qwen3-4b-base / trainer_state.json
suronek's picture
Upload trainer_state.json
4232d27 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 14860,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002692333580130578,
"grad_norm": 0.482421875,
"learning_rate": 0.0,
"loss": 4.6193,
"num_input_tokens_seen": 65536,
"step": 1,
"train_runtime": 35.1493,
"train_tokens_per_second": 1864.502
},
{
"epoch": 0.002692333580130578,
"grad_norm": 0.470703125,
"learning_rate": 1.0089686098654709e-06,
"loss": 4.9615,
"num_input_tokens_seen": 655360,
"step": 10,
"train_runtime": 82.7179,
"train_tokens_per_second": 7922.835
},
{
"epoch": 0.005384667160261156,
"grad_norm": 0.478515625,
"learning_rate": 2.1300448430493275e-06,
"loss": 4.8258,
"num_input_tokens_seen": 1310720,
"step": 20,
"train_runtime": 166.8773,
"train_tokens_per_second": 7854.392
},
{
"epoch": 0.008077000740391735,
"grad_norm": 0.5078125,
"learning_rate": 3.251121076233184e-06,
"loss": 4.9214,
"num_input_tokens_seen": 1966080,
"step": 30,
"train_runtime": 256.0013,
"train_tokens_per_second": 7679.96
},
{
"epoch": 0.010769334320522312,
"grad_norm": 0.4765625,
"learning_rate": 4.372197309417041e-06,
"loss": 4.8702,
"num_input_tokens_seen": 2621440,
"step": 40,
"train_runtime": 346.1026,
"train_tokens_per_second": 7574.171
},
{
"epoch": 0.013461667900652891,
"grad_norm": 0.5,
"learning_rate": 5.493273542600897e-06,
"loss": 4.8648,
"num_input_tokens_seen": 3276800,
"step": 50,
"train_runtime": 436.0808,
"train_tokens_per_second": 7514.205
},
{
"epoch": 0.01615400148078347,
"grad_norm": 0.474609375,
"learning_rate": 6.614349775784753e-06,
"loss": 4.9171,
"num_input_tokens_seen": 3932160,
"step": 60,
"train_runtime": 526.5782,
"train_tokens_per_second": 7467.381
},
{
"epoch": 0.018846335060914047,
"grad_norm": 0.4921875,
"learning_rate": 7.73542600896861e-06,
"loss": 4.8927,
"num_input_tokens_seen": 4587520,
"step": 70,
"train_runtime": 616.6177,
"train_tokens_per_second": 7439.813
},
{
"epoch": 0.021538668641044624,
"grad_norm": 0.5078125,
"learning_rate": 8.856502242152467e-06,
"loss": 4.9268,
"num_input_tokens_seen": 5242880,
"step": 80,
"train_runtime": 707.2158,
"train_tokens_per_second": 7413.409
},
{
"epoch": 0.024231002221175205,
"grad_norm": 0.578125,
"learning_rate": 9.977578475336324e-06,
"loss": 4.9262,
"num_input_tokens_seen": 5898240,
"step": 90,
"train_runtime": 797.3568,
"train_tokens_per_second": 7397.241
},
{
"epoch": 0.026923335801305782,
"grad_norm": 0.5234375,
"learning_rate": 1.109865470852018e-05,
"loss": 4.8498,
"num_input_tokens_seen": 6553600,
"step": 100,
"train_runtime": 887.9768,
"train_tokens_per_second": 7380.373
},
{
"epoch": 0.02961566938143636,
"grad_norm": 0.470703125,
"learning_rate": 1.2219730941704037e-05,
"loss": 4.8954,
"num_input_tokens_seen": 7208960,
"step": 110,
"train_runtime": 978.7816,
"train_tokens_per_second": 7365.239
},
{
"epoch": 0.03230800296156694,
"grad_norm": 0.5390625,
"learning_rate": 1.3340807174887892e-05,
"loss": 4.8873,
"num_input_tokens_seen": 7864320,
"step": 120,
"train_runtime": 1070.0192,
"train_tokens_per_second": 7349.7
},
{
"epoch": 0.03500033654169751,
"grad_norm": 0.486328125,
"learning_rate": 1.4461883408071749e-05,
"loss": 4.9083,
"num_input_tokens_seen": 8519680,
"step": 130,
"train_runtime": 1160.919,
"train_tokens_per_second": 7338.738
},
{
"epoch": 0.037692670121828094,
"grad_norm": 0.48046875,
"learning_rate": 1.5582959641255608e-05,
"loss": 4.8152,
"num_input_tokens_seen": 9175040,
"step": 140,
"train_runtime": 1251.55,
"train_tokens_per_second": 7330.942
},
{
"epoch": 0.040385003701958674,
"grad_norm": 0.58984375,
"learning_rate": 1.6704035874439464e-05,
"loss": 4.8949,
"num_input_tokens_seen": 9830400,
"step": 150,
"train_runtime": 1342.147,
"train_tokens_per_second": 7324.384
},
{
"epoch": 0.04307733728208925,
"grad_norm": 0.5234375,
"learning_rate": 1.7825112107623318e-05,
"loss": 4.7979,
"num_input_tokens_seen": 10485760,
"step": 160,
"train_runtime": 1432.7472,
"train_tokens_per_second": 7318.639
},
{
"epoch": 0.04576967086221983,
"grad_norm": 0.4609375,
"learning_rate": 1.8946188340807175e-05,
"loss": 4.9055,
"num_input_tokens_seen": 11141120,
"step": 170,
"train_runtime": 1523.0122,
"train_tokens_per_second": 7315.187
},
{
"epoch": 0.04846200444235041,
"grad_norm": 0.462890625,
"learning_rate": 2.006726457399103e-05,
"loss": 4.79,
"num_input_tokens_seen": 11796480,
"step": 180,
"train_runtime": 1614.1075,
"train_tokens_per_second": 7308.361
},
{
"epoch": 0.05115433802248098,
"grad_norm": 0.50390625,
"learning_rate": 2.1188340807174888e-05,
"loss": 4.8685,
"num_input_tokens_seen": 12451840,
"step": 190,
"train_runtime": 1704.6942,
"train_tokens_per_second": 7304.442
},
{
"epoch": 0.053846671602611564,
"grad_norm": 0.55078125,
"learning_rate": 2.2309417040358745e-05,
"loss": 4.7882,
"num_input_tokens_seen": 13107200,
"step": 200,
"train_runtime": 1795.85,
"train_tokens_per_second": 7298.605
},
{
"epoch": 0.056539005182742144,
"grad_norm": 0.490234375,
"learning_rate": 2.3430493273542602e-05,
"loss": 4.918,
"num_input_tokens_seen": 13762560,
"step": 210,
"train_runtime": 1886.2187,
"train_tokens_per_second": 7296.376
},
{
"epoch": 0.05923133876287272,
"grad_norm": 0.546875,
"learning_rate": 2.455156950672646e-05,
"loss": 4.8228,
"num_input_tokens_seen": 14417920,
"step": 220,
"train_runtime": 1977.1485,
"train_tokens_per_second": 7292.28
},
{
"epoch": 0.0619236723430033,
"grad_norm": 0.5,
"learning_rate": 2.567264573991032e-05,
"loss": 4.8993,
"num_input_tokens_seen": 15073280,
"step": 230,
"train_runtime": 2067.7789,
"train_tokens_per_second": 7289.6
},
{
"epoch": 0.06461600592313388,
"grad_norm": 0.53515625,
"learning_rate": 2.6793721973094172e-05,
"loss": 4.8871,
"num_input_tokens_seen": 15728640,
"step": 240,
"train_runtime": 2158.7764,
"train_tokens_per_second": 7285.905
},
{
"epoch": 0.06730833950326445,
"grad_norm": 0.51953125,
"learning_rate": 2.7914798206278025e-05,
"loss": 4.8695,
"num_input_tokens_seen": 16384000,
"step": 250,
"train_runtime": 2249.0647,
"train_tokens_per_second": 7284.806
},
{
"epoch": 0.07000067308339503,
"grad_norm": 0.55859375,
"learning_rate": 2.9035874439461886e-05,
"loss": 4.8812,
"num_input_tokens_seen": 17039360,
"step": 260,
"train_runtime": 2339.559,
"train_tokens_per_second": 7283.15
},
{
"epoch": 0.07269300666352561,
"grad_norm": 0.52734375,
"learning_rate": 3.015695067264574e-05,
"loss": 4.9003,
"num_input_tokens_seen": 17694720,
"step": 270,
"train_runtime": 2429.9591,
"train_tokens_per_second": 7281.9
},
{
"epoch": 0.07538534024365619,
"grad_norm": 0.5859375,
"learning_rate": 3.12780269058296e-05,
"loss": 4.8836,
"num_input_tokens_seen": 18350080,
"step": 280,
"train_runtime": 2520.4305,
"train_tokens_per_second": 7280.534
},
{
"epoch": 0.07807767382378676,
"grad_norm": 0.5703125,
"learning_rate": 3.2399103139013456e-05,
"loss": 4.87,
"num_input_tokens_seen": 19005440,
"step": 290,
"train_runtime": 2610.6174,
"train_tokens_per_second": 7280.056
},
{
"epoch": 0.08077000740391735,
"grad_norm": 0.51953125,
"learning_rate": 3.3520179372197316e-05,
"loss": 4.9218,
"num_input_tokens_seen": 19660800,
"step": 300,
"train_runtime": 2701.2793,
"train_tokens_per_second": 7278.329
},
{
"epoch": 0.08346234098404792,
"grad_norm": 0.52734375,
"learning_rate": 3.464125560538117e-05,
"loss": 4.8102,
"num_input_tokens_seen": 20316160,
"step": 310,
"train_runtime": 2791.906,
"train_tokens_per_second": 7276.807
},
{
"epoch": 0.0861546745641785,
"grad_norm": 0.5546875,
"learning_rate": 3.576233183856502e-05,
"loss": 4.9781,
"num_input_tokens_seen": 20971520,
"step": 320,
"train_runtime": 2882.398,
"train_tokens_per_second": 7275.72
},
{
"epoch": 0.08884700814430908,
"grad_norm": 0.490234375,
"learning_rate": 3.688340807174888e-05,
"loss": 4.8558,
"num_input_tokens_seen": 21626880,
"step": 330,
"train_runtime": 2973.0085,
"train_tokens_per_second": 7274.409
},
{
"epoch": 0.09153934172443966,
"grad_norm": 0.546875,
"learning_rate": 3.8004484304932737e-05,
"loss": 4.9521,
"num_input_tokens_seen": 22282240,
"step": 340,
"train_runtime": 3063.4637,
"train_tokens_per_second": 7273.545
},
{
"epoch": 0.09423167530457023,
"grad_norm": 0.55859375,
"learning_rate": 3.91255605381166e-05,
"loss": 4.8852,
"num_input_tokens_seen": 22937600,
"step": 350,
"train_runtime": 3153.35,
"train_tokens_per_second": 7274.042
},
{
"epoch": 0.09692400888470082,
"grad_norm": 0.515625,
"learning_rate": 4.024663677130045e-05,
"loss": 4.7577,
"num_input_tokens_seen": 23592960,
"step": 360,
"train_runtime": 3244.206,
"train_tokens_per_second": 7272.337
},
{
"epoch": 0.09961634246483139,
"grad_norm": 0.546875,
"learning_rate": 4.1367713004484303e-05,
"loss": 4.8759,
"num_input_tokens_seen": 24248320,
"step": 370,
"train_runtime": 3334.7646,
"train_tokens_per_second": 7271.374
},
{
"epoch": 0.10230867604496197,
"grad_norm": 0.5546875,
"learning_rate": 4.2488789237668164e-05,
"loss": 4.811,
"num_input_tokens_seen": 24903680,
"step": 380,
"train_runtime": 3425.76,
"train_tokens_per_second": 7269.534
},
{
"epoch": 0.10500100962509255,
"grad_norm": 0.5703125,
"learning_rate": 4.360986547085202e-05,
"loss": 4.7531,
"num_input_tokens_seen": 25559040,
"step": 390,
"train_runtime": 3516.2291,
"train_tokens_per_second": 7268.878
},
{
"epoch": 0.10769334320522313,
"grad_norm": 0.546875,
"learning_rate": 4.473094170403588e-05,
"loss": 4.717,
"num_input_tokens_seen": 26214400,
"step": 400,
"train_runtime": 3606.4366,
"train_tokens_per_second": 7268.782
},
{
"epoch": 0.1103856767853537,
"grad_norm": 0.51953125,
"learning_rate": 4.585201793721973e-05,
"loss": 4.7769,
"num_input_tokens_seen": 26869760,
"step": 410,
"train_runtime": 3697.0154,
"train_tokens_per_second": 7267.96
},
{
"epoch": 0.11307801036548429,
"grad_norm": 0.5546875,
"learning_rate": 4.697309417040359e-05,
"loss": 4.7951,
"num_input_tokens_seen": 27525120,
"step": 420,
"train_runtime": 3787.3387,
"train_tokens_per_second": 7267.668
},
{
"epoch": 0.11577034394561486,
"grad_norm": 0.671875,
"learning_rate": 4.8094170403587444e-05,
"loss": 4.8145,
"num_input_tokens_seen": 28180480,
"step": 430,
"train_runtime": 3877.3269,
"train_tokens_per_second": 7268.018
},
{
"epoch": 0.11846267752574544,
"grad_norm": 0.6328125,
"learning_rate": 4.92152466367713e-05,
"loss": 4.7604,
"num_input_tokens_seen": 28835840,
"step": 440,
"train_runtime": 3967.668,
"train_tokens_per_second": 7267.705
},
{
"epoch": 0.12115501110587602,
"grad_norm": 0.54296875,
"learning_rate": 4.9999994655793676e-05,
"loss": 4.8413,
"num_input_tokens_seen": 29491200,
"step": 450,
"train_runtime": 4058.2143,
"train_tokens_per_second": 7267.039
},
{
"epoch": 0.1238473446860066,
"grad_norm": 0.578125,
"learning_rate": 4.999989964774474e-05,
"loss": 4.7132,
"num_input_tokens_seen": 30146560,
"step": 460,
"train_runtime": 4148.5833,
"train_tokens_per_second": 7266.712
},
{
"epoch": 0.12653967826613718,
"grad_norm": 0.546875,
"learning_rate": 4.99996858800747e-05,
"loss": 4.7559,
"num_input_tokens_seen": 30801920,
"step": 470,
"train_runtime": 4238.9481,
"train_tokens_per_second": 7266.407
},
{
"epoch": 0.12923201184626776,
"grad_norm": 0.578125,
"learning_rate": 4.999935335379901e-05,
"loss": 4.7794,
"num_input_tokens_seen": 31457280,
"step": 480,
"train_runtime": 4329.1664,
"train_tokens_per_second": 7266.36
},
{
"epoch": 0.13192434542639833,
"grad_norm": 0.58984375,
"learning_rate": 4.9998902070497324e-05,
"loss": 4.8323,
"num_input_tokens_seen": 32112640,
"step": 490,
"train_runtime": 4419.7433,
"train_tokens_per_second": 7265.725
},
{
"epoch": 0.1346166790065289,
"grad_norm": 0.58984375,
"learning_rate": 4.999833203231341e-05,
"loss": 4.6969,
"num_input_tokens_seen": 32768000,
"step": 500,
"train_runtime": 4510.5207,
"train_tokens_per_second": 7264.793
},
{
"epoch": 0.13730901258665948,
"grad_norm": 0.56640625,
"learning_rate": 4.9997643241955186e-05,
"loss": 4.7646,
"num_input_tokens_seen": 33423360,
"step": 510,
"train_runtime": 4620.4996,
"train_tokens_per_second": 7233.711
},
{
"epoch": 0.14000134616679005,
"grad_norm": 0.640625,
"learning_rate": 4.9996835702694675e-05,
"loss": 4.8101,
"num_input_tokens_seen": 34078720,
"step": 520,
"train_runtime": 4710.7686,
"train_tokens_per_second": 7234.217
},
{
"epoch": 0.14269367974692065,
"grad_norm": 0.7265625,
"learning_rate": 4.999590941836802e-05,
"loss": 4.7956,
"num_input_tokens_seen": 34734080,
"step": 530,
"train_runtime": 4801.1604,
"train_tokens_per_second": 7234.518
},
{
"epoch": 0.14538601332705123,
"grad_norm": 0.58203125,
"learning_rate": 4.999486439337546e-05,
"loss": 4.6643,
"num_input_tokens_seen": 35389440,
"step": 540,
"train_runtime": 4891.786,
"train_tokens_per_second": 7234.462
},
{
"epoch": 0.1480783469071818,
"grad_norm": 0.58203125,
"learning_rate": 4.999370063268126e-05,
"loss": 4.7488,
"num_input_tokens_seen": 36044800,
"step": 550,
"train_runtime": 4982.5576,
"train_tokens_per_second": 7234.196
},
{
"epoch": 0.15077068048731238,
"grad_norm": 0.625,
"learning_rate": 4.999241814181378e-05,
"loss": 4.7162,
"num_input_tokens_seen": 36700160,
"step": 560,
"train_runtime": 5073.5063,
"train_tokens_per_second": 7233.688
},
{
"epoch": 0.15346301406744295,
"grad_norm": 0.6328125,
"learning_rate": 4.999101692686534e-05,
"loss": 4.7735,
"num_input_tokens_seen": 37355520,
"step": 570,
"train_runtime": 5164.0672,
"train_tokens_per_second": 7233.74
},
{
"epoch": 0.15615534764757352,
"grad_norm": 0.63671875,
"learning_rate": 4.9989496994492305e-05,
"loss": 4.7035,
"num_input_tokens_seen": 38010880,
"step": 580,
"train_runtime": 5254.4846,
"train_tokens_per_second": 7233.988
},
{
"epoch": 0.15884768122770412,
"grad_norm": 0.61328125,
"learning_rate": 4.998785835191495e-05,
"loss": 4.6791,
"num_input_tokens_seen": 38666240,
"step": 590,
"train_runtime": 5344.8949,
"train_tokens_per_second": 7234.238
},
{
"epoch": 0.1615400148078347,
"grad_norm": 0.6796875,
"learning_rate": 4.9986101006917496e-05,
"loss": 4.6101,
"num_input_tokens_seen": 39321600,
"step": 600,
"train_runtime": 5435.7009,
"train_tokens_per_second": 7233.952
},
{
"epoch": 0.16423234838796527,
"grad_norm": 0.625,
"learning_rate": 4.9984224967848035e-05,
"loss": 4.6882,
"num_input_tokens_seen": 39976960,
"step": 610,
"train_runtime": 5525.6374,
"train_tokens_per_second": 7234.814
},
{
"epoch": 0.16692468196809584,
"grad_norm": 0.60546875,
"learning_rate": 4.998223024361852e-05,
"loss": 4.7693,
"num_input_tokens_seen": 40632320,
"step": 620,
"train_runtime": 5616.3935,
"train_tokens_per_second": 7234.593
},
{
"epoch": 0.16961701554822642,
"grad_norm": 0.66015625,
"learning_rate": 4.9980116843704694e-05,
"loss": 4.6585,
"num_input_tokens_seen": 41287680,
"step": 630,
"train_runtime": 5706.8994,
"train_tokens_per_second": 7234.696
},
{
"epoch": 0.172309349128357,
"grad_norm": 0.69921875,
"learning_rate": 4.997788477814606e-05,
"loss": 4.6682,
"num_input_tokens_seen": 41943040,
"step": 640,
"train_runtime": 5797.3335,
"train_tokens_per_second": 7234.885
},
{
"epoch": 0.1750016827084876,
"grad_norm": 0.6875,
"learning_rate": 4.9975534057545815e-05,
"loss": 4.6915,
"num_input_tokens_seen": 42598400,
"step": 650,
"train_runtime": 5888.074,
"train_tokens_per_second": 7234.692
},
{
"epoch": 0.17769401628861817,
"grad_norm": 0.609375,
"learning_rate": 4.997306469307086e-05,
"loss": 4.7539,
"num_input_tokens_seen": 43253760,
"step": 660,
"train_runtime": 5978.6175,
"train_tokens_per_second": 7234.743
},
{
"epoch": 0.18038634986874874,
"grad_norm": 0.6640625,
"learning_rate": 4.997047669645165e-05,
"loss": 4.5955,
"num_input_tokens_seen": 43909120,
"step": 670,
"train_runtime": 6069.1762,
"train_tokens_per_second": 7234.774
},
{
"epoch": 0.18307868344887931,
"grad_norm": 0.69140625,
"learning_rate": 4.9967770079982235e-05,
"loss": 4.6347,
"num_input_tokens_seen": 44564480,
"step": 680,
"train_runtime": 6160.0413,
"train_tokens_per_second": 7234.445
},
{
"epoch": 0.1857710170290099,
"grad_norm": 0.78515625,
"learning_rate": 4.9964944856520116e-05,
"loss": 4.6122,
"num_input_tokens_seen": 45219840,
"step": 690,
"train_runtime": 6250.9481,
"train_tokens_per_second": 7234.077
},
{
"epoch": 0.18846335060914046,
"grad_norm": 0.734375,
"learning_rate": 4.996200103948626e-05,
"loss": 4.6617,
"num_input_tokens_seen": 45875200,
"step": 700,
"train_runtime": 6341.6991,
"train_tokens_per_second": 7233.897
},
{
"epoch": 0.19115568418927106,
"grad_norm": 0.69921875,
"learning_rate": 4.995893864286498e-05,
"loss": 4.7138,
"num_input_tokens_seen": 46530560,
"step": 710,
"train_runtime": 6431.9076,
"train_tokens_per_second": 7234.333
},
{
"epoch": 0.19384801776940164,
"grad_norm": 0.703125,
"learning_rate": 4.9955757681203896e-05,
"loss": 4.5922,
"num_input_tokens_seen": 47185920,
"step": 720,
"train_runtime": 6522.506,
"train_tokens_per_second": 7234.324
},
{
"epoch": 0.1965403513495322,
"grad_norm": 0.7578125,
"learning_rate": 4.995245816961387e-05,
"loss": 4.7699,
"num_input_tokens_seen": 47841280,
"step": 730,
"train_runtime": 6613.3112,
"train_tokens_per_second": 7234.089
},
{
"epoch": 0.19923268492966278,
"grad_norm": 0.6484375,
"learning_rate": 4.9949040123768896e-05,
"loss": 4.7121,
"num_input_tokens_seen": 48496640,
"step": 740,
"train_runtime": 6703.96,
"train_tokens_per_second": 7234.029
},
{
"epoch": 0.20192501850979336,
"grad_norm": 0.63671875,
"learning_rate": 4.994550355990609e-05,
"loss": 4.7765,
"num_input_tokens_seen": 49152000,
"step": 750,
"train_runtime": 6794.2239,
"train_tokens_per_second": 7234.38
},
{
"epoch": 0.20461735208992393,
"grad_norm": 0.6171875,
"learning_rate": 4.994184849482556e-05,
"loss": 4.6858,
"num_input_tokens_seen": 49807360,
"step": 760,
"train_runtime": 6885.0159,
"train_tokens_per_second": 7234.168
},
{
"epoch": 0.20730968567005453,
"grad_norm": 0.6796875,
"learning_rate": 4.993807494589032e-05,
"loss": 4.6799,
"num_input_tokens_seen": 50462720,
"step": 770,
"train_runtime": 6975.1885,
"train_tokens_per_second": 7234.603
},
{
"epoch": 0.2100020192501851,
"grad_norm": 0.69140625,
"learning_rate": 4.9934182931026284e-05,
"loss": 4.6223,
"num_input_tokens_seen": 51118080,
"step": 780,
"train_runtime": 7066.0675,
"train_tokens_per_second": 7234.304
},
{
"epoch": 0.21269435283031568,
"grad_norm": 0.66015625,
"learning_rate": 4.993017246872207e-05,
"loss": 4.7387,
"num_input_tokens_seen": 51773440,
"step": 790,
"train_runtime": 7156.7484,
"train_tokens_per_second": 7234.213
},
{
"epoch": 0.21538668641044625,
"grad_norm": 0.59765625,
"learning_rate": 4.9926043578029e-05,
"loss": 4.6983,
"num_input_tokens_seen": 52428800,
"step": 800,
"train_runtime": 7247.421,
"train_tokens_per_second": 7234.132
},
{
"epoch": 0.21807901999057683,
"grad_norm": 0.65625,
"learning_rate": 4.992179627856097e-05,
"loss": 4.6995,
"num_input_tokens_seen": 53084160,
"step": 810,
"train_runtime": 7337.4649,
"train_tokens_per_second": 7234.673
},
{
"epoch": 0.2207713535707074,
"grad_norm": 0.625,
"learning_rate": 4.9917430590494375e-05,
"loss": 4.7396,
"num_input_tokens_seen": 53739520,
"step": 820,
"train_runtime": 7427.9552,
"train_tokens_per_second": 7234.766
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.63671875,
"learning_rate": 4.991294653456799e-05,
"loss": 4.7064,
"num_input_tokens_seen": 54394880,
"step": 830,
"train_runtime": 7518.6869,
"train_tokens_per_second": 7234.625
},
{
"epoch": 0.22615602073096858,
"grad_norm": 0.62109375,
"learning_rate": 4.9908344132082894e-05,
"loss": 4.7164,
"num_input_tokens_seen": 55050240,
"step": 840,
"train_runtime": 7609.0857,
"train_tokens_per_second": 7234.804
},
{
"epoch": 0.22884835431109915,
"grad_norm": 0.76953125,
"learning_rate": 4.9903623404902366e-05,
"loss": 4.6648,
"num_input_tokens_seen": 55705600,
"step": 850,
"train_runtime": 7699.3938,
"train_tokens_per_second": 7235.063
},
{
"epoch": 0.23154068789122972,
"grad_norm": 0.69921875,
"learning_rate": 4.989878437545175e-05,
"loss": 4.7801,
"num_input_tokens_seen": 56360960,
"step": 860,
"train_runtime": 7789.7806,
"train_tokens_per_second": 7235.244
},
{
"epoch": 0.2342330214713603,
"grad_norm": 0.7109375,
"learning_rate": 4.98938270667184e-05,
"loss": 4.697,
"num_input_tokens_seen": 57016320,
"step": 870,
"train_runtime": 7880.6012,
"train_tokens_per_second": 7235.022
},
{
"epoch": 0.23692535505149087,
"grad_norm": 0.69921875,
"learning_rate": 4.988875150225154e-05,
"loss": 4.6899,
"num_input_tokens_seen": 57671680,
"step": 880,
"train_runtime": 7970.6998,
"train_tokens_per_second": 7235.46
},
{
"epoch": 0.23961768863162144,
"grad_norm": 0.6875,
"learning_rate": 4.9883557706162146e-05,
"loss": 4.7879,
"num_input_tokens_seen": 58327040,
"step": 890,
"train_runtime": 8061.5195,
"train_tokens_per_second": 7235.241
},
{
"epoch": 0.24231002221175205,
"grad_norm": 0.66015625,
"learning_rate": 4.987824570312285e-05,
"loss": 4.7247,
"num_input_tokens_seen": 58982400,
"step": 900,
"train_runtime": 8152.3787,
"train_tokens_per_second": 7234.993
},
{
"epoch": 0.24500235579188262,
"grad_norm": 0.71484375,
"learning_rate": 4.98728155183678e-05,
"loss": 4.7704,
"num_input_tokens_seen": 59637760,
"step": 910,
"train_runtime": 8242.3679,
"train_tokens_per_second": 7235.513
},
{
"epoch": 0.2476946893720132,
"grad_norm": 0.7578125,
"learning_rate": 4.986726717769259e-05,
"loss": 4.7249,
"num_input_tokens_seen": 60293120,
"step": 920,
"train_runtime": 8333.1851,
"train_tokens_per_second": 7235.303
},
{
"epoch": 0.25038702295214377,
"grad_norm": 0.7578125,
"learning_rate": 4.986160070745405e-05,
"loss": 4.6593,
"num_input_tokens_seen": 60948480,
"step": 930,
"train_runtime": 8423.8926,
"train_tokens_per_second": 7235.192
},
{
"epoch": 0.25307935653227437,
"grad_norm": 0.6796875,
"learning_rate": 4.9855816134570233e-05,
"loss": 4.7105,
"num_input_tokens_seen": 61603840,
"step": 940,
"train_runtime": 8514.3643,
"train_tokens_per_second": 7235.284
},
{
"epoch": 0.2557716901124049,
"grad_norm": 0.6640625,
"learning_rate": 4.9849913486520174e-05,
"loss": 4.7001,
"num_input_tokens_seen": 62259200,
"step": 950,
"train_runtime": 8605.0711,
"train_tokens_per_second": 7235.176
},
{
"epoch": 0.2584640236925355,
"grad_norm": 0.60546875,
"learning_rate": 4.9843892791343835e-05,
"loss": 4.6863,
"num_input_tokens_seen": 62914560,
"step": 960,
"train_runtime": 8695.5454,
"train_tokens_per_second": 7235.263
},
{
"epoch": 0.26115635727266606,
"grad_norm": 0.6875,
"learning_rate": 4.983775407764197e-05,
"loss": 4.6958,
"num_input_tokens_seen": 63569920,
"step": 970,
"train_runtime": 8786.3319,
"train_tokens_per_second": 7235.092
},
{
"epoch": 0.26384869085279666,
"grad_norm": 0.7265625,
"learning_rate": 4.983149737457593e-05,
"loss": 4.6252,
"num_input_tokens_seen": 64225280,
"step": 980,
"train_runtime": 8876.7708,
"train_tokens_per_second": 7235.208
},
{
"epoch": 0.26654102443292726,
"grad_norm": 0.6875,
"learning_rate": 4.982512271186759e-05,
"loss": 4.6473,
"num_input_tokens_seen": 64880640,
"step": 990,
"train_runtime": 8966.8048,
"train_tokens_per_second": 7235.648
},
{
"epoch": 0.2692333580130578,
"grad_norm": 0.65625,
"learning_rate": 4.981863011979917e-05,
"loss": 4.6721,
"num_input_tokens_seen": 65536000,
"step": 1000,
"train_runtime": 9058.0411,
"train_tokens_per_second": 7235.118
},
{
"epoch": 0.2719256915931884,
"grad_norm": 0.69921875,
"learning_rate": 4.98120196292131e-05,
"loss": 4.6244,
"num_input_tokens_seen": 66191360,
"step": 1010,
"train_runtime": 9165.1685,
"train_tokens_per_second": 7222.056
},
{
"epoch": 0.27461802517331896,
"grad_norm": 0.7734375,
"learning_rate": 4.98052912715119e-05,
"loss": 4.7093,
"num_input_tokens_seen": 66846720,
"step": 1020,
"train_runtime": 9256.0483,
"train_tokens_per_second": 7221.95
},
{
"epoch": 0.27731035875344956,
"grad_norm": 0.8046875,
"learning_rate": 4.9798445078657964e-05,
"loss": 4.574,
"num_input_tokens_seen": 67502080,
"step": 1030,
"train_runtime": 9346.5081,
"train_tokens_per_second": 7222.171
},
{
"epoch": 0.2800026923335801,
"grad_norm": 0.6640625,
"learning_rate": 4.979148108317348e-05,
"loss": 4.6194,
"num_input_tokens_seen": 68157440,
"step": 1040,
"train_runtime": 9437.3179,
"train_tokens_per_second": 7222.12
},
{
"epoch": 0.2826950259137107,
"grad_norm": 0.74609375,
"learning_rate": 4.978439931814024e-05,
"loss": 4.5994,
"num_input_tokens_seen": 68812800,
"step": 1050,
"train_runtime": 9528.5122,
"train_tokens_per_second": 7221.778
},
{
"epoch": 0.2853873594938413,
"grad_norm": 0.74609375,
"learning_rate": 4.977719981719949e-05,
"loss": 4.719,
"num_input_tokens_seen": 69468160,
"step": 1060,
"train_runtime": 9619.4388,
"train_tokens_per_second": 7221.644
},
{
"epoch": 0.28807969307397185,
"grad_norm": 0.70703125,
"learning_rate": 4.9769882614551775e-05,
"loss": 4.6091,
"num_input_tokens_seen": 70123520,
"step": 1070,
"train_runtime": 9710.4832,
"train_tokens_per_second": 7221.424
},
{
"epoch": 0.29077202665410246,
"grad_norm": 0.69921875,
"learning_rate": 4.9762447744956754e-05,
"loss": 4.6141,
"num_input_tokens_seen": 70778880,
"step": 1080,
"train_runtime": 9801.2058,
"train_tokens_per_second": 7221.446
},
{
"epoch": 0.293464360234233,
"grad_norm": 0.671875,
"learning_rate": 4.975489524373306e-05,
"loss": 4.7019,
"num_input_tokens_seen": 71434240,
"step": 1090,
"train_runtime": 9891.44,
"train_tokens_per_second": 7221.824
},
{
"epoch": 0.2961566938143636,
"grad_norm": 0.91015625,
"learning_rate": 4.9747225146758127e-05,
"loss": 4.7062,
"num_input_tokens_seen": 72089600,
"step": 1100,
"train_runtime": 9981.7977,
"train_tokens_per_second": 7222.106
},
{
"epoch": 0.2988490273944942,
"grad_norm": 0.7890625,
"learning_rate": 4.973943749046801e-05,
"loss": 4.6744,
"num_input_tokens_seen": 72744960,
"step": 1110,
"train_runtime": 10072.6114,
"train_tokens_per_second": 7222.056
},
{
"epoch": 0.30154136097462475,
"grad_norm": 0.6875,
"learning_rate": 4.973153231185722e-05,
"loss": 4.6924,
"num_input_tokens_seen": 73400320,
"step": 1120,
"train_runtime": 10163.7303,
"train_tokens_per_second": 7221.789
},
{
"epoch": 0.30423369455475535,
"grad_norm": 0.71875,
"learning_rate": 4.972350964847856e-05,
"loss": 4.6235,
"num_input_tokens_seen": 74055680,
"step": 1130,
"train_runtime": 10254.3897,
"train_tokens_per_second": 7221.852
},
{
"epoch": 0.3069260281348859,
"grad_norm": 0.74609375,
"learning_rate": 4.971536953844292e-05,
"loss": 4.6009,
"num_input_tokens_seen": 74711040,
"step": 1140,
"train_runtime": 10344.7658,
"train_tokens_per_second": 7222.11
},
{
"epoch": 0.3096183617150165,
"grad_norm": 0.80859375,
"learning_rate": 4.97071120204191e-05,
"loss": 4.6823,
"num_input_tokens_seen": 75366400,
"step": 1150,
"train_runtime": 10435.3375,
"train_tokens_per_second": 7222.229
},
{
"epoch": 0.31231069529514704,
"grad_norm": 0.8046875,
"learning_rate": 4.969873713363365e-05,
"loss": 4.5743,
"num_input_tokens_seen": 76021760,
"step": 1160,
"train_runtime": 10525.9663,
"train_tokens_per_second": 7222.307
},
{
"epoch": 0.31500302887527765,
"grad_norm": 0.765625,
"learning_rate": 4.9690244917870666e-05,
"loss": 4.5756,
"num_input_tokens_seen": 76677120,
"step": 1170,
"train_runtime": 10616.9618,
"train_tokens_per_second": 7222.134
},
{
"epoch": 0.31769536245540825,
"grad_norm": 0.75390625,
"learning_rate": 4.96816354134716e-05,
"loss": 4.663,
"num_input_tokens_seen": 77332480,
"step": 1180,
"train_runtime": 10707.8238,
"train_tokens_per_second": 7222.054
},
{
"epoch": 0.3203876960355388,
"grad_norm": 0.8671875,
"learning_rate": 4.967290866133509e-05,
"loss": 4.6054,
"num_input_tokens_seen": 77987840,
"step": 1190,
"train_runtime": 10798.3303,
"train_tokens_per_second": 7222.213
},
{
"epoch": 0.3230800296156694,
"grad_norm": 0.734375,
"learning_rate": 4.9664064702916714e-05,
"loss": 4.6963,
"num_input_tokens_seen": 78643200,
"step": 1200,
"train_runtime": 10888.809,
"train_tokens_per_second": 7222.388
},
{
"epoch": 0.32577236319579994,
"grad_norm": 0.9296875,
"learning_rate": 4.965510358022886e-05,
"loss": 4.5616,
"num_input_tokens_seen": 79298560,
"step": 1210,
"train_runtime": 10979.4745,
"train_tokens_per_second": 7222.437
},
{
"epoch": 0.32846469677593054,
"grad_norm": 0.7265625,
"learning_rate": 4.964602533584046e-05,
"loss": 4.58,
"num_input_tokens_seen": 79953920,
"step": 1220,
"train_runtime": 11069.847,
"train_tokens_per_second": 7222.676
},
{
"epoch": 0.3311570303560611,
"grad_norm": 0.77734375,
"learning_rate": 4.9636830012876874e-05,
"loss": 4.5397,
"num_input_tokens_seen": 80609280,
"step": 1230,
"train_runtime": 11160.1095,
"train_tokens_per_second": 7222.983
},
{
"epoch": 0.3338493639361917,
"grad_norm": 0.78125,
"learning_rate": 4.9627517655019576e-05,
"loss": 4.7348,
"num_input_tokens_seen": 81264640,
"step": 1240,
"train_runtime": 11250.738,
"train_tokens_per_second": 7223.05
},
{
"epoch": 0.3365416975163223,
"grad_norm": 0.765625,
"learning_rate": 4.9618088306506033e-05,
"loss": 4.6434,
"num_input_tokens_seen": 81920000,
"step": 1250,
"train_runtime": 11341.6932,
"train_tokens_per_second": 7222.907
},
{
"epoch": 0.33923403109645284,
"grad_norm": 0.8984375,
"learning_rate": 4.9608542012129464e-05,
"loss": 4.5681,
"num_input_tokens_seen": 82575360,
"step": 1260,
"train_runtime": 11432.4636,
"train_tokens_per_second": 7222.884
},
{
"epoch": 0.34192636467658344,
"grad_norm": 0.79296875,
"learning_rate": 4.9598878817238614e-05,
"loss": 4.5804,
"num_input_tokens_seen": 83230720,
"step": 1270,
"train_runtime": 11523.2458,
"train_tokens_per_second": 7222.854
},
{
"epoch": 0.344618698256714,
"grad_norm": 0.7890625,
"learning_rate": 4.9589098767737576e-05,
"loss": 4.603,
"num_input_tokens_seen": 83886080,
"step": 1280,
"train_runtime": 11613.741,
"train_tokens_per_second": 7223.002
},
{
"epoch": 0.3473110318368446,
"grad_norm": 0.69140625,
"learning_rate": 4.9579201910085515e-05,
"loss": 4.6111,
"num_input_tokens_seen": 84541440,
"step": 1290,
"train_runtime": 11704.8318,
"train_tokens_per_second": 7222.781
},
{
"epoch": 0.3500033654169752,
"grad_norm": 0.82421875,
"learning_rate": 4.956918829129652e-05,
"loss": 4.5381,
"num_input_tokens_seen": 85196800,
"step": 1300,
"train_runtime": 11795.0371,
"train_tokens_per_second": 7223.106
},
{
"epoch": 0.35269569899710573,
"grad_norm": 0.76171875,
"learning_rate": 4.955905795893933e-05,
"loss": 4.5358,
"num_input_tokens_seen": 85852160,
"step": 1310,
"train_runtime": 11885.3743,
"train_tokens_per_second": 7223.345
},
{
"epoch": 0.35538803257723633,
"grad_norm": 0.72265625,
"learning_rate": 4.9548810961137084e-05,
"loss": 4.534,
"num_input_tokens_seen": 86507520,
"step": 1320,
"train_runtime": 11976.5494,
"train_tokens_per_second": 7223.075
},
{
"epoch": 0.3580803661573669,
"grad_norm": 0.7265625,
"learning_rate": 4.953844734656719e-05,
"loss": 4.5764,
"num_input_tokens_seen": 87162880,
"step": 1330,
"train_runtime": 12067.2148,
"train_tokens_per_second": 7223.115
},
{
"epoch": 0.3607726997374975,
"grad_norm": 0.734375,
"learning_rate": 4.9527967164460995e-05,
"loss": 4.5397,
"num_input_tokens_seen": 87818240,
"step": 1340,
"train_runtime": 12157.8843,
"train_tokens_per_second": 7223.152
},
{
"epoch": 0.36346503331762803,
"grad_norm": 0.765625,
"learning_rate": 4.9517370464603595e-05,
"loss": 4.6979,
"num_input_tokens_seen": 88473600,
"step": 1350,
"train_runtime": 12248.3715,
"train_tokens_per_second": 7223.295
},
{
"epoch": 0.36615736689775863,
"grad_norm": 0.77734375,
"learning_rate": 4.950665729733359e-05,
"loss": 4.6323,
"num_input_tokens_seen": 89128960,
"step": 1360,
"train_runtime": 12338.7122,
"train_tokens_per_second": 7223.522
},
{
"epoch": 0.36884970047788923,
"grad_norm": 0.8984375,
"learning_rate": 4.949582771354287e-05,
"loss": 4.5588,
"num_input_tokens_seen": 89784320,
"step": 1370,
"train_runtime": 12429.1409,
"train_tokens_per_second": 7223.695
},
{
"epoch": 0.3715420340580198,
"grad_norm": 0.75390625,
"learning_rate": 4.948488176467631e-05,
"loss": 4.5774,
"num_input_tokens_seen": 90439680,
"step": 1380,
"train_runtime": 12519.317,
"train_tokens_per_second": 7224.011
},
{
"epoch": 0.3742343676381504,
"grad_norm": 0.84765625,
"learning_rate": 4.94738195027316e-05,
"loss": 4.519,
"num_input_tokens_seen": 91095040,
"step": 1390,
"train_runtime": 12610.0394,
"train_tokens_per_second": 7224.009
},
{
"epoch": 0.3769267012182809,
"grad_norm": 0.8125,
"learning_rate": 4.946264098025895e-05,
"loss": 4.565,
"num_input_tokens_seen": 91750400,
"step": 1400,
"train_runtime": 12700.1302,
"train_tokens_per_second": 7224.367
},
{
"epoch": 0.3796190347984115,
"grad_norm": 0.83203125,
"learning_rate": 4.945134625036087e-05,
"loss": 4.5675,
"num_input_tokens_seen": 92405760,
"step": 1410,
"train_runtime": 12790.5268,
"train_tokens_per_second": 7224.547
},
{
"epoch": 0.3823113683785421,
"grad_norm": 0.77734375,
"learning_rate": 4.9439935366691855e-05,
"loss": 4.6965,
"num_input_tokens_seen": 93061120,
"step": 1420,
"train_runtime": 12880.7495,
"train_tokens_per_second": 7224.822
},
{
"epoch": 0.3850037019586727,
"grad_norm": 0.79296875,
"learning_rate": 4.9428408383458244e-05,
"loss": 4.5986,
"num_input_tokens_seen": 93716480,
"step": 1430,
"train_runtime": 12971.4303,
"train_tokens_per_second": 7224.838
},
{
"epoch": 0.3876960355388033,
"grad_norm": 0.71875,
"learning_rate": 4.941676535541785e-05,
"loss": 4.4871,
"num_input_tokens_seen": 94371840,
"step": 1440,
"train_runtime": 13061.9531,
"train_tokens_per_second": 7224.941
},
{
"epoch": 0.3903883691189338,
"grad_norm": 0.82421875,
"learning_rate": 4.940500633787976e-05,
"loss": 4.5071,
"num_input_tokens_seen": 95027200,
"step": 1450,
"train_runtime": 13152.757,
"train_tokens_per_second": 7224.888
},
{
"epoch": 0.3930807026990644,
"grad_norm": 0.75,
"learning_rate": 4.9393131386704054e-05,
"loss": 4.4905,
"num_input_tokens_seen": 95682560,
"step": 1460,
"train_runtime": 13243.3978,
"train_tokens_per_second": 7224.925
},
{
"epoch": 0.39577303627919497,
"grad_norm": 0.84375,
"learning_rate": 4.938114055830155e-05,
"loss": 4.4752,
"num_input_tokens_seen": 96337920,
"step": 1470,
"train_runtime": 13333.6559,
"train_tokens_per_second": 7225.169
},
{
"epoch": 0.39846536985932557,
"grad_norm": 0.859375,
"learning_rate": 4.936903390963353e-05,
"loss": 4.5764,
"num_input_tokens_seen": 96993280,
"step": 1480,
"train_runtime": 13424.6859,
"train_tokens_per_second": 7224.994
},
{
"epoch": 0.40115770343945617,
"grad_norm": 0.83984375,
"learning_rate": 4.935681149821147e-05,
"loss": 4.6151,
"num_input_tokens_seen": 97648640,
"step": 1490,
"train_runtime": 13515.4724,
"train_tokens_per_second": 7224.952
},
{
"epoch": 0.4038500370195867,
"grad_norm": 0.7734375,
"learning_rate": 4.9344473382096747e-05,
"loss": 4.4216,
"num_input_tokens_seen": 98304000,
"step": 1500,
"train_runtime": 13606.7544,
"train_tokens_per_second": 7224.647
},
{
"epoch": 0.4065423705997173,
"grad_norm": 0.77734375,
"learning_rate": 4.93320196199004e-05,
"loss": 4.6077,
"num_input_tokens_seen": 98959360,
"step": 1510,
"train_runtime": 13713.7734,
"train_tokens_per_second": 7216.056
},
{
"epoch": 0.40923470417984786,
"grad_norm": 0.84375,
"learning_rate": 4.931945027078283e-05,
"loss": 4.4706,
"num_input_tokens_seen": 99614720,
"step": 1520,
"train_runtime": 13803.8041,
"train_tokens_per_second": 7216.469
},
{
"epoch": 0.41192703775997846,
"grad_norm": 0.890625,
"learning_rate": 4.9306765394453524e-05,
"loss": 4.5019,
"num_input_tokens_seen": 100270080,
"step": 1530,
"train_runtime": 13894.7642,
"train_tokens_per_second": 7216.393
},
{
"epoch": 0.41461937134010907,
"grad_norm": 0.828125,
"learning_rate": 4.9293965051170775e-05,
"loss": 4.5346,
"num_input_tokens_seen": 100925440,
"step": 1540,
"train_runtime": 13985.7264,
"train_tokens_per_second": 7216.317
},
{
"epoch": 0.4173117049202396,
"grad_norm": 0.77734375,
"learning_rate": 4.928104930174137e-05,
"loss": 4.5612,
"num_input_tokens_seen": 101580800,
"step": 1550,
"train_runtime": 14076.8567,
"train_tokens_per_second": 7216.156
},
{
"epoch": 0.4200040385003702,
"grad_norm": 0.81640625,
"learning_rate": 4.926801820752035e-05,
"loss": 4.5452,
"num_input_tokens_seen": 102236160,
"step": 1560,
"train_runtime": 14167.6497,
"train_tokens_per_second": 7216.169
},
{
"epoch": 0.42269637208050076,
"grad_norm": 0.89453125,
"learning_rate": 4.925487183041065e-05,
"loss": 4.555,
"num_input_tokens_seen": 102891520,
"step": 1570,
"train_runtime": 14258.1592,
"train_tokens_per_second": 7216.326
},
{
"epoch": 0.42538870566063136,
"grad_norm": 0.99609375,
"learning_rate": 4.924161023286291e-05,
"loss": 4.4827,
"num_input_tokens_seen": 103546880,
"step": 1580,
"train_runtime": 14349.9231,
"train_tokens_per_second": 7215.849
},
{
"epoch": 0.4280810392407619,
"grad_norm": 0.84765625,
"learning_rate": 4.9228233477875044e-05,
"loss": 4.4487,
"num_input_tokens_seen": 104202240,
"step": 1590,
"train_runtime": 14440.8839,
"train_tokens_per_second": 7215.78
},
{
"epoch": 0.4307733728208925,
"grad_norm": 0.765625,
"learning_rate": 4.921474162899206e-05,
"loss": 4.3774,
"num_input_tokens_seen": 104857600,
"step": 1600,
"train_runtime": 14532.0545,
"train_tokens_per_second": 7215.607
},
{
"epoch": 0.4334657064010231,
"grad_norm": 0.7421875,
"learning_rate": 4.920113475030568e-05,
"loss": 4.5434,
"num_input_tokens_seen": 105512960,
"step": 1610,
"train_runtime": 14623.2846,
"train_tokens_per_second": 7215.408
},
{
"epoch": 0.43615803998115366,
"grad_norm": 0.8515625,
"learning_rate": 4.9187412906454066e-05,
"loss": 4.4334,
"num_input_tokens_seen": 106168320,
"step": 1620,
"train_runtime": 14714.7556,
"train_tokens_per_second": 7215.092
},
{
"epoch": 0.43885037356128426,
"grad_norm": 0.93359375,
"learning_rate": 4.917357616262153e-05,
"loss": 4.5523,
"num_input_tokens_seen": 106823680,
"step": 1630,
"train_runtime": 14805.4885,
"train_tokens_per_second": 7215.14
},
{
"epoch": 0.4415427071414148,
"grad_norm": 0.890625,
"learning_rate": 4.91596245845382e-05,
"loss": 4.5495,
"num_input_tokens_seen": 107479040,
"step": 1640,
"train_runtime": 14896.2954,
"train_tokens_per_second": 7215.152
},
{
"epoch": 0.4442350407215454,
"grad_norm": 0.828125,
"learning_rate": 4.914555823847969e-05,
"loss": 4.496,
"num_input_tokens_seen": 108134400,
"step": 1650,
"train_runtime": 14987.0507,
"train_tokens_per_second": 7215.189
},
{
"epoch": 0.44692737430167595,
"grad_norm": 0.8984375,
"learning_rate": 4.913137719126684e-05,
"loss": 4.4586,
"num_input_tokens_seen": 108789760,
"step": 1660,
"train_runtime": 15078.4493,
"train_tokens_per_second": 7214.917
},
{
"epoch": 0.44961970788180655,
"grad_norm": 0.87890625,
"learning_rate": 4.911708151026535e-05,
"loss": 4.4917,
"num_input_tokens_seen": 109445120,
"step": 1670,
"train_runtime": 15169.1949,
"train_tokens_per_second": 7214.959
},
{
"epoch": 0.45231204146193715,
"grad_norm": 0.80859375,
"learning_rate": 4.910267126338547e-05,
"loss": 4.444,
"num_input_tokens_seen": 110100480,
"step": 1680,
"train_runtime": 15260.849,
"train_tokens_per_second": 7214.571
},
{
"epoch": 0.4550043750420677,
"grad_norm": 0.859375,
"learning_rate": 4.90881465190817e-05,
"loss": 4.5261,
"num_input_tokens_seen": 110755840,
"step": 1690,
"train_runtime": 15351.7417,
"train_tokens_per_second": 7214.546
},
{
"epoch": 0.4576967086221983,
"grad_norm": 0.8671875,
"learning_rate": 4.9073507346352446e-05,
"loss": 4.4845,
"num_input_tokens_seen": 111411200,
"step": 1700,
"train_runtime": 15443.3127,
"train_tokens_per_second": 7214.203
},
{
"epoch": 0.46038904220232885,
"grad_norm": 0.83984375,
"learning_rate": 4.905875381473968e-05,
"loss": 4.4624,
"num_input_tokens_seen": 112066560,
"step": 1710,
"train_runtime": 15534.4543,
"train_tokens_per_second": 7214.065
},
{
"epoch": 0.46308137578245945,
"grad_norm": 0.86328125,
"learning_rate": 4.904388599432864e-05,
"loss": 4.4836,
"num_input_tokens_seen": 112721920,
"step": 1720,
"train_runtime": 15625.6018,
"train_tokens_per_second": 7213.925
},
{
"epoch": 0.46577370936259005,
"grad_norm": 0.91796875,
"learning_rate": 4.902890395574749e-05,
"loss": 4.4618,
"num_input_tokens_seen": 113377280,
"step": 1730,
"train_runtime": 15716.5331,
"train_tokens_per_second": 7213.886
},
{
"epoch": 0.4684660429427206,
"grad_norm": 0.86328125,
"learning_rate": 4.901380777016695e-05,
"loss": 4.3769,
"num_input_tokens_seen": 114032640,
"step": 1740,
"train_runtime": 15807.8819,
"train_tokens_per_second": 7213.657
},
{
"epoch": 0.4711583765228512,
"grad_norm": 0.77734375,
"learning_rate": 4.899859750930001e-05,
"loss": 4.553,
"num_input_tokens_seen": 114688000,
"step": 1750,
"train_runtime": 15899.4164,
"train_tokens_per_second": 7213.347
},
{
"epoch": 0.47385071010298174,
"grad_norm": 0.8671875,
"learning_rate": 4.898327324540154e-05,
"loss": 4.4616,
"num_input_tokens_seen": 115343360,
"step": 1760,
"train_runtime": 15990.6527,
"train_tokens_per_second": 7213.174
},
{
"epoch": 0.47654304368311234,
"grad_norm": 0.83984375,
"learning_rate": 4.8967835051267995e-05,
"loss": 4.455,
"num_input_tokens_seen": 115998720,
"step": 1770,
"train_runtime": 16082.1424,
"train_tokens_per_second": 7212.89
},
{
"epoch": 0.4792353772632429,
"grad_norm": 0.96484375,
"learning_rate": 4.895228300023703e-05,
"loss": 4.4788,
"num_input_tokens_seen": 116654080,
"step": 1780,
"train_runtime": 16173.2498,
"train_tokens_per_second": 7212.779
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.93359375,
"learning_rate": 4.893661716618716e-05,
"loss": 4.347,
"num_input_tokens_seen": 117309440,
"step": 1790,
"train_runtime": 16264.5135,
"train_tokens_per_second": 7212.601
},
{
"epoch": 0.4846200444235041,
"grad_norm": 0.8125,
"learning_rate": 4.892083762353744e-05,
"loss": 4.4545,
"num_input_tokens_seen": 117964800,
"step": 1800,
"train_runtime": 16355.8454,
"train_tokens_per_second": 7212.394
},
{
"epoch": 0.48731237800363464,
"grad_norm": 0.859375,
"learning_rate": 4.890494444724706e-05,
"loss": 4.4354,
"num_input_tokens_seen": 118620160,
"step": 1810,
"train_runtime": 16447.3794,
"train_tokens_per_second": 7212.101
},
{
"epoch": 0.49000471158376524,
"grad_norm": 0.85546875,
"learning_rate": 4.8888937712815034e-05,
"loss": 4.4927,
"num_input_tokens_seen": 119275520,
"step": 1820,
"train_runtime": 16538.3871,
"train_tokens_per_second": 7212.041
},
{
"epoch": 0.4926970451638958,
"grad_norm": 0.76171875,
"learning_rate": 4.887281749627981e-05,
"loss": 4.4122,
"num_input_tokens_seen": 119930880,
"step": 1830,
"train_runtime": 16629.7048,
"train_tokens_per_second": 7211.847
},
{
"epoch": 0.4953893787440264,
"grad_norm": 0.921875,
"learning_rate": 4.8856583874218926e-05,
"loss": 4.3483,
"num_input_tokens_seen": 120586240,
"step": 1840,
"train_runtime": 16721.1413,
"train_tokens_per_second": 7211.603
},
{
"epoch": 0.498081712324157,
"grad_norm": 0.953125,
"learning_rate": 4.884023692374865e-05,
"loss": 4.3592,
"num_input_tokens_seen": 121241600,
"step": 1850,
"train_runtime": 16812.5418,
"train_tokens_per_second": 7211.378
},
{
"epoch": 0.5007740459042875,
"grad_norm": 1.03125,
"learning_rate": 4.8823776722523596e-05,
"loss": 4.4645,
"num_input_tokens_seen": 121896960,
"step": 1860,
"train_runtime": 16903.8606,
"train_tokens_per_second": 7211.191
},
{
"epoch": 0.5034663794844181,
"grad_norm": 0.91796875,
"learning_rate": 4.880720334873638e-05,
"loss": 4.4003,
"num_input_tokens_seen": 122552320,
"step": 1870,
"train_runtime": 16995.2262,
"train_tokens_per_second": 7210.985
},
{
"epoch": 0.5061587130645487,
"grad_norm": 0.9140625,
"learning_rate": 4.879051688111719e-05,
"loss": 4.474,
"num_input_tokens_seen": 123207680,
"step": 1880,
"train_runtime": 17086.3329,
"train_tokens_per_second": 7210.891
},
{
"epoch": 0.5088510466446793,
"grad_norm": 0.91796875,
"learning_rate": 4.877371739893352e-05,
"loss": 4.4175,
"num_input_tokens_seen": 123863040,
"step": 1890,
"train_runtime": 17177.0386,
"train_tokens_per_second": 7210.966
},
{
"epoch": 0.5115433802248098,
"grad_norm": 0.87890625,
"learning_rate": 4.875680498198968e-05,
"loss": 4.3441,
"num_input_tokens_seen": 124518400,
"step": 1900,
"train_runtime": 17267.9962,
"train_tokens_per_second": 7210.935
},
{
"epoch": 0.5142357138049405,
"grad_norm": 0.890625,
"learning_rate": 4.873977971062649e-05,
"loss": 4.5013,
"num_input_tokens_seen": 125173760,
"step": 1910,
"train_runtime": 17359.2257,
"train_tokens_per_second": 7210.792
},
{
"epoch": 0.516928047385071,
"grad_norm": 0.84765625,
"learning_rate": 4.872264166572086e-05,
"loss": 4.3947,
"num_input_tokens_seen": 125829120,
"step": 1920,
"train_runtime": 17450.3296,
"train_tokens_per_second": 7210.702
},
{
"epoch": 0.5196203809652016,
"grad_norm": 0.92578125,
"learning_rate": 4.870539092868542e-05,
"loss": 4.5027,
"num_input_tokens_seen": 126484480,
"step": 1930,
"train_runtime": 17541.1892,
"train_tokens_per_second": 7210.713
},
{
"epoch": 0.5223127145453321,
"grad_norm": 0.92578125,
"learning_rate": 4.868802758146816e-05,
"loss": 4.4597,
"num_input_tokens_seen": 127139840,
"step": 1940,
"train_runtime": 17632.9992,
"train_tokens_per_second": 7210.335
},
{
"epoch": 0.5250050481254628,
"grad_norm": 0.84375,
"learning_rate": 4.867055170655197e-05,
"loss": 4.4397,
"num_input_tokens_seen": 127795200,
"step": 1950,
"train_runtime": 17724.0126,
"train_tokens_per_second": 7210.286
},
{
"epoch": 0.5276973817055933,
"grad_norm": 0.85546875,
"learning_rate": 4.865296338695432e-05,
"loss": 4.2699,
"num_input_tokens_seen": 128450560,
"step": 1960,
"train_runtime": 17815.026,
"train_tokens_per_second": 7210.237
},
{
"epoch": 0.5303897152857239,
"grad_norm": 0.94921875,
"learning_rate": 4.863526270622683e-05,
"loss": 4.4383,
"num_input_tokens_seen": 129105920,
"step": 1970,
"train_runtime": 17905.7807,
"train_tokens_per_second": 7210.293
},
{
"epoch": 0.5330820488658545,
"grad_norm": 0.94140625,
"learning_rate": 4.86174497484549e-05,
"loss": 4.4799,
"num_input_tokens_seen": 129761280,
"step": 1980,
"train_runtime": 17997.6594,
"train_tokens_per_second": 7209.898
},
{
"epoch": 0.5357743824459851,
"grad_norm": 0.8046875,
"learning_rate": 4.859952459825726e-05,
"loss": 4.4029,
"num_input_tokens_seen": 130416640,
"step": 1990,
"train_runtime": 18088.8465,
"train_tokens_per_second": 7209.782
},
{
"epoch": 0.5384667160261156,
"grad_norm": 0.86328125,
"learning_rate": 4.8581487340785614e-05,
"loss": 4.3795,
"num_input_tokens_seen": 131072000,
"step": 2000,
"train_runtime": 18180.2553,
"train_tokens_per_second": 7209.58
},
{
"epoch": 0.5411590496062462,
"grad_norm": 0.86328125,
"learning_rate": 4.856333806172422e-05,
"loss": 4.3712,
"num_input_tokens_seen": 131727360,
"step": 2010,
"train_runtime": 18289.7026,
"train_tokens_per_second": 7202.269
},
{
"epoch": 0.5438513831863768,
"grad_norm": 0.7890625,
"learning_rate": 4.8545076847289495e-05,
"loss": 4.4262,
"num_input_tokens_seen": 132382720,
"step": 2020,
"train_runtime": 18380.7598,
"train_tokens_per_second": 7202.244
},
{
"epoch": 0.5465437167665074,
"grad_norm": 0.91796875,
"learning_rate": 4.8526703784229566e-05,
"loss": 4.4754,
"num_input_tokens_seen": 133038080,
"step": 2030,
"train_runtime": 18471.8385,
"train_tokens_per_second": 7202.211
},
{
"epoch": 0.5492360503466379,
"grad_norm": 0.87109375,
"learning_rate": 4.8508218959823916e-05,
"loss": 4.409,
"num_input_tokens_seen": 133693440,
"step": 2040,
"train_runtime": 18563.4788,
"train_tokens_per_second": 7201.96
},
{
"epoch": 0.5519283839267686,
"grad_norm": 0.8984375,
"learning_rate": 4.848962246188292e-05,
"loss": 4.3891,
"num_input_tokens_seen": 134348800,
"step": 2050,
"train_runtime": 18654.1561,
"train_tokens_per_second": 7202.084
},
{
"epoch": 0.5546207175068991,
"grad_norm": 0.9375,
"learning_rate": 4.8470914378747464e-05,
"loss": 4.3823,
"num_input_tokens_seen": 135004160,
"step": 2060,
"train_runtime": 18745.2629,
"train_tokens_per_second": 7202.041
},
{
"epoch": 0.5573130510870297,
"grad_norm": 0.8671875,
"learning_rate": 4.845209479928849e-05,
"loss": 4.3553,
"num_input_tokens_seen": 135659520,
"step": 2070,
"train_runtime": 18836.6425,
"train_tokens_per_second": 7201.895
},
{
"epoch": 0.5600053846671602,
"grad_norm": 0.87109375,
"learning_rate": 4.843316381290661e-05,
"loss": 4.3418,
"num_input_tokens_seen": 136314880,
"step": 2080,
"train_runtime": 18927.6258,
"train_tokens_per_second": 7201.901
},
{
"epoch": 0.5626977182472909,
"grad_norm": 0.90625,
"learning_rate": 4.8414121509531645e-05,
"loss": 4.4231,
"num_input_tokens_seen": 136970240,
"step": 2090,
"train_runtime": 19018.1391,
"train_tokens_per_second": 7202.084
},
{
"epoch": 0.5653900518274214,
"grad_norm": 0.9375,
"learning_rate": 4.839496797962224e-05,
"loss": 4.4212,
"num_input_tokens_seen": 137625600,
"step": 2100,
"train_runtime": 19109.2304,
"train_tokens_per_second": 7202.048
},
{
"epoch": 0.568082385407552,
"grad_norm": 0.9375,
"learning_rate": 4.837570331416539e-05,
"loss": 4.4107,
"num_input_tokens_seen": 138280960,
"step": 2110,
"train_runtime": 19200.5208,
"train_tokens_per_second": 7201.938
},
{
"epoch": 0.5707747189876826,
"grad_norm": 0.921875,
"learning_rate": 4.835632760467604e-05,
"loss": 4.3413,
"num_input_tokens_seen": 138936320,
"step": 2120,
"train_runtime": 19292.2653,
"train_tokens_per_second": 7201.659
},
{
"epoch": 0.5734670525678132,
"grad_norm": 0.84375,
"learning_rate": 4.8336840943196636e-05,
"loss": 4.3488,
"num_input_tokens_seen": 139591680,
"step": 2130,
"train_runtime": 19382.8953,
"train_tokens_per_second": 7201.797
},
{
"epoch": 0.5761593861479437,
"grad_norm": 0.88671875,
"learning_rate": 4.8317243422296695e-05,
"loss": 4.3576,
"num_input_tokens_seen": 140247040,
"step": 2140,
"train_runtime": 19474.0961,
"train_tokens_per_second": 7201.723
},
{
"epoch": 0.5788517197280743,
"grad_norm": 0.8984375,
"learning_rate": 4.8297535135072345e-05,
"loss": 4.3835,
"num_input_tokens_seen": 140902400,
"step": 2150,
"train_runtime": 19565.1786,
"train_tokens_per_second": 7201.693
},
{
"epoch": 0.5815440533082049,
"grad_norm": 0.9140625,
"learning_rate": 4.8277716175145926e-05,
"loss": 4.4169,
"num_input_tokens_seen": 141557760,
"step": 2160,
"train_runtime": 19656.3068,
"train_tokens_per_second": 7201.646
},
{
"epoch": 0.5842363868883355,
"grad_norm": 0.8515625,
"learning_rate": 4.825778663666549e-05,
"loss": 4.3731,
"num_input_tokens_seen": 142213120,
"step": 2170,
"train_runtime": 19747.5839,
"train_tokens_per_second": 7201.545
},
{
"epoch": 0.586928720468466,
"grad_norm": 0.796875,
"learning_rate": 4.8237746614304404e-05,
"loss": 4.3626,
"num_input_tokens_seen": 142868480,
"step": 2180,
"train_runtime": 19838.3892,
"train_tokens_per_second": 7201.617
},
{
"epoch": 0.5896210540485967,
"grad_norm": 0.83203125,
"learning_rate": 4.821759620326086e-05,
"loss": 4.4174,
"num_input_tokens_seen": 143523840,
"step": 2190,
"train_runtime": 19929.5944,
"train_tokens_per_second": 7201.543
},
{
"epoch": 0.5923133876287272,
"grad_norm": 0.90234375,
"learning_rate": 4.819733549925746e-05,
"loss": 4.2844,
"num_input_tokens_seen": 144179200,
"step": 2200,
"train_runtime": 20020.9319,
"train_tokens_per_second": 7201.423
},
{
"epoch": 0.5950057212088578,
"grad_norm": 0.875,
"learning_rate": 4.817696459854072e-05,
"loss": 4.3374,
"num_input_tokens_seen": 144834560,
"step": 2210,
"train_runtime": 20111.5767,
"train_tokens_per_second": 7201.552
},
{
"epoch": 0.5976980547889884,
"grad_norm": 0.875,
"learning_rate": 4.815648359788065e-05,
"loss": 4.3078,
"num_input_tokens_seen": 145489920,
"step": 2220,
"train_runtime": 20203.0906,
"train_tokens_per_second": 7201.369
},
{
"epoch": 0.600390388369119,
"grad_norm": 0.84375,
"learning_rate": 4.8135892594570284e-05,
"loss": 4.422,
"num_input_tokens_seen": 146145280,
"step": 2230,
"train_runtime": 20294.0976,
"train_tokens_per_second": 7201.369
},
{
"epoch": 0.6030827219492495,
"grad_norm": 0.93359375,
"learning_rate": 4.81151916864252e-05,
"loss": 4.3721,
"num_input_tokens_seen": 146800640,
"step": 2240,
"train_runtime": 20384.6845,
"train_tokens_per_second": 7201.516
},
{
"epoch": 0.60577505552938,
"grad_norm": 0.921875,
"learning_rate": 4.809438097178306e-05,
"loss": 4.3855,
"num_input_tokens_seen": 147456000,
"step": 2250,
"train_runtime": 20476.0301,
"train_tokens_per_second": 7201.396
},
{
"epoch": 0.6084673891095107,
"grad_norm": 0.98046875,
"learning_rate": 4.807346054950319e-05,
"loss": 4.2855,
"num_input_tokens_seen": 148111360,
"step": 2260,
"train_runtime": 20567.2353,
"train_tokens_per_second": 7201.326
},
{
"epoch": 0.6111597226896412,
"grad_norm": 0.90625,
"learning_rate": 4.805243051896603e-05,
"loss": 4.3628,
"num_input_tokens_seen": 148766720,
"step": 2270,
"train_runtime": 20657.7792,
"train_tokens_per_second": 7201.487
},
{
"epoch": 0.6138520562697718,
"grad_norm": 0.8828125,
"learning_rate": 4.8031290980072714e-05,
"loss": 4.4122,
"num_input_tokens_seen": 149422080,
"step": 2280,
"train_runtime": 20749.1493,
"train_tokens_per_second": 7201.359
},
{
"epoch": 0.6165443898499025,
"grad_norm": 1.0625,
"learning_rate": 4.80100420332446e-05,
"loss": 4.4566,
"num_input_tokens_seen": 150077440,
"step": 2290,
"train_runtime": 20840.466,
"train_tokens_per_second": 7201.252
},
{
"epoch": 0.619236723430033,
"grad_norm": 0.890625,
"learning_rate": 4.798868377942276e-05,
"loss": 4.4265,
"num_input_tokens_seen": 150732800,
"step": 2300,
"train_runtime": 20931.623,
"train_tokens_per_second": 7201.2
},
{
"epoch": 0.6219290570101635,
"grad_norm": 0.9375,
"learning_rate": 4.796721632006754e-05,
"loss": 4.3916,
"num_input_tokens_seen": 151388160,
"step": 2310,
"train_runtime": 21022.8072,
"train_tokens_per_second": 7201.139
},
{
"epoch": 0.6246213905902941,
"grad_norm": 0.85546875,
"learning_rate": 4.794563975715803e-05,
"loss": 4.2368,
"num_input_tokens_seen": 152043520,
"step": 2320,
"train_runtime": 21114.2106,
"train_tokens_per_second": 7201.004
},
{
"epoch": 0.6273137241704247,
"grad_norm": 0.8359375,
"learning_rate": 4.792395419319163e-05,
"loss": 4.3248,
"num_input_tokens_seen": 152698880,
"step": 2330,
"train_runtime": 21205.6312,
"train_tokens_per_second": 7200.865
},
{
"epoch": 0.6300060577505553,
"grad_norm": 0.94921875,
"learning_rate": 4.7902159731183524e-05,
"loss": 4.2703,
"num_input_tokens_seen": 153354240,
"step": 2340,
"train_runtime": 21297.559,
"train_tokens_per_second": 7200.555
},
{
"epoch": 0.6326983913306858,
"grad_norm": 0.8359375,
"learning_rate": 4.7880256474666194e-05,
"loss": 4.3804,
"num_input_tokens_seen": 154009600,
"step": 2350,
"train_runtime": 21389.4114,
"train_tokens_per_second": 7200.273
},
{
"epoch": 0.6353907249108165,
"grad_norm": 1.0,
"learning_rate": 4.785824452768898e-05,
"loss": 4.3842,
"num_input_tokens_seen": 154664960,
"step": 2360,
"train_runtime": 21480.3832,
"train_tokens_per_second": 7200.289
},
{
"epoch": 0.638083058490947,
"grad_norm": 1.046875,
"learning_rate": 4.783612399481751e-05,
"loss": 4.2943,
"num_input_tokens_seen": 155320320,
"step": 2370,
"train_runtime": 21571.6877,
"train_tokens_per_second": 7200.193
},
{
"epoch": 0.6407753920710776,
"grad_norm": 1.015625,
"learning_rate": 4.781389498113324e-05,
"loss": 4.326,
"num_input_tokens_seen": 155975680,
"step": 2380,
"train_runtime": 21663.2694,
"train_tokens_per_second": 7200.006
},
{
"epoch": 0.6434677256512081,
"grad_norm": 0.828125,
"learning_rate": 4.779155759223298e-05,
"loss": 4.3084,
"num_input_tokens_seen": 156631040,
"step": 2390,
"train_runtime": 21754.438,
"train_tokens_per_second": 7199.958
},
{
"epoch": 0.6461600592313388,
"grad_norm": 0.859375,
"learning_rate": 4.776911193422835e-05,
"loss": 4.3083,
"num_input_tokens_seen": 157286400,
"step": 2400,
"train_runtime": 21845.5014,
"train_tokens_per_second": 7199.945
},
{
"epoch": 0.6488523928114693,
"grad_norm": 0.9296875,
"learning_rate": 4.7746558113745276e-05,
"loss": 4.3757,
"num_input_tokens_seen": 157941760,
"step": 2410,
"train_runtime": 21937.6885,
"train_tokens_per_second": 7199.563
},
{
"epoch": 0.6515447263915999,
"grad_norm": 1.0,
"learning_rate": 4.7723896237923526e-05,
"loss": 4.3263,
"num_input_tokens_seen": 158597120,
"step": 2420,
"train_runtime": 22028.8679,
"train_tokens_per_second": 7199.513
},
{
"epoch": 0.6542370599717305,
"grad_norm": 0.87109375,
"learning_rate": 4.770112641441616e-05,
"loss": 4.3565,
"num_input_tokens_seen": 159252480,
"step": 2430,
"train_runtime": 22120.3426,
"train_tokens_per_second": 7199.368
},
{
"epoch": 0.6569293935518611,
"grad_norm": 0.87109375,
"learning_rate": 4.767824875138904e-05,
"loss": 4.4035,
"num_input_tokens_seen": 159907840,
"step": 2440,
"train_runtime": 22211.4295,
"train_tokens_per_second": 7199.349
},
{
"epoch": 0.6596217271319916,
"grad_norm": 0.9375,
"learning_rate": 4.7655263357520304e-05,
"loss": 4.3352,
"num_input_tokens_seen": 160563200,
"step": 2450,
"train_runtime": 22302.9882,
"train_tokens_per_second": 7199.179
},
{
"epoch": 0.6623140607121222,
"grad_norm": 0.90234375,
"learning_rate": 4.763217034199986e-05,
"loss": 4.3358,
"num_input_tokens_seen": 161218560,
"step": 2460,
"train_runtime": 22394.5547,
"train_tokens_per_second": 7199.007
},
{
"epoch": 0.6650063942922528,
"grad_norm": 1.265625,
"learning_rate": 4.760896981452885e-05,
"loss": 4.3606,
"num_input_tokens_seen": 161873920,
"step": 2470,
"train_runtime": 22486.3138,
"train_tokens_per_second": 7198.775
},
{
"epoch": 0.6676987278723834,
"grad_norm": 0.8515625,
"learning_rate": 4.758566188531916e-05,
"loss": 4.2709,
"num_input_tokens_seen": 162529280,
"step": 2480,
"train_runtime": 22577.8549,
"train_tokens_per_second": 7198.615
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.8828125,
"learning_rate": 4.756224666509286e-05,
"loss": 4.2548,
"num_input_tokens_seen": 163184640,
"step": 2490,
"train_runtime": 22669.1281,
"train_tokens_per_second": 7198.541
},
{
"epoch": 0.6730833950326446,
"grad_norm": 0.87890625,
"learning_rate": 4.753872426508171e-05,
"loss": 4.2854,
"num_input_tokens_seen": 163840000,
"step": 2500,
"train_runtime": 22760.6762,
"train_tokens_per_second": 7198.38
},
{
"epoch": 0.6757757286127751,
"grad_norm": 1.03125,
"learning_rate": 4.751509479702662e-05,
"loss": 4.3176,
"num_input_tokens_seen": 164495360,
"step": 2510,
"train_runtime": 22869.0727,
"train_tokens_per_second": 7192.918
},
{
"epoch": 0.6784680621929057,
"grad_norm": 0.93359375,
"learning_rate": 4.749135837317709e-05,
"loss": 4.3155,
"num_input_tokens_seen": 165150720,
"step": 2520,
"train_runtime": 22959.138,
"train_tokens_per_second": 7193.246
},
{
"epoch": 0.6811603957730363,
"grad_norm": 0.91015625,
"learning_rate": 4.746751510629073e-05,
"loss": 4.322,
"num_input_tokens_seen": 165806080,
"step": 2530,
"train_runtime": 23050.3772,
"train_tokens_per_second": 7193.205
},
{
"epoch": 0.6838527293531669,
"grad_norm": 0.95703125,
"learning_rate": 4.744356510963268e-05,
"loss": 4.2189,
"num_input_tokens_seen": 166461440,
"step": 2540,
"train_runtime": 23141.7466,
"train_tokens_per_second": 7193.123
},
{
"epoch": 0.6865450629332974,
"grad_norm": 0.98046875,
"learning_rate": 4.741950849697512e-05,
"loss": 4.31,
"num_input_tokens_seen": 167116800,
"step": 2550,
"train_runtime": 23232.5085,
"train_tokens_per_second": 7193.231
},
{
"epoch": 0.689237396513428,
"grad_norm": 0.91796875,
"learning_rate": 4.7395345382596644e-05,
"loss": 4.457,
"num_input_tokens_seen": 167772160,
"step": 2560,
"train_runtime": 23323.2423,
"train_tokens_per_second": 7193.346
},
{
"epoch": 0.6919297300935586,
"grad_norm": 0.953125,
"learning_rate": 4.7371075881281826e-05,
"loss": 4.2846,
"num_input_tokens_seen": 168427520,
"step": 2570,
"train_runtime": 23414.198,
"train_tokens_per_second": 7193.393
},
{
"epoch": 0.6946220636736892,
"grad_norm": 1.0390625,
"learning_rate": 4.7346700108320605e-05,
"loss": 4.3602,
"num_input_tokens_seen": 169082880,
"step": 2580,
"train_runtime": 23505.6086,
"train_tokens_per_second": 7193.299
},
{
"epoch": 0.6973143972538197,
"grad_norm": 0.99609375,
"learning_rate": 4.732221817950773e-05,
"loss": 4.2612,
"num_input_tokens_seen": 169738240,
"step": 2590,
"train_runtime": 23596.4702,
"train_tokens_per_second": 7193.374
},
{
"epoch": 0.7000067308339504,
"grad_norm": 0.953125,
"learning_rate": 4.729763021114227e-05,
"loss": 4.3303,
"num_input_tokens_seen": 170393600,
"step": 2600,
"train_runtime": 23687.1091,
"train_tokens_per_second": 7193.516
},
{
"epoch": 0.7026990644140809,
"grad_norm": 0.93359375,
"learning_rate": 4.727293632002699e-05,
"loss": 4.3715,
"num_input_tokens_seen": 171048960,
"step": 2610,
"train_runtime": 23778.7081,
"train_tokens_per_second": 7193.366
},
{
"epoch": 0.7053913979942115,
"grad_norm": 0.92578125,
"learning_rate": 4.7248136623467855e-05,
"loss": 4.3184,
"num_input_tokens_seen": 171704320,
"step": 2620,
"train_runtime": 23869.4479,
"train_tokens_per_second": 7193.477
},
{
"epoch": 0.708083731574342,
"grad_norm": 0.97265625,
"learning_rate": 4.722323123927344e-05,
"loss": 4.2171,
"num_input_tokens_seen": 172359680,
"step": 2630,
"train_runtime": 23960.14,
"train_tokens_per_second": 7193.601
},
{
"epoch": 0.7107760651544727,
"grad_norm": 0.90625,
"learning_rate": 4.719822028575438e-05,
"loss": 4.3047,
"num_input_tokens_seen": 173015040,
"step": 2640,
"train_runtime": 24050.8669,
"train_tokens_per_second": 7193.713
},
{
"epoch": 0.7134683987346032,
"grad_norm": 0.98828125,
"learning_rate": 4.717310388172281e-05,
"loss": 4.3117,
"num_input_tokens_seen": 173670400,
"step": 2650,
"train_runtime": 24141.4796,
"train_tokens_per_second": 7193.859
},
{
"epoch": 0.7161607323147338,
"grad_norm": 1.0078125,
"learning_rate": 4.714788214649179e-05,
"loss": 4.2947,
"num_input_tokens_seen": 174325760,
"step": 2660,
"train_runtime": 24232.2176,
"train_tokens_per_second": 7193.966
},
{
"epoch": 0.7188530658948644,
"grad_norm": 1.03125,
"learning_rate": 4.712255519987474e-05,
"loss": 4.2709,
"num_input_tokens_seen": 174981120,
"step": 2670,
"train_runtime": 24322.9895,
"train_tokens_per_second": 7194.063
},
{
"epoch": 0.721545399474995,
"grad_norm": 0.88671875,
"learning_rate": 4.70971231621849e-05,
"loss": 4.3155,
"num_input_tokens_seen": 175636480,
"step": 2680,
"train_runtime": 24413.3721,
"train_tokens_per_second": 7194.274
},
{
"epoch": 0.7242377330551255,
"grad_norm": 0.91796875,
"learning_rate": 4.707158615423471e-05,
"loss": 4.2957,
"num_input_tokens_seen": 176291840,
"step": 2690,
"train_runtime": 24504.1257,
"train_tokens_per_second": 7194.374
},
{
"epoch": 0.7269300666352561,
"grad_norm": 0.94140625,
"learning_rate": 4.70459442973353e-05,
"loss": 4.2754,
"num_input_tokens_seen": 176947200,
"step": 2700,
"train_runtime": 24594.1858,
"train_tokens_per_second": 7194.676
},
{
"epoch": 0.7296224002153867,
"grad_norm": 0.89453125,
"learning_rate": 4.702019771329581e-05,
"loss": 4.2962,
"num_input_tokens_seen": 177602560,
"step": 2710,
"train_runtime": 24685.4443,
"train_tokens_per_second": 7194.627
},
{
"epoch": 0.7323147337955173,
"grad_norm": 0.9921875,
"learning_rate": 4.699434652442293e-05,
"loss": 4.2323,
"num_input_tokens_seen": 178257920,
"step": 2720,
"train_runtime": 24775.7353,
"train_tokens_per_second": 7194.859
},
{
"epoch": 0.7350070673756478,
"grad_norm": 0.90625,
"learning_rate": 4.696839085352026e-05,
"loss": 4.2707,
"num_input_tokens_seen": 178913280,
"step": 2730,
"train_runtime": 24866.4687,
"train_tokens_per_second": 7194.961
},
{
"epoch": 0.7376994009557785,
"grad_norm": 1.0,
"learning_rate": 4.6942330823887706e-05,
"loss": 4.2595,
"num_input_tokens_seen": 179568640,
"step": 2740,
"train_runtime": 24957.2903,
"train_tokens_per_second": 7195.038
},
{
"epoch": 0.740391734535909,
"grad_norm": 1.0859375,
"learning_rate": 4.691616655932094e-05,
"loss": 4.3064,
"num_input_tokens_seen": 180224000,
"step": 2750,
"train_runtime": 25047.8997,
"train_tokens_per_second": 7195.174
},
{
"epoch": 0.7430840681160396,
"grad_norm": 1.03125,
"learning_rate": 4.6889898184110784e-05,
"loss": 4.2911,
"num_input_tokens_seen": 180879360,
"step": 2760,
"train_runtime": 25138.519,
"train_tokens_per_second": 7195.307
},
{
"epoch": 0.7457764016961702,
"grad_norm": 0.8671875,
"learning_rate": 4.686352582304263e-05,
"loss": 4.2602,
"num_input_tokens_seen": 181534720,
"step": 2770,
"train_runtime": 25229.243,
"train_tokens_per_second": 7195.409
},
{
"epoch": 0.7484687352763008,
"grad_norm": 0.859375,
"learning_rate": 4.6837049601395845e-05,
"loss": 4.2676,
"num_input_tokens_seen": 182190080,
"step": 2780,
"train_runtime": 25319.4176,
"train_tokens_per_second": 7195.666
},
{
"epoch": 0.7511610688564313,
"grad_norm": 0.89453125,
"learning_rate": 4.6810469644943175e-05,
"loss": 4.2867,
"num_input_tokens_seen": 182845440,
"step": 2790,
"train_runtime": 25410.0216,
"train_tokens_per_second": 7195.8
},
{
"epoch": 0.7538534024365618,
"grad_norm": 0.91796875,
"learning_rate": 4.6783786079950165e-05,
"loss": 4.3149,
"num_input_tokens_seen": 183500800,
"step": 2800,
"train_runtime": 25500.6347,
"train_tokens_per_second": 7195.931
},
{
"epoch": 0.7565457360166925,
"grad_norm": 0.92578125,
"learning_rate": 4.67569990331745e-05,
"loss": 4.2937,
"num_input_tokens_seen": 184156160,
"step": 2810,
"train_runtime": 25591.425,
"train_tokens_per_second": 7196.01
},
{
"epoch": 0.759238069596823,
"grad_norm": 0.89453125,
"learning_rate": 4.67301086318655e-05,
"loss": 4.2444,
"num_input_tokens_seen": 184811520,
"step": 2820,
"train_runtime": 25681.4577,
"train_tokens_per_second": 7196.302
},
{
"epoch": 0.7619304031769536,
"grad_norm": 0.8828125,
"learning_rate": 4.6703115003763406e-05,
"loss": 4.263,
"num_input_tokens_seen": 185466880,
"step": 2830,
"train_runtime": 25772.2848,
"train_tokens_per_second": 7196.369
},
{
"epoch": 0.7646227367570843,
"grad_norm": 0.9375,
"learning_rate": 4.6676018277098874e-05,
"loss": 4.2548,
"num_input_tokens_seen": 186122240,
"step": 2840,
"train_runtime": 25863.216,
"train_tokens_per_second": 7196.407
},
{
"epoch": 0.7673150703372148,
"grad_norm": 0.984375,
"learning_rate": 4.664881858059229e-05,
"loss": 4.2596,
"num_input_tokens_seen": 186777600,
"step": 2850,
"train_runtime": 25953.599,
"train_tokens_per_second": 7196.597
},
{
"epoch": 0.7700074039173453,
"grad_norm": 0.94921875,
"learning_rate": 4.662151604345321e-05,
"loss": 4.2311,
"num_input_tokens_seen": 187432960,
"step": 2860,
"train_runtime": 26043.8592,
"train_tokens_per_second": 7196.82
},
{
"epoch": 0.7726997374974759,
"grad_norm": 1.015625,
"learning_rate": 4.6594110795379695e-05,
"loss": 4.3322,
"num_input_tokens_seen": 188088320,
"step": 2870,
"train_runtime": 26134.4702,
"train_tokens_per_second": 7196.944
},
{
"epoch": 0.7753920710776065,
"grad_norm": 0.94921875,
"learning_rate": 4.656660296655775e-05,
"loss": 4.3234,
"num_input_tokens_seen": 188743680,
"step": 2880,
"train_runtime": 26225.4231,
"train_tokens_per_second": 7196.974
},
{
"epoch": 0.7780844046577371,
"grad_norm": 0.9140625,
"learning_rate": 4.653899268766069e-05,
"loss": 4.1814,
"num_input_tokens_seen": 189399040,
"step": 2890,
"train_runtime": 26316.4752,
"train_tokens_per_second": 7196.976
},
{
"epoch": 0.7807767382378676,
"grad_norm": 0.953125,
"learning_rate": 4.6511280089848466e-05,
"loss": 4.3011,
"num_input_tokens_seen": 190054400,
"step": 2900,
"train_runtime": 26407.6103,
"train_tokens_per_second": 7196.956
},
{
"epoch": 0.7834690718179983,
"grad_norm": 0.90234375,
"learning_rate": 4.6483465304767124e-05,
"loss": 4.3018,
"num_input_tokens_seen": 190709760,
"step": 2910,
"train_runtime": 26498.1593,
"train_tokens_per_second": 7197.095
},
{
"epoch": 0.7861614053981288,
"grad_norm": 0.890625,
"learning_rate": 4.6455548464548126e-05,
"loss": 4.2726,
"num_input_tokens_seen": 191365120,
"step": 2920,
"train_runtime": 26589.1325,
"train_tokens_per_second": 7197.118
},
{
"epoch": 0.7888537389782594,
"grad_norm": 0.89453125,
"learning_rate": 4.642752970180774e-05,
"loss": 4.3334,
"num_input_tokens_seen": 192020480,
"step": 2930,
"train_runtime": 26680.0364,
"train_tokens_per_second": 7197.16
},
{
"epoch": 0.7915460725583899,
"grad_norm": 0.95703125,
"learning_rate": 4.639940914964641e-05,
"loss": 4.2296,
"num_input_tokens_seen": 192675840,
"step": 2940,
"train_runtime": 26770.9407,
"train_tokens_per_second": 7197.201
},
{
"epoch": 0.7942384061385206,
"grad_norm": 0.90234375,
"learning_rate": 4.6371186941648116e-05,
"loss": 4.2387,
"num_input_tokens_seen": 193331200,
"step": 2950,
"train_runtime": 26861.686,
"train_tokens_per_second": 7197.285
},
{
"epoch": 0.7969307397186511,
"grad_norm": 0.9375,
"learning_rate": 4.634286321187973e-05,
"loss": 4.1788,
"num_input_tokens_seen": 193986560,
"step": 2960,
"train_runtime": 26952.8251,
"train_tokens_per_second": 7197.263
},
{
"epoch": 0.7996230732987817,
"grad_norm": 0.87109375,
"learning_rate": 4.631443809489043e-05,
"loss": 4.2251,
"num_input_tokens_seen": 194641920,
"step": 2970,
"train_runtime": 27043.1608,
"train_tokens_per_second": 7197.455
},
{
"epoch": 0.8023154068789123,
"grad_norm": 0.91015625,
"learning_rate": 4.628591172571098e-05,
"loss": 4.1999,
"num_input_tokens_seen": 195297280,
"step": 2980,
"train_runtime": 27134.199,
"train_tokens_per_second": 7197.459
},
{
"epoch": 0.8050077404590429,
"grad_norm": 1.0625,
"learning_rate": 4.6257284239853186e-05,
"loss": 4.3704,
"num_input_tokens_seen": 195952640,
"step": 2990,
"train_runtime": 27224.5966,
"train_tokens_per_second": 7197.632
},
{
"epoch": 0.8077000740391734,
"grad_norm": 0.9296875,
"learning_rate": 4.6228555773309155e-05,
"loss": 4.2884,
"num_input_tokens_seen": 196608000,
"step": 3000,
"train_runtime": 27315.5079,
"train_tokens_per_second": 7197.67
},
{
"epoch": 0.810392407619304,
"grad_norm": 0.8828125,
"learning_rate": 4.619972646255069e-05,
"loss": 4.2231,
"num_input_tokens_seen": 197263360,
"step": 3010,
"train_runtime": 27424.1093,
"train_tokens_per_second": 7193.064
},
{
"epoch": 0.8130847411994346,
"grad_norm": 0.91015625,
"learning_rate": 4.617079644452869e-05,
"loss": 4.2115,
"num_input_tokens_seen": 197918720,
"step": 3020,
"train_runtime": 27514.8792,
"train_tokens_per_second": 7193.152
},
{
"epoch": 0.8157770747795652,
"grad_norm": 0.8984375,
"learning_rate": 4.614176585667239e-05,
"loss": 4.2903,
"num_input_tokens_seen": 198574080,
"step": 3030,
"train_runtime": 27605.6503,
"train_tokens_per_second": 7193.24
},
{
"epoch": 0.8184694083596957,
"grad_norm": 0.96484375,
"learning_rate": 4.611263483688885e-05,
"loss": 4.2218,
"num_input_tokens_seen": 199229440,
"step": 3040,
"train_runtime": 27697.4045,
"train_tokens_per_second": 7193.073
},
{
"epoch": 0.8211617419398264,
"grad_norm": 1.0078125,
"learning_rate": 4.608340352356215e-05,
"loss": 4.2355,
"num_input_tokens_seen": 199884800,
"step": 3050,
"train_runtime": 27787.9614,
"train_tokens_per_second": 7193.216
},
{
"epoch": 0.8238540755199569,
"grad_norm": 0.9453125,
"learning_rate": 4.605407205555285e-05,
"loss": 4.1975,
"num_input_tokens_seen": 200540160,
"step": 3060,
"train_runtime": 27878.9629,
"train_tokens_per_second": 7193.243
},
{
"epoch": 0.8265464091000875,
"grad_norm": 0.99609375,
"learning_rate": 4.602464057219727e-05,
"loss": 4.2887,
"num_input_tokens_seen": 201195520,
"step": 3070,
"train_runtime": 27970.4176,
"train_tokens_per_second": 7193.154
},
{
"epoch": 0.8292387426802181,
"grad_norm": 0.91796875,
"learning_rate": 4.599510921330683e-05,
"loss": 4.205,
"num_input_tokens_seen": 201850880,
"step": 3080,
"train_runtime": 28061.1136,
"train_tokens_per_second": 7193.26
},
{
"epoch": 0.8319310762603487,
"grad_norm": 0.92578125,
"learning_rate": 4.5965478119167424e-05,
"loss": 4.246,
"num_input_tokens_seen": 202506240,
"step": 3090,
"train_runtime": 28151.5595,
"train_tokens_per_second": 7193.429
},
{
"epoch": 0.8346234098404792,
"grad_norm": 0.94140625,
"learning_rate": 4.5935747430538726e-05,
"loss": 4.2395,
"num_input_tokens_seen": 203161600,
"step": 3100,
"train_runtime": 28242.6937,
"train_tokens_per_second": 7193.422
},
{
"epoch": 0.8373157434206098,
"grad_norm": 0.97265625,
"learning_rate": 4.59059172886535e-05,
"loss": 4.1347,
"num_input_tokens_seen": 203816960,
"step": 3110,
"train_runtime": 28334.0092,
"train_tokens_per_second": 7193.368
},
{
"epoch": 0.8400080770007404,
"grad_norm": 0.953125,
"learning_rate": 4.587598783521697e-05,
"loss": 4.2245,
"num_input_tokens_seen": 204472320,
"step": 3120,
"train_runtime": 28425.3655,
"train_tokens_per_second": 7193.305
},
{
"epoch": 0.842700410580871,
"grad_norm": 0.94921875,
"learning_rate": 4.584595921240614e-05,
"loss": 4.1877,
"num_input_tokens_seen": 205127680,
"step": 3130,
"train_runtime": 28516.1829,
"train_tokens_per_second": 7193.378
},
{
"epoch": 0.8453927441610015,
"grad_norm": 0.875,
"learning_rate": 4.581583156286908e-05,
"loss": 4.2275,
"num_input_tokens_seen": 205783040,
"step": 3140,
"train_runtime": 28607.2046,
"train_tokens_per_second": 7193.399
},
{
"epoch": 0.8480850777411322,
"grad_norm": 0.93359375,
"learning_rate": 4.5785605029724315e-05,
"loss": 4.1242,
"num_input_tokens_seen": 206438400,
"step": 3150,
"train_runtime": 28698.0895,
"train_tokens_per_second": 7193.454
},
{
"epoch": 0.8507774113212627,
"grad_norm": 1.0390625,
"learning_rate": 4.575527975656007e-05,
"loss": 4.2295,
"num_input_tokens_seen": 207093760,
"step": 3160,
"train_runtime": 28789.2485,
"train_tokens_per_second": 7193.441
},
{
"epoch": 0.8534697449013933,
"grad_norm": 0.9921875,
"learning_rate": 4.572485588743365e-05,
"loss": 4.1666,
"num_input_tokens_seen": 207749120,
"step": 3170,
"train_runtime": 28880.2276,
"train_tokens_per_second": 7193.472
},
{
"epoch": 0.8561620784815238,
"grad_norm": 1.03125,
"learning_rate": 4.569433356687072e-05,
"loss": 4.224,
"num_input_tokens_seen": 208404480,
"step": 3180,
"train_runtime": 28970.6224,
"train_tokens_per_second": 7193.649
},
{
"epoch": 0.8588544120616545,
"grad_norm": 0.98828125,
"learning_rate": 4.566371293986463e-05,
"loss": 4.1672,
"num_input_tokens_seen": 209059840,
"step": 3190,
"train_runtime": 29061.8644,
"train_tokens_per_second": 7193.614
},
{
"epoch": 0.861546745641785,
"grad_norm": 0.91015625,
"learning_rate": 4.563299415187572e-05,
"loss": 4.1537,
"num_input_tokens_seen": 209715200,
"step": 3200,
"train_runtime": 29152.5619,
"train_tokens_per_second": 7193.714
},
{
"epoch": 0.8642390792219156,
"grad_norm": 0.91015625,
"learning_rate": 4.560217734883066e-05,
"loss": 4.2219,
"num_input_tokens_seen": 210370560,
"step": 3210,
"train_runtime": 29243.5634,
"train_tokens_per_second": 7193.739
},
{
"epoch": 0.8669314128020462,
"grad_norm": 0.90234375,
"learning_rate": 4.557126267712169e-05,
"loss": 4.1849,
"num_input_tokens_seen": 211025920,
"step": 3220,
"train_runtime": 29333.9838,
"train_tokens_per_second": 7193.906
},
{
"epoch": 0.8696237463821768,
"grad_norm": 0.953125,
"learning_rate": 4.5540250283606e-05,
"loss": 4.2267,
"num_input_tokens_seen": 211681280,
"step": 3230,
"train_runtime": 29425.3427,
"train_tokens_per_second": 7193.842
},
{
"epoch": 0.8723160799623073,
"grad_norm": 0.9765625,
"learning_rate": 4.550914031560498e-05,
"loss": 4.2194,
"num_input_tokens_seen": 212336640,
"step": 3240,
"train_runtime": 29516.178,
"train_tokens_per_second": 7193.907
},
{
"epoch": 0.8750084135424379,
"grad_norm": 1.0625,
"learning_rate": 4.5477932920903546e-05,
"loss": 4.2358,
"num_input_tokens_seen": 212992000,
"step": 3250,
"train_runtime": 29606.7227,
"train_tokens_per_second": 7194.042
},
{
"epoch": 0.8777007471225685,
"grad_norm": 1.09375,
"learning_rate": 4.544662824774943e-05,
"loss": 4.2741,
"num_input_tokens_seen": 213647360,
"step": 3260,
"train_runtime": 29697.6281,
"train_tokens_per_second": 7194.088
},
{
"epoch": 0.8803930807026991,
"grad_norm": 1.1015625,
"learning_rate": 4.5415226444852464e-05,
"loss": 4.2825,
"num_input_tokens_seen": 214302720,
"step": 3270,
"train_runtime": 29788.4901,
"train_tokens_per_second": 7194.145
},
{
"epoch": 0.8830854142828296,
"grad_norm": 0.94921875,
"learning_rate": 4.538372766138391e-05,
"loss": 4.1546,
"num_input_tokens_seen": 214958080,
"step": 3280,
"train_runtime": 29879.3258,
"train_tokens_per_second": 7194.208
},
{
"epoch": 0.8857777478629603,
"grad_norm": 1.0390625,
"learning_rate": 4.535213204697571e-05,
"loss": 4.1899,
"num_input_tokens_seen": 215613440,
"step": 3290,
"train_runtime": 29970.0037,
"train_tokens_per_second": 7194.308
},
{
"epoch": 0.8884700814430908,
"grad_norm": 1.0703125,
"learning_rate": 4.5320439751719786e-05,
"loss": 4.2076,
"num_input_tokens_seen": 216268800,
"step": 3300,
"train_runtime": 30060.9314,
"train_tokens_per_second": 7194.348
},
{
"epoch": 0.8911624150232214,
"grad_norm": 0.99609375,
"learning_rate": 4.528865092616734e-05,
"loss": 4.1606,
"num_input_tokens_seen": 216924160,
"step": 3310,
"train_runtime": 30151.2458,
"train_tokens_per_second": 7194.534
},
{
"epoch": 0.8938547486033519,
"grad_norm": 0.9296875,
"learning_rate": 4.525676572132814e-05,
"loss": 4.1753,
"num_input_tokens_seen": 217579520,
"step": 3320,
"train_runtime": 30241.7705,
"train_tokens_per_second": 7194.669
},
{
"epoch": 0.8965470821834826,
"grad_norm": 0.9921875,
"learning_rate": 4.522478428866979e-05,
"loss": 4.2919,
"num_input_tokens_seen": 218234880,
"step": 3330,
"train_runtime": 30332.5616,
"train_tokens_per_second": 7194.74
},
{
"epoch": 0.8992394157636131,
"grad_norm": 0.9609375,
"learning_rate": 4.519270678011701e-05,
"loss": 4.1882,
"num_input_tokens_seen": 218890240,
"step": 3340,
"train_runtime": 30423.4601,
"train_tokens_per_second": 7194.785
},
{
"epoch": 0.9019317493437436,
"grad_norm": 1.0234375,
"learning_rate": 4.516053334805091e-05,
"loss": 4.2376,
"num_input_tokens_seen": 219545600,
"step": 3350,
"train_runtime": 30513.7311,
"train_tokens_per_second": 7194.977
},
{
"epoch": 0.9046240829238743,
"grad_norm": 0.88671875,
"learning_rate": 4.512826414530831e-05,
"loss": 4.1968,
"num_input_tokens_seen": 220200960,
"step": 3360,
"train_runtime": 30604.2642,
"train_tokens_per_second": 7195.107
},
{
"epoch": 0.9073164165040049,
"grad_norm": 0.8984375,
"learning_rate": 4.509589932518094e-05,
"loss": 4.1383,
"num_input_tokens_seen": 220856320,
"step": 3370,
"train_runtime": 30694.8063,
"train_tokens_per_second": 7195.234
},
{
"epoch": 0.9100087500841354,
"grad_norm": 0.98828125,
"learning_rate": 4.506343904141478e-05,
"loss": 4.2102,
"num_input_tokens_seen": 221511680,
"step": 3380,
"train_runtime": 30785.3617,
"train_tokens_per_second": 7195.357
},
{
"epoch": 0.912701083664266,
"grad_norm": 0.984375,
"learning_rate": 4.5030883448209276e-05,
"loss": 4.2352,
"num_input_tokens_seen": 222167040,
"step": 3390,
"train_runtime": 30875.9189,
"train_tokens_per_second": 7195.479
},
{
"epoch": 0.9153934172443966,
"grad_norm": 0.9296875,
"learning_rate": 4.499823270021666e-05,
"loss": 4.1812,
"num_input_tokens_seen": 222822400,
"step": 3400,
"train_runtime": 30966.6114,
"train_tokens_per_second": 7195.569
},
{
"epoch": 0.9180857508245271,
"grad_norm": 0.921875,
"learning_rate": 4.496548695254116e-05,
"loss": 4.1502,
"num_input_tokens_seen": 223477760,
"step": 3410,
"train_runtime": 31056.7817,
"train_tokens_per_second": 7195.78
},
{
"epoch": 0.9207780844046577,
"grad_norm": 1.0234375,
"learning_rate": 4.4932646360738305e-05,
"loss": 4.2178,
"num_input_tokens_seen": 224133120,
"step": 3420,
"train_runtime": 31147.1211,
"train_tokens_per_second": 7195.95
},
{
"epoch": 0.9234704179847883,
"grad_norm": 0.91015625,
"learning_rate": 4.489971108081418e-05,
"loss": 4.1711,
"num_input_tokens_seen": 224788480,
"step": 3430,
"train_runtime": 31237.7219,
"train_tokens_per_second": 7196.059
},
{
"epoch": 0.9261627515649189,
"grad_norm": 0.91015625,
"learning_rate": 4.486668126922466e-05,
"loss": 4.1814,
"num_input_tokens_seen": 225443840,
"step": 3440,
"train_runtime": 31328.2106,
"train_tokens_per_second": 7196.193
},
{
"epoch": 0.9288550851450494,
"grad_norm": 1.0,
"learning_rate": 4.48335570828747e-05,
"loss": 4.175,
"num_input_tokens_seen": 226099200,
"step": 3450,
"train_runtime": 31419.1368,
"train_tokens_per_second": 7196.226
},
{
"epoch": 0.9315474187251801,
"grad_norm": 0.9765625,
"learning_rate": 4.480033867911755e-05,
"loss": 4.2191,
"num_input_tokens_seen": 226754560,
"step": 3460,
"train_runtime": 31509.2505,
"train_tokens_per_second": 7196.444
},
{
"epoch": 0.9342397523053106,
"grad_norm": 0.953125,
"learning_rate": 4.476702621575406e-05,
"loss": 4.2263,
"num_input_tokens_seen": 227409920,
"step": 3470,
"train_runtime": 31600.9475,
"train_tokens_per_second": 7196.301
},
{
"epoch": 0.9369320858854412,
"grad_norm": 1.0625,
"learning_rate": 4.4733619851031885e-05,
"loss": 4.2899,
"num_input_tokens_seen": 228065280,
"step": 3480,
"train_runtime": 31691.2678,
"train_tokens_per_second": 7196.471
},
{
"epoch": 0.9396244194655717,
"grad_norm": 0.94140625,
"learning_rate": 4.470011974364474e-05,
"loss": 4.0834,
"num_input_tokens_seen": 228720640,
"step": 3490,
"train_runtime": 31782.787,
"train_tokens_per_second": 7196.368
},
{
"epoch": 0.9423167530457024,
"grad_norm": 0.9453125,
"learning_rate": 4.466652605273166e-05,
"loss": 4.2763,
"num_input_tokens_seen": 229376000,
"step": 3500,
"train_runtime": 31873.3236,
"train_tokens_per_second": 7196.488
},
{
"epoch": 0.9450090866258329,
"grad_norm": 0.90625,
"learning_rate": 4.463283893787628e-05,
"loss": 4.1294,
"num_input_tokens_seen": 230031360,
"step": 3510,
"train_runtime": 31997.0965,
"train_tokens_per_second": 7189.132
},
{
"epoch": 0.9477014202059635,
"grad_norm": 1.0078125,
"learning_rate": 4.459905855910597e-05,
"loss": 4.1761,
"num_input_tokens_seen": 230686720,
"step": 3520,
"train_runtime": 32087.0281,
"train_tokens_per_second": 7189.407
},
{
"epoch": 0.9503937537860941,
"grad_norm": 0.94140625,
"learning_rate": 4.4565185076891175e-05,
"loss": 4.1342,
"num_input_tokens_seen": 231342080,
"step": 3530,
"train_runtime": 32177.1945,
"train_tokens_per_second": 7189.629
},
{
"epoch": 0.9530860873662247,
"grad_norm": 0.9296875,
"learning_rate": 4.453121865214463e-05,
"loss": 4.1464,
"num_input_tokens_seen": 231997440,
"step": 3540,
"train_runtime": 32267.9925,
"train_tokens_per_second": 7189.708
},
{
"epoch": 0.9557784209463552,
"grad_norm": 0.921875,
"learning_rate": 4.449715944622057e-05,
"loss": 4.2182,
"num_input_tokens_seen": 232652800,
"step": 3550,
"train_runtime": 32358.5662,
"train_tokens_per_second": 7189.836
},
{
"epoch": 0.9584707545264858,
"grad_norm": 0.953125,
"learning_rate": 4.4463007620913975e-05,
"loss": 4.1777,
"num_input_tokens_seen": 233308160,
"step": 3560,
"train_runtime": 32449.4776,
"train_tokens_per_second": 7189.89
},
{
"epoch": 0.9611630881066164,
"grad_norm": 0.94140625,
"learning_rate": 4.442876333845982e-05,
"loss": 4.094,
"num_input_tokens_seen": 233963520,
"step": 3570,
"train_runtime": 32540.2049,
"train_tokens_per_second": 7189.983
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.90625,
"learning_rate": 4.439442676153227e-05,
"loss": 4.1403,
"num_input_tokens_seen": 234618880,
"step": 3580,
"train_runtime": 32631.0037,
"train_tokens_per_second": 7190.06
},
{
"epoch": 0.9665477552668775,
"grad_norm": 0.953125,
"learning_rate": 4.4359998053243925e-05,
"loss": 4.1258,
"num_input_tokens_seen": 235274240,
"step": 3590,
"train_runtime": 32721.4244,
"train_tokens_per_second": 7190.22
},
{
"epoch": 0.9692400888470082,
"grad_norm": 0.984375,
"learning_rate": 4.432547737714508e-05,
"loss": 4.1197,
"num_input_tokens_seen": 235929600,
"step": 3600,
"train_runtime": 32812.1821,
"train_tokens_per_second": 7190.305
},
{
"epoch": 0.9719324224271387,
"grad_norm": 1.0078125,
"learning_rate": 4.429086489722287e-05,
"loss": 4.1698,
"num_input_tokens_seen": 236584960,
"step": 3610,
"train_runtime": 32903.0276,
"train_tokens_per_second": 7190.371
},
{
"epoch": 0.9746247560072693,
"grad_norm": 1.0625,
"learning_rate": 4.425616077790056e-05,
"loss": 4.2282,
"num_input_tokens_seen": 237240320,
"step": 3620,
"train_runtime": 32993.439,
"train_tokens_per_second": 7190.53
},
{
"epoch": 0.9773170895873999,
"grad_norm": 0.93359375,
"learning_rate": 4.422136518403673e-05,
"loss": 4.3195,
"num_input_tokens_seen": 237895680,
"step": 3630,
"train_runtime": 33084.7741,
"train_tokens_per_second": 7190.488
},
{
"epoch": 0.9800094231675305,
"grad_norm": 1.0,
"learning_rate": 4.4186478280924516e-05,
"loss": 4.1702,
"num_input_tokens_seen": 238551040,
"step": 3640,
"train_runtime": 33175.3782,
"train_tokens_per_second": 7190.605
},
{
"epoch": 0.982701756747661,
"grad_norm": 0.94140625,
"learning_rate": 4.4151500234290796e-05,
"loss": 4.1488,
"num_input_tokens_seen": 239206400,
"step": 3650,
"train_runtime": 33266.5303,
"train_tokens_per_second": 7190.603
},
{
"epoch": 0.9853940903277916,
"grad_norm": 1.0,
"learning_rate": 4.411643121029541e-05,
"loss": 4.2429,
"num_input_tokens_seen": 239861760,
"step": 3660,
"train_runtime": 33357.1993,
"train_tokens_per_second": 7190.704
},
{
"epoch": 0.9880864239079222,
"grad_norm": 1.046875,
"learning_rate": 4.40812713755304e-05,
"loss": 4.2334,
"num_input_tokens_seen": 240517120,
"step": 3670,
"train_runtime": 33447.6459,
"train_tokens_per_second": 7190.853
},
{
"epoch": 0.9907787574880528,
"grad_norm": 0.9921875,
"learning_rate": 4.4046020897019166e-05,
"loss": 4.1692,
"num_input_tokens_seen": 241172480,
"step": 3680,
"train_runtime": 33538.6244,
"train_tokens_per_second": 7190.888
},
{
"epoch": 0.9934710910681833,
"grad_norm": 1.0859375,
"learning_rate": 4.4010679942215745e-05,
"loss": 4.1434,
"num_input_tokens_seen": 241827840,
"step": 3690,
"train_runtime": 33629.0822,
"train_tokens_per_second": 7191.033
},
{
"epoch": 0.996163424648314,
"grad_norm": 0.9765625,
"learning_rate": 4.397524867900392e-05,
"loss": 4.2236,
"num_input_tokens_seen": 242483200,
"step": 3700,
"train_runtime": 33719.8306,
"train_tokens_per_second": 7191.116
},
{
"epoch": 0.9988557582284445,
"grad_norm": 0.9296875,
"learning_rate": 4.393972727569652e-05,
"loss": 4.1554,
"num_input_tokens_seen": 243138560,
"step": 3710,
"train_runtime": 33810.897,
"train_tokens_per_second": 7191.13
},
{
"epoch": 1.0013461667900654,
"grad_norm": 0.9453125,
"learning_rate": 4.390411590103455e-05,
"loss": 4.1643,
"num_input_tokens_seen": 243744768,
"step": 3720,
"train_runtime": 33895.1256,
"train_tokens_per_second": 7191.145
},
{
"epoch": 1.004038500370196,
"grad_norm": 0.9765625,
"learning_rate": 4.3868414724186424e-05,
"loss": 4.1963,
"num_input_tokens_seen": 244400128,
"step": 3730,
"train_runtime": 33986.4957,
"train_tokens_per_second": 7191.095
},
{
"epoch": 1.0067308339503265,
"grad_norm": 1.0390625,
"learning_rate": 4.3832623914747154e-05,
"loss": 4.1447,
"num_input_tokens_seen": 245055488,
"step": 3740,
"train_runtime": 34077.1663,
"train_tokens_per_second": 7191.193
},
{
"epoch": 1.009423167530457,
"grad_norm": 1.03125,
"learning_rate": 4.379674364273755e-05,
"loss": 4.1409,
"num_input_tokens_seen": 245710848,
"step": 3750,
"train_runtime": 34167.718,
"train_tokens_per_second": 7191.316
},
{
"epoch": 1.0121155011105876,
"grad_norm": 0.9140625,
"learning_rate": 4.3760774078603375e-05,
"loss": 4.1164,
"num_input_tokens_seen": 246366208,
"step": 3760,
"train_runtime": 34258.1953,
"train_tokens_per_second": 7191.453
},
{
"epoch": 1.0148078346907181,
"grad_norm": 0.984375,
"learning_rate": 4.372471539321461e-05,
"loss": 4.0986,
"num_input_tokens_seen": 247021568,
"step": 3770,
"train_runtime": 34349.3094,
"train_tokens_per_second": 7191.457
},
{
"epoch": 1.0175001682708487,
"grad_norm": 0.9453125,
"learning_rate": 4.368856775786456e-05,
"loss": 4.1294,
"num_input_tokens_seen": 247676928,
"step": 3780,
"train_runtime": 34439.4855,
"train_tokens_per_second": 7191.656
},
{
"epoch": 1.0201925018509794,
"grad_norm": 0.984375,
"learning_rate": 4.36523313442691e-05,
"loss": 4.0792,
"num_input_tokens_seen": 248332288,
"step": 3790,
"train_runtime": 34530.928,
"train_tokens_per_second": 7191.59
},
{
"epoch": 1.02288483543111,
"grad_norm": 1.0234375,
"learning_rate": 4.361600632456583e-05,
"loss": 4.1988,
"num_input_tokens_seen": 248987648,
"step": 3800,
"train_runtime": 34621.3595,
"train_tokens_per_second": 7191.735
},
{
"epoch": 1.0255771690112405,
"grad_norm": 0.93359375,
"learning_rate": 4.3579592871313265e-05,
"loss": 4.1271,
"num_input_tokens_seen": 249643008,
"step": 3810,
"train_runtime": 34712.6294,
"train_tokens_per_second": 7191.706
},
{
"epoch": 1.028269502591371,
"grad_norm": 1.0078125,
"learning_rate": 4.354309115749001e-05,
"loss": 4.1366,
"num_input_tokens_seen": 250298368,
"step": 3820,
"train_runtime": 34803.1708,
"train_tokens_per_second": 7191.827
},
{
"epoch": 1.0309618361715016,
"grad_norm": 0.97265625,
"learning_rate": 4.3506501356493965e-05,
"loss": 4.083,
"num_input_tokens_seen": 250953728,
"step": 3830,
"train_runtime": 34894.1642,
"train_tokens_per_second": 7191.854
},
{
"epoch": 1.0336541697516322,
"grad_norm": 0.921875,
"learning_rate": 4.346982364214145e-05,
"loss": 4.1208,
"num_input_tokens_seen": 251609088,
"step": 3840,
"train_runtime": 34985.3424,
"train_tokens_per_second": 7191.843
},
{
"epoch": 1.0363465033317627,
"grad_norm": 0.91796875,
"learning_rate": 4.343305818866643e-05,
"loss": 4.1373,
"num_input_tokens_seen": 252264448,
"step": 3850,
"train_runtime": 35076.5599,
"train_tokens_per_second": 7191.824
},
{
"epoch": 1.0390388369118935,
"grad_norm": 0.9765625,
"learning_rate": 4.339620517071965e-05,
"loss": 4.0883,
"num_input_tokens_seen": 252919808,
"step": 3860,
"train_runtime": 35167.2199,
"train_tokens_per_second": 7191.919
},
{
"epoch": 1.041731170492024,
"grad_norm": 0.9375,
"learning_rate": 4.335926476336785e-05,
"loss": 4.1411,
"num_input_tokens_seen": 253575168,
"step": 3870,
"train_runtime": 35257.9069,
"train_tokens_per_second": 7192.009
},
{
"epoch": 1.0444235040721546,
"grad_norm": 1.1796875,
"learning_rate": 4.332223714209286e-05,
"loss": 4.0782,
"num_input_tokens_seen": 254230528,
"step": 3880,
"train_runtime": 35348.4394,
"train_tokens_per_second": 7192.129
},
{
"epoch": 1.047115837652285,
"grad_norm": 1.046875,
"learning_rate": 4.328512248279085e-05,
"loss": 4.1105,
"num_input_tokens_seen": 254885888,
"step": 3890,
"train_runtime": 35439.1569,
"train_tokens_per_second": 7192.211
},
{
"epoch": 1.0498081712324157,
"grad_norm": 1.0390625,
"learning_rate": 4.3247920961771445e-05,
"loss": 4.1275,
"num_input_tokens_seen": 255541248,
"step": 3900,
"train_runtime": 35529.4127,
"train_tokens_per_second": 7192.386
},
{
"epoch": 1.0525005048125462,
"grad_norm": 1.0078125,
"learning_rate": 4.3210632755756884e-05,
"loss": 4.0792,
"num_input_tokens_seen": 256196608,
"step": 3910,
"train_runtime": 35620.2122,
"train_tokens_per_second": 7192.45
},
{
"epoch": 1.0551928383926767,
"grad_norm": 0.94140625,
"learning_rate": 4.3173258041881226e-05,
"loss": 4.1166,
"num_input_tokens_seen": 256851968,
"step": 3920,
"train_runtime": 35710.6281,
"train_tokens_per_second": 7192.592
},
{
"epoch": 1.0578851719728075,
"grad_norm": 0.984375,
"learning_rate": 4.313579699768945e-05,
"loss": 4.0884,
"num_input_tokens_seen": 257507328,
"step": 3930,
"train_runtime": 35801.49,
"train_tokens_per_second": 7192.643
},
{
"epoch": 1.060577505552938,
"grad_norm": 0.953125,
"learning_rate": 4.309824980113664e-05,
"loss": 4.0969,
"num_input_tokens_seen": 258162688,
"step": 3940,
"train_runtime": 35891.9587,
"train_tokens_per_second": 7192.772
},
{
"epoch": 1.0632698391330686,
"grad_norm": 1.0078125,
"learning_rate": 4.306061663058715e-05,
"loss": 4.1934,
"num_input_tokens_seen": 258818048,
"step": 3950,
"train_runtime": 35982.4641,
"train_tokens_per_second": 7192.894
},
{
"epoch": 1.0659621727131992,
"grad_norm": 0.921875,
"learning_rate": 4.302289766481374e-05,
"loss": 4.0758,
"num_input_tokens_seen": 259473408,
"step": 3960,
"train_runtime": 36073.417,
"train_tokens_per_second": 7192.926
},
{
"epoch": 1.0686545062933297,
"grad_norm": 0.92578125,
"learning_rate": 4.2985093082996744e-05,
"loss": 4.1234,
"num_input_tokens_seen": 260128768,
"step": 3970,
"train_runtime": 36164.2317,
"train_tokens_per_second": 7192.985
},
{
"epoch": 1.0713468398734602,
"grad_norm": 0.93359375,
"learning_rate": 4.294720306472317e-05,
"loss": 4.1192,
"num_input_tokens_seen": 260784128,
"step": 3980,
"train_runtime": 36255.0669,
"train_tokens_per_second": 7193.039
},
{
"epoch": 1.074039173453591,
"grad_norm": 1.0234375,
"learning_rate": 4.2909227789985935e-05,
"loss": 4.0864,
"num_input_tokens_seen": 261439488,
"step": 3990,
"train_runtime": 36345.6406,
"train_tokens_per_second": 7193.146
},
{
"epoch": 1.0767315070337216,
"grad_norm": 1.0625,
"learning_rate": 4.287116743918292e-05,
"loss": 4.1621,
"num_input_tokens_seen": 262094848,
"step": 4000,
"train_runtime": 36436.248,
"train_tokens_per_second": 7193.245
},
{
"epoch": 1.079423840613852,
"grad_norm": 0.95703125,
"learning_rate": 4.283302219311616e-05,
"loss": 4.1096,
"num_input_tokens_seen": 262750208,
"step": 4010,
"train_runtime": 36545.2724,
"train_tokens_per_second": 7189.718
},
{
"epoch": 1.0821161741939826,
"grad_norm": 0.984375,
"learning_rate": 4.279479223299099e-05,
"loss": 4.1474,
"num_input_tokens_seen": 263405568,
"step": 4020,
"train_runtime": 36636.1801,
"train_tokens_per_second": 7189.766
},
{
"epoch": 1.0848085077741132,
"grad_norm": 1.015625,
"learning_rate": 4.275647774041517e-05,
"loss": 4.1458,
"num_input_tokens_seen": 264060928,
"step": 4030,
"train_runtime": 36726.9109,
"train_tokens_per_second": 7189.849
},
{
"epoch": 1.0875008413542437,
"grad_norm": 0.953125,
"learning_rate": 4.2718078897397994e-05,
"loss": 4.0886,
"num_input_tokens_seen": 264716288,
"step": 4040,
"train_runtime": 36816.916,
"train_tokens_per_second": 7190.072
},
{
"epoch": 1.0901931749343743,
"grad_norm": 0.94921875,
"learning_rate": 4.267959588634949e-05,
"loss": 4.1204,
"num_input_tokens_seen": 265371648,
"step": 4050,
"train_runtime": 36907.5704,
"train_tokens_per_second": 7190.168
},
{
"epoch": 1.092885508514505,
"grad_norm": 0.9296875,
"learning_rate": 4.26410288900795e-05,
"loss": 4.0423,
"num_input_tokens_seen": 266027008,
"step": 4060,
"train_runtime": 36997.9691,
"train_tokens_per_second": 7190.314
},
{
"epoch": 1.0955778420946356,
"grad_norm": 0.94140625,
"learning_rate": 4.2602378091796834e-05,
"loss": 4.0993,
"num_input_tokens_seen": 266682368,
"step": 4070,
"train_runtime": 37088.8087,
"train_tokens_per_second": 7190.373
},
{
"epoch": 1.0982701756747661,
"grad_norm": 1.0078125,
"learning_rate": 4.25636436751084e-05,
"loss": 4.0873,
"num_input_tokens_seen": 267337728,
"step": 4080,
"train_runtime": 37179.7577,
"train_tokens_per_second": 7190.411
},
{
"epoch": 1.1009625092548967,
"grad_norm": 1.015625,
"learning_rate": 4.252482582401832e-05,
"loss": 4.0804,
"num_input_tokens_seen": 267993088,
"step": 4090,
"train_runtime": 37270.755,
"train_tokens_per_second": 7190.439
},
{
"epoch": 1.1036548428350272,
"grad_norm": 0.98046875,
"learning_rate": 4.248592472292707e-05,
"loss": 4.1809,
"num_input_tokens_seen": 268648448,
"step": 4100,
"train_runtime": 37361.4309,
"train_tokens_per_second": 7190.529
},
{
"epoch": 1.1063471764151578,
"grad_norm": 0.9375,
"learning_rate": 4.244694055663058e-05,
"loss": 4.1186,
"num_input_tokens_seen": 269303808,
"step": 4110,
"train_runtime": 37452.1799,
"train_tokens_per_second": 7190.604
},
{
"epoch": 1.1090395099952883,
"grad_norm": 0.953125,
"learning_rate": 4.24078735103194e-05,
"loss": 4.087,
"num_input_tokens_seen": 269959168,
"step": 4120,
"train_runtime": 37542.0572,
"train_tokens_per_second": 7190.846
},
{
"epoch": 1.111731843575419,
"grad_norm": 0.94140625,
"learning_rate": 4.236872376957777e-05,
"loss": 4.1264,
"num_input_tokens_seen": 270614528,
"step": 4130,
"train_runtime": 37633.1209,
"train_tokens_per_second": 7190.861
},
{
"epoch": 1.1144241771555496,
"grad_norm": 0.96484375,
"learning_rate": 4.232949152038277e-05,
"loss": 4.1552,
"num_input_tokens_seen": 271269888,
"step": 4140,
"train_runtime": 37723.8949,
"train_tokens_per_second": 7190.93
},
{
"epoch": 1.1171165107356802,
"grad_norm": 0.95703125,
"learning_rate": 4.2290176949103444e-05,
"loss": 4.1153,
"num_input_tokens_seen": 271925248,
"step": 4150,
"train_runtime": 37814.3548,
"train_tokens_per_second": 7191.059
},
{
"epoch": 1.1198088443158107,
"grad_norm": 0.96484375,
"learning_rate": 4.225078024249988e-05,
"loss": 3.9972,
"num_input_tokens_seen": 272580608,
"step": 4160,
"train_runtime": 37905.2274,
"train_tokens_per_second": 7191.109
},
{
"epoch": 1.1225011778959413,
"grad_norm": 0.96875,
"learning_rate": 4.221130158772234e-05,
"loss": 4.1237,
"num_input_tokens_seen": 273235968,
"step": 4170,
"train_runtime": 37995.663,
"train_tokens_per_second": 7191.241
},
{
"epoch": 1.1251935114760718,
"grad_norm": 0.98828125,
"learning_rate": 4.217174117231038e-05,
"loss": 4.0567,
"num_input_tokens_seen": 273891328,
"step": 4180,
"train_runtime": 38086.6329,
"train_tokens_per_second": 7191.272
},
{
"epoch": 1.1278858450562024,
"grad_norm": 1.203125,
"learning_rate": 4.2132099184191956e-05,
"loss": 4.1517,
"num_input_tokens_seen": 274546688,
"step": 4190,
"train_runtime": 38176.9747,
"train_tokens_per_second": 7191.421
},
{
"epoch": 1.1305781786363331,
"grad_norm": 0.98046875,
"learning_rate": 4.209237581168253e-05,
"loss": 4.1753,
"num_input_tokens_seen": 275202048,
"step": 4200,
"train_runtime": 38267.894,
"train_tokens_per_second": 7191.46
},
{
"epoch": 1.1332705122164637,
"grad_norm": 0.9609375,
"learning_rate": 4.205257124348416e-05,
"loss": 4.1628,
"num_input_tokens_seen": 275857408,
"step": 4210,
"train_runtime": 38358.472,
"train_tokens_per_second": 7191.564
},
{
"epoch": 1.1359628457965942,
"grad_norm": 1.0078125,
"learning_rate": 4.201268566868462e-05,
"loss": 4.0838,
"num_input_tokens_seen": 276512768,
"step": 4220,
"train_runtime": 38449.6346,
"train_tokens_per_second": 7191.558
},
{
"epoch": 1.1386551793767248,
"grad_norm": 0.94921875,
"learning_rate": 4.197271927675651e-05,
"loss": 4.0511,
"num_input_tokens_seen": 277168128,
"step": 4230,
"train_runtime": 38540.098,
"train_tokens_per_second": 7191.682
},
{
"epoch": 1.1413475129568553,
"grad_norm": 1.0,
"learning_rate": 4.1932672257556315e-05,
"loss": 4.0671,
"num_input_tokens_seen": 277823488,
"step": 4240,
"train_runtime": 38630.478,
"train_tokens_per_second": 7191.821
},
{
"epoch": 1.1440398465369859,
"grad_norm": 0.95703125,
"learning_rate": 4.189254480132357e-05,
"loss": 4.0356,
"num_input_tokens_seen": 278478848,
"step": 4250,
"train_runtime": 38722.0947,
"train_tokens_per_second": 7191.73
},
{
"epoch": 1.1467321801171164,
"grad_norm": 0.9609375,
"learning_rate": 4.1852337098679894e-05,
"loss": 4.1653,
"num_input_tokens_seen": 279134208,
"step": 4260,
"train_runtime": 38812.5776,
"train_tokens_per_second": 7191.849
},
{
"epoch": 1.1494245136972472,
"grad_norm": 0.9921875,
"learning_rate": 4.1812049340628126e-05,
"loss": 4.1336,
"num_input_tokens_seen": 279789568,
"step": 4270,
"train_runtime": 38903.0508,
"train_tokens_per_second": 7191.97
},
{
"epoch": 1.1521168472773777,
"grad_norm": 0.921875,
"learning_rate": 4.177168171855137e-05,
"loss": 4.0297,
"num_input_tokens_seen": 280444928,
"step": 4280,
"train_runtime": 38993.719,
"train_tokens_per_second": 7192.054
},
{
"epoch": 1.1548091808575083,
"grad_norm": 0.98046875,
"learning_rate": 4.173123442421214e-05,
"loss": 4.1431,
"num_input_tokens_seen": 281100288,
"step": 4290,
"train_runtime": 39084.2184,
"train_tokens_per_second": 7192.169
},
{
"epoch": 1.1575015144376388,
"grad_norm": 0.96484375,
"learning_rate": 4.1690707649751435e-05,
"loss": 4.1921,
"num_input_tokens_seen": 281755648,
"step": 4300,
"train_runtime": 39175.0783,
"train_tokens_per_second": 7192.217
},
{
"epoch": 1.1601938480177694,
"grad_norm": 0.9609375,
"learning_rate": 4.1650101587687795e-05,
"loss": 4.1002,
"num_input_tokens_seen": 282411008,
"step": 4310,
"train_runtime": 39265.6359,
"train_tokens_per_second": 7192.32
},
{
"epoch": 1.1628861815979,
"grad_norm": 0.9921875,
"learning_rate": 4.1609416430916417e-05,
"loss": 4.1173,
"num_input_tokens_seen": 283066368,
"step": 4320,
"train_runtime": 39356.8918,
"train_tokens_per_second": 7192.295
},
{
"epoch": 1.1655785151780305,
"grad_norm": 1.015625,
"learning_rate": 4.156865237270822e-05,
"loss": 4.1631,
"num_input_tokens_seen": 283721728,
"step": 4330,
"train_runtime": 39447.6761,
"train_tokens_per_second": 7192.356
},
{
"epoch": 1.1682708487581612,
"grad_norm": 0.9765625,
"learning_rate": 4.1527809606708955e-05,
"loss": 4.1155,
"num_input_tokens_seen": 284377088,
"step": 4340,
"train_runtime": 39538.1818,
"train_tokens_per_second": 7192.468
},
{
"epoch": 1.1709631823382918,
"grad_norm": 1.015625,
"learning_rate": 4.148688832693827e-05,
"loss": 4.0624,
"num_input_tokens_seen": 285032448,
"step": 4350,
"train_runtime": 39629.3436,
"train_tokens_per_second": 7192.459
},
{
"epoch": 1.1736555159184223,
"grad_norm": 1.1171875,
"learning_rate": 4.144588872778874e-05,
"loss": 4.0805,
"num_input_tokens_seen": 285687808,
"step": 4360,
"train_runtime": 39720.1522,
"train_tokens_per_second": 7192.515
},
{
"epoch": 1.1763478494985529,
"grad_norm": 1.03125,
"learning_rate": 4.1404811004025043e-05,
"loss": 4.1736,
"num_input_tokens_seen": 286343168,
"step": 4370,
"train_runtime": 39811.0389,
"train_tokens_per_second": 7192.557
},
{
"epoch": 1.1790401830786834,
"grad_norm": 0.9296875,
"learning_rate": 4.136365535078296e-05,
"loss": 4.0432,
"num_input_tokens_seen": 286998528,
"step": 4380,
"train_runtime": 39901.6261,
"train_tokens_per_second": 7192.652
},
{
"epoch": 1.181732516658814,
"grad_norm": 0.9765625,
"learning_rate": 4.132242196356846e-05,
"loss": 4.1073,
"num_input_tokens_seen": 287653888,
"step": 4390,
"train_runtime": 39992.7877,
"train_tokens_per_second": 7192.644
},
{
"epoch": 1.1844248502389445,
"grad_norm": 1.0,
"learning_rate": 4.128111103825679e-05,
"loss": 4.1021,
"num_input_tokens_seen": 288309248,
"step": 4400,
"train_runtime": 40083.5048,
"train_tokens_per_second": 7192.716
},
{
"epoch": 1.1871171838190753,
"grad_norm": 0.9765625,
"learning_rate": 4.123972277109153e-05,
"loss": 4.0843,
"num_input_tokens_seen": 288964608,
"step": 4410,
"train_runtime": 40174.7665,
"train_tokens_per_second": 7192.689
},
{
"epoch": 1.1898095173992058,
"grad_norm": 0.96484375,
"learning_rate": 4.119825735868367e-05,
"loss": 4.0853,
"num_input_tokens_seen": 289619968,
"step": 4420,
"train_runtime": 40265.4969,
"train_tokens_per_second": 7192.758
},
{
"epoch": 1.1925018509793364,
"grad_norm": 1.0078125,
"learning_rate": 4.115671499801066e-05,
"loss": 4.0748,
"num_input_tokens_seen": 290275328,
"step": 4430,
"train_runtime": 40356.3231,
"train_tokens_per_second": 7192.809
},
{
"epoch": 1.195194184559467,
"grad_norm": 1.0,
"learning_rate": 4.11150958864155e-05,
"loss": 4.1187,
"num_input_tokens_seen": 290930688,
"step": 4440,
"train_runtime": 40447.5539,
"train_tokens_per_second": 7192.788
},
{
"epoch": 1.1978865181395975,
"grad_norm": 0.94140625,
"learning_rate": 4.107340022160577e-05,
"loss": 4.1389,
"num_input_tokens_seen": 291586048,
"step": 4450,
"train_runtime": 40538.5359,
"train_tokens_per_second": 7192.812
},
{
"epoch": 1.200578851719728,
"grad_norm": 1.0078125,
"learning_rate": 4.1031628201652726e-05,
"loss": 4.0362,
"num_input_tokens_seen": 292241408,
"step": 4460,
"train_runtime": 40628.8905,
"train_tokens_per_second": 7192.946
},
{
"epoch": 1.2032711852998585,
"grad_norm": 1.0234375,
"learning_rate": 4.098978002499035e-05,
"loss": 4.1541,
"num_input_tokens_seen": 292896768,
"step": 4470,
"train_runtime": 40719.8886,
"train_tokens_per_second": 7192.966
},
{
"epoch": 1.2059635188799893,
"grad_norm": 1.0625,
"learning_rate": 4.094785589041436e-05,
"loss": 4.0388,
"num_input_tokens_seen": 293552128,
"step": 4480,
"train_runtime": 40810.8039,
"train_tokens_per_second": 7193.0
},
{
"epoch": 1.2086558524601199,
"grad_norm": 1.1171875,
"learning_rate": 4.0905855997081345e-05,
"loss": 4.1092,
"num_input_tokens_seen": 294207488,
"step": 4490,
"train_runtime": 40901.4178,
"train_tokens_per_second": 7193.088
},
{
"epoch": 1.2113481860402504,
"grad_norm": 0.9453125,
"learning_rate": 4.0863780544507756e-05,
"loss": 3.9746,
"num_input_tokens_seen": 294862848,
"step": 4500,
"train_runtime": 40992.0143,
"train_tokens_per_second": 7193.178
},
{
"epoch": 1.214040519620381,
"grad_norm": 0.984375,
"learning_rate": 4.082162973256898e-05,
"loss": 4.1136,
"num_input_tokens_seen": 295518208,
"step": 4510,
"train_runtime": 41103.9394,
"train_tokens_per_second": 7189.535
},
{
"epoch": 1.2167328532005115,
"grad_norm": 1.0625,
"learning_rate": 4.0779403761498414e-05,
"loss": 4.0529,
"num_input_tokens_seen": 296173568,
"step": 4520,
"train_runtime": 41197.3252,
"train_tokens_per_second": 7189.146
},
{
"epoch": 1.219425186780642,
"grad_norm": 0.95703125,
"learning_rate": 4.0737102831886465e-05,
"loss": 4.203,
"num_input_tokens_seen": 296828928,
"step": 4530,
"train_runtime": 41286.8522,
"train_tokens_per_second": 7189.43
},
{
"epoch": 1.2221175203607726,
"grad_norm": 0.96875,
"learning_rate": 4.069472714467965e-05,
"loss": 4.1483,
"num_input_tokens_seen": 297484288,
"step": 4540,
"train_runtime": 41377.9341,
"train_tokens_per_second": 7189.443
},
{
"epoch": 1.2248098539409034,
"grad_norm": 0.94140625,
"learning_rate": 4.0652276901179574e-05,
"loss": 4.0595,
"num_input_tokens_seen": 298139648,
"step": 4550,
"train_runtime": 41468.2417,
"train_tokens_per_second": 7189.59
},
{
"epoch": 1.227502187521034,
"grad_norm": 0.95703125,
"learning_rate": 4.0609752303042063e-05,
"loss": 4.0823,
"num_input_tokens_seen": 298795008,
"step": 4560,
"train_runtime": 41559.5192,
"train_tokens_per_second": 7189.568
},
{
"epoch": 1.2301945211011645,
"grad_norm": 1.09375,
"learning_rate": 4.0567153552276125e-05,
"loss": 4.1371,
"num_input_tokens_seen": 299450368,
"step": 4570,
"train_runtime": 41650.0624,
"train_tokens_per_second": 7189.674
},
{
"epoch": 1.232886854681295,
"grad_norm": 1.109375,
"learning_rate": 4.0524480851243026e-05,
"loss": 4.0857,
"num_input_tokens_seen": 300105728,
"step": 4580,
"train_runtime": 41741.0104,
"train_tokens_per_second": 7189.709
},
{
"epoch": 1.2355791882614255,
"grad_norm": 1.0390625,
"learning_rate": 4.048173440265535e-05,
"loss": 4.0981,
"num_input_tokens_seen": 300761088,
"step": 4590,
"train_runtime": 41831.73,
"train_tokens_per_second": 7189.784
},
{
"epoch": 1.238271521841556,
"grad_norm": 0.9609375,
"learning_rate": 4.043891440957598e-05,
"loss": 4.0393,
"num_input_tokens_seen": 301416448,
"step": 4600,
"train_runtime": 41922.3234,
"train_tokens_per_second": 7189.879
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.9296875,
"learning_rate": 4.039602107541717e-05,
"loss": 3.9847,
"num_input_tokens_seen": 302071808,
"step": 4610,
"train_runtime": 42012.9274,
"train_tokens_per_second": 7189.973
},
{
"epoch": 1.2436561890018174,
"grad_norm": 0.94140625,
"learning_rate": 4.035305460393961e-05,
"loss": 3.9565,
"num_input_tokens_seen": 302727168,
"step": 4620,
"train_runtime": 42104.2059,
"train_tokens_per_second": 7189.951
},
{
"epoch": 1.246348522581948,
"grad_norm": 1.0390625,
"learning_rate": 4.0310015199251375e-05,
"loss": 3.9737,
"num_input_tokens_seen": 303382528,
"step": 4630,
"train_runtime": 42194.6319,
"train_tokens_per_second": 7190.074
},
{
"epoch": 1.2490408561620785,
"grad_norm": 0.984375,
"learning_rate": 4.0266903065807013e-05,
"loss": 4.1593,
"num_input_tokens_seen": 304037888,
"step": 4640,
"train_runtime": 42285.6276,
"train_tokens_per_second": 7190.1
},
{
"epoch": 1.251733189742209,
"grad_norm": 0.96875,
"learning_rate": 4.0223718408406593e-05,
"loss": 4.0606,
"num_input_tokens_seen": 304693248,
"step": 4650,
"train_runtime": 42376.5765,
"train_tokens_per_second": 7190.134
},
{
"epoch": 1.2544255233223396,
"grad_norm": 0.9453125,
"learning_rate": 4.018046143219466e-05,
"loss": 4.0414,
"num_input_tokens_seen": 305348608,
"step": 4660,
"train_runtime": 42467.4287,
"train_tokens_per_second": 7190.184
},
{
"epoch": 1.2571178569024704,
"grad_norm": 1.046875,
"learning_rate": 4.0137132342659345e-05,
"loss": 4.0417,
"num_input_tokens_seen": 306003968,
"step": 4670,
"train_runtime": 42558.2623,
"train_tokens_per_second": 7190.236
},
{
"epoch": 1.2598101904826007,
"grad_norm": 1.03125,
"learning_rate": 4.00937313456313e-05,
"loss": 4.0062,
"num_input_tokens_seen": 306659328,
"step": 4680,
"train_runtime": 42649.2409,
"train_tokens_per_second": 7190.265
},
{
"epoch": 1.2625025240627314,
"grad_norm": 0.94921875,
"learning_rate": 4.0050258647282815e-05,
"loss": 4.0743,
"num_input_tokens_seen": 307314688,
"step": 4690,
"train_runtime": 42739.8579,
"train_tokens_per_second": 7190.354
},
{
"epoch": 1.265194857642862,
"grad_norm": 0.9765625,
"learning_rate": 4.0006714454126756e-05,
"loss": 4.063,
"num_input_tokens_seen": 307970048,
"step": 4700,
"train_runtime": 42830.7285,
"train_tokens_per_second": 7190.399
},
{
"epoch": 1.2678871912229925,
"grad_norm": 0.98828125,
"learning_rate": 3.9963098973015625e-05,
"loss": 4.1403,
"num_input_tokens_seen": 308625408,
"step": 4710,
"train_runtime": 42921.0004,
"train_tokens_per_second": 7190.546
},
{
"epoch": 1.270579524803123,
"grad_norm": 1.1015625,
"learning_rate": 3.991941241114057e-05,
"loss": 4.1257,
"num_input_tokens_seen": 309280768,
"step": 4720,
"train_runtime": 43011.9497,
"train_tokens_per_second": 7190.578
},
{
"epoch": 1.2732718583832536,
"grad_norm": 0.98046875,
"learning_rate": 3.9875654976030416e-05,
"loss": 4.0648,
"num_input_tokens_seen": 309936128,
"step": 4730,
"train_runtime": 43102.7908,
"train_tokens_per_second": 7190.628
},
{
"epoch": 1.2759641919633844,
"grad_norm": 1.0,
"learning_rate": 3.983182687555066e-05,
"loss": 3.9744,
"num_input_tokens_seen": 310591488,
"step": 4740,
"train_runtime": 43193.9112,
"train_tokens_per_second": 7190.631
},
{
"epoch": 1.2786565255435147,
"grad_norm": 0.984375,
"learning_rate": 3.9787928317902465e-05,
"loss": 4.0177,
"num_input_tokens_seen": 311246848,
"step": 4750,
"train_runtime": 43284.833,
"train_tokens_per_second": 7190.668
},
{
"epoch": 1.2813488591236455,
"grad_norm": 1.0546875,
"learning_rate": 3.974395951162172e-05,
"loss": 4.1278,
"num_input_tokens_seen": 311902208,
"step": 4760,
"train_runtime": 43375.2348,
"train_tokens_per_second": 7190.79
},
{
"epoch": 1.284041192703776,
"grad_norm": 0.94921875,
"learning_rate": 3.9699920665578016e-05,
"loss": 3.9982,
"num_input_tokens_seen": 312557568,
"step": 4770,
"train_runtime": 43466.3882,
"train_tokens_per_second": 7190.788
},
{
"epoch": 1.2867335262839066,
"grad_norm": 0.9296875,
"learning_rate": 3.965581198897366e-05,
"loss": 3.9781,
"num_input_tokens_seen": 313212928,
"step": 4780,
"train_runtime": 43556.6919,
"train_tokens_per_second": 7190.926
},
{
"epoch": 1.2894258598640371,
"grad_norm": 1.0078125,
"learning_rate": 3.961163369134266e-05,
"loss": 3.9825,
"num_input_tokens_seen": 313868288,
"step": 4790,
"train_runtime": 43647.9657,
"train_tokens_per_second": 7190.903
},
{
"epoch": 1.2921181934441677,
"grad_norm": 0.98828125,
"learning_rate": 3.956738598254981e-05,
"loss": 4.1024,
"num_input_tokens_seen": 314523648,
"step": 4800,
"train_runtime": 43738.6087,
"train_tokens_per_second": 7190.984
},
{
"epoch": 1.2948105270242984,
"grad_norm": 0.99609375,
"learning_rate": 3.9523069072789576e-05,
"loss": 4.0959,
"num_input_tokens_seen": 315179008,
"step": 4810,
"train_runtime": 43829.2011,
"train_tokens_per_second": 7191.074
},
{
"epoch": 1.2975028606044288,
"grad_norm": 1.0390625,
"learning_rate": 3.947868317258519e-05,
"loss": 4.0322,
"num_input_tokens_seen": 315834368,
"step": 4820,
"train_runtime": 43920.1552,
"train_tokens_per_second": 7191.103
},
{
"epoch": 1.3001951941845595,
"grad_norm": 1.046875,
"learning_rate": 3.943422849278759e-05,
"loss": 4.0566,
"num_input_tokens_seen": 316489728,
"step": 4830,
"train_runtime": 44010.9066,
"train_tokens_per_second": 7191.166
},
{
"epoch": 1.30288752776469,
"grad_norm": 1.078125,
"learning_rate": 3.938970524457449e-05,
"loss": 4.0945,
"num_input_tokens_seen": 317145088,
"step": 4840,
"train_runtime": 44101.059,
"train_tokens_per_second": 7191.326
},
{
"epoch": 1.3055798613448206,
"grad_norm": 1.125,
"learning_rate": 3.9345113639449274e-05,
"loss": 4.0747,
"num_input_tokens_seen": 317800448,
"step": 4850,
"train_runtime": 44191.7055,
"train_tokens_per_second": 7191.405
},
{
"epoch": 1.3082721949249512,
"grad_norm": 1.0703125,
"learning_rate": 3.930045388924008e-05,
"loss": 4.0492,
"num_input_tokens_seen": 318455808,
"step": 4860,
"train_runtime": 44282.9561,
"train_tokens_per_second": 7191.385
},
{
"epoch": 1.3109645285050817,
"grad_norm": 0.96875,
"learning_rate": 3.925572620609878e-05,
"loss": 4.0212,
"num_input_tokens_seen": 319111168,
"step": 4870,
"train_runtime": 44373.8261,
"train_tokens_per_second": 7191.428
},
{
"epoch": 1.3136568620852125,
"grad_norm": 0.95703125,
"learning_rate": 3.921093080249992e-05,
"loss": 4.0731,
"num_input_tokens_seen": 319766528,
"step": 4880,
"train_runtime": 44464.7093,
"train_tokens_per_second": 7191.468
},
{
"epoch": 1.3163491956653428,
"grad_norm": 0.98046875,
"learning_rate": 3.916606789123977e-05,
"loss": 4.094,
"num_input_tokens_seen": 320421888,
"step": 4890,
"train_runtime": 44554.734,
"train_tokens_per_second": 7191.646
},
{
"epoch": 1.3190415292454736,
"grad_norm": 1.03125,
"learning_rate": 3.912113768543526e-05,
"loss": 3.9722,
"num_input_tokens_seen": 321077248,
"step": 4900,
"train_runtime": 44645.687,
"train_tokens_per_second": 7191.674
},
{
"epoch": 1.3217338628256041,
"grad_norm": 0.9140625,
"learning_rate": 3.907614039852304e-05,
"loss": 4.0809,
"num_input_tokens_seen": 321732608,
"step": 4910,
"train_runtime": 44736.4925,
"train_tokens_per_second": 7191.726
},
{
"epoch": 1.3244261964057347,
"grad_norm": 0.98046875,
"learning_rate": 3.903107624425838e-05,
"loss": 4.0212,
"num_input_tokens_seen": 322387968,
"step": 4920,
"train_runtime": 44827.482,
"train_tokens_per_second": 7191.748
},
{
"epoch": 1.3271185299858652,
"grad_norm": 0.953125,
"learning_rate": 3.898594543671422e-05,
"loss": 4.0774,
"num_input_tokens_seen": 323043328,
"step": 4930,
"train_runtime": 44918.0801,
"train_tokens_per_second": 7191.833
},
{
"epoch": 1.3298108635659958,
"grad_norm": 1.046875,
"learning_rate": 3.894074819028013e-05,
"loss": 4.0552,
"num_input_tokens_seen": 323698688,
"step": 4940,
"train_runtime": 45008.9135,
"train_tokens_per_second": 7191.88
},
{
"epoch": 1.3325031971461265,
"grad_norm": 1.0234375,
"learning_rate": 3.889548471966128e-05,
"loss": 4.0884,
"num_input_tokens_seen": 324354048,
"step": 4950,
"train_runtime": 45099.1757,
"train_tokens_per_second": 7192.017
},
{
"epoch": 1.3351955307262569,
"grad_norm": 0.97265625,
"learning_rate": 3.885015523987744e-05,
"loss": 3.992,
"num_input_tokens_seen": 325009408,
"step": 4960,
"train_runtime": 45190.2096,
"train_tokens_per_second": 7192.031
},
{
"epoch": 1.3378878643063876,
"grad_norm": 0.9453125,
"learning_rate": 3.880475996626195e-05,
"loss": 4.0486,
"num_input_tokens_seen": 325664768,
"step": 4970,
"train_runtime": 45280.9408,
"train_tokens_per_second": 7192.094
},
{
"epoch": 1.3405801978865182,
"grad_norm": 1.015625,
"learning_rate": 3.875929911446069e-05,
"loss": 3.9412,
"num_input_tokens_seen": 326320128,
"step": 4980,
"train_runtime": 45371.3567,
"train_tokens_per_second": 7192.206
},
{
"epoch": 1.3432725314666487,
"grad_norm": 1.03125,
"learning_rate": 3.8713772900431075e-05,
"loss": 4.0498,
"num_input_tokens_seen": 326975488,
"step": 4990,
"train_runtime": 45462.0751,
"train_tokens_per_second": 7192.269
},
{
"epoch": 1.3459648650467793,
"grad_norm": 0.9765625,
"learning_rate": 3.866818154044101e-05,
"loss": 4.0712,
"num_input_tokens_seen": 327630848,
"step": 5000,
"train_runtime": 45552.9489,
"train_tokens_per_second": 7192.308
},
{
"epoch": 1.3486571986269098,
"grad_norm": 1.03125,
"learning_rate": 3.8622525251067864e-05,
"loss": 4.0049,
"num_input_tokens_seen": 328286208,
"step": 5010,
"train_runtime": 45661.5242,
"train_tokens_per_second": 7189.559
},
{
"epoch": 1.3513495322070406,
"grad_norm": 0.953125,
"learning_rate": 3.8576804249197456e-05,
"loss": 4.0657,
"num_input_tokens_seen": 328941568,
"step": 5020,
"train_runtime": 45752.0747,
"train_tokens_per_second": 7189.654
},
{
"epoch": 1.354041865787171,
"grad_norm": 1.09375,
"learning_rate": 3.853101875202301e-05,
"loss": 4.0619,
"num_input_tokens_seen": 329596928,
"step": 5030,
"train_runtime": 45843.0305,
"train_tokens_per_second": 7189.685
},
{
"epoch": 1.3567341993673017,
"grad_norm": 1.03125,
"learning_rate": 3.848516897704414e-05,
"loss": 4.1361,
"num_input_tokens_seen": 330252288,
"step": 5040,
"train_runtime": 45933.7716,
"train_tokens_per_second": 7189.749
},
{
"epoch": 1.3594265329474322,
"grad_norm": 0.9140625,
"learning_rate": 3.843925514206578e-05,
"loss": 4.083,
"num_input_tokens_seen": 330907648,
"step": 5050,
"train_runtime": 46024.504,
"train_tokens_per_second": 7189.815
},
{
"epoch": 1.3621188665275628,
"grad_norm": 1.0078125,
"learning_rate": 3.839327746519721e-05,
"loss": 4.0152,
"num_input_tokens_seen": 331563008,
"step": 5060,
"train_runtime": 46115.7035,
"train_tokens_per_second": 7189.807
},
{
"epoch": 1.3648112001076933,
"grad_norm": 0.99609375,
"learning_rate": 3.834723616485095e-05,
"loss": 4.0064,
"num_input_tokens_seen": 332218368,
"step": 5070,
"train_runtime": 46206.8061,
"train_tokens_per_second": 7189.815
},
{
"epoch": 1.3675035336878238,
"grad_norm": 1.0390625,
"learning_rate": 3.8301131459741786e-05,
"loss": 3.984,
"num_input_tokens_seen": 332873728,
"step": 5080,
"train_runtime": 46298.4821,
"train_tokens_per_second": 7189.733
},
{
"epoch": 1.3701958672679546,
"grad_norm": 1.03125,
"learning_rate": 3.825496356888568e-05,
"loss": 4.0889,
"num_input_tokens_seen": 333529088,
"step": 5090,
"train_runtime": 46389.6726,
"train_tokens_per_second": 7189.727
},
{
"epoch": 1.3728882008480852,
"grad_norm": 1.109375,
"learning_rate": 3.820873271159877e-05,
"loss": 3.9429,
"num_input_tokens_seen": 334184448,
"step": 5100,
"train_runtime": 46480.8345,
"train_tokens_per_second": 7189.726
},
{
"epoch": 1.3755805344282157,
"grad_norm": 1.03125,
"learning_rate": 3.816243910749629e-05,
"loss": 4.0672,
"num_input_tokens_seen": 334839808,
"step": 5110,
"train_runtime": 46572.051,
"train_tokens_per_second": 7189.716
},
{
"epoch": 1.3782728680083463,
"grad_norm": 1.0078125,
"learning_rate": 3.8116082976491576e-05,
"loss": 3.9873,
"num_input_tokens_seen": 335495168,
"step": 5120,
"train_runtime": 46663.0437,
"train_tokens_per_second": 7189.74
},
{
"epoch": 1.3809652015884768,
"grad_norm": 0.9609375,
"learning_rate": 3.8069664538794955e-05,
"loss": 4.0912,
"num_input_tokens_seen": 336150528,
"step": 5130,
"train_runtime": 46753.8595,
"train_tokens_per_second": 7189.792
},
{
"epoch": 1.3836575351686073,
"grad_norm": 0.97265625,
"learning_rate": 3.802318401491277e-05,
"loss": 4.057,
"num_input_tokens_seen": 336805888,
"step": 5140,
"train_runtime": 46844.4928,
"train_tokens_per_second": 7189.872
},
{
"epoch": 1.386349868748738,
"grad_norm": 0.96875,
"learning_rate": 3.797664162564626e-05,
"loss": 4.0411,
"num_input_tokens_seen": 337461248,
"step": 5150,
"train_runtime": 46935.7326,
"train_tokens_per_second": 7189.858
},
{
"epoch": 1.3890422023288687,
"grad_norm": 0.9765625,
"learning_rate": 3.7930037592090605e-05,
"loss": 4.0293,
"num_input_tokens_seen": 338116608,
"step": 5160,
"train_runtime": 47026.6833,
"train_tokens_per_second": 7189.888
},
{
"epoch": 1.3917345359089992,
"grad_norm": 0.96484375,
"learning_rate": 3.788337213563377e-05,
"loss": 3.9478,
"num_input_tokens_seen": 338771968,
"step": 5170,
"train_runtime": 47118.4452,
"train_tokens_per_second": 7189.795
},
{
"epoch": 1.3944268694891297,
"grad_norm": 1.046875,
"learning_rate": 3.7836645477955524e-05,
"loss": 4.0632,
"num_input_tokens_seen": 339427328,
"step": 5180,
"train_runtime": 47209.0334,
"train_tokens_per_second": 7189.881
},
{
"epoch": 1.3971192030692603,
"grad_norm": 0.94140625,
"learning_rate": 3.7789857841026363e-05,
"loss": 3.9842,
"num_input_tokens_seen": 340082688,
"step": 5190,
"train_runtime": 47299.8697,
"train_tokens_per_second": 7189.929
},
{
"epoch": 1.3998115366493908,
"grad_norm": 1.0703125,
"learning_rate": 3.774300944710647e-05,
"loss": 4.0714,
"num_input_tokens_seen": 340738048,
"step": 5200,
"train_runtime": 47390.6283,
"train_tokens_per_second": 7189.988
},
{
"epoch": 1.4025038702295214,
"grad_norm": 0.9140625,
"learning_rate": 3.769610051874463e-05,
"loss": 4.0991,
"num_input_tokens_seen": 341393408,
"step": 5210,
"train_runtime": 47481.7596,
"train_tokens_per_second": 7189.991
},
{
"epoch": 1.405196203809652,
"grad_norm": 0.95703125,
"learning_rate": 3.7649131278777194e-05,
"loss": 4.0515,
"num_input_tokens_seen": 342048768,
"step": 5220,
"train_runtime": 47572.5591,
"train_tokens_per_second": 7190.043
},
{
"epoch": 1.4078885373897827,
"grad_norm": 0.953125,
"learning_rate": 3.7602101950327036e-05,
"loss": 3.9602,
"num_input_tokens_seen": 342704128,
"step": 5230,
"train_runtime": 47663.3771,
"train_tokens_per_second": 7190.093
},
{
"epoch": 1.4105808709699132,
"grad_norm": 0.98828125,
"learning_rate": 3.755501275680245e-05,
"loss": 3.9706,
"num_input_tokens_seen": 343359488,
"step": 5240,
"train_runtime": 47754.3796,
"train_tokens_per_second": 7190.115
},
{
"epoch": 1.4132732045500438,
"grad_norm": 1.03125,
"learning_rate": 3.7507863921896144e-05,
"loss": 4.0457,
"num_input_tokens_seen": 344014848,
"step": 5250,
"train_runtime": 47845.8588,
"train_tokens_per_second": 7190.065
},
{
"epoch": 1.4159655381301743,
"grad_norm": 0.98046875,
"learning_rate": 3.7460655669584114e-05,
"loss": 4.0667,
"num_input_tokens_seen": 344670208,
"step": 5260,
"train_runtime": 47936.3933,
"train_tokens_per_second": 7190.157
},
{
"epoch": 1.4186578717103049,
"grad_norm": 0.96875,
"learning_rate": 3.741338822412463e-05,
"loss": 3.9787,
"num_input_tokens_seen": 345325568,
"step": 5270,
"train_runtime": 48027.5225,
"train_tokens_per_second": 7190.16
},
{
"epoch": 1.4213502052904354,
"grad_norm": 1.046875,
"learning_rate": 3.736606181005715e-05,
"loss": 4.092,
"num_input_tokens_seen": 345980928,
"step": 5280,
"train_runtime": 48118.394,
"train_tokens_per_second": 7190.201
},
{
"epoch": 1.424042538870566,
"grad_norm": 0.9765625,
"learning_rate": 3.731867665220124e-05,
"loss": 4.0368,
"num_input_tokens_seen": 346636288,
"step": 5290,
"train_runtime": 48209.5568,
"train_tokens_per_second": 7190.199
},
{
"epoch": 1.4267348724506967,
"grad_norm": 0.9453125,
"learning_rate": 3.727123297565557e-05,
"loss": 4.1142,
"num_input_tokens_seen": 347291648,
"step": 5300,
"train_runtime": 48300.6605,
"train_tokens_per_second": 7190.205
},
{
"epoch": 1.4294272060308273,
"grad_norm": 1.0390625,
"learning_rate": 3.722373100579674e-05,
"loss": 4.0787,
"num_input_tokens_seen": 347947008,
"step": 5310,
"train_runtime": 48391.9849,
"train_tokens_per_second": 7190.178
},
{
"epoch": 1.4321195396109578,
"grad_norm": 0.953125,
"learning_rate": 3.717617096827831e-05,
"loss": 3.9859,
"num_input_tokens_seen": 348602368,
"step": 5320,
"train_runtime": 48483.2543,
"train_tokens_per_second": 7190.16
},
{
"epoch": 1.4348118731910884,
"grad_norm": 1.0703125,
"learning_rate": 3.712855308902967e-05,
"loss": 4.0245,
"num_input_tokens_seen": 349257728,
"step": 5330,
"train_runtime": 48574.3918,
"train_tokens_per_second": 7190.162
},
{
"epoch": 1.437504206771219,
"grad_norm": 1.0625,
"learning_rate": 3.708087759425497e-05,
"loss": 3.9664,
"num_input_tokens_seen": 349913088,
"step": 5340,
"train_runtime": 48666.116,
"train_tokens_per_second": 7190.076
},
{
"epoch": 1.4401965403513495,
"grad_norm": 1.0078125,
"learning_rate": 3.703314471043206e-05,
"loss": 3.9756,
"num_input_tokens_seen": 350568448,
"step": 5350,
"train_runtime": 48757.3819,
"train_tokens_per_second": 7190.059
},
{
"epoch": 1.44288887393148,
"grad_norm": 0.9921875,
"learning_rate": 3.6985354664311434e-05,
"loss": 4.0478,
"num_input_tokens_seen": 351223808,
"step": 5360,
"train_runtime": 48847.8506,
"train_tokens_per_second": 7190.159
},
{
"epoch": 1.4455812075116108,
"grad_norm": 0.98046875,
"learning_rate": 3.6937507682915105e-05,
"loss": 4.1172,
"num_input_tokens_seen": 351879168,
"step": 5370,
"train_runtime": 48940.0294,
"train_tokens_per_second": 7190.007
},
{
"epoch": 1.4482735410917413,
"grad_norm": 1.015625,
"learning_rate": 3.688960399353557e-05,
"loss": 3.9969,
"num_input_tokens_seen": 352534528,
"step": 5380,
"train_runtime": 49030.843,
"train_tokens_per_second": 7190.056
},
{
"epoch": 1.4509658746718719,
"grad_norm": 1.0,
"learning_rate": 3.684164382373469e-05,
"loss": 4.0159,
"num_input_tokens_seen": 353189888,
"step": 5390,
"train_runtime": 49121.8378,
"train_tokens_per_second": 7190.079
},
{
"epoch": 1.4536582082520024,
"grad_norm": 1.0234375,
"learning_rate": 3.6793627401342655e-05,
"loss": 3.9705,
"num_input_tokens_seen": 353845248,
"step": 5400,
"train_runtime": 49213.3395,
"train_tokens_per_second": 7190.027
},
{
"epoch": 1.456350541832133,
"grad_norm": 1.015625,
"learning_rate": 3.6745554954456854e-05,
"loss": 4.0728,
"num_input_tokens_seen": 354500608,
"step": 5410,
"train_runtime": 49304.2357,
"train_tokens_per_second": 7190.064
},
{
"epoch": 1.4590428754122635,
"grad_norm": 0.98828125,
"learning_rate": 3.669742671144084e-05,
"loss": 4.0177,
"num_input_tokens_seen": 355155968,
"step": 5420,
"train_runtime": 49398.2246,
"train_tokens_per_second": 7189.65
},
{
"epoch": 1.461735208992394,
"grad_norm": 1.0390625,
"learning_rate": 3.66492429009232e-05,
"loss": 4.0315,
"num_input_tokens_seen": 355811328,
"step": 5430,
"train_runtime": 49488.7546,
"train_tokens_per_second": 7189.741
},
{
"epoch": 1.4644275425725248,
"grad_norm": 0.97265625,
"learning_rate": 3.660100375179649e-05,
"loss": 3.9762,
"num_input_tokens_seen": 356466688,
"step": 5440,
"train_runtime": 49580.1514,
"train_tokens_per_second": 7189.706
},
{
"epoch": 1.4671198761526554,
"grad_norm": 1.1015625,
"learning_rate": 3.655270949321616e-05,
"loss": 3.9728,
"num_input_tokens_seen": 357122048,
"step": 5450,
"train_runtime": 49671.019,
"train_tokens_per_second": 7189.747
},
{
"epoch": 1.469812209732786,
"grad_norm": 1.0078125,
"learning_rate": 3.650436035459945e-05,
"loss": 4.0728,
"num_input_tokens_seen": 357777408,
"step": 5460,
"train_runtime": 49761.4624,
"train_tokens_per_second": 7189.849
},
{
"epoch": 1.4725045433129165,
"grad_norm": 0.9453125,
"learning_rate": 3.6455956565624286e-05,
"loss": 4.0256,
"num_input_tokens_seen": 358432768,
"step": 5470,
"train_runtime": 49852.6514,
"train_tokens_per_second": 7189.844
},
{
"epoch": 1.475196876893047,
"grad_norm": 0.95703125,
"learning_rate": 3.6407498356228225e-05,
"loss": 3.9752,
"num_input_tokens_seen": 359088128,
"step": 5480,
"train_runtime": 49943.9357,
"train_tokens_per_second": 7189.824
},
{
"epoch": 1.4778892104731776,
"grad_norm": 1.0078125,
"learning_rate": 3.635898595660733e-05,
"loss": 4.0104,
"num_input_tokens_seen": 359743488,
"step": 5490,
"train_runtime": 50034.7254,
"train_tokens_per_second": 7189.876
},
{
"epoch": 1.480581544053308,
"grad_norm": 1.0078125,
"learning_rate": 3.63104195972151e-05,
"loss": 3.9878,
"num_input_tokens_seen": 360398848,
"step": 5500,
"train_runtime": 50125.3273,
"train_tokens_per_second": 7189.955
},
{
"epoch": 1.4832738776334389,
"grad_norm": 1.03125,
"learning_rate": 3.626179950876134e-05,
"loss": 4.1044,
"num_input_tokens_seen": 361054208,
"step": 5510,
"train_runtime": 50233.9465,
"train_tokens_per_second": 7187.455
},
{
"epoch": 1.4859662112135694,
"grad_norm": 1.046875,
"learning_rate": 3.6213125922211135e-05,
"loss": 3.9922,
"num_input_tokens_seen": 361709568,
"step": 5520,
"train_runtime": 50325.7876,
"train_tokens_per_second": 7187.36
},
{
"epoch": 1.4886585447937,
"grad_norm": 0.984375,
"learning_rate": 3.616439906878367e-05,
"loss": 4.009,
"num_input_tokens_seen": 362364928,
"step": 5530,
"train_runtime": 50417.0272,
"train_tokens_per_second": 7187.352
},
{
"epoch": 1.4913508783738305,
"grad_norm": 1.0078125,
"learning_rate": 3.611561917995117e-05,
"loss": 4.0106,
"num_input_tokens_seen": 363020288,
"step": 5540,
"train_runtime": 50508.3602,
"train_tokens_per_second": 7187.331
},
{
"epoch": 1.494043211953961,
"grad_norm": 0.98046875,
"learning_rate": 3.606678648743783e-05,
"loss": 3.9998,
"num_input_tokens_seen": 363675648,
"step": 5550,
"train_runtime": 50599.5219,
"train_tokens_per_second": 7187.334
},
{
"epoch": 1.4967355455340916,
"grad_norm": 1.0,
"learning_rate": 3.601790122321864e-05,
"loss": 4.0427,
"num_input_tokens_seen": 364331008,
"step": 5560,
"train_runtime": 50690.8134,
"train_tokens_per_second": 7187.318
},
{
"epoch": 1.4994278791142222,
"grad_norm": 0.921875,
"learning_rate": 3.5968963619518366e-05,
"loss": 3.9806,
"num_input_tokens_seen": 364986368,
"step": 5570,
"train_runtime": 50782.1551,
"train_tokens_per_second": 7187.296
},
{
"epoch": 1.502120212694353,
"grad_norm": 1.03125,
"learning_rate": 3.591997390881039e-05,
"loss": 4.0842,
"num_input_tokens_seen": 365641728,
"step": 5580,
"train_runtime": 50873.1287,
"train_tokens_per_second": 7187.325
},
{
"epoch": 1.5048125462744832,
"grad_norm": 1.0546875,
"learning_rate": 3.5870932323815624e-05,
"loss": 4.062,
"num_input_tokens_seen": 366297088,
"step": 5590,
"train_runtime": 50964.1659,
"train_tokens_per_second": 7187.346
},
{
"epoch": 1.507504879854614,
"grad_norm": 0.92578125,
"learning_rate": 3.582183909750141e-05,
"loss": 3.9796,
"num_input_tokens_seen": 366952448,
"step": 5600,
"train_runtime": 51055.039,
"train_tokens_per_second": 7187.389
},
{
"epoch": 1.5101972134347446,
"grad_norm": 1.0546875,
"learning_rate": 3.5772694463080394e-05,
"loss": 3.916,
"num_input_tokens_seen": 367607808,
"step": 5610,
"train_runtime": 51145.8693,
"train_tokens_per_second": 7187.439
},
{
"epoch": 1.512889547014875,
"grad_norm": 1.0234375,
"learning_rate": 3.572349865400944e-05,
"loss": 4.062,
"num_input_tokens_seen": 368263168,
"step": 5620,
"train_runtime": 51237.2955,
"train_tokens_per_second": 7187.404
},
{
"epoch": 1.5155818805950059,
"grad_norm": 1.0234375,
"learning_rate": 3.567425190398852e-05,
"loss": 4.0634,
"num_input_tokens_seen": 368918528,
"step": 5630,
"train_runtime": 51328.4263,
"train_tokens_per_second": 7187.412
},
{
"epoch": 1.5182742141751362,
"grad_norm": 0.9921875,
"learning_rate": 3.562495444695958e-05,
"loss": 3.9609,
"num_input_tokens_seen": 369573888,
"step": 5640,
"train_runtime": 51419.2465,
"train_tokens_per_second": 7187.462
},
{
"epoch": 1.520966547755267,
"grad_norm": 0.97265625,
"learning_rate": 3.557560651710546e-05,
"loss": 4.0745,
"num_input_tokens_seen": 370229248,
"step": 5650,
"train_runtime": 51510.6964,
"train_tokens_per_second": 7187.425
},
{
"epoch": 1.5236588813353975,
"grad_norm": 1.015625,
"learning_rate": 3.552620834884876e-05,
"loss": 4.0192,
"num_input_tokens_seen": 370884608,
"step": 5660,
"train_runtime": 51602.3065,
"train_tokens_per_second": 7187.365
},
{
"epoch": 1.526351214915528,
"grad_norm": 0.98046875,
"learning_rate": 3.547676017685072e-05,
"loss": 4.0617,
"num_input_tokens_seen": 371539968,
"step": 5670,
"train_runtime": 51693.1661,
"train_tokens_per_second": 7187.41
},
{
"epoch": 1.5290435484956586,
"grad_norm": 1.015625,
"learning_rate": 3.542726223601013e-05,
"loss": 4.0181,
"num_input_tokens_seen": 372195328,
"step": 5680,
"train_runtime": 51784.1565,
"train_tokens_per_second": 7187.436
},
{
"epoch": 1.5317358820757891,
"grad_norm": 1.0078125,
"learning_rate": 3.537771476146222e-05,
"loss": 3.9385,
"num_input_tokens_seen": 372850688,
"step": 5690,
"train_runtime": 51874.6753,
"train_tokens_per_second": 7187.528
},
{
"epoch": 1.53442821565592,
"grad_norm": 1.0078125,
"learning_rate": 3.532811798857749e-05,
"loss": 3.9561,
"num_input_tokens_seen": 373506048,
"step": 5700,
"train_runtime": 51965.8624,
"train_tokens_per_second": 7187.527
},
{
"epoch": 1.5371205492360502,
"grad_norm": 0.9921875,
"learning_rate": 3.527847215296065e-05,
"loss": 4.0395,
"num_input_tokens_seen": 374161408,
"step": 5710,
"train_runtime": 52056.8871,
"train_tokens_per_second": 7187.549
},
{
"epoch": 1.539812882816181,
"grad_norm": 0.94921875,
"learning_rate": 3.522877749044948e-05,
"loss": 4.0609,
"num_input_tokens_seen": 374816768,
"step": 5720,
"train_runtime": 52148.1425,
"train_tokens_per_second": 7187.538
},
{
"epoch": 1.5425052163963116,
"grad_norm": 0.984375,
"learning_rate": 3.517903423711368e-05,
"loss": 4.098,
"num_input_tokens_seen": 375472128,
"step": 5730,
"train_runtime": 52239.0426,
"train_tokens_per_second": 7187.577
},
{
"epoch": 1.545197549976442,
"grad_norm": 1.0234375,
"learning_rate": 3.5129242629253815e-05,
"loss": 4.0239,
"num_input_tokens_seen": 376127488,
"step": 5740,
"train_runtime": 52330.2141,
"train_tokens_per_second": 7187.578
},
{
"epoch": 1.5478898835565726,
"grad_norm": 0.9921875,
"learning_rate": 3.5079402903400124e-05,
"loss": 4.0425,
"num_input_tokens_seen": 376782848,
"step": 5750,
"train_runtime": 52421.5113,
"train_tokens_per_second": 7187.562
},
{
"epoch": 1.5505822171367032,
"grad_norm": 0.99609375,
"learning_rate": 3.502951529631143e-05,
"loss": 3.992,
"num_input_tokens_seen": 377438208,
"step": 5760,
"train_runtime": 52512.2071,
"train_tokens_per_second": 7187.628
},
{
"epoch": 1.553274550716834,
"grad_norm": 1.09375,
"learning_rate": 3.4979580044974026e-05,
"loss": 4.0186,
"num_input_tokens_seen": 378093568,
"step": 5770,
"train_runtime": 52603.1367,
"train_tokens_per_second": 7187.662
},
{
"epoch": 1.5559668842969643,
"grad_norm": 0.96484375,
"learning_rate": 3.492959738660052e-05,
"loss": 3.9302,
"num_input_tokens_seen": 378748928,
"step": 5780,
"train_runtime": 52694.318,
"train_tokens_per_second": 7187.662
},
{
"epoch": 1.558659217877095,
"grad_norm": 0.98046875,
"learning_rate": 3.487956755862874e-05,
"loss": 3.9378,
"num_input_tokens_seen": 379404288,
"step": 5790,
"train_runtime": 52784.9202,
"train_tokens_per_second": 7187.74
},
{
"epoch": 1.5613515514572256,
"grad_norm": 1.0234375,
"learning_rate": 3.482949079872056e-05,
"loss": 4.0948,
"num_input_tokens_seen": 380059648,
"step": 5800,
"train_runtime": 52875.9022,
"train_tokens_per_second": 7187.767
},
{
"epoch": 1.5640438850373561,
"grad_norm": 0.9296875,
"learning_rate": 3.4779367344760825e-05,
"loss": 4.0237,
"num_input_tokens_seen": 380715008,
"step": 5810,
"train_runtime": 52966.9492,
"train_tokens_per_second": 7187.784
},
{
"epoch": 1.5667362186174867,
"grad_norm": 1.0234375,
"learning_rate": 3.472919743485619e-05,
"loss": 4.0448,
"num_input_tokens_seen": 381370368,
"step": 5820,
"train_runtime": 53058.0795,
"train_tokens_per_second": 7187.791
},
{
"epoch": 1.5694285521976172,
"grad_norm": 0.94921875,
"learning_rate": 3.4678981307333985e-05,
"loss": 4.075,
"num_input_tokens_seen": 382025728,
"step": 5830,
"train_runtime": 53149.127,
"train_tokens_per_second": 7187.808
},
{
"epoch": 1.572120885777748,
"grad_norm": 1.03125,
"learning_rate": 3.46287192007411e-05,
"loss": 4.0984,
"num_input_tokens_seen": 382681088,
"step": 5840,
"train_runtime": 53239.7537,
"train_tokens_per_second": 7187.882
},
{
"epoch": 1.5748132193578783,
"grad_norm": 1.125,
"learning_rate": 3.457841135384284e-05,
"loss": 4.0243,
"num_input_tokens_seen": 383336448,
"step": 5850,
"train_runtime": 53330.9167,
"train_tokens_per_second": 7187.884
},
{
"epoch": 1.577505552938009,
"grad_norm": 1.0625,
"learning_rate": 3.452805800562181e-05,
"loss": 3.9802,
"num_input_tokens_seen": 383991808,
"step": 5860,
"train_runtime": 53421.5398,
"train_tokens_per_second": 7187.958
},
{
"epoch": 1.5801978865181396,
"grad_norm": 0.9296875,
"learning_rate": 3.447765939527673e-05,
"loss": 4.0104,
"num_input_tokens_seen": 384647168,
"step": 5870,
"train_runtime": 53512.1225,
"train_tokens_per_second": 7188.038
},
{
"epoch": 1.5828902200982702,
"grad_norm": 0.9296875,
"learning_rate": 3.442721576222139e-05,
"loss": 4.0854,
"num_input_tokens_seen": 385302528,
"step": 5880,
"train_runtime": 53603.182,
"train_tokens_per_second": 7188.053
},
{
"epoch": 1.5855825536784007,
"grad_norm": 0.97265625,
"learning_rate": 3.4376727346083384e-05,
"loss": 4.0028,
"num_input_tokens_seen": 385957888,
"step": 5890,
"train_runtime": 53694.6503,
"train_tokens_per_second": 7188.014
},
{
"epoch": 1.5882748872585313,
"grad_norm": 1.0,
"learning_rate": 3.43261943867031e-05,
"loss": 4.0289,
"num_input_tokens_seen": 386613248,
"step": 5900,
"train_runtime": 53785.2225,
"train_tokens_per_second": 7188.094
},
{
"epoch": 1.590967220838662,
"grad_norm": 0.9453125,
"learning_rate": 3.427561712413251e-05,
"loss": 3.9998,
"num_input_tokens_seen": 387268608,
"step": 5910,
"train_runtime": 53876.8747,
"train_tokens_per_second": 7188.03
},
{
"epoch": 1.5936595544187924,
"grad_norm": 0.9453125,
"learning_rate": 3.422499579863404e-05,
"loss": 4.0511,
"num_input_tokens_seen": 387923968,
"step": 5920,
"train_runtime": 53967.5433,
"train_tokens_per_second": 7188.098
},
{
"epoch": 1.5963518879989231,
"grad_norm": 1.078125,
"learning_rate": 3.417433065067942e-05,
"loss": 4.1121,
"num_input_tokens_seen": 388579328,
"step": 5930,
"train_runtime": 54058.0362,
"train_tokens_per_second": 7188.188
},
{
"epoch": 1.5990442215790537,
"grad_norm": 1.015625,
"learning_rate": 3.4123621920948577e-05,
"loss": 4.0063,
"num_input_tokens_seen": 389234688,
"step": 5940,
"train_runtime": 54149.1209,
"train_tokens_per_second": 7188.2
},
{
"epoch": 1.6017365551591842,
"grad_norm": 0.98046875,
"learning_rate": 3.407286985032846e-05,
"loss": 3.9714,
"num_input_tokens_seen": 389890048,
"step": 5950,
"train_runtime": 54240.6637,
"train_tokens_per_second": 7188.15
},
{
"epoch": 1.6044288887393148,
"grad_norm": 1.0546875,
"learning_rate": 3.40220746799119e-05,
"loss": 3.9757,
"num_input_tokens_seen": 390545408,
"step": 5960,
"train_runtime": 54331.414,
"train_tokens_per_second": 7188.206
},
{
"epoch": 1.6071212223194453,
"grad_norm": 1.0,
"learning_rate": 3.397123665099647e-05,
"loss": 3.9941,
"num_input_tokens_seen": 391200768,
"step": 5970,
"train_runtime": 54422.2535,
"train_tokens_per_second": 7188.25
},
{
"epoch": 1.609813555899576,
"grad_norm": 0.9921875,
"learning_rate": 3.3920356005083344e-05,
"loss": 3.991,
"num_input_tokens_seen": 391856128,
"step": 5980,
"train_runtime": 54512.9514,
"train_tokens_per_second": 7188.312
},
{
"epoch": 1.6125058894797064,
"grad_norm": 1.0078125,
"learning_rate": 3.386943298387615e-05,
"loss": 4.0555,
"num_input_tokens_seen": 392511488,
"step": 5990,
"train_runtime": 54604.0111,
"train_tokens_per_second": 7188.327
},
{
"epoch": 1.6151982230598372,
"grad_norm": 1.0,
"learning_rate": 3.38184678292798e-05,
"loss": 4.0397,
"num_input_tokens_seen": 393166848,
"step": 6000,
"train_runtime": 54695.0368,
"train_tokens_per_second": 7188.346
},
{
"epoch": 1.6178905566399677,
"grad_norm": 0.94921875,
"learning_rate": 3.3767460783399355e-05,
"loss": 4.0446,
"num_input_tokens_seen": 393822208,
"step": 6010,
"train_runtime": 54808.6027,
"train_tokens_per_second": 7185.409
},
{
"epoch": 1.6205828902200983,
"grad_norm": 1.015625,
"learning_rate": 3.3716412088538905e-05,
"loss": 3.9582,
"num_input_tokens_seen": 394477568,
"step": 6020,
"train_runtime": 54898.5567,
"train_tokens_per_second": 7185.573
},
{
"epoch": 1.6232752238002288,
"grad_norm": 0.9453125,
"learning_rate": 3.366532198720036e-05,
"loss": 3.9191,
"num_input_tokens_seen": 395132928,
"step": 6030,
"train_runtime": 54989.6002,
"train_tokens_per_second": 7185.594
},
{
"epoch": 1.6259675573803594,
"grad_norm": 1.0,
"learning_rate": 3.3614190722082327e-05,
"loss": 3.9044,
"num_input_tokens_seen": 395788288,
"step": 6040,
"train_runtime": 55080.1763,
"train_tokens_per_second": 7185.676
},
{
"epoch": 1.6286598909604901,
"grad_norm": 1.0546875,
"learning_rate": 3.356301853607898e-05,
"loss": 4.0952,
"num_input_tokens_seen": 396443648,
"step": 6050,
"train_runtime": 55171.3889,
"train_tokens_per_second": 7185.675
},
{
"epoch": 1.6313522245406205,
"grad_norm": 1.0078125,
"learning_rate": 3.3511805672278876e-05,
"loss": 4.0242,
"num_input_tokens_seen": 397099008,
"step": 6060,
"train_runtime": 55262.5814,
"train_tokens_per_second": 7185.676
},
{
"epoch": 1.6340445581207512,
"grad_norm": 1.0078125,
"learning_rate": 3.346055237396379e-05,
"loss": 3.9988,
"num_input_tokens_seen": 397754368,
"step": 6070,
"train_runtime": 55353.1031,
"train_tokens_per_second": 7185.765
},
{
"epoch": 1.6367368917008818,
"grad_norm": 1.0078125,
"learning_rate": 3.340925888460761e-05,
"loss": 3.9703,
"num_input_tokens_seen": 398409728,
"step": 6080,
"train_runtime": 55444.0405,
"train_tokens_per_second": 7185.799
},
{
"epoch": 1.6394292252810123,
"grad_norm": 0.9453125,
"learning_rate": 3.3357925447875105e-05,
"loss": 4.0456,
"num_input_tokens_seen": 399065088,
"step": 6090,
"train_runtime": 55535.1639,
"train_tokens_per_second": 7185.809
},
{
"epoch": 1.6421215588611429,
"grad_norm": 0.98046875,
"learning_rate": 3.330655230762085e-05,
"loss": 4.0108,
"num_input_tokens_seen": 399720448,
"step": 6100,
"train_runtime": 55625.2117,
"train_tokens_per_second": 7185.958
},
{
"epoch": 1.6448138924412734,
"grad_norm": 1.1328125,
"learning_rate": 3.3255139707888005e-05,
"loss": 3.9459,
"num_input_tokens_seen": 400375808,
"step": 6110,
"train_runtime": 55716.3074,
"train_tokens_per_second": 7185.972
},
{
"epoch": 1.6475062260214042,
"grad_norm": 0.95703125,
"learning_rate": 3.320368789290718e-05,
"loss": 3.9569,
"num_input_tokens_seen": 401031168,
"step": 6120,
"train_runtime": 55807.5546,
"train_tokens_per_second": 7185.966
},
{
"epoch": 1.6501985596015345,
"grad_norm": 0.99609375,
"learning_rate": 3.31521971070953e-05,
"loss": 3.8892,
"num_input_tokens_seen": 401686528,
"step": 6130,
"train_runtime": 55898.3697,
"train_tokens_per_second": 7186.015
},
{
"epoch": 1.6528908931816653,
"grad_norm": 1.046875,
"learning_rate": 3.310066759505437e-05,
"loss": 4.0045,
"num_input_tokens_seen": 402341888,
"step": 6140,
"train_runtime": 55989.3295,
"train_tokens_per_second": 7186.046
},
{
"epoch": 1.6555832267617958,
"grad_norm": 1.0234375,
"learning_rate": 3.3049099601570394e-05,
"loss": 4.0136,
"num_input_tokens_seen": 402997248,
"step": 6150,
"train_runtime": 56080.6133,
"train_tokens_per_second": 7186.035
},
{
"epoch": 1.6582755603419264,
"grad_norm": 0.9609375,
"learning_rate": 3.2997493371612176e-05,
"loss": 3.9676,
"num_input_tokens_seen": 403652608,
"step": 6160,
"train_runtime": 56171.1951,
"train_tokens_per_second": 7186.114
},
{
"epoch": 1.660967893922057,
"grad_norm": 1.0234375,
"learning_rate": 3.294584915033015e-05,
"loss": 4.0019,
"num_input_tokens_seen": 404307968,
"step": 6170,
"train_runtime": 56262.7175,
"train_tokens_per_second": 7186.073
},
{
"epoch": 1.6636602275021874,
"grad_norm": 0.95703125,
"learning_rate": 3.289416718305522e-05,
"loss": 3.9407,
"num_input_tokens_seen": 404963328,
"step": 6180,
"train_runtime": 56353.3711,
"train_tokens_per_second": 7186.142
},
{
"epoch": 1.6663525610823182,
"grad_norm": 1.1015625,
"learning_rate": 3.284244771529762e-05,
"loss": 4.0951,
"num_input_tokens_seen": 405618688,
"step": 6190,
"train_runtime": 56444.4303,
"train_tokens_per_second": 7186.16
},
{
"epoch": 1.6690448946624485,
"grad_norm": 1.0546875,
"learning_rate": 3.279069099274569e-05,
"loss": 4.0132,
"num_input_tokens_seen": 406274048,
"step": 6200,
"train_runtime": 56535.5036,
"train_tokens_per_second": 7186.175
},
{
"epoch": 1.6717372282425793,
"grad_norm": 1.0,
"learning_rate": 3.2738897261264796e-05,
"loss": 4.0009,
"num_input_tokens_seen": 406929408,
"step": 6210,
"train_runtime": 56625.8426,
"train_tokens_per_second": 7186.284
},
{
"epoch": 1.6744295618227099,
"grad_norm": 0.98046875,
"learning_rate": 3.2687066766896056e-05,
"loss": 4.0089,
"num_input_tokens_seen": 407584768,
"step": 6220,
"train_runtime": 56717.0759,
"train_tokens_per_second": 7186.28
},
{
"epoch": 1.6771218954028404,
"grad_norm": 1.015625,
"learning_rate": 3.263519975585527e-05,
"loss": 4.0005,
"num_input_tokens_seen": 408240128,
"step": 6230,
"train_runtime": 56807.9677,
"train_tokens_per_second": 7186.318
},
{
"epoch": 1.679814228982971,
"grad_norm": 1.0625,
"learning_rate": 3.258329647453169e-05,
"loss": 4.0387,
"num_input_tokens_seen": 408895488,
"step": 6240,
"train_runtime": 56899.2401,
"train_tokens_per_second": 7186.308
},
{
"epoch": 1.6825065625631015,
"grad_norm": 1.0703125,
"learning_rate": 3.253135716948685e-05,
"loss": 3.9846,
"num_input_tokens_seen": 409550848,
"step": 6250,
"train_runtime": 56989.5839,
"train_tokens_per_second": 7186.416
},
{
"epoch": 1.6851988961432323,
"grad_norm": 1.140625,
"learning_rate": 3.2479382087453445e-05,
"loss": 3.9494,
"num_input_tokens_seen": 410206208,
"step": 6260,
"train_runtime": 57081.0846,
"train_tokens_per_second": 7186.377
},
{
"epoch": 1.6878912297233626,
"grad_norm": 1.0703125,
"learning_rate": 3.24273714753341e-05,
"loss": 4.0214,
"num_input_tokens_seen": 410861568,
"step": 6270,
"train_runtime": 57172.0125,
"train_tokens_per_second": 7186.411
},
{
"epoch": 1.6905835633034934,
"grad_norm": 1.1015625,
"learning_rate": 3.237532558020023e-05,
"loss": 3.9752,
"num_input_tokens_seen": 411516928,
"step": 6280,
"train_runtime": 57263.3216,
"train_tokens_per_second": 7186.396
},
{
"epoch": 1.693275896883624,
"grad_norm": 1.03125,
"learning_rate": 3.232324464929087e-05,
"loss": 3.9727,
"num_input_tokens_seen": 412172288,
"step": 6290,
"train_runtime": 57353.7021,
"train_tokens_per_second": 7186.498
},
{
"epoch": 1.6959682304637544,
"grad_norm": 1.046875,
"learning_rate": 3.2271128930011466e-05,
"loss": 4.0558,
"num_input_tokens_seen": 412827648,
"step": 6300,
"train_runtime": 57444.8662,
"train_tokens_per_second": 7186.502
},
{
"epoch": 1.698660564043885,
"grad_norm": 1.0390625,
"learning_rate": 3.221897866993274e-05,
"loss": 4.0699,
"num_input_tokens_seen": 413483008,
"step": 6310,
"train_runtime": 57535.7838,
"train_tokens_per_second": 7186.536
},
{
"epoch": 1.7013528976240155,
"grad_norm": 1.0390625,
"learning_rate": 3.216679411678949e-05,
"loss": 3.97,
"num_input_tokens_seen": 414138368,
"step": 6320,
"train_runtime": 57626.993,
"train_tokens_per_second": 7186.534
},
{
"epoch": 1.7040452312041463,
"grad_norm": 1.03125,
"learning_rate": 3.2114575518479415e-05,
"loss": 4.1078,
"num_input_tokens_seen": 414793728,
"step": 6330,
"train_runtime": 57717.747,
"train_tokens_per_second": 7186.589
},
{
"epoch": 1.7067375647842766,
"grad_norm": 1.03125,
"learning_rate": 3.2062323123061964e-05,
"loss": 3.9,
"num_input_tokens_seen": 415449088,
"step": 6340,
"train_runtime": 57808.6745,
"train_tokens_per_second": 7186.622
},
{
"epoch": 1.7094298983644074,
"grad_norm": 0.94140625,
"learning_rate": 3.2010037178757116e-05,
"loss": 4.0176,
"num_input_tokens_seen": 416104448,
"step": 6350,
"train_runtime": 57899.7806,
"train_tokens_per_second": 7186.633
},
{
"epoch": 1.712122231944538,
"grad_norm": 0.97265625,
"learning_rate": 3.195771793394421e-05,
"loss": 4.0038,
"num_input_tokens_seen": 416759808,
"step": 6360,
"train_runtime": 57990.7902,
"train_tokens_per_second": 7186.655
},
{
"epoch": 1.7148145655246685,
"grad_norm": 0.96875,
"learning_rate": 3.1905365637160814e-05,
"loss": 3.9994,
"num_input_tokens_seen": 417415168,
"step": 6370,
"train_runtime": 58081.7605,
"train_tokens_per_second": 7186.682
},
{
"epoch": 1.717506899104799,
"grad_norm": 0.96484375,
"learning_rate": 3.1852980537101464e-05,
"loss": 4.0242,
"num_input_tokens_seen": 418070528,
"step": 6380,
"train_runtime": 58173.008,
"train_tokens_per_second": 7186.675
},
{
"epoch": 1.7201992326849296,
"grad_norm": 1.0234375,
"learning_rate": 3.180056288261655e-05,
"loss": 4.0096,
"num_input_tokens_seen": 418725888,
"step": 6390,
"train_runtime": 58263.8091,
"train_tokens_per_second": 7186.724
},
{
"epoch": 1.7228915662650603,
"grad_norm": 1.078125,
"learning_rate": 3.174811292271111e-05,
"loss": 4.0482,
"num_input_tokens_seen": 419381248,
"step": 6400,
"train_runtime": 58354.9614,
"train_tokens_per_second": 7186.728
},
{
"epoch": 1.7255838998451907,
"grad_norm": 0.97265625,
"learning_rate": 3.1695630906543636e-05,
"loss": 4.0342,
"num_input_tokens_seen": 420036608,
"step": 6410,
"train_runtime": 58445.4659,
"train_tokens_per_second": 7186.813
},
{
"epoch": 1.7282762334253214,
"grad_norm": 0.9921875,
"learning_rate": 3.1643117083424893e-05,
"loss": 3.9961,
"num_input_tokens_seen": 420691968,
"step": 6420,
"train_runtime": 58536.7942,
"train_tokens_per_second": 7186.795
},
{
"epoch": 1.730968567005452,
"grad_norm": 0.93359375,
"learning_rate": 3.1590571702816775e-05,
"loss": 3.9685,
"num_input_tokens_seen": 421347328,
"step": 6430,
"train_runtime": 58627.8959,
"train_tokens_per_second": 7186.806
},
{
"epoch": 1.7336609005855825,
"grad_norm": 1.0390625,
"learning_rate": 3.153799501433103e-05,
"loss": 3.9833,
"num_input_tokens_seen": 422002688,
"step": 6440,
"train_runtime": 58718.8139,
"train_tokens_per_second": 7186.839
},
{
"epoch": 1.736353234165713,
"grad_norm": 0.96875,
"learning_rate": 3.1485387267728195e-05,
"loss": 4.1101,
"num_input_tokens_seen": 422658048,
"step": 6450,
"train_runtime": 58809.7237,
"train_tokens_per_second": 7186.874
},
{
"epoch": 1.7390455677458436,
"grad_norm": 1.03125,
"learning_rate": 3.1432748712916304e-05,
"loss": 4.123,
"num_input_tokens_seen": 423313408,
"step": 6460,
"train_runtime": 58900.9514,
"train_tokens_per_second": 7186.869
},
{
"epoch": 1.7417379013259744,
"grad_norm": 1.03125,
"learning_rate": 3.138007959994976e-05,
"loss": 3.9765,
"num_input_tokens_seen": 423968768,
"step": 6470,
"train_runtime": 58992.128,
"train_tokens_per_second": 7186.87
},
{
"epoch": 1.7444302349061047,
"grad_norm": 1.0234375,
"learning_rate": 3.132738017902811e-05,
"loss": 4.003,
"num_input_tokens_seen": 424624128,
"step": 6480,
"train_runtime": 59083.3309,
"train_tokens_per_second": 7186.868
},
{
"epoch": 1.7471225684862355,
"grad_norm": 1.0234375,
"learning_rate": 3.1274650700494896e-05,
"loss": 3.9553,
"num_input_tokens_seen": 425279488,
"step": 6490,
"train_runtime": 59174.4889,
"train_tokens_per_second": 7186.872
},
{
"epoch": 1.749814902066366,
"grad_norm": 0.984375,
"learning_rate": 3.122189141483644e-05,
"loss": 4.123,
"num_input_tokens_seen": 425934848,
"step": 6500,
"train_runtime": 59264.7894,
"train_tokens_per_second": 7186.98
},
{
"epoch": 1.7525072356464966,
"grad_norm": 0.9921875,
"learning_rate": 3.116910257268066e-05,
"loss": 4.1079,
"num_input_tokens_seen": 426590208,
"step": 6510,
"train_runtime": 59373.0354,
"train_tokens_per_second": 7184.915
},
{
"epoch": 1.7551995692266273,
"grad_norm": 1.0,
"learning_rate": 3.1116284424795875e-05,
"loss": 4.033,
"num_input_tokens_seen": 427245568,
"step": 6520,
"train_runtime": 59463.6665,
"train_tokens_per_second": 7184.985
},
{
"epoch": 1.7578919028067577,
"grad_norm": 1.0234375,
"learning_rate": 3.106343722208962e-05,
"loss": 4.0505,
"num_input_tokens_seen": 427900928,
"step": 6530,
"train_runtime": 59554.4574,
"train_tokens_per_second": 7185.036
},
{
"epoch": 1.7605842363868884,
"grad_norm": 1.078125,
"learning_rate": 3.1010561215607464e-05,
"loss": 4.009,
"num_input_tokens_seen": 428556288,
"step": 6540,
"train_runtime": 59645.9384,
"train_tokens_per_second": 7185.004
},
{
"epoch": 1.7632765699670188,
"grad_norm": 1.109375,
"learning_rate": 3.0957656656531785e-05,
"loss": 3.8864,
"num_input_tokens_seen": 429211648,
"step": 6550,
"train_runtime": 59736.628,
"train_tokens_per_second": 7185.067
},
{
"epoch": 1.7659689035471495,
"grad_norm": 1.0078125,
"learning_rate": 3.090472379618061e-05,
"loss": 4.0765,
"num_input_tokens_seen": 429867008,
"step": 6560,
"train_runtime": 59827.3922,
"train_tokens_per_second": 7185.12
},
{
"epoch": 1.76866123712728,
"grad_norm": 1.03125,
"learning_rate": 3.0851762886006415e-05,
"loss": 3.9765,
"num_input_tokens_seen": 430522368,
"step": 6570,
"train_runtime": 59918.2991,
"train_tokens_per_second": 7185.157
},
{
"epoch": 1.7713535707074106,
"grad_norm": 1.0390625,
"learning_rate": 3.0798774177594894e-05,
"loss": 4.0562,
"num_input_tokens_seen": 431177728,
"step": 6580,
"train_runtime": 60009.4082,
"train_tokens_per_second": 7185.169
},
{
"epoch": 1.7740459042875414,
"grad_norm": 1.03125,
"learning_rate": 3.074575792266383e-05,
"loss": 4.0406,
"num_input_tokens_seen": 431833088,
"step": 6590,
"train_runtime": 60101.1242,
"train_tokens_per_second": 7185.108
},
{
"epoch": 1.7767382378676717,
"grad_norm": 0.99609375,
"learning_rate": 3.069271437306185e-05,
"loss": 3.9112,
"num_input_tokens_seen": 432488448,
"step": 6600,
"train_runtime": 60192.3902,
"train_tokens_per_second": 7185.102
},
{
"epoch": 1.7794305714478025,
"grad_norm": 1.03125,
"learning_rate": 3.063964378076723e-05,
"loss": 3.9743,
"num_input_tokens_seen": 433143808,
"step": 6610,
"train_runtime": 60283.5332,
"train_tokens_per_second": 7185.11
},
{
"epoch": 1.7821229050279328,
"grad_norm": 0.96875,
"learning_rate": 3.058654639788673e-05,
"loss": 4.0205,
"num_input_tokens_seen": 433799168,
"step": 6620,
"train_runtime": 60374.3752,
"train_tokens_per_second": 7185.154
},
{
"epoch": 1.7848152386080636,
"grad_norm": 1.0234375,
"learning_rate": 3.0533422476654355e-05,
"loss": 3.9233,
"num_input_tokens_seen": 434454528,
"step": 6630,
"train_runtime": 60464.7616,
"train_tokens_per_second": 7185.252
},
{
"epoch": 1.7875075721881941,
"grad_norm": 0.953125,
"learning_rate": 3.0480272269430193e-05,
"loss": 3.9677,
"num_input_tokens_seen": 435109888,
"step": 6640,
"train_runtime": 60555.7399,
"train_tokens_per_second": 7185.279
},
{
"epoch": 1.7901999057683247,
"grad_norm": 0.96875,
"learning_rate": 3.0427096028699192e-05,
"loss": 4.0849,
"num_input_tokens_seen": 435765248,
"step": 6650,
"train_runtime": 60646.5633,
"train_tokens_per_second": 7185.325
},
{
"epoch": 1.7928922393484554,
"grad_norm": 1.0703125,
"learning_rate": 3.0373894007069985e-05,
"loss": 3.9531,
"num_input_tokens_seen": 436420608,
"step": 6660,
"train_runtime": 60738.1385,
"train_tokens_per_second": 7185.281
},
{
"epoch": 1.7955845729285858,
"grad_norm": 1.015625,
"learning_rate": 3.0320666457273657e-05,
"loss": 3.9727,
"num_input_tokens_seen": 437075968,
"step": 6670,
"train_runtime": 60829.4049,
"train_tokens_per_second": 7185.274
},
{
"epoch": 1.7982769065087165,
"grad_norm": 1.015625,
"learning_rate": 3.0267413632162566e-05,
"loss": 4.005,
"num_input_tokens_seen": 437731328,
"step": 6680,
"train_runtime": 60920.3104,
"train_tokens_per_second": 7185.31
},
{
"epoch": 1.8009692400888468,
"grad_norm": 0.98046875,
"learning_rate": 3.0214135784709146e-05,
"loss": 3.9385,
"num_input_tokens_seen": 438386688,
"step": 6690,
"train_runtime": 61011.0006,
"train_tokens_per_second": 7185.371
},
{
"epoch": 1.8036615736689776,
"grad_norm": 0.94921875,
"learning_rate": 3.0160833168004694e-05,
"loss": 3.9885,
"num_input_tokens_seen": 439042048,
"step": 6700,
"train_runtime": 61101.2415,
"train_tokens_per_second": 7185.485
},
{
"epoch": 1.8063539072491082,
"grad_norm": 0.921875,
"learning_rate": 3.010750603525816e-05,
"loss": 4.0583,
"num_input_tokens_seen": 439697408,
"step": 6710,
"train_runtime": 61192.4341,
"train_tokens_per_second": 7185.486
},
{
"epoch": 1.8090462408292387,
"grad_norm": 1.0546875,
"learning_rate": 3.005415463979496e-05,
"loss": 3.9452,
"num_input_tokens_seen": 440352768,
"step": 6720,
"train_runtime": 61283.4288,
"train_tokens_per_second": 7185.511
},
{
"epoch": 1.8117385744093695,
"grad_norm": 1.015625,
"learning_rate": 3.000077923505579e-05,
"loss": 4.0071,
"num_input_tokens_seen": 441008128,
"step": 6730,
"train_runtime": 61374.2641,
"train_tokens_per_second": 7185.555
},
{
"epoch": 1.8144309079894998,
"grad_norm": 0.95703125,
"learning_rate": 2.9947380074595372e-05,
"loss": 3.9955,
"num_input_tokens_seen": 441663488,
"step": 6740,
"train_runtime": 61465.5537,
"train_tokens_per_second": 7185.545
},
{
"epoch": 1.8171232415696306,
"grad_norm": 1.0234375,
"learning_rate": 2.9893957412081286e-05,
"loss": 3.977,
"num_input_tokens_seen": 442318848,
"step": 6750,
"train_runtime": 61556.3225,
"train_tokens_per_second": 7185.596
},
{
"epoch": 1.8198155751497609,
"grad_norm": 1.0,
"learning_rate": 2.984051150129276e-05,
"loss": 3.976,
"num_input_tokens_seen": 442974208,
"step": 6760,
"train_runtime": 61647.2052,
"train_tokens_per_second": 7185.633
},
{
"epoch": 1.8225079087298917,
"grad_norm": 1.0234375,
"learning_rate": 2.9787042596119453e-05,
"loss": 4.016,
"num_input_tokens_seen": 443629568,
"step": 6770,
"train_runtime": 61738.2432,
"train_tokens_per_second": 7185.653
},
{
"epoch": 1.8252002423100222,
"grad_norm": 0.92578125,
"learning_rate": 2.9733550950560268e-05,
"loss": 4.0733,
"num_input_tokens_seen": 444284928,
"step": 6780,
"train_runtime": 61829.2087,
"train_tokens_per_second": 7185.68
},
{
"epoch": 1.8278925758901527,
"grad_norm": 0.953125,
"learning_rate": 2.9680036818722113e-05,
"loss": 3.9411,
"num_input_tokens_seen": 444940288,
"step": 6790,
"train_runtime": 61919.8373,
"train_tokens_per_second": 7185.747
},
{
"epoch": 1.8305849094702835,
"grad_norm": 0.96875,
"learning_rate": 2.962650045481875e-05,
"loss": 3.9126,
"num_input_tokens_seen": 445595648,
"step": 6800,
"train_runtime": 62011.0093,
"train_tokens_per_second": 7185.751
},
{
"epoch": 1.8332772430504138,
"grad_norm": 1.15625,
"learning_rate": 2.9572942113169515e-05,
"loss": 3.9839,
"num_input_tokens_seen": 446251008,
"step": 6810,
"train_runtime": 62101.7996,
"train_tokens_per_second": 7185.798
},
{
"epoch": 1.8359695766305446,
"grad_norm": 0.98828125,
"learning_rate": 2.951936204819818e-05,
"loss": 3.9176,
"num_input_tokens_seen": 446906368,
"step": 6820,
"train_runtime": 62192.8181,
"train_tokens_per_second": 7185.82
},
{
"epoch": 1.8386619102106752,
"grad_norm": 0.97265625,
"learning_rate": 2.946576051443168e-05,
"loss": 3.9816,
"num_input_tokens_seen": 447561728,
"step": 6830,
"train_runtime": 62283.5597,
"train_tokens_per_second": 7185.873
},
{
"epoch": 1.8413542437908057,
"grad_norm": 1.015625,
"learning_rate": 2.9412137766498952e-05,
"loss": 4.0402,
"num_input_tokens_seen": 448217088,
"step": 6840,
"train_runtime": 62374.6178,
"train_tokens_per_second": 7185.889
},
{
"epoch": 1.8440465773709362,
"grad_norm": 1.0546875,
"learning_rate": 2.9358494059129714e-05,
"loss": 4.0046,
"num_input_tokens_seen": 448872448,
"step": 6850,
"train_runtime": 62465.6805,
"train_tokens_per_second": 7185.905
},
{
"epoch": 1.8467389109510668,
"grad_norm": 1.0625,
"learning_rate": 2.9304829647153243e-05,
"loss": 4.0306,
"num_input_tokens_seen": 449527808,
"step": 6860,
"train_runtime": 62556.3264,
"train_tokens_per_second": 7185.969
},
{
"epoch": 1.8494312445311976,
"grad_norm": 0.97265625,
"learning_rate": 2.925114478549717e-05,
"loss": 3.8527,
"num_input_tokens_seen": 450183168,
"step": 6870,
"train_runtime": 62647.2501,
"train_tokens_per_second": 7186.0
},
{
"epoch": 1.8521235781113279,
"grad_norm": 0.97265625,
"learning_rate": 2.9197439729186272e-05,
"loss": 3.9802,
"num_input_tokens_seen": 450838528,
"step": 6880,
"train_runtime": 62738.6239,
"train_tokens_per_second": 7185.981
},
{
"epoch": 1.8548159116914587,
"grad_norm": 1.0234375,
"learning_rate": 2.914371473334126e-05,
"loss": 3.9518,
"num_input_tokens_seen": 451493888,
"step": 6890,
"train_runtime": 62829.5332,
"train_tokens_per_second": 7186.014
},
{
"epoch": 1.8575082452715892,
"grad_norm": 0.96484375,
"learning_rate": 2.908997005317756e-05,
"loss": 4.0215,
"num_input_tokens_seen": 452149248,
"step": 6900,
"train_runtime": 62920.7037,
"train_tokens_per_second": 7186.017
},
{
"epoch": 1.8602005788517197,
"grad_norm": 1.03125,
"learning_rate": 2.9036205944004114e-05,
"loss": 3.8857,
"num_input_tokens_seen": 452804608,
"step": 6910,
"train_runtime": 63012.0036,
"train_tokens_per_second": 7186.006
},
{
"epoch": 1.8628929124318503,
"grad_norm": 0.98046875,
"learning_rate": 2.8982422661222154e-05,
"loss": 4.0057,
"num_input_tokens_seen": 453459968,
"step": 6920,
"train_runtime": 63102.6813,
"train_tokens_per_second": 7186.065
},
{
"epoch": 1.8655852460119808,
"grad_norm": 0.9453125,
"learning_rate": 2.8928620460324007e-05,
"loss": 3.97,
"num_input_tokens_seen": 454115328,
"step": 6930,
"train_runtime": 63193.6179,
"train_tokens_per_second": 7186.095
},
{
"epoch": 1.8682775795921116,
"grad_norm": 0.953125,
"learning_rate": 2.887479959689185e-05,
"loss": 4.0438,
"num_input_tokens_seen": 454770688,
"step": 6940,
"train_runtime": 63284.6204,
"train_tokens_per_second": 7186.117
},
{
"epoch": 1.870969913172242,
"grad_norm": 1.0234375,
"learning_rate": 2.882096032659652e-05,
"loss": 3.9974,
"num_input_tokens_seen": 455426048,
"step": 6950,
"train_runtime": 63375.5135,
"train_tokens_per_second": 7186.152
},
{
"epoch": 1.8736622467523727,
"grad_norm": 0.95703125,
"learning_rate": 2.8767102905196308e-05,
"loss": 4.0249,
"num_input_tokens_seen": 456081408,
"step": 6960,
"train_runtime": 63466.6018,
"train_tokens_per_second": 7186.164
},
{
"epoch": 1.8763545803325032,
"grad_norm": 0.97265625,
"learning_rate": 2.8713227588535705e-05,
"loss": 3.9506,
"num_input_tokens_seen": 456736768,
"step": 6970,
"train_runtime": 63557.4534,
"train_tokens_per_second": 7186.203
},
{
"epoch": 1.8790469139126338,
"grad_norm": 0.96875,
"learning_rate": 2.8659334632544244e-05,
"loss": 4.0156,
"num_input_tokens_seen": 457392128,
"step": 6980,
"train_runtime": 63648.8191,
"train_tokens_per_second": 7186.184
},
{
"epoch": 1.8817392474927643,
"grad_norm": 1.0546875,
"learning_rate": 2.860542429323521e-05,
"loss": 4.0436,
"num_input_tokens_seen": 458047488,
"step": 6990,
"train_runtime": 63739.4385,
"train_tokens_per_second": 7186.249
},
{
"epoch": 1.8844315810728949,
"grad_norm": 0.953125,
"learning_rate": 2.8551496826704517e-05,
"loss": 3.9041,
"num_input_tokens_seen": 458702848,
"step": 7000,
"train_runtime": 63830.3107,
"train_tokens_per_second": 7186.286
},
{
"epoch": 1.8871239146530256,
"grad_norm": 0.99609375,
"learning_rate": 2.84975524891294e-05,
"loss": 3.9567,
"num_input_tokens_seen": 459358208,
"step": 7010,
"train_runtime": 63939.4485,
"train_tokens_per_second": 7184.269
},
{
"epoch": 1.889816248233156,
"grad_norm": 0.94921875,
"learning_rate": 2.8443591536767244e-05,
"loss": 3.9643,
"num_input_tokens_seen": 460013568,
"step": 7020,
"train_runtime": 64030.6445,
"train_tokens_per_second": 7184.272
},
{
"epoch": 1.8925085818132867,
"grad_norm": 0.97265625,
"learning_rate": 2.8389614225954382e-05,
"loss": 3.9844,
"num_input_tokens_seen": 460668928,
"step": 7030,
"train_runtime": 64121.8034,
"train_tokens_per_second": 7184.279
},
{
"epoch": 1.8952009153934173,
"grad_norm": 0.9609375,
"learning_rate": 2.8335620813104834e-05,
"loss": 4.0291,
"num_input_tokens_seen": 461324288,
"step": 7040,
"train_runtime": 64212.97,
"train_tokens_per_second": 7184.285
},
{
"epoch": 1.8978932489735478,
"grad_norm": 1.0,
"learning_rate": 2.828161155470912e-05,
"loss": 3.9941,
"num_input_tokens_seen": 461979648,
"step": 7050,
"train_runtime": 64304.1397,
"train_tokens_per_second": 7184.291
},
{
"epoch": 1.9005855825536784,
"grad_norm": 1.03125,
"learning_rate": 2.8227586707333035e-05,
"loss": 3.9564,
"num_input_tokens_seen": 462635008,
"step": 7060,
"train_runtime": 64395.1051,
"train_tokens_per_second": 7184.319
},
{
"epoch": 1.903277916133809,
"grad_norm": 0.97265625,
"learning_rate": 2.817354652761643e-05,
"loss": 3.964,
"num_input_tokens_seen": 463290368,
"step": 7070,
"train_runtime": 64485.9399,
"train_tokens_per_second": 7184.362
},
{
"epoch": 1.9059702497139397,
"grad_norm": 1.0,
"learning_rate": 2.811949127227198e-05,
"loss": 3.9875,
"num_input_tokens_seen": 463945728,
"step": 7080,
"train_runtime": 64577.2564,
"train_tokens_per_second": 7184.352
},
{
"epoch": 1.90866258329407,
"grad_norm": 1.09375,
"learning_rate": 2.806542119808398e-05,
"loss": 4.0569,
"num_input_tokens_seen": 464601088,
"step": 7090,
"train_runtime": 64668.0339,
"train_tokens_per_second": 7184.401
},
{
"epoch": 1.9113549168742008,
"grad_norm": 0.9609375,
"learning_rate": 2.8011336561907125e-05,
"loss": 3.9746,
"num_input_tokens_seen": 465256448,
"step": 7100,
"train_runtime": 64759.001,
"train_tokens_per_second": 7184.429
},
{
"epoch": 1.9140472504543313,
"grad_norm": 0.9921875,
"learning_rate": 2.7957237620665285e-05,
"loss": 4.0546,
"num_input_tokens_seen": 465911808,
"step": 7110,
"train_runtime": 64849.8303,
"train_tokens_per_second": 7184.472
},
{
"epoch": 1.9167395840344619,
"grad_norm": 1.0234375,
"learning_rate": 2.790312463135027e-05,
"loss": 4.0233,
"num_input_tokens_seen": 466567168,
"step": 7120,
"train_runtime": 64941.3773,
"train_tokens_per_second": 7184.436
},
{
"epoch": 1.9194319176145924,
"grad_norm": 0.97265625,
"learning_rate": 2.7848997851020652e-05,
"loss": 3.9364,
"num_input_tokens_seen": 467222528,
"step": 7130,
"train_runtime": 65032.2044,
"train_tokens_per_second": 7184.479
},
{
"epoch": 1.922124251194723,
"grad_norm": 0.9921875,
"learning_rate": 2.7794857536800496e-05,
"loss": 3.9537,
"num_input_tokens_seen": 467877888,
"step": 7140,
"train_runtime": 65123.1618,
"train_tokens_per_second": 7184.508
},
{
"epoch": 1.9248165847748537,
"grad_norm": 1.0078125,
"learning_rate": 2.774070394587816e-05,
"loss": 3.9303,
"num_input_tokens_seen": 468533248,
"step": 7150,
"train_runtime": 65214.4402,
"train_tokens_per_second": 7184.502
},
{
"epoch": 1.927508918354984,
"grad_norm": 1.0078125,
"learning_rate": 2.768653733550507e-05,
"loss": 3.9496,
"num_input_tokens_seen": 469188608,
"step": 7160,
"train_runtime": 65305.3984,
"train_tokens_per_second": 7184.53
},
{
"epoch": 1.9302012519351148,
"grad_norm": 0.98046875,
"learning_rate": 2.7632357962994508e-05,
"loss": 3.9278,
"num_input_tokens_seen": 469843968,
"step": 7170,
"train_runtime": 65396.6545,
"train_tokens_per_second": 7184.526
},
{
"epoch": 1.9328935855152454,
"grad_norm": 0.97265625,
"learning_rate": 2.757816608572038e-05,
"loss": 3.9639,
"num_input_tokens_seen": 470499328,
"step": 7180,
"train_runtime": 65487.742,
"train_tokens_per_second": 7184.54
},
{
"epoch": 1.935585919095376,
"grad_norm": 1.0078125,
"learning_rate": 2.7523961961115986e-05,
"loss": 3.9429,
"num_input_tokens_seen": 471154688,
"step": 7190,
"train_runtime": 65578.8448,
"train_tokens_per_second": 7184.553
},
{
"epoch": 1.9382782526755065,
"grad_norm": 0.98046875,
"learning_rate": 2.7469745846672818e-05,
"loss": 3.9361,
"num_input_tokens_seen": 471810048,
"step": 7200,
"train_runtime": 65670.0383,
"train_tokens_per_second": 7184.556
},
{
"epoch": 1.940970586255637,
"grad_norm": 0.98828125,
"learning_rate": 2.7415517999939316e-05,
"loss": 3.9282,
"num_input_tokens_seen": 472465408,
"step": 7210,
"train_runtime": 65761.1225,
"train_tokens_per_second": 7184.57
},
{
"epoch": 1.9436629198357678,
"grad_norm": 1.046875,
"learning_rate": 2.7361278678519654e-05,
"loss": 3.9673,
"num_input_tokens_seen": 473120768,
"step": 7220,
"train_runtime": 65852.8059,
"train_tokens_per_second": 7184.52
},
{
"epoch": 1.946355253415898,
"grad_norm": 0.9609375,
"learning_rate": 2.7307028140072515e-05,
"loss": 3.965,
"num_input_tokens_seen": 473776128,
"step": 7230,
"train_runtime": 65944.3346,
"train_tokens_per_second": 7184.486
},
{
"epoch": 1.9490475869960289,
"grad_norm": 1.0,
"learning_rate": 2.7252766642309873e-05,
"loss": 4.0568,
"num_input_tokens_seen": 474431488,
"step": 7240,
"train_runtime": 66035.3472,
"train_tokens_per_second": 7184.508
},
{
"epoch": 1.9517399205761594,
"grad_norm": 1.0078125,
"learning_rate": 2.7198494442995752e-05,
"loss": 4.0275,
"num_input_tokens_seen": 475086848,
"step": 7250,
"train_runtime": 66126.6077,
"train_tokens_per_second": 7184.504
},
{
"epoch": 1.95443225415629,
"grad_norm": 0.984375,
"learning_rate": 2.714421179994503e-05,
"loss": 3.9956,
"num_input_tokens_seen": 475742208,
"step": 7260,
"train_runtime": 66217.8895,
"train_tokens_per_second": 7184.497
},
{
"epoch": 1.9571245877364205,
"grad_norm": 1.0390625,
"learning_rate": 2.708991897102218e-05,
"loss": 3.9954,
"num_input_tokens_seen": 476397568,
"step": 7270,
"train_runtime": 66309.0076,
"train_tokens_per_second": 7184.508
},
{
"epoch": 1.959816921316551,
"grad_norm": 1.03125,
"learning_rate": 2.703561621414008e-05,
"loss": 4.0028,
"num_input_tokens_seen": 477052928,
"step": 7280,
"train_runtime": 66400.4447,
"train_tokens_per_second": 7184.484
},
{
"epoch": 1.9625092548966818,
"grad_norm": 0.9453125,
"learning_rate": 2.6981303787258744e-05,
"loss": 4.0518,
"num_input_tokens_seen": 477708288,
"step": 7290,
"train_runtime": 66491.4046,
"train_tokens_per_second": 7184.512
},
{
"epoch": 1.9652015884768121,
"grad_norm": 0.97265625,
"learning_rate": 2.6926981948384146e-05,
"loss": 3.926,
"num_input_tokens_seen": 478363648,
"step": 7300,
"train_runtime": 66582.7307,
"train_tokens_per_second": 7184.5
},
{
"epoch": 1.967893922056943,
"grad_norm": 0.9609375,
"learning_rate": 2.687265095556696e-05,
"loss": 4.0207,
"num_input_tokens_seen": 479019008,
"step": 7310,
"train_runtime": 66673.8653,
"train_tokens_per_second": 7184.509
},
{
"epoch": 1.9705862556370735,
"grad_norm": 0.96875,
"learning_rate": 2.6818311066901336e-05,
"loss": 3.9799,
"num_input_tokens_seen": 479674368,
"step": 7320,
"train_runtime": 66765.1344,
"train_tokens_per_second": 7184.504
},
{
"epoch": 1.973278589217204,
"grad_norm": 0.96875,
"learning_rate": 2.6763962540523714e-05,
"loss": 3.9499,
"num_input_tokens_seen": 480329728,
"step": 7330,
"train_runtime": 66856.3141,
"train_tokens_per_second": 7184.508
},
{
"epoch": 1.9759709227973346,
"grad_norm": 1.0234375,
"learning_rate": 2.6709605634611534e-05,
"loss": 3.9893,
"num_input_tokens_seen": 480985088,
"step": 7340,
"train_runtime": 66947.2542,
"train_tokens_per_second": 7184.538
},
{
"epoch": 1.978663256377465,
"grad_norm": 0.94921875,
"learning_rate": 2.665524060738206e-05,
"loss": 4.0101,
"num_input_tokens_seen": 481640448,
"step": 7350,
"train_runtime": 67038.5524,
"train_tokens_per_second": 7184.529
},
{
"epoch": 1.9813555899575959,
"grad_norm": 0.97265625,
"learning_rate": 2.660086771709112e-05,
"loss": 4.0528,
"num_input_tokens_seen": 482295808,
"step": 7360,
"train_runtime": 67129.9715,
"train_tokens_per_second": 7184.508
},
{
"epoch": 1.9840479235377262,
"grad_norm": 1.140625,
"learning_rate": 2.6546487222031918e-05,
"loss": 3.9714,
"num_input_tokens_seen": 482951168,
"step": 7370,
"train_runtime": 67220.7779,
"train_tokens_per_second": 7184.552
},
{
"epoch": 1.986740257117857,
"grad_norm": 1.015625,
"learning_rate": 2.6492099380533764e-05,
"loss": 4.0543,
"num_input_tokens_seen": 483606528,
"step": 7380,
"train_runtime": 67311.9445,
"train_tokens_per_second": 7184.557
},
{
"epoch": 1.9894325906979875,
"grad_norm": 1.0,
"learning_rate": 2.643770445096087e-05,
"loss": 4.017,
"num_input_tokens_seen": 484261888,
"step": 7390,
"train_runtime": 67403.3963,
"train_tokens_per_second": 7184.532
},
{
"epoch": 1.992124924278118,
"grad_norm": 0.98046875,
"learning_rate": 2.638330269171113e-05,
"loss": 4.0822,
"num_input_tokens_seen": 484917248,
"step": 7400,
"train_runtime": 67494.964,
"train_tokens_per_second": 7184.495
},
{
"epoch": 1.9948172578582486,
"grad_norm": 0.984375,
"learning_rate": 2.6328894361214867e-05,
"loss": 4.0175,
"num_input_tokens_seen": 485572608,
"step": 7410,
"train_runtime": 67586.4077,
"train_tokens_per_second": 7184.471
},
{
"epoch": 1.9975095914383791,
"grad_norm": 0.98828125,
"learning_rate": 2.6274479717933637e-05,
"loss": 4.0293,
"num_input_tokens_seen": 486227968,
"step": 7420,
"train_runtime": 67677.1172,
"train_tokens_per_second": 7184.525
},
{
"epoch": 2.0,
"grad_norm": 2.0625,
"learning_rate": 2.622005902035896e-05,
"loss": 3.9542,
"num_input_tokens_seen": 486834176,
"step": 7430,
"train_runtime": 67761.968,
"train_tokens_per_second": 7184.475
},
{
"epoch": 2.0026923335801308,
"grad_norm": 1.015625,
"learning_rate": 2.616563252701114e-05,
"loss": 4.0162,
"num_input_tokens_seen": 487489536,
"step": 7440,
"train_runtime": 67852.7594,
"train_tokens_per_second": 7184.52
},
{
"epoch": 2.005384667160261,
"grad_norm": 0.97265625,
"learning_rate": 2.6111200496438e-05,
"loss": 4.0107,
"num_input_tokens_seen": 488144896,
"step": 7450,
"train_runtime": 67944.3204,
"train_tokens_per_second": 7184.484
},
{
"epoch": 2.008077000740392,
"grad_norm": 1.015625,
"learning_rate": 2.6056763187213678e-05,
"loss": 3.87,
"num_input_tokens_seen": 488800256,
"step": 7460,
"train_runtime": 68035.4837,
"train_tokens_per_second": 7184.49
},
{
"epoch": 2.010769334320522,
"grad_norm": 0.97265625,
"learning_rate": 2.6002320857937373e-05,
"loss": 3.9323,
"num_input_tokens_seen": 489455616,
"step": 7470,
"train_runtime": 68126.8899,
"train_tokens_per_second": 7184.47
},
{
"epoch": 2.013461667900653,
"grad_norm": 0.94140625,
"learning_rate": 2.5947873767232146e-05,
"loss": 4.031,
"num_input_tokens_seen": 490110976,
"step": 7480,
"train_runtime": 68217.9602,
"train_tokens_per_second": 7184.486
},
{
"epoch": 2.0161540014807833,
"grad_norm": 0.97265625,
"learning_rate": 2.5893422173743664e-05,
"loss": 3.8792,
"num_input_tokens_seen": 490766336,
"step": 7490,
"train_runtime": 68309.4455,
"train_tokens_per_second": 7184.458
},
{
"epoch": 2.018846335060914,
"grad_norm": 0.9609375,
"learning_rate": 2.5838966336138992e-05,
"loss": 3.9895,
"num_input_tokens_seen": 491421696,
"step": 7500,
"train_runtime": 68400.9196,
"train_tokens_per_second": 7184.431
},
{
"epoch": 2.021538668641045,
"grad_norm": 1.0,
"learning_rate": 2.578450651310535e-05,
"loss": 3.875,
"num_input_tokens_seen": 492077056,
"step": 7510,
"train_runtime": 68510.0869,
"train_tokens_per_second": 7182.549
},
{
"epoch": 2.024231002221175,
"grad_norm": 0.96875,
"learning_rate": 2.5730042963348898e-05,
"loss": 4.0199,
"num_input_tokens_seen": 492732416,
"step": 7520,
"train_runtime": 68600.2883,
"train_tokens_per_second": 7182.658
},
{
"epoch": 2.026923335801306,
"grad_norm": 1.046875,
"learning_rate": 2.56755759455935e-05,
"loss": 3.9386,
"num_input_tokens_seen": 493387776,
"step": 7530,
"train_runtime": 68691.7421,
"train_tokens_per_second": 7182.636
},
{
"epoch": 2.0296156693814362,
"grad_norm": 0.9921875,
"learning_rate": 2.5621105718579484e-05,
"loss": 3.9828,
"num_input_tokens_seen": 494043136,
"step": 7540,
"train_runtime": 68782.8621,
"train_tokens_per_second": 7182.649
},
{
"epoch": 2.032308002961567,
"grad_norm": 0.98828125,
"learning_rate": 2.5566632541062435e-05,
"loss": 3.8517,
"num_input_tokens_seen": 494698496,
"step": 7550,
"train_runtime": 68874.1653,
"train_tokens_per_second": 7182.642
},
{
"epoch": 2.0350003365416973,
"grad_norm": 0.97265625,
"learning_rate": 2.5512156671811943e-05,
"loss": 4.0755,
"num_input_tokens_seen": 495353856,
"step": 7560,
"train_runtime": 68965.1858,
"train_tokens_per_second": 7182.665
},
{
"epoch": 2.037692670121828,
"grad_norm": 1.0625,
"learning_rate": 2.5457678369610394e-05,
"loss": 3.9296,
"num_input_tokens_seen": 496009216,
"step": 7570,
"train_runtime": 69056.7428,
"train_tokens_per_second": 7182.633
},
{
"epoch": 2.040385003701959,
"grad_norm": 0.91015625,
"learning_rate": 2.5403197893251723e-05,
"loss": 3.9863,
"num_input_tokens_seen": 496664576,
"step": 7580,
"train_runtime": 69147.3015,
"train_tokens_per_second": 7182.704
},
{
"epoch": 2.043077337282089,
"grad_norm": 1.0546875,
"learning_rate": 2.5348715501540203e-05,
"loss": 3.9717,
"num_input_tokens_seen": 497319936,
"step": 7590,
"train_runtime": 69238.82,
"train_tokens_per_second": 7182.675
},
{
"epoch": 2.04576967086222,
"grad_norm": 0.953125,
"learning_rate": 2.52942314532892e-05,
"loss": 3.8741,
"num_input_tokens_seen": 497975296,
"step": 7600,
"train_runtime": 69330.4323,
"train_tokens_per_second": 7182.637
},
{
"epoch": 2.0484620044423503,
"grad_norm": 0.9453125,
"learning_rate": 2.5239746007319954e-05,
"loss": 3.8784,
"num_input_tokens_seen": 498630656,
"step": 7610,
"train_runtime": 69421.1704,
"train_tokens_per_second": 7182.689
},
{
"epoch": 2.051154338022481,
"grad_norm": 1.03125,
"learning_rate": 2.5185259422460334e-05,
"loss": 3.9551,
"num_input_tokens_seen": 499286016,
"step": 7620,
"train_runtime": 69512.3931,
"train_tokens_per_second": 7182.691
},
{
"epoch": 2.0538466716026114,
"grad_norm": 0.98828125,
"learning_rate": 2.5130771957543632e-05,
"loss": 3.9249,
"num_input_tokens_seen": 499941376,
"step": 7630,
"train_runtime": 69603.3932,
"train_tokens_per_second": 7182.716
},
{
"epoch": 2.056539005182742,
"grad_norm": 0.99609375,
"learning_rate": 2.507628387140731e-05,
"loss": 3.948,
"num_input_tokens_seen": 500596736,
"step": 7640,
"train_runtime": 69694.7297,
"train_tokens_per_second": 7182.706
},
{
"epoch": 2.059231338762873,
"grad_norm": 0.97265625,
"learning_rate": 2.502179542289178e-05,
"loss": 3.9351,
"num_input_tokens_seen": 501252096,
"step": 7650,
"train_runtime": 69786.3479,
"train_tokens_per_second": 7182.667
},
{
"epoch": 2.0619236723430032,
"grad_norm": 1.015625,
"learning_rate": 2.4967306870839198e-05,
"loss": 4.0082,
"num_input_tokens_seen": 501907456,
"step": 7660,
"train_runtime": 69877.0105,
"train_tokens_per_second": 7182.727
},
{
"epoch": 2.064616005923134,
"grad_norm": 0.9609375,
"learning_rate": 2.4912818474092173e-05,
"loss": 3.9455,
"num_input_tokens_seen": 502562816,
"step": 7670,
"train_runtime": 69968.4304,
"train_tokens_per_second": 7182.708
},
{
"epoch": 2.0673083395032643,
"grad_norm": 1.0078125,
"learning_rate": 2.4858330491492624e-05,
"loss": 3.9224,
"num_input_tokens_seen": 503218176,
"step": 7680,
"train_runtime": 70060.2435,
"train_tokens_per_second": 7182.65
},
{
"epoch": 2.070000673083395,
"grad_norm": 0.98828125,
"learning_rate": 2.480384318188045e-05,
"loss": 3.9099,
"num_input_tokens_seen": 503873536,
"step": 7690,
"train_runtime": 70151.399,
"train_tokens_per_second": 7182.658
},
{
"epoch": 2.0726930066635254,
"grad_norm": 0.98046875,
"learning_rate": 2.4749356804092392e-05,
"loss": 3.8047,
"num_input_tokens_seen": 504528896,
"step": 7700,
"train_runtime": 70242.717,
"train_tokens_per_second": 7182.651
},
{
"epoch": 2.075385340243656,
"grad_norm": 1.0,
"learning_rate": 2.4694871616960764e-05,
"loss": 3.9816,
"num_input_tokens_seen": 505184256,
"step": 7710,
"train_runtime": 70334.448,
"train_tokens_per_second": 7182.601
},
{
"epoch": 2.078077673823787,
"grad_norm": 1.0078125,
"learning_rate": 2.464038787931219e-05,
"loss": 3.8623,
"num_input_tokens_seen": 505839616,
"step": 7720,
"train_runtime": 70425.3555,
"train_tokens_per_second": 7182.635
},
{
"epoch": 2.0807700074039173,
"grad_norm": 1.0234375,
"learning_rate": 2.4585905849966454e-05,
"loss": 3.9311,
"num_input_tokens_seen": 506494976,
"step": 7730,
"train_runtime": 70516.6306,
"train_tokens_per_second": 7182.632
},
{
"epoch": 2.083462340984048,
"grad_norm": 1.015625,
"learning_rate": 2.4531425787735175e-05,
"loss": 4.0023,
"num_input_tokens_seen": 507150336,
"step": 7740,
"train_runtime": 70608.3253,
"train_tokens_per_second": 7182.586
},
{
"epoch": 2.0861546745641784,
"grad_norm": 1.015625,
"learning_rate": 2.447694795142067e-05,
"loss": 3.9401,
"num_input_tokens_seen": 507805696,
"step": 7750,
"train_runtime": 70699.7316,
"train_tokens_per_second": 7182.569
},
{
"epoch": 2.088847008144309,
"grad_norm": 1.046875,
"learning_rate": 2.442247259981467e-05,
"loss": 3.9075,
"num_input_tokens_seen": 508461056,
"step": 7760,
"train_runtime": 70791.0769,
"train_tokens_per_second": 7182.559
},
{
"epoch": 2.0915393417244394,
"grad_norm": 1.0546875,
"learning_rate": 2.4367999991697086e-05,
"loss": 4.0235,
"num_input_tokens_seen": 509116416,
"step": 7770,
"train_runtime": 70882.845,
"train_tokens_per_second": 7182.505
},
{
"epoch": 2.09423167530457,
"grad_norm": 0.9453125,
"learning_rate": 2.4313530385834823e-05,
"loss": 4.0668,
"num_input_tokens_seen": 509771776,
"step": 7780,
"train_runtime": 70973.9031,
"train_tokens_per_second": 7182.524
},
{
"epoch": 2.096924008884701,
"grad_norm": 0.9765625,
"learning_rate": 2.4259064040980492e-05,
"loss": 3.9675,
"num_input_tokens_seen": 510427136,
"step": 7790,
"train_runtime": 71065.1618,
"train_tokens_per_second": 7182.523
},
{
"epoch": 2.0996163424648313,
"grad_norm": 0.9765625,
"learning_rate": 2.420460121587125e-05,
"loss": 3.9474,
"num_input_tokens_seen": 511082496,
"step": 7800,
"train_runtime": 71156.9178,
"train_tokens_per_second": 7182.471
},
{
"epoch": 2.102308676044962,
"grad_norm": 0.9921875,
"learning_rate": 2.4150142169227492e-05,
"loss": 3.9166,
"num_input_tokens_seen": 511737856,
"step": 7810,
"train_runtime": 71248.0112,
"train_tokens_per_second": 7182.486
},
{
"epoch": 2.1050010096250924,
"grad_norm": 0.9765625,
"learning_rate": 2.4095687159751703e-05,
"loss": 3.9603,
"num_input_tokens_seen": 512393216,
"step": 7820,
"train_runtime": 71339.3142,
"train_tokens_per_second": 7182.48
},
{
"epoch": 2.107693343205223,
"grad_norm": 0.98828125,
"learning_rate": 2.404123644612718e-05,
"loss": 3.9247,
"num_input_tokens_seen": 513048576,
"step": 7830,
"train_runtime": 71430.987,
"train_tokens_per_second": 7182.437
},
{
"epoch": 2.1103856767853535,
"grad_norm": 1.0234375,
"learning_rate": 2.3986790287016784e-05,
"loss": 3.9972,
"num_input_tokens_seen": 513703936,
"step": 7840,
"train_runtime": 71521.7458,
"train_tokens_per_second": 7182.486
},
{
"epoch": 2.1130780103654843,
"grad_norm": 0.98046875,
"learning_rate": 2.3932348941061783e-05,
"loss": 3.9288,
"num_input_tokens_seen": 514359296,
"step": 7850,
"train_runtime": 71613.2492,
"train_tokens_per_second": 7182.46
},
{
"epoch": 2.115770343945615,
"grad_norm": 0.953125,
"learning_rate": 2.387791266688054e-05,
"loss": 4.0332,
"num_input_tokens_seen": 515014656,
"step": 7860,
"train_runtime": 71704.5297,
"train_tokens_per_second": 7182.456
},
{
"epoch": 2.1184626775257454,
"grad_norm": 0.99609375,
"learning_rate": 2.3823481723067366e-05,
"loss": 3.9433,
"num_input_tokens_seen": 515670016,
"step": 7870,
"train_runtime": 71795.876,
"train_tokens_per_second": 7182.446
},
{
"epoch": 2.121155011105876,
"grad_norm": 0.95703125,
"learning_rate": 2.3769056368191208e-05,
"loss": 3.8531,
"num_input_tokens_seen": 516325376,
"step": 7880,
"train_runtime": 71887.3636,
"train_tokens_per_second": 7182.422
},
{
"epoch": 2.1238473446860064,
"grad_norm": 0.9296875,
"learning_rate": 2.3714636860794495e-05,
"loss": 3.9497,
"num_input_tokens_seen": 516980736,
"step": 7890,
"train_runtime": 71978.4865,
"train_tokens_per_second": 7182.434
},
{
"epoch": 2.126539678266137,
"grad_norm": 1.046875,
"learning_rate": 2.366022345939188e-05,
"loss": 3.9821,
"num_input_tokens_seen": 517636096,
"step": 7900,
"train_runtime": 72070.3063,
"train_tokens_per_second": 7182.377
},
{
"epoch": 2.129232011846268,
"grad_norm": 1.0,
"learning_rate": 2.3605816422468976e-05,
"loss": 3.9986,
"num_input_tokens_seen": 518291456,
"step": 7910,
"train_runtime": 72161.9237,
"train_tokens_per_second": 7182.34
},
{
"epoch": 2.1319243454263983,
"grad_norm": 1.0234375,
"learning_rate": 2.3551416008481205e-05,
"loss": 3.9442,
"num_input_tokens_seen": 518946816,
"step": 7920,
"train_runtime": 72253.5881,
"train_tokens_per_second": 7182.298
},
{
"epoch": 2.134616679006529,
"grad_norm": 1.0,
"learning_rate": 2.3497022475852487e-05,
"loss": 3.9685,
"num_input_tokens_seen": 519602176,
"step": 7930,
"train_runtime": 72344.6661,
"train_tokens_per_second": 7182.315
},
{
"epoch": 2.1373090125866594,
"grad_norm": 1.09375,
"learning_rate": 2.344263608297409e-05,
"loss": 4.0041,
"num_input_tokens_seen": 520257536,
"step": 7940,
"train_runtime": 72436.1344,
"train_tokens_per_second": 7182.293
},
{
"epoch": 2.14000134616679,
"grad_norm": 1.0078125,
"learning_rate": 2.3388257088203326e-05,
"loss": 3.9341,
"num_input_tokens_seen": 520912896,
"step": 7950,
"train_runtime": 72527.844,
"train_tokens_per_second": 7182.247
},
{
"epoch": 2.1426936797469205,
"grad_norm": 0.9765625,
"learning_rate": 2.33338857498624e-05,
"loss": 4.0325,
"num_input_tokens_seen": 521568256,
"step": 7960,
"train_runtime": 72619.3594,
"train_tokens_per_second": 7182.221
},
{
"epoch": 2.1453860133270513,
"grad_norm": 1.03125,
"learning_rate": 2.327952232623714e-05,
"loss": 3.937,
"num_input_tokens_seen": 522223616,
"step": 7970,
"train_runtime": 72710.6428,
"train_tokens_per_second": 7182.217
},
{
"epoch": 2.148078346907182,
"grad_norm": 1.0390625,
"learning_rate": 2.3225167075575744e-05,
"loss": 3.9797,
"num_input_tokens_seen": 522878976,
"step": 7980,
"train_runtime": 72802.8888,
"train_tokens_per_second": 7182.119
},
{
"epoch": 2.1507706804873123,
"grad_norm": 0.9921875,
"learning_rate": 2.3170820256087628e-05,
"loss": 3.933,
"num_input_tokens_seen": 523534336,
"step": 7990,
"train_runtime": 72895.0433,
"train_tokens_per_second": 7182.029
},
{
"epoch": 2.153463014067443,
"grad_norm": 0.9609375,
"learning_rate": 2.3116482125942112e-05,
"loss": 4.0139,
"num_input_tokens_seen": 524189696,
"step": 8000,
"train_runtime": 72987.0823,
"train_tokens_per_second": 7181.952
},
{
"epoch": 2.1561553476475734,
"grad_norm": 1.03125,
"learning_rate": 2.306215294326729e-05,
"loss": 3.8985,
"num_input_tokens_seen": 524845056,
"step": 8010,
"train_runtime": 73107.6287,
"train_tokens_per_second": 7179.074
},
{
"epoch": 2.158847681227704,
"grad_norm": 0.9609375,
"learning_rate": 2.3007832966148695e-05,
"loss": 3.9383,
"num_input_tokens_seen": 525500416,
"step": 8020,
"train_runtime": 73199.5993,
"train_tokens_per_second": 7179.007
},
{
"epoch": 2.1615400148078345,
"grad_norm": 0.95703125,
"learning_rate": 2.2953522452628176e-05,
"loss": 3.8953,
"num_input_tokens_seen": 526155776,
"step": 8030,
"train_runtime": 73291.6216,
"train_tokens_per_second": 7178.935
},
{
"epoch": 2.1642323483879653,
"grad_norm": 0.921875,
"learning_rate": 2.289922166070262e-05,
"loss": 3.9527,
"num_input_tokens_seen": 526811136,
"step": 8040,
"train_runtime": 73384.1019,
"train_tokens_per_second": 7178.818
},
{
"epoch": 2.166924681968096,
"grad_norm": 0.98046875,
"learning_rate": 2.2844930848322695e-05,
"loss": 4.0024,
"num_input_tokens_seen": 527466496,
"step": 8050,
"train_runtime": 73475.8145,
"train_tokens_per_second": 7178.777
},
{
"epoch": 2.1696170155482264,
"grad_norm": 0.9765625,
"learning_rate": 2.279065027339171e-05,
"loss": 3.8969,
"num_input_tokens_seen": 528121856,
"step": 8060,
"train_runtime": 73567.1917,
"train_tokens_per_second": 7178.769
},
{
"epoch": 2.172309349128357,
"grad_norm": 1.0234375,
"learning_rate": 2.27363801937643e-05,
"loss": 3.9977,
"num_input_tokens_seen": 528777216,
"step": 8070,
"train_runtime": 73658.6764,
"train_tokens_per_second": 7178.75
},
{
"epoch": 2.1750016827084875,
"grad_norm": 1.0390625,
"learning_rate": 2.268212086724528e-05,
"loss": 3.9677,
"num_input_tokens_seen": 529432576,
"step": 8080,
"train_runtime": 73750.8176,
"train_tokens_per_second": 7178.667
},
{
"epoch": 2.1776940162886183,
"grad_norm": 0.92578125,
"learning_rate": 2.262787255158837e-05,
"loss": 3.9226,
"num_input_tokens_seen": 530087936,
"step": 8090,
"train_runtime": 73842.6244,
"train_tokens_per_second": 7178.617
},
{
"epoch": 2.1803863498687486,
"grad_norm": 1.0625,
"learning_rate": 2.257363550449497e-05,
"loss": 4.0062,
"num_input_tokens_seen": 530743296,
"step": 8100,
"train_runtime": 73934.5186,
"train_tokens_per_second": 7178.559
},
{
"epoch": 2.1830786834488793,
"grad_norm": 1.0,
"learning_rate": 2.251940998361297e-05,
"loss": 3.9742,
"num_input_tokens_seen": 531398656,
"step": 8110,
"train_runtime": 74026.3714,
"train_tokens_per_second": 7178.505
},
{
"epoch": 2.18577101702901,
"grad_norm": 1.0625,
"learning_rate": 2.246519624653548e-05,
"loss": 4.0204,
"num_input_tokens_seen": 532054016,
"step": 8120,
"train_runtime": 74118.3082,
"train_tokens_per_second": 7178.443
},
{
"epoch": 2.1884633506091404,
"grad_norm": 0.9921875,
"learning_rate": 2.2410994550799674e-05,
"loss": 3.924,
"num_input_tokens_seen": 532709376,
"step": 8130,
"train_runtime": 74209.9458,
"train_tokens_per_second": 7178.409
},
{
"epoch": 2.191155684189271,
"grad_norm": 0.99609375,
"learning_rate": 2.2356805153885473e-05,
"loss": 3.8933,
"num_input_tokens_seen": 533364736,
"step": 8140,
"train_runtime": 74301.8978,
"train_tokens_per_second": 7178.346
},
{
"epoch": 2.1938480177694015,
"grad_norm": 0.96875,
"learning_rate": 2.230262831321441e-05,
"loss": 3.99,
"num_input_tokens_seen": 534020096,
"step": 8150,
"train_runtime": 74393.435,
"train_tokens_per_second": 7178.323
},
{
"epoch": 2.1965403513495323,
"grad_norm": 0.96484375,
"learning_rate": 2.224846428614838e-05,
"loss": 3.8841,
"num_input_tokens_seen": 534675456,
"step": 8160,
"train_runtime": 74485.4277,
"train_tokens_per_second": 7178.256
},
{
"epoch": 2.1992326849296626,
"grad_norm": 0.94921875,
"learning_rate": 2.219431332998836e-05,
"loss": 3.8712,
"num_input_tokens_seen": 535330816,
"step": 8170,
"train_runtime": 74577.0024,
"train_tokens_per_second": 7178.229
},
{
"epoch": 2.2019250185097934,
"grad_norm": 0.984375,
"learning_rate": 2.2140175701973283e-05,
"loss": 3.9662,
"num_input_tokens_seen": 535986176,
"step": 8180,
"train_runtime": 74668.8087,
"train_tokens_per_second": 7178.18
},
{
"epoch": 2.204617352089924,
"grad_norm": 0.94140625,
"learning_rate": 2.2086051659278738e-05,
"loss": 3.9555,
"num_input_tokens_seen": 536641536,
"step": 8190,
"train_runtime": 74760.7332,
"train_tokens_per_second": 7178.12
},
{
"epoch": 2.2073096856700545,
"grad_norm": 1.0234375,
"learning_rate": 2.20319414590158e-05,
"loss": 3.9335,
"num_input_tokens_seen": 537296896,
"step": 8200,
"train_runtime": 74851.828,
"train_tokens_per_second": 7178.14
},
{
"epoch": 2.2100020192501852,
"grad_norm": 0.9921875,
"learning_rate": 2.197784535822976e-05,
"loss": 3.9308,
"num_input_tokens_seen": 537952256,
"step": 8210,
"train_runtime": 74943.4274,
"train_tokens_per_second": 7178.111
},
{
"epoch": 2.2126943528303156,
"grad_norm": 0.9296875,
"learning_rate": 2.192376361389896e-05,
"loss": 3.9608,
"num_input_tokens_seen": 538607616,
"step": 8220,
"train_runtime": 75035.4487,
"train_tokens_per_second": 7178.042
},
{
"epoch": 2.2153866864104463,
"grad_norm": 1.046875,
"learning_rate": 2.1869696482933535e-05,
"loss": 3.9234,
"num_input_tokens_seen": 539262976,
"step": 8230,
"train_runtime": 75127.5116,
"train_tokens_per_second": 7177.969
},
{
"epoch": 2.2180790199905767,
"grad_norm": 1.0,
"learning_rate": 2.181564422217418e-05,
"loss": 3.8744,
"num_input_tokens_seen": 539918336,
"step": 8240,
"train_runtime": 75219.0028,
"train_tokens_per_second": 7177.951
},
{
"epoch": 2.2207713535707074,
"grad_norm": 0.99609375,
"learning_rate": 2.1761607088390982e-05,
"loss": 3.9672,
"num_input_tokens_seen": 540573696,
"step": 8250,
"train_runtime": 75311.1549,
"train_tokens_per_second": 7177.87
},
{
"epoch": 2.223463687150838,
"grad_norm": 0.98046875,
"learning_rate": 2.1707585338282134e-05,
"loss": 3.9026,
"num_input_tokens_seen": 541229056,
"step": 8260,
"train_runtime": 75402.1748,
"train_tokens_per_second": 7177.897
},
{
"epoch": 2.2261560207309685,
"grad_norm": 0.99609375,
"learning_rate": 2.1653579228472787e-05,
"loss": 3.946,
"num_input_tokens_seen": 541884416,
"step": 8270,
"train_runtime": 75494.0303,
"train_tokens_per_second": 7177.845
},
{
"epoch": 2.2288483543110993,
"grad_norm": 1.0625,
"learning_rate": 2.159958901551376e-05,
"loss": 3.9629,
"num_input_tokens_seen": 542539776,
"step": 8280,
"train_runtime": 75585.735,
"train_tokens_per_second": 7177.806
},
{
"epoch": 2.2315406878912296,
"grad_norm": 1.0078125,
"learning_rate": 2.154561495588038e-05,
"loss": 3.8611,
"num_input_tokens_seen": 543195136,
"step": 8290,
"train_runtime": 75677.483,
"train_tokens_per_second": 7177.764
},
{
"epoch": 2.2342330214713604,
"grad_norm": 0.9765625,
"learning_rate": 2.1491657305971244e-05,
"loss": 3.8859,
"num_input_tokens_seen": 543850496,
"step": 8300,
"train_runtime": 75768.9086,
"train_tokens_per_second": 7177.753
},
{
"epoch": 2.2369253550514907,
"grad_norm": 0.98046875,
"learning_rate": 2.143771632210696e-05,
"loss": 3.9045,
"num_input_tokens_seen": 544505856,
"step": 8310,
"train_runtime": 75860.6864,
"train_tokens_per_second": 7177.708
},
{
"epoch": 2.2396176886316215,
"grad_norm": 0.98828125,
"learning_rate": 2.138379226052901e-05,
"loss": 3.9382,
"num_input_tokens_seen": 545161216,
"step": 8320,
"train_runtime": 75952.4776,
"train_tokens_per_second": 7177.662
},
{
"epoch": 2.2423100222117522,
"grad_norm": 1.03125,
"learning_rate": 2.1329885377398446e-05,
"loss": 3.9432,
"num_input_tokens_seen": 545816576,
"step": 8330,
"train_runtime": 76044.383,
"train_tokens_per_second": 7177.605
},
{
"epoch": 2.2450023557918826,
"grad_norm": 0.98046875,
"learning_rate": 2.1275995928794758e-05,
"loss": 3.9323,
"num_input_tokens_seen": 546471936,
"step": 8340,
"train_runtime": 76136.3929,
"train_tokens_per_second": 7177.539
},
{
"epoch": 2.2476946893720133,
"grad_norm": 0.9921875,
"learning_rate": 2.1222124170714575e-05,
"loss": 3.9404,
"num_input_tokens_seen": 547127296,
"step": 8350,
"train_runtime": 76227.9893,
"train_tokens_per_second": 7177.512
},
{
"epoch": 2.2503870229521437,
"grad_norm": 0.94921875,
"learning_rate": 2.1168270359070514e-05,
"loss": 3.9462,
"num_input_tokens_seen": 547782656,
"step": 8360,
"train_runtime": 76319.1941,
"train_tokens_per_second": 7177.521
},
{
"epoch": 2.2530793565322744,
"grad_norm": 0.96484375,
"learning_rate": 2.1114434749689944e-05,
"loss": 3.9809,
"num_input_tokens_seen": 548438016,
"step": 8370,
"train_runtime": 76411.0251,
"train_tokens_per_second": 7177.472
},
{
"epoch": 2.2557716901124047,
"grad_norm": 0.9765625,
"learning_rate": 2.1060617598313733e-05,
"loss": 3.8728,
"num_input_tokens_seen": 549093376,
"step": 8380,
"train_runtime": 76502.5914,
"train_tokens_per_second": 7177.448
},
{
"epoch": 2.2584640236925355,
"grad_norm": 0.96875,
"learning_rate": 2.1006819160595108e-05,
"loss": 3.9735,
"num_input_tokens_seen": 549748736,
"step": 8390,
"train_runtime": 76595.1593,
"train_tokens_per_second": 7177.33
},
{
"epoch": 2.2611563572726663,
"grad_norm": 1.015625,
"learning_rate": 2.0953039692098364e-05,
"loss": 3.9304,
"num_input_tokens_seen": 550404096,
"step": 8400,
"train_runtime": 76686.6123,
"train_tokens_per_second": 7177.317
},
{
"epoch": 2.2638486908527966,
"grad_norm": 0.984375,
"learning_rate": 2.089927944829771e-05,
"loss": 3.9103,
"num_input_tokens_seen": 551059456,
"step": 8410,
"train_runtime": 76778.7614,
"train_tokens_per_second": 7177.238
},
{
"epoch": 2.2665410244329274,
"grad_norm": 0.98828125,
"learning_rate": 2.0845538684576005e-05,
"loss": 3.958,
"num_input_tokens_seen": 551714816,
"step": 8420,
"train_runtime": 76870.7391,
"train_tokens_per_second": 7177.176
},
{
"epoch": 2.2692333580130577,
"grad_norm": 0.9765625,
"learning_rate": 2.079181765622359e-05,
"loss": 3.9227,
"num_input_tokens_seen": 552370176,
"step": 8430,
"train_runtime": 76961.857,
"train_tokens_per_second": 7177.194
},
{
"epoch": 2.2719256915931885,
"grad_norm": 0.9375,
"learning_rate": 2.0738116618437055e-05,
"loss": 3.9221,
"num_input_tokens_seen": 553025536,
"step": 8440,
"train_runtime": 77053.3704,
"train_tokens_per_second": 7177.175
},
{
"epoch": 2.274618025173319,
"grad_norm": 0.984375,
"learning_rate": 2.0684435826318008e-05,
"loss": 3.954,
"num_input_tokens_seen": 553680896,
"step": 8450,
"train_runtime": 77145.0088,
"train_tokens_per_second": 7177.145
},
{
"epoch": 2.2773103587534496,
"grad_norm": 0.953125,
"learning_rate": 2.063077553487191e-05,
"loss": 3.9332,
"num_input_tokens_seen": 554336256,
"step": 8460,
"train_runtime": 77237.3794,
"train_tokens_per_second": 7177.046
},
{
"epoch": 2.2800026923335803,
"grad_norm": 0.98828125,
"learning_rate": 2.0577135999006798e-05,
"loss": 3.8525,
"num_input_tokens_seen": 554991616,
"step": 8470,
"train_runtime": 77328.7228,
"train_tokens_per_second": 7177.044
},
{
"epoch": 2.2826950259137107,
"grad_norm": 0.94140625,
"learning_rate": 2.0523517473532144e-05,
"loss": 3.9421,
"num_input_tokens_seen": 555646976,
"step": 8480,
"train_runtime": 77420.9627,
"train_tokens_per_second": 7176.958
},
{
"epoch": 2.2853873594938414,
"grad_norm": 0.94140625,
"learning_rate": 2.0469920213157613e-05,
"loss": 3.9269,
"num_input_tokens_seen": 556302336,
"step": 8490,
"train_runtime": 77512.6117,
"train_tokens_per_second": 7176.927
},
{
"epoch": 2.2880796930739717,
"grad_norm": 0.94921875,
"learning_rate": 2.0416344472491817e-05,
"loss": 4.017,
"num_input_tokens_seen": 556957696,
"step": 8500,
"train_runtime": 77604.4223,
"train_tokens_per_second": 7176.881
},
{
"epoch": 2.2907720266541025,
"grad_norm": 1.0,
"learning_rate": 2.0362790506041186e-05,
"loss": 3.8685,
"num_input_tokens_seen": 557613056,
"step": 8510,
"train_runtime": 77719.4257,
"train_tokens_per_second": 7174.693
},
{
"epoch": 2.293464360234233,
"grad_norm": 0.97265625,
"learning_rate": 2.0309258568208675e-05,
"loss": 4.0096,
"num_input_tokens_seen": 558268416,
"step": 8520,
"train_runtime": 77811.0797,
"train_tokens_per_second": 7174.665
},
{
"epoch": 2.2961566938143636,
"grad_norm": 0.9375,
"learning_rate": 2.0255748913292626e-05,
"loss": 4.0394,
"num_input_tokens_seen": 558923776,
"step": 8530,
"train_runtime": 77902.6246,
"train_tokens_per_second": 7174.646
},
{
"epoch": 2.2988490273944944,
"grad_norm": 0.98828125,
"learning_rate": 2.0202261795485495e-05,
"loss": 4.0142,
"num_input_tokens_seen": 559579136,
"step": 8540,
"train_runtime": 77994.7083,
"train_tokens_per_second": 7174.578
},
{
"epoch": 2.3015413609746247,
"grad_norm": 1.015625,
"learning_rate": 2.0148797468872704e-05,
"loss": 3.9483,
"num_input_tokens_seen": 560234496,
"step": 8550,
"train_runtime": 78086.5915,
"train_tokens_per_second": 7174.529
},
{
"epoch": 2.3042336945547555,
"grad_norm": 1.0,
"learning_rate": 2.0095356187431417e-05,
"loss": 4.0448,
"num_input_tokens_seen": 560889856,
"step": 8560,
"train_runtime": 78178.1001,
"train_tokens_per_second": 7174.514
},
{
"epoch": 2.306926028134886,
"grad_norm": 0.9765625,
"learning_rate": 2.0041938205029274e-05,
"loss": 3.9285,
"num_input_tokens_seen": 561545216,
"step": 8570,
"train_runtime": 78269.9302,
"train_tokens_per_second": 7174.469
},
{
"epoch": 2.3096183617150166,
"grad_norm": 0.95703125,
"learning_rate": 1.99885437754233e-05,
"loss": 4.0154,
"num_input_tokens_seen": 562200576,
"step": 8580,
"train_runtime": 78361.4008,
"train_tokens_per_second": 7174.458
},
{
"epoch": 2.312310695295147,
"grad_norm": 1.0078125,
"learning_rate": 1.9935173152258575e-05,
"loss": 3.9581,
"num_input_tokens_seen": 562855936,
"step": 8590,
"train_runtime": 78453.1924,
"train_tokens_per_second": 7174.417
},
{
"epoch": 2.3150030288752776,
"grad_norm": 0.96875,
"learning_rate": 1.9881826589067136e-05,
"loss": 3.8998,
"num_input_tokens_seen": 563511296,
"step": 8600,
"train_runtime": 78544.1591,
"train_tokens_per_second": 7174.452
},
{
"epoch": 2.3176953624554084,
"grad_norm": 0.99609375,
"learning_rate": 1.9828504339266686e-05,
"loss": 3.8947,
"num_input_tokens_seen": 564166656,
"step": 8610,
"train_runtime": 78636.1489,
"train_tokens_per_second": 7174.393
},
{
"epoch": 2.3203876960355387,
"grad_norm": 0.97265625,
"learning_rate": 1.9775206656159466e-05,
"loss": 3.9563,
"num_input_tokens_seen": 564822016,
"step": 8620,
"train_runtime": 78727.3979,
"train_tokens_per_second": 7174.402
},
{
"epoch": 2.3230800296156695,
"grad_norm": 0.9375,
"learning_rate": 1.9721933792931e-05,
"loss": 3.9166,
"num_input_tokens_seen": 565477376,
"step": 8630,
"train_runtime": 78819.0122,
"train_tokens_per_second": 7174.378
},
{
"epoch": 2.3257723631958,
"grad_norm": 0.96484375,
"learning_rate": 1.9668686002648887e-05,
"loss": 3.8972,
"num_input_tokens_seen": 566132736,
"step": 8640,
"train_runtime": 78910.9,
"train_tokens_per_second": 7174.329
},
{
"epoch": 2.3284646967759306,
"grad_norm": 0.96875,
"learning_rate": 1.9615463538261663e-05,
"loss": 3.7837,
"num_input_tokens_seen": 566788096,
"step": 8650,
"train_runtime": 79002.2736,
"train_tokens_per_second": 7174.326
},
{
"epoch": 2.331157030356061,
"grad_norm": 0.96484375,
"learning_rate": 1.9562266652597504e-05,
"loss": 3.9452,
"num_input_tokens_seen": 567443456,
"step": 8660,
"train_runtime": 79093.8075,
"train_tokens_per_second": 7174.31
},
{
"epoch": 2.3338493639361917,
"grad_norm": 0.984375,
"learning_rate": 1.9509095598363134e-05,
"loss": 3.8569,
"num_input_tokens_seen": 568098816,
"step": 8670,
"train_runtime": 79185.6114,
"train_tokens_per_second": 7174.268
},
{
"epoch": 2.3365416975163225,
"grad_norm": 1.0390625,
"learning_rate": 1.9455950628142508e-05,
"loss": 3.9199,
"num_input_tokens_seen": 568754176,
"step": 8680,
"train_runtime": 79277.143,
"train_tokens_per_second": 7174.252
},
{
"epoch": 2.339234031096453,
"grad_norm": 0.984375,
"learning_rate": 1.9402831994395712e-05,
"loss": 3.9144,
"num_input_tokens_seen": 569409536,
"step": 8690,
"train_runtime": 79368.6331,
"train_tokens_per_second": 7174.239
},
{
"epoch": 2.3419263646765835,
"grad_norm": 1.015625,
"learning_rate": 1.9349739949457725e-05,
"loss": 3.9841,
"num_input_tokens_seen": 570064896,
"step": 8700,
"train_runtime": 79459.4951,
"train_tokens_per_second": 7174.283
},
{
"epoch": 2.344618698256714,
"grad_norm": 0.99609375,
"learning_rate": 1.9296674745537187e-05,
"loss": 3.9162,
"num_input_tokens_seen": 570720256,
"step": 8710,
"train_runtime": 79551.4372,
"train_tokens_per_second": 7174.229
},
{
"epoch": 2.3473110318368446,
"grad_norm": 0.9609375,
"learning_rate": 1.924363663471526e-05,
"loss": 3.8481,
"num_input_tokens_seen": 571375616,
"step": 8720,
"train_runtime": 79642.3623,
"train_tokens_per_second": 7174.268
},
{
"epoch": 2.350003365416975,
"grad_norm": 0.96875,
"learning_rate": 1.9190625868944386e-05,
"loss": 3.9566,
"num_input_tokens_seen": 572030976,
"step": 8730,
"train_runtime": 79733.9595,
"train_tokens_per_second": 7174.245
},
{
"epoch": 2.3526956989971057,
"grad_norm": 0.93359375,
"learning_rate": 1.9137642700047126e-05,
"loss": 3.9721,
"num_input_tokens_seen": 572686336,
"step": 8740,
"train_runtime": 79825.6602,
"train_tokens_per_second": 7174.214
},
{
"epoch": 2.3553880325772365,
"grad_norm": 0.9375,
"learning_rate": 1.9084687379714914e-05,
"loss": 3.8708,
"num_input_tokens_seen": 573341696,
"step": 8750,
"train_runtime": 79917.2004,
"train_tokens_per_second": 7174.196
},
{
"epoch": 2.358080366157367,
"grad_norm": 0.97265625,
"learning_rate": 1.9031760159506923e-05,
"loss": 3.9549,
"num_input_tokens_seen": 573997056,
"step": 8760,
"train_runtime": 80008.431,
"train_tokens_per_second": 7174.207
},
{
"epoch": 2.3607726997374976,
"grad_norm": 0.92578125,
"learning_rate": 1.8978861290848836e-05,
"loss": 3.9081,
"num_input_tokens_seen": 574652416,
"step": 8770,
"train_runtime": 80099.4108,
"train_tokens_per_second": 7174.24
},
{
"epoch": 2.363465033317628,
"grad_norm": 0.94921875,
"learning_rate": 1.8925991025031623e-05,
"loss": 3.9408,
"num_input_tokens_seen": 575307776,
"step": 8780,
"train_runtime": 80191.0045,
"train_tokens_per_second": 7174.218
},
{
"epoch": 2.3661573668977587,
"grad_norm": 0.984375,
"learning_rate": 1.887314961321043e-05,
"loss": 3.8723,
"num_input_tokens_seen": 575963136,
"step": 8790,
"train_runtime": 80282.2053,
"train_tokens_per_second": 7174.232
},
{
"epoch": 2.368849700477889,
"grad_norm": 1.0234375,
"learning_rate": 1.8820337306403274e-05,
"loss": 3.9218,
"num_input_tokens_seen": 576618496,
"step": 8800,
"train_runtime": 80373.7228,
"train_tokens_per_second": 7174.217
},
{
"epoch": 2.3715420340580198,
"grad_norm": 1.03125,
"learning_rate": 1.8767554355489974e-05,
"loss": 3.921,
"num_input_tokens_seen": 577273856,
"step": 8810,
"train_runtime": 80464.6105,
"train_tokens_per_second": 7174.258
},
{
"epoch": 2.3742343676381505,
"grad_norm": 0.94921875,
"learning_rate": 1.8714801011210842e-05,
"loss": 3.854,
"num_input_tokens_seen": 577929216,
"step": 8820,
"train_runtime": 80556.5972,
"train_tokens_per_second": 7174.201
},
{
"epoch": 2.376926701218281,
"grad_norm": 0.96484375,
"learning_rate": 1.8662077524165583e-05,
"loss": 3.9925,
"num_input_tokens_seen": 578584576,
"step": 8830,
"train_runtime": 80647.9657,
"train_tokens_per_second": 7174.199
},
{
"epoch": 2.3796190347984116,
"grad_norm": 1.0234375,
"learning_rate": 1.8609384144812068e-05,
"loss": 3.9418,
"num_input_tokens_seen": 579239936,
"step": 8840,
"train_runtime": 80739.7775,
"train_tokens_per_second": 7174.158
},
{
"epoch": 2.382311368378542,
"grad_norm": 0.97265625,
"learning_rate": 1.8556721123465107e-05,
"loss": 3.955,
"num_input_tokens_seen": 579895296,
"step": 8850,
"train_runtime": 80831.2192,
"train_tokens_per_second": 7174.15
},
{
"epoch": 2.3850037019586727,
"grad_norm": 0.94921875,
"learning_rate": 1.8504088710295346e-05,
"loss": 3.9767,
"num_input_tokens_seen": 580550656,
"step": 8860,
"train_runtime": 80922.8283,
"train_tokens_per_second": 7174.127
},
{
"epoch": 2.387696035538803,
"grad_norm": 1.0234375,
"learning_rate": 1.8451487155327986e-05,
"loss": 3.9133,
"num_input_tokens_seen": 581206016,
"step": 8870,
"train_runtime": 81013.9561,
"train_tokens_per_second": 7174.147
},
{
"epoch": 2.390388369118934,
"grad_norm": 1.0,
"learning_rate": 1.8398916708441672e-05,
"loss": 4.023,
"num_input_tokens_seen": 581861376,
"step": 8880,
"train_runtime": 81105.7181,
"train_tokens_per_second": 7174.11
},
{
"epoch": 2.3930807026990646,
"grad_norm": 1.0234375,
"learning_rate": 1.8346377619367267e-05,
"loss": 3.8944,
"num_input_tokens_seen": 582516736,
"step": 8890,
"train_runtime": 81197.261,
"train_tokens_per_second": 7174.093
},
{
"epoch": 2.395773036279195,
"grad_norm": 0.99609375,
"learning_rate": 1.8293870137686648e-05,
"loss": 3.8692,
"num_input_tokens_seen": 583172096,
"step": 8900,
"train_runtime": 81289.0326,
"train_tokens_per_second": 7174.056
},
{
"epoch": 2.3984653698593257,
"grad_norm": 0.9765625,
"learning_rate": 1.8241394512831576e-05,
"loss": 3.9668,
"num_input_tokens_seen": 583827456,
"step": 8910,
"train_runtime": 81380.7073,
"train_tokens_per_second": 7174.028
},
{
"epoch": 2.401157703439456,
"grad_norm": 0.9921875,
"learning_rate": 1.8188950994082456e-05,
"loss": 3.8961,
"num_input_tokens_seen": 584482816,
"step": 8920,
"train_runtime": 81472.3337,
"train_tokens_per_second": 7174.004
},
{
"epoch": 2.4038500370195868,
"grad_norm": 0.9453125,
"learning_rate": 1.813653983056719e-05,
"loss": 3.9315,
"num_input_tokens_seen": 585138176,
"step": 8930,
"train_runtime": 81563.6768,
"train_tokens_per_second": 7174.004
},
{
"epoch": 2.406542370599717,
"grad_norm": 0.96484375,
"learning_rate": 1.8084161271259966e-05,
"loss": 3.9244,
"num_input_tokens_seen": 585793536,
"step": 8940,
"train_runtime": 81655.1173,
"train_tokens_per_second": 7173.997
},
{
"epoch": 2.409234704179848,
"grad_norm": 1.015625,
"learning_rate": 1.8031815564980104e-05,
"loss": 3.9473,
"num_input_tokens_seen": 586448896,
"step": 8950,
"train_runtime": 81747.0607,
"train_tokens_per_second": 7173.945
},
{
"epoch": 2.4119270377599786,
"grad_norm": 1.0234375,
"learning_rate": 1.797950296039086e-05,
"loss": 4.0002,
"num_input_tokens_seen": 587104256,
"step": 8960,
"train_runtime": 81838.6641,
"train_tokens_per_second": 7173.923
},
{
"epoch": 2.414619371340109,
"grad_norm": 1.015625,
"learning_rate": 1.7927223705998214e-05,
"loss": 3.8831,
"num_input_tokens_seen": 587759616,
"step": 8970,
"train_runtime": 81930.2317,
"train_tokens_per_second": 7173.904
},
{
"epoch": 2.4173117049202397,
"grad_norm": 1.015625,
"learning_rate": 1.7874978050149765e-05,
"loss": 3.9407,
"num_input_tokens_seen": 588414976,
"step": 8980,
"train_runtime": 82021.6546,
"train_tokens_per_second": 7173.898
},
{
"epoch": 2.42000403850037,
"grad_norm": 0.9375,
"learning_rate": 1.7822766241033456e-05,
"loss": 4.0271,
"num_input_tokens_seen": 589070336,
"step": 8990,
"train_runtime": 82113.2156,
"train_tokens_per_second": 7173.88
},
{
"epoch": 2.422696372080501,
"grad_norm": 0.984375,
"learning_rate": 1.7770588526676497e-05,
"loss": 3.9653,
"num_input_tokens_seen": 589725696,
"step": 9000,
"train_runtime": 82204.6575,
"train_tokens_per_second": 7173.872
},
{
"epoch": 2.425388705660631,
"grad_norm": 0.984375,
"learning_rate": 1.771844515494408e-05,
"loss": 4.016,
"num_input_tokens_seen": 590381056,
"step": 9010,
"train_runtime": 82312.9264,
"train_tokens_per_second": 7172.398
},
{
"epoch": 2.428081039240762,
"grad_norm": 0.953125,
"learning_rate": 1.7666336373538293e-05,
"loss": 3.8489,
"num_input_tokens_seen": 591036416,
"step": 9020,
"train_runtime": 82404.3747,
"train_tokens_per_second": 7172.391
},
{
"epoch": 2.4307733728208927,
"grad_norm": 1.015625,
"learning_rate": 1.7614262429996907e-05,
"loss": 3.954,
"num_input_tokens_seen": 591691776,
"step": 9030,
"train_runtime": 82495.3911,
"train_tokens_per_second": 7172.422
},
{
"epoch": 2.433465706401023,
"grad_norm": 1.0234375,
"learning_rate": 1.7562223571692164e-05,
"loss": 3.9313,
"num_input_tokens_seen": 592347136,
"step": 9040,
"train_runtime": 82586.7823,
"train_tokens_per_second": 7172.421
},
{
"epoch": 2.4361580399811538,
"grad_norm": 0.9375,
"learning_rate": 1.751022004582969e-05,
"loss": 3.8823,
"num_input_tokens_seen": 593002496,
"step": 9050,
"train_runtime": 82677.9376,
"train_tokens_per_second": 7172.439
},
{
"epoch": 2.438850373561284,
"grad_norm": 0.9765625,
"learning_rate": 1.7458252099447205e-05,
"loss": 3.9563,
"num_input_tokens_seen": 593657856,
"step": 9060,
"train_runtime": 82769.4217,
"train_tokens_per_second": 7172.43
},
{
"epoch": 2.441542707141415,
"grad_norm": 0.98828125,
"learning_rate": 1.7406319979413472e-05,
"loss": 3.9923,
"num_input_tokens_seen": 594313216,
"step": 9070,
"train_runtime": 82860.4577,
"train_tokens_per_second": 7172.459
},
{
"epoch": 2.444235040721545,
"grad_norm": 0.98828125,
"learning_rate": 1.735442393242701e-05,
"loss": 3.9398,
"num_input_tokens_seen": 594968576,
"step": 9080,
"train_runtime": 82951.846,
"train_tokens_per_second": 7172.457
},
{
"epoch": 2.446927374301676,
"grad_norm": 1.0,
"learning_rate": 1.7302564205015022e-05,
"loss": 3.9483,
"num_input_tokens_seen": 595623936,
"step": 9090,
"train_runtime": 83043.0825,
"train_tokens_per_second": 7172.469
},
{
"epoch": 2.4496197078818067,
"grad_norm": 1.015625,
"learning_rate": 1.725074104353217e-05,
"loss": 3.9473,
"num_input_tokens_seen": 596279296,
"step": 9100,
"train_runtime": 83134.9393,
"train_tokens_per_second": 7172.427
},
{
"epoch": 2.452312041461937,
"grad_norm": 0.99609375,
"learning_rate": 1.7198954694159374e-05,
"loss": 4.0133,
"num_input_tokens_seen": 596934656,
"step": 9110,
"train_runtime": 83226.6166,
"train_tokens_per_second": 7172.401
},
{
"epoch": 2.455004375042068,
"grad_norm": 0.98046875,
"learning_rate": 1.7147205402902746e-05,
"loss": 3.971,
"num_input_tokens_seen": 597590016,
"step": 9120,
"train_runtime": 83317.6613,
"train_tokens_per_second": 7172.429
},
{
"epoch": 2.457696708622198,
"grad_norm": 0.9609375,
"learning_rate": 1.7095493415592284e-05,
"loss": 3.9413,
"num_input_tokens_seen": 598245376,
"step": 9130,
"train_runtime": 83408.8708,
"train_tokens_per_second": 7172.443
},
{
"epoch": 2.460389042202329,
"grad_norm": 0.96484375,
"learning_rate": 1.704381897788085e-05,
"loss": 3.9971,
"num_input_tokens_seen": 598900736,
"step": 9140,
"train_runtime": 83500.1884,
"train_tokens_per_second": 7172.448
},
{
"epoch": 2.4630813757824592,
"grad_norm": 0.99609375,
"learning_rate": 1.699218233524286e-05,
"loss": 3.9231,
"num_input_tokens_seen": 599556096,
"step": 9150,
"train_runtime": 83591.4932,
"train_tokens_per_second": 7172.453
},
{
"epoch": 2.46577370936259,
"grad_norm": 0.95703125,
"learning_rate": 1.6940583732973248e-05,
"loss": 4.0531,
"num_input_tokens_seen": 600211456,
"step": 9160,
"train_runtime": 83683.1001,
"train_tokens_per_second": 7172.433
},
{
"epoch": 2.4684660429427208,
"grad_norm": 0.953125,
"learning_rate": 1.6889023416186223e-05,
"loss": 3.9988,
"num_input_tokens_seen": 600866816,
"step": 9170,
"train_runtime": 83773.9429,
"train_tokens_per_second": 7172.479
},
{
"epoch": 2.471158376522851,
"grad_norm": 0.984375,
"learning_rate": 1.6837501629814088e-05,
"loss": 3.9686,
"num_input_tokens_seen": 601522176,
"step": 9180,
"train_runtime": 83866.3011,
"train_tokens_per_second": 7172.394
},
{
"epoch": 2.473850710102982,
"grad_norm": 1.0078125,
"learning_rate": 1.6786018618606163e-05,
"loss": 3.9876,
"num_input_tokens_seen": 602177536,
"step": 9190,
"train_runtime": 83957.9068,
"train_tokens_per_second": 7172.374
},
{
"epoch": 2.476543043683112,
"grad_norm": 0.9765625,
"learning_rate": 1.6734574627127524e-05,
"loss": 3.9547,
"num_input_tokens_seen": 602832896,
"step": 9200,
"train_runtime": 84049.3473,
"train_tokens_per_second": 7172.369
},
{
"epoch": 2.479235377263243,
"grad_norm": 0.96875,
"learning_rate": 1.6683169899757938e-05,
"loss": 4.0162,
"num_input_tokens_seen": 603488256,
"step": 9210,
"train_runtime": 84140.7588,
"train_tokens_per_second": 7172.365
},
{
"epoch": 2.4819277108433733,
"grad_norm": 0.97265625,
"learning_rate": 1.6631804680690594e-05,
"loss": 3.9212,
"num_input_tokens_seen": 604143616,
"step": 9220,
"train_runtime": 84232.6052,
"train_tokens_per_second": 7172.325
},
{
"epoch": 2.484620044423504,
"grad_norm": 0.96875,
"learning_rate": 1.658047921393104e-05,
"loss": 4.011,
"num_input_tokens_seen": 604798976,
"step": 9230,
"train_runtime": 84324.1924,
"train_tokens_per_second": 7172.307
},
{
"epoch": 2.487312378003635,
"grad_norm": 1.0,
"learning_rate": 1.652919374329599e-05,
"loss": 3.9753,
"num_input_tokens_seen": 605454336,
"step": 9240,
"train_runtime": 84415.5753,
"train_tokens_per_second": 7172.306
},
{
"epoch": 2.490004711583765,
"grad_norm": 1.0390625,
"learning_rate": 1.6477948512412115e-05,
"loss": 3.8978,
"num_input_tokens_seen": 606109696,
"step": 9250,
"train_runtime": 84507.2116,
"train_tokens_per_second": 7172.284
},
{
"epoch": 2.492697045163896,
"grad_norm": 1.03125,
"learning_rate": 1.6426743764714993e-05,
"loss": 4.1365,
"num_input_tokens_seen": 606765056,
"step": 9260,
"train_runtime": 84599.1369,
"train_tokens_per_second": 7172.237
},
{
"epoch": 2.495389378744026,
"grad_norm": 0.94921875,
"learning_rate": 1.6375579743447827e-05,
"loss": 3.9571,
"num_input_tokens_seen": 607420416,
"step": 9270,
"train_runtime": 84690.608,
"train_tokens_per_second": 7172.229
},
{
"epoch": 2.498081712324157,
"grad_norm": 0.984375,
"learning_rate": 1.6324456691660402e-05,
"loss": 3.9577,
"num_input_tokens_seen": 608075776,
"step": 9280,
"train_runtime": 84782.2711,
"train_tokens_per_second": 7172.204
},
{
"epoch": 2.5007740459042873,
"grad_norm": 0.9609375,
"learning_rate": 1.6273374852207862e-05,
"loss": 3.9495,
"num_input_tokens_seen": 608731136,
"step": 9290,
"train_runtime": 84874.7787,
"train_tokens_per_second": 7172.109
},
{
"epoch": 2.503466379484418,
"grad_norm": 1.046875,
"learning_rate": 1.622233446774957e-05,
"loss": 3.9344,
"num_input_tokens_seen": 609386496,
"step": 9300,
"train_runtime": 84966.6974,
"train_tokens_per_second": 7172.063
},
{
"epoch": 2.506158713064549,
"grad_norm": 0.97265625,
"learning_rate": 1.6171335780747982e-05,
"loss": 3.9531,
"num_input_tokens_seen": 610041856,
"step": 9310,
"train_runtime": 85058.3056,
"train_tokens_per_second": 7172.043
},
{
"epoch": 2.508851046644679,
"grad_norm": 0.94921875,
"learning_rate": 1.6120379033467435e-05,
"loss": 3.9955,
"num_input_tokens_seen": 610697216,
"step": 9320,
"train_runtime": 85150.3157,
"train_tokens_per_second": 7171.99
},
{
"epoch": 2.51154338022481,
"grad_norm": 0.98828125,
"learning_rate": 1.6069464467973093e-05,
"loss": 3.8716,
"num_input_tokens_seen": 611352576,
"step": 9330,
"train_runtime": 85242.2507,
"train_tokens_per_second": 7171.943
},
{
"epoch": 2.5142357138049407,
"grad_norm": 0.98046875,
"learning_rate": 1.6018592326129678e-05,
"loss": 3.924,
"num_input_tokens_seen": 612007936,
"step": 9340,
"train_runtime": 85333.8473,
"train_tokens_per_second": 7171.925
},
{
"epoch": 2.516928047385071,
"grad_norm": 0.98046875,
"learning_rate": 1.5967762849600422e-05,
"loss": 3.936,
"num_input_tokens_seen": 612663296,
"step": 9350,
"train_runtime": 85425.593,
"train_tokens_per_second": 7171.894
},
{
"epoch": 2.5196203809652014,
"grad_norm": 0.96484375,
"learning_rate": 1.5916976279845884e-05,
"loss": 3.9575,
"num_input_tokens_seen": 613318656,
"step": 9360,
"train_runtime": 85517.71,
"train_tokens_per_second": 7171.832
},
{
"epoch": 2.522312714545332,
"grad_norm": 0.97265625,
"learning_rate": 1.5866232858122758e-05,
"loss": 3.9254,
"num_input_tokens_seen": 613974016,
"step": 9370,
"train_runtime": 85609.6872,
"train_tokens_per_second": 7171.782
},
{
"epoch": 2.525005048125463,
"grad_norm": 0.96484375,
"learning_rate": 1.5815532825482822e-05,
"loss": 3.8467,
"num_input_tokens_seen": 614629376,
"step": 9380,
"train_runtime": 85701.5941,
"train_tokens_per_second": 7171.738
},
{
"epoch": 2.527697381705593,
"grad_norm": 0.96484375,
"learning_rate": 1.576487642277168e-05,
"loss": 3.9184,
"num_input_tokens_seen": 615284736,
"step": 9390,
"train_runtime": 85794.0017,
"train_tokens_per_second": 7171.652
},
{
"epoch": 2.530389715285724,
"grad_norm": 0.9375,
"learning_rate": 1.571426389062773e-05,
"loss": 3.9802,
"num_input_tokens_seen": 615940096,
"step": 9400,
"train_runtime": 85886.318,
"train_tokens_per_second": 7171.574
},
{
"epoch": 2.5330820488658548,
"grad_norm": 0.9765625,
"learning_rate": 1.566369546948092e-05,
"loss": 3.884,
"num_input_tokens_seen": 616595456,
"step": 9410,
"train_runtime": 85978.2226,
"train_tokens_per_second": 7171.531
},
{
"epoch": 2.535774382445985,
"grad_norm": 1.0234375,
"learning_rate": 1.561317139955169e-05,
"loss": 3.9143,
"num_input_tokens_seen": 617250816,
"step": 9420,
"train_runtime": 86070.5047,
"train_tokens_per_second": 7171.456
},
{
"epoch": 2.5384667160261154,
"grad_norm": 0.984375,
"learning_rate": 1.5562691920849786e-05,
"loss": 4.0689,
"num_input_tokens_seen": 617906176,
"step": 9430,
"train_runtime": 86161.6282,
"train_tokens_per_second": 7171.477
},
{
"epoch": 2.541159049606246,
"grad_norm": 0.9453125,
"learning_rate": 1.5512257273173102e-05,
"loss": 3.8768,
"num_input_tokens_seen": 618561536,
"step": 9440,
"train_runtime": 86253.1252,
"train_tokens_per_second": 7171.468
},
{
"epoch": 2.543851383186377,
"grad_norm": 1.0,
"learning_rate": 1.546186769610661e-05,
"loss": 4.0194,
"num_input_tokens_seen": 619216896,
"step": 9450,
"train_runtime": 86345.733,
"train_tokens_per_second": 7171.366
},
{
"epoch": 2.5465437167665073,
"grad_norm": 0.96875,
"learning_rate": 1.5411523429021123e-05,
"loss": 3.976,
"num_input_tokens_seen": 619872256,
"step": 9460,
"train_runtime": 86437.2789,
"train_tokens_per_second": 7171.353
},
{
"epoch": 2.549236050346638,
"grad_norm": 0.9765625,
"learning_rate": 1.5361224711072277e-05,
"loss": 3.9104,
"num_input_tokens_seen": 620527616,
"step": 9470,
"train_runtime": 86529.5616,
"train_tokens_per_second": 7171.279
},
{
"epoch": 2.551928383926769,
"grad_norm": 0.9609375,
"learning_rate": 1.5310971781199273e-05,
"loss": 3.8955,
"num_input_tokens_seen": 621182976,
"step": 9480,
"train_runtime": 86621.4372,
"train_tokens_per_second": 7171.238
},
{
"epoch": 2.554620717506899,
"grad_norm": 0.96484375,
"learning_rate": 1.5260764878123833e-05,
"loss": 3.9429,
"num_input_tokens_seen": 621838336,
"step": 9490,
"train_runtime": 86712.9665,
"train_tokens_per_second": 7171.227
},
{
"epoch": 2.5573130510870294,
"grad_norm": 0.9609375,
"learning_rate": 1.5210604240349042e-05,
"loss": 3.921,
"num_input_tokens_seen": 622493696,
"step": 9500,
"train_runtime": 86805.012,
"train_tokens_per_second": 7171.172
},
{
"epoch": 2.56000538466716,
"grad_norm": 0.96875,
"learning_rate": 1.5160490106158164e-05,
"loss": 3.9021,
"num_input_tokens_seen": 623149056,
"step": 9510,
"train_runtime": 86914.2944,
"train_tokens_per_second": 7169.696
},
{
"epoch": 2.562697718247291,
"grad_norm": 0.94921875,
"learning_rate": 1.5110422713613603e-05,
"loss": 3.9917,
"num_input_tokens_seen": 623804416,
"step": 9520,
"train_runtime": 87006.6394,
"train_tokens_per_second": 7169.619
},
{
"epoch": 2.5653900518274213,
"grad_norm": 0.96875,
"learning_rate": 1.5060402300555677e-05,
"loss": 3.9477,
"num_input_tokens_seen": 624459776,
"step": 9530,
"train_runtime": 87098.3272,
"train_tokens_per_second": 7169.596
},
{
"epoch": 2.568082385407552,
"grad_norm": 1.0,
"learning_rate": 1.5010429104601565e-05,
"loss": 3.9293,
"num_input_tokens_seen": 625115136,
"step": 9540,
"train_runtime": 87190.5284,
"train_tokens_per_second": 7169.53
},
{
"epoch": 2.570774718987683,
"grad_norm": 0.9765625,
"learning_rate": 1.4960503363144116e-05,
"loss": 3.9775,
"num_input_tokens_seen": 625770496,
"step": 9550,
"train_runtime": 87282.3472,
"train_tokens_per_second": 7169.497
},
{
"epoch": 2.573467052567813,
"grad_norm": 0.9609375,
"learning_rate": 1.4910625313350778e-05,
"loss": 3.9451,
"num_input_tokens_seen": 626425856,
"step": 9560,
"train_runtime": 87374.4718,
"train_tokens_per_second": 7169.438
},
{
"epoch": 2.5761593861479435,
"grad_norm": 0.91796875,
"learning_rate": 1.486079519216245e-05,
"loss": 4.0295,
"num_input_tokens_seen": 627081216,
"step": 9570,
"train_runtime": 87467.3126,
"train_tokens_per_second": 7169.321
},
{
"epoch": 2.5788517197280743,
"grad_norm": 0.96875,
"learning_rate": 1.4811013236292304e-05,
"loss": 3.9548,
"num_input_tokens_seen": 627736576,
"step": 9580,
"train_runtime": 87558.703,
"train_tokens_per_second": 7169.322
},
{
"epoch": 2.581544053308205,
"grad_norm": 0.98046875,
"learning_rate": 1.4761279682224765e-05,
"loss": 3.96,
"num_input_tokens_seen": 628391936,
"step": 9590,
"train_runtime": 87650.9007,
"train_tokens_per_second": 7169.258
},
{
"epoch": 2.5842363868883353,
"grad_norm": 0.98046875,
"learning_rate": 1.4711594766214281e-05,
"loss": 4.0199,
"num_input_tokens_seen": 629047296,
"step": 9600,
"train_runtime": 87743.0317,
"train_tokens_per_second": 7169.199
},
{
"epoch": 2.586928720468466,
"grad_norm": 0.95703125,
"learning_rate": 1.4661958724284292e-05,
"loss": 3.8755,
"num_input_tokens_seen": 629702656,
"step": 9610,
"train_runtime": 87835.7675,
"train_tokens_per_second": 7169.092
},
{
"epoch": 2.589621054048597,
"grad_norm": 1.0,
"learning_rate": 1.4612371792226026e-05,
"loss": 3.9478,
"num_input_tokens_seen": 630358016,
"step": 9620,
"train_runtime": 87927.4765,
"train_tokens_per_second": 7169.068
},
{
"epoch": 2.592313387628727,
"grad_norm": 0.97265625,
"learning_rate": 1.456283420559745e-05,
"loss": 3.9678,
"num_input_tokens_seen": 631013376,
"step": 9630,
"train_runtime": 88019.1614,
"train_tokens_per_second": 7169.046
},
{
"epoch": 2.5950057212088575,
"grad_norm": 0.93359375,
"learning_rate": 1.4513346199722112e-05,
"loss": 3.9617,
"num_input_tokens_seen": 631668736,
"step": 9640,
"train_runtime": 88111.4148,
"train_tokens_per_second": 7168.977
},
{
"epoch": 2.5976980547889883,
"grad_norm": 0.9765625,
"learning_rate": 1.4463908009688021e-05,
"loss": 3.936,
"num_input_tokens_seen": 632324096,
"step": 9650,
"train_runtime": 88203.1555,
"train_tokens_per_second": 7168.951
},
{
"epoch": 2.600390388369119,
"grad_norm": 0.9453125,
"learning_rate": 1.4414519870346554e-05,
"loss": 3.9889,
"num_input_tokens_seen": 632979456,
"step": 9660,
"train_runtime": 88295.399,
"train_tokens_per_second": 7168.884
},
{
"epoch": 2.6030827219492494,
"grad_norm": 0.94921875,
"learning_rate": 1.4365182016311319e-05,
"loss": 3.9093,
"num_input_tokens_seen": 633634816,
"step": 9670,
"train_runtime": 88387.9014,
"train_tokens_per_second": 7168.796
},
{
"epoch": 2.60577505552938,
"grad_norm": 1.0078125,
"learning_rate": 1.4315894681957037e-05,
"loss": 3.9056,
"num_input_tokens_seen": 634290176,
"step": 9680,
"train_runtime": 88479.8584,
"train_tokens_per_second": 7168.752
},
{
"epoch": 2.608467389109511,
"grad_norm": 1.0078125,
"learning_rate": 1.4266658101418496e-05,
"loss": 4.0172,
"num_input_tokens_seen": 634945536,
"step": 9690,
"train_runtime": 88571.9004,
"train_tokens_per_second": 7168.702
},
{
"epoch": 2.6111597226896412,
"grad_norm": 0.97265625,
"learning_rate": 1.4217472508589286e-05,
"loss": 3.9201,
"num_input_tokens_seen": 635600896,
"step": 9700,
"train_runtime": 88663.5355,
"train_tokens_per_second": 7168.684
},
{
"epoch": 2.6138520562697716,
"grad_norm": 0.9765625,
"learning_rate": 1.4168338137120878e-05,
"loss": 3.895,
"num_input_tokens_seen": 636256256,
"step": 9710,
"train_runtime": 88755.9511,
"train_tokens_per_second": 7168.604
},
{
"epoch": 2.6165443898499023,
"grad_norm": 0.95703125,
"learning_rate": 1.4119255220421374e-05,
"loss": 3.9592,
"num_input_tokens_seen": 636911616,
"step": 9720,
"train_runtime": 88847.8034,
"train_tokens_per_second": 7168.569
},
{
"epoch": 2.619236723430033,
"grad_norm": 1.03125,
"learning_rate": 1.4070223991654452e-05,
"loss": 3.9153,
"num_input_tokens_seen": 637566976,
"step": 9730,
"train_runtime": 88939.424,
"train_tokens_per_second": 7168.553
},
{
"epoch": 2.6219290570101634,
"grad_norm": 0.94921875,
"learning_rate": 1.4021244683738249e-05,
"loss": 3.9623,
"num_input_tokens_seen": 638222336,
"step": 9740,
"train_runtime": 89031.2726,
"train_tokens_per_second": 7168.519
},
{
"epoch": 2.624621390590294,
"grad_norm": 0.984375,
"learning_rate": 1.3972317529344265e-05,
"loss": 3.9298,
"num_input_tokens_seen": 638877696,
"step": 9750,
"train_runtime": 89123.2096,
"train_tokens_per_second": 7168.477
},
{
"epoch": 2.627313724170425,
"grad_norm": 0.9609375,
"learning_rate": 1.3923442760896244e-05,
"loss": 3.9353,
"num_input_tokens_seen": 639533056,
"step": 9760,
"train_runtime": 89215.2725,
"train_tokens_per_second": 7168.426
},
{
"epoch": 2.6300060577505553,
"grad_norm": 0.96875,
"learning_rate": 1.3874620610569078e-05,
"loss": 3.9251,
"num_input_tokens_seen": 640188416,
"step": 9770,
"train_runtime": 89307.6683,
"train_tokens_per_second": 7168.348
},
{
"epoch": 2.6326983913306856,
"grad_norm": 0.95703125,
"learning_rate": 1.38258513102877e-05,
"loss": 3.8596,
"num_input_tokens_seen": 640843776,
"step": 9780,
"train_runtime": 89399.5391,
"train_tokens_per_second": 7168.312
},
{
"epoch": 2.6353907249108164,
"grad_norm": 1.0,
"learning_rate": 1.3777135091725985e-05,
"loss": 4.015,
"num_input_tokens_seen": 641499136,
"step": 9790,
"train_runtime": 89491.8769,
"train_tokens_per_second": 7168.239
},
{
"epoch": 2.638083058490947,
"grad_norm": 0.96875,
"learning_rate": 1.372847218630565e-05,
"loss": 3.9325,
"num_input_tokens_seen": 642154496,
"step": 9800,
"train_runtime": 89583.7249,
"train_tokens_per_second": 7168.205
},
{
"epoch": 2.6407753920710775,
"grad_norm": 0.96875,
"learning_rate": 1.367986282519515e-05,
"loss": 3.9479,
"num_input_tokens_seen": 642809856,
"step": 9810,
"train_runtime": 89676.1861,
"train_tokens_per_second": 7168.122
},
{
"epoch": 2.6434677256512082,
"grad_norm": 0.9765625,
"learning_rate": 1.3631307239308575e-05,
"loss": 3.9335,
"num_input_tokens_seen": 643465216,
"step": 9820,
"train_runtime": 89767.6734,
"train_tokens_per_second": 7168.117
},
{
"epoch": 2.646160059231339,
"grad_norm": 1.0234375,
"learning_rate": 1.3582805659304598e-05,
"loss": 4.004,
"num_input_tokens_seen": 644120576,
"step": 9830,
"train_runtime": 89859.7834,
"train_tokens_per_second": 7168.063
},
{
"epoch": 2.6488523928114693,
"grad_norm": 1.0234375,
"learning_rate": 1.3534358315585278e-05,
"loss": 3.9324,
"num_input_tokens_seen": 644775936,
"step": 9840,
"train_runtime": 89951.7227,
"train_tokens_per_second": 7168.022
},
{
"epoch": 2.6515447263915997,
"grad_norm": 0.9453125,
"learning_rate": 1.3485965438295095e-05,
"loss": 3.879,
"num_input_tokens_seen": 645431296,
"step": 9850,
"train_runtime": 90044.1085,
"train_tokens_per_second": 7167.946
},
{
"epoch": 2.6542370599717304,
"grad_norm": 0.9375,
"learning_rate": 1.3437627257319752e-05,
"loss": 3.9483,
"num_input_tokens_seen": 646086656,
"step": 9860,
"train_runtime": 90136.1279,
"train_tokens_per_second": 7167.899
},
{
"epoch": 2.656929393551861,
"grad_norm": 0.9921875,
"learning_rate": 1.3389344002285132e-05,
"loss": 3.9728,
"num_input_tokens_seen": 646742016,
"step": 9870,
"train_runtime": 90228.3459,
"train_tokens_per_second": 7167.836
},
{
"epoch": 2.6596217271319915,
"grad_norm": 0.94140625,
"learning_rate": 1.33411159025562e-05,
"loss": 3.9763,
"num_input_tokens_seen": 647397376,
"step": 9880,
"train_runtime": 90320.3635,
"train_tokens_per_second": 7167.79
},
{
"epoch": 2.6623140607121223,
"grad_norm": 0.9375,
"learning_rate": 1.329294318723591e-05,
"loss": 3.9931,
"num_input_tokens_seen": 648052736,
"step": 9890,
"train_runtime": 90412.7519,
"train_tokens_per_second": 7167.714
},
{
"epoch": 2.665006394292253,
"grad_norm": 0.94921875,
"learning_rate": 1.3244826085164116e-05,
"loss": 3.9671,
"num_input_tokens_seen": 648708096,
"step": 9900,
"train_runtime": 90504.3501,
"train_tokens_per_second": 7167.701
},
{
"epoch": 2.6676987278723834,
"grad_norm": 0.9765625,
"learning_rate": 1.319676482491649e-05,
"loss": 3.8095,
"num_input_tokens_seen": 649363456,
"step": 9910,
"train_runtime": 90596.3941,
"train_tokens_per_second": 7167.652
},
{
"epoch": 2.6703910614525137,
"grad_norm": 1.0078125,
"learning_rate": 1.3148759634803428e-05,
"loss": 3.8765,
"num_input_tokens_seen": 650018816,
"step": 9920,
"train_runtime": 90687.7916,
"train_tokens_per_second": 7167.655
},
{
"epoch": 2.6730833950326445,
"grad_norm": 1.0,
"learning_rate": 1.3100810742868975e-05,
"loss": 3.9887,
"num_input_tokens_seen": 650674176,
"step": 9930,
"train_runtime": 90779.4533,
"train_tokens_per_second": 7167.637
},
{
"epoch": 2.6757757286127752,
"grad_norm": 0.9765625,
"learning_rate": 1.3052918376889737e-05,
"loss": 3.8924,
"num_input_tokens_seen": 651329536,
"step": 9940,
"train_runtime": 90871.6076,
"train_tokens_per_second": 7167.58
},
{
"epoch": 2.6784680621929056,
"grad_norm": 0.9609375,
"learning_rate": 1.3005082764373791e-05,
"loss": 3.9264,
"num_input_tokens_seen": 651984896,
"step": 9950,
"train_runtime": 90963.4747,
"train_tokens_per_second": 7167.546
},
{
"epoch": 2.6811603957730363,
"grad_norm": 0.94140625,
"learning_rate": 1.295730413255961e-05,
"loss": 3.8486,
"num_input_tokens_seen": 652640256,
"step": 9960,
"train_runtime": 91055.2698,
"train_tokens_per_second": 7167.518
},
{
"epoch": 2.683852729353167,
"grad_norm": 0.98828125,
"learning_rate": 1.290958270841503e-05,
"loss": 4.0193,
"num_input_tokens_seen": 653295616,
"step": 9970,
"train_runtime": 91146.9224,
"train_tokens_per_second": 7167.501
},
{
"epoch": 2.6865450629332974,
"grad_norm": 0.98046875,
"learning_rate": 1.286191871863604e-05,
"loss": 3.9871,
"num_input_tokens_seen": 653950976,
"step": 9980,
"train_runtime": 91239.4581,
"train_tokens_per_second": 7167.414
},
{
"epoch": 2.6892373965134277,
"grad_norm": 0.94140625,
"learning_rate": 1.2814312389645882e-05,
"loss": 4.0445,
"num_input_tokens_seen": 654606336,
"step": 9990,
"train_runtime": 91330.7094,
"train_tokens_per_second": 7167.429
},
{
"epoch": 2.6919297300935585,
"grad_norm": 1.015625,
"learning_rate": 1.2766763947593835e-05,
"loss": 3.9427,
"num_input_tokens_seen": 655261696,
"step": 10000,
"train_runtime": 91422.3951,
"train_tokens_per_second": 7167.409
},
{
"epoch": 2.6946220636736893,
"grad_norm": 0.9609375,
"learning_rate": 1.2719273618354206e-05,
"loss": 3.9393,
"num_input_tokens_seen": 655917056,
"step": 10010,
"train_runtime": 91532.3928,
"train_tokens_per_second": 7165.956
},
{
"epoch": 2.6973143972538196,
"grad_norm": 1.03125,
"learning_rate": 1.2671841627525235e-05,
"loss": 4.0016,
"num_input_tokens_seen": 656572416,
"step": 10020,
"train_runtime": 91623.8029,
"train_tokens_per_second": 7165.959
},
{
"epoch": 2.7000067308339504,
"grad_norm": 1.0234375,
"learning_rate": 1.2624468200428041e-05,
"loss": 4.0109,
"num_input_tokens_seen": 657227776,
"step": 10030,
"train_runtime": 91715.694,
"train_tokens_per_second": 7165.925
},
{
"epoch": 2.702699064414081,
"grad_norm": 0.9765625,
"learning_rate": 1.257715356210554e-05,
"loss": 3.8675,
"num_input_tokens_seen": 657883136,
"step": 10040,
"train_runtime": 91807.8194,
"train_tokens_per_second": 7165.873
},
{
"epoch": 2.7053913979942115,
"grad_norm": 0.953125,
"learning_rate": 1.2529897937321369e-05,
"loss": 3.834,
"num_input_tokens_seen": 658538496,
"step": 10050,
"train_runtime": 91900.0271,
"train_tokens_per_second": 7165.814
},
{
"epoch": 2.708083731574342,
"grad_norm": 0.93359375,
"learning_rate": 1.2482701550558837e-05,
"loss": 3.8913,
"num_input_tokens_seen": 659193856,
"step": 10060,
"train_runtime": 91991.8394,
"train_tokens_per_second": 7165.786
},
{
"epoch": 2.7107760651544726,
"grad_norm": 0.94921875,
"learning_rate": 1.2435564626019844e-05,
"loss": 4.0047,
"num_input_tokens_seen": 659849216,
"step": 10070,
"train_runtime": 92083.4627,
"train_tokens_per_second": 7165.773
},
{
"epoch": 2.7134683987346033,
"grad_norm": 0.95703125,
"learning_rate": 1.2388487387623813e-05,
"loss": 4.1109,
"num_input_tokens_seen": 660504576,
"step": 10080,
"train_runtime": 92175.2186,
"train_tokens_per_second": 7165.75
},
{
"epoch": 2.7161607323147337,
"grad_norm": 0.98828125,
"learning_rate": 1.2341470059006663e-05,
"loss": 3.9107,
"num_input_tokens_seen": 661159936,
"step": 10090,
"train_runtime": 92267.2511,
"train_tokens_per_second": 7165.705
},
{
"epoch": 2.7188530658948644,
"grad_norm": 0.97265625,
"learning_rate": 1.2294512863519666e-05,
"loss": 3.924,
"num_input_tokens_seen": 661815296,
"step": 10100,
"train_runtime": 92359.1828,
"train_tokens_per_second": 7165.669
},
{
"epoch": 2.721545399474995,
"grad_norm": 0.9375,
"learning_rate": 1.2247616024228491e-05,
"loss": 3.9895,
"num_input_tokens_seen": 662470656,
"step": 10110,
"train_runtime": 92450.6455,
"train_tokens_per_second": 7165.668
},
{
"epoch": 2.7242377330551255,
"grad_norm": 1.0,
"learning_rate": 1.220077976391206e-05,
"loss": 4.095,
"num_input_tokens_seen": 663126016,
"step": 10120,
"train_runtime": 92542.7132,
"train_tokens_per_second": 7165.621
},
{
"epoch": 2.726930066635256,
"grad_norm": 0.9609375,
"learning_rate": 1.2154004305061525e-05,
"loss": 3.9587,
"num_input_tokens_seen": 663781376,
"step": 10130,
"train_runtime": 92634.7173,
"train_tokens_per_second": 7165.579
},
{
"epoch": 2.7296224002153866,
"grad_norm": 0.94140625,
"learning_rate": 1.210728986987921e-05,
"loss": 3.8686,
"num_input_tokens_seen": 664436736,
"step": 10140,
"train_runtime": 92726.4649,
"train_tokens_per_second": 7165.557
},
{
"epoch": 2.7323147337955174,
"grad_norm": 0.9609375,
"learning_rate": 1.2060636680277548e-05,
"loss": 3.7919,
"num_input_tokens_seen": 665092096,
"step": 10150,
"train_runtime": 92818.6506,
"train_tokens_per_second": 7165.501
},
{
"epoch": 2.7350070673756477,
"grad_norm": 0.9765625,
"learning_rate": 1.201404495787804e-05,
"loss": 3.8285,
"num_input_tokens_seen": 665747456,
"step": 10160,
"train_runtime": 92910.6656,
"train_tokens_per_second": 7165.458
},
{
"epoch": 2.7376994009557785,
"grad_norm": 0.9296875,
"learning_rate": 1.1967514924010182e-05,
"loss": 3.798,
"num_input_tokens_seen": 666402816,
"step": 10170,
"train_runtime": 93002.6363,
"train_tokens_per_second": 7165.419
},
{
"epoch": 2.7403917345359092,
"grad_norm": 0.9453125,
"learning_rate": 1.1921046799710425e-05,
"loss": 3.8745,
"num_input_tokens_seen": 667058176,
"step": 10180,
"train_runtime": 93094.608,
"train_tokens_per_second": 7165.379
},
{
"epoch": 2.7430840681160396,
"grad_norm": 1.0,
"learning_rate": 1.1874640805721137e-05,
"loss": 3.9656,
"num_input_tokens_seen": 667713536,
"step": 10190,
"train_runtime": 93187.4885,
"train_tokens_per_second": 7165.27
},
{
"epoch": 2.7457764016961703,
"grad_norm": 0.96484375,
"learning_rate": 1.1828297162489529e-05,
"loss": 3.9262,
"num_input_tokens_seen": 668368896,
"step": 10200,
"train_runtime": 93279.158,
"train_tokens_per_second": 7165.254
},
{
"epoch": 2.7484687352763006,
"grad_norm": 0.95703125,
"learning_rate": 1.1782016090166622e-05,
"loss": 3.9285,
"num_input_tokens_seen": 669024256,
"step": 10210,
"train_runtime": 93371.5394,
"train_tokens_per_second": 7165.184
},
{
"epoch": 2.7511610688564314,
"grad_norm": 0.921875,
"learning_rate": 1.17357978086062e-05,
"loss": 3.9583,
"num_input_tokens_seen": 669679616,
"step": 10220,
"train_runtime": 93463.6235,
"train_tokens_per_second": 7165.136
},
{
"epoch": 2.7538534024365617,
"grad_norm": 0.94921875,
"learning_rate": 1.1689642537363796e-05,
"loss": 3.9728,
"num_input_tokens_seen": 670334976,
"step": 10230,
"train_runtime": 93556.1642,
"train_tokens_per_second": 7165.054
},
{
"epoch": 2.7565457360166925,
"grad_norm": 0.96484375,
"learning_rate": 1.1643550495695549e-05,
"loss": 3.9459,
"num_input_tokens_seen": 670990336,
"step": 10240,
"train_runtime": 93648.1394,
"train_tokens_per_second": 7165.015
},
{
"epoch": 2.7592380695968233,
"grad_norm": 0.9453125,
"learning_rate": 1.1597521902557303e-05,
"loss": 4.0197,
"num_input_tokens_seen": 671645696,
"step": 10250,
"train_runtime": 93739.8958,
"train_tokens_per_second": 7164.993
},
{
"epoch": 2.7619304031769536,
"grad_norm": 0.94140625,
"learning_rate": 1.1551556976603461e-05,
"loss": 3.9275,
"num_input_tokens_seen": 672301056,
"step": 10260,
"train_runtime": 93831.7229,
"train_tokens_per_second": 7164.965
},
{
"epoch": 2.7646227367570844,
"grad_norm": 0.9921875,
"learning_rate": 1.1505655936185985e-05,
"loss": 3.8451,
"num_input_tokens_seen": 672956416,
"step": 10270,
"train_runtime": 93923.4861,
"train_tokens_per_second": 7164.943
},
{
"epoch": 2.7673150703372147,
"grad_norm": 0.96484375,
"learning_rate": 1.1459818999353358e-05,
"loss": 3.9533,
"num_input_tokens_seen": 673611776,
"step": 10280,
"train_runtime": 94015.5906,
"train_tokens_per_second": 7164.894
},
{
"epoch": 2.7700074039173455,
"grad_norm": 0.99609375,
"learning_rate": 1.1414046383849545e-05,
"loss": 3.8752,
"num_input_tokens_seen": 674267136,
"step": 10290,
"train_runtime": 94107.2512,
"train_tokens_per_second": 7164.88
},
{
"epoch": 2.772699737497476,
"grad_norm": 0.97265625,
"learning_rate": 1.1368338307112955e-05,
"loss": 3.9406,
"num_input_tokens_seen": 674922496,
"step": 10300,
"train_runtime": 94198.9926,
"train_tokens_per_second": 7164.859
},
{
"epoch": 2.7753920710776065,
"grad_norm": 0.93359375,
"learning_rate": 1.1322694986275414e-05,
"loss": 3.9558,
"num_input_tokens_seen": 675577856,
"step": 10310,
"train_runtime": 94291.3535,
"train_tokens_per_second": 7164.791
},
{
"epoch": 2.7780844046577373,
"grad_norm": 0.9609375,
"learning_rate": 1.1277116638161136e-05,
"loss": 3.9761,
"num_input_tokens_seen": 676233216,
"step": 10320,
"train_runtime": 94383.5654,
"train_tokens_per_second": 7164.735
},
{
"epoch": 2.7807767382378676,
"grad_norm": 0.95703125,
"learning_rate": 1.1231603479285683e-05,
"loss": 4.0315,
"num_input_tokens_seen": 676888576,
"step": 10330,
"train_runtime": 94475.3802,
"train_tokens_per_second": 7164.709
},
{
"epoch": 2.7834690718179984,
"grad_norm": 0.94140625,
"learning_rate": 1.1186155725854942e-05,
"loss": 3.8553,
"num_input_tokens_seen": 677543936,
"step": 10340,
"train_runtime": 94568.0217,
"train_tokens_per_second": 7164.62
},
{
"epoch": 2.7861614053981287,
"grad_norm": 0.921875,
"learning_rate": 1.1140773593764099e-05,
"loss": 3.8506,
"num_input_tokens_seen": 678199296,
"step": 10350,
"train_runtime": 94659.8542,
"train_tokens_per_second": 7164.593
},
{
"epoch": 2.7888537389782595,
"grad_norm": 0.91796875,
"learning_rate": 1.1095457298596598e-05,
"loss": 3.8205,
"num_input_tokens_seen": 678854656,
"step": 10360,
"train_runtime": 94752.2918,
"train_tokens_per_second": 7164.52
},
{
"epoch": 2.79154607255839,
"grad_norm": 1.03125,
"learning_rate": 1.1050207055623182e-05,
"loss": 3.9934,
"num_input_tokens_seen": 679510016,
"step": 10370,
"train_runtime": 94843.5667,
"train_tokens_per_second": 7164.535
},
{
"epoch": 2.7942384061385206,
"grad_norm": 0.9375,
"learning_rate": 1.100502307980074e-05,
"loss": 3.9097,
"num_input_tokens_seen": 680165376,
"step": 10380,
"train_runtime": 94935.0022,
"train_tokens_per_second": 7164.537
},
{
"epoch": 2.7969307397186514,
"grad_norm": 0.93359375,
"learning_rate": 1.0959905585771435e-05,
"loss": 3.9756,
"num_input_tokens_seen": 680820736,
"step": 10390,
"train_runtime": 95026.9066,
"train_tokens_per_second": 7164.505
},
{
"epoch": 2.7996230732987817,
"grad_norm": 0.96875,
"learning_rate": 1.0914854787861579e-05,
"loss": 3.9535,
"num_input_tokens_seen": 681476096,
"step": 10400,
"train_runtime": 95118.9361,
"train_tokens_per_second": 7164.463
},
{
"epoch": 2.8023154068789125,
"grad_norm": 0.953125,
"learning_rate": 1.086987090008066e-05,
"loss": 3.8648,
"num_input_tokens_seen": 682131456,
"step": 10410,
"train_runtime": 95210.7981,
"train_tokens_per_second": 7164.434
},
{
"epoch": 2.8050077404590428,
"grad_norm": 0.97265625,
"learning_rate": 1.082495413612031e-05,
"loss": 3.8493,
"num_input_tokens_seen": 682786816,
"step": 10420,
"train_runtime": 95302.6719,
"train_tokens_per_second": 7164.404
},
{
"epoch": 2.8077000740391735,
"grad_norm": 0.99609375,
"learning_rate": 1.0780104709353306e-05,
"loss": 4.052,
"num_input_tokens_seen": 683442176,
"step": 10430,
"train_runtime": 95394.912,
"train_tokens_per_second": 7164.346
},
{
"epoch": 2.810392407619304,
"grad_norm": 0.9609375,
"learning_rate": 1.0735322832832534e-05,
"loss": 3.9188,
"num_input_tokens_seen": 684097536,
"step": 10440,
"train_runtime": 95486.7033,
"train_tokens_per_second": 7164.322
},
{
"epoch": 2.8130847411994346,
"grad_norm": 0.953125,
"learning_rate": 1.0690608719290002e-05,
"loss": 3.9258,
"num_input_tokens_seen": 684752896,
"step": 10450,
"train_runtime": 95578.2682,
"train_tokens_per_second": 7164.316
},
{
"epoch": 2.8157770747795654,
"grad_norm": 0.9609375,
"learning_rate": 1.0645962581135807e-05,
"loss": 3.9355,
"num_input_tokens_seen": 685408256,
"step": 10460,
"train_runtime": 95670.2538,
"train_tokens_per_second": 7164.278
},
{
"epoch": 2.8184694083596957,
"grad_norm": 0.953125,
"learning_rate": 1.0601384630457139e-05,
"loss": 3.9595,
"num_input_tokens_seen": 686063616,
"step": 10470,
"train_runtime": 95761.844,
"train_tokens_per_second": 7164.269
},
{
"epoch": 2.8211617419398265,
"grad_norm": 1.0234375,
"learning_rate": 1.055687507901726e-05,
"loss": 3.8596,
"num_input_tokens_seen": 686718976,
"step": 10480,
"train_runtime": 95854.3168,
"train_tokens_per_second": 7164.195
},
{
"epoch": 2.823854075519957,
"grad_norm": 0.95703125,
"learning_rate": 1.0512434138254543e-05,
"loss": 3.9505,
"num_input_tokens_seen": 687374336,
"step": 10490,
"train_runtime": 95946.6439,
"train_tokens_per_second": 7164.131
},
{
"epoch": 2.8265464091000876,
"grad_norm": 0.9296875,
"learning_rate": 1.0468062019281375e-05,
"loss": 3.9638,
"num_input_tokens_seen": 688029696,
"step": 10500,
"train_runtime": 96038.3957,
"train_tokens_per_second": 7164.111
},
{
"epoch": 2.829238742680218,
"grad_norm": 0.96484375,
"learning_rate": 1.0423758932883274e-05,
"loss": 3.9239,
"num_input_tokens_seen": 688685056,
"step": 10510,
"train_runtime": 96147.6978,
"train_tokens_per_second": 7162.783
},
{
"epoch": 2.8319310762603487,
"grad_norm": 1.015625,
"learning_rate": 1.0379525089517762e-05,
"loss": 4.0036,
"num_input_tokens_seen": 689340416,
"step": 10520,
"train_runtime": 96239.3496,
"train_tokens_per_second": 7162.771
},
{
"epoch": 2.8346234098404794,
"grad_norm": 0.99609375,
"learning_rate": 1.0335360699313488e-05,
"loss": 3.996,
"num_input_tokens_seen": 689995776,
"step": 10530,
"train_runtime": 96331.657,
"train_tokens_per_second": 7162.711
},
{
"epoch": 2.8373157434206098,
"grad_norm": 1.0,
"learning_rate": 1.0291265972069136e-05,
"loss": 3.9246,
"num_input_tokens_seen": 690651136,
"step": 10540,
"train_runtime": 96423.5509,
"train_tokens_per_second": 7162.681
},
{
"epoch": 2.8400080770007405,
"grad_norm": 1.0234375,
"learning_rate": 1.024724111725247e-05,
"loss": 3.8981,
"num_input_tokens_seen": 691306496,
"step": 10550,
"train_runtime": 96515.4576,
"train_tokens_per_second": 7162.651
},
{
"epoch": 2.842700410580871,
"grad_norm": 0.9609375,
"learning_rate": 1.0203286343999336e-05,
"loss": 3.8913,
"num_input_tokens_seen": 691961856,
"step": 10560,
"train_runtime": 96608.1186,
"train_tokens_per_second": 7162.564
},
{
"epoch": 2.8453927441610016,
"grad_norm": 0.97265625,
"learning_rate": 1.0159401861112652e-05,
"loss": 3.9764,
"num_input_tokens_seen": 692617216,
"step": 10570,
"train_runtime": 96700.1482,
"train_tokens_per_second": 7162.525
},
{
"epoch": 2.848085077741132,
"grad_norm": 1.0234375,
"learning_rate": 1.0115587877061447e-05,
"loss": 3.848,
"num_input_tokens_seen": 693272576,
"step": 10580,
"train_runtime": 96792.4871,
"train_tokens_per_second": 7162.463
},
{
"epoch": 2.8507774113212627,
"grad_norm": 0.9140625,
"learning_rate": 1.0071844599979838e-05,
"loss": 3.9022,
"num_input_tokens_seen": 693927936,
"step": 10590,
"train_runtime": 96884.7813,
"train_tokens_per_second": 7162.404
},
{
"epoch": 2.8534697449013935,
"grad_norm": 0.96484375,
"learning_rate": 1.0028172237666061e-05,
"loss": 3.9427,
"num_input_tokens_seen": 694583296,
"step": 10600,
"train_runtime": 96977.1597,
"train_tokens_per_second": 7162.339
},
{
"epoch": 2.856162078481524,
"grad_norm": 0.99609375,
"learning_rate": 9.984570997581475e-06,
"loss": 3.9405,
"num_input_tokens_seen": 695238656,
"step": 10610,
"train_runtime": 97069.4636,
"train_tokens_per_second": 7162.28
},
{
"epoch": 2.8588544120616546,
"grad_norm": 0.9609375,
"learning_rate": 9.94104108684957e-06,
"loss": 3.8885,
"num_input_tokens_seen": 695894016,
"step": 10620,
"train_runtime": 97161.1311,
"train_tokens_per_second": 7162.268
},
{
"epoch": 2.861546745641785,
"grad_norm": 0.9765625,
"learning_rate": 9.897582712255037e-06,
"loss": 4.0121,
"num_input_tokens_seen": 696549376,
"step": 10630,
"train_runtime": 97253.0394,
"train_tokens_per_second": 7162.238
},
{
"epoch": 2.8642390792219157,
"grad_norm": 0.97265625,
"learning_rate": 9.854196080242672e-06,
"loss": 4.0188,
"num_input_tokens_seen": 697204736,
"step": 10640,
"train_runtime": 97345.146,
"train_tokens_per_second": 7162.193
},
{
"epoch": 2.866931412802046,
"grad_norm": 0.96875,
"learning_rate": 9.810881396916535e-06,
"loss": 4.0005,
"num_input_tokens_seen": 697860096,
"step": 10650,
"train_runtime": 97436.9304,
"train_tokens_per_second": 7162.172
},
{
"epoch": 2.8696237463821768,
"grad_norm": 0.93359375,
"learning_rate": 9.76763886803883e-06,
"loss": 3.8468,
"num_input_tokens_seen": 698515456,
"step": 10660,
"train_runtime": 97528.7654,
"train_tokens_per_second": 7162.148
},
{
"epoch": 2.8723160799623075,
"grad_norm": 0.91015625,
"learning_rate": 9.724468699029068e-06,
"loss": 3.9389,
"num_input_tokens_seen": 699170816,
"step": 10670,
"train_runtime": 97621.254,
"train_tokens_per_second": 7162.076
},
{
"epoch": 2.875008413542438,
"grad_norm": 0.9453125,
"learning_rate": 9.681371094962974e-06,
"loss": 3.9357,
"num_input_tokens_seen": 699826176,
"step": 10680,
"train_runtime": 97713.0596,
"train_tokens_per_second": 7162.054
},
{
"epoch": 2.8777007471225686,
"grad_norm": 0.95703125,
"learning_rate": 9.638346260571576e-06,
"loss": 3.9306,
"num_input_tokens_seen": 700481536,
"step": 10690,
"train_runtime": 97805.1444,
"train_tokens_per_second": 7162.011
},
{
"epoch": 2.880393080702699,
"grad_norm": 0.95703125,
"learning_rate": 9.595394400240218e-06,
"loss": 3.9851,
"num_input_tokens_seen": 701136896,
"step": 10700,
"train_runtime": 97897.8106,
"train_tokens_per_second": 7161.926
},
{
"epoch": 2.8830854142828297,
"grad_norm": 0.9609375,
"learning_rate": 9.552515718007584e-06,
"loss": 3.9531,
"num_input_tokens_seen": 701792256,
"step": 10710,
"train_runtime": 97990.2175,
"train_tokens_per_second": 7161.86
},
{
"epoch": 2.88577774786296,
"grad_norm": 0.9375,
"learning_rate": 9.509710417564738e-06,
"loss": 3.9513,
"num_input_tokens_seen": 702447616,
"step": 10720,
"train_runtime": 98081.4876,
"train_tokens_per_second": 7161.878
},
{
"epoch": 2.888470081443091,
"grad_norm": 0.92578125,
"learning_rate": 9.466978702254136e-06,
"loss": 3.9174,
"num_input_tokens_seen": 703102976,
"step": 10730,
"train_runtime": 98174.2376,
"train_tokens_per_second": 7161.787
},
{
"epoch": 2.8911624150232216,
"grad_norm": 0.9765625,
"learning_rate": 9.424320775068698e-06,
"loss": 3.9466,
"num_input_tokens_seen": 703758336,
"step": 10740,
"train_runtime": 98266.7376,
"train_tokens_per_second": 7161.715
},
{
"epoch": 2.893854748603352,
"grad_norm": 0.9609375,
"learning_rate": 9.381736838650801e-06,
"loss": 3.9308,
"num_input_tokens_seen": 704413696,
"step": 10750,
"train_runtime": 98358.7673,
"train_tokens_per_second": 7161.677
},
{
"epoch": 2.8965470821834827,
"grad_norm": 0.9375,
"learning_rate": 9.339227095291336e-06,
"loss": 3.9239,
"num_input_tokens_seen": 705069056,
"step": 10760,
"train_runtime": 98450.4054,
"train_tokens_per_second": 7161.667
},
{
"epoch": 2.899239415763613,
"grad_norm": 0.9453125,
"learning_rate": 9.296791746928782e-06,
"loss": 3.9094,
"num_input_tokens_seen": 705724416,
"step": 10770,
"train_runtime": 98542.8347,
"train_tokens_per_second": 7161.601
},
{
"epoch": 2.9019317493437438,
"grad_norm": 0.9609375,
"learning_rate": 9.254430995148147e-06,
"loss": 3.819,
"num_input_tokens_seen": 706379776,
"step": 10780,
"train_runtime": 98634.8775,
"train_tokens_per_second": 7161.562
},
{
"epoch": 2.904624082923874,
"grad_norm": 0.921875,
"learning_rate": 9.212145041180146e-06,
"loss": 3.9131,
"num_input_tokens_seen": 707035136,
"step": 10790,
"train_runtime": 98727.3024,
"train_tokens_per_second": 7161.496
},
{
"epoch": 2.907316416504005,
"grad_norm": 0.9375,
"learning_rate": 9.169934085900108e-06,
"loss": 3.931,
"num_input_tokens_seen": 707690496,
"step": 10800,
"train_runtime": 98819.7569,
"train_tokens_per_second": 7161.427
},
{
"epoch": 2.9100087500841356,
"grad_norm": 0.9453125,
"learning_rate": 9.127798329827144e-06,
"loss": 3.8224,
"num_input_tokens_seen": 708345856,
"step": 10810,
"train_runtime": 98911.3629,
"train_tokens_per_second": 7161.42
},
{
"epoch": 2.912701083664266,
"grad_norm": 0.96484375,
"learning_rate": 9.08573797312311e-06,
"loss": 3.9507,
"num_input_tokens_seen": 709001216,
"step": 10820,
"train_runtime": 99003.7864,
"train_tokens_per_second": 7161.355
},
{
"epoch": 2.9153934172443967,
"grad_norm": 0.97265625,
"learning_rate": 9.043753215591685e-06,
"loss": 3.8651,
"num_input_tokens_seen": 709656576,
"step": 10830,
"train_runtime": 99095.6505,
"train_tokens_per_second": 7161.329
},
{
"epoch": 2.918085750824527,
"grad_norm": 0.984375,
"learning_rate": 9.001844256677427e-06,
"loss": 3.9426,
"num_input_tokens_seen": 710311936,
"step": 10840,
"train_runtime": 99188.0519,
"train_tokens_per_second": 7161.265
},
{
"epoch": 2.920778084404658,
"grad_norm": 0.90234375,
"learning_rate": 8.960011295464815e-06,
"loss": 3.9152,
"num_input_tokens_seen": 710967296,
"step": 10850,
"train_runtime": 99280.5448,
"train_tokens_per_second": 7161.195
},
{
"epoch": 2.923470417984788,
"grad_norm": 0.9453125,
"learning_rate": 8.918254530677317e-06,
"loss": 3.9429,
"num_input_tokens_seen": 711622656,
"step": 10860,
"train_runtime": 99372.7163,
"train_tokens_per_second": 7161.147
},
{
"epoch": 2.926162751564919,
"grad_norm": 0.9765625,
"learning_rate": 8.876574160676432e-06,
"loss": 3.9166,
"num_input_tokens_seen": 712278016,
"step": 10870,
"train_runtime": 99464.7853,
"train_tokens_per_second": 7161.107
},
{
"epoch": 2.9288550851450497,
"grad_norm": 0.95703125,
"learning_rate": 8.834970383460738e-06,
"loss": 4.0155,
"num_input_tokens_seen": 712933376,
"step": 10880,
"train_runtime": 99556.9298,
"train_tokens_per_second": 7161.062
},
{
"epoch": 2.93154741872518,
"grad_norm": 0.96484375,
"learning_rate": 8.79344339666501e-06,
"loss": 3.8929,
"num_input_tokens_seen": 713588736,
"step": 10890,
"train_runtime": 99649.0027,
"train_tokens_per_second": 7161.022
},
{
"epoch": 2.9342397523053108,
"grad_norm": 0.9921875,
"learning_rate": 8.751993397559177e-06,
"loss": 3.8957,
"num_input_tokens_seen": 714244096,
"step": 10900,
"train_runtime": 99741.2487,
"train_tokens_per_second": 7160.97
},
{
"epoch": 2.936932085885441,
"grad_norm": 0.97265625,
"learning_rate": 8.71062058304751e-06,
"loss": 3.8997,
"num_input_tokens_seen": 714899456,
"step": 10910,
"train_runtime": 99833.4505,
"train_tokens_per_second": 7160.921
},
{
"epoch": 2.939624419465572,
"grad_norm": 0.98828125,
"learning_rate": 8.66932514966755e-06,
"loss": 3.9571,
"num_input_tokens_seen": 715554816,
"step": 10920,
"train_runtime": 99925.6375,
"train_tokens_per_second": 7160.873
},
{
"epoch": 2.942316753045702,
"grad_norm": 0.96875,
"learning_rate": 8.628107293589326e-06,
"loss": 3.9485,
"num_input_tokens_seen": 716210176,
"step": 10930,
"train_runtime": 100018.0382,
"train_tokens_per_second": 7160.81
},
{
"epoch": 2.945009086625833,
"grad_norm": 0.99609375,
"learning_rate": 8.586967210614267e-06,
"loss": 3.8976,
"num_input_tokens_seen": 716865536,
"step": 10940,
"train_runtime": 100109.7805,
"train_tokens_per_second": 7160.794
},
{
"epoch": 2.9477014202059637,
"grad_norm": 0.9609375,
"learning_rate": 8.545905096174409e-06,
"loss": 3.9325,
"num_input_tokens_seen": 717520896,
"step": 10950,
"train_runtime": 100202.1004,
"train_tokens_per_second": 7160.737
},
{
"epoch": 2.950393753786094,
"grad_norm": 0.98046875,
"learning_rate": 8.504921145331372e-06,
"loss": 3.9919,
"num_input_tokens_seen": 718176256,
"step": 10960,
"train_runtime": 100293.9173,
"train_tokens_per_second": 7160.716
},
{
"epoch": 2.953086087366225,
"grad_norm": 1.0078125,
"learning_rate": 8.464015552775473e-06,
"loss": 3.9009,
"num_input_tokens_seen": 718831616,
"step": 10970,
"train_runtime": 100385.6555,
"train_tokens_per_second": 7160.701
},
{
"epoch": 2.955778420946355,
"grad_norm": 0.95703125,
"learning_rate": 8.423188512824807e-06,
"loss": 3.7904,
"num_input_tokens_seen": 719486976,
"step": 10980,
"train_runtime": 100477.6356,
"train_tokens_per_second": 7160.668
},
{
"epoch": 2.958470754526486,
"grad_norm": 1.0078125,
"learning_rate": 8.382440219424297e-06,
"loss": 3.8862,
"num_input_tokens_seen": 720142336,
"step": 10990,
"train_runtime": 100569.8027,
"train_tokens_per_second": 7160.622
},
{
"epoch": 2.961163088106616,
"grad_norm": 0.94921875,
"learning_rate": 8.341770866144799e-06,
"loss": 4.0179,
"num_input_tokens_seen": 720797696,
"step": 11000,
"train_runtime": 100661.6627,
"train_tokens_per_second": 7160.598
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.97265625,
"learning_rate": 8.301180646182169e-06,
"loss": 3.8564,
"num_input_tokens_seen": 721453056,
"step": 11010,
"train_runtime": 100773.8193,
"train_tokens_per_second": 7159.132
},
{
"epoch": 2.9665477552668778,
"grad_norm": 0.94921875,
"learning_rate": 8.260669752356337e-06,
"loss": 3.8763,
"num_input_tokens_seen": 722108416,
"step": 11020,
"train_runtime": 100865.0354,
"train_tokens_per_second": 7159.155
},
{
"epoch": 2.969240088847008,
"grad_norm": 0.9765625,
"learning_rate": 8.220238377110434e-06,
"loss": 3.8952,
"num_input_tokens_seen": 722763776,
"step": 11030,
"train_runtime": 100957.4463,
"train_tokens_per_second": 7159.093
},
{
"epoch": 2.971932422427139,
"grad_norm": 0.9453125,
"learning_rate": 8.179886712509796e-06,
"loss": 3.9574,
"num_input_tokens_seen": 723419136,
"step": 11040,
"train_runtime": 101049.1864,
"train_tokens_per_second": 7159.079
},
{
"epoch": 2.974624756007269,
"grad_norm": 0.94921875,
"learning_rate": 8.139614950241156e-06,
"loss": 3.9683,
"num_input_tokens_seen": 724074496,
"step": 11050,
"train_runtime": 101141.4786,
"train_tokens_per_second": 7159.026
},
{
"epoch": 2.9773170895874,
"grad_norm": 0.97265625,
"learning_rate": 8.099423281611621e-06,
"loss": 4.0283,
"num_input_tokens_seen": 724729856,
"step": 11060,
"train_runtime": 101233.3511,
"train_tokens_per_second": 7159.003
},
{
"epoch": 2.9800094231675303,
"grad_norm": 0.9609375,
"learning_rate": 8.05931189754788e-06,
"loss": 3.8768,
"num_input_tokens_seen": 725385216,
"step": 11070,
"train_runtime": 101325.3837,
"train_tokens_per_second": 7158.968
},
{
"epoch": 2.982701756747661,
"grad_norm": 0.95703125,
"learning_rate": 8.019280988595182e-06,
"loss": 3.9215,
"num_input_tokens_seen": 726040576,
"step": 11080,
"train_runtime": 101417.6096,
"train_tokens_per_second": 7158.92
},
{
"epoch": 2.985394090327792,
"grad_norm": 0.92578125,
"learning_rate": 7.979330744916536e-06,
"loss": 3.9027,
"num_input_tokens_seen": 726695936,
"step": 11090,
"train_runtime": 101509.4109,
"train_tokens_per_second": 7158.902
},
{
"epoch": 2.988086423907922,
"grad_norm": 0.96875,
"learning_rate": 7.939461356291722e-06,
"loss": 3.7375,
"num_input_tokens_seen": 727351296,
"step": 11100,
"train_runtime": 101601.1555,
"train_tokens_per_second": 7158.888
},
{
"epoch": 2.990778757488053,
"grad_norm": 0.92578125,
"learning_rate": 7.899673012116448e-06,
"loss": 3.9314,
"num_input_tokens_seen": 728006656,
"step": 11110,
"train_runtime": 101693.7993,
"train_tokens_per_second": 7158.811
},
{
"epoch": 2.993471091068183,
"grad_norm": 0.93359375,
"learning_rate": 7.859965901401417e-06,
"loss": 3.9704,
"num_input_tokens_seen": 728662016,
"step": 11120,
"train_runtime": 101785.7077,
"train_tokens_per_second": 7158.785
},
{
"epoch": 2.996163424648314,
"grad_norm": 0.93359375,
"learning_rate": 7.82034021277144e-06,
"loss": 3.9414,
"num_input_tokens_seen": 729317376,
"step": 11130,
"train_runtime": 101877.9311,
"train_tokens_per_second": 7158.738
},
{
"epoch": 2.9988557582284443,
"grad_norm": 0.98828125,
"learning_rate": 7.780796134464547e-06,
"loss": 3.8961,
"num_input_tokens_seen": 729972736,
"step": 11140,
"train_runtime": 101970.4027,
"train_tokens_per_second": 7158.673
},
{
"epoch": 3.001346166790065,
"grad_norm": 0.953125,
"learning_rate": 7.741333854331082e-06,
"loss": 3.9306,
"num_input_tokens_seen": 730578944,
"step": 11150,
"train_runtime": 102055.0045,
"train_tokens_per_second": 7158.678
},
{
"epoch": 3.004038500370196,
"grad_norm": 0.9296875,
"learning_rate": 7.701953559832803e-06,
"loss": 3.957,
"num_input_tokens_seen": 731234304,
"step": 11160,
"train_runtime": 102147.0284,
"train_tokens_per_second": 7158.645
},
{
"epoch": 3.0067308339503263,
"grad_norm": 0.94140625,
"learning_rate": 7.662655438042046e-06,
"loss": 3.9666,
"num_input_tokens_seen": 731889664,
"step": 11170,
"train_runtime": 102239.7093,
"train_tokens_per_second": 7158.566
},
{
"epoch": 3.009423167530457,
"grad_norm": 0.9375,
"learning_rate": 7.623439675640726e-06,
"loss": 3.872,
"num_input_tokens_seen": 732545024,
"step": 11180,
"train_runtime": 102331.4232,
"train_tokens_per_second": 7158.554
},
{
"epoch": 3.012115501110588,
"grad_norm": 0.98046875,
"learning_rate": 7.5843064589195854e-06,
"loss": 3.8952,
"num_input_tokens_seen": 733200384,
"step": 11190,
"train_runtime": 102423.6666,
"train_tokens_per_second": 7158.506
},
{
"epoch": 3.014807834690718,
"grad_norm": 0.93359375,
"learning_rate": 7.54525597377718e-06,
"loss": 3.8343,
"num_input_tokens_seen": 733855744,
"step": 11200,
"train_runtime": 102515.5914,
"train_tokens_per_second": 7158.479
},
{
"epoch": 3.017500168270849,
"grad_norm": 0.92578125,
"learning_rate": 7.506288405719111e-06,
"loss": 3.971,
"num_input_tokens_seen": 734511104,
"step": 11210,
"train_runtime": 102607.6931,
"train_tokens_per_second": 7158.441
},
{
"epoch": 3.020192501850979,
"grad_norm": 0.984375,
"learning_rate": 7.467403939857057e-06,
"loss": 3.9035,
"num_input_tokens_seen": 735166464,
"step": 11220,
"train_runtime": 102700.0139,
"train_tokens_per_second": 7158.387
},
{
"epoch": 3.02288483543111,
"grad_norm": 0.99609375,
"learning_rate": 7.428602760907941e-06,
"loss": 3.9609,
"num_input_tokens_seen": 735821824,
"step": 11230,
"train_runtime": 102791.8887,
"train_tokens_per_second": 7158.365
},
{
"epoch": 3.0255771690112403,
"grad_norm": 0.94140625,
"learning_rate": 7.3898850531930296e-06,
"loss": 3.824,
"num_input_tokens_seen": 736477184,
"step": 11240,
"train_runtime": 102883.9877,
"train_tokens_per_second": 7158.327
},
{
"epoch": 3.028269502591371,
"grad_norm": 0.9609375,
"learning_rate": 7.351251000637074e-06,
"loss": 3.918,
"num_input_tokens_seen": 737132544,
"step": 11250,
"train_runtime": 102976.1963,
"train_tokens_per_second": 7158.281
},
{
"epoch": 3.030961836171502,
"grad_norm": 0.9296875,
"learning_rate": 7.312700786767434e-06,
"loss": 3.9329,
"num_input_tokens_seen": 737787904,
"step": 11260,
"train_runtime": 103068.404,
"train_tokens_per_second": 7158.235
},
{
"epoch": 3.033654169751632,
"grad_norm": 1.0078125,
"learning_rate": 7.274234594713192e-06,
"loss": 3.9396,
"num_input_tokens_seen": 738443264,
"step": 11270,
"train_runtime": 103160.4604,
"train_tokens_per_second": 7158.201
},
{
"epoch": 3.036346503331763,
"grad_norm": 0.96875,
"learning_rate": 7.2358526072042884e-06,
"loss": 3.9039,
"num_input_tokens_seen": 739098624,
"step": 11280,
"train_runtime": 103252.2124,
"train_tokens_per_second": 7158.187
},
{
"epoch": 3.0390388369118932,
"grad_norm": 0.953125,
"learning_rate": 7.197555006570692e-06,
"loss": 3.8997,
"num_input_tokens_seen": 739753984,
"step": 11290,
"train_runtime": 103344.3731,
"train_tokens_per_second": 7158.145
},
{
"epoch": 3.041731170492024,
"grad_norm": 0.9453125,
"learning_rate": 7.159341974741443e-06,
"loss": 4.0359,
"num_input_tokens_seen": 740409344,
"step": 11300,
"train_runtime": 103436.4539,
"train_tokens_per_second": 7158.108
},
{
"epoch": 3.0444235040721543,
"grad_norm": 0.90625,
"learning_rate": 7.121213693243911e-06,
"loss": 3.8726,
"num_input_tokens_seen": 741064704,
"step": 11310,
"train_runtime": 103528.1729,
"train_tokens_per_second": 7158.097
},
{
"epoch": 3.047115837652285,
"grad_norm": 0.9375,
"learning_rate": 7.0831703432028e-06,
"loss": 3.8816,
"num_input_tokens_seen": 741720064,
"step": 11320,
"train_runtime": 103620.17,
"train_tokens_per_second": 7158.066
},
{
"epoch": 3.049808171232416,
"grad_norm": 0.984375,
"learning_rate": 7.0452121053394214e-06,
"loss": 3.9057,
"num_input_tokens_seen": 742375424,
"step": 11330,
"train_runtime": 103712.2381,
"train_tokens_per_second": 7158.031
},
{
"epoch": 3.052500504812546,
"grad_norm": 0.9609375,
"learning_rate": 7.007339159970702e-06,
"loss": 3.8628,
"num_input_tokens_seen": 743030784,
"step": 11340,
"train_runtime": 103804.578,
"train_tokens_per_second": 7157.977
},
{
"epoch": 3.055192838392677,
"grad_norm": 0.95703125,
"learning_rate": 6.9695516870084575e-06,
"loss": 3.8919,
"num_input_tokens_seen": 743686144,
"step": 11350,
"train_runtime": 103896.1137,
"train_tokens_per_second": 7157.978
},
{
"epoch": 3.0578851719728073,
"grad_norm": 0.921875,
"learning_rate": 6.93184986595844e-06,
"loss": 3.9367,
"num_input_tokens_seen": 744341504,
"step": 11360,
"train_runtime": 103988.6753,
"train_tokens_per_second": 7157.909
},
{
"epoch": 3.060577505552938,
"grad_norm": 0.9765625,
"learning_rate": 6.894233875919523e-06,
"loss": 3.9588,
"num_input_tokens_seen": 744996864,
"step": 11370,
"train_runtime": 104080.7445,
"train_tokens_per_second": 7157.874
},
{
"epoch": 3.0632698391330684,
"grad_norm": 0.92578125,
"learning_rate": 6.856703895582858e-06,
"loss": 3.9219,
"num_input_tokens_seen": 745652224,
"step": 11380,
"train_runtime": 104172.4223,
"train_tokens_per_second": 7157.866
},
{
"epoch": 3.065962172713199,
"grad_norm": 0.96484375,
"learning_rate": 6.819260103231007e-06,
"loss": 3.9588,
"num_input_tokens_seen": 746307584,
"step": 11390,
"train_runtime": 104264.209,
"train_tokens_per_second": 7157.85
},
{
"epoch": 3.06865450629333,
"grad_norm": 0.98046875,
"learning_rate": 6.781902676737106e-06,
"loss": 3.9269,
"num_input_tokens_seen": 746962944,
"step": 11400,
"train_runtime": 104356.0904,
"train_tokens_per_second": 7157.828
},
{
"epoch": 3.0713468398734602,
"grad_norm": 0.953125,
"learning_rate": 6.744631793564027e-06,
"loss": 3.9009,
"num_input_tokens_seen": 747618304,
"step": 11410,
"train_runtime": 104448.1552,
"train_tokens_per_second": 7157.793
},
{
"epoch": 3.074039173453591,
"grad_norm": 0.9375,
"learning_rate": 6.707447630763505e-06,
"loss": 3.9709,
"num_input_tokens_seen": 748273664,
"step": 11420,
"train_runtime": 104539.7768,
"train_tokens_per_second": 7157.789
},
{
"epoch": 3.0767315070337213,
"grad_norm": 0.9375,
"learning_rate": 6.670350364975358e-06,
"loss": 3.9597,
"num_input_tokens_seen": 748929024,
"step": 11430,
"train_runtime": 104631.99,
"train_tokens_per_second": 7157.744
},
{
"epoch": 3.079423840613852,
"grad_norm": 0.9375,
"learning_rate": 6.633340172426552e-06,
"loss": 3.9172,
"num_input_tokens_seen": 749584384,
"step": 11440,
"train_runtime": 104724.0749,
"train_tokens_per_second": 7157.708
},
{
"epoch": 3.0821161741939824,
"grad_norm": 0.953125,
"learning_rate": 6.596417228930482e-06,
"loss": 3.9498,
"num_input_tokens_seen": 750239744,
"step": 11450,
"train_runtime": 104816.6624,
"train_tokens_per_second": 7157.638
},
{
"epoch": 3.084808507774113,
"grad_norm": 0.96484375,
"learning_rate": 6.5595817098860095e-06,
"loss": 3.879,
"num_input_tokens_seen": 750895104,
"step": 11460,
"train_runtime": 104908.4207,
"train_tokens_per_second": 7157.625
},
{
"epoch": 3.087500841354244,
"grad_norm": 0.953125,
"learning_rate": 6.522833790276761e-06,
"loss": 3.9198,
"num_input_tokens_seen": 751550464,
"step": 11470,
"train_runtime": 105000.5214,
"train_tokens_per_second": 7157.588
},
{
"epoch": 3.0901931749343743,
"grad_norm": 0.94140625,
"learning_rate": 6.486173644670169e-06,
"loss": 4.0222,
"num_input_tokens_seen": 752205824,
"step": 11480,
"train_runtime": 105092.6597,
"train_tokens_per_second": 7157.549
},
{
"epoch": 3.092885508514505,
"grad_norm": 1.0078125,
"learning_rate": 6.449601447216752e-06,
"loss": 3.8615,
"num_input_tokens_seen": 752861184,
"step": 11490,
"train_runtime": 105184.1985,
"train_tokens_per_second": 7157.55
},
{
"epoch": 3.0955778420946354,
"grad_norm": 0.94921875,
"learning_rate": 6.413117371649216e-06,
"loss": 3.9926,
"num_input_tokens_seen": 753516544,
"step": 11500,
"train_runtime": 105276.9824,
"train_tokens_per_second": 7157.467
},
{
"epoch": 3.098270175674766,
"grad_norm": 0.95703125,
"learning_rate": 6.376721591281651e-06,
"loss": 4.0003,
"num_input_tokens_seen": 754171904,
"step": 11510,
"train_runtime": 105386.85,
"train_tokens_per_second": 7156.224
},
{
"epoch": 3.1009625092548965,
"grad_norm": 1.0625,
"learning_rate": 6.340414279008719e-06,
"loss": 3.9484,
"num_input_tokens_seen": 754827264,
"step": 11520,
"train_runtime": 105478.7999,
"train_tokens_per_second": 7156.199
},
{
"epoch": 3.1036548428350272,
"grad_norm": 0.9765625,
"learning_rate": 6.304195607304819e-06,
"loss": 3.9393,
"num_input_tokens_seen": 755482624,
"step": 11530,
"train_runtime": 105571.1372,
"train_tokens_per_second": 7156.147
},
{
"epoch": 3.106347176415158,
"grad_norm": 0.9453125,
"learning_rate": 6.268065748223268e-06,
"loss": 3.9351,
"num_input_tokens_seen": 756137984,
"step": 11540,
"train_runtime": 105662.9611,
"train_tokens_per_second": 7156.131
},
{
"epoch": 3.1090395099952883,
"grad_norm": 0.9921875,
"learning_rate": 6.2320248733954896e-06,
"loss": 3.8764,
"num_input_tokens_seen": 756793344,
"step": 11550,
"train_runtime": 105755.0931,
"train_tokens_per_second": 7156.094
},
{
"epoch": 3.111731843575419,
"grad_norm": 0.96875,
"learning_rate": 6.1960731540301905e-06,
"loss": 3.9,
"num_input_tokens_seen": 757448704,
"step": 11560,
"train_runtime": 105848.1538,
"train_tokens_per_second": 7155.994
},
{
"epoch": 3.1144241771555494,
"grad_norm": 0.9375,
"learning_rate": 6.16021076091258e-06,
"loss": 4.0642,
"num_input_tokens_seen": 758104064,
"step": 11570,
"train_runtime": 105940.0057,
"train_tokens_per_second": 7155.975
},
{
"epoch": 3.11711651073568,
"grad_norm": 0.96875,
"learning_rate": 6.1244378644034845e-06,
"loss": 3.8746,
"num_input_tokens_seen": 758759424,
"step": 11580,
"train_runtime": 106032.0039,
"train_tokens_per_second": 7155.947
},
{
"epoch": 3.1198088443158105,
"grad_norm": 1.0078125,
"learning_rate": 6.088754634438637e-06,
"loss": 3.9193,
"num_input_tokens_seen": 759414784,
"step": 11590,
"train_runtime": 106123.8501,
"train_tokens_per_second": 7155.929
},
{
"epoch": 3.1225011778959413,
"grad_norm": 0.96484375,
"learning_rate": 6.053161240527766e-06,
"loss": 3.9818,
"num_input_tokens_seen": 760070144,
"step": 11600,
"train_runtime": 106215.7767,
"train_tokens_per_second": 7155.906
},
{
"epoch": 3.125193511476072,
"grad_norm": 0.953125,
"learning_rate": 6.017657851753891e-06,
"loss": 3.9894,
"num_input_tokens_seen": 760725504,
"step": 11610,
"train_runtime": 106307.9492,
"train_tokens_per_second": 7155.867
},
{
"epoch": 3.1278858450562024,
"grad_norm": 0.9375,
"learning_rate": 5.982244636772441e-06,
"loss": 3.8536,
"num_input_tokens_seen": 761380864,
"step": 11620,
"train_runtime": 106399.9187,
"train_tokens_per_second": 7155.841
},
{
"epoch": 3.130578178636333,
"grad_norm": 0.94140625,
"learning_rate": 5.9469217638104894e-06,
"loss": 3.9471,
"num_input_tokens_seen": 762036224,
"step": 11630,
"train_runtime": 106492.0655,
"train_tokens_per_second": 7155.803
},
{
"epoch": 3.1332705122164635,
"grad_norm": 0.9296875,
"learning_rate": 5.911689400665954e-06,
"loss": 3.9905,
"num_input_tokens_seen": 762691584,
"step": 11640,
"train_runtime": 106584.3106,
"train_tokens_per_second": 7155.758
},
{
"epoch": 3.1359628457965942,
"grad_norm": 0.99609375,
"learning_rate": 5.876547714706787e-06,
"loss": 3.9328,
"num_input_tokens_seen": 763346944,
"step": 11650,
"train_runtime": 106676.6118,
"train_tokens_per_second": 7155.71
},
{
"epoch": 3.138655179376725,
"grad_norm": 0.9609375,
"learning_rate": 5.841496872870192e-06,
"loss": 3.8978,
"num_input_tokens_seen": 764002304,
"step": 11660,
"train_runtime": 106769.0125,
"train_tokens_per_second": 7155.656
},
{
"epoch": 3.1413475129568553,
"grad_norm": 0.94140625,
"learning_rate": 5.806537041661828e-06,
"loss": 3.953,
"num_input_tokens_seen": 764657664,
"step": 11670,
"train_runtime": 106861.4721,
"train_tokens_per_second": 7155.597
},
{
"epoch": 3.144039846536986,
"grad_norm": 0.98046875,
"learning_rate": 5.771668387155002e-06,
"loss": 3.9492,
"num_input_tokens_seen": 765313024,
"step": 11680,
"train_runtime": 106953.7644,
"train_tokens_per_second": 7155.55
},
{
"epoch": 3.1467321801171164,
"grad_norm": 1.0,
"learning_rate": 5.7368910749899305e-06,
"loss": 3.9586,
"num_input_tokens_seen": 765968384,
"step": 11690,
"train_runtime": 107045.9565,
"train_tokens_per_second": 7155.51
},
{
"epoch": 3.149424513697247,
"grad_norm": 0.96484375,
"learning_rate": 5.702205270372868e-06,
"loss": 3.9319,
"num_input_tokens_seen": 766623744,
"step": 11700,
"train_runtime": 107137.7942,
"train_tokens_per_second": 7155.493
},
{
"epoch": 3.1521168472773775,
"grad_norm": 0.97265625,
"learning_rate": 5.667611138075418e-06,
"loss": 3.9963,
"num_input_tokens_seen": 767279104,
"step": 11710,
"train_runtime": 107230.4527,
"train_tokens_per_second": 7155.422
},
{
"epoch": 3.1548091808575083,
"grad_norm": 0.92578125,
"learning_rate": 5.63310884243366e-06,
"loss": 3.9246,
"num_input_tokens_seen": 767934464,
"step": 11720,
"train_runtime": 107322.4378,
"train_tokens_per_second": 7155.395
},
{
"epoch": 3.157501514437639,
"grad_norm": 0.93359375,
"learning_rate": 5.598698547347458e-06,
"loss": 4.0042,
"num_input_tokens_seen": 768589824,
"step": 11730,
"train_runtime": 107414.9071,
"train_tokens_per_second": 7155.337
},
{
"epoch": 3.1601938480177694,
"grad_norm": 0.96875,
"learning_rate": 5.564380416279588e-06,
"loss": 3.8923,
"num_input_tokens_seen": 769245184,
"step": 11740,
"train_runtime": 107507.3025,
"train_tokens_per_second": 7155.283
},
{
"epoch": 3.1628861815979,
"grad_norm": 0.9296875,
"learning_rate": 5.530154612255054e-06,
"loss": 3.9094,
"num_input_tokens_seen": 769900544,
"step": 11750,
"train_runtime": 107599.2603,
"train_tokens_per_second": 7155.259
},
{
"epoch": 3.1655785151780305,
"grad_norm": 0.98828125,
"learning_rate": 5.496021297860237e-06,
"loss": 3.9055,
"num_input_tokens_seen": 770555904,
"step": 11760,
"train_runtime": 107691.7989,
"train_tokens_per_second": 7155.196
},
{
"epoch": 3.1682708487581612,
"grad_norm": 0.921875,
"learning_rate": 5.461980635242178e-06,
"loss": 3.9176,
"num_input_tokens_seen": 771211264,
"step": 11770,
"train_runtime": 107783.6916,
"train_tokens_per_second": 7155.176
},
{
"epoch": 3.1709631823382916,
"grad_norm": 0.98046875,
"learning_rate": 5.428032786107764e-06,
"loss": 3.9859,
"num_input_tokens_seen": 771866624,
"step": 11780,
"train_runtime": 107875.9842,
"train_tokens_per_second": 7155.129
},
{
"epoch": 3.1736555159184223,
"grad_norm": 0.93359375,
"learning_rate": 5.394177911722994e-06,
"loss": 3.8847,
"num_input_tokens_seen": 772521984,
"step": 11790,
"train_runtime": 107968.1245,
"train_tokens_per_second": 7155.093
},
{
"epoch": 3.176347849498553,
"grad_norm": 0.9375,
"learning_rate": 5.3604161729122e-06,
"loss": 4.0042,
"num_input_tokens_seen": 773177344,
"step": 11800,
"train_runtime": 108060.6872,
"train_tokens_per_second": 7155.029
},
{
"epoch": 3.1790401830786834,
"grad_norm": 0.98046875,
"learning_rate": 5.326747730057272e-06,
"loss": 3.96,
"num_input_tokens_seen": 773832704,
"step": 11810,
"train_runtime": 108152.2901,
"train_tokens_per_second": 7155.028
},
{
"epoch": 3.181732516658814,
"grad_norm": 0.94140625,
"learning_rate": 5.293172743096908e-06,
"loss": 3.9886,
"num_input_tokens_seen": 774488064,
"step": 11820,
"train_runtime": 108245.0968,
"train_tokens_per_second": 7154.948
},
{
"epoch": 3.1844248502389445,
"grad_norm": 0.9296875,
"learning_rate": 5.259691371525877e-06,
"loss": 3.9539,
"num_input_tokens_seen": 775143424,
"step": 11830,
"train_runtime": 108337.5367,
"train_tokens_per_second": 7154.892
},
{
"epoch": 3.1871171838190753,
"grad_norm": 0.92578125,
"learning_rate": 5.226303774394192e-06,
"loss": 3.8804,
"num_input_tokens_seen": 775798784,
"step": 11840,
"train_runtime": 108429.8092,
"train_tokens_per_second": 7154.848
},
{
"epoch": 3.1898095173992056,
"grad_norm": 0.953125,
"learning_rate": 5.193010110306454e-06,
"loss": 3.9077,
"num_input_tokens_seen": 776454144,
"step": 11850,
"train_runtime": 108521.9051,
"train_tokens_per_second": 7154.815
},
{
"epoch": 3.1925018509793364,
"grad_norm": 0.953125,
"learning_rate": 5.159810537420981e-06,
"loss": 3.9966,
"num_input_tokens_seen": 777109504,
"step": 11860,
"train_runtime": 108613.9934,
"train_tokens_per_second": 7154.783
},
{
"epoch": 3.195194184559467,
"grad_norm": 0.96484375,
"learning_rate": 5.12670521344919e-06,
"loss": 3.9349,
"num_input_tokens_seen": 777764864,
"step": 11870,
"train_runtime": 108706.2346,
"train_tokens_per_second": 7154.74
},
{
"epoch": 3.1978865181395975,
"grad_norm": 0.92578125,
"learning_rate": 5.0936942956547075e-06,
"loss": 3.9206,
"num_input_tokens_seen": 778420224,
"step": 11880,
"train_runtime": 108797.8689,
"train_tokens_per_second": 7154.738
},
{
"epoch": 3.2005788517197282,
"grad_norm": 1.0078125,
"learning_rate": 5.060777940852751e-06,
"loss": 3.9034,
"num_input_tokens_seen": 779075584,
"step": 11890,
"train_runtime": 108890.5878,
"train_tokens_per_second": 7154.664
},
{
"epoch": 3.2032711852998585,
"grad_norm": 0.94140625,
"learning_rate": 5.0279563054092924e-06,
"loss": 3.8623,
"num_input_tokens_seen": 779730944,
"step": 11900,
"train_runtime": 108982.9782,
"train_tokens_per_second": 7154.612
},
{
"epoch": 3.2059635188799893,
"grad_norm": 0.96875,
"learning_rate": 4.995229545240357e-06,
"loss": 3.9039,
"num_input_tokens_seen": 780386304,
"step": 11910,
"train_runtime": 109075.6339,
"train_tokens_per_second": 7154.543
},
{
"epoch": 3.2086558524601196,
"grad_norm": 0.96484375,
"learning_rate": 4.962597815811274e-06,
"loss": 3.9737,
"num_input_tokens_seen": 781041664,
"step": 11920,
"train_runtime": 109167.8315,
"train_tokens_per_second": 7154.504
},
{
"epoch": 3.2113481860402504,
"grad_norm": 1.0,
"learning_rate": 4.930061272135941e-06,
"loss": 4.0498,
"num_input_tokens_seen": 781697024,
"step": 11930,
"train_runtime": 109260.1843,
"train_tokens_per_second": 7154.455
},
{
"epoch": 3.214040519620381,
"grad_norm": 0.96484375,
"learning_rate": 4.897620068776077e-06,
"loss": 3.861,
"num_input_tokens_seen": 782352384,
"step": 11940,
"train_runtime": 109352.1011,
"train_tokens_per_second": 7154.434
},
{
"epoch": 3.2167328532005115,
"grad_norm": 0.95703125,
"learning_rate": 4.865274359840513e-06,
"loss": 3.9858,
"num_input_tokens_seen": 783007744,
"step": 11950,
"train_runtime": 109444.7089,
"train_tokens_per_second": 7154.368
},
{
"epoch": 3.2194251867806423,
"grad_norm": 0.9375,
"learning_rate": 4.833024298984415e-06,
"loss": 3.9379,
"num_input_tokens_seen": 783663104,
"step": 11960,
"train_runtime": 109536.6778,
"train_tokens_per_second": 7154.344
},
{
"epoch": 3.2221175203607726,
"grad_norm": 0.9921875,
"learning_rate": 4.80087003940862e-06,
"loss": 3.9185,
"num_input_tokens_seen": 784318464,
"step": 11970,
"train_runtime": 109629.3566,
"train_tokens_per_second": 7154.274
},
{
"epoch": 3.2248098539409034,
"grad_norm": 0.9453125,
"learning_rate": 4.768811733858819e-06,
"loss": 3.9845,
"num_input_tokens_seen": 784973824,
"step": 11980,
"train_runtime": 109720.7631,
"train_tokens_per_second": 7154.287
},
{
"epoch": 3.2275021875210337,
"grad_norm": 0.94140625,
"learning_rate": 4.736849534624946e-06,
"loss": 3.9469,
"num_input_tokens_seen": 785629184,
"step": 11990,
"train_runtime": 109814.1506,
"train_tokens_per_second": 7154.171
},
{
"epoch": 3.2301945211011645,
"grad_norm": 0.9453125,
"learning_rate": 4.704983593540324e-06,
"loss": 3.8453,
"num_input_tokens_seen": 786284544,
"step": 12000,
"train_runtime": 109905.8052,
"train_tokens_per_second": 7154.168
},
{
"epoch": 3.232886854681295,
"grad_norm": 0.96875,
"learning_rate": 4.673214061981068e-06,
"loss": 3.8652,
"num_input_tokens_seen": 786939904,
"step": 12010,
"train_runtime": 110015.6893,
"train_tokens_per_second": 7152.979
},
{
"epoch": 3.2355791882614255,
"grad_norm": 0.9765625,
"learning_rate": 4.641541090865276e-06,
"loss": 3.927,
"num_input_tokens_seen": 787595264,
"step": 12020,
"train_runtime": 110107.01,
"train_tokens_per_second": 7152.998
},
{
"epoch": 3.2382715218415563,
"grad_norm": 0.9375,
"learning_rate": 4.6099648306523556e-06,
"loss": 3.8873,
"num_input_tokens_seen": 788250624,
"step": 12030,
"train_runtime": 110198.8521,
"train_tokens_per_second": 7152.984
},
{
"epoch": 3.2409638554216866,
"grad_norm": 0.96484375,
"learning_rate": 4.578485431342297e-06,
"loss": 3.9165,
"num_input_tokens_seen": 788905984,
"step": 12040,
"train_runtime": 110291.0577,
"train_tokens_per_second": 7152.946
},
{
"epoch": 3.2436561890018174,
"grad_norm": 0.96875,
"learning_rate": 4.547103042474963e-06,
"loss": 3.8838,
"num_input_tokens_seen": 789561344,
"step": 12050,
"train_runtime": 110382.4853,
"train_tokens_per_second": 7152.959
},
{
"epoch": 3.2463485225819477,
"grad_norm": 0.9375,
"learning_rate": 4.515817813129372e-06,
"loss": 3.9319,
"num_input_tokens_seen": 790216704,
"step": 12060,
"train_runtime": 110475.1123,
"train_tokens_per_second": 7152.893
},
{
"epoch": 3.2490408561620785,
"grad_norm": 0.9453125,
"learning_rate": 4.484629891923004e-06,
"loss": 3.8857,
"num_input_tokens_seen": 790872064,
"step": 12070,
"train_runtime": 110566.7673,
"train_tokens_per_second": 7152.891
},
{
"epoch": 3.2517331897422093,
"grad_norm": 0.94921875,
"learning_rate": 4.453539427011072e-06,
"loss": 3.8892,
"num_input_tokens_seen": 791527424,
"step": 12080,
"train_runtime": 110659.4322,
"train_tokens_per_second": 7152.824
},
{
"epoch": 3.2544255233223396,
"grad_norm": 0.93359375,
"learning_rate": 4.4225465660858664e-06,
"loss": 3.9316,
"num_input_tokens_seen": 792182784,
"step": 12090,
"train_runtime": 110752.3031,
"train_tokens_per_second": 7152.743
},
{
"epoch": 3.2571178569024704,
"grad_norm": 0.9453125,
"learning_rate": 4.391651456375967e-06,
"loss": 3.9423,
"num_input_tokens_seen": 792838144,
"step": 12100,
"train_runtime": 110845.2252,
"train_tokens_per_second": 7152.659
},
{
"epoch": 3.2598101904826007,
"grad_norm": 0.97265625,
"learning_rate": 4.360854244645649e-06,
"loss": 3.8601,
"num_input_tokens_seen": 793493504,
"step": 12110,
"train_runtime": 110937.2339,
"train_tokens_per_second": 7152.635
},
{
"epoch": 3.2625025240627314,
"grad_norm": 0.96875,
"learning_rate": 4.330155077194078e-06,
"loss": 3.9145,
"num_input_tokens_seen": 794148864,
"step": 12120,
"train_runtime": 111030.4523,
"train_tokens_per_second": 7152.532
},
{
"epoch": 3.2651948576428618,
"grad_norm": 0.97265625,
"learning_rate": 4.299554099854733e-06,
"loss": 3.8768,
"num_input_tokens_seen": 794804224,
"step": 12130,
"train_runtime": 111122.8894,
"train_tokens_per_second": 7152.48
},
{
"epoch": 3.2678871912229925,
"grad_norm": 0.95703125,
"learning_rate": 4.269051457994586e-06,
"loss": 4.1122,
"num_input_tokens_seen": 795459584,
"step": 12140,
"train_runtime": 111215.379,
"train_tokens_per_second": 7152.424
},
{
"epoch": 3.2705795248031233,
"grad_norm": 0.9609375,
"learning_rate": 4.238647296513526e-06,
"loss": 3.8964,
"num_input_tokens_seen": 796114944,
"step": 12150,
"train_runtime": 111307.6864,
"train_tokens_per_second": 7152.381
},
{
"epoch": 3.2732718583832536,
"grad_norm": 0.9609375,
"learning_rate": 4.208341759843595e-06,
"loss": 3.9527,
"num_input_tokens_seen": 796770304,
"step": 12160,
"train_runtime": 111400.1018,
"train_tokens_per_second": 7152.33
},
{
"epoch": 3.2759641919633844,
"grad_norm": 0.90234375,
"learning_rate": 4.178134991948332e-06,
"loss": 3.8812,
"num_input_tokens_seen": 797425664,
"step": 12170,
"train_runtime": 111493.1246,
"train_tokens_per_second": 7152.241
},
{
"epoch": 3.2786565255435147,
"grad_norm": 0.9609375,
"learning_rate": 4.148027136322089e-06,
"loss": 3.9501,
"num_input_tokens_seen": 798081024,
"step": 12180,
"train_runtime": 111585.511,
"train_tokens_per_second": 7152.192
},
{
"epoch": 3.2813488591236455,
"grad_norm": 1.0,
"learning_rate": 4.118018335989335e-06,
"loss": 4.0134,
"num_input_tokens_seen": 798736384,
"step": 12190,
"train_runtime": 111678.0109,
"train_tokens_per_second": 7152.137
},
{
"epoch": 3.284041192703776,
"grad_norm": 0.9296875,
"learning_rate": 4.088108733503995e-06,
"loss": 3.8888,
"num_input_tokens_seen": 799391744,
"step": 12200,
"train_runtime": 111770.2228,
"train_tokens_per_second": 7152.099
},
{
"epoch": 3.2867335262839066,
"grad_norm": 0.9453125,
"learning_rate": 4.058298470948763e-06,
"loss": 3.9358,
"num_input_tokens_seen": 800047104,
"step": 12210,
"train_runtime": 111862.7143,
"train_tokens_per_second": 7152.044
},
{
"epoch": 3.2894258598640373,
"grad_norm": 1.015625,
"learning_rate": 4.028587689934421e-06,
"loss": 3.9023,
"num_input_tokens_seen": 800702464,
"step": 12220,
"train_runtime": 111955.0352,
"train_tokens_per_second": 7152.0
},
{
"epoch": 3.2921181934441677,
"grad_norm": 0.96484375,
"learning_rate": 3.998976531599197e-06,
"loss": 3.8785,
"num_input_tokens_seen": 801357824,
"step": 12230,
"train_runtime": 112047.4267,
"train_tokens_per_second": 7151.952
},
{
"epoch": 3.2948105270242984,
"grad_norm": 0.96875,
"learning_rate": 3.969465136608028e-06,
"loss": 4.003,
"num_input_tokens_seen": 802013184,
"step": 12240,
"train_runtime": 112139.6241,
"train_tokens_per_second": 7151.916
},
{
"epoch": 3.2975028606044288,
"grad_norm": 0.98046875,
"learning_rate": 3.940053645151984e-06,
"loss": 3.9166,
"num_input_tokens_seen": 802668544,
"step": 12250,
"train_runtime": 112231.7633,
"train_tokens_per_second": 7151.884
},
{
"epoch": 3.3001951941845595,
"grad_norm": 0.94140625,
"learning_rate": 3.910742196947509e-06,
"loss": 3.9437,
"num_input_tokens_seen": 803323904,
"step": 12260,
"train_runtime": 112324.1686,
"train_tokens_per_second": 7151.835
},
{
"epoch": 3.30288752776469,
"grad_norm": 0.9375,
"learning_rate": 3.881530931235841e-06,
"loss": 3.9045,
"num_input_tokens_seen": 803979264,
"step": 12270,
"train_runtime": 112416.5708,
"train_tokens_per_second": 7151.786
},
{
"epoch": 3.3055798613448206,
"grad_norm": 0.92578125,
"learning_rate": 3.852419986782271e-06,
"loss": 3.8566,
"num_input_tokens_seen": 804634624,
"step": 12280,
"train_runtime": 112509.2023,
"train_tokens_per_second": 7151.723
},
{
"epoch": 3.3082721949249514,
"grad_norm": 0.9453125,
"learning_rate": 3.823409501875558e-06,
"loss": 3.8607,
"num_input_tokens_seen": 805289984,
"step": 12290,
"train_runtime": 112601.4274,
"train_tokens_per_second": 7151.685
},
{
"epoch": 3.3109645285050817,
"grad_norm": 0.9453125,
"learning_rate": 3.7944996143272155e-06,
"loss": 3.9406,
"num_input_tokens_seen": 805945344,
"step": 12300,
"train_runtime": 112694.7222,
"train_tokens_per_second": 7151.58
},
{
"epoch": 3.3136568620852125,
"grad_norm": 0.9296875,
"learning_rate": 3.7656904614708917e-06,
"loss": 3.9762,
"num_input_tokens_seen": 806600704,
"step": 12310,
"train_runtime": 112786.9265,
"train_tokens_per_second": 7151.544
},
{
"epoch": 3.316349195665343,
"grad_norm": 0.9453125,
"learning_rate": 3.7369821801616966e-06,
"loss": 3.966,
"num_input_tokens_seen": 807256064,
"step": 12320,
"train_runtime": 112879.4784,
"train_tokens_per_second": 7151.486
},
{
"epoch": 3.3190415292454736,
"grad_norm": 0.9140625,
"learning_rate": 3.708374906775561e-06,
"loss": 3.9349,
"num_input_tokens_seen": 807911424,
"step": 12330,
"train_runtime": 112972.1945,
"train_tokens_per_second": 7151.418
},
{
"epoch": 3.321733862825604,
"grad_norm": 0.9609375,
"learning_rate": 3.679868777208584e-06,
"loss": 3.7792,
"num_input_tokens_seen": 808566784,
"step": 12340,
"train_runtime": 113064.7919,
"train_tokens_per_second": 7151.358
},
{
"epoch": 3.3244261964057347,
"grad_norm": 0.9296875,
"learning_rate": 3.6514639268764113e-06,
"loss": 3.8643,
"num_input_tokens_seen": 809222144,
"step": 12350,
"train_runtime": 113157.2744,
"train_tokens_per_second": 7151.305
},
{
"epoch": 3.3271185299858654,
"grad_norm": 0.94140625,
"learning_rate": 3.623160490713534e-06,
"loss": 3.8796,
"num_input_tokens_seen": 809877504,
"step": 12360,
"train_runtime": 113250.0104,
"train_tokens_per_second": 7151.236
},
{
"epoch": 3.3298108635659958,
"grad_norm": 0.96484375,
"learning_rate": 3.5949586031727267e-06,
"loss": 3.9112,
"num_input_tokens_seen": 810532864,
"step": 12370,
"train_runtime": 113342.2264,
"train_tokens_per_second": 7151.199
},
{
"epoch": 3.3325031971461265,
"grad_norm": 0.9453125,
"learning_rate": 3.5668583982243237e-06,
"loss": 3.8047,
"num_input_tokens_seen": 811188224,
"step": 12380,
"train_runtime": 113434.978,
"train_tokens_per_second": 7151.13
},
{
"epoch": 3.335195530726257,
"grad_norm": 0.97265625,
"learning_rate": 3.538860009355674e-06,
"loss": 3.8597,
"num_input_tokens_seen": 811843584,
"step": 12390,
"train_runtime": 113527.2728,
"train_tokens_per_second": 7151.089
},
{
"epoch": 3.3378878643063876,
"grad_norm": 0.9453125,
"learning_rate": 3.5109635695704053e-06,
"loss": 3.9051,
"num_input_tokens_seen": 812498944,
"step": 12400,
"train_runtime": 113619.9857,
"train_tokens_per_second": 7151.021
},
{
"epoch": 3.340580197886518,
"grad_norm": 0.9375,
"learning_rate": 3.483169211387899e-06,
"loss": 3.976,
"num_input_tokens_seen": 813154304,
"step": 12410,
"train_runtime": 113712.2758,
"train_tokens_per_second": 7150.981
},
{
"epoch": 3.3432725314666487,
"grad_norm": 0.96875,
"learning_rate": 3.455477066842569e-06,
"loss": 3.9116,
"num_input_tokens_seen": 813809664,
"step": 12420,
"train_runtime": 113804.6321,
"train_tokens_per_second": 7150.936
},
{
"epoch": 3.3459648650467795,
"grad_norm": 0.9609375,
"learning_rate": 3.4278872674832957e-06,
"loss": 3.9571,
"num_input_tokens_seen": 814465024,
"step": 12430,
"train_runtime": 113896.5008,
"train_tokens_per_second": 7150.922
},
{
"epoch": 3.34865719862691,
"grad_norm": 0.98828125,
"learning_rate": 3.4003999443727617e-06,
"loss": 3.8961,
"num_input_tokens_seen": 815120384,
"step": 12440,
"train_runtime": 113989.5994,
"train_tokens_per_second": 7150.831
},
{
"epoch": 3.3513495322070406,
"grad_norm": 0.92578125,
"learning_rate": 3.3730152280868623e-06,
"loss": 3.8976,
"num_input_tokens_seen": 815775744,
"step": 12450,
"train_runtime": 114081.8169,
"train_tokens_per_second": 7150.796
},
{
"epoch": 3.354041865787171,
"grad_norm": 0.9453125,
"learning_rate": 3.345733248714053e-06,
"loss": 3.9377,
"num_input_tokens_seen": 816431104,
"step": 12460,
"train_runtime": 114174.4767,
"train_tokens_per_second": 7150.732
},
{
"epoch": 3.3567341993673017,
"grad_norm": 0.94921875,
"learning_rate": 3.3185541358547596e-06,
"loss": 3.9536,
"num_input_tokens_seen": 817086464,
"step": 12470,
"train_runtime": 114266.5848,
"train_tokens_per_second": 7150.703
},
{
"epoch": 3.359426532947432,
"grad_norm": 1.0390625,
"learning_rate": 3.2914780186207416e-06,
"loss": 3.8851,
"num_input_tokens_seen": 817741824,
"step": 12480,
"train_runtime": 114359.449,
"train_tokens_per_second": 7150.628
},
{
"epoch": 3.3621188665275628,
"grad_norm": 0.91796875,
"learning_rate": 3.2645050256345066e-06,
"loss": 3.8909,
"num_input_tokens_seen": 818397184,
"step": 12490,
"train_runtime": 114451.976,
"train_tokens_per_second": 7150.573
},
{
"epoch": 3.3648112001076935,
"grad_norm": 0.96484375,
"learning_rate": 3.237635285028645e-06,
"loss": 3.9691,
"num_input_tokens_seen": 819052544,
"step": 12500,
"train_runtime": 114543.7532,
"train_tokens_per_second": 7150.565
},
{
"epoch": 3.367503533687824,
"grad_norm": 0.95703125,
"learning_rate": 3.2108689244453013e-06,
"loss": 3.8528,
"num_input_tokens_seen": 819707904,
"step": 12510,
"train_runtime": 114655.1115,
"train_tokens_per_second": 7149.336
},
{
"epoch": 3.3701958672679546,
"grad_norm": 0.95703125,
"learning_rate": 3.1842060710354755e-06,
"loss": 3.8692,
"num_input_tokens_seen": 820363264,
"step": 12520,
"train_runtime": 114746.4237,
"train_tokens_per_second": 7149.358
},
{
"epoch": 3.372888200848085,
"grad_norm": 0.97265625,
"learning_rate": 3.1576468514585123e-06,
"loss": 3.925,
"num_input_tokens_seen": 821018624,
"step": 12530,
"train_runtime": 114838.9916,
"train_tokens_per_second": 7149.302
},
{
"epoch": 3.3755805344282157,
"grad_norm": 0.96484375,
"learning_rate": 3.1311913918814106e-06,
"loss": 3.9613,
"num_input_tokens_seen": 821673984,
"step": 12540,
"train_runtime": 114931.5525,
"train_tokens_per_second": 7149.246
},
{
"epoch": 3.378272868008346,
"grad_norm": 0.98828125,
"learning_rate": 3.1048398179783055e-06,
"loss": 3.9994,
"num_input_tokens_seen": 822329344,
"step": 12550,
"train_runtime": 115023.9222,
"train_tokens_per_second": 7149.203
},
{
"epoch": 3.380965201588477,
"grad_norm": 0.9609375,
"learning_rate": 3.0785922549298127e-06,
"loss": 3.8232,
"num_input_tokens_seen": 822984704,
"step": 12560,
"train_runtime": 115116.901,
"train_tokens_per_second": 7149.121
},
{
"epoch": 3.3836575351686076,
"grad_norm": 0.96875,
"learning_rate": 3.0524488274224577e-06,
"loss": 3.9493,
"num_input_tokens_seen": 823640064,
"step": 12570,
"train_runtime": 115209.1857,
"train_tokens_per_second": 7149.083
},
{
"epoch": 3.386349868748738,
"grad_norm": 0.953125,
"learning_rate": 3.02640965964808e-06,
"loss": 3.8712,
"num_input_tokens_seen": 824295424,
"step": 12580,
"train_runtime": 115301.7326,
"train_tokens_per_second": 7149.029
},
{
"epoch": 3.3890422023288687,
"grad_norm": 0.98828125,
"learning_rate": 3.000474875303247e-06,
"loss": 3.8895,
"num_input_tokens_seen": 824950784,
"step": 12590,
"train_runtime": 115394.0046,
"train_tokens_per_second": 7148.992
},
{
"epoch": 3.391734535908999,
"grad_norm": 0.9375,
"learning_rate": 2.974644597588655e-06,
"loss": 3.8803,
"num_input_tokens_seen": 825606144,
"step": 12600,
"train_runtime": 115486.5639,
"train_tokens_per_second": 7148.937
},
{
"epoch": 3.3944268694891297,
"grad_norm": 0.9453125,
"learning_rate": 2.9489189492085622e-06,
"loss": 3.9924,
"num_input_tokens_seen": 826261504,
"step": 12610,
"train_runtime": 115579.424,
"train_tokens_per_second": 7148.863
},
{
"epoch": 3.39711920306926,
"grad_norm": 0.9609375,
"learning_rate": 2.923298052370177e-06,
"loss": 4.0049,
"num_input_tokens_seen": 826916864,
"step": 12620,
"train_runtime": 115672.0103,
"train_tokens_per_second": 7148.807
},
{
"epoch": 3.399811536649391,
"grad_norm": 0.90625,
"learning_rate": 2.8977820287831303e-06,
"loss": 3.9375,
"num_input_tokens_seen": 827572224,
"step": 12630,
"train_runtime": 115763.97,
"train_tokens_per_second": 7148.789
},
{
"epoch": 3.4025038702295216,
"grad_norm": 0.92578125,
"learning_rate": 2.872370999658816e-06,
"loss": 3.8849,
"num_input_tokens_seen": 828227584,
"step": 12640,
"train_runtime": 115856.9261,
"train_tokens_per_second": 7148.71
},
{
"epoch": 3.405196203809652,
"grad_norm": 0.9765625,
"learning_rate": 2.8470650857099073e-06,
"loss": 3.8561,
"num_input_tokens_seen": 828882944,
"step": 12650,
"train_runtime": 115948.9156,
"train_tokens_per_second": 7148.691
},
{
"epoch": 3.4078885373897827,
"grad_norm": 0.94921875,
"learning_rate": 2.8218644071496993e-06,
"loss": 3.9041,
"num_input_tokens_seen": 829538304,
"step": 12660,
"train_runtime": 116041.5761,
"train_tokens_per_second": 7148.63
},
{
"epoch": 3.410580870969913,
"grad_norm": 0.9375,
"learning_rate": 2.796769083691608e-06,
"loss": 3.9112,
"num_input_tokens_seen": 830193664,
"step": 12670,
"train_runtime": 116133.9458,
"train_tokens_per_second": 7148.587
},
{
"epoch": 3.413273204550044,
"grad_norm": 0.9453125,
"learning_rate": 2.7717792345485412e-06,
"loss": 3.908,
"num_input_tokens_seen": 830849024,
"step": 12680,
"train_runtime": 116226.1902,
"train_tokens_per_second": 7148.553
},
{
"epoch": 3.415965538130174,
"grad_norm": 0.98046875,
"learning_rate": 2.7468949784323905e-06,
"loss": 3.9629,
"num_input_tokens_seen": 831504384,
"step": 12690,
"train_runtime": 116318.5418,
"train_tokens_per_second": 7148.511
},
{
"epoch": 3.418657871710305,
"grad_norm": 0.953125,
"learning_rate": 2.722116433553418e-06,
"loss": 3.9296,
"num_input_tokens_seen": 832159744,
"step": 12700,
"train_runtime": 116411.3217,
"train_tokens_per_second": 7148.443
},
{
"epoch": 3.4213502052904357,
"grad_norm": 0.95703125,
"learning_rate": 2.6974437176197214e-06,
"loss": 3.9205,
"num_input_tokens_seen": 832815104,
"step": 12710,
"train_runtime": 116504.246,
"train_tokens_per_second": 7148.367
},
{
"epoch": 3.424042538870566,
"grad_norm": 0.97265625,
"learning_rate": 2.6728769478366638e-06,
"loss": 3.904,
"num_input_tokens_seen": 833470464,
"step": 12720,
"train_runtime": 116596.4746,
"train_tokens_per_second": 7148.333
},
{
"epoch": 3.4267348724506967,
"grad_norm": 0.96484375,
"learning_rate": 2.648416240906326e-06,
"loss": 4.0302,
"num_input_tokens_seen": 834125824,
"step": 12730,
"train_runtime": 116688.7306,
"train_tokens_per_second": 7148.298
},
{
"epoch": 3.429427206030827,
"grad_norm": 0.96875,
"learning_rate": 2.6240617130269428e-06,
"loss": 3.8861,
"num_input_tokens_seen": 834781184,
"step": 12740,
"train_runtime": 116781.4661,
"train_tokens_per_second": 7148.233
},
{
"epoch": 3.432119539610958,
"grad_norm": 0.95703125,
"learning_rate": 2.599813479892371e-06,
"loss": 3.9706,
"num_input_tokens_seen": 835436544,
"step": 12750,
"train_runtime": 116873.9549,
"train_tokens_per_second": 7148.184
},
{
"epoch": 3.434811873191088,
"grad_norm": 1.0703125,
"learning_rate": 2.5756716566914947e-06,
"loss": 3.963,
"num_input_tokens_seen": 836091904,
"step": 12760,
"train_runtime": 116966.4762,
"train_tokens_per_second": 7148.133
},
{
"epoch": 3.437504206771219,
"grad_norm": 0.9609375,
"learning_rate": 2.551636358107745e-06,
"loss": 3.9309,
"num_input_tokens_seen": 836747264,
"step": 12770,
"train_runtime": 117059.1439,
"train_tokens_per_second": 7148.073
},
{
"epoch": 3.4401965403513497,
"grad_norm": 0.9453125,
"learning_rate": 2.5277076983184765e-06,
"loss": 3.9271,
"num_input_tokens_seen": 837402624,
"step": 12780,
"train_runtime": 117151.9522,
"train_tokens_per_second": 7148.004
},
{
"epoch": 3.44288887393148,
"grad_norm": 0.984375,
"learning_rate": 2.503885790994509e-06,
"loss": 3.935,
"num_input_tokens_seen": 838057984,
"step": 12790,
"train_runtime": 117244.1649,
"train_tokens_per_second": 7147.972
},
{
"epoch": 3.445581207511611,
"grad_norm": 0.9296875,
"learning_rate": 2.480170749299504e-06,
"loss": 3.9176,
"num_input_tokens_seen": 838713344,
"step": 12800,
"train_runtime": 117337.2422,
"train_tokens_per_second": 7147.887
},
{
"epoch": 3.448273541091741,
"grad_norm": 0.9453125,
"learning_rate": 2.456562685889505e-06,
"loss": 3.9086,
"num_input_tokens_seen": 839368704,
"step": 12810,
"train_runtime": 117429.7527,
"train_tokens_per_second": 7147.837
},
{
"epoch": 3.450965874671872,
"grad_norm": 0.95703125,
"learning_rate": 2.4330617129123405e-06,
"loss": 3.882,
"num_input_tokens_seen": 840024064,
"step": 12820,
"train_runtime": 117522.192,
"train_tokens_per_second": 7147.791
},
{
"epoch": 3.453658208252002,
"grad_norm": 0.9453125,
"learning_rate": 2.4096679420071294e-06,
"loss": 4.0154,
"num_input_tokens_seen": 840679424,
"step": 12830,
"train_runtime": 117614.5624,
"train_tokens_per_second": 7147.749
},
{
"epoch": 3.456350541832133,
"grad_norm": 0.91796875,
"learning_rate": 2.386381484303729e-06,
"loss": 3.8888,
"num_input_tokens_seen": 841334784,
"step": 12840,
"train_runtime": 117707.487,
"train_tokens_per_second": 7147.674
},
{
"epoch": 3.4590428754122637,
"grad_norm": 0.93359375,
"learning_rate": 2.3632024504222195e-06,
"loss": 3.9835,
"num_input_tokens_seen": 841990144,
"step": 12850,
"train_runtime": 117800.2316,
"train_tokens_per_second": 7147.61
},
{
"epoch": 3.461735208992394,
"grad_norm": 0.96875,
"learning_rate": 2.3401309504723747e-06,
"loss": 3.934,
"num_input_tokens_seen": 842645504,
"step": 12860,
"train_runtime": 117892.9661,
"train_tokens_per_second": 7147.547
},
{
"epoch": 3.464427542572525,
"grad_norm": 0.9609375,
"learning_rate": 2.317167094053138e-06,
"loss": 3.9311,
"num_input_tokens_seen": 843300864,
"step": 12870,
"train_runtime": 117985.8884,
"train_tokens_per_second": 7147.472
},
{
"epoch": 3.467119876152655,
"grad_norm": 1.015625,
"learning_rate": 2.294310990252099e-06,
"loss": 3.9461,
"num_input_tokens_seen": 843956224,
"step": 12880,
"train_runtime": 118078.3568,
"train_tokens_per_second": 7147.425
},
{
"epoch": 3.469812209732786,
"grad_norm": 0.95703125,
"learning_rate": 2.2715627476449953e-06,
"loss": 3.9396,
"num_input_tokens_seen": 844611584,
"step": 12890,
"train_runtime": 118170.8013,
"train_tokens_per_second": 7147.38
},
{
"epoch": 3.4725045433129162,
"grad_norm": 0.9609375,
"learning_rate": 2.248922474295148e-06,
"loss": 3.8993,
"num_input_tokens_seen": 845266944,
"step": 12900,
"train_runtime": 118263.5744,
"train_tokens_per_second": 7147.314
},
{
"epoch": 3.475196876893047,
"grad_norm": 0.98046875,
"learning_rate": 2.226390277753024e-06,
"loss": 3.9836,
"num_input_tokens_seen": 845922304,
"step": 12910,
"train_runtime": 118356.2,
"train_tokens_per_second": 7147.258
},
{
"epoch": 3.477889210473178,
"grad_norm": 0.9140625,
"learning_rate": 2.2039662650556347e-06,
"loss": 3.949,
"num_input_tokens_seen": 846577664,
"step": 12920,
"train_runtime": 118449.0696,
"train_tokens_per_second": 7147.187
},
{
"epoch": 3.480581544053308,
"grad_norm": 0.9296875,
"learning_rate": 2.1816505427261158e-06,
"loss": 3.9043,
"num_input_tokens_seen": 847233024,
"step": 12930,
"train_runtime": 118541.3444,
"train_tokens_per_second": 7147.152
},
{
"epoch": 3.483273877633439,
"grad_norm": 0.953125,
"learning_rate": 2.1594432167731464e-06,
"loss": 3.9428,
"num_input_tokens_seen": 847888384,
"step": 12940,
"train_runtime": 118634.0314,
"train_tokens_per_second": 7147.092
},
{
"epoch": 3.485966211213569,
"grad_norm": 0.953125,
"learning_rate": 2.1373443926905076e-06,
"loss": 3.9676,
"num_input_tokens_seen": 848543744,
"step": 12950,
"train_runtime": 118726.1138,
"train_tokens_per_second": 7147.069
},
{
"epoch": 3.4886585447937,
"grad_norm": 0.92578125,
"learning_rate": 2.1153541754565326e-06,
"loss": 3.9707,
"num_input_tokens_seen": 849199104,
"step": 12960,
"train_runtime": 118818.8701,
"train_tokens_per_second": 7147.005
},
{
"epoch": 3.4913508783738303,
"grad_norm": 0.95703125,
"learning_rate": 2.0934726695336386e-06,
"loss": 3.8384,
"num_input_tokens_seen": 849854464,
"step": 12970,
"train_runtime": 118911.6083,
"train_tokens_per_second": 7146.943
},
{
"epoch": 3.494043211953961,
"grad_norm": 0.9921875,
"learning_rate": 2.0716999788678167e-06,
"loss": 3.9075,
"num_input_tokens_seen": 850509824,
"step": 12980,
"train_runtime": 119004.2495,
"train_tokens_per_second": 7146.886
},
{
"epoch": 3.496735545534092,
"grad_norm": 0.95703125,
"learning_rate": 2.0500362068881396e-06,
"loss": 4.0169,
"num_input_tokens_seen": 851165184,
"step": 12990,
"train_runtime": 119096.5756,
"train_tokens_per_second": 7146.849
},
{
"epoch": 3.499427879114222,
"grad_norm": 0.9765625,
"learning_rate": 2.028481456506276e-06,
"loss": 3.9316,
"num_input_tokens_seen": 851820544,
"step": 13000,
"train_runtime": 119189.4526,
"train_tokens_per_second": 7146.778
},
{
"epoch": 3.502120212694353,
"grad_norm": 0.9296875,
"learning_rate": 2.0070358301159996e-06,
"loss": 3.8656,
"num_input_tokens_seen": 852475904,
"step": 13010,
"train_runtime": 119304.0506,
"train_tokens_per_second": 7145.406
},
{
"epoch": 3.5048125462744832,
"grad_norm": 0.94921875,
"learning_rate": 1.9856994295926857e-06,
"loss": 3.8804,
"num_input_tokens_seen": 853131264,
"step": 13020,
"train_runtime": 119396.5114,
"train_tokens_per_second": 7145.362
},
{
"epoch": 3.507504879854614,
"grad_norm": 0.98828125,
"learning_rate": 1.964472356292876e-06,
"loss": 3.9249,
"num_input_tokens_seen": 853786624,
"step": 13030,
"train_runtime": 119489.3871,
"train_tokens_per_second": 7145.293
},
{
"epoch": 3.5101972134347443,
"grad_norm": 0.953125,
"learning_rate": 1.9433547110537214e-06,
"loss": 3.9338,
"num_input_tokens_seen": 854441984,
"step": 13040,
"train_runtime": 119581.9681,
"train_tokens_per_second": 7145.241
},
{
"epoch": 3.512889547014875,
"grad_norm": 0.953125,
"learning_rate": 1.922346594192581e-06,
"loss": 3.9438,
"num_input_tokens_seen": 855097344,
"step": 13050,
"train_runtime": 119675.863,
"train_tokens_per_second": 7145.111
},
{
"epoch": 3.515581880595006,
"grad_norm": 0.93359375,
"learning_rate": 1.9014481055064842e-06,
"loss": 3.9569,
"num_input_tokens_seen": 855752704,
"step": 13060,
"train_runtime": 119768.6888,
"train_tokens_per_second": 7145.045
},
{
"epoch": 3.518274214175136,
"grad_norm": 0.9140625,
"learning_rate": 1.880659344271707e-06,
"loss": 3.8864,
"num_input_tokens_seen": 856408064,
"step": 13070,
"train_runtime": 119861.1973,
"train_tokens_per_second": 7144.998
},
{
"epoch": 3.520966547755267,
"grad_norm": 0.97265625,
"learning_rate": 1.8599804092432477e-06,
"loss": 3.9652,
"num_input_tokens_seen": 857063424,
"step": 13080,
"train_runtime": 119954.2411,
"train_tokens_per_second": 7144.92
},
{
"epoch": 3.5236588813353977,
"grad_norm": 0.94921875,
"learning_rate": 1.8394113986544087e-06,
"loss": 3.9942,
"num_input_tokens_seen": 857718784,
"step": 13090,
"train_runtime": 120046.6443,
"train_tokens_per_second": 7144.879
},
{
"epoch": 3.526351214915528,
"grad_norm": 0.93359375,
"learning_rate": 1.8189524102162958e-06,
"loss": 3.9735,
"num_input_tokens_seen": 858374144,
"step": 13100,
"train_runtime": 120139.9898,
"train_tokens_per_second": 7144.783
},
{
"epoch": 3.5290435484956584,
"grad_norm": 0.93359375,
"learning_rate": 1.798603541117369e-06,
"loss": 3.9093,
"num_input_tokens_seen": 859029504,
"step": 13110,
"train_runtime": 120232.3957,
"train_tokens_per_second": 7144.742
},
{
"epoch": 3.531735882075789,
"grad_norm": 0.96484375,
"learning_rate": 1.7783648880229765e-06,
"loss": 3.9128,
"num_input_tokens_seen": 859684864,
"step": 13120,
"train_runtime": 120325.4141,
"train_tokens_per_second": 7144.666
},
{
"epoch": 3.53442821565592,
"grad_norm": 0.95703125,
"learning_rate": 1.758236547074893e-06,
"loss": 3.9422,
"num_input_tokens_seen": 860340224,
"step": 13130,
"train_runtime": 120417.7906,
"train_tokens_per_second": 7144.627
},
{
"epoch": 3.5371205492360502,
"grad_norm": 0.9375,
"learning_rate": 1.7382186138908629e-06,
"loss": 3.8821,
"num_input_tokens_seen": 860995584,
"step": 13140,
"train_runtime": 120510.245,
"train_tokens_per_second": 7144.584
},
{
"epoch": 3.539812882816181,
"grad_norm": 0.96484375,
"learning_rate": 1.7183111835641696e-06,
"loss": 4.0563,
"num_input_tokens_seen": 861650944,
"step": 13150,
"train_runtime": 120603.6381,
"train_tokens_per_second": 7144.486
},
{
"epoch": 3.5425052163963118,
"grad_norm": 0.93359375,
"learning_rate": 1.6985143506631301e-06,
"loss": 3.8831,
"num_input_tokens_seen": 862306304,
"step": 13160,
"train_runtime": 120696.122,
"train_tokens_per_second": 7144.441
},
{
"epoch": 3.545197549976442,
"grad_norm": 0.95703125,
"learning_rate": 1.6788282092307151e-06,
"loss": 3.8789,
"num_input_tokens_seen": 862961664,
"step": 13170,
"train_runtime": 120789.5116,
"train_tokens_per_second": 7144.343
},
{
"epoch": 3.5478898835565724,
"grad_norm": 0.95703125,
"learning_rate": 1.6592528527840296e-06,
"loss": 3.9771,
"num_input_tokens_seen": 863617024,
"step": 13180,
"train_runtime": 120882.5701,
"train_tokens_per_second": 7144.264
},
{
"epoch": 3.550582217136703,
"grad_norm": 0.9453125,
"learning_rate": 1.6397883743139387e-06,
"loss": 3.9567,
"num_input_tokens_seen": 864272384,
"step": 13190,
"train_runtime": 120975.3607,
"train_tokens_per_second": 7144.202
},
{
"epoch": 3.553274550716834,
"grad_norm": 0.94921875,
"learning_rate": 1.6204348662845648e-06,
"loss": 3.8517,
"num_input_tokens_seen": 864927744,
"step": 13200,
"train_runtime": 121068.3929,
"train_tokens_per_second": 7144.125
},
{
"epoch": 3.5559668842969643,
"grad_norm": 0.98046875,
"learning_rate": 1.6011924206328965e-06,
"loss": 3.9251,
"num_input_tokens_seen": 865583104,
"step": 13210,
"train_runtime": 121161.5615,
"train_tokens_per_second": 7144.041
},
{
"epoch": 3.558659217877095,
"grad_norm": 0.98828125,
"learning_rate": 1.5820611287683195e-06,
"loss": 3.8975,
"num_input_tokens_seen": 866238464,
"step": 13220,
"train_runtime": 121254.6622,
"train_tokens_per_second": 7143.96
},
{
"epoch": 3.561351551457226,
"grad_norm": 0.9375,
"learning_rate": 1.5630410815722001e-06,
"loss": 3.9955,
"num_input_tokens_seen": 866893824,
"step": 13230,
"train_runtime": 121347.1378,
"train_tokens_per_second": 7143.917
},
{
"epoch": 3.564043885037356,
"grad_norm": 0.94921875,
"learning_rate": 1.5441323693974441e-06,
"loss": 3.9784,
"num_input_tokens_seen": 867549184,
"step": 13240,
"train_runtime": 121440.003,
"train_tokens_per_second": 7143.85
},
{
"epoch": 3.5667362186174865,
"grad_norm": 0.94921875,
"learning_rate": 1.5253350820680689e-06,
"loss": 3.9543,
"num_input_tokens_seen": 868204544,
"step": 13250,
"train_runtime": 121533.1136,
"train_tokens_per_second": 7143.769
},
{
"epoch": 3.5694285521976172,
"grad_norm": 0.94140625,
"learning_rate": 1.5066493088787853e-06,
"loss": 3.9159,
"num_input_tokens_seen": 868859904,
"step": 13260,
"train_runtime": 121625.8864,
"train_tokens_per_second": 7143.709
},
{
"epoch": 3.572120885777748,
"grad_norm": 1.046875,
"learning_rate": 1.4880751385945608e-06,
"loss": 3.8722,
"num_input_tokens_seen": 869515264,
"step": 13270,
"train_runtime": 121719.2975,
"train_tokens_per_second": 7143.611
},
{
"epoch": 3.5748132193578783,
"grad_norm": 0.9921875,
"learning_rate": 1.4696126594502035e-06,
"loss": 3.9232,
"num_input_tokens_seen": 870170624,
"step": 13280,
"train_runtime": 121812.0493,
"train_tokens_per_second": 7143.551
},
{
"epoch": 3.577505552938009,
"grad_norm": 0.96484375,
"learning_rate": 1.4512619591499576e-06,
"loss": 3.8966,
"num_input_tokens_seen": 870825984,
"step": 13290,
"train_runtime": 121904.7883,
"train_tokens_per_second": 7143.493
},
{
"epoch": 3.58019788651814,
"grad_norm": 0.953125,
"learning_rate": 1.4330231248670501e-06,
"loss": 3.9572,
"num_input_tokens_seen": 871481344,
"step": 13300,
"train_runtime": 121997.553,
"train_tokens_per_second": 7143.433
},
{
"epoch": 3.58289022009827,
"grad_norm": 0.94921875,
"learning_rate": 1.414896243243319e-06,
"loss": 3.9553,
"num_input_tokens_seen": 872136704,
"step": 13310,
"train_runtime": 122091.1716,
"train_tokens_per_second": 7143.323
},
{
"epoch": 3.5855825536784005,
"grad_norm": 0.97265625,
"learning_rate": 1.3968814003887665e-06,
"loss": 3.8738,
"num_input_tokens_seen": 872792064,
"step": 13320,
"train_runtime": 122183.9693,
"train_tokens_per_second": 7143.262
},
{
"epoch": 3.5882748872585313,
"grad_norm": 0.97265625,
"learning_rate": 1.3789786818811823e-06,
"loss": 3.9605,
"num_input_tokens_seen": 873447424,
"step": 13330,
"train_runtime": 122276.6746,
"train_tokens_per_second": 7143.206
},
{
"epoch": 3.590967220838662,
"grad_norm": 0.9453125,
"learning_rate": 1.3611881727656956e-06,
"loss": 3.9756,
"num_input_tokens_seen": 874102784,
"step": 13340,
"train_runtime": 122369.8263,
"train_tokens_per_second": 7143.124
},
{
"epoch": 3.5936595544187924,
"grad_norm": 0.96484375,
"learning_rate": 1.3435099575544258e-06,
"loss": 3.9266,
"num_input_tokens_seen": 874758144,
"step": 13350,
"train_runtime": 122462.8819,
"train_tokens_per_second": 7143.047
},
{
"epoch": 3.596351887998923,
"grad_norm": 0.953125,
"learning_rate": 1.3259441202260276e-06,
"loss": 3.976,
"num_input_tokens_seen": 875413504,
"step": 13360,
"train_runtime": 122555.7889,
"train_tokens_per_second": 7142.98
},
{
"epoch": 3.599044221579054,
"grad_norm": 0.90625,
"learning_rate": 1.308490744225324e-06,
"loss": 4.0271,
"num_input_tokens_seen": 876068864,
"step": 13370,
"train_runtime": 122649.1213,
"train_tokens_per_second": 7142.887
},
{
"epoch": 3.6017365551591842,
"grad_norm": 0.9765625,
"learning_rate": 1.2911499124629023e-06,
"loss": 3.9439,
"num_input_tokens_seen": 876724224,
"step": 13380,
"train_runtime": 122742.1853,
"train_tokens_per_second": 7142.811
},
{
"epoch": 3.6044288887393146,
"grad_norm": 0.95703125,
"learning_rate": 1.2739217073147154e-06,
"loss": 3.9033,
"num_input_tokens_seen": 877379584,
"step": 13390,
"train_runtime": 122835.033,
"train_tokens_per_second": 7142.747
},
{
"epoch": 3.6071212223194453,
"grad_norm": 0.9375,
"learning_rate": 1.2568062106216998e-06,
"loss": 3.9033,
"num_input_tokens_seen": 878034944,
"step": 13400,
"train_runtime": 122928.5466,
"train_tokens_per_second": 7142.645
},
{
"epoch": 3.609813555899576,
"grad_norm": 0.93359375,
"learning_rate": 1.239803503689377e-06,
"loss": 3.9349,
"num_input_tokens_seen": 878690304,
"step": 13410,
"train_runtime": 123021.7118,
"train_tokens_per_second": 7142.563
},
{
"epoch": 3.6125058894797064,
"grad_norm": 0.94921875,
"learning_rate": 1.2229136672874674e-06,
"loss": 3.9461,
"num_input_tokens_seen": 879345664,
"step": 13420,
"train_runtime": 123114.5592,
"train_tokens_per_second": 7142.499
},
{
"epoch": 3.615198223059837,
"grad_norm": 0.9453125,
"learning_rate": 1.206136781649525e-06,
"loss": 3.895,
"num_input_tokens_seen": 880001024,
"step": 13430,
"train_runtime": 123207.418,
"train_tokens_per_second": 7142.435
},
{
"epoch": 3.617890556639968,
"grad_norm": 0.96875,
"learning_rate": 1.1894729264725235e-06,
"loss": 3.9249,
"num_input_tokens_seen": 880656384,
"step": 13440,
"train_runtime": 123300.3293,
"train_tokens_per_second": 7142.368
},
{
"epoch": 3.6205828902200983,
"grad_norm": 0.94140625,
"learning_rate": 1.1729221809165163e-06,
"loss": 3.9358,
"num_input_tokens_seen": 881311744,
"step": 13450,
"train_runtime": 123393.1214,
"train_tokens_per_second": 7142.309
},
{
"epoch": 3.6232752238002286,
"grad_norm": 0.9453125,
"learning_rate": 1.1564846236042177e-06,
"loss": 3.8576,
"num_input_tokens_seen": 881967104,
"step": 13460,
"train_runtime": 123485.088,
"train_tokens_per_second": 7142.296
},
{
"epoch": 3.6259675573803594,
"grad_norm": 0.96484375,
"learning_rate": 1.1401603326206767e-06,
"loss": 3.851,
"num_input_tokens_seen": 882622464,
"step": 13470,
"train_runtime": 123578.7292,
"train_tokens_per_second": 7142.188
},
{
"epoch": 3.62865989096049,
"grad_norm": 0.9375,
"learning_rate": 1.123949385512857e-06,
"loss": 3.993,
"num_input_tokens_seen": 883277824,
"step": 13480,
"train_runtime": 123671.2898,
"train_tokens_per_second": 7142.141
},
{
"epoch": 3.6313522245406205,
"grad_norm": 0.93359375,
"learning_rate": 1.1078518592893134e-06,
"loss": 3.9331,
"num_input_tokens_seen": 883933184,
"step": 13490,
"train_runtime": 123763.8687,
"train_tokens_per_second": 7142.094
},
{
"epoch": 3.6340445581207512,
"grad_norm": 0.9375,
"learning_rate": 1.091867830419796e-06,
"loss": 3.8876,
"num_input_tokens_seen": 884588544,
"step": 13500,
"train_runtime": 123856.936,
"train_tokens_per_second": 7142.019
},
{
"epoch": 3.636736891700882,
"grad_norm": 0.9296875,
"learning_rate": 1.0759973748348945e-06,
"loss": 3.8899,
"num_input_tokens_seen": 885243904,
"step": 13510,
"train_runtime": 123967.4324,
"train_tokens_per_second": 7140.939
},
{
"epoch": 3.6394292252810123,
"grad_norm": 0.94921875,
"learning_rate": 1.0602405679256883e-06,
"loss": 3.8452,
"num_input_tokens_seen": 885899264,
"step": 13520,
"train_runtime": 124060.5751,
"train_tokens_per_second": 7140.861
},
{
"epoch": 3.6421215588611426,
"grad_norm": 1.0078125,
"learning_rate": 1.044597484543372e-06,
"loss": 3.9445,
"num_input_tokens_seen": 886554624,
"step": 13530,
"train_runtime": 124153.6451,
"train_tokens_per_second": 7140.786
},
{
"epoch": 3.6448138924412734,
"grad_norm": 0.9765625,
"learning_rate": 1.0290681989989088e-06,
"loss": 3.9449,
"num_input_tokens_seen": 887209984,
"step": 13540,
"train_runtime": 124246.5598,
"train_tokens_per_second": 7140.721
},
{
"epoch": 3.647506226021404,
"grad_norm": 0.95703125,
"learning_rate": 1.0136527850626826e-06,
"loss": 3.8624,
"num_input_tokens_seen": 887865344,
"step": 13550,
"train_runtime": 124339.6232,
"train_tokens_per_second": 7140.647
},
{
"epoch": 3.6501985596015345,
"grad_norm": 0.95703125,
"learning_rate": 9.983513159641271e-07,
"loss": 3.8709,
"num_input_tokens_seen": 888520704,
"step": 13560,
"train_runtime": 124432.4044,
"train_tokens_per_second": 7140.589
},
{
"epoch": 3.6528908931816653,
"grad_norm": 0.9765625,
"learning_rate": 9.831638643914147e-07,
"loss": 3.9275,
"num_input_tokens_seen": 889176064,
"step": 13570,
"train_runtime": 124525.4167,
"train_tokens_per_second": 7140.519
},
{
"epoch": 3.655583226761796,
"grad_norm": 0.921875,
"learning_rate": 9.68090502491059e-07,
"loss": 3.9328,
"num_input_tokens_seen": 889831424,
"step": 13580,
"train_runtime": 124618.2504,
"train_tokens_per_second": 7140.458
},
{
"epoch": 3.6582755603419264,
"grad_norm": 0.9453125,
"learning_rate": 9.531313018676297e-07,
"loss": 3.8715,
"num_input_tokens_seen": 890486784,
"step": 13590,
"train_runtime": 124710.9332,
"train_tokens_per_second": 7140.407
},
{
"epoch": 3.6609678939220567,
"grad_norm": 0.93359375,
"learning_rate": 9.382863335833631e-07,
"loss": 3.8953,
"num_input_tokens_seen": 891142144,
"step": 13600,
"train_runtime": 124803.5328,
"train_tokens_per_second": 7140.36
},
{
"epoch": 3.6636602275021874,
"grad_norm": 0.90625,
"learning_rate": 9.235556681578605e-07,
"loss": 3.9701,
"num_input_tokens_seen": 891797504,
"step": 13610,
"train_runtime": 124896.9726,
"train_tokens_per_second": 7140.265
},
{
"epoch": 3.666352561082318,
"grad_norm": 0.94140625,
"learning_rate": 9.089393755677328e-07,
"loss": 3.9312,
"num_input_tokens_seen": 892452864,
"step": 13620,
"train_runtime": 124989.3517,
"train_tokens_per_second": 7140.231
},
{
"epoch": 3.6690448946624485,
"grad_norm": 0.9296875,
"learning_rate": 8.944375252462695e-07,
"loss": 3.8729,
"num_input_tokens_seen": 893108224,
"step": 13630,
"train_runtime": 125082.8191,
"train_tokens_per_second": 7140.135
},
{
"epoch": 3.6717372282425793,
"grad_norm": 0.96484375,
"learning_rate": 8.800501860831201e-07,
"loss": 3.952,
"num_input_tokens_seen": 893763584,
"step": 13640,
"train_runtime": 125175.6069,
"train_tokens_per_second": 7140.078
},
{
"epoch": 3.67442956182271,
"grad_norm": 0.96484375,
"learning_rate": 8.657774264239554e-07,
"loss": 3.9501,
"num_input_tokens_seen": 894418944,
"step": 13650,
"train_runtime": 125267.9245,
"train_tokens_per_second": 7140.048
},
{
"epoch": 3.6771218954028404,
"grad_norm": 0.93359375,
"learning_rate": 8.516193140701484e-07,
"loss": 3.7779,
"num_input_tokens_seen": 895074304,
"step": 13660,
"train_runtime": 125360.9193,
"train_tokens_per_second": 7139.979
},
{
"epoch": 3.6798142289829707,
"grad_norm": 0.96484375,
"learning_rate": 8.375759162784519e-07,
"loss": 3.9228,
"num_input_tokens_seen": 895729664,
"step": 13670,
"train_runtime": 125453.9842,
"train_tokens_per_second": 7139.906
},
{
"epoch": 3.6825065625631015,
"grad_norm": 0.9453125,
"learning_rate": 8.23647299760677e-07,
"loss": 3.8713,
"num_input_tokens_seen": 896385024,
"step": 13680,
"train_runtime": 125546.3023,
"train_tokens_per_second": 7139.876
},
{
"epoch": 3.6851988961432323,
"grad_norm": 0.9296875,
"learning_rate": 8.098335306833848e-07,
"loss": 3.9329,
"num_input_tokens_seen": 897040384,
"step": 13690,
"train_runtime": 125639.1859,
"train_tokens_per_second": 7139.814
},
{
"epoch": 3.6878912297233626,
"grad_norm": 0.9765625,
"learning_rate": 7.961346746675452e-07,
"loss": 3.8788,
"num_input_tokens_seen": 897695744,
"step": 13700,
"train_runtime": 125732.4529,
"train_tokens_per_second": 7139.73
},
{
"epoch": 3.6905835633034934,
"grad_norm": 0.93359375,
"learning_rate": 7.825507967882728e-07,
"loss": 3.8508,
"num_input_tokens_seen": 898351104,
"step": 13710,
"train_runtime": 125825.2377,
"train_tokens_per_second": 7139.673
},
{
"epoch": 3.693275896883624,
"grad_norm": 0.9921875,
"learning_rate": 7.690819615744582e-07,
"loss": 3.9309,
"num_input_tokens_seen": 899006464,
"step": 13720,
"train_runtime": 125918.1496,
"train_tokens_per_second": 7139.61
},
{
"epoch": 3.6959682304637544,
"grad_norm": 0.9375,
"learning_rate": 7.557282330085235e-07,
"loss": 3.8989,
"num_input_tokens_seen": 899661824,
"step": 13730,
"train_runtime": 126010.9722,
"train_tokens_per_second": 7139.551
},
{
"epoch": 3.6986605640438848,
"grad_norm": 0.9296875,
"learning_rate": 7.42489674526059e-07,
"loss": 3.9604,
"num_input_tokens_seen": 900317184,
"step": 13740,
"train_runtime": 126103.5634,
"train_tokens_per_second": 7139.506
},
{
"epoch": 3.7013528976240155,
"grad_norm": 0.9609375,
"learning_rate": 7.293663490155783e-07,
"loss": 3.8815,
"num_input_tokens_seen": 900972544,
"step": 13750,
"train_runtime": 126196.3334,
"train_tokens_per_second": 7139.451
},
{
"epoch": 3.7040452312041463,
"grad_norm": 0.96484375,
"learning_rate": 7.163583188181694e-07,
"loss": 3.9181,
"num_input_tokens_seen": 901627904,
"step": 13760,
"train_runtime": 126289.4151,
"train_tokens_per_second": 7139.378
},
{
"epoch": 3.7067375647842766,
"grad_norm": 0.921875,
"learning_rate": 7.034656457272332e-07,
"loss": 3.861,
"num_input_tokens_seen": 902283264,
"step": 13770,
"train_runtime": 126382.4093,
"train_tokens_per_second": 7139.311
},
{
"epoch": 3.7094298983644074,
"grad_norm": 0.93359375,
"learning_rate": 6.906883909881728e-07,
"loss": 3.8926,
"num_input_tokens_seen": 902938624,
"step": 13780,
"train_runtime": 126475.0511,
"train_tokens_per_second": 7139.263
},
{
"epoch": 3.712122231944538,
"grad_norm": 1.0078125,
"learning_rate": 6.780266152981107e-07,
"loss": 3.9374,
"num_input_tokens_seen": 903593984,
"step": 13790,
"train_runtime": 126568.1129,
"train_tokens_per_second": 7139.191
},
{
"epoch": 3.7148145655246685,
"grad_norm": 0.98046875,
"learning_rate": 6.654803788055968e-07,
"loss": 4.0054,
"num_input_tokens_seen": 904249344,
"step": 13800,
"train_runtime": 126661.0167,
"train_tokens_per_second": 7139.129
},
{
"epoch": 3.717506899104799,
"grad_norm": 0.9453125,
"learning_rate": 6.530497411103176e-07,
"loss": 3.8824,
"num_input_tokens_seen": 904904704,
"step": 13810,
"train_runtime": 126753.906,
"train_tokens_per_second": 7139.068
},
{
"epoch": 3.7201992326849296,
"grad_norm": 0.92578125,
"learning_rate": 6.407347612628234e-07,
"loss": 3.9466,
"num_input_tokens_seen": 905560064,
"step": 13820,
"train_runtime": 126847.0746,
"train_tokens_per_second": 7138.991
},
{
"epoch": 3.7228915662650603,
"grad_norm": 1.0,
"learning_rate": 6.285354977642516e-07,
"loss": 3.9317,
"num_input_tokens_seen": 906215424,
"step": 13830,
"train_runtime": 126939.4623,
"train_tokens_per_second": 7138.957
},
{
"epoch": 3.7255838998451907,
"grad_norm": 0.9140625,
"learning_rate": 6.164520085660208e-07,
"loss": 3.9766,
"num_input_tokens_seen": 906870784,
"step": 13840,
"train_runtime": 127032.893,
"train_tokens_per_second": 7138.866
},
{
"epoch": 3.7282762334253214,
"grad_norm": 0.91796875,
"learning_rate": 6.044843510695924e-07,
"loss": 3.8449,
"num_input_tokens_seen": 907526144,
"step": 13850,
"train_runtime": 127125.9777,
"train_tokens_per_second": 7138.794
},
{
"epoch": 3.730968567005452,
"grad_norm": 1.046875,
"learning_rate": 5.926325821261652e-07,
"loss": 4.0254,
"num_input_tokens_seen": 908181504,
"step": 13860,
"train_runtime": 127219.2626,
"train_tokens_per_second": 7138.711
},
{
"epoch": 3.7336609005855825,
"grad_norm": 0.98046875,
"learning_rate": 5.808967580364366e-07,
"loss": 3.9602,
"num_input_tokens_seen": 908836864,
"step": 13870,
"train_runtime": 127311.9513,
"train_tokens_per_second": 7138.661
},
{
"epoch": 3.736353234165713,
"grad_norm": 0.95703125,
"learning_rate": 5.692769345503057e-07,
"loss": 3.8487,
"num_input_tokens_seen": 909492224,
"step": 13880,
"train_runtime": 127405.1651,
"train_tokens_per_second": 7138.582
},
{
"epoch": 3.7390455677458436,
"grad_norm": 0.96875,
"learning_rate": 5.577731668666347e-07,
"loss": 3.9313,
"num_input_tokens_seen": 910147584,
"step": 13890,
"train_runtime": 127498.1611,
"train_tokens_per_second": 7138.515
},
{
"epoch": 3.7417379013259744,
"grad_norm": 0.9765625,
"learning_rate": 5.463855096329601e-07,
"loss": 3.9862,
"num_input_tokens_seen": 910802944,
"step": 13900,
"train_runtime": 127591.5238,
"train_tokens_per_second": 7138.428
},
{
"epoch": 3.7444302349061047,
"grad_norm": 0.9609375,
"learning_rate": 5.35114016945254e-07,
"loss": 3.8747,
"num_input_tokens_seen": 911458304,
"step": 13910,
"train_runtime": 127684.5276,
"train_tokens_per_second": 7138.361
},
{
"epoch": 3.7471225684862355,
"grad_norm": 0.90234375,
"learning_rate": 5.239587423476633e-07,
"loss": 3.8832,
"num_input_tokens_seen": 912113664,
"step": 13920,
"train_runtime": 127777.5312,
"train_tokens_per_second": 7138.295
},
{
"epoch": 3.7498149020663663,
"grad_norm": 1.0,
"learning_rate": 5.12919738832246e-07,
"loss": 3.9736,
"num_input_tokens_seen": 912769024,
"step": 13930,
"train_runtime": 127870.422,
"train_tokens_per_second": 7138.234
},
{
"epoch": 3.7525072356464966,
"grad_norm": 0.953125,
"learning_rate": 5.019970588387213e-07,
"loss": 3.9561,
"num_input_tokens_seen": 913424384,
"step": 13940,
"train_runtime": 127963.6033,
"train_tokens_per_second": 7138.158
},
{
"epoch": 3.7551995692266273,
"grad_norm": 0.921875,
"learning_rate": 4.911907542542449e-07,
"loss": 3.9202,
"num_input_tokens_seen": 914079744,
"step": 13950,
"train_runtime": 128056.3548,
"train_tokens_per_second": 7138.105
},
{
"epoch": 3.7578919028067577,
"grad_norm": 0.9453125,
"learning_rate": 4.805008764131147e-07,
"loss": 3.8571,
"num_input_tokens_seen": 914735104,
"step": 13960,
"train_runtime": 128149.0609,
"train_tokens_per_second": 7138.055
},
{
"epoch": 3.7605842363868884,
"grad_norm": 0.92578125,
"learning_rate": 4.699274760965794e-07,
"loss": 3.99,
"num_input_tokens_seen": 915390464,
"step": 13970,
"train_runtime": 128242.0726,
"train_tokens_per_second": 7137.989
},
{
"epoch": 3.7632765699670188,
"grad_norm": 0.98046875,
"learning_rate": 4.5947060353254967e-07,
"loss": 3.8639,
"num_input_tokens_seen": 916045824,
"step": 13980,
"train_runtime": 128335.1981,
"train_tokens_per_second": 7137.916
},
{
"epoch": 3.7659689035471495,
"grad_norm": 0.9453125,
"learning_rate": 4.4913030839540404e-07,
"loss": 3.9185,
"num_input_tokens_seen": 916701184,
"step": 13990,
"train_runtime": 128428.5155,
"train_tokens_per_second": 7137.832
},
{
"epoch": 3.7686612371272803,
"grad_norm": 0.9609375,
"learning_rate": 4.38906639805714e-07,
"loss": 3.9674,
"num_input_tokens_seen": 917356544,
"step": 14000,
"train_runtime": 128521.9811,
"train_tokens_per_second": 7137.74
},
{
"epoch": 3.7713535707074106,
"grad_norm": 0.9609375,
"learning_rate": 4.2879964633003867e-07,
"loss": 3.9561,
"num_input_tokens_seen": 918011904,
"step": 14010,
"train_runtime": 128632.4387,
"train_tokens_per_second": 7136.706
},
{
"epoch": 3.7740459042875414,
"grad_norm": 0.96875,
"learning_rate": 4.188093759806805e-07,
"loss": 3.941,
"num_input_tokens_seen": 918667264,
"step": 14020,
"train_runtime": 128724.8013,
"train_tokens_per_second": 7136.676
},
{
"epoch": 3.7767382378676717,
"grad_norm": 0.921875,
"learning_rate": 4.0893587621545493e-07,
"loss": 3.9297,
"num_input_tokens_seen": 919322624,
"step": 14030,
"train_runtime": 128818.3904,
"train_tokens_per_second": 7136.579
},
{
"epoch": 3.7794305714478025,
"grad_norm": 0.9609375,
"learning_rate": 3.9917919393747673e-07,
"loss": 3.9339,
"num_input_tokens_seen": 919977984,
"step": 14040,
"train_runtime": 128911.4414,
"train_tokens_per_second": 7136.512
},
{
"epoch": 3.782122905027933,
"grad_norm": 0.9296875,
"learning_rate": 3.895393754949267e-07,
"loss": 3.923,
"num_input_tokens_seen": 920633344,
"step": 14050,
"train_runtime": 129004.4875,
"train_tokens_per_second": 7136.444
},
{
"epoch": 3.7848152386080636,
"grad_norm": 0.95703125,
"learning_rate": 3.8001646668083537e-07,
"loss": 3.9791,
"num_input_tokens_seen": 921288704,
"step": 14060,
"train_runtime": 129097.3418,
"train_tokens_per_second": 7136.388
},
{
"epoch": 3.7875075721881943,
"grad_norm": 0.9765625,
"learning_rate": 3.706105127328663e-07,
"loss": 3.9704,
"num_input_tokens_seen": 921944064,
"step": 14070,
"train_runtime": 129190.3786,
"train_tokens_per_second": 7136.321
},
{
"epoch": 3.7901999057683247,
"grad_norm": 0.953125,
"learning_rate": 3.613215583330998e-07,
"loss": 3.8051,
"num_input_tokens_seen": 922599424,
"step": 14080,
"train_runtime": 129283.3427,
"train_tokens_per_second": 7136.259
},
{
"epoch": 3.7928922393484554,
"grad_norm": 0.9375,
"learning_rate": 3.521496476078245e-07,
"loss": 3.8645,
"num_input_tokens_seen": 923254784,
"step": 14090,
"train_runtime": 129376.1908,
"train_tokens_per_second": 7136.203
},
{
"epoch": 3.7955845729285858,
"grad_norm": 0.9296875,
"learning_rate": 3.4309482412731574e-07,
"loss": 3.9249,
"num_input_tokens_seen": 923910144,
"step": 14100,
"train_runtime": 129469.0328,
"train_tokens_per_second": 7136.148
},
{
"epoch": 3.7982769065087165,
"grad_norm": 0.94140625,
"learning_rate": 3.341571309056463e-07,
"loss": 3.8903,
"num_input_tokens_seen": 924565504,
"step": 14110,
"train_runtime": 129561.6755,
"train_tokens_per_second": 7136.103
},
{
"epoch": 3.800969240088847,
"grad_norm": 1.0234375,
"learning_rate": 3.253366104004646e-07,
"loss": 3.8958,
"num_input_tokens_seen": 925220864,
"step": 14120,
"train_runtime": 129654.5103,
"train_tokens_per_second": 7136.048
},
{
"epoch": 3.8036615736689776,
"grad_norm": 0.93359375,
"learning_rate": 3.1663330451281446e-07,
"loss": 3.8679,
"num_input_tokens_seen": 925876224,
"step": 14130,
"train_runtime": 129747.5313,
"train_tokens_per_second": 7135.983
},
{
"epoch": 3.8063539072491084,
"grad_norm": 0.98828125,
"learning_rate": 3.0804725458690177e-07,
"loss": 3.8652,
"num_input_tokens_seen": 926531584,
"step": 14140,
"train_runtime": 129840.1296,
"train_tokens_per_second": 7135.942
},
{
"epoch": 3.8090462408292387,
"grad_norm": 0.953125,
"learning_rate": 2.9957850140994447e-07,
"loss": 3.9356,
"num_input_tokens_seen": 927186944,
"step": 14150,
"train_runtime": 129933.2246,
"train_tokens_per_second": 7135.873
},
{
"epoch": 3.8117385744093695,
"grad_norm": 0.94140625,
"learning_rate": 2.912270852119314e-07,
"loss": 3.9382,
"num_input_tokens_seen": 927842304,
"step": 14160,
"train_runtime": 130026.321,
"train_tokens_per_second": 7135.804
},
{
"epoch": 3.8144309079895,
"grad_norm": 0.984375,
"learning_rate": 2.8299304566546667e-07,
"loss": 3.9925,
"num_input_tokens_seen": 928497664,
"step": 14170,
"train_runtime": 130118.8414,
"train_tokens_per_second": 7135.766
},
{
"epoch": 3.8171232415696306,
"grad_norm": 0.921875,
"learning_rate": 2.748764218855643e-07,
"loss": 3.9259,
"num_input_tokens_seen": 929153024,
"step": 14180,
"train_runtime": 130211.9958,
"train_tokens_per_second": 7135.695
},
{
"epoch": 3.819815575149761,
"grad_norm": 0.98828125,
"learning_rate": 2.668772524294649e-07,
"loss": 3.9098,
"num_input_tokens_seen": 929808384,
"step": 14190,
"train_runtime": 130304.4538,
"train_tokens_per_second": 7135.661
},
{
"epoch": 3.8225079087298917,
"grad_norm": 0.94921875,
"learning_rate": 2.589955752964529e-07,
"loss": 3.9708,
"num_input_tokens_seen": 930463744,
"step": 14200,
"train_runtime": 130397.4744,
"train_tokens_per_second": 7135.596
},
{
"epoch": 3.8252002423100224,
"grad_norm": 0.9453125,
"learning_rate": 2.5123142792768117e-07,
"loss": 3.9448,
"num_input_tokens_seen": 931119104,
"step": 14210,
"train_runtime": 130489.9438,
"train_tokens_per_second": 7135.562
},
{
"epoch": 3.8278925758901527,
"grad_norm": 1.0546875,
"learning_rate": 2.435848472059826e-07,
"loss": 3.8709,
"num_input_tokens_seen": 931774464,
"step": 14220,
"train_runtime": 130582.7644,
"train_tokens_per_second": 7135.509
},
{
"epoch": 3.8305849094702835,
"grad_norm": 0.94921875,
"learning_rate": 2.3605586945570635e-07,
"loss": 4.0202,
"num_input_tokens_seen": 932429824,
"step": 14230,
"train_runtime": 130675.8246,
"train_tokens_per_second": 7135.442
},
{
"epoch": 3.833277243050414,
"grad_norm": 1.0546875,
"learning_rate": 2.286445304425372e-07,
"loss": 4.0128,
"num_input_tokens_seen": 933085184,
"step": 14240,
"train_runtime": 130768.5266,
"train_tokens_per_second": 7135.396
},
{
"epoch": 3.8359695766305446,
"grad_norm": 0.9609375,
"learning_rate": 2.2135086537332926e-07,
"loss": 3.8913,
"num_input_tokens_seen": 933740544,
"step": 14250,
"train_runtime": 130861.4831,
"train_tokens_per_second": 7135.335
},
{
"epoch": 3.838661910210675,
"grad_norm": 0.9453125,
"learning_rate": 2.1417490889593661e-07,
"loss": 3.9366,
"num_input_tokens_seen": 934395904,
"step": 14260,
"train_runtime": 130954.3688,
"train_tokens_per_second": 7135.279
},
{
"epoch": 3.8413542437908057,
"grad_norm": 0.93359375,
"learning_rate": 2.0711669509905218e-07,
"loss": 3.8326,
"num_input_tokens_seen": 935051264,
"step": 14270,
"train_runtime": 131047.4893,
"train_tokens_per_second": 7135.209
},
{
"epoch": 3.8440465773709365,
"grad_norm": 0.9921875,
"learning_rate": 2.0017625751204138e-07,
"loss": 3.9629,
"num_input_tokens_seen": 935706624,
"step": 14280,
"train_runtime": 131140.2652,
"train_tokens_per_second": 7135.159
},
{
"epoch": 3.846738910951067,
"grad_norm": 0.984375,
"learning_rate": 1.933536291047866e-07,
"loss": 3.9492,
"num_input_tokens_seen": 936361984,
"step": 14290,
"train_runtime": 131233.5766,
"train_tokens_per_second": 7135.079
},
{
"epoch": 3.8494312445311976,
"grad_norm": 0.96484375,
"learning_rate": 1.86648842287529e-07,
"loss": 3.8845,
"num_input_tokens_seen": 937017344,
"step": 14300,
"train_runtime": 131326.1746,
"train_tokens_per_second": 7135.039
},
{
"epoch": 3.852123578111328,
"grad_norm": 0.9453125,
"learning_rate": 1.8006192891071581e-07,
"loss": 3.8817,
"num_input_tokens_seen": 937672704,
"step": 14310,
"train_runtime": 131419.3724,
"train_tokens_per_second": 7134.966
},
{
"epoch": 3.8548159116914587,
"grad_norm": 0.9140625,
"learning_rate": 1.735929202648423e-07,
"loss": 3.8734,
"num_input_tokens_seen": 938328064,
"step": 14320,
"train_runtime": 131512.5259,
"train_tokens_per_second": 7134.895
},
{
"epoch": 3.857508245271589,
"grad_norm": 1.0,
"learning_rate": 1.6724184708031276e-07,
"loss": 3.8662,
"num_input_tokens_seen": 938983424,
"step": 14330,
"train_runtime": 131605.8597,
"train_tokens_per_second": 7134.815
},
{
"epoch": 3.8602005788517197,
"grad_norm": 0.9296875,
"learning_rate": 1.6100873952729078e-07,
"loss": 4.0064,
"num_input_tokens_seen": 939638784,
"step": 14340,
"train_runtime": 131698.4257,
"train_tokens_per_second": 7134.776
},
{
"epoch": 3.8628929124318505,
"grad_norm": 0.921875,
"learning_rate": 1.5489362721556044e-07,
"loss": 3.8501,
"num_input_tokens_seen": 940294144,
"step": 14350,
"train_runtime": 131790.8987,
"train_tokens_per_second": 7134.743
},
{
"epoch": 3.865585246011981,
"grad_norm": 0.95703125,
"learning_rate": 1.488965391943653e-07,
"loss": 4.0097,
"num_input_tokens_seen": 940949504,
"step": 14360,
"train_runtime": 131884.3248,
"train_tokens_per_second": 7134.658
},
{
"epoch": 3.8682775795921116,
"grad_norm": 0.96484375,
"learning_rate": 1.4301750395230296e-07,
"loss": 3.8275,
"num_input_tokens_seen": 941604864,
"step": 14370,
"train_runtime": 131977.2636,
"train_tokens_per_second": 7134.599
},
{
"epoch": 3.870969913172242,
"grad_norm": 0.95703125,
"learning_rate": 1.3725654941716127e-07,
"loss": 3.9239,
"num_input_tokens_seen": 942260224,
"step": 14380,
"train_runtime": 132070.4041,
"train_tokens_per_second": 7134.53
},
{
"epoch": 3.8736622467523727,
"grad_norm": 0.95703125,
"learning_rate": 1.3161370295580734e-07,
"loss": 3.8412,
"num_input_tokens_seen": 942915584,
"step": 14390,
"train_runtime": 132163.8532,
"train_tokens_per_second": 7134.444
},
{
"epoch": 3.876354580332503,
"grad_norm": 0.96484375,
"learning_rate": 1.2608899137403207e-07,
"loss": 3.9429,
"num_input_tokens_seen": 943570944,
"step": 14400,
"train_runtime": 132256.8675,
"train_tokens_per_second": 7134.381
},
{
"epoch": 3.879046913912634,
"grad_norm": 0.9453125,
"learning_rate": 1.2068244091645588e-07,
"loss": 4.0124,
"num_input_tokens_seen": 944226304,
"step": 14410,
"train_runtime": 132349.5777,
"train_tokens_per_second": 7134.336
},
{
"epoch": 3.8817392474927646,
"grad_norm": 0.94140625,
"learning_rate": 1.153940772663703e-07,
"loss": 3.8483,
"num_input_tokens_seen": 944881664,
"step": 14420,
"train_runtime": 132442.9328,
"train_tokens_per_second": 7134.255
},
{
"epoch": 3.884431581072895,
"grad_norm": 0.95703125,
"learning_rate": 1.1022392554564387e-07,
"loss": 3.8698,
"num_input_tokens_seen": 945537024,
"step": 14430,
"train_runtime": 132535.4318,
"train_tokens_per_second": 7134.221
},
{
"epoch": 3.8871239146530256,
"grad_norm": 0.9609375,
"learning_rate": 1.0517201031458312e-07,
"loss": 3.8573,
"num_input_tokens_seen": 946192384,
"step": 14440,
"train_runtime": 132628.1579,
"train_tokens_per_second": 7134.174
},
{
"epoch": 3.889816248233156,
"grad_norm": 0.96484375,
"learning_rate": 1.0023835557182448e-07,
"loss": 3.9935,
"num_input_tokens_seen": 946847744,
"step": 14450,
"train_runtime": 132721.5337,
"train_tokens_per_second": 7134.093
},
{
"epoch": 3.8925085818132867,
"grad_norm": 0.953125,
"learning_rate": 9.542298475422318e-08,
"loss": 3.9391,
"num_input_tokens_seen": 947503104,
"step": 14460,
"train_runtime": 132814.2323,
"train_tokens_per_second": 7134.048
},
{
"epoch": 3.895200915393417,
"grad_norm": 0.9296875,
"learning_rate": 9.072592073673392e-08,
"loss": 3.9545,
"num_input_tokens_seen": 948158464,
"step": 14470,
"train_runtime": 132907.318,
"train_tokens_per_second": 7133.982
},
{
"epoch": 3.897893248973548,
"grad_norm": 0.9609375,
"learning_rate": 8.614718583230819e-08,
"loss": 3.9326,
"num_input_tokens_seen": 948813824,
"step": 14480,
"train_runtime": 133000.2245,
"train_tokens_per_second": 7133.926
},
{
"epoch": 3.9005855825536786,
"grad_norm": 0.9375,
"learning_rate": 8.168680179178879e-08,
"loss": 3.9629,
"num_input_tokens_seen": 949469184,
"step": 14490,
"train_runtime": 133092.8079,
"train_tokens_per_second": 7133.888
},
{
"epoch": 3.903277916133809,
"grad_norm": 0.9765625,
"learning_rate": 7.734478980379878e-08,
"loss": 3.8664,
"num_input_tokens_seen": 950124544,
"step": 14500,
"train_runtime": 133185.5074,
"train_tokens_per_second": 7133.843
},
{
"epoch": 3.9059702497139397,
"grad_norm": 0.953125,
"learning_rate": 7.312117049464995e-08,
"loss": 3.9334,
"num_input_tokens_seen": 950779904,
"step": 14510,
"train_runtime": 133295.6189,
"train_tokens_per_second": 7132.867
},
{
"epoch": 3.90866258329407,
"grad_norm": 0.91796875,
"learning_rate": 6.901596392824006e-08,
"loss": 3.9127,
"num_input_tokens_seen": 951435264,
"step": 14520,
"train_runtime": 133388.1264,
"train_tokens_per_second": 7132.833
},
{
"epoch": 3.911354916874201,
"grad_norm": 0.9609375,
"learning_rate": 6.502918960595849e-08,
"loss": 3.9548,
"num_input_tokens_seen": 952090624,
"step": 14530,
"train_runtime": 133480.8519,
"train_tokens_per_second": 7132.788
},
{
"epoch": 3.914047250454331,
"grad_norm": 0.9453125,
"learning_rate": 6.116086646659192e-08,
"loss": 3.9364,
"num_input_tokens_seen": 952745984,
"step": 14540,
"train_runtime": 133574.0035,
"train_tokens_per_second": 7132.72
},
{
"epoch": 3.916739584034462,
"grad_norm": 0.98046875,
"learning_rate": 5.741101288623818e-08,
"loss": 3.9602,
"num_input_tokens_seen": 953401344,
"step": 14550,
"train_runtime": 133666.7795,
"train_tokens_per_second": 7132.672
},
{
"epoch": 3.9194319176145926,
"grad_norm": 0.91015625,
"learning_rate": 5.377964667822033e-08,
"loss": 3.9444,
"num_input_tokens_seen": 954056704,
"step": 14560,
"train_runtime": 133758.9615,
"train_tokens_per_second": 7132.656
},
{
"epoch": 3.922124251194723,
"grad_norm": 0.953125,
"learning_rate": 5.026678509298943e-08,
"loss": 3.8375,
"num_input_tokens_seen": 954712064,
"step": 14570,
"train_runtime": 133852.3351,
"train_tokens_per_second": 7132.577
},
{
"epoch": 3.9248165847748537,
"grad_norm": 0.9453125,
"learning_rate": 4.687244481806075e-08,
"loss": 4.0509,
"num_input_tokens_seen": 955367424,
"step": 14580,
"train_runtime": 133945.6488,
"train_tokens_per_second": 7132.501
},
{
"epoch": 3.927508918354984,
"grad_norm": 0.953125,
"learning_rate": 4.3596641977916575e-08,
"loss": 3.8624,
"num_input_tokens_seen": 956022784,
"step": 14590,
"train_runtime": 134038.3984,
"train_tokens_per_second": 7132.455
},
{
"epoch": 3.930201251935115,
"grad_norm": 0.97265625,
"learning_rate": 4.043939213395076e-08,
"loss": 3.9485,
"num_input_tokens_seen": 956678144,
"step": 14600,
"train_runtime": 134131.6976,
"train_tokens_per_second": 7132.379
},
{
"epoch": 3.932893585515245,
"grad_norm": 1.03125,
"learning_rate": 3.740071028436876e-08,
"loss": 3.922,
"num_input_tokens_seen": 957333504,
"step": 14610,
"train_runtime": 134224.6729,
"train_tokens_per_second": 7132.321
},
{
"epoch": 3.935585919095376,
"grad_norm": 0.94140625,
"learning_rate": 3.448061086414045e-08,
"loss": 3.9343,
"num_input_tokens_seen": 957988864,
"step": 14620,
"train_runtime": 134317.4462,
"train_tokens_per_second": 7132.274
},
{
"epoch": 3.9382782526755067,
"grad_norm": 0.95703125,
"learning_rate": 3.167910774491412e-08,
"loss": 3.9556,
"num_input_tokens_seen": 958644224,
"step": 14630,
"train_runtime": 134410.7699,
"train_tokens_per_second": 7132.198
},
{
"epoch": 3.940970586255637,
"grad_norm": 0.94140625,
"learning_rate": 2.8996214234966456e-08,
"loss": 3.9511,
"num_input_tokens_seen": 959299584,
"step": 14640,
"train_runtime": 134503.2588,
"train_tokens_per_second": 7132.166
},
{
"epoch": 3.9436629198357678,
"grad_norm": 0.92578125,
"learning_rate": 2.6431943079122112e-08,
"loss": 3.9414,
"num_input_tokens_seen": 959954944,
"step": 14650,
"train_runtime": 134596.4574,
"train_tokens_per_second": 7132.097
},
{
"epoch": 3.946355253415898,
"grad_norm": 1.0078125,
"learning_rate": 2.39863064587037e-08,
"loss": 4.0138,
"num_input_tokens_seen": 960610304,
"step": 14660,
"train_runtime": 134689.6475,
"train_tokens_per_second": 7132.028
},
{
"epoch": 3.949047586996029,
"grad_norm": 0.953125,
"learning_rate": 2.165931599147353e-08,
"loss": 3.9296,
"num_input_tokens_seen": 961265664,
"step": 14670,
"train_runtime": 134782.3868,
"train_tokens_per_second": 7131.983
},
{
"epoch": 3.951739920576159,
"grad_norm": 0.9296875,
"learning_rate": 1.9450982731578082e-08,
"loss": 3.9363,
"num_input_tokens_seen": 961921024,
"step": 14680,
"train_runtime": 134875.6297,
"train_tokens_per_second": 7131.911
},
{
"epoch": 3.95443225415629,
"grad_norm": 0.99609375,
"learning_rate": 1.7361317169492518e-08,
"loss": 3.955,
"num_input_tokens_seen": 962576384,
"step": 14690,
"train_runtime": 134968.5816,
"train_tokens_per_second": 7131.855
},
{
"epoch": 3.9571245877364207,
"grad_norm": 0.98046875,
"learning_rate": 1.5390329231970703e-08,
"loss": 3.8801,
"num_input_tokens_seen": 963231744,
"step": 14700,
"train_runtime": 135061.2815,
"train_tokens_per_second": 7131.813
},
{
"epoch": 3.959816921316551,
"grad_norm": 0.9375,
"learning_rate": 1.3538028282000792e-08,
"loss": 3.9704,
"num_input_tokens_seen": 963887104,
"step": 14710,
"train_runtime": 135154.3462,
"train_tokens_per_second": 7131.751
},
{
"epoch": 3.962509254896682,
"grad_norm": 0.94140625,
"learning_rate": 1.1804423118760844e-08,
"loss": 3.8934,
"num_input_tokens_seen": 964542464,
"step": 14720,
"train_runtime": 135247.3859,
"train_tokens_per_second": 7131.69
},
{
"epoch": 3.965201588476812,
"grad_norm": 0.99609375,
"learning_rate": 1.0189521977577166e-08,
"loss": 4.0073,
"num_input_tokens_seen": 965197824,
"step": 14730,
"train_runtime": 135340.1518,
"train_tokens_per_second": 7131.644
},
{
"epoch": 3.967893922056943,
"grad_norm": 0.921875,
"learning_rate": 8.693332529879917e-09,
"loss": 3.9742,
"num_input_tokens_seen": 965853184,
"step": 14740,
"train_runtime": 135433.0047,
"train_tokens_per_second": 7131.594
},
{
"epoch": 3.9705862556370732,
"grad_norm": 0.9921875,
"learning_rate": 7.3158618831781215e-09,
"loss": 3.8715,
"num_input_tokens_seen": 966508544,
"step": 14750,
"train_runtime": 135526.1675,
"train_tokens_per_second": 7131.527
},
{
"epoch": 3.973278589217204,
"grad_norm": 0.9375,
"learning_rate": 6.057116581006939e-09,
"loss": 3.9439,
"num_input_tokens_seen": 967163904,
"step": 14760,
"train_runtime": 135619.1962,
"train_tokens_per_second": 7131.468
},
{
"epoch": 3.9759709227973348,
"grad_norm": 0.9765625,
"learning_rate": 4.917102602922108e-09,
"loss": 3.858,
"num_input_tokens_seen": 967819264,
"step": 14770,
"train_runtime": 135711.9871,
"train_tokens_per_second": 7131.421
},
{
"epoch": 3.978663256377465,
"grad_norm": 0.96484375,
"learning_rate": 3.895825364444438e-09,
"loss": 3.9277,
"num_input_tokens_seen": 968474624,
"step": 14780,
"train_runtime": 135804.7135,
"train_tokens_per_second": 7131.377
},
{
"epoch": 3.981355589957596,
"grad_norm": 0.9765625,
"learning_rate": 2.9932897170542594e-09,
"loss": 3.8246,
"num_input_tokens_seen": 969129984,
"step": 14790,
"train_runtime": 135897.7934,
"train_tokens_per_second": 7131.315
},
{
"epoch": 3.984047923537726,
"grad_norm": 0.94921875,
"learning_rate": 2.2094999481664382e-09,
"loss": 4.0439,
"num_input_tokens_seen": 969785344,
"step": 14800,
"train_runtime": 135990.5442,
"train_tokens_per_second": 7131.27
},
{
"epoch": 3.986740257117857,
"grad_norm": 0.9609375,
"learning_rate": 1.5444597810915228e-09,
"loss": 3.9585,
"num_input_tokens_seen": 970440704,
"step": 14810,
"train_runtime": 136083.8449,
"train_tokens_per_second": 7131.197
},
{
"epoch": 3.9894325906979873,
"grad_norm": 0.94921875,
"learning_rate": 9.981723750412951e-10,
"loss": 3.8607,
"num_input_tokens_seen": 971096064,
"step": 14820,
"train_runtime": 136176.9057,
"train_tokens_per_second": 7131.136
},
{
"epoch": 3.992124924278118,
"grad_norm": 1.0078125,
"learning_rate": 5.706403251037884e-10,
"loss": 3.9295,
"num_input_tokens_seen": 971751424,
"step": 14830,
"train_runtime": 136269.7783,
"train_tokens_per_second": 7131.085
},
{
"epoch": 3.994817257858249,
"grad_norm": 0.9375,
"learning_rate": 2.6186566222663465e-10,
"loss": 3.9395,
"num_input_tokens_seen": 972406784,
"step": 14840,
"train_runtime": 136362.3434,
"train_tokens_per_second": 7131.051
},
{
"epoch": 3.997509591438379,
"grad_norm": 0.89453125,
"learning_rate": 7.184985321706527e-11,
"loss": 3.9461,
"num_input_tokens_seen": 973062144,
"step": 14850,
"train_runtime": 136455.5477,
"train_tokens_per_second": 7130.983
},
{
"epoch": 4.0,
"grad_norm": 1.96875,
"learning_rate": 5.938007224814967e-13,
"loss": 3.9392,
"num_input_tokens_seen": 973668352,
"step": 14860,
"train_runtime": 136541.371,
"train_tokens_per_second": 7130.94
},
{
"epoch": 4.0,
"num_input_tokens_seen": 973668352,
"step": 14860,
"total_flos": 2.1227013472184697e+19,
"train_loss": 4.094350151065862,
"train_runtime": 136571.1716,
"train_samples_per_second": 3.481,
"train_steps_per_second": 0.109,
"train_tokens_per_second": 7129.384
}
],
"logging_steps": 10,
"max_steps": 14860,
"num_input_tokens_seen": 973668352,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1227013472184697e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}