Arabic-TTS-Spark / LLM /trainer_state.json
IbrahimSalah's picture
Upload LLM/trainer_state.json with huggingface_hub
589fba1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 44.270833333333336,
"eval_steps": 576,
"global_step": 25500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001736111111111111,
"eval_loss": 9.594602584838867,
"eval_runtime": 41.3373,
"eval_samples_per_second": 90.209,
"eval_steps_per_second": 5.661,
"step": 1
},
{
"epoch": 0.08680555555555555,
"grad_norm": 12.75,
"learning_rate": 0.000196,
"loss": 7.4156,
"step": 50
},
{
"epoch": 0.1736111111111111,
"grad_norm": 13.3125,
"learning_rate": 0.0001999985665413352,
"loss": 4.4164,
"step": 100
},
{
"epoch": 0.2604166666666667,
"grad_norm": 5.78125,
"learning_rate": 0.00019999414859436728,
"loss": 4.1765,
"step": 150
},
{
"epoch": 0.3472222222222222,
"grad_norm": 11.0,
"learning_rate": 0.00019998674569395055,
"loss": 4.0896,
"step": 200
},
{
"epoch": 0.4340277777777778,
"grad_norm": 6.625,
"learning_rate": 0.000199976358061071,
"loss": 3.9586,
"step": 250
},
{
"epoch": 0.5208333333333334,
"grad_norm": 6.65625,
"learning_rate": 0.00019996298600581287,
"loss": 3.9273,
"step": 300
},
{
"epoch": 0.6076388888888888,
"grad_norm": 13.125,
"learning_rate": 0.0001999466299273491,
"loss": 3.8612,
"step": 350
},
{
"epoch": 0.6944444444444444,
"grad_norm": 7.0625,
"learning_rate": 0.00019992729031392958,
"loss": 3.8205,
"step": 400
},
{
"epoch": 0.78125,
"grad_norm": 8.75,
"learning_rate": 0.00019990496774286654,
"loss": 3.7956,
"step": 450
},
{
"epoch": 0.8680555555555556,
"grad_norm": 8.75,
"learning_rate": 0.00019987966288051735,
"loss": 3.7654,
"step": 500
},
{
"epoch": 0.9548611111111112,
"grad_norm": 14.0625,
"learning_rate": 0.00019985137648226457,
"loss": 3.6055,
"step": 550
},
{
"epoch": 1.0,
"eval_loss": 3.320380210876465,
"eval_runtime": 41.8114,
"eval_samples_per_second": 89.186,
"eval_steps_per_second": 5.597,
"step": 576
},
{
"epoch": 1.0416666666666667,
"grad_norm": 13.875,
"learning_rate": 0.00019982010939249346,
"loss": 3.4141,
"step": 600
},
{
"epoch": 1.1284722222222223,
"grad_norm": 15.125,
"learning_rate": 0.0001997858625445666,
"loss": 3.3461,
"step": 650
},
{
"epoch": 1.2152777777777777,
"grad_norm": 13.25,
"learning_rate": 0.0001997486369607964,
"loss": 3.2968,
"step": 700
},
{
"epoch": 1.3020833333333333,
"grad_norm": 11.25,
"learning_rate": 0.00019970843375241416,
"loss": 3.2924,
"step": 750
},
{
"epoch": 1.3888888888888888,
"grad_norm": 12.5,
"learning_rate": 0.00019966525411953717,
"loss": 3.2577,
"step": 800
},
{
"epoch": 1.4756944444444444,
"grad_norm": 13.0625,
"learning_rate": 0.00019961909935113284,
"loss": 3.2544,
"step": 850
},
{
"epoch": 1.5625,
"grad_norm": 14.125,
"learning_rate": 0.00019956997082498009,
"loss": 3.2245,
"step": 900
},
{
"epoch": 1.6493055555555556,
"grad_norm": 9.1875,
"learning_rate": 0.00019951787000762835,
"loss": 3.2121,
"step": 950
},
{
"epoch": 1.7361111111111112,
"grad_norm": 13.125,
"learning_rate": 0.00019946279845435382,
"loss": 3.1861,
"step": 1000
},
{
"epoch": 1.8229166666666665,
"grad_norm": 8.8125,
"learning_rate": 0.0001994047578091129,
"loss": 3.1813,
"step": 1050
},
{
"epoch": 1.9097222222222223,
"grad_norm": 10.5,
"learning_rate": 0.00019934374980449325,
"loss": 3.1483,
"step": 1100
},
{
"epoch": 1.9965277777777777,
"grad_norm": 11.875,
"learning_rate": 0.00019927977626166193,
"loss": 3.1491,
"step": 1150
},
{
"epoch": 2.0,
"eval_loss": 3.0681025981903076,
"eval_runtime": 41.9062,
"eval_samples_per_second": 88.984,
"eval_steps_per_second": 5.584,
"step": 1152
},
{
"epoch": 2.0833333333333335,
"grad_norm": 10.25,
"learning_rate": 0.00019921283909031114,
"loss": 3.1364,
"step": 1200
},
{
"epoch": 2.170138888888889,
"grad_norm": 10.375,
"learning_rate": 0.00019914294028860127,
"loss": 3.1123,
"step": 1250
},
{
"epoch": 2.2569444444444446,
"grad_norm": 10.3125,
"learning_rate": 0.00019907008194310102,
"loss": 3.1234,
"step": 1300
},
{
"epoch": 2.34375,
"grad_norm": 12.9375,
"learning_rate": 0.00019899426622872543,
"loss": 3.1215,
"step": 1350
},
{
"epoch": 2.4305555555555554,
"grad_norm": 12.5,
"learning_rate": 0.00019891549540867066,
"loss": 3.0999,
"step": 1400
},
{
"epoch": 2.517361111111111,
"grad_norm": 6.59375,
"learning_rate": 0.00019883377183434666,
"loss": 3.1192,
"step": 1450
},
{
"epoch": 2.6041666666666665,
"grad_norm": 7.6875,
"learning_rate": 0.00019874909794530675,
"loss": 3.0983,
"step": 1500
},
{
"epoch": 2.6909722222222223,
"grad_norm": 8.625,
"learning_rate": 0.0001986614762691751,
"loss": 3.0853,
"step": 1550
},
{
"epoch": 2.7777777777777777,
"grad_norm": 12.875,
"learning_rate": 0.00019857090942157092,
"loss": 3.0822,
"step": 1600
},
{
"epoch": 2.8645833333333335,
"grad_norm": 11.3125,
"learning_rate": 0.00019847740010603068,
"loss": 3.0779,
"step": 1650
},
{
"epoch": 2.951388888888889,
"grad_norm": 7.3125,
"learning_rate": 0.00019838095111392726,
"loss": 3.0747,
"step": 1700
},
{
"epoch": 3.0,
"eval_loss": 3.002568483352661,
"eval_runtime": 40.5832,
"eval_samples_per_second": 91.885,
"eval_steps_per_second": 5.766,
"step": 1728
},
{
"epoch": 3.0381944444444446,
"grad_norm": 11.625,
"learning_rate": 0.00019828156532438666,
"loss": 3.0638,
"step": 1750
},
{
"epoch": 3.125,
"grad_norm": 10.9375,
"learning_rate": 0.00019817924570420198,
"loss": 3.0585,
"step": 1800
},
{
"epoch": 3.2118055555555554,
"grad_norm": 7.0625,
"learning_rate": 0.00019807399530774502,
"loss": 3.0494,
"step": 1850
},
{
"epoch": 3.298611111111111,
"grad_norm": 9.125,
"learning_rate": 0.00019796581727687493,
"loss": 3.0628,
"step": 1900
},
{
"epoch": 3.3854166666666665,
"grad_norm": 11.875,
"learning_rate": 0.00019785471484084458,
"loss": 3.0529,
"step": 1950
},
{
"epoch": 3.4722222222222223,
"grad_norm": 14.9375,
"learning_rate": 0.00019774069131620398,
"loss": 3.0594,
"step": 2000
},
{
"epoch": 3.5590277777777777,
"grad_norm": 8.4375,
"learning_rate": 0.00019762375010670143,
"loss": 3.0478,
"step": 2050
},
{
"epoch": 3.6458333333333335,
"grad_norm": 9.8125,
"learning_rate": 0.0001975038947031819,
"loss": 3.0401,
"step": 2100
},
{
"epoch": 3.732638888888889,
"grad_norm": 11.0,
"learning_rate": 0.0001973811286834827,
"loss": 3.0339,
"step": 2150
},
{
"epoch": 3.8194444444444446,
"grad_norm": 9.0625,
"learning_rate": 0.00019725545571232686,
"loss": 3.0461,
"step": 2200
},
{
"epoch": 3.90625,
"grad_norm": 7.21875,
"learning_rate": 0.0001971268795412135,
"loss": 3.0156,
"step": 2250
},
{
"epoch": 3.9930555555555554,
"grad_norm": 9.75,
"learning_rate": 0.00019699540400830616,
"loss": 3.0261,
"step": 2300
},
{
"epoch": 4.0,
"eval_loss": 2.960036516189575,
"eval_runtime": 41.7286,
"eval_samples_per_second": 89.363,
"eval_steps_per_second": 5.608,
"step": 2304
},
{
"epoch": 4.079861111111111,
"grad_norm": 7.53125,
"learning_rate": 0.00019686103303831787,
"loss": 3.0194,
"step": 2350
},
{
"epoch": 4.166666666666667,
"grad_norm": 8.0,
"learning_rate": 0.0001967237706423943,
"loss": 2.9982,
"step": 2400
},
{
"epoch": 4.253472222222222,
"grad_norm": 10.0,
"learning_rate": 0.00019658362091799374,
"loss": 3.0147,
"step": 2450
},
{
"epoch": 4.340277777777778,
"grad_norm": 8.9375,
"learning_rate": 0.00019644058804876513,
"loss": 3.0187,
"step": 2500
},
{
"epoch": 4.427083333333333,
"grad_norm": 7.28125,
"learning_rate": 0.0001962946763044228,
"loss": 3.0009,
"step": 2550
},
{
"epoch": 4.513888888888889,
"grad_norm": 6.96875,
"learning_rate": 0.00019614589004061928,
"loss": 3.0264,
"step": 2600
},
{
"epoch": 4.600694444444445,
"grad_norm": 8.6875,
"learning_rate": 0.0001959942336988152,
"loss": 3.0037,
"step": 2650
},
{
"epoch": 4.6875,
"grad_norm": 8.25,
"learning_rate": 0.0001958397118061466,
"loss": 3.0003,
"step": 2700
},
{
"epoch": 4.774305555555555,
"grad_norm": 7.1875,
"learning_rate": 0.00019568232897529002,
"loss": 2.9937,
"step": 2750
},
{
"epoch": 4.861111111111111,
"grad_norm": 7.5,
"learning_rate": 0.00019552208990432457,
"loss": 2.9977,
"step": 2800
},
{
"epoch": 4.947916666666667,
"grad_norm": 11.8125,
"learning_rate": 0.0001953589993765918,
"loss": 2.992,
"step": 2850
},
{
"epoch": 5.0,
"eval_loss": 2.9334027767181396,
"eval_runtime": 42.3875,
"eval_samples_per_second": 87.974,
"eval_steps_per_second": 5.52,
"step": 2880
},
{
"epoch": 5.034722222222222,
"grad_norm": 8.8125,
"learning_rate": 0.000195193062260553,
"loss": 2.9851,
"step": 2900
},
{
"epoch": 5.121527777777778,
"grad_norm": 7.875,
"learning_rate": 0.00019502428350964355,
"loss": 2.9796,
"step": 2950
},
{
"epoch": 5.208333333333333,
"grad_norm": 6.53125,
"learning_rate": 0.00019485266816212548,
"loss": 2.977,
"step": 3000
},
{
"epoch": 5.295138888888889,
"grad_norm": 11.125,
"learning_rate": 0.00019467822134093684,
"loss": 2.9887,
"step": 3050
},
{
"epoch": 5.381944444444445,
"grad_norm": 8.0625,
"learning_rate": 0.00019450094825353864,
"loss": 2.982,
"step": 3100
},
{
"epoch": 5.46875,
"grad_norm": 8.75,
"learning_rate": 0.00019432085419175975,
"loss": 2.9896,
"step": 3150
},
{
"epoch": 5.555555555555555,
"grad_norm": 8.0625,
"learning_rate": 0.00019413794453163857,
"loss": 2.9854,
"step": 3200
},
{
"epoch": 5.642361111111111,
"grad_norm": 10.4375,
"learning_rate": 0.00019395222473326284,
"loss": 2.9749,
"step": 3250
},
{
"epoch": 5.729166666666667,
"grad_norm": 7.03125,
"learning_rate": 0.00019376370034060653,
"loss": 2.9705,
"step": 3300
},
{
"epoch": 5.815972222222222,
"grad_norm": 9.8125,
"learning_rate": 0.00019357237698136427,
"loss": 2.9855,
"step": 3350
},
{
"epoch": 5.902777777777778,
"grad_norm": 6.78125,
"learning_rate": 0.00019337826036678338,
"loss": 2.9596,
"step": 3400
},
{
"epoch": 5.989583333333333,
"grad_norm": 8.6875,
"learning_rate": 0.00019318135629149363,
"loss": 2.9692,
"step": 3450
},
{
"epoch": 6.0,
"eval_loss": 2.9161436557769775,
"eval_runtime": 41.8777,
"eval_samples_per_second": 89.045,
"eval_steps_per_second": 5.588,
"step": 3456
},
{
"epoch": 6.076388888888889,
"grad_norm": 8.5625,
"learning_rate": 0.0001929816706333339,
"loss": 2.9666,
"step": 3500
},
{
"epoch": 6.163194444444445,
"grad_norm": 11.625,
"learning_rate": 0.00019277920935317688,
"loss": 2.9451,
"step": 3550
},
{
"epoch": 6.25,
"grad_norm": 7.625,
"learning_rate": 0.00019257397849475124,
"loss": 2.9624,
"step": 3600
},
{
"epoch": 6.336805555555555,
"grad_norm": 7.34375,
"learning_rate": 0.00019236598418446098,
"loss": 2.9722,
"step": 3650
},
{
"epoch": 6.423611111111111,
"grad_norm": 7.5,
"learning_rate": 0.00019215523263120283,
"loss": 2.9552,
"step": 3700
},
{
"epoch": 6.510416666666667,
"grad_norm": 10.625,
"learning_rate": 0.0001919417301261806,
"loss": 2.9844,
"step": 3750
},
{
"epoch": 6.597222222222222,
"grad_norm": 6.25,
"learning_rate": 0.00019172548304271768,
"loss": 2.9576,
"step": 3800
},
{
"epoch": 6.684027777777778,
"grad_norm": 8.25,
"learning_rate": 0.00019150649783606646,
"loss": 2.9598,
"step": 3850
},
{
"epoch": 6.770833333333333,
"grad_norm": 6.25,
"learning_rate": 0.00019128478104321603,
"loss": 2.9488,
"step": 3900
},
{
"epoch": 6.857638888888889,
"grad_norm": 8.25,
"learning_rate": 0.00019106033928269667,
"loss": 2.9591,
"step": 3950
},
{
"epoch": 6.944444444444445,
"grad_norm": 5.8125,
"learning_rate": 0.00019083317925438248,
"loss": 2.9501,
"step": 4000
},
{
"epoch": 7.0,
"eval_loss": 2.90425968170166,
"eval_runtime": 41.3276,
"eval_samples_per_second": 90.23,
"eval_steps_per_second": 5.662,
"step": 4032
},
{
"epoch": 7.03125,
"grad_norm": 6.40625,
"learning_rate": 0.00019060330773929137,
"loss": 2.9478,
"step": 4050
},
{
"epoch": 7.118055555555555,
"grad_norm": 8.75,
"learning_rate": 0.00019037073159938256,
"loss": 2.9421,
"step": 4100
},
{
"epoch": 7.204861111111111,
"grad_norm": 6.1875,
"learning_rate": 0.00019013545777735183,
"loss": 2.9394,
"step": 4150
},
{
"epoch": 7.291666666666667,
"grad_norm": 6.71875,
"learning_rate": 0.00018989749329642418,
"loss": 2.9519,
"step": 4200
},
{
"epoch": 7.378472222222222,
"grad_norm": 6.25,
"learning_rate": 0.00018965684526014425,
"loss": 2.9475,
"step": 4250
},
{
"epoch": 7.465277777777778,
"grad_norm": 4.46875,
"learning_rate": 0.00018941352085216425,
"loss": 2.9507,
"step": 4300
},
{
"epoch": 7.552083333333333,
"grad_norm": 7.5625,
"learning_rate": 0.0001891675273360295,
"loss": 2.956,
"step": 4350
},
{
"epoch": 7.638888888888889,
"grad_norm": 8.1875,
"learning_rate": 0.00018891887205496163,
"loss": 2.9422,
"step": 4400
},
{
"epoch": 7.725694444444445,
"grad_norm": 6.625,
"learning_rate": 0.00018866756243163938,
"loss": 2.9379,
"step": 4450
},
{
"epoch": 7.8125,
"grad_norm": 7.46875,
"learning_rate": 0.00018841360596797695,
"loss": 2.9477,
"step": 4500
},
{
"epoch": 7.899305555555555,
"grad_norm": 9.3125,
"learning_rate": 0.0001881570102449002,
"loss": 2.9293,
"step": 4550
},
{
"epoch": 7.986111111111111,
"grad_norm": 8.375,
"learning_rate": 0.0001878977829221201,
"loss": 2.9379,
"step": 4600
},
{
"epoch": 8.0,
"eval_loss": 2.894627571105957,
"eval_runtime": 42.0326,
"eval_samples_per_second": 88.717,
"eval_steps_per_second": 5.567,
"step": 4608
},
{
"epoch": 8.072916666666666,
"grad_norm": 7.625,
"learning_rate": 0.00018763593173790454,
"loss": 2.9327,
"step": 4650
},
{
"epoch": 8.159722222222221,
"grad_norm": 6.25,
"learning_rate": 0.00018737146450884668,
"loss": 2.917,
"step": 4700
},
{
"epoch": 8.246527777777779,
"grad_norm": 5.28125,
"learning_rate": 0.00018710438912963225,
"loss": 2.9335,
"step": 4750
},
{
"epoch": 8.333333333333334,
"grad_norm": 5.90625,
"learning_rate": 0.00018683471357280347,
"loss": 2.9416,
"step": 4800
},
{
"epoch": 8.42013888888889,
"grad_norm": 6.84375,
"learning_rate": 0.00018656244588852124,
"loss": 2.9256,
"step": 4850
},
{
"epoch": 8.506944444444445,
"grad_norm": 5.5625,
"learning_rate": 0.00018628759420432473,
"loss": 2.9525,
"step": 4900
},
{
"epoch": 8.59375,
"grad_norm": 6.0625,
"learning_rate": 0.00018601016672488888,
"loss": 2.9268,
"step": 4950
},
{
"epoch": 8.680555555555555,
"grad_norm": 6.90625,
"learning_rate": 0.00018573017173177938,
"loss": 2.9347,
"step": 5000
},
{
"epoch": 8.76736111111111,
"grad_norm": 5.78125,
"learning_rate": 0.0001854476175832055,
"loss": 2.9267,
"step": 5050
},
{
"epoch": 8.854166666666666,
"grad_norm": 6.375,
"learning_rate": 0.00018516251271377064,
"loss": 2.9246,
"step": 5100
},
{
"epoch": 8.940972222222221,
"grad_norm": 6.0,
"learning_rate": 0.00018487486563422036,
"loss": 2.9221,
"step": 5150
},
{
"epoch": 9.0,
"eval_loss": 2.8883821964263916,
"eval_runtime": 41.385,
"eval_samples_per_second": 90.105,
"eval_steps_per_second": 5.654,
"step": 5184
},
{
"epoch": 9.027777777777779,
"grad_norm": 7.875,
"learning_rate": 0.00018458468493118857,
"loss": 2.9219,
"step": 5200
},
{
"epoch": 9.114583333333334,
"grad_norm": 7.90625,
"learning_rate": 0.000184291979266941,
"loss": 2.9209,
"step": 5250
},
{
"epoch": 9.20138888888889,
"grad_norm": 6.4375,
"learning_rate": 0.00018399675737911677,
"loss": 2.9127,
"step": 5300
},
{
"epoch": 9.288194444444445,
"grad_norm": 5.9375,
"learning_rate": 0.00018369902808046748,
"loss": 2.9262,
"step": 5350
},
{
"epoch": 9.375,
"grad_norm": 6.5625,
"learning_rate": 0.0001833988002585941,
"loss": 2.9258,
"step": 5400
},
{
"epoch": 9.461805555555555,
"grad_norm": 5.8125,
"learning_rate": 0.00018309608287568182,
"loss": 2.9275,
"step": 5450
},
{
"epoch": 9.54861111111111,
"grad_norm": 6.25,
"learning_rate": 0.00018279088496823235,
"loss": 2.9312,
"step": 5500
},
{
"epoch": 9.635416666666666,
"grad_norm": 6.09375,
"learning_rate": 0.00018248321564679425,
"loss": 2.9205,
"step": 5550
},
{
"epoch": 9.722222222222221,
"grad_norm": 8.0625,
"learning_rate": 0.0001821730840956909,
"loss": 2.9203,
"step": 5600
},
{
"epoch": 9.809027777777779,
"grad_norm": 4.6875,
"learning_rate": 0.00018186049957274656,
"loss": 2.9264,
"step": 5650
},
{
"epoch": 9.895833333333334,
"grad_norm": 5.0,
"learning_rate": 0.0001815454714090096,
"loss": 2.9109,
"step": 5700
},
{
"epoch": 9.98263888888889,
"grad_norm": 5.875,
"learning_rate": 0.0001812280090084744,
"loss": 2.9139,
"step": 5750
},
{
"epoch": 10.0,
"eval_loss": 2.8820853233337402,
"eval_runtime": 42.0383,
"eval_samples_per_second": 88.705,
"eval_steps_per_second": 5.566,
"step": 5760
},
{
"epoch": 10.069444444444445,
"grad_norm": 5.90625,
"learning_rate": 0.00018090812184780032,
"loss": 2.9105,
"step": 5800
},
{
"epoch": 10.15625,
"grad_norm": 5.59375,
"learning_rate": 0.000180585819476029,
"loss": 2.9039,
"step": 5850
},
{
"epoch": 10.243055555555555,
"grad_norm": 5.84375,
"learning_rate": 0.0001802611115142991,
"loss": 2.9122,
"step": 5900
},
{
"epoch": 10.32986111111111,
"grad_norm": 6.75,
"learning_rate": 0.00017993400765555932,
"loss": 2.9233,
"step": 5950
},
{
"epoch": 10.416666666666666,
"grad_norm": 5.71875,
"learning_rate": 0.00017960451766427897,
"loss": 2.9075,
"step": 6000
},
{
"epoch": 10.503472222222221,
"grad_norm": 6.09375,
"learning_rate": 0.00017927265137615637,
"loss": 2.937,
"step": 6050
},
{
"epoch": 10.590277777777779,
"grad_norm": 4.90625,
"learning_rate": 0.00017893841869782547,
"loss": 2.9075,
"step": 6100
},
{
"epoch": 10.677083333333334,
"grad_norm": 5.5625,
"learning_rate": 0.0001786018296065599,
"loss": 2.9184,
"step": 6150
},
{
"epoch": 10.76388888888889,
"grad_norm": 5.71875,
"learning_rate": 0.0001782628941499753,
"loss": 2.9093,
"step": 6200
},
{
"epoch": 10.850694444444445,
"grad_norm": 6.9375,
"learning_rate": 0.00017792162244572928,
"loss": 2.911,
"step": 6250
},
{
"epoch": 10.9375,
"grad_norm": 8.125,
"learning_rate": 0.00017757802468121946,
"loss": 2.9023,
"step": 6300
},
{
"epoch": 11.0,
"eval_loss": 2.8765242099761963,
"eval_runtime": 40.8481,
"eval_samples_per_second": 91.289,
"eval_steps_per_second": 5.729,
"step": 6336
},
{
"epoch": 11.024305555555555,
"grad_norm": 4.3125,
"learning_rate": 0.00017723211111327934,
"loss": 2.9075,
"step": 6350
},
{
"epoch": 11.11111111111111,
"grad_norm": 4.6875,
"learning_rate": 0.0001768838920678721,
"loss": 2.9027,
"step": 6400
},
{
"epoch": 11.197916666666666,
"grad_norm": 8.375,
"learning_rate": 0.00017653337793978237,
"loss": 2.8971,
"step": 6450
},
{
"epoch": 11.284722222222221,
"grad_norm": 6.34375,
"learning_rate": 0.00017618057919230597,
"loss": 2.9095,
"step": 6500
},
{
"epoch": 11.371527777777779,
"grad_norm": 10.125,
"learning_rate": 0.00017582550635693753,
"loss": 2.9108,
"step": 6550
},
{
"epoch": 11.458333333333334,
"grad_norm": 9.375,
"learning_rate": 0.0001754681700330561,
"loss": 2.9115,
"step": 6600
},
{
"epoch": 11.54513888888889,
"grad_norm": 5.96875,
"learning_rate": 0.00017510858088760876,
"loss": 2.9137,
"step": 6650
},
{
"epoch": 11.631944444444445,
"grad_norm": 6.9375,
"learning_rate": 0.00017474674965479222,
"loss": 2.91,
"step": 6700
},
{
"epoch": 11.71875,
"grad_norm": 9.8125,
"learning_rate": 0.00017438268713573237,
"loss": 2.9037,
"step": 6750
},
{
"epoch": 11.805555555555555,
"grad_norm": 4.75,
"learning_rate": 0.00017401640419816182,
"loss": 2.9103,
"step": 6800
},
{
"epoch": 11.89236111111111,
"grad_norm": 6.96875,
"learning_rate": 0.00017364791177609554,
"loss": 2.895,
"step": 6850
},
{
"epoch": 11.979166666666666,
"grad_norm": 6.0625,
"learning_rate": 0.00017327722086950446,
"loss": 2.8989,
"step": 6900
},
{
"epoch": 12.0,
"eval_loss": 2.872136116027832,
"eval_runtime": 41.6305,
"eval_samples_per_second": 89.574,
"eval_steps_per_second": 5.621,
"step": 6912
},
{
"epoch": 12.065972222222221,
"grad_norm": 7.75,
"learning_rate": 0.0001729043425439871,
"loss": 2.8952,
"step": 6950
},
{
"epoch": 12.152777777777779,
"grad_norm": 5.84375,
"learning_rate": 0.00017252928793043916,
"loss": 2.8915,
"step": 7000
},
{
"epoch": 12.239583333333334,
"grad_norm": 6.5625,
"learning_rate": 0.00017215206822472143,
"loss": 2.8955,
"step": 7050
},
{
"epoch": 12.32638888888889,
"grad_norm": 5.875,
"learning_rate": 0.00017177269468732535,
"loss": 2.9131,
"step": 7100
},
{
"epoch": 12.413194444444445,
"grad_norm": 6.65625,
"learning_rate": 0.00017139117864303714,
"loss": 2.8935,
"step": 7150
},
{
"epoch": 12.5,
"grad_norm": 6.96875,
"learning_rate": 0.0001710075314805995,
"loss": 2.9223,
"step": 7200
},
{
"epoch": 12.586805555555555,
"grad_norm": 5.71875,
"learning_rate": 0.00017062176465237175,
"loss": 2.8979,
"step": 7250
},
{
"epoch": 12.67361111111111,
"grad_norm": 7.28125,
"learning_rate": 0.00017023388967398796,
"loss": 2.9076,
"step": 7300
},
{
"epoch": 12.760416666666666,
"grad_norm": 6.0625,
"learning_rate": 0.00016984391812401316,
"loss": 2.8939,
"step": 7350
},
{
"epoch": 12.847222222222221,
"grad_norm": 5.03125,
"learning_rate": 0.00016945186164359782,
"loss": 2.9007,
"step": 7400
},
{
"epoch": 12.934027777777779,
"grad_norm": 5.46875,
"learning_rate": 0.00016905773193613013,
"loss": 2.891,
"step": 7450
},
{
"epoch": 13.0,
"eval_loss": 2.869907855987549,
"eval_runtime": 41.6939,
"eval_samples_per_second": 89.437,
"eval_steps_per_second": 5.612,
"step": 7488
},
{
"epoch": 13.020833333333334,
"grad_norm": 5.375,
"learning_rate": 0.00016866154076688683,
"loss": 2.8958,
"step": 7500
},
{
"epoch": 13.10763888888889,
"grad_norm": 5.03125,
"learning_rate": 0.00016826329996268196,
"loss": 2.8938,
"step": 7550
},
{
"epoch": 13.194444444444445,
"grad_norm": 6.0625,
"learning_rate": 0.00016786302141151368,
"loss": 2.8862,
"step": 7600
},
{
"epoch": 13.28125,
"grad_norm": 5.21875,
"learning_rate": 0.00016746071706220966,
"loss": 2.8969,
"step": 7650
},
{
"epoch": 13.368055555555555,
"grad_norm": 6.8125,
"learning_rate": 0.00016705639892407014,
"loss": 2.9042,
"step": 7700
},
{
"epoch": 13.45486111111111,
"grad_norm": 5.28125,
"learning_rate": 0.00016665007906650948,
"loss": 2.8953,
"step": 7750
},
{
"epoch": 13.541666666666666,
"grad_norm": 8.1875,
"learning_rate": 0.00016624176961869616,
"loss": 2.908,
"step": 7800
},
{
"epoch": 13.628472222222221,
"grad_norm": 5.34375,
"learning_rate": 0.0001658314827691902,
"loss": 2.8964,
"step": 7850
},
{
"epoch": 13.715277777777779,
"grad_norm": 5.0,
"learning_rate": 0.00016541923076557978,
"loss": 2.8924,
"step": 7900
},
{
"epoch": 13.802083333333334,
"grad_norm": 5.28125,
"learning_rate": 0.0001650050259141154,
"loss": 2.9024,
"step": 7950
},
{
"epoch": 13.88888888888889,
"grad_norm": 4.71875,
"learning_rate": 0.00016458888057934248,
"loss": 2.884,
"step": 8000
},
{
"epoch": 13.975694444444445,
"grad_norm": 11.4375,
"learning_rate": 0.0001641708071837325,
"loss": 2.8926,
"step": 8050
},
{
"epoch": 14.0,
"eval_loss": 2.8657476902008057,
"eval_runtime": 41.9302,
"eval_samples_per_second": 88.934,
"eval_steps_per_second": 5.581,
"step": 8064
},
{
"epoch": 14.0625,
"grad_norm": 6.40625,
"learning_rate": 0.00016375081820731193,
"loss": 2.8867,
"step": 8100
},
{
"epoch": 14.149305555555555,
"grad_norm": 4.625,
"learning_rate": 0.00016332892618728986,
"loss": 2.8829,
"step": 8150
},
{
"epoch": 14.23611111111111,
"grad_norm": 4.1875,
"learning_rate": 0.00016290514371768356,
"loss": 2.8852,
"step": 8200
},
{
"epoch": 14.322916666666666,
"grad_norm": 4.3125,
"learning_rate": 0.0001624794834489427,
"loss": 2.9058,
"step": 8250
},
{
"epoch": 14.409722222222221,
"grad_norm": 4.53125,
"learning_rate": 0.00016205195808757173,
"loss": 2.8848,
"step": 8300
},
{
"epoch": 14.496527777777779,
"grad_norm": 6.21875,
"learning_rate": 0.00016162258039575033,
"loss": 2.9088,
"step": 8350
},
{
"epoch": 14.583333333333334,
"grad_norm": 7.34375,
"learning_rate": 0.0001611913631909528,
"loss": 2.8913,
"step": 8400
},
{
"epoch": 14.67013888888889,
"grad_norm": 6.0,
"learning_rate": 0.00016075831934556518,
"loss": 2.9013,
"step": 8450
},
{
"epoch": 14.756944444444445,
"grad_norm": 8.9375,
"learning_rate": 0.00016032346178650105,
"loss": 2.8843,
"step": 8500
},
{
"epoch": 14.84375,
"grad_norm": 4.9375,
"learning_rate": 0.0001598868034948157,
"loss": 2.8901,
"step": 8550
},
{
"epoch": 14.930555555555555,
"grad_norm": 5.40625,
"learning_rate": 0.00015944835750531858,
"loss": 2.8824,
"step": 8600
},
{
"epoch": 15.0,
"eval_loss": 2.8647055625915527,
"eval_runtime": 41.7282,
"eval_samples_per_second": 89.364,
"eval_steps_per_second": 5.608,
"step": 8640
},
{
"epoch": 15.01736111111111,
"grad_norm": 8.125,
"learning_rate": 0.0001590081369061842,
"loss": 2.8874,
"step": 8650
},
{
"epoch": 15.104166666666666,
"grad_norm": 6.375,
"learning_rate": 0.00015856615483856153,
"loss": 2.8822,
"step": 8700
},
{
"epoch": 15.190972222222221,
"grad_norm": 6.21875,
"learning_rate": 0.00015812242449618147,
"loss": 2.8752,
"step": 8750
},
{
"epoch": 15.277777777777779,
"grad_norm": 7.15625,
"learning_rate": 0.0001576769591249633,
"loss": 2.8873,
"step": 8800
},
{
"epoch": 15.364583333333334,
"grad_norm": 5.0625,
"learning_rate": 0.0001572297720226191,
"loss": 2.8993,
"step": 8850
},
{
"epoch": 15.45138888888889,
"grad_norm": 5.8125,
"learning_rate": 0.00015678087653825675,
"loss": 2.8854,
"step": 8900
},
{
"epoch": 15.538194444444445,
"grad_norm": 4.03125,
"learning_rate": 0.0001563302860719816,
"loss": 2.8994,
"step": 8950
},
{
"epoch": 15.625,
"grad_norm": 6.59375,
"learning_rate": 0.00015587801407449648,
"loss": 2.8893,
"step": 9000
},
{
"epoch": 15.711805555555555,
"grad_norm": 6.25,
"learning_rate": 0.0001554240740466998,
"loss": 2.8871,
"step": 9050
},
{
"epoch": 15.79861111111111,
"grad_norm": 10.0625,
"learning_rate": 0.00015496847953928313,
"loss": 2.8935,
"step": 9100
},
{
"epoch": 15.885416666666666,
"grad_norm": 5.9375,
"learning_rate": 0.00015451124415232615,
"loss": 2.8775,
"step": 9150
},
{
"epoch": 15.972222222222221,
"grad_norm": 7.65625,
"learning_rate": 0.00015405238153489096,
"loss": 2.8831,
"step": 9200
},
{
"epoch": 16.0,
"eval_loss": 2.8630547523498535,
"eval_runtime": 40.8978,
"eval_samples_per_second": 91.179,
"eval_steps_per_second": 5.722,
"step": 9216
},
{
"epoch": 16.05902777777778,
"grad_norm": 5.78125,
"learning_rate": 0.00015359190538461462,
"loss": 2.88,
"step": 9250
},
{
"epoch": 16.145833333333332,
"grad_norm": 5.46875,
"learning_rate": 0.00015312982944730018,
"loss": 2.8777,
"step": 9300
},
{
"epoch": 16.23263888888889,
"grad_norm": 6.71875,
"learning_rate": 0.00015266616751650642,
"loss": 2.8785,
"step": 9350
},
{
"epoch": 16.319444444444443,
"grad_norm": 4.46875,
"learning_rate": 0.00015220093343313592,
"loss": 2.8968,
"step": 9400
},
{
"epoch": 16.40625,
"grad_norm": 5.125,
"learning_rate": 0.00015173414108502224,
"loss": 2.877,
"step": 9450
},
{
"epoch": 16.493055555555557,
"grad_norm": 5.03125,
"learning_rate": 0.00015126580440651496,
"loss": 2.9016,
"step": 9500
},
{
"epoch": 16.57986111111111,
"grad_norm": 6.71875,
"learning_rate": 0.00015079593737806399,
"loss": 2.8841,
"step": 9550
},
{
"epoch": 16.666666666666668,
"grad_norm": 5.21875,
"learning_rate": 0.00015032455402580217,
"loss": 2.8937,
"step": 9600
},
{
"epoch": 16.75347222222222,
"grad_norm": 5.59375,
"learning_rate": 0.00014985166842112644,
"loss": 2.8789,
"step": 9650
},
{
"epoch": 16.84027777777778,
"grad_norm": 5.5625,
"learning_rate": 0.00014937729468027797,
"loss": 2.8883,
"step": 9700
},
{
"epoch": 16.927083333333332,
"grad_norm": 5.15625,
"learning_rate": 0.00014890144696392074,
"loss": 2.8751,
"step": 9750
},
{
"epoch": 17.0,
"eval_loss": 2.862104892730713,
"eval_runtime": 41.9728,
"eval_samples_per_second": 88.843,
"eval_steps_per_second": 5.575,
"step": 9792
},
{
"epoch": 17.01388888888889,
"grad_norm": 8.0625,
"learning_rate": 0.00014842413947671872,
"loss": 2.8821,
"step": 9800
},
{
"epoch": 17.100694444444443,
"grad_norm": 5.71875,
"learning_rate": 0.0001479453864669119,
"loss": 2.8785,
"step": 9850
},
{
"epoch": 17.1875,
"grad_norm": 5.4375,
"learning_rate": 0.00014746520222589103,
"loss": 2.8715,
"step": 9900
},
{
"epoch": 17.274305555555557,
"grad_norm": 11.8125,
"learning_rate": 0.00014698360108777097,
"loss": 2.8826,
"step": 9950
},
{
"epoch": 17.36111111111111,
"grad_norm": 5.96875,
"learning_rate": 0.00014650059742896265,
"loss": 2.8958,
"step": 10000
},
{
"epoch": 17.447916666666668,
"grad_norm": 5.75,
"learning_rate": 0.00014601620566774415,
"loss": 2.8751,
"step": 10050
},
{
"epoch": 17.53472222222222,
"grad_norm": 4.53125,
"learning_rate": 0.00014553044026383014,
"loss": 2.8925,
"step": 10100
},
{
"epoch": 17.62152777777778,
"grad_norm": 5.28125,
"learning_rate": 0.0001450433157179403,
"loss": 2.8889,
"step": 10150
},
{
"epoch": 17.708333333333332,
"grad_norm": 6.21875,
"learning_rate": 0.00014455484657136642,
"loss": 2.8807,
"step": 10200
},
{
"epoch": 17.79513888888889,
"grad_norm": 5.375,
"learning_rate": 0.00014406504740553837,
"loss": 2.8836,
"step": 10250
},
{
"epoch": 17.881944444444443,
"grad_norm": 5.125,
"learning_rate": 0.00014357393284158878,
"loss": 2.8723,
"step": 10300
},
{
"epoch": 17.96875,
"grad_norm": 5.3125,
"learning_rate": 0.00014308151753991658,
"loss": 2.881,
"step": 10350
},
{
"epoch": 18.0,
"eval_loss": 2.8605997562408447,
"eval_runtime": 40.45,
"eval_samples_per_second": 92.188,
"eval_steps_per_second": 5.785,
"step": 10368
},
{
"epoch": 18.055555555555557,
"grad_norm": 4.5625,
"learning_rate": 0.00014258781619974945,
"loss": 2.8781,
"step": 10400
},
{
"epoch": 18.14236111111111,
"grad_norm": 4.625,
"learning_rate": 0.00014209284355870492,
"loss": 2.8705,
"step": 10450
},
{
"epoch": 18.229166666666668,
"grad_norm": 6.09375,
"learning_rate": 0.00014159661439235046,
"loss": 2.876,
"step": 10500
},
{
"epoch": 18.31597222222222,
"grad_norm": 4.875,
"learning_rate": 0.0001410991435137625,
"loss": 2.8918,
"step": 10550
},
{
"epoch": 18.40277777777778,
"grad_norm": 4.375,
"learning_rate": 0.00014060044577308408,
"loss": 2.8759,
"step": 10600
},
{
"epoch": 18.489583333333332,
"grad_norm": 5.90625,
"learning_rate": 0.00014010053605708174,
"loss": 2.8958,
"step": 10650
},
{
"epoch": 18.57638888888889,
"grad_norm": 5.125,
"learning_rate": 0.000139599429288701,
"loss": 2.8763,
"step": 10700
},
{
"epoch": 18.663194444444443,
"grad_norm": 4.96875,
"learning_rate": 0.00013909714042662085,
"loss": 2.8905,
"step": 10750
},
{
"epoch": 18.75,
"grad_norm": 5.625,
"learning_rate": 0.00013859368446480743,
"loss": 2.8782,
"step": 10800
},
{
"epoch": 18.836805555555557,
"grad_norm": 5.375,
"learning_rate": 0.0001380890764320662,
"loss": 2.8834,
"step": 10850
},
{
"epoch": 18.92361111111111,
"grad_norm": 3.984375,
"learning_rate": 0.00013758333139159343,
"loss": 2.8705,
"step": 10900
},
{
"epoch": 19.0,
"eval_loss": 2.8602795600891113,
"eval_runtime": 40.4167,
"eval_samples_per_second": 92.264,
"eval_steps_per_second": 5.79,
"step": 10944
},
{
"epoch": 19.010416666666668,
"grad_norm": 5.71875,
"learning_rate": 0.00013707646444052656,
"loss": 2.8757,
"step": 10950
},
{
"epoch": 19.09722222222222,
"grad_norm": 7.3125,
"learning_rate": 0.0001365684907094935,
"loss": 2.8753,
"step": 11000
},
{
"epoch": 19.18402777777778,
"grad_norm": 5.78125,
"learning_rate": 0.0001360594253621609,
"loss": 2.8632,
"step": 11050
},
{
"epoch": 19.270833333333332,
"grad_norm": 5.28125,
"learning_rate": 0.0001355492835947816,
"loss": 2.8771,
"step": 11100
},
{
"epoch": 19.35763888888889,
"grad_norm": 4.53125,
"learning_rate": 0.000135038080635741,
"loss": 2.8945,
"step": 11150
},
{
"epoch": 19.444444444444443,
"grad_norm": 4.25,
"learning_rate": 0.00013452583174510237,
"loss": 2.87,
"step": 11200
},
{
"epoch": 19.53125,
"grad_norm": 5.1875,
"learning_rate": 0.0001340125522141514,
"loss": 2.8948,
"step": 11250
},
{
"epoch": 19.618055555555557,
"grad_norm": 5.9375,
"learning_rate": 0.00013349825736493965,
"loss": 2.882,
"step": 11300
},
{
"epoch": 19.70486111111111,
"grad_norm": 7.0,
"learning_rate": 0.00013298296254982733,
"loss": 2.8753,
"step": 11350
},
{
"epoch": 19.791666666666668,
"grad_norm": 4.78125,
"learning_rate": 0.00013246668315102487,
"loss": 2.8823,
"step": 11400
},
{
"epoch": 19.87847222222222,
"grad_norm": 4.59375,
"learning_rate": 0.00013194943458013375,
"loss": 2.8675,
"step": 11450
},
{
"epoch": 19.96527777777778,
"grad_norm": 6.0,
"learning_rate": 0.00013143123227768658,
"loss": 2.8765,
"step": 11500
},
{
"epoch": 20.0,
"eval_loss": 2.8591601848602295,
"eval_runtime": 41.4885,
"eval_samples_per_second": 89.88,
"eval_steps_per_second": 5.64,
"step": 11520
},
{
"epoch": 20.052083333333332,
"grad_norm": 4.9375,
"learning_rate": 0.00013091209171268599,
"loss": 2.8735,
"step": 11550
},
{
"epoch": 20.13888888888889,
"grad_norm": 5.40625,
"learning_rate": 0.00013039202838214294,
"loss": 2.8698,
"step": 11600
},
{
"epoch": 20.225694444444443,
"grad_norm": 5.34375,
"learning_rate": 0.0001298710578106142,
"loss": 2.8702,
"step": 11650
},
{
"epoch": 20.3125,
"grad_norm": 6.59375,
"learning_rate": 0.00012934919554973874,
"loss": 2.8871,
"step": 11700
},
{
"epoch": 20.399305555555557,
"grad_norm": 4.375,
"learning_rate": 0.00012882645717777376,
"loss": 2.8752,
"step": 11750
},
{
"epoch": 20.48611111111111,
"grad_norm": 6.78125,
"learning_rate": 0.00012830285829912926,
"loss": 2.8896,
"step": 11800
},
{
"epoch": 20.572916666666668,
"grad_norm": 5.21875,
"learning_rate": 0.00012777841454390275,
"loss": 2.8768,
"step": 11850
},
{
"epoch": 20.65972222222222,
"grad_norm": 5.59375,
"learning_rate": 0.00012725314156741214,
"loss": 2.8846,
"step": 11900
},
{
"epoch": 20.74652777777778,
"grad_norm": 5.25,
"learning_rate": 0.00012672705504972884,
"loss": 2.873,
"step": 11950
},
{
"epoch": 20.833333333333332,
"grad_norm": 5.6875,
"learning_rate": 0.00012620017069520936,
"loss": 2.8809,
"step": 12000
},
{
"epoch": 20.92013888888889,
"grad_norm": 4.40625,
"learning_rate": 0.00012567250423202675,
"loss": 2.8656,
"step": 12050
},
{
"epoch": 21.0,
"eval_loss": 2.857980489730835,
"eval_runtime": 41.0572,
"eval_samples_per_second": 90.825,
"eval_steps_per_second": 5.699,
"step": 12096
},
{
"epoch": 21.006944444444443,
"grad_norm": 5.5,
"learning_rate": 0.00012514407141170104,
"loss": 2.8738,
"step": 12100
},
{
"epoch": 21.09375,
"grad_norm": 4.78125,
"learning_rate": 0.00012461488800862887,
"loss": 2.8725,
"step": 12150
},
{
"epoch": 21.180555555555557,
"grad_norm": 8.375,
"learning_rate": 0.00012408496981961288,
"loss": 2.8628,
"step": 12200
},
{
"epoch": 21.26736111111111,
"grad_norm": 7.375,
"learning_rate": 0.00012355433266338992,
"loss": 2.8733,
"step": 12250
},
{
"epoch": 21.354166666666668,
"grad_norm": 4.65625,
"learning_rate": 0.00012302299238015895,
"loss": 2.8901,
"step": 12300
},
{
"epoch": 21.44097222222222,
"grad_norm": 5.0625,
"learning_rate": 0.0001224909648311082,
"loss": 2.8696,
"step": 12350
},
{
"epoch": 21.52777777777778,
"grad_norm": 6.09375,
"learning_rate": 0.00012195826589794162,
"loss": 2.8925,
"step": 12400
},
{
"epoch": 21.614583333333332,
"grad_norm": 6.0625,
"learning_rate": 0.00012142491148240491,
"loss": 2.8764,
"step": 12450
},
{
"epoch": 21.70138888888889,
"grad_norm": 4.90625,
"learning_rate": 0.00012089091750581067,
"loss": 2.8716,
"step": 12500
},
{
"epoch": 21.788194444444443,
"grad_norm": 5.9375,
"learning_rate": 0.0001203562999085633,
"loss": 2.8816,
"step": 12550
},
{
"epoch": 21.875,
"grad_norm": 5.59375,
"learning_rate": 0.00011982107464968298,
"loss": 2.8677,
"step": 12600
},
{
"epoch": 21.961805555555557,
"grad_norm": 5.40625,
"learning_rate": 0.00011928525770632946,
"loss": 2.8729,
"step": 12650
},
{
"epoch": 22.0,
"eval_loss": 2.857877016067505,
"eval_runtime": 42.4855,
"eval_samples_per_second": 87.771,
"eval_steps_per_second": 5.508,
"step": 12672
},
{
"epoch": 22.04861111111111,
"grad_norm": 7.5,
"learning_rate": 0.000118748865073325,
"loss": 2.8712,
"step": 12700
},
{
"epoch": 22.135416666666668,
"grad_norm": 5.65625,
"learning_rate": 0.00011821191276267684,
"loss": 2.867,
"step": 12750
},
{
"epoch": 22.22222222222222,
"grad_norm": 6.1875,
"learning_rate": 0.00011767441680309955,
"loss": 2.8635,
"step": 12800
},
{
"epoch": 22.30902777777778,
"grad_norm": 8.625,
"learning_rate": 0.00011713639323953602,
"loss": 2.886,
"step": 12850
},
{
"epoch": 22.395833333333332,
"grad_norm": 6.09375,
"learning_rate": 0.00011659785813267905,
"loss": 2.872,
"step": 12900
},
{
"epoch": 22.48263888888889,
"grad_norm": 5.625,
"learning_rate": 0.0001160588275584915,
"loss": 2.8891,
"step": 12950
},
{
"epoch": 22.569444444444443,
"grad_norm": 4.59375,
"learning_rate": 0.00011551931760772661,
"loss": 2.8741,
"step": 13000
},
{
"epoch": 22.65625,
"grad_norm": 6.65625,
"learning_rate": 0.00011497934438544769,
"loss": 2.8815,
"step": 13050
},
{
"epoch": 22.743055555555557,
"grad_norm": 5.3125,
"learning_rate": 0.00011443892401054719,
"loss": 2.8705,
"step": 13100
},
{
"epoch": 22.82986111111111,
"grad_norm": 4.96875,
"learning_rate": 0.00011389807261526573,
"loss": 2.8823,
"step": 13150
},
{
"epoch": 22.916666666666668,
"grad_norm": 5.5,
"learning_rate": 0.00011335680634471035,
"loss": 2.8596,
"step": 13200
},
{
"epoch": 23.0,
"eval_loss": 2.856687545776367,
"eval_runtime": 42.2388,
"eval_samples_per_second": 88.284,
"eval_steps_per_second": 5.54,
"step": 13248
},
{
"epoch": 23.00347222222222,
"grad_norm": 7.3125,
"learning_rate": 0.00011281514135637278,
"loss": 2.8712,
"step": 13250
},
{
"epoch": 23.09027777777778,
"grad_norm": 5.46875,
"learning_rate": 0.00011227309381964684,
"loss": 2.8741,
"step": 13300
},
{
"epoch": 23.177083333333332,
"grad_norm": 4.875,
"learning_rate": 0.00011173067991534598,
"loss": 2.8567,
"step": 13350
},
{
"epoch": 23.26388888888889,
"grad_norm": 4.59375,
"learning_rate": 0.00011118791583522023,
"loss": 2.8739,
"step": 13400
},
{
"epoch": 23.350694444444443,
"grad_norm": 5.96875,
"learning_rate": 0.00011064481778147275,
"loss": 2.8865,
"step": 13450
},
{
"epoch": 23.4375,
"grad_norm": 5.6875,
"learning_rate": 0.00011010140196627627,
"loss": 2.8657,
"step": 13500
},
{
"epoch": 23.524305555555557,
"grad_norm": 5.15625,
"learning_rate": 0.00010955768461128911,
"loss": 2.8911,
"step": 13550
},
{
"epoch": 23.61111111111111,
"grad_norm": 5.0625,
"learning_rate": 0.00010901368194717091,
"loss": 2.8727,
"step": 13600
},
{
"epoch": 23.697916666666668,
"grad_norm": 5.34375,
"learning_rate": 0.00010846941021309817,
"loss": 2.8729,
"step": 13650
},
{
"epoch": 23.78472222222222,
"grad_norm": 5.53125,
"learning_rate": 0.00010792488565627953,
"loss": 2.8749,
"step": 13700
},
{
"epoch": 23.87152777777778,
"grad_norm": 4.84375,
"learning_rate": 0.00010738012453147062,
"loss": 2.87,
"step": 13750
},
{
"epoch": 23.958333333333332,
"grad_norm": 6.875,
"learning_rate": 0.00010683514310048894,
"loss": 2.8713,
"step": 13800
},
{
"epoch": 24.0,
"eval_loss": 2.856473922729492,
"eval_runtime": 40.551,
"eval_samples_per_second": 91.958,
"eval_steps_per_second": 5.771,
"step": 13824
},
{
"epoch": 24.04513888888889,
"grad_norm": 7.4375,
"learning_rate": 0.00010628995763172851,
"loss": 2.8675,
"step": 13850
},
{
"epoch": 24.131944444444443,
"grad_norm": 5.875,
"learning_rate": 0.00010574458439967401,
"loss": 2.8666,
"step": 13900
},
{
"epoch": 24.21875,
"grad_norm": 5.40625,
"learning_rate": 0.00010519903968441516,
"loss": 2.8586,
"step": 13950
},
{
"epoch": 24.305555555555557,
"grad_norm": 4.8125,
"learning_rate": 0.0001046533397711607,
"loss": 2.8836,
"step": 14000
},
{
"epoch": 24.39236111111111,
"grad_norm": 4.46875,
"learning_rate": 0.00010410750094975215,
"loss": 2.8711,
"step": 14050
},
{
"epoch": 24.479166666666668,
"grad_norm": 5.59375,
"learning_rate": 0.00010356153951417771,
"loss": 2.8866,
"step": 14100
},
{
"epoch": 24.56597222222222,
"grad_norm": 5.3125,
"learning_rate": 0.00010301547176208568,
"loss": 2.8723,
"step": 14150
},
{
"epoch": 24.65277777777778,
"grad_norm": 5.71875,
"learning_rate": 0.00010246931399429812,
"loss": 2.8754,
"step": 14200
},
{
"epoch": 24.739583333333332,
"grad_norm": 5.46875,
"learning_rate": 0.00010192308251432412,
"loss": 2.8733,
"step": 14250
},
{
"epoch": 24.82638888888889,
"grad_norm": 5.8125,
"learning_rate": 0.0001013767936278732,
"loss": 2.8821,
"step": 14300
},
{
"epoch": 24.913194444444443,
"grad_norm": 4.96875,
"learning_rate": 0.00010083046364236854,
"loss": 2.8564,
"step": 14350
},
{
"epoch": 25.0,
"grad_norm": 5.90625,
"learning_rate": 0.00010028410886646014,
"loss": 2.8708,
"step": 14400
},
{
"epoch": 25.0,
"eval_loss": 2.8556883335113525,
"eval_runtime": 39.6238,
"eval_samples_per_second": 94.11,
"eval_steps_per_second": 5.906,
"step": 14400
},
{
"epoch": 25.086805555555557,
"grad_norm": 4.46875,
"learning_rate": 9.97377456095381e-05,
"loss": 2.8732,
"step": 14450
},
{
"epoch": 25.17361111111111,
"grad_norm": 4.1875,
"learning_rate": 9.91913901812456e-05,
"loss": 2.8537,
"step": 14500
},
{
"epoch": 25.260416666666668,
"grad_norm": 3.640625,
"learning_rate": 9.864505889099217e-05,
"loss": 2.8704,
"step": 14550
},
{
"epoch": 25.34722222222222,
"grad_norm": 5.5,
"learning_rate": 9.809876804746683e-05,
"loss": 2.8865,
"step": 14600
},
{
"epoch": 25.43402777777778,
"grad_norm": 6.6875,
"learning_rate": 9.755253395815116e-05,
"loss": 2.8648,
"step": 14650
},
{
"epoch": 25.520833333333332,
"grad_norm": 5.46875,
"learning_rate": 9.700637292883252e-05,
"loss": 2.8886,
"step": 14700
},
{
"epoch": 25.60763888888889,
"grad_norm": 4.96875,
"learning_rate": 9.646030126311743e-05,
"loss": 2.872,
"step": 14750
},
{
"epoch": 25.694444444444443,
"grad_norm": 7.9375,
"learning_rate": 9.591433526194474e-05,
"loss": 2.8698,
"step": 14800
},
{
"epoch": 25.78125,
"grad_norm": 5.0,
"learning_rate": 9.536849122309901e-05,
"loss": 2.8718,
"step": 14850
},
{
"epoch": 25.868055555555557,
"grad_norm": 6.1875,
"learning_rate": 9.482278544072425e-05,
"loss": 2.8712,
"step": 14900
},
{
"epoch": 25.95486111111111,
"grad_norm": 5.34375,
"learning_rate": 9.427723420483717e-05,
"loss": 2.8674,
"step": 14950
},
{
"epoch": 26.0,
"eval_loss": 2.855642080307007,
"eval_runtime": 41.248,
"eval_samples_per_second": 90.404,
"eval_steps_per_second": 5.673,
"step": 14976
},
{
"epoch": 26.041666666666668,
"grad_norm": 6.46875,
"learning_rate": 9.373185380084113e-05,
"loss": 2.8681,
"step": 15000
},
{
"epoch": 26.12847222222222,
"grad_norm": 6.21875,
"learning_rate": 9.318666050903988e-05,
"loss": 2.8659,
"step": 15050
},
{
"epoch": 26.21527777777778,
"grad_norm": 5.65625,
"learning_rate": 9.264167060415178e-05,
"loss": 2.857,
"step": 15100
},
{
"epoch": 26.302083333333332,
"grad_norm": 4.4375,
"learning_rate": 9.209690035482372e-05,
"loss": 2.8821,
"step": 15150
},
{
"epoch": 26.38888888888889,
"grad_norm": 6.0,
"learning_rate": 9.155236602314552e-05,
"loss": 2.8707,
"step": 15200
},
{
"epoch": 26.475694444444443,
"grad_norm": 4.53125,
"learning_rate": 9.100808386416475e-05,
"loss": 2.8819,
"step": 15250
},
{
"epoch": 26.5625,
"grad_norm": 4.125,
"learning_rate": 9.046407012540115e-05,
"loss": 2.8716,
"step": 15300
},
{
"epoch": 26.649305555555557,
"grad_norm": 5.8125,
"learning_rate": 8.992034104636183e-05,
"loss": 2.8758,
"step": 15350
},
{
"epoch": 26.73611111111111,
"grad_norm": 4.84375,
"learning_rate": 8.937691285805634e-05,
"loss": 2.8716,
"step": 15400
},
{
"epoch": 26.822916666666668,
"grad_norm": 4.8125,
"learning_rate": 8.883380178251249e-05,
"loss": 2.8792,
"step": 15450
},
{
"epoch": 26.90972222222222,
"grad_norm": 4.59375,
"learning_rate": 8.829102403229163e-05,
"loss": 2.8585,
"step": 15500
},
{
"epoch": 26.99652777777778,
"grad_norm": 7.09375,
"learning_rate": 8.774859581000504e-05,
"loss": 2.8683,
"step": 15550
},
{
"epoch": 27.0,
"eval_loss": 2.8553037643432617,
"eval_runtime": 41.718,
"eval_samples_per_second": 89.386,
"eval_steps_per_second": 5.609,
"step": 15552
},
{
"epoch": 27.083333333333332,
"grad_norm": 4.78125,
"learning_rate": 8.720653330783013e-05,
"loss": 2.8705,
"step": 15600
},
{
"epoch": 27.17013888888889,
"grad_norm": 4.5,
"learning_rate": 8.666485270702704e-05,
"loss": 2.8559,
"step": 15650
},
{
"epoch": 27.256944444444443,
"grad_norm": 4.03125,
"learning_rate": 8.612357017745578e-05,
"loss": 2.87,
"step": 15700
},
{
"epoch": 27.34375,
"grad_norm": 5.65625,
"learning_rate": 8.558270187709328e-05,
"loss": 2.8804,
"step": 15750
},
{
"epoch": 27.430555555555557,
"grad_norm": 5.15625,
"learning_rate": 8.504226395155132e-05,
"loss": 2.8634,
"step": 15800
},
{
"epoch": 27.51736111111111,
"grad_norm": 5.40625,
"learning_rate": 8.450227253359439e-05,
"loss": 2.8878,
"step": 15850
},
{
"epoch": 27.604166666666668,
"grad_norm": 4.15625,
"learning_rate": 8.39627437426581e-05,
"loss": 2.8713,
"step": 15900
},
{
"epoch": 27.69097222222222,
"grad_norm": 4.84375,
"learning_rate": 8.34236936843682e-05,
"loss": 2.8689,
"step": 15950
},
{
"epoch": 27.77777777777778,
"grad_norm": 5.34375,
"learning_rate": 8.28851384500595e-05,
"loss": 2.8706,
"step": 16000
},
{
"epoch": 27.864583333333332,
"grad_norm": 4.3125,
"learning_rate": 8.234709411629572e-05,
"loss": 2.8689,
"step": 16050
},
{
"epoch": 27.95138888888889,
"grad_norm": 8.125,
"learning_rate": 8.180957674438966e-05,
"loss": 2.8677,
"step": 16100
},
{
"epoch": 28.0,
"eval_loss": 2.8553411960601807,
"eval_runtime": 41.9128,
"eval_samples_per_second": 88.97,
"eval_steps_per_second": 5.583,
"step": 16128
},
{
"epoch": 28.038194444444443,
"grad_norm": 4.34375,
"learning_rate": 8.12726023799235e-05,
"loss": 2.8652,
"step": 16150
},
{
"epoch": 28.125,
"grad_norm": 5.125,
"learning_rate": 8.073618705226998e-05,
"loss": 2.8667,
"step": 16200
},
{
"epoch": 28.211805555555557,
"grad_norm": 4.34375,
"learning_rate": 8.020034677411386e-05,
"loss": 2.8591,
"step": 16250
},
{
"epoch": 28.29861111111111,
"grad_norm": 4.9375,
"learning_rate": 7.966509754097404e-05,
"loss": 2.8778,
"step": 16300
},
{
"epoch": 28.385416666666668,
"grad_norm": 4.0625,
"learning_rate": 7.913045533072587e-05,
"loss": 2.8716,
"step": 16350
},
{
"epoch": 28.47222222222222,
"grad_norm": 5.125,
"learning_rate": 7.859643610312424e-05,
"loss": 2.8786,
"step": 16400
},
{
"epoch": 28.55902777777778,
"grad_norm": 5.375,
"learning_rate": 7.80630557993274e-05,
"loss": 2.8746,
"step": 16450
},
{
"epoch": 28.645833333333332,
"grad_norm": 4.75,
"learning_rate": 7.753033034142075e-05,
"loss": 2.871,
"step": 16500
},
{
"epoch": 28.73263888888889,
"grad_norm": 5.09375,
"learning_rate": 7.69982756319417e-05,
"loss": 2.8704,
"step": 16550
},
{
"epoch": 28.819444444444443,
"grad_norm": 5.03125,
"learning_rate": 7.646690755340504e-05,
"loss": 2.8813,
"step": 16600
},
{
"epoch": 28.90625,
"grad_norm": 4.53125,
"learning_rate": 7.59362419678287e-05,
"loss": 2.8563,
"step": 16650
},
{
"epoch": 28.993055555555557,
"grad_norm": 4.53125,
"learning_rate": 7.540629471626026e-05,
"loss": 2.868,
"step": 16700
},
{
"epoch": 29.0,
"eval_loss": 2.8549838066101074,
"eval_runtime": 40.2288,
"eval_samples_per_second": 92.695,
"eval_steps_per_second": 5.817,
"step": 16704
},
{
"epoch": 29.07986111111111,
"grad_norm": 4.90625,
"learning_rate": 7.48770816183042e-05,
"loss": 2.869,
"step": 16750
},
{
"epoch": 29.166666666666668,
"grad_norm": 4.1875,
"learning_rate": 7.434861847164955e-05,
"loss": 2.8525,
"step": 16800
},
{
"epoch": 29.25347222222222,
"grad_norm": 4.125,
"learning_rate": 7.382092105159825e-05,
"loss": 2.868,
"step": 16850
},
{
"epoch": 29.34027777777778,
"grad_norm": 6.125,
"learning_rate": 7.329400511059442e-05,
"loss": 2.8797,
"step": 16900
},
{
"epoch": 29.427083333333332,
"grad_norm": 4.71875,
"learning_rate": 7.276788637775393e-05,
"loss": 2.8629,
"step": 16950
},
{
"epoch": 29.51388888888889,
"grad_norm": 3.90625,
"learning_rate": 7.224258055839509e-05,
"loss": 2.8888,
"step": 17000
},
{
"epoch": 29.600694444444443,
"grad_norm": 4.8125,
"learning_rate": 7.171810333356961e-05,
"loss": 2.869,
"step": 17050
},
{
"epoch": 29.6875,
"grad_norm": 5.0625,
"learning_rate": 7.119447035959457e-05,
"loss": 2.8709,
"step": 17100
},
{
"epoch": 29.774305555555557,
"grad_norm": 3.828125,
"learning_rate": 7.067169726758522e-05,
"loss": 2.8669,
"step": 17150
},
{
"epoch": 29.86111111111111,
"grad_norm": 5.5625,
"learning_rate": 7.014979966298808e-05,
"loss": 2.8698,
"step": 17200
},
{
"epoch": 29.947916666666668,
"grad_norm": 3.546875,
"learning_rate": 6.962879312511531e-05,
"loss": 2.8669,
"step": 17250
},
{
"epoch": 30.0,
"eval_loss": 2.854860544204712,
"eval_runtime": 41.7924,
"eval_samples_per_second": 89.227,
"eval_steps_per_second": 5.599,
"step": 17280
},
{
"epoch": 30.03472222222222,
"grad_norm": 5.53125,
"learning_rate": 6.910869320667955e-05,
"loss": 2.8649,
"step": 17300
},
{
"epoch": 30.12152777777778,
"grad_norm": 6.15625,
"learning_rate": 6.858951543332978e-05,
"loss": 2.8648,
"step": 17350
},
{
"epoch": 30.208333333333332,
"grad_norm": 5.3125,
"learning_rate": 6.807127530318771e-05,
"loss": 2.8618,
"step": 17400
},
{
"epoch": 30.29513888888889,
"grad_norm": 4.625,
"learning_rate": 6.755398828638512e-05,
"loss": 2.8748,
"step": 17450
},
{
"epoch": 30.381944444444443,
"grad_norm": 4.6875,
"learning_rate": 6.703766982460231e-05,
"loss": 2.8702,
"step": 17500
},
{
"epoch": 30.46875,
"grad_norm": 4.5,
"learning_rate": 6.652233533060683e-05,
"loss": 2.8766,
"step": 17550
},
{
"epoch": 30.555555555555557,
"grad_norm": 4.65625,
"learning_rate": 6.600800018779356e-05,
"loss": 2.8766,
"step": 17600
},
{
"epoch": 30.64236111111111,
"grad_norm": 5.53125,
"learning_rate": 6.549467974972552e-05,
"loss": 2.8674,
"step": 17650
},
{
"epoch": 30.729166666666668,
"grad_norm": 5.28125,
"learning_rate": 6.498238933967544e-05,
"loss": 2.868,
"step": 17700
},
{
"epoch": 30.81597222222222,
"grad_norm": 4.09375,
"learning_rate": 6.44711442501684e-05,
"loss": 2.8798,
"step": 17750
},
{
"epoch": 30.90277777777778,
"grad_norm": 6.03125,
"learning_rate": 6.396095974252534e-05,
"loss": 2.8578,
"step": 17800
},
{
"epoch": 30.989583333333332,
"grad_norm": 5.59375,
"learning_rate": 6.345185104640747e-05,
"loss": 2.8672,
"step": 17850
},
{
"epoch": 31.0,
"eval_loss": 2.8543925285339355,
"eval_runtime": 41.2327,
"eval_samples_per_second": 90.438,
"eval_steps_per_second": 5.675,
"step": 17856
},
{
"epoch": 31.07638888888889,
"grad_norm": 4.625,
"learning_rate": 6.294383335936167e-05,
"loss": 2.87,
"step": 17900
},
{
"epoch": 31.163194444444443,
"grad_norm": 3.78125,
"learning_rate": 6.24369218463667e-05,
"loss": 2.8516,
"step": 17950
},
{
"epoch": 31.25,
"grad_norm": 5.4375,
"learning_rate": 6.193113163938075e-05,
"loss": 2.8673,
"step": 18000
},
{
"epoch": 31.336805555555557,
"grad_norm": 4.4375,
"learning_rate": 6.14264778368895e-05,
"loss": 2.8794,
"step": 18050
},
{
"epoch": 31.42361111111111,
"grad_norm": 5.125,
"learning_rate": 6.092297550345554e-05,
"loss": 2.8634,
"step": 18100
},
{
"epoch": 31.510416666666668,
"grad_norm": 5.46875,
"learning_rate": 6.0420639669268544e-05,
"loss": 2.8904,
"step": 18150
},
{
"epoch": 31.59722222222222,
"grad_norm": 4.21875,
"learning_rate": 5.991948532969685e-05,
"loss": 2.8651,
"step": 18200
},
{
"epoch": 31.68402777777778,
"grad_norm": 4.6875,
"learning_rate": 5.9419527444839515e-05,
"loss": 2.8727,
"step": 18250
},
{
"epoch": 31.770833333333332,
"grad_norm": 3.765625,
"learning_rate": 5.8920780939079955e-05,
"loss": 2.8645,
"step": 18300
},
{
"epoch": 31.85763888888889,
"grad_norm": 6.28125,
"learning_rate": 5.8423260700640417e-05,
"loss": 2.8713,
"step": 18350
},
{
"epoch": 31.944444444444443,
"grad_norm": 6.9375,
"learning_rate": 5.792698158113742e-05,
"loss": 2.8634,
"step": 18400
},
{
"epoch": 32.0,
"eval_loss": 2.8544044494628906,
"eval_runtime": 40.4905,
"eval_samples_per_second": 92.096,
"eval_steps_per_second": 5.779,
"step": 18432
},
{
"epoch": 32.03125,
"grad_norm": 3.84375,
"learning_rate": 5.743195839513852e-05,
"loss": 2.8657,
"step": 18450
},
{
"epoch": 32.11805555555556,
"grad_norm": 4.65625,
"learning_rate": 5.693820591971996e-05,
"loss": 2.8633,
"step": 18500
},
{
"epoch": 32.204861111111114,
"grad_norm": 5.25,
"learning_rate": 5.644573889402589e-05,
"loss": 2.8595,
"step": 18550
},
{
"epoch": 32.291666666666664,
"grad_norm": 4.8125,
"learning_rate": 5.5954572018827846e-05,
"loss": 2.8737,
"step": 18600
},
{
"epoch": 32.37847222222222,
"grad_norm": 5.46875,
"learning_rate": 5.5464719956086396e-05,
"loss": 2.8722,
"step": 18650
},
{
"epoch": 32.46527777777778,
"grad_norm": 5.15625,
"learning_rate": 5.49761973285132e-05,
"loss": 2.871,
"step": 18700
},
{
"epoch": 32.552083333333336,
"grad_norm": 4.1875,
"learning_rate": 5.4489018719134654e-05,
"loss": 2.8801,
"step": 18750
},
{
"epoch": 32.638888888888886,
"grad_norm": 4.875,
"learning_rate": 5.400319867085633e-05,
"loss": 2.8668,
"step": 18800
},
{
"epoch": 32.72569444444444,
"grad_norm": 4.8125,
"learning_rate": 5.3518751686029134e-05,
"loss": 2.8673,
"step": 18850
},
{
"epoch": 32.8125,
"grad_norm": 3.828125,
"learning_rate": 5.303569222601626e-05,
"loss": 2.875,
"step": 18900
},
{
"epoch": 32.89930555555556,
"grad_norm": 4.0625,
"learning_rate": 5.25540347107615e-05,
"loss": 2.8596,
"step": 18950
},
{
"epoch": 32.986111111111114,
"grad_norm": 4.25,
"learning_rate": 5.207379351835875e-05,
"loss": 2.8683,
"step": 19000
},
{
"epoch": 33.0,
"eval_loss": 2.854464054107666,
"eval_runtime": 40.2584,
"eval_samples_per_second": 92.627,
"eval_steps_per_second": 5.812,
"step": 19008
},
{
"epoch": 33.072916666666664,
"grad_norm": 4.5,
"learning_rate": 5.1594982984622906e-05,
"loss": 2.8657,
"step": 19050
},
{
"epoch": 33.15972222222222,
"grad_norm": 5.78125,
"learning_rate": 5.1117617402661865e-05,
"loss": 2.8538,
"step": 19100
},
{
"epoch": 33.24652777777778,
"grad_norm": 4.25,
"learning_rate": 5.064171102244985e-05,
"loss": 2.8671,
"step": 19150
},
{
"epoch": 33.333333333333336,
"grad_norm": 5.125,
"learning_rate": 5.0167278050402075e-05,
"loss": 2.879,
"step": 19200
},
{
"epoch": 33.420138888888886,
"grad_norm": 3.890625,
"learning_rate": 4.9694332648950536e-05,
"loss": 2.8637,
"step": 19250
},
{
"epoch": 33.50694444444444,
"grad_norm": 3.71875,
"learning_rate": 4.9222888936121494e-05,
"loss": 2.8891,
"step": 19300
},
{
"epoch": 33.59375,
"grad_norm": 3.578125,
"learning_rate": 4.875296098511365e-05,
"loss": 2.864,
"step": 19350
},
{
"epoch": 33.68055555555556,
"grad_norm": 4.9375,
"learning_rate": 4.828456282387859e-05,
"loss": 2.8731,
"step": 19400
},
{
"epoch": 33.767361111111114,
"grad_norm": 4.3125,
"learning_rate": 4.781770843470144e-05,
"loss": 2.8677,
"step": 19450
},
{
"epoch": 33.854166666666664,
"grad_norm": 5.15625,
"learning_rate": 4.735241175378386e-05,
"loss": 2.8649,
"step": 19500
},
{
"epoch": 33.94097222222222,
"grad_norm": 4.15625,
"learning_rate": 4.688868667082794e-05,
"loss": 2.8629,
"step": 19550
},
{
"epoch": 34.0,
"eval_loss": 2.8541414737701416,
"eval_runtime": 39.773,
"eval_samples_per_second": 93.757,
"eval_steps_per_second": 5.883,
"step": 19584
},
{
"epoch": 34.02777777777778,
"grad_norm": 4.09375,
"learning_rate": 4.642654702862157e-05,
"loss": 2.8661,
"step": 19600
},
{
"epoch": 34.114583333333336,
"grad_norm": 3.53125,
"learning_rate": 4.596600662262508e-05,
"loss": 2.8641,
"step": 19650
},
{
"epoch": 34.201388888888886,
"grad_norm": 4.46875,
"learning_rate": 4.55070792005597e-05,
"loss": 2.8574,
"step": 19700
},
{
"epoch": 34.28819444444444,
"grad_norm": 3.828125,
"learning_rate": 4.5049778461996926e-05,
"loss": 2.8735,
"step": 19750
},
{
"epoch": 34.375,
"grad_norm": 6.65625,
"learning_rate": 4.459411805794976e-05,
"loss": 2.8731,
"step": 19800
},
{
"epoch": 34.46180555555556,
"grad_norm": 4.25,
"learning_rate": 4.414011159046495e-05,
"loss": 2.8719,
"step": 19850
},
{
"epoch": 34.548611111111114,
"grad_norm": 3.90625,
"learning_rate": 4.368777261221737e-05,
"loss": 2.8769,
"step": 19900
},
{
"epoch": 34.635416666666664,
"grad_norm": 4.0625,
"learning_rate": 4.323711462610495e-05,
"loss": 2.8679,
"step": 19950
},
{
"epoch": 34.72222222222222,
"grad_norm": 3.953125,
"learning_rate": 4.278815108484602e-05,
"loss": 2.8681,
"step": 20000
},
{
"epoch": 34.80902777777778,
"grad_norm": 4.09375,
"learning_rate": 4.234089539057745e-05,
"loss": 2.8744,
"step": 20050
},
{
"epoch": 34.895833333333336,
"grad_norm": 4.03125,
"learning_rate": 4.1895360894454774e-05,
"loss": 2.8615,
"step": 20100
},
{
"epoch": 34.982638888888886,
"grad_norm": 4.21875,
"learning_rate": 4.1451560896253515e-05,
"loss": 2.8641,
"step": 20150
},
{
"epoch": 35.0,
"eval_loss": 2.8540618419647217,
"eval_runtime": 41.6189,
"eval_samples_per_second": 89.599,
"eval_steps_per_second": 5.622,
"step": 20160
},
{
"epoch": 35.06944444444444,
"grad_norm": 4.1875,
"learning_rate": 4.100950864397223e-05,
"loss": 2.8629,
"step": 20200
},
{
"epoch": 35.15625,
"grad_norm": 3.484375,
"learning_rate": 4.056921733343704e-05,
"loss": 2.8579,
"step": 20250
},
{
"epoch": 35.24305555555556,
"grad_norm": 5.28125,
"learning_rate": 4.013070010790759e-05,
"loss": 2.8641,
"step": 20300
},
{
"epoch": 35.329861111111114,
"grad_norm": 4.25,
"learning_rate": 3.9693970057684984e-05,
"loss": 2.8801,
"step": 20350
},
{
"epoch": 35.416666666666664,
"grad_norm": 4.84375,
"learning_rate": 3.9259040219720645e-05,
"loss": 2.8614,
"step": 20400
},
{
"epoch": 35.50347222222222,
"grad_norm": 5.21875,
"learning_rate": 3.882592357722761e-05,
"loss": 2.8902,
"step": 20450
},
{
"epoch": 35.59027777777778,
"grad_norm": 4.71875,
"learning_rate": 3.839463305929247e-05,
"loss": 2.8626,
"step": 20500
},
{
"epoch": 35.677083333333336,
"grad_norm": 4.3125,
"learning_rate": 3.7965181540489794e-05,
"loss": 2.8741,
"step": 20550
},
{
"epoch": 35.763888888888886,
"grad_norm": 3.46875,
"learning_rate": 3.753758184049764e-05,
"loss": 2.8656,
"step": 20600
},
{
"epoch": 35.85069444444444,
"grad_norm": 5.03125,
"learning_rate": 3.7111846723714916e-05,
"loss": 2.8661,
"step": 20650
},
{
"epoch": 35.9375,
"grad_norm": 3.25,
"learning_rate": 3.668798889888022e-05,
"loss": 2.8597,
"step": 20700
},
{
"epoch": 36.0,
"eval_loss": 2.853997230529785,
"eval_runtime": 41.7365,
"eval_samples_per_second": 89.346,
"eval_steps_per_second": 5.607,
"step": 20736
},
{
"epoch": 36.02430555555556,
"grad_norm": 4.5,
"learning_rate": 3.626602101869281e-05,
"loss": 2.8674,
"step": 20750
},
{
"epoch": 36.111111111111114,
"grad_norm": 5.15625,
"learning_rate": 3.5845955679434426e-05,
"loss": 2.8631,
"step": 20800
},
{
"epoch": 36.197916666666664,
"grad_norm": 3.6875,
"learning_rate": 3.542780542059373e-05,
"loss": 2.8576,
"step": 20850
},
{
"epoch": 36.28472222222222,
"grad_norm": 3.515625,
"learning_rate": 3.501158272449155e-05,
"loss": 2.8715,
"step": 20900
},
{
"epoch": 36.37152777777778,
"grad_norm": 3.4375,
"learning_rate": 3.45973000159088e-05,
"loss": 2.8754,
"step": 20950
},
{
"epoch": 36.458333333333336,
"grad_norm": 3.671875,
"learning_rate": 3.418496966171498e-05,
"loss": 2.8721,
"step": 21000
},
{
"epoch": 36.545138888888886,
"grad_norm": 3.84375,
"learning_rate": 3.377460397049951e-05,
"loss": 2.8741,
"step": 21050
},
{
"epoch": 36.63194444444444,
"grad_norm": 4.3125,
"learning_rate": 3.336621519220404e-05,
"loss": 2.8717,
"step": 21100
},
{
"epoch": 36.71875,
"grad_norm": 3.625,
"learning_rate": 3.295981551775679e-05,
"loss": 2.8655,
"step": 21150
},
{
"epoch": 36.80555555555556,
"grad_norm": 3.46875,
"learning_rate": 3.255541707870874e-05,
"loss": 2.8748,
"step": 21200
},
{
"epoch": 36.892361111111114,
"grad_norm": 3.640625,
"learning_rate": 3.2153031946871427e-05,
"loss": 2.8598,
"step": 21250
},
{
"epoch": 36.979166666666664,
"grad_norm": 3.875,
"learning_rate": 3.1752672133956596e-05,
"loss": 2.8632,
"step": 21300
},
{
"epoch": 37.0,
"eval_loss": 2.854156017303467,
"eval_runtime": 40.6449,
"eval_samples_per_second": 91.746,
"eval_steps_per_second": 5.757,
"step": 21312
},
{
"epoch": 37.06597222222222,
"grad_norm": 4.125,
"learning_rate": 3.135434959121756e-05,
"loss": 2.8613,
"step": 21350
},
{
"epoch": 37.15277777777778,
"grad_norm": 5.53125,
"learning_rate": 3.095807620909257e-05,
"loss": 2.859,
"step": 21400
},
{
"epoch": 37.239583333333336,
"grad_norm": 4.15625,
"learning_rate": 3.0563863816849795e-05,
"loss": 2.8618,
"step": 21450
},
{
"epoch": 37.326388888888886,
"grad_norm": 4.0625,
"learning_rate": 3.017172418223424e-05,
"loss": 2.8817,
"step": 21500
},
{
"epoch": 37.41319444444444,
"grad_norm": 3.953125,
"learning_rate": 2.9781669011116364e-05,
"loss": 2.8609,
"step": 21550
},
{
"epoch": 37.5,
"grad_norm": 3.96875,
"learning_rate": 2.939370994714278e-05,
"loss": 2.8872,
"step": 21600
},
{
"epoch": 37.58680555555556,
"grad_norm": 4.09375,
"learning_rate": 2.90078585713886e-05,
"loss": 2.864,
"step": 21650
},
{
"epoch": 37.673611111111114,
"grad_norm": 3.421875,
"learning_rate": 2.8624126402011798e-05,
"loss": 2.8757,
"step": 21700
},
{
"epoch": 37.760416666666664,
"grad_norm": 3.40625,
"learning_rate": 2.8242524893909162e-05,
"loss": 2.8623,
"step": 21750
},
{
"epoch": 37.84722222222222,
"grad_norm": 3.28125,
"learning_rate": 2.7863065438374748e-05,
"loss": 2.8695,
"step": 21800
},
{
"epoch": 37.93402777777778,
"grad_norm": 4.4375,
"learning_rate": 2.7485759362759378e-05,
"loss": 2.8596,
"step": 21850
},
{
"epoch": 38.0,
"eval_loss": 2.8540520668029785,
"eval_runtime": 41.5194,
"eval_samples_per_second": 89.813,
"eval_steps_per_second": 5.636,
"step": 21888
},
{
"epoch": 38.020833333333336,
"grad_norm": 3.25,
"learning_rate": 2.7110617930132877e-05,
"loss": 2.8658,
"step": 21900
},
{
"epoch": 38.107638888888886,
"grad_norm": 3.1875,
"learning_rate": 2.673765233894755e-05,
"loss": 2.8632,
"step": 21950
},
{
"epoch": 38.19444444444444,
"grad_norm": 3.765625,
"learning_rate": 2.6366873722704265e-05,
"loss": 2.8583,
"step": 22000
},
{
"epoch": 38.28125,
"grad_norm": 3.640625,
"learning_rate": 2.599829314961967e-05,
"loss": 2.8678,
"step": 22050
},
{
"epoch": 38.36805555555556,
"grad_norm": 3.609375,
"learning_rate": 2.5631921622296128e-05,
"loss": 2.8777,
"step": 22100
},
{
"epoch": 38.454861111111114,
"grad_norm": 3.15625,
"learning_rate": 2.526777007739316e-05,
"loss": 2.8671,
"step": 22150
},
{
"epoch": 38.541666666666664,
"grad_norm": 3.53125,
"learning_rate": 2.4905849385300883e-05,
"loss": 2.8782,
"step": 22200
},
{
"epoch": 38.62847222222222,
"grad_norm": 3.46875,
"learning_rate": 2.4546170349815666e-05,
"loss": 2.8699,
"step": 22250
},
{
"epoch": 38.71527777777778,
"grad_norm": 3.8125,
"learning_rate": 2.418874370781754e-05,
"loss": 2.8658,
"step": 22300
},
{
"epoch": 38.802083333333336,
"grad_norm": 3.9375,
"learning_rate": 2.3833580128949762e-05,
"loss": 2.8749,
"step": 22350
},
{
"epoch": 38.888888888888886,
"grad_norm": 3.75,
"learning_rate": 2.3480690215300105e-05,
"loss": 2.8573,
"step": 22400
},
{
"epoch": 38.97569444444444,
"grad_norm": 4.75,
"learning_rate": 2.313008450108468e-05,
"loss": 2.8656,
"step": 22450
},
{
"epoch": 39.0,
"eval_loss": 2.854092597961426,
"eval_runtime": 41.0282,
"eval_samples_per_second": 90.889,
"eval_steps_per_second": 5.703,
"step": 22464
},
{
"epoch": 39.0625,
"grad_norm": 3.40625,
"learning_rate": 2.278177345233323e-05,
"loss": 2.8622,
"step": 22500
},
{
"epoch": 39.14930555555556,
"grad_norm": 3.625,
"learning_rate": 2.2435767466576863e-05,
"loss": 2.8578,
"step": 22550
},
{
"epoch": 39.236111111111114,
"grad_norm": 4.15625,
"learning_rate": 2.209207687253746e-05,
"loss": 2.8602,
"step": 22600
},
{
"epoch": 39.322916666666664,
"grad_norm": 2.921875,
"learning_rate": 2.1750711929819723e-05,
"loss": 2.8825,
"step": 22650
},
{
"epoch": 39.40972222222222,
"grad_norm": 3.421875,
"learning_rate": 2.1411682828604452e-05,
"loss": 2.8618,
"step": 22700
},
{
"epoch": 39.49652777777778,
"grad_norm": 3.6875,
"learning_rate": 2.1074999689344755e-05,
"loss": 2.8834,
"step": 22750
},
{
"epoch": 39.583333333333336,
"grad_norm": 4.46875,
"learning_rate": 2.0740672562463602e-05,
"loss": 2.8664,
"step": 22800
},
{
"epoch": 39.670138888888886,
"grad_norm": 3.5625,
"learning_rate": 2.0408711428054195e-05,
"loss": 2.8771,
"step": 22850
},
{
"epoch": 39.75694444444444,
"grad_norm": 3.390625,
"learning_rate": 2.0079126195581612e-05,
"loss": 2.8629,
"step": 22900
},
{
"epoch": 39.84375,
"grad_norm": 4.28125,
"learning_rate": 1.9751926703587353e-05,
"loss": 2.867,
"step": 22950
},
{
"epoch": 39.93055555555556,
"grad_norm": 3.390625,
"learning_rate": 1.9427122719395452e-05,
"loss": 2.8591,
"step": 23000
},
{
"epoch": 40.0,
"eval_loss": 2.85404634475708,
"eval_runtime": 42.4648,
"eval_samples_per_second": 87.814,
"eval_steps_per_second": 5.51,
"step": 23040
},
{
"epoch": 40.017361111111114,
"grad_norm": 4.71875,
"learning_rate": 1.9104723938821012e-05,
"loss": 2.8661,
"step": 23050
},
{
"epoch": 40.104166666666664,
"grad_norm": 3.375,
"learning_rate": 1.8784739985880628e-05,
"loss": 2.8613,
"step": 23100
},
{
"epoch": 40.19097222222222,
"grad_norm": 3.078125,
"learning_rate": 1.8467180412505313e-05,
"loss": 2.8565,
"step": 23150
},
{
"epoch": 40.27777777777778,
"grad_norm": 3.171875,
"learning_rate": 1.8152054698255194e-05,
"loss": 2.8671,
"step": 23200
},
{
"epoch": 40.364583333333336,
"grad_norm": 3.609375,
"learning_rate": 1.7839372250036534e-05,
"loss": 2.8812,
"step": 23250
},
{
"epoch": 40.451388888888886,
"grad_norm": 3.9375,
"learning_rate": 1.7529142401821062e-05,
"loss": 2.8657,
"step": 23300
},
{
"epoch": 40.53819444444444,
"grad_norm": 3.875,
"learning_rate": 1.722137441436721e-05,
"loss": 2.8782,
"step": 23350
},
{
"epoch": 40.625,
"grad_norm": 3.984375,
"learning_rate": 1.6916077474943736e-05,
"loss": 2.8685,
"step": 23400
},
{
"epoch": 40.71180555555556,
"grad_norm": 2.90625,
"learning_rate": 1.66132606970554e-05,
"loss": 2.8671,
"step": 23450
},
{
"epoch": 40.798611111111114,
"grad_norm": 3.328125,
"learning_rate": 1.631293312017099e-05,
"loss": 2.8723,
"step": 23500
},
{
"epoch": 40.885416666666664,
"grad_norm": 3.171875,
"learning_rate": 1.6015103709453482e-05,
"loss": 2.8591,
"step": 23550
},
{
"epoch": 40.97222222222222,
"grad_norm": 3.375,
"learning_rate": 1.571978135549238e-05,
"loss": 2.8635,
"step": 23600
},
{
"epoch": 41.0,
"eval_loss": 2.8541696071624756,
"eval_runtime": 40.7796,
"eval_samples_per_second": 91.443,
"eval_steps_per_second": 5.738,
"step": 23616
},
{
"epoch": 41.05902777777778,
"grad_norm": 3.5625,
"learning_rate": 1.5426974874038247e-05,
"loss": 2.8627,
"step": 23650
},
{
"epoch": 41.145833333333336,
"grad_norm": 2.96875,
"learning_rate": 1.51366930057398e-05,
"loss": 2.8606,
"step": 23700
},
{
"epoch": 41.232638888888886,
"grad_norm": 3.25,
"learning_rate": 1.4848944415882648e-05,
"loss": 2.8608,
"step": 23750
},
{
"epoch": 41.31944444444444,
"grad_norm": 2.9375,
"learning_rate": 1.4563737694130885e-05,
"loss": 2.8802,
"step": 23800
},
{
"epoch": 41.40625,
"grad_norm": 3.21875,
"learning_rate": 1.4281081354270564e-05,
"loss": 2.8615,
"step": 23850
},
{
"epoch": 41.49305555555556,
"grad_norm": 3.59375,
"learning_rate": 1.4000983833955594e-05,
"loss": 2.8829,
"step": 23900
},
{
"epoch": 41.579861111111114,
"grad_norm": 2.828125,
"learning_rate": 1.3723453494455784e-05,
"loss": 2.8665,
"step": 23950
},
{
"epoch": 41.666666666666664,
"grad_norm": 3.015625,
"learning_rate": 1.3448498620407345e-05,
"loss": 2.8761,
"step": 24000
},
{
"epoch": 41.75347222222222,
"grad_norm": 4.03125,
"learning_rate": 1.3176127419565564e-05,
"loss": 2.8624,
"step": 24050
},
{
"epoch": 41.84027777777778,
"grad_norm": 3.09375,
"learning_rate": 1.2906348022559755e-05,
"loss": 2.8687,
"step": 24100
},
{
"epoch": 41.927083333333336,
"grad_norm": 3.078125,
"learning_rate": 1.2639168482650532e-05,
"loss": 2.8575,
"step": 24150
},
{
"epoch": 42.0,
"eval_loss": 2.854001045227051,
"eval_runtime": 40.3709,
"eval_samples_per_second": 92.368,
"eval_steps_per_second": 5.796,
"step": 24192
},
{
"epoch": 42.013888888888886,
"grad_norm": 3.96875,
"learning_rate": 1.2374596775489477e-05,
"loss": 2.8656,
"step": 24200
},
{
"epoch": 42.10069444444444,
"grad_norm": 3.4375,
"learning_rate": 1.2112640798881058e-05,
"loss": 2.8625,
"step": 24250
},
{
"epoch": 42.1875,
"grad_norm": 3.53125,
"learning_rate": 1.1853308372546756e-05,
"loss": 2.8571,
"step": 24300
},
{
"epoch": 42.27430555555556,
"grad_norm": 3.21875,
"learning_rate": 1.1596607237891766e-05,
"loss": 2.8664,
"step": 24350
},
{
"epoch": 42.361111111111114,
"grad_norm": 3.125,
"learning_rate": 1.1342545057773846e-05,
"loss": 2.881,
"step": 24400
},
{
"epoch": 42.447916666666664,
"grad_norm": 3.453125,
"learning_rate": 1.1091129416274603e-05,
"loss": 2.8614,
"step": 24450
},
{
"epoch": 42.53472222222222,
"grad_norm": 3.28125,
"learning_rate": 1.0842367818472988e-05,
"loss": 2.8773,
"step": 24500
},
{
"epoch": 42.62152777777778,
"grad_norm": 2.640625,
"learning_rate": 1.0596267690221496e-05,
"loss": 2.874,
"step": 24550
},
{
"epoch": 42.708333333333336,
"grad_norm": 4.0625,
"learning_rate": 1.0352836377924202e-05,
"loss": 2.8666,
"step": 24600
},
{
"epoch": 42.795138888888886,
"grad_norm": 3.203125,
"learning_rate": 1.0112081148317687e-05,
"loss": 2.8681,
"step": 24650
},
{
"epoch": 42.88194444444444,
"grad_norm": 3.15625,
"learning_rate": 9.874009188253974e-06,
"loss": 2.8575,
"step": 24700
},
{
"epoch": 42.96875,
"grad_norm": 3.046875,
"learning_rate": 9.63862760448616e-06,
"loss": 2.8666,
"step": 24750
},
{
"epoch": 43.0,
"eval_loss": 2.8540420532226562,
"eval_runtime": 42.3273,
"eval_samples_per_second": 88.099,
"eval_steps_per_second": 5.528,
"step": 24768
},
{
"epoch": 43.05555555555556,
"grad_norm": 3.265625,
"learning_rate": 9.405943423456043e-06,
"loss": 2.8636,
"step": 24800
},
{
"epoch": 43.142361111111114,
"grad_norm": 3.25,
"learning_rate": 9.175963591084546e-06,
"loss": 2.858,
"step": 24850
},
{
"epoch": 43.229166666666664,
"grad_norm": 4.59375,
"learning_rate": 8.948694972564343e-06,
"loss": 2.8629,
"step": 24900
},
{
"epoch": 43.31597222222222,
"grad_norm": 3.078125,
"learning_rate": 8.724144352154861e-06,
"loss": 2.8783,
"step": 24950
},
{
"epoch": 43.40277777777778,
"grad_norm": 3.515625,
"learning_rate": 8.502318432979806e-06,
"loss": 2.8623,
"step": 25000
},
{
"epoch": 43.489583333333336,
"grad_norm": 2.65625,
"learning_rate": 8.28322383682707e-06,
"loss": 2.8827,
"step": 25050
},
{
"epoch": 43.576388888888886,
"grad_norm": 3.203125,
"learning_rate": 8.066867103951082e-06,
"loss": 2.8631,
"step": 25100
},
{
"epoch": 43.66319444444444,
"grad_norm": 2.890625,
"learning_rate": 7.853254692877476e-06,
"loss": 2.8769,
"step": 25150
},
{
"epoch": 43.75,
"grad_norm": 3.078125,
"learning_rate": 7.642392980210423e-06,
"loss": 2.8654,
"step": 25200
},
{
"epoch": 43.83680555555556,
"grad_norm": 3.21875,
"learning_rate": 7.4342882604422125e-06,
"loss": 2.87,
"step": 25250
},
{
"epoch": 43.923611111111114,
"grad_norm": 3.34375,
"learning_rate": 7.228946745765364e-06,
"loss": 2.8584,
"step": 25300
},
{
"epoch": 44.0,
"eval_loss": 2.8539493083953857,
"eval_runtime": 42.0373,
"eval_samples_per_second": 88.707,
"eval_steps_per_second": 5.566,
"step": 25344
},
{
"epoch": 44.010416666666664,
"grad_norm": 2.90625,
"learning_rate": 7.026374565887117e-06,
"loss": 2.8638,
"step": 25350
},
{
"epoch": 44.09722222222222,
"grad_norm": 2.46875,
"learning_rate": 6.826577767846665e-06,
"loss": 2.8638,
"step": 25400
},
{
"epoch": 44.18402777777778,
"grad_norm": 3.078125,
"learning_rate": 6.629562315834348e-06,
"loss": 2.8536,
"step": 25450
},
{
"epoch": 44.270833333333336,
"grad_norm": 3.09375,
"learning_rate": 6.435334091013856e-06,
"loss": 2.8646,
"step": 25500
}
],
"logging_steps": 50,
"max_steps": 28800,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.177294293290189e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}