NairaRahim's picture
Training in progress, epoch 33, checkpoint
fc4f407 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_metric": 34.45762252807617,
"best_model_checkpoint": "/kaggle/working/output/checkpoint-43065",
"epoch": 33.0,
"eval_steps": 500,
"global_step": 43065,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07662835249042145,
"grad_norm": 9.545656204223633,
"learning_rate": 4.9952586206896554e-05,
"loss": 58.0015,
"step": 100
},
{
"epoch": 0.1532567049808429,
"grad_norm": 3.9482674598693848,
"learning_rate": 4.990469348659004e-05,
"loss": 38.502,
"step": 200
},
{
"epoch": 0.22988505747126436,
"grad_norm": 2.5423216819763184,
"learning_rate": 4.985680076628353e-05,
"loss": 35.7891,
"step": 300
},
{
"epoch": 0.3065134099616858,
"grad_norm": 3.6723568439483643,
"learning_rate": 4.9808908045977015e-05,
"loss": 34.9999,
"step": 400
},
{
"epoch": 0.3831417624521073,
"grad_norm": 2.0953221321105957,
"learning_rate": 4.97610153256705e-05,
"loss": 35.9283,
"step": 500
},
{
"epoch": 0.45977011494252873,
"grad_norm": 4.932604789733887,
"learning_rate": 4.971312260536399e-05,
"loss": 34.5531,
"step": 600
},
{
"epoch": 0.5363984674329502,
"grad_norm": 5.419522762298584,
"learning_rate": 4.9665229885057475e-05,
"loss": 34.7408,
"step": 700
},
{
"epoch": 0.6130268199233716,
"grad_norm": 3.9690020084381104,
"learning_rate": 4.961733716475096e-05,
"loss": 34.5521,
"step": 800
},
{
"epoch": 0.6896551724137931,
"grad_norm": 3.3197548389434814,
"learning_rate": 4.956944444444445e-05,
"loss": 33.4281,
"step": 900
},
{
"epoch": 0.7662835249042146,
"grad_norm": 4.233493328094482,
"learning_rate": 4.952155172413793e-05,
"loss": 34.3137,
"step": 1000
},
{
"epoch": 0.842911877394636,
"grad_norm": 5.390758037567139,
"learning_rate": 4.9473659003831416e-05,
"loss": 33.9454,
"step": 1100
},
{
"epoch": 0.9195402298850575,
"grad_norm": 3.419612407684326,
"learning_rate": 4.94257662835249e-05,
"loss": 34.2298,
"step": 1200
},
{
"epoch": 0.9961685823754789,
"grad_norm": 2.3791182041168213,
"learning_rate": 4.937787356321839e-05,
"loss": 33.5481,
"step": 1300
},
{
"epoch": 1.0,
"eval_loss": 35.558197021484375,
"eval_runtime": 49.3359,
"eval_samples_per_second": 26.451,
"eval_steps_per_second": 3.324,
"step": 1305
},
{
"epoch": 1.0727969348659003,
"grad_norm": 3.0501019954681396,
"learning_rate": 4.932998084291188e-05,
"loss": 34.3557,
"step": 1400
},
{
"epoch": 1.1494252873563218,
"grad_norm": 3.027714252471924,
"learning_rate": 4.928208812260537e-05,
"loss": 34.2442,
"step": 1500
},
{
"epoch": 1.2260536398467432,
"grad_norm": 3.693758249282837,
"learning_rate": 4.923419540229886e-05,
"loss": 33.5375,
"step": 1600
},
{
"epoch": 1.3026819923371646,
"grad_norm": 3.7679357528686523,
"learning_rate": 4.9186302681992344e-05,
"loss": 33.7891,
"step": 1700
},
{
"epoch": 1.3793103448275863,
"grad_norm": 3.2367331981658936,
"learning_rate": 4.9138409961685824e-05,
"loss": 33.4964,
"step": 1800
},
{
"epoch": 1.4559386973180077,
"grad_norm": 3.6876628398895264,
"learning_rate": 4.909051724137931e-05,
"loss": 34.7739,
"step": 1900
},
{
"epoch": 1.5325670498084292,
"grad_norm": 1.9550260305404663,
"learning_rate": 4.90426245210728e-05,
"loss": 34.2552,
"step": 2000
},
{
"epoch": 1.6091954022988506,
"grad_norm": 4.955118656158447,
"learning_rate": 4.8994731800766285e-05,
"loss": 33.9766,
"step": 2100
},
{
"epoch": 1.685823754789272,
"grad_norm": 6.145394802093506,
"learning_rate": 4.894683908045977e-05,
"loss": 34.1676,
"step": 2200
},
{
"epoch": 1.7624521072796935,
"grad_norm": 6.15125846862793,
"learning_rate": 4.889894636015326e-05,
"loss": 34.3084,
"step": 2300
},
{
"epoch": 1.839080459770115,
"grad_norm": 2.647857427597046,
"learning_rate": 4.8851053639846746e-05,
"loss": 34.6449,
"step": 2400
},
{
"epoch": 1.9157088122605364,
"grad_norm": 4.066762447357178,
"learning_rate": 4.880316091954023e-05,
"loss": 34.1318,
"step": 2500
},
{
"epoch": 1.9923371647509578,
"grad_norm": 5.785406589508057,
"learning_rate": 4.875526819923372e-05,
"loss": 34.1303,
"step": 2600
},
{
"epoch": 2.0,
"eval_loss": 35.211631774902344,
"eval_runtime": 49.3338,
"eval_samples_per_second": 26.452,
"eval_steps_per_second": 3.324,
"step": 2610
},
{
"epoch": 2.0689655172413794,
"grad_norm": 6.074384689331055,
"learning_rate": 4.8707375478927206e-05,
"loss": 33.6587,
"step": 2700
},
{
"epoch": 2.1455938697318007,
"grad_norm": 3.770009994506836,
"learning_rate": 4.865948275862069e-05,
"loss": 34.5023,
"step": 2800
},
{
"epoch": 2.2222222222222223,
"grad_norm": 4.6336140632629395,
"learning_rate": 4.861159003831418e-05,
"loss": 34.1806,
"step": 2900
},
{
"epoch": 2.2988505747126435,
"grad_norm": 5.440792083740234,
"learning_rate": 4.856369731800767e-05,
"loss": 34.6645,
"step": 3000
},
{
"epoch": 2.375478927203065,
"grad_norm": 2.98138165473938,
"learning_rate": 4.8515804597701154e-05,
"loss": 34.1371,
"step": 3100
},
{
"epoch": 2.4521072796934864,
"grad_norm": 2.4175803661346436,
"learning_rate": 4.846791187739464e-05,
"loss": 33.8015,
"step": 3200
},
{
"epoch": 2.528735632183908,
"grad_norm": 3.846370220184326,
"learning_rate": 4.842001915708813e-05,
"loss": 34.0589,
"step": 3300
},
{
"epoch": 2.6053639846743293,
"grad_norm": 4.001793384552002,
"learning_rate": 4.8372126436781614e-05,
"loss": 33.7327,
"step": 3400
},
{
"epoch": 2.681992337164751,
"grad_norm": 3.7779624462127686,
"learning_rate": 4.83242337164751e-05,
"loss": 34.3508,
"step": 3500
},
{
"epoch": 2.7586206896551726,
"grad_norm": 3.5112695693969727,
"learning_rate": 4.827634099616858e-05,
"loss": 33.5653,
"step": 3600
},
{
"epoch": 2.835249042145594,
"grad_norm": 2.3443048000335693,
"learning_rate": 4.822844827586207e-05,
"loss": 33.798,
"step": 3700
},
{
"epoch": 2.9118773946360155,
"grad_norm": 2.5035479068756104,
"learning_rate": 4.8180555555555555e-05,
"loss": 33.4353,
"step": 3800
},
{
"epoch": 2.9885057471264367,
"grad_norm": 3.4322028160095215,
"learning_rate": 4.813266283524904e-05,
"loss": 33.948,
"step": 3900
},
{
"epoch": 3.0,
"eval_loss": 35.00273132324219,
"eval_runtime": 49.3242,
"eval_samples_per_second": 26.458,
"eval_steps_per_second": 3.325,
"step": 3915
},
{
"epoch": 3.0651340996168583,
"grad_norm": 2.8833682537078857,
"learning_rate": 4.808477011494253e-05,
"loss": 34.523,
"step": 4000
},
{
"epoch": 3.1417624521072796,
"grad_norm": 2.8744261264801025,
"learning_rate": 4.803735632183908e-05,
"loss": 33.921,
"step": 4100
},
{
"epoch": 3.218390804597701,
"grad_norm": 2.928616762161255,
"learning_rate": 4.798946360153257e-05,
"loss": 33.6903,
"step": 4200
},
{
"epoch": 3.2950191570881224,
"grad_norm": 3.0579280853271484,
"learning_rate": 4.7941570881226054e-05,
"loss": 33.0608,
"step": 4300
},
{
"epoch": 3.371647509578544,
"grad_norm": 1.6688510179519653,
"learning_rate": 4.789367816091954e-05,
"loss": 33.8769,
"step": 4400
},
{
"epoch": 3.4482758620689653,
"grad_norm": 2.6190459728240967,
"learning_rate": 4.784578544061303e-05,
"loss": 33.2974,
"step": 4500
},
{
"epoch": 3.524904214559387,
"grad_norm": 2.6260671615600586,
"learning_rate": 4.7797892720306515e-05,
"loss": 34.0589,
"step": 4600
},
{
"epoch": 3.6015325670498086,
"grad_norm": 3.191978693008423,
"learning_rate": 4.775e-05,
"loss": 33.9493,
"step": 4700
},
{
"epoch": 3.67816091954023,
"grad_norm": 2.759941339492798,
"learning_rate": 4.770210727969349e-05,
"loss": 33.5936,
"step": 4800
},
{
"epoch": 3.7547892720306515,
"grad_norm": 2.262294054031372,
"learning_rate": 4.7654214559386976e-05,
"loss": 34.06,
"step": 4900
},
{
"epoch": 3.8314176245210727,
"grad_norm": 4.6808600425720215,
"learning_rate": 4.760632183908046e-05,
"loss": 34.1592,
"step": 5000
},
{
"epoch": 3.9080459770114944,
"grad_norm": 4.294464111328125,
"learning_rate": 4.755842911877395e-05,
"loss": 34.4652,
"step": 5100
},
{
"epoch": 3.9846743295019156,
"grad_norm": 2.7845072746276855,
"learning_rate": 4.7510536398467436e-05,
"loss": 34.2075,
"step": 5200
},
{
"epoch": 4.0,
"eval_loss": 34.954986572265625,
"eval_runtime": 49.2865,
"eval_samples_per_second": 26.478,
"eval_steps_per_second": 3.327,
"step": 5220
},
{
"epoch": 4.061302681992337,
"grad_norm": 4.420943260192871,
"learning_rate": 4.746264367816092e-05,
"loss": 34.5735,
"step": 5300
},
{
"epoch": 4.137931034482759,
"grad_norm": 2.898287534713745,
"learning_rate": 4.741475095785441e-05,
"loss": 34.0739,
"step": 5400
},
{
"epoch": 4.21455938697318,
"grad_norm": 4.703996658325195,
"learning_rate": 4.73668582375479e-05,
"loss": 33.7022,
"step": 5500
},
{
"epoch": 4.291187739463601,
"grad_norm": 2.2913658618927,
"learning_rate": 4.7318965517241384e-05,
"loss": 33.6581,
"step": 5600
},
{
"epoch": 4.3678160919540225,
"grad_norm": 3.895615339279175,
"learning_rate": 4.727107279693487e-05,
"loss": 34.0314,
"step": 5700
},
{
"epoch": 4.444444444444445,
"grad_norm": 4.635524749755859,
"learning_rate": 4.722318007662835e-05,
"loss": 34.5266,
"step": 5800
},
{
"epoch": 4.521072796934866,
"grad_norm": 3.451066017150879,
"learning_rate": 4.717528735632184e-05,
"loss": 33.1786,
"step": 5900
},
{
"epoch": 4.597701149425287,
"grad_norm": 2.552107810974121,
"learning_rate": 4.7127394636015325e-05,
"loss": 33.6118,
"step": 6000
},
{
"epoch": 4.674329501915709,
"grad_norm": 2.359786033630371,
"learning_rate": 4.707998084291188e-05,
"loss": 33.9903,
"step": 6100
},
{
"epoch": 4.75095785440613,
"grad_norm": 2.2611875534057617,
"learning_rate": 4.703208812260537e-05,
"loss": 34.0762,
"step": 6200
},
{
"epoch": 4.827586206896552,
"grad_norm": 1.8199210166931152,
"learning_rate": 4.698419540229885e-05,
"loss": 33.6635,
"step": 6300
},
{
"epoch": 4.904214559386973,
"grad_norm": 2.7332305908203125,
"learning_rate": 4.693630268199234e-05,
"loss": 33.0946,
"step": 6400
},
{
"epoch": 4.980842911877395,
"grad_norm": 2.9454078674316406,
"learning_rate": 4.6888409961685824e-05,
"loss": 33.9173,
"step": 6500
},
{
"epoch": 5.0,
"eval_loss": 34.924800872802734,
"eval_runtime": 49.3002,
"eval_samples_per_second": 26.47,
"eval_steps_per_second": 3.327,
"step": 6525
},
{
"epoch": 5.057471264367816,
"grad_norm": 2.3083884716033936,
"learning_rate": 4.684051724137931e-05,
"loss": 33.8987,
"step": 6600
},
{
"epoch": 5.134099616858237,
"grad_norm": 2.228327751159668,
"learning_rate": 4.67926245210728e-05,
"loss": 33.8189,
"step": 6700
},
{
"epoch": 5.210727969348659,
"grad_norm": 3.6814918518066406,
"learning_rate": 4.6744731800766284e-05,
"loss": 33.8364,
"step": 6800
},
{
"epoch": 5.287356321839081,
"grad_norm": 2.5758285522460938,
"learning_rate": 4.669683908045977e-05,
"loss": 33.7093,
"step": 6900
},
{
"epoch": 5.363984674329502,
"grad_norm": 4.175839900970459,
"learning_rate": 4.6648946360153265e-05,
"loss": 33.6689,
"step": 7000
},
{
"epoch": 5.440613026819923,
"grad_norm": 2.213092088699341,
"learning_rate": 4.6601053639846745e-05,
"loss": 33.7936,
"step": 7100
},
{
"epoch": 5.517241379310345,
"grad_norm": 2.4982571601867676,
"learning_rate": 4.655316091954023e-05,
"loss": 33.3686,
"step": 7200
},
{
"epoch": 5.593869731800766,
"grad_norm": 3.635983943939209,
"learning_rate": 4.6505747126436784e-05,
"loss": 33.5493,
"step": 7300
},
{
"epoch": 5.670498084291188,
"grad_norm": 4.315894603729248,
"learning_rate": 4.645785440613027e-05,
"loss": 33.6607,
"step": 7400
},
{
"epoch": 5.747126436781609,
"grad_norm": 2.6151223182678223,
"learning_rate": 4.640996168582376e-05,
"loss": 34.7535,
"step": 7500
},
{
"epoch": 5.823754789272031,
"grad_norm": 4.03953218460083,
"learning_rate": 4.6362068965517244e-05,
"loss": 33.9865,
"step": 7600
},
{
"epoch": 5.900383141762452,
"grad_norm": 2.512362480163574,
"learning_rate": 4.6314176245210724e-05,
"loss": 33.0343,
"step": 7700
},
{
"epoch": 5.977011494252873,
"grad_norm": 4.745575428009033,
"learning_rate": 4.626628352490422e-05,
"loss": 33.4544,
"step": 7800
},
{
"epoch": 6.0,
"eval_loss": 34.841033935546875,
"eval_runtime": 49.3059,
"eval_samples_per_second": 26.467,
"eval_steps_per_second": 3.326,
"step": 7830
},
{
"epoch": 6.053639846743295,
"grad_norm": 2.996056079864502,
"learning_rate": 4.6218390804597705e-05,
"loss": 33.631,
"step": 7900
},
{
"epoch": 6.130268199233717,
"grad_norm": 3.3260300159454346,
"learning_rate": 4.617049808429119e-05,
"loss": 33.9222,
"step": 8000
},
{
"epoch": 6.206896551724138,
"grad_norm": 2.214486598968506,
"learning_rate": 4.612260536398468e-05,
"loss": 32.9576,
"step": 8100
},
{
"epoch": 6.283524904214559,
"grad_norm": 3.6611664295196533,
"learning_rate": 4.6074712643678166e-05,
"loss": 33.5231,
"step": 8200
},
{
"epoch": 6.360153256704981,
"grad_norm": 2.582730770111084,
"learning_rate": 4.602681992337165e-05,
"loss": 33.6936,
"step": 8300
},
{
"epoch": 6.436781609195402,
"grad_norm": 2.739861488342285,
"learning_rate": 4.597892720306514e-05,
"loss": 33.3997,
"step": 8400
},
{
"epoch": 6.513409961685824,
"grad_norm": 2.2102463245391846,
"learning_rate": 4.593103448275862e-05,
"loss": 33.9374,
"step": 8500
},
{
"epoch": 6.590038314176245,
"grad_norm": 3.83150577545166,
"learning_rate": 4.5883141762452106e-05,
"loss": 33.9961,
"step": 8600
},
{
"epoch": 6.666666666666667,
"grad_norm": 3.981616735458374,
"learning_rate": 4.583524904214559e-05,
"loss": 33.5413,
"step": 8700
},
{
"epoch": 6.743295019157088,
"grad_norm": 2.3303332328796387,
"learning_rate": 4.578735632183908e-05,
"loss": 34.0529,
"step": 8800
},
{
"epoch": 6.819923371647509,
"grad_norm": 3.9573702812194824,
"learning_rate": 4.573946360153257e-05,
"loss": 33.2897,
"step": 8900
},
{
"epoch": 6.896551724137931,
"grad_norm": 2.6185879707336426,
"learning_rate": 4.5691570881226054e-05,
"loss": 34.0662,
"step": 9000
},
{
"epoch": 6.973180076628353,
"grad_norm": 3.1155271530151367,
"learning_rate": 4.564367816091955e-05,
"loss": 33.517,
"step": 9100
},
{
"epoch": 7.0,
"eval_loss": 34.818748474121094,
"eval_runtime": 49.3029,
"eval_samples_per_second": 26.469,
"eval_steps_per_second": 3.326,
"step": 9135
},
{
"epoch": 7.049808429118774,
"grad_norm": 3.117553472518921,
"learning_rate": 4.5595785440613034e-05,
"loss": 34.1218,
"step": 9200
},
{
"epoch": 7.126436781609195,
"grad_norm": 2.5572612285614014,
"learning_rate": 4.5547892720306515e-05,
"loss": 33.662,
"step": 9300
},
{
"epoch": 7.203065134099617,
"grad_norm": 3.5347042083740234,
"learning_rate": 4.55e-05,
"loss": 34.4668,
"step": 9400
},
{
"epoch": 7.2796934865900385,
"grad_norm": 1.9216647148132324,
"learning_rate": 4.545210727969349e-05,
"loss": 33.4468,
"step": 9500
},
{
"epoch": 7.35632183908046,
"grad_norm": 4.242152214050293,
"learning_rate": 4.5404214559386975e-05,
"loss": 33.5805,
"step": 9600
},
{
"epoch": 7.432950191570881,
"grad_norm": 2.9310567378997803,
"learning_rate": 4.535632183908046e-05,
"loss": 34.0603,
"step": 9700
},
{
"epoch": 7.509578544061303,
"grad_norm": 2.6573023796081543,
"learning_rate": 4.530842911877395e-05,
"loss": 33.8766,
"step": 9800
},
{
"epoch": 7.586206896551724,
"grad_norm": 2.7849409580230713,
"learning_rate": 4.5260536398467436e-05,
"loss": 33.6309,
"step": 9900
},
{
"epoch": 7.662835249042145,
"grad_norm": 2.7377357482910156,
"learning_rate": 4.521264367816092e-05,
"loss": 33.3621,
"step": 10000
},
{
"epoch": 7.739463601532567,
"grad_norm": 2.106233835220337,
"learning_rate": 4.516475095785441e-05,
"loss": 33.4172,
"step": 10100
},
{
"epoch": 7.816091954022989,
"grad_norm": 2.1989126205444336,
"learning_rate": 4.5116858237547896e-05,
"loss": 33.5937,
"step": 10200
},
{
"epoch": 7.89272030651341,
"grad_norm": 2.903721570968628,
"learning_rate": 4.5068965517241377e-05,
"loss": 33.7935,
"step": 10300
},
{
"epoch": 7.969348659003831,
"grad_norm": 2.061602830886841,
"learning_rate": 4.5021072796934863e-05,
"loss": 33.3289,
"step": 10400
},
{
"epoch": 8.0,
"eval_loss": 34.95075607299805,
"eval_runtime": 49.3237,
"eval_samples_per_second": 26.458,
"eval_steps_per_second": 3.325,
"step": 10440
},
{
"epoch": 8.045977011494253,
"grad_norm": 1.8656938076019287,
"learning_rate": 4.497318007662836e-05,
"loss": 33.8404,
"step": 10500
},
{
"epoch": 8.122605363984674,
"grad_norm": 2.783926486968994,
"learning_rate": 4.4925287356321844e-05,
"loss": 33.9544,
"step": 10600
},
{
"epoch": 8.199233716475096,
"grad_norm": 2.175081968307495,
"learning_rate": 4.487739463601533e-05,
"loss": 33.6405,
"step": 10700
},
{
"epoch": 8.275862068965518,
"grad_norm": 4.121524333953857,
"learning_rate": 4.482950191570882e-05,
"loss": 33.568,
"step": 10800
},
{
"epoch": 8.352490421455938,
"grad_norm": 3.978410482406616,
"learning_rate": 4.4781609195402305e-05,
"loss": 33.6659,
"step": 10900
},
{
"epoch": 8.42911877394636,
"grad_norm": 3.0454840660095215,
"learning_rate": 4.473419540229885e-05,
"loss": 33.2689,
"step": 11000
},
{
"epoch": 8.505747126436782,
"grad_norm": 3.169114828109741,
"learning_rate": 4.4686302681992336e-05,
"loss": 33.6227,
"step": 11100
},
{
"epoch": 8.582375478927203,
"grad_norm": 2.5880959033966064,
"learning_rate": 4.463840996168582e-05,
"loss": 33.3022,
"step": 11200
},
{
"epoch": 8.659003831417625,
"grad_norm": 2.1367762088775635,
"learning_rate": 4.459051724137932e-05,
"loss": 33.2851,
"step": 11300
},
{
"epoch": 8.735632183908045,
"grad_norm": 3.0278782844543457,
"learning_rate": 4.4542624521072804e-05,
"loss": 33.922,
"step": 11400
},
{
"epoch": 8.812260536398467,
"grad_norm": 2.6361653804779053,
"learning_rate": 4.4494731800766284e-05,
"loss": 33.1482,
"step": 11500
},
{
"epoch": 8.88888888888889,
"grad_norm": 2.7836809158325195,
"learning_rate": 4.444683908045977e-05,
"loss": 34.1345,
"step": 11600
},
{
"epoch": 8.96551724137931,
"grad_norm": 2.519681453704834,
"learning_rate": 4.439894636015326e-05,
"loss": 34.0642,
"step": 11700
},
{
"epoch": 9.0,
"eval_loss": 34.75983428955078,
"eval_runtime": 49.3463,
"eval_samples_per_second": 26.446,
"eval_steps_per_second": 3.323,
"step": 11745
},
{
"epoch": 9.042145593869732,
"grad_norm": 6.431031703948975,
"learning_rate": 4.4351053639846745e-05,
"loss": 33.6431,
"step": 11800
},
{
"epoch": 9.118773946360154,
"grad_norm": 3.262486457824707,
"learning_rate": 4.430316091954023e-05,
"loss": 32.9398,
"step": 11900
},
{
"epoch": 9.195402298850574,
"grad_norm": 1.945741057395935,
"learning_rate": 4.425526819923372e-05,
"loss": 32.7256,
"step": 12000
},
{
"epoch": 9.272030651340996,
"grad_norm": 5.09276008605957,
"learning_rate": 4.4207375478927205e-05,
"loss": 33.9015,
"step": 12100
},
{
"epoch": 9.348659003831418,
"grad_norm": 3.785059928894043,
"learning_rate": 4.415948275862069e-05,
"loss": 33.6765,
"step": 12200
},
{
"epoch": 9.425287356321839,
"grad_norm": 2.4255340099334717,
"learning_rate": 4.411159003831418e-05,
"loss": 33.1262,
"step": 12300
},
{
"epoch": 9.50191570881226,
"grad_norm": 5.869349479675293,
"learning_rate": 4.4063697318007666e-05,
"loss": 33.2205,
"step": 12400
},
{
"epoch": 9.578544061302683,
"grad_norm": 2.361865997314453,
"learning_rate": 4.4015804597701146e-05,
"loss": 34.0441,
"step": 12500
},
{
"epoch": 9.655172413793103,
"grad_norm": 2.6989896297454834,
"learning_rate": 4.396791187739464e-05,
"loss": 33.6812,
"step": 12600
},
{
"epoch": 9.731800766283525,
"grad_norm": 2.6094741821289062,
"learning_rate": 4.3920019157088127e-05,
"loss": 33.9178,
"step": 12700
},
{
"epoch": 9.808429118773946,
"grad_norm": 2.4616310596466064,
"learning_rate": 4.3872126436781613e-05,
"loss": 34.5233,
"step": 12800
},
{
"epoch": 9.885057471264368,
"grad_norm": 2.7729408740997314,
"learning_rate": 4.38242337164751e-05,
"loss": 33.378,
"step": 12900
},
{
"epoch": 9.96168582375479,
"grad_norm": 2.5230519771575928,
"learning_rate": 4.377634099616859e-05,
"loss": 33.442,
"step": 13000
},
{
"epoch": 10.0,
"eval_loss": 34.700294494628906,
"eval_runtime": 49.2926,
"eval_samples_per_second": 26.475,
"eval_steps_per_second": 3.327,
"step": 13050
},
{
"epoch": 10.03831417624521,
"grad_norm": 2.5322816371917725,
"learning_rate": 4.3728448275862074e-05,
"loss": 33.8873,
"step": 13100
},
{
"epoch": 10.114942528735632,
"grad_norm": 2.1063241958618164,
"learning_rate": 4.368103448275862e-05,
"loss": 33.871,
"step": 13200
},
{
"epoch": 10.191570881226054,
"grad_norm": 3.7001326084136963,
"learning_rate": 4.3633141762452106e-05,
"loss": 34.5129,
"step": 13300
},
{
"epoch": 10.268199233716475,
"grad_norm": 1.8534705638885498,
"learning_rate": 4.35852490421456e-05,
"loss": 33.7739,
"step": 13400
},
{
"epoch": 10.344827586206897,
"grad_norm": 1.9871069192886353,
"learning_rate": 4.3537356321839086e-05,
"loss": 33.4124,
"step": 13500
},
{
"epoch": 10.421455938697317,
"grad_norm": 2.264529228210449,
"learning_rate": 4.348946360153257e-05,
"loss": 33.24,
"step": 13600
},
{
"epoch": 10.49808429118774,
"grad_norm": 3.0297787189483643,
"learning_rate": 4.344157088122606e-05,
"loss": 33.2922,
"step": 13700
},
{
"epoch": 10.574712643678161,
"grad_norm": 2.7185864448547363,
"learning_rate": 4.339367816091954e-05,
"loss": 33.4859,
"step": 13800
},
{
"epoch": 10.651340996168582,
"grad_norm": 3.8887524604797363,
"learning_rate": 4.334578544061303e-05,
"loss": 33.4322,
"step": 13900
},
{
"epoch": 10.727969348659004,
"grad_norm": 2.5119857788085938,
"learning_rate": 4.3297892720306514e-05,
"loss": 33.6234,
"step": 14000
},
{
"epoch": 10.804597701149426,
"grad_norm": 3.2969565391540527,
"learning_rate": 4.325e-05,
"loss": 33.4341,
"step": 14100
},
{
"epoch": 10.881226053639846,
"grad_norm": 3.3629229068756104,
"learning_rate": 4.320210727969349e-05,
"loss": 32.7636,
"step": 14200
},
{
"epoch": 10.957854406130268,
"grad_norm": 3.0765013694763184,
"learning_rate": 4.3154214559386975e-05,
"loss": 33.7066,
"step": 14300
},
{
"epoch": 11.0,
"eval_loss": 34.70278549194336,
"eval_runtime": 49.2928,
"eval_samples_per_second": 26.474,
"eval_steps_per_second": 3.327,
"step": 14355
},
{
"epoch": 11.03448275862069,
"grad_norm": 2.7724273204803467,
"learning_rate": 4.310632183908046e-05,
"loss": 33.7759,
"step": 14400
},
{
"epoch": 11.11111111111111,
"grad_norm": 3.9663071632385254,
"learning_rate": 4.305842911877395e-05,
"loss": 33.6063,
"step": 14500
},
{
"epoch": 11.187739463601533,
"grad_norm": 2.53495717048645,
"learning_rate": 4.3010536398467435e-05,
"loss": 32.9251,
"step": 14600
},
{
"epoch": 11.264367816091955,
"grad_norm": 3.928633689880371,
"learning_rate": 4.296264367816092e-05,
"loss": 33.41,
"step": 14700
},
{
"epoch": 11.340996168582375,
"grad_norm": 1.888804316520691,
"learning_rate": 4.291475095785441e-05,
"loss": 33.147,
"step": 14800
},
{
"epoch": 11.417624521072797,
"grad_norm": 3.151488780975342,
"learning_rate": 4.2866858237547896e-05,
"loss": 34.011,
"step": 14900
},
{
"epoch": 11.494252873563218,
"grad_norm": 2.659867286682129,
"learning_rate": 4.281896551724138e-05,
"loss": 33.3559,
"step": 15000
},
{
"epoch": 11.57088122605364,
"grad_norm": 4.092405319213867,
"learning_rate": 4.277107279693487e-05,
"loss": 33.2301,
"step": 15100
},
{
"epoch": 11.647509578544062,
"grad_norm": 4.295740127563477,
"learning_rate": 4.2723659003831415e-05,
"loss": 33.1047,
"step": 15200
},
{
"epoch": 11.724137931034482,
"grad_norm": 2.4472806453704834,
"learning_rate": 4.26757662835249e-05,
"loss": 33.8206,
"step": 15300
},
{
"epoch": 11.800766283524904,
"grad_norm": 2.716550350189209,
"learning_rate": 4.262787356321839e-05,
"loss": 33.7173,
"step": 15400
},
{
"epoch": 11.877394636015326,
"grad_norm": 3.1278491020202637,
"learning_rate": 4.257998084291188e-05,
"loss": 34.0344,
"step": 15500
},
{
"epoch": 11.954022988505747,
"grad_norm": 2.4835212230682373,
"learning_rate": 4.253208812260537e-05,
"loss": 33.8397,
"step": 15600
},
{
"epoch": 12.0,
"eval_loss": 34.70100402832031,
"eval_runtime": 49.2554,
"eval_samples_per_second": 26.495,
"eval_steps_per_second": 3.33,
"step": 15660
},
{
"epoch": 12.030651340996169,
"grad_norm": 2.331453800201416,
"learning_rate": 4.2484195402298856e-05,
"loss": 32.9794,
"step": 15700
},
{
"epoch": 12.10727969348659,
"grad_norm": 2.2127463817596436,
"learning_rate": 4.243630268199234e-05,
"loss": 33.6367,
"step": 15800
},
{
"epoch": 12.183908045977011,
"grad_norm": 3.1127703189849854,
"learning_rate": 4.238840996168583e-05,
"loss": 32.7221,
"step": 15900
},
{
"epoch": 12.260536398467433,
"grad_norm": 2.5665576457977295,
"learning_rate": 4.234051724137931e-05,
"loss": 33.7796,
"step": 16000
},
{
"epoch": 12.337164750957854,
"grad_norm": 2.995265245437622,
"learning_rate": 4.22926245210728e-05,
"loss": 32.8062,
"step": 16100
},
{
"epoch": 12.413793103448276,
"grad_norm": 3.4698216915130615,
"learning_rate": 4.2244731800766284e-05,
"loss": 33.5182,
"step": 16200
},
{
"epoch": 12.490421455938698,
"grad_norm": 4.030599117279053,
"learning_rate": 4.219683908045977e-05,
"loss": 33.7621,
"step": 16300
},
{
"epoch": 12.567049808429118,
"grad_norm": 2.277189254760742,
"learning_rate": 4.214894636015326e-05,
"loss": 33.7926,
"step": 16400
},
{
"epoch": 12.64367816091954,
"grad_norm": 2.3156633377075195,
"learning_rate": 4.2101053639846744e-05,
"loss": 33.869,
"step": 16500
},
{
"epoch": 12.720306513409962,
"grad_norm": 3.5089361667633057,
"learning_rate": 4.205316091954023e-05,
"loss": 33.6732,
"step": 16600
},
{
"epoch": 12.796934865900383,
"grad_norm": 2.5379600524902344,
"learning_rate": 4.200526819923372e-05,
"loss": 33.5854,
"step": 16700
},
{
"epoch": 12.873563218390805,
"grad_norm": 2.5784411430358887,
"learning_rate": 4.1957375478927205e-05,
"loss": 33.2835,
"step": 16800
},
{
"epoch": 12.950191570881227,
"grad_norm": 2.574859380722046,
"learning_rate": 4.190948275862069e-05,
"loss": 33.8945,
"step": 16900
},
{
"epoch": 13.0,
"eval_loss": 34.72227478027344,
"eval_runtime": 49.2549,
"eval_samples_per_second": 26.495,
"eval_steps_per_second": 3.33,
"step": 16965
},
{
"epoch": 13.026819923371647,
"grad_norm": 3.8546385765075684,
"learning_rate": 4.186159003831418e-05,
"loss": 33.1455,
"step": 17000
},
{
"epoch": 13.10344827586207,
"grad_norm": 3.751404047012329,
"learning_rate": 4.1813697318007665e-05,
"loss": 33.7843,
"step": 17100
},
{
"epoch": 13.18007662835249,
"grad_norm": 3.0844898223876953,
"learning_rate": 4.176580459770115e-05,
"loss": 32.8163,
"step": 17200
},
{
"epoch": 13.256704980842912,
"grad_norm": 1.7570416927337646,
"learning_rate": 4.1718390804597704e-05,
"loss": 33.3296,
"step": 17300
},
{
"epoch": 13.333333333333334,
"grad_norm": 2.5809695720672607,
"learning_rate": 4.1670498084291184e-05,
"loss": 34.1621,
"step": 17400
},
{
"epoch": 13.409961685823754,
"grad_norm": 2.564545154571533,
"learning_rate": 4.162260536398467e-05,
"loss": 33.4641,
"step": 17500
},
{
"epoch": 13.486590038314176,
"grad_norm": 3.2340521812438965,
"learning_rate": 4.1574712643678165e-05,
"loss": 33.5958,
"step": 17600
},
{
"epoch": 13.563218390804598,
"grad_norm": 4.329983711242676,
"learning_rate": 4.152681992337165e-05,
"loss": 33.53,
"step": 17700
},
{
"epoch": 13.639846743295019,
"grad_norm": 2.3342621326446533,
"learning_rate": 4.147892720306514e-05,
"loss": 33.7702,
"step": 17800
},
{
"epoch": 13.71647509578544,
"grad_norm": 2.6764466762542725,
"learning_rate": 4.1431034482758625e-05,
"loss": 33.6024,
"step": 17900
},
{
"epoch": 13.793103448275861,
"grad_norm": 5.089807033538818,
"learning_rate": 4.138314176245211e-05,
"loss": 32.9291,
"step": 18000
},
{
"epoch": 13.869731800766283,
"grad_norm": 2.4803364276885986,
"learning_rate": 4.13352490421456e-05,
"loss": 33.2098,
"step": 18100
},
{
"epoch": 13.946360153256705,
"grad_norm": 3.0112080574035645,
"learning_rate": 4.128735632183908e-05,
"loss": 33.7988,
"step": 18200
},
{
"epoch": 14.0,
"eval_loss": 34.82696533203125,
"eval_runtime": 49.261,
"eval_samples_per_second": 26.492,
"eval_steps_per_second": 3.329,
"step": 18270
},
{
"epoch": 14.022988505747126,
"grad_norm": 3.0625782012939453,
"learning_rate": 4.1239463601532566e-05,
"loss": 33.4482,
"step": 18300
},
{
"epoch": 14.099616858237548,
"grad_norm": 2.5372705459594727,
"learning_rate": 4.119157088122605e-05,
"loss": 33.284,
"step": 18400
},
{
"epoch": 14.17624521072797,
"grad_norm": 2.9518911838531494,
"learning_rate": 4.114367816091954e-05,
"loss": 33.4866,
"step": 18500
},
{
"epoch": 14.25287356321839,
"grad_norm": 2.1386337280273438,
"learning_rate": 4.109578544061303e-05,
"loss": 33.653,
"step": 18600
},
{
"epoch": 14.329501915708812,
"grad_norm": 2.1180756092071533,
"learning_rate": 4.1047892720306514e-05,
"loss": 34.3663,
"step": 18700
},
{
"epoch": 14.406130268199234,
"grad_norm": 3.0451836585998535,
"learning_rate": 4.1e-05,
"loss": 32.7698,
"step": 18800
},
{
"epoch": 14.482758620689655,
"grad_norm": 3.8517203330993652,
"learning_rate": 4.0952107279693494e-05,
"loss": 33.3581,
"step": 18900
},
{
"epoch": 14.559386973180077,
"grad_norm": 2.322065830230713,
"learning_rate": 4.0904214559386974e-05,
"loss": 33.3386,
"step": 19000
},
{
"epoch": 14.636015325670499,
"grad_norm": 2.604886054992676,
"learning_rate": 4.085632183908046e-05,
"loss": 33.8964,
"step": 19100
},
{
"epoch": 14.71264367816092,
"grad_norm": 3.6753382682800293,
"learning_rate": 4.080842911877395e-05,
"loss": 32.9918,
"step": 19200
},
{
"epoch": 14.789272030651341,
"grad_norm": 3.1375985145568848,
"learning_rate": 4.07610153256705e-05,
"loss": 33.5981,
"step": 19300
},
{
"epoch": 14.865900383141762,
"grad_norm": 3.9305307865142822,
"learning_rate": 4.071312260536399e-05,
"loss": 34.0074,
"step": 19400
},
{
"epoch": 14.942528735632184,
"grad_norm": 3.2952847480773926,
"learning_rate": 4.0665229885057473e-05,
"loss": 33.0554,
"step": 19500
},
{
"epoch": 15.0,
"eval_loss": 34.7192268371582,
"eval_runtime": 49.2319,
"eval_samples_per_second": 26.507,
"eval_steps_per_second": 3.331,
"step": 19575
},
{
"epoch": 15.019157088122606,
"grad_norm": 3.291614294052124,
"learning_rate": 4.061733716475096e-05,
"loss": 32.9437,
"step": 19600
},
{
"epoch": 15.095785440613026,
"grad_norm": 4.4670867919921875,
"learning_rate": 4.056944444444445e-05,
"loss": 33.6879,
"step": 19700
},
{
"epoch": 15.172413793103448,
"grad_norm": 3.4122018814086914,
"learning_rate": 4.0521551724137934e-05,
"loss": 33.0167,
"step": 19800
},
{
"epoch": 15.24904214559387,
"grad_norm": 3.854083299636841,
"learning_rate": 4.047365900383142e-05,
"loss": 33.8342,
"step": 19900
},
{
"epoch": 15.32567049808429,
"grad_norm": 2.945396900177002,
"learning_rate": 4.042576628352491e-05,
"loss": 32.3812,
"step": 20000
},
{
"epoch": 15.402298850574713,
"grad_norm": 2.5246341228485107,
"learning_rate": 4.0377873563218395e-05,
"loss": 33.3573,
"step": 20100
},
{
"epoch": 15.478927203065133,
"grad_norm": 2.837134599685669,
"learning_rate": 4.032998084291188e-05,
"loss": 33.5981,
"step": 20200
},
{
"epoch": 15.555555555555555,
"grad_norm": 4.350450038909912,
"learning_rate": 4.028208812260537e-05,
"loss": 34.0699,
"step": 20300
},
{
"epoch": 15.632183908045977,
"grad_norm": 2.4908435344696045,
"learning_rate": 4.0234195402298855e-05,
"loss": 33.8105,
"step": 20400
},
{
"epoch": 15.708812260536398,
"grad_norm": 2.9461615085601807,
"learning_rate": 4.0186302681992336e-05,
"loss": 33.3251,
"step": 20500
},
{
"epoch": 15.78544061302682,
"grad_norm": 2.8716940879821777,
"learning_rate": 4.013840996168582e-05,
"loss": 33.7594,
"step": 20600
},
{
"epoch": 15.862068965517242,
"grad_norm": 2.7166991233825684,
"learning_rate": 4.009051724137931e-05,
"loss": 33.58,
"step": 20700
},
{
"epoch": 15.938697318007662,
"grad_norm": 2.2878618240356445,
"learning_rate": 4.0042624521072796e-05,
"loss": 33.4573,
"step": 20800
},
{
"epoch": 16.0,
"eval_loss": 34.54485321044922,
"eval_runtime": 49.3188,
"eval_samples_per_second": 26.46,
"eval_steps_per_second": 3.325,
"step": 20880
},
{
"epoch": 16.015325670498083,
"grad_norm": 2.970867156982422,
"learning_rate": 3.999473180076628e-05,
"loss": 33.5118,
"step": 20900
},
{
"epoch": 16.091954022988507,
"grad_norm": 2.395005464553833,
"learning_rate": 3.994683908045978e-05,
"loss": 34.1932,
"step": 21000
},
{
"epoch": 16.168582375478927,
"grad_norm": 2.8175065517425537,
"learning_rate": 3.9898946360153264e-05,
"loss": 32.9815,
"step": 21100
},
{
"epoch": 16.245210727969347,
"grad_norm": 4.665389537811279,
"learning_rate": 3.985105363984675e-05,
"loss": 33.8616,
"step": 21200
},
{
"epoch": 16.32183908045977,
"grad_norm": 3.425340175628662,
"learning_rate": 3.980316091954023e-05,
"loss": 33.2022,
"step": 21300
},
{
"epoch": 16.39846743295019,
"grad_norm": 5.212127685546875,
"learning_rate": 3.975574712643678e-05,
"loss": 33.3935,
"step": 21400
},
{
"epoch": 16.47509578544061,
"grad_norm": 1.9034606218338013,
"learning_rate": 3.970785440613027e-05,
"loss": 32.739,
"step": 21500
},
{
"epoch": 16.551724137931036,
"grad_norm": 2.024109125137329,
"learning_rate": 3.9659961685823756e-05,
"loss": 33.4628,
"step": 21600
},
{
"epoch": 16.628352490421456,
"grad_norm": 2.8185606002807617,
"learning_rate": 3.961206896551724e-05,
"loss": 33.7672,
"step": 21700
},
{
"epoch": 16.704980842911876,
"grad_norm": 3.2981534004211426,
"learning_rate": 3.956417624521073e-05,
"loss": 33.1976,
"step": 21800
},
{
"epoch": 16.7816091954023,
"grad_norm": 4.531330585479736,
"learning_rate": 3.951628352490422e-05,
"loss": 33.2379,
"step": 21900
},
{
"epoch": 16.85823754789272,
"grad_norm": 2.4455623626708984,
"learning_rate": 3.9468390804597704e-05,
"loss": 33.2898,
"step": 22000
},
{
"epoch": 16.93486590038314,
"grad_norm": 4.1596245765686035,
"learning_rate": 3.942049808429119e-05,
"loss": 33.2167,
"step": 22100
},
{
"epoch": 17.0,
"eval_loss": 34.65380096435547,
"eval_runtime": 49.3114,
"eval_samples_per_second": 26.464,
"eval_steps_per_second": 3.326,
"step": 22185
},
{
"epoch": 17.011494252873565,
"grad_norm": 4.7622528076171875,
"learning_rate": 3.937260536398468e-05,
"loss": 34.3268,
"step": 22200
},
{
"epoch": 17.088122605363985,
"grad_norm": 2.9908533096313477,
"learning_rate": 3.9324712643678164e-05,
"loss": 33.4477,
"step": 22300
},
{
"epoch": 17.164750957854405,
"grad_norm": 2.2341110706329346,
"learning_rate": 3.927681992337165e-05,
"loss": 33.6793,
"step": 22400
},
{
"epoch": 17.24137931034483,
"grad_norm": 2.3946852684020996,
"learning_rate": 3.922892720306514e-05,
"loss": 33.2578,
"step": 22500
},
{
"epoch": 17.31800766283525,
"grad_norm": 3.3899614810943604,
"learning_rate": 3.9181034482758625e-05,
"loss": 33.2486,
"step": 22600
},
{
"epoch": 17.39463601532567,
"grad_norm": 5.150006294250488,
"learning_rate": 3.9133141762452105e-05,
"loss": 33.0265,
"step": 22700
},
{
"epoch": 17.47126436781609,
"grad_norm": 2.8135523796081543,
"learning_rate": 3.908524904214559e-05,
"loss": 33.4384,
"step": 22800
},
{
"epoch": 17.547892720306514,
"grad_norm": 2.5454325675964355,
"learning_rate": 3.903735632183908e-05,
"loss": 33.4139,
"step": 22900
},
{
"epoch": 17.624521072796934,
"grad_norm": 4.680717945098877,
"learning_rate": 3.8989463601532566e-05,
"loss": 34.0209,
"step": 23000
},
{
"epoch": 17.701149425287355,
"grad_norm": 4.242103099822998,
"learning_rate": 3.894157088122606e-05,
"loss": 33.1372,
"step": 23100
},
{
"epoch": 17.77777777777778,
"grad_norm": 2.639352798461914,
"learning_rate": 3.8893678160919546e-05,
"loss": 33.3558,
"step": 23200
},
{
"epoch": 17.8544061302682,
"grad_norm": 1.9746617078781128,
"learning_rate": 3.884578544061303e-05,
"loss": 33.7639,
"step": 23300
},
{
"epoch": 17.93103448275862,
"grad_norm": 4.005228519439697,
"learning_rate": 3.879837164750958e-05,
"loss": 33.0241,
"step": 23400
},
{
"epoch": 18.0,
"eval_loss": 34.649261474609375,
"eval_runtime": 49.2606,
"eval_samples_per_second": 26.492,
"eval_steps_per_second": 3.329,
"step": 23490
},
{
"epoch": 18.007662835249043,
"grad_norm": 2.500631809234619,
"learning_rate": 3.8750478927203065e-05,
"loss": 33.3219,
"step": 23500
},
{
"epoch": 18.084291187739463,
"grad_norm": 3.90655255317688,
"learning_rate": 3.870258620689655e-05,
"loss": 33.4211,
"step": 23600
},
{
"epoch": 18.160919540229884,
"grad_norm": 2.702497720718384,
"learning_rate": 3.865469348659004e-05,
"loss": 33.2414,
"step": 23700
},
{
"epoch": 18.237547892720308,
"grad_norm": 1.9609768390655518,
"learning_rate": 3.8606800766283525e-05,
"loss": 34.0671,
"step": 23800
},
{
"epoch": 18.314176245210728,
"grad_norm": 2.072951316833496,
"learning_rate": 3.855890804597702e-05,
"loss": 33.6311,
"step": 23900
},
{
"epoch": 18.39080459770115,
"grad_norm": 3.249264717102051,
"learning_rate": 3.85110153256705e-05,
"loss": 32.9968,
"step": 24000
},
{
"epoch": 18.467432950191572,
"grad_norm": 4.439345359802246,
"learning_rate": 3.8463122605363986e-05,
"loss": 33.1314,
"step": 24100
},
{
"epoch": 18.544061302681992,
"grad_norm": 3.9109508991241455,
"learning_rate": 3.841522988505747e-05,
"loss": 33.3908,
"step": 24200
},
{
"epoch": 18.620689655172413,
"grad_norm": 2.539151668548584,
"learning_rate": 3.836733716475096e-05,
"loss": 33.5031,
"step": 24300
},
{
"epoch": 18.697318007662837,
"grad_norm": 2.6246118545532227,
"learning_rate": 3.831944444444445e-05,
"loss": 33.6923,
"step": 24400
},
{
"epoch": 18.773946360153257,
"grad_norm": 3.5379223823547363,
"learning_rate": 3.8271551724137934e-05,
"loss": 32.9198,
"step": 24500
},
{
"epoch": 18.850574712643677,
"grad_norm": 3.673536539077759,
"learning_rate": 3.822365900383142e-05,
"loss": 33.5072,
"step": 24600
},
{
"epoch": 18.9272030651341,
"grad_norm": 3.9377758502960205,
"learning_rate": 3.817576628352491e-05,
"loss": 32.8486,
"step": 24700
},
{
"epoch": 19.0,
"eval_loss": 34.617279052734375,
"eval_runtime": 49.3115,
"eval_samples_per_second": 26.464,
"eval_steps_per_second": 3.326,
"step": 24795
},
{
"epoch": 19.00383141762452,
"grad_norm": 3.04927659034729,
"learning_rate": 3.8127873563218394e-05,
"loss": 33.7055,
"step": 24800
},
{
"epoch": 19.080459770114942,
"grad_norm": 2.725443124771118,
"learning_rate": 3.8079980842911874e-05,
"loss": 33.5355,
"step": 24900
},
{
"epoch": 19.157088122605366,
"grad_norm": 3.853895425796509,
"learning_rate": 3.803208812260536e-05,
"loss": 33.5267,
"step": 25000
},
{
"epoch": 19.233716475095786,
"grad_norm": 2.666419267654419,
"learning_rate": 3.798419540229885e-05,
"loss": 33.4069,
"step": 25100
},
{
"epoch": 19.310344827586206,
"grad_norm": 3.5618317127227783,
"learning_rate": 3.793630268199234e-05,
"loss": 33.7295,
"step": 25200
},
{
"epoch": 19.386973180076627,
"grad_norm": 3.351062297821045,
"learning_rate": 3.788840996168583e-05,
"loss": 33.1994,
"step": 25300
},
{
"epoch": 19.46360153256705,
"grad_norm": 3.3226547241210938,
"learning_rate": 3.7840996168582374e-05,
"loss": 33.3149,
"step": 25400
},
{
"epoch": 19.54022988505747,
"grad_norm": 4.15867805480957,
"learning_rate": 3.779310344827586e-05,
"loss": 33.5592,
"step": 25500
},
{
"epoch": 19.61685823754789,
"grad_norm": 2.333674430847168,
"learning_rate": 3.774521072796935e-05,
"loss": 33.7336,
"step": 25600
},
{
"epoch": 19.693486590038315,
"grad_norm": 2.9516782760620117,
"learning_rate": 3.7697318007662834e-05,
"loss": 33.3228,
"step": 25700
},
{
"epoch": 19.770114942528735,
"grad_norm": 1.734508991241455,
"learning_rate": 3.764942528735632e-05,
"loss": 33.3216,
"step": 25800
},
{
"epoch": 19.846743295019156,
"grad_norm": 2.4886648654937744,
"learning_rate": 3.760153256704981e-05,
"loss": 33.5157,
"step": 25900
},
{
"epoch": 19.92337164750958,
"grad_norm": 3.6624252796173096,
"learning_rate": 3.75536398467433e-05,
"loss": 33.2399,
"step": 26000
},
{
"epoch": 20.0,
"grad_norm": 4.810445785522461,
"learning_rate": 3.750574712643679e-05,
"loss": 32.548,
"step": 26100
},
{
"epoch": 20.0,
"eval_loss": 34.746856689453125,
"eval_runtime": 49.2861,
"eval_samples_per_second": 26.478,
"eval_steps_per_second": 3.328,
"step": 26100
},
{
"epoch": 20.07662835249042,
"grad_norm": 4.07724142074585,
"learning_rate": 3.745785440613027e-05,
"loss": 33.562,
"step": 26200
},
{
"epoch": 20.153256704980844,
"grad_norm": 4.335379600524902,
"learning_rate": 3.7409961685823756e-05,
"loss": 33.166,
"step": 26300
},
{
"epoch": 20.229885057471265,
"grad_norm": 5.472820281982422,
"learning_rate": 3.736206896551724e-05,
"loss": 33.8918,
"step": 26400
},
{
"epoch": 20.306513409961685,
"grad_norm": 3.011789321899414,
"learning_rate": 3.731417624521073e-05,
"loss": 33.395,
"step": 26500
},
{
"epoch": 20.38314176245211,
"grad_norm": 3.251089096069336,
"learning_rate": 3.7266283524904216e-05,
"loss": 32.9072,
"step": 26600
},
{
"epoch": 20.45977011494253,
"grad_norm": 2.7508978843688965,
"learning_rate": 3.72183908045977e-05,
"loss": 33.92,
"step": 26700
},
{
"epoch": 20.53639846743295,
"grad_norm": 2.8051536083221436,
"learning_rate": 3.717049808429119e-05,
"loss": 33.9392,
"step": 26800
},
{
"epoch": 20.613026819923373,
"grad_norm": 7.377379417419434,
"learning_rate": 3.712260536398468e-05,
"loss": 33.0382,
"step": 26900
},
{
"epoch": 20.689655172413794,
"grad_norm": 3.7770464420318604,
"learning_rate": 3.7074712643678164e-05,
"loss": 32.6836,
"step": 27000
},
{
"epoch": 20.766283524904214,
"grad_norm": 4.923346996307373,
"learning_rate": 3.702681992337165e-05,
"loss": 33.2129,
"step": 27100
},
{
"epoch": 20.842911877394634,
"grad_norm": 4.790703773498535,
"learning_rate": 3.697892720306513e-05,
"loss": 33.5413,
"step": 27200
},
{
"epoch": 20.919540229885058,
"grad_norm": 4.592926025390625,
"learning_rate": 3.6931034482758624e-05,
"loss": 33.2436,
"step": 27300
},
{
"epoch": 20.99616858237548,
"grad_norm": 3.0529520511627197,
"learning_rate": 3.688314176245211e-05,
"loss": 33.2415,
"step": 27400
},
{
"epoch": 21.0,
"eval_loss": 34.59661865234375,
"eval_runtime": 49.3345,
"eval_samples_per_second": 26.452,
"eval_steps_per_second": 3.324,
"step": 27405
},
{
"epoch": 21.0727969348659,
"grad_norm": 2.287121534347534,
"learning_rate": 3.683572796934866e-05,
"loss": 32.9962,
"step": 27500
},
{
"epoch": 21.149425287356323,
"grad_norm": 2.5622124671936035,
"learning_rate": 3.678783524904214e-05,
"loss": 33.2565,
"step": 27600
},
{
"epoch": 21.226053639846743,
"grad_norm": 2.2134974002838135,
"learning_rate": 3.673994252873563e-05,
"loss": 33.7442,
"step": 27700
},
{
"epoch": 21.302681992337163,
"grad_norm": 2.574054002761841,
"learning_rate": 3.669204980842912e-05,
"loss": 33.7998,
"step": 27800
},
{
"epoch": 21.379310344827587,
"grad_norm": 2.8479721546173096,
"learning_rate": 3.6644157088122604e-05,
"loss": 33.2015,
"step": 27900
},
{
"epoch": 21.455938697318008,
"grad_norm": 4.845319747924805,
"learning_rate": 3.659626436781609e-05,
"loss": 33.7904,
"step": 28000
},
{
"epoch": 21.532567049808428,
"grad_norm": 2.353726863861084,
"learning_rate": 3.6548371647509584e-05,
"loss": 33.7207,
"step": 28100
},
{
"epoch": 21.60919540229885,
"grad_norm": 3.003556966781616,
"learning_rate": 3.650047892720307e-05,
"loss": 33.297,
"step": 28200
},
{
"epoch": 21.685823754789272,
"grad_norm": 4.815252304077148,
"learning_rate": 3.645258620689656e-05,
"loss": 33.3036,
"step": 28300
},
{
"epoch": 21.762452107279692,
"grad_norm": 3.0622081756591797,
"learning_rate": 3.640469348659004e-05,
"loss": 33.3661,
"step": 28400
},
{
"epoch": 21.839080459770116,
"grad_norm": 3.3728883266448975,
"learning_rate": 3.6356800766283525e-05,
"loss": 32.8782,
"step": 28500
},
{
"epoch": 21.915708812260537,
"grad_norm": 2.2338080406188965,
"learning_rate": 3.630890804597701e-05,
"loss": 33.0412,
"step": 28600
},
{
"epoch": 21.992337164750957,
"grad_norm": 3.717360019683838,
"learning_rate": 3.62610153256705e-05,
"loss": 33.0318,
"step": 28700
},
{
"epoch": 22.0,
"eval_loss": 34.53865432739258,
"eval_runtime": 49.3318,
"eval_samples_per_second": 26.454,
"eval_steps_per_second": 3.324,
"step": 28710
},
{
"epoch": 22.06896551724138,
"grad_norm": 2.77984356880188,
"learning_rate": 3.6213122605363986e-05,
"loss": 33.6688,
"step": 28800
},
{
"epoch": 22.1455938697318,
"grad_norm": 3.427570104598999,
"learning_rate": 3.616522988505747e-05,
"loss": 33.2569,
"step": 28900
},
{
"epoch": 22.22222222222222,
"grad_norm": 2.060288429260254,
"learning_rate": 3.611733716475096e-05,
"loss": 33.4445,
"step": 29000
},
{
"epoch": 22.298850574712645,
"grad_norm": 3.7918601036071777,
"learning_rate": 3.6069444444444446e-05,
"loss": 34.2303,
"step": 29100
},
{
"epoch": 22.375478927203066,
"grad_norm": 3.412705659866333,
"learning_rate": 3.602155172413793e-05,
"loss": 33.5679,
"step": 29200
},
{
"epoch": 22.452107279693486,
"grad_norm": 4.111233711242676,
"learning_rate": 3.597365900383142e-05,
"loss": 32.9136,
"step": 29300
},
{
"epoch": 22.52873563218391,
"grad_norm": 2.1312243938446045,
"learning_rate": 3.592576628352491e-05,
"loss": 32.8361,
"step": 29400
},
{
"epoch": 22.60536398467433,
"grad_norm": 2.0618536472320557,
"learning_rate": 3.587835249042146e-05,
"loss": 33.8499,
"step": 29500
},
{
"epoch": 22.68199233716475,
"grad_norm": 2.7332096099853516,
"learning_rate": 3.5830459770114946e-05,
"loss": 33.5899,
"step": 29600
},
{
"epoch": 22.75862068965517,
"grad_norm": 4.264729022979736,
"learning_rate": 3.578256704980843e-05,
"loss": 33.194,
"step": 29700
},
{
"epoch": 22.835249042145595,
"grad_norm": 4.755107402801514,
"learning_rate": 3.573467432950192e-05,
"loss": 33.2129,
"step": 29800
},
{
"epoch": 22.911877394636015,
"grad_norm": 3.751232147216797,
"learning_rate": 3.56867816091954e-05,
"loss": 33.2948,
"step": 29900
},
{
"epoch": 22.988505747126435,
"grad_norm": 3.3150830268859863,
"learning_rate": 3.5638888888888886e-05,
"loss": 32.452,
"step": 30000
},
{
"epoch": 23.0,
"eval_loss": 34.624755859375,
"eval_runtime": 49.3378,
"eval_samples_per_second": 26.45,
"eval_steps_per_second": 3.324,
"step": 30015
},
{
"epoch": 23.06513409961686,
"grad_norm": 1.9898459911346436,
"learning_rate": 3.559099616858237e-05,
"loss": 33.2659,
"step": 30100
},
{
"epoch": 23.14176245210728,
"grad_norm": 3.3541698455810547,
"learning_rate": 3.554310344827587e-05,
"loss": 33.4747,
"step": 30200
},
{
"epoch": 23.2183908045977,
"grad_norm": 2.298229694366455,
"learning_rate": 3.5495210727969354e-05,
"loss": 33.8791,
"step": 30300
},
{
"epoch": 23.295019157088124,
"grad_norm": 3.9336183071136475,
"learning_rate": 3.544731800766284e-05,
"loss": 33.8427,
"step": 30400
},
{
"epoch": 23.371647509578544,
"grad_norm": 2.9286720752716064,
"learning_rate": 3.539942528735633e-05,
"loss": 33.9572,
"step": 30500
},
{
"epoch": 23.448275862068964,
"grad_norm": 2.9716665744781494,
"learning_rate": 3.5351532567049814e-05,
"loss": 32.5295,
"step": 30600
},
{
"epoch": 23.52490421455939,
"grad_norm": 3.5073654651641846,
"learning_rate": 3.5303639846743294e-05,
"loss": 33.3511,
"step": 30700
},
{
"epoch": 23.60153256704981,
"grad_norm": 4.5670084953308105,
"learning_rate": 3.525574712643678e-05,
"loss": 33.4249,
"step": 30800
},
{
"epoch": 23.67816091954023,
"grad_norm": 2.563405990600586,
"learning_rate": 3.520785440613027e-05,
"loss": 33.821,
"step": 30900
},
{
"epoch": 23.754789272030653,
"grad_norm": 3.5928332805633545,
"learning_rate": 3.5159961685823755e-05,
"loss": 32.9252,
"step": 31000
},
{
"epoch": 23.831417624521073,
"grad_norm": 3.2677550315856934,
"learning_rate": 3.511206896551724e-05,
"loss": 33.4694,
"step": 31100
},
{
"epoch": 23.908045977011493,
"grad_norm": 3.8751015663146973,
"learning_rate": 3.506417624521073e-05,
"loss": 32.7835,
"step": 31200
},
{
"epoch": 23.984674329501917,
"grad_norm": 3.955101490020752,
"learning_rate": 3.5016283524904216e-05,
"loss": 32.6658,
"step": 31300
},
{
"epoch": 24.0,
"eval_loss": 34.550262451171875,
"eval_runtime": 49.3313,
"eval_samples_per_second": 26.454,
"eval_steps_per_second": 3.324,
"step": 31320
},
{
"epoch": 24.061302681992338,
"grad_norm": 3.885087013244629,
"learning_rate": 3.49683908045977e-05,
"loss": 33.5285,
"step": 31400
},
{
"epoch": 24.137931034482758,
"grad_norm": 8.908398628234863,
"learning_rate": 3.4920977011494254e-05,
"loss": 33.1673,
"step": 31500
},
{
"epoch": 24.21455938697318,
"grad_norm": 4.042150974273682,
"learning_rate": 3.487308429118774e-05,
"loss": 33.0384,
"step": 31600
},
{
"epoch": 24.291187739463602,
"grad_norm": 4.992551803588867,
"learning_rate": 3.482519157088123e-05,
"loss": 33.7439,
"step": 31700
},
{
"epoch": 24.367816091954023,
"grad_norm": 5.118918418884277,
"learning_rate": 3.4777298850574715e-05,
"loss": 33.5604,
"step": 31800
},
{
"epoch": 24.444444444444443,
"grad_norm": 3.2756083011627197,
"learning_rate": 3.47294061302682e-05,
"loss": 33.6225,
"step": 31900
},
{
"epoch": 24.521072796934867,
"grad_norm": 2.9864351749420166,
"learning_rate": 3.468151340996169e-05,
"loss": 34.0539,
"step": 32000
},
{
"epoch": 24.597701149425287,
"grad_norm": 2.945171356201172,
"learning_rate": 3.463362068965517e-05,
"loss": 33.2655,
"step": 32100
},
{
"epoch": 24.674329501915707,
"grad_norm": 4.09877347946167,
"learning_rate": 3.4585727969348656e-05,
"loss": 33.239,
"step": 32200
},
{
"epoch": 24.75095785440613,
"grad_norm": 3.7949306964874268,
"learning_rate": 3.453783524904215e-05,
"loss": 32.7246,
"step": 32300
},
{
"epoch": 24.82758620689655,
"grad_norm": 3.8750340938568115,
"learning_rate": 3.4489942528735636e-05,
"loss": 32.5477,
"step": 32400
},
{
"epoch": 24.904214559386972,
"grad_norm": 3.84676456451416,
"learning_rate": 3.444204980842912e-05,
"loss": 33.5781,
"step": 32500
},
{
"epoch": 24.980842911877396,
"grad_norm": 2.3316519260406494,
"learning_rate": 3.439415708812261e-05,
"loss": 33.0241,
"step": 32600
},
{
"epoch": 25.0,
"eval_loss": 34.565101623535156,
"eval_runtime": 49.343,
"eval_samples_per_second": 26.448,
"eval_steps_per_second": 3.324,
"step": 32625
},
{
"epoch": 25.057471264367816,
"grad_norm": 2.94795823097229,
"learning_rate": 3.43462643678161e-05,
"loss": 33.1012,
"step": 32700
},
{
"epoch": 25.134099616858236,
"grad_norm": 2.3455259799957275,
"learning_rate": 3.4298371647509584e-05,
"loss": 33.1345,
"step": 32800
},
{
"epoch": 25.21072796934866,
"grad_norm": 2.678739547729492,
"learning_rate": 3.4250478927203064e-05,
"loss": 33.2271,
"step": 32900
},
{
"epoch": 25.28735632183908,
"grad_norm": 4.3170952796936035,
"learning_rate": 3.420258620689655e-05,
"loss": 33.0392,
"step": 33000
},
{
"epoch": 25.3639846743295,
"grad_norm": 3.8895034790039062,
"learning_rate": 3.415469348659004e-05,
"loss": 33.2535,
"step": 33100
},
{
"epoch": 25.440613026819925,
"grad_norm": 3.693235158920288,
"learning_rate": 3.4106800766283525e-05,
"loss": 33.4471,
"step": 33200
},
{
"epoch": 25.517241379310345,
"grad_norm": 5.521793365478516,
"learning_rate": 3.405890804597701e-05,
"loss": 34.2142,
"step": 33300
},
{
"epoch": 25.593869731800766,
"grad_norm": 2.8983964920043945,
"learning_rate": 3.40110153256705e-05,
"loss": 34.362,
"step": 33400
},
{
"epoch": 25.67049808429119,
"grad_norm": 3.329155206680298,
"learning_rate": 3.396360153256705e-05,
"loss": 32.373,
"step": 33500
},
{
"epoch": 25.74712643678161,
"grad_norm": 2.6269519329071045,
"learning_rate": 3.391570881226054e-05,
"loss": 33.1401,
"step": 33600
},
{
"epoch": 25.82375478927203,
"grad_norm": 3.1628787517547607,
"learning_rate": 3.3867816091954024e-05,
"loss": 33.2718,
"step": 33700
},
{
"epoch": 25.900383141762454,
"grad_norm": 3.0653462409973145,
"learning_rate": 3.381992337164751e-05,
"loss": 33.481,
"step": 33800
},
{
"epoch": 25.977011494252874,
"grad_norm": 2.5874106884002686,
"learning_rate": 3.377250957854406e-05,
"loss": 33.2467,
"step": 33900
},
{
"epoch": 26.0,
"eval_loss": 34.54924392700195,
"eval_runtime": 49.3,
"eval_samples_per_second": 26.471,
"eval_steps_per_second": 3.327,
"step": 33930
},
{
"epoch": 26.053639846743295,
"grad_norm": 3.76274037361145,
"learning_rate": 3.372461685823755e-05,
"loss": 33.5457,
"step": 34000
},
{
"epoch": 26.130268199233715,
"grad_norm": 5.3265061378479,
"learning_rate": 3.3676724137931036e-05,
"loss": 33.2299,
"step": 34100
},
{
"epoch": 26.20689655172414,
"grad_norm": 4.5878987312316895,
"learning_rate": 3.362883141762452e-05,
"loss": 33.7869,
"step": 34200
},
{
"epoch": 26.28352490421456,
"grad_norm": 3.673882007598877,
"learning_rate": 3.358093869731801e-05,
"loss": 32.6976,
"step": 34300
},
{
"epoch": 26.36015325670498,
"grad_norm": 3.5689809322357178,
"learning_rate": 3.35330459770115e-05,
"loss": 33.6335,
"step": 34400
},
{
"epoch": 26.436781609195403,
"grad_norm": 5.735408306121826,
"learning_rate": 3.3485153256704984e-05,
"loss": 33.116,
"step": 34500
},
{
"epoch": 26.513409961685824,
"grad_norm": 1.9485822916030884,
"learning_rate": 3.343726053639847e-05,
"loss": 33.9434,
"step": 34600
},
{
"epoch": 26.590038314176244,
"grad_norm": 4.049289226531982,
"learning_rate": 3.338936781609196e-05,
"loss": 33.2275,
"step": 34700
},
{
"epoch": 26.666666666666668,
"grad_norm": 2.392415761947632,
"learning_rate": 3.334147509578544e-05,
"loss": 33.1124,
"step": 34800
},
{
"epoch": 26.743295019157088,
"grad_norm": 3.01650333404541,
"learning_rate": 3.3293582375478924e-05,
"loss": 33.0733,
"step": 34900
},
{
"epoch": 26.81992337164751,
"grad_norm": 3.1701831817626953,
"learning_rate": 3.324568965517241e-05,
"loss": 33.0029,
"step": 35000
},
{
"epoch": 26.896551724137932,
"grad_norm": 2.6294217109680176,
"learning_rate": 3.3197796934865905e-05,
"loss": 32.9881,
"step": 35100
},
{
"epoch": 26.973180076628353,
"grad_norm": 3.4342799186706543,
"learning_rate": 3.314990421455939e-05,
"loss": 33.047,
"step": 35200
},
{
"epoch": 27.0,
"eval_loss": 34.47444152832031,
"eval_runtime": 49.2859,
"eval_samples_per_second": 26.478,
"eval_steps_per_second": 3.328,
"step": 35235
},
{
"epoch": 27.049808429118773,
"grad_norm": 2.2080352306365967,
"learning_rate": 3.310201149425288e-05,
"loss": 33.0059,
"step": 35300
},
{
"epoch": 27.126436781609197,
"grad_norm": 3.0985817909240723,
"learning_rate": 3.3054118773946366e-05,
"loss": 33.8201,
"step": 35400
},
{
"epoch": 27.203065134099617,
"grad_norm": 3.165069103240967,
"learning_rate": 3.300622605363985e-05,
"loss": 33.7343,
"step": 35500
},
{
"epoch": 27.279693486590038,
"grad_norm": 3.2427308559417725,
"learning_rate": 3.295833333333333e-05,
"loss": 32.8829,
"step": 35600
},
{
"epoch": 27.35632183908046,
"grad_norm": 2.973548412322998,
"learning_rate": 3.291044061302682e-05,
"loss": 33.2656,
"step": 35700
},
{
"epoch": 27.43295019157088,
"grad_norm": 2.892834424972534,
"learning_rate": 3.2862547892720306e-05,
"loss": 33.085,
"step": 35800
},
{
"epoch": 27.509578544061302,
"grad_norm": 2.4037787914276123,
"learning_rate": 3.281465517241379e-05,
"loss": 32.7549,
"step": 35900
},
{
"epoch": 27.586206896551722,
"grad_norm": 3.7890052795410156,
"learning_rate": 3.276676245210728e-05,
"loss": 33.4256,
"step": 36000
},
{
"epoch": 27.662835249042146,
"grad_norm": 3.4910600185394287,
"learning_rate": 3.271886973180077e-05,
"loss": 33.3707,
"step": 36100
},
{
"epoch": 27.739463601532567,
"grad_norm": 2.895573854446411,
"learning_rate": 3.2670977011494254e-05,
"loss": 32.699,
"step": 36200
},
{
"epoch": 27.816091954022987,
"grad_norm": 4.670979022979736,
"learning_rate": 3.262308429118774e-05,
"loss": 33.5898,
"step": 36300
},
{
"epoch": 27.89272030651341,
"grad_norm": 2.362605571746826,
"learning_rate": 3.257519157088123e-05,
"loss": 33.4235,
"step": 36400
},
{
"epoch": 27.96934865900383,
"grad_norm": 4.695677280426025,
"learning_rate": 3.2527298850574715e-05,
"loss": 33.9318,
"step": 36500
},
{
"epoch": 28.0,
"eval_loss": 34.4583740234375,
"eval_runtime": 49.3101,
"eval_samples_per_second": 26.465,
"eval_steps_per_second": 3.326,
"step": 36540
},
{
"epoch": 28.04597701149425,
"grad_norm": 6.301197052001953,
"learning_rate": 3.24794061302682e-05,
"loss": 33.1986,
"step": 36600
},
{
"epoch": 28.122605363984675,
"grad_norm": 3.1395254135131836,
"learning_rate": 3.243151340996169e-05,
"loss": 32.8468,
"step": 36700
},
{
"epoch": 28.199233716475096,
"grad_norm": 2.673875331878662,
"learning_rate": 3.2383620689655175e-05,
"loss": 32.7804,
"step": 36800
},
{
"epoch": 28.275862068965516,
"grad_norm": 3.807201862335205,
"learning_rate": 3.233572796934866e-05,
"loss": 34.128,
"step": 36900
},
{
"epoch": 28.35249042145594,
"grad_norm": 3.2160332202911377,
"learning_rate": 3.228783524904215e-05,
"loss": 33.0419,
"step": 37000
},
{
"epoch": 28.42911877394636,
"grad_norm": 3.2508413791656494,
"learning_rate": 3.2239942528735636e-05,
"loss": 33.3642,
"step": 37100
},
{
"epoch": 28.50574712643678,
"grad_norm": 4.088146209716797,
"learning_rate": 3.219204980842912e-05,
"loss": 33.1516,
"step": 37200
},
{
"epoch": 28.582375478927204,
"grad_norm": 3.4091460704803467,
"learning_rate": 3.214415708812261e-05,
"loss": 33.5824,
"step": 37300
},
{
"epoch": 28.659003831417625,
"grad_norm": 3.113368034362793,
"learning_rate": 3.209626436781609e-05,
"loss": 33.2279,
"step": 37400
},
{
"epoch": 28.735632183908045,
"grad_norm": 3.7009544372558594,
"learning_rate": 3.2048371647509577e-05,
"loss": 33.0744,
"step": 37500
},
{
"epoch": 28.81226053639847,
"grad_norm": 2.046365261077881,
"learning_rate": 3.2000478927203063e-05,
"loss": 33.1949,
"step": 37600
},
{
"epoch": 28.88888888888889,
"grad_norm": 3.8142659664154053,
"learning_rate": 3.195258620689655e-05,
"loss": 33.3156,
"step": 37700
},
{
"epoch": 28.96551724137931,
"grad_norm": 3.120384454727173,
"learning_rate": 3.190469348659004e-05,
"loss": 32.828,
"step": 37800
},
{
"epoch": 29.0,
"eval_loss": 34.519615173339844,
"eval_runtime": 49.3498,
"eval_samples_per_second": 26.444,
"eval_steps_per_second": 3.323,
"step": 37845
},
{
"epoch": 29.042145593869733,
"grad_norm": 4.515305042266846,
"learning_rate": 3.185727969348659e-05,
"loss": 33.5753,
"step": 37900
},
{
"epoch": 29.118773946360154,
"grad_norm": 3.7501096725463867,
"learning_rate": 3.1809386973180076e-05,
"loss": 33.3131,
"step": 38000
},
{
"epoch": 29.195402298850574,
"grad_norm": 3.431818723678589,
"learning_rate": 3.176149425287356e-05,
"loss": 33.085,
"step": 38100
},
{
"epoch": 29.272030651340994,
"grad_norm": 3.4503543376922607,
"learning_rate": 3.171360153256705e-05,
"loss": 32.7894,
"step": 38200
},
{
"epoch": 29.34865900383142,
"grad_norm": 4.361378192901611,
"learning_rate": 3.1665708812260536e-05,
"loss": 33.4922,
"step": 38300
},
{
"epoch": 29.42528735632184,
"grad_norm": 2.354480504989624,
"learning_rate": 3.161781609195402e-05,
"loss": 33.3214,
"step": 38400
},
{
"epoch": 29.50191570881226,
"grad_norm": 3.3123044967651367,
"learning_rate": 3.156992337164751e-05,
"loss": 33.3181,
"step": 38500
},
{
"epoch": 29.578544061302683,
"grad_norm": 2.3824117183685303,
"learning_rate": 3.1522030651341e-05,
"loss": 33.0926,
"step": 38600
},
{
"epoch": 29.655172413793103,
"grad_norm": 2.811178684234619,
"learning_rate": 3.1474137931034484e-05,
"loss": 33.3361,
"step": 38700
},
{
"epoch": 29.731800766283524,
"grad_norm": 4.715090751647949,
"learning_rate": 3.142624521072797e-05,
"loss": 32.8444,
"step": 38800
},
{
"epoch": 29.808429118773947,
"grad_norm": 2.191209316253662,
"learning_rate": 3.137835249042146e-05,
"loss": 33.9677,
"step": 38900
},
{
"epoch": 29.885057471264368,
"grad_norm": 2.606814384460449,
"learning_rate": 3.1330459770114945e-05,
"loss": 33.3536,
"step": 39000
},
{
"epoch": 29.961685823754788,
"grad_norm": 4.8533172607421875,
"learning_rate": 3.128256704980843e-05,
"loss": 33.2721,
"step": 39100
},
{
"epoch": 30.0,
"eval_loss": 34.46094512939453,
"eval_runtime": 49.4265,
"eval_samples_per_second": 26.403,
"eval_steps_per_second": 3.318,
"step": 39150
},
{
"epoch": 30.038314176245212,
"grad_norm": 4.915451526641846,
"learning_rate": 3.123467432950192e-05,
"loss": 33.0369,
"step": 39200
},
{
"epoch": 30.114942528735632,
"grad_norm": 4.369636058807373,
"learning_rate": 3.1186781609195405e-05,
"loss": 33.1459,
"step": 39300
},
{
"epoch": 30.191570881226053,
"grad_norm": 2.9162957668304443,
"learning_rate": 3.113888888888889e-05,
"loss": 32.9688,
"step": 39400
},
{
"epoch": 30.268199233716476,
"grad_norm": 4.7777628898620605,
"learning_rate": 3.109099616858238e-05,
"loss": 33.7249,
"step": 39500
},
{
"epoch": 30.344827586206897,
"grad_norm": 3.651850700378418,
"learning_rate": 3.104310344827586e-05,
"loss": 33.4887,
"step": 39600
},
{
"epoch": 30.421455938697317,
"grad_norm": 3.29491925239563,
"learning_rate": 3.0995210727969346e-05,
"loss": 33.5714,
"step": 39700
},
{
"epoch": 30.49808429118774,
"grad_norm": 3.9116616249084473,
"learning_rate": 3.094731800766283e-05,
"loss": 33.7763,
"step": 39800
},
{
"epoch": 30.57471264367816,
"grad_norm": Infinity,
"learning_rate": 3.089990421455939e-05,
"loss": 32.1907,
"step": 39900
},
{
"epoch": 30.65134099616858,
"grad_norm": 3.237652063369751,
"learning_rate": 3.085201149425287e-05,
"loss": 33.344,
"step": 40000
},
{
"epoch": 30.727969348659006,
"grad_norm": 4.286235809326172,
"learning_rate": 3.080459770114943e-05,
"loss": 33.181,
"step": 40100
},
{
"epoch": 30.804597701149426,
"grad_norm": 2.6222527027130127,
"learning_rate": 3.075670498084292e-05,
"loss": 33.3407,
"step": 40200
},
{
"epoch": 30.881226053639846,
"grad_norm": 3.7431180477142334,
"learning_rate": 3.0708812260536404e-05,
"loss": 33.1109,
"step": 40300
},
{
"epoch": 30.957854406130267,
"grad_norm": 3.0706677436828613,
"learning_rate": 3.066091954022989e-05,
"loss": 33.3504,
"step": 40400
},
{
"epoch": 31.0,
"eval_loss": 34.48047637939453,
"eval_runtime": 49.4044,
"eval_samples_per_second": 26.415,
"eval_steps_per_second": 3.32,
"step": 40455
},
{
"epoch": 31.03448275862069,
"grad_norm": 3.288548231124878,
"learning_rate": 3.061302681992337e-05,
"loss": 33.4014,
"step": 40500
},
{
"epoch": 31.11111111111111,
"grad_norm": 4.078604221343994,
"learning_rate": 3.056513409961686e-05,
"loss": 33.5796,
"step": 40600
},
{
"epoch": 31.18773946360153,
"grad_norm": 3.589484691619873,
"learning_rate": 3.0517241379310348e-05,
"loss": 32.9547,
"step": 40700
},
{
"epoch": 31.264367816091955,
"grad_norm": 3.1043126583099365,
"learning_rate": 3.046934865900383e-05,
"loss": 33.2105,
"step": 40800
},
{
"epoch": 31.340996168582375,
"grad_norm": 2.446356773376465,
"learning_rate": 3.0421455938697318e-05,
"loss": 33.1642,
"step": 40900
},
{
"epoch": 31.417624521072796,
"grad_norm": 2.966627597808838,
"learning_rate": 3.0373563218390805e-05,
"loss": 32.7751,
"step": 41000
},
{
"epoch": 31.49425287356322,
"grad_norm": 4.547020435333252,
"learning_rate": 3.0325670498084292e-05,
"loss": 33.8578,
"step": 41100
},
{
"epoch": 31.57088122605364,
"grad_norm": 3.151139259338379,
"learning_rate": 3.0277777777777776e-05,
"loss": 33.2976,
"step": 41200
},
{
"epoch": 31.64750957854406,
"grad_norm": 2.8900582790374756,
"learning_rate": 3.0229885057471262e-05,
"loss": 33.1161,
"step": 41300
},
{
"epoch": 31.724137931034484,
"grad_norm": 2.5485446453094482,
"learning_rate": 3.0181992337164756e-05,
"loss": 33.596,
"step": 41400
},
{
"epoch": 31.800766283524904,
"grad_norm": 2.5474777221679688,
"learning_rate": 3.0134099616858243e-05,
"loss": 33.3569,
"step": 41500
},
{
"epoch": 31.877394636015325,
"grad_norm": 3.6182713508605957,
"learning_rate": 3.0086206896551726e-05,
"loss": 32.824,
"step": 41600
},
{
"epoch": 31.95402298850575,
"grad_norm": 3.898332118988037,
"learning_rate": 3.0038314176245213e-05,
"loss": 32.8775,
"step": 41700
},
{
"epoch": 32.0,
"eval_loss": 34.500526428222656,
"eval_runtime": 49.4041,
"eval_samples_per_second": 26.415,
"eval_steps_per_second": 3.32,
"step": 41760
},
{
"epoch": 32.030651340996165,
"grad_norm": 3.481757164001465,
"learning_rate": 2.99904214559387e-05,
"loss": 33.4618,
"step": 41800
},
{
"epoch": 32.10727969348659,
"grad_norm": 3.9191551208496094,
"learning_rate": 2.9942528735632187e-05,
"loss": 33.627,
"step": 41900
},
{
"epoch": 32.18390804597701,
"grad_norm": 5.722991466522217,
"learning_rate": 2.989463601532567e-05,
"loss": 32.705,
"step": 42000
},
{
"epoch": 32.26053639846743,
"grad_norm": 4.626276016235352,
"learning_rate": 2.9846743295019157e-05,
"loss": 33.4211,
"step": 42100
},
{
"epoch": 32.337164750957854,
"grad_norm": 2.526745557785034,
"learning_rate": 2.9798850574712644e-05,
"loss": 32.9605,
"step": 42200
},
{
"epoch": 32.41379310344828,
"grad_norm": 2.2517364025115967,
"learning_rate": 2.975095785440613e-05,
"loss": 33.1264,
"step": 42300
},
{
"epoch": 32.490421455938694,
"grad_norm": 5.5678606033325195,
"learning_rate": 2.9703065134099618e-05,
"loss": 33.1141,
"step": 42400
},
{
"epoch": 32.56704980842912,
"grad_norm": 3.7891595363616943,
"learning_rate": 2.96551724137931e-05,
"loss": 33.3294,
"step": 42500
},
{
"epoch": 32.64367816091954,
"grad_norm": 3.350956916809082,
"learning_rate": 2.960727969348659e-05,
"loss": 33.6182,
"step": 42600
},
{
"epoch": 32.72030651340996,
"grad_norm": 3.138821601867676,
"learning_rate": 2.9559386973180075e-05,
"loss": 33.2219,
"step": 42700
},
{
"epoch": 32.79693486590038,
"grad_norm": 3.301961898803711,
"learning_rate": 2.9511494252873566e-05,
"loss": 33.5015,
"step": 42800
},
{
"epoch": 32.87356321839081,
"grad_norm": 3.0760138034820557,
"learning_rate": 2.9463601532567052e-05,
"loss": 33.3376,
"step": 42900
},
{
"epoch": 32.95019157088122,
"grad_norm": 2.474372625350952,
"learning_rate": 2.941570881226054e-05,
"loss": 32.9016,
"step": 43000
},
{
"epoch": 33.0,
"eval_loss": 34.45762252807617,
"eval_runtime": 49.4143,
"eval_samples_per_second": 26.409,
"eval_steps_per_second": 3.319,
"step": 43065
}
],
"logging_steps": 100,
"max_steps": 104400,
"num_input_tokens_seen": 0,
"num_train_epochs": 80,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.644125980055654e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}