NairaRahim's picture
Training in progress, epoch 11, checkpoint
e802d02 verified
raw
history blame
28.1 kB
{
"best_metric": 34.700294494628906,
"best_model_checkpoint": "/kaggle/working/output/checkpoint-13050",
"epoch": 11.0,
"eval_steps": 500,
"global_step": 14355,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07662835249042145,
"grad_norm": 9.545656204223633,
"learning_rate": 4.9952586206896554e-05,
"loss": 58.0015,
"step": 100
},
{
"epoch": 0.1532567049808429,
"grad_norm": 3.9482674598693848,
"learning_rate": 4.990469348659004e-05,
"loss": 38.502,
"step": 200
},
{
"epoch": 0.22988505747126436,
"grad_norm": 2.5423216819763184,
"learning_rate": 4.985680076628353e-05,
"loss": 35.7891,
"step": 300
},
{
"epoch": 0.3065134099616858,
"grad_norm": 3.6723568439483643,
"learning_rate": 4.9808908045977015e-05,
"loss": 34.9999,
"step": 400
},
{
"epoch": 0.3831417624521073,
"grad_norm": 2.0953221321105957,
"learning_rate": 4.97610153256705e-05,
"loss": 35.9283,
"step": 500
},
{
"epoch": 0.45977011494252873,
"grad_norm": 4.932604789733887,
"learning_rate": 4.971312260536399e-05,
"loss": 34.5531,
"step": 600
},
{
"epoch": 0.5363984674329502,
"grad_norm": 5.419522762298584,
"learning_rate": 4.9665229885057475e-05,
"loss": 34.7408,
"step": 700
},
{
"epoch": 0.6130268199233716,
"grad_norm": 3.9690020084381104,
"learning_rate": 4.961733716475096e-05,
"loss": 34.5521,
"step": 800
},
{
"epoch": 0.6896551724137931,
"grad_norm": 3.3197548389434814,
"learning_rate": 4.956944444444445e-05,
"loss": 33.4281,
"step": 900
},
{
"epoch": 0.7662835249042146,
"grad_norm": 4.233493328094482,
"learning_rate": 4.952155172413793e-05,
"loss": 34.3137,
"step": 1000
},
{
"epoch": 0.842911877394636,
"grad_norm": 5.390758037567139,
"learning_rate": 4.9473659003831416e-05,
"loss": 33.9454,
"step": 1100
},
{
"epoch": 0.9195402298850575,
"grad_norm": 3.419612407684326,
"learning_rate": 4.94257662835249e-05,
"loss": 34.2298,
"step": 1200
},
{
"epoch": 0.9961685823754789,
"grad_norm": 2.3791182041168213,
"learning_rate": 4.937787356321839e-05,
"loss": 33.5481,
"step": 1300
},
{
"epoch": 1.0,
"eval_loss": 35.558197021484375,
"eval_runtime": 49.3359,
"eval_samples_per_second": 26.451,
"eval_steps_per_second": 3.324,
"step": 1305
},
{
"epoch": 1.0727969348659003,
"grad_norm": 3.0501019954681396,
"learning_rate": 4.932998084291188e-05,
"loss": 34.3557,
"step": 1400
},
{
"epoch": 1.1494252873563218,
"grad_norm": 3.027714252471924,
"learning_rate": 4.928208812260537e-05,
"loss": 34.2442,
"step": 1500
},
{
"epoch": 1.2260536398467432,
"grad_norm": 3.693758249282837,
"learning_rate": 4.923419540229886e-05,
"loss": 33.5375,
"step": 1600
},
{
"epoch": 1.3026819923371646,
"grad_norm": 3.7679357528686523,
"learning_rate": 4.9186302681992344e-05,
"loss": 33.7891,
"step": 1700
},
{
"epoch": 1.3793103448275863,
"grad_norm": 3.2367331981658936,
"learning_rate": 4.9138409961685824e-05,
"loss": 33.4964,
"step": 1800
},
{
"epoch": 1.4559386973180077,
"grad_norm": 3.6876628398895264,
"learning_rate": 4.909051724137931e-05,
"loss": 34.7739,
"step": 1900
},
{
"epoch": 1.5325670498084292,
"grad_norm": 1.9550260305404663,
"learning_rate": 4.90426245210728e-05,
"loss": 34.2552,
"step": 2000
},
{
"epoch": 1.6091954022988506,
"grad_norm": 4.955118656158447,
"learning_rate": 4.8994731800766285e-05,
"loss": 33.9766,
"step": 2100
},
{
"epoch": 1.685823754789272,
"grad_norm": 6.145394802093506,
"learning_rate": 4.894683908045977e-05,
"loss": 34.1676,
"step": 2200
},
{
"epoch": 1.7624521072796935,
"grad_norm": 6.15125846862793,
"learning_rate": 4.889894636015326e-05,
"loss": 34.3084,
"step": 2300
},
{
"epoch": 1.839080459770115,
"grad_norm": 2.647857427597046,
"learning_rate": 4.8851053639846746e-05,
"loss": 34.6449,
"step": 2400
},
{
"epoch": 1.9157088122605364,
"grad_norm": 4.066762447357178,
"learning_rate": 4.880316091954023e-05,
"loss": 34.1318,
"step": 2500
},
{
"epoch": 1.9923371647509578,
"grad_norm": 5.785406589508057,
"learning_rate": 4.875526819923372e-05,
"loss": 34.1303,
"step": 2600
},
{
"epoch": 2.0,
"eval_loss": 35.211631774902344,
"eval_runtime": 49.3338,
"eval_samples_per_second": 26.452,
"eval_steps_per_second": 3.324,
"step": 2610
},
{
"epoch": 2.0689655172413794,
"grad_norm": 6.074384689331055,
"learning_rate": 4.8707375478927206e-05,
"loss": 33.6587,
"step": 2700
},
{
"epoch": 2.1455938697318007,
"grad_norm": 3.770009994506836,
"learning_rate": 4.865948275862069e-05,
"loss": 34.5023,
"step": 2800
},
{
"epoch": 2.2222222222222223,
"grad_norm": 4.6336140632629395,
"learning_rate": 4.861159003831418e-05,
"loss": 34.1806,
"step": 2900
},
{
"epoch": 2.2988505747126435,
"grad_norm": 5.440792083740234,
"learning_rate": 4.856369731800767e-05,
"loss": 34.6645,
"step": 3000
},
{
"epoch": 2.375478927203065,
"grad_norm": 2.98138165473938,
"learning_rate": 4.8515804597701154e-05,
"loss": 34.1371,
"step": 3100
},
{
"epoch": 2.4521072796934864,
"grad_norm": 2.4175803661346436,
"learning_rate": 4.846791187739464e-05,
"loss": 33.8015,
"step": 3200
},
{
"epoch": 2.528735632183908,
"grad_norm": 3.846370220184326,
"learning_rate": 4.842001915708813e-05,
"loss": 34.0589,
"step": 3300
},
{
"epoch": 2.6053639846743293,
"grad_norm": 4.001793384552002,
"learning_rate": 4.8372126436781614e-05,
"loss": 33.7327,
"step": 3400
},
{
"epoch": 2.681992337164751,
"grad_norm": 3.7779624462127686,
"learning_rate": 4.83242337164751e-05,
"loss": 34.3508,
"step": 3500
},
{
"epoch": 2.7586206896551726,
"grad_norm": 3.5112695693969727,
"learning_rate": 4.827634099616858e-05,
"loss": 33.5653,
"step": 3600
},
{
"epoch": 2.835249042145594,
"grad_norm": 2.3443048000335693,
"learning_rate": 4.822844827586207e-05,
"loss": 33.798,
"step": 3700
},
{
"epoch": 2.9118773946360155,
"grad_norm": 2.5035479068756104,
"learning_rate": 4.8180555555555555e-05,
"loss": 33.4353,
"step": 3800
},
{
"epoch": 2.9885057471264367,
"grad_norm": 3.4322028160095215,
"learning_rate": 4.813266283524904e-05,
"loss": 33.948,
"step": 3900
},
{
"epoch": 3.0,
"eval_loss": 35.00273132324219,
"eval_runtime": 49.3242,
"eval_samples_per_second": 26.458,
"eval_steps_per_second": 3.325,
"step": 3915
},
{
"epoch": 3.0651340996168583,
"grad_norm": 2.8833682537078857,
"learning_rate": 4.808477011494253e-05,
"loss": 34.523,
"step": 4000
},
{
"epoch": 3.1417624521072796,
"grad_norm": 2.8744261264801025,
"learning_rate": 4.803735632183908e-05,
"loss": 33.921,
"step": 4100
},
{
"epoch": 3.218390804597701,
"grad_norm": 2.928616762161255,
"learning_rate": 4.798946360153257e-05,
"loss": 33.6903,
"step": 4200
},
{
"epoch": 3.2950191570881224,
"grad_norm": 3.0579280853271484,
"learning_rate": 4.7941570881226054e-05,
"loss": 33.0608,
"step": 4300
},
{
"epoch": 3.371647509578544,
"grad_norm": 1.6688510179519653,
"learning_rate": 4.789367816091954e-05,
"loss": 33.8769,
"step": 4400
},
{
"epoch": 3.4482758620689653,
"grad_norm": 2.6190459728240967,
"learning_rate": 4.784578544061303e-05,
"loss": 33.2974,
"step": 4500
},
{
"epoch": 3.524904214559387,
"grad_norm": 2.6260671615600586,
"learning_rate": 4.7797892720306515e-05,
"loss": 34.0589,
"step": 4600
},
{
"epoch": 3.6015325670498086,
"grad_norm": 3.191978693008423,
"learning_rate": 4.775e-05,
"loss": 33.9493,
"step": 4700
},
{
"epoch": 3.67816091954023,
"grad_norm": 2.759941339492798,
"learning_rate": 4.770210727969349e-05,
"loss": 33.5936,
"step": 4800
},
{
"epoch": 3.7547892720306515,
"grad_norm": 2.262294054031372,
"learning_rate": 4.7654214559386976e-05,
"loss": 34.06,
"step": 4900
},
{
"epoch": 3.8314176245210727,
"grad_norm": 4.6808600425720215,
"learning_rate": 4.760632183908046e-05,
"loss": 34.1592,
"step": 5000
},
{
"epoch": 3.9080459770114944,
"grad_norm": 4.294464111328125,
"learning_rate": 4.755842911877395e-05,
"loss": 34.4652,
"step": 5100
},
{
"epoch": 3.9846743295019156,
"grad_norm": 2.7845072746276855,
"learning_rate": 4.7510536398467436e-05,
"loss": 34.2075,
"step": 5200
},
{
"epoch": 4.0,
"eval_loss": 34.954986572265625,
"eval_runtime": 49.2865,
"eval_samples_per_second": 26.478,
"eval_steps_per_second": 3.327,
"step": 5220
},
{
"epoch": 4.061302681992337,
"grad_norm": 4.420943260192871,
"learning_rate": 4.746264367816092e-05,
"loss": 34.5735,
"step": 5300
},
{
"epoch": 4.137931034482759,
"grad_norm": 2.898287534713745,
"learning_rate": 4.741475095785441e-05,
"loss": 34.0739,
"step": 5400
},
{
"epoch": 4.21455938697318,
"grad_norm": 4.703996658325195,
"learning_rate": 4.73668582375479e-05,
"loss": 33.7022,
"step": 5500
},
{
"epoch": 4.291187739463601,
"grad_norm": 2.2913658618927,
"learning_rate": 4.7318965517241384e-05,
"loss": 33.6581,
"step": 5600
},
{
"epoch": 4.3678160919540225,
"grad_norm": 3.895615339279175,
"learning_rate": 4.727107279693487e-05,
"loss": 34.0314,
"step": 5700
},
{
"epoch": 4.444444444444445,
"grad_norm": 4.635524749755859,
"learning_rate": 4.722318007662835e-05,
"loss": 34.5266,
"step": 5800
},
{
"epoch": 4.521072796934866,
"grad_norm": 3.451066017150879,
"learning_rate": 4.717528735632184e-05,
"loss": 33.1786,
"step": 5900
},
{
"epoch": 4.597701149425287,
"grad_norm": 2.552107810974121,
"learning_rate": 4.7127394636015325e-05,
"loss": 33.6118,
"step": 6000
},
{
"epoch": 4.674329501915709,
"grad_norm": 2.359786033630371,
"learning_rate": 4.707998084291188e-05,
"loss": 33.9903,
"step": 6100
},
{
"epoch": 4.75095785440613,
"grad_norm": 2.2611875534057617,
"learning_rate": 4.703208812260537e-05,
"loss": 34.0762,
"step": 6200
},
{
"epoch": 4.827586206896552,
"grad_norm": 1.8199210166931152,
"learning_rate": 4.698419540229885e-05,
"loss": 33.6635,
"step": 6300
},
{
"epoch": 4.904214559386973,
"grad_norm": 2.7332305908203125,
"learning_rate": 4.693630268199234e-05,
"loss": 33.0946,
"step": 6400
},
{
"epoch": 4.980842911877395,
"grad_norm": 2.9454078674316406,
"learning_rate": 4.6888409961685824e-05,
"loss": 33.9173,
"step": 6500
},
{
"epoch": 5.0,
"eval_loss": 34.924800872802734,
"eval_runtime": 49.3002,
"eval_samples_per_second": 26.47,
"eval_steps_per_second": 3.327,
"step": 6525
},
{
"epoch": 5.057471264367816,
"grad_norm": 2.3083884716033936,
"learning_rate": 4.684051724137931e-05,
"loss": 33.8987,
"step": 6600
},
{
"epoch": 5.134099616858237,
"grad_norm": 2.228327751159668,
"learning_rate": 4.67926245210728e-05,
"loss": 33.8189,
"step": 6700
},
{
"epoch": 5.210727969348659,
"grad_norm": 3.6814918518066406,
"learning_rate": 4.6744731800766284e-05,
"loss": 33.8364,
"step": 6800
},
{
"epoch": 5.287356321839081,
"grad_norm": 2.5758285522460938,
"learning_rate": 4.669683908045977e-05,
"loss": 33.7093,
"step": 6900
},
{
"epoch": 5.363984674329502,
"grad_norm": 4.175839900970459,
"learning_rate": 4.6648946360153265e-05,
"loss": 33.6689,
"step": 7000
},
{
"epoch": 5.440613026819923,
"grad_norm": 2.213092088699341,
"learning_rate": 4.6601053639846745e-05,
"loss": 33.7936,
"step": 7100
},
{
"epoch": 5.517241379310345,
"grad_norm": 2.4982571601867676,
"learning_rate": 4.655316091954023e-05,
"loss": 33.3686,
"step": 7200
},
{
"epoch": 5.593869731800766,
"grad_norm": 3.635983943939209,
"learning_rate": 4.6505747126436784e-05,
"loss": 33.5493,
"step": 7300
},
{
"epoch": 5.670498084291188,
"grad_norm": 4.315894603729248,
"learning_rate": 4.645785440613027e-05,
"loss": 33.6607,
"step": 7400
},
{
"epoch": 5.747126436781609,
"grad_norm": 2.6151223182678223,
"learning_rate": 4.640996168582376e-05,
"loss": 34.7535,
"step": 7500
},
{
"epoch": 5.823754789272031,
"grad_norm": 4.03953218460083,
"learning_rate": 4.6362068965517244e-05,
"loss": 33.9865,
"step": 7600
},
{
"epoch": 5.900383141762452,
"grad_norm": 2.512362480163574,
"learning_rate": 4.6314176245210724e-05,
"loss": 33.0343,
"step": 7700
},
{
"epoch": 5.977011494252873,
"grad_norm": 4.745575428009033,
"learning_rate": 4.626628352490422e-05,
"loss": 33.4544,
"step": 7800
},
{
"epoch": 6.0,
"eval_loss": 34.841033935546875,
"eval_runtime": 49.3059,
"eval_samples_per_second": 26.467,
"eval_steps_per_second": 3.326,
"step": 7830
},
{
"epoch": 6.053639846743295,
"grad_norm": 2.996056079864502,
"learning_rate": 4.6218390804597705e-05,
"loss": 33.631,
"step": 7900
},
{
"epoch": 6.130268199233717,
"grad_norm": 3.3260300159454346,
"learning_rate": 4.617049808429119e-05,
"loss": 33.9222,
"step": 8000
},
{
"epoch": 6.206896551724138,
"grad_norm": 2.214486598968506,
"learning_rate": 4.612260536398468e-05,
"loss": 32.9576,
"step": 8100
},
{
"epoch": 6.283524904214559,
"grad_norm": 3.6611664295196533,
"learning_rate": 4.6074712643678166e-05,
"loss": 33.5231,
"step": 8200
},
{
"epoch": 6.360153256704981,
"grad_norm": 2.582730770111084,
"learning_rate": 4.602681992337165e-05,
"loss": 33.6936,
"step": 8300
},
{
"epoch": 6.436781609195402,
"grad_norm": 2.739861488342285,
"learning_rate": 4.597892720306514e-05,
"loss": 33.3997,
"step": 8400
},
{
"epoch": 6.513409961685824,
"grad_norm": 2.2102463245391846,
"learning_rate": 4.593103448275862e-05,
"loss": 33.9374,
"step": 8500
},
{
"epoch": 6.590038314176245,
"grad_norm": 3.83150577545166,
"learning_rate": 4.5883141762452106e-05,
"loss": 33.9961,
"step": 8600
},
{
"epoch": 6.666666666666667,
"grad_norm": 3.981616735458374,
"learning_rate": 4.583524904214559e-05,
"loss": 33.5413,
"step": 8700
},
{
"epoch": 6.743295019157088,
"grad_norm": 2.3303332328796387,
"learning_rate": 4.578735632183908e-05,
"loss": 34.0529,
"step": 8800
},
{
"epoch": 6.819923371647509,
"grad_norm": 3.9573702812194824,
"learning_rate": 4.573946360153257e-05,
"loss": 33.2897,
"step": 8900
},
{
"epoch": 6.896551724137931,
"grad_norm": 2.6185879707336426,
"learning_rate": 4.5691570881226054e-05,
"loss": 34.0662,
"step": 9000
},
{
"epoch": 6.973180076628353,
"grad_norm": 3.1155271530151367,
"learning_rate": 4.564367816091955e-05,
"loss": 33.517,
"step": 9100
},
{
"epoch": 7.0,
"eval_loss": 34.818748474121094,
"eval_runtime": 49.3029,
"eval_samples_per_second": 26.469,
"eval_steps_per_second": 3.326,
"step": 9135
},
{
"epoch": 7.049808429118774,
"grad_norm": 3.117553472518921,
"learning_rate": 4.5595785440613034e-05,
"loss": 34.1218,
"step": 9200
},
{
"epoch": 7.126436781609195,
"grad_norm": 2.5572612285614014,
"learning_rate": 4.5547892720306515e-05,
"loss": 33.662,
"step": 9300
},
{
"epoch": 7.203065134099617,
"grad_norm": 3.5347042083740234,
"learning_rate": 4.55e-05,
"loss": 34.4668,
"step": 9400
},
{
"epoch": 7.2796934865900385,
"grad_norm": 1.9216647148132324,
"learning_rate": 4.545210727969349e-05,
"loss": 33.4468,
"step": 9500
},
{
"epoch": 7.35632183908046,
"grad_norm": 4.242152214050293,
"learning_rate": 4.5404214559386975e-05,
"loss": 33.5805,
"step": 9600
},
{
"epoch": 7.432950191570881,
"grad_norm": 2.9310567378997803,
"learning_rate": 4.535632183908046e-05,
"loss": 34.0603,
"step": 9700
},
{
"epoch": 7.509578544061303,
"grad_norm": 2.6573023796081543,
"learning_rate": 4.530842911877395e-05,
"loss": 33.8766,
"step": 9800
},
{
"epoch": 7.586206896551724,
"grad_norm": 2.7849409580230713,
"learning_rate": 4.5260536398467436e-05,
"loss": 33.6309,
"step": 9900
},
{
"epoch": 7.662835249042145,
"grad_norm": 2.7377357482910156,
"learning_rate": 4.521264367816092e-05,
"loss": 33.3621,
"step": 10000
},
{
"epoch": 7.739463601532567,
"grad_norm": 2.106233835220337,
"learning_rate": 4.516475095785441e-05,
"loss": 33.4172,
"step": 10100
},
{
"epoch": 7.816091954022989,
"grad_norm": 2.1989126205444336,
"learning_rate": 4.5116858237547896e-05,
"loss": 33.5937,
"step": 10200
},
{
"epoch": 7.89272030651341,
"grad_norm": 2.903721570968628,
"learning_rate": 4.5068965517241377e-05,
"loss": 33.7935,
"step": 10300
},
{
"epoch": 7.969348659003831,
"grad_norm": 2.061602830886841,
"learning_rate": 4.5021072796934863e-05,
"loss": 33.3289,
"step": 10400
},
{
"epoch": 8.0,
"eval_loss": 34.95075607299805,
"eval_runtime": 49.3237,
"eval_samples_per_second": 26.458,
"eval_steps_per_second": 3.325,
"step": 10440
},
{
"epoch": 8.045977011494253,
"grad_norm": 1.8656938076019287,
"learning_rate": 4.497318007662836e-05,
"loss": 33.8404,
"step": 10500
},
{
"epoch": 8.122605363984674,
"grad_norm": 2.783926486968994,
"learning_rate": 4.4925287356321844e-05,
"loss": 33.9544,
"step": 10600
},
{
"epoch": 8.199233716475096,
"grad_norm": 2.175081968307495,
"learning_rate": 4.487739463601533e-05,
"loss": 33.6405,
"step": 10700
},
{
"epoch": 8.275862068965518,
"grad_norm": 4.121524333953857,
"learning_rate": 4.482950191570882e-05,
"loss": 33.568,
"step": 10800
},
{
"epoch": 8.352490421455938,
"grad_norm": 3.978410482406616,
"learning_rate": 4.4781609195402305e-05,
"loss": 33.6659,
"step": 10900
},
{
"epoch": 8.42911877394636,
"grad_norm": 3.0454840660095215,
"learning_rate": 4.473419540229885e-05,
"loss": 33.2689,
"step": 11000
},
{
"epoch": 8.505747126436782,
"grad_norm": 3.169114828109741,
"learning_rate": 4.4686302681992336e-05,
"loss": 33.6227,
"step": 11100
},
{
"epoch": 8.582375478927203,
"grad_norm": 2.5880959033966064,
"learning_rate": 4.463840996168582e-05,
"loss": 33.3022,
"step": 11200
},
{
"epoch": 8.659003831417625,
"grad_norm": 2.1367762088775635,
"learning_rate": 4.459051724137932e-05,
"loss": 33.2851,
"step": 11300
},
{
"epoch": 8.735632183908045,
"grad_norm": 3.0278782844543457,
"learning_rate": 4.4542624521072804e-05,
"loss": 33.922,
"step": 11400
},
{
"epoch": 8.812260536398467,
"grad_norm": 2.6361653804779053,
"learning_rate": 4.4494731800766284e-05,
"loss": 33.1482,
"step": 11500
},
{
"epoch": 8.88888888888889,
"grad_norm": 2.7836809158325195,
"learning_rate": 4.444683908045977e-05,
"loss": 34.1345,
"step": 11600
},
{
"epoch": 8.96551724137931,
"grad_norm": 2.519681453704834,
"learning_rate": 4.439894636015326e-05,
"loss": 34.0642,
"step": 11700
},
{
"epoch": 9.0,
"eval_loss": 34.75983428955078,
"eval_runtime": 49.3463,
"eval_samples_per_second": 26.446,
"eval_steps_per_second": 3.323,
"step": 11745
},
{
"epoch": 9.042145593869732,
"grad_norm": 6.431031703948975,
"learning_rate": 4.4351053639846745e-05,
"loss": 33.6431,
"step": 11800
},
{
"epoch": 9.118773946360154,
"grad_norm": 3.262486457824707,
"learning_rate": 4.430316091954023e-05,
"loss": 32.9398,
"step": 11900
},
{
"epoch": 9.195402298850574,
"grad_norm": 1.945741057395935,
"learning_rate": 4.425526819923372e-05,
"loss": 32.7256,
"step": 12000
},
{
"epoch": 9.272030651340996,
"grad_norm": 5.09276008605957,
"learning_rate": 4.4207375478927205e-05,
"loss": 33.9015,
"step": 12100
},
{
"epoch": 9.348659003831418,
"grad_norm": 3.785059928894043,
"learning_rate": 4.415948275862069e-05,
"loss": 33.6765,
"step": 12200
},
{
"epoch": 9.425287356321839,
"grad_norm": 2.4255340099334717,
"learning_rate": 4.411159003831418e-05,
"loss": 33.1262,
"step": 12300
},
{
"epoch": 9.50191570881226,
"grad_norm": 5.869349479675293,
"learning_rate": 4.4063697318007666e-05,
"loss": 33.2205,
"step": 12400
},
{
"epoch": 9.578544061302683,
"grad_norm": 2.361865997314453,
"learning_rate": 4.4015804597701146e-05,
"loss": 34.0441,
"step": 12500
},
{
"epoch": 9.655172413793103,
"grad_norm": 2.6989896297454834,
"learning_rate": 4.396791187739464e-05,
"loss": 33.6812,
"step": 12600
},
{
"epoch": 9.731800766283525,
"grad_norm": 2.6094741821289062,
"learning_rate": 4.3920019157088127e-05,
"loss": 33.9178,
"step": 12700
},
{
"epoch": 9.808429118773946,
"grad_norm": 2.4616310596466064,
"learning_rate": 4.3872126436781613e-05,
"loss": 34.5233,
"step": 12800
},
{
"epoch": 9.885057471264368,
"grad_norm": 2.7729408740997314,
"learning_rate": 4.38242337164751e-05,
"loss": 33.378,
"step": 12900
},
{
"epoch": 9.96168582375479,
"grad_norm": 2.5230519771575928,
"learning_rate": 4.377634099616859e-05,
"loss": 33.442,
"step": 13000
},
{
"epoch": 10.0,
"eval_loss": 34.700294494628906,
"eval_runtime": 49.2926,
"eval_samples_per_second": 26.475,
"eval_steps_per_second": 3.327,
"step": 13050
},
{
"epoch": 10.03831417624521,
"grad_norm": 2.5322816371917725,
"learning_rate": 4.3728448275862074e-05,
"loss": 33.8873,
"step": 13100
},
{
"epoch": 10.114942528735632,
"grad_norm": 2.1063241958618164,
"learning_rate": 4.368103448275862e-05,
"loss": 33.871,
"step": 13200
},
{
"epoch": 10.191570881226054,
"grad_norm": 3.7001326084136963,
"learning_rate": 4.3633141762452106e-05,
"loss": 34.5129,
"step": 13300
},
{
"epoch": 10.268199233716475,
"grad_norm": 1.8534705638885498,
"learning_rate": 4.35852490421456e-05,
"loss": 33.7739,
"step": 13400
},
{
"epoch": 10.344827586206897,
"grad_norm": 1.9871069192886353,
"learning_rate": 4.3537356321839086e-05,
"loss": 33.4124,
"step": 13500
},
{
"epoch": 10.421455938697317,
"grad_norm": 2.264529228210449,
"learning_rate": 4.348946360153257e-05,
"loss": 33.24,
"step": 13600
},
{
"epoch": 10.49808429118774,
"grad_norm": 3.0297787189483643,
"learning_rate": 4.344157088122606e-05,
"loss": 33.2922,
"step": 13700
},
{
"epoch": 10.574712643678161,
"grad_norm": 2.7185864448547363,
"learning_rate": 4.339367816091954e-05,
"loss": 33.4859,
"step": 13800
},
{
"epoch": 10.651340996168582,
"grad_norm": 3.8887524604797363,
"learning_rate": 4.334578544061303e-05,
"loss": 33.4322,
"step": 13900
},
{
"epoch": 10.727969348659004,
"grad_norm": 2.5119857788085938,
"learning_rate": 4.3297892720306514e-05,
"loss": 33.6234,
"step": 14000
},
{
"epoch": 10.804597701149426,
"grad_norm": 3.2969565391540527,
"learning_rate": 4.325e-05,
"loss": 33.4341,
"step": 14100
},
{
"epoch": 10.881226053639846,
"grad_norm": 3.3629229068756104,
"learning_rate": 4.320210727969349e-05,
"loss": 32.7636,
"step": 14200
},
{
"epoch": 10.957854406130268,
"grad_norm": 3.0765013694763184,
"learning_rate": 4.3154214559386975e-05,
"loss": 33.7066,
"step": 14300
},
{
"epoch": 11.0,
"eval_loss": 34.70278549194336,
"eval_runtime": 49.2928,
"eval_samples_per_second": 26.474,
"eval_steps_per_second": 3.327,
"step": 14355
}
],
"logging_steps": 100,
"max_steps": 104400,
"num_input_tokens_seen": 0,
"num_train_epochs": 80,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5480419933518848e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}