| { |
| "best_metric": 34.700294494628906, |
| "best_model_checkpoint": "/kaggle/working/output/checkpoint-13050", |
| "epoch": 11.0, |
| "eval_steps": 500, |
| "global_step": 14355, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.07662835249042145, |
| "grad_norm": 9.545656204223633, |
| "learning_rate": 4.9952586206896554e-05, |
| "loss": 58.0015, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1532567049808429, |
| "grad_norm": 3.9482674598693848, |
| "learning_rate": 4.990469348659004e-05, |
| "loss": 38.502, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22988505747126436, |
| "grad_norm": 2.5423216819763184, |
| "learning_rate": 4.985680076628353e-05, |
| "loss": 35.7891, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3065134099616858, |
| "grad_norm": 3.6723568439483643, |
| "learning_rate": 4.9808908045977015e-05, |
| "loss": 34.9999, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3831417624521073, |
| "grad_norm": 2.0953221321105957, |
| "learning_rate": 4.97610153256705e-05, |
| "loss": 35.9283, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.45977011494252873, |
| "grad_norm": 4.932604789733887, |
| "learning_rate": 4.971312260536399e-05, |
| "loss": 34.5531, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5363984674329502, |
| "grad_norm": 5.419522762298584, |
| "learning_rate": 4.9665229885057475e-05, |
| "loss": 34.7408, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6130268199233716, |
| "grad_norm": 3.9690020084381104, |
| "learning_rate": 4.961733716475096e-05, |
| "loss": 34.5521, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 3.3197548389434814, |
| "learning_rate": 4.956944444444445e-05, |
| "loss": 33.4281, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7662835249042146, |
| "grad_norm": 4.233493328094482, |
| "learning_rate": 4.952155172413793e-05, |
| "loss": 34.3137, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.842911877394636, |
| "grad_norm": 5.390758037567139, |
| "learning_rate": 4.9473659003831416e-05, |
| "loss": 33.9454, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9195402298850575, |
| "grad_norm": 3.419612407684326, |
| "learning_rate": 4.94257662835249e-05, |
| "loss": 34.2298, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.9961685823754789, |
| "grad_norm": 2.3791182041168213, |
| "learning_rate": 4.937787356321839e-05, |
| "loss": 33.5481, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 35.558197021484375, |
| "eval_runtime": 49.3359, |
| "eval_samples_per_second": 26.451, |
| "eval_steps_per_second": 3.324, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.0727969348659003, |
| "grad_norm": 3.0501019954681396, |
| "learning_rate": 4.932998084291188e-05, |
| "loss": 34.3557, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1494252873563218, |
| "grad_norm": 3.027714252471924, |
| "learning_rate": 4.928208812260537e-05, |
| "loss": 34.2442, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2260536398467432, |
| "grad_norm": 3.693758249282837, |
| "learning_rate": 4.923419540229886e-05, |
| "loss": 33.5375, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.3026819923371646, |
| "grad_norm": 3.7679357528686523, |
| "learning_rate": 4.9186302681992344e-05, |
| "loss": 33.7891, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3793103448275863, |
| "grad_norm": 3.2367331981658936, |
| "learning_rate": 4.9138409961685824e-05, |
| "loss": 33.4964, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.4559386973180077, |
| "grad_norm": 3.6876628398895264, |
| "learning_rate": 4.909051724137931e-05, |
| "loss": 34.7739, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.5325670498084292, |
| "grad_norm": 1.9550260305404663, |
| "learning_rate": 4.90426245210728e-05, |
| "loss": 34.2552, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6091954022988506, |
| "grad_norm": 4.955118656158447, |
| "learning_rate": 4.8994731800766285e-05, |
| "loss": 33.9766, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.685823754789272, |
| "grad_norm": 6.145394802093506, |
| "learning_rate": 4.894683908045977e-05, |
| "loss": 34.1676, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.7624521072796935, |
| "grad_norm": 6.15125846862793, |
| "learning_rate": 4.889894636015326e-05, |
| "loss": 34.3084, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.839080459770115, |
| "grad_norm": 2.647857427597046, |
| "learning_rate": 4.8851053639846746e-05, |
| "loss": 34.6449, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.9157088122605364, |
| "grad_norm": 4.066762447357178, |
| "learning_rate": 4.880316091954023e-05, |
| "loss": 34.1318, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.9923371647509578, |
| "grad_norm": 5.785406589508057, |
| "learning_rate": 4.875526819923372e-05, |
| "loss": 34.1303, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 35.211631774902344, |
| "eval_runtime": 49.3338, |
| "eval_samples_per_second": 26.452, |
| "eval_steps_per_second": 3.324, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.0689655172413794, |
| "grad_norm": 6.074384689331055, |
| "learning_rate": 4.8707375478927206e-05, |
| "loss": 33.6587, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.1455938697318007, |
| "grad_norm": 3.770009994506836, |
| "learning_rate": 4.865948275862069e-05, |
| "loss": 34.5023, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 4.6336140632629395, |
| "learning_rate": 4.861159003831418e-05, |
| "loss": 34.1806, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.2988505747126435, |
| "grad_norm": 5.440792083740234, |
| "learning_rate": 4.856369731800767e-05, |
| "loss": 34.6645, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.375478927203065, |
| "grad_norm": 2.98138165473938, |
| "learning_rate": 4.8515804597701154e-05, |
| "loss": 34.1371, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.4521072796934864, |
| "grad_norm": 2.4175803661346436, |
| "learning_rate": 4.846791187739464e-05, |
| "loss": 33.8015, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.528735632183908, |
| "grad_norm": 3.846370220184326, |
| "learning_rate": 4.842001915708813e-05, |
| "loss": 34.0589, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.6053639846743293, |
| "grad_norm": 4.001793384552002, |
| "learning_rate": 4.8372126436781614e-05, |
| "loss": 33.7327, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.681992337164751, |
| "grad_norm": 3.7779624462127686, |
| "learning_rate": 4.83242337164751e-05, |
| "loss": 34.3508, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.7586206896551726, |
| "grad_norm": 3.5112695693969727, |
| "learning_rate": 4.827634099616858e-05, |
| "loss": 33.5653, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.835249042145594, |
| "grad_norm": 2.3443048000335693, |
| "learning_rate": 4.822844827586207e-05, |
| "loss": 33.798, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.9118773946360155, |
| "grad_norm": 2.5035479068756104, |
| "learning_rate": 4.8180555555555555e-05, |
| "loss": 33.4353, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.9885057471264367, |
| "grad_norm": 3.4322028160095215, |
| "learning_rate": 4.813266283524904e-05, |
| "loss": 33.948, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 35.00273132324219, |
| "eval_runtime": 49.3242, |
| "eval_samples_per_second": 26.458, |
| "eval_steps_per_second": 3.325, |
| "step": 3915 |
| }, |
| { |
| "epoch": 3.0651340996168583, |
| "grad_norm": 2.8833682537078857, |
| "learning_rate": 4.808477011494253e-05, |
| "loss": 34.523, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.1417624521072796, |
| "grad_norm": 2.8744261264801025, |
| "learning_rate": 4.803735632183908e-05, |
| "loss": 33.921, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.218390804597701, |
| "grad_norm": 2.928616762161255, |
| "learning_rate": 4.798946360153257e-05, |
| "loss": 33.6903, |
| "step": 4200 |
| }, |
| { |
| "epoch": 3.2950191570881224, |
| "grad_norm": 3.0579280853271484, |
| "learning_rate": 4.7941570881226054e-05, |
| "loss": 33.0608, |
| "step": 4300 |
| }, |
| { |
| "epoch": 3.371647509578544, |
| "grad_norm": 1.6688510179519653, |
| "learning_rate": 4.789367816091954e-05, |
| "loss": 33.8769, |
| "step": 4400 |
| }, |
| { |
| "epoch": 3.4482758620689653, |
| "grad_norm": 2.6190459728240967, |
| "learning_rate": 4.784578544061303e-05, |
| "loss": 33.2974, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.524904214559387, |
| "grad_norm": 2.6260671615600586, |
| "learning_rate": 4.7797892720306515e-05, |
| "loss": 34.0589, |
| "step": 4600 |
| }, |
| { |
| "epoch": 3.6015325670498086, |
| "grad_norm": 3.191978693008423, |
| "learning_rate": 4.775e-05, |
| "loss": 33.9493, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.67816091954023, |
| "grad_norm": 2.759941339492798, |
| "learning_rate": 4.770210727969349e-05, |
| "loss": 33.5936, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.7547892720306515, |
| "grad_norm": 2.262294054031372, |
| "learning_rate": 4.7654214559386976e-05, |
| "loss": 34.06, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.8314176245210727, |
| "grad_norm": 4.6808600425720215, |
| "learning_rate": 4.760632183908046e-05, |
| "loss": 34.1592, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.9080459770114944, |
| "grad_norm": 4.294464111328125, |
| "learning_rate": 4.755842911877395e-05, |
| "loss": 34.4652, |
| "step": 5100 |
| }, |
| { |
| "epoch": 3.9846743295019156, |
| "grad_norm": 2.7845072746276855, |
| "learning_rate": 4.7510536398467436e-05, |
| "loss": 34.2075, |
| "step": 5200 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 34.954986572265625, |
| "eval_runtime": 49.2865, |
| "eval_samples_per_second": 26.478, |
| "eval_steps_per_second": 3.327, |
| "step": 5220 |
| }, |
| { |
| "epoch": 4.061302681992337, |
| "grad_norm": 4.420943260192871, |
| "learning_rate": 4.746264367816092e-05, |
| "loss": 34.5735, |
| "step": 5300 |
| }, |
| { |
| "epoch": 4.137931034482759, |
| "grad_norm": 2.898287534713745, |
| "learning_rate": 4.741475095785441e-05, |
| "loss": 34.0739, |
| "step": 5400 |
| }, |
| { |
| "epoch": 4.21455938697318, |
| "grad_norm": 4.703996658325195, |
| "learning_rate": 4.73668582375479e-05, |
| "loss": 33.7022, |
| "step": 5500 |
| }, |
| { |
| "epoch": 4.291187739463601, |
| "grad_norm": 2.2913658618927, |
| "learning_rate": 4.7318965517241384e-05, |
| "loss": 33.6581, |
| "step": 5600 |
| }, |
| { |
| "epoch": 4.3678160919540225, |
| "grad_norm": 3.895615339279175, |
| "learning_rate": 4.727107279693487e-05, |
| "loss": 34.0314, |
| "step": 5700 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 4.635524749755859, |
| "learning_rate": 4.722318007662835e-05, |
| "loss": 34.5266, |
| "step": 5800 |
| }, |
| { |
| "epoch": 4.521072796934866, |
| "grad_norm": 3.451066017150879, |
| "learning_rate": 4.717528735632184e-05, |
| "loss": 33.1786, |
| "step": 5900 |
| }, |
| { |
| "epoch": 4.597701149425287, |
| "grad_norm": 2.552107810974121, |
| "learning_rate": 4.7127394636015325e-05, |
| "loss": 33.6118, |
| "step": 6000 |
| }, |
| { |
| "epoch": 4.674329501915709, |
| "grad_norm": 2.359786033630371, |
| "learning_rate": 4.707998084291188e-05, |
| "loss": 33.9903, |
| "step": 6100 |
| }, |
| { |
| "epoch": 4.75095785440613, |
| "grad_norm": 2.2611875534057617, |
| "learning_rate": 4.703208812260537e-05, |
| "loss": 34.0762, |
| "step": 6200 |
| }, |
| { |
| "epoch": 4.827586206896552, |
| "grad_norm": 1.8199210166931152, |
| "learning_rate": 4.698419540229885e-05, |
| "loss": 33.6635, |
| "step": 6300 |
| }, |
| { |
| "epoch": 4.904214559386973, |
| "grad_norm": 2.7332305908203125, |
| "learning_rate": 4.693630268199234e-05, |
| "loss": 33.0946, |
| "step": 6400 |
| }, |
| { |
| "epoch": 4.980842911877395, |
| "grad_norm": 2.9454078674316406, |
| "learning_rate": 4.6888409961685824e-05, |
| "loss": 33.9173, |
| "step": 6500 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 34.924800872802734, |
| "eval_runtime": 49.3002, |
| "eval_samples_per_second": 26.47, |
| "eval_steps_per_second": 3.327, |
| "step": 6525 |
| }, |
| { |
| "epoch": 5.057471264367816, |
| "grad_norm": 2.3083884716033936, |
| "learning_rate": 4.684051724137931e-05, |
| "loss": 33.8987, |
| "step": 6600 |
| }, |
| { |
| "epoch": 5.134099616858237, |
| "grad_norm": 2.228327751159668, |
| "learning_rate": 4.67926245210728e-05, |
| "loss": 33.8189, |
| "step": 6700 |
| }, |
| { |
| "epoch": 5.210727969348659, |
| "grad_norm": 3.6814918518066406, |
| "learning_rate": 4.6744731800766284e-05, |
| "loss": 33.8364, |
| "step": 6800 |
| }, |
| { |
| "epoch": 5.287356321839081, |
| "grad_norm": 2.5758285522460938, |
| "learning_rate": 4.669683908045977e-05, |
| "loss": 33.7093, |
| "step": 6900 |
| }, |
| { |
| "epoch": 5.363984674329502, |
| "grad_norm": 4.175839900970459, |
| "learning_rate": 4.6648946360153265e-05, |
| "loss": 33.6689, |
| "step": 7000 |
| }, |
| { |
| "epoch": 5.440613026819923, |
| "grad_norm": 2.213092088699341, |
| "learning_rate": 4.6601053639846745e-05, |
| "loss": 33.7936, |
| "step": 7100 |
| }, |
| { |
| "epoch": 5.517241379310345, |
| "grad_norm": 2.4982571601867676, |
| "learning_rate": 4.655316091954023e-05, |
| "loss": 33.3686, |
| "step": 7200 |
| }, |
| { |
| "epoch": 5.593869731800766, |
| "grad_norm": 3.635983943939209, |
| "learning_rate": 4.6505747126436784e-05, |
| "loss": 33.5493, |
| "step": 7300 |
| }, |
| { |
| "epoch": 5.670498084291188, |
| "grad_norm": 4.315894603729248, |
| "learning_rate": 4.645785440613027e-05, |
| "loss": 33.6607, |
| "step": 7400 |
| }, |
| { |
| "epoch": 5.747126436781609, |
| "grad_norm": 2.6151223182678223, |
| "learning_rate": 4.640996168582376e-05, |
| "loss": 34.7535, |
| "step": 7500 |
| }, |
| { |
| "epoch": 5.823754789272031, |
| "grad_norm": 4.03953218460083, |
| "learning_rate": 4.6362068965517244e-05, |
| "loss": 33.9865, |
| "step": 7600 |
| }, |
| { |
| "epoch": 5.900383141762452, |
| "grad_norm": 2.512362480163574, |
| "learning_rate": 4.6314176245210724e-05, |
| "loss": 33.0343, |
| "step": 7700 |
| }, |
| { |
| "epoch": 5.977011494252873, |
| "grad_norm": 4.745575428009033, |
| "learning_rate": 4.626628352490422e-05, |
| "loss": 33.4544, |
| "step": 7800 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 34.841033935546875, |
| "eval_runtime": 49.3059, |
| "eval_samples_per_second": 26.467, |
| "eval_steps_per_second": 3.326, |
| "step": 7830 |
| }, |
| { |
| "epoch": 6.053639846743295, |
| "grad_norm": 2.996056079864502, |
| "learning_rate": 4.6218390804597705e-05, |
| "loss": 33.631, |
| "step": 7900 |
| }, |
| { |
| "epoch": 6.130268199233717, |
| "grad_norm": 3.3260300159454346, |
| "learning_rate": 4.617049808429119e-05, |
| "loss": 33.9222, |
| "step": 8000 |
| }, |
| { |
| "epoch": 6.206896551724138, |
| "grad_norm": 2.214486598968506, |
| "learning_rate": 4.612260536398468e-05, |
| "loss": 32.9576, |
| "step": 8100 |
| }, |
| { |
| "epoch": 6.283524904214559, |
| "grad_norm": 3.6611664295196533, |
| "learning_rate": 4.6074712643678166e-05, |
| "loss": 33.5231, |
| "step": 8200 |
| }, |
| { |
| "epoch": 6.360153256704981, |
| "grad_norm": 2.582730770111084, |
| "learning_rate": 4.602681992337165e-05, |
| "loss": 33.6936, |
| "step": 8300 |
| }, |
| { |
| "epoch": 6.436781609195402, |
| "grad_norm": 2.739861488342285, |
| "learning_rate": 4.597892720306514e-05, |
| "loss": 33.3997, |
| "step": 8400 |
| }, |
| { |
| "epoch": 6.513409961685824, |
| "grad_norm": 2.2102463245391846, |
| "learning_rate": 4.593103448275862e-05, |
| "loss": 33.9374, |
| "step": 8500 |
| }, |
| { |
| "epoch": 6.590038314176245, |
| "grad_norm": 3.83150577545166, |
| "learning_rate": 4.5883141762452106e-05, |
| "loss": 33.9961, |
| "step": 8600 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 3.981616735458374, |
| "learning_rate": 4.583524904214559e-05, |
| "loss": 33.5413, |
| "step": 8700 |
| }, |
| { |
| "epoch": 6.743295019157088, |
| "grad_norm": 2.3303332328796387, |
| "learning_rate": 4.578735632183908e-05, |
| "loss": 34.0529, |
| "step": 8800 |
| }, |
| { |
| "epoch": 6.819923371647509, |
| "grad_norm": 3.9573702812194824, |
| "learning_rate": 4.573946360153257e-05, |
| "loss": 33.2897, |
| "step": 8900 |
| }, |
| { |
| "epoch": 6.896551724137931, |
| "grad_norm": 2.6185879707336426, |
| "learning_rate": 4.5691570881226054e-05, |
| "loss": 34.0662, |
| "step": 9000 |
| }, |
| { |
| "epoch": 6.973180076628353, |
| "grad_norm": 3.1155271530151367, |
| "learning_rate": 4.564367816091955e-05, |
| "loss": 33.517, |
| "step": 9100 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 34.818748474121094, |
| "eval_runtime": 49.3029, |
| "eval_samples_per_second": 26.469, |
| "eval_steps_per_second": 3.326, |
| "step": 9135 |
| }, |
| { |
| "epoch": 7.049808429118774, |
| "grad_norm": 3.117553472518921, |
| "learning_rate": 4.5595785440613034e-05, |
| "loss": 34.1218, |
| "step": 9200 |
| }, |
| { |
| "epoch": 7.126436781609195, |
| "grad_norm": 2.5572612285614014, |
| "learning_rate": 4.5547892720306515e-05, |
| "loss": 33.662, |
| "step": 9300 |
| }, |
| { |
| "epoch": 7.203065134099617, |
| "grad_norm": 3.5347042083740234, |
| "learning_rate": 4.55e-05, |
| "loss": 34.4668, |
| "step": 9400 |
| }, |
| { |
| "epoch": 7.2796934865900385, |
| "grad_norm": 1.9216647148132324, |
| "learning_rate": 4.545210727969349e-05, |
| "loss": 33.4468, |
| "step": 9500 |
| }, |
| { |
| "epoch": 7.35632183908046, |
| "grad_norm": 4.242152214050293, |
| "learning_rate": 4.5404214559386975e-05, |
| "loss": 33.5805, |
| "step": 9600 |
| }, |
| { |
| "epoch": 7.432950191570881, |
| "grad_norm": 2.9310567378997803, |
| "learning_rate": 4.535632183908046e-05, |
| "loss": 34.0603, |
| "step": 9700 |
| }, |
| { |
| "epoch": 7.509578544061303, |
| "grad_norm": 2.6573023796081543, |
| "learning_rate": 4.530842911877395e-05, |
| "loss": 33.8766, |
| "step": 9800 |
| }, |
| { |
| "epoch": 7.586206896551724, |
| "grad_norm": 2.7849409580230713, |
| "learning_rate": 4.5260536398467436e-05, |
| "loss": 33.6309, |
| "step": 9900 |
| }, |
| { |
| "epoch": 7.662835249042145, |
| "grad_norm": 2.7377357482910156, |
| "learning_rate": 4.521264367816092e-05, |
| "loss": 33.3621, |
| "step": 10000 |
| }, |
| { |
| "epoch": 7.739463601532567, |
| "grad_norm": 2.106233835220337, |
| "learning_rate": 4.516475095785441e-05, |
| "loss": 33.4172, |
| "step": 10100 |
| }, |
| { |
| "epoch": 7.816091954022989, |
| "grad_norm": 2.1989126205444336, |
| "learning_rate": 4.5116858237547896e-05, |
| "loss": 33.5937, |
| "step": 10200 |
| }, |
| { |
| "epoch": 7.89272030651341, |
| "grad_norm": 2.903721570968628, |
| "learning_rate": 4.5068965517241377e-05, |
| "loss": 33.7935, |
| "step": 10300 |
| }, |
| { |
| "epoch": 7.969348659003831, |
| "grad_norm": 2.061602830886841, |
| "learning_rate": 4.5021072796934863e-05, |
| "loss": 33.3289, |
| "step": 10400 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 34.95075607299805, |
| "eval_runtime": 49.3237, |
| "eval_samples_per_second": 26.458, |
| "eval_steps_per_second": 3.325, |
| "step": 10440 |
| }, |
| { |
| "epoch": 8.045977011494253, |
| "grad_norm": 1.8656938076019287, |
| "learning_rate": 4.497318007662836e-05, |
| "loss": 33.8404, |
| "step": 10500 |
| }, |
| { |
| "epoch": 8.122605363984674, |
| "grad_norm": 2.783926486968994, |
| "learning_rate": 4.4925287356321844e-05, |
| "loss": 33.9544, |
| "step": 10600 |
| }, |
| { |
| "epoch": 8.199233716475096, |
| "grad_norm": 2.175081968307495, |
| "learning_rate": 4.487739463601533e-05, |
| "loss": 33.6405, |
| "step": 10700 |
| }, |
| { |
| "epoch": 8.275862068965518, |
| "grad_norm": 4.121524333953857, |
| "learning_rate": 4.482950191570882e-05, |
| "loss": 33.568, |
| "step": 10800 |
| }, |
| { |
| "epoch": 8.352490421455938, |
| "grad_norm": 3.978410482406616, |
| "learning_rate": 4.4781609195402305e-05, |
| "loss": 33.6659, |
| "step": 10900 |
| }, |
| { |
| "epoch": 8.42911877394636, |
| "grad_norm": 3.0454840660095215, |
| "learning_rate": 4.473419540229885e-05, |
| "loss": 33.2689, |
| "step": 11000 |
| }, |
| { |
| "epoch": 8.505747126436782, |
| "grad_norm": 3.169114828109741, |
| "learning_rate": 4.4686302681992336e-05, |
| "loss": 33.6227, |
| "step": 11100 |
| }, |
| { |
| "epoch": 8.582375478927203, |
| "grad_norm": 2.5880959033966064, |
| "learning_rate": 4.463840996168582e-05, |
| "loss": 33.3022, |
| "step": 11200 |
| }, |
| { |
| "epoch": 8.659003831417625, |
| "grad_norm": 2.1367762088775635, |
| "learning_rate": 4.459051724137932e-05, |
| "loss": 33.2851, |
| "step": 11300 |
| }, |
| { |
| "epoch": 8.735632183908045, |
| "grad_norm": 3.0278782844543457, |
| "learning_rate": 4.4542624521072804e-05, |
| "loss": 33.922, |
| "step": 11400 |
| }, |
| { |
| "epoch": 8.812260536398467, |
| "grad_norm": 2.6361653804779053, |
| "learning_rate": 4.4494731800766284e-05, |
| "loss": 33.1482, |
| "step": 11500 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 2.7836809158325195, |
| "learning_rate": 4.444683908045977e-05, |
| "loss": 34.1345, |
| "step": 11600 |
| }, |
| { |
| "epoch": 8.96551724137931, |
| "grad_norm": 2.519681453704834, |
| "learning_rate": 4.439894636015326e-05, |
| "loss": 34.0642, |
| "step": 11700 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 34.75983428955078, |
| "eval_runtime": 49.3463, |
| "eval_samples_per_second": 26.446, |
| "eval_steps_per_second": 3.323, |
| "step": 11745 |
| }, |
| { |
| "epoch": 9.042145593869732, |
| "grad_norm": 6.431031703948975, |
| "learning_rate": 4.4351053639846745e-05, |
| "loss": 33.6431, |
| "step": 11800 |
| }, |
| { |
| "epoch": 9.118773946360154, |
| "grad_norm": 3.262486457824707, |
| "learning_rate": 4.430316091954023e-05, |
| "loss": 32.9398, |
| "step": 11900 |
| }, |
| { |
| "epoch": 9.195402298850574, |
| "grad_norm": 1.945741057395935, |
| "learning_rate": 4.425526819923372e-05, |
| "loss": 32.7256, |
| "step": 12000 |
| }, |
| { |
| "epoch": 9.272030651340996, |
| "grad_norm": 5.09276008605957, |
| "learning_rate": 4.4207375478927205e-05, |
| "loss": 33.9015, |
| "step": 12100 |
| }, |
| { |
| "epoch": 9.348659003831418, |
| "grad_norm": 3.785059928894043, |
| "learning_rate": 4.415948275862069e-05, |
| "loss": 33.6765, |
| "step": 12200 |
| }, |
| { |
| "epoch": 9.425287356321839, |
| "grad_norm": 2.4255340099334717, |
| "learning_rate": 4.411159003831418e-05, |
| "loss": 33.1262, |
| "step": 12300 |
| }, |
| { |
| "epoch": 9.50191570881226, |
| "grad_norm": 5.869349479675293, |
| "learning_rate": 4.4063697318007666e-05, |
| "loss": 33.2205, |
| "step": 12400 |
| }, |
| { |
| "epoch": 9.578544061302683, |
| "grad_norm": 2.361865997314453, |
| "learning_rate": 4.4015804597701146e-05, |
| "loss": 34.0441, |
| "step": 12500 |
| }, |
| { |
| "epoch": 9.655172413793103, |
| "grad_norm": 2.6989896297454834, |
| "learning_rate": 4.396791187739464e-05, |
| "loss": 33.6812, |
| "step": 12600 |
| }, |
| { |
| "epoch": 9.731800766283525, |
| "grad_norm": 2.6094741821289062, |
| "learning_rate": 4.3920019157088127e-05, |
| "loss": 33.9178, |
| "step": 12700 |
| }, |
| { |
| "epoch": 9.808429118773946, |
| "grad_norm": 2.4616310596466064, |
| "learning_rate": 4.3872126436781613e-05, |
| "loss": 34.5233, |
| "step": 12800 |
| }, |
| { |
| "epoch": 9.885057471264368, |
| "grad_norm": 2.7729408740997314, |
| "learning_rate": 4.38242337164751e-05, |
| "loss": 33.378, |
| "step": 12900 |
| }, |
| { |
| "epoch": 9.96168582375479, |
| "grad_norm": 2.5230519771575928, |
| "learning_rate": 4.377634099616859e-05, |
| "loss": 33.442, |
| "step": 13000 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 34.700294494628906, |
| "eval_runtime": 49.2926, |
| "eval_samples_per_second": 26.475, |
| "eval_steps_per_second": 3.327, |
| "step": 13050 |
| }, |
| { |
| "epoch": 10.03831417624521, |
| "grad_norm": 2.5322816371917725, |
| "learning_rate": 4.3728448275862074e-05, |
| "loss": 33.8873, |
| "step": 13100 |
| }, |
| { |
| "epoch": 10.114942528735632, |
| "grad_norm": 2.1063241958618164, |
| "learning_rate": 4.368103448275862e-05, |
| "loss": 33.871, |
| "step": 13200 |
| }, |
| { |
| "epoch": 10.191570881226054, |
| "grad_norm": 3.7001326084136963, |
| "learning_rate": 4.3633141762452106e-05, |
| "loss": 34.5129, |
| "step": 13300 |
| }, |
| { |
| "epoch": 10.268199233716475, |
| "grad_norm": 1.8534705638885498, |
| "learning_rate": 4.35852490421456e-05, |
| "loss": 33.7739, |
| "step": 13400 |
| }, |
| { |
| "epoch": 10.344827586206897, |
| "grad_norm": 1.9871069192886353, |
| "learning_rate": 4.3537356321839086e-05, |
| "loss": 33.4124, |
| "step": 13500 |
| }, |
| { |
| "epoch": 10.421455938697317, |
| "grad_norm": 2.264529228210449, |
| "learning_rate": 4.348946360153257e-05, |
| "loss": 33.24, |
| "step": 13600 |
| }, |
| { |
| "epoch": 10.49808429118774, |
| "grad_norm": 3.0297787189483643, |
| "learning_rate": 4.344157088122606e-05, |
| "loss": 33.2922, |
| "step": 13700 |
| }, |
| { |
| "epoch": 10.574712643678161, |
| "grad_norm": 2.7185864448547363, |
| "learning_rate": 4.339367816091954e-05, |
| "loss": 33.4859, |
| "step": 13800 |
| }, |
| { |
| "epoch": 10.651340996168582, |
| "grad_norm": 3.8887524604797363, |
| "learning_rate": 4.334578544061303e-05, |
| "loss": 33.4322, |
| "step": 13900 |
| }, |
| { |
| "epoch": 10.727969348659004, |
| "grad_norm": 2.5119857788085938, |
| "learning_rate": 4.3297892720306514e-05, |
| "loss": 33.6234, |
| "step": 14000 |
| }, |
| { |
| "epoch": 10.804597701149426, |
| "grad_norm": 3.2969565391540527, |
| "learning_rate": 4.325e-05, |
| "loss": 33.4341, |
| "step": 14100 |
| }, |
| { |
| "epoch": 10.881226053639846, |
| "grad_norm": 3.3629229068756104, |
| "learning_rate": 4.320210727969349e-05, |
| "loss": 32.7636, |
| "step": 14200 |
| }, |
| { |
| "epoch": 10.957854406130268, |
| "grad_norm": 3.0765013694763184, |
| "learning_rate": 4.3154214559386975e-05, |
| "loss": 33.7066, |
| "step": 14300 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 34.70278549194336, |
| "eval_runtime": 49.2928, |
| "eval_samples_per_second": 26.474, |
| "eval_steps_per_second": 3.327, |
| "step": 14355 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 104400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 80, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 10, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 1 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5480419933518848e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|