Invalid JSON:
Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_metric": 34.45762252807617, | |
| "best_model_checkpoint": "/kaggle/working/output/checkpoint-43065", | |
| "epoch": 33.0, | |
| "eval_steps": 500, | |
| "global_step": 43065, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07662835249042145, | |
| "grad_norm": 9.545656204223633, | |
| "learning_rate": 4.9952586206896554e-05, | |
| "loss": 58.0015, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1532567049808429, | |
| "grad_norm": 3.9482674598693848, | |
| "learning_rate": 4.990469348659004e-05, | |
| "loss": 38.502, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 2.5423216819763184, | |
| "learning_rate": 4.985680076628353e-05, | |
| "loss": 35.7891, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3065134099616858, | |
| "grad_norm": 3.6723568439483643, | |
| "learning_rate": 4.9808908045977015e-05, | |
| "loss": 34.9999, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3831417624521073, | |
| "grad_norm": 2.0953221321105957, | |
| "learning_rate": 4.97610153256705e-05, | |
| "loss": 35.9283, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.45977011494252873, | |
| "grad_norm": 4.932604789733887, | |
| "learning_rate": 4.971312260536399e-05, | |
| "loss": 34.5531, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5363984674329502, | |
| "grad_norm": 5.419522762298584, | |
| "learning_rate": 4.9665229885057475e-05, | |
| "loss": 34.7408, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6130268199233716, | |
| "grad_norm": 3.9690020084381104, | |
| "learning_rate": 4.961733716475096e-05, | |
| "loss": 34.5521, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 3.3197548389434814, | |
| "learning_rate": 4.956944444444445e-05, | |
| "loss": 33.4281, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7662835249042146, | |
| "grad_norm": 4.233493328094482, | |
| "learning_rate": 4.952155172413793e-05, | |
| "loss": 34.3137, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.842911877394636, | |
| "grad_norm": 5.390758037567139, | |
| "learning_rate": 4.9473659003831416e-05, | |
| "loss": 33.9454, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9195402298850575, | |
| "grad_norm": 3.419612407684326, | |
| "learning_rate": 4.94257662835249e-05, | |
| "loss": 34.2298, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9961685823754789, | |
| "grad_norm": 2.3791182041168213, | |
| "learning_rate": 4.937787356321839e-05, | |
| "loss": 33.5481, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 35.558197021484375, | |
| "eval_runtime": 49.3359, | |
| "eval_samples_per_second": 26.451, | |
| "eval_steps_per_second": 3.324, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.0727969348659003, | |
| "grad_norm": 3.0501019954681396, | |
| "learning_rate": 4.932998084291188e-05, | |
| "loss": 34.3557, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1494252873563218, | |
| "grad_norm": 3.027714252471924, | |
| "learning_rate": 4.928208812260537e-05, | |
| "loss": 34.2442, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2260536398467432, | |
| "grad_norm": 3.693758249282837, | |
| "learning_rate": 4.923419540229886e-05, | |
| "loss": 33.5375, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3026819923371646, | |
| "grad_norm": 3.7679357528686523, | |
| "learning_rate": 4.9186302681992344e-05, | |
| "loss": 33.7891, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.3793103448275863, | |
| "grad_norm": 3.2367331981658936, | |
| "learning_rate": 4.9138409961685824e-05, | |
| "loss": 33.4964, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.4559386973180077, | |
| "grad_norm": 3.6876628398895264, | |
| "learning_rate": 4.909051724137931e-05, | |
| "loss": 34.7739, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.5325670498084292, | |
| "grad_norm": 1.9550260305404663, | |
| "learning_rate": 4.90426245210728e-05, | |
| "loss": 34.2552, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6091954022988506, | |
| "grad_norm": 4.955118656158447, | |
| "learning_rate": 4.8994731800766285e-05, | |
| "loss": 33.9766, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.685823754789272, | |
| "grad_norm": 6.145394802093506, | |
| "learning_rate": 4.894683908045977e-05, | |
| "loss": 34.1676, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.7624521072796935, | |
| "grad_norm": 6.15125846862793, | |
| "learning_rate": 4.889894636015326e-05, | |
| "loss": 34.3084, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.839080459770115, | |
| "grad_norm": 2.647857427597046, | |
| "learning_rate": 4.8851053639846746e-05, | |
| "loss": 34.6449, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.9157088122605364, | |
| "grad_norm": 4.066762447357178, | |
| "learning_rate": 4.880316091954023e-05, | |
| "loss": 34.1318, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.9923371647509578, | |
| "grad_norm": 5.785406589508057, | |
| "learning_rate": 4.875526819923372e-05, | |
| "loss": 34.1303, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 35.211631774902344, | |
| "eval_runtime": 49.3338, | |
| "eval_samples_per_second": 26.452, | |
| "eval_steps_per_second": 3.324, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.0689655172413794, | |
| "grad_norm": 6.074384689331055, | |
| "learning_rate": 4.8707375478927206e-05, | |
| "loss": 33.6587, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.1455938697318007, | |
| "grad_norm": 3.770009994506836, | |
| "learning_rate": 4.865948275862069e-05, | |
| "loss": 34.5023, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 4.6336140632629395, | |
| "learning_rate": 4.861159003831418e-05, | |
| "loss": 34.1806, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.2988505747126435, | |
| "grad_norm": 5.440792083740234, | |
| "learning_rate": 4.856369731800767e-05, | |
| "loss": 34.6645, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.375478927203065, | |
| "grad_norm": 2.98138165473938, | |
| "learning_rate": 4.8515804597701154e-05, | |
| "loss": 34.1371, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.4521072796934864, | |
| "grad_norm": 2.4175803661346436, | |
| "learning_rate": 4.846791187739464e-05, | |
| "loss": 33.8015, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.528735632183908, | |
| "grad_norm": 3.846370220184326, | |
| "learning_rate": 4.842001915708813e-05, | |
| "loss": 34.0589, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.6053639846743293, | |
| "grad_norm": 4.001793384552002, | |
| "learning_rate": 4.8372126436781614e-05, | |
| "loss": 33.7327, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.681992337164751, | |
| "grad_norm": 3.7779624462127686, | |
| "learning_rate": 4.83242337164751e-05, | |
| "loss": 34.3508, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 3.5112695693969727, | |
| "learning_rate": 4.827634099616858e-05, | |
| "loss": 33.5653, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.835249042145594, | |
| "grad_norm": 2.3443048000335693, | |
| "learning_rate": 4.822844827586207e-05, | |
| "loss": 33.798, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.9118773946360155, | |
| "grad_norm": 2.5035479068756104, | |
| "learning_rate": 4.8180555555555555e-05, | |
| "loss": 33.4353, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.9885057471264367, | |
| "grad_norm": 3.4322028160095215, | |
| "learning_rate": 4.813266283524904e-05, | |
| "loss": 33.948, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 35.00273132324219, | |
| "eval_runtime": 49.3242, | |
| "eval_samples_per_second": 26.458, | |
| "eval_steps_per_second": 3.325, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 3.0651340996168583, | |
| "grad_norm": 2.8833682537078857, | |
| "learning_rate": 4.808477011494253e-05, | |
| "loss": 34.523, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.1417624521072796, | |
| "grad_norm": 2.8744261264801025, | |
| "learning_rate": 4.803735632183908e-05, | |
| "loss": 33.921, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.218390804597701, | |
| "grad_norm": 2.928616762161255, | |
| "learning_rate": 4.798946360153257e-05, | |
| "loss": 33.6903, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.2950191570881224, | |
| "grad_norm": 3.0579280853271484, | |
| "learning_rate": 4.7941570881226054e-05, | |
| "loss": 33.0608, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.371647509578544, | |
| "grad_norm": 1.6688510179519653, | |
| "learning_rate": 4.789367816091954e-05, | |
| "loss": 33.8769, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.4482758620689653, | |
| "grad_norm": 2.6190459728240967, | |
| "learning_rate": 4.784578544061303e-05, | |
| "loss": 33.2974, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.524904214559387, | |
| "grad_norm": 2.6260671615600586, | |
| "learning_rate": 4.7797892720306515e-05, | |
| "loss": 34.0589, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.6015325670498086, | |
| "grad_norm": 3.191978693008423, | |
| "learning_rate": 4.775e-05, | |
| "loss": 33.9493, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.67816091954023, | |
| "grad_norm": 2.759941339492798, | |
| "learning_rate": 4.770210727969349e-05, | |
| "loss": 33.5936, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.7547892720306515, | |
| "grad_norm": 2.262294054031372, | |
| "learning_rate": 4.7654214559386976e-05, | |
| "loss": 34.06, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.8314176245210727, | |
| "grad_norm": 4.6808600425720215, | |
| "learning_rate": 4.760632183908046e-05, | |
| "loss": 34.1592, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.9080459770114944, | |
| "grad_norm": 4.294464111328125, | |
| "learning_rate": 4.755842911877395e-05, | |
| "loss": 34.4652, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.9846743295019156, | |
| "grad_norm": 2.7845072746276855, | |
| "learning_rate": 4.7510536398467436e-05, | |
| "loss": 34.2075, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 34.954986572265625, | |
| "eval_runtime": 49.2865, | |
| "eval_samples_per_second": 26.478, | |
| "eval_steps_per_second": 3.327, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 4.061302681992337, | |
| "grad_norm": 4.420943260192871, | |
| "learning_rate": 4.746264367816092e-05, | |
| "loss": 34.5735, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.137931034482759, | |
| "grad_norm": 2.898287534713745, | |
| "learning_rate": 4.741475095785441e-05, | |
| "loss": 34.0739, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.21455938697318, | |
| "grad_norm": 4.703996658325195, | |
| "learning_rate": 4.73668582375479e-05, | |
| "loss": 33.7022, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.291187739463601, | |
| "grad_norm": 2.2913658618927, | |
| "learning_rate": 4.7318965517241384e-05, | |
| "loss": 33.6581, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.3678160919540225, | |
| "grad_norm": 3.895615339279175, | |
| "learning_rate": 4.727107279693487e-05, | |
| "loss": 34.0314, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 4.635524749755859, | |
| "learning_rate": 4.722318007662835e-05, | |
| "loss": 34.5266, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.521072796934866, | |
| "grad_norm": 3.451066017150879, | |
| "learning_rate": 4.717528735632184e-05, | |
| "loss": 33.1786, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.597701149425287, | |
| "grad_norm": 2.552107810974121, | |
| "learning_rate": 4.7127394636015325e-05, | |
| "loss": 33.6118, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.674329501915709, | |
| "grad_norm": 2.359786033630371, | |
| "learning_rate": 4.707998084291188e-05, | |
| "loss": 33.9903, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.75095785440613, | |
| "grad_norm": 2.2611875534057617, | |
| "learning_rate": 4.703208812260537e-05, | |
| "loss": 34.0762, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 4.827586206896552, | |
| "grad_norm": 1.8199210166931152, | |
| "learning_rate": 4.698419540229885e-05, | |
| "loss": 33.6635, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.904214559386973, | |
| "grad_norm": 2.7332305908203125, | |
| "learning_rate": 4.693630268199234e-05, | |
| "loss": 33.0946, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 4.980842911877395, | |
| "grad_norm": 2.9454078674316406, | |
| "learning_rate": 4.6888409961685824e-05, | |
| "loss": 33.9173, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 34.924800872802734, | |
| "eval_runtime": 49.3002, | |
| "eval_samples_per_second": 26.47, | |
| "eval_steps_per_second": 3.327, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 5.057471264367816, | |
| "grad_norm": 2.3083884716033936, | |
| "learning_rate": 4.684051724137931e-05, | |
| "loss": 33.8987, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 5.134099616858237, | |
| "grad_norm": 2.228327751159668, | |
| "learning_rate": 4.67926245210728e-05, | |
| "loss": 33.8189, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 5.210727969348659, | |
| "grad_norm": 3.6814918518066406, | |
| "learning_rate": 4.6744731800766284e-05, | |
| "loss": 33.8364, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 5.287356321839081, | |
| "grad_norm": 2.5758285522460938, | |
| "learning_rate": 4.669683908045977e-05, | |
| "loss": 33.7093, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.363984674329502, | |
| "grad_norm": 4.175839900970459, | |
| "learning_rate": 4.6648946360153265e-05, | |
| "loss": 33.6689, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.440613026819923, | |
| "grad_norm": 2.213092088699341, | |
| "learning_rate": 4.6601053639846745e-05, | |
| "loss": 33.7936, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 5.517241379310345, | |
| "grad_norm": 2.4982571601867676, | |
| "learning_rate": 4.655316091954023e-05, | |
| "loss": 33.3686, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 5.593869731800766, | |
| "grad_norm": 3.635983943939209, | |
| "learning_rate": 4.6505747126436784e-05, | |
| "loss": 33.5493, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 5.670498084291188, | |
| "grad_norm": 4.315894603729248, | |
| "learning_rate": 4.645785440613027e-05, | |
| "loss": 33.6607, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 5.747126436781609, | |
| "grad_norm": 2.6151223182678223, | |
| "learning_rate": 4.640996168582376e-05, | |
| "loss": 34.7535, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 5.823754789272031, | |
| "grad_norm": 4.03953218460083, | |
| "learning_rate": 4.6362068965517244e-05, | |
| "loss": 33.9865, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 5.900383141762452, | |
| "grad_norm": 2.512362480163574, | |
| "learning_rate": 4.6314176245210724e-05, | |
| "loss": 33.0343, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 5.977011494252873, | |
| "grad_norm": 4.745575428009033, | |
| "learning_rate": 4.626628352490422e-05, | |
| "loss": 33.4544, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 34.841033935546875, | |
| "eval_runtime": 49.3059, | |
| "eval_samples_per_second": 26.467, | |
| "eval_steps_per_second": 3.326, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 6.053639846743295, | |
| "grad_norm": 2.996056079864502, | |
| "learning_rate": 4.6218390804597705e-05, | |
| "loss": 33.631, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 6.130268199233717, | |
| "grad_norm": 3.3260300159454346, | |
| "learning_rate": 4.617049808429119e-05, | |
| "loss": 33.9222, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.206896551724138, | |
| "grad_norm": 2.214486598968506, | |
| "learning_rate": 4.612260536398468e-05, | |
| "loss": 32.9576, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 6.283524904214559, | |
| "grad_norm": 3.6611664295196533, | |
| "learning_rate": 4.6074712643678166e-05, | |
| "loss": 33.5231, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 6.360153256704981, | |
| "grad_norm": 2.582730770111084, | |
| "learning_rate": 4.602681992337165e-05, | |
| "loss": 33.6936, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 6.436781609195402, | |
| "grad_norm": 2.739861488342285, | |
| "learning_rate": 4.597892720306514e-05, | |
| "loss": 33.3997, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 6.513409961685824, | |
| "grad_norm": 2.2102463245391846, | |
| "learning_rate": 4.593103448275862e-05, | |
| "loss": 33.9374, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 6.590038314176245, | |
| "grad_norm": 3.83150577545166, | |
| "learning_rate": 4.5883141762452106e-05, | |
| "loss": 33.9961, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 3.981616735458374, | |
| "learning_rate": 4.583524904214559e-05, | |
| "loss": 33.5413, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 6.743295019157088, | |
| "grad_norm": 2.3303332328796387, | |
| "learning_rate": 4.578735632183908e-05, | |
| "loss": 34.0529, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 6.819923371647509, | |
| "grad_norm": 3.9573702812194824, | |
| "learning_rate": 4.573946360153257e-05, | |
| "loss": 33.2897, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 6.896551724137931, | |
| "grad_norm": 2.6185879707336426, | |
| "learning_rate": 4.5691570881226054e-05, | |
| "loss": 34.0662, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 6.973180076628353, | |
| "grad_norm": 3.1155271530151367, | |
| "learning_rate": 4.564367816091955e-05, | |
| "loss": 33.517, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 34.818748474121094, | |
| "eval_runtime": 49.3029, | |
| "eval_samples_per_second": 26.469, | |
| "eval_steps_per_second": 3.326, | |
| "step": 9135 | |
| }, | |
| { | |
| "epoch": 7.049808429118774, | |
| "grad_norm": 3.117553472518921, | |
| "learning_rate": 4.5595785440613034e-05, | |
| "loss": 34.1218, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 7.126436781609195, | |
| "grad_norm": 2.5572612285614014, | |
| "learning_rate": 4.5547892720306515e-05, | |
| "loss": 33.662, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 7.203065134099617, | |
| "grad_norm": 3.5347042083740234, | |
| "learning_rate": 4.55e-05, | |
| "loss": 34.4668, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 7.2796934865900385, | |
| "grad_norm": 1.9216647148132324, | |
| "learning_rate": 4.545210727969349e-05, | |
| "loss": 33.4468, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 7.35632183908046, | |
| "grad_norm": 4.242152214050293, | |
| "learning_rate": 4.5404214559386975e-05, | |
| "loss": 33.5805, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 7.432950191570881, | |
| "grad_norm": 2.9310567378997803, | |
| "learning_rate": 4.535632183908046e-05, | |
| "loss": 34.0603, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 7.509578544061303, | |
| "grad_norm": 2.6573023796081543, | |
| "learning_rate": 4.530842911877395e-05, | |
| "loss": 33.8766, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 7.586206896551724, | |
| "grad_norm": 2.7849409580230713, | |
| "learning_rate": 4.5260536398467436e-05, | |
| "loss": 33.6309, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 7.662835249042145, | |
| "grad_norm": 2.7377357482910156, | |
| "learning_rate": 4.521264367816092e-05, | |
| "loss": 33.3621, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 7.739463601532567, | |
| "grad_norm": 2.106233835220337, | |
| "learning_rate": 4.516475095785441e-05, | |
| "loss": 33.4172, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 7.816091954022989, | |
| "grad_norm": 2.1989126205444336, | |
| "learning_rate": 4.5116858237547896e-05, | |
| "loss": 33.5937, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 7.89272030651341, | |
| "grad_norm": 2.903721570968628, | |
| "learning_rate": 4.5068965517241377e-05, | |
| "loss": 33.7935, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 7.969348659003831, | |
| "grad_norm": 2.061602830886841, | |
| "learning_rate": 4.5021072796934863e-05, | |
| "loss": 33.3289, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 34.95075607299805, | |
| "eval_runtime": 49.3237, | |
| "eval_samples_per_second": 26.458, | |
| "eval_steps_per_second": 3.325, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 8.045977011494253, | |
| "grad_norm": 1.8656938076019287, | |
| "learning_rate": 4.497318007662836e-05, | |
| "loss": 33.8404, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 8.122605363984674, | |
| "grad_norm": 2.783926486968994, | |
| "learning_rate": 4.4925287356321844e-05, | |
| "loss": 33.9544, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 8.199233716475096, | |
| "grad_norm": 2.175081968307495, | |
| "learning_rate": 4.487739463601533e-05, | |
| "loss": 33.6405, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 8.275862068965518, | |
| "grad_norm": 4.121524333953857, | |
| "learning_rate": 4.482950191570882e-05, | |
| "loss": 33.568, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 8.352490421455938, | |
| "grad_norm": 3.978410482406616, | |
| "learning_rate": 4.4781609195402305e-05, | |
| "loss": 33.6659, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 8.42911877394636, | |
| "grad_norm": 3.0454840660095215, | |
| "learning_rate": 4.473419540229885e-05, | |
| "loss": 33.2689, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 8.505747126436782, | |
| "grad_norm": 3.169114828109741, | |
| "learning_rate": 4.4686302681992336e-05, | |
| "loss": 33.6227, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 8.582375478927203, | |
| "grad_norm": 2.5880959033966064, | |
| "learning_rate": 4.463840996168582e-05, | |
| "loss": 33.3022, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 8.659003831417625, | |
| "grad_norm": 2.1367762088775635, | |
| "learning_rate": 4.459051724137932e-05, | |
| "loss": 33.2851, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 8.735632183908045, | |
| "grad_norm": 3.0278782844543457, | |
| "learning_rate": 4.4542624521072804e-05, | |
| "loss": 33.922, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 8.812260536398467, | |
| "grad_norm": 2.6361653804779053, | |
| "learning_rate": 4.4494731800766284e-05, | |
| "loss": 33.1482, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 2.7836809158325195, | |
| "learning_rate": 4.444683908045977e-05, | |
| "loss": 34.1345, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 8.96551724137931, | |
| "grad_norm": 2.519681453704834, | |
| "learning_rate": 4.439894636015326e-05, | |
| "loss": 34.0642, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 34.75983428955078, | |
| "eval_runtime": 49.3463, | |
| "eval_samples_per_second": 26.446, | |
| "eval_steps_per_second": 3.323, | |
| "step": 11745 | |
| }, | |
| { | |
| "epoch": 9.042145593869732, | |
| "grad_norm": 6.431031703948975, | |
| "learning_rate": 4.4351053639846745e-05, | |
| "loss": 33.6431, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 9.118773946360154, | |
| "grad_norm": 3.262486457824707, | |
| "learning_rate": 4.430316091954023e-05, | |
| "loss": 32.9398, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 9.195402298850574, | |
| "grad_norm": 1.945741057395935, | |
| "learning_rate": 4.425526819923372e-05, | |
| "loss": 32.7256, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 9.272030651340996, | |
| "grad_norm": 5.09276008605957, | |
| "learning_rate": 4.4207375478927205e-05, | |
| "loss": 33.9015, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 9.348659003831418, | |
| "grad_norm": 3.785059928894043, | |
| "learning_rate": 4.415948275862069e-05, | |
| "loss": 33.6765, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 9.425287356321839, | |
| "grad_norm": 2.4255340099334717, | |
| "learning_rate": 4.411159003831418e-05, | |
| "loss": 33.1262, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 9.50191570881226, | |
| "grad_norm": 5.869349479675293, | |
| "learning_rate": 4.4063697318007666e-05, | |
| "loss": 33.2205, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 9.578544061302683, | |
| "grad_norm": 2.361865997314453, | |
| "learning_rate": 4.4015804597701146e-05, | |
| "loss": 34.0441, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 9.655172413793103, | |
| "grad_norm": 2.6989896297454834, | |
| "learning_rate": 4.396791187739464e-05, | |
| "loss": 33.6812, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 9.731800766283525, | |
| "grad_norm": 2.6094741821289062, | |
| "learning_rate": 4.3920019157088127e-05, | |
| "loss": 33.9178, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 9.808429118773946, | |
| "grad_norm": 2.4616310596466064, | |
| "learning_rate": 4.3872126436781613e-05, | |
| "loss": 34.5233, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 9.885057471264368, | |
| "grad_norm": 2.7729408740997314, | |
| "learning_rate": 4.38242337164751e-05, | |
| "loss": 33.378, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 9.96168582375479, | |
| "grad_norm": 2.5230519771575928, | |
| "learning_rate": 4.377634099616859e-05, | |
| "loss": 33.442, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 34.700294494628906, | |
| "eval_runtime": 49.2926, | |
| "eval_samples_per_second": 26.475, | |
| "eval_steps_per_second": 3.327, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 10.03831417624521, | |
| "grad_norm": 2.5322816371917725, | |
| "learning_rate": 4.3728448275862074e-05, | |
| "loss": 33.8873, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 10.114942528735632, | |
| "grad_norm": 2.1063241958618164, | |
| "learning_rate": 4.368103448275862e-05, | |
| "loss": 33.871, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 10.191570881226054, | |
| "grad_norm": 3.7001326084136963, | |
| "learning_rate": 4.3633141762452106e-05, | |
| "loss": 34.5129, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 10.268199233716475, | |
| "grad_norm": 1.8534705638885498, | |
| "learning_rate": 4.35852490421456e-05, | |
| "loss": 33.7739, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 10.344827586206897, | |
| "grad_norm": 1.9871069192886353, | |
| "learning_rate": 4.3537356321839086e-05, | |
| "loss": 33.4124, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 10.421455938697317, | |
| "grad_norm": 2.264529228210449, | |
| "learning_rate": 4.348946360153257e-05, | |
| "loss": 33.24, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 10.49808429118774, | |
| "grad_norm": 3.0297787189483643, | |
| "learning_rate": 4.344157088122606e-05, | |
| "loss": 33.2922, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 10.574712643678161, | |
| "grad_norm": 2.7185864448547363, | |
| "learning_rate": 4.339367816091954e-05, | |
| "loss": 33.4859, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 10.651340996168582, | |
| "grad_norm": 3.8887524604797363, | |
| "learning_rate": 4.334578544061303e-05, | |
| "loss": 33.4322, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 10.727969348659004, | |
| "grad_norm": 2.5119857788085938, | |
| "learning_rate": 4.3297892720306514e-05, | |
| "loss": 33.6234, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 10.804597701149426, | |
| "grad_norm": 3.2969565391540527, | |
| "learning_rate": 4.325e-05, | |
| "loss": 33.4341, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 10.881226053639846, | |
| "grad_norm": 3.3629229068756104, | |
| "learning_rate": 4.320210727969349e-05, | |
| "loss": 32.7636, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 10.957854406130268, | |
| "grad_norm": 3.0765013694763184, | |
| "learning_rate": 4.3154214559386975e-05, | |
| "loss": 33.7066, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 34.70278549194336, | |
| "eval_runtime": 49.2928, | |
| "eval_samples_per_second": 26.474, | |
| "eval_steps_per_second": 3.327, | |
| "step": 14355 | |
| }, | |
| { | |
| "epoch": 11.03448275862069, | |
| "grad_norm": 2.7724273204803467, | |
| "learning_rate": 4.310632183908046e-05, | |
| "loss": 33.7759, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 3.9663071632385254, | |
| "learning_rate": 4.305842911877395e-05, | |
| "loss": 33.6063, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 11.187739463601533, | |
| "grad_norm": 2.53495717048645, | |
| "learning_rate": 4.3010536398467435e-05, | |
| "loss": 32.9251, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 11.264367816091955, | |
| "grad_norm": 3.928633689880371, | |
| "learning_rate": 4.296264367816092e-05, | |
| "loss": 33.41, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 11.340996168582375, | |
| "grad_norm": 1.888804316520691, | |
| "learning_rate": 4.291475095785441e-05, | |
| "loss": 33.147, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 11.417624521072797, | |
| "grad_norm": 3.151488780975342, | |
| "learning_rate": 4.2866858237547896e-05, | |
| "loss": 34.011, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 11.494252873563218, | |
| "grad_norm": 2.659867286682129, | |
| "learning_rate": 4.281896551724138e-05, | |
| "loss": 33.3559, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 11.57088122605364, | |
| "grad_norm": 4.092405319213867, | |
| "learning_rate": 4.277107279693487e-05, | |
| "loss": 33.2301, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 11.647509578544062, | |
| "grad_norm": 4.295740127563477, | |
| "learning_rate": 4.2723659003831415e-05, | |
| "loss": 33.1047, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 11.724137931034482, | |
| "grad_norm": 2.4472806453704834, | |
| "learning_rate": 4.26757662835249e-05, | |
| "loss": 33.8206, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 11.800766283524904, | |
| "grad_norm": 2.716550350189209, | |
| "learning_rate": 4.262787356321839e-05, | |
| "loss": 33.7173, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 11.877394636015326, | |
| "grad_norm": 3.1278491020202637, | |
| "learning_rate": 4.257998084291188e-05, | |
| "loss": 34.0344, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 11.954022988505747, | |
| "grad_norm": 2.4835212230682373, | |
| "learning_rate": 4.253208812260537e-05, | |
| "loss": 33.8397, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 34.70100402832031, | |
| "eval_runtime": 49.2554, | |
| "eval_samples_per_second": 26.495, | |
| "eval_steps_per_second": 3.33, | |
| "step": 15660 | |
| }, | |
| { | |
| "epoch": 12.030651340996169, | |
| "grad_norm": 2.331453800201416, | |
| "learning_rate": 4.2484195402298856e-05, | |
| "loss": 32.9794, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 12.10727969348659, | |
| "grad_norm": 2.2127463817596436, | |
| "learning_rate": 4.243630268199234e-05, | |
| "loss": 33.6367, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 12.183908045977011, | |
| "grad_norm": 3.1127703189849854, | |
| "learning_rate": 4.238840996168583e-05, | |
| "loss": 32.7221, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 12.260536398467433, | |
| "grad_norm": 2.5665576457977295, | |
| "learning_rate": 4.234051724137931e-05, | |
| "loss": 33.7796, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 12.337164750957854, | |
| "grad_norm": 2.995265245437622, | |
| "learning_rate": 4.22926245210728e-05, | |
| "loss": 32.8062, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 12.413793103448276, | |
| "grad_norm": 3.4698216915130615, | |
| "learning_rate": 4.2244731800766284e-05, | |
| "loss": 33.5182, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 12.490421455938698, | |
| "grad_norm": 4.030599117279053, | |
| "learning_rate": 4.219683908045977e-05, | |
| "loss": 33.7621, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 12.567049808429118, | |
| "grad_norm": 2.277189254760742, | |
| "learning_rate": 4.214894636015326e-05, | |
| "loss": 33.7926, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 12.64367816091954, | |
| "grad_norm": 2.3156633377075195, | |
| "learning_rate": 4.2101053639846744e-05, | |
| "loss": 33.869, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 12.720306513409962, | |
| "grad_norm": 3.5089361667633057, | |
| "learning_rate": 4.205316091954023e-05, | |
| "loss": 33.6732, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 12.796934865900383, | |
| "grad_norm": 2.5379600524902344, | |
| "learning_rate": 4.200526819923372e-05, | |
| "loss": 33.5854, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 12.873563218390805, | |
| "grad_norm": 2.5784411430358887, | |
| "learning_rate": 4.1957375478927205e-05, | |
| "loss": 33.2835, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 12.950191570881227, | |
| "grad_norm": 2.574859380722046, | |
| "learning_rate": 4.190948275862069e-05, | |
| "loss": 33.8945, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 34.72227478027344, | |
| "eval_runtime": 49.2549, | |
| "eval_samples_per_second": 26.495, | |
| "eval_steps_per_second": 3.33, | |
| "step": 16965 | |
| }, | |
| { | |
| "epoch": 13.026819923371647, | |
| "grad_norm": 3.8546385765075684, | |
| "learning_rate": 4.186159003831418e-05, | |
| "loss": 33.1455, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 13.10344827586207, | |
| "grad_norm": 3.751404047012329, | |
| "learning_rate": 4.1813697318007665e-05, | |
| "loss": 33.7843, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 13.18007662835249, | |
| "grad_norm": 3.0844898223876953, | |
| "learning_rate": 4.176580459770115e-05, | |
| "loss": 32.8163, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 13.256704980842912, | |
| "grad_norm": 1.7570416927337646, | |
| "learning_rate": 4.1718390804597704e-05, | |
| "loss": 33.3296, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 2.5809695720672607, | |
| "learning_rate": 4.1670498084291184e-05, | |
| "loss": 34.1621, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 13.409961685823754, | |
| "grad_norm": 2.564545154571533, | |
| "learning_rate": 4.162260536398467e-05, | |
| "loss": 33.4641, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 13.486590038314176, | |
| "grad_norm": 3.2340521812438965, | |
| "learning_rate": 4.1574712643678165e-05, | |
| "loss": 33.5958, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 13.563218390804598, | |
| "grad_norm": 4.329983711242676, | |
| "learning_rate": 4.152681992337165e-05, | |
| "loss": 33.53, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 13.639846743295019, | |
| "grad_norm": 2.3342621326446533, | |
| "learning_rate": 4.147892720306514e-05, | |
| "loss": 33.7702, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 13.71647509578544, | |
| "grad_norm": 2.6764466762542725, | |
| "learning_rate": 4.1431034482758625e-05, | |
| "loss": 33.6024, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 13.793103448275861, | |
| "grad_norm": 5.089807033538818, | |
| "learning_rate": 4.138314176245211e-05, | |
| "loss": 32.9291, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 13.869731800766283, | |
| "grad_norm": 2.4803364276885986, | |
| "learning_rate": 4.13352490421456e-05, | |
| "loss": 33.2098, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 13.946360153256705, | |
| "grad_norm": 3.0112080574035645, | |
| "learning_rate": 4.128735632183908e-05, | |
| "loss": 33.7988, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 34.82696533203125, | |
| "eval_runtime": 49.261, | |
| "eval_samples_per_second": 26.492, | |
| "eval_steps_per_second": 3.329, | |
| "step": 18270 | |
| }, | |
| { | |
| "epoch": 14.022988505747126, | |
| "grad_norm": 3.0625782012939453, | |
| "learning_rate": 4.1239463601532566e-05, | |
| "loss": 33.4482, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 14.099616858237548, | |
| "grad_norm": 2.5372705459594727, | |
| "learning_rate": 4.119157088122605e-05, | |
| "loss": 33.284, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 14.17624521072797, | |
| "grad_norm": 2.9518911838531494, | |
| "learning_rate": 4.114367816091954e-05, | |
| "loss": 33.4866, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 14.25287356321839, | |
| "grad_norm": 2.1386337280273438, | |
| "learning_rate": 4.109578544061303e-05, | |
| "loss": 33.653, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 14.329501915708812, | |
| "grad_norm": 2.1180756092071533, | |
| "learning_rate": 4.1047892720306514e-05, | |
| "loss": 34.3663, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 14.406130268199234, | |
| "grad_norm": 3.0451836585998535, | |
| "learning_rate": 4.1e-05, | |
| "loss": 32.7698, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 14.482758620689655, | |
| "grad_norm": 3.8517203330993652, | |
| "learning_rate": 4.0952107279693494e-05, | |
| "loss": 33.3581, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 14.559386973180077, | |
| "grad_norm": 2.322065830230713, | |
| "learning_rate": 4.0904214559386974e-05, | |
| "loss": 33.3386, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 14.636015325670499, | |
| "grad_norm": 2.604886054992676, | |
| "learning_rate": 4.085632183908046e-05, | |
| "loss": 33.8964, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 14.71264367816092, | |
| "grad_norm": 3.6753382682800293, | |
| "learning_rate": 4.080842911877395e-05, | |
| "loss": 32.9918, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 14.789272030651341, | |
| "grad_norm": 3.1375985145568848, | |
| "learning_rate": 4.07610153256705e-05, | |
| "loss": 33.5981, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 14.865900383141762, | |
| "grad_norm": 3.9305307865142822, | |
| "learning_rate": 4.071312260536399e-05, | |
| "loss": 34.0074, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 14.942528735632184, | |
| "grad_norm": 3.2952847480773926, | |
| "learning_rate": 4.0665229885057473e-05, | |
| "loss": 33.0554, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 34.7192268371582, | |
| "eval_runtime": 49.2319, | |
| "eval_samples_per_second": 26.507, | |
| "eval_steps_per_second": 3.331, | |
| "step": 19575 | |
| }, | |
| { | |
| "epoch": 15.019157088122606, | |
| "grad_norm": 3.291614294052124, | |
| "learning_rate": 4.061733716475096e-05, | |
| "loss": 32.9437, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 15.095785440613026, | |
| "grad_norm": 4.4670867919921875, | |
| "learning_rate": 4.056944444444445e-05, | |
| "loss": 33.6879, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 15.172413793103448, | |
| "grad_norm": 3.4122018814086914, | |
| "learning_rate": 4.0521551724137934e-05, | |
| "loss": 33.0167, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 15.24904214559387, | |
| "grad_norm": 3.854083299636841, | |
| "learning_rate": 4.047365900383142e-05, | |
| "loss": 33.8342, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 15.32567049808429, | |
| "grad_norm": 2.945396900177002, | |
| "learning_rate": 4.042576628352491e-05, | |
| "loss": 32.3812, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 15.402298850574713, | |
| "grad_norm": 2.5246341228485107, | |
| "learning_rate": 4.0377873563218395e-05, | |
| "loss": 33.3573, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 15.478927203065133, | |
| "grad_norm": 2.837134599685669, | |
| "learning_rate": 4.032998084291188e-05, | |
| "loss": 33.5981, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 15.555555555555555, | |
| "grad_norm": 4.350450038909912, | |
| "learning_rate": 4.028208812260537e-05, | |
| "loss": 34.0699, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 15.632183908045977, | |
| "grad_norm": 2.4908435344696045, | |
| "learning_rate": 4.0234195402298855e-05, | |
| "loss": 33.8105, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 15.708812260536398, | |
| "grad_norm": 2.9461615085601807, | |
| "learning_rate": 4.0186302681992336e-05, | |
| "loss": 33.3251, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 15.78544061302682, | |
| "grad_norm": 2.8716940879821777, | |
| "learning_rate": 4.013840996168582e-05, | |
| "loss": 33.7594, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 15.862068965517242, | |
| "grad_norm": 2.7166991233825684, | |
| "learning_rate": 4.009051724137931e-05, | |
| "loss": 33.58, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 15.938697318007662, | |
| "grad_norm": 2.2878618240356445, | |
| "learning_rate": 4.0042624521072796e-05, | |
| "loss": 33.4573, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 34.54485321044922, | |
| "eval_runtime": 49.3188, | |
| "eval_samples_per_second": 26.46, | |
| "eval_steps_per_second": 3.325, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 16.015325670498083, | |
| "grad_norm": 2.970867156982422, | |
| "learning_rate": 3.999473180076628e-05, | |
| "loss": 33.5118, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 16.091954022988507, | |
| "grad_norm": 2.395005464553833, | |
| "learning_rate": 3.994683908045978e-05, | |
| "loss": 34.1932, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 16.168582375478927, | |
| "grad_norm": 2.8175065517425537, | |
| "learning_rate": 3.9898946360153264e-05, | |
| "loss": 32.9815, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 16.245210727969347, | |
| "grad_norm": 4.665389537811279, | |
| "learning_rate": 3.985105363984675e-05, | |
| "loss": 33.8616, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 16.32183908045977, | |
| "grad_norm": 3.425340175628662, | |
| "learning_rate": 3.980316091954023e-05, | |
| "loss": 33.2022, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 16.39846743295019, | |
| "grad_norm": 5.212127685546875, | |
| "learning_rate": 3.975574712643678e-05, | |
| "loss": 33.3935, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 16.47509578544061, | |
| "grad_norm": 1.9034606218338013, | |
| "learning_rate": 3.970785440613027e-05, | |
| "loss": 32.739, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 16.551724137931036, | |
| "grad_norm": 2.024109125137329, | |
| "learning_rate": 3.9659961685823756e-05, | |
| "loss": 33.4628, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 16.628352490421456, | |
| "grad_norm": 2.8185606002807617, | |
| "learning_rate": 3.961206896551724e-05, | |
| "loss": 33.7672, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 16.704980842911876, | |
| "grad_norm": 3.2981534004211426, | |
| "learning_rate": 3.956417624521073e-05, | |
| "loss": 33.1976, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 16.7816091954023, | |
| "grad_norm": 4.531330585479736, | |
| "learning_rate": 3.951628352490422e-05, | |
| "loss": 33.2379, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 16.85823754789272, | |
| "grad_norm": 2.4455623626708984, | |
| "learning_rate": 3.9468390804597704e-05, | |
| "loss": 33.2898, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 16.93486590038314, | |
| "grad_norm": 4.1596245765686035, | |
| "learning_rate": 3.942049808429119e-05, | |
| "loss": 33.2167, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 34.65380096435547, | |
| "eval_runtime": 49.3114, | |
| "eval_samples_per_second": 26.464, | |
| "eval_steps_per_second": 3.326, | |
| "step": 22185 | |
| }, | |
| { | |
| "epoch": 17.011494252873565, | |
| "grad_norm": 4.7622528076171875, | |
| "learning_rate": 3.937260536398468e-05, | |
| "loss": 34.3268, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 17.088122605363985, | |
| "grad_norm": 2.9908533096313477, | |
| "learning_rate": 3.9324712643678164e-05, | |
| "loss": 33.4477, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 17.164750957854405, | |
| "grad_norm": 2.2341110706329346, | |
| "learning_rate": 3.927681992337165e-05, | |
| "loss": 33.6793, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 17.24137931034483, | |
| "grad_norm": 2.3946852684020996, | |
| "learning_rate": 3.922892720306514e-05, | |
| "loss": 33.2578, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 17.31800766283525, | |
| "grad_norm": 3.3899614810943604, | |
| "learning_rate": 3.9181034482758625e-05, | |
| "loss": 33.2486, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 17.39463601532567, | |
| "grad_norm": 5.150006294250488, | |
| "learning_rate": 3.9133141762452105e-05, | |
| "loss": 33.0265, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 17.47126436781609, | |
| "grad_norm": 2.8135523796081543, | |
| "learning_rate": 3.908524904214559e-05, | |
| "loss": 33.4384, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 17.547892720306514, | |
| "grad_norm": 2.5454325675964355, | |
| "learning_rate": 3.903735632183908e-05, | |
| "loss": 33.4139, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 17.624521072796934, | |
| "grad_norm": 4.680717945098877, | |
| "learning_rate": 3.8989463601532566e-05, | |
| "loss": 34.0209, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 17.701149425287355, | |
| "grad_norm": 4.242103099822998, | |
| "learning_rate": 3.894157088122606e-05, | |
| "loss": 33.1372, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 2.639352798461914, | |
| "learning_rate": 3.8893678160919546e-05, | |
| "loss": 33.3558, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 17.8544061302682, | |
| "grad_norm": 1.9746617078781128, | |
| "learning_rate": 3.884578544061303e-05, | |
| "loss": 33.7639, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 17.93103448275862, | |
| "grad_norm": 4.005228519439697, | |
| "learning_rate": 3.879837164750958e-05, | |
| "loss": 33.0241, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 34.649261474609375, | |
| "eval_runtime": 49.2606, | |
| "eval_samples_per_second": 26.492, | |
| "eval_steps_per_second": 3.329, | |
| "step": 23490 | |
| }, | |
| { | |
| "epoch": 18.007662835249043, | |
| "grad_norm": 2.500631809234619, | |
| "learning_rate": 3.8750478927203065e-05, | |
| "loss": 33.3219, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 18.084291187739463, | |
| "grad_norm": 3.90655255317688, | |
| "learning_rate": 3.870258620689655e-05, | |
| "loss": 33.4211, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 18.160919540229884, | |
| "grad_norm": 2.702497720718384, | |
| "learning_rate": 3.865469348659004e-05, | |
| "loss": 33.2414, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 18.237547892720308, | |
| "grad_norm": 1.9609768390655518, | |
| "learning_rate": 3.8606800766283525e-05, | |
| "loss": 34.0671, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 18.314176245210728, | |
| "grad_norm": 2.072951316833496, | |
| "learning_rate": 3.855890804597702e-05, | |
| "loss": 33.6311, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 18.39080459770115, | |
| "grad_norm": 3.249264717102051, | |
| "learning_rate": 3.85110153256705e-05, | |
| "loss": 32.9968, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 18.467432950191572, | |
| "grad_norm": 4.439345359802246, | |
| "learning_rate": 3.8463122605363986e-05, | |
| "loss": 33.1314, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 18.544061302681992, | |
| "grad_norm": 3.9109508991241455, | |
| "learning_rate": 3.841522988505747e-05, | |
| "loss": 33.3908, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 18.620689655172413, | |
| "grad_norm": 2.539151668548584, | |
| "learning_rate": 3.836733716475096e-05, | |
| "loss": 33.5031, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 18.697318007662837, | |
| "grad_norm": 2.6246118545532227, | |
| "learning_rate": 3.831944444444445e-05, | |
| "loss": 33.6923, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 18.773946360153257, | |
| "grad_norm": 3.5379223823547363, | |
| "learning_rate": 3.8271551724137934e-05, | |
| "loss": 32.9198, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 18.850574712643677, | |
| "grad_norm": 3.673536539077759, | |
| "learning_rate": 3.822365900383142e-05, | |
| "loss": 33.5072, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 18.9272030651341, | |
| "grad_norm": 3.9377758502960205, | |
| "learning_rate": 3.817576628352491e-05, | |
| "loss": 32.8486, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 34.617279052734375, | |
| "eval_runtime": 49.3115, | |
| "eval_samples_per_second": 26.464, | |
| "eval_steps_per_second": 3.326, | |
| "step": 24795 | |
| }, | |
| { | |
| "epoch": 19.00383141762452, | |
| "grad_norm": 3.04927659034729, | |
| "learning_rate": 3.8127873563218394e-05, | |
| "loss": 33.7055, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 19.080459770114942, | |
| "grad_norm": 2.725443124771118, | |
| "learning_rate": 3.8079980842911874e-05, | |
| "loss": 33.5355, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 19.157088122605366, | |
| "grad_norm": 3.853895425796509, | |
| "learning_rate": 3.803208812260536e-05, | |
| "loss": 33.5267, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 19.233716475095786, | |
| "grad_norm": 2.666419267654419, | |
| "learning_rate": 3.798419540229885e-05, | |
| "loss": 33.4069, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 19.310344827586206, | |
| "grad_norm": 3.5618317127227783, | |
| "learning_rate": 3.793630268199234e-05, | |
| "loss": 33.7295, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 19.386973180076627, | |
| "grad_norm": 3.351062297821045, | |
| "learning_rate": 3.788840996168583e-05, | |
| "loss": 33.1994, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 19.46360153256705, | |
| "grad_norm": 3.3226547241210938, | |
| "learning_rate": 3.7840996168582374e-05, | |
| "loss": 33.3149, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 19.54022988505747, | |
| "grad_norm": 4.15867805480957, | |
| "learning_rate": 3.779310344827586e-05, | |
| "loss": 33.5592, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 19.61685823754789, | |
| "grad_norm": 2.333674430847168, | |
| "learning_rate": 3.774521072796935e-05, | |
| "loss": 33.7336, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 19.693486590038315, | |
| "grad_norm": 2.9516782760620117, | |
| "learning_rate": 3.7697318007662834e-05, | |
| "loss": 33.3228, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 19.770114942528735, | |
| "grad_norm": 1.734508991241455, | |
| "learning_rate": 3.764942528735632e-05, | |
| "loss": 33.3216, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 19.846743295019156, | |
| "grad_norm": 2.4886648654937744, | |
| "learning_rate": 3.760153256704981e-05, | |
| "loss": 33.5157, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 19.92337164750958, | |
| "grad_norm": 3.6624252796173096, | |
| "learning_rate": 3.75536398467433e-05, | |
| "loss": 33.2399, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 4.810445785522461, | |
| "learning_rate": 3.750574712643679e-05, | |
| "loss": 32.548, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 34.746856689453125, | |
| "eval_runtime": 49.2861, | |
| "eval_samples_per_second": 26.478, | |
| "eval_steps_per_second": 3.328, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 20.07662835249042, | |
| "grad_norm": 4.07724142074585, | |
| "learning_rate": 3.745785440613027e-05, | |
| "loss": 33.562, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 20.153256704980844, | |
| "grad_norm": 4.335379600524902, | |
| "learning_rate": 3.7409961685823756e-05, | |
| "loss": 33.166, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 20.229885057471265, | |
| "grad_norm": 5.472820281982422, | |
| "learning_rate": 3.736206896551724e-05, | |
| "loss": 33.8918, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 20.306513409961685, | |
| "grad_norm": 3.011789321899414, | |
| "learning_rate": 3.731417624521073e-05, | |
| "loss": 33.395, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 20.38314176245211, | |
| "grad_norm": 3.251089096069336, | |
| "learning_rate": 3.7266283524904216e-05, | |
| "loss": 32.9072, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 20.45977011494253, | |
| "grad_norm": 2.7508978843688965, | |
| "learning_rate": 3.72183908045977e-05, | |
| "loss": 33.92, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 20.53639846743295, | |
| "grad_norm": 2.8051536083221436, | |
| "learning_rate": 3.717049808429119e-05, | |
| "loss": 33.9392, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 20.613026819923373, | |
| "grad_norm": 7.377379417419434, | |
| "learning_rate": 3.712260536398468e-05, | |
| "loss": 33.0382, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 20.689655172413794, | |
| "grad_norm": 3.7770464420318604, | |
| "learning_rate": 3.7074712643678164e-05, | |
| "loss": 32.6836, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 20.766283524904214, | |
| "grad_norm": 4.923346996307373, | |
| "learning_rate": 3.702681992337165e-05, | |
| "loss": 33.2129, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 20.842911877394634, | |
| "grad_norm": 4.790703773498535, | |
| "learning_rate": 3.697892720306513e-05, | |
| "loss": 33.5413, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 20.919540229885058, | |
| "grad_norm": 4.592926025390625, | |
| "learning_rate": 3.6931034482758624e-05, | |
| "loss": 33.2436, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 20.99616858237548, | |
| "grad_norm": 3.0529520511627197, | |
| "learning_rate": 3.688314176245211e-05, | |
| "loss": 33.2415, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 34.59661865234375, | |
| "eval_runtime": 49.3345, | |
| "eval_samples_per_second": 26.452, | |
| "eval_steps_per_second": 3.324, | |
| "step": 27405 | |
| }, | |
| { | |
| "epoch": 21.0727969348659, | |
| "grad_norm": 2.287121534347534, | |
| "learning_rate": 3.683572796934866e-05, | |
| "loss": 32.9962, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 21.149425287356323, | |
| "grad_norm": 2.5622124671936035, | |
| "learning_rate": 3.678783524904214e-05, | |
| "loss": 33.2565, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 21.226053639846743, | |
| "grad_norm": 2.2134974002838135, | |
| "learning_rate": 3.673994252873563e-05, | |
| "loss": 33.7442, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 21.302681992337163, | |
| "grad_norm": 2.574054002761841, | |
| "learning_rate": 3.669204980842912e-05, | |
| "loss": 33.7998, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 21.379310344827587, | |
| "grad_norm": 2.8479721546173096, | |
| "learning_rate": 3.6644157088122604e-05, | |
| "loss": 33.2015, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 21.455938697318008, | |
| "grad_norm": 4.845319747924805, | |
| "learning_rate": 3.659626436781609e-05, | |
| "loss": 33.7904, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 21.532567049808428, | |
| "grad_norm": 2.353726863861084, | |
| "learning_rate": 3.6548371647509584e-05, | |
| "loss": 33.7207, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 21.60919540229885, | |
| "grad_norm": 3.003556966781616, | |
| "learning_rate": 3.650047892720307e-05, | |
| "loss": 33.297, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 21.685823754789272, | |
| "grad_norm": 4.815252304077148, | |
| "learning_rate": 3.645258620689656e-05, | |
| "loss": 33.3036, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 21.762452107279692, | |
| "grad_norm": 3.0622081756591797, | |
| "learning_rate": 3.640469348659004e-05, | |
| "loss": 33.3661, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 21.839080459770116, | |
| "grad_norm": 3.3728883266448975, | |
| "learning_rate": 3.6356800766283525e-05, | |
| "loss": 32.8782, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 21.915708812260537, | |
| "grad_norm": 2.2338080406188965, | |
| "learning_rate": 3.630890804597701e-05, | |
| "loss": 33.0412, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 21.992337164750957, | |
| "grad_norm": 3.717360019683838, | |
| "learning_rate": 3.62610153256705e-05, | |
| "loss": 33.0318, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 34.53865432739258, | |
| "eval_runtime": 49.3318, | |
| "eval_samples_per_second": 26.454, | |
| "eval_steps_per_second": 3.324, | |
| "step": 28710 | |
| }, | |
| { | |
| "epoch": 22.06896551724138, | |
| "grad_norm": 2.77984356880188, | |
| "learning_rate": 3.6213122605363986e-05, | |
| "loss": 33.6688, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 22.1455938697318, | |
| "grad_norm": 3.427570104598999, | |
| "learning_rate": 3.616522988505747e-05, | |
| "loss": 33.2569, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 22.22222222222222, | |
| "grad_norm": 2.060288429260254, | |
| "learning_rate": 3.611733716475096e-05, | |
| "loss": 33.4445, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 22.298850574712645, | |
| "grad_norm": 3.7918601036071777, | |
| "learning_rate": 3.6069444444444446e-05, | |
| "loss": 34.2303, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 22.375478927203066, | |
| "grad_norm": 3.412705659866333, | |
| "learning_rate": 3.602155172413793e-05, | |
| "loss": 33.5679, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 22.452107279693486, | |
| "grad_norm": 4.111233711242676, | |
| "learning_rate": 3.597365900383142e-05, | |
| "loss": 32.9136, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 22.52873563218391, | |
| "grad_norm": 2.1312243938446045, | |
| "learning_rate": 3.592576628352491e-05, | |
| "loss": 32.8361, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 22.60536398467433, | |
| "grad_norm": 2.0618536472320557, | |
| "learning_rate": 3.587835249042146e-05, | |
| "loss": 33.8499, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 22.68199233716475, | |
| "grad_norm": 2.7332096099853516, | |
| "learning_rate": 3.5830459770114946e-05, | |
| "loss": 33.5899, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 22.75862068965517, | |
| "grad_norm": 4.264729022979736, | |
| "learning_rate": 3.578256704980843e-05, | |
| "loss": 33.194, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 22.835249042145595, | |
| "grad_norm": 4.755107402801514, | |
| "learning_rate": 3.573467432950192e-05, | |
| "loss": 33.2129, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 22.911877394636015, | |
| "grad_norm": 3.751232147216797, | |
| "learning_rate": 3.56867816091954e-05, | |
| "loss": 33.2948, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 22.988505747126435, | |
| "grad_norm": 3.3150830268859863, | |
| "learning_rate": 3.5638888888888886e-05, | |
| "loss": 32.452, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 34.624755859375, | |
| "eval_runtime": 49.3378, | |
| "eval_samples_per_second": 26.45, | |
| "eval_steps_per_second": 3.324, | |
| "step": 30015 | |
| }, | |
| { | |
| "epoch": 23.06513409961686, | |
| "grad_norm": 1.9898459911346436, | |
| "learning_rate": 3.559099616858237e-05, | |
| "loss": 33.2659, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 23.14176245210728, | |
| "grad_norm": 3.3541698455810547, | |
| "learning_rate": 3.554310344827587e-05, | |
| "loss": 33.4747, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 23.2183908045977, | |
| "grad_norm": 2.298229694366455, | |
| "learning_rate": 3.5495210727969354e-05, | |
| "loss": 33.8791, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 23.295019157088124, | |
| "grad_norm": 3.9336183071136475, | |
| "learning_rate": 3.544731800766284e-05, | |
| "loss": 33.8427, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 23.371647509578544, | |
| "grad_norm": 2.9286720752716064, | |
| "learning_rate": 3.539942528735633e-05, | |
| "loss": 33.9572, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 23.448275862068964, | |
| "grad_norm": 2.9716665744781494, | |
| "learning_rate": 3.5351532567049814e-05, | |
| "loss": 32.5295, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 23.52490421455939, | |
| "grad_norm": 3.5073654651641846, | |
| "learning_rate": 3.5303639846743294e-05, | |
| "loss": 33.3511, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 23.60153256704981, | |
| "grad_norm": 4.5670084953308105, | |
| "learning_rate": 3.525574712643678e-05, | |
| "loss": 33.4249, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 23.67816091954023, | |
| "grad_norm": 2.563405990600586, | |
| "learning_rate": 3.520785440613027e-05, | |
| "loss": 33.821, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 23.754789272030653, | |
| "grad_norm": 3.5928332805633545, | |
| "learning_rate": 3.5159961685823755e-05, | |
| "loss": 32.9252, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 23.831417624521073, | |
| "grad_norm": 3.2677550315856934, | |
| "learning_rate": 3.511206896551724e-05, | |
| "loss": 33.4694, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 23.908045977011493, | |
| "grad_norm": 3.8751015663146973, | |
| "learning_rate": 3.506417624521073e-05, | |
| "loss": 32.7835, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 23.984674329501917, | |
| "grad_norm": 3.955101490020752, | |
| "learning_rate": 3.5016283524904216e-05, | |
| "loss": 32.6658, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 34.550262451171875, | |
| "eval_runtime": 49.3313, | |
| "eval_samples_per_second": 26.454, | |
| "eval_steps_per_second": 3.324, | |
| "step": 31320 | |
| }, | |
| { | |
| "epoch": 24.061302681992338, | |
| "grad_norm": 3.885087013244629, | |
| "learning_rate": 3.49683908045977e-05, | |
| "loss": 33.5285, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 24.137931034482758, | |
| "grad_norm": 8.908398628234863, | |
| "learning_rate": 3.4920977011494254e-05, | |
| "loss": 33.1673, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 24.21455938697318, | |
| "grad_norm": 4.042150974273682, | |
| "learning_rate": 3.487308429118774e-05, | |
| "loss": 33.0384, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 24.291187739463602, | |
| "grad_norm": 4.992551803588867, | |
| "learning_rate": 3.482519157088123e-05, | |
| "loss": 33.7439, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 24.367816091954023, | |
| "grad_norm": 5.118918418884277, | |
| "learning_rate": 3.4777298850574715e-05, | |
| "loss": 33.5604, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 24.444444444444443, | |
| "grad_norm": 3.2756083011627197, | |
| "learning_rate": 3.47294061302682e-05, | |
| "loss": 33.6225, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 24.521072796934867, | |
| "grad_norm": 2.9864351749420166, | |
| "learning_rate": 3.468151340996169e-05, | |
| "loss": 34.0539, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 24.597701149425287, | |
| "grad_norm": 2.945171356201172, | |
| "learning_rate": 3.463362068965517e-05, | |
| "loss": 33.2655, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 24.674329501915707, | |
| "grad_norm": 4.09877347946167, | |
| "learning_rate": 3.4585727969348656e-05, | |
| "loss": 33.239, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 24.75095785440613, | |
| "grad_norm": 3.7949306964874268, | |
| "learning_rate": 3.453783524904215e-05, | |
| "loss": 32.7246, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 24.82758620689655, | |
| "grad_norm": 3.8750340938568115, | |
| "learning_rate": 3.4489942528735636e-05, | |
| "loss": 32.5477, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 24.904214559386972, | |
| "grad_norm": 3.84676456451416, | |
| "learning_rate": 3.444204980842912e-05, | |
| "loss": 33.5781, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 24.980842911877396, | |
| "grad_norm": 2.3316519260406494, | |
| "learning_rate": 3.439415708812261e-05, | |
| "loss": 33.0241, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 34.565101623535156, | |
| "eval_runtime": 49.343, | |
| "eval_samples_per_second": 26.448, | |
| "eval_steps_per_second": 3.324, | |
| "step": 32625 | |
| }, | |
| { | |
| "epoch": 25.057471264367816, | |
| "grad_norm": 2.94795823097229, | |
| "learning_rate": 3.43462643678161e-05, | |
| "loss": 33.1012, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 25.134099616858236, | |
| "grad_norm": 2.3455259799957275, | |
| "learning_rate": 3.4298371647509584e-05, | |
| "loss": 33.1345, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 25.21072796934866, | |
| "grad_norm": 2.678739547729492, | |
| "learning_rate": 3.4250478927203064e-05, | |
| "loss": 33.2271, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 25.28735632183908, | |
| "grad_norm": 4.3170952796936035, | |
| "learning_rate": 3.420258620689655e-05, | |
| "loss": 33.0392, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 25.3639846743295, | |
| "grad_norm": 3.8895034790039062, | |
| "learning_rate": 3.415469348659004e-05, | |
| "loss": 33.2535, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 25.440613026819925, | |
| "grad_norm": 3.693235158920288, | |
| "learning_rate": 3.4106800766283525e-05, | |
| "loss": 33.4471, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 25.517241379310345, | |
| "grad_norm": 5.521793365478516, | |
| "learning_rate": 3.405890804597701e-05, | |
| "loss": 34.2142, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 25.593869731800766, | |
| "grad_norm": 2.8983964920043945, | |
| "learning_rate": 3.40110153256705e-05, | |
| "loss": 34.362, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 25.67049808429119, | |
| "grad_norm": 3.329155206680298, | |
| "learning_rate": 3.396360153256705e-05, | |
| "loss": 32.373, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 25.74712643678161, | |
| "grad_norm": 2.6269519329071045, | |
| "learning_rate": 3.391570881226054e-05, | |
| "loss": 33.1401, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 25.82375478927203, | |
| "grad_norm": 3.1628787517547607, | |
| "learning_rate": 3.3867816091954024e-05, | |
| "loss": 33.2718, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 25.900383141762454, | |
| "grad_norm": 3.0653462409973145, | |
| "learning_rate": 3.381992337164751e-05, | |
| "loss": 33.481, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 25.977011494252874, | |
| "grad_norm": 2.5874106884002686, | |
| "learning_rate": 3.377250957854406e-05, | |
| "loss": 33.2467, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 34.54924392700195, | |
| "eval_runtime": 49.3, | |
| "eval_samples_per_second": 26.471, | |
| "eval_steps_per_second": 3.327, | |
| "step": 33930 | |
| }, | |
| { | |
| "epoch": 26.053639846743295, | |
| "grad_norm": 3.76274037361145, | |
| "learning_rate": 3.372461685823755e-05, | |
| "loss": 33.5457, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 26.130268199233715, | |
| "grad_norm": 5.3265061378479, | |
| "learning_rate": 3.3676724137931036e-05, | |
| "loss": 33.2299, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 26.20689655172414, | |
| "grad_norm": 4.5878987312316895, | |
| "learning_rate": 3.362883141762452e-05, | |
| "loss": 33.7869, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 26.28352490421456, | |
| "grad_norm": 3.673882007598877, | |
| "learning_rate": 3.358093869731801e-05, | |
| "loss": 32.6976, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 26.36015325670498, | |
| "grad_norm": 3.5689809322357178, | |
| "learning_rate": 3.35330459770115e-05, | |
| "loss": 33.6335, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 26.436781609195403, | |
| "grad_norm": 5.735408306121826, | |
| "learning_rate": 3.3485153256704984e-05, | |
| "loss": 33.116, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 26.513409961685824, | |
| "grad_norm": 1.9485822916030884, | |
| "learning_rate": 3.343726053639847e-05, | |
| "loss": 33.9434, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 26.590038314176244, | |
| "grad_norm": 4.049289226531982, | |
| "learning_rate": 3.338936781609196e-05, | |
| "loss": 33.2275, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 2.392415761947632, | |
| "learning_rate": 3.334147509578544e-05, | |
| "loss": 33.1124, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 26.743295019157088, | |
| "grad_norm": 3.01650333404541, | |
| "learning_rate": 3.3293582375478924e-05, | |
| "loss": 33.0733, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 26.81992337164751, | |
| "grad_norm": 3.1701831817626953, | |
| "learning_rate": 3.324568965517241e-05, | |
| "loss": 33.0029, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 26.896551724137932, | |
| "grad_norm": 2.6294217109680176, | |
| "learning_rate": 3.3197796934865905e-05, | |
| "loss": 32.9881, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 26.973180076628353, | |
| "grad_norm": 3.4342799186706543, | |
| "learning_rate": 3.314990421455939e-05, | |
| "loss": 33.047, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 34.47444152832031, | |
| "eval_runtime": 49.2859, | |
| "eval_samples_per_second": 26.478, | |
| "eval_steps_per_second": 3.328, | |
| "step": 35235 | |
| }, | |
| { | |
| "epoch": 27.049808429118773, | |
| "grad_norm": 2.2080352306365967, | |
| "learning_rate": 3.310201149425288e-05, | |
| "loss": 33.0059, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 27.126436781609197, | |
| "grad_norm": 3.0985817909240723, | |
| "learning_rate": 3.3054118773946366e-05, | |
| "loss": 33.8201, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 27.203065134099617, | |
| "grad_norm": 3.165069103240967, | |
| "learning_rate": 3.300622605363985e-05, | |
| "loss": 33.7343, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 27.279693486590038, | |
| "grad_norm": 3.2427308559417725, | |
| "learning_rate": 3.295833333333333e-05, | |
| "loss": 32.8829, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 27.35632183908046, | |
| "grad_norm": 2.973548412322998, | |
| "learning_rate": 3.291044061302682e-05, | |
| "loss": 33.2656, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 27.43295019157088, | |
| "grad_norm": 2.892834424972534, | |
| "learning_rate": 3.2862547892720306e-05, | |
| "loss": 33.085, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 27.509578544061302, | |
| "grad_norm": 2.4037787914276123, | |
| "learning_rate": 3.281465517241379e-05, | |
| "loss": 32.7549, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 27.586206896551722, | |
| "grad_norm": 3.7890052795410156, | |
| "learning_rate": 3.276676245210728e-05, | |
| "loss": 33.4256, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 27.662835249042146, | |
| "grad_norm": 3.4910600185394287, | |
| "learning_rate": 3.271886973180077e-05, | |
| "loss": 33.3707, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 27.739463601532567, | |
| "grad_norm": 2.895573854446411, | |
| "learning_rate": 3.2670977011494254e-05, | |
| "loss": 32.699, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 27.816091954022987, | |
| "grad_norm": 4.670979022979736, | |
| "learning_rate": 3.262308429118774e-05, | |
| "loss": 33.5898, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 27.89272030651341, | |
| "grad_norm": 2.362605571746826, | |
| "learning_rate": 3.257519157088123e-05, | |
| "loss": 33.4235, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 27.96934865900383, | |
| "grad_norm": 4.695677280426025, | |
| "learning_rate": 3.2527298850574715e-05, | |
| "loss": 33.9318, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 34.4583740234375, | |
| "eval_runtime": 49.3101, | |
| "eval_samples_per_second": 26.465, | |
| "eval_steps_per_second": 3.326, | |
| "step": 36540 | |
| }, | |
| { | |
| "epoch": 28.04597701149425, | |
| "grad_norm": 6.301197052001953, | |
| "learning_rate": 3.24794061302682e-05, | |
| "loss": 33.1986, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 28.122605363984675, | |
| "grad_norm": 3.1395254135131836, | |
| "learning_rate": 3.243151340996169e-05, | |
| "loss": 32.8468, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 28.199233716475096, | |
| "grad_norm": 2.673875331878662, | |
| "learning_rate": 3.2383620689655175e-05, | |
| "loss": 32.7804, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 28.275862068965516, | |
| "grad_norm": 3.807201862335205, | |
| "learning_rate": 3.233572796934866e-05, | |
| "loss": 34.128, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 28.35249042145594, | |
| "grad_norm": 3.2160332202911377, | |
| "learning_rate": 3.228783524904215e-05, | |
| "loss": 33.0419, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 28.42911877394636, | |
| "grad_norm": 3.2508413791656494, | |
| "learning_rate": 3.2239942528735636e-05, | |
| "loss": 33.3642, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 28.50574712643678, | |
| "grad_norm": 4.088146209716797, | |
| "learning_rate": 3.219204980842912e-05, | |
| "loss": 33.1516, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 28.582375478927204, | |
| "grad_norm": 3.4091460704803467, | |
| "learning_rate": 3.214415708812261e-05, | |
| "loss": 33.5824, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 28.659003831417625, | |
| "grad_norm": 3.113368034362793, | |
| "learning_rate": 3.209626436781609e-05, | |
| "loss": 33.2279, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 28.735632183908045, | |
| "grad_norm": 3.7009544372558594, | |
| "learning_rate": 3.2048371647509577e-05, | |
| "loss": 33.0744, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 28.81226053639847, | |
| "grad_norm": 2.046365261077881, | |
| "learning_rate": 3.2000478927203063e-05, | |
| "loss": 33.1949, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 28.88888888888889, | |
| "grad_norm": 3.8142659664154053, | |
| "learning_rate": 3.195258620689655e-05, | |
| "loss": 33.3156, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 28.96551724137931, | |
| "grad_norm": 3.120384454727173, | |
| "learning_rate": 3.190469348659004e-05, | |
| "loss": 32.828, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 34.519615173339844, | |
| "eval_runtime": 49.3498, | |
| "eval_samples_per_second": 26.444, | |
| "eval_steps_per_second": 3.323, | |
| "step": 37845 | |
| }, | |
| { | |
| "epoch": 29.042145593869733, | |
| "grad_norm": 4.515305042266846, | |
| "learning_rate": 3.185727969348659e-05, | |
| "loss": 33.5753, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 29.118773946360154, | |
| "grad_norm": 3.7501096725463867, | |
| "learning_rate": 3.1809386973180076e-05, | |
| "loss": 33.3131, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 29.195402298850574, | |
| "grad_norm": 3.431818723678589, | |
| "learning_rate": 3.176149425287356e-05, | |
| "loss": 33.085, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 29.272030651340994, | |
| "grad_norm": 3.4503543376922607, | |
| "learning_rate": 3.171360153256705e-05, | |
| "loss": 32.7894, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 29.34865900383142, | |
| "grad_norm": 4.361378192901611, | |
| "learning_rate": 3.1665708812260536e-05, | |
| "loss": 33.4922, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 29.42528735632184, | |
| "grad_norm": 2.354480504989624, | |
| "learning_rate": 3.161781609195402e-05, | |
| "loss": 33.3214, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 29.50191570881226, | |
| "grad_norm": 3.3123044967651367, | |
| "learning_rate": 3.156992337164751e-05, | |
| "loss": 33.3181, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 29.578544061302683, | |
| "grad_norm": 2.3824117183685303, | |
| "learning_rate": 3.1522030651341e-05, | |
| "loss": 33.0926, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 29.655172413793103, | |
| "grad_norm": 2.811178684234619, | |
| "learning_rate": 3.1474137931034484e-05, | |
| "loss": 33.3361, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 29.731800766283524, | |
| "grad_norm": 4.715090751647949, | |
| "learning_rate": 3.142624521072797e-05, | |
| "loss": 32.8444, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 29.808429118773947, | |
| "grad_norm": 2.191209316253662, | |
| "learning_rate": 3.137835249042146e-05, | |
| "loss": 33.9677, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 29.885057471264368, | |
| "grad_norm": 2.606814384460449, | |
| "learning_rate": 3.1330459770114945e-05, | |
| "loss": 33.3536, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 29.961685823754788, | |
| "grad_norm": 4.8533172607421875, | |
| "learning_rate": 3.128256704980843e-05, | |
| "loss": 33.2721, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 34.46094512939453, | |
| "eval_runtime": 49.4265, | |
| "eval_samples_per_second": 26.403, | |
| "eval_steps_per_second": 3.318, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 30.038314176245212, | |
| "grad_norm": 4.915451526641846, | |
| "learning_rate": 3.123467432950192e-05, | |
| "loss": 33.0369, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 30.114942528735632, | |
| "grad_norm": 4.369636058807373, | |
| "learning_rate": 3.1186781609195405e-05, | |
| "loss": 33.1459, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 30.191570881226053, | |
| "grad_norm": 2.9162957668304443, | |
| "learning_rate": 3.113888888888889e-05, | |
| "loss": 32.9688, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 30.268199233716476, | |
| "grad_norm": 4.7777628898620605, | |
| "learning_rate": 3.109099616858238e-05, | |
| "loss": 33.7249, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 30.344827586206897, | |
| "grad_norm": 3.651850700378418, | |
| "learning_rate": 3.104310344827586e-05, | |
| "loss": 33.4887, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 30.421455938697317, | |
| "grad_norm": 3.29491925239563, | |
| "learning_rate": 3.0995210727969346e-05, | |
| "loss": 33.5714, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 30.49808429118774, | |
| "grad_norm": 3.9116616249084473, | |
| "learning_rate": 3.094731800766283e-05, | |
| "loss": 33.7763, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 30.57471264367816, | |
| "grad_norm": Infinity, | |
| "learning_rate": 3.089990421455939e-05, | |
| "loss": 32.1907, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 30.65134099616858, | |
| "grad_norm": 3.237652063369751, | |
| "learning_rate": 3.085201149425287e-05, | |
| "loss": 33.344, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 30.727969348659006, | |
| "grad_norm": 4.286235809326172, | |
| "learning_rate": 3.080459770114943e-05, | |
| "loss": 33.181, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 30.804597701149426, | |
| "grad_norm": 2.6222527027130127, | |
| "learning_rate": 3.075670498084292e-05, | |
| "loss": 33.3407, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 30.881226053639846, | |
| "grad_norm": 3.7431180477142334, | |
| "learning_rate": 3.0708812260536404e-05, | |
| "loss": 33.1109, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 30.957854406130267, | |
| "grad_norm": 3.0706677436828613, | |
| "learning_rate": 3.066091954022989e-05, | |
| "loss": 33.3504, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 34.48047637939453, | |
| "eval_runtime": 49.4044, | |
| "eval_samples_per_second": 26.415, | |
| "eval_steps_per_second": 3.32, | |
| "step": 40455 | |
| }, | |
| { | |
| "epoch": 31.03448275862069, | |
| "grad_norm": 3.288548231124878, | |
| "learning_rate": 3.061302681992337e-05, | |
| "loss": 33.4014, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 31.11111111111111, | |
| "grad_norm": 4.078604221343994, | |
| "learning_rate": 3.056513409961686e-05, | |
| "loss": 33.5796, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 31.18773946360153, | |
| "grad_norm": 3.589484691619873, | |
| "learning_rate": 3.0517241379310348e-05, | |
| "loss": 32.9547, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 31.264367816091955, | |
| "grad_norm": 3.1043126583099365, | |
| "learning_rate": 3.046934865900383e-05, | |
| "loss": 33.2105, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 31.340996168582375, | |
| "grad_norm": 2.446356773376465, | |
| "learning_rate": 3.0421455938697318e-05, | |
| "loss": 33.1642, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 31.417624521072796, | |
| "grad_norm": 2.966627597808838, | |
| "learning_rate": 3.0373563218390805e-05, | |
| "loss": 32.7751, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 31.49425287356322, | |
| "grad_norm": 4.547020435333252, | |
| "learning_rate": 3.0325670498084292e-05, | |
| "loss": 33.8578, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 31.57088122605364, | |
| "grad_norm": 3.151139259338379, | |
| "learning_rate": 3.0277777777777776e-05, | |
| "loss": 33.2976, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 31.64750957854406, | |
| "grad_norm": 2.8900582790374756, | |
| "learning_rate": 3.0229885057471262e-05, | |
| "loss": 33.1161, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 31.724137931034484, | |
| "grad_norm": 2.5485446453094482, | |
| "learning_rate": 3.0181992337164756e-05, | |
| "loss": 33.596, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 31.800766283524904, | |
| "grad_norm": 2.5474777221679688, | |
| "learning_rate": 3.0134099616858243e-05, | |
| "loss": 33.3569, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 31.877394636015325, | |
| "grad_norm": 3.6182713508605957, | |
| "learning_rate": 3.0086206896551726e-05, | |
| "loss": 32.824, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 31.95402298850575, | |
| "grad_norm": 3.898332118988037, | |
| "learning_rate": 3.0038314176245213e-05, | |
| "loss": 32.8775, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 34.500526428222656, | |
| "eval_runtime": 49.4041, | |
| "eval_samples_per_second": 26.415, | |
| "eval_steps_per_second": 3.32, | |
| "step": 41760 | |
| }, | |
| { | |
| "epoch": 32.030651340996165, | |
| "grad_norm": 3.481757164001465, | |
| "learning_rate": 2.99904214559387e-05, | |
| "loss": 33.4618, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 32.10727969348659, | |
| "grad_norm": 3.9191551208496094, | |
| "learning_rate": 2.9942528735632187e-05, | |
| "loss": 33.627, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 32.18390804597701, | |
| "grad_norm": 5.722991466522217, | |
| "learning_rate": 2.989463601532567e-05, | |
| "loss": 32.705, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 32.26053639846743, | |
| "grad_norm": 4.626276016235352, | |
| "learning_rate": 2.9846743295019157e-05, | |
| "loss": 33.4211, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 32.337164750957854, | |
| "grad_norm": 2.526745557785034, | |
| "learning_rate": 2.9798850574712644e-05, | |
| "loss": 32.9605, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 32.41379310344828, | |
| "grad_norm": 2.2517364025115967, | |
| "learning_rate": 2.975095785440613e-05, | |
| "loss": 33.1264, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 32.490421455938694, | |
| "grad_norm": 5.5678606033325195, | |
| "learning_rate": 2.9703065134099618e-05, | |
| "loss": 33.1141, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 32.56704980842912, | |
| "grad_norm": 3.7891595363616943, | |
| "learning_rate": 2.96551724137931e-05, | |
| "loss": 33.3294, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 32.64367816091954, | |
| "grad_norm": 3.350956916809082, | |
| "learning_rate": 2.960727969348659e-05, | |
| "loss": 33.6182, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 32.72030651340996, | |
| "grad_norm": 3.138821601867676, | |
| "learning_rate": 2.9559386973180075e-05, | |
| "loss": 33.2219, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 32.79693486590038, | |
| "grad_norm": 3.301961898803711, | |
| "learning_rate": 2.9511494252873566e-05, | |
| "loss": 33.5015, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 32.87356321839081, | |
| "grad_norm": 3.0760138034820557, | |
| "learning_rate": 2.9463601532567052e-05, | |
| "loss": 33.3376, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 32.95019157088122, | |
| "grad_norm": 2.474372625350952, | |
| "learning_rate": 2.941570881226054e-05, | |
| "loss": 32.9016, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 34.45762252807617, | |
| "eval_runtime": 49.4143, | |
| "eval_samples_per_second": 26.409, | |
| "eval_steps_per_second": 3.319, | |
| "step": 43065 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 104400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 80, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.644125980055654e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |