{ "best_metric": 34.45762252807617, "best_model_checkpoint": "/kaggle/working/output/checkpoint-43065", "epoch": 33.0, "eval_steps": 500, "global_step": 43065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07662835249042145, "grad_norm": 9.545656204223633, "learning_rate": 4.9952586206896554e-05, "loss": 58.0015, "step": 100 }, { "epoch": 0.1532567049808429, "grad_norm": 3.9482674598693848, "learning_rate": 4.990469348659004e-05, "loss": 38.502, "step": 200 }, { "epoch": 0.22988505747126436, "grad_norm": 2.5423216819763184, "learning_rate": 4.985680076628353e-05, "loss": 35.7891, "step": 300 }, { "epoch": 0.3065134099616858, "grad_norm": 3.6723568439483643, "learning_rate": 4.9808908045977015e-05, "loss": 34.9999, "step": 400 }, { "epoch": 0.3831417624521073, "grad_norm": 2.0953221321105957, "learning_rate": 4.97610153256705e-05, "loss": 35.9283, "step": 500 }, { "epoch": 0.45977011494252873, "grad_norm": 4.932604789733887, "learning_rate": 4.971312260536399e-05, "loss": 34.5531, "step": 600 }, { "epoch": 0.5363984674329502, "grad_norm": 5.419522762298584, "learning_rate": 4.9665229885057475e-05, "loss": 34.7408, "step": 700 }, { "epoch": 0.6130268199233716, "grad_norm": 3.9690020084381104, "learning_rate": 4.961733716475096e-05, "loss": 34.5521, "step": 800 }, { "epoch": 0.6896551724137931, "grad_norm": 3.3197548389434814, "learning_rate": 4.956944444444445e-05, "loss": 33.4281, "step": 900 }, { "epoch": 0.7662835249042146, "grad_norm": 4.233493328094482, "learning_rate": 4.952155172413793e-05, "loss": 34.3137, "step": 1000 }, { "epoch": 0.842911877394636, "grad_norm": 5.390758037567139, "learning_rate": 4.9473659003831416e-05, "loss": 33.9454, "step": 1100 }, { "epoch": 0.9195402298850575, "grad_norm": 3.419612407684326, "learning_rate": 4.94257662835249e-05, "loss": 34.2298, "step": 1200 }, { "epoch": 0.9961685823754789, "grad_norm": 2.3791182041168213, "learning_rate": 4.937787356321839e-05, "loss": 33.5481, "step": 1300 }, { "epoch": 1.0, "eval_loss": 35.558197021484375, "eval_runtime": 49.3359, "eval_samples_per_second": 26.451, "eval_steps_per_second": 3.324, "step": 1305 }, { "epoch": 1.0727969348659003, "grad_norm": 3.0501019954681396, "learning_rate": 4.932998084291188e-05, "loss": 34.3557, "step": 1400 }, { "epoch": 1.1494252873563218, "grad_norm": 3.027714252471924, "learning_rate": 4.928208812260537e-05, "loss": 34.2442, "step": 1500 }, { "epoch": 1.2260536398467432, "grad_norm": 3.693758249282837, "learning_rate": 4.923419540229886e-05, "loss": 33.5375, "step": 1600 }, { "epoch": 1.3026819923371646, "grad_norm": 3.7679357528686523, "learning_rate": 4.9186302681992344e-05, "loss": 33.7891, "step": 1700 }, { "epoch": 1.3793103448275863, "grad_norm": 3.2367331981658936, "learning_rate": 4.9138409961685824e-05, "loss": 33.4964, "step": 1800 }, { "epoch": 1.4559386973180077, "grad_norm": 3.6876628398895264, "learning_rate": 4.909051724137931e-05, "loss": 34.7739, "step": 1900 }, { "epoch": 1.5325670498084292, "grad_norm": 1.9550260305404663, "learning_rate": 4.90426245210728e-05, "loss": 34.2552, "step": 2000 }, { "epoch": 1.6091954022988506, "grad_norm": 4.955118656158447, "learning_rate": 4.8994731800766285e-05, "loss": 33.9766, "step": 2100 }, { "epoch": 1.685823754789272, "grad_norm": 6.145394802093506, "learning_rate": 4.894683908045977e-05, "loss": 34.1676, "step": 2200 }, { "epoch": 1.7624521072796935, "grad_norm": 6.15125846862793, "learning_rate": 4.889894636015326e-05, "loss": 34.3084, "step": 2300 }, { "epoch": 1.839080459770115, "grad_norm": 2.647857427597046, "learning_rate": 4.8851053639846746e-05, "loss": 34.6449, "step": 2400 }, { "epoch": 1.9157088122605364, "grad_norm": 4.066762447357178, "learning_rate": 4.880316091954023e-05, "loss": 34.1318, "step": 2500 }, { "epoch": 1.9923371647509578, "grad_norm": 5.785406589508057, "learning_rate": 4.875526819923372e-05, "loss": 34.1303, "step": 2600 }, { "epoch": 2.0, "eval_loss": 35.211631774902344, "eval_runtime": 49.3338, "eval_samples_per_second": 26.452, "eval_steps_per_second": 3.324, "step": 2610 }, { "epoch": 2.0689655172413794, "grad_norm": 6.074384689331055, "learning_rate": 4.8707375478927206e-05, "loss": 33.6587, "step": 2700 }, { "epoch": 2.1455938697318007, "grad_norm": 3.770009994506836, "learning_rate": 4.865948275862069e-05, "loss": 34.5023, "step": 2800 }, { "epoch": 2.2222222222222223, "grad_norm": 4.6336140632629395, "learning_rate": 4.861159003831418e-05, "loss": 34.1806, "step": 2900 }, { "epoch": 2.2988505747126435, "grad_norm": 5.440792083740234, "learning_rate": 4.856369731800767e-05, "loss": 34.6645, "step": 3000 }, { "epoch": 2.375478927203065, "grad_norm": 2.98138165473938, "learning_rate": 4.8515804597701154e-05, "loss": 34.1371, "step": 3100 }, { "epoch": 2.4521072796934864, "grad_norm": 2.4175803661346436, "learning_rate": 4.846791187739464e-05, "loss": 33.8015, "step": 3200 }, { "epoch": 2.528735632183908, "grad_norm": 3.846370220184326, "learning_rate": 4.842001915708813e-05, "loss": 34.0589, "step": 3300 }, { "epoch": 2.6053639846743293, "grad_norm": 4.001793384552002, "learning_rate": 4.8372126436781614e-05, "loss": 33.7327, "step": 3400 }, { "epoch": 2.681992337164751, "grad_norm": 3.7779624462127686, "learning_rate": 4.83242337164751e-05, "loss": 34.3508, "step": 3500 }, { "epoch": 2.7586206896551726, "grad_norm": 3.5112695693969727, "learning_rate": 4.827634099616858e-05, "loss": 33.5653, "step": 3600 }, { "epoch": 2.835249042145594, "grad_norm": 2.3443048000335693, "learning_rate": 4.822844827586207e-05, "loss": 33.798, "step": 3700 }, { "epoch": 2.9118773946360155, "grad_norm": 2.5035479068756104, "learning_rate": 4.8180555555555555e-05, "loss": 33.4353, "step": 3800 }, { "epoch": 2.9885057471264367, "grad_norm": 3.4322028160095215, "learning_rate": 4.813266283524904e-05, "loss": 33.948, "step": 3900 }, { "epoch": 3.0, "eval_loss": 35.00273132324219, "eval_runtime": 49.3242, "eval_samples_per_second": 26.458, "eval_steps_per_second": 3.325, "step": 3915 }, { "epoch": 3.0651340996168583, "grad_norm": 2.8833682537078857, "learning_rate": 4.808477011494253e-05, "loss": 34.523, "step": 4000 }, { "epoch": 3.1417624521072796, "grad_norm": 2.8744261264801025, "learning_rate": 4.803735632183908e-05, "loss": 33.921, "step": 4100 }, { "epoch": 3.218390804597701, "grad_norm": 2.928616762161255, "learning_rate": 4.798946360153257e-05, "loss": 33.6903, "step": 4200 }, { "epoch": 3.2950191570881224, "grad_norm": 3.0579280853271484, "learning_rate": 4.7941570881226054e-05, "loss": 33.0608, "step": 4300 }, { "epoch": 3.371647509578544, "grad_norm": 1.6688510179519653, "learning_rate": 4.789367816091954e-05, "loss": 33.8769, "step": 4400 }, { "epoch": 3.4482758620689653, "grad_norm": 2.6190459728240967, "learning_rate": 4.784578544061303e-05, "loss": 33.2974, "step": 4500 }, { "epoch": 3.524904214559387, "grad_norm": 2.6260671615600586, "learning_rate": 4.7797892720306515e-05, "loss": 34.0589, "step": 4600 }, { "epoch": 3.6015325670498086, "grad_norm": 3.191978693008423, "learning_rate": 4.775e-05, "loss": 33.9493, "step": 4700 }, { "epoch": 3.67816091954023, "grad_norm": 2.759941339492798, "learning_rate": 4.770210727969349e-05, "loss": 33.5936, "step": 4800 }, { "epoch": 3.7547892720306515, "grad_norm": 2.262294054031372, "learning_rate": 4.7654214559386976e-05, "loss": 34.06, "step": 4900 }, { "epoch": 3.8314176245210727, "grad_norm": 4.6808600425720215, "learning_rate": 4.760632183908046e-05, "loss": 34.1592, "step": 5000 }, { "epoch": 3.9080459770114944, "grad_norm": 4.294464111328125, "learning_rate": 4.755842911877395e-05, "loss": 34.4652, "step": 5100 }, { "epoch": 3.9846743295019156, "grad_norm": 2.7845072746276855, "learning_rate": 4.7510536398467436e-05, "loss": 34.2075, "step": 5200 }, { "epoch": 4.0, "eval_loss": 34.954986572265625, "eval_runtime": 49.2865, "eval_samples_per_second": 26.478, "eval_steps_per_second": 3.327, "step": 5220 }, { "epoch": 4.061302681992337, "grad_norm": 4.420943260192871, "learning_rate": 4.746264367816092e-05, "loss": 34.5735, "step": 5300 }, { "epoch": 4.137931034482759, "grad_norm": 2.898287534713745, "learning_rate": 4.741475095785441e-05, "loss": 34.0739, "step": 5400 }, { "epoch": 4.21455938697318, "grad_norm": 4.703996658325195, "learning_rate": 4.73668582375479e-05, "loss": 33.7022, "step": 5500 }, { "epoch": 4.291187739463601, "grad_norm": 2.2913658618927, "learning_rate": 4.7318965517241384e-05, "loss": 33.6581, "step": 5600 }, { "epoch": 4.3678160919540225, "grad_norm": 3.895615339279175, "learning_rate": 4.727107279693487e-05, "loss": 34.0314, "step": 5700 }, { "epoch": 4.444444444444445, "grad_norm": 4.635524749755859, "learning_rate": 4.722318007662835e-05, "loss": 34.5266, "step": 5800 }, { "epoch": 4.521072796934866, "grad_norm": 3.451066017150879, "learning_rate": 4.717528735632184e-05, "loss": 33.1786, "step": 5900 }, { "epoch": 4.597701149425287, "grad_norm": 2.552107810974121, "learning_rate": 4.7127394636015325e-05, "loss": 33.6118, "step": 6000 }, { "epoch": 4.674329501915709, "grad_norm": 2.359786033630371, "learning_rate": 4.707998084291188e-05, "loss": 33.9903, "step": 6100 }, { "epoch": 4.75095785440613, "grad_norm": 2.2611875534057617, "learning_rate": 4.703208812260537e-05, "loss": 34.0762, "step": 6200 }, { "epoch": 4.827586206896552, "grad_norm": 1.8199210166931152, "learning_rate": 4.698419540229885e-05, "loss": 33.6635, "step": 6300 }, { "epoch": 4.904214559386973, "grad_norm": 2.7332305908203125, "learning_rate": 4.693630268199234e-05, "loss": 33.0946, "step": 6400 }, { "epoch": 4.980842911877395, "grad_norm": 2.9454078674316406, "learning_rate": 4.6888409961685824e-05, "loss": 33.9173, "step": 6500 }, { "epoch": 5.0, "eval_loss": 34.924800872802734, "eval_runtime": 49.3002, "eval_samples_per_second": 26.47, "eval_steps_per_second": 3.327, "step": 6525 }, { "epoch": 5.057471264367816, "grad_norm": 2.3083884716033936, "learning_rate": 4.684051724137931e-05, "loss": 33.8987, "step": 6600 }, { "epoch": 5.134099616858237, "grad_norm": 2.228327751159668, "learning_rate": 4.67926245210728e-05, "loss": 33.8189, "step": 6700 }, { "epoch": 5.210727969348659, "grad_norm": 3.6814918518066406, "learning_rate": 4.6744731800766284e-05, "loss": 33.8364, "step": 6800 }, { "epoch": 5.287356321839081, "grad_norm": 2.5758285522460938, "learning_rate": 4.669683908045977e-05, "loss": 33.7093, "step": 6900 }, { "epoch": 5.363984674329502, "grad_norm": 4.175839900970459, "learning_rate": 4.6648946360153265e-05, "loss": 33.6689, "step": 7000 }, { "epoch": 5.440613026819923, "grad_norm": 2.213092088699341, "learning_rate": 4.6601053639846745e-05, "loss": 33.7936, "step": 7100 }, { "epoch": 5.517241379310345, "grad_norm": 2.4982571601867676, "learning_rate": 4.655316091954023e-05, "loss": 33.3686, "step": 7200 }, { "epoch": 5.593869731800766, "grad_norm": 3.635983943939209, "learning_rate": 4.6505747126436784e-05, "loss": 33.5493, "step": 7300 }, { "epoch": 5.670498084291188, "grad_norm": 4.315894603729248, "learning_rate": 4.645785440613027e-05, "loss": 33.6607, "step": 7400 }, { "epoch": 5.747126436781609, "grad_norm": 2.6151223182678223, "learning_rate": 4.640996168582376e-05, "loss": 34.7535, "step": 7500 }, { "epoch": 5.823754789272031, "grad_norm": 4.03953218460083, "learning_rate": 4.6362068965517244e-05, "loss": 33.9865, "step": 7600 }, { "epoch": 5.900383141762452, "grad_norm": 2.512362480163574, "learning_rate": 4.6314176245210724e-05, "loss": 33.0343, "step": 7700 }, { "epoch": 5.977011494252873, "grad_norm": 4.745575428009033, "learning_rate": 4.626628352490422e-05, "loss": 33.4544, "step": 7800 }, { "epoch": 6.0, "eval_loss": 34.841033935546875, "eval_runtime": 49.3059, "eval_samples_per_second": 26.467, "eval_steps_per_second": 3.326, "step": 7830 }, { "epoch": 6.053639846743295, "grad_norm": 2.996056079864502, "learning_rate": 4.6218390804597705e-05, "loss": 33.631, "step": 7900 }, { "epoch": 6.130268199233717, "grad_norm": 3.3260300159454346, "learning_rate": 4.617049808429119e-05, "loss": 33.9222, "step": 8000 }, { "epoch": 6.206896551724138, "grad_norm": 2.214486598968506, "learning_rate": 4.612260536398468e-05, "loss": 32.9576, "step": 8100 }, { "epoch": 6.283524904214559, "grad_norm": 3.6611664295196533, "learning_rate": 4.6074712643678166e-05, "loss": 33.5231, "step": 8200 }, { "epoch": 6.360153256704981, "grad_norm": 2.582730770111084, "learning_rate": 4.602681992337165e-05, "loss": 33.6936, "step": 8300 }, { "epoch": 6.436781609195402, "grad_norm": 2.739861488342285, "learning_rate": 4.597892720306514e-05, "loss": 33.3997, "step": 8400 }, { "epoch": 6.513409961685824, "grad_norm": 2.2102463245391846, "learning_rate": 4.593103448275862e-05, "loss": 33.9374, "step": 8500 }, { "epoch": 6.590038314176245, "grad_norm": 3.83150577545166, "learning_rate": 4.5883141762452106e-05, "loss": 33.9961, "step": 8600 }, { "epoch": 6.666666666666667, "grad_norm": 3.981616735458374, "learning_rate": 4.583524904214559e-05, "loss": 33.5413, "step": 8700 }, { "epoch": 6.743295019157088, "grad_norm": 2.3303332328796387, "learning_rate": 4.578735632183908e-05, "loss": 34.0529, "step": 8800 }, { "epoch": 6.819923371647509, "grad_norm": 3.9573702812194824, "learning_rate": 4.573946360153257e-05, "loss": 33.2897, "step": 8900 }, { "epoch": 6.896551724137931, "grad_norm": 2.6185879707336426, "learning_rate": 4.5691570881226054e-05, "loss": 34.0662, "step": 9000 }, { "epoch": 6.973180076628353, "grad_norm": 3.1155271530151367, "learning_rate": 4.564367816091955e-05, "loss": 33.517, "step": 9100 }, { "epoch": 7.0, "eval_loss": 34.818748474121094, "eval_runtime": 49.3029, "eval_samples_per_second": 26.469, "eval_steps_per_second": 3.326, "step": 9135 }, { "epoch": 7.049808429118774, "grad_norm": 3.117553472518921, "learning_rate": 4.5595785440613034e-05, "loss": 34.1218, "step": 9200 }, { "epoch": 7.126436781609195, "grad_norm": 2.5572612285614014, "learning_rate": 4.5547892720306515e-05, "loss": 33.662, "step": 9300 }, { "epoch": 7.203065134099617, "grad_norm": 3.5347042083740234, "learning_rate": 4.55e-05, "loss": 34.4668, "step": 9400 }, { "epoch": 7.2796934865900385, "grad_norm": 1.9216647148132324, "learning_rate": 4.545210727969349e-05, "loss": 33.4468, "step": 9500 }, { "epoch": 7.35632183908046, "grad_norm": 4.242152214050293, "learning_rate": 4.5404214559386975e-05, "loss": 33.5805, "step": 9600 }, { "epoch": 7.432950191570881, "grad_norm": 2.9310567378997803, "learning_rate": 4.535632183908046e-05, "loss": 34.0603, "step": 9700 }, { "epoch": 7.509578544061303, "grad_norm": 2.6573023796081543, "learning_rate": 4.530842911877395e-05, "loss": 33.8766, "step": 9800 }, { "epoch": 7.586206896551724, "grad_norm": 2.7849409580230713, "learning_rate": 4.5260536398467436e-05, "loss": 33.6309, "step": 9900 }, { "epoch": 7.662835249042145, "grad_norm": 2.7377357482910156, "learning_rate": 4.521264367816092e-05, "loss": 33.3621, "step": 10000 }, { "epoch": 7.739463601532567, "grad_norm": 2.106233835220337, "learning_rate": 4.516475095785441e-05, "loss": 33.4172, "step": 10100 }, { "epoch": 7.816091954022989, "grad_norm": 2.1989126205444336, "learning_rate": 4.5116858237547896e-05, "loss": 33.5937, "step": 10200 }, { "epoch": 7.89272030651341, "grad_norm": 2.903721570968628, "learning_rate": 4.5068965517241377e-05, "loss": 33.7935, "step": 10300 }, { "epoch": 7.969348659003831, "grad_norm": 2.061602830886841, "learning_rate": 4.5021072796934863e-05, "loss": 33.3289, "step": 10400 }, { "epoch": 8.0, "eval_loss": 34.95075607299805, "eval_runtime": 49.3237, "eval_samples_per_second": 26.458, "eval_steps_per_second": 3.325, "step": 10440 }, { "epoch": 8.045977011494253, "grad_norm": 1.8656938076019287, "learning_rate": 4.497318007662836e-05, "loss": 33.8404, "step": 10500 }, { "epoch": 8.122605363984674, "grad_norm": 2.783926486968994, "learning_rate": 4.4925287356321844e-05, "loss": 33.9544, "step": 10600 }, { "epoch": 8.199233716475096, "grad_norm": 2.175081968307495, "learning_rate": 4.487739463601533e-05, "loss": 33.6405, "step": 10700 }, { "epoch": 8.275862068965518, "grad_norm": 4.121524333953857, "learning_rate": 4.482950191570882e-05, "loss": 33.568, "step": 10800 }, { "epoch": 8.352490421455938, "grad_norm": 3.978410482406616, "learning_rate": 4.4781609195402305e-05, "loss": 33.6659, "step": 10900 }, { "epoch": 8.42911877394636, "grad_norm": 3.0454840660095215, "learning_rate": 4.473419540229885e-05, "loss": 33.2689, "step": 11000 }, { "epoch": 8.505747126436782, "grad_norm": 3.169114828109741, "learning_rate": 4.4686302681992336e-05, "loss": 33.6227, "step": 11100 }, { "epoch": 8.582375478927203, "grad_norm": 2.5880959033966064, "learning_rate": 4.463840996168582e-05, "loss": 33.3022, "step": 11200 }, { "epoch": 8.659003831417625, "grad_norm": 2.1367762088775635, "learning_rate": 4.459051724137932e-05, "loss": 33.2851, "step": 11300 }, { "epoch": 8.735632183908045, "grad_norm": 3.0278782844543457, "learning_rate": 4.4542624521072804e-05, "loss": 33.922, "step": 11400 }, { "epoch": 8.812260536398467, "grad_norm": 2.6361653804779053, "learning_rate": 4.4494731800766284e-05, "loss": 33.1482, "step": 11500 }, { "epoch": 8.88888888888889, "grad_norm": 2.7836809158325195, "learning_rate": 4.444683908045977e-05, "loss": 34.1345, "step": 11600 }, { "epoch": 8.96551724137931, "grad_norm": 2.519681453704834, "learning_rate": 4.439894636015326e-05, "loss": 34.0642, "step": 11700 }, { "epoch": 9.0, "eval_loss": 34.75983428955078, "eval_runtime": 49.3463, "eval_samples_per_second": 26.446, "eval_steps_per_second": 3.323, "step": 11745 }, { "epoch": 9.042145593869732, "grad_norm": 6.431031703948975, "learning_rate": 4.4351053639846745e-05, "loss": 33.6431, "step": 11800 }, { "epoch": 9.118773946360154, "grad_norm": 3.262486457824707, "learning_rate": 4.430316091954023e-05, "loss": 32.9398, "step": 11900 }, { "epoch": 9.195402298850574, "grad_norm": 1.945741057395935, "learning_rate": 4.425526819923372e-05, "loss": 32.7256, "step": 12000 }, { "epoch": 9.272030651340996, "grad_norm": 5.09276008605957, "learning_rate": 4.4207375478927205e-05, "loss": 33.9015, "step": 12100 }, { "epoch": 9.348659003831418, "grad_norm": 3.785059928894043, "learning_rate": 4.415948275862069e-05, "loss": 33.6765, "step": 12200 }, { "epoch": 9.425287356321839, "grad_norm": 2.4255340099334717, "learning_rate": 4.411159003831418e-05, "loss": 33.1262, "step": 12300 }, { "epoch": 9.50191570881226, "grad_norm": 5.869349479675293, "learning_rate": 4.4063697318007666e-05, "loss": 33.2205, "step": 12400 }, { "epoch": 9.578544061302683, "grad_norm": 2.361865997314453, "learning_rate": 4.4015804597701146e-05, "loss": 34.0441, "step": 12500 }, { "epoch": 9.655172413793103, "grad_norm": 2.6989896297454834, "learning_rate": 4.396791187739464e-05, "loss": 33.6812, "step": 12600 }, { "epoch": 9.731800766283525, "grad_norm": 2.6094741821289062, "learning_rate": 4.3920019157088127e-05, "loss": 33.9178, "step": 12700 }, { "epoch": 9.808429118773946, "grad_norm": 2.4616310596466064, "learning_rate": 4.3872126436781613e-05, "loss": 34.5233, "step": 12800 }, { "epoch": 9.885057471264368, "grad_norm": 2.7729408740997314, "learning_rate": 4.38242337164751e-05, "loss": 33.378, "step": 12900 }, { "epoch": 9.96168582375479, "grad_norm": 2.5230519771575928, "learning_rate": 4.377634099616859e-05, "loss": 33.442, "step": 13000 }, { "epoch": 10.0, "eval_loss": 34.700294494628906, "eval_runtime": 49.2926, "eval_samples_per_second": 26.475, "eval_steps_per_second": 3.327, "step": 13050 }, { "epoch": 10.03831417624521, "grad_norm": 2.5322816371917725, "learning_rate": 4.3728448275862074e-05, "loss": 33.8873, "step": 13100 }, { "epoch": 10.114942528735632, "grad_norm": 2.1063241958618164, "learning_rate": 4.368103448275862e-05, "loss": 33.871, "step": 13200 }, { "epoch": 10.191570881226054, "grad_norm": 3.7001326084136963, "learning_rate": 4.3633141762452106e-05, "loss": 34.5129, "step": 13300 }, { "epoch": 10.268199233716475, "grad_norm": 1.8534705638885498, "learning_rate": 4.35852490421456e-05, "loss": 33.7739, "step": 13400 }, { "epoch": 10.344827586206897, "grad_norm": 1.9871069192886353, "learning_rate": 4.3537356321839086e-05, "loss": 33.4124, "step": 13500 }, { "epoch": 10.421455938697317, "grad_norm": 2.264529228210449, "learning_rate": 4.348946360153257e-05, "loss": 33.24, "step": 13600 }, { "epoch": 10.49808429118774, "grad_norm": 3.0297787189483643, "learning_rate": 4.344157088122606e-05, "loss": 33.2922, "step": 13700 }, { "epoch": 10.574712643678161, "grad_norm": 2.7185864448547363, "learning_rate": 4.339367816091954e-05, "loss": 33.4859, "step": 13800 }, { "epoch": 10.651340996168582, "grad_norm": 3.8887524604797363, "learning_rate": 4.334578544061303e-05, "loss": 33.4322, "step": 13900 }, { "epoch": 10.727969348659004, "grad_norm": 2.5119857788085938, "learning_rate": 4.3297892720306514e-05, "loss": 33.6234, "step": 14000 }, { "epoch": 10.804597701149426, "grad_norm": 3.2969565391540527, "learning_rate": 4.325e-05, "loss": 33.4341, "step": 14100 }, { "epoch": 10.881226053639846, "grad_norm": 3.3629229068756104, "learning_rate": 4.320210727969349e-05, "loss": 32.7636, "step": 14200 }, { "epoch": 10.957854406130268, "grad_norm": 3.0765013694763184, "learning_rate": 4.3154214559386975e-05, "loss": 33.7066, "step": 14300 }, { "epoch": 11.0, "eval_loss": 34.70278549194336, "eval_runtime": 49.2928, "eval_samples_per_second": 26.474, "eval_steps_per_second": 3.327, "step": 14355 }, { "epoch": 11.03448275862069, "grad_norm": 2.7724273204803467, "learning_rate": 4.310632183908046e-05, "loss": 33.7759, "step": 14400 }, { "epoch": 11.11111111111111, "grad_norm": 3.9663071632385254, "learning_rate": 4.305842911877395e-05, "loss": 33.6063, "step": 14500 }, { "epoch": 11.187739463601533, "grad_norm": 2.53495717048645, "learning_rate": 4.3010536398467435e-05, "loss": 32.9251, "step": 14600 }, { "epoch": 11.264367816091955, "grad_norm": 3.928633689880371, "learning_rate": 4.296264367816092e-05, "loss": 33.41, "step": 14700 }, { "epoch": 11.340996168582375, "grad_norm": 1.888804316520691, "learning_rate": 4.291475095785441e-05, "loss": 33.147, "step": 14800 }, { "epoch": 11.417624521072797, "grad_norm": 3.151488780975342, "learning_rate": 4.2866858237547896e-05, "loss": 34.011, "step": 14900 }, { "epoch": 11.494252873563218, "grad_norm": 2.659867286682129, "learning_rate": 4.281896551724138e-05, "loss": 33.3559, "step": 15000 }, { "epoch": 11.57088122605364, "grad_norm": 4.092405319213867, "learning_rate": 4.277107279693487e-05, "loss": 33.2301, "step": 15100 }, { "epoch": 11.647509578544062, "grad_norm": 4.295740127563477, "learning_rate": 4.2723659003831415e-05, "loss": 33.1047, "step": 15200 }, { "epoch": 11.724137931034482, "grad_norm": 2.4472806453704834, "learning_rate": 4.26757662835249e-05, "loss": 33.8206, "step": 15300 }, { "epoch": 11.800766283524904, "grad_norm": 2.716550350189209, "learning_rate": 4.262787356321839e-05, "loss": 33.7173, "step": 15400 }, { "epoch": 11.877394636015326, "grad_norm": 3.1278491020202637, "learning_rate": 4.257998084291188e-05, "loss": 34.0344, "step": 15500 }, { "epoch": 11.954022988505747, "grad_norm": 2.4835212230682373, "learning_rate": 4.253208812260537e-05, "loss": 33.8397, "step": 15600 }, { "epoch": 12.0, "eval_loss": 34.70100402832031, "eval_runtime": 49.2554, "eval_samples_per_second": 26.495, "eval_steps_per_second": 3.33, "step": 15660 }, { "epoch": 12.030651340996169, "grad_norm": 2.331453800201416, "learning_rate": 4.2484195402298856e-05, "loss": 32.9794, "step": 15700 }, { "epoch": 12.10727969348659, "grad_norm": 2.2127463817596436, "learning_rate": 4.243630268199234e-05, "loss": 33.6367, "step": 15800 }, { "epoch": 12.183908045977011, "grad_norm": 3.1127703189849854, "learning_rate": 4.238840996168583e-05, "loss": 32.7221, "step": 15900 }, { "epoch": 12.260536398467433, "grad_norm": 2.5665576457977295, "learning_rate": 4.234051724137931e-05, "loss": 33.7796, "step": 16000 }, { "epoch": 12.337164750957854, "grad_norm": 2.995265245437622, "learning_rate": 4.22926245210728e-05, "loss": 32.8062, "step": 16100 }, { "epoch": 12.413793103448276, "grad_norm": 3.4698216915130615, "learning_rate": 4.2244731800766284e-05, "loss": 33.5182, "step": 16200 }, { "epoch": 12.490421455938698, "grad_norm": 4.030599117279053, "learning_rate": 4.219683908045977e-05, "loss": 33.7621, "step": 16300 }, { "epoch": 12.567049808429118, "grad_norm": 2.277189254760742, "learning_rate": 4.214894636015326e-05, "loss": 33.7926, "step": 16400 }, { "epoch": 12.64367816091954, "grad_norm": 2.3156633377075195, "learning_rate": 4.2101053639846744e-05, "loss": 33.869, "step": 16500 }, { "epoch": 12.720306513409962, "grad_norm": 3.5089361667633057, "learning_rate": 4.205316091954023e-05, "loss": 33.6732, "step": 16600 }, { "epoch": 12.796934865900383, "grad_norm": 2.5379600524902344, "learning_rate": 4.200526819923372e-05, "loss": 33.5854, "step": 16700 }, { "epoch": 12.873563218390805, "grad_norm": 2.5784411430358887, "learning_rate": 4.1957375478927205e-05, "loss": 33.2835, "step": 16800 }, { "epoch": 12.950191570881227, "grad_norm": 2.574859380722046, "learning_rate": 4.190948275862069e-05, "loss": 33.8945, "step": 16900 }, { "epoch": 13.0, "eval_loss": 34.72227478027344, "eval_runtime": 49.2549, "eval_samples_per_second": 26.495, "eval_steps_per_second": 3.33, "step": 16965 }, { "epoch": 13.026819923371647, "grad_norm": 3.8546385765075684, "learning_rate": 4.186159003831418e-05, "loss": 33.1455, "step": 17000 }, { "epoch": 13.10344827586207, "grad_norm": 3.751404047012329, "learning_rate": 4.1813697318007665e-05, "loss": 33.7843, "step": 17100 }, { "epoch": 13.18007662835249, "grad_norm": 3.0844898223876953, "learning_rate": 4.176580459770115e-05, "loss": 32.8163, "step": 17200 }, { "epoch": 13.256704980842912, "grad_norm": 1.7570416927337646, "learning_rate": 4.1718390804597704e-05, "loss": 33.3296, "step": 17300 }, { "epoch": 13.333333333333334, "grad_norm": 2.5809695720672607, "learning_rate": 4.1670498084291184e-05, "loss": 34.1621, "step": 17400 }, { "epoch": 13.409961685823754, "grad_norm": 2.564545154571533, "learning_rate": 4.162260536398467e-05, "loss": 33.4641, "step": 17500 }, { "epoch": 13.486590038314176, "grad_norm": 3.2340521812438965, "learning_rate": 4.1574712643678165e-05, "loss": 33.5958, "step": 17600 }, { "epoch": 13.563218390804598, "grad_norm": 4.329983711242676, "learning_rate": 4.152681992337165e-05, "loss": 33.53, "step": 17700 }, { "epoch": 13.639846743295019, "grad_norm": 2.3342621326446533, "learning_rate": 4.147892720306514e-05, "loss": 33.7702, "step": 17800 }, { "epoch": 13.71647509578544, "grad_norm": 2.6764466762542725, "learning_rate": 4.1431034482758625e-05, "loss": 33.6024, "step": 17900 }, { "epoch": 13.793103448275861, "grad_norm": 5.089807033538818, "learning_rate": 4.138314176245211e-05, "loss": 32.9291, "step": 18000 }, { "epoch": 13.869731800766283, "grad_norm": 2.4803364276885986, "learning_rate": 4.13352490421456e-05, "loss": 33.2098, "step": 18100 }, { "epoch": 13.946360153256705, "grad_norm": 3.0112080574035645, "learning_rate": 4.128735632183908e-05, "loss": 33.7988, "step": 18200 }, { "epoch": 14.0, "eval_loss": 34.82696533203125, "eval_runtime": 49.261, "eval_samples_per_second": 26.492, "eval_steps_per_second": 3.329, "step": 18270 }, { "epoch": 14.022988505747126, "grad_norm": 3.0625782012939453, "learning_rate": 4.1239463601532566e-05, "loss": 33.4482, "step": 18300 }, { "epoch": 14.099616858237548, "grad_norm": 2.5372705459594727, "learning_rate": 4.119157088122605e-05, "loss": 33.284, "step": 18400 }, { "epoch": 14.17624521072797, "grad_norm": 2.9518911838531494, "learning_rate": 4.114367816091954e-05, "loss": 33.4866, "step": 18500 }, { "epoch": 14.25287356321839, "grad_norm": 2.1386337280273438, "learning_rate": 4.109578544061303e-05, "loss": 33.653, "step": 18600 }, { "epoch": 14.329501915708812, "grad_norm": 2.1180756092071533, "learning_rate": 4.1047892720306514e-05, "loss": 34.3663, "step": 18700 }, { "epoch": 14.406130268199234, "grad_norm": 3.0451836585998535, "learning_rate": 4.1e-05, "loss": 32.7698, "step": 18800 }, { "epoch": 14.482758620689655, "grad_norm": 3.8517203330993652, "learning_rate": 4.0952107279693494e-05, "loss": 33.3581, "step": 18900 }, { "epoch": 14.559386973180077, "grad_norm": 2.322065830230713, "learning_rate": 4.0904214559386974e-05, "loss": 33.3386, "step": 19000 }, { "epoch": 14.636015325670499, "grad_norm": 2.604886054992676, "learning_rate": 4.085632183908046e-05, "loss": 33.8964, "step": 19100 }, { "epoch": 14.71264367816092, "grad_norm": 3.6753382682800293, "learning_rate": 4.080842911877395e-05, "loss": 32.9918, "step": 19200 }, { "epoch": 14.789272030651341, "grad_norm": 3.1375985145568848, "learning_rate": 4.07610153256705e-05, "loss": 33.5981, "step": 19300 }, { "epoch": 14.865900383141762, "grad_norm": 3.9305307865142822, "learning_rate": 4.071312260536399e-05, "loss": 34.0074, "step": 19400 }, { "epoch": 14.942528735632184, "grad_norm": 3.2952847480773926, "learning_rate": 4.0665229885057473e-05, "loss": 33.0554, "step": 19500 }, { "epoch": 15.0, "eval_loss": 34.7192268371582, "eval_runtime": 49.2319, "eval_samples_per_second": 26.507, "eval_steps_per_second": 3.331, "step": 19575 }, { "epoch": 15.019157088122606, "grad_norm": 3.291614294052124, "learning_rate": 4.061733716475096e-05, "loss": 32.9437, "step": 19600 }, { "epoch": 15.095785440613026, "grad_norm": 4.4670867919921875, "learning_rate": 4.056944444444445e-05, "loss": 33.6879, "step": 19700 }, { "epoch": 15.172413793103448, "grad_norm": 3.4122018814086914, "learning_rate": 4.0521551724137934e-05, "loss": 33.0167, "step": 19800 }, { "epoch": 15.24904214559387, "grad_norm": 3.854083299636841, "learning_rate": 4.047365900383142e-05, "loss": 33.8342, "step": 19900 }, { "epoch": 15.32567049808429, "grad_norm": 2.945396900177002, "learning_rate": 4.042576628352491e-05, "loss": 32.3812, "step": 20000 }, { "epoch": 15.402298850574713, "grad_norm": 2.5246341228485107, "learning_rate": 4.0377873563218395e-05, "loss": 33.3573, "step": 20100 }, { "epoch": 15.478927203065133, "grad_norm": 2.837134599685669, "learning_rate": 4.032998084291188e-05, "loss": 33.5981, "step": 20200 }, { "epoch": 15.555555555555555, "grad_norm": 4.350450038909912, "learning_rate": 4.028208812260537e-05, "loss": 34.0699, "step": 20300 }, { "epoch": 15.632183908045977, "grad_norm": 2.4908435344696045, "learning_rate": 4.0234195402298855e-05, "loss": 33.8105, "step": 20400 }, { "epoch": 15.708812260536398, "grad_norm": 2.9461615085601807, "learning_rate": 4.0186302681992336e-05, "loss": 33.3251, "step": 20500 }, { "epoch": 15.78544061302682, "grad_norm": 2.8716940879821777, "learning_rate": 4.013840996168582e-05, "loss": 33.7594, "step": 20600 }, { "epoch": 15.862068965517242, "grad_norm": 2.7166991233825684, "learning_rate": 4.009051724137931e-05, "loss": 33.58, "step": 20700 }, { "epoch": 15.938697318007662, "grad_norm": 2.2878618240356445, "learning_rate": 4.0042624521072796e-05, "loss": 33.4573, "step": 20800 }, { "epoch": 16.0, "eval_loss": 34.54485321044922, "eval_runtime": 49.3188, "eval_samples_per_second": 26.46, "eval_steps_per_second": 3.325, "step": 20880 }, { "epoch": 16.015325670498083, "grad_norm": 2.970867156982422, "learning_rate": 3.999473180076628e-05, "loss": 33.5118, "step": 20900 }, { "epoch": 16.091954022988507, "grad_norm": 2.395005464553833, "learning_rate": 3.994683908045978e-05, "loss": 34.1932, "step": 21000 }, { "epoch": 16.168582375478927, "grad_norm": 2.8175065517425537, "learning_rate": 3.9898946360153264e-05, "loss": 32.9815, "step": 21100 }, { "epoch": 16.245210727969347, "grad_norm": 4.665389537811279, "learning_rate": 3.985105363984675e-05, "loss": 33.8616, "step": 21200 }, { "epoch": 16.32183908045977, "grad_norm": 3.425340175628662, "learning_rate": 3.980316091954023e-05, "loss": 33.2022, "step": 21300 }, { "epoch": 16.39846743295019, "grad_norm": 5.212127685546875, "learning_rate": 3.975574712643678e-05, "loss": 33.3935, "step": 21400 }, { "epoch": 16.47509578544061, "grad_norm": 1.9034606218338013, "learning_rate": 3.970785440613027e-05, "loss": 32.739, "step": 21500 }, { "epoch": 16.551724137931036, "grad_norm": 2.024109125137329, "learning_rate": 3.9659961685823756e-05, "loss": 33.4628, "step": 21600 }, { "epoch": 16.628352490421456, "grad_norm": 2.8185606002807617, "learning_rate": 3.961206896551724e-05, "loss": 33.7672, "step": 21700 }, { "epoch": 16.704980842911876, "grad_norm": 3.2981534004211426, "learning_rate": 3.956417624521073e-05, "loss": 33.1976, "step": 21800 }, { "epoch": 16.7816091954023, "grad_norm": 4.531330585479736, "learning_rate": 3.951628352490422e-05, "loss": 33.2379, "step": 21900 }, { "epoch": 16.85823754789272, "grad_norm": 2.4455623626708984, "learning_rate": 3.9468390804597704e-05, "loss": 33.2898, "step": 22000 }, { "epoch": 16.93486590038314, "grad_norm": 4.1596245765686035, "learning_rate": 3.942049808429119e-05, "loss": 33.2167, "step": 22100 }, { "epoch": 17.0, "eval_loss": 34.65380096435547, "eval_runtime": 49.3114, "eval_samples_per_second": 26.464, "eval_steps_per_second": 3.326, "step": 22185 }, { "epoch": 17.011494252873565, "grad_norm": 4.7622528076171875, "learning_rate": 3.937260536398468e-05, "loss": 34.3268, "step": 22200 }, { "epoch": 17.088122605363985, "grad_norm": 2.9908533096313477, "learning_rate": 3.9324712643678164e-05, "loss": 33.4477, "step": 22300 }, { "epoch": 17.164750957854405, "grad_norm": 2.2341110706329346, "learning_rate": 3.927681992337165e-05, "loss": 33.6793, "step": 22400 }, { "epoch": 17.24137931034483, "grad_norm": 2.3946852684020996, "learning_rate": 3.922892720306514e-05, "loss": 33.2578, "step": 22500 }, { "epoch": 17.31800766283525, "grad_norm": 3.3899614810943604, "learning_rate": 3.9181034482758625e-05, "loss": 33.2486, "step": 22600 }, { "epoch": 17.39463601532567, "grad_norm": 5.150006294250488, "learning_rate": 3.9133141762452105e-05, "loss": 33.0265, "step": 22700 }, { "epoch": 17.47126436781609, "grad_norm": 2.8135523796081543, "learning_rate": 3.908524904214559e-05, "loss": 33.4384, "step": 22800 }, { "epoch": 17.547892720306514, "grad_norm": 2.5454325675964355, "learning_rate": 3.903735632183908e-05, "loss": 33.4139, "step": 22900 }, { "epoch": 17.624521072796934, "grad_norm": 4.680717945098877, "learning_rate": 3.8989463601532566e-05, "loss": 34.0209, "step": 23000 }, { "epoch": 17.701149425287355, "grad_norm": 4.242103099822998, "learning_rate": 3.894157088122606e-05, "loss": 33.1372, "step": 23100 }, { "epoch": 17.77777777777778, "grad_norm": 2.639352798461914, "learning_rate": 3.8893678160919546e-05, "loss": 33.3558, "step": 23200 }, { "epoch": 17.8544061302682, "grad_norm": 1.9746617078781128, "learning_rate": 3.884578544061303e-05, "loss": 33.7639, "step": 23300 }, { "epoch": 17.93103448275862, "grad_norm": 4.005228519439697, "learning_rate": 3.879837164750958e-05, "loss": 33.0241, "step": 23400 }, { "epoch": 18.0, "eval_loss": 34.649261474609375, "eval_runtime": 49.2606, "eval_samples_per_second": 26.492, "eval_steps_per_second": 3.329, "step": 23490 }, { "epoch": 18.007662835249043, "grad_norm": 2.500631809234619, "learning_rate": 3.8750478927203065e-05, "loss": 33.3219, "step": 23500 }, { "epoch": 18.084291187739463, "grad_norm": 3.90655255317688, "learning_rate": 3.870258620689655e-05, "loss": 33.4211, "step": 23600 }, { "epoch": 18.160919540229884, "grad_norm": 2.702497720718384, "learning_rate": 3.865469348659004e-05, "loss": 33.2414, "step": 23700 }, { "epoch": 18.237547892720308, "grad_norm": 1.9609768390655518, "learning_rate": 3.8606800766283525e-05, "loss": 34.0671, "step": 23800 }, { "epoch": 18.314176245210728, "grad_norm": 2.072951316833496, "learning_rate": 3.855890804597702e-05, "loss": 33.6311, "step": 23900 }, { "epoch": 18.39080459770115, "grad_norm": 3.249264717102051, "learning_rate": 3.85110153256705e-05, "loss": 32.9968, "step": 24000 }, { "epoch": 18.467432950191572, "grad_norm": 4.439345359802246, "learning_rate": 3.8463122605363986e-05, "loss": 33.1314, "step": 24100 }, { "epoch": 18.544061302681992, "grad_norm": 3.9109508991241455, "learning_rate": 3.841522988505747e-05, "loss": 33.3908, "step": 24200 }, { "epoch": 18.620689655172413, "grad_norm": 2.539151668548584, "learning_rate": 3.836733716475096e-05, "loss": 33.5031, "step": 24300 }, { "epoch": 18.697318007662837, "grad_norm": 2.6246118545532227, "learning_rate": 3.831944444444445e-05, "loss": 33.6923, "step": 24400 }, { "epoch": 18.773946360153257, "grad_norm": 3.5379223823547363, "learning_rate": 3.8271551724137934e-05, "loss": 32.9198, "step": 24500 }, { "epoch": 18.850574712643677, "grad_norm": 3.673536539077759, "learning_rate": 3.822365900383142e-05, "loss": 33.5072, "step": 24600 }, { "epoch": 18.9272030651341, "grad_norm": 3.9377758502960205, "learning_rate": 3.817576628352491e-05, "loss": 32.8486, "step": 24700 }, { "epoch": 19.0, "eval_loss": 34.617279052734375, "eval_runtime": 49.3115, "eval_samples_per_second": 26.464, "eval_steps_per_second": 3.326, "step": 24795 }, { "epoch": 19.00383141762452, "grad_norm": 3.04927659034729, "learning_rate": 3.8127873563218394e-05, "loss": 33.7055, "step": 24800 }, { "epoch": 19.080459770114942, "grad_norm": 2.725443124771118, "learning_rate": 3.8079980842911874e-05, "loss": 33.5355, "step": 24900 }, { "epoch": 19.157088122605366, "grad_norm": 3.853895425796509, "learning_rate": 3.803208812260536e-05, "loss": 33.5267, "step": 25000 }, { "epoch": 19.233716475095786, "grad_norm": 2.666419267654419, "learning_rate": 3.798419540229885e-05, "loss": 33.4069, "step": 25100 }, { "epoch": 19.310344827586206, "grad_norm": 3.5618317127227783, "learning_rate": 3.793630268199234e-05, "loss": 33.7295, "step": 25200 }, { "epoch": 19.386973180076627, "grad_norm": 3.351062297821045, "learning_rate": 3.788840996168583e-05, "loss": 33.1994, "step": 25300 }, { "epoch": 19.46360153256705, "grad_norm": 3.3226547241210938, "learning_rate": 3.7840996168582374e-05, "loss": 33.3149, "step": 25400 }, { "epoch": 19.54022988505747, "grad_norm": 4.15867805480957, "learning_rate": 3.779310344827586e-05, "loss": 33.5592, "step": 25500 }, { "epoch": 19.61685823754789, "grad_norm": 2.333674430847168, "learning_rate": 3.774521072796935e-05, "loss": 33.7336, "step": 25600 }, { "epoch": 19.693486590038315, "grad_norm": 2.9516782760620117, "learning_rate": 3.7697318007662834e-05, "loss": 33.3228, "step": 25700 }, { "epoch": 19.770114942528735, "grad_norm": 1.734508991241455, "learning_rate": 3.764942528735632e-05, "loss": 33.3216, "step": 25800 }, { "epoch": 19.846743295019156, "grad_norm": 2.4886648654937744, "learning_rate": 3.760153256704981e-05, "loss": 33.5157, "step": 25900 }, { "epoch": 19.92337164750958, "grad_norm": 3.6624252796173096, "learning_rate": 3.75536398467433e-05, "loss": 33.2399, "step": 26000 }, { "epoch": 20.0, "grad_norm": 4.810445785522461, "learning_rate": 3.750574712643679e-05, "loss": 32.548, "step": 26100 }, { "epoch": 20.0, "eval_loss": 34.746856689453125, "eval_runtime": 49.2861, "eval_samples_per_second": 26.478, "eval_steps_per_second": 3.328, "step": 26100 }, { "epoch": 20.07662835249042, "grad_norm": 4.07724142074585, "learning_rate": 3.745785440613027e-05, "loss": 33.562, "step": 26200 }, { "epoch": 20.153256704980844, "grad_norm": 4.335379600524902, "learning_rate": 3.7409961685823756e-05, "loss": 33.166, "step": 26300 }, { "epoch": 20.229885057471265, "grad_norm": 5.472820281982422, "learning_rate": 3.736206896551724e-05, "loss": 33.8918, "step": 26400 }, { "epoch": 20.306513409961685, "grad_norm": 3.011789321899414, "learning_rate": 3.731417624521073e-05, "loss": 33.395, "step": 26500 }, { "epoch": 20.38314176245211, "grad_norm": 3.251089096069336, "learning_rate": 3.7266283524904216e-05, "loss": 32.9072, "step": 26600 }, { "epoch": 20.45977011494253, "grad_norm": 2.7508978843688965, "learning_rate": 3.72183908045977e-05, "loss": 33.92, "step": 26700 }, { "epoch": 20.53639846743295, "grad_norm": 2.8051536083221436, "learning_rate": 3.717049808429119e-05, "loss": 33.9392, "step": 26800 }, { "epoch": 20.613026819923373, "grad_norm": 7.377379417419434, "learning_rate": 3.712260536398468e-05, "loss": 33.0382, "step": 26900 }, { "epoch": 20.689655172413794, "grad_norm": 3.7770464420318604, "learning_rate": 3.7074712643678164e-05, "loss": 32.6836, "step": 27000 }, { "epoch": 20.766283524904214, "grad_norm": 4.923346996307373, "learning_rate": 3.702681992337165e-05, "loss": 33.2129, "step": 27100 }, { "epoch": 20.842911877394634, "grad_norm": 4.790703773498535, "learning_rate": 3.697892720306513e-05, "loss": 33.5413, "step": 27200 }, { "epoch": 20.919540229885058, "grad_norm": 4.592926025390625, "learning_rate": 3.6931034482758624e-05, "loss": 33.2436, "step": 27300 }, { "epoch": 20.99616858237548, "grad_norm": 3.0529520511627197, "learning_rate": 3.688314176245211e-05, "loss": 33.2415, "step": 27400 }, { "epoch": 21.0, "eval_loss": 34.59661865234375, "eval_runtime": 49.3345, "eval_samples_per_second": 26.452, "eval_steps_per_second": 3.324, "step": 27405 }, { "epoch": 21.0727969348659, "grad_norm": 2.287121534347534, "learning_rate": 3.683572796934866e-05, "loss": 32.9962, "step": 27500 }, { "epoch": 21.149425287356323, "grad_norm": 2.5622124671936035, "learning_rate": 3.678783524904214e-05, "loss": 33.2565, "step": 27600 }, { "epoch": 21.226053639846743, "grad_norm": 2.2134974002838135, "learning_rate": 3.673994252873563e-05, "loss": 33.7442, "step": 27700 }, { "epoch": 21.302681992337163, "grad_norm": 2.574054002761841, "learning_rate": 3.669204980842912e-05, "loss": 33.7998, "step": 27800 }, { "epoch": 21.379310344827587, "grad_norm": 2.8479721546173096, "learning_rate": 3.6644157088122604e-05, "loss": 33.2015, "step": 27900 }, { "epoch": 21.455938697318008, "grad_norm": 4.845319747924805, "learning_rate": 3.659626436781609e-05, "loss": 33.7904, "step": 28000 }, { "epoch": 21.532567049808428, "grad_norm": 2.353726863861084, "learning_rate": 3.6548371647509584e-05, "loss": 33.7207, "step": 28100 }, { "epoch": 21.60919540229885, "grad_norm": 3.003556966781616, "learning_rate": 3.650047892720307e-05, "loss": 33.297, "step": 28200 }, { "epoch": 21.685823754789272, "grad_norm": 4.815252304077148, "learning_rate": 3.645258620689656e-05, "loss": 33.3036, "step": 28300 }, { "epoch": 21.762452107279692, "grad_norm": 3.0622081756591797, "learning_rate": 3.640469348659004e-05, "loss": 33.3661, "step": 28400 }, { "epoch": 21.839080459770116, "grad_norm": 3.3728883266448975, "learning_rate": 3.6356800766283525e-05, "loss": 32.8782, "step": 28500 }, { "epoch": 21.915708812260537, "grad_norm": 2.2338080406188965, "learning_rate": 3.630890804597701e-05, "loss": 33.0412, "step": 28600 }, { "epoch": 21.992337164750957, "grad_norm": 3.717360019683838, "learning_rate": 3.62610153256705e-05, "loss": 33.0318, "step": 28700 }, { "epoch": 22.0, "eval_loss": 34.53865432739258, "eval_runtime": 49.3318, "eval_samples_per_second": 26.454, "eval_steps_per_second": 3.324, "step": 28710 }, { "epoch": 22.06896551724138, "grad_norm": 2.77984356880188, "learning_rate": 3.6213122605363986e-05, "loss": 33.6688, "step": 28800 }, { "epoch": 22.1455938697318, "grad_norm": 3.427570104598999, "learning_rate": 3.616522988505747e-05, "loss": 33.2569, "step": 28900 }, { "epoch": 22.22222222222222, "grad_norm": 2.060288429260254, "learning_rate": 3.611733716475096e-05, "loss": 33.4445, "step": 29000 }, { "epoch": 22.298850574712645, "grad_norm": 3.7918601036071777, "learning_rate": 3.6069444444444446e-05, "loss": 34.2303, "step": 29100 }, { "epoch": 22.375478927203066, "grad_norm": 3.412705659866333, "learning_rate": 3.602155172413793e-05, "loss": 33.5679, "step": 29200 }, { "epoch": 22.452107279693486, "grad_norm": 4.111233711242676, "learning_rate": 3.597365900383142e-05, "loss": 32.9136, "step": 29300 }, { "epoch": 22.52873563218391, "grad_norm": 2.1312243938446045, "learning_rate": 3.592576628352491e-05, "loss": 32.8361, "step": 29400 }, { "epoch": 22.60536398467433, "grad_norm": 2.0618536472320557, "learning_rate": 3.587835249042146e-05, "loss": 33.8499, "step": 29500 }, { "epoch": 22.68199233716475, "grad_norm": 2.7332096099853516, "learning_rate": 3.5830459770114946e-05, "loss": 33.5899, "step": 29600 }, { "epoch": 22.75862068965517, "grad_norm": 4.264729022979736, "learning_rate": 3.578256704980843e-05, "loss": 33.194, "step": 29700 }, { "epoch": 22.835249042145595, "grad_norm": 4.755107402801514, "learning_rate": 3.573467432950192e-05, "loss": 33.2129, "step": 29800 }, { "epoch": 22.911877394636015, "grad_norm": 3.751232147216797, "learning_rate": 3.56867816091954e-05, "loss": 33.2948, "step": 29900 }, { "epoch": 22.988505747126435, "grad_norm": 3.3150830268859863, "learning_rate": 3.5638888888888886e-05, "loss": 32.452, "step": 30000 }, { "epoch": 23.0, "eval_loss": 34.624755859375, "eval_runtime": 49.3378, "eval_samples_per_second": 26.45, "eval_steps_per_second": 3.324, "step": 30015 }, { "epoch": 23.06513409961686, "grad_norm": 1.9898459911346436, "learning_rate": 3.559099616858237e-05, "loss": 33.2659, "step": 30100 }, { "epoch": 23.14176245210728, "grad_norm": 3.3541698455810547, "learning_rate": 3.554310344827587e-05, "loss": 33.4747, "step": 30200 }, { "epoch": 23.2183908045977, "grad_norm": 2.298229694366455, "learning_rate": 3.5495210727969354e-05, "loss": 33.8791, "step": 30300 }, { "epoch": 23.295019157088124, "grad_norm": 3.9336183071136475, "learning_rate": 3.544731800766284e-05, "loss": 33.8427, "step": 30400 }, { "epoch": 23.371647509578544, "grad_norm": 2.9286720752716064, "learning_rate": 3.539942528735633e-05, "loss": 33.9572, "step": 30500 }, { "epoch": 23.448275862068964, "grad_norm": 2.9716665744781494, "learning_rate": 3.5351532567049814e-05, "loss": 32.5295, "step": 30600 }, { "epoch": 23.52490421455939, "grad_norm": 3.5073654651641846, "learning_rate": 3.5303639846743294e-05, "loss": 33.3511, "step": 30700 }, { "epoch": 23.60153256704981, "grad_norm": 4.5670084953308105, "learning_rate": 3.525574712643678e-05, "loss": 33.4249, "step": 30800 }, { "epoch": 23.67816091954023, "grad_norm": 2.563405990600586, "learning_rate": 3.520785440613027e-05, "loss": 33.821, "step": 30900 }, { "epoch": 23.754789272030653, "grad_norm": 3.5928332805633545, "learning_rate": 3.5159961685823755e-05, "loss": 32.9252, "step": 31000 }, { "epoch": 23.831417624521073, "grad_norm": 3.2677550315856934, "learning_rate": 3.511206896551724e-05, "loss": 33.4694, "step": 31100 }, { "epoch": 23.908045977011493, "grad_norm": 3.8751015663146973, "learning_rate": 3.506417624521073e-05, "loss": 32.7835, "step": 31200 }, { "epoch": 23.984674329501917, "grad_norm": 3.955101490020752, "learning_rate": 3.5016283524904216e-05, "loss": 32.6658, "step": 31300 }, { "epoch": 24.0, "eval_loss": 34.550262451171875, "eval_runtime": 49.3313, "eval_samples_per_second": 26.454, "eval_steps_per_second": 3.324, "step": 31320 }, { "epoch": 24.061302681992338, "grad_norm": 3.885087013244629, "learning_rate": 3.49683908045977e-05, "loss": 33.5285, "step": 31400 }, { "epoch": 24.137931034482758, "grad_norm": 8.908398628234863, "learning_rate": 3.4920977011494254e-05, "loss": 33.1673, "step": 31500 }, { "epoch": 24.21455938697318, "grad_norm": 4.042150974273682, "learning_rate": 3.487308429118774e-05, "loss": 33.0384, "step": 31600 }, { "epoch": 24.291187739463602, "grad_norm": 4.992551803588867, "learning_rate": 3.482519157088123e-05, "loss": 33.7439, "step": 31700 }, { "epoch": 24.367816091954023, "grad_norm": 5.118918418884277, "learning_rate": 3.4777298850574715e-05, "loss": 33.5604, "step": 31800 }, { "epoch": 24.444444444444443, "grad_norm": 3.2756083011627197, "learning_rate": 3.47294061302682e-05, "loss": 33.6225, "step": 31900 }, { "epoch": 24.521072796934867, "grad_norm": 2.9864351749420166, "learning_rate": 3.468151340996169e-05, "loss": 34.0539, "step": 32000 }, { "epoch": 24.597701149425287, "grad_norm": 2.945171356201172, "learning_rate": 3.463362068965517e-05, "loss": 33.2655, "step": 32100 }, { "epoch": 24.674329501915707, "grad_norm": 4.09877347946167, "learning_rate": 3.4585727969348656e-05, "loss": 33.239, "step": 32200 }, { "epoch": 24.75095785440613, "grad_norm": 3.7949306964874268, "learning_rate": 3.453783524904215e-05, "loss": 32.7246, "step": 32300 }, { "epoch": 24.82758620689655, "grad_norm": 3.8750340938568115, "learning_rate": 3.4489942528735636e-05, "loss": 32.5477, "step": 32400 }, { "epoch": 24.904214559386972, "grad_norm": 3.84676456451416, "learning_rate": 3.444204980842912e-05, "loss": 33.5781, "step": 32500 }, { "epoch": 24.980842911877396, "grad_norm": 2.3316519260406494, "learning_rate": 3.439415708812261e-05, "loss": 33.0241, "step": 32600 }, { "epoch": 25.0, "eval_loss": 34.565101623535156, "eval_runtime": 49.343, "eval_samples_per_second": 26.448, "eval_steps_per_second": 3.324, "step": 32625 }, { "epoch": 25.057471264367816, "grad_norm": 2.94795823097229, "learning_rate": 3.43462643678161e-05, "loss": 33.1012, "step": 32700 }, { "epoch": 25.134099616858236, "grad_norm": 2.3455259799957275, "learning_rate": 3.4298371647509584e-05, "loss": 33.1345, "step": 32800 }, { "epoch": 25.21072796934866, "grad_norm": 2.678739547729492, "learning_rate": 3.4250478927203064e-05, "loss": 33.2271, "step": 32900 }, { "epoch": 25.28735632183908, "grad_norm": 4.3170952796936035, "learning_rate": 3.420258620689655e-05, "loss": 33.0392, "step": 33000 }, { "epoch": 25.3639846743295, "grad_norm": 3.8895034790039062, "learning_rate": 3.415469348659004e-05, "loss": 33.2535, "step": 33100 }, { "epoch": 25.440613026819925, "grad_norm": 3.693235158920288, "learning_rate": 3.4106800766283525e-05, "loss": 33.4471, "step": 33200 }, { "epoch": 25.517241379310345, "grad_norm": 5.521793365478516, "learning_rate": 3.405890804597701e-05, "loss": 34.2142, "step": 33300 }, { "epoch": 25.593869731800766, "grad_norm": 2.8983964920043945, "learning_rate": 3.40110153256705e-05, "loss": 34.362, "step": 33400 }, { "epoch": 25.67049808429119, "grad_norm": 3.329155206680298, "learning_rate": 3.396360153256705e-05, "loss": 32.373, "step": 33500 }, { "epoch": 25.74712643678161, "grad_norm": 2.6269519329071045, "learning_rate": 3.391570881226054e-05, "loss": 33.1401, "step": 33600 }, { "epoch": 25.82375478927203, "grad_norm": 3.1628787517547607, "learning_rate": 3.3867816091954024e-05, "loss": 33.2718, "step": 33700 }, { "epoch": 25.900383141762454, "grad_norm": 3.0653462409973145, "learning_rate": 3.381992337164751e-05, "loss": 33.481, "step": 33800 }, { "epoch": 25.977011494252874, "grad_norm": 2.5874106884002686, "learning_rate": 3.377250957854406e-05, "loss": 33.2467, "step": 33900 }, { "epoch": 26.0, "eval_loss": 34.54924392700195, "eval_runtime": 49.3, "eval_samples_per_second": 26.471, "eval_steps_per_second": 3.327, "step": 33930 }, { "epoch": 26.053639846743295, "grad_norm": 3.76274037361145, "learning_rate": 3.372461685823755e-05, "loss": 33.5457, "step": 34000 }, { "epoch": 26.130268199233715, "grad_norm": 5.3265061378479, "learning_rate": 3.3676724137931036e-05, "loss": 33.2299, "step": 34100 }, { "epoch": 26.20689655172414, "grad_norm": 4.5878987312316895, "learning_rate": 3.362883141762452e-05, "loss": 33.7869, "step": 34200 }, { "epoch": 26.28352490421456, "grad_norm": 3.673882007598877, "learning_rate": 3.358093869731801e-05, "loss": 32.6976, "step": 34300 }, { "epoch": 26.36015325670498, "grad_norm": 3.5689809322357178, "learning_rate": 3.35330459770115e-05, "loss": 33.6335, "step": 34400 }, { "epoch": 26.436781609195403, "grad_norm": 5.735408306121826, "learning_rate": 3.3485153256704984e-05, "loss": 33.116, "step": 34500 }, { "epoch": 26.513409961685824, "grad_norm": 1.9485822916030884, "learning_rate": 3.343726053639847e-05, "loss": 33.9434, "step": 34600 }, { "epoch": 26.590038314176244, "grad_norm": 4.049289226531982, "learning_rate": 3.338936781609196e-05, "loss": 33.2275, "step": 34700 }, { "epoch": 26.666666666666668, "grad_norm": 2.392415761947632, "learning_rate": 3.334147509578544e-05, "loss": 33.1124, "step": 34800 }, { "epoch": 26.743295019157088, "grad_norm": 3.01650333404541, "learning_rate": 3.3293582375478924e-05, "loss": 33.0733, "step": 34900 }, { "epoch": 26.81992337164751, "grad_norm": 3.1701831817626953, "learning_rate": 3.324568965517241e-05, "loss": 33.0029, "step": 35000 }, { "epoch": 26.896551724137932, "grad_norm": 2.6294217109680176, "learning_rate": 3.3197796934865905e-05, "loss": 32.9881, "step": 35100 }, { "epoch": 26.973180076628353, "grad_norm": 3.4342799186706543, "learning_rate": 3.314990421455939e-05, "loss": 33.047, "step": 35200 }, { "epoch": 27.0, "eval_loss": 34.47444152832031, "eval_runtime": 49.2859, "eval_samples_per_second": 26.478, "eval_steps_per_second": 3.328, "step": 35235 }, { "epoch": 27.049808429118773, "grad_norm": 2.2080352306365967, "learning_rate": 3.310201149425288e-05, "loss": 33.0059, "step": 35300 }, { "epoch": 27.126436781609197, "grad_norm": 3.0985817909240723, "learning_rate": 3.3054118773946366e-05, "loss": 33.8201, "step": 35400 }, { "epoch": 27.203065134099617, "grad_norm": 3.165069103240967, "learning_rate": 3.300622605363985e-05, "loss": 33.7343, "step": 35500 }, { "epoch": 27.279693486590038, "grad_norm": 3.2427308559417725, "learning_rate": 3.295833333333333e-05, "loss": 32.8829, "step": 35600 }, { "epoch": 27.35632183908046, "grad_norm": 2.973548412322998, "learning_rate": 3.291044061302682e-05, "loss": 33.2656, "step": 35700 }, { "epoch": 27.43295019157088, "grad_norm": 2.892834424972534, "learning_rate": 3.2862547892720306e-05, "loss": 33.085, "step": 35800 }, { "epoch": 27.509578544061302, "grad_norm": 2.4037787914276123, "learning_rate": 3.281465517241379e-05, "loss": 32.7549, "step": 35900 }, { "epoch": 27.586206896551722, "grad_norm": 3.7890052795410156, "learning_rate": 3.276676245210728e-05, "loss": 33.4256, "step": 36000 }, { "epoch": 27.662835249042146, "grad_norm": 3.4910600185394287, "learning_rate": 3.271886973180077e-05, "loss": 33.3707, "step": 36100 }, { "epoch": 27.739463601532567, "grad_norm": 2.895573854446411, "learning_rate": 3.2670977011494254e-05, "loss": 32.699, "step": 36200 }, { "epoch": 27.816091954022987, "grad_norm": 4.670979022979736, "learning_rate": 3.262308429118774e-05, "loss": 33.5898, "step": 36300 }, { "epoch": 27.89272030651341, "grad_norm": 2.362605571746826, "learning_rate": 3.257519157088123e-05, "loss": 33.4235, "step": 36400 }, { "epoch": 27.96934865900383, "grad_norm": 4.695677280426025, "learning_rate": 3.2527298850574715e-05, "loss": 33.9318, "step": 36500 }, { "epoch": 28.0, "eval_loss": 34.4583740234375, "eval_runtime": 49.3101, "eval_samples_per_second": 26.465, "eval_steps_per_second": 3.326, "step": 36540 }, { "epoch": 28.04597701149425, "grad_norm": 6.301197052001953, "learning_rate": 3.24794061302682e-05, "loss": 33.1986, "step": 36600 }, { "epoch": 28.122605363984675, "grad_norm": 3.1395254135131836, "learning_rate": 3.243151340996169e-05, "loss": 32.8468, "step": 36700 }, { "epoch": 28.199233716475096, "grad_norm": 2.673875331878662, "learning_rate": 3.2383620689655175e-05, "loss": 32.7804, "step": 36800 }, { "epoch": 28.275862068965516, "grad_norm": 3.807201862335205, "learning_rate": 3.233572796934866e-05, "loss": 34.128, "step": 36900 }, { "epoch": 28.35249042145594, "grad_norm": 3.2160332202911377, "learning_rate": 3.228783524904215e-05, "loss": 33.0419, "step": 37000 }, { "epoch": 28.42911877394636, "grad_norm": 3.2508413791656494, "learning_rate": 3.2239942528735636e-05, "loss": 33.3642, "step": 37100 }, { "epoch": 28.50574712643678, "grad_norm": 4.088146209716797, "learning_rate": 3.219204980842912e-05, "loss": 33.1516, "step": 37200 }, { "epoch": 28.582375478927204, "grad_norm": 3.4091460704803467, "learning_rate": 3.214415708812261e-05, "loss": 33.5824, "step": 37300 }, { "epoch": 28.659003831417625, "grad_norm": 3.113368034362793, "learning_rate": 3.209626436781609e-05, "loss": 33.2279, "step": 37400 }, { "epoch": 28.735632183908045, "grad_norm": 3.7009544372558594, "learning_rate": 3.2048371647509577e-05, "loss": 33.0744, "step": 37500 }, { "epoch": 28.81226053639847, "grad_norm": 2.046365261077881, "learning_rate": 3.2000478927203063e-05, "loss": 33.1949, "step": 37600 }, { "epoch": 28.88888888888889, "grad_norm": 3.8142659664154053, "learning_rate": 3.195258620689655e-05, "loss": 33.3156, "step": 37700 }, { "epoch": 28.96551724137931, "grad_norm": 3.120384454727173, "learning_rate": 3.190469348659004e-05, "loss": 32.828, "step": 37800 }, { "epoch": 29.0, "eval_loss": 34.519615173339844, "eval_runtime": 49.3498, "eval_samples_per_second": 26.444, "eval_steps_per_second": 3.323, "step": 37845 }, { "epoch": 29.042145593869733, "grad_norm": 4.515305042266846, "learning_rate": 3.185727969348659e-05, "loss": 33.5753, "step": 37900 }, { "epoch": 29.118773946360154, "grad_norm": 3.7501096725463867, "learning_rate": 3.1809386973180076e-05, "loss": 33.3131, "step": 38000 }, { "epoch": 29.195402298850574, "grad_norm": 3.431818723678589, "learning_rate": 3.176149425287356e-05, "loss": 33.085, "step": 38100 }, { "epoch": 29.272030651340994, "grad_norm": 3.4503543376922607, "learning_rate": 3.171360153256705e-05, "loss": 32.7894, "step": 38200 }, { "epoch": 29.34865900383142, "grad_norm": 4.361378192901611, "learning_rate": 3.1665708812260536e-05, "loss": 33.4922, "step": 38300 }, { "epoch": 29.42528735632184, "grad_norm": 2.354480504989624, "learning_rate": 3.161781609195402e-05, "loss": 33.3214, "step": 38400 }, { "epoch": 29.50191570881226, "grad_norm": 3.3123044967651367, "learning_rate": 3.156992337164751e-05, "loss": 33.3181, "step": 38500 }, { "epoch": 29.578544061302683, "grad_norm": 2.3824117183685303, "learning_rate": 3.1522030651341e-05, "loss": 33.0926, "step": 38600 }, { "epoch": 29.655172413793103, "grad_norm": 2.811178684234619, "learning_rate": 3.1474137931034484e-05, "loss": 33.3361, "step": 38700 }, { "epoch": 29.731800766283524, "grad_norm": 4.715090751647949, "learning_rate": 3.142624521072797e-05, "loss": 32.8444, "step": 38800 }, { "epoch": 29.808429118773947, "grad_norm": 2.191209316253662, "learning_rate": 3.137835249042146e-05, "loss": 33.9677, "step": 38900 }, { "epoch": 29.885057471264368, "grad_norm": 2.606814384460449, "learning_rate": 3.1330459770114945e-05, "loss": 33.3536, "step": 39000 }, { "epoch": 29.961685823754788, "grad_norm": 4.8533172607421875, "learning_rate": 3.128256704980843e-05, "loss": 33.2721, "step": 39100 }, { "epoch": 30.0, "eval_loss": 34.46094512939453, "eval_runtime": 49.4265, "eval_samples_per_second": 26.403, "eval_steps_per_second": 3.318, "step": 39150 }, { "epoch": 30.038314176245212, "grad_norm": 4.915451526641846, "learning_rate": 3.123467432950192e-05, "loss": 33.0369, "step": 39200 }, { "epoch": 30.114942528735632, "grad_norm": 4.369636058807373, "learning_rate": 3.1186781609195405e-05, "loss": 33.1459, "step": 39300 }, { "epoch": 30.191570881226053, "grad_norm": 2.9162957668304443, "learning_rate": 3.113888888888889e-05, "loss": 32.9688, "step": 39400 }, { "epoch": 30.268199233716476, "grad_norm": 4.7777628898620605, "learning_rate": 3.109099616858238e-05, "loss": 33.7249, "step": 39500 }, { "epoch": 30.344827586206897, "grad_norm": 3.651850700378418, "learning_rate": 3.104310344827586e-05, "loss": 33.4887, "step": 39600 }, { "epoch": 30.421455938697317, "grad_norm": 3.29491925239563, "learning_rate": 3.0995210727969346e-05, "loss": 33.5714, "step": 39700 }, { "epoch": 30.49808429118774, "grad_norm": 3.9116616249084473, "learning_rate": 3.094731800766283e-05, "loss": 33.7763, "step": 39800 }, { "epoch": 30.57471264367816, "grad_norm": Infinity, "learning_rate": 3.089990421455939e-05, "loss": 32.1907, "step": 39900 }, { "epoch": 30.65134099616858, "grad_norm": 3.237652063369751, "learning_rate": 3.085201149425287e-05, "loss": 33.344, "step": 40000 }, { "epoch": 30.727969348659006, "grad_norm": 4.286235809326172, "learning_rate": 3.080459770114943e-05, "loss": 33.181, "step": 40100 }, { "epoch": 30.804597701149426, "grad_norm": 2.6222527027130127, "learning_rate": 3.075670498084292e-05, "loss": 33.3407, "step": 40200 }, { "epoch": 30.881226053639846, "grad_norm": 3.7431180477142334, "learning_rate": 3.0708812260536404e-05, "loss": 33.1109, "step": 40300 }, { "epoch": 30.957854406130267, "grad_norm": 3.0706677436828613, "learning_rate": 3.066091954022989e-05, "loss": 33.3504, "step": 40400 }, { "epoch": 31.0, "eval_loss": 34.48047637939453, "eval_runtime": 49.4044, "eval_samples_per_second": 26.415, "eval_steps_per_second": 3.32, "step": 40455 }, { "epoch": 31.03448275862069, "grad_norm": 3.288548231124878, "learning_rate": 3.061302681992337e-05, "loss": 33.4014, "step": 40500 }, { "epoch": 31.11111111111111, "grad_norm": 4.078604221343994, "learning_rate": 3.056513409961686e-05, "loss": 33.5796, "step": 40600 }, { "epoch": 31.18773946360153, "grad_norm": 3.589484691619873, "learning_rate": 3.0517241379310348e-05, "loss": 32.9547, "step": 40700 }, { "epoch": 31.264367816091955, "grad_norm": 3.1043126583099365, "learning_rate": 3.046934865900383e-05, "loss": 33.2105, "step": 40800 }, { "epoch": 31.340996168582375, "grad_norm": 2.446356773376465, "learning_rate": 3.0421455938697318e-05, "loss": 33.1642, "step": 40900 }, { "epoch": 31.417624521072796, "grad_norm": 2.966627597808838, "learning_rate": 3.0373563218390805e-05, "loss": 32.7751, "step": 41000 }, { "epoch": 31.49425287356322, "grad_norm": 4.547020435333252, "learning_rate": 3.0325670498084292e-05, "loss": 33.8578, "step": 41100 }, { "epoch": 31.57088122605364, "grad_norm": 3.151139259338379, "learning_rate": 3.0277777777777776e-05, "loss": 33.2976, "step": 41200 }, { "epoch": 31.64750957854406, "grad_norm": 2.8900582790374756, "learning_rate": 3.0229885057471262e-05, "loss": 33.1161, "step": 41300 }, { "epoch": 31.724137931034484, "grad_norm": 2.5485446453094482, "learning_rate": 3.0181992337164756e-05, "loss": 33.596, "step": 41400 }, { "epoch": 31.800766283524904, "grad_norm": 2.5474777221679688, "learning_rate": 3.0134099616858243e-05, "loss": 33.3569, "step": 41500 }, { "epoch": 31.877394636015325, "grad_norm": 3.6182713508605957, "learning_rate": 3.0086206896551726e-05, "loss": 32.824, "step": 41600 }, { "epoch": 31.95402298850575, "grad_norm": 3.898332118988037, "learning_rate": 3.0038314176245213e-05, "loss": 32.8775, "step": 41700 }, { "epoch": 32.0, "eval_loss": 34.500526428222656, "eval_runtime": 49.4041, "eval_samples_per_second": 26.415, "eval_steps_per_second": 3.32, "step": 41760 }, { "epoch": 32.030651340996165, "grad_norm": 3.481757164001465, "learning_rate": 2.99904214559387e-05, "loss": 33.4618, "step": 41800 }, { "epoch": 32.10727969348659, "grad_norm": 3.9191551208496094, "learning_rate": 2.9942528735632187e-05, "loss": 33.627, "step": 41900 }, { "epoch": 32.18390804597701, "grad_norm": 5.722991466522217, "learning_rate": 2.989463601532567e-05, "loss": 32.705, "step": 42000 }, { "epoch": 32.26053639846743, "grad_norm": 4.626276016235352, "learning_rate": 2.9846743295019157e-05, "loss": 33.4211, "step": 42100 }, { "epoch": 32.337164750957854, "grad_norm": 2.526745557785034, "learning_rate": 2.9798850574712644e-05, "loss": 32.9605, "step": 42200 }, { "epoch": 32.41379310344828, "grad_norm": 2.2517364025115967, "learning_rate": 2.975095785440613e-05, "loss": 33.1264, "step": 42300 }, { "epoch": 32.490421455938694, "grad_norm": 5.5678606033325195, "learning_rate": 2.9703065134099618e-05, "loss": 33.1141, "step": 42400 }, { "epoch": 32.56704980842912, "grad_norm": 3.7891595363616943, "learning_rate": 2.96551724137931e-05, "loss": 33.3294, "step": 42500 }, { "epoch": 32.64367816091954, "grad_norm": 3.350956916809082, "learning_rate": 2.960727969348659e-05, "loss": 33.6182, "step": 42600 }, { "epoch": 32.72030651340996, "grad_norm": 3.138821601867676, "learning_rate": 2.9559386973180075e-05, "loss": 33.2219, "step": 42700 }, { "epoch": 32.79693486590038, "grad_norm": 3.301961898803711, "learning_rate": 2.9511494252873566e-05, "loss": 33.5015, "step": 42800 }, { "epoch": 32.87356321839081, "grad_norm": 3.0760138034820557, "learning_rate": 2.9463601532567052e-05, "loss": 33.3376, "step": 42900 }, { "epoch": 32.95019157088122, "grad_norm": 2.474372625350952, "learning_rate": 2.941570881226054e-05, "loss": 32.9016, "step": 43000 }, { "epoch": 33.0, "eval_loss": 34.45762252807617, "eval_runtime": 49.4143, "eval_samples_per_second": 26.409, "eval_steps_per_second": 3.319, "step": 43065 } ], "logging_steps": 100, "max_steps": 104400, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.644125980055654e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }