diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,528 +1,5028 @@ { - "best_global_step": 520, - "best_metric": 0.06546766310930252, - "best_model_checkpoint": "./beans_outputs/checkpoint-520", - "epoch": 5.0, + "best_global_step": 4420, + "best_metric": 0.0026711083482950926, + "best_model_checkpoint": "./beans_outputs/checkpoint-4420", + "epoch": 50.0, "eval_steps": 500, - "global_step": 650, + "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07692307692307693, - "grad_norm": 2.198843479156494, - "learning_rate": 1.9723076923076924e-05, - "loss": 1.0245, + "grad_norm": 2.1993725299835205, + "learning_rate": 1.9972307692307693e-05, + "loss": 1.0244, "step": 10 }, { "epoch": 0.15384615384615385, - "grad_norm": 1.917884111404419, - "learning_rate": 1.9415384615384615e-05, - "loss": 0.9454, + "grad_norm": 1.8771612644195557, + "learning_rate": 1.9941538461538464e-05, + "loss": 0.9441, "step": 20 }, { "epoch": 0.23076923076923078, - "grad_norm": 2.078744411468506, - "learning_rate": 1.910769230769231e-05, - "loss": 0.8398, + "grad_norm": 2.052964210510254, + "learning_rate": 1.9910769230769232e-05, + "loss": 0.8359, "step": 30 }, { "epoch": 0.3076923076923077, - "grad_norm": 2.6160151958465576, - "learning_rate": 1.88e-05, - "loss": 0.6955, + "grad_norm": 2.6124305725097656, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.6878, "step": 40 }, { "epoch": 0.38461538461538464, - "grad_norm": 4.149175643920898, - "learning_rate": 1.8492307692307694e-05, - "loss": 0.661, + "grad_norm": 4.064327239990234, + "learning_rate": 1.984923076923077e-05, + "loss": 0.6501, "step": 50 }, { "epoch": 0.46153846153846156, - "grad_norm": 2.1062347888946533, - "learning_rate": 1.8184615384615384e-05, - "loss": 0.5797, + "grad_norm": 2.0996179580688477, + "learning_rate": 1.9818461538461538e-05, + "loss": 0.5661, "step": 60 }, { "epoch": 0.5384615384615384, - "grad_norm": 1.8043235540390015, - "learning_rate": 1.7876923076923078e-05, - "loss": 0.5192, + "grad_norm": 2.3283865451812744, + "learning_rate": 1.978769230769231e-05, + "loss": 0.5052, "step": 70 }, { "epoch": 0.6153846153846154, - "grad_norm": 1.217357873916626, - "learning_rate": 1.7569230769230772e-05, - "loss": 0.3928, + "grad_norm": 1.2207870483398438, + "learning_rate": 1.9756923076923077e-05, + "loss": 0.3754, "step": 80 }, { "epoch": 0.6923076923076923, - "grad_norm": 2.427462577819824, - "learning_rate": 1.7261538461538463e-05, - "loss": 0.3534, + "grad_norm": 2.5365116596221924, + "learning_rate": 1.9726153846153848e-05, + "loss": 0.3324, "step": 90 }, { "epoch": 0.7692307692307693, - "grad_norm": 3.0030345916748047, - "learning_rate": 1.6953846153846156e-05, - "loss": 0.3683, + "grad_norm": 2.8520209789276123, + "learning_rate": 1.9695384615384616e-05, + "loss": 0.3458, "step": 100 }, { "epoch": 0.8461538461538461, - "grad_norm": 2.384122371673584, - "learning_rate": 1.6646153846153847e-05, - "loss": 0.3326, + "grad_norm": 2.122269630432129, + "learning_rate": 1.9664615384615387e-05, + "loss": 0.3181, "step": 110 }, { "epoch": 0.9230769230769231, - "grad_norm": 2.4714367389678955, - "learning_rate": 1.633846153846154e-05, - "loss": 0.2446, + "grad_norm": 2.8259902000427246, + "learning_rate": 1.9633846153846155e-05, + "loss": 0.2241, "step": 120 }, { "epoch": 1.0, - "grad_norm": 7.0579328536987305, - "learning_rate": 1.603076923076923e-05, - "loss": 0.281, + "grad_norm": 7.2732014656066895, + "learning_rate": 1.9603076923076926e-05, + "loss": 0.2628, "step": 130 }, { "epoch": 1.0, "eval_accuracy": 0.9624060150375939, - "eval_loss": 0.21590378880500793, - "eval_runtime": 0.6008, - "eval_samples_per_second": 221.361, - "eval_steps_per_second": 28.294, + "eval_loss": 0.19485221803188324, + "eval_runtime": 0.5484, + "eval_samples_per_second": 242.538, + "eval_steps_per_second": 31.001, "step": 130 }, { "epoch": 1.0769230769230769, - "grad_norm": 1.5222105979919434, - "learning_rate": 1.5723076923076926e-05, - "loss": 0.1984, + "grad_norm": 1.3019007444381714, + "learning_rate": 1.9572307692307693e-05, + "loss": 0.1757, "step": 140 }, { "epoch": 1.1538461538461537, - "grad_norm": 3.3074021339416504, - "learning_rate": 1.5415384615384616e-05, - "loss": 0.2224, + "grad_norm": 3.8632516860961914, + "learning_rate": 1.9541538461538464e-05, + "loss": 0.2064, "step": 150 }, { "epoch": 1.2307692307692308, - "grad_norm": 1.5715149641036987, - "learning_rate": 1.510769230769231e-05, - "loss": 0.2032, + "grad_norm": 1.4258320331573486, + "learning_rate": 1.9510769230769232e-05, + "loss": 0.176, "step": 160 }, { "epoch": 1.3076923076923077, - "grad_norm": 0.5117770433425903, - "learning_rate": 1.48e-05, - "loss": 0.2197, + "grad_norm": 0.4442344605922699, + "learning_rate": 1.948e-05, + "loss": 0.2113, "step": 170 }, { "epoch": 1.3846153846153846, - "grad_norm": 0.6276758909225464, - "learning_rate": 1.4492307692307695e-05, - "loss": 0.209, + "grad_norm": 0.5212724208831787, + "learning_rate": 1.944923076923077e-05, + "loss": 0.1956, "step": 180 }, { "epoch": 1.4615384615384617, - "grad_norm": 2.73740553855896, - "learning_rate": 1.4184615384615385e-05, - "loss": 0.1603, + "grad_norm": 2.0352931022644043, + "learning_rate": 1.941846153846154e-05, + "loss": 0.1544, "step": 190 }, { "epoch": 1.5384615384615383, - "grad_norm": 0.4705379605293274, - "learning_rate": 1.3876923076923079e-05, - "loss": 0.2545, + "grad_norm": 0.39393723011016846, + "learning_rate": 1.938769230769231e-05, + "loss": 0.2483, "step": 200 }, { "epoch": 1.6153846153846154, - "grad_norm": 0.6391307711601257, - "learning_rate": 1.356923076923077e-05, - "loss": 0.1793, + "grad_norm": 0.556025505065918, + "learning_rate": 1.9356923076923077e-05, + "loss": 0.1702, "step": 210 }, { "epoch": 1.6923076923076923, - "grad_norm": 3.0284461975097656, - "learning_rate": 1.3261538461538464e-05, - "loss": 0.2168, + "grad_norm": 1.02499520778656, + "learning_rate": 1.932615384615385e-05, + "loss": 0.2031, "step": 220 }, { "epoch": 1.7692307692307692, - "grad_norm": 2.021752119064331, - "learning_rate": 1.2953846153846154e-05, - "loss": 0.1622, + "grad_norm": 1.9602783918380737, + "learning_rate": 1.929538461538462e-05, + "loss": 0.1452, "step": 230 }, { "epoch": 1.8461538461538463, - "grad_norm": 5.118215560913086, - "learning_rate": 1.2646153846153848e-05, - "loss": 0.2609, + "grad_norm": 5.003844261169434, + "learning_rate": 1.9264615384615387e-05, + "loss": 0.2707, "step": 240 }, { "epoch": 1.9230769230769231, - "grad_norm": 0.380063533782959, - "learning_rate": 1.2338461538461539e-05, - "loss": 0.137, + "grad_norm": 0.3506212830543518, + "learning_rate": 1.9233846153846155e-05, + "loss": 0.1291, "step": 250 }, { "epoch": 2.0, - "grad_norm": 0.34861981868743896, - "learning_rate": 1.2030769230769233e-05, - "loss": 0.1296, + "grad_norm": 0.2888948917388916, + "learning_rate": 1.9203076923076923e-05, + "loss": 0.118, "step": 260 }, { "epoch": 2.0, "eval_accuracy": 0.9699248120300752, - "eval_loss": 0.13036151230335236, - "eval_runtime": 0.607, - "eval_samples_per_second": 219.099, - "eval_steps_per_second": 28.005, + "eval_loss": 0.1250736266374588, + "eval_runtime": 0.5315, + "eval_samples_per_second": 250.252, + "eval_steps_per_second": 31.987, "step": 260 }, { "epoch": 2.076923076923077, - "grad_norm": 0.48520398139953613, - "learning_rate": 1.1723076923076923e-05, - "loss": 0.0894, + "grad_norm": 0.7948436141014099, + "learning_rate": 1.9172307692307694e-05, + "loss": 0.0888, "step": 270 }, { "epoch": 2.1538461538461537, - "grad_norm": 0.3357318639755249, - "learning_rate": 1.1415384615384617e-05, - "loss": 0.1421, + "grad_norm": 0.2710164785385132, + "learning_rate": 1.914153846153846e-05, + "loss": 0.1268, "step": 280 }, { "epoch": 2.230769230769231, - "grad_norm": 7.024319648742676, - "learning_rate": 1.1107692307692308e-05, - "loss": 0.1226, + "grad_norm": 8.435958862304688, + "learning_rate": 1.9110769230769233e-05, + "loss": 0.1177, "step": 290 }, { "epoch": 2.3076923076923075, - "grad_norm": 8.50007152557373, - "learning_rate": 1.0800000000000002e-05, - "loss": 0.1355, + "grad_norm": 9.71842098236084, + "learning_rate": 1.908e-05, + "loss": 0.1279, "step": 300 }, { "epoch": 2.3846153846153846, - "grad_norm": 4.990291118621826, - "learning_rate": 1.0492307692307692e-05, - "loss": 0.1761, + "grad_norm": 8.280940055847168, + "learning_rate": 1.904923076923077e-05, + "loss": 0.1861, "step": 310 }, { "epoch": 2.4615384615384617, - "grad_norm": 0.3203364908695221, - "learning_rate": 1.0184615384615386e-05, - "loss": 0.1472, + "grad_norm": 0.23196564614772797, + "learning_rate": 1.901846153846154e-05, + "loss": 0.1407, "step": 320 }, { "epoch": 2.5384615384615383, - "grad_norm": 6.037258148193359, - "learning_rate": 9.876923076923077e-06, - "loss": 0.1877, + "grad_norm": 3.181715488433838, + "learning_rate": 1.898769230769231e-05, + "loss": 0.1969, "step": 330 }, { "epoch": 2.6153846153846154, - "grad_norm": 1.1843386888504028, - "learning_rate": 9.569230769230769e-06, - "loss": 0.0712, + "grad_norm": 0.7809241414070129, + "learning_rate": 1.8956923076923078e-05, + "loss": 0.056, "step": 340 }, { "epoch": 2.6923076923076925, - "grad_norm": 0.35253170132637024, - "learning_rate": 9.261538461538461e-06, - "loss": 0.1313, + "grad_norm": 0.2504461407661438, + "learning_rate": 1.892615384615385e-05, + "loss": 0.1216, "step": 350 }, { "epoch": 2.769230769230769, - "grad_norm": 9.631648063659668, - "learning_rate": 8.953846153846153e-06, - "loss": 0.1164, + "grad_norm": 4.153248310089111, + "learning_rate": 1.8895384615384617e-05, + "loss": 0.0784, "step": 360 }, { "epoch": 2.8461538461538463, - "grad_norm": 0.31370919942855835, - "learning_rate": 8.646153846153846e-06, - "loss": 0.1535, + "grad_norm": 0.228765606880188, + "learning_rate": 1.8864615384615384e-05, + "loss": 0.1656, "step": 370 }, { "epoch": 2.9230769230769234, - "grad_norm": 1.1416504383087158, - "learning_rate": 8.338461538461538e-06, - "loss": 0.1321, + "grad_norm": 2.024319887161255, + "learning_rate": 1.8833846153846155e-05, + "loss": 0.1252, "step": 380 }, { "epoch": 3.0, - "grad_norm": 0.2953951358795166, - "learning_rate": 8.03076923076923e-06, - "loss": 0.1373, + "grad_norm": 0.2130887359380722, + "learning_rate": 1.8803076923076923e-05, + "loss": 0.1361, "step": 390 }, { "epoch": 3.0, - "eval_accuracy": 0.9774436090225563, - "eval_loss": 0.09720253199338913, - "eval_runtime": 0.5922, - "eval_samples_per_second": 224.569, - "eval_steps_per_second": 28.704, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.061659153550863266, + "eval_runtime": 0.5223, + "eval_samples_per_second": 254.66, + "eval_steps_per_second": 32.551, "step": 390 }, { "epoch": 3.076923076923077, - "grad_norm": 2.0271453857421875, - "learning_rate": 7.723076923076924e-06, - "loss": 0.1345, + "grad_norm": 3.039837121963501, + "learning_rate": 1.8772307692307694e-05, + "loss": 0.1213, "step": 400 }, { "epoch": 3.1538461538461537, - "grad_norm": 7.004184722900391, - "learning_rate": 7.4153846153846164e-06, - "loss": 0.1496, + "grad_norm": 7.426234245300293, + "learning_rate": 1.8741538461538462e-05, + "loss": 0.1952, "step": 410 }, { "epoch": 3.230769230769231, - "grad_norm": 9.225290298461914, - "learning_rate": 7.107692307692309e-06, - "loss": 0.1388, + "grad_norm": 7.860882759094238, + "learning_rate": 1.8710769230769233e-05, + "loss": 0.1215, "step": 420 }, { "epoch": 3.3076923076923075, - "grad_norm": 0.2332003116607666, - "learning_rate": 6.800000000000001e-06, - "loss": 0.0772, + "grad_norm": 0.1747923642396927, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.085, "step": 430 }, { "epoch": 3.3846153846153846, - "grad_norm": 3.141547918319702, - "learning_rate": 6.492307692307693e-06, - "loss": 0.1145, + "grad_norm": 9.794089317321777, + "learning_rate": 1.8649230769230772e-05, + "loss": 0.1086, "step": 440 }, { "epoch": 3.4615384615384617, - "grad_norm": 12.893486022949219, - "learning_rate": 6.1846153846153855e-06, - "loss": 0.0818, + "grad_norm": 8.472834587097168, + "learning_rate": 1.861846153846154e-05, + "loss": 0.091, "step": 450 }, { "epoch": 3.5384615384615383, - "grad_norm": 5.630913257598877, - "learning_rate": 5.876923076923078e-06, - "loss": 0.0948, + "grad_norm": 3.2180655002593994, + "learning_rate": 1.8587692307692307e-05, + "loss": 0.1159, "step": 460 }, { "epoch": 3.6153846153846154, - "grad_norm": 7.8553266525268555, - "learning_rate": 5.56923076923077e-06, - "loss": 0.0719, + "grad_norm": 0.7684321403503418, + "learning_rate": 1.8556923076923078e-05, + "loss": 0.0745, "step": 470 }, { "epoch": 3.6923076923076925, - "grad_norm": 0.21685053408145905, - "learning_rate": 5.261538461538462e-06, - "loss": 0.1, + "grad_norm": 0.15339425206184387, + "learning_rate": 1.8526153846153846e-05, + "loss": 0.0811, "step": 480 }, { "epoch": 3.769230769230769, - "grad_norm": 0.21081575751304626, - "learning_rate": 4.9538461538461545e-06, - "loss": 0.069, + "grad_norm": 0.15312495827674866, + "learning_rate": 1.8495384615384617e-05, + "loss": 0.0538, "step": 490 }, { "epoch": 3.8461538461538463, - "grad_norm": 0.8678991198539734, - "learning_rate": 4.646153846153847e-06, - "loss": 0.1374, + "grad_norm": 0.9928426742553711, + "learning_rate": 1.8464615384615385e-05, + "loss": 0.1629, "step": 500 }, { "epoch": 3.9230769230769234, - "grad_norm": 0.20829035341739655, - "learning_rate": 4.338461538461539e-06, - "loss": 0.1311, + "grad_norm": 0.13902151584625244, + "learning_rate": 1.8433846153846156e-05, + "loss": 0.1372, "step": 510 }, { "epoch": 4.0, - "grad_norm": 0.30338263511657715, - "learning_rate": 4.030769230769231e-06, - "loss": 0.0845, + "grad_norm": 0.21010972559452057, + "learning_rate": 1.8403076923076924e-05, + "loss": 0.0528, "step": 520 }, { "epoch": 4.0, - "eval_accuracy": 0.9924812030075187, - "eval_loss": 0.06546766310930252, - "eval_runtime": 0.5826, - "eval_samples_per_second": 228.276, - "eval_steps_per_second": 29.178, + "eval_accuracy": 0.9774436090225563, + "eval_loss": 0.07376394420862198, + "eval_runtime": 0.5377, + "eval_samples_per_second": 247.352, + "eval_steps_per_second": 31.616, "step": 520 }, { "epoch": 4.076923076923077, - "grad_norm": 7.523567199707031, - "learning_rate": 3.723076923076923e-06, - "loss": 0.0997, + "grad_norm": 15.824823379516602, + "learning_rate": 1.8372307692307695e-05, + "loss": 0.0982, "step": 530 }, { "epoch": 4.153846153846154, - "grad_norm": 1.7107905149459839, - "learning_rate": 3.4153846153846154e-06, - "loss": 0.1668, + "grad_norm": 1.9137977361679077, + "learning_rate": 1.8341538461538462e-05, + "loss": 0.1502, "step": 540 }, { "epoch": 4.230769230769231, - "grad_norm": 0.3665030300617218, - "learning_rate": 3.1076923076923076e-06, - "loss": 0.1545, + "grad_norm": 0.3407939672470093, + "learning_rate": 1.8310769230769233e-05, + "loss": 0.1394, "step": 550 }, { "epoch": 4.3076923076923075, - "grad_norm": 1.344637155532837, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.0503, + "grad_norm": 0.6323396563529968, + "learning_rate": 1.828e-05, + "loss": 0.0307, "step": 560 }, { "epoch": 4.384615384615385, - "grad_norm": 0.27009326219558716, - "learning_rate": 2.4923076923076926e-06, - "loss": 0.135, + "grad_norm": 0.1906716227531433, + "learning_rate": 1.824923076923077e-05, + "loss": 0.13, "step": 570 }, { "epoch": 4.461538461538462, - "grad_norm": 0.2649720311164856, - "learning_rate": 2.184615384615385e-06, - "loss": 0.083, + "grad_norm": 0.17275558412075043, + "learning_rate": 1.821846153846154e-05, + "loss": 0.0686, "step": 580 }, { "epoch": 4.538461538461538, - "grad_norm": 9.79839038848877, - "learning_rate": 1.876923076923077e-06, - "loss": 0.072, + "grad_norm": 17.331369400024414, + "learning_rate": 1.8187692307692308e-05, + "loss": 0.0953, "step": 590 }, { "epoch": 4.615384615384615, - "grad_norm": 2.8558285236358643, - "learning_rate": 1.5692307692307693e-06, - "loss": 0.0905, + "grad_norm": 1.0825220346450806, + "learning_rate": 1.815692307692308e-05, + "loss": 0.1503, "step": 600 }, { "epoch": 4.6923076923076925, - "grad_norm": 5.410094738006592, - "learning_rate": 1.2615384615384616e-06, - "loss": 0.0874, + "grad_norm": 1.982507586479187, + "learning_rate": 1.8126153846153846e-05, + "loss": 0.041, "step": 610 }, { "epoch": 4.769230769230769, - "grad_norm": 0.26254087686538696, - "learning_rate": 9.53846153846154e-07, - "loss": 0.0866, + "grad_norm": 0.3985183537006378, + "learning_rate": 1.8095384615384618e-05, + "loss": 0.2082, "step": 620 }, { "epoch": 4.846153846153846, - "grad_norm": 0.2139281928539276, - "learning_rate": 6.461538461538462e-07, - "loss": 0.0662, + "grad_norm": 2.1958301067352295, + "learning_rate": 1.806461538461539e-05, + "loss": 0.0991, "step": 630 }, { "epoch": 4.923076923076923, - "grad_norm": 0.23087060451507568, - "learning_rate": 3.3846153846153845e-07, - "loss": 0.0822, + "grad_norm": 0.1799912005662918, + "learning_rate": 1.8033846153846156e-05, + "loss": 0.0802, "step": 640 }, { "epoch": 5.0, - "grad_norm": 0.618672788143158, - "learning_rate": 3.076923076923077e-08, - "loss": 0.1147, + "grad_norm": 0.19620178639888763, + "learning_rate": 1.8003076923076924e-05, + "loss": 0.1193, "step": 650 }, { "epoch": 5.0, - "eval_accuracy": 0.9774436090225563, - "eval_loss": 0.08272561430931091, - "eval_runtime": 0.6349, - "eval_samples_per_second": 209.467, - "eval_steps_per_second": 26.774, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.04503392055630684, + "eval_runtime": 0.5386, + "eval_samples_per_second": 246.939, + "eval_steps_per_second": 31.564, "step": 650 }, { - "epoch": 5.0, - "step": 650, - "total_flos": 4.006371770595533e+17, - "train_loss": 0.0, - "train_runtime": 0.1521, - "train_samples_per_second": 33994.24, - "train_steps_per_second": 4273.937 + "epoch": 5.076923076923077, + "grad_norm": 0.13532300293445587, + "learning_rate": 1.7972307692307692e-05, + "loss": 0.0844, + "step": 660 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 1.1429115533828735, + "learning_rate": 1.7941538461538463e-05, + "loss": 0.0569, + "step": 670 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 0.21620318293571472, + "learning_rate": 1.791076923076923e-05, + "loss": 0.1509, + "step": 680 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 0.22496506571769714, + "learning_rate": 1.788e-05, + "loss": 0.0691, + "step": 690 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 2.199425220489502, + "learning_rate": 1.784923076923077e-05, + "loss": 0.1, + "step": 700 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 0.22637762129306793, + "learning_rate": 1.781846153846154e-05, + "loss": 0.0602, + "step": 710 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 0.10512543469667435, + "learning_rate": 1.778769230769231e-05, + "loss": 0.1046, + "step": 720 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 9.640692710876465, + "learning_rate": 1.775692307692308e-05, + "loss": 0.1422, + "step": 730 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 0.10444699972867966, + "learning_rate": 1.7726153846153847e-05, + "loss": 0.0954, + "step": 740 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.1038907989859581, + "learning_rate": 1.7695384615384618e-05, + "loss": 0.069, + "step": 750 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 0.10659747570753098, + "learning_rate": 1.7664615384615386e-05, + "loss": 0.0995, + "step": 760 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 13.881101608276367, + "learning_rate": 1.7633846153846153e-05, + "loss": 0.0691, + "step": 770 + }, + { + "epoch": 6.0, + "grad_norm": 0.13533784449100494, + "learning_rate": 1.7603076923076924e-05, + "loss": 0.0533, + "step": 780 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9849624060150376, + "eval_loss": 0.044003281742334366, + "eval_runtime": 0.5995, + "eval_samples_per_second": 221.869, + "eval_steps_per_second": 28.359, + "step": 780 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 0.10555137693881989, + "learning_rate": 1.7572307692307692e-05, + "loss": 0.0309, + "step": 790 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 7.061936855316162, + "learning_rate": 1.7541538461538463e-05, + "loss": 0.1161, + "step": 800 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 0.22455041110515594, + "learning_rate": 1.751076923076923e-05, + "loss": 0.0206, + "step": 810 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 0.5675930976867676, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.0572, + "step": 820 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 2.61474347114563, + "learning_rate": 1.7449230769230773e-05, + "loss": 0.1632, + "step": 830 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 17.26417350769043, + "learning_rate": 1.741846153846154e-05, + "loss": 0.1166, + "step": 840 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.13066354393959045, + "learning_rate": 1.738769230769231e-05, + "loss": 0.1699, + "step": 850 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 3.2680106163024902, + "learning_rate": 1.7356923076923076e-05, + "loss": 0.0347, + "step": 860 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 15.424432754516602, + "learning_rate": 1.7326153846153847e-05, + "loss": 0.2143, + "step": 870 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 0.2559768855571747, + "learning_rate": 1.7295384615384615e-05, + "loss": 0.0605, + "step": 880 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 1.5335848331451416, + "learning_rate": 1.7264615384615386e-05, + "loss": 0.0956, + "step": 890 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 18.936418533325195, + "learning_rate": 1.7233846153846154e-05, + "loss": 0.0894, + "step": 900 + }, + { + "epoch": 7.0, + "grad_norm": 0.13383910059928894, + "learning_rate": 1.7203076923076925e-05, + "loss": 0.112, + "step": 910 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9849624060150376, + "eval_loss": 0.08170107752084732, + "eval_runtime": 0.5145, + "eval_samples_per_second": 258.513, + "eval_steps_per_second": 33.043, + "step": 910 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 0.10564497113227844, + "learning_rate": 1.7172307692307696e-05, + "loss": 0.0313, + "step": 920 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 0.09875310212373734, + "learning_rate": 1.7141538461538464e-05, + "loss": 0.0195, + "step": 930 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 1.6062630414962769, + "learning_rate": 1.711076923076923e-05, + "loss": 0.0202, + "step": 940 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 2.8777565956115723, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.0695, + "step": 950 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 6.561578750610352, + "learning_rate": 1.704923076923077e-05, + "loss": 0.0224, + "step": 960 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 0.3698536157608032, + "learning_rate": 1.7018461538461538e-05, + "loss": 0.0832, + "step": 970 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 0.08061320334672928, + "learning_rate": 1.698769230769231e-05, + "loss": 0.0374, + "step": 980 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 0.13726599514484406, + "learning_rate": 1.6956923076923077e-05, + "loss": 0.0735, + "step": 990 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.08616241812705994, + "learning_rate": 1.6926153846153848e-05, + "loss": 0.0581, + "step": 1000 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 0.09052889049053192, + "learning_rate": 1.6895384615384615e-05, + "loss": 0.1407, + "step": 1010 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 0.08195500820875168, + "learning_rate": 1.6864615384615387e-05, + "loss": 0.0783, + "step": 1020 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 0.24290509521961212, + "learning_rate": 1.6833846153846158e-05, + "loss": 0.0338, + "step": 1030 + }, + { + "epoch": 8.0, + "grad_norm": 0.24887411296367645, + "learning_rate": 1.6803076923076925e-05, + "loss": 0.1805, + "step": 1040 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9849624060150376, + "eval_loss": 0.056554585695266724, + "eval_runtime": 0.527, + "eval_samples_per_second": 252.351, + "eval_steps_per_second": 32.255, + "step": 1040 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 0.1855037808418274, + "learning_rate": 1.6772307692307693e-05, + "loss": 0.0593, + "step": 1050 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 0.06974730640649796, + "learning_rate": 1.674153846153846e-05, + "loss": 0.0604, + "step": 1060 + }, + { + "epoch": 8.23076923076923, + "grad_norm": 0.09743643552064896, + "learning_rate": 1.6710769230769232e-05, + "loss": 0.0998, + "step": 1070 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 0.48972249031066895, + "learning_rate": 1.668e-05, + "loss": 0.0862, + "step": 1080 + }, + { + "epoch": 8.384615384615385, + "grad_norm": 0.10573729872703552, + "learning_rate": 1.664923076923077e-05, + "loss": 0.0965, + "step": 1090 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 13.329763412475586, + "learning_rate": 1.661846153846154e-05, + "loss": 0.1358, + "step": 1100 + }, + { + "epoch": 8.538461538461538, + "grad_norm": 0.06869436055421829, + "learning_rate": 1.658769230769231e-05, + "loss": 0.0472, + "step": 1110 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 0.08724185079336166, + "learning_rate": 1.655692307692308e-05, + "loss": 0.0192, + "step": 1120 + }, + { + "epoch": 8.692307692307692, + "grad_norm": 0.31756719946861267, + "learning_rate": 1.6526153846153848e-05, + "loss": 0.0923, + "step": 1130 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 0.07281168550252914, + "learning_rate": 1.6495384615384616e-05, + "loss": 0.0417, + "step": 1140 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 0.0760447159409523, + "learning_rate": 1.6464615384615387e-05, + "loss": 0.0289, + "step": 1150 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 0.0711609274148941, + "learning_rate": 1.6433846153846155e-05, + "loss": 0.0205, + "step": 1160 + }, + { + "epoch": 9.0, + "grad_norm": 4.698291778564453, + "learning_rate": 1.6403076923076922e-05, + "loss": 0.0257, + "step": 1170 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.01932494528591633, + "eval_runtime": 0.5334, + "eval_samples_per_second": 249.357, + "eval_steps_per_second": 31.873, + "step": 1170 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 0.06923072040081024, + "learning_rate": 1.6372307692307693e-05, + "loss": 0.0771, + "step": 1180 + }, + { + "epoch": 9.153846153846153, + "grad_norm": 0.06963079422712326, + "learning_rate": 1.634153846153846e-05, + "loss": 0.1029, + "step": 1190 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.09271117299795151, + "learning_rate": 1.6310769230769232e-05, + "loss": 0.0865, + "step": 1200 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 1.8971734046936035, + "learning_rate": 1.628e-05, + "loss": 0.0927, + "step": 1210 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 2.8679933547973633, + "learning_rate": 1.624923076923077e-05, + "loss": 0.0529, + "step": 1220 + }, + { + "epoch": 9.461538461538462, + "grad_norm": 0.09798012673854828, + "learning_rate": 1.6218461538461542e-05, + "loss": 0.0703, + "step": 1230 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 0.2857028841972351, + "learning_rate": 1.618769230769231e-05, + "loss": 0.0428, + "step": 1240 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 1.7935677766799927, + "learning_rate": 1.6156923076923078e-05, + "loss": 0.1339, + "step": 1250 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 5.323243141174316, + "learning_rate": 1.6126153846153845e-05, + "loss": 0.1784, + "step": 1260 + }, + { + "epoch": 9.76923076923077, + "grad_norm": 0.06728469580411911, + "learning_rate": 1.6095384615384616e-05, + "loss": 0.0586, + "step": 1270 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 0.07235895097255707, + "learning_rate": 1.6064615384615384e-05, + "loss": 0.1575, + "step": 1280 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 0.7870832681655884, + "learning_rate": 1.6033846153846155e-05, + "loss": 0.0551, + "step": 1290 + }, + { + "epoch": 10.0, + "grad_norm": 0.06598883867263794, + "learning_rate": 1.6003076923076923e-05, + "loss": 0.0132, + "step": 1300 + }, + { + "epoch": 10.0, + "eval_accuracy": 1.0, + "eval_loss": 0.012247535400092602, + "eval_runtime": 0.5465, + "eval_samples_per_second": 243.345, + "eval_steps_per_second": 31.104, + "step": 1300 + }, + { + "epoch": 10.076923076923077, + "grad_norm": 5.756060600280762, + "learning_rate": 1.5972307692307694e-05, + "loss": 0.0639, + "step": 1310 + }, + { + "epoch": 10.153846153846153, + "grad_norm": 0.08128657191991806, + "learning_rate": 1.5941538461538465e-05, + "loss": 0.0372, + "step": 1320 + }, + { + "epoch": 10.23076923076923, + "grad_norm": 0.05877137556672096, + "learning_rate": 1.5910769230769233e-05, + "loss": 0.0418, + "step": 1330 + }, + { + "epoch": 10.307692307692308, + "grad_norm": 0.059279099106788635, + "learning_rate": 1.588e-05, + "loss": 0.0562, + "step": 1340 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 0.06010574847459793, + "learning_rate": 1.584923076923077e-05, + "loss": 0.1116, + "step": 1350 + }, + { + "epoch": 10.461538461538462, + "grad_norm": 0.056922800838947296, + "learning_rate": 1.581846153846154e-05, + "loss": 0.0187, + "step": 1360 + }, + { + "epoch": 10.538461538461538, + "grad_norm": 7.122900009155273, + "learning_rate": 1.5787692307692307e-05, + "loss": 0.0636, + "step": 1370 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 0.05465436354279518, + "learning_rate": 1.5756923076923078e-05, + "loss": 0.0202, + "step": 1380 + }, + { + "epoch": 10.692307692307692, + "grad_norm": 7.826301097869873, + "learning_rate": 1.5726153846153846e-05, + "loss": 0.014, + "step": 1390 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 0.18159383535385132, + "learning_rate": 1.5695384615384617e-05, + "loss": 0.0751, + "step": 1400 + }, + { + "epoch": 10.846153846153847, + "grad_norm": 0.14412198960781097, + "learning_rate": 1.5664615384615388e-05, + "loss": 0.1063, + "step": 1410 + }, + { + "epoch": 10.923076923076923, + "grad_norm": 15.912820816040039, + "learning_rate": 1.5633846153846156e-05, + "loss": 0.1204, + "step": 1420 + }, + { + "epoch": 11.0, + "grad_norm": 0.054850250482559204, + "learning_rate": 1.5603076923076927e-05, + "loss": 0.0138, + "step": 1430 + }, + { + "epoch": 11.0, + "eval_accuracy": 1.0, + "eval_loss": 0.011250641196966171, + "eval_runtime": 0.5237, + "eval_samples_per_second": 253.964, + "eval_steps_per_second": 32.462, + "step": 1430 + }, + { + "epoch": 11.076923076923077, + "grad_norm": 0.9215047955513, + "learning_rate": 1.5572307692307694e-05, + "loss": 0.0599, + "step": 1440 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 0.0585063137114048, + "learning_rate": 1.5541538461538462e-05, + "loss": 0.0105, + "step": 1450 + }, + { + "epoch": 11.23076923076923, + "grad_norm": 0.07215956598520279, + "learning_rate": 1.551076923076923e-05, + "loss": 0.1199, + "step": 1460 + }, + { + "epoch": 11.307692307692308, + "grad_norm": 0.31457674503326416, + "learning_rate": 1.548e-05, + "loss": 0.0243, + "step": 1470 + }, + { + "epoch": 11.384615384615385, + "grad_norm": 6.269303798675537, + "learning_rate": 1.544923076923077e-05, + "loss": 0.1006, + "step": 1480 + }, + { + "epoch": 11.461538461538462, + "grad_norm": 0.10126815736293793, + "learning_rate": 1.541846153846154e-05, + "loss": 0.0529, + "step": 1490 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 0.0713447853922844, + "learning_rate": 1.5387692307692307e-05, + "loss": 0.1265, + "step": 1500 + }, + { + "epoch": 11.615384615384615, + "grad_norm": 0.05555682256817818, + "learning_rate": 1.535692307692308e-05, + "loss": 0.0312, + "step": 1510 + }, + { + "epoch": 11.692307692307692, + "grad_norm": 0.6526745557785034, + "learning_rate": 1.532615384615385e-05, + "loss": 0.0198, + "step": 1520 + }, + { + "epoch": 11.76923076923077, + "grad_norm": 26.136056900024414, + "learning_rate": 1.5295384615384617e-05, + "loss": 0.1143, + "step": 1530 + }, + { + "epoch": 11.846153846153847, + "grad_norm": 0.0794009119272232, + "learning_rate": 1.5264615384615385e-05, + "loss": 0.0798, + "step": 1540 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 0.05675533041357994, + "learning_rate": 1.5233846153846154e-05, + "loss": 0.0175, + "step": 1550 + }, + { + "epoch": 12.0, + "grad_norm": 0.05353838950395584, + "learning_rate": 1.5203076923076925e-05, + "loss": 0.0702, + "step": 1560 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9849624060150376, + "eval_loss": 0.07327966392040253, + "eval_runtime": 0.5368, + "eval_samples_per_second": 247.745, + "eval_steps_per_second": 31.667, + "step": 1560 + }, + { + "epoch": 12.076923076923077, + "grad_norm": 0.06556718796491623, + "learning_rate": 1.5172307692307693e-05, + "loss": 0.062, + "step": 1570 + }, + { + "epoch": 12.153846153846153, + "grad_norm": 0.05220413580536842, + "learning_rate": 1.5141538461538463e-05, + "loss": 0.0536, + "step": 1580 + }, + { + "epoch": 12.23076923076923, + "grad_norm": 0.046761319041252136, + "learning_rate": 1.5110769230769232e-05, + "loss": 0.0165, + "step": 1590 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 0.04752703011035919, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.0492, + "step": 1600 + }, + { + "epoch": 12.384615384615385, + "grad_norm": 4.9522857666015625, + "learning_rate": 1.504923076923077e-05, + "loss": 0.0227, + "step": 1610 + }, + { + "epoch": 12.461538461538462, + "grad_norm": 0.06048072502017021, + "learning_rate": 1.501846153846154e-05, + "loss": 0.009, + "step": 1620 + }, + { + "epoch": 12.538461538461538, + "grad_norm": 0.6236754655838013, + "learning_rate": 1.498769230769231e-05, + "loss": 0.0093, + "step": 1630 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 0.043698158115148544, + "learning_rate": 1.4956923076923077e-05, + "loss": 0.0532, + "step": 1640 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 0.2654157876968384, + "learning_rate": 1.4926153846153848e-05, + "loss": 0.0689, + "step": 1650 + }, + { + "epoch": 12.76923076923077, + "grad_norm": 0.07314817607402802, + "learning_rate": 1.4895384615384616e-05, + "loss": 0.113, + "step": 1660 + }, + { + "epoch": 12.846153846153847, + "grad_norm": 0.047497883439064026, + "learning_rate": 1.4864615384615385e-05, + "loss": 0.0242, + "step": 1670 + }, + { + "epoch": 12.923076923076923, + "grad_norm": 0.045534830540418625, + "learning_rate": 1.4833846153846155e-05, + "loss": 0.0662, + "step": 1680 + }, + { + "epoch": 13.0, + "grad_norm": 0.05489093437790871, + "learning_rate": 1.4803076923076924e-05, + "loss": 0.0631, + "step": 1690 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9624060150375939, + "eval_loss": 0.16809675097465515, + "eval_runtime": 0.5346, + "eval_samples_per_second": 248.782, + "eval_steps_per_second": 31.799, + "step": 1690 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 4.369393348693848, + "learning_rate": 1.4772307692307692e-05, + "loss": 0.0094, + "step": 1700 + }, + { + "epoch": 13.153846153846153, + "grad_norm": 0.11855673789978027, + "learning_rate": 1.4741538461538463e-05, + "loss": 0.0861, + "step": 1710 + }, + { + "epoch": 13.23076923076923, + "grad_norm": 0.04420981928706169, + "learning_rate": 1.4710769230769232e-05, + "loss": 0.0554, + "step": 1720 + }, + { + "epoch": 13.307692307692308, + "grad_norm": 17.567378997802734, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.1276, + "step": 1730 + }, + { + "epoch": 13.384615384615385, + "grad_norm": 27.38726806640625, + "learning_rate": 1.4649230769230771e-05, + "loss": 0.1181, + "step": 1740 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 0.4835070073604584, + "learning_rate": 1.4618461538461539e-05, + "loss": 0.0877, + "step": 1750 + }, + { + "epoch": 13.538461538461538, + "grad_norm": 0.04729830101132393, + "learning_rate": 1.458769230769231e-05, + "loss": 0.0901, + "step": 1760 + }, + { + "epoch": 13.615384615384615, + "grad_norm": 28.740734100341797, + "learning_rate": 1.4556923076923078e-05, + "loss": 0.0895, + "step": 1770 + }, + { + "epoch": 13.692307692307692, + "grad_norm": 0.03954683616757393, + "learning_rate": 1.4526153846153847e-05, + "loss": 0.0626, + "step": 1780 + }, + { + "epoch": 13.76923076923077, + "grad_norm": 0.042213909327983856, + "learning_rate": 1.4495384615384616e-05, + "loss": 0.0087, + "step": 1790 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 0.047844819724559784, + "learning_rate": 1.4464615384615386e-05, + "loss": 0.0796, + "step": 1800 + }, + { + "epoch": 13.923076923076923, + "grad_norm": 0.04067637026309967, + "learning_rate": 1.4433846153846155e-05, + "loss": 0.0091, + "step": 1810 + }, + { + "epoch": 14.0, + "grad_norm": 0.04228312522172928, + "learning_rate": 1.4403076923076925e-05, + "loss": 0.0234, + "step": 1820 + }, + { + "epoch": 14.0, + "eval_accuracy": 1.0, + "eval_loss": 0.007983488962054253, + "eval_runtime": 0.6132, + "eval_samples_per_second": 216.892, + "eval_steps_per_second": 27.723, + "step": 1820 + }, + { + "epoch": 14.076923076923077, + "grad_norm": 0.041289396584033966, + "learning_rate": 1.4372307692307694e-05, + "loss": 0.0741, + "step": 1830 + }, + { + "epoch": 14.153846153846153, + "grad_norm": 21.82209014892578, + "learning_rate": 1.4341538461538462e-05, + "loss": 0.0756, + "step": 1840 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 0.0443301722407341, + "learning_rate": 1.4310769230769233e-05, + "loss": 0.0114, + "step": 1850 + }, + { + "epoch": 14.307692307692308, + "grad_norm": 17.30769920349121, + "learning_rate": 1.428e-05, + "loss": 0.0443, + "step": 1860 + }, + { + "epoch": 14.384615384615385, + "grad_norm": 0.08275499939918518, + "learning_rate": 1.4249230769230772e-05, + "loss": 0.0816, + "step": 1870 + }, + { + "epoch": 14.461538461538462, + "grad_norm": 0.06993326544761658, + "learning_rate": 1.421846153846154e-05, + "loss": 0.02, + "step": 1880 + }, + { + "epoch": 14.538461538461538, + "grad_norm": 0.038256365805864334, + "learning_rate": 1.4187692307692309e-05, + "loss": 0.0378, + "step": 1890 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 0.03897516429424286, + "learning_rate": 1.4156923076923076e-05, + "loss": 0.0339, + "step": 1900 + }, + { + "epoch": 14.692307692307692, + "grad_norm": 0.11164209991693497, + "learning_rate": 1.4126153846153847e-05, + "loss": 0.0585, + "step": 1910 + }, + { + "epoch": 14.76923076923077, + "grad_norm": 0.09699133783578873, + "learning_rate": 1.4095384615384617e-05, + "loss": 0.0692, + "step": 1920 + }, + { + "epoch": 14.846153846153847, + "grad_norm": 3.6259686946868896, + "learning_rate": 1.4064615384615386e-05, + "loss": 0.1692, + "step": 1930 + }, + { + "epoch": 14.923076923076923, + "grad_norm": 0.03823119401931763, + "learning_rate": 1.4033846153846156e-05, + "loss": 0.0761, + "step": 1940 + }, + { + "epoch": 15.0, + "grad_norm": 0.7793135643005371, + "learning_rate": 1.4003076923076923e-05, + "loss": 0.088, + "step": 1950 + }, + { + "epoch": 15.0, + "eval_accuracy": 1.0, + "eval_loss": 0.007652882486581802, + "eval_runtime": 0.5238, + "eval_samples_per_second": 253.898, + "eval_steps_per_second": 32.453, + "step": 1950 + }, + { + "epoch": 15.076923076923077, + "grad_norm": 0.278807133436203, + "learning_rate": 1.3972307692307694e-05, + "loss": 0.0556, + "step": 1960 + }, + { + "epoch": 15.153846153846153, + "grad_norm": 0.7071412801742554, + "learning_rate": 1.3941538461538462e-05, + "loss": 0.1088, + "step": 1970 + }, + { + "epoch": 15.23076923076923, + "grad_norm": 0.04095473885536194, + "learning_rate": 1.3910769230769232e-05, + "loss": 0.1378, + "step": 1980 + }, + { + "epoch": 15.307692307692308, + "grad_norm": 0.04028699919581413, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.0086, + "step": 1990 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 0.03561926260590553, + "learning_rate": 1.384923076923077e-05, + "loss": 0.0732, + "step": 2000 + }, + { + "epoch": 15.461538461538462, + "grad_norm": 2.915605068206787, + "learning_rate": 1.3818461538461541e-05, + "loss": 0.0727, + "step": 2010 + }, + { + "epoch": 15.538461538461538, + "grad_norm": 0.04950162023305893, + "learning_rate": 1.3787692307692309e-05, + "loss": 0.0578, + "step": 2020 + }, + { + "epoch": 15.615384615384615, + "grad_norm": 23.74346351623535, + "learning_rate": 1.3756923076923079e-05, + "loss": 0.0422, + "step": 2030 + }, + { + "epoch": 15.692307692307692, + "grad_norm": 0.20986884832382202, + "learning_rate": 1.3726153846153846e-05, + "loss": 0.2549, + "step": 2040 + }, + { + "epoch": 15.76923076923077, + "grad_norm": 0.03871215134859085, + "learning_rate": 1.3695384615384617e-05, + "loss": 0.0654, + "step": 2050 + }, + { + "epoch": 15.846153846153847, + "grad_norm": 0.037374626845121384, + "learning_rate": 1.3664615384615385e-05, + "loss": 0.0069, + "step": 2060 + }, + { + "epoch": 15.923076923076923, + "grad_norm": 0.03548239544034004, + "learning_rate": 1.3633846153846156e-05, + "loss": 0.033, + "step": 2070 + }, + { + "epoch": 16.0, + "grad_norm": 0.039899665862321854, + "learning_rate": 1.3603076923076924e-05, + "loss": 0.0502, + "step": 2080 + }, + { + "epoch": 16.0, + "eval_accuracy": 1.0, + "eval_loss": 0.006897523999214172, + "eval_runtime": 0.533, + "eval_samples_per_second": 249.532, + "eval_steps_per_second": 31.895, + "step": 2080 + }, + { + "epoch": 16.076923076923077, + "grad_norm": 0.12189177423715591, + "learning_rate": 1.3572307692307693e-05, + "loss": 0.0118, + "step": 2090 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 0.06642794609069824, + "learning_rate": 1.3541538461538464e-05, + "loss": 0.1052, + "step": 2100 + }, + { + "epoch": 16.23076923076923, + "grad_norm": 2.3815836906433105, + "learning_rate": 1.3510769230769232e-05, + "loss": 0.0742, + "step": 2110 + }, + { + "epoch": 16.307692307692307, + "grad_norm": 3.2235050201416016, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.0083, + "step": 2120 + }, + { + "epoch": 16.384615384615383, + "grad_norm": 0.035906847566366196, + "learning_rate": 1.344923076923077e-05, + "loss": 0.0064, + "step": 2130 + }, + { + "epoch": 16.46153846153846, + "grad_norm": 5.5031938552856445, + "learning_rate": 1.341846153846154e-05, + "loss": 0.1338, + "step": 2140 + }, + { + "epoch": 16.53846153846154, + "grad_norm": 0.0375455841422081, + "learning_rate": 1.3387692307692308e-05, + "loss": 0.0152, + "step": 2150 + }, + { + "epoch": 16.615384615384617, + "grad_norm": 0.07156344503164291, + "learning_rate": 1.3356923076923079e-05, + "loss": 0.0069, + "step": 2160 + }, + { + "epoch": 16.692307692307693, + "grad_norm": 0.04594796895980835, + "learning_rate": 1.3326153846153847e-05, + "loss": 0.0227, + "step": 2170 + }, + { + "epoch": 16.76923076923077, + "grad_norm": 0.07690643519163132, + "learning_rate": 1.3295384615384616e-05, + "loss": 0.0082, + "step": 2180 + }, + { + "epoch": 16.846153846153847, + "grad_norm": 26.13251304626465, + "learning_rate": 1.3264615384615385e-05, + "loss": 0.132, + "step": 2190 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 0.03183867409825325, + "learning_rate": 1.3233846153846155e-05, + "loss": 0.0681, + "step": 2200 + }, + { + "epoch": 17.0, + "grad_norm": 0.06133001297712326, + "learning_rate": 1.3203076923076926e-05, + "loss": 0.007, + "step": 2210 + }, + { + "epoch": 17.0, + "eval_accuracy": 1.0, + "eval_loss": 0.006952174473553896, + "eval_runtime": 0.539, + "eval_samples_per_second": 246.77, + "eval_steps_per_second": 31.542, + "step": 2210 + }, + { + "epoch": 17.076923076923077, + "grad_norm": 0.03733190521597862, + "learning_rate": 1.3172307692307694e-05, + "loss": 0.0135, + "step": 2220 + }, + { + "epoch": 17.153846153846153, + "grad_norm": 0.035284917801618576, + "learning_rate": 1.3141538461538463e-05, + "loss": 0.007, + "step": 2230 + }, + { + "epoch": 17.23076923076923, + "grad_norm": 0.034291546791791916, + "learning_rate": 1.311076923076923e-05, + "loss": 0.0199, + "step": 2240 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 0.05276218429207802, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.0739, + "step": 2250 + }, + { + "epoch": 17.384615384615383, + "grad_norm": 0.03207957744598389, + "learning_rate": 1.304923076923077e-05, + "loss": 0.0629, + "step": 2260 + }, + { + "epoch": 17.46153846153846, + "grad_norm": 0.041123662143945694, + "learning_rate": 1.301846153846154e-05, + "loss": 0.0075, + "step": 2270 + }, + { + "epoch": 17.53846153846154, + "grad_norm": 0.6720069646835327, + "learning_rate": 1.2987692307692308e-05, + "loss": 0.0622, + "step": 2280 + }, + { + "epoch": 17.615384615384617, + "grad_norm": 0.03161491081118584, + "learning_rate": 1.2956923076923078e-05, + "loss": 0.006, + "step": 2290 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 0.031039714813232422, + "learning_rate": 1.2926153846153849e-05, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 17.76923076923077, + "grad_norm": 0.03025674633681774, + "learning_rate": 1.2895384615384616e-05, + "loss": 0.0739, + "step": 2310 + }, + { + "epoch": 17.846153846153847, + "grad_norm": 0.035373855382204056, + "learning_rate": 1.2864615384615386e-05, + "loss": 0.0702, + "step": 2320 + }, + { + "epoch": 17.923076923076923, + "grad_norm": 0.033173635601997375, + "learning_rate": 1.2833846153846155e-05, + "loss": 0.0189, + "step": 2330 + }, + { + "epoch": 18.0, + "grad_norm": 0.04258832708001137, + "learning_rate": 1.2803076923076925e-05, + "loss": 0.0787, + "step": 2340 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.015911292284727097, + "eval_runtime": 0.5392, + "eval_samples_per_second": 246.66, + "eval_steps_per_second": 31.528, + "step": 2340 + }, + { + "epoch": 18.076923076923077, + "grad_norm": 0.06672647595405579, + "learning_rate": 1.2772307692307692e-05, + "loss": 0.006, + "step": 2350 + }, + { + "epoch": 18.153846153846153, + "grad_norm": 0.031002268195152283, + "learning_rate": 1.2741538461538463e-05, + "loss": 0.0087, + "step": 2360 + }, + { + "epoch": 18.23076923076923, + "grad_norm": 0.0316181555390358, + "learning_rate": 1.2710769230769231e-05, + "loss": 0.2059, + "step": 2370 + }, + { + "epoch": 18.307692307692307, + "grad_norm": 0.03107195906341076, + "learning_rate": 1.268e-05, + "loss": 0.0062, + "step": 2380 + }, + { + "epoch": 18.384615384615383, + "grad_norm": 0.03275629132986069, + "learning_rate": 1.264923076923077e-05, + "loss": 0.0189, + "step": 2390 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 7.005457401275635, + "learning_rate": 1.261846153846154e-05, + "loss": 0.0309, + "step": 2400 + }, + { + "epoch": 18.53846153846154, + "grad_norm": 0.03404508903622627, + "learning_rate": 1.258769230769231e-05, + "loss": 0.0107, + "step": 2410 + }, + { + "epoch": 18.615384615384617, + "grad_norm": 0.03446439281105995, + "learning_rate": 1.2556923076923078e-05, + "loss": 0.0166, + "step": 2420 + }, + { + "epoch": 18.692307692307693, + "grad_norm": 26.720441818237305, + "learning_rate": 1.2526153846153848e-05, + "loss": 0.0371, + "step": 2430 + }, + { + "epoch": 18.76923076923077, + "grad_norm": 0.031138837337493896, + "learning_rate": 1.2495384615384615e-05, + "loss": 0.0291, + "step": 2440 + }, + { + "epoch": 18.846153846153847, + "grad_norm": 4.7112627029418945, + "learning_rate": 1.2464615384615386e-05, + "loss": 0.0811, + "step": 2450 + }, + { + "epoch": 18.923076923076923, + "grad_norm": 15.13769245147705, + "learning_rate": 1.2433846153846154e-05, + "loss": 0.0315, + "step": 2460 + }, + { + "epoch": 19.0, + "grad_norm": 0.06720714271068573, + "learning_rate": 1.2403076923076925e-05, + "loss": 0.0322, + "step": 2470 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.9699248120300752, + "eval_loss": 0.09268058836460114, + "eval_runtime": 0.5134, + "eval_samples_per_second": 259.076, + "eval_steps_per_second": 33.115, + "step": 2470 + }, + { + "epoch": 19.076923076923077, + "grad_norm": 1.727834701538086, + "learning_rate": 1.2372307692307693e-05, + "loss": 0.1479, + "step": 2480 + }, + { + "epoch": 19.153846153846153, + "grad_norm": 0.03252265602350235, + "learning_rate": 1.2341538461538462e-05, + "loss": 0.008, + "step": 2490 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 0.03733864426612854, + "learning_rate": 1.2310769230769233e-05, + "loss": 0.0717, + "step": 2500 + }, + { + "epoch": 19.307692307692307, + "grad_norm": 0.029602911323308945, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.0272, + "step": 2510 + }, + { + "epoch": 19.384615384615383, + "grad_norm": 0.031687181442976, + "learning_rate": 1.224923076923077e-05, + "loss": 0.033, + "step": 2520 + }, + { + "epoch": 19.46153846153846, + "grad_norm": 0.03254169225692749, + "learning_rate": 1.221846153846154e-05, + "loss": 0.0058, + "step": 2530 + }, + { + "epoch": 19.53846153846154, + "grad_norm": 0.03204645216464996, + "learning_rate": 1.218769230769231e-05, + "loss": 0.0081, + "step": 2540 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 2.3990302085876465, + "learning_rate": 1.2156923076923077e-05, + "loss": 0.009, + "step": 2550 + }, + { + "epoch": 19.692307692307693, + "grad_norm": 8.247553825378418, + "learning_rate": 1.2126153846153848e-05, + "loss": 0.065, + "step": 2560 + }, + { + "epoch": 19.76923076923077, + "grad_norm": 0.027142079547047615, + "learning_rate": 1.2095384615384616e-05, + "loss": 0.0936, + "step": 2570 + }, + { + "epoch": 19.846153846153847, + "grad_norm": 4.044205188751221, + "learning_rate": 1.2064615384615385e-05, + "loss": 0.0684, + "step": 2580 + }, + { + "epoch": 19.923076923076923, + "grad_norm": 0.025813696905970573, + "learning_rate": 1.2033846153846154e-05, + "loss": 0.0103, + "step": 2590 + }, + { + "epoch": 20.0, + "grad_norm": 0.03822549432516098, + "learning_rate": 1.2003076923076924e-05, + "loss": 0.0051, + "step": 2600 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.9774436090225563, + "eval_loss": 0.07036413252353668, + "eval_runtime": 0.539, + "eval_samples_per_second": 246.732, + "eval_steps_per_second": 31.537, + "step": 2600 + }, + { + "epoch": 20.076923076923077, + "grad_norm": 29.656103134155273, + "learning_rate": 1.1972307692307695e-05, + "loss": 0.0669, + "step": 2610 + }, + { + "epoch": 20.153846153846153, + "grad_norm": 0.08279704302549362, + "learning_rate": 1.1941538461538463e-05, + "loss": 0.058, + "step": 2620 + }, + { + "epoch": 20.23076923076923, + "grad_norm": 0.02927885204553604, + "learning_rate": 1.1910769230769232e-05, + "loss": 0.1304, + "step": 2630 + }, + { + "epoch": 20.307692307692307, + "grad_norm": 0.033018048852682114, + "learning_rate": 1.188e-05, + "loss": 0.0088, + "step": 2640 + }, + { + "epoch": 20.384615384615383, + "grad_norm": 0.027642302215099335, + "learning_rate": 1.1849230769230771e-05, + "loss": 0.0681, + "step": 2650 + }, + { + "epoch": 20.46153846153846, + "grad_norm": 0.025126196444034576, + "learning_rate": 1.1818461538461539e-05, + "loss": 0.0054, + "step": 2660 + }, + { + "epoch": 20.53846153846154, + "grad_norm": 0.03173169121146202, + "learning_rate": 1.178769230769231e-05, + "loss": 0.0643, + "step": 2670 + }, + { + "epoch": 20.615384615384617, + "grad_norm": 0.04188225790858269, + "learning_rate": 1.1756923076923077e-05, + "loss": 0.0655, + "step": 2680 + }, + { + "epoch": 20.692307692307693, + "grad_norm": 6.354015827178955, + "learning_rate": 1.1726153846153847e-05, + "loss": 0.0074, + "step": 2690 + }, + { + "epoch": 20.76923076923077, + "grad_norm": 0.02860679291188717, + "learning_rate": 1.1695384615384618e-05, + "loss": 0.0086, + "step": 2700 + }, + { + "epoch": 20.846153846153847, + "grad_norm": 0.032081782817840576, + "learning_rate": 1.1664615384615386e-05, + "loss": 0.0742, + "step": 2710 + }, + { + "epoch": 20.923076923076923, + "grad_norm": 104.71792602539062, + "learning_rate": 1.1633846153846155e-05, + "loss": 0.1029, + "step": 2720 + }, + { + "epoch": 21.0, + "grad_norm": 0.03403322771191597, + "learning_rate": 1.1603076923076924e-05, + "loss": 0.0053, + "step": 2730 + }, + { + "epoch": 21.0, + "eval_accuracy": 1.0, + "eval_loss": 0.00510411849245429, + "eval_runtime": 0.5492, + "eval_samples_per_second": 242.177, + "eval_steps_per_second": 30.955, + "step": 2730 + }, + { + "epoch": 21.076923076923077, + "grad_norm": 1.5433504581451416, + "learning_rate": 1.1572307692307694e-05, + "loss": 0.0367, + "step": 2740 + }, + { + "epoch": 21.153846153846153, + "grad_norm": 3.759187936782837, + "learning_rate": 1.1541538461538461e-05, + "loss": 0.0811, + "step": 2750 + }, + { + "epoch": 21.23076923076923, + "grad_norm": 0.02730879932641983, + "learning_rate": 1.1510769230769232e-05, + "loss": 0.1553, + "step": 2760 + }, + { + "epoch": 21.307692307692307, + "grad_norm": 0.025684796273708344, + "learning_rate": 1.148e-05, + "loss": 0.005, + "step": 2770 + }, + { + "epoch": 21.384615384615383, + "grad_norm": 1.0041700601577759, + "learning_rate": 1.144923076923077e-05, + "loss": 0.0223, + "step": 2780 + }, + { + "epoch": 21.46153846153846, + "grad_norm": 0.03505957871675491, + "learning_rate": 1.141846153846154e-05, + "loss": 0.0052, + "step": 2790 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 0.028406942263245583, + "learning_rate": 1.1387692307692308e-05, + "loss": 0.0054, + "step": 2800 + }, + { + "epoch": 21.615384615384617, + "grad_norm": 0.03344385325908661, + "learning_rate": 1.135692307692308e-05, + "loss": 0.0092, + "step": 2810 + }, + { + "epoch": 21.692307692307693, + "grad_norm": 0.0294520054012537, + "learning_rate": 1.1326153846153847e-05, + "loss": 0.0064, + "step": 2820 + }, + { + "epoch": 21.76923076923077, + "grad_norm": 0.027606450021266937, + "learning_rate": 1.1295384615384617e-05, + "loss": 0.0066, + "step": 2830 + }, + { + "epoch": 21.846153846153847, + "grad_norm": 0.025868358090519905, + "learning_rate": 1.1264615384615384e-05, + "loss": 0.0152, + "step": 2840 + }, + { + "epoch": 21.923076923076923, + "grad_norm": 0.05131072178483009, + "learning_rate": 1.1233846153846155e-05, + "loss": 0.0049, + "step": 2850 + }, + { + "epoch": 22.0, + "grad_norm": 0.20607870817184448, + "learning_rate": 1.1203076923076923e-05, + "loss": 0.0056, + "step": 2860 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.03114698827266693, + "eval_runtime": 0.53, + "eval_samples_per_second": 250.951, + "eval_steps_per_second": 32.076, + "step": 2860 + }, + { + "epoch": 22.076923076923077, + "grad_norm": 0.024041753262281418, + "learning_rate": 1.1172307692307694e-05, + "loss": 0.1269, + "step": 2870 + }, + { + "epoch": 22.153846153846153, + "grad_norm": 0.030343621969223022, + "learning_rate": 1.1141538461538462e-05, + "loss": 0.0422, + "step": 2880 + }, + { + "epoch": 22.23076923076923, + "grad_norm": 11.643026351928711, + "learning_rate": 1.1110769230769231e-05, + "loss": 0.0574, + "step": 2890 + }, + { + "epoch": 22.307692307692307, + "grad_norm": 0.04696710407733917, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.0809, + "step": 2900 + }, + { + "epoch": 22.384615384615383, + "grad_norm": 0.03564168140292168, + "learning_rate": 1.104923076923077e-05, + "loss": 0.037, + "step": 2910 + }, + { + "epoch": 22.46153846153846, + "grad_norm": 0.024806920439004898, + "learning_rate": 1.101846153846154e-05, + "loss": 0.0421, + "step": 2920 + }, + { + "epoch": 22.53846153846154, + "grad_norm": 0.024994414299726486, + "learning_rate": 1.0987692307692309e-05, + "loss": 0.0052, + "step": 2930 + }, + { + "epoch": 22.615384615384617, + "grad_norm": 0.023204075172543526, + "learning_rate": 1.0956923076923078e-05, + "loss": 0.0055, + "step": 2940 + }, + { + "epoch": 22.692307692307693, + "grad_norm": 0.034952543675899506, + "learning_rate": 1.0926153846153846e-05, + "loss": 0.0184, + "step": 2950 + }, + { + "epoch": 22.76923076923077, + "grad_norm": 0.022795893251895905, + "learning_rate": 1.0895384615384617e-05, + "loss": 0.0048, + "step": 2960 + }, + { + "epoch": 22.846153846153847, + "grad_norm": 0.09495553374290466, + "learning_rate": 1.0864615384615385e-05, + "loss": 0.0045, + "step": 2970 + }, + { + "epoch": 22.923076923076923, + "grad_norm": 0.02656714804470539, + "learning_rate": 1.0833846153846154e-05, + "loss": 0.0768, + "step": 2980 + }, + { + "epoch": 23.0, + "grad_norm": 0.02597396820783615, + "learning_rate": 1.0803076923076925e-05, + "loss": 0.0763, + "step": 2990 + }, + { + "epoch": 23.0, + "eval_accuracy": 1.0, + "eval_loss": 0.004323595203459263, + "eval_runtime": 0.5703, + "eval_samples_per_second": 233.208, + "eval_steps_per_second": 29.809, + "step": 2990 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 0.9274821877479553, + "learning_rate": 1.0772307692307693e-05, + "loss": 0.0526, + "step": 3000 + }, + { + "epoch": 23.153846153846153, + "grad_norm": 0.02383274771273136, + "learning_rate": 1.0741538461538464e-05, + "loss": 0.0426, + "step": 3010 + }, + { + "epoch": 23.23076923076923, + "grad_norm": 0.02221047319471836, + "learning_rate": 1.0710769230769232e-05, + "loss": 0.0153, + "step": 3020 + }, + { + "epoch": 23.307692307692307, + "grad_norm": 0.023653149604797363, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.0051, + "step": 3030 + }, + { + "epoch": 23.384615384615383, + "grad_norm": 5.931399345397949, + "learning_rate": 1.0649230769230769e-05, + "loss": 0.1295, + "step": 3040 + }, + { + "epoch": 23.46153846153846, + "grad_norm": 0.025742702186107635, + "learning_rate": 1.061846153846154e-05, + "loss": 0.0043, + "step": 3050 + }, + { + "epoch": 23.53846153846154, + "grad_norm": 4.639406681060791, + "learning_rate": 1.0587692307692308e-05, + "loss": 0.0775, + "step": 3060 + }, + { + "epoch": 23.615384615384617, + "grad_norm": 0.02598275989294052, + "learning_rate": 1.0556923076923079e-05, + "loss": 0.0736, + "step": 3070 + }, + { + "epoch": 23.692307692307693, + "grad_norm": 0.06597548723220825, + "learning_rate": 1.0526153846153846e-05, + "loss": 0.0087, + "step": 3080 + }, + { + "epoch": 23.76923076923077, + "grad_norm": 0.027538040652871132, + "learning_rate": 1.0495384615384616e-05, + "loss": 0.0045, + "step": 3090 + }, + { + "epoch": 23.846153846153847, + "grad_norm": 1.389827847480774, + "learning_rate": 1.0464615384615387e-05, + "loss": 0.006, + "step": 3100 + }, + { + "epoch": 23.923076923076923, + "grad_norm": 0.021650169044733047, + "learning_rate": 1.0433846153846155e-05, + "loss": 0.1117, + "step": 3110 + }, + { + "epoch": 24.0, + "grad_norm": 0.03035648725926876, + "learning_rate": 1.0403076923076924e-05, + "loss": 0.0045, + "step": 3120 + }, + { + "epoch": 24.0, + "eval_accuracy": 1.0, + "eval_loss": 0.004528364632278681, + "eval_runtime": 0.5128, + "eval_samples_per_second": 259.38, + "eval_steps_per_second": 33.154, + "step": 3120 + }, + { + "epoch": 24.076923076923077, + "grad_norm": 0.030537929385900497, + "learning_rate": 1.0372307692307693e-05, + "loss": 0.0586, + "step": 3130 + }, + { + "epoch": 24.153846153846153, + "grad_norm": 0.028254270553588867, + "learning_rate": 1.0341538461538463e-05, + "loss": 0.0153, + "step": 3140 + }, + { + "epoch": 24.23076923076923, + "grad_norm": 0.02149919420480728, + "learning_rate": 1.031076923076923e-05, + "loss": 0.0256, + "step": 3150 + }, + { + "epoch": 24.307692307692307, + "grad_norm": 0.01989160105586052, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.0502, + "step": 3160 + }, + { + "epoch": 24.384615384615383, + "grad_norm": 0.02913454733788967, + "learning_rate": 1.024923076923077e-05, + "loss": 0.0203, + "step": 3170 + }, + { + "epoch": 24.46153846153846, + "grad_norm": 36.81732940673828, + "learning_rate": 1.0218461538461539e-05, + "loss": 0.1064, + "step": 3180 + }, + { + "epoch": 24.53846153846154, + "grad_norm": 0.0211233738809824, + "learning_rate": 1.018769230769231e-05, + "loss": 0.056, + "step": 3190 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 0.025757772848010063, + "learning_rate": 1.0156923076923077e-05, + "loss": 0.0058, + "step": 3200 + }, + { + "epoch": 24.692307692307693, + "grad_norm": 0.04154786840081215, + "learning_rate": 1.0126153846153849e-05, + "loss": 0.0697, + "step": 3210 + }, + { + "epoch": 24.76923076923077, + "grad_norm": 0.026883425191044807, + "learning_rate": 1.0095384615384616e-05, + "loss": 0.0823, + "step": 3220 + }, + { + "epoch": 24.846153846153847, + "grad_norm": 0.022140992805361748, + "learning_rate": 1.0064615384615386e-05, + "loss": 0.0039, + "step": 3230 + }, + { + "epoch": 24.923076923076923, + "grad_norm": 0.02101975493133068, + "learning_rate": 1.0033846153846153e-05, + "loss": 0.0742, + "step": 3240 + }, + { + "epoch": 25.0, + "grad_norm": 0.031036680564284325, + "learning_rate": 1.0003076923076924e-05, + "loss": 0.0039, + "step": 3250 + }, + { + "epoch": 25.0, + "eval_accuracy": 1.0, + "eval_loss": 0.004188508726656437, + "eval_runtime": 0.5295, + "eval_samples_per_second": 251.176, + "eval_steps_per_second": 32.105, + "step": 3250 + }, + { + "epoch": 25.076923076923077, + "grad_norm": 0.09134573489427567, + "learning_rate": 9.972307692307694e-06, + "loss": 0.0492, + "step": 3260 + }, + { + "epoch": 25.153846153846153, + "grad_norm": 0.024105530232191086, + "learning_rate": 9.941538461538463e-06, + "loss": 0.1689, + "step": 3270 + }, + { + "epoch": 25.23076923076923, + "grad_norm": 0.4104077219963074, + "learning_rate": 9.910769230769231e-06, + "loss": 0.0042, + "step": 3280 + }, + { + "epoch": 25.307692307692307, + "grad_norm": 0.03301569074392319, + "learning_rate": 9.88e-06, + "loss": 0.0042, + "step": 3290 + }, + { + "epoch": 25.384615384615383, + "grad_norm": 0.02022700011730194, + "learning_rate": 9.84923076923077e-06, + "loss": 0.005, + "step": 3300 + }, + { + "epoch": 25.46153846153846, + "grad_norm": 2.8255412578582764, + "learning_rate": 9.818461538461539e-06, + "loss": 0.0047, + "step": 3310 + }, + { + "epoch": 25.53846153846154, + "grad_norm": 0.025504756718873978, + "learning_rate": 9.787692307692308e-06, + "loss": 0.136, + "step": 3320 + }, + { + "epoch": 25.615384615384617, + "grad_norm": 0.025238120928406715, + "learning_rate": 9.756923076923078e-06, + "loss": 0.0477, + "step": 3330 + }, + { + "epoch": 25.692307692307693, + "grad_norm": 0.027804501354694366, + "learning_rate": 9.726153846153847e-06, + "loss": 0.0737, + "step": 3340 + }, + { + "epoch": 25.76923076923077, + "grad_norm": 0.03406287729740143, + "learning_rate": 9.695384615384617e-06, + "loss": 0.0039, + "step": 3350 + }, + { + "epoch": 25.846153846153847, + "grad_norm": 0.034681614488363266, + "learning_rate": 9.664615384615386e-06, + "loss": 0.0458, + "step": 3360 + }, + { + "epoch": 25.923076923076923, + "grad_norm": 24.834915161132812, + "learning_rate": 9.633846153846155e-06, + "loss": 0.14, + "step": 3370 + }, + { + "epoch": 26.0, + "grad_norm": 0.05473438277840614, + "learning_rate": 9.603076923076923e-06, + "loss": 0.0041, + "step": 3380 + }, + { + "epoch": 26.0, + "eval_accuracy": 1.0, + "eval_loss": 0.003798810066655278, + "eval_runtime": 0.4905, + "eval_samples_per_second": 271.129, + "eval_steps_per_second": 34.656, + "step": 3380 + }, + { + "epoch": 26.076923076923077, + "grad_norm": 0.1843373030424118, + "learning_rate": 9.572307692307693e-06, + "loss": 0.0235, + "step": 3390 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 27.810468673706055, + "learning_rate": 9.541538461538462e-06, + "loss": 0.04, + "step": 3400 + }, + { + "epoch": 26.23076923076923, + "grad_norm": 0.02020885981619358, + "learning_rate": 9.510769230769231e-06, + "loss": 0.0578, + "step": 3410 + }, + { + "epoch": 26.307692307692307, + "grad_norm": 0.03883889317512512, + "learning_rate": 9.48e-06, + "loss": 0.0144, + "step": 3420 + }, + { + "epoch": 26.384615384615383, + "grad_norm": 1.0686793327331543, + "learning_rate": 9.44923076923077e-06, + "loss": 0.0896, + "step": 3430 + }, + { + "epoch": 26.46153846153846, + "grad_norm": 0.02395572140812874, + "learning_rate": 9.41846153846154e-06, + "loss": 0.0683, + "step": 3440 + }, + { + "epoch": 26.53846153846154, + "grad_norm": 0.043965164572000504, + "learning_rate": 9.387692307692309e-06, + "loss": 0.0492, + "step": 3450 + }, + { + "epoch": 26.615384615384617, + "grad_norm": 0.020178133621811867, + "learning_rate": 9.356923076923078e-06, + "loss": 0.0761, + "step": 3460 + }, + { + "epoch": 26.692307692307693, + "grad_norm": 0.021391689777374268, + "learning_rate": 9.326153846153848e-06, + "loss": 0.0035, + "step": 3470 + }, + { + "epoch": 26.76923076923077, + "grad_norm": 0.023591719567775726, + "learning_rate": 9.295384615384615e-06, + "loss": 0.0035, + "step": 3480 + }, + { + "epoch": 26.846153846153847, + "grad_norm": 27.220369338989258, + "learning_rate": 9.264615384615385e-06, + "loss": 0.0936, + "step": 3490 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 5.122826099395752, + "learning_rate": 9.233846153846154e-06, + "loss": 0.0087, + "step": 3500 + }, + { + "epoch": 27.0, + "grad_norm": 0.025866487994790077, + "learning_rate": 9.203076923076924e-06, + "loss": 0.0038, + "step": 3510 + }, + { + "epoch": 27.0, + "eval_accuracy": 1.0, + "eval_loss": 0.0037521023768931627, + "eval_runtime": 0.5235, + "eval_samples_per_second": 254.041, + "eval_steps_per_second": 32.471, + "step": 3510 + }, + { + "epoch": 27.076923076923077, + "grad_norm": 0.02617652341723442, + "learning_rate": 9.172307692307693e-06, + "loss": 0.0035, + "step": 3520 + }, + { + "epoch": 27.153846153846153, + "grad_norm": 0.019553804770112038, + "learning_rate": 9.141538461538462e-06, + "loss": 0.0056, + "step": 3530 + }, + { + "epoch": 27.23076923076923, + "grad_norm": 0.3406715393066406, + "learning_rate": 9.110769230769232e-06, + "loss": 0.0052, + "step": 3540 + }, + { + "epoch": 27.307692307692307, + "grad_norm": 0.021406322717666626, + "learning_rate": 9.080000000000001e-06, + "loss": 0.0649, + "step": 3550 + }, + { + "epoch": 27.384615384615383, + "grad_norm": 0.017564058303833008, + "learning_rate": 9.04923076923077e-06, + "loss": 0.0556, + "step": 3560 + }, + { + "epoch": 27.46153846153846, + "grad_norm": 0.017913702875375748, + "learning_rate": 9.01846153846154e-06, + "loss": 0.0063, + "step": 3570 + }, + { + "epoch": 27.53846153846154, + "grad_norm": 12.03849983215332, + "learning_rate": 8.987692307692308e-06, + "loss": 0.0492, + "step": 3580 + }, + { + "epoch": 27.615384615384617, + "grad_norm": 0.032500751316547394, + "learning_rate": 8.956923076923077e-06, + "loss": 0.0391, + "step": 3590 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 0.02035779319703579, + "learning_rate": 8.926153846153846e-06, + "loss": 0.0034, + "step": 3600 + }, + { + "epoch": 27.76923076923077, + "grad_norm": 0.09579632431268692, + "learning_rate": 8.895384615384616e-06, + "loss": 0.0251, + "step": 3610 + }, + { + "epoch": 27.846153846153847, + "grad_norm": 37.06322479248047, + "learning_rate": 8.864615384615385e-06, + "loss": 0.0454, + "step": 3620 + }, + { + "epoch": 27.923076923076923, + "grad_norm": 0.017491474747657776, + "learning_rate": 8.833846153846155e-06, + "loss": 0.0699, + "step": 3630 + }, + { + "epoch": 28.0, + "grad_norm": 0.02647402323782444, + "learning_rate": 8.803076923076924e-06, + "loss": 0.0732, + "step": 3640 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.036787230521440506, + "eval_runtime": 0.5259, + "eval_samples_per_second": 252.922, + "eval_steps_per_second": 32.328, + "step": 3640 + }, + { + "epoch": 28.076923076923077, + "grad_norm": 38.54633712768555, + "learning_rate": 8.772307692307693e-06, + "loss": 0.0992, + "step": 3650 + }, + { + "epoch": 28.153846153846153, + "grad_norm": 0.01895446516573429, + "learning_rate": 8.741538461538463e-06, + "loss": 0.0039, + "step": 3660 + }, + { + "epoch": 28.23076923076923, + "grad_norm": 0.0215084720402956, + "learning_rate": 8.710769230769232e-06, + "loss": 0.0569, + "step": 3670 + }, + { + "epoch": 28.307692307692307, + "grad_norm": 6.288870334625244, + "learning_rate": 8.68e-06, + "loss": 0.0322, + "step": 3680 + }, + { + "epoch": 28.384615384615383, + "grad_norm": 0.01768365502357483, + "learning_rate": 8.64923076923077e-06, + "loss": 0.0037, + "step": 3690 + }, + { + "epoch": 28.46153846153846, + "grad_norm": 0.039491429924964905, + "learning_rate": 8.618461538461539e-06, + "loss": 0.0033, + "step": 3700 + }, + { + "epoch": 28.53846153846154, + "grad_norm": 0.016860023140907288, + "learning_rate": 8.587692307692308e-06, + "loss": 0.0082, + "step": 3710 + }, + { + "epoch": 28.615384615384617, + "grad_norm": 3.477928638458252, + "learning_rate": 8.556923076923077e-06, + "loss": 0.0584, + "step": 3720 + }, + { + "epoch": 28.692307692307693, + "grad_norm": 0.01949097402393818, + "learning_rate": 8.526153846153847e-06, + "loss": 0.0036, + "step": 3730 + }, + { + "epoch": 28.76923076923077, + "grad_norm": 0.017446843907237053, + "learning_rate": 8.495384615384616e-06, + "loss": 0.0351, + "step": 3740 + }, + { + "epoch": 28.846153846153847, + "grad_norm": 0.017422709614038467, + "learning_rate": 8.464615384615386e-06, + "loss": 0.0033, + "step": 3750 + }, + { + "epoch": 28.923076923076923, + "grad_norm": 0.016815312206745148, + "learning_rate": 8.433846153846155e-06, + "loss": 0.0112, + "step": 3760 + }, + { + "epoch": 29.0, + "grad_norm": 0.01812721975147724, + "learning_rate": 8.403076923076924e-06, + "loss": 0.003, + "step": 3770 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.9774436090225563, + "eval_loss": 0.061834584921598434, + "eval_runtime": 0.5311, + "eval_samples_per_second": 250.417, + "eval_steps_per_second": 32.008, + "step": 3770 + }, + { + "epoch": 29.076923076923077, + "grad_norm": 0.01886649616062641, + "learning_rate": 8.372307692307692e-06, + "loss": 0.0031, + "step": 3780 + }, + { + "epoch": 29.153846153846153, + "grad_norm": 0.01647001877427101, + "learning_rate": 8.341538461538462e-06, + "loss": 0.003, + "step": 3790 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 0.39807796478271484, + "learning_rate": 8.310769230769231e-06, + "loss": 0.0052, + "step": 3800 + }, + { + "epoch": 29.307692307692307, + "grad_norm": 0.018272781744599342, + "learning_rate": 8.28e-06, + "loss": 0.0037, + "step": 3810 + }, + { + "epoch": 29.384615384615383, + "grad_norm": 0.016053559258580208, + "learning_rate": 8.24923076923077e-06, + "loss": 0.004, + "step": 3820 + }, + { + "epoch": 29.46153846153846, + "grad_norm": 36.70539474487305, + "learning_rate": 8.218461538461539e-06, + "loss": 0.0204, + "step": 3830 + }, + { + "epoch": 29.53846153846154, + "grad_norm": 0.015738273039460182, + "learning_rate": 8.187692307692309e-06, + "loss": 0.009, + "step": 3840 + }, + { + "epoch": 29.615384615384617, + "grad_norm": 0.023203525692224503, + "learning_rate": 8.156923076923078e-06, + "loss": 0.0034, + "step": 3850 + }, + { + "epoch": 29.692307692307693, + "grad_norm": 0.016458503901958466, + "learning_rate": 8.126153846153847e-06, + "loss": 0.0601, + "step": 3860 + }, + { + "epoch": 29.76923076923077, + "grad_norm": 0.01801135018467903, + "learning_rate": 8.095384615384617e-06, + "loss": 0.0039, + "step": 3870 + }, + { + "epoch": 29.846153846153847, + "grad_norm": 0.016389012336730957, + "learning_rate": 8.064615384615384e-06, + "loss": 0.003, + "step": 3880 + }, + { + "epoch": 29.923076923076923, + "grad_norm": 0.016634546220302582, + "learning_rate": 8.033846153846154e-06, + "loss": 0.1151, + "step": 3890 + }, + { + "epoch": 30.0, + "grad_norm": 0.018060820177197456, + "learning_rate": 8.003076923076923e-06, + "loss": 0.003, + "step": 3900 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9774436090225563, + "eval_loss": 0.07695284485816956, + "eval_runtime": 0.5033, + "eval_samples_per_second": 264.262, + "eval_steps_per_second": 33.778, + "step": 3900 + }, + { + "epoch": 30.076923076923077, + "grad_norm": 0.015643548220396042, + "learning_rate": 7.972307692307693e-06, + "loss": 0.003, + "step": 3910 + }, + { + "epoch": 30.153846153846153, + "grad_norm": 0.02010912075638771, + "learning_rate": 7.941538461538462e-06, + "loss": 0.0032, + "step": 3920 + }, + { + "epoch": 30.23076923076923, + "grad_norm": 0.018395811319351196, + "learning_rate": 7.910769230769231e-06, + "loss": 0.0028, + "step": 3930 + }, + { + "epoch": 30.307692307692307, + "grad_norm": 0.016207564622163773, + "learning_rate": 7.88e-06, + "loss": 0.1054, + "step": 3940 + }, + { + "epoch": 30.384615384615383, + "grad_norm": 0.016036609187722206, + "learning_rate": 7.84923076923077e-06, + "loss": 0.0083, + "step": 3950 + }, + { + "epoch": 30.46153846153846, + "grad_norm": 0.019265053793787956, + "learning_rate": 7.81846153846154e-06, + "loss": 0.0029, + "step": 3960 + }, + { + "epoch": 30.53846153846154, + "grad_norm": 0.015535300597548485, + "learning_rate": 7.787692307692309e-06, + "loss": 0.019, + "step": 3970 + }, + { + "epoch": 30.615384615384617, + "grad_norm": 0.017704902216792107, + "learning_rate": 7.756923076923077e-06, + "loss": 0.0029, + "step": 3980 + }, + { + "epoch": 30.692307692307693, + "grad_norm": 0.015535766258835793, + "learning_rate": 7.726153846153846e-06, + "loss": 0.003, + "step": 3990 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 0.015465916134417057, + "learning_rate": 7.695384615384615e-06, + "loss": 0.0107, + "step": 4000 + }, + { + "epoch": 30.846153846153847, + "grad_norm": 0.015191212296485901, + "learning_rate": 7.664615384615385e-06, + "loss": 0.0028, + "step": 4010 + }, + { + "epoch": 30.923076923076923, + "grad_norm": 0.03314828500151634, + "learning_rate": 7.633846153846154e-06, + "loss": 0.0803, + "step": 4020 + }, + { + "epoch": 31.0, + "grad_norm": 0.022903915494680405, + "learning_rate": 7.6030769230769245e-06, + "loss": 0.0029, + "step": 4030 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.027967238798737526, + "eval_runtime": 0.5156, + "eval_samples_per_second": 257.974, + "eval_steps_per_second": 32.974, + "step": 4030 + }, + { + "epoch": 31.076923076923077, + "grad_norm": 0.015114635229110718, + "learning_rate": 7.572307692307693e-06, + "loss": 0.0032, + "step": 4040 + }, + { + "epoch": 31.153846153846153, + "grad_norm": 0.01759064383804798, + "learning_rate": 7.5415384615384624e-06, + "loss": 0.0046, + "step": 4050 + }, + { + "epoch": 31.23076923076923, + "grad_norm": 0.11509862542152405, + "learning_rate": 7.510769230769232e-06, + "loss": 0.0044, + "step": 4060 + }, + { + "epoch": 31.307692307692307, + "grad_norm": 0.014955403283238411, + "learning_rate": 7.48e-06, + "loss": 0.0026, + "step": 4070 + }, + { + "epoch": 31.384615384615383, + "grad_norm": 0.015624427236616611, + "learning_rate": 7.44923076923077e-06, + "loss": 0.0676, + "step": 4080 + }, + { + "epoch": 31.46153846153846, + "grad_norm": 0.01573682203888893, + "learning_rate": 7.418461538461539e-06, + "loss": 0.0035, + "step": 4090 + }, + { + "epoch": 31.53846153846154, + "grad_norm": 3.9268758296966553, + "learning_rate": 7.387692307692308e-06, + "loss": 0.1539, + "step": 4100 + }, + { + "epoch": 31.615384615384617, + "grad_norm": 0.015769314020872116, + "learning_rate": 7.356923076923077e-06, + "loss": 0.0028, + "step": 4110 + }, + { + "epoch": 31.692307692307693, + "grad_norm": 0.02034025639295578, + "learning_rate": 7.326153846153847e-06, + "loss": 0.0069, + "step": 4120 + }, + { + "epoch": 31.76923076923077, + "grad_norm": 0.2609105706214905, + "learning_rate": 7.295384615384617e-06, + "loss": 0.0033, + "step": 4130 + }, + { + "epoch": 31.846153846153847, + "grad_norm": 0.02863130159676075, + "learning_rate": 7.264615384615385e-06, + "loss": 0.0028, + "step": 4140 + }, + { + "epoch": 31.923076923076923, + "grad_norm": 0.014573452062904835, + "learning_rate": 7.233846153846155e-06, + "loss": 0.0027, + "step": 4150 + }, + { + "epoch": 32.0, + "grad_norm": 0.02208390086889267, + "learning_rate": 7.203076923076924e-06, + "loss": 0.0027, + "step": 4160 + }, + { + "epoch": 32.0, + "eval_accuracy": 1.0, + "eval_loss": 0.00548548111692071, + "eval_runtime": 0.528, + "eval_samples_per_second": 251.887, + "eval_steps_per_second": 32.196, + "step": 4160 + }, + { + "epoch": 32.07692307692308, + "grad_norm": 0.01504305750131607, + "learning_rate": 7.172307692307693e-06, + "loss": 0.0617, + "step": 4170 + }, + { + "epoch": 32.15384615384615, + "grad_norm": 0.013871644623577595, + "learning_rate": 7.141538461538462e-06, + "loss": 0.0446, + "step": 4180 + }, + { + "epoch": 32.23076923076923, + "grad_norm": 1.4766546487808228, + "learning_rate": 7.1107692307692314e-06, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 0.01372003834694624, + "learning_rate": 7.08e-06, + "loss": 0.0041, + "step": 4200 + }, + { + "epoch": 32.38461538461539, + "grad_norm": 0.020989365875720978, + "learning_rate": 7.049230769230769e-06, + "loss": 0.0101, + "step": 4210 + }, + { + "epoch": 32.46153846153846, + "grad_norm": 0.013711328618228436, + "learning_rate": 7.01846153846154e-06, + "loss": 0.0027, + "step": 4220 + }, + { + "epoch": 32.53846153846154, + "grad_norm": 0.014522346667945385, + "learning_rate": 6.987692307692309e-06, + "loss": 0.0131, + "step": 4230 + }, + { + "epoch": 32.61538461538461, + "grad_norm": 0.015881482511758804, + "learning_rate": 6.9569230769230776e-06, + "loss": 0.0121, + "step": 4240 + }, + { + "epoch": 32.69230769230769, + "grad_norm": 0.01881629228591919, + "learning_rate": 6.926153846153847e-06, + "loss": 0.0025, + "step": 4250 + }, + { + "epoch": 32.76923076923077, + "grad_norm": 0.014320643618702888, + "learning_rate": 6.895384615384616e-06, + "loss": 0.0025, + "step": 4260 + }, + { + "epoch": 32.84615384615385, + "grad_norm": 0.014492589980363846, + "learning_rate": 6.864615384615385e-06, + "loss": 0.0024, + "step": 4270 + }, + { + "epoch": 32.92307692307692, + "grad_norm": 0.054243363440036774, + "learning_rate": 6.833846153846154e-06, + "loss": 0.0026, + "step": 4280 + }, + { + "epoch": 33.0, + "grad_norm": 0.015907688066363335, + "learning_rate": 6.803076923076924e-06, + "loss": 0.0027, + "step": 4290 + }, + { + "epoch": 33.0, + "eval_accuracy": 1.0, + "eval_loss": 0.004582146182656288, + "eval_runtime": 0.5289, + "eval_samples_per_second": 251.469, + "eval_steps_per_second": 32.143, + "step": 4290 + }, + { + "epoch": 33.07692307692308, + "grad_norm": 0.4758153259754181, + "learning_rate": 6.772307692307692e-06, + "loss": 0.0034, + "step": 4300 + }, + { + "epoch": 33.15384615384615, + "grad_norm": 0.013688137754797935, + "learning_rate": 6.741538461538462e-06, + "loss": 0.0801, + "step": 4310 + }, + { + "epoch": 33.23076923076923, + "grad_norm": 0.014032394625246525, + "learning_rate": 6.710769230769232e-06, + "loss": 0.0831, + "step": 4320 + }, + { + "epoch": 33.30769230769231, + "grad_norm": 0.013409950770437717, + "learning_rate": 6.680000000000001e-06, + "loss": 0.0568, + "step": 4330 + }, + { + "epoch": 33.38461538461539, + "grad_norm": 0.01506001129746437, + "learning_rate": 6.64923076923077e-06, + "loss": 0.0027, + "step": 4340 + }, + { + "epoch": 33.46153846153846, + "grad_norm": 0.015481417998671532, + "learning_rate": 6.618461538461539e-06, + "loss": 0.0842, + "step": 4350 + }, + { + "epoch": 33.53846153846154, + "grad_norm": 0.01454329676926136, + "learning_rate": 6.587692307692309e-06, + "loss": 0.0729, + "step": 4360 + }, + { + "epoch": 33.61538461538461, + "grad_norm": 0.014938593842089176, + "learning_rate": 6.556923076923077e-06, + "loss": 0.0035, + "step": 4370 + }, + { + "epoch": 33.69230769230769, + "grad_norm": 0.026904650032520294, + "learning_rate": 6.5261538461538465e-06, + "loss": 0.0029, + "step": 4380 + }, + { + "epoch": 33.76923076923077, + "grad_norm": 0.015914857387542725, + "learning_rate": 6.495384615384616e-06, + "loss": 0.0027, + "step": 4390 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 0.017130790278315544, + "learning_rate": 6.4646153846153845e-06, + "loss": 0.0153, + "step": 4400 + }, + { + "epoch": 33.92307692307692, + "grad_norm": 0.013609221205115318, + "learning_rate": 6.433846153846154e-06, + "loss": 0.078, + "step": 4410 + }, + { + "epoch": 34.0, + "grad_norm": 0.017566576600074768, + "learning_rate": 6.403076923076924e-06, + "loss": 0.0073, + "step": 4420 + }, + { + "epoch": 34.0, + "eval_accuracy": 1.0, + "eval_loss": 0.0026711083482950926, + "eval_runtime": 0.5155, + "eval_samples_per_second": 257.978, + "eval_steps_per_second": 32.975, + "step": 4420 + }, + { + "epoch": 34.07692307692308, + "grad_norm": 0.01437960471957922, + "learning_rate": 6.3723076923076935e-06, + "loss": 0.05, + "step": 4430 + }, + { + "epoch": 34.15384615384615, + "grad_norm": 0.014740287326276302, + "learning_rate": 6.341538461538462e-06, + "loss": 0.0559, + "step": 4440 + }, + { + "epoch": 34.23076923076923, + "grad_norm": 0.014450889080762863, + "learning_rate": 6.3107692307692315e-06, + "loss": 0.0715, + "step": 4450 + }, + { + "epoch": 34.30769230769231, + "grad_norm": 0.014556027948856354, + "learning_rate": 6.280000000000001e-06, + "loss": 0.0027, + "step": 4460 + }, + { + "epoch": 34.38461538461539, + "grad_norm": 0.021746527403593063, + "learning_rate": 6.249230769230769e-06, + "loss": 0.0026, + "step": 4470 + }, + { + "epoch": 34.46153846153846, + "grad_norm": 23.73759651184082, + "learning_rate": 6.218461538461539e-06, + "loss": 0.0572, + "step": 4480 + }, + { + "epoch": 34.53846153846154, + "grad_norm": 0.019620612263679504, + "learning_rate": 6.187692307692308e-06, + "loss": 0.0028, + "step": 4490 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 0.017097901552915573, + "learning_rate": 6.156923076923077e-06, + "loss": 0.0044, + "step": 4500 + }, + { + "epoch": 34.69230769230769, + "grad_norm": 0.022046837955713272, + "learning_rate": 6.126153846153846e-06, + "loss": 0.0026, + "step": 4510 + }, + { + "epoch": 34.76923076923077, + "grad_norm": 0.014298142865300179, + "learning_rate": 6.095384615384616e-06, + "loss": 0.0026, + "step": 4520 + }, + { + "epoch": 34.84615384615385, + "grad_norm": 0.013737565837800503, + "learning_rate": 6.064615384615386e-06, + "loss": 0.107, + "step": 4530 + }, + { + "epoch": 34.92307692307692, + "grad_norm": 0.015088734216988087, + "learning_rate": 6.033846153846154e-06, + "loss": 0.0539, + "step": 4540 + }, + { + "epoch": 35.0, + "grad_norm": 21.94456672668457, + "learning_rate": 6.003076923076924e-06, + "loss": 0.0325, + "step": 4550 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.010248888283967972, + "eval_runtime": 0.5513, + "eval_samples_per_second": 241.259, + "eval_steps_per_second": 30.838, + "step": 4550 + }, + { + "epoch": 35.07692307692308, + "grad_norm": 0.023278767243027687, + "learning_rate": 5.972307692307693e-06, + "loss": 0.0564, + "step": 4560 + }, + { + "epoch": 35.15384615384615, + "grad_norm": 0.012957429513335228, + "learning_rate": 5.941538461538462e-06, + "loss": 0.0564, + "step": 4570 + }, + { + "epoch": 35.23076923076923, + "grad_norm": 0.01405767910182476, + "learning_rate": 5.910769230769231e-06, + "loss": 0.0053, + "step": 4580 + }, + { + "epoch": 35.30769230769231, + "grad_norm": 0.015820490196347237, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.0024, + "step": 4590 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 0.017123892903327942, + "learning_rate": 5.849230769230769e-06, + "loss": 0.0185, + "step": 4600 + }, + { + "epoch": 35.46153846153846, + "grad_norm": 0.019515695050358772, + "learning_rate": 5.818461538461538e-06, + "loss": 0.0389, + "step": 4610 + }, + { + "epoch": 35.53846153846154, + "grad_norm": 0.014827900566160679, + "learning_rate": 5.787692307692309e-06, + "loss": 0.0414, + "step": 4620 + }, + { + "epoch": 35.61538461538461, + "grad_norm": 0.012985621578991413, + "learning_rate": 5.756923076923078e-06, + "loss": 0.0144, + "step": 4630 + }, + { + "epoch": 35.69230769230769, + "grad_norm": 0.017493218183517456, + "learning_rate": 5.726153846153847e-06, + "loss": 0.0653, + "step": 4640 + }, + { + "epoch": 35.76923076923077, + "grad_norm": 0.22555764019489288, + "learning_rate": 5.695384615384616e-06, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 35.84615384615385, + "grad_norm": 0.013455439358949661, + "learning_rate": 5.664615384615385e-06, + "loss": 0.0034, + "step": 4660 + }, + { + "epoch": 35.92307692307692, + "grad_norm": 0.015202301554381847, + "learning_rate": 5.633846153846154e-06, + "loss": 0.0777, + "step": 4670 + }, + { + "epoch": 36.0, + "grad_norm": 0.01825718767940998, + "learning_rate": 5.603076923076923e-06, + "loss": 0.003, + "step": 4680 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.033365920186042786, + "eval_runtime": 0.5589, + "eval_samples_per_second": 237.953, + "eval_steps_per_second": 30.415, + "step": 4680 + }, + { + "epoch": 36.07692307692308, + "grad_norm": 0.013083132915198803, + "learning_rate": 5.572307692307693e-06, + "loss": 0.0025, + "step": 4690 + }, + { + "epoch": 36.15384615384615, + "grad_norm": 0.016317158937454224, + "learning_rate": 5.541538461538461e-06, + "loss": 0.0618, + "step": 4700 + }, + { + "epoch": 36.23076923076923, + "grad_norm": 0.015961136668920517, + "learning_rate": 5.5107692307692315e-06, + "loss": 0.0024, + "step": 4710 + }, + { + "epoch": 36.30769230769231, + "grad_norm": 0.01492154598236084, + "learning_rate": 5.480000000000001e-06, + "loss": 0.0517, + "step": 4720 + }, + { + "epoch": 36.38461538461539, + "grad_norm": 0.014177103526890278, + "learning_rate": 5.44923076923077e-06, + "loss": 0.003, + "step": 4730 + }, + { + "epoch": 36.46153846153846, + "grad_norm": 0.016013240441679955, + "learning_rate": 5.418461538461539e-06, + "loss": 0.0141, + "step": 4740 + }, + { + "epoch": 36.53846153846154, + "grad_norm": 0.014002722688019276, + "learning_rate": 5.387692307692308e-06, + "loss": 0.1037, + "step": 4750 + }, + { + "epoch": 36.61538461538461, + "grad_norm": 0.013050459325313568, + "learning_rate": 5.356923076923078e-06, + "loss": 0.0305, + "step": 4760 + }, + { + "epoch": 36.69230769230769, + "grad_norm": 0.06147696077823639, + "learning_rate": 5.326153846153846e-06, + "loss": 0.0047, + "step": 4770 + }, + { + "epoch": 36.76923076923077, + "grad_norm": 0.015133769251406193, + "learning_rate": 5.2953846153846156e-06, + "loss": 0.0122, + "step": 4780 + }, + { + "epoch": 36.84615384615385, + "grad_norm": 0.09745892137289047, + "learning_rate": 5.264615384615385e-06, + "loss": 0.0685, + "step": 4790 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 0.3880118429660797, + "learning_rate": 5.2338461538461535e-06, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 37.0, + "grad_norm": 0.025298159569501877, + "learning_rate": 5.203076923076924e-06, + "loss": 0.0023, + "step": 4810 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.03191983327269554, + "eval_runtime": 0.5391, + "eval_samples_per_second": 246.729, + "eval_steps_per_second": 31.537, + "step": 4810 + }, + { + "epoch": 37.07692307692308, + "grad_norm": 0.017688415944576263, + "learning_rate": 5.172307692307693e-06, + "loss": 0.062, + "step": 4820 + }, + { + "epoch": 37.15384615384615, + "grad_norm": 0.04807035252451897, + "learning_rate": 5.1415384615384625e-06, + "loss": 0.004, + "step": 4830 + }, + { + "epoch": 37.23076923076923, + "grad_norm": 8.656044006347656, + "learning_rate": 5.110769230769231e-06, + "loss": 0.0693, + "step": 4840 + }, + { + "epoch": 37.30769230769231, + "grad_norm": 0.03971487283706665, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.0029, + "step": 4850 + }, + { + "epoch": 37.38461538461539, + "grad_norm": 0.017609858885407448, + "learning_rate": 5.04923076923077e-06, + "loss": 0.3001, + "step": 4860 + }, + { + "epoch": 37.46153846153846, + "grad_norm": 0.013561651110649109, + "learning_rate": 5.0184615384615384e-06, + "loss": 0.0219, + "step": 4870 + }, + { + "epoch": 37.53846153846154, + "grad_norm": 0.012035545893013477, + "learning_rate": 4.987692307692308e-06, + "loss": 0.0624, + "step": 4880 + }, + { + "epoch": 37.61538461538461, + "grad_norm": 0.015826474875211716, + "learning_rate": 4.956923076923077e-06, + "loss": 0.0025, + "step": 4890 + }, + { + "epoch": 37.69230769230769, + "grad_norm": 0.012562999501824379, + "learning_rate": 4.926153846153847e-06, + "loss": 0.0023, + "step": 4900 + }, + { + "epoch": 37.76923076923077, + "grad_norm": 0.014352110214531422, + "learning_rate": 4.895384615384616e-06, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 37.84615384615385, + "grad_norm": 0.014295834116637707, + "learning_rate": 4.8646153846153846e-06, + "loss": 0.0024, + "step": 4920 + }, + { + "epoch": 37.92307692307692, + "grad_norm": 0.6250506639480591, + "learning_rate": 4.833846153846154e-06, + "loss": 0.0474, + "step": 4930 + }, + { + "epoch": 38.0, + "grad_norm": 0.013108393177390099, + "learning_rate": 4.803076923076923e-06, + "loss": 0.0042, + "step": 4940 + }, + { + "epoch": 38.0, + "eval_accuracy": 1.0, + "eval_loss": 0.003052742453292012, + "eval_runtime": 0.5546, + "eval_samples_per_second": 239.827, + "eval_steps_per_second": 30.655, + "step": 4940 + }, + { + "epoch": 38.07692307692308, + "grad_norm": 0.018184663727879524, + "learning_rate": 4.772307692307693e-06, + "loss": 0.0135, + "step": 4950 + }, + { + "epoch": 38.15384615384615, + "grad_norm": 0.015793485566973686, + "learning_rate": 4.741538461538462e-06, + "loss": 0.195, + "step": 4960 + }, + { + "epoch": 38.23076923076923, + "grad_norm": 0.011755319312214851, + "learning_rate": 4.710769230769231e-06, + "loss": 0.0321, + "step": 4970 + }, + { + "epoch": 38.30769230769231, + "grad_norm": 0.012798692099750042, + "learning_rate": 4.680000000000001e-06, + "loss": 0.0024, + "step": 4980 + }, + { + "epoch": 38.38461538461539, + "grad_norm": 0.012951038777828217, + "learning_rate": 4.6492307692307695e-06, + "loss": 0.0106, + "step": 4990 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 14.449468612670898, + "learning_rate": 4.618461538461539e-06, + "loss": 0.0132, + "step": 5000 + }, + { + "epoch": 38.53846153846154, + "grad_norm": 0.012457410804927349, + "learning_rate": 4.587692307692308e-06, + "loss": 0.0022, + "step": 5010 + }, + { + "epoch": 38.61538461538461, + "grad_norm": 0.012001272290945053, + "learning_rate": 4.556923076923077e-06, + "loss": 0.0142, + "step": 5020 + }, + { + "epoch": 38.69230769230769, + "grad_norm": 0.012857157737016678, + "learning_rate": 4.526153846153847e-06, + "loss": 0.1611, + "step": 5030 + }, + { + "epoch": 38.76923076923077, + "grad_norm": 9.153227806091309, + "learning_rate": 4.495384615384616e-06, + "loss": 0.2075, + "step": 5040 + }, + { + "epoch": 38.84615384615385, + "grad_norm": 0.014607460238039494, + "learning_rate": 4.464615384615385e-06, + "loss": 0.0048, + "step": 5050 + }, + { + "epoch": 38.92307692307692, + "grad_norm": 0.01707644946873188, + "learning_rate": 4.433846153846154e-06, + "loss": 0.0065, + "step": 5060 + }, + { + "epoch": 39.0, + "grad_norm": 0.04221203178167343, + "learning_rate": 4.403076923076923e-06, + "loss": 0.0024, + "step": 5070 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.019144639372825623, + "eval_runtime": 0.5241, + "eval_samples_per_second": 253.781, + "eval_steps_per_second": 32.438, + "step": 5070 + }, + { + "epoch": 39.07692307692308, + "grad_norm": 0.01679709553718567, + "learning_rate": 4.372307692307693e-06, + "loss": 0.0186, + "step": 5080 + }, + { + "epoch": 39.15384615384615, + "grad_norm": 0.0380309522151947, + "learning_rate": 4.341538461538462e-06, + "loss": 0.0284, + "step": 5090 + }, + { + "epoch": 39.23076923076923, + "grad_norm": 0.062268923968076706, + "learning_rate": 4.310769230769231e-06, + "loss": 0.0023, + "step": 5100 + }, + { + "epoch": 39.30769230769231, + "grad_norm": 0.012896365486085415, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.0621, + "step": 5110 + }, + { + "epoch": 39.38461538461539, + "grad_norm": 0.012767019681632519, + "learning_rate": 4.249230769230769e-06, + "loss": 0.0466, + "step": 5120 + }, + { + "epoch": 39.46153846153846, + "grad_norm": 0.01415946800261736, + "learning_rate": 4.218461538461539e-06, + "loss": 0.0819, + "step": 5130 + }, + { + "epoch": 39.53846153846154, + "grad_norm": 0.013272907584905624, + "learning_rate": 4.187692307692308e-06, + "loss": 0.003, + "step": 5140 + }, + { + "epoch": 39.61538461538461, + "grad_norm": 0.014489632099866867, + "learning_rate": 4.156923076923077e-06, + "loss": 0.0025, + "step": 5150 + }, + { + "epoch": 39.69230769230769, + "grad_norm": 0.013894579373300076, + "learning_rate": 4.126153846153847e-06, + "loss": 0.0023, + "step": 5160 + }, + { + "epoch": 39.76923076923077, + "grad_norm": 9.107804298400879, + "learning_rate": 4.095384615384615e-06, + "loss": 0.0515, + "step": 5170 + }, + { + "epoch": 39.84615384615385, + "grad_norm": 0.014358878135681152, + "learning_rate": 4.0646153846153854e-06, + "loss": 0.0026, + "step": 5180 + }, + { + "epoch": 39.92307692307692, + "grad_norm": 0.013104984536767006, + "learning_rate": 4.033846153846154e-06, + "loss": 0.0024, + "step": 5190 + }, + { + "epoch": 40.0, + "grad_norm": 0.01427317876368761, + "learning_rate": 4.003076923076923e-06, + "loss": 0.0022, + "step": 5200 + }, + { + "epoch": 40.0, + "eval_accuracy": 1.0, + "eval_loss": 0.003598709125071764, + "eval_runtime": 0.5381, + "eval_samples_per_second": 247.161, + "eval_steps_per_second": 31.592, + "step": 5200 + }, + { + "epoch": 40.07692307692308, + "grad_norm": 0.03101656772196293, + "learning_rate": 3.972307692307693e-06, + "loss": 0.0023, + "step": 5210 + }, + { + "epoch": 40.15384615384615, + "grad_norm": 0.2715461552143097, + "learning_rate": 3.941538461538461e-06, + "loss": 0.0036, + "step": 5220 + }, + { + "epoch": 40.23076923076923, + "grad_norm": 0.01134204026311636, + "learning_rate": 3.9107692307692316e-06, + "loss": 0.0023, + "step": 5230 + }, + { + "epoch": 40.30769230769231, + "grad_norm": 0.14256364107131958, + "learning_rate": 3.88e-06, + "loss": 0.0045, + "step": 5240 + }, + { + "epoch": 40.38461538461539, + "grad_norm": 0.012102825567126274, + "learning_rate": 3.8492307692307695e-06, + "loss": 0.0409, + "step": 5250 + }, + { + "epoch": 40.46153846153846, + "grad_norm": 0.02875143103301525, + "learning_rate": 3.818461538461539e-06, + "loss": 0.0126, + "step": 5260 + }, + { + "epoch": 40.53846153846154, + "grad_norm": 1.0032603740692139, + "learning_rate": 3.787692307692308e-06, + "loss": 0.0254, + "step": 5270 + }, + { + "epoch": 40.61538461538461, + "grad_norm": 0.031576529145240784, + "learning_rate": 3.7569230769230773e-06, + "loss": 0.0478, + "step": 5280 + }, + { + "epoch": 40.69230769230769, + "grad_norm": 0.015850715339183807, + "learning_rate": 3.7261538461538467e-06, + "loss": 0.0024, + "step": 5290 + }, + { + "epoch": 40.76923076923077, + "grad_norm": 0.024947721511125565, + "learning_rate": 3.6953846153846156e-06, + "loss": 0.0024, + "step": 5300 + }, + { + "epoch": 40.84615384615385, + "grad_norm": 0.015615087933838367, + "learning_rate": 3.6646153846153846e-06, + "loss": 0.0021, + "step": 5310 + }, + { + "epoch": 40.92307692307692, + "grad_norm": 0.01173191424459219, + "learning_rate": 3.633846153846154e-06, + "loss": 0.0179, + "step": 5320 + }, + { + "epoch": 41.0, + "grad_norm": 0.013337279669940472, + "learning_rate": 3.6030769230769234e-06, + "loss": 0.0029, + "step": 5330 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.010145832784473896, + "eval_runtime": 0.5302, + "eval_samples_per_second": 250.839, + "eval_steps_per_second": 32.062, + "step": 5330 + }, + { + "epoch": 41.07692307692308, + "grad_norm": 0.016314612701535225, + "learning_rate": 3.572307692307693e-06, + "loss": 0.0137, + "step": 5340 + }, + { + "epoch": 41.15384615384615, + "grad_norm": 0.012036019004881382, + "learning_rate": 3.5415384615384618e-06, + "loss": 0.1352, + "step": 5350 + }, + { + "epoch": 41.23076923076923, + "grad_norm": 0.014739703387022018, + "learning_rate": 3.5107692307692307e-06, + "loss": 0.0021, + "step": 5360 + }, + { + "epoch": 41.30769230769231, + "grad_norm": 0.012463758699595928, + "learning_rate": 3.48e-06, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 41.38461538461539, + "grad_norm": 0.01311911828815937, + "learning_rate": 3.4492307692307695e-06, + "loss": 0.072, + "step": 5380 + }, + { + "epoch": 41.46153846153846, + "grad_norm": 0.021092034876346588, + "learning_rate": 3.418461538461539e-06, + "loss": 0.0022, + "step": 5390 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 18.661046981811523, + "learning_rate": 3.387692307692308e-06, + "loss": 0.0754, + "step": 5400 + }, + { + "epoch": 41.61538461538461, + "grad_norm": 1.5773985385894775, + "learning_rate": 3.356923076923077e-06, + "loss": 0.029, + "step": 5410 + }, + { + "epoch": 41.69230769230769, + "grad_norm": 0.012229927815496922, + "learning_rate": 3.3261538461538463e-06, + "loss": 0.0613, + "step": 5420 + }, + { + "epoch": 41.76923076923077, + "grad_norm": 0.011011325754225254, + "learning_rate": 3.2953846153846157e-06, + "loss": 0.0029, + "step": 5430 + }, + { + "epoch": 41.84615384615385, + "grad_norm": 0.019086536020040512, + "learning_rate": 3.264615384615385e-06, + "loss": 0.0023, + "step": 5440 + }, + { + "epoch": 41.92307692307692, + "grad_norm": 0.015000764280557632, + "learning_rate": 3.233846153846154e-06, + "loss": 0.0754, + "step": 5450 + }, + { + "epoch": 42.0, + "grad_norm": 0.01666330359876156, + "learning_rate": 3.203076923076923e-06, + "loss": 0.0021, + "step": 5460 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.014437055215239525, + "eval_runtime": 0.5274, + "eval_samples_per_second": 252.194, + "eval_steps_per_second": 32.235, + "step": 5460 + }, + { + "epoch": 42.07692307692308, + "grad_norm": 0.013320401310920715, + "learning_rate": 3.1723076923076924e-06, + "loss": 0.0427, + "step": 5470 + }, + { + "epoch": 42.15384615384615, + "grad_norm": 0.015172663144767284, + "learning_rate": 3.141538461538462e-06, + "loss": 0.0756, + "step": 5480 + }, + { + "epoch": 42.23076923076923, + "grad_norm": 0.01375211589038372, + "learning_rate": 3.110769230769231e-06, + "loss": 0.0095, + "step": 5490 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 0.010913485661149025, + "learning_rate": 3.08e-06, + "loss": 0.045, + "step": 5500 + }, + { + "epoch": 42.38461538461539, + "grad_norm": 0.012315131723880768, + "learning_rate": 3.049230769230769e-06, + "loss": 0.0166, + "step": 5510 + }, + { + "epoch": 42.46153846153846, + "grad_norm": 0.011989817023277283, + "learning_rate": 3.0184615384615385e-06, + "loss": 0.0023, + "step": 5520 + }, + { + "epoch": 42.53846153846154, + "grad_norm": 0.1684093326330185, + "learning_rate": 2.987692307692308e-06, + "loss": 0.0024, + "step": 5530 + }, + { + "epoch": 42.61538461538461, + "grad_norm": 0.014550294727087021, + "learning_rate": 2.9569230769230773e-06, + "loss": 0.1139, + "step": 5540 + }, + { + "epoch": 42.69230769230769, + "grad_norm": 0.012002137489616871, + "learning_rate": 2.9261538461538463e-06, + "loss": 0.0134, + "step": 5550 + }, + { + "epoch": 42.76923076923077, + "grad_norm": 0.8407074213027954, + "learning_rate": 2.8953846153846153e-06, + "loss": 0.0025, + "step": 5560 + }, + { + "epoch": 42.84615384615385, + "grad_norm": 0.012318179942667484, + "learning_rate": 2.8646153846153847e-06, + "loss": 0.0021, + "step": 5570 + }, + { + "epoch": 42.92307692307692, + "grad_norm": 0.01188538409769535, + "learning_rate": 2.833846153846154e-06, + "loss": 0.0022, + "step": 5580 + }, + { + "epoch": 43.0, + "grad_norm": 0.017193341627717018, + "learning_rate": 2.8030769230769234e-06, + "loss": 0.0021, + "step": 5590 + }, + { + "epoch": 43.0, + "eval_accuracy": 1.0, + "eval_loss": 0.006857139058411121, + "eval_runtime": 0.5239, + "eval_samples_per_second": 253.888, + "eval_steps_per_second": 32.452, + "step": 5590 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 0.015062793157994747, + "learning_rate": 2.7723076923076924e-06, + "loss": 0.0202, + "step": 5600 + }, + { + "epoch": 43.15384615384615, + "grad_norm": 0.032106004655361176, + "learning_rate": 2.7415384615384614e-06, + "loss": 0.0117, + "step": 5610 + }, + { + "epoch": 43.23076923076923, + "grad_norm": 0.017983099445700645, + "learning_rate": 2.710769230769231e-06, + "loss": 0.0205, + "step": 5620 + }, + { + "epoch": 43.30769230769231, + "grad_norm": 0.01667805388569832, + "learning_rate": 2.68e-06, + "loss": 0.0035, + "step": 5630 + }, + { + "epoch": 43.38461538461539, + "grad_norm": 18.033634185791016, + "learning_rate": 2.6492307692307696e-06, + "loss": 0.063, + "step": 5640 + }, + { + "epoch": 43.46153846153846, + "grad_norm": 0.011769862845540047, + "learning_rate": 2.6184615384615385e-06, + "loss": 0.0021, + "step": 5650 + }, + { + "epoch": 43.53846153846154, + "grad_norm": 0.010967381298542023, + "learning_rate": 2.587692307692308e-06, + "loss": 0.0223, + "step": 5660 + }, + { + "epoch": 43.61538461538461, + "grad_norm": 0.013407750055193901, + "learning_rate": 2.5569230769230773e-06, + "loss": 0.0728, + "step": 5670 + }, + { + "epoch": 43.69230769230769, + "grad_norm": 0.015275663696229458, + "learning_rate": 2.5261538461538463e-06, + "loss": 0.0368, + "step": 5680 + }, + { + "epoch": 43.76923076923077, + "grad_norm": 0.011243674904108047, + "learning_rate": 2.4953846153846157e-06, + "loss": 0.1436, + "step": 5690 + }, + { + "epoch": 43.84615384615385, + "grad_norm": 0.014015651308000088, + "learning_rate": 2.4646153846153847e-06, + "loss": 0.0067, + "step": 5700 + }, + { + "epoch": 43.92307692307692, + "grad_norm": 0.01901082880795002, + "learning_rate": 2.433846153846154e-06, + "loss": 0.0022, + "step": 5710 + }, + { + "epoch": 44.0, + "grad_norm": 0.012149263173341751, + "learning_rate": 2.4030769230769235e-06, + "loss": 0.065, + "step": 5720 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.010318690910935402, + "eval_runtime": 0.5282, + "eval_samples_per_second": 251.784, + "eval_steps_per_second": 32.183, + "step": 5720 + }, + { + "epoch": 44.07692307692308, + "grad_norm": 0.3261222243309021, + "learning_rate": 2.3723076923076924e-06, + "loss": 0.0022, + "step": 5730 + }, + { + "epoch": 44.15384615384615, + "grad_norm": 0.012312835082411766, + "learning_rate": 2.341538461538462e-06, + "loss": 0.0084, + "step": 5740 + }, + { + "epoch": 44.23076923076923, + "grad_norm": 0.011203623376786709, + "learning_rate": 2.310769230769231e-06, + "loss": 0.0622, + "step": 5750 + }, + { + "epoch": 44.30769230769231, + "grad_norm": 0.012114602141082287, + "learning_rate": 2.28e-06, + "loss": 0.0023, + "step": 5760 + }, + { + "epoch": 44.38461538461539, + "grad_norm": 0.011456971988081932, + "learning_rate": 2.2492307692307696e-06, + "loss": 0.0779, + "step": 5770 + }, + { + "epoch": 44.46153846153846, + "grad_norm": 0.0135037275031209, + "learning_rate": 2.218461538461539e-06, + "loss": 0.0084, + "step": 5780 + }, + { + "epoch": 44.53846153846154, + "grad_norm": 0.015234984457492828, + "learning_rate": 2.187692307692308e-06, + "loss": 0.0547, + "step": 5790 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 0.04381181672215462, + "learning_rate": 2.156923076923077e-06, + "loss": 0.0023, + "step": 5800 + }, + { + "epoch": 44.69230769230769, + "grad_norm": 0.010934427380561829, + "learning_rate": 2.1261538461538463e-06, + "loss": 0.0021, + "step": 5810 + }, + { + "epoch": 44.76923076923077, + "grad_norm": 27.930889129638672, + "learning_rate": 2.0953846153846157e-06, + "loss": 0.0159, + "step": 5820 + }, + { + "epoch": 44.84615384615385, + "grad_norm": 0.012389708310365677, + "learning_rate": 2.064615384615385e-06, + "loss": 0.0793, + "step": 5830 + }, + { + "epoch": 44.92307692307692, + "grad_norm": 0.028808044269680977, + "learning_rate": 2.033846153846154e-06, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 45.0, + "grad_norm": 0.01526501402258873, + "learning_rate": 2.003076923076923e-06, + "loss": 0.0022, + "step": 5850 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.010875669308006763, + "eval_runtime": 0.5276, + "eval_samples_per_second": 252.076, + "eval_steps_per_second": 32.22, + "step": 5850 + }, + { + "epoch": 45.07692307692308, + "grad_norm": 0.013048418797552586, + "learning_rate": 1.9723076923076924e-06, + "loss": 0.0698, + "step": 5860 + }, + { + "epoch": 45.15384615384615, + "grad_norm": 0.01143215224146843, + "learning_rate": 1.941538461538462e-06, + "loss": 0.0634, + "step": 5870 + }, + { + "epoch": 45.23076923076923, + "grad_norm": 0.010755735449492931, + "learning_rate": 1.9107692307692312e-06, + "loss": 0.0019, + "step": 5880 + }, + { + "epoch": 45.30769230769231, + "grad_norm": 0.012564443983137608, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.0239, + "step": 5890 + }, + { + "epoch": 45.38461538461539, + "grad_norm": 0.01237222459167242, + "learning_rate": 1.8492307692307692e-06, + "loss": 0.002, + "step": 5900 + }, + { + "epoch": 45.46153846153846, + "grad_norm": 0.014862455427646637, + "learning_rate": 1.8184615384615386e-06, + "loss": 0.0617, + "step": 5910 + }, + { + "epoch": 45.53846153846154, + "grad_norm": 0.8376421928405762, + "learning_rate": 1.7876923076923078e-06, + "loss": 0.0703, + "step": 5920 + }, + { + "epoch": 45.61538461538461, + "grad_norm": 0.013092657551169395, + "learning_rate": 1.7569230769230772e-06, + "loss": 0.0517, + "step": 5930 + }, + { + "epoch": 45.69230769230769, + "grad_norm": 0.01345751527696848, + "learning_rate": 1.7261538461538463e-06, + "loss": 0.0137, + "step": 5940 + }, + { + "epoch": 45.76923076923077, + "grad_norm": 0.014570233412086964, + "learning_rate": 1.6953846153846153e-06, + "loss": 0.0526, + "step": 5950 + }, + { + "epoch": 45.84615384615385, + "grad_norm": 0.012618360109627247, + "learning_rate": 1.6646153846153847e-06, + "loss": 0.0021, + "step": 5960 + }, + { + "epoch": 45.92307692307692, + "grad_norm": 0.012154246680438519, + "learning_rate": 1.6338461538461539e-06, + "loss": 0.002, + "step": 5970 + }, + { + "epoch": 46.0, + "grad_norm": 0.012495743110775948, + "learning_rate": 1.6030769230769233e-06, + "loss": 0.002, + "step": 5980 + }, + { + "epoch": 46.0, + "eval_accuracy": 1.0, + "eval_loss": 0.0075845057144761086, + "eval_runtime": 0.5229, + "eval_samples_per_second": 254.371, + "eval_steps_per_second": 32.514, + "step": 5980 + }, + { + "epoch": 46.07692307692308, + "grad_norm": 0.011556745506823063, + "learning_rate": 1.5723076923076925e-06, + "loss": 0.0023, + "step": 5990 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 0.012387405149638653, + "learning_rate": 1.5415384615384614e-06, + "loss": 0.0021, + "step": 6000 + }, + { + "epoch": 46.23076923076923, + "grad_norm": 0.010581187903881073, + "learning_rate": 1.5107692307692308e-06, + "loss": 0.0042, + "step": 6010 + }, + { + "epoch": 46.30769230769231, + "grad_norm": 0.015102666802704334, + "learning_rate": 1.48e-06, + "loss": 0.0984, + "step": 6020 + }, + { + "epoch": 46.38461538461539, + "grad_norm": 0.01553966011852026, + "learning_rate": 1.4492307692307694e-06, + "loss": 0.0019, + "step": 6030 + }, + { + "epoch": 46.46153846153846, + "grad_norm": 0.011307159438729286, + "learning_rate": 1.4184615384615386e-06, + "loss": 0.0041, + "step": 6040 + }, + { + "epoch": 46.53846153846154, + "grad_norm": 0.013432678766548634, + "learning_rate": 1.3876923076923076e-06, + "loss": 0.0218, + "step": 6050 + }, + { + "epoch": 46.61538461538461, + "grad_norm": 0.240656316280365, + "learning_rate": 1.356923076923077e-06, + "loss": 0.0027, + "step": 6060 + }, + { + "epoch": 46.69230769230769, + "grad_norm": 0.2763911783695221, + "learning_rate": 1.3261538461538461e-06, + "loss": 0.0021, + "step": 6070 + }, + { + "epoch": 46.76923076923077, + "grad_norm": 0.01159702893346548, + "learning_rate": 1.2953846153846155e-06, + "loss": 0.002, + "step": 6080 + }, + { + "epoch": 46.84615384615385, + "grad_norm": 0.010589174926280975, + "learning_rate": 1.2646153846153847e-06, + "loss": 0.002, + "step": 6090 + }, + { + "epoch": 46.92307692307692, + "grad_norm": 0.013224811293184757, + "learning_rate": 1.233846153846154e-06, + "loss": 0.0023, + "step": 6100 + }, + { + "epoch": 47.0, + "grad_norm": 0.014801280573010445, + "learning_rate": 1.2030769230769233e-06, + "loss": 0.0021, + "step": 6110 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.010350220836699009, + "eval_runtime": 0.5437, + "eval_samples_per_second": 244.619, + "eval_steps_per_second": 31.267, + "step": 6110 + }, + { + "epoch": 47.07692307692308, + "grad_norm": 0.01060748752206564, + "learning_rate": 1.1723076923076925e-06, + "loss": 0.0556, + "step": 6120 + }, + { + "epoch": 47.15384615384615, + "grad_norm": 0.012824353761970997, + "learning_rate": 1.1415384615384617e-06, + "loss": 0.002, + "step": 6130 + }, + { + "epoch": 47.23076923076923, + "grad_norm": 0.012337896041572094, + "learning_rate": 1.1107692307692309e-06, + "loss": 0.0019, + "step": 6140 + }, + { + "epoch": 47.30769230769231, + "grad_norm": 0.011829494498670101, + "learning_rate": 1.08e-06, + "loss": 0.0026, + "step": 6150 + }, + { + "epoch": 47.38461538461539, + "grad_norm": 0.013274445198476315, + "learning_rate": 1.0492307692307694e-06, + "loss": 0.0046, + "step": 6160 + }, + { + "epoch": 47.46153846153846, + "grad_norm": 0.011121121235191822, + "learning_rate": 1.0184615384615386e-06, + "loss": 0.0023, + "step": 6170 + }, + { + "epoch": 47.53846153846154, + "grad_norm": 0.05423010513186455, + "learning_rate": 9.876923076923078e-07, + "loss": 0.002, + "step": 6180 + }, + { + "epoch": 47.61538461538461, + "grad_norm": 0.012248422019183636, + "learning_rate": 9.56923076923077e-07, + "loss": 0.0531, + "step": 6190 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 0.011545045301318169, + "learning_rate": 9.261538461538462e-07, + "loss": 0.0021, + "step": 6200 + }, + { + "epoch": 47.76923076923077, + "grad_norm": 0.010758856311440468, + "learning_rate": 8.953846153846155e-07, + "loss": 0.0536, + "step": 6210 + }, + { + "epoch": 47.84615384615385, + "grad_norm": 0.012839927338063717, + "learning_rate": 8.646153846153847e-07, + "loss": 0.0021, + "step": 6220 + }, + { + "epoch": 47.92307692307692, + "grad_norm": 0.010332600213587284, + "learning_rate": 8.338461538461539e-07, + "loss": 0.0029, + "step": 6230 + }, + { + "epoch": 48.0, + "grad_norm": 0.011800854466855526, + "learning_rate": 8.030769230769231e-07, + "loss": 0.0034, + "step": 6240 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9849624060150376, + "eval_loss": 0.02306681126356125, + "eval_runtime": 0.5276, + "eval_samples_per_second": 252.072, + "eval_steps_per_second": 32.22, + "step": 6240 + }, + { + "epoch": 48.07692307692308, + "grad_norm": 0.011218088679015636, + "learning_rate": 7.723076923076923e-07, + "loss": 0.002, + "step": 6250 + }, + { + "epoch": 48.15384615384615, + "grad_norm": 0.01928931288421154, + "learning_rate": 7.415384615384616e-07, + "loss": 0.0398, + "step": 6260 + }, + { + "epoch": 48.23076923076923, + "grad_norm": 0.011039414443075657, + "learning_rate": 7.107692307692309e-07, + "loss": 0.0612, + "step": 6270 + }, + { + "epoch": 48.30769230769231, + "grad_norm": 0.013023107312619686, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0024, + "step": 6280 + }, + { + "epoch": 48.38461538461539, + "grad_norm": 0.011004856787621975, + "learning_rate": 6.492307692307692e-07, + "loss": 0.055, + "step": 6290 + }, + { + "epoch": 48.46153846153846, + "grad_norm": 0.010889323428273201, + "learning_rate": 6.184615384615385e-07, + "loss": 0.0019, + "step": 6300 + }, + { + "epoch": 48.53846153846154, + "grad_norm": 0.014071095734834671, + "learning_rate": 5.876923076923077e-07, + "loss": 0.0023, + "step": 6310 + }, + { + "epoch": 48.61538461538461, + "grad_norm": 0.012683231383562088, + "learning_rate": 5.56923076923077e-07, + "loss": 0.0048, + "step": 6320 + }, + { + "epoch": 48.69230769230769, + "grad_norm": 12.02895450592041, + "learning_rate": 5.261538461538462e-07, + "loss": 0.0344, + "step": 6330 + }, + { + "epoch": 48.76923076923077, + "grad_norm": 0.035363540053367615, + "learning_rate": 4.953846153846155e-07, + "loss": 0.0024, + "step": 6340 + }, + { + "epoch": 48.84615384615385, + "grad_norm": 0.014501710422337055, + "learning_rate": 4.6461538461538465e-07, + "loss": 0.0035, + "step": 6350 + }, + { + "epoch": 48.92307692307692, + "grad_norm": 0.01146631222218275, + "learning_rate": 4.3384615384615384e-07, + "loss": 0.0303, + "step": 6360 + }, + { + "epoch": 49.0, + "grad_norm": 0.012319709174335003, + "learning_rate": 4.0307692307692313e-07, + "loss": 0.0578, + "step": 6370 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.027832742780447006, + "eval_runtime": 0.4975, + "eval_samples_per_second": 267.35, + "eval_steps_per_second": 34.173, + "step": 6370 + }, + { + "epoch": 49.07692307692308, + "grad_norm": 0.01587672345340252, + "learning_rate": 3.7230769230769236e-07, + "loss": 0.006, + "step": 6380 + }, + { + "epoch": 49.15384615384615, + "grad_norm": 0.011370908468961716, + "learning_rate": 3.4153846153846155e-07, + "loss": 0.0021, + "step": 6390 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 0.011154048144817352, + "learning_rate": 3.107692307692308e-07, + "loss": 0.0019, + "step": 6400 + }, + { + "epoch": 49.30769230769231, + "grad_norm": 0.011250234209001064, + "learning_rate": 2.8e-07, + "loss": 0.0021, + "step": 6410 + }, + { + "epoch": 49.38461538461539, + "grad_norm": 0.012271963059902191, + "learning_rate": 2.4923076923076926e-07, + "loss": 0.0021, + "step": 6420 + }, + { + "epoch": 49.46153846153846, + "grad_norm": 0.0118482680991292, + "learning_rate": 2.1846153846153847e-07, + "loss": 0.0023, + "step": 6430 + }, + { + "epoch": 49.53846153846154, + "grad_norm": 0.01767592318356037, + "learning_rate": 1.8769230769230773e-07, + "loss": 0.0037, + "step": 6440 + }, + { + "epoch": 49.61538461538461, + "grad_norm": 0.01147297490388155, + "learning_rate": 1.5692307692307694e-07, + "loss": 0.002, + "step": 6450 + }, + { + "epoch": 49.69230769230769, + "grad_norm": 0.023704219609498978, + "learning_rate": 1.2615384615384617e-07, + "loss": 0.002, + "step": 6460 + }, + { + "epoch": 49.76923076923077, + "grad_norm": 1.87815260887146, + "learning_rate": 9.53846153846154e-08, + "loss": 0.0973, + "step": 6470 + }, + { + "epoch": 49.84615384615385, + "grad_norm": 0.013064450584352016, + "learning_rate": 6.461538461538462e-08, + "loss": 0.0388, + "step": 6480 + }, + { + "epoch": 49.92307692307692, + "grad_norm": 0.013756626285612583, + "learning_rate": 3.384615384615385e-08, + "loss": 0.0052, + "step": 6490 + }, + { + "epoch": 50.0, + "grad_norm": 0.12089427560567856, + "learning_rate": 3.0769230769230774e-09, + "loss": 0.0041, + "step": 6500 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.9924812030075187, + "eval_loss": 0.028640763834118843, + "eval_runtime": 0.4928, + "eval_samples_per_second": 269.898, + "eval_steps_per_second": 34.498, + "step": 6500 + }, + { + "epoch": 50.0, + "step": 6500, + "total_flos": 4.006371770595533e+18, + "train_loss": 0.058502563614971366, + "train_runtime": 459.1371, + "train_samples_per_second": 112.603, + "train_steps_per_second": 14.157 } ], "logging_steps": 10, - "max_steps": 650, + "max_steps": 6500, "num_input_tokens_seen": 0, - "num_train_epochs": 5, + "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -536,7 +5036,7 @@ "attributes": {} } }, - "total_flos": 4.006371770595533e+17, + "total_flos": 4.006371770595533e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null