diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { - "best_metric": 0.4903908967971802, - "best_model_checkpoint": "./beans_outputs/checkpoint-621", - "epoch": 50.0, + "best_metric": 0.4896911084651947, + "best_model_checkpoint": "./beans_outputs/checkpoint-1495", + "epoch": 200.0, "eval_steps": 500, - "global_step": 1150, + "global_step": 4600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -1264,19 +1264,3784 @@ "step": 1150 }, { - "epoch": 50.0, - "step": 1150, - "total_flos": 2.72467378584576e+17, - "train_loss": 0.4791690407628598, - "train_runtime": 1616.3187, - "train_samples_per_second": 44.391, - "train_steps_per_second": 0.711 + "epoch": 50.43478260869565, + "grad_norm": 2.7469470500946045, + "learning_rate": 1.4956521739130436e-05, + "loss": 0.4877, + "step": 1160 + }, + { + "epoch": 50.869565217391305, + "grad_norm": 2.5254504680633545, + "learning_rate": 1.491304347826087e-05, + "loss": 0.4366, + "step": 1170 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.7755905511811023, + "eval_loss": 0.497986376285553, + "eval_runtime": 3.9676, + "eval_samples_per_second": 64.019, + "eval_steps_per_second": 0.504, + "step": 1173 + }, + { + "epoch": 51.30434782608695, + "grad_norm": 2.032457113265991, + "learning_rate": 1.4869565217391306e-05, + "loss": 0.4663, + "step": 1180 + }, + { + "epoch": 51.73913043478261, + "grad_norm": 2.802882432937622, + "learning_rate": 1.4826086956521741e-05, + "loss": 0.4467, + "step": 1190 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.49471431970596313, + "eval_runtime": 4.8494, + "eval_samples_per_second": 52.378, + "eval_steps_per_second": 0.412, + "step": 1196 + }, + { + "epoch": 52.17391304347826, + "grad_norm": 2.2359466552734375, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.4424, + "step": 1200 + }, + { + "epoch": 52.608695652173914, + "grad_norm": 3.219308376312256, + "learning_rate": 1.473913043478261e-05, + "loss": 0.4797, + "step": 1210 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.7755905511811023, + "eval_loss": 0.4950390160083771, + "eval_runtime": 3.8997, + "eval_samples_per_second": 65.133, + "eval_steps_per_second": 0.513, + "step": 1219 + }, + { + "epoch": 53.04347826086956, + "grad_norm": 2.6939969062805176, + "learning_rate": 1.4695652173913045e-05, + "loss": 0.4256, + "step": 1220 + }, + { + "epoch": 53.47826086956522, + "grad_norm": 2.6343085765838623, + "learning_rate": 1.465217391304348e-05, + "loss": 0.4192, + "step": 1230 + }, + { + "epoch": 53.91304347826087, + "grad_norm": 10.655885696411133, + "learning_rate": 1.4608695652173915e-05, + "loss": 0.4544, + "step": 1240 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.7716535433070866, + "eval_loss": 0.49978330731391907, + "eval_runtime": 3.7833, + "eval_samples_per_second": 67.137, + "eval_steps_per_second": 0.529, + "step": 1242 + }, + { + "epoch": 54.34782608695652, + "grad_norm": 4.838284969329834, + "learning_rate": 1.456521739130435e-05, + "loss": 0.4361, + "step": 1250 + }, + { + "epoch": 54.78260869565217, + "grad_norm": 3.1171820163726807, + "learning_rate": 1.4521739130434785e-05, + "loss": 0.4466, + "step": 1260 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.49803978204727173, + "eval_runtime": 3.7735, + "eval_samples_per_second": 67.311, + "eval_steps_per_second": 0.53, + "step": 1265 + }, + { + "epoch": 55.21739130434783, + "grad_norm": 2.5744450092315674, + "learning_rate": 1.447826086956522e-05, + "loss": 0.4511, + "step": 1270 + }, + { + "epoch": 55.65217391304348, + "grad_norm": 7.211576461791992, + "learning_rate": 1.4434782608695654e-05, + "loss": 0.4599, + "step": 1280 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.4962967336177826, + "eval_runtime": 4.8613, + "eval_samples_per_second": 52.249, + "eval_steps_per_second": 0.411, + "step": 1288 + }, + { + "epoch": 56.08695652173913, + "grad_norm": 1.932460069656372, + "learning_rate": 1.4391304347826087e-05, + "loss": 0.4168, + "step": 1290 + }, + { + "epoch": 56.52173913043478, + "grad_norm": 5.841196537017822, + "learning_rate": 1.4347826086956522e-05, + "loss": 0.4622, + "step": 1300 + }, + { + "epoch": 56.95652173913044, + "grad_norm": 1.948188066482544, + "learning_rate": 1.4304347826086957e-05, + "loss": 0.4458, + "step": 1310 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.49557480216026306, + "eval_runtime": 3.6949, + "eval_samples_per_second": 68.744, + "eval_steps_per_second": 0.541, + "step": 1311 + }, + { + "epoch": 57.391304347826086, + "grad_norm": 4.304020881652832, + "learning_rate": 1.4260869565217392e-05, + "loss": 0.4378, + "step": 1320 + }, + { + "epoch": 57.82608695652174, + "grad_norm": 2.710130453109741, + "learning_rate": 1.4217391304347828e-05, + "loss": 0.4296, + "step": 1330 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.49939388036727905, + "eval_runtime": 3.7356, + "eval_samples_per_second": 67.995, + "eval_steps_per_second": 0.535, + "step": 1334 + }, + { + "epoch": 58.26086956521739, + "grad_norm": 3.730140209197998, + "learning_rate": 1.4173913043478263e-05, + "loss": 0.4664, + "step": 1340 + }, + { + "epoch": 58.69565217391305, + "grad_norm": 9.71405029296875, + "learning_rate": 1.4130434782608698e-05, + "loss": 0.4415, + "step": 1350 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.4997561573982239, + "eval_runtime": 4.9859, + "eval_samples_per_second": 50.943, + "eval_steps_per_second": 0.401, + "step": 1357 + }, + { + "epoch": 59.130434782608695, + "grad_norm": 2.7752935886383057, + "learning_rate": 1.4086956521739133e-05, + "loss": 0.4436, + "step": 1360 + }, + { + "epoch": 59.56521739130435, + "grad_norm": 4.0491251945495605, + "learning_rate": 1.4043478260869568e-05, + "loss": 0.4442, + "step": 1370 + }, + { + "epoch": 60.0, + "grad_norm": 3.6015145778656006, + "learning_rate": 1.4e-05, + "loss": 0.4036, + "step": 1380 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.4996122717857361, + "eval_runtime": 3.8039, + "eval_samples_per_second": 66.774, + "eval_steps_per_second": 0.526, + "step": 1380 + }, + { + "epoch": 60.43478260869565, + "grad_norm": 2.5297908782958984, + "learning_rate": 1.3956521739130435e-05, + "loss": 0.4364, + "step": 1390 + }, + { + "epoch": 60.869565217391305, + "grad_norm": 2.8682429790496826, + "learning_rate": 1.391304347826087e-05, + "loss": 0.4406, + "step": 1400 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5022182464599609, + "eval_runtime": 3.7333, + "eval_samples_per_second": 68.036, + "eval_steps_per_second": 0.536, + "step": 1403 + }, + { + "epoch": 61.30434782608695, + "grad_norm": 3.3014872074127197, + "learning_rate": 1.3869565217391305e-05, + "loss": 0.4346, + "step": 1410 + }, + { + "epoch": 61.73913043478261, + "grad_norm": 3.4654860496520996, + "learning_rate": 1.382608695652174e-05, + "loss": 0.4235, + "step": 1420 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5018435120582581, + "eval_runtime": 4.9797, + "eval_samples_per_second": 51.007, + "eval_steps_per_second": 0.402, + "step": 1426 + }, + { + "epoch": 62.17391304347826, + "grad_norm": 6.609365940093994, + "learning_rate": 1.3782608695652175e-05, + "loss": 0.4884, + "step": 1430 + }, + { + "epoch": 62.608695652173914, + "grad_norm": 3.425076484680176, + "learning_rate": 1.373913043478261e-05, + "loss": 0.4492, + "step": 1440 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.4963783323764801, + "eval_runtime": 3.7677, + "eval_samples_per_second": 67.416, + "eval_steps_per_second": 0.531, + "step": 1449 + }, + { + "epoch": 63.04347826086956, + "grad_norm": 4.069096088409424, + "learning_rate": 1.3695652173913046e-05, + "loss": 0.4119, + "step": 1450 + }, + { + "epoch": 63.47826086956522, + "grad_norm": 2.3584377765655518, + "learning_rate": 1.3652173913043479e-05, + "loss": 0.4356, + "step": 1460 + }, + { + "epoch": 63.91304347826087, + "grad_norm": 12.776151657104492, + "learning_rate": 1.3608695652173913e-05, + "loss": 0.4065, + "step": 1470 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.4952879250049591, + "eval_runtime": 3.7751, + "eval_samples_per_second": 67.284, + "eval_steps_per_second": 0.53, + "step": 1472 + }, + { + "epoch": 64.34782608695652, + "grad_norm": 2.501909017562866, + "learning_rate": 1.3565217391304348e-05, + "loss": 0.4356, + "step": 1480 + }, + { + "epoch": 64.78260869565217, + "grad_norm": 2.650075674057007, + "learning_rate": 1.3521739130434783e-05, + "loss": 0.4474, + "step": 1490 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.4896911084651947, + "eval_runtime": 5.1494, + "eval_samples_per_second": 49.326, + "eval_steps_per_second": 0.388, + "step": 1495 + }, + { + "epoch": 65.21739130434783, + "grad_norm": 4.465973854064941, + "learning_rate": 1.3478260869565218e-05, + "loss": 0.4246, + "step": 1500 + }, + { + "epoch": 65.65217391304348, + "grad_norm": 2.057035207748413, + "learning_rate": 1.3434782608695653e-05, + "loss": 0.4605, + "step": 1510 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.5039426684379578, + "eval_runtime": 3.7424, + "eval_samples_per_second": 67.871, + "eval_steps_per_second": 0.534, + "step": 1518 + }, + { + "epoch": 66.08695652173913, + "grad_norm": 3.064012050628662, + "learning_rate": 1.3391304347826088e-05, + "loss": 0.4157, + "step": 1520 + }, + { + "epoch": 66.52173913043478, + "grad_norm": 3.7584011554718018, + "learning_rate": 1.3347826086956523e-05, + "loss": 0.4553, + "step": 1530 + }, + { + "epoch": 66.95652173913044, + "grad_norm": 2.945054054260254, + "learning_rate": 1.3304347826086958e-05, + "loss": 0.436, + "step": 1540 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.7755905511811023, + "eval_loss": 0.5024412274360657, + "eval_runtime": 3.7621, + "eval_samples_per_second": 67.516, + "eval_steps_per_second": 0.532, + "step": 1541 + }, + { + "epoch": 67.3913043478261, + "grad_norm": 3.1257166862487793, + "learning_rate": 1.3260869565217392e-05, + "loss": 0.4173, + "step": 1550 + }, + { + "epoch": 67.82608695652173, + "grad_norm": 5.225259304046631, + "learning_rate": 1.3217391304347827e-05, + "loss": 0.4746, + "step": 1560 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5006521344184875, + "eval_runtime": 5.0155, + "eval_samples_per_second": 50.643, + "eval_steps_per_second": 0.399, + "step": 1564 + }, + { + "epoch": 68.26086956521739, + "grad_norm": 3.3438003063201904, + "learning_rate": 1.3173913043478262e-05, + "loss": 0.3957, + "step": 1570 + }, + { + "epoch": 68.69565217391305, + "grad_norm": 2.6640641689300537, + "learning_rate": 1.3130434782608697e-05, + "loss": 0.4555, + "step": 1580 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5053796172142029, + "eval_runtime": 3.7207, + "eval_samples_per_second": 68.266, + "eval_steps_per_second": 0.538, + "step": 1587 + }, + { + "epoch": 69.1304347826087, + "grad_norm": 6.726771831512451, + "learning_rate": 1.308695652173913e-05, + "loss": 0.4322, + "step": 1590 + }, + { + "epoch": 69.56521739130434, + "grad_norm": 3.231029748916626, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.3949, + "step": 1600 + }, + { + "epoch": 70.0, + "grad_norm": 6.560612201690674, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.433, + "step": 1610 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.49738696217536926, + "eval_runtime": 3.716, + "eval_samples_per_second": 68.353, + "eval_steps_per_second": 0.538, + "step": 1610 + }, + { + "epoch": 70.43478260869566, + "grad_norm": 2.6361474990844727, + "learning_rate": 1.2956521739130436e-05, + "loss": 0.4158, + "step": 1620 + }, + { + "epoch": 70.8695652173913, + "grad_norm": 2.7182960510253906, + "learning_rate": 1.2913043478260871e-05, + "loss": 0.4503, + "step": 1630 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.509588897228241, + "eval_runtime": 5.0488, + "eval_samples_per_second": 50.309, + "eval_steps_per_second": 0.396, + "step": 1633 + }, + { + "epoch": 71.30434782608695, + "grad_norm": 1.938330888748169, + "learning_rate": 1.2869565217391305e-05, + "loss": 0.4285, + "step": 1640 + }, + { + "epoch": 71.73913043478261, + "grad_norm": 3.1797378063201904, + "learning_rate": 1.282608695652174e-05, + "loss": 0.4424, + "step": 1650 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.7755905511811023, + "eval_loss": 0.5040333867073059, + "eval_runtime": 3.7598, + "eval_samples_per_second": 67.557, + "eval_steps_per_second": 0.532, + "step": 1656 + }, + { + "epoch": 72.17391304347827, + "grad_norm": 3.028841257095337, + "learning_rate": 1.2782608695652175e-05, + "loss": 0.4278, + "step": 1660 + }, + { + "epoch": 72.6086956521739, + "grad_norm": 3.0137178897857666, + "learning_rate": 1.273913043478261e-05, + "loss": 0.4331, + "step": 1670 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.505591869354248, + "eval_runtime": 3.7202, + "eval_samples_per_second": 68.277, + "eval_steps_per_second": 0.538, + "step": 1679 + }, + { + "epoch": 73.04347826086956, + "grad_norm": 3.519934892654419, + "learning_rate": 1.2695652173913045e-05, + "loss": 0.4523, + "step": 1680 + }, + { + "epoch": 73.47826086956522, + "grad_norm": 2.6839394569396973, + "learning_rate": 1.265217391304348e-05, + "loss": 0.4143, + "step": 1690 + }, + { + "epoch": 73.91304347826087, + "grad_norm": 4.223355770111084, + "learning_rate": 1.2608695652173915e-05, + "loss": 0.4263, + "step": 1700 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5025500059127808, + "eval_runtime": 5.0354, + "eval_samples_per_second": 50.442, + "eval_steps_per_second": 0.397, + "step": 1702 + }, + { + "epoch": 74.34782608695652, + "grad_norm": 2.633610248565674, + "learning_rate": 1.2565217391304349e-05, + "loss": 0.4451, + "step": 1710 + }, + { + "epoch": 74.78260869565217, + "grad_norm": 4.227041721343994, + "learning_rate": 1.2521739130434784e-05, + "loss": 0.4305, + "step": 1720 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5032832026481628, + "eval_runtime": 3.7074, + "eval_samples_per_second": 68.512, + "eval_steps_per_second": 0.539, + "step": 1725 + }, + { + "epoch": 75.21739130434783, + "grad_norm": 3.885732412338257, + "learning_rate": 1.2478260869565217e-05, + "loss": 0.4177, + "step": 1730 + }, + { + "epoch": 75.65217391304348, + "grad_norm": 6.669870853424072, + "learning_rate": 1.2434782608695652e-05, + "loss": 0.4271, + "step": 1740 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5014809966087341, + "eval_runtime": 3.6911, + "eval_samples_per_second": 68.814, + "eval_steps_per_second": 0.542, + "step": 1748 + }, + { + "epoch": 76.08695652173913, + "grad_norm": 1.820388913154602, + "learning_rate": 1.2391304347826088e-05, + "loss": 0.4457, + "step": 1750 + }, + { + "epoch": 76.52173913043478, + "grad_norm": 2.142805337905884, + "learning_rate": 1.2347826086956523e-05, + "loss": 0.3962, + "step": 1760 + }, + { + "epoch": 76.95652173913044, + "grad_norm": 3.5151073932647705, + "learning_rate": 1.2304347826086958e-05, + "loss": 0.4635, + "step": 1770 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.49884113669395447, + "eval_runtime": 4.3354, + "eval_samples_per_second": 58.587, + "eval_steps_per_second": 0.461, + "step": 1771 + }, + { + "epoch": 77.3913043478261, + "grad_norm": 3.867955207824707, + "learning_rate": 1.2260869565217393e-05, + "loss": 0.4616, + "step": 1780 + }, + { + "epoch": 77.82608695652173, + "grad_norm": 2.6050870418548584, + "learning_rate": 1.2217391304347828e-05, + "loss": 0.4212, + "step": 1790 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.4993511438369751, + "eval_runtime": 4.267, + "eval_samples_per_second": 59.527, + "eval_steps_per_second": 0.469, + "step": 1794 + }, + { + "epoch": 78.26086956521739, + "grad_norm": 2.1961538791656494, + "learning_rate": 1.2173913043478263e-05, + "loss": 0.4191, + "step": 1800 + }, + { + "epoch": 78.69565217391305, + "grad_norm": 6.02454948425293, + "learning_rate": 1.2130434782608698e-05, + "loss": 0.4154, + "step": 1810 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5044043660163879, + "eval_runtime": 3.7036, + "eval_samples_per_second": 68.581, + "eval_steps_per_second": 0.54, + "step": 1817 + }, + { + "epoch": 79.1304347826087, + "grad_norm": 2.1048858165740967, + "learning_rate": 1.208695652173913e-05, + "loss": 0.4196, + "step": 1820 + }, + { + "epoch": 79.56521739130434, + "grad_norm": 2.8622193336486816, + "learning_rate": 1.2043478260869565e-05, + "loss": 0.4314, + "step": 1830 + }, + { + "epoch": 80.0, + "grad_norm": 6.1558427810668945, + "learning_rate": 1.2e-05, + "loss": 0.4288, + "step": 1840 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5033003687858582, + "eval_runtime": 3.7575, + "eval_samples_per_second": 67.598, + "eval_steps_per_second": 0.532, + "step": 1840 + }, + { + "epoch": 80.43478260869566, + "grad_norm": 3.3254945278167725, + "learning_rate": 1.1956521739130435e-05, + "loss": 0.4297, + "step": 1850 + }, + { + "epoch": 80.8695652173913, + "grad_norm": 2.2818620204925537, + "learning_rate": 1.191304347826087e-05, + "loss": 0.4211, + "step": 1860 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5050157904624939, + "eval_runtime": 5.0113, + "eval_samples_per_second": 50.685, + "eval_steps_per_second": 0.399, + "step": 1863 + }, + { + "epoch": 81.30434782608695, + "grad_norm": 4.174459934234619, + "learning_rate": 1.1869565217391306e-05, + "loss": 0.4229, + "step": 1870 + }, + { + "epoch": 81.73913043478261, + "grad_norm": 2.87514066696167, + "learning_rate": 1.182608695652174e-05, + "loss": 0.4022, + "step": 1880 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5021248459815979, + "eval_runtime": 3.7629, + "eval_samples_per_second": 67.5, + "eval_steps_per_second": 0.531, + "step": 1886 + }, + { + "epoch": 82.17391304347827, + "grad_norm": 5.307149410247803, + "learning_rate": 1.1782608695652176e-05, + "loss": 0.4564, + "step": 1890 + }, + { + "epoch": 82.6086956521739, + "grad_norm": 4.411511421203613, + "learning_rate": 1.1739130434782611e-05, + "loss": 0.4477, + "step": 1900 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.7755905511811023, + "eval_loss": 0.509568452835083, + "eval_runtime": 5.3605, + "eval_samples_per_second": 47.384, + "eval_steps_per_second": 0.373, + "step": 1909 + }, + { + "epoch": 83.04347826086956, + "grad_norm": 2.478482246398926, + "learning_rate": 1.1695652173913043e-05, + "loss": 0.4118, + "step": 1910 + }, + { + "epoch": 83.47826086956522, + "grad_norm": 2.000185012817383, + "learning_rate": 1.1652173913043478e-05, + "loss": 0.4486, + "step": 1920 + }, + { + "epoch": 83.91304347826087, + "grad_norm": 4.231175422668457, + "learning_rate": 1.1608695652173913e-05, + "loss": 0.4091, + "step": 1930 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.501672625541687, + "eval_runtime": 4.6714, + "eval_samples_per_second": 54.374, + "eval_steps_per_second": 0.428, + "step": 1932 + }, + { + "epoch": 84.34782608695652, + "grad_norm": 8.062799453735352, + "learning_rate": 1.1565217391304348e-05, + "loss": 0.4108, + "step": 1940 + }, + { + "epoch": 84.78260869565217, + "grad_norm": 3.525912046432495, + "learning_rate": 1.1521739130434783e-05, + "loss": 0.4284, + "step": 1950 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.5094006061553955, + "eval_runtime": 3.7057, + "eval_samples_per_second": 68.544, + "eval_steps_per_second": 0.54, + "step": 1955 + }, + { + "epoch": 85.21739130434783, + "grad_norm": 2.8294172286987305, + "learning_rate": 1.1478260869565218e-05, + "loss": 0.4341, + "step": 1960 + }, + { + "epoch": 85.65217391304348, + "grad_norm": 2.6164603233337402, + "learning_rate": 1.1434782608695654e-05, + "loss": 0.4317, + "step": 1970 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5055702328681946, + "eval_runtime": 3.78, + "eval_samples_per_second": 67.195, + "eval_steps_per_second": 0.529, + "step": 1978 + }, + { + "epoch": 86.08695652173913, + "grad_norm": 5.29531717300415, + "learning_rate": 1.1391304347826089e-05, + "loss": 0.3842, + "step": 1980 + }, + { + "epoch": 86.52173913043478, + "grad_norm": 3.8016159534454346, + "learning_rate": 1.1347826086956524e-05, + "loss": 0.4294, + "step": 1990 + }, + { + "epoch": 86.95652173913044, + "grad_norm": 2.229055643081665, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.4011, + "step": 2000 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.4991566836833954, + "eval_runtime": 4.9616, + "eval_samples_per_second": 51.194, + "eval_steps_per_second": 0.403, + "step": 2001 + }, + { + "epoch": 87.3913043478261, + "grad_norm": 4.449975490570068, + "learning_rate": 1.1260869565217392e-05, + "loss": 0.4413, + "step": 2010 + }, + { + "epoch": 87.82608695652173, + "grad_norm": 3.4843342304229736, + "learning_rate": 1.1217391304347827e-05, + "loss": 0.4043, + "step": 2020 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5106358528137207, + "eval_runtime": 3.759, + "eval_samples_per_second": 67.571, + "eval_steps_per_second": 0.532, + "step": 2024 + }, + { + "epoch": 88.26086956521739, + "grad_norm": 3.2311477661132812, + "learning_rate": 1.1173913043478261e-05, + "loss": 0.4127, + "step": 2030 + }, + { + "epoch": 88.69565217391305, + "grad_norm": 3.511033058166504, + "learning_rate": 1.1130434782608696e-05, + "loss": 0.4233, + "step": 2040 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5082967877388, + "eval_runtime": 3.7279, + "eval_samples_per_second": 68.135, + "eval_steps_per_second": 0.536, + "step": 2047 + }, + { + "epoch": 89.1304347826087, + "grad_norm": 3.1737847328186035, + "learning_rate": 1.1086956521739131e-05, + "loss": 0.4449, + "step": 2050 + }, + { + "epoch": 89.56521739130434, + "grad_norm": 3.3332552909851074, + "learning_rate": 1.1043478260869566e-05, + "loss": 0.4148, + "step": 2060 + }, + { + "epoch": 90.0, + "grad_norm": 5.011209487915039, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.4383, + "step": 2070 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5015798211097717, + "eval_runtime": 5.0126, + "eval_samples_per_second": 50.672, + "eval_steps_per_second": 0.399, + "step": 2070 + }, + { + "epoch": 90.43478260869566, + "grad_norm": 2.4368808269500732, + "learning_rate": 1.0956521739130435e-05, + "loss": 0.4133, + "step": 2080 + }, + { + "epoch": 90.8695652173913, + "grad_norm": 5.885110378265381, + "learning_rate": 1.091304347826087e-05, + "loss": 0.4328, + "step": 2090 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5062097311019897, + "eval_runtime": 3.7299, + "eval_samples_per_second": 68.098, + "eval_steps_per_second": 0.536, + "step": 2093 + }, + { + "epoch": 91.30434782608695, + "grad_norm": 2.2072901725769043, + "learning_rate": 1.0869565217391305e-05, + "loss": 0.4137, + "step": 2100 + }, + { + "epoch": 91.73913043478261, + "grad_norm": 1.9848076105117798, + "learning_rate": 1.082608695652174e-05, + "loss": 0.3978, + "step": 2110 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5026075839996338, + "eval_runtime": 3.7759, + "eval_samples_per_second": 67.268, + "eval_steps_per_second": 0.53, + "step": 2116 + }, + { + "epoch": 92.17391304347827, + "grad_norm": 3.738398313522339, + "learning_rate": 1.0782608695652175e-05, + "loss": 0.4459, + "step": 2120 + }, + { + "epoch": 92.6086956521739, + "grad_norm": 3.0096168518066406, + "learning_rate": 1.073913043478261e-05, + "loss": 0.4052, + "step": 2130 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.49642127752304077, + "eval_runtime": 5.0236, + "eval_samples_per_second": 50.562, + "eval_steps_per_second": 0.398, + "step": 2139 + }, + { + "epoch": 93.04347826086956, + "grad_norm": 3.7452170848846436, + "learning_rate": 1.0695652173913046e-05, + "loss": 0.4205, + "step": 2140 + }, + { + "epoch": 93.47826086956522, + "grad_norm": 3.8985049724578857, + "learning_rate": 1.0652173913043479e-05, + "loss": 0.4171, + "step": 2150 + }, + { + "epoch": 93.91304347826087, + "grad_norm": 2.283020496368408, + "learning_rate": 1.0608695652173914e-05, + "loss": 0.3938, + "step": 2160 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5036487579345703, + "eval_runtime": 3.7298, + "eval_samples_per_second": 68.101, + "eval_steps_per_second": 0.536, + "step": 2162 + }, + { + "epoch": 94.34782608695652, + "grad_norm": 7.054046630859375, + "learning_rate": 1.0565217391304348e-05, + "loss": 0.4336, + "step": 2170 + }, + { + "epoch": 94.78260869565217, + "grad_norm": 3.131002902984619, + "learning_rate": 1.0521739130434783e-05, + "loss": 0.393, + "step": 2180 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5102458596229553, + "eval_runtime": 3.6839, + "eval_samples_per_second": 68.949, + "eval_steps_per_second": 0.543, + "step": 2185 + }, + { + "epoch": 95.21739130434783, + "grad_norm": 2.4622268676757812, + "learning_rate": 1.0478260869565218e-05, + "loss": 0.3997, + "step": 2190 + }, + { + "epoch": 95.65217391304348, + "grad_norm": 3.815375566482544, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.4294, + "step": 2200 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5002910494804382, + "eval_runtime": 4.8997, + "eval_samples_per_second": 51.84, + "eval_steps_per_second": 0.408, + "step": 2208 + }, + { + "epoch": 96.08695652173913, + "grad_norm": 8.787290573120117, + "learning_rate": 1.0391304347826088e-05, + "loss": 0.4155, + "step": 2210 + }, + { + "epoch": 96.52173913043478, + "grad_norm": 2.8499906063079834, + "learning_rate": 1.0347826086956523e-05, + "loss": 0.4095, + "step": 2220 + }, + { + "epoch": 96.95652173913044, + "grad_norm": 6.26355504989624, + "learning_rate": 1.0304347826086958e-05, + "loss": 0.4122, + "step": 2230 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5013226270675659, + "eval_runtime": 3.7744, + "eval_samples_per_second": 67.296, + "eval_steps_per_second": 0.53, + "step": 2231 + }, + { + "epoch": 97.3913043478261, + "grad_norm": 3.257772445678711, + "learning_rate": 1.0260869565217393e-05, + "loss": 0.3522, + "step": 2240 + }, + { + "epoch": 97.82608695652173, + "grad_norm": 2.788611888885498, + "learning_rate": 1.0217391304347829e-05, + "loss": 0.4207, + "step": 2250 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.507587730884552, + "eval_runtime": 3.8534, + "eval_samples_per_second": 65.915, + "eval_steps_per_second": 0.519, + "step": 2254 + }, + { + "epoch": 98.26086956521739, + "grad_norm": 2.974043846130371, + "learning_rate": 1.017391304347826e-05, + "loss": 0.4352, + "step": 2260 + }, + { + "epoch": 98.69565217391305, + "grad_norm": 3.231869697570801, + "learning_rate": 1.0130434782608695e-05, + "loss": 0.4127, + "step": 2270 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.503979504108429, + "eval_runtime": 5.0998, + "eval_samples_per_second": 49.806, + "eval_steps_per_second": 0.392, + "step": 2277 + }, + { + "epoch": 99.1304347826087, + "grad_norm": 2.597999095916748, + "learning_rate": 1.008695652173913e-05, + "loss": 0.3888, + "step": 2280 + }, + { + "epoch": 99.56521739130434, + "grad_norm": 4.4219889640808105, + "learning_rate": 1.0043478260869566e-05, + "loss": 0.3921, + "step": 2290 + }, + { + "epoch": 100.0, + "grad_norm": 4.641758441925049, + "learning_rate": 1e-05, + "loss": 0.441, + "step": 2300 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.502194881439209, + "eval_runtime": 3.7077, + "eval_samples_per_second": 68.506, + "eval_steps_per_second": 0.539, + "step": 2300 + }, + { + "epoch": 100.43478260869566, + "grad_norm": 2.948529005050659, + "learning_rate": 9.956521739130436e-06, + "loss": 0.4324, + "step": 2310 + }, + { + "epoch": 100.8695652173913, + "grad_norm": 2.4855594635009766, + "learning_rate": 9.913043478260871e-06, + "loss": 0.3938, + "step": 2320 + }, + { + "epoch": 101.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.4974897503852844, + "eval_runtime": 3.7364, + "eval_samples_per_second": 67.98, + "eval_steps_per_second": 0.535, + "step": 2323 + }, + { + "epoch": 101.30434782608695, + "grad_norm": 4.753269195556641, + "learning_rate": 9.869565217391304e-06, + "loss": 0.3918, + "step": 2330 + }, + { + "epoch": 101.73913043478261, + "grad_norm": 5.000470161437988, + "learning_rate": 9.82608695652174e-06, + "loss": 0.4109, + "step": 2340 + }, + { + "epoch": 102.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5018798112869263, + "eval_runtime": 4.8425, + "eval_samples_per_second": 52.452, + "eval_steps_per_second": 0.413, + "step": 2346 + }, + { + "epoch": 102.17391304347827, + "grad_norm": 2.8584697246551514, + "learning_rate": 9.782608695652175e-06, + "loss": 0.4199, + "step": 2350 + }, + { + "epoch": 102.6086956521739, + "grad_norm": 2.773083448410034, + "learning_rate": 9.73913043478261e-06, + "loss": 0.4299, + "step": 2360 + }, + { + "epoch": 103.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5060404539108276, + "eval_runtime": 3.7179, + "eval_samples_per_second": 68.318, + "eval_steps_per_second": 0.538, + "step": 2369 + }, + { + "epoch": 103.04347826086956, + "grad_norm": 1.847158670425415, + "learning_rate": 9.695652173913043e-06, + "loss": 0.3834, + "step": 2370 + }, + { + "epoch": 103.47826086956522, + "grad_norm": 4.114128112792969, + "learning_rate": 9.652173913043478e-06, + "loss": 0.4061, + "step": 2380 + }, + { + "epoch": 103.91304347826087, + "grad_norm": 5.080406665802002, + "learning_rate": 9.608695652173914e-06, + "loss": 0.4148, + "step": 2390 + }, + { + "epoch": 104.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5038026571273804, + "eval_runtime": 3.7535, + "eval_samples_per_second": 67.671, + "eval_steps_per_second": 0.533, + "step": 2392 + }, + { + "epoch": 104.34782608695652, + "grad_norm": 3.291896104812622, + "learning_rate": 9.565217391304349e-06, + "loss": 0.4272, + "step": 2400 + }, + { + "epoch": 104.78260869565217, + "grad_norm": 2.7959041595458984, + "learning_rate": 9.521739130434784e-06, + "loss": 0.4179, + "step": 2410 + }, + { + "epoch": 105.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5064316391944885, + "eval_runtime": 4.8627, + "eval_samples_per_second": 52.235, + "eval_steps_per_second": 0.411, + "step": 2415 + }, + { + "epoch": 105.21739130434783, + "grad_norm": 5.880518913269043, + "learning_rate": 9.478260869565217e-06, + "loss": 0.4155, + "step": 2420 + }, + { + "epoch": 105.65217391304348, + "grad_norm": 2.2435200214385986, + "learning_rate": 9.434782608695652e-06, + "loss": 0.4352, + "step": 2430 + }, + { + "epoch": 106.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5059410929679871, + "eval_runtime": 3.7149, + "eval_samples_per_second": 68.373, + "eval_steps_per_second": 0.538, + "step": 2438 + }, + { + "epoch": 106.08695652173913, + "grad_norm": 3.1865811347961426, + "learning_rate": 9.391304347826087e-06, + "loss": 0.3997, + "step": 2440 + }, + { + "epoch": 106.52173913043478, + "grad_norm": 4.0479936599731445, + "learning_rate": 9.347826086956523e-06, + "loss": 0.401, + "step": 2450 + }, + { + "epoch": 106.95652173913044, + "grad_norm": 2.87663197517395, + "learning_rate": 9.304347826086956e-06, + "loss": 0.4027, + "step": 2460 + }, + { + "epoch": 107.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5025486350059509, + "eval_runtime": 3.7614, + "eval_samples_per_second": 67.528, + "eval_steps_per_second": 0.532, + "step": 2461 + }, + { + "epoch": 107.3913043478261, + "grad_norm": 2.630986452102661, + "learning_rate": 9.260869565217391e-06, + "loss": 0.3828, + "step": 2470 + }, + { + "epoch": 107.82608695652173, + "grad_norm": 2.9700822830200195, + "learning_rate": 9.217391304347826e-06, + "loss": 0.4002, + "step": 2480 + }, + { + "epoch": 108.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5020495653152466, + "eval_runtime": 4.6331, + "eval_samples_per_second": 54.823, + "eval_steps_per_second": 0.432, + "step": 2484 + }, + { + "epoch": 108.26086956521739, + "grad_norm": 4.361221790313721, + "learning_rate": 9.173913043478261e-06, + "loss": 0.405, + "step": 2490 + }, + { + "epoch": 108.69565217391305, + "grad_norm": 2.9328296184539795, + "learning_rate": 9.130434782608697e-06, + "loss": 0.3988, + "step": 2500 + }, + { + "epoch": 109.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5063354969024658, + "eval_runtime": 3.8012, + "eval_samples_per_second": 66.821, + "eval_steps_per_second": 0.526, + "step": 2507 + }, + { + "epoch": 109.1304347826087, + "grad_norm": 2.3236513137817383, + "learning_rate": 9.086956521739132e-06, + "loss": 0.3894, + "step": 2510 + }, + { + "epoch": 109.56521739130434, + "grad_norm": 3.4379804134368896, + "learning_rate": 9.043478260869565e-06, + "loss": 0.4023, + "step": 2520 + }, + { + "epoch": 110.0, + "grad_norm": 4.300137042999268, + "learning_rate": 9e-06, + "loss": 0.4095, + "step": 2530 + }, + { + "epoch": 110.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5034452676773071, + "eval_runtime": 3.7021, + "eval_samples_per_second": 68.61, + "eval_steps_per_second": 0.54, + "step": 2530 + }, + { + "epoch": 110.43478260869566, + "grad_norm": 2.190524101257324, + "learning_rate": 8.956521739130435e-06, + "loss": 0.4072, + "step": 2540 + }, + { + "epoch": 110.8695652173913, + "grad_norm": 2.2291879653930664, + "learning_rate": 8.91304347826087e-06, + "loss": 0.4001, + "step": 2550 + }, + { + "epoch": 111.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.505436360836029, + "eval_runtime": 4.2919, + "eval_samples_per_second": 59.182, + "eval_steps_per_second": 0.466, + "step": 2553 + }, + { + "epoch": 111.30434782608695, + "grad_norm": 3.1182541847229004, + "learning_rate": 8.869565217391306e-06, + "loss": 0.3904, + "step": 2560 + }, + { + "epoch": 111.73913043478261, + "grad_norm": 3.8375625610351562, + "learning_rate": 8.82608695652174e-06, + "loss": 0.4201, + "step": 2570 + }, + { + "epoch": 112.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5076125860214233, + "eval_runtime": 4.2691, + "eval_samples_per_second": 59.497, + "eval_steps_per_second": 0.468, + "step": 2576 + }, + { + "epoch": 112.17391304347827, + "grad_norm": 2.4231808185577393, + "learning_rate": 8.782608695652174e-06, + "loss": 0.3925, + "step": 2580 + }, + { + "epoch": 112.6086956521739, + "grad_norm": 4.854309558868408, + "learning_rate": 8.73913043478261e-06, + "loss": 0.4134, + "step": 2590 + }, + { + "epoch": 113.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5069688558578491, + "eval_runtime": 3.7367, + "eval_samples_per_second": 67.974, + "eval_steps_per_second": 0.535, + "step": 2599 + }, + { + "epoch": 113.04347826086956, + "grad_norm": 4.327704429626465, + "learning_rate": 8.695652173913044e-06, + "loss": 0.3959, + "step": 2600 + }, + { + "epoch": 113.47826086956522, + "grad_norm": 2.8718910217285156, + "learning_rate": 8.65217391304348e-06, + "loss": 0.3806, + "step": 2610 + }, + { + "epoch": 113.91304347826087, + "grad_norm": 5.400497913360596, + "learning_rate": 8.608695652173915e-06, + "loss": 0.3614, + "step": 2620 + }, + { + "epoch": 114.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5032684206962585, + "eval_runtime": 4.4717, + "eval_samples_per_second": 56.802, + "eval_steps_per_second": 0.447, + "step": 2622 + }, + { + "epoch": 114.34782608695652, + "grad_norm": 2.7276597023010254, + "learning_rate": 8.56521739130435e-06, + "loss": 0.3956, + "step": 2630 + }, + { + "epoch": 114.78260869565217, + "grad_norm": 3.339860200881958, + "learning_rate": 8.521739130434783e-06, + "loss": 0.3928, + "step": 2640 + }, + { + "epoch": 115.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5042973160743713, + "eval_runtime": 4.1216, + "eval_samples_per_second": 61.627, + "eval_steps_per_second": 0.485, + "step": 2645 + }, + { + "epoch": 115.21739130434783, + "grad_norm": 2.435579538345337, + "learning_rate": 8.478260869565218e-06, + "loss": 0.4149, + "step": 2650 + }, + { + "epoch": 115.65217391304348, + "grad_norm": 3.9001612663269043, + "learning_rate": 8.434782608695653e-06, + "loss": 0.435, + "step": 2660 + }, + { + "epoch": 116.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.4998602271080017, + "eval_runtime": 3.7703, + "eval_samples_per_second": 67.369, + "eval_steps_per_second": 0.53, + "step": 2668 + }, + { + "epoch": 116.08695652173913, + "grad_norm": 4.031954288482666, + "learning_rate": 8.391304347826089e-06, + "loss": 0.3575, + "step": 2670 + }, + { + "epoch": 116.52173913043478, + "grad_norm": 3.1172120571136475, + "learning_rate": 8.347826086956522e-06, + "loss": 0.4062, + "step": 2680 + }, + { + "epoch": 116.95652173913044, + "grad_norm": 2.6061761379241943, + "learning_rate": 8.304347826086957e-06, + "loss": 0.4162, + "step": 2690 + }, + { + "epoch": 117.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5132189393043518, + "eval_runtime": 4.4279, + "eval_samples_per_second": 57.363, + "eval_steps_per_second": 0.452, + "step": 2691 + }, + { + "epoch": 117.3913043478261, + "grad_norm": 1.7457960844039917, + "learning_rate": 8.260869565217392e-06, + "loss": 0.3887, + "step": 2700 + }, + { + "epoch": 117.82608695652173, + "grad_norm": 5.013397216796875, + "learning_rate": 8.217391304347827e-06, + "loss": 0.4078, + "step": 2710 + }, + { + "epoch": 118.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.5088200569152832, + "eval_runtime": 4.1897, + "eval_samples_per_second": 60.625, + "eval_steps_per_second": 0.477, + "step": 2714 + }, + { + "epoch": 118.26086956521739, + "grad_norm": 3.4758872985839844, + "learning_rate": 8.173913043478263e-06, + "loss": 0.4251, + "step": 2720 + }, + { + "epoch": 118.69565217391305, + "grad_norm": 1.8225319385528564, + "learning_rate": 8.130434782608696e-06, + "loss": 0.4025, + "step": 2730 + }, + { + "epoch": 119.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.507527768611908, + "eval_runtime": 3.721, + "eval_samples_per_second": 68.261, + "eval_steps_per_second": 0.537, + "step": 2737 + }, + { + "epoch": 119.1304347826087, + "grad_norm": 4.636626720428467, + "learning_rate": 8.086956521739131e-06, + "loss": 0.4024, + "step": 2740 + }, + { + "epoch": 119.56521739130434, + "grad_norm": 2.249758720397949, + "learning_rate": 8.043478260869566e-06, + "loss": 0.3917, + "step": 2750 + }, + { + "epoch": 120.0, + "grad_norm": 6.408204555511475, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4096, + "step": 2760 + }, + { + "epoch": 120.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.502310574054718, + "eval_runtime": 4.469, + "eval_samples_per_second": 56.836, + "eval_steps_per_second": 0.448, + "step": 2760 + }, + { + "epoch": 120.43478260869566, + "grad_norm": 2.495302200317383, + "learning_rate": 7.956521739130435e-06, + "loss": 0.3791, + "step": 2770 + }, + { + "epoch": 120.8695652173913, + "grad_norm": 2.840449571609497, + "learning_rate": 7.91304347826087e-06, + "loss": 0.3879, + "step": 2780 + }, + { + "epoch": 121.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5062641501426697, + "eval_runtime": 3.9315, + "eval_samples_per_second": 64.606, + "eval_steps_per_second": 0.509, + "step": 2783 + }, + { + "epoch": 121.30434782608695, + "grad_norm": 4.82555627822876, + "learning_rate": 7.869565217391305e-06, + "loss": 0.4232, + "step": 2790 + }, + { + "epoch": 121.73913043478261, + "grad_norm": 3.220736503601074, + "learning_rate": 7.82608695652174e-06, + "loss": 0.4033, + "step": 2800 + }, + { + "epoch": 122.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5001329183578491, + "eval_runtime": 3.7903, + "eval_samples_per_second": 67.012, + "eval_steps_per_second": 0.528, + "step": 2806 + }, + { + "epoch": 122.17391304347827, + "grad_norm": 4.516547203063965, + "learning_rate": 7.782608695652174e-06, + "loss": 0.4144, + "step": 2810 + }, + { + "epoch": 122.6086956521739, + "grad_norm": 2.559272289276123, + "learning_rate": 7.739130434782609e-06, + "loss": 0.3927, + "step": 2820 + }, + { + "epoch": 123.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.5087068676948547, + "eval_runtime": 4.404, + "eval_samples_per_second": 57.675, + "eval_steps_per_second": 0.454, + "step": 2829 + }, + { + "epoch": 123.04347826086956, + "grad_norm": 3.344332695007324, + "learning_rate": 7.695652173913044e-06, + "loss": 0.4016, + "step": 2830 + }, + { + "epoch": 123.47826086956522, + "grad_norm": 2.610856533050537, + "learning_rate": 7.652173913043479e-06, + "loss": 0.3925, + "step": 2840 + }, + { + "epoch": 123.91304347826087, + "grad_norm": 3.501596689224243, + "learning_rate": 7.608695652173914e-06, + "loss": 0.3803, + "step": 2850 + }, + { + "epoch": 124.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5149940848350525, + "eval_runtime": 4.2503, + "eval_samples_per_second": 59.76, + "eval_steps_per_second": 0.471, + "step": 2852 + }, + { + "epoch": 124.34782608695652, + "grad_norm": 4.040353298187256, + "learning_rate": 7.565217391304348e-06, + "loss": 0.4101, + "step": 2860 + }, + { + "epoch": 124.78260869565217, + "grad_norm": 3.1806752681732178, + "learning_rate": 7.5217391304347835e-06, + "loss": 0.4248, + "step": 2870 + }, + { + "epoch": 125.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.515027642250061, + "eval_runtime": 3.7006, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 0.54, + "step": 2875 + }, + { + "epoch": 125.21739130434783, + "grad_norm": 2.976123332977295, + "learning_rate": 7.478260869565218e-06, + "loss": 0.3806, + "step": 2880 + }, + { + "epoch": 125.65217391304348, + "grad_norm": 4.0399250984191895, + "learning_rate": 7.434782608695653e-06, + "loss": 0.3874, + "step": 2890 + }, + { + "epoch": 126.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5157892107963562, + "eval_runtime": 3.8292, + "eval_samples_per_second": 66.332, + "eval_steps_per_second": 0.522, + "step": 2898 + }, + { + "epoch": 126.08695652173913, + "grad_norm": 2.8186984062194824, + "learning_rate": 7.391304347826087e-06, + "loss": 0.4068, + "step": 2900 + }, + { + "epoch": 126.52173913043478, + "grad_norm": 1.7811031341552734, + "learning_rate": 7.347826086956522e-06, + "loss": 0.4188, + "step": 2910 + }, + { + "epoch": 126.95652173913044, + "grad_norm": 2.591479539871216, + "learning_rate": 7.304347826086957e-06, + "loss": 0.3646, + "step": 2920 + }, + { + "epoch": 127.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.4979710578918457, + "eval_runtime": 4.9476, + "eval_samples_per_second": 51.338, + "eval_steps_per_second": 0.404, + "step": 2921 + }, + { + "epoch": 127.3913043478261, + "grad_norm": 3.097064733505249, + "learning_rate": 7.2608695652173925e-06, + "loss": 0.3809, + "step": 2930 + }, + { + "epoch": 127.82608695652173, + "grad_norm": 4.5358805656433105, + "learning_rate": 7.217391304347827e-06, + "loss": 0.4115, + "step": 2940 + }, + { + "epoch": 128.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.507692813873291, + "eval_runtime": 3.7086, + "eval_samples_per_second": 68.489, + "eval_steps_per_second": 0.539, + "step": 2944 + }, + { + "epoch": 128.2608695652174, + "grad_norm": 4.192093372344971, + "learning_rate": 7.173913043478261e-06, + "loss": 0.3931, + "step": 2950 + }, + { + "epoch": 128.69565217391303, + "grad_norm": 2.4763779640197754, + "learning_rate": 7.130434782608696e-06, + "loss": 0.385, + "step": 2960 + }, + { + "epoch": 129.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5153175592422485, + "eval_runtime": 3.726, + "eval_samples_per_second": 68.17, + "eval_steps_per_second": 0.537, + "step": 2967 + }, + { + "epoch": 129.1304347826087, + "grad_norm": 2.906510353088379, + "learning_rate": 7.086956521739131e-06, + "loss": 0.4009, + "step": 2970 + }, + { + "epoch": 129.56521739130434, + "grad_norm": 5.497567653656006, + "learning_rate": 7.0434782608695665e-06, + "loss": 0.4091, + "step": 2980 + }, + { + "epoch": 130.0, + "grad_norm": 4.277368068695068, + "learning_rate": 7e-06, + "loss": 0.4064, + "step": 2990 + }, + { + "epoch": 130.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.511443018913269, + "eval_runtime": 5.0723, + "eval_samples_per_second": 50.076, + "eval_steps_per_second": 0.394, + "step": 2990 + }, + { + "epoch": 130.43478260869566, + "grad_norm": 2.3368613719940186, + "learning_rate": 6.956521739130435e-06, + "loss": 0.3762, + "step": 3000 + }, + { + "epoch": 130.8695652173913, + "grad_norm": 2.983280897140503, + "learning_rate": 6.91304347826087e-06, + "loss": 0.4168, + "step": 3010 + }, + { + "epoch": 131.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5056832432746887, + "eval_runtime": 3.7472, + "eval_samples_per_second": 67.785, + "eval_steps_per_second": 0.534, + "step": 3013 + }, + { + "epoch": 131.30434782608697, + "grad_norm": 5.6472978591918945, + "learning_rate": 6.869565217391305e-06, + "loss": 0.3454, + "step": 3020 + }, + { + "epoch": 131.7391304347826, + "grad_norm": 2.710934638977051, + "learning_rate": 6.8260869565217395e-06, + "loss": 0.4319, + "step": 3030 + }, + { + "epoch": 132.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5041180849075317, + "eval_runtime": 3.7165, + "eval_samples_per_second": 68.344, + "eval_steps_per_second": 0.538, + "step": 3036 + }, + { + "epoch": 132.17391304347825, + "grad_norm": 2.8998305797576904, + "learning_rate": 6.782608695652174e-06, + "loss": 0.3769, + "step": 3040 + }, + { + "epoch": 132.6086956521739, + "grad_norm": 3.503068208694458, + "learning_rate": 6.739130434782609e-06, + "loss": 0.4234, + "step": 3050 + }, + { + "epoch": 133.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5119389891624451, + "eval_runtime": 4.8197, + "eval_samples_per_second": 52.701, + "eval_steps_per_second": 0.415, + "step": 3059 + }, + { + "epoch": 133.04347826086956, + "grad_norm": 2.628817319869995, + "learning_rate": 6.695652173913044e-06, + "loss": 0.3984, + "step": 3060 + }, + { + "epoch": 133.47826086956522, + "grad_norm": 3.1060750484466553, + "learning_rate": 6.652173913043479e-06, + "loss": 0.4147, + "step": 3070 + }, + { + "epoch": 133.91304347826087, + "grad_norm": 6.7668328285217285, + "learning_rate": 6.6086956521739135e-06, + "loss": 0.3721, + "step": 3080 + }, + { + "epoch": 134.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.51175457239151, + "eval_runtime": 3.7909, + "eval_samples_per_second": 67.003, + "eval_steps_per_second": 0.528, + "step": 3082 + }, + { + "epoch": 134.34782608695653, + "grad_norm": 6.763729572296143, + "learning_rate": 6.565217391304349e-06, + "loss": 0.386, + "step": 3090 + }, + { + "epoch": 134.7826086956522, + "grad_norm": 4.876804828643799, + "learning_rate": 6.521739130434783e-06, + "loss": 0.3709, + "step": 3100 + }, + { + "epoch": 135.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5078221559524536, + "eval_runtime": 3.7684, + "eval_samples_per_second": 67.402, + "eval_steps_per_second": 0.531, + "step": 3105 + }, + { + "epoch": 135.2173913043478, + "grad_norm": 3.7445313930511475, + "learning_rate": 6.478260869565218e-06, + "loss": 0.3592, + "step": 3110 + }, + { + "epoch": 135.65217391304347, + "grad_norm": 5.715231418609619, + "learning_rate": 6.434782608695652e-06, + "loss": 0.4149, + "step": 3120 + }, + { + "epoch": 136.0, + "eval_accuracy": 0.7795275590551181, + "eval_loss": 0.5163589715957642, + "eval_runtime": 4.6746, + "eval_samples_per_second": 54.336, + "eval_steps_per_second": 0.428, + "step": 3128 + }, + { + "epoch": 136.08695652173913, + "grad_norm": 3.3850629329681396, + "learning_rate": 6.391304347826087e-06, + "loss": 0.3681, + "step": 3130 + }, + { + "epoch": 136.52173913043478, + "grad_norm": 5.502380847930908, + "learning_rate": 6.3478260869565225e-06, + "loss": 0.3629, + "step": 3140 + }, + { + "epoch": 136.95652173913044, + "grad_norm": 4.158088684082031, + "learning_rate": 6.304347826086958e-06, + "loss": 0.416, + "step": 3150 + }, + { + "epoch": 137.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5123007297515869, + "eval_runtime": 3.789, + "eval_samples_per_second": 67.036, + "eval_steps_per_second": 0.528, + "step": 3151 + }, + { + "epoch": 137.3913043478261, + "grad_norm": 2.241478681564331, + "learning_rate": 6.260869565217392e-06, + "loss": 0.4089, + "step": 3160 + }, + { + "epoch": 137.82608695652175, + "grad_norm": 4.336514472961426, + "learning_rate": 6.217391304347826e-06, + "loss": 0.406, + "step": 3170 + }, + { + "epoch": 138.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5115824937820435, + "eval_runtime": 3.7195, + "eval_samples_per_second": 68.288, + "eval_steps_per_second": 0.538, + "step": 3174 + }, + { + "epoch": 138.2608695652174, + "grad_norm": 2.154179334640503, + "learning_rate": 6.173913043478261e-06, + "loss": 0.4018, + "step": 3180 + }, + { + "epoch": 138.69565217391303, + "grad_norm": 3.2215845584869385, + "learning_rate": 6.1304347826086965e-06, + "loss": 0.3613, + "step": 3190 + }, + { + "epoch": 139.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5169662237167358, + "eval_runtime": 4.4593, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 0.448, + "step": 3197 + }, + { + "epoch": 139.1304347826087, + "grad_norm": 2.800915241241455, + "learning_rate": 6.086956521739132e-06, + "loss": 0.3863, + "step": 3200 + }, + { + "epoch": 139.56521739130434, + "grad_norm": 7.433578014373779, + "learning_rate": 6.043478260869565e-06, + "loss": 0.4278, + "step": 3210 + }, + { + "epoch": 140.0, + "grad_norm": 3.887300968170166, + "learning_rate": 6e-06, + "loss": 0.3786, + "step": 3220 + }, + { + "epoch": 140.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5098868608474731, + "eval_runtime": 4.1343, + "eval_samples_per_second": 61.437, + "eval_steps_per_second": 0.484, + "step": 3220 + }, + { + "epoch": 140.43478260869566, + "grad_norm": 3.3379013538360596, + "learning_rate": 5.956521739130435e-06, + "loss": 0.405, + "step": 3230 + }, + { + "epoch": 140.8695652173913, + "grad_norm": 3.2763419151306152, + "learning_rate": 5.91304347826087e-06, + "loss": 0.3976, + "step": 3240 + }, + { + "epoch": 141.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5111474394798279, + "eval_runtime": 3.8029, + "eval_samples_per_second": 66.792, + "eval_steps_per_second": 0.526, + "step": 3243 + }, + { + "epoch": 141.30434782608697, + "grad_norm": 3.1908023357391357, + "learning_rate": 5.8695652173913055e-06, + "loss": 0.3856, + "step": 3250 + }, + { + "epoch": 141.7391304347826, + "grad_norm": 3.875778913497925, + "learning_rate": 5.826086956521739e-06, + "loss": 0.371, + "step": 3260 + }, + { + "epoch": 142.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5081124901771545, + "eval_runtime": 4.5605, + "eval_samples_per_second": 55.696, + "eval_steps_per_second": 0.439, + "step": 3266 + }, + { + "epoch": 142.17391304347825, + "grad_norm": 2.925506353378296, + "learning_rate": 5.782608695652174e-06, + "loss": 0.4169, + "step": 3270 + }, + { + "epoch": 142.6086956521739, + "grad_norm": 9.266388893127441, + "learning_rate": 5.739130434782609e-06, + "loss": 0.4056, + "step": 3280 + }, + { + "epoch": 143.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5098369717597961, + "eval_runtime": 3.9928, + "eval_samples_per_second": 63.615, + "eval_steps_per_second": 0.501, + "step": 3289 + }, + { + "epoch": 143.04347826086956, + "grad_norm": 25.856365203857422, + "learning_rate": 5.695652173913044e-06, + "loss": 0.3757, + "step": 3290 + }, + { + "epoch": 143.47826086956522, + "grad_norm": 2.937258720397949, + "learning_rate": 5.652173913043479e-06, + "loss": 0.3745, + "step": 3300 + }, + { + "epoch": 143.91304347826087, + "grad_norm": 3.236806631088257, + "learning_rate": 5.608695652173914e-06, + "loss": 0.4214, + "step": 3310 + }, + { + "epoch": 144.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5085259675979614, + "eval_runtime": 3.7621, + "eval_samples_per_second": 67.516, + "eval_steps_per_second": 0.532, + "step": 3312 + }, + { + "epoch": 144.34782608695653, + "grad_norm": 3.6454241275787354, + "learning_rate": 5.565217391304348e-06, + "loss": 0.3659, + "step": 3320 + }, + { + "epoch": 144.7826086956522, + "grad_norm": 3.4510464668273926, + "learning_rate": 5.521739130434783e-06, + "loss": 0.3832, + "step": 3330 + }, + { + "epoch": 145.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5084368586540222, + "eval_runtime": 4.2307, + "eval_samples_per_second": 60.037, + "eval_steps_per_second": 0.473, + "step": 3335 + }, + { + "epoch": 145.2173913043478, + "grad_norm": 2.4478542804718018, + "learning_rate": 5.478260869565217e-06, + "loss": 0.3522, + "step": 3340 + }, + { + "epoch": 145.65217391304347, + "grad_norm": 4.097745895385742, + "learning_rate": 5.4347826086956525e-06, + "loss": 0.3762, + "step": 3350 + }, + { + "epoch": 146.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5060733556747437, + "eval_runtime": 4.203, + "eval_samples_per_second": 60.433, + "eval_steps_per_second": 0.476, + "step": 3358 + }, + { + "epoch": 146.08695652173913, + "grad_norm": 3.384960651397705, + "learning_rate": 5.391304347826088e-06, + "loss": 0.3761, + "step": 3360 + }, + { + "epoch": 146.52173913043478, + "grad_norm": 2.909395217895508, + "learning_rate": 5.347826086956523e-06, + "loss": 0.3902, + "step": 3370 + }, + { + "epoch": 146.95652173913044, + "grad_norm": 2.538163900375366, + "learning_rate": 5.304347826086957e-06, + "loss": 0.4118, + "step": 3380 + }, + { + "epoch": 147.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5111083984375, + "eval_runtime": 3.8334, + "eval_samples_per_second": 66.26, + "eval_steps_per_second": 0.522, + "step": 3381 + }, + { + "epoch": 147.3913043478261, + "grad_norm": 2.9644970893859863, + "learning_rate": 5.260869565217391e-06, + "loss": 0.3802, + "step": 3390 + }, + { + "epoch": 147.82608695652175, + "grad_norm": 3.0972464084625244, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.3866, + "step": 3400 + }, + { + "epoch": 148.0, + "eval_accuracy": 0.8070866141732284, + "eval_loss": 0.5092455148696899, + "eval_runtime": 5.6174, + "eval_samples_per_second": 45.216, + "eval_steps_per_second": 0.356, + "step": 3404 + }, + { + "epoch": 148.2608695652174, + "grad_norm": 1.625214695930481, + "learning_rate": 5.173913043478262e-06, + "loss": 0.3584, + "step": 3410 + }, + { + "epoch": 148.69565217391303, + "grad_norm": 15.01403522491455, + "learning_rate": 5.130434782608697e-06, + "loss": 0.3869, + "step": 3420 + }, + { + "epoch": 149.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.512187659740448, + "eval_runtime": 3.8515, + "eval_samples_per_second": 65.948, + "eval_steps_per_second": 0.519, + "step": 3427 + }, + { + "epoch": 149.1304347826087, + "grad_norm": 1.9776344299316406, + "learning_rate": 5.08695652173913e-06, + "loss": 0.3921, + "step": 3430 + }, + { + "epoch": 149.56521739130434, + "grad_norm": 2.336129665374756, + "learning_rate": 5.043478260869565e-06, + "loss": 0.4048, + "step": 3440 + }, + { + "epoch": 150.0, + "grad_norm": 3.6398816108703613, + "learning_rate": 5e-06, + "loss": 0.3734, + "step": 3450 + }, + { + "epoch": 150.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5116916298866272, + "eval_runtime": 3.7475, + "eval_samples_per_second": 67.779, + "eval_steps_per_second": 0.534, + "step": 3450 + }, + { + "epoch": 150.43478260869566, + "grad_norm": 2.299021005630493, + "learning_rate": 4.9565217391304355e-06, + "loss": 0.3734, + "step": 3460 + }, + { + "epoch": 150.8695652173913, + "grad_norm": 3.107494831085205, + "learning_rate": 4.91304347826087e-06, + "loss": 0.4061, + "step": 3470 + }, + { + "epoch": 151.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5094764232635498, + "eval_runtime": 4.4075, + "eval_samples_per_second": 57.629, + "eval_steps_per_second": 0.454, + "step": 3473 + }, + { + "epoch": 151.30434782608697, + "grad_norm": 2.319066286087036, + "learning_rate": 4.869565217391305e-06, + "loss": 0.3681, + "step": 3480 + }, + { + "epoch": 151.7391304347826, + "grad_norm": 2.7603538036346436, + "learning_rate": 4.826086956521739e-06, + "loss": 0.3705, + "step": 3490 + }, + { + "epoch": 152.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5171404480934143, + "eval_runtime": 4.362, + "eval_samples_per_second": 58.23, + "eval_steps_per_second": 0.459, + "step": 3496 + }, + { + "epoch": 152.17391304347825, + "grad_norm": 2.0375826358795166, + "learning_rate": 4.782608695652174e-06, + "loss": 0.3882, + "step": 3500 + }, + { + "epoch": 152.6086956521739, + "grad_norm": 2.8498833179473877, + "learning_rate": 4.739130434782609e-06, + "loss": 0.3873, + "step": 3510 + }, + { + "epoch": 153.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5179200768470764, + "eval_runtime": 3.7588, + "eval_samples_per_second": 67.575, + "eval_steps_per_second": 0.532, + "step": 3519 + }, + { + "epoch": 153.04347826086956, + "grad_norm": 2.707977533340454, + "learning_rate": 4.695652173913044e-06, + "loss": 0.3979, + "step": 3520 + }, + { + "epoch": 153.47826086956522, + "grad_norm": 3.5183486938476562, + "learning_rate": 4.652173913043478e-06, + "loss": 0.4025, + "step": 3530 + }, + { + "epoch": 153.91304347826087, + "grad_norm": 2.90291166305542, + "learning_rate": 4.608695652173913e-06, + "loss": 0.3927, + "step": 3540 + }, + { + "epoch": 154.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5117496252059937, + "eval_runtime": 3.7541, + "eval_samples_per_second": 67.659, + "eval_steps_per_second": 0.533, + "step": 3542 + }, + { + "epoch": 154.34782608695653, + "grad_norm": 4.005958080291748, + "learning_rate": 4.565217391304348e-06, + "loss": 0.4011, + "step": 3550 + }, + { + "epoch": 154.7826086956522, + "grad_norm": 2.469202995300293, + "learning_rate": 4.5217391304347826e-06, + "loss": 0.3807, + "step": 3560 + }, + { + "epoch": 155.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5133464932441711, + "eval_runtime": 5.8154, + "eval_samples_per_second": 43.677, + "eval_steps_per_second": 0.344, + "step": 3565 + }, + { + "epoch": 155.2173913043478, + "grad_norm": 3.2248237133026123, + "learning_rate": 4.478260869565218e-06, + "loss": 0.4498, + "step": 3570 + }, + { + "epoch": 155.65217391304347, + "grad_norm": 3.463270425796509, + "learning_rate": 4.434782608695653e-06, + "loss": 0.3761, + "step": 3580 + }, + { + "epoch": 156.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5140319466590881, + "eval_runtime": 3.7668, + "eval_samples_per_second": 67.432, + "eval_steps_per_second": 0.531, + "step": 3588 + }, + { + "epoch": 156.08695652173913, + "grad_norm": 3.640611171722412, + "learning_rate": 4.391304347826087e-06, + "loss": 0.3609, + "step": 3590 + }, + { + "epoch": 156.52173913043478, + "grad_norm": 4.198793888092041, + "learning_rate": 4.347826086956522e-06, + "loss": 0.3984, + "step": 3600 + }, + { + "epoch": 156.95652173913044, + "grad_norm": 2.9035775661468506, + "learning_rate": 4.304347826086957e-06, + "loss": 0.3964, + "step": 3610 + }, + { + "epoch": 157.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5117691159248352, + "eval_runtime": 3.7832, + "eval_samples_per_second": 67.138, + "eval_steps_per_second": 0.529, + "step": 3611 + }, + { + "epoch": 157.3913043478261, + "grad_norm": 5.13762092590332, + "learning_rate": 4.260869565217392e-06, + "loss": 0.3818, + "step": 3620 + }, + { + "epoch": 157.82608695652175, + "grad_norm": 8.948963165283203, + "learning_rate": 4.217391304347827e-06, + "loss": 0.39, + "step": 3630 + }, + { + "epoch": 158.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5122236609458923, + "eval_runtime": 4.6309, + "eval_samples_per_second": 54.849, + "eval_steps_per_second": 0.432, + "step": 3634 + }, + { + "epoch": 158.2608695652174, + "grad_norm": 2.4759654998779297, + "learning_rate": 4.173913043478261e-06, + "loss": 0.3784, + "step": 3640 + }, + { + "epoch": 158.69565217391303, + "grad_norm": 2.407663106918335, + "learning_rate": 4.130434782608696e-06, + "loss": 0.3943, + "step": 3650 + }, + { + "epoch": 159.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5125917196273804, + "eval_runtime": 3.7278, + "eval_samples_per_second": 68.138, + "eval_steps_per_second": 0.537, + "step": 3657 + }, + { + "epoch": 159.1304347826087, + "grad_norm": 2.2464840412139893, + "learning_rate": 4.086956521739131e-06, + "loss": 0.3675, + "step": 3660 + }, + { + "epoch": 159.56521739130434, + "grad_norm": 3.0186944007873535, + "learning_rate": 4.0434782608695655e-06, + "loss": 0.355, + "step": 3670 + }, + { + "epoch": 160.0, + "grad_norm": 9.606362342834473, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3417, + "step": 3680 + }, + { + "epoch": 160.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5096677541732788, + "eval_runtime": 3.7505, + "eval_samples_per_second": 67.724, + "eval_steps_per_second": 0.533, + "step": 3680 + }, + { + "epoch": 160.43478260869566, + "grad_norm": 3.155024766921997, + "learning_rate": 3.956521739130435e-06, + "loss": 0.3951, + "step": 3690 + }, + { + "epoch": 160.8695652173913, + "grad_norm": 2.3195645809173584, + "learning_rate": 3.91304347826087e-06, + "loss": 0.3996, + "step": 3700 + }, + { + "epoch": 161.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5048008561134338, + "eval_runtime": 4.9463, + "eval_samples_per_second": 51.351, + "eval_steps_per_second": 0.404, + "step": 3703 + }, + { + "epoch": 161.30434782608697, + "grad_norm": 16.818618774414062, + "learning_rate": 3.869565217391304e-06, + "loss": 0.3613, + "step": 3710 + }, + { + "epoch": 161.7391304347826, + "grad_norm": 5.290389060974121, + "learning_rate": 3.8260869565217395e-06, + "loss": 0.4, + "step": 3720 + }, + { + "epoch": 162.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5148473978042603, + "eval_runtime": 3.7348, + "eval_samples_per_second": 68.008, + "eval_steps_per_second": 0.535, + "step": 3726 + }, + { + "epoch": 162.17391304347825, + "grad_norm": 4.7519330978393555, + "learning_rate": 3.782608695652174e-06, + "loss": 0.3983, + "step": 3730 + }, + { + "epoch": 162.6086956521739, + "grad_norm": 2.433164358139038, + "learning_rate": 3.739130434782609e-06, + "loss": 0.4051, + "step": 3740 + }, + { + "epoch": 163.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5150399804115295, + "eval_runtime": 3.7013, + "eval_samples_per_second": 68.625, + "eval_steps_per_second": 0.54, + "step": 3749 + }, + { + "epoch": 163.04347826086956, + "grad_norm": 2.870962381362915, + "learning_rate": 3.6956521739130436e-06, + "loss": 0.3903, + "step": 3750 + }, + { + "epoch": 163.47826086956522, + "grad_norm": 3.3795669078826904, + "learning_rate": 3.6521739130434787e-06, + "loss": 0.3981, + "step": 3760 + }, + { + "epoch": 163.91304347826087, + "grad_norm": 4.447073936462402, + "learning_rate": 3.6086956521739134e-06, + "loss": 0.3973, + "step": 3770 + }, + { + "epoch": 164.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5036624073982239, + "eval_runtime": 4.8343, + "eval_samples_per_second": 52.541, + "eval_steps_per_second": 0.414, + "step": 3772 + }, + { + "epoch": 164.34782608695653, + "grad_norm": 2.5403716564178467, + "learning_rate": 3.565217391304348e-06, + "loss": 0.3586, + "step": 3780 + }, + { + "epoch": 164.7826086956522, + "grad_norm": 2.5216853618621826, + "learning_rate": 3.5217391304347832e-06, + "loss": 0.3963, + "step": 3790 + }, + { + "epoch": 165.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5048288702964783, + "eval_runtime": 3.7404, + "eval_samples_per_second": 67.907, + "eval_steps_per_second": 0.535, + "step": 3795 + }, + { + "epoch": 165.2173913043478, + "grad_norm": 3.382376194000244, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.4012, + "step": 3800 + }, + { + "epoch": 165.65217391304347, + "grad_norm": 3.0021872520446777, + "learning_rate": 3.4347826086956526e-06, + "loss": 0.3568, + "step": 3810 + }, + { + "epoch": 166.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5167564749717712, + "eval_runtime": 3.6895, + "eval_samples_per_second": 68.845, + "eval_steps_per_second": 0.542, + "step": 3818 + }, + { + "epoch": 166.08695652173913, + "grad_norm": 4.209798812866211, + "learning_rate": 3.391304347826087e-06, + "loss": 0.4217, + "step": 3820 + }, + { + "epoch": 166.52173913043478, + "grad_norm": 2.3605332374572754, + "learning_rate": 3.347826086956522e-06, + "loss": 0.3897, + "step": 3830 + }, + { + "epoch": 166.95652173913044, + "grad_norm": 7.9494733810424805, + "learning_rate": 3.3043478260869567e-06, + "loss": 0.3995, + "step": 3840 + }, + { + "epoch": 167.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5096150636672974, + "eval_runtime": 4.9956, + "eval_samples_per_second": 50.845, + "eval_steps_per_second": 0.4, + "step": 3841 + }, + { + "epoch": 167.3913043478261, + "grad_norm": 3.431043863296509, + "learning_rate": 3.2608695652173914e-06, + "loss": 0.3765, + "step": 3850 + }, + { + "epoch": 167.82608695652175, + "grad_norm": 3.4384922981262207, + "learning_rate": 3.217391304347826e-06, + "loss": 0.3628, + "step": 3860 + }, + { + "epoch": 168.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5101594924926758, + "eval_runtime": 3.7705, + "eval_samples_per_second": 67.365, + "eval_steps_per_second": 0.53, + "step": 3864 + }, + { + "epoch": 168.2608695652174, + "grad_norm": 8.502880096435547, + "learning_rate": 3.1739130434782613e-06, + "loss": 0.3857, + "step": 3870 + }, + { + "epoch": 168.69565217391303, + "grad_norm": 2.5634241104125977, + "learning_rate": 3.130434782608696e-06, + "loss": 0.3836, + "step": 3880 + }, + { + "epoch": 169.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5133307576179504, + "eval_runtime": 3.7532, + "eval_samples_per_second": 67.676, + "eval_steps_per_second": 0.533, + "step": 3887 + }, + { + "epoch": 169.1304347826087, + "grad_norm": 3.617677927017212, + "learning_rate": 3.0869565217391307e-06, + "loss": 0.4251, + "step": 3890 + }, + { + "epoch": 169.56521739130434, + "grad_norm": 3.9091439247131348, + "learning_rate": 3.043478260869566e-06, + "loss": 0.3747, + "step": 3900 + }, + { + "epoch": 170.0, + "grad_norm": 12.626005172729492, + "learning_rate": 3e-06, + "loss": 0.3646, + "step": 3910 + }, + { + "epoch": 170.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5099019408226013, + "eval_runtime": 4.85, + "eval_samples_per_second": 52.372, + "eval_steps_per_second": 0.412, + "step": 3910 + }, + { + "epoch": 170.43478260869566, + "grad_norm": 5.3712263107299805, + "learning_rate": 2.956521739130435e-06, + "loss": 0.3732, + "step": 3920 + }, + { + "epoch": 170.8695652173913, + "grad_norm": 2.489645481109619, + "learning_rate": 2.9130434782608695e-06, + "loss": 0.3789, + "step": 3930 + }, + { + "epoch": 171.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5151440501213074, + "eval_runtime": 3.7265, + "eval_samples_per_second": 68.161, + "eval_steps_per_second": 0.537, + "step": 3933 + }, + { + "epoch": 171.30434782608697, + "grad_norm": 60.279747009277344, + "learning_rate": 2.8695652173913046e-06, + "loss": 0.3372, + "step": 3940 + }, + { + "epoch": 171.7391304347826, + "grad_norm": 5.177385330200195, + "learning_rate": 2.8260869565217393e-06, + "loss": 0.3832, + "step": 3950 + }, + { + "epoch": 172.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5148643255233765, + "eval_runtime": 3.7835, + "eval_samples_per_second": 67.134, + "eval_steps_per_second": 0.529, + "step": 3956 + }, + { + "epoch": 172.17391304347825, + "grad_norm": 2.5841851234436035, + "learning_rate": 2.782608695652174e-06, + "loss": 0.405, + "step": 3960 + }, + { + "epoch": 172.6086956521739, + "grad_norm": 2.6472222805023193, + "learning_rate": 2.7391304347826087e-06, + "loss": 0.3476, + "step": 3970 + }, + { + "epoch": 173.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5178123116493225, + "eval_runtime": 5.0055, + "eval_samples_per_second": 50.745, + "eval_steps_per_second": 0.4, + "step": 3979 + }, + { + "epoch": 173.04347826086956, + "grad_norm": 2.3995625972747803, + "learning_rate": 2.695652173913044e-06, + "loss": 0.4347, + "step": 3980 + }, + { + "epoch": 173.47826086956522, + "grad_norm": 4.958439826965332, + "learning_rate": 2.6521739130434785e-06, + "loss": 0.3886, + "step": 3990 + }, + { + "epoch": 173.91304347826087, + "grad_norm": 4.661713600158691, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.3806, + "step": 4000 + }, + { + "epoch": 174.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5080812573432922, + "eval_runtime": 3.6838, + "eval_samples_per_second": 68.951, + "eval_steps_per_second": 0.543, + "step": 4002 + }, + { + "epoch": 174.34782608695653, + "grad_norm": 2.979862928390503, + "learning_rate": 2.5652173913043484e-06, + "loss": 0.3429, + "step": 4010 + }, + { + "epoch": 174.7826086956522, + "grad_norm": 1.8571139574050903, + "learning_rate": 2.5217391304347826e-06, + "loss": 0.4053, + "step": 4020 + }, + { + "epoch": 175.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5099707245826721, + "eval_runtime": 3.7194, + "eval_samples_per_second": 68.291, + "eval_steps_per_second": 0.538, + "step": 4025 + }, + { + "epoch": 175.2173913043478, + "grad_norm": 2.364047050476074, + "learning_rate": 2.4782608695652178e-06, + "loss": 0.3774, + "step": 4030 + }, + { + "epoch": 175.65217391304347, + "grad_norm": 4.220658779144287, + "learning_rate": 2.4347826086956525e-06, + "loss": 0.3986, + "step": 4040 + }, + { + "epoch": 176.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5189133286476135, + "eval_runtime": 5.0478, + "eval_samples_per_second": 50.319, + "eval_steps_per_second": 0.396, + "step": 4048 + }, + { + "epoch": 176.08695652173913, + "grad_norm": 2.9689295291900635, + "learning_rate": 2.391304347826087e-06, + "loss": 0.4225, + "step": 4050 + }, + { + "epoch": 176.52173913043478, + "grad_norm": 3.78476881980896, + "learning_rate": 2.347826086956522e-06, + "loss": 0.3798, + "step": 4060 + }, + { + "epoch": 176.95652173913044, + "grad_norm": 2.3258774280548096, + "learning_rate": 2.3043478260869566e-06, + "loss": 0.3827, + "step": 4070 + }, + { + "epoch": 177.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5128843784332275, + "eval_runtime": 3.7539, + "eval_samples_per_second": 67.663, + "eval_steps_per_second": 0.533, + "step": 4071 + }, + { + "epoch": 177.3913043478261, + "grad_norm": 2.329585313796997, + "learning_rate": 2.2608695652173913e-06, + "loss": 0.329, + "step": 4080 + }, + { + "epoch": 177.82608695652175, + "grad_norm": 3.0889029502868652, + "learning_rate": 2.2173913043478264e-06, + "loss": 0.3892, + "step": 4090 + }, + { + "epoch": 178.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5099364519119263, + "eval_runtime": 3.764, + "eval_samples_per_second": 67.482, + "eval_steps_per_second": 0.531, + "step": 4094 + }, + { + "epoch": 178.2608695652174, + "grad_norm": 3.167226791381836, + "learning_rate": 2.173913043478261e-06, + "loss": 0.3801, + "step": 4100 + }, + { + "epoch": 178.69565217391303, + "grad_norm": 2.857957601547241, + "learning_rate": 2.130434782608696e-06, + "loss": 0.3955, + "step": 4110 + }, + { + "epoch": 179.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5212357640266418, + "eval_runtime": 4.8308, + "eval_samples_per_second": 52.579, + "eval_steps_per_second": 0.414, + "step": 4117 + }, + { + "epoch": 179.1304347826087, + "grad_norm": 8.153979301452637, + "learning_rate": 2.0869565217391305e-06, + "loss": 0.4062, + "step": 4120 + }, + { + "epoch": 179.56521739130434, + "grad_norm": 3.2647910118103027, + "learning_rate": 2.0434782608695656e-06, + "loss": 0.3603, + "step": 4130 + }, + { + "epoch": 180.0, + "grad_norm": 4.87031364440918, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4077, + "step": 4140 + }, + { + "epoch": 180.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5101702213287354, + "eval_runtime": 3.8052, + "eval_samples_per_second": 66.75, + "eval_steps_per_second": 0.526, + "step": 4140 + }, + { + "epoch": 180.43478260869566, + "grad_norm": 3.3625569343566895, + "learning_rate": 1.956521739130435e-06, + "loss": 0.3881, + "step": 4150 + }, + { + "epoch": 180.8695652173913, + "grad_norm": 3.717646360397339, + "learning_rate": 1.9130434782608697e-06, + "loss": 0.3579, + "step": 4160 + }, + { + "epoch": 181.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5099858641624451, + "eval_runtime": 3.707, + "eval_samples_per_second": 68.52, + "eval_steps_per_second": 0.54, + "step": 4163 + }, + { + "epoch": 181.30434782608697, + "grad_norm": 2.5178964138031006, + "learning_rate": 1.8695652173913044e-06, + "loss": 0.3828, + "step": 4170 + }, + { + "epoch": 181.7391304347826, + "grad_norm": 3.244948625564575, + "learning_rate": 1.8260869565217394e-06, + "loss": 0.3666, + "step": 4180 + }, + { + "epoch": 182.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5247715711593628, + "eval_runtime": 4.2228, + "eval_samples_per_second": 60.149, + "eval_steps_per_second": 0.474, + "step": 4186 + }, + { + "epoch": 182.17391304347825, + "grad_norm": 3.418851613998413, + "learning_rate": 1.782608695652174e-06, + "loss": 0.4, + "step": 4190 + }, + { + "epoch": 182.6086956521739, + "grad_norm": 2.247349262237549, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.3746, + "step": 4200 + }, + { + "epoch": 183.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5220462083816528, + "eval_runtime": 4.3605, + "eval_samples_per_second": 58.25, + "eval_steps_per_second": 0.459, + "step": 4209 + }, + { + "epoch": 183.04347826086956, + "grad_norm": 5.591789245605469, + "learning_rate": 1.6956521739130435e-06, + "loss": 0.3971, + "step": 4210 + }, + { + "epoch": 183.47826086956522, + "grad_norm": 2.8663575649261475, + "learning_rate": 1.6521739130434784e-06, + "loss": 0.3516, + "step": 4220 + }, + { + "epoch": 183.91304347826087, + "grad_norm": 5.791408061981201, + "learning_rate": 1.608695652173913e-06, + "loss": 0.3867, + "step": 4230 + }, + { + "epoch": 184.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5172824859619141, + "eval_runtime": 3.8331, + "eval_samples_per_second": 66.265, + "eval_steps_per_second": 0.522, + "step": 4232 + }, + { + "epoch": 184.34782608695653, + "grad_norm": 3.3605191707611084, + "learning_rate": 1.565217391304348e-06, + "loss": 0.3911, + "step": 4240 + }, + { + "epoch": 184.7826086956522, + "grad_norm": 3.4683103561401367, + "learning_rate": 1.521739130434783e-06, + "loss": 0.4024, + "step": 4250 + }, + { + "epoch": 185.0, + "eval_accuracy": 0.7874015748031497, + "eval_loss": 0.5248106122016907, + "eval_runtime": 4.5705, + "eval_samples_per_second": 55.574, + "eval_steps_per_second": 0.438, + "step": 4255 + }, + { + "epoch": 185.2173913043478, + "grad_norm": 4.495180130004883, + "learning_rate": 1.4782608695652176e-06, + "loss": 0.3931, + "step": 4260 + }, + { + "epoch": 185.65217391304347, + "grad_norm": 4.51051139831543, + "learning_rate": 1.4347826086956523e-06, + "loss": 0.4014, + "step": 4270 + }, + { + "epoch": 186.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5084752440452576, + "eval_runtime": 4.1594, + "eval_samples_per_second": 61.066, + "eval_steps_per_second": 0.481, + "step": 4278 + }, + { + "epoch": 186.08695652173913, + "grad_norm": 6.847979545593262, + "learning_rate": 1.391304347826087e-06, + "loss": 0.3887, + "step": 4280 + }, + { + "epoch": 186.52173913043478, + "grad_norm": 8.414494514465332, + "learning_rate": 1.347826086956522e-06, + "loss": 0.3876, + "step": 4290 + }, + { + "epoch": 186.95652173913044, + "grad_norm": 2.0459609031677246, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.3445, + "step": 4300 + }, + { + "epoch": 187.0, + "eval_accuracy": 0.8031496062992126, + "eval_loss": 0.5136986970901489, + "eval_runtime": 3.7104, + "eval_samples_per_second": 68.456, + "eval_steps_per_second": 0.539, + "step": 4301 + }, + { + "epoch": 187.3913043478261, + "grad_norm": 2.7707877159118652, + "learning_rate": 1.2608695652173913e-06, + "loss": 0.4067, + "step": 4310 + }, + { + "epoch": 187.82608695652175, + "grad_norm": 2.2277884483337402, + "learning_rate": 1.2173913043478262e-06, + "loss": 0.382, + "step": 4320 + }, + { + "epoch": 188.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.521314799785614, + "eval_runtime": 4.1528, + "eval_samples_per_second": 61.164, + "eval_steps_per_second": 0.482, + "step": 4324 + }, + { + "epoch": 188.2608695652174, + "grad_norm": 4.299314498901367, + "learning_rate": 1.173913043478261e-06, + "loss": 0.3717, + "step": 4330 + }, + { + "epoch": 188.69565217391303, + "grad_norm": 2.479510545730591, + "learning_rate": 1.1304347826086956e-06, + "loss": 0.3673, + "step": 4340 + }, + { + "epoch": 189.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5241702795028687, + "eval_runtime": 4.1853, + "eval_samples_per_second": 60.689, + "eval_steps_per_second": 0.478, + "step": 4347 + }, + { + "epoch": 189.1304347826087, + "grad_norm": 3.9942944049835205, + "learning_rate": 1.0869565217391306e-06, + "loss": 0.4158, + "step": 4350 + }, + { + "epoch": 189.56521739130434, + "grad_norm": 2.8651175498962402, + "learning_rate": 1.0434782608695653e-06, + "loss": 0.3919, + "step": 4360 + }, + { + "epoch": 190.0, + "grad_norm": 3.1065168380737305, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.3631, + "step": 4370 + }, + { + "epoch": 190.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5146118402481079, + "eval_runtime": 3.7356, + "eval_samples_per_second": 67.995, + "eval_steps_per_second": 0.535, + "step": 4370 + }, + { + "epoch": 190.43478260869566, + "grad_norm": 3.8796093463897705, + "learning_rate": 9.565217391304349e-07, + "loss": 0.3893, + "step": 4380 + }, + { + "epoch": 190.8695652173913, + "grad_norm": 3.2894842624664307, + "learning_rate": 9.130434782608697e-07, + "loss": 0.393, + "step": 4390 + }, + { + "epoch": 191.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5097819566726685, + "eval_runtime": 3.7404, + "eval_samples_per_second": 67.908, + "eval_steps_per_second": 0.535, + "step": 4393 + }, + { + "epoch": 191.30434782608697, + "grad_norm": 2.4112348556518555, + "learning_rate": 8.695652173913044e-07, + "loss": 0.4037, + "step": 4400 + }, + { + "epoch": 191.7391304347826, + "grad_norm": 2.4510791301727295, + "learning_rate": 8.260869565217392e-07, + "loss": 0.3806, + "step": 4410 + }, + { + "epoch": 192.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5133717656135559, + "eval_runtime": 5.016, + "eval_samples_per_second": 50.638, + "eval_steps_per_second": 0.399, + "step": 4416 + }, + { + "epoch": 192.17391304347825, + "grad_norm": 3.1017332077026367, + "learning_rate": 7.82608695652174e-07, + "loss": 0.3598, + "step": 4420 + }, + { + "epoch": 192.6086956521739, + "grad_norm": 3.5164568424224854, + "learning_rate": 7.391304347826088e-07, + "loss": 0.3789, + "step": 4430 + }, + { + "epoch": 193.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5127285718917847, + "eval_runtime": 3.736, + "eval_samples_per_second": 67.988, + "eval_steps_per_second": 0.535, + "step": 4439 + }, + { + "epoch": 193.04347826086956, + "grad_norm": 86.44344329833984, + "learning_rate": 6.956521739130435e-07, + "loss": 0.3858, + "step": 4440 + }, + { + "epoch": 193.47826086956522, + "grad_norm": 2.892185688018799, + "learning_rate": 6.521739130434783e-07, + "loss": 0.3894, + "step": 4450 + }, + { + "epoch": 193.91304347826087, + "grad_norm": 2.0254733562469482, + "learning_rate": 6.086956521739131e-07, + "loss": 0.3717, + "step": 4460 + }, + { + "epoch": 194.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5183544158935547, + "eval_runtime": 3.7197, + "eval_samples_per_second": 68.285, + "eval_steps_per_second": 0.538, + "step": 4462 + }, + { + "epoch": 194.34782608695653, + "grad_norm": 4.124297618865967, + "learning_rate": 5.652173913043478e-07, + "loss": 0.4098, + "step": 4470 + }, + { + "epoch": 194.7826086956522, + "grad_norm": 4.1497955322265625, + "learning_rate": 5.217391304347826e-07, + "loss": 0.361, + "step": 4480 + }, + { + "epoch": 195.0, + "eval_accuracy": 0.7834645669291339, + "eval_loss": 0.5185708999633789, + "eval_runtime": 4.9741, + "eval_samples_per_second": 51.064, + "eval_steps_per_second": 0.402, + "step": 4485 + }, + { + "epoch": 195.2173913043478, + "grad_norm": 11.268845558166504, + "learning_rate": 4.782608695652174e-07, + "loss": 0.3786, + "step": 4490 + }, + { + "epoch": 195.65217391304347, + "grad_norm": 3.9937920570373535, + "learning_rate": 4.347826086956522e-07, + "loss": 0.3722, + "step": 4500 + }, + { + "epoch": 196.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5107359886169434, + "eval_runtime": 3.7506, + "eval_samples_per_second": 67.723, + "eval_steps_per_second": 0.533, + "step": 4508 + }, + { + "epoch": 196.08695652173913, + "grad_norm": 2.869596004486084, + "learning_rate": 3.91304347826087e-07, + "loss": 0.3985, + "step": 4510 + }, + { + "epoch": 196.52173913043478, + "grad_norm": 6.21280574798584, + "learning_rate": 3.4782608695652175e-07, + "loss": 0.4019, + "step": 4520 + }, + { + "epoch": 196.95652173913044, + "grad_norm": 2.2324206829071045, + "learning_rate": 3.0434782608695656e-07, + "loss": 0.3551, + "step": 4530 + }, + { + "epoch": 197.0, + "eval_accuracy": 0.7952755905511811, + "eval_loss": 0.5174936056137085, + "eval_runtime": 3.6975, + "eval_samples_per_second": 68.695, + "eval_steps_per_second": 0.541, + "step": 4531 + }, + { + "epoch": 197.3913043478261, + "grad_norm": 2.6415905952453613, + "learning_rate": 2.608695652173913e-07, + "loss": 0.3919, + "step": 4540 + }, + { + "epoch": 197.82608695652175, + "grad_norm": 5.146513938903809, + "learning_rate": 2.173913043478261e-07, + "loss": 0.3649, + "step": 4550 + }, + { + "epoch": 198.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5135703682899475, + "eval_runtime": 4.9875, + "eval_samples_per_second": 50.928, + "eval_steps_per_second": 0.401, + "step": 4554 + }, + { + "epoch": 198.2608695652174, + "grad_norm": 3.1943461894989014, + "learning_rate": 1.7391304347826088e-07, + "loss": 0.3763, + "step": 4560 + }, + { + "epoch": 198.69565217391303, + "grad_norm": 2.8955743312835693, + "learning_rate": 1.3043478260869566e-07, + "loss": 0.3749, + "step": 4570 + }, + { + "epoch": 199.0, + "eval_accuracy": 0.7913385826771654, + "eval_loss": 0.5192672610282898, + "eval_runtime": 3.6944, + "eval_samples_per_second": 68.753, + "eval_steps_per_second": 0.541, + "step": 4577 + }, + { + "epoch": 199.1304347826087, + "grad_norm": 12.166488647460938, + "learning_rate": 8.695652173913044e-08, + "loss": 0.3869, + "step": 4580 + }, + { + "epoch": 199.56521739130434, + "grad_norm": 2.9687561988830566, + "learning_rate": 4.347826086956522e-08, + "loss": 0.3926, + "step": 4590 + }, + { + "epoch": 200.0, + "grad_norm": 4.834624290466309, + "learning_rate": 0.0, + "loss": 0.3782, + "step": 4600 + }, + { + "epoch": 200.0, + "eval_accuracy": 0.7992125984251969, + "eval_loss": 0.5181651711463928, + "eval_runtime": 3.7789, + "eval_samples_per_second": 67.216, + "eval_steps_per_second": 0.529, + "step": 4600 + }, + { + "epoch": 200.0, + "step": 4600, + "total_flos": 1.089869514338304e+18, + "train_loss": 0.30267460563908455, + "train_runtime": 4787.5341, + "train_samples_per_second": 59.947, + "train_steps_per_second": 0.961 } ], "logging_steps": 10, - "max_steps": 1150, + "max_steps": 4600, "num_input_tokens_seen": 0, - "num_train_epochs": 50, + "num_train_epochs": 200, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -1290,7 +5055,7 @@ "attributes": {} } }, - "total_flos": 2.72467378584576e+17, + "total_flos": 1.089869514338304e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null