diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8309 @@ +{ + "best_metric": 0.9622641509433962, + "best_model_checkpoint": "wav2vec2-2Class-easy-train-test-large/checkpoint-2520", + "epoch": 782.2222222222222, + "eval_steps": 500, + "global_step": 8800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.98, + "eval_accuracy": 0.4088050314465409, + "eval_loss": 0.7003181576728821, + "eval_runtime": 1.8048, + "eval_samples_per_second": 88.1, + "eval_steps_per_second": 5.541, + "step": 11 + }, + { + "epoch": 1.96, + "eval_accuracy": 0.4088050314465409, + "eval_loss": 0.7001124620437622, + "eval_runtime": 1.7728, + "eval_samples_per_second": 89.69, + "eval_steps_per_second": 5.641, + "step": 22 + }, + { + "epoch": 2.93, + "eval_accuracy": 0.41509433962264153, + "eval_loss": 0.69970703125, + "eval_runtime": 1.7593, + "eval_samples_per_second": 90.375, + "eval_steps_per_second": 5.684, + "step": 33 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.42138364779874216, + "eval_loss": 0.6991450786590576, + "eval_runtime": 1.7582, + "eval_samples_per_second": 90.433, + "eval_steps_per_second": 5.688, + "step": 45 + }, + { + "epoch": 4.44, + "grad_norm": 0.8353477716445923, + "learning_rate": 1.7045454545454546e-06, + "loss": 0.6976, + "step": 50 + }, + { + "epoch": 4.98, + "eval_accuracy": 0.4276729559748428, + "eval_loss": 0.6984724998474121, + "eval_runtime": 1.7849, + "eval_samples_per_second": 89.08, + "eval_steps_per_second": 5.603, + "step": 56 + }, + { + "epoch": 5.96, + "eval_accuracy": 0.44025157232704404, + "eval_loss": 0.697744607925415, + "eval_runtime": 2.127, + "eval_samples_per_second": 74.753, + "eval_steps_per_second": 4.701, + "step": 67 + }, + { + "epoch": 6.93, + "eval_accuracy": 0.44654088050314467, + "eval_loss": 0.6968724727630615, + "eval_runtime": 2.2513, + "eval_samples_per_second": 70.624, + "eval_steps_per_second": 4.442, + "step": 78 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.46540880503144655, + "eval_loss": 0.6957085728645325, + "eval_runtime": 2.1194, + "eval_samples_per_second": 75.021, + "eval_steps_per_second": 4.718, + "step": 90 + }, + { + "epoch": 8.89, + "grad_norm": 0.45805710554122925, + "learning_rate": 3.409090909090909e-06, + "loss": 0.6952, + "step": 100 + }, + { + "epoch": 8.98, + "eval_accuracy": 0.46540880503144655, + "eval_loss": 0.6945385932922363, + "eval_runtime": 2.2918, + "eval_samples_per_second": 69.378, + "eval_steps_per_second": 4.363, + "step": 101 + }, + { + "epoch": 9.96, + "eval_accuracy": 0.4779874213836478, + "eval_loss": 0.6933900117874146, + "eval_runtime": 2.2504, + "eval_samples_per_second": 70.654, + "eval_steps_per_second": 4.444, + "step": 112 + }, + { + "epoch": 10.93, + "eval_accuracy": 0.49056603773584906, + "eval_loss": 0.692146360874176, + "eval_runtime": 2.1543, + "eval_samples_per_second": 73.804, + "eval_steps_per_second": 4.642, + "step": 123 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.5471698113207547, + "eval_loss": 0.6906170845031738, + "eval_runtime": 2.0832, + "eval_samples_per_second": 76.326, + "eval_steps_per_second": 4.8, + "step": 135 + }, + { + "epoch": 12.98, + "eval_accuracy": 0.610062893081761, + "eval_loss": 0.6892228722572327, + "eval_runtime": 2.0269, + "eval_samples_per_second": 78.443, + "eval_steps_per_second": 4.934, + "step": 146 + }, + { + "epoch": 13.33, + "grad_norm": 0.6493268609046936, + "learning_rate": 5.1136363636363635e-06, + "loss": 0.6911, + "step": 150 + }, + { + "epoch": 13.96, + "eval_accuracy": 0.6037735849056604, + "eval_loss": 0.6878040432929993, + "eval_runtime": 2.1502, + "eval_samples_per_second": 73.946, + "eval_steps_per_second": 4.651, + "step": 157 + }, + { + "epoch": 14.93, + "eval_accuracy": 0.5911949685534591, + "eval_loss": 0.6863483190536499, + "eval_runtime": 2.0844, + "eval_samples_per_second": 76.279, + "eval_steps_per_second": 4.797, + "step": 168 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.5911949685534591, + "eval_loss": 0.6847361326217651, + "eval_runtime": 2.1372, + "eval_samples_per_second": 74.395, + "eval_steps_per_second": 4.679, + "step": 180 + }, + { + "epoch": 16.98, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6830993294715881, + "eval_runtime": 2.3473, + "eval_samples_per_second": 67.739, + "eval_steps_per_second": 4.26, + "step": 191 + }, + { + "epoch": 17.78, + "grad_norm": 0.5862739086151123, + "learning_rate": 6.818181818181818e-06, + "loss": 0.6852, + "step": 200 + }, + { + "epoch": 17.96, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6815393567085266, + "eval_runtime": 2.1307, + "eval_samples_per_second": 74.623, + "eval_steps_per_second": 4.693, + "step": 202 + }, + { + "epoch": 18.93, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.679994523525238, + "eval_runtime": 2.082, + "eval_samples_per_second": 76.37, + "eval_steps_per_second": 4.803, + "step": 213 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6782289147377014, + "eval_runtime": 2.1302, + "eval_samples_per_second": 74.641, + "eval_steps_per_second": 4.694, + "step": 225 + }, + { + "epoch": 20.98, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6765275001525879, + "eval_runtime": 2.0229, + "eval_samples_per_second": 78.601, + "eval_steps_per_second": 4.943, + "step": 236 + }, + { + "epoch": 21.96, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6749551892280579, + "eval_runtime": 2.0505, + "eval_samples_per_second": 77.542, + "eval_steps_per_second": 4.877, + "step": 247 + }, + { + "epoch": 22.22, + "grad_norm": 0.10243403911590576, + "learning_rate": 8.522727272727273e-06, + "loss": 0.6783, + "step": 250 + }, + { + "epoch": 22.93, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6732170581817627, + "eval_runtime": 2.0616, + "eval_samples_per_second": 77.125, + "eval_steps_per_second": 4.851, + "step": 258 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6713252067565918, + "eval_runtime": 2.1605, + "eval_samples_per_second": 73.595, + "eval_steps_per_second": 4.629, + "step": 270 + }, + { + "epoch": 24.98, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6694673895835876, + "eval_runtime": 2.0526, + "eval_samples_per_second": 77.462, + "eval_steps_per_second": 4.872, + "step": 281 + }, + { + "epoch": 25.96, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6674391031265259, + "eval_runtime": 2.1284, + "eval_samples_per_second": 74.704, + "eval_steps_per_second": 4.698, + "step": 292 + }, + { + "epoch": 26.67, + "grad_norm": 0.3114006221294403, + "learning_rate": 1.0227272727272727e-05, + "loss": 0.6676, + "step": 300 + }, + { + "epoch": 26.93, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6654335856437683, + "eval_runtime": 1.9991, + "eval_samples_per_second": 79.535, + "eval_steps_per_second": 5.002, + "step": 303 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6630644202232361, + "eval_runtime": 2.0451, + "eval_samples_per_second": 77.745, + "eval_steps_per_second": 4.89, + "step": 315 + }, + { + "epoch": 28.98, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6605831980705261, + "eval_runtime": 2.0625, + "eval_samples_per_second": 77.092, + "eval_steps_per_second": 4.849, + "step": 326 + }, + { + "epoch": 29.96, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6578991413116455, + "eval_runtime": 2.0381, + "eval_samples_per_second": 78.014, + "eval_steps_per_second": 4.907, + "step": 337 + }, + { + "epoch": 30.93, + "eval_accuracy": 0.5849056603773585, + "eval_loss": 0.6539114713668823, + "eval_runtime": 1.9774, + "eval_samples_per_second": 80.407, + "eval_steps_per_second": 5.057, + "step": 348 + }, + { + "epoch": 31.11, + "grad_norm": 0.2134709656238556, + "learning_rate": 1.1931818181818181e-05, + "loss": 0.6516, + "step": 350 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.5974842767295597, + "eval_loss": 0.6492742896080017, + "eval_runtime": 2.0601, + "eval_samples_per_second": 77.182, + "eval_steps_per_second": 4.854, + "step": 360 + }, + { + "epoch": 32.98, + "eval_accuracy": 0.610062893081761, + "eval_loss": 0.6441397070884705, + "eval_runtime": 2.0739, + "eval_samples_per_second": 76.667, + "eval_steps_per_second": 4.822, + "step": 371 + }, + { + "epoch": 33.96, + "eval_accuracy": 0.6226415094339622, + "eval_loss": 0.6348815560340881, + "eval_runtime": 2.1526, + "eval_samples_per_second": 73.865, + "eval_steps_per_second": 4.646, + "step": 382 + }, + { + "epoch": 34.93, + "eval_accuracy": 0.6289308176100629, + "eval_loss": 0.6257140040397644, + "eval_runtime": 2.0081, + "eval_samples_per_second": 79.179, + "eval_steps_per_second": 4.98, + "step": 393 + }, + { + "epoch": 35.56, + "grad_norm": 0.8974349498748779, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.6124, + "step": 400 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.6415094339622641, + "eval_loss": 0.611738920211792, + "eval_runtime": 1.9854, + "eval_samples_per_second": 80.083, + "eval_steps_per_second": 5.037, + "step": 405 + }, + { + "epoch": 36.98, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 0.5910706520080566, + "eval_runtime": 2.0618, + "eval_samples_per_second": 77.117, + "eval_steps_per_second": 4.85, + "step": 416 + }, + { + "epoch": 37.96, + "eval_accuracy": 0.6918238993710691, + "eval_loss": 0.5672016143798828, + "eval_runtime": 2.0402, + "eval_samples_per_second": 77.932, + "eval_steps_per_second": 4.901, + "step": 427 + }, + { + "epoch": 38.93, + "eval_accuracy": 0.7232704402515723, + "eval_loss": 0.5392354130744934, + "eval_runtime": 2.2936, + "eval_samples_per_second": 69.324, + "eval_steps_per_second": 4.36, + "step": 438 + }, + { + "epoch": 40.0, + "grad_norm": 0.7736309170722961, + "learning_rate": 1.534090909090909e-05, + "loss": 0.5073, + "step": 450 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.7547169811320755, + "eval_loss": 0.5041937232017517, + "eval_runtime": 2.1247, + "eval_samples_per_second": 74.835, + "eval_steps_per_second": 4.707, + "step": 450 + }, + { + "epoch": 40.98, + "eval_accuracy": 0.7672955974842768, + "eval_loss": 0.47902750968933105, + "eval_runtime": 2.163, + "eval_samples_per_second": 73.509, + "eval_steps_per_second": 4.623, + "step": 461 + }, + { + "epoch": 41.96, + "eval_accuracy": 0.779874213836478, + "eval_loss": 0.47594940662384033, + "eval_runtime": 2.1321, + "eval_samples_per_second": 74.574, + "eval_steps_per_second": 4.69, + "step": 472 + }, + { + "epoch": 42.93, + "eval_accuracy": 0.7987421383647799, + "eval_loss": 0.4369964003562927, + "eval_runtime": 2.1555, + "eval_samples_per_second": 73.765, + "eval_steps_per_second": 4.639, + "step": 483 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.7987421383647799, + "eval_loss": 0.43516698479652405, + "eval_runtime": 2.032, + "eval_samples_per_second": 78.249, + "eval_steps_per_second": 4.921, + "step": 495 + }, + { + "epoch": 44.44, + "grad_norm": 0.4976819157600403, + "learning_rate": 1.7045454545454546e-05, + "loss": 0.3489, + "step": 500 + }, + { + "epoch": 44.98, + "eval_accuracy": 0.7987421383647799, + "eval_loss": 0.4422326385974884, + "eval_runtime": 2.1135, + "eval_samples_per_second": 75.231, + "eval_steps_per_second": 4.732, + "step": 506 + }, + { + "epoch": 45.96, + "eval_accuracy": 0.8050314465408805, + "eval_loss": 0.41540881991386414, + "eval_runtime": 2.0847, + "eval_samples_per_second": 76.27, + "eval_steps_per_second": 4.797, + "step": 517 + }, + { + "epoch": 46.93, + "eval_accuracy": 0.8050314465408805, + "eval_loss": 0.4131433367729187, + "eval_runtime": 1.9752, + "eval_samples_per_second": 80.498, + "eval_steps_per_second": 5.063, + "step": 528 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.8113207547169812, + "eval_loss": 0.3975575864315033, + "eval_runtime": 2.01, + "eval_samples_per_second": 79.104, + "eval_steps_per_second": 4.975, + "step": 540 + }, + { + "epoch": 48.89, + "grad_norm": 0.5197520852088928, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.2962, + "step": 550 + }, + { + "epoch": 48.98, + "eval_accuracy": 0.8113207547169812, + "eval_loss": 0.39397454261779785, + "eval_runtime": 2.0261, + "eval_samples_per_second": 78.474, + "eval_steps_per_second": 4.935, + "step": 551 + }, + { + "epoch": 49.96, + "eval_accuracy": 0.8238993710691824, + "eval_loss": 0.371494859457016, + "eval_runtime": 2.0246, + "eval_samples_per_second": 78.535, + "eval_steps_per_second": 4.939, + "step": 562 + }, + { + "epoch": 50.93, + "eval_accuracy": 0.8427672955974843, + "eval_loss": 0.34951409697532654, + "eval_runtime": 2.3286, + "eval_samples_per_second": 68.281, + "eval_steps_per_second": 4.294, + "step": 573 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.8364779874213837, + "eval_loss": 0.3481156826019287, + "eval_runtime": 1.9542, + "eval_samples_per_second": 81.362, + "eval_steps_per_second": 5.117, + "step": 585 + }, + { + "epoch": 52.98, + "eval_accuracy": 0.8176100628930818, + "eval_loss": 0.3817409873008728, + "eval_runtime": 2.0789, + "eval_samples_per_second": 76.484, + "eval_steps_per_second": 4.81, + "step": 596 + }, + { + "epoch": 53.33, + "grad_norm": 0.5608111023902893, + "learning_rate": 2.0454545454545454e-05, + "loss": 0.2573, + "step": 600 + }, + { + "epoch": 53.96, + "eval_accuracy": 0.8490566037735849, + "eval_loss": 0.3412492871284485, + "eval_runtime": 2.0746, + "eval_samples_per_second": 76.642, + "eval_steps_per_second": 4.82, + "step": 607 + }, + { + "epoch": 54.93, + "eval_accuracy": 0.8490566037735849, + "eval_loss": 0.32929155230522156, + "eval_runtime": 1.9991, + "eval_samples_per_second": 79.538, + "eval_steps_per_second": 5.002, + "step": 618 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.8427672955974843, + "eval_loss": 0.3547687232494354, + "eval_runtime": 2.1242, + "eval_samples_per_second": 74.851, + "eval_steps_per_second": 4.708, + "step": 630 + }, + { + "epoch": 56.98, + "eval_accuracy": 0.8427672955974843, + "eval_loss": 0.3044220209121704, + "eval_runtime": 2.0508, + "eval_samples_per_second": 77.532, + "eval_steps_per_second": 4.876, + "step": 641 + }, + { + "epoch": 57.78, + "grad_norm": 0.894092321395874, + "learning_rate": 2.215909090909091e-05, + "loss": 0.2279, + "step": 650 + }, + { + "epoch": 57.96, + "eval_accuracy": 0.8490566037735849, + "eval_loss": 0.32347577810287476, + "eval_runtime": 2.2095, + "eval_samples_per_second": 71.963, + "eval_steps_per_second": 4.526, + "step": 652 + }, + { + "epoch": 58.93, + "eval_accuracy": 0.8490566037735849, + "eval_loss": 0.3371436297893524, + "eval_runtime": 2.1055, + "eval_samples_per_second": 75.518, + "eval_steps_per_second": 4.75, + "step": 663 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.8490566037735849, + "eval_loss": 0.31275492906570435, + "eval_runtime": 2.1311, + "eval_samples_per_second": 74.61, + "eval_steps_per_second": 4.692, + "step": 675 + }, + { + "epoch": 60.98, + "eval_accuracy": 0.8553459119496856, + "eval_loss": 0.32111966609954834, + "eval_runtime": 2.0639, + "eval_samples_per_second": 77.038, + "eval_steps_per_second": 4.845, + "step": 686 + }, + { + "epoch": 61.96, + "eval_accuracy": 0.8616352201257862, + "eval_loss": 0.302960604429245, + "eval_runtime": 2.0241, + "eval_samples_per_second": 78.552, + "eval_steps_per_second": 4.94, + "step": 697 + }, + { + "epoch": 62.22, + "grad_norm": 0.4315973222255707, + "learning_rate": 2.3863636363636362e-05, + "loss": 0.2167, + "step": 700 + }, + { + "epoch": 62.93, + "eval_accuracy": 0.8616352201257862, + "eval_loss": 0.29696550965309143, + "eval_runtime": 2.034, + "eval_samples_per_second": 78.169, + "eval_steps_per_second": 4.916, + "step": 708 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.8679245283018868, + "eval_loss": 0.29949402809143066, + "eval_runtime": 2.095, + "eval_samples_per_second": 75.897, + "eval_steps_per_second": 4.773, + "step": 720 + }, + { + "epoch": 64.98, + "eval_accuracy": 0.8742138364779874, + "eval_loss": 0.2867083251476288, + "eval_runtime": 2.0417, + "eval_samples_per_second": 77.876, + "eval_steps_per_second": 4.898, + "step": 731 + }, + { + "epoch": 65.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.26363295316696167, + "eval_runtime": 2.1382, + "eval_samples_per_second": 74.363, + "eval_steps_per_second": 4.677, + "step": 742 + }, + { + "epoch": 66.67, + "grad_norm": 0.37665870785713196, + "learning_rate": 2.556818181818182e-05, + "loss": 0.207, + "step": 750 + }, + { + "epoch": 66.93, + "eval_accuracy": 0.8805031446540881, + "eval_loss": 0.28482353687286377, + "eval_runtime": 2.1166, + "eval_samples_per_second": 75.119, + "eval_steps_per_second": 4.724, + "step": 753 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.8867924528301887, + "eval_loss": 0.2750767767429352, + "eval_runtime": 2.1981, + "eval_samples_per_second": 72.336, + "eval_steps_per_second": 4.549, + "step": 765 + }, + { + "epoch": 68.98, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.256393700838089, + "eval_runtime": 2.033, + "eval_samples_per_second": 78.211, + "eval_steps_per_second": 4.919, + "step": 776 + }, + { + "epoch": 69.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.25443732738494873, + "eval_runtime": 2.0096, + "eval_samples_per_second": 79.121, + "eval_steps_per_second": 4.976, + "step": 787 + }, + { + "epoch": 70.93, + "eval_accuracy": 0.8742138364779874, + "eval_loss": 0.2954423129558563, + "eval_runtime": 2.1018, + "eval_samples_per_second": 75.649, + "eval_steps_per_second": 4.758, + "step": 798 + }, + { + "epoch": 71.11, + "grad_norm": 0.7302255630493164, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.1899, + "step": 800 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.25169771909713745, + "eval_runtime": 2.041, + "eval_samples_per_second": 77.904, + "eval_steps_per_second": 4.9, + "step": 810 + }, + { + "epoch": 72.98, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.2506076693534851, + "eval_runtime": 2.0257, + "eval_samples_per_second": 78.49, + "eval_steps_per_second": 4.936, + "step": 821 + }, + { + "epoch": 73.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.2434261441230774, + "eval_runtime": 2.0325, + "eval_samples_per_second": 78.23, + "eval_steps_per_second": 4.92, + "step": 832 + }, + { + "epoch": 74.93, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.23832084238529205, + "eval_runtime": 2.1871, + "eval_samples_per_second": 72.699, + "eval_steps_per_second": 4.572, + "step": 843 + }, + { + "epoch": 75.56, + "grad_norm": 0.5180615186691284, + "learning_rate": 2.897727272727273e-05, + "loss": 0.1801, + "step": 850 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.23464229702949524, + "eval_runtime": 2.026, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 4.936, + "step": 855 + }, + { + "epoch": 76.98, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.22975026071071625, + "eval_runtime": 2.0881, + "eval_samples_per_second": 76.147, + "eval_steps_per_second": 4.789, + "step": 866 + }, + { + "epoch": 77.96, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.2403678596019745, + "eval_runtime": 2.075, + "eval_samples_per_second": 76.626, + "eval_steps_per_second": 4.819, + "step": 877 + }, + { + "epoch": 78.93, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.2674010097980499, + "eval_runtime": 2.037, + "eval_samples_per_second": 78.057, + "eval_steps_per_second": 4.909, + "step": 888 + }, + { + "epoch": 80.0, + "grad_norm": 1.2135472297668457, + "learning_rate": 2.9924242424242427e-05, + "loss": 0.1692, + "step": 900 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.2231501042842865, + "eval_runtime": 2.0398, + "eval_samples_per_second": 77.949, + "eval_steps_per_second": 4.902, + "step": 900 + }, + { + "epoch": 80.98, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.2390480935573578, + "eval_runtime": 1.9822, + "eval_samples_per_second": 80.213, + "eval_steps_per_second": 5.045, + "step": 911 + }, + { + "epoch": 81.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.20583955943584442, + "eval_runtime": 2.0665, + "eval_samples_per_second": 76.94, + "eval_steps_per_second": 4.839, + "step": 922 + }, + { + "epoch": 82.93, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.2114023119211197, + "eval_runtime": 2.0736, + "eval_samples_per_second": 76.678, + "eval_steps_per_second": 4.823, + "step": 933 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.24830691516399384, + "eval_runtime": 2.0148, + "eval_samples_per_second": 78.915, + "eval_steps_per_second": 4.963, + "step": 945 + }, + { + "epoch": 84.44, + "grad_norm": 0.5111488103866577, + "learning_rate": 2.9734848484848486e-05, + "loss": 0.1691, + "step": 950 + }, + { + "epoch": 84.98, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.2259017676115036, + "eval_runtime": 2.2201, + "eval_samples_per_second": 71.618, + "eval_steps_per_second": 4.504, + "step": 956 + }, + { + "epoch": 85.96, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.20239894092082977, + "eval_runtime": 2.0671, + "eval_samples_per_second": 76.918, + "eval_steps_per_second": 4.838, + "step": 967 + }, + { + "epoch": 86.93, + "eval_accuracy": 0.89937106918239, + "eval_loss": 0.20193150639533997, + "eval_runtime": 2.0416, + "eval_samples_per_second": 77.879, + "eval_steps_per_second": 4.898, + "step": 978 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19625458121299744, + "eval_runtime": 2.0196, + "eval_samples_per_second": 78.73, + "eval_steps_per_second": 4.952, + "step": 990 + }, + { + "epoch": 88.89, + "grad_norm": 0.4683234989643097, + "learning_rate": 2.9545454545454545e-05, + "loss": 0.1609, + "step": 1000 + }, + { + "epoch": 88.98, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.21583892405033112, + "eval_runtime": 2.0254, + "eval_samples_per_second": 78.503, + "eval_steps_per_second": 4.937, + "step": 1001 + }, + { + "epoch": 89.96, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.197691410779953, + "eval_runtime": 1.9978, + "eval_samples_per_second": 79.586, + "eval_steps_per_second": 5.005, + "step": 1012 + }, + { + "epoch": 90.93, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.19791610538959503, + "eval_runtime": 2.0853, + "eval_samples_per_second": 76.248, + "eval_steps_per_second": 4.795, + "step": 1023 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.20358721911907196, + "eval_runtime": 2.1963, + "eval_samples_per_second": 72.393, + "eval_steps_per_second": 4.553, + "step": 1035 + }, + { + "epoch": 92.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19769711792469025, + "eval_runtime": 2.0089, + "eval_samples_per_second": 79.146, + "eval_steps_per_second": 4.978, + "step": 1046 + }, + { + "epoch": 93.33, + "grad_norm": 0.6099847555160522, + "learning_rate": 2.9356060606060604e-05, + "loss": 0.1516, + "step": 1050 + }, + { + "epoch": 93.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.1974458247423172, + "eval_runtime": 2.1182, + "eval_samples_per_second": 75.065, + "eval_steps_per_second": 4.721, + "step": 1057 + }, + { + "epoch": 94.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.1993919163942337, + "eval_runtime": 2.0707, + "eval_samples_per_second": 76.787, + "eval_steps_per_second": 4.829, + "step": 1068 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.1955273449420929, + "eval_runtime": 2.0163, + "eval_samples_per_second": 78.858, + "eval_steps_per_second": 4.96, + "step": 1080 + }, + { + "epoch": 96.98, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.19483698904514313, + "eval_runtime": 2.0495, + "eval_samples_per_second": 77.581, + "eval_steps_per_second": 4.879, + "step": 1091 + }, + { + "epoch": 97.78, + "grad_norm": 1.0578981637954712, + "learning_rate": 2.9166666666666666e-05, + "loss": 0.1386, + "step": 1100 + }, + { + "epoch": 97.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19463855028152466, + "eval_runtime": 2.0625, + "eval_samples_per_second": 77.091, + "eval_steps_per_second": 4.849, + "step": 1102 + }, + { + "epoch": 98.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19323910772800446, + "eval_runtime": 2.0028, + "eval_samples_per_second": 79.389, + "eval_steps_per_second": 4.993, + "step": 1113 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.1841806173324585, + "eval_runtime": 2.1056, + "eval_samples_per_second": 75.512, + "eval_steps_per_second": 4.749, + "step": 1125 + }, + { + "epoch": 100.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.18839451670646667, + "eval_runtime": 1.9858, + "eval_samples_per_second": 80.07, + "eval_steps_per_second": 5.036, + "step": 1136 + }, + { + "epoch": 101.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.1899903267621994, + "eval_runtime": 2.2196, + "eval_samples_per_second": 71.635, + "eval_steps_per_second": 4.505, + "step": 1147 + }, + { + "epoch": 102.22, + "grad_norm": 0.6229210495948792, + "learning_rate": 2.897727272727273e-05, + "loss": 0.1279, + "step": 1150 + }, + { + "epoch": 102.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.184115469455719, + "eval_runtime": 2.0229, + "eval_samples_per_second": 78.602, + "eval_steps_per_second": 4.944, + "step": 1158 + }, + { + "epoch": 104.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19207227230072021, + "eval_runtime": 1.9639, + "eval_samples_per_second": 80.962, + "eval_steps_per_second": 5.092, + "step": 1170 + }, + { + "epoch": 104.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19926591217517853, + "eval_runtime": 2.0509, + "eval_samples_per_second": 77.526, + "eval_steps_per_second": 4.876, + "step": 1181 + }, + { + "epoch": 105.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.19455212354660034, + "eval_runtime": 2.0496, + "eval_samples_per_second": 77.577, + "eval_steps_per_second": 4.879, + "step": 1192 + }, + { + "epoch": 106.67, + "grad_norm": 1.2741256952285767, + "learning_rate": 2.8787878787878788e-05, + "loss": 0.1258, + "step": 1200 + }, + { + "epoch": 106.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.18963727355003357, + "eval_runtime": 2.0026, + "eval_samples_per_second": 79.395, + "eval_steps_per_second": 4.993, + "step": 1203 + }, + { + "epoch": 108.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.1884273737668991, + "eval_runtime": 2.0343, + "eval_samples_per_second": 78.16, + "eval_steps_per_second": 4.916, + "step": 1215 + }, + { + "epoch": 108.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.17940251529216766, + "eval_runtime": 2.1734, + "eval_samples_per_second": 73.156, + "eval_steps_per_second": 4.601, + "step": 1226 + }, + { + "epoch": 109.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.18589730560779572, + "eval_runtime": 2.0874, + "eval_samples_per_second": 76.17, + "eval_steps_per_second": 4.791, + "step": 1237 + }, + { + "epoch": 110.93, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.2194768339395523, + "eval_runtime": 2.0717, + "eval_samples_per_second": 76.747, + "eval_steps_per_second": 4.827, + "step": 1248 + }, + { + "epoch": 111.11, + "grad_norm": 0.3613344430923462, + "learning_rate": 2.859848484848485e-05, + "loss": 0.1258, + "step": 1250 + }, + { + "epoch": 112.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.20826272666454315, + "eval_runtime": 1.9861, + "eval_samples_per_second": 80.057, + "eval_steps_per_second": 5.035, + "step": 1260 + }, + { + "epoch": 112.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.21202689409255981, + "eval_runtime": 2.0132, + "eval_samples_per_second": 78.98, + "eval_steps_per_second": 4.967, + "step": 1271 + }, + { + "epoch": 113.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.20663346350193024, + "eval_runtime": 2.02, + "eval_samples_per_second": 78.711, + "eval_steps_per_second": 4.95, + "step": 1282 + }, + { + "epoch": 114.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.1931203156709671, + "eval_runtime": 2.033, + "eval_samples_per_second": 78.208, + "eval_steps_per_second": 4.919, + "step": 1293 + }, + { + "epoch": 115.56, + "grad_norm": 0.7503376007080078, + "learning_rate": 2.8409090909090912e-05, + "loss": 0.1023, + "step": 1300 + }, + { + "epoch": 116.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.19000084698200226, + "eval_runtime": 2.0014, + "eval_samples_per_second": 79.446, + "eval_steps_per_second": 4.997, + "step": 1305 + }, + { + "epoch": 116.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.20288796722888947, + "eval_runtime": 2.0774, + "eval_samples_per_second": 76.539, + "eval_steps_per_second": 4.814, + "step": 1316 + }, + { + "epoch": 117.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19505923986434937, + "eval_runtime": 2.0552, + "eval_samples_per_second": 77.366, + "eval_steps_per_second": 4.866, + "step": 1327 + }, + { + "epoch": 118.93, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.20838169753551483, + "eval_runtime": 2.2371, + "eval_samples_per_second": 71.074, + "eval_steps_per_second": 4.47, + "step": 1338 + }, + { + "epoch": 120.0, + "grad_norm": 0.2376416176557541, + "learning_rate": 2.821969696969697e-05, + "loss": 0.0997, + "step": 1350 + }, + { + "epoch": 120.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2159019112586975, + "eval_runtime": 2.0579, + "eval_samples_per_second": 77.264, + "eval_steps_per_second": 4.859, + "step": 1350 + }, + { + "epoch": 120.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.21662545204162598, + "eval_runtime": 2.0756, + "eval_samples_per_second": 76.605, + "eval_steps_per_second": 4.818, + "step": 1361 + }, + { + "epoch": 121.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.197323277592659, + "eval_runtime": 2.0227, + "eval_samples_per_second": 78.607, + "eval_steps_per_second": 4.944, + "step": 1372 + }, + { + "epoch": 122.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.18507684767246246, + "eval_runtime": 2.0728, + "eval_samples_per_second": 76.706, + "eval_steps_per_second": 4.824, + "step": 1383 + }, + { + "epoch": 124.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.20666691660881042, + "eval_runtime": 1.9717, + "eval_samples_per_second": 80.642, + "eval_steps_per_second": 5.072, + "step": 1395 + }, + { + "epoch": 124.44, + "grad_norm": 0.3115290403366089, + "learning_rate": 2.803030303030303e-05, + "loss": 0.1021, + "step": 1400 + }, + { + "epoch": 124.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.19534242153167725, + "eval_runtime": 2.0497, + "eval_samples_per_second": 77.571, + "eval_steps_per_second": 4.879, + "step": 1406 + }, + { + "epoch": 125.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.17650572955608368, + "eval_runtime": 2.239, + "eval_samples_per_second": 71.015, + "eval_steps_per_second": 4.466, + "step": 1417 + }, + { + "epoch": 126.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.18782062828540802, + "eval_runtime": 2.0533, + "eval_samples_per_second": 77.437, + "eval_steps_per_second": 4.87, + "step": 1428 + }, + { + "epoch": 128.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.20708344876766205, + "eval_runtime": 2.0414, + "eval_samples_per_second": 77.887, + "eval_steps_per_second": 4.899, + "step": 1440 + }, + { + "epoch": 128.89, + "grad_norm": 1.2413551807403564, + "learning_rate": 2.784090909090909e-05, + "loss": 0.0883, + "step": 1450 + }, + { + "epoch": 128.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.2241077572107315, + "eval_runtime": 1.9826, + "eval_samples_per_second": 80.197, + "eval_steps_per_second": 5.044, + "step": 1451 + }, + { + "epoch": 129.96, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.23481474816799164, + "eval_runtime": 1.9747, + "eval_samples_per_second": 80.518, + "eval_steps_per_second": 5.064, + "step": 1462 + }, + { + "epoch": 130.93, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.24748335778713226, + "eval_runtime": 1.9737, + "eval_samples_per_second": 80.559, + "eval_steps_per_second": 5.067, + "step": 1473 + }, + { + "epoch": 132.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.21596243977546692, + "eval_runtime": 2.0455, + "eval_samples_per_second": 77.733, + "eval_steps_per_second": 4.889, + "step": 1485 + }, + { + "epoch": 132.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.20896825194358826, + "eval_runtime": 2.047, + "eval_samples_per_second": 77.675, + "eval_steps_per_second": 4.885, + "step": 1496 + }, + { + "epoch": 133.33, + "grad_norm": 0.56540846824646, + "learning_rate": 2.7651515151515152e-05, + "loss": 0.0769, + "step": 1500 + }, + { + "epoch": 133.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.21468934416770935, + "eval_runtime": 1.9936, + "eval_samples_per_second": 79.754, + "eval_steps_per_second": 5.016, + "step": 1507 + }, + { + "epoch": 134.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.22008037567138672, + "eval_runtime": 2.0857, + "eval_samples_per_second": 76.234, + "eval_steps_per_second": 4.795, + "step": 1518 + }, + { + "epoch": 136.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.23723578453063965, + "eval_runtime": 2.1872, + "eval_samples_per_second": 72.695, + "eval_steps_per_second": 4.572, + "step": 1530 + }, + { + "epoch": 136.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.21990692615509033, + "eval_runtime": 2.0473, + "eval_samples_per_second": 77.664, + "eval_steps_per_second": 4.885, + "step": 1541 + }, + { + "epoch": 137.78, + "grad_norm": 1.0245180130004883, + "learning_rate": 2.7462121212121214e-05, + "loss": 0.0786, + "step": 1550 + }, + { + "epoch": 137.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.2087443619966507, + "eval_runtime": 2.0577, + "eval_samples_per_second": 77.271, + "eval_steps_per_second": 4.86, + "step": 1552 + }, + { + "epoch": 138.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.18779344856739044, + "eval_runtime": 2.0799, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 4.808, + "step": 1563 + }, + { + "epoch": 140.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.1914655864238739, + "eval_runtime": 2.043, + "eval_samples_per_second": 77.827, + "eval_steps_per_second": 4.895, + "step": 1575 + }, + { + "epoch": 140.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.23168283700942993, + "eval_runtime": 2.0313, + "eval_samples_per_second": 78.277, + "eval_steps_per_second": 4.923, + "step": 1586 + }, + { + "epoch": 141.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.2865447700023651, + "eval_runtime": 2.0095, + "eval_samples_per_second": 79.125, + "eval_steps_per_second": 4.976, + "step": 1597 + }, + { + "epoch": 142.22, + "grad_norm": 1.393044352531433, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.0714, + "step": 1600 + }, + { + "epoch": 142.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.22998519241809845, + "eval_runtime": 2.1842, + "eval_samples_per_second": 72.794, + "eval_steps_per_second": 4.578, + "step": 1608 + }, + { + "epoch": 144.0, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.27265357971191406, + "eval_runtime": 2.0318, + "eval_samples_per_second": 78.258, + "eval_steps_per_second": 4.922, + "step": 1620 + }, + { + "epoch": 144.98, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.28114742040634155, + "eval_runtime": 2.0949, + "eval_samples_per_second": 75.9, + "eval_steps_per_second": 4.774, + "step": 1631 + }, + { + "epoch": 145.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.21014899015426636, + "eval_runtime": 2.0829, + "eval_samples_per_second": 76.335, + "eval_steps_per_second": 4.801, + "step": 1642 + }, + { + "epoch": 146.67, + "grad_norm": 1.1527929306030273, + "learning_rate": 2.7083333333333335e-05, + "loss": 0.0702, + "step": 1650 + }, + { + "epoch": 146.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.20363318920135498, + "eval_runtime": 2.0224, + "eval_samples_per_second": 78.618, + "eval_steps_per_second": 4.945, + "step": 1653 + }, + { + "epoch": 148.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22154641151428223, + "eval_runtime": 2.0286, + "eval_samples_per_second": 78.378, + "eval_steps_per_second": 4.929, + "step": 1665 + }, + { + "epoch": 148.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.21356013417243958, + "eval_runtime": 1.9745, + "eval_samples_per_second": 80.526, + "eval_steps_per_second": 5.065, + "step": 1676 + }, + { + "epoch": 149.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.20560431480407715, + "eval_runtime": 2.0343, + "eval_samples_per_second": 78.161, + "eval_steps_per_second": 4.916, + "step": 1687 + }, + { + "epoch": 150.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.20028233528137207, + "eval_runtime": 2.0476, + "eval_samples_per_second": 77.65, + "eval_steps_per_second": 4.884, + "step": 1698 + }, + { + "epoch": 151.11, + "grad_norm": 0.6037131547927856, + "learning_rate": 2.6893939393939398e-05, + "loss": 0.0676, + "step": 1700 + }, + { + "epoch": 152.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.22495229542255402, + "eval_runtime": 2.0653, + "eval_samples_per_second": 76.985, + "eval_steps_per_second": 4.842, + "step": 1710 + }, + { + "epoch": 152.98, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.1910940259695053, + "eval_runtime": 2.2097, + "eval_samples_per_second": 71.955, + "eval_steps_per_second": 4.525, + "step": 1721 + }, + { + "epoch": 153.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2189728170633316, + "eval_runtime": 2.049, + "eval_samples_per_second": 77.598, + "eval_steps_per_second": 4.88, + "step": 1732 + }, + { + "epoch": 154.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.1975589245557785, + "eval_runtime": 2.0536, + "eval_samples_per_second": 77.426, + "eval_steps_per_second": 4.87, + "step": 1743 + }, + { + "epoch": 155.56, + "grad_norm": 0.9841188788414001, + "learning_rate": 2.6704545454545453e-05, + "loss": 0.0674, + "step": 1750 + }, + { + "epoch": 156.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.18743836879730225, + "eval_runtime": 2.0593, + "eval_samples_per_second": 77.211, + "eval_steps_per_second": 4.856, + "step": 1755 + }, + { + "epoch": 156.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2022770792245865, + "eval_runtime": 2.0432, + "eval_samples_per_second": 77.821, + "eval_steps_per_second": 4.894, + "step": 1766 + }, + { + "epoch": 157.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.21527531743049622, + "eval_runtime": 1.9951, + "eval_samples_per_second": 79.694, + "eval_steps_per_second": 5.012, + "step": 1777 + }, + { + "epoch": 158.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.22451625764369965, + "eval_runtime": 2.1442, + "eval_samples_per_second": 74.155, + "eval_steps_per_second": 4.664, + "step": 1788 + }, + { + "epoch": 160.0, + "grad_norm": 0.5377254486083984, + "learning_rate": 2.6515151515151516e-05, + "loss": 0.0548, + "step": 1800 + }, + { + "epoch": 160.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2431740015745163, + "eval_runtime": 2.2699, + "eval_samples_per_second": 70.046, + "eval_steps_per_second": 4.405, + "step": 1800 + }, + { + "epoch": 160.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2071038782596588, + "eval_runtime": 2.0506, + "eval_samples_per_second": 77.538, + "eval_steps_per_second": 4.877, + "step": 1811 + }, + { + "epoch": 161.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.18368059396743774, + "eval_runtime": 2.2081, + "eval_samples_per_second": 72.006, + "eval_steps_per_second": 4.529, + "step": 1822 + }, + { + "epoch": 162.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.19161438941955566, + "eval_runtime": 1.9999, + "eval_samples_per_second": 79.505, + "eval_steps_per_second": 5.0, + "step": 1833 + }, + { + "epoch": 164.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.22212089598178864, + "eval_runtime": 2.0001, + "eval_samples_per_second": 79.497, + "eval_steps_per_second": 5.0, + "step": 1845 + }, + { + "epoch": 164.44, + "grad_norm": 0.5433365702629089, + "learning_rate": 2.6325757575757575e-05, + "loss": 0.0616, + "step": 1850 + }, + { + "epoch": 164.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.21204246580600739, + "eval_runtime": 2.035, + "eval_samples_per_second": 78.132, + "eval_steps_per_second": 4.914, + "step": 1856 + }, + { + "epoch": 165.96, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.18882697820663452, + "eval_runtime": 2.0581, + "eval_samples_per_second": 77.256, + "eval_steps_per_second": 4.859, + "step": 1867 + }, + { + "epoch": 166.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.19714578986167908, + "eval_runtime": 2.002, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 4.995, + "step": 1878 + }, + { + "epoch": 168.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.21613995730876923, + "eval_runtime": 2.0979, + "eval_samples_per_second": 75.789, + "eval_steps_per_second": 4.767, + "step": 1890 + }, + { + "epoch": 168.89, + "grad_norm": 0.4616011083126068, + "learning_rate": 2.6136363636363637e-05, + "loss": 0.0467, + "step": 1900 + }, + { + "epoch": 168.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22824302315711975, + "eval_runtime": 2.0023, + "eval_samples_per_second": 79.407, + "eval_steps_per_second": 4.994, + "step": 1901 + }, + { + "epoch": 169.96, + "eval_accuracy": 0.9056603773584906, + "eval_loss": 0.31181007623672485, + "eval_runtime": 2.2272, + "eval_samples_per_second": 71.39, + "eval_steps_per_second": 4.49, + "step": 1912 + }, + { + "epoch": 170.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23191651701927185, + "eval_runtime": 2.0759, + "eval_samples_per_second": 76.592, + "eval_steps_per_second": 4.817, + "step": 1923 + }, + { + "epoch": 172.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.27404358983039856, + "eval_runtime": 2.0769, + "eval_samples_per_second": 76.555, + "eval_steps_per_second": 4.815, + "step": 1935 + }, + { + "epoch": 172.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2666384279727936, + "eval_runtime": 2.1046, + "eval_samples_per_second": 75.548, + "eval_steps_per_second": 4.751, + "step": 1946 + }, + { + "epoch": 173.33, + "grad_norm": 1.0961925983428955, + "learning_rate": 2.59469696969697e-05, + "loss": 0.0609, + "step": 1950 + }, + { + "epoch": 173.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23152852058410645, + "eval_runtime": 2.0323, + "eval_samples_per_second": 78.237, + "eval_steps_per_second": 4.921, + "step": 1957 + }, + { + "epoch": 174.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22292692959308624, + "eval_runtime": 2.0749, + "eval_samples_per_second": 76.629, + "eval_steps_per_second": 4.819, + "step": 1968 + }, + { + "epoch": 176.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.21578945219516754, + "eval_runtime": 2.0472, + "eval_samples_per_second": 77.668, + "eval_steps_per_second": 4.885, + "step": 1980 + }, + { + "epoch": 176.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.22257991135120392, + "eval_runtime": 2.1698, + "eval_samples_per_second": 73.278, + "eval_steps_per_second": 4.609, + "step": 1991 + }, + { + "epoch": 177.78, + "grad_norm": 1.6022953987121582, + "learning_rate": 2.575757575757576e-05, + "loss": 0.0522, + "step": 2000 + }, + { + "epoch": 177.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.22241446375846863, + "eval_runtime": 2.0341, + "eval_samples_per_second": 78.167, + "eval_steps_per_second": 4.916, + "step": 2002 + }, + { + "epoch": 178.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.21375904977321625, + "eval_runtime": 2.1094, + "eval_samples_per_second": 75.377, + "eval_steps_per_second": 4.741, + "step": 2013 + }, + { + "epoch": 180.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.21769364178180695, + "eval_runtime": 1.9898, + "eval_samples_per_second": 79.909, + "eval_steps_per_second": 5.026, + "step": 2025 + }, + { + "epoch": 180.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.19169649481773376, + "eval_runtime": 2.1326, + "eval_samples_per_second": 74.558, + "eval_steps_per_second": 4.689, + "step": 2036 + }, + { + "epoch": 181.96, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.19741381704807281, + "eval_runtime": 2.1931, + "eval_samples_per_second": 72.5, + "eval_steps_per_second": 4.56, + "step": 2047 + }, + { + "epoch": 182.22, + "grad_norm": 0.7399430274963379, + "learning_rate": 2.556818181818182e-05, + "loss": 0.0515, + "step": 2050 + }, + { + "epoch": 182.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.21981187164783478, + "eval_runtime": 2.0417, + "eval_samples_per_second": 77.878, + "eval_steps_per_second": 4.898, + "step": 2058 + }, + { + "epoch": 184.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.24247391521930695, + "eval_runtime": 2.1999, + "eval_samples_per_second": 72.278, + "eval_steps_per_second": 4.546, + "step": 2070 + }, + { + "epoch": 184.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.24488882720470428, + "eval_runtime": 2.0767, + "eval_samples_per_second": 76.565, + "eval_steps_per_second": 4.815, + "step": 2081 + }, + { + "epoch": 185.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.23463451862335205, + "eval_runtime": 2.0674, + "eval_samples_per_second": 76.907, + "eval_steps_per_second": 4.837, + "step": 2092 + }, + { + "epoch": 186.67, + "grad_norm": 0.67291659116745, + "learning_rate": 2.5378787878787876e-05, + "loss": 0.045, + "step": 2100 + }, + { + "epoch": 186.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23308323323726654, + "eval_runtime": 2.2603, + "eval_samples_per_second": 70.346, + "eval_steps_per_second": 4.424, + "step": 2103 + }, + { + "epoch": 188.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2660614252090454, + "eval_runtime": 2.0509, + "eval_samples_per_second": 77.527, + "eval_steps_per_second": 4.876, + "step": 2115 + }, + { + "epoch": 188.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22910529375076294, + "eval_runtime": 2.0536, + "eval_samples_per_second": 77.423, + "eval_steps_per_second": 4.869, + "step": 2126 + }, + { + "epoch": 189.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.23477251827716827, + "eval_runtime": 2.0092, + "eval_samples_per_second": 79.134, + "eval_steps_per_second": 4.977, + "step": 2137 + }, + { + "epoch": 190.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23087622225284576, + "eval_runtime": 2.0403, + "eval_samples_per_second": 77.929, + "eval_steps_per_second": 4.901, + "step": 2148 + }, + { + "epoch": 191.11, + "grad_norm": 0.11660194396972656, + "learning_rate": 2.518939393939394e-05, + "loss": 0.0403, + "step": 2150 + }, + { + "epoch": 192.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.27889564633369446, + "eval_runtime": 2.0147, + "eval_samples_per_second": 78.921, + "eval_steps_per_second": 4.964, + "step": 2160 + }, + { + "epoch": 192.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2540048658847809, + "eval_runtime": 2.1082, + "eval_samples_per_second": 75.42, + "eval_steps_per_second": 4.743, + "step": 2171 + }, + { + "epoch": 193.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.23720349371433258, + "eval_runtime": 2.1791, + "eval_samples_per_second": 72.966, + "eval_steps_per_second": 4.589, + "step": 2182 + }, + { + "epoch": 194.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2507873773574829, + "eval_runtime": 1.986, + "eval_samples_per_second": 80.061, + "eval_steps_per_second": 5.035, + "step": 2193 + }, + { + "epoch": 195.56, + "grad_norm": 0.8518453240394592, + "learning_rate": 2.5e-05, + "loss": 0.0476, + "step": 2200 + }, + { + "epoch": 196.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2193620353937149, + "eval_runtime": 2.1819, + "eval_samples_per_second": 72.874, + "eval_steps_per_second": 4.583, + "step": 2205 + }, + { + "epoch": 196.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.23066306114196777, + "eval_runtime": 2.0482, + "eval_samples_per_second": 77.628, + "eval_steps_per_second": 4.882, + "step": 2216 + }, + { + "epoch": 197.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2719472646713257, + "eval_runtime": 1.9901, + "eval_samples_per_second": 79.896, + "eval_steps_per_second": 5.025, + "step": 2227 + }, + { + "epoch": 198.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.28040099143981934, + "eval_runtime": 2.0617, + "eval_samples_per_second": 77.122, + "eval_steps_per_second": 4.85, + "step": 2238 + }, + { + "epoch": 200.0, + "grad_norm": 0.09039253741502762, + "learning_rate": 2.481060606060606e-05, + "loss": 0.0457, + "step": 2250 + }, + { + "epoch": 200.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2755438983440399, + "eval_runtime": 2.0773, + "eval_samples_per_second": 76.541, + "eval_steps_per_second": 4.814, + "step": 2250 + }, + { + "epoch": 200.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2353052794933319, + "eval_runtime": 1.9899, + "eval_samples_per_second": 79.904, + "eval_steps_per_second": 5.025, + "step": 2261 + }, + { + "epoch": 201.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.21893078088760376, + "eval_runtime": 2.1045, + "eval_samples_per_second": 75.552, + "eval_steps_per_second": 4.752, + "step": 2272 + }, + { + "epoch": 202.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.21625204384326935, + "eval_runtime": 2.0731, + "eval_samples_per_second": 76.697, + "eval_steps_per_second": 4.824, + "step": 2283 + }, + { + "epoch": 204.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2110479772090912, + "eval_runtime": 2.1463, + "eval_samples_per_second": 74.079, + "eval_steps_per_second": 4.659, + "step": 2295 + }, + { + "epoch": 204.44, + "grad_norm": 0.9943685531616211, + "learning_rate": 2.4621212121212123e-05, + "loss": 0.0393, + "step": 2300 + }, + { + "epoch": 204.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23164410889148712, + "eval_runtime": 2.0606, + "eval_samples_per_second": 77.162, + "eval_steps_per_second": 4.853, + "step": 2306 + }, + { + "epoch": 205.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.24650876224040985, + "eval_runtime": 2.0011, + "eval_samples_per_second": 79.455, + "eval_steps_per_second": 4.997, + "step": 2317 + }, + { + "epoch": 206.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23763243854045868, + "eval_runtime": 2.0999, + "eval_samples_per_second": 75.719, + "eval_steps_per_second": 4.762, + "step": 2328 + }, + { + "epoch": 208.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2170635461807251, + "eval_runtime": 2.1575, + "eval_samples_per_second": 73.697, + "eval_steps_per_second": 4.635, + "step": 2340 + }, + { + "epoch": 208.89, + "grad_norm": 0.46173095703125, + "learning_rate": 2.4431818181818185e-05, + "loss": 0.0443, + "step": 2350 + }, + { + "epoch": 208.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23952844738960266, + "eval_runtime": 2.0014, + "eval_samples_per_second": 79.445, + "eval_steps_per_second": 4.997, + "step": 2351 + }, + { + "epoch": 209.96, + "eval_accuracy": 0.8930817610062893, + "eval_loss": 0.2906019687652588, + "eval_runtime": 2.0133, + "eval_samples_per_second": 78.977, + "eval_steps_per_second": 4.967, + "step": 2362 + }, + { + "epoch": 210.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2608316242694855, + "eval_runtime": 2.1558, + "eval_samples_per_second": 73.755, + "eval_steps_per_second": 4.639, + "step": 2373 + }, + { + "epoch": 212.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23210321366786957, + "eval_runtime": 2.0606, + "eval_samples_per_second": 77.161, + "eval_steps_per_second": 4.853, + "step": 2385 + }, + { + "epoch": 212.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.24640053510665894, + "eval_runtime": 2.2148, + "eval_samples_per_second": 71.79, + "eval_steps_per_second": 4.515, + "step": 2396 + }, + { + "epoch": 213.33, + "grad_norm": 0.94215327501297, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.0539, + "step": 2400 + }, + { + "epoch": 213.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.2441636025905609, + "eval_runtime": 2.172, + "eval_samples_per_second": 73.203, + "eval_steps_per_second": 4.604, + "step": 2407 + }, + { + "epoch": 214.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2511676847934723, + "eval_runtime": 2.0176, + "eval_samples_per_second": 78.806, + "eval_steps_per_second": 4.956, + "step": 2418 + }, + { + "epoch": 216.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22649481892585754, + "eval_runtime": 2.0103, + "eval_samples_per_second": 79.091, + "eval_steps_per_second": 4.974, + "step": 2430 + }, + { + "epoch": 216.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.21274729073047638, + "eval_runtime": 2.0508, + "eval_samples_per_second": 77.529, + "eval_steps_per_second": 4.876, + "step": 2441 + }, + { + "epoch": 217.78, + "grad_norm": 0.7381362318992615, + "learning_rate": 2.4053030303030303e-05, + "loss": 0.0415, + "step": 2450 + }, + { + "epoch": 217.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.284365177154541, + "eval_runtime": 2.0321, + "eval_samples_per_second": 78.244, + "eval_steps_per_second": 4.921, + "step": 2452 + }, + { + "epoch": 218.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.24891048669815063, + "eval_runtime": 2.0843, + "eval_samples_per_second": 76.285, + "eval_steps_per_second": 4.798, + "step": 2463 + }, + { + "epoch": 220.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.21200108528137207, + "eval_runtime": 1.9938, + "eval_samples_per_second": 79.748, + "eval_steps_per_second": 5.016, + "step": 2475 + }, + { + "epoch": 220.98, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.2015109807252884, + "eval_runtime": 2.2098, + "eval_samples_per_second": 71.951, + "eval_steps_per_second": 4.525, + "step": 2486 + }, + { + "epoch": 221.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.25095799565315247, + "eval_runtime": 2.0817, + "eval_samples_per_second": 76.381, + "eval_steps_per_second": 4.804, + "step": 2497 + }, + { + "epoch": 222.22, + "grad_norm": 0.3756774961948395, + "learning_rate": 2.3863636363636362e-05, + "loss": 0.0325, + "step": 2500 + }, + { + "epoch": 222.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2875436246395111, + "eval_runtime": 2.0148, + "eval_samples_per_second": 78.915, + "eval_steps_per_second": 4.963, + "step": 2508 + }, + { + "epoch": 224.0, + "eval_accuracy": 0.9622641509433962, + "eval_loss": 0.19936275482177734, + "eval_runtime": 2.0208, + "eval_samples_per_second": 78.682, + "eval_steps_per_second": 4.949, + "step": 2520 + }, + { + "epoch": 224.98, + "eval_accuracy": 0.9622641509433962, + "eval_loss": 0.20330873131752014, + "eval_runtime": 2.1708, + "eval_samples_per_second": 73.243, + "eval_steps_per_second": 4.606, + "step": 2531 + }, + { + "epoch": 225.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2391451746225357, + "eval_runtime": 1.9988, + "eval_samples_per_second": 79.549, + "eval_steps_per_second": 5.003, + "step": 2542 + }, + { + "epoch": 226.67, + "grad_norm": 0.6930297017097473, + "learning_rate": 2.3674242424242424e-05, + "loss": 0.0249, + "step": 2550 + }, + { + "epoch": 226.93, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.30440014600753784, + "eval_runtime": 2.0166, + "eval_samples_per_second": 78.847, + "eval_steps_per_second": 4.959, + "step": 2553 + }, + { + "epoch": 228.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2825218439102173, + "eval_runtime": 2.2235, + "eval_samples_per_second": 71.51, + "eval_steps_per_second": 4.497, + "step": 2565 + }, + { + "epoch": 228.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.234725683927536, + "eval_runtime": 2.0151, + "eval_samples_per_second": 78.905, + "eval_steps_per_second": 4.963, + "step": 2576 + }, + { + "epoch": 229.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.24049904942512512, + "eval_runtime": 2.0305, + "eval_samples_per_second": 78.304, + "eval_steps_per_second": 4.925, + "step": 2587 + }, + { + "epoch": 230.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.25367188453674316, + "eval_runtime": 2.1765, + "eval_samples_per_second": 73.054, + "eval_steps_per_second": 4.595, + "step": 2598 + }, + { + "epoch": 231.11, + "grad_norm": 0.8203662037849426, + "learning_rate": 2.3484848484848487e-05, + "loss": 0.0358, + "step": 2600 + }, + { + "epoch": 232.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.27088040113449097, + "eval_runtime": 2.0677, + "eval_samples_per_second": 76.895, + "eval_steps_per_second": 4.836, + "step": 2610 + }, + { + "epoch": 232.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2444712519645691, + "eval_runtime": 2.123, + "eval_samples_per_second": 74.893, + "eval_steps_per_second": 4.71, + "step": 2621 + }, + { + "epoch": 233.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.24358882009983063, + "eval_runtime": 2.0612, + "eval_samples_per_second": 77.139, + "eval_steps_per_second": 4.852, + "step": 2632 + }, + { + "epoch": 234.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.22266939282417297, + "eval_runtime": 2.0145, + "eval_samples_per_second": 78.929, + "eval_steps_per_second": 4.964, + "step": 2643 + }, + { + "epoch": 235.56, + "grad_norm": 0.7004448771476746, + "learning_rate": 2.3295454545454546e-05, + "loss": 0.0345, + "step": 2650 + }, + { + "epoch": 236.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.22081993520259857, + "eval_runtime": 2.0852, + "eval_samples_per_second": 76.252, + "eval_steps_per_second": 4.796, + "step": 2655 + }, + { + "epoch": 236.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.22930140793323517, + "eval_runtime": 2.038, + "eval_samples_per_second": 78.019, + "eval_steps_per_second": 4.907, + "step": 2666 + }, + { + "epoch": 237.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2159855216741562, + "eval_runtime": 2.2011, + "eval_samples_per_second": 72.236, + "eval_steps_per_second": 4.543, + "step": 2677 + }, + { + "epoch": 238.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2085605412721634, + "eval_runtime": 2.0845, + "eval_samples_per_second": 76.277, + "eval_steps_per_second": 4.797, + "step": 2688 + }, + { + "epoch": 240.0, + "grad_norm": 1.642115592956543, + "learning_rate": 2.3106060606060608e-05, + "loss": 0.0339, + "step": 2700 + }, + { + "epoch": 240.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.26398828625679016, + "eval_runtime": 1.9895, + "eval_samples_per_second": 79.918, + "eval_steps_per_second": 5.026, + "step": 2700 + }, + { + "epoch": 240.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2953893542289734, + "eval_runtime": 2.0677, + "eval_samples_per_second": 76.899, + "eval_steps_per_second": 4.836, + "step": 2711 + }, + { + "epoch": 241.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2507174611091614, + "eval_runtime": 2.1213, + "eval_samples_per_second": 74.953, + "eval_steps_per_second": 4.714, + "step": 2722 + }, + { + "epoch": 242.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.227327361702919, + "eval_runtime": 1.9774, + "eval_samples_per_second": 80.407, + "eval_steps_per_second": 5.057, + "step": 2733 + }, + { + "epoch": 244.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.24215646088123322, + "eval_runtime": 2.0297, + "eval_samples_per_second": 78.336, + "eval_steps_per_second": 4.927, + "step": 2745 + }, + { + "epoch": 244.44, + "grad_norm": 1.2598336935043335, + "learning_rate": 2.2916666666666667e-05, + "loss": 0.0309, + "step": 2750 + }, + { + "epoch": 244.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2931080758571625, + "eval_runtime": 2.1459, + "eval_samples_per_second": 74.093, + "eval_steps_per_second": 4.66, + "step": 2756 + }, + { + "epoch": 245.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2694746255874634, + "eval_runtime": 2.0392, + "eval_samples_per_second": 77.97, + "eval_steps_per_second": 4.904, + "step": 2767 + }, + { + "epoch": 246.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.26456066966056824, + "eval_runtime": 2.1011, + "eval_samples_per_second": 75.673, + "eval_steps_per_second": 4.759, + "step": 2778 + }, + { + "epoch": 248.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23147591948509216, + "eval_runtime": 2.0349, + "eval_samples_per_second": 78.135, + "eval_steps_per_second": 4.914, + "step": 2790 + }, + { + "epoch": 248.89, + "grad_norm": 1.3385041952133179, + "learning_rate": 2.272727272727273e-05, + "loss": 0.0301, + "step": 2800 + }, + { + "epoch": 248.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2269720882177353, + "eval_runtime": 2.0267, + "eval_samples_per_second": 78.453, + "eval_steps_per_second": 4.934, + "step": 2801 + }, + { + "epoch": 249.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.244718536734581, + "eval_runtime": 2.0507, + "eval_samples_per_second": 77.533, + "eval_steps_per_second": 4.876, + "step": 2812 + }, + { + "epoch": 250.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2586061358451843, + "eval_runtime": 2.0836, + "eval_samples_per_second": 76.312, + "eval_steps_per_second": 4.799, + "step": 2823 + }, + { + "epoch": 252.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3038959503173828, + "eval_runtime": 2.0093, + "eval_samples_per_second": 79.132, + "eval_steps_per_second": 4.977, + "step": 2835 + }, + { + "epoch": 252.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.27771249413490295, + "eval_runtime": 2.1305, + "eval_samples_per_second": 74.63, + "eval_steps_per_second": 4.694, + "step": 2846 + }, + { + "epoch": 253.33, + "grad_norm": 0.40545353293418884, + "learning_rate": 2.2537878787878788e-05, + "loss": 0.0335, + "step": 2850 + }, + { + "epoch": 253.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.256588876247406, + "eval_runtime": 2.1001, + "eval_samples_per_second": 75.709, + "eval_steps_per_second": 4.762, + "step": 2857 + }, + { + "epoch": 254.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.26031869649887085, + "eval_runtime": 2.2094, + "eval_samples_per_second": 71.966, + "eval_steps_per_second": 4.526, + "step": 2868 + }, + { + "epoch": 256.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.26985806226730347, + "eval_runtime": 1.9916, + "eval_samples_per_second": 79.835, + "eval_steps_per_second": 5.021, + "step": 2880 + }, + { + "epoch": 256.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2838137149810791, + "eval_runtime": 1.9992, + "eval_samples_per_second": 79.532, + "eval_steps_per_second": 5.002, + "step": 2891 + }, + { + "epoch": 257.78, + "grad_norm": 0.2661449611186981, + "learning_rate": 2.2348484848484847e-05, + "loss": 0.0249, + "step": 2900 + }, + { + "epoch": 257.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2572626769542694, + "eval_runtime": 2.0448, + "eval_samples_per_second": 77.758, + "eval_steps_per_second": 4.89, + "step": 2902 + }, + { + "epoch": 258.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2652382254600525, + "eval_runtime": 2.0483, + "eval_samples_per_second": 77.627, + "eval_steps_per_second": 4.882, + "step": 2913 + }, + { + "epoch": 260.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.26221606135368347, + "eval_runtime": 1.9761, + "eval_samples_per_second": 80.461, + "eval_steps_per_second": 5.06, + "step": 2925 + }, + { + "epoch": 260.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2583387494087219, + "eval_runtime": 2.0285, + "eval_samples_per_second": 78.384, + "eval_steps_per_second": 4.93, + "step": 2936 + }, + { + "epoch": 261.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23241400718688965, + "eval_runtime": 2.1753, + "eval_samples_per_second": 73.094, + "eval_steps_per_second": 4.597, + "step": 2947 + }, + { + "epoch": 262.22, + "grad_norm": 0.5177292227745056, + "learning_rate": 2.215909090909091e-05, + "loss": 0.0308, + "step": 2950 + }, + { + "epoch": 262.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2781696319580078, + "eval_runtime": 2.0731, + "eval_samples_per_second": 76.695, + "eval_steps_per_second": 4.824, + "step": 2958 + }, + { + "epoch": 264.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2519301474094391, + "eval_runtime": 2.1326, + "eval_samples_per_second": 74.556, + "eval_steps_per_second": 4.689, + "step": 2970 + }, + { + "epoch": 264.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2634475529193878, + "eval_runtime": 2.0868, + "eval_samples_per_second": 76.194, + "eval_steps_per_second": 4.792, + "step": 2981 + }, + { + "epoch": 265.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2647358775138855, + "eval_runtime": 2.023, + "eval_samples_per_second": 78.596, + "eval_steps_per_second": 4.943, + "step": 2992 + }, + { + "epoch": 266.67, + "grad_norm": 0.311382532119751, + "learning_rate": 2.1969696969696972e-05, + "loss": 0.0282, + "step": 3000 + }, + { + "epoch": 266.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.25880536437034607, + "eval_runtime": 2.0166, + "eval_samples_per_second": 78.845, + "eval_steps_per_second": 4.959, + "step": 3003 + }, + { + "epoch": 268.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.23151087760925293, + "eval_runtime": 2.1955, + "eval_samples_per_second": 72.42, + "eval_steps_per_second": 4.555, + "step": 3015 + }, + { + "epoch": 268.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.22928977012634277, + "eval_runtime": 2.1352, + "eval_samples_per_second": 74.465, + "eval_steps_per_second": 4.683, + "step": 3026 + }, + { + "epoch": 269.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.23751527070999146, + "eval_runtime": 2.0031, + "eval_samples_per_second": 79.378, + "eval_steps_per_second": 4.992, + "step": 3037 + }, + { + "epoch": 270.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.24385805428028107, + "eval_runtime": 2.109, + "eval_samples_per_second": 75.392, + "eval_steps_per_second": 4.742, + "step": 3048 + }, + { + "epoch": 271.11, + "grad_norm": 0.8252888321876526, + "learning_rate": 2.178030303030303e-05, + "loss": 0.0347, + "step": 3050 + }, + { + "epoch": 272.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2541854679584503, + "eval_runtime": 2.1279, + "eval_samples_per_second": 74.722, + "eval_steps_per_second": 4.699, + "step": 3060 + }, + { + "epoch": 272.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.24015812575817108, + "eval_runtime": 1.9697, + "eval_samples_per_second": 80.724, + "eval_steps_per_second": 5.077, + "step": 3071 + }, + { + "epoch": 273.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2365039885044098, + "eval_runtime": 2.1369, + "eval_samples_per_second": 74.406, + "eval_steps_per_second": 4.68, + "step": 3082 + }, + { + "epoch": 274.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2757132053375244, + "eval_runtime": 2.0094, + "eval_samples_per_second": 79.128, + "eval_steps_per_second": 4.977, + "step": 3093 + }, + { + "epoch": 275.56, + "grad_norm": 0.06441498547792435, + "learning_rate": 2.1590909090909093e-05, + "loss": 0.0211, + "step": 3100 + }, + { + "epoch": 276.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.25078749656677246, + "eval_runtime": 2.0059, + "eval_samples_per_second": 79.266, + "eval_steps_per_second": 4.985, + "step": 3105 + }, + { + "epoch": 276.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23951387405395508, + "eval_runtime": 2.174, + "eval_samples_per_second": 73.137, + "eval_steps_per_second": 4.6, + "step": 3116 + }, + { + "epoch": 277.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.25363460183143616, + "eval_runtime": 2.0281, + "eval_samples_per_second": 78.399, + "eval_steps_per_second": 4.931, + "step": 3127 + }, + { + "epoch": 278.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.26847586035728455, + "eval_runtime": 2.2802, + "eval_samples_per_second": 69.729, + "eval_steps_per_second": 4.385, + "step": 3138 + }, + { + "epoch": 280.0, + "grad_norm": 0.5554720759391785, + "learning_rate": 2.1401515151515152e-05, + "loss": 0.0248, + "step": 3150 + }, + { + "epoch": 280.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.2974900007247925, + "eval_runtime": 2.0423, + "eval_samples_per_second": 77.852, + "eval_steps_per_second": 4.896, + "step": 3150 + }, + { + "epoch": 280.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3234010636806488, + "eval_runtime": 2.0793, + "eval_samples_per_second": 76.469, + "eval_steps_per_second": 4.809, + "step": 3161 + }, + { + "epoch": 281.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2707124352455139, + "eval_runtime": 2.0919, + "eval_samples_per_second": 76.007, + "eval_steps_per_second": 4.78, + "step": 3172 + }, + { + "epoch": 282.93, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.22501063346862793, + "eval_runtime": 1.9726, + "eval_samples_per_second": 80.606, + "eval_steps_per_second": 5.07, + "step": 3183 + }, + { + "epoch": 284.0, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.23188871145248413, + "eval_runtime": 1.9745, + "eval_samples_per_second": 80.526, + "eval_steps_per_second": 5.065, + "step": 3195 + }, + { + "epoch": 284.44, + "grad_norm": 0.20468498766422272, + "learning_rate": 2.121212121212121e-05, + "loss": 0.0243, + "step": 3200 + }, + { + "epoch": 284.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.25254714488983154, + "eval_runtime": 2.1319, + "eval_samples_per_second": 74.582, + "eval_steps_per_second": 4.691, + "step": 3206 + }, + { + "epoch": 285.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.26610061526298523, + "eval_runtime": 2.0326, + "eval_samples_per_second": 78.226, + "eval_steps_per_second": 4.92, + "step": 3217 + }, + { + "epoch": 286.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.28444719314575195, + "eval_runtime": 2.0467, + "eval_samples_per_second": 77.687, + "eval_steps_per_second": 4.886, + "step": 3228 + }, + { + "epoch": 288.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2571127116680145, + "eval_runtime": 2.1631, + "eval_samples_per_second": 73.504, + "eval_steps_per_second": 4.623, + "step": 3240 + }, + { + "epoch": 288.89, + "grad_norm": 1.0598843097686768, + "learning_rate": 2.1022727272727274e-05, + "loss": 0.0223, + "step": 3250 + }, + { + "epoch": 288.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.251703679561615, + "eval_runtime": 2.09, + "eval_samples_per_second": 76.075, + "eval_steps_per_second": 4.785, + "step": 3251 + }, + { + "epoch": 289.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2636191248893738, + "eval_runtime": 2.0348, + "eval_samples_per_second": 78.14, + "eval_steps_per_second": 4.914, + "step": 3262 + }, + { + "epoch": 290.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.26941102743148804, + "eval_runtime": 2.0598, + "eval_samples_per_second": 77.193, + "eval_steps_per_second": 4.855, + "step": 3273 + }, + { + "epoch": 292.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.23060773313045502, + "eval_runtime": 2.0528, + "eval_samples_per_second": 77.454, + "eval_steps_per_second": 4.871, + "step": 3285 + }, + { + "epoch": 292.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.23769862949848175, + "eval_runtime": 2.0936, + "eval_samples_per_second": 75.945, + "eval_steps_per_second": 4.776, + "step": 3296 + }, + { + "epoch": 293.33, + "grad_norm": 0.6022414565086365, + "learning_rate": 2.0833333333333333e-05, + "loss": 0.0234, + "step": 3300 + }, + { + "epoch": 293.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.26981261372566223, + "eval_runtime": 2.0959, + "eval_samples_per_second": 75.861, + "eval_steps_per_second": 4.771, + "step": 3307 + }, + { + "epoch": 294.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28393277525901794, + "eval_runtime": 2.0125, + "eval_samples_per_second": 79.007, + "eval_steps_per_second": 4.969, + "step": 3318 + }, + { + "epoch": 296.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.25016099214553833, + "eval_runtime": 2.1941, + "eval_samples_per_second": 72.467, + "eval_steps_per_second": 4.558, + "step": 3330 + }, + { + "epoch": 296.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.27042049169540405, + "eval_runtime": 2.0192, + "eval_samples_per_second": 78.742, + "eval_steps_per_second": 4.952, + "step": 3341 + }, + { + "epoch": 297.78, + "grad_norm": 0.03581221029162407, + "learning_rate": 2.0643939393939395e-05, + "loss": 0.0256, + "step": 3350 + }, + { + "epoch": 297.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.28789857029914856, + "eval_runtime": 2.1148, + "eval_samples_per_second": 75.183, + "eval_steps_per_second": 4.729, + "step": 3352 + }, + { + "epoch": 298.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3078269362449646, + "eval_runtime": 2.0062, + "eval_samples_per_second": 79.253, + "eval_steps_per_second": 4.984, + "step": 3363 + }, + { + "epoch": 300.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31602492928504944, + "eval_runtime": 2.0641, + "eval_samples_per_second": 77.031, + "eval_steps_per_second": 4.845, + "step": 3375 + }, + { + "epoch": 300.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2705954313278198, + "eval_runtime": 2.0316, + "eval_samples_per_second": 78.263, + "eval_steps_per_second": 4.922, + "step": 3386 + }, + { + "epoch": 301.96, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2504004240036011, + "eval_runtime": 2.1492, + "eval_samples_per_second": 73.982, + "eval_steps_per_second": 4.653, + "step": 3397 + }, + { + "epoch": 302.22, + "grad_norm": 2.553766965866089, + "learning_rate": 2.0454545454545454e-05, + "loss": 0.0224, + "step": 3400 + }, + { + "epoch": 302.93, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.24540336430072784, + "eval_runtime": 2.0269, + "eval_samples_per_second": 78.443, + "eval_steps_per_second": 4.934, + "step": 3408 + }, + { + "epoch": 304.0, + "eval_accuracy": 0.9559748427672956, + "eval_loss": 0.24798454344272614, + "eval_runtime": 2.0863, + "eval_samples_per_second": 76.213, + "eval_steps_per_second": 4.793, + "step": 3420 + }, + { + "epoch": 304.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2511013150215149, + "eval_runtime": 2.0476, + "eval_samples_per_second": 77.651, + "eval_steps_per_second": 4.884, + "step": 3431 + }, + { + "epoch": 305.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2796252369880676, + "eval_runtime": 2.1539, + "eval_samples_per_second": 73.819, + "eval_steps_per_second": 4.643, + "step": 3442 + }, + { + "epoch": 306.67, + "grad_norm": 0.41460466384887695, + "learning_rate": 2.0265151515151516e-05, + "loss": 0.0155, + "step": 3450 + }, + { + "epoch": 306.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.29322367906570435, + "eval_runtime": 2.093, + "eval_samples_per_second": 75.966, + "eval_steps_per_second": 4.778, + "step": 3453 + }, + { + "epoch": 308.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2996874153614044, + "eval_runtime": 2.0951, + "eval_samples_per_second": 75.893, + "eval_steps_per_second": 4.773, + "step": 3465 + }, + { + "epoch": 308.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3044210970401764, + "eval_runtime": 1.9749, + "eval_samples_per_second": 80.512, + "eval_steps_per_second": 5.064, + "step": 3476 + }, + { + "epoch": 309.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3255678415298462, + "eval_runtime": 2.0175, + "eval_samples_per_second": 78.81, + "eval_steps_per_second": 4.957, + "step": 3487 + }, + { + "epoch": 310.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3400976359844208, + "eval_runtime": 2.0285, + "eval_samples_per_second": 78.381, + "eval_steps_per_second": 4.93, + "step": 3498 + }, + { + "epoch": 311.11, + "grad_norm": 0.5975369811058044, + "learning_rate": 2.007575757575758e-05, + "loss": 0.0226, + "step": 3500 + }, + { + "epoch": 312.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.30681127309799194, + "eval_runtime": 2.0805, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 4.807, + "step": 3510 + }, + { + "epoch": 312.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.30169352889060974, + "eval_runtime": 2.1998, + "eval_samples_per_second": 72.279, + "eval_steps_per_second": 4.546, + "step": 3521 + }, + { + "epoch": 313.96, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.29409661889076233, + "eval_runtime": 2.1625, + "eval_samples_per_second": 73.527, + "eval_steps_per_second": 4.624, + "step": 3532 + }, + { + "epoch": 314.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2840117812156677, + "eval_runtime": 2.0614, + "eval_samples_per_second": 77.134, + "eval_steps_per_second": 4.851, + "step": 3543 + }, + { + "epoch": 315.56, + "grad_norm": 0.4768455922603607, + "learning_rate": 1.9886363636363634e-05, + "loss": 0.0153, + "step": 3550 + }, + { + "epoch": 316.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.28999558091163635, + "eval_runtime": 2.0423, + "eval_samples_per_second": 77.855, + "eval_steps_per_second": 4.897, + "step": 3555 + }, + { + "epoch": 316.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.29232266545295715, + "eval_runtime": 2.0108, + "eval_samples_per_second": 79.073, + "eval_steps_per_second": 4.973, + "step": 3566 + }, + { + "epoch": 317.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2964979112148285, + "eval_runtime": 1.9633, + "eval_samples_per_second": 80.988, + "eval_steps_per_second": 5.094, + "step": 3577 + }, + { + "epoch": 318.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3201989531517029, + "eval_runtime": 2.0683, + "eval_samples_per_second": 76.876, + "eval_steps_per_second": 4.835, + "step": 3588 + }, + { + "epoch": 320.0, + "grad_norm": 0.01774447225034237, + "learning_rate": 1.9696969696969697e-05, + "loss": 0.0183, + "step": 3600 + }, + { + "epoch": 320.0, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.33252981305122375, + "eval_runtime": 1.9991, + "eval_samples_per_second": 79.534, + "eval_steps_per_second": 5.002, + "step": 3600 + }, + { + "epoch": 320.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.34411394596099854, + "eval_runtime": 1.9595, + "eval_samples_per_second": 81.143, + "eval_steps_per_second": 5.103, + "step": 3611 + }, + { + "epoch": 321.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3004206120967865, + "eval_runtime": 2.102, + "eval_samples_per_second": 75.644, + "eval_steps_per_second": 4.757, + "step": 3622 + }, + { + "epoch": 322.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3022076487541199, + "eval_runtime": 2.1248, + "eval_samples_per_second": 74.83, + "eval_steps_per_second": 4.706, + "step": 3633 + }, + { + "epoch": 324.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.29579004645347595, + "eval_runtime": 2.073, + "eval_samples_per_second": 76.702, + "eval_steps_per_second": 4.824, + "step": 3645 + }, + { + "epoch": 324.44, + "grad_norm": 0.43064549565315247, + "learning_rate": 1.950757575757576e-05, + "loss": 0.0257, + "step": 3650 + }, + { + "epoch": 324.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2942567765712738, + "eval_runtime": 2.08, + "eval_samples_per_second": 76.442, + "eval_steps_per_second": 4.808, + "step": 3656 + }, + { + "epoch": 325.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2944892942905426, + "eval_runtime": 1.9313, + "eval_samples_per_second": 82.326, + "eval_steps_per_second": 5.178, + "step": 3667 + }, + { + "epoch": 326.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.29099544882774353, + "eval_runtime": 2.085, + "eval_samples_per_second": 76.26, + "eval_steps_per_second": 4.796, + "step": 3678 + }, + { + "epoch": 328.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.2856423258781433, + "eval_runtime": 2.1029, + "eval_samples_per_second": 75.609, + "eval_steps_per_second": 4.755, + "step": 3690 + }, + { + "epoch": 328.89, + "grad_norm": 0.7020539045333862, + "learning_rate": 1.9318181818181818e-05, + "loss": 0.0164, + "step": 3700 + }, + { + "epoch": 328.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.25798845291137695, + "eval_runtime": 2.0372, + "eval_samples_per_second": 78.047, + "eval_steps_per_second": 4.909, + "step": 3701 + }, + { + "epoch": 329.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2566261291503906, + "eval_runtime": 2.1479, + "eval_samples_per_second": 74.027, + "eval_steps_per_second": 4.656, + "step": 3712 + }, + { + "epoch": 330.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2538098394870758, + "eval_runtime": 2.0665, + "eval_samples_per_second": 76.941, + "eval_steps_per_second": 4.839, + "step": 3723 + }, + { + "epoch": 332.0, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.24481499195098877, + "eval_runtime": 2.0898, + "eval_samples_per_second": 76.084, + "eval_steps_per_second": 4.785, + "step": 3735 + }, + { + "epoch": 332.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2543666958808899, + "eval_runtime": 2.035, + "eval_samples_per_second": 78.134, + "eval_steps_per_second": 4.914, + "step": 3746 + }, + { + "epoch": 333.33, + "grad_norm": 0.9068632125854492, + "learning_rate": 1.912878787878788e-05, + "loss": 0.0222, + "step": 3750 + }, + { + "epoch": 333.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3075094223022461, + "eval_runtime": 2.101, + "eval_samples_per_second": 75.678, + "eval_steps_per_second": 4.76, + "step": 3757 + }, + { + "epoch": 334.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.27574771642684937, + "eval_runtime": 2.0253, + "eval_samples_per_second": 78.507, + "eval_steps_per_second": 4.938, + "step": 3768 + }, + { + "epoch": 336.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2714598774909973, + "eval_runtime": 2.2265, + "eval_samples_per_second": 71.412, + "eval_steps_per_second": 4.491, + "step": 3780 + }, + { + "epoch": 336.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3330034911632538, + "eval_runtime": 2.0552, + "eval_samples_per_second": 77.365, + "eval_steps_per_second": 4.866, + "step": 3791 + }, + { + "epoch": 337.78, + "grad_norm": 0.03231671825051308, + "learning_rate": 1.893939393939394e-05, + "loss": 0.0212, + "step": 3800 + }, + { + "epoch": 337.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35598525404930115, + "eval_runtime": 2.0188, + "eval_samples_per_second": 78.762, + "eval_steps_per_second": 4.954, + "step": 3802 + }, + { + "epoch": 338.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28320637345314026, + "eval_runtime": 2.1352, + "eval_samples_per_second": 74.467, + "eval_steps_per_second": 4.683, + "step": 3813 + }, + { + "epoch": 340.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2855217754840851, + "eval_runtime": 2.1886, + "eval_samples_per_second": 72.648, + "eval_steps_per_second": 4.569, + "step": 3825 + }, + { + "epoch": 340.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.30631041526794434, + "eval_runtime": 2.0061, + "eval_samples_per_second": 79.26, + "eval_steps_per_second": 4.985, + "step": 3836 + }, + { + "epoch": 341.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.29151424765586853, + "eval_runtime": 2.0201, + "eval_samples_per_second": 78.71, + "eval_steps_per_second": 4.95, + "step": 3847 + }, + { + "epoch": 342.22, + "grad_norm": 0.07481174916028976, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.016, + "step": 3850 + }, + { + "epoch": 342.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.28358563780784607, + "eval_runtime": 1.9309, + "eval_samples_per_second": 82.344, + "eval_steps_per_second": 5.179, + "step": 3858 + }, + { + "epoch": 344.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28052231669425964, + "eval_runtime": 1.9926, + "eval_samples_per_second": 79.797, + "eval_steps_per_second": 5.019, + "step": 3870 + }, + { + "epoch": 344.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.26776131987571716, + "eval_runtime": 2.1218, + "eval_samples_per_second": 74.936, + "eval_steps_per_second": 4.713, + "step": 3881 + }, + { + "epoch": 345.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2839824855327606, + "eval_runtime": 2.0764, + "eval_samples_per_second": 76.575, + "eval_steps_per_second": 4.816, + "step": 3892 + }, + { + "epoch": 346.67, + "grad_norm": 1.4776334762573242, + "learning_rate": 1.856060606060606e-05, + "loss": 0.0163, + "step": 3900 + }, + { + "epoch": 346.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3048093914985657, + "eval_runtime": 2.1233, + "eval_samples_per_second": 74.885, + "eval_steps_per_second": 4.71, + "step": 3903 + }, + { + "epoch": 348.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.27605798840522766, + "eval_runtime": 1.9601, + "eval_samples_per_second": 81.117, + "eval_steps_per_second": 5.102, + "step": 3915 + }, + { + "epoch": 348.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30447614192962646, + "eval_runtime": 2.0457, + "eval_samples_per_second": 77.724, + "eval_steps_per_second": 4.888, + "step": 3926 + }, + { + "epoch": 349.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.26728910207748413, + "eval_runtime": 2.0205, + "eval_samples_per_second": 78.692, + "eval_steps_per_second": 4.949, + "step": 3937 + }, + { + "epoch": 350.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2830033600330353, + "eval_runtime": 2.0741, + "eval_samples_per_second": 76.66, + "eval_steps_per_second": 4.821, + "step": 3948 + }, + { + "epoch": 351.11, + "grad_norm": 0.30603834986686707, + "learning_rate": 1.837121212121212e-05, + "loss": 0.0185, + "step": 3950 + }, + { + "epoch": 352.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31495675444602966, + "eval_runtime": 2.0088, + "eval_samples_per_second": 79.152, + "eval_steps_per_second": 4.978, + "step": 3960 + }, + { + "epoch": 352.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2967083156108856, + "eval_runtime": 2.0921, + "eval_samples_per_second": 75.999, + "eval_steps_per_second": 4.78, + "step": 3971 + }, + { + "epoch": 353.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2917640507221222, + "eval_runtime": 2.1439, + "eval_samples_per_second": 74.165, + "eval_steps_per_second": 4.664, + "step": 3982 + }, + { + "epoch": 354.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2848517894744873, + "eval_runtime": 2.0244, + "eval_samples_per_second": 78.541, + "eval_steps_per_second": 4.94, + "step": 3993 + }, + { + "epoch": 355.56, + "grad_norm": 0.6905023455619812, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.0189, + "step": 4000 + }, + { + "epoch": 356.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28043246269226074, + "eval_runtime": 2.0697, + "eval_samples_per_second": 76.823, + "eval_steps_per_second": 4.832, + "step": 4005 + }, + { + "epoch": 356.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.29090604186058044, + "eval_runtime": 2.3048, + "eval_samples_per_second": 68.987, + "eval_steps_per_second": 4.339, + "step": 4016 + }, + { + "epoch": 357.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3029940724372864, + "eval_runtime": 2.0213, + "eval_samples_per_second": 78.661, + "eval_steps_per_second": 4.947, + "step": 4027 + }, + { + "epoch": 358.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.316310852766037, + "eval_runtime": 2.0126, + "eval_samples_per_second": 79.004, + "eval_steps_per_second": 4.969, + "step": 4038 + }, + { + "epoch": 360.0, + "grad_norm": 0.09516480565071106, + "learning_rate": 1.799242424242424e-05, + "loss": 0.0153, + "step": 4050 + }, + { + "epoch": 360.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.32167917490005493, + "eval_runtime": 1.9486, + "eval_samples_per_second": 81.598, + "eval_steps_per_second": 5.132, + "step": 4050 + }, + { + "epoch": 360.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3025132715702057, + "eval_runtime": 2.0179, + "eval_samples_per_second": 78.794, + "eval_steps_per_second": 4.956, + "step": 4061 + }, + { + "epoch": 361.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.297443687915802, + "eval_runtime": 1.9969, + "eval_samples_per_second": 79.622, + "eval_steps_per_second": 5.008, + "step": 4072 + }, + { + "epoch": 362.93, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.28664350509643555, + "eval_runtime": 2.0131, + "eval_samples_per_second": 78.984, + "eval_steps_per_second": 4.968, + "step": 4083 + }, + { + "epoch": 364.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.32455363869667053, + "eval_runtime": 2.1216, + "eval_samples_per_second": 74.943, + "eval_steps_per_second": 4.713, + "step": 4095 + }, + { + "epoch": 364.44, + "grad_norm": 0.14960724115371704, + "learning_rate": 1.7803030303030303e-05, + "loss": 0.0169, + "step": 4100 + }, + { + "epoch": 364.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2801210880279541, + "eval_runtime": 1.87, + "eval_samples_per_second": 85.025, + "eval_steps_per_second": 5.347, + "step": 4106 + }, + { + "epoch": 365.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.31326618790626526, + "eval_runtime": 1.8975, + "eval_samples_per_second": 83.793, + "eval_steps_per_second": 5.27, + "step": 4117 + }, + { + "epoch": 366.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3283620774745941, + "eval_runtime": 1.8154, + "eval_samples_per_second": 87.585, + "eval_steps_per_second": 5.509, + "step": 4128 + }, + { + "epoch": 368.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2716998755931854, + "eval_runtime": 1.7785, + "eval_samples_per_second": 89.401, + "eval_steps_per_second": 5.623, + "step": 4140 + }, + { + "epoch": 368.89, + "grad_norm": 1.529534935951233, + "learning_rate": 1.7613636363636366e-05, + "loss": 0.0207, + "step": 4150 + }, + { + "epoch": 368.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.26920509338378906, + "eval_runtime": 1.777, + "eval_samples_per_second": 89.477, + "eval_steps_per_second": 5.627, + "step": 4151 + }, + { + "epoch": 369.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2673673927783966, + "eval_runtime": 1.8105, + "eval_samples_per_second": 87.819, + "eval_steps_per_second": 5.523, + "step": 4162 + }, + { + "epoch": 370.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.26433154940605164, + "eval_runtime": 1.8098, + "eval_samples_per_second": 87.857, + "eval_steps_per_second": 5.526, + "step": 4173 + }, + { + "epoch": 372.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2969939410686493, + "eval_runtime": 1.7874, + "eval_samples_per_second": 88.954, + "eval_steps_per_second": 5.595, + "step": 4185 + }, + { + "epoch": 372.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2687932550907135, + "eval_runtime": 1.9292, + "eval_samples_per_second": 82.418, + "eval_steps_per_second": 5.184, + "step": 4196 + }, + { + "epoch": 373.33, + "grad_norm": 0.41630563139915466, + "learning_rate": 1.7424242424242425e-05, + "loss": 0.0213, + "step": 4200 + }, + { + "epoch": 373.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2765069603919983, + "eval_runtime": 1.9392, + "eval_samples_per_second": 81.994, + "eval_steps_per_second": 5.157, + "step": 4207 + }, + { + "epoch": 374.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28704383969306946, + "eval_runtime": 1.8427, + "eval_samples_per_second": 86.287, + "eval_steps_per_second": 5.427, + "step": 4218 + }, + { + "epoch": 376.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30059266090393066, + "eval_runtime": 1.8146, + "eval_samples_per_second": 87.624, + "eval_steps_per_second": 5.511, + "step": 4230 + }, + { + "epoch": 376.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2943706512451172, + "eval_runtime": 1.7941, + "eval_samples_per_second": 88.625, + "eval_steps_per_second": 5.574, + "step": 4241 + }, + { + "epoch": 377.78, + "grad_norm": 1.3894481658935547, + "learning_rate": 1.7234848484848487e-05, + "loss": 0.02, + "step": 4250 + }, + { + "epoch": 377.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3019978106021881, + "eval_runtime": 1.8046, + "eval_samples_per_second": 88.107, + "eval_steps_per_second": 5.541, + "step": 4252 + }, + { + "epoch": 378.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3074227571487427, + "eval_runtime": 1.7835, + "eval_samples_per_second": 89.152, + "eval_steps_per_second": 5.607, + "step": 4263 + }, + { + "epoch": 380.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.29427269101142883, + "eval_runtime": 1.8177, + "eval_samples_per_second": 87.473, + "eval_steps_per_second": 5.501, + "step": 4275 + }, + { + "epoch": 380.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.2825266420841217, + "eval_runtime": 1.8911, + "eval_samples_per_second": 84.077, + "eval_steps_per_second": 5.288, + "step": 4286 + }, + { + "epoch": 381.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2760971188545227, + "eval_runtime": 1.9521, + "eval_samples_per_second": 81.451, + "eval_steps_per_second": 5.123, + "step": 4297 + }, + { + "epoch": 382.22, + "grad_norm": 0.021462175995111465, + "learning_rate": 1.7045454545454546e-05, + "loss": 0.0143, + "step": 4300 + }, + { + "epoch": 382.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.29204800724983215, + "eval_runtime": 1.9261, + "eval_samples_per_second": 82.551, + "eval_steps_per_second": 5.192, + "step": 4308 + }, + { + "epoch": 384.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.29515865445137024, + "eval_runtime": 1.8478, + "eval_samples_per_second": 86.046, + "eval_steps_per_second": 5.412, + "step": 4320 + }, + { + "epoch": 384.98, + "eval_accuracy": 0.949685534591195, + "eval_loss": 0.3164711594581604, + "eval_runtime": 1.7929, + "eval_samples_per_second": 88.684, + "eval_steps_per_second": 5.578, + "step": 4331 + }, + { + "epoch": 385.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2803152799606323, + "eval_runtime": 1.8039, + "eval_samples_per_second": 88.141, + "eval_steps_per_second": 5.543, + "step": 4342 + }, + { + "epoch": 386.67, + "grad_norm": 0.4159376621246338, + "learning_rate": 1.6856060606060605e-05, + "loss": 0.0196, + "step": 4350 + }, + { + "epoch": 386.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.28756093978881836, + "eval_runtime": 1.7845, + "eval_samples_per_second": 89.1, + "eval_steps_per_second": 5.604, + "step": 4353 + }, + { + "epoch": 388.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2759377956390381, + "eval_runtime": 1.8441, + "eval_samples_per_second": 86.221, + "eval_steps_per_second": 5.423, + "step": 4365 + }, + { + "epoch": 388.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2701479494571686, + "eval_runtime": 1.7826, + "eval_samples_per_second": 89.198, + "eval_steps_per_second": 5.61, + "step": 4376 + }, + { + "epoch": 389.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2951464354991913, + "eval_runtime": 1.9039, + "eval_samples_per_second": 83.514, + "eval_steps_per_second": 5.252, + "step": 4387 + }, + { + "epoch": 390.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2950435280799866, + "eval_runtime": 1.8731, + "eval_samples_per_second": 84.885, + "eval_steps_per_second": 5.339, + "step": 4398 + }, + { + "epoch": 391.11, + "grad_norm": 0.057938866317272186, + "learning_rate": 1.6670454545454544e-05, + "loss": 0.0234, + "step": 4400 + }, + { + "epoch": 392.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.29603102803230286, + "eval_runtime": 1.9831, + "eval_samples_per_second": 80.176, + "eval_steps_per_second": 5.043, + "step": 4410 + }, + { + "epoch": 392.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3337320387363434, + "eval_runtime": 1.847, + "eval_samples_per_second": 86.084, + "eval_steps_per_second": 5.414, + "step": 4421 + }, + { + "epoch": 393.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33828112483024597, + "eval_runtime": 1.8496, + "eval_samples_per_second": 85.964, + "eval_steps_per_second": 5.407, + "step": 4432 + }, + { + "epoch": 394.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3078320026397705, + "eval_runtime": 1.8258, + "eval_samples_per_second": 87.084, + "eval_steps_per_second": 5.477, + "step": 4443 + }, + { + "epoch": 395.56, + "grad_norm": 0.39662787318229675, + "learning_rate": 1.6481060606060606e-05, + "loss": 0.0161, + "step": 4450 + }, + { + "epoch": 396.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3138676881790161, + "eval_runtime": 1.7627, + "eval_samples_per_second": 90.205, + "eval_steps_per_second": 5.673, + "step": 4455 + }, + { + "epoch": 396.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31875431537628174, + "eval_runtime": 1.7584, + "eval_samples_per_second": 90.422, + "eval_steps_per_second": 5.687, + "step": 4466 + }, + { + "epoch": 397.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3307281732559204, + "eval_runtime": 1.7976, + "eval_samples_per_second": 88.452, + "eval_steps_per_second": 5.563, + "step": 4477 + }, + { + "epoch": 398.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31634414196014404, + "eval_runtime": 1.8551, + "eval_samples_per_second": 85.711, + "eval_steps_per_second": 5.391, + "step": 4488 + }, + { + "epoch": 400.0, + "grad_norm": 0.7240819931030273, + "learning_rate": 1.6291666666666665e-05, + "loss": 0.0162, + "step": 4500 + }, + { + "epoch": 400.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3018243908882141, + "eval_runtime": 1.9085, + "eval_samples_per_second": 83.313, + "eval_steps_per_second": 5.24, + "step": 4500 + }, + { + "epoch": 400.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2813258469104767, + "eval_runtime": 2.0304, + "eval_samples_per_second": 78.308, + "eval_steps_per_second": 4.925, + "step": 4511 + }, + { + "epoch": 401.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3019176125526428, + "eval_runtime": 1.8259, + "eval_samples_per_second": 87.08, + "eval_steps_per_second": 5.477, + "step": 4522 + }, + { + "epoch": 402.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.28099265694618225, + "eval_runtime": 1.7238, + "eval_samples_per_second": 92.239, + "eval_steps_per_second": 5.801, + "step": 4533 + }, + { + "epoch": 404.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2745566666126251, + "eval_runtime": 1.7857, + "eval_samples_per_second": 89.039, + "eval_steps_per_second": 5.6, + "step": 4545 + }, + { + "epoch": 404.44, + "grad_norm": 0.8649039268493652, + "learning_rate": 1.6102272727272727e-05, + "loss": 0.023, + "step": 4550 + }, + { + "epoch": 404.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2850847542285919, + "eval_runtime": 1.8274, + "eval_samples_per_second": 87.011, + "eval_steps_per_second": 5.472, + "step": 4556 + }, + { + "epoch": 405.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.31582126021385193, + "eval_runtime": 1.742, + "eval_samples_per_second": 91.274, + "eval_steps_per_second": 5.741, + "step": 4567 + }, + { + "epoch": 406.93, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.34668126702308655, + "eval_runtime": 1.8815, + "eval_samples_per_second": 84.506, + "eval_steps_per_second": 5.315, + "step": 4578 + }, + { + "epoch": 408.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.34958958625793457, + "eval_runtime": 2.0856, + "eval_samples_per_second": 76.236, + "eval_steps_per_second": 4.795, + "step": 4590 + }, + { + "epoch": 408.89, + "grad_norm": 1.8184185028076172, + "learning_rate": 1.591287878787879e-05, + "loss": 0.0164, + "step": 4600 + }, + { + "epoch": 408.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.33241006731987, + "eval_runtime": 2.1278, + "eval_samples_per_second": 74.727, + "eval_steps_per_second": 4.7, + "step": 4601 + }, + { + "epoch": 409.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32462239265441895, + "eval_runtime": 2.221, + "eval_samples_per_second": 71.589, + "eval_steps_per_second": 4.502, + "step": 4612 + }, + { + "epoch": 410.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3765309154987335, + "eval_runtime": 2.0273, + "eval_samples_per_second": 78.43, + "eval_steps_per_second": 4.933, + "step": 4623 + }, + { + "epoch": 412.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3543161451816559, + "eval_runtime": 2.0351, + "eval_samples_per_second": 78.129, + "eval_steps_per_second": 4.914, + "step": 4635 + }, + { + "epoch": 412.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3280029594898224, + "eval_runtime": 2.1541, + "eval_samples_per_second": 73.813, + "eval_steps_per_second": 4.642, + "step": 4646 + }, + { + "epoch": 413.33, + "grad_norm": 1.7262401580810547, + "learning_rate": 1.572348484848485e-05, + "loss": 0.0189, + "step": 4650 + }, + { + "epoch": 413.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30754944682121277, + "eval_runtime": 1.987, + "eval_samples_per_second": 80.018, + "eval_steps_per_second": 5.033, + "step": 4657 + }, + { + "epoch": 414.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3012823462486267, + "eval_runtime": 2.084, + "eval_samples_per_second": 76.297, + "eval_steps_per_second": 4.799, + "step": 4668 + }, + { + "epoch": 416.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3047963082790375, + "eval_runtime": 2.1147, + "eval_samples_per_second": 75.187, + "eval_steps_per_second": 4.729, + "step": 4680 + }, + { + "epoch": 416.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.297464519739151, + "eval_runtime": 2.0439, + "eval_samples_per_second": 77.791, + "eval_steps_per_second": 4.893, + "step": 4691 + }, + { + "epoch": 417.78, + "grad_norm": 0.03005032427608967, + "learning_rate": 1.553409090909091e-05, + "loss": 0.018, + "step": 4700 + }, + { + "epoch": 417.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30111947655677795, + "eval_runtime": 2.0823, + "eval_samples_per_second": 76.356, + "eval_steps_per_second": 4.802, + "step": 4702 + }, + { + "epoch": 418.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3059113621711731, + "eval_runtime": 2.0164, + "eval_samples_per_second": 78.853, + "eval_steps_per_second": 4.959, + "step": 4713 + }, + { + "epoch": 420.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3002815544605255, + "eval_runtime": 2.0599, + "eval_samples_per_second": 77.187, + "eval_steps_per_second": 4.855, + "step": 4725 + }, + { + "epoch": 420.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.2898853123188019, + "eval_runtime": 2.1653, + "eval_samples_per_second": 73.43, + "eval_steps_per_second": 4.618, + "step": 4736 + }, + { + "epoch": 421.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.27394920587539673, + "eval_runtime": 1.976, + "eval_samples_per_second": 80.464, + "eval_steps_per_second": 5.061, + "step": 4747 + }, + { + "epoch": 422.22, + "grad_norm": 0.05734672769904137, + "learning_rate": 1.534469696969697e-05, + "loss": 0.014, + "step": 4750 + }, + { + "epoch": 422.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.28232210874557495, + "eval_runtime": 2.0336, + "eval_samples_per_second": 78.186, + "eval_steps_per_second": 4.917, + "step": 4758 + }, + { + "epoch": 424.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3002234697341919, + "eval_runtime": 2.1015, + "eval_samples_per_second": 75.661, + "eval_steps_per_second": 4.759, + "step": 4770 + }, + { + "epoch": 424.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31039535999298096, + "eval_runtime": 2.0591, + "eval_samples_per_second": 77.218, + "eval_steps_per_second": 4.856, + "step": 4781 + }, + { + "epoch": 425.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2992786467075348, + "eval_runtime": 2.219, + "eval_samples_per_second": 71.652, + "eval_steps_per_second": 4.506, + "step": 4792 + }, + { + "epoch": 426.67, + "grad_norm": 0.20316560566425323, + "learning_rate": 1.5155303030303031e-05, + "loss": 0.0161, + "step": 4800 + }, + { + "epoch": 426.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.28384503722190857, + "eval_runtime": 2.3212, + "eval_samples_per_second": 68.5, + "eval_steps_per_second": 4.308, + "step": 4803 + }, + { + "epoch": 428.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.303459495306015, + "eval_runtime": 2.0531, + "eval_samples_per_second": 77.442, + "eval_steps_per_second": 4.871, + "step": 4815 + }, + { + "epoch": 428.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31719303131103516, + "eval_runtime": 2.0034, + "eval_samples_per_second": 79.365, + "eval_steps_per_second": 4.992, + "step": 4826 + }, + { + "epoch": 429.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2884739935398102, + "eval_runtime": 2.1854, + "eval_samples_per_second": 72.756, + "eval_steps_per_second": 4.576, + "step": 4837 + }, + { + "epoch": 430.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2915368676185608, + "eval_runtime": 2.0672, + "eval_samples_per_second": 76.914, + "eval_steps_per_second": 4.837, + "step": 4848 + }, + { + "epoch": 431.11, + "grad_norm": 0.1926555037498474, + "learning_rate": 1.496590909090909e-05, + "loss": 0.0181, + "step": 4850 + }, + { + "epoch": 432.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32380226254463196, + "eval_runtime": 2.0107, + "eval_samples_per_second": 79.076, + "eval_steps_per_second": 4.973, + "step": 4860 + }, + { + "epoch": 432.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3051411807537079, + "eval_runtime": 2.0979, + "eval_samples_per_second": 75.789, + "eval_steps_per_second": 4.767, + "step": 4871 + }, + { + "epoch": 433.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2746570408344269, + "eval_runtime": 2.0978, + "eval_samples_per_second": 75.795, + "eval_steps_per_second": 4.767, + "step": 4882 + }, + { + "epoch": 434.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.27779048681259155, + "eval_runtime": 2.1681, + "eval_samples_per_second": 73.336, + "eval_steps_per_second": 4.612, + "step": 4893 + }, + { + "epoch": 435.56, + "grad_norm": 0.2639506757259369, + "learning_rate": 1.4776515151515152e-05, + "loss": 0.0152, + "step": 4900 + }, + { + "epoch": 436.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3142688274383545, + "eval_runtime": 2.0074, + "eval_samples_per_second": 79.208, + "eval_steps_per_second": 4.982, + "step": 4905 + }, + { + "epoch": 436.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.29534852504730225, + "eval_runtime": 2.0119, + "eval_samples_per_second": 79.031, + "eval_steps_per_second": 4.97, + "step": 4916 + }, + { + "epoch": 437.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2987271249294281, + "eval_runtime": 2.0466, + "eval_samples_per_second": 77.691, + "eval_steps_per_second": 4.886, + "step": 4927 + }, + { + "epoch": 438.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3240003287792206, + "eval_runtime": 2.1303, + "eval_samples_per_second": 74.638, + "eval_steps_per_second": 4.694, + "step": 4938 + }, + { + "epoch": 440.0, + "grad_norm": 1.0273933410644531, + "learning_rate": 1.4587121212121213e-05, + "loss": 0.0233, + "step": 4950 + }, + { + "epoch": 440.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2931964099407196, + "eval_runtime": 2.0028, + "eval_samples_per_second": 79.388, + "eval_steps_per_second": 4.993, + "step": 4950 + }, + { + "epoch": 440.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.30667683482170105, + "eval_runtime": 2.1028, + "eval_samples_per_second": 75.614, + "eval_steps_per_second": 4.756, + "step": 4961 + }, + { + "epoch": 441.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31695908308029175, + "eval_runtime": 2.1429, + "eval_samples_per_second": 74.198, + "eval_steps_per_second": 4.667, + "step": 4972 + }, + { + "epoch": 442.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33484575152397156, + "eval_runtime": 2.2487, + "eval_samples_per_second": 70.709, + "eval_steps_per_second": 4.447, + "step": 4983 + }, + { + "epoch": 444.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3350779116153717, + "eval_runtime": 2.2089, + "eval_samples_per_second": 71.981, + "eval_steps_per_second": 4.527, + "step": 4995 + }, + { + "epoch": 444.44, + "grad_norm": 0.05571739375591278, + "learning_rate": 1.4397727272727274e-05, + "loss": 0.0134, + "step": 5000 + }, + { + "epoch": 444.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.33779439330101013, + "eval_runtime": 2.155, + "eval_samples_per_second": 73.781, + "eval_steps_per_second": 4.64, + "step": 5006 + }, + { + "epoch": 445.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.32037729024887085, + "eval_runtime": 2.1415, + "eval_samples_per_second": 74.247, + "eval_steps_per_second": 4.67, + "step": 5017 + }, + { + "epoch": 446.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30960965156555176, + "eval_runtime": 2.0664, + "eval_samples_per_second": 76.947, + "eval_steps_per_second": 4.839, + "step": 5028 + }, + { + "epoch": 448.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3135194480419159, + "eval_runtime": 2.1609, + "eval_samples_per_second": 73.581, + "eval_steps_per_second": 4.628, + "step": 5040 + }, + { + "epoch": 448.89, + "grad_norm": 1.2499555349349976, + "learning_rate": 1.4208333333333333e-05, + "loss": 0.0185, + "step": 5050 + }, + { + "epoch": 448.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32047778367996216, + "eval_runtime": 2.0116, + "eval_samples_per_second": 79.04, + "eval_steps_per_second": 4.971, + "step": 5051 + }, + { + "epoch": 449.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3151703476905823, + "eval_runtime": 1.9982, + "eval_samples_per_second": 79.571, + "eval_steps_per_second": 5.004, + "step": 5062 + }, + { + "epoch": 450.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.32720035314559937, + "eval_runtime": 2.0554, + "eval_samples_per_second": 77.357, + "eval_steps_per_second": 4.865, + "step": 5073 + }, + { + "epoch": 452.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.31637299060821533, + "eval_runtime": 2.0655, + "eval_samples_per_second": 76.978, + "eval_steps_per_second": 4.841, + "step": 5085 + }, + { + "epoch": 452.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3297300934791565, + "eval_runtime": 2.0194, + "eval_samples_per_second": 78.737, + "eval_steps_per_second": 4.952, + "step": 5096 + }, + { + "epoch": 453.33, + "grad_norm": 0.4623982012271881, + "learning_rate": 1.4018939393939395e-05, + "loss": 0.0149, + "step": 5100 + }, + { + "epoch": 453.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3299054801464081, + "eval_runtime": 2.027, + "eval_samples_per_second": 78.441, + "eval_steps_per_second": 4.933, + "step": 5107 + }, + { + "epoch": 454.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34270188212394714, + "eval_runtime": 2.0462, + "eval_samples_per_second": 77.705, + "eval_steps_per_second": 4.887, + "step": 5118 + }, + { + "epoch": 456.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3775523006916046, + "eval_runtime": 2.0532, + "eval_samples_per_second": 77.442, + "eval_steps_per_second": 4.871, + "step": 5130 + }, + { + "epoch": 456.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.376447468996048, + "eval_runtime": 2.0839, + "eval_samples_per_second": 76.298, + "eval_steps_per_second": 4.799, + "step": 5141 + }, + { + "epoch": 457.78, + "grad_norm": 0.23284748196601868, + "learning_rate": 1.3829545454545456e-05, + "loss": 0.0099, + "step": 5150 + }, + { + "epoch": 457.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3852477967739105, + "eval_runtime": 2.0765, + "eval_samples_per_second": 76.569, + "eval_steps_per_second": 4.816, + "step": 5152 + }, + { + "epoch": 458.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.35552406311035156, + "eval_runtime": 2.0834, + "eval_samples_per_second": 76.318, + "eval_steps_per_second": 4.8, + "step": 5163 + }, + { + "epoch": 460.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3497180640697479, + "eval_runtime": 2.1727, + "eval_samples_per_second": 73.182, + "eval_steps_per_second": 4.603, + "step": 5175 + }, + { + "epoch": 460.98, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.3959099054336548, + "eval_runtime": 2.2063, + "eval_samples_per_second": 72.066, + "eval_steps_per_second": 4.532, + "step": 5186 + }, + { + "epoch": 461.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3428646922111511, + "eval_runtime": 2.0667, + "eval_samples_per_second": 76.934, + "eval_steps_per_second": 4.839, + "step": 5197 + }, + { + "epoch": 462.22, + "grad_norm": 0.01973637193441391, + "learning_rate": 1.3640151515151516e-05, + "loss": 0.0123, + "step": 5200 + }, + { + "epoch": 462.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3277600407600403, + "eval_runtime": 2.0262, + "eval_samples_per_second": 78.472, + "eval_steps_per_second": 4.935, + "step": 5208 + }, + { + "epoch": 464.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.307450532913208, + "eval_runtime": 2.1318, + "eval_samples_per_second": 74.586, + "eval_steps_per_second": 4.691, + "step": 5220 + }, + { + "epoch": 464.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.30191025137901306, + "eval_runtime": 2.0236, + "eval_samples_per_second": 78.574, + "eval_steps_per_second": 4.942, + "step": 5231 + }, + { + "epoch": 465.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3069049119949341, + "eval_runtime": 1.9794, + "eval_samples_per_second": 80.326, + "eval_steps_per_second": 5.052, + "step": 5242 + }, + { + "epoch": 466.67, + "grad_norm": 1.7077068090438843, + "learning_rate": 1.3450757575757575e-05, + "loss": 0.0169, + "step": 5250 + }, + { + "epoch": 466.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3036327362060547, + "eval_runtime": 2.2515, + "eval_samples_per_second": 70.62, + "eval_steps_per_second": 4.442, + "step": 5253 + }, + { + "epoch": 468.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32558977603912354, + "eval_runtime": 2.0075, + "eval_samples_per_second": 79.202, + "eval_steps_per_second": 4.981, + "step": 5265 + }, + { + "epoch": 468.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3241185247898102, + "eval_runtime": 2.079, + "eval_samples_per_second": 76.48, + "eval_steps_per_second": 4.81, + "step": 5276 + }, + { + "epoch": 469.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.32361313700675964, + "eval_runtime": 2.2276, + "eval_samples_per_second": 71.378, + "eval_steps_per_second": 4.489, + "step": 5287 + }, + { + "epoch": 470.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.32213094830513, + "eval_runtime": 2.0555, + "eval_samples_per_second": 77.353, + "eval_steps_per_second": 4.865, + "step": 5298 + }, + { + "epoch": 471.11, + "grad_norm": 2.2473459243774414, + "learning_rate": 1.3261363636363636e-05, + "loss": 0.0114, + "step": 5300 + }, + { + "epoch": 472.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2958085536956787, + "eval_runtime": 2.1042, + "eval_samples_per_second": 75.563, + "eval_steps_per_second": 4.752, + "step": 5310 + }, + { + "epoch": 472.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.2994365692138672, + "eval_runtime": 2.0536, + "eval_samples_per_second": 77.424, + "eval_steps_per_second": 4.869, + "step": 5321 + }, + { + "epoch": 473.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.29937687516212463, + "eval_runtime": 2.0807, + "eval_samples_per_second": 76.417, + "eval_steps_per_second": 4.806, + "step": 5332 + }, + { + "epoch": 474.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.4239935576915741, + "eval_runtime": 2.0885, + "eval_samples_per_second": 76.13, + "eval_steps_per_second": 4.788, + "step": 5343 + }, + { + "epoch": 475.56, + "grad_norm": 0.01770736277103424, + "learning_rate": 1.3071969696969698e-05, + "loss": 0.0148, + "step": 5350 + }, + { + "epoch": 476.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.32858237624168396, + "eval_runtime": 2.0527, + "eval_samples_per_second": 77.46, + "eval_steps_per_second": 4.872, + "step": 5355 + }, + { + "epoch": 476.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2954269051551819, + "eval_runtime": 2.1594, + "eval_samples_per_second": 73.63, + "eval_steps_per_second": 4.631, + "step": 5366 + }, + { + "epoch": 477.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.29593905806541443, + "eval_runtime": 2.1654, + "eval_samples_per_second": 73.426, + "eval_steps_per_second": 4.618, + "step": 5377 + }, + { + "epoch": 478.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2928108274936676, + "eval_runtime": 2.2433, + "eval_samples_per_second": 70.877, + "eval_steps_per_second": 4.458, + "step": 5388 + }, + { + "epoch": 480.0, + "grad_norm": 1.7406607866287231, + "learning_rate": 1.2882575757575757e-05, + "loss": 0.0171, + "step": 5400 + }, + { + "epoch": 480.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.2977100610733032, + "eval_runtime": 2.0243, + "eval_samples_per_second": 78.544, + "eval_steps_per_second": 4.94, + "step": 5400 + }, + { + "epoch": 480.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.30747535824775696, + "eval_runtime": 2.0298, + "eval_samples_per_second": 78.334, + "eval_steps_per_second": 4.927, + "step": 5411 + }, + { + "epoch": 481.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3572753071784973, + "eval_runtime": 2.0524, + "eval_samples_per_second": 77.47, + "eval_steps_per_second": 4.872, + "step": 5422 + }, + { + "epoch": 482.93, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3878822326660156, + "eval_runtime": 2.0986, + "eval_samples_per_second": 75.766, + "eval_steps_per_second": 4.765, + "step": 5433 + }, + { + "epoch": 484.0, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.3886529803276062, + "eval_runtime": 2.078, + "eval_samples_per_second": 76.517, + "eval_steps_per_second": 4.812, + "step": 5445 + }, + { + "epoch": 484.44, + "grad_norm": 0.06283226609230042, + "learning_rate": 1.2693181818181818e-05, + "loss": 0.0166, + "step": 5450 + }, + { + "epoch": 484.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3698625862598419, + "eval_runtime": 2.094, + "eval_samples_per_second": 75.932, + "eval_steps_per_second": 4.776, + "step": 5456 + }, + { + "epoch": 485.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.351385235786438, + "eval_runtime": 2.0668, + "eval_samples_per_second": 76.93, + "eval_steps_per_second": 4.838, + "step": 5467 + }, + { + "epoch": 486.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34395086765289307, + "eval_runtime": 2.12, + "eval_samples_per_second": 74.999, + "eval_steps_per_second": 4.717, + "step": 5478 + }, + { + "epoch": 488.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.31205570697784424, + "eval_runtime": 2.2336, + "eval_samples_per_second": 71.184, + "eval_steps_per_second": 4.477, + "step": 5490 + }, + { + "epoch": 488.89, + "grad_norm": 1.9271873235702515, + "learning_rate": 1.2503787878787879e-05, + "loss": 0.0169, + "step": 5500 + }, + { + "epoch": 488.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3185611069202423, + "eval_runtime": 1.9966, + "eval_samples_per_second": 79.635, + "eval_steps_per_second": 5.008, + "step": 5501 + }, + { + "epoch": 489.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3383605182170868, + "eval_runtime": 2.0874, + "eval_samples_per_second": 76.17, + "eval_steps_per_second": 4.791, + "step": 5512 + }, + { + "epoch": 490.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.35870400071144104, + "eval_runtime": 2.2491, + "eval_samples_per_second": 70.694, + "eval_steps_per_second": 4.446, + "step": 5523 + }, + { + "epoch": 492.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3265625238418579, + "eval_runtime": 2.0134, + "eval_samples_per_second": 78.971, + "eval_steps_per_second": 4.967, + "step": 5535 + }, + { + "epoch": 492.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3273981213569641, + "eval_runtime": 2.0972, + "eval_samples_per_second": 75.815, + "eval_steps_per_second": 4.768, + "step": 5546 + }, + { + "epoch": 493.33, + "grad_norm": 0.3140685260295868, + "learning_rate": 1.2314393939393941e-05, + "loss": 0.0162, + "step": 5550 + }, + { + "epoch": 493.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3433980345726013, + "eval_runtime": 4.4757, + "eval_samples_per_second": 35.525, + "eval_steps_per_second": 2.234, + "step": 5557 + }, + { + "epoch": 494.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3295518755912781, + "eval_runtime": 2.0317, + "eval_samples_per_second": 78.259, + "eval_steps_per_second": 4.922, + "step": 5568 + }, + { + "epoch": 496.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.31786414980888367, + "eval_runtime": 2.1435, + "eval_samples_per_second": 74.179, + "eval_steps_per_second": 4.665, + "step": 5580 + }, + { + "epoch": 496.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32228994369506836, + "eval_runtime": 2.0036, + "eval_samples_per_second": 79.357, + "eval_steps_per_second": 4.991, + "step": 5591 + }, + { + "epoch": 497.78, + "grad_norm": 1.7739616632461548, + "learning_rate": 1.2125e-05, + "loss": 0.0128, + "step": 5600 + }, + { + "epoch": 497.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3525673747062683, + "eval_runtime": 2.0848, + "eval_samples_per_second": 76.266, + "eval_steps_per_second": 4.797, + "step": 5602 + }, + { + "epoch": 498.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3345227539539337, + "eval_runtime": 2.0597, + "eval_samples_per_second": 77.194, + "eval_steps_per_second": 4.855, + "step": 5613 + }, + { + "epoch": 500.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3081194758415222, + "eval_runtime": 2.14, + "eval_samples_per_second": 74.297, + "eval_steps_per_second": 4.673, + "step": 5625 + }, + { + "epoch": 500.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3136290907859802, + "eval_runtime": 2.0866, + "eval_samples_per_second": 76.201, + "eval_steps_per_second": 4.793, + "step": 5636 + }, + { + "epoch": 501.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31603533029556274, + "eval_runtime": 2.272, + "eval_samples_per_second": 69.983, + "eval_steps_per_second": 4.401, + "step": 5647 + }, + { + "epoch": 502.22, + "grad_norm": 0.024508927017450333, + "learning_rate": 1.193560606060606e-05, + "loss": 0.0089, + "step": 5650 + }, + { + "epoch": 502.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3217502236366272, + "eval_runtime": 2.2568, + "eval_samples_per_second": 70.454, + "eval_steps_per_second": 4.431, + "step": 5658 + }, + { + "epoch": 504.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3330020606517792, + "eval_runtime": 2.1528, + "eval_samples_per_second": 73.857, + "eval_steps_per_second": 4.645, + "step": 5670 + }, + { + "epoch": 504.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3610976040363312, + "eval_runtime": 2.1981, + "eval_samples_per_second": 72.335, + "eval_steps_per_second": 4.549, + "step": 5681 + }, + { + "epoch": 505.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3819771111011505, + "eval_runtime": 2.0605, + "eval_samples_per_second": 77.167, + "eval_steps_per_second": 4.853, + "step": 5692 + }, + { + "epoch": 506.67, + "grad_norm": 0.13250546157360077, + "learning_rate": 1.1746212121212121e-05, + "loss": 0.0168, + "step": 5700 + }, + { + "epoch": 506.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3471725881099701, + "eval_runtime": 2.0816, + "eval_samples_per_second": 76.384, + "eval_steps_per_second": 4.804, + "step": 5703 + }, + { + "epoch": 508.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3075188100337982, + "eval_runtime": 2.1057, + "eval_samples_per_second": 75.51, + "eval_steps_per_second": 4.749, + "step": 5715 + }, + { + "epoch": 508.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.30466988682746887, + "eval_runtime": 2.1027, + "eval_samples_per_second": 75.617, + "eval_steps_per_second": 4.756, + "step": 5726 + }, + { + "epoch": 509.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.314418226480484, + "eval_runtime": 2.1578, + "eval_samples_per_second": 73.686, + "eval_steps_per_second": 4.634, + "step": 5737 + }, + { + "epoch": 510.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3144315183162689, + "eval_runtime": 2.2895, + "eval_samples_per_second": 69.447, + "eval_steps_per_second": 4.368, + "step": 5748 + }, + { + "epoch": 511.11, + "grad_norm": 1.371584415435791, + "learning_rate": 1.1556818181818184e-05, + "loss": 0.0143, + "step": 5750 + }, + { + "epoch": 512.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.30977222323417664, + "eval_runtime": 2.0905, + "eval_samples_per_second": 76.059, + "eval_steps_per_second": 4.784, + "step": 5760 + }, + { + "epoch": 512.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.31324854493141174, + "eval_runtime": 2.2018, + "eval_samples_per_second": 72.212, + "eval_steps_per_second": 4.542, + "step": 5771 + }, + { + "epoch": 513.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3324536979198456, + "eval_runtime": 2.096, + "eval_samples_per_second": 75.859, + "eval_steps_per_second": 4.771, + "step": 5782 + }, + { + "epoch": 514.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32093632221221924, + "eval_runtime": 2.049, + "eval_samples_per_second": 77.599, + "eval_steps_per_second": 4.88, + "step": 5793 + }, + { + "epoch": 515.56, + "grad_norm": 1.4226562976837158, + "learning_rate": 1.1367424242424243e-05, + "loss": 0.014, + "step": 5800 + }, + { + "epoch": 516.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3191947937011719, + "eval_runtime": 2.0898, + "eval_samples_per_second": 76.083, + "eval_steps_per_second": 4.785, + "step": 5805 + }, + { + "epoch": 516.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.311814546585083, + "eval_runtime": 2.0315, + "eval_samples_per_second": 78.269, + "eval_steps_per_second": 4.923, + "step": 5816 + }, + { + "epoch": 517.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.31416967511177063, + "eval_runtime": 2.0132, + "eval_samples_per_second": 78.978, + "eval_steps_per_second": 4.967, + "step": 5827 + }, + { + "epoch": 518.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3255424201488495, + "eval_runtime": 2.4361, + "eval_samples_per_second": 65.269, + "eval_steps_per_second": 4.105, + "step": 5838 + }, + { + "epoch": 520.0, + "grad_norm": 0.1621515154838562, + "learning_rate": 1.1178030303030303e-05, + "loss": 0.0111, + "step": 5850 + }, + { + "epoch": 520.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.32208895683288574, + "eval_runtime": 2.0821, + "eval_samples_per_second": 76.364, + "eval_steps_per_second": 4.803, + "step": 5850 + }, + { + "epoch": 520.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3211723566055298, + "eval_runtime": 2.0312, + "eval_samples_per_second": 78.28, + "eval_steps_per_second": 4.923, + "step": 5861 + }, + { + "epoch": 521.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.32905757427215576, + "eval_runtime": 2.0294, + "eval_samples_per_second": 78.349, + "eval_steps_per_second": 4.928, + "step": 5872 + }, + { + "epoch": 522.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.33144110441207886, + "eval_runtime": 2.032, + "eval_samples_per_second": 78.249, + "eval_steps_per_second": 4.921, + "step": 5883 + }, + { + "epoch": 524.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3268250823020935, + "eval_runtime": 2.0687, + "eval_samples_per_second": 76.859, + "eval_steps_per_second": 4.834, + "step": 5895 + }, + { + "epoch": 524.44, + "grad_norm": 0.008243849501013756, + "learning_rate": 1.0988636363636364e-05, + "loss": 0.0107, + "step": 5900 + }, + { + "epoch": 524.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3351696729660034, + "eval_runtime": 2.155, + "eval_samples_per_second": 73.782, + "eval_steps_per_second": 4.64, + "step": 5906 + }, + { + "epoch": 525.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34242841601371765, + "eval_runtime": 2.0063, + "eval_samples_per_second": 79.249, + "eval_steps_per_second": 4.984, + "step": 5917 + }, + { + "epoch": 526.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33888906240463257, + "eval_runtime": 2.2365, + "eval_samples_per_second": 71.093, + "eval_steps_per_second": 4.471, + "step": 5928 + }, + { + "epoch": 528.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3547358810901642, + "eval_runtime": 2.0755, + "eval_samples_per_second": 76.609, + "eval_steps_per_second": 4.818, + "step": 5940 + }, + { + "epoch": 528.89, + "grad_norm": 0.47511938214302063, + "learning_rate": 1.0799242424242423e-05, + "loss": 0.01, + "step": 5950 + }, + { + "epoch": 528.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.34747716784477234, + "eval_runtime": 2.0823, + "eval_samples_per_second": 76.358, + "eval_steps_per_second": 4.802, + "step": 5951 + }, + { + "epoch": 529.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35945838689804077, + "eval_runtime": 2.0524, + "eval_samples_per_second": 77.469, + "eval_steps_per_second": 4.872, + "step": 5962 + }, + { + "epoch": 530.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3673442602157593, + "eval_runtime": 2.0276, + "eval_samples_per_second": 78.419, + "eval_steps_per_second": 4.932, + "step": 5973 + }, + { + "epoch": 532.0, + "eval_accuracy": 0.9119496855345912, + "eval_loss": 0.41652363538742065, + "eval_runtime": 2.0573, + "eval_samples_per_second": 77.285, + "eval_steps_per_second": 4.861, + "step": 5985 + }, + { + "epoch": 532.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.42472416162490845, + "eval_runtime": 2.1003, + "eval_samples_per_second": 75.704, + "eval_steps_per_second": 4.761, + "step": 5996 + }, + { + "epoch": 533.33, + "grad_norm": 0.15851238369941711, + "learning_rate": 1.0609848484848485e-05, + "loss": 0.0126, + "step": 6000 + }, + { + "epoch": 533.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.4061521589756012, + "eval_runtime": 2.0889, + "eval_samples_per_second": 76.116, + "eval_steps_per_second": 4.787, + "step": 6007 + }, + { + "epoch": 534.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3752112090587616, + "eval_runtime": 2.0476, + "eval_samples_per_second": 77.651, + "eval_steps_per_second": 4.884, + "step": 6018 + }, + { + "epoch": 536.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.35743284225463867, + "eval_runtime": 2.2159, + "eval_samples_per_second": 71.753, + "eval_steps_per_second": 4.513, + "step": 6030 + }, + { + "epoch": 536.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3824201226234436, + "eval_runtime": 2.0455, + "eval_samples_per_second": 77.732, + "eval_steps_per_second": 4.889, + "step": 6041 + }, + { + "epoch": 537.78, + "grad_norm": 0.0922364741563797, + "learning_rate": 1.0420454545454546e-05, + "loss": 0.0126, + "step": 6050 + }, + { + "epoch": 537.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3730430006980896, + "eval_runtime": 2.1192, + "eval_samples_per_second": 75.028, + "eval_steps_per_second": 4.719, + "step": 6052 + }, + { + "epoch": 538.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3703514337539673, + "eval_runtime": 2.2056, + "eval_samples_per_second": 72.091, + "eval_steps_per_second": 4.534, + "step": 6063 + }, + { + "epoch": 540.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.38142630457878113, + "eval_runtime": 2.0818, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 4.804, + "step": 6075 + }, + { + "epoch": 540.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3648853302001953, + "eval_runtime": 2.2199, + "eval_samples_per_second": 71.625, + "eval_steps_per_second": 4.505, + "step": 6086 + }, + { + "epoch": 541.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3810517489910126, + "eval_runtime": 2.0826, + "eval_samples_per_second": 76.345, + "eval_steps_per_second": 4.802, + "step": 6097 + }, + { + "epoch": 542.22, + "grad_norm": 0.04241061210632324, + "learning_rate": 1.0231060606060607e-05, + "loss": 0.012, + "step": 6100 + }, + { + "epoch": 542.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3544082045555115, + "eval_runtime": 2.08, + "eval_samples_per_second": 76.442, + "eval_steps_per_second": 4.808, + "step": 6108 + }, + { + "epoch": 544.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3614555597305298, + "eval_runtime": 2.2123, + "eval_samples_per_second": 71.871, + "eval_steps_per_second": 4.52, + "step": 6120 + }, + { + "epoch": 544.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35575661063194275, + "eval_runtime": 2.1324, + "eval_samples_per_second": 74.564, + "eval_steps_per_second": 4.69, + "step": 6131 + }, + { + "epoch": 545.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.34816914796829224, + "eval_runtime": 2.0819, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 4.803, + "step": 6142 + }, + { + "epoch": 546.67, + "grad_norm": 0.5738076567649841, + "learning_rate": 1.0041666666666666e-05, + "loss": 0.0135, + "step": 6150 + }, + { + "epoch": 546.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.36677080392837524, + "eval_runtime": 2.1421, + "eval_samples_per_second": 74.226, + "eval_steps_per_second": 4.668, + "step": 6153 + }, + { + "epoch": 548.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34037116169929504, + "eval_runtime": 2.0657, + "eval_samples_per_second": 76.972, + "eval_steps_per_second": 4.841, + "step": 6165 + }, + { + "epoch": 548.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33401021361351013, + "eval_runtime": 2.0325, + "eval_samples_per_second": 78.229, + "eval_steps_per_second": 4.92, + "step": 6176 + }, + { + "epoch": 549.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3377488851547241, + "eval_runtime": 2.1646, + "eval_samples_per_second": 73.456, + "eval_steps_per_second": 4.62, + "step": 6187 + }, + { + "epoch": 550.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3406839966773987, + "eval_runtime": 2.1382, + "eval_samples_per_second": 74.36, + "eval_steps_per_second": 4.677, + "step": 6198 + }, + { + "epoch": 551.11, + "grad_norm": 0.34322044253349304, + "learning_rate": 9.852272727272728e-06, + "loss": 0.0101, + "step": 6200 + }, + { + "epoch": 552.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33890071511268616, + "eval_runtime": 2.0917, + "eval_samples_per_second": 76.015, + "eval_steps_per_second": 4.781, + "step": 6210 + }, + { + "epoch": 552.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33053550124168396, + "eval_runtime": 2.2779, + "eval_samples_per_second": 69.8, + "eval_steps_per_second": 4.39, + "step": 6221 + }, + { + "epoch": 553.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.31986501812934875, + "eval_runtime": 1.9669, + "eval_samples_per_second": 80.836, + "eval_steps_per_second": 5.084, + "step": 6232 + }, + { + "epoch": 554.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33377256989479065, + "eval_runtime": 2.0395, + "eval_samples_per_second": 77.96, + "eval_steps_per_second": 4.903, + "step": 6243 + }, + { + "epoch": 555.56, + "grad_norm": 0.1416609138250351, + "learning_rate": 9.662878787878789e-06, + "loss": 0.0175, + "step": 6250 + }, + { + "epoch": 556.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33228349685668945, + "eval_runtime": 2.1542, + "eval_samples_per_second": 73.811, + "eval_steps_per_second": 4.642, + "step": 6255 + }, + { + "epoch": 556.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.340250164270401, + "eval_runtime": 2.0563, + "eval_samples_per_second": 77.325, + "eval_steps_per_second": 4.863, + "step": 6266 + }, + { + "epoch": 557.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34735485911369324, + "eval_runtime": 2.0285, + "eval_samples_per_second": 78.384, + "eval_steps_per_second": 4.93, + "step": 6277 + }, + { + "epoch": 558.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34990042448043823, + "eval_runtime": 2.1967, + "eval_samples_per_second": 72.38, + "eval_steps_per_second": 4.552, + "step": 6288 + }, + { + "epoch": 560.0, + "grad_norm": 0.09764547646045685, + "learning_rate": 9.473484848484848e-06, + "loss": 0.0108, + "step": 6300 + }, + { + "epoch": 560.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.342894971370697, + "eval_runtime": 2.026, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 4.936, + "step": 6300 + }, + { + "epoch": 560.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3396158218383789, + "eval_runtime": 2.3052, + "eval_samples_per_second": 68.976, + "eval_steps_per_second": 4.338, + "step": 6311 + }, + { + "epoch": 561.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3467164933681488, + "eval_runtime": 2.0425, + "eval_samples_per_second": 77.846, + "eval_steps_per_second": 4.896, + "step": 6322 + }, + { + "epoch": 562.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3349219858646393, + "eval_runtime": 2.0651, + "eval_samples_per_second": 76.992, + "eval_steps_per_second": 4.842, + "step": 6333 + }, + { + "epoch": 564.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3380991518497467, + "eval_runtime": 2.111, + "eval_samples_per_second": 75.32, + "eval_steps_per_second": 4.737, + "step": 6345 + }, + { + "epoch": 564.44, + "grad_norm": 0.021107789129018784, + "learning_rate": 9.284090909090908e-06, + "loss": 0.0139, + "step": 6350 + }, + { + "epoch": 564.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.32741737365722656, + "eval_runtime": 2.1143, + "eval_samples_per_second": 75.203, + "eval_steps_per_second": 4.73, + "step": 6356 + }, + { + "epoch": 565.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3318650722503662, + "eval_runtime": 1.9953, + "eval_samples_per_second": 79.688, + "eval_steps_per_second": 5.012, + "step": 6367 + }, + { + "epoch": 566.93, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.33214500546455383, + "eval_runtime": 2.0923, + "eval_samples_per_second": 75.992, + "eval_steps_per_second": 4.779, + "step": 6378 + }, + { + "epoch": 568.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3546938896179199, + "eval_runtime": 2.1191, + "eval_samples_per_second": 75.033, + "eval_steps_per_second": 4.719, + "step": 6390 + }, + { + "epoch": 568.89, + "grad_norm": 1.3278522491455078, + "learning_rate": 9.09469696969697e-06, + "loss": 0.0138, + "step": 6400 + }, + { + "epoch": 568.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.366202175617218, + "eval_runtime": 2.0849, + "eval_samples_per_second": 76.261, + "eval_steps_per_second": 4.796, + "step": 6401 + }, + { + "epoch": 569.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.34554189443588257, + "eval_runtime": 2.2433, + "eval_samples_per_second": 70.878, + "eval_steps_per_second": 4.458, + "step": 6412 + }, + { + "epoch": 570.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3477872908115387, + "eval_runtime": 2.0921, + "eval_samples_per_second": 76.0, + "eval_steps_per_second": 4.78, + "step": 6423 + }, + { + "epoch": 572.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3400007486343384, + "eval_runtime": 2.0746, + "eval_samples_per_second": 76.641, + "eval_steps_per_second": 4.82, + "step": 6435 + }, + { + "epoch": 572.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3512841463088989, + "eval_runtime": 2.0975, + "eval_samples_per_second": 75.803, + "eval_steps_per_second": 4.767, + "step": 6446 + }, + { + "epoch": 573.33, + "grad_norm": 0.1855485886335373, + "learning_rate": 8.905303030303031e-06, + "loss": 0.0095, + "step": 6450 + }, + { + "epoch": 573.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3461546301841736, + "eval_runtime": 2.067, + "eval_samples_per_second": 76.921, + "eval_steps_per_second": 4.838, + "step": 6457 + }, + { + "epoch": 574.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33488187193870544, + "eval_runtime": 2.0691, + "eval_samples_per_second": 76.846, + "eval_steps_per_second": 4.833, + "step": 6468 + }, + { + "epoch": 576.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.337620347738266, + "eval_runtime": 2.018, + "eval_samples_per_second": 78.793, + "eval_steps_per_second": 4.956, + "step": 6480 + }, + { + "epoch": 576.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.33732709288597107, + "eval_runtime": 2.0922, + "eval_samples_per_second": 75.996, + "eval_steps_per_second": 4.78, + "step": 6491 + }, + { + "epoch": 577.78, + "grad_norm": 0.9204933643341064, + "learning_rate": 8.71590909090909e-06, + "loss": 0.0138, + "step": 6500 + }, + { + "epoch": 577.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3310604989528656, + "eval_runtime": 2.1334, + "eval_samples_per_second": 74.528, + "eval_steps_per_second": 4.687, + "step": 6502 + }, + { + "epoch": 578.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.33120694756507874, + "eval_runtime": 2.1395, + "eval_samples_per_second": 74.316, + "eval_steps_per_second": 4.674, + "step": 6513 + }, + { + "epoch": 580.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3291258215904236, + "eval_runtime": 2.1193, + "eval_samples_per_second": 75.024, + "eval_steps_per_second": 4.719, + "step": 6525 + }, + { + "epoch": 580.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3441867232322693, + "eval_runtime": 2.081, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 4.805, + "step": 6536 + }, + { + "epoch": 581.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3806348145008087, + "eval_runtime": 2.08, + "eval_samples_per_second": 76.443, + "eval_steps_per_second": 4.808, + "step": 6547 + }, + { + "epoch": 582.22, + "grad_norm": 1.3162257671356201, + "learning_rate": 8.526515151515151e-06, + "loss": 0.0163, + "step": 6550 + }, + { + "epoch": 582.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.39340561628341675, + "eval_runtime": 2.0419, + "eval_samples_per_second": 77.868, + "eval_steps_per_second": 4.897, + "step": 6558 + }, + { + "epoch": 584.0, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3990216851234436, + "eval_runtime": 2.049, + "eval_samples_per_second": 77.599, + "eval_steps_per_second": 4.88, + "step": 6570 + }, + { + "epoch": 584.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.353302925825119, + "eval_runtime": 2.1595, + "eval_samples_per_second": 73.629, + "eval_steps_per_second": 4.631, + "step": 6581 + }, + { + "epoch": 585.96, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.34103333950042725, + "eval_runtime": 2.2099, + "eval_samples_per_second": 71.948, + "eval_steps_per_second": 4.525, + "step": 6592 + }, + { + "epoch": 586.67, + "grad_norm": 0.35993504524230957, + "learning_rate": 8.337121212121213e-06, + "loss": 0.0152, + "step": 6600 + }, + { + "epoch": 586.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3351433575153351, + "eval_runtime": 2.2699, + "eval_samples_per_second": 70.046, + "eval_steps_per_second": 4.405, + "step": 6603 + }, + { + "epoch": 588.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3369242250919342, + "eval_runtime": 2.117, + "eval_samples_per_second": 75.106, + "eval_steps_per_second": 4.724, + "step": 6615 + }, + { + "epoch": 588.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35417425632476807, + "eval_runtime": 2.144, + "eval_samples_per_second": 74.161, + "eval_steps_per_second": 4.664, + "step": 6626 + }, + { + "epoch": 589.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3728938102722168, + "eval_runtime": 2.0531, + "eval_samples_per_second": 77.443, + "eval_steps_per_second": 4.871, + "step": 6637 + }, + { + "epoch": 590.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34069618582725525, + "eval_runtime": 2.1327, + "eval_samples_per_second": 74.555, + "eval_steps_per_second": 4.689, + "step": 6648 + }, + { + "epoch": 591.11, + "grad_norm": 0.19336657226085663, + "learning_rate": 8.147727272727274e-06, + "loss": 0.017, + "step": 6650 + }, + { + "epoch": 592.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3440462052822113, + "eval_runtime": 2.0686, + "eval_samples_per_second": 76.865, + "eval_steps_per_second": 4.834, + "step": 6660 + }, + { + "epoch": 592.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3493140935897827, + "eval_runtime": 2.0648, + "eval_samples_per_second": 77.004, + "eval_steps_per_second": 4.843, + "step": 6671 + }, + { + "epoch": 593.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.37120524048805237, + "eval_runtime": 2.2033, + "eval_samples_per_second": 72.165, + "eval_steps_per_second": 4.539, + "step": 6682 + }, + { + "epoch": 594.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.36460721492767334, + "eval_runtime": 2.2563, + "eval_samples_per_second": 70.47, + "eval_steps_per_second": 4.432, + "step": 6693 + }, + { + "epoch": 595.56, + "grad_norm": 0.017406007274985313, + "learning_rate": 7.958333333333333e-06, + "loss": 0.0113, + "step": 6700 + }, + { + "epoch": 596.0, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.36630791425704956, + "eval_runtime": 2.0788, + "eval_samples_per_second": 76.486, + "eval_steps_per_second": 4.81, + "step": 6705 + }, + { + "epoch": 596.98, + "eval_accuracy": 0.9245283018867925, + "eval_loss": 0.3725621700286865, + "eval_runtime": 2.226, + "eval_samples_per_second": 71.429, + "eval_steps_per_second": 4.492, + "step": 6716 + }, + { + "epoch": 597.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.35295018553733826, + "eval_runtime": 2.16, + "eval_samples_per_second": 73.611, + "eval_steps_per_second": 4.63, + "step": 6727 + }, + { + "epoch": 598.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3451589047908783, + "eval_runtime": 2.0598, + "eval_samples_per_second": 77.193, + "eval_steps_per_second": 4.855, + "step": 6738 + }, + { + "epoch": 600.0, + "grad_norm": 0.1029694527387619, + "learning_rate": 7.768939393939394e-06, + "loss": 0.0115, + "step": 6750 + }, + { + "epoch": 600.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3340095281600952, + "eval_runtime": 2.1945, + "eval_samples_per_second": 72.455, + "eval_steps_per_second": 4.557, + "step": 6750 + }, + { + "epoch": 600.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34892547130584717, + "eval_runtime": 2.1247, + "eval_samples_per_second": 74.836, + "eval_steps_per_second": 4.707, + "step": 6761 + }, + { + "epoch": 601.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3408372402191162, + "eval_runtime": 2.1827, + "eval_samples_per_second": 72.846, + "eval_steps_per_second": 4.582, + "step": 6772 + }, + { + "epoch": 602.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3423627018928528, + "eval_runtime": 2.2182, + "eval_samples_per_second": 71.68, + "eval_steps_per_second": 4.508, + "step": 6783 + }, + { + "epoch": 604.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34804755449295044, + "eval_runtime": 2.1754, + "eval_samples_per_second": 73.091, + "eval_steps_per_second": 4.597, + "step": 6795 + }, + { + "epoch": 604.44, + "grad_norm": 0.7808576822280884, + "learning_rate": 7.579545454545454e-06, + "loss": 0.0132, + "step": 6800 + }, + { + "epoch": 604.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34386932849884033, + "eval_runtime": 2.0311, + "eval_samples_per_second": 78.283, + "eval_steps_per_second": 4.923, + "step": 6806 + }, + { + "epoch": 605.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3530921936035156, + "eval_runtime": 2.102, + "eval_samples_per_second": 75.641, + "eval_steps_per_second": 4.757, + "step": 6817 + }, + { + "epoch": 606.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3807942271232605, + "eval_runtime": 2.1101, + "eval_samples_per_second": 75.351, + "eval_steps_per_second": 4.739, + "step": 6828 + }, + { + "epoch": 608.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3441016674041748, + "eval_runtime": 2.1163, + "eval_samples_per_second": 75.133, + "eval_steps_per_second": 4.725, + "step": 6840 + }, + { + "epoch": 608.89, + "grad_norm": 0.31322968006134033, + "learning_rate": 7.390151515151515e-06, + "loss": 0.014, + "step": 6850 + }, + { + "epoch": 608.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3534349203109741, + "eval_runtime": 2.0731, + "eval_samples_per_second": 76.696, + "eval_steps_per_second": 4.824, + "step": 6851 + }, + { + "epoch": 609.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3583095371723175, + "eval_runtime": 2.1365, + "eval_samples_per_second": 74.419, + "eval_steps_per_second": 4.68, + "step": 6862 + }, + { + "epoch": 610.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3640231490135193, + "eval_runtime": 2.3226, + "eval_samples_per_second": 68.457, + "eval_steps_per_second": 4.305, + "step": 6873 + }, + { + "epoch": 612.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3587685227394104, + "eval_runtime": 2.0532, + "eval_samples_per_second": 77.44, + "eval_steps_per_second": 4.87, + "step": 6885 + }, + { + "epoch": 612.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3662501275539398, + "eval_runtime": 2.1672, + "eval_samples_per_second": 73.368, + "eval_steps_per_second": 4.614, + "step": 6896 + }, + { + "epoch": 613.33, + "grad_norm": 1.508801817893982, + "learning_rate": 7.200757575757576e-06, + "loss": 0.0089, + "step": 6900 + }, + { + "epoch": 613.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3788923919200897, + "eval_runtime": 2.0361, + "eval_samples_per_second": 78.092, + "eval_steps_per_second": 4.911, + "step": 6907 + }, + { + "epoch": 614.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.378842294216156, + "eval_runtime": 2.0538, + "eval_samples_per_second": 77.417, + "eval_steps_per_second": 4.869, + "step": 6918 + }, + { + "epoch": 616.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3528358042240143, + "eval_runtime": 2.0973, + "eval_samples_per_second": 75.811, + "eval_steps_per_second": 4.768, + "step": 6930 + }, + { + "epoch": 616.98, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3626009523868561, + "eval_runtime": 2.1285, + "eval_samples_per_second": 74.701, + "eval_steps_per_second": 4.698, + "step": 6941 + }, + { + "epoch": 617.78, + "grad_norm": 0.027906352654099464, + "learning_rate": 7.0113636363636365e-06, + "loss": 0.0135, + "step": 6950 + }, + { + "epoch": 617.96, + "eval_accuracy": 0.9182389937106918, + "eval_loss": 0.3760795593261719, + "eval_runtime": 2.0573, + "eval_samples_per_second": 77.285, + "eval_steps_per_second": 4.861, + "step": 6952 + }, + { + "epoch": 618.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3911431133747101, + "eval_runtime": 2.3187, + "eval_samples_per_second": 68.573, + "eval_steps_per_second": 4.313, + "step": 6963 + }, + { + "epoch": 620.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3900914192199707, + "eval_runtime": 2.1186, + "eval_samples_per_second": 75.049, + "eval_steps_per_second": 4.72, + "step": 6975 + }, + { + "epoch": 620.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.4003194272518158, + "eval_runtime": 2.1007, + "eval_samples_per_second": 75.689, + "eval_steps_per_second": 4.76, + "step": 6986 + }, + { + "epoch": 621.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.36526620388031006, + "eval_runtime": 2.1753, + "eval_samples_per_second": 73.093, + "eval_steps_per_second": 4.597, + "step": 6997 + }, + { + "epoch": 622.22, + "grad_norm": 0.05157339572906494, + "learning_rate": 6.821969696969697e-06, + "loss": 0.0071, + "step": 7000 + }, + { + "epoch": 622.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.33499374985694885, + "eval_runtime": 2.11, + "eval_samples_per_second": 75.356, + "eval_steps_per_second": 4.739, + "step": 7008 + }, + { + "epoch": 624.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3353654444217682, + "eval_runtime": 2.0902, + "eval_samples_per_second": 76.069, + "eval_steps_per_second": 4.784, + "step": 7020 + }, + { + "epoch": 624.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.37156394124031067, + "eval_runtime": 2.0375, + "eval_samples_per_second": 78.038, + "eval_steps_per_second": 4.908, + "step": 7031 + }, + { + "epoch": 625.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3520486354827881, + "eval_runtime": 2.0907, + "eval_samples_per_second": 76.051, + "eval_steps_per_second": 4.783, + "step": 7042 + }, + { + "epoch": 626.67, + "grad_norm": 0.5914948582649231, + "learning_rate": 6.632575757575758e-06, + "loss": 0.0129, + "step": 7050 + }, + { + "epoch": 626.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3306971490383148, + "eval_runtime": 2.0739, + "eval_samples_per_second": 76.667, + "eval_steps_per_second": 4.822, + "step": 7053 + }, + { + "epoch": 628.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33053889870643616, + "eval_runtime": 2.2479, + "eval_samples_per_second": 70.731, + "eval_steps_per_second": 4.449, + "step": 7065 + }, + { + "epoch": 628.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3301643431186676, + "eval_runtime": 1.9998, + "eval_samples_per_second": 79.509, + "eval_steps_per_second": 5.001, + "step": 7076 + }, + { + "epoch": 629.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3290785253047943, + "eval_runtime": 2.0356, + "eval_samples_per_second": 78.11, + "eval_steps_per_second": 4.913, + "step": 7087 + }, + { + "epoch": 630.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3329908847808838, + "eval_runtime": 2.0116, + "eval_samples_per_second": 79.04, + "eval_steps_per_second": 4.971, + "step": 7098 + }, + { + "epoch": 631.11, + "grad_norm": 1.9344037771224976, + "learning_rate": 6.4431818181818185e-06, + "loss": 0.0091, + "step": 7100 + }, + { + "epoch": 632.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3331502079963684, + "eval_runtime": 2.1322, + "eval_samples_per_second": 74.572, + "eval_steps_per_second": 4.69, + "step": 7110 + }, + { + "epoch": 632.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33215317130088806, + "eval_runtime": 2.0089, + "eval_samples_per_second": 79.146, + "eval_steps_per_second": 4.978, + "step": 7121 + }, + { + "epoch": 633.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3437711000442505, + "eval_runtime": 2.1614, + "eval_samples_per_second": 73.562, + "eval_steps_per_second": 4.627, + "step": 7132 + }, + { + "epoch": 634.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.36110153794288635, + "eval_runtime": 2.1038, + "eval_samples_per_second": 75.577, + "eval_steps_per_second": 4.753, + "step": 7143 + }, + { + "epoch": 635.56, + "grad_norm": 0.008998346514999866, + "learning_rate": 6.253787878787879e-06, + "loss": 0.0107, + "step": 7150 + }, + { + "epoch": 636.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34894272685050964, + "eval_runtime": 2.1178, + "eval_samples_per_second": 75.077, + "eval_steps_per_second": 4.722, + "step": 7155 + }, + { + "epoch": 636.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3357524573802948, + "eval_runtime": 2.1256, + "eval_samples_per_second": 74.803, + "eval_steps_per_second": 4.705, + "step": 7166 + }, + { + "epoch": 637.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3372538983821869, + "eval_runtime": 2.0938, + "eval_samples_per_second": 75.939, + "eval_steps_per_second": 4.776, + "step": 7177 + }, + { + "epoch": 638.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3444075584411621, + "eval_runtime": 2.1377, + "eval_samples_per_second": 74.379, + "eval_steps_per_second": 4.678, + "step": 7188 + }, + { + "epoch": 640.0, + "grad_norm": 0.753413736820221, + "learning_rate": 6.06439393939394e-06, + "loss": 0.0125, + "step": 7200 + }, + { + "epoch": 640.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.36328038573265076, + "eval_runtime": 2.0555, + "eval_samples_per_second": 77.354, + "eval_steps_per_second": 4.865, + "step": 7200 + }, + { + "epoch": 640.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3562980592250824, + "eval_runtime": 2.0343, + "eval_samples_per_second": 78.159, + "eval_steps_per_second": 4.916, + "step": 7211 + }, + { + "epoch": 641.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.35727426409721375, + "eval_runtime": 2.0513, + "eval_samples_per_second": 77.513, + "eval_steps_per_second": 4.875, + "step": 7222 + }, + { + "epoch": 642.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3534907400608063, + "eval_runtime": 2.109, + "eval_samples_per_second": 75.393, + "eval_steps_per_second": 4.742, + "step": 7233 + }, + { + "epoch": 644.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34685295820236206, + "eval_runtime": 2.1171, + "eval_samples_per_second": 75.104, + "eval_steps_per_second": 4.724, + "step": 7245 + }, + { + "epoch": 644.44, + "grad_norm": 0.040267378091812134, + "learning_rate": 5.8750000000000005e-06, + "loss": 0.0071, + "step": 7250 + }, + { + "epoch": 644.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34481677412986755, + "eval_runtime": 2.2433, + "eval_samples_per_second": 70.878, + "eval_steps_per_second": 4.458, + "step": 7256 + }, + { + "epoch": 645.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3445126414299011, + "eval_runtime": 2.09, + "eval_samples_per_second": 76.075, + "eval_steps_per_second": 4.785, + "step": 7267 + }, + { + "epoch": 646.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3418070077896118, + "eval_runtime": 2.1179, + "eval_samples_per_second": 75.074, + "eval_steps_per_second": 4.722, + "step": 7278 + }, + { + "epoch": 648.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3541422188282013, + "eval_runtime": 2.0491, + "eval_samples_per_second": 77.596, + "eval_steps_per_second": 4.88, + "step": 7290 + }, + { + "epoch": 648.89, + "grad_norm": 0.02006547898054123, + "learning_rate": 5.685606060606061e-06, + "loss": 0.0076, + "step": 7300 + }, + { + "epoch": 648.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34063196182250977, + "eval_runtime": 2.1334, + "eval_samples_per_second": 74.528, + "eval_steps_per_second": 4.687, + "step": 7301 + }, + { + "epoch": 649.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3326892852783203, + "eval_runtime": 2.0215, + "eval_samples_per_second": 78.656, + "eval_steps_per_second": 4.947, + "step": 7312 + }, + { + "epoch": 650.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3381519019603729, + "eval_runtime": 2.1234, + "eval_samples_per_second": 74.878, + "eval_steps_per_second": 4.709, + "step": 7323 + }, + { + "epoch": 652.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3574288785457611, + "eval_runtime": 2.2212, + "eval_samples_per_second": 71.583, + "eval_steps_per_second": 4.502, + "step": 7335 + }, + { + "epoch": 652.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3462476134300232, + "eval_runtime": 2.3846, + "eval_samples_per_second": 66.678, + "eval_steps_per_second": 4.194, + "step": 7346 + }, + { + "epoch": 653.33, + "grad_norm": 0.1632642298936844, + "learning_rate": 5.5e-06, + "loss": 0.0131, + "step": 7350 + }, + { + "epoch": 653.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33882516622543335, + "eval_runtime": 2.0171, + "eval_samples_per_second": 78.826, + "eval_steps_per_second": 4.958, + "step": 7357 + }, + { + "epoch": 654.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.337929904460907, + "eval_runtime": 2.1283, + "eval_samples_per_second": 74.708, + "eval_steps_per_second": 4.699, + "step": 7368 + }, + { + "epoch": 656.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3396049737930298, + "eval_runtime": 2.0868, + "eval_samples_per_second": 76.193, + "eval_steps_per_second": 4.792, + "step": 7380 + }, + { + "epoch": 656.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3436720371246338, + "eval_runtime": 2.0283, + "eval_samples_per_second": 78.391, + "eval_steps_per_second": 4.93, + "step": 7391 + }, + { + "epoch": 657.78, + "grad_norm": 1.5342937707901, + "learning_rate": 5.3106060606060605e-06, + "loss": 0.0086, + "step": 7400 + }, + { + "epoch": 657.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3466395139694214, + "eval_runtime": 2.1077, + "eval_samples_per_second": 75.438, + "eval_steps_per_second": 4.745, + "step": 7402 + }, + { + "epoch": 658.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3453463315963745, + "eval_runtime": 2.0776, + "eval_samples_per_second": 76.532, + "eval_steps_per_second": 4.813, + "step": 7413 + }, + { + "epoch": 660.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3420422077178955, + "eval_runtime": 2.0546, + "eval_samples_per_second": 77.386, + "eval_steps_per_second": 4.867, + "step": 7425 + }, + { + "epoch": 660.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33706873655319214, + "eval_runtime": 2.1267, + "eval_samples_per_second": 74.764, + "eval_steps_per_second": 4.702, + "step": 7436 + }, + { + "epoch": 661.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34426021575927734, + "eval_runtime": 2.0904, + "eval_samples_per_second": 76.061, + "eval_steps_per_second": 4.784, + "step": 7447 + }, + { + "epoch": 662.22, + "grad_norm": 0.16996954381465912, + "learning_rate": 5.121212121212121e-06, + "loss": 0.0123, + "step": 7450 + }, + { + "epoch": 662.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3473140299320221, + "eval_runtime": 2.0509, + "eval_samples_per_second": 77.526, + "eval_steps_per_second": 4.876, + "step": 7458 + }, + { + "epoch": 664.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3424939215183258, + "eval_runtime": 2.0641, + "eval_samples_per_second": 77.031, + "eval_steps_per_second": 4.845, + "step": 7470 + }, + { + "epoch": 664.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.345442533493042, + "eval_runtime": 2.0612, + "eval_samples_per_second": 77.138, + "eval_steps_per_second": 4.851, + "step": 7481 + }, + { + "epoch": 665.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3493753969669342, + "eval_runtime": 1.9848, + "eval_samples_per_second": 80.108, + "eval_steps_per_second": 5.038, + "step": 7492 + }, + { + "epoch": 666.67, + "grad_norm": 0.08370883017778397, + "learning_rate": 4.931818181818182e-06, + "loss": 0.0083, + "step": 7500 + }, + { + "epoch": 666.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35356974601745605, + "eval_runtime": 2.1097, + "eval_samples_per_second": 75.368, + "eval_steps_per_second": 4.74, + "step": 7503 + }, + { + "epoch": 668.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34760990738868713, + "eval_runtime": 2.1147, + "eval_samples_per_second": 75.188, + "eval_steps_per_second": 4.729, + "step": 7515 + }, + { + "epoch": 668.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34870967268943787, + "eval_runtime": 2.0331, + "eval_samples_per_second": 78.206, + "eval_steps_per_second": 4.919, + "step": 7526 + }, + { + "epoch": 669.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35328802466392517, + "eval_runtime": 2.4514, + "eval_samples_per_second": 64.861, + "eval_steps_per_second": 4.079, + "step": 7537 + }, + { + "epoch": 670.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35539668798446655, + "eval_runtime": 2.1199, + "eval_samples_per_second": 75.003, + "eval_steps_per_second": 4.717, + "step": 7548 + }, + { + "epoch": 671.11, + "grad_norm": 2.100541353225708, + "learning_rate": 4.7424242424242426e-06, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 672.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3482361435890198, + "eval_runtime": 2.1456, + "eval_samples_per_second": 74.104, + "eval_steps_per_second": 4.661, + "step": 7560 + }, + { + "epoch": 672.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34814804792404175, + "eval_runtime": 2.0856, + "eval_samples_per_second": 76.239, + "eval_steps_per_second": 4.795, + "step": 7571 + }, + { + "epoch": 673.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.344621866941452, + "eval_runtime": 2.2762, + "eval_samples_per_second": 69.852, + "eval_steps_per_second": 4.393, + "step": 7582 + }, + { + "epoch": 674.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3432255983352661, + "eval_runtime": 2.0507, + "eval_samples_per_second": 77.533, + "eval_steps_per_second": 4.876, + "step": 7593 + }, + { + "epoch": 675.56, + "grad_norm": 0.598809003829956, + "learning_rate": 4.553030303030303e-06, + "loss": 0.0111, + "step": 7600 + }, + { + "epoch": 676.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34696489572525024, + "eval_runtime": 2.1457, + "eval_samples_per_second": 74.102, + "eval_steps_per_second": 4.66, + "step": 7605 + }, + { + "epoch": 676.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.33925533294677734, + "eval_runtime": 2.134, + "eval_samples_per_second": 74.507, + "eval_steps_per_second": 4.686, + "step": 7616 + }, + { + "epoch": 677.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3386417627334595, + "eval_runtime": 2.0634, + "eval_samples_per_second": 77.059, + "eval_steps_per_second": 4.846, + "step": 7627 + }, + { + "epoch": 678.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3310278058052063, + "eval_runtime": 2.0308, + "eval_samples_per_second": 78.293, + "eval_steps_per_second": 4.924, + "step": 7638 + }, + { + "epoch": 680.0, + "grad_norm": 0.0734761655330658, + "learning_rate": 4.363636363636364e-06, + "loss": 0.0107, + "step": 7650 + }, + { + "epoch": 680.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.329887717962265, + "eval_runtime": 2.214, + "eval_samples_per_second": 71.816, + "eval_steps_per_second": 4.517, + "step": 7650 + }, + { + "epoch": 680.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33161696791648865, + "eval_runtime": 2.0168, + "eval_samples_per_second": 78.839, + "eval_steps_per_second": 4.958, + "step": 7661 + }, + { + "epoch": 681.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33317527174949646, + "eval_runtime": 2.1533, + "eval_samples_per_second": 73.84, + "eval_steps_per_second": 4.644, + "step": 7672 + }, + { + "epoch": 682.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3443678021430969, + "eval_runtime": 2.1824, + "eval_samples_per_second": 72.855, + "eval_steps_per_second": 4.582, + "step": 7683 + }, + { + "epoch": 684.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3444632291793823, + "eval_runtime": 1.9773, + "eval_samples_per_second": 80.414, + "eval_steps_per_second": 5.058, + "step": 7695 + }, + { + "epoch": 684.44, + "grad_norm": 1.5188406705856323, + "learning_rate": 4.1742424242424246e-06, + "loss": 0.0091, + "step": 7700 + }, + { + "epoch": 684.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3443754017353058, + "eval_runtime": 2.0477, + "eval_samples_per_second": 77.647, + "eval_steps_per_second": 4.883, + "step": 7706 + }, + { + "epoch": 685.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34085437655448914, + "eval_runtime": 2.2252, + "eval_samples_per_second": 71.453, + "eval_steps_per_second": 4.494, + "step": 7717 + }, + { + "epoch": 686.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34413453936576843, + "eval_runtime": 2.1451, + "eval_samples_per_second": 74.121, + "eval_steps_per_second": 4.662, + "step": 7728 + }, + { + "epoch": 688.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.35173678398132324, + "eval_runtime": 2.0413, + "eval_samples_per_second": 77.89, + "eval_steps_per_second": 4.899, + "step": 7740 + }, + { + "epoch": 688.89, + "grad_norm": 1.0382517576217651, + "learning_rate": 3.984848484848484e-06, + "loss": 0.0081, + "step": 7750 + }, + { + "epoch": 688.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3521307110786438, + "eval_runtime": 2.0937, + "eval_samples_per_second": 75.942, + "eval_steps_per_second": 4.776, + "step": 7751 + }, + { + "epoch": 689.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.350664883852005, + "eval_runtime": 2.1003, + "eval_samples_per_second": 75.703, + "eval_steps_per_second": 4.761, + "step": 7762 + }, + { + "epoch": 690.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3460524082183838, + "eval_runtime": 2.0791, + "eval_samples_per_second": 76.475, + "eval_steps_per_second": 4.81, + "step": 7773 + }, + { + "epoch": 692.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.349832683801651, + "eval_runtime": 2.0457, + "eval_samples_per_second": 77.724, + "eval_steps_per_second": 4.888, + "step": 7785 + }, + { + "epoch": 692.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.35444310307502747, + "eval_runtime": 2.1547, + "eval_samples_per_second": 73.793, + "eval_steps_per_second": 4.641, + "step": 7796 + }, + { + "epoch": 693.33, + "grad_norm": 0.36742502450942993, + "learning_rate": 3.795454545454546e-06, + "loss": 0.009, + "step": 7800 + }, + { + "epoch": 693.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.35569891333580017, + "eval_runtime": 2.0236, + "eval_samples_per_second": 78.575, + "eval_steps_per_second": 4.942, + "step": 7807 + }, + { + "epoch": 694.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.35327550768852234, + "eval_runtime": 2.057, + "eval_samples_per_second": 77.297, + "eval_steps_per_second": 4.861, + "step": 7818 + }, + { + "epoch": 696.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3559163212776184, + "eval_runtime": 2.203, + "eval_samples_per_second": 72.173, + "eval_steps_per_second": 4.539, + "step": 7830 + }, + { + "epoch": 696.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35951152443885803, + "eval_runtime": 2.0835, + "eval_samples_per_second": 76.315, + "eval_steps_per_second": 4.8, + "step": 7841 + }, + { + "epoch": 697.78, + "grad_norm": 0.10021142661571503, + "learning_rate": 3.606060606060606e-06, + "loss": 0.0078, + "step": 7850 + }, + { + "epoch": 697.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3617425560951233, + "eval_runtime": 2.0937, + "eval_samples_per_second": 75.941, + "eval_steps_per_second": 4.776, + "step": 7852 + }, + { + "epoch": 698.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3614467978477478, + "eval_runtime": 2.2589, + "eval_samples_per_second": 70.389, + "eval_steps_per_second": 4.427, + "step": 7863 + }, + { + "epoch": 700.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34519079327583313, + "eval_runtime": 2.046, + "eval_samples_per_second": 77.712, + "eval_steps_per_second": 4.888, + "step": 7875 + }, + { + "epoch": 700.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34312644600868225, + "eval_runtime": 2.143, + "eval_samples_per_second": 74.196, + "eval_steps_per_second": 4.666, + "step": 7886 + }, + { + "epoch": 701.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34687530994415283, + "eval_runtime": 2.1317, + "eval_samples_per_second": 74.59, + "eval_steps_per_second": 4.691, + "step": 7897 + }, + { + "epoch": 702.22, + "grad_norm": 0.013305970467627048, + "learning_rate": 3.416666666666667e-06, + "loss": 0.0102, + "step": 7900 + }, + { + "epoch": 702.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3564489483833313, + "eval_runtime": 2.0468, + "eval_samples_per_second": 77.682, + "eval_steps_per_second": 4.886, + "step": 7908 + }, + { + "epoch": 704.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35935157537460327, + "eval_runtime": 2.0233, + "eval_samples_per_second": 78.584, + "eval_steps_per_second": 4.942, + "step": 7920 + }, + { + "epoch": 704.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3517804443836212, + "eval_runtime": 2.2107, + "eval_samples_per_second": 71.924, + "eval_steps_per_second": 4.524, + "step": 7931 + }, + { + "epoch": 705.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3444287180900574, + "eval_runtime": 2.0396, + "eval_samples_per_second": 77.958, + "eval_steps_per_second": 4.903, + "step": 7942 + }, + { + "epoch": 706.67, + "grad_norm": 1.1949517726898193, + "learning_rate": 3.2272727272727275e-06, + "loss": 0.008, + "step": 7950 + }, + { + "epoch": 706.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34264177083969116, + "eval_runtime": 2.0811, + "eval_samples_per_second": 76.402, + "eval_steps_per_second": 4.805, + "step": 7953 + }, + { + "epoch": 708.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34593525528907776, + "eval_runtime": 2.1049, + "eval_samples_per_second": 75.537, + "eval_steps_per_second": 4.751, + "step": 7965 + }, + { + "epoch": 708.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3511156439781189, + "eval_runtime": 2.0385, + "eval_samples_per_second": 77.999, + "eval_steps_per_second": 4.906, + "step": 7976 + }, + { + "epoch": 709.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.35437288880348206, + "eval_runtime": 2.0421, + "eval_samples_per_second": 77.862, + "eval_steps_per_second": 4.897, + "step": 7987 + }, + { + "epoch": 710.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3566732704639435, + "eval_runtime": 2.2624, + "eval_samples_per_second": 70.28, + "eval_steps_per_second": 4.42, + "step": 7998 + }, + { + "epoch": 711.11, + "grad_norm": 0.8354963660240173, + "learning_rate": 3.0378787878787878e-06, + "loss": 0.0053, + "step": 8000 + }, + { + "epoch": 712.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3673837184906006, + "eval_runtime": 2.0161, + "eval_samples_per_second": 78.866, + "eval_steps_per_second": 4.96, + "step": 8010 + }, + { + "epoch": 712.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3630300760269165, + "eval_runtime": 2.0691, + "eval_samples_per_second": 76.844, + "eval_steps_per_second": 4.833, + "step": 8021 + }, + { + "epoch": 713.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3602018654346466, + "eval_runtime": 2.0814, + "eval_samples_per_second": 76.389, + "eval_steps_per_second": 4.804, + "step": 8032 + }, + { + "epoch": 714.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35657405853271484, + "eval_runtime": 2.0547, + "eval_samples_per_second": 77.384, + "eval_steps_per_second": 4.867, + "step": 8043 + }, + { + "epoch": 715.56, + "grad_norm": 0.17041368782520294, + "learning_rate": 2.8484848484848484e-06, + "loss": 0.0071, + "step": 8050 + }, + { + "epoch": 716.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3645796477794647, + "eval_runtime": 2.0104, + "eval_samples_per_second": 79.087, + "eval_steps_per_second": 4.974, + "step": 8055 + }, + { + "epoch": 716.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.364641398191452, + "eval_runtime": 2.0723, + "eval_samples_per_second": 76.725, + "eval_steps_per_second": 4.825, + "step": 8066 + }, + { + "epoch": 717.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3593458831310272, + "eval_runtime": 2.017, + "eval_samples_per_second": 78.83, + "eval_steps_per_second": 4.958, + "step": 8077 + }, + { + "epoch": 718.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3625403344631195, + "eval_runtime": 2.1034, + "eval_samples_per_second": 75.591, + "eval_steps_per_second": 4.754, + "step": 8088 + }, + { + "epoch": 720.0, + "grad_norm": 0.7891609072685242, + "learning_rate": 2.659090909090909e-06, + "loss": 0.0071, + "step": 8100 + }, + { + "epoch": 720.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.36099299788475037, + "eval_runtime": 2.0137, + "eval_samples_per_second": 78.958, + "eval_steps_per_second": 4.966, + "step": 8100 + }, + { + "epoch": 720.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.35885581374168396, + "eval_runtime": 2.0236, + "eval_samples_per_second": 78.572, + "eval_steps_per_second": 4.942, + "step": 8111 + }, + { + "epoch": 721.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3528722822666168, + "eval_runtime": 2.0499, + "eval_samples_per_second": 77.565, + "eval_steps_per_second": 4.878, + "step": 8122 + }, + { + "epoch": 722.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34843915700912476, + "eval_runtime": 2.0515, + "eval_samples_per_second": 77.504, + "eval_steps_per_second": 4.874, + "step": 8133 + }, + { + "epoch": 724.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3468559682369232, + "eval_runtime": 2.0267, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 4.934, + "step": 8145 + }, + { + "epoch": 724.44, + "grad_norm": 0.013204416260123253, + "learning_rate": 2.46969696969697e-06, + "loss": 0.0098, + "step": 8150 + }, + { + "epoch": 724.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34806957840919495, + "eval_runtime": 2.0094, + "eval_samples_per_second": 79.126, + "eval_steps_per_second": 4.976, + "step": 8156 + }, + { + "epoch": 725.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34636813402175903, + "eval_runtime": 2.1662, + "eval_samples_per_second": 73.4, + "eval_steps_per_second": 4.616, + "step": 8167 + }, + { + "epoch": 726.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34824779629707336, + "eval_runtime": 2.0311, + "eval_samples_per_second": 78.282, + "eval_steps_per_second": 4.923, + "step": 8178 + }, + { + "epoch": 728.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34667864441871643, + "eval_runtime": 2.2582, + "eval_samples_per_second": 70.411, + "eval_steps_per_second": 4.428, + "step": 8190 + }, + { + "epoch": 728.89, + "grad_norm": 1.7239004373550415, + "learning_rate": 2.2803030303030305e-06, + "loss": 0.0159, + "step": 8200 + }, + { + "epoch": 728.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.346113383769989, + "eval_runtime": 2.0824, + "eval_samples_per_second": 76.353, + "eval_steps_per_second": 4.802, + "step": 8201 + }, + { + "epoch": 729.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3437664210796356, + "eval_runtime": 2.0394, + "eval_samples_per_second": 77.966, + "eval_steps_per_second": 4.904, + "step": 8212 + }, + { + "epoch": 730.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33936139941215515, + "eval_runtime": 2.0701, + "eval_samples_per_second": 76.809, + "eval_steps_per_second": 4.831, + "step": 8223 + }, + { + "epoch": 732.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3355594277381897, + "eval_runtime": 2.1359, + "eval_samples_per_second": 74.442, + "eval_steps_per_second": 4.682, + "step": 8235 + }, + { + "epoch": 732.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3355758488178253, + "eval_runtime": 2.0241, + "eval_samples_per_second": 78.553, + "eval_steps_per_second": 4.94, + "step": 8246 + }, + { + "epoch": 733.33, + "grad_norm": 1.1134917736053467, + "learning_rate": 2.090909090909091e-06, + "loss": 0.0128, + "step": 8250 + }, + { + "epoch": 733.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.337179034948349, + "eval_runtime": 2.162, + "eval_samples_per_second": 73.543, + "eval_steps_per_second": 4.625, + "step": 8257 + }, + { + "epoch": 734.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3391708731651306, + "eval_runtime": 2.0183, + "eval_samples_per_second": 78.778, + "eval_steps_per_second": 4.955, + "step": 8268 + }, + { + "epoch": 736.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3454706072807312, + "eval_runtime": 2.037, + "eval_samples_per_second": 78.056, + "eval_steps_per_second": 4.909, + "step": 8280 + }, + { + "epoch": 736.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34865179657936096, + "eval_runtime": 2.1268, + "eval_samples_per_second": 74.76, + "eval_steps_per_second": 4.702, + "step": 8291 + }, + { + "epoch": 737.78, + "grad_norm": 0.008208476938307285, + "learning_rate": 1.9015151515151518e-06, + "loss": 0.0086, + "step": 8300 + }, + { + "epoch": 737.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3467850983142853, + "eval_runtime": 2.1854, + "eval_samples_per_second": 72.756, + "eval_steps_per_second": 4.576, + "step": 8302 + }, + { + "epoch": 738.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.344488263130188, + "eval_runtime": 2.0623, + "eval_samples_per_second": 77.099, + "eval_steps_per_second": 4.849, + "step": 8313 + }, + { + "epoch": 740.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.34248578548431396, + "eval_runtime": 2.0582, + "eval_samples_per_second": 77.254, + "eval_steps_per_second": 4.859, + "step": 8325 + }, + { + "epoch": 740.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3452531397342682, + "eval_runtime": 2.1556, + "eval_samples_per_second": 73.762, + "eval_steps_per_second": 4.639, + "step": 8336 + }, + { + "epoch": 741.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34475868940353394, + "eval_runtime": 2.0516, + "eval_samples_per_second": 77.5, + "eval_steps_per_second": 4.874, + "step": 8347 + }, + { + "epoch": 742.22, + "grad_norm": 0.2444353848695755, + "learning_rate": 1.712121212121212e-06, + "loss": 0.011, + "step": 8350 + }, + { + "epoch": 742.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34120240807533264, + "eval_runtime": 2.0936, + "eval_samples_per_second": 75.945, + "eval_steps_per_second": 4.776, + "step": 8358 + }, + { + "epoch": 744.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.33924660086631775, + "eval_runtime": 2.2099, + "eval_samples_per_second": 71.948, + "eval_steps_per_second": 4.525, + "step": 8370 + }, + { + "epoch": 744.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3390309512615204, + "eval_runtime": 1.9925, + "eval_samples_per_second": 79.801, + "eval_steps_per_second": 5.019, + "step": 8381 + }, + { + "epoch": 745.96, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3395291268825531, + "eval_runtime": 1.9807, + "eval_samples_per_second": 80.274, + "eval_steps_per_second": 5.049, + "step": 8392 + }, + { + "epoch": 746.67, + "grad_norm": 0.8103430867195129, + "learning_rate": 1.5227272727272727e-06, + "loss": 0.0074, + "step": 8400 + }, + { + "epoch": 746.93, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3383350074291229, + "eval_runtime": 2.2625, + "eval_samples_per_second": 70.276, + "eval_steps_per_second": 4.42, + "step": 8403 + }, + { + "epoch": 748.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33776676654815674, + "eval_runtime": 2.0087, + "eval_samples_per_second": 79.157, + "eval_steps_per_second": 4.978, + "step": 8415 + }, + { + "epoch": 748.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3348415195941925, + "eval_runtime": 2.0796, + "eval_samples_per_second": 76.457, + "eval_steps_per_second": 4.809, + "step": 8426 + }, + { + "epoch": 749.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33347979187965393, + "eval_runtime": 2.1871, + "eval_samples_per_second": 72.698, + "eval_steps_per_second": 4.572, + "step": 8437 + }, + { + "epoch": 750.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33422428369522095, + "eval_runtime": 2.069, + "eval_samples_per_second": 76.849, + "eval_steps_per_second": 4.833, + "step": 8448 + }, + { + "epoch": 751.11, + "grad_norm": 1.5617446899414062, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0087, + "step": 8450 + }, + { + "epoch": 752.0, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33466464281082153, + "eval_runtime": 2.0175, + "eval_samples_per_second": 78.81, + "eval_steps_per_second": 4.957, + "step": 8460 + }, + { + "epoch": 752.98, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.33632901310920715, + "eval_runtime": 2.2613, + "eval_samples_per_second": 70.315, + "eval_steps_per_second": 4.422, + "step": 8471 + }, + { + "epoch": 753.96, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3377835154533386, + "eval_runtime": 2.0093, + "eval_samples_per_second": 79.131, + "eval_steps_per_second": 4.977, + "step": 8482 + }, + { + "epoch": 754.93, + "eval_accuracy": 0.9433962264150944, + "eval_loss": 0.3383637070655823, + "eval_runtime": 2.0348, + "eval_samples_per_second": 78.139, + "eval_steps_per_second": 4.914, + "step": 8493 + }, + { + "epoch": 755.56, + "grad_norm": 1.2671109437942505, + "learning_rate": 1.143939393939394e-06, + "loss": 0.0061, + "step": 8500 + }, + { + "epoch": 756.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3406466245651245, + "eval_runtime": 2.0595, + "eval_samples_per_second": 77.203, + "eval_steps_per_second": 4.856, + "step": 8505 + }, + { + "epoch": 756.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.34400761127471924, + "eval_runtime": 1.9798, + "eval_samples_per_second": 80.313, + "eval_steps_per_second": 5.051, + "step": 8516 + }, + { + "epoch": 757.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34409239888191223, + "eval_runtime": 2.0569, + "eval_samples_per_second": 77.301, + "eval_steps_per_second": 4.862, + "step": 8527 + }, + { + "epoch": 758.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34241315722465515, + "eval_runtime": 2.0733, + "eval_samples_per_second": 76.691, + "eval_steps_per_second": 4.823, + "step": 8538 + }, + { + "epoch": 760.0, + "grad_norm": 2.0512726306915283, + "learning_rate": 9.545454545454546e-07, + "loss": 0.0119, + "step": 8550 + }, + { + "epoch": 760.0, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3426421582698822, + "eval_runtime": 2.0315, + "eval_samples_per_second": 78.268, + "eval_steps_per_second": 4.922, + "step": 8550 + }, + { + "epoch": 760.98, + "eval_accuracy": 0.9371069182389937, + "eval_loss": 0.3427829444408417, + "eval_runtime": 2.1633, + "eval_samples_per_second": 73.499, + "eval_steps_per_second": 4.623, + "step": 8561 + }, + { + "epoch": 761.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34399789571762085, + "eval_runtime": 2.1363, + "eval_samples_per_second": 74.428, + "eval_steps_per_second": 4.681, + "step": 8572 + }, + { + "epoch": 762.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3443286418914795, + "eval_runtime": 2.0533, + "eval_samples_per_second": 77.437, + "eval_steps_per_second": 4.87, + "step": 8583 + }, + { + "epoch": 764.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.345469206571579, + "eval_runtime": 1.9651, + "eval_samples_per_second": 80.911, + "eval_steps_per_second": 5.089, + "step": 8595 + }, + { + "epoch": 764.44, + "grad_norm": 0.15614187717437744, + "learning_rate": 7.651515151515152e-07, + "loss": 0.0056, + "step": 8600 + }, + { + "epoch": 764.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34602606296539307, + "eval_runtime": 2.0712, + "eval_samples_per_second": 76.769, + "eval_steps_per_second": 4.828, + "step": 8606 + }, + { + "epoch": 765.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3463137745857239, + "eval_runtime": 1.9634, + "eval_samples_per_second": 80.983, + "eval_steps_per_second": 5.093, + "step": 8617 + }, + { + "epoch": 766.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34662124514579773, + "eval_runtime": 2.0264, + "eval_samples_per_second": 78.466, + "eval_steps_per_second": 4.935, + "step": 8628 + }, + { + "epoch": 768.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3465888202190399, + "eval_runtime": 2.1276, + "eval_samples_per_second": 74.732, + "eval_steps_per_second": 4.7, + "step": 8640 + }, + { + "epoch": 768.89, + "grad_norm": 0.13273529708385468, + "learning_rate": 5.757575757575757e-07, + "loss": 0.0094, + "step": 8650 + }, + { + "epoch": 768.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34740516543388367, + "eval_runtime": 1.986, + "eval_samples_per_second": 80.062, + "eval_steps_per_second": 5.035, + "step": 8651 + }, + { + "epoch": 769.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3476426601409912, + "eval_runtime": 2.2993, + "eval_samples_per_second": 69.152, + "eval_steps_per_second": 4.349, + "step": 8662 + }, + { + "epoch": 770.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34822559356689453, + "eval_runtime": 2.054, + "eval_samples_per_second": 77.411, + "eval_steps_per_second": 4.869, + "step": 8673 + }, + { + "epoch": 772.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.348609060049057, + "eval_runtime": 2.0775, + "eval_samples_per_second": 76.533, + "eval_steps_per_second": 4.813, + "step": 8685 + }, + { + "epoch": 772.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34849491715431213, + "eval_runtime": 1.9848, + "eval_samples_per_second": 80.11, + "eval_steps_per_second": 5.038, + "step": 8696 + }, + { + "epoch": 773.33, + "grad_norm": 2.092862606048584, + "learning_rate": 3.8636363636363636e-07, + "loss": 0.014, + "step": 8700 + }, + { + "epoch": 773.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3478315770626068, + "eval_runtime": 2.0318, + "eval_samples_per_second": 78.257, + "eval_steps_per_second": 4.922, + "step": 8707 + }, + { + "epoch": 774.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.347221702337265, + "eval_runtime": 2.0723, + "eval_samples_per_second": 76.726, + "eval_steps_per_second": 4.826, + "step": 8718 + }, + { + "epoch": 776.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34651002287864685, + "eval_runtime": 1.9895, + "eval_samples_per_second": 79.92, + "eval_steps_per_second": 5.026, + "step": 8730 + }, + { + "epoch": 776.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3461478352546692, + "eval_runtime": 2.0438, + "eval_samples_per_second": 77.796, + "eval_steps_per_second": 4.893, + "step": 8741 + }, + { + "epoch": 777.78, + "grad_norm": 0.42866629362106323, + "learning_rate": 1.9696969696969696e-07, + "loss": 0.0126, + "step": 8750 + }, + { + "epoch": 777.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3467194736003876, + "eval_runtime": 2.0767, + "eval_samples_per_second": 76.564, + "eval_steps_per_second": 4.815, + "step": 8752 + }, + { + "epoch": 778.93, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3471050262451172, + "eval_runtime": 2.0846, + "eval_samples_per_second": 76.272, + "eval_steps_per_second": 4.797, + "step": 8763 + }, + { + "epoch": 780.0, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.34714454412460327, + "eval_runtime": 2.1516, + "eval_samples_per_second": 73.897, + "eval_steps_per_second": 4.648, + "step": 8775 + }, + { + "epoch": 780.98, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3471665382385254, + "eval_runtime": 2.0781, + "eval_samples_per_second": 76.511, + "eval_steps_per_second": 4.812, + "step": 8786 + }, + { + "epoch": 781.96, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3471100628376007, + "eval_runtime": 2.0029, + "eval_samples_per_second": 79.386, + "eval_steps_per_second": 4.993, + "step": 8797 + }, + { + "epoch": 782.22, + "grad_norm": 0.060126595199108124, + "learning_rate": 7.575757575757576e-09, + "loss": 0.0048, + "step": 8800 + }, + { + "epoch": 782.22, + "eval_accuracy": 0.9308176100628931, + "eval_loss": 0.3471885025501251, + "eval_runtime": 2.0337, + "eval_samples_per_second": 78.181, + "eval_steps_per_second": 4.917, + "step": 8800 + }, + { + "epoch": 782.22, + "step": 8800, + "total_flos": 4.912188447589224e+18, + "train_loss": 0.0709631282125007, + "train_runtime": 5794.2307, + "train_samples_per_second": 98.995, + "train_steps_per_second": 1.519 + } + ], + "logging_steps": 50, + "max_steps": 8800, + "num_input_tokens_seen": 0, + "num_train_epochs": 800, + "save_steps": 500, + "total_flos": 4.912188447589224e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}