diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7710 +1,606 @@ { - "best_metric": 0.3460787236690521, - "best_model_checkpoint": "outputs/checkpoint-672", - "epoch": 24.041085840058695, - "eval_steps": 8, - "global_step": 4096, + "best_metric": 0.43469348549842834, + "best_model_checkpoint": "outputs/checkpoint-512", + "epoch": 3.002932551319648, + "eval_steps": 32, + "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, - "grad_norm": 0.55859375, - "learning_rate": 1.25e-05, - "loss": 0.981, - "step": 8 - }, - { - "epoch": 0.05, - "eval_loss": 0.9143704771995544, - "eval_runtime": 4.8662, - "eval_samples_per_second": 9.864, - "eval_steps_per_second": 1.233, - "step": 8 - }, - { - "epoch": 0.09, - "grad_norm": 0.494140625, + "grad_norm": 0.103515625, "learning_rate": 2.5e-05, - "loss": 0.9604, - "step": 16 + "loss": 1.2622, + "step": 8 }, { "epoch": 0.09, - "eval_loss": 0.8369872570037842, - "eval_runtime": 4.8785, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "grad_norm": 0.08984375, + "learning_rate": 5e-05, + "loss": 1.2695, "step": 16 }, { "epoch": 0.14, - "grad_norm": 0.447265625, - "learning_rate": 3.7500000000000003e-05, - "loss": 0.8553, - "step": 24 - }, - { - "epoch": 0.14, - "eval_loss": 0.7113822102546692, - "eval_runtime": 4.8736, - "eval_samples_per_second": 9.849, - "eval_steps_per_second": 1.231, + "grad_norm": 0.1181640625, + "learning_rate": 7.500000000000001e-05, + "loss": 1.1917, "step": 24 }, { "epoch": 0.19, - "grad_norm": 0.4453125, - "learning_rate": 5e-05, - "loss": 0.7358, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001, + "loss": 1.0581, "step": 32 }, { "epoch": 0.19, - "eval_loss": 0.5980567336082458, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, + "eval_loss": 0.8670921325683594, + "eval_runtime": 2.7914, + "eval_samples_per_second": 17.195, + "eval_steps_per_second": 2.149, "step": 32 }, { "epoch": 0.23, - "grad_norm": 0.33984375, - "learning_rate": 4.9901574803149606e-05, - "loss": 0.6122, + "grad_norm": 0.12158203125, + "learning_rate": 9.833333333333333e-05, + "loss": 0.8582, "step": 40 }, - { - "epoch": 0.23, - "eval_loss": 0.5387040972709656, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 40 - }, - { - "epoch": 0.28, - "grad_norm": 0.353515625, - "learning_rate": 4.9803149606299216e-05, - "loss": 0.5819, - "step": 48 - }, { "epoch": 0.28, - "eval_loss": 0.5024240016937256, - "eval_runtime": 4.8808, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, + "grad_norm": 0.09619140625, + "learning_rate": 9.666666666666667e-05, + "loss": 0.7709, "step": 48 }, { "epoch": 0.33, - "grad_norm": 0.328125, - "learning_rate": 4.970472440944882e-05, - "loss": 0.547, - "step": 56 - }, - { - "epoch": 0.33, - "eval_loss": 0.47695469856262207, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, + "grad_norm": 0.08642578125, + "learning_rate": 9.5e-05, + "loss": 0.7284, "step": 56 }, { "epoch": 0.38, - "grad_norm": 0.345703125, - "learning_rate": 4.960629921259843e-05, - "loss": 0.5432, + "grad_norm": 0.1318359375, + "learning_rate": 9.333333333333334e-05, + "loss": 0.7043, "step": 64 }, { "epoch": 0.38, - "eval_loss": 0.46160802245140076, - "eval_runtime": 4.8825, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, + "eval_loss": 0.6009237170219421, + "eval_runtime": 2.1819, + "eval_samples_per_second": 22.0, + "eval_steps_per_second": 2.75, "step": 64 }, { "epoch": 0.42, - "grad_norm": 0.328125, - "learning_rate": 4.950787401574803e-05, - "loss": 0.5187, - "step": 72 - }, - { - "epoch": 0.42, - "eval_loss": 0.4508754014968872, - "eval_runtime": 4.8793, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, + "grad_norm": 0.0986328125, + "learning_rate": 9.166666666666667e-05, + "loss": 0.6693, "step": 72 }, { "epoch": 0.47, - "grad_norm": 0.341796875, - "learning_rate": 4.940944881889764e-05, - "loss": 0.5056, + "grad_norm": 0.09814453125, + "learning_rate": 9e-05, + "loss": 0.6409, "step": 80 }, - { - "epoch": 0.47, - "eval_loss": 0.4417424201965332, - "eval_runtime": 4.8827, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 80 - }, - { - "epoch": 0.52, - "grad_norm": 0.32421875, - "learning_rate": 4.9311023622047246e-05, - "loss": 0.4723, - "step": 88 - }, { "epoch": 0.52, - "eval_loss": 0.4341174364089966, - "eval_runtime": 4.8789, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, + "grad_norm": 0.1279296875, + "learning_rate": 8.833333333333333e-05, + "loss": 0.6038, "step": 88 }, { "epoch": 0.56, - "grad_norm": 0.349609375, - "learning_rate": 4.9212598425196856e-05, - "loss": 0.4813, + "grad_norm": 0.0947265625, + "learning_rate": 8.666666666666667e-05, + "loss": 0.6093, "step": 96 }, { "epoch": 0.56, - "eval_loss": 0.429939866065979, - "eval_runtime": 4.878, - "eval_samples_per_second": 9.84, - "eval_steps_per_second": 1.23, + "eval_loss": 0.5342517495155334, + "eval_runtime": 2.1808, + "eval_samples_per_second": 22.01, + "eval_steps_per_second": 2.751, "step": 96 }, { "epoch": 0.61, - "grad_norm": 0.333984375, - "learning_rate": 4.911417322834646e-05, - "loss": 0.4873, + "grad_norm": 0.1005859375, + "learning_rate": 8.5e-05, + "loss": 0.5974, "step": 104 }, - { - "epoch": 0.61, - "eval_loss": 0.42384615540504456, - "eval_runtime": 4.8764, - "eval_samples_per_second": 9.843, - "eval_steps_per_second": 1.23, - "step": 104 - }, - { - "epoch": 0.66, - "grad_norm": 0.376953125, - "learning_rate": 4.901574803149607e-05, - "loss": 0.464, - "step": 112 - }, { "epoch": 0.66, - "eval_loss": 0.41946908831596375, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, + "grad_norm": 0.11474609375, + "learning_rate": 8.333333333333334e-05, + "loss": 0.5732, "step": 112 }, { "epoch": 0.7, - "grad_norm": 0.37109375, - "learning_rate": 4.891732283464567e-05, - "loss": 0.4859, - "step": 120 - }, - { - "epoch": 0.7, - "eval_loss": 0.41445252299308777, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, + "grad_norm": 0.11669921875, + "learning_rate": 8.166666666666667e-05, + "loss": 0.5907, "step": 120 }, { "epoch": 0.75, - "grad_norm": 0.35546875, - "learning_rate": 4.881889763779528e-05, - "loss": 0.4579, + "grad_norm": 0.0966796875, + "learning_rate": 8e-05, + "loss": 0.5548, "step": 128 }, { "epoch": 0.75, - "eval_loss": 0.4118093252182007, - "eval_runtime": 4.8842, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, + "eval_loss": 0.5006037950515747, + "eval_runtime": 2.1797, + "eval_samples_per_second": 22.021, + "eval_steps_per_second": 2.753, "step": 128 }, { "epoch": 0.8, - "grad_norm": 0.33984375, - "learning_rate": 4.872047244094489e-05, - "loss": 0.4549, - "step": 136 - }, - { - "epoch": 0.8, - "eval_loss": 0.40753862261772156, - "eval_runtime": 4.8904, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, + "grad_norm": 0.095703125, + "learning_rate": 7.833333333333333e-05, + "loss": 0.5563, "step": 136 }, { - "epoch": 0.85, - "grad_norm": 0.365234375, - "learning_rate": 4.862204724409449e-05, - "loss": 0.4374, + "epoch": 0.84, + "grad_norm": 0.1142578125, + "learning_rate": 7.666666666666667e-05, + "loss": 0.5311, "step": 144 }, - { - "epoch": 0.85, - "eval_loss": 0.4049430787563324, - "eval_runtime": 4.8885, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 144 - }, - { - "epoch": 0.89, - "grad_norm": 0.369140625, - "learning_rate": 4.85236220472441e-05, - "loss": 0.438, - "step": 152 - }, { "epoch": 0.89, - "eval_loss": 0.4011856019496918, - "eval_runtime": 4.8787, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "grad_norm": 0.126953125, + "learning_rate": 7.500000000000001e-05, + "loss": 0.5284, "step": 152 }, { "epoch": 0.94, - "grad_norm": 0.333984375, - "learning_rate": 4.84251968503937e-05, - "loss": 0.4347, + "grad_norm": 0.09814453125, + "learning_rate": 7.333333333333333e-05, + "loss": 0.5316, "step": 160 }, { "epoch": 0.94, - "eval_loss": 0.39831605553627014, - "eval_runtime": 4.8796, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, + "eval_loss": 0.48313403129577637, + "eval_runtime": 2.1847, + "eval_samples_per_second": 21.971, + "eval_steps_per_second": 2.746, "step": 160 }, { "epoch": 0.99, - "grad_norm": 0.37109375, - "learning_rate": 4.832677165354331e-05, - "loss": 0.4388, - "step": 168 - }, - { - "epoch": 0.99, - "eval_loss": 0.39434146881103516, - "eval_runtime": 4.8874, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, + "grad_norm": 0.09912109375, + "learning_rate": 7.166666666666667e-05, + "loss": 0.5324, "step": 168 }, { "epoch": 1.03, - "grad_norm": 0.38671875, - "learning_rate": 4.822834645669291e-05, - "loss": 0.4519, - "step": 176 - }, - { - "epoch": 1.03, - "eval_loss": 0.39422157406806946, - "eval_runtime": 4.8818, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, + "grad_norm": 0.1455078125, + "learning_rate": 7e-05, + "loss": 0.5456, "step": 176 }, { "epoch": 1.08, - "grad_norm": 0.34375, - "learning_rate": 4.812992125984252e-05, - "loss": 0.4273, - "step": 184 - }, - { - "epoch": 1.08, - "eval_loss": 0.3909244239330292, - "eval_runtime": 4.8834, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, + "grad_norm": 0.1025390625, + "learning_rate": 6.833333333333333e-05, + "loss": 0.5274, "step": 184 }, { "epoch": 1.13, - "grad_norm": 0.3828125, - "learning_rate": 4.8031496062992124e-05, - "loss": 0.4169, + "grad_norm": 0.11181640625, + "learning_rate": 6.666666666666667e-05, + "loss": 0.5161, "step": 192 }, { "epoch": 1.13, - "eval_loss": 0.38786780834198, - "eval_runtime": 4.8897, - "eval_samples_per_second": 9.816, - "eval_steps_per_second": 1.227, + "eval_loss": 0.4703657329082489, + "eval_runtime": 2.1798, + "eval_samples_per_second": 22.021, + "eval_steps_per_second": 2.753, "step": 192 }, { "epoch": 1.17, - "grad_norm": 0.3828125, - "learning_rate": 4.7933070866141734e-05, - "loss": 0.4217, - "step": 200 - }, - { - "epoch": 1.17, - "eval_loss": 0.38564667105674744, - "eval_runtime": 4.884, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, + "grad_norm": 0.12060546875, + "learning_rate": 6.500000000000001e-05, + "loss": 0.5129, "step": 200 }, { "epoch": 1.22, - "grad_norm": 0.404296875, - "learning_rate": 4.783464566929134e-05, - "loss": 0.4473, - "step": 208 - }, - { - "epoch": 1.22, - "eval_loss": 0.3840738534927368, - "eval_runtime": 4.8763, - "eval_samples_per_second": 9.844, - "eval_steps_per_second": 1.23, + "grad_norm": 0.11083984375, + "learning_rate": 6.333333333333333e-05, + "loss": 0.5455, "step": 208 }, { "epoch": 1.27, - "grad_norm": 0.357421875, - "learning_rate": 4.773622047244095e-05, - "loss": 0.4158, - "step": 216 - }, - { - "epoch": 1.27, - "eval_loss": 0.3802422285079956, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, + "grad_norm": 0.1142578125, + "learning_rate": 6.166666666666667e-05, + "loss": 0.5113, "step": 216 }, { "epoch": 1.31, - "grad_norm": 0.3671875, - "learning_rate": 4.763779527559055e-05, - "loss": 0.4112, + "grad_norm": 0.134765625, + "learning_rate": 6e-05, + "loss": 0.4937, "step": 224 }, { "epoch": 1.31, - "eval_loss": 0.38026484847068787, - "eval_runtime": 4.8808, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, + "eval_loss": 0.4608299434185028, + "eval_runtime": 2.1784, + "eval_samples_per_second": 22.035, + "eval_steps_per_second": 2.754, "step": 224 }, { "epoch": 1.36, - "grad_norm": 0.37109375, - "learning_rate": 4.753937007874016e-05, - "loss": 0.4317, - "step": 232 - }, - { - "epoch": 1.36, - "eval_loss": 0.3779226839542389, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, + "grad_norm": 0.10791015625, + "learning_rate": 5.833333333333334e-05, + "loss": 0.5279, "step": 232 }, { "epoch": 1.41, - "grad_norm": 0.408203125, - "learning_rate": 4.7440944881889765e-05, - "loss": 0.4105, - "step": 240 - }, - { - "epoch": 1.41, - "eval_loss": 0.37829089164733887, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, + "grad_norm": 0.1181640625, + "learning_rate": 5.666666666666667e-05, + "loss": 0.5057, "step": 240 }, { - "epoch": 1.46, - "grad_norm": 0.41015625, - "learning_rate": 4.7342519685039375e-05, - "loss": 0.4015, - "step": 248 - }, - { - "epoch": 1.46, - "eval_loss": 0.3772072494029999, - "eval_runtime": 4.8791, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, + "epoch": 1.45, + "grad_norm": 0.11328125, + "learning_rate": 5.500000000000001e-05, + "loss": 0.4863, "step": 248 }, { "epoch": 1.5, - "grad_norm": 0.451171875, - "learning_rate": 4.724409448818898e-05, - "loss": 0.4262, + "grad_norm": 0.1376953125, + "learning_rate": 5.333333333333333e-05, + "loss": 0.5236, "step": 256 }, { "epoch": 1.5, - "eval_loss": 0.37624046206474304, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, + "eval_loss": 0.454235315322876, + "eval_runtime": 2.1818, + "eval_samples_per_second": 22.0, + "eval_steps_per_second": 2.75, "step": 256 }, { "epoch": 1.55, - "grad_norm": 0.361328125, - "learning_rate": 4.714566929133858e-05, - "loss": 0.4135, - "step": 264 - }, - { - "epoch": 1.55, - "eval_loss": 0.37519559264183044, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, + "grad_norm": 0.11181640625, + "learning_rate": 5.166666666666667e-05, + "loss": 0.5, "step": 264 }, { "epoch": 1.6, - "grad_norm": 0.4453125, - "learning_rate": 4.704724409448819e-05, - "loss": 0.4099, - "step": 272 - }, - { - "epoch": 1.6, - "eval_loss": 0.37314102053642273, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, + "grad_norm": 0.1259765625, + "learning_rate": 5e-05, + "loss": 0.4989, "step": 272 }, { "epoch": 1.64, - "grad_norm": 0.408203125, - "learning_rate": 4.6948818897637795e-05, - "loss": 0.4074, - "step": 280 - }, - { - "epoch": 1.64, - "eval_loss": 0.3711419999599457, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, + "grad_norm": 0.12451171875, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.4944, "step": 280 }, { "epoch": 1.69, - "grad_norm": 0.41015625, - "learning_rate": 4.6850393700787405e-05, - "loss": 0.4203, + "grad_norm": 0.11962890625, + "learning_rate": 4.666666666666667e-05, + "loss": 0.5064, "step": 288 }, { "epoch": 1.69, - "eval_loss": 0.3701172173023224, - "eval_runtime": 4.8785, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "eval_loss": 0.4474702775478363, + "eval_runtime": 2.1767, + "eval_samples_per_second": 22.052, + "eval_steps_per_second": 2.756, "step": 288 }, { "epoch": 1.74, - "grad_norm": 0.35546875, - "learning_rate": 4.675196850393701e-05, - "loss": 0.4384, - "step": 296 - }, - { - "epoch": 1.74, - "eval_loss": 0.36824676394462585, - "eval_runtime": 4.8786, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "grad_norm": 0.103515625, + "learning_rate": 4.5e-05, + "loss": 0.5323, "step": 296 }, { "epoch": 1.78, - "grad_norm": 0.43359375, - "learning_rate": 4.665354330708662e-05, - "loss": 0.4047, - "step": 304 - }, - { - "epoch": 1.78, - "eval_loss": 0.36743244528770447, - "eval_runtime": 4.8787, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "grad_norm": 0.1328125, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.4955, "step": 304 }, { "epoch": 1.83, - "grad_norm": 0.33984375, - "learning_rate": 4.655511811023622e-05, - "loss": 0.4196, - "step": 312 - }, - { - "epoch": 1.83, - "eval_loss": 0.36576294898986816, - "eval_runtime": 4.8821, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, + "grad_norm": 0.10546875, + "learning_rate": 4.166666666666667e-05, + "loss": 0.5137, "step": 312 }, { "epoch": 1.88, - "grad_norm": 0.423828125, - "learning_rate": 4.645669291338583e-05, - "loss": 0.4044, + "grad_norm": 0.1337890625, + "learning_rate": 4e-05, + "loss": 0.497, "step": 320 }, { "epoch": 1.88, - "eval_loss": 0.3646230697631836, - "eval_runtime": 4.8839, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, + "eval_loss": 0.442803293466568, + "eval_runtime": 2.1775, + "eval_samples_per_second": 22.044, + "eval_steps_per_second": 2.755, "step": 320 }, { - "epoch": 1.93, - "grad_norm": 0.39453125, - "learning_rate": 4.6358267716535436e-05, - "loss": 0.4169, - "step": 328 - }, - { - "epoch": 1.93, - "eval_loss": 0.363146036863327, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, + "epoch": 1.92, + "grad_norm": 0.10791015625, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.5032, "step": 328 }, { "epoch": 1.97, - "grad_norm": 0.41015625, - "learning_rate": 4.6259842519685046e-05, - "loss": 0.4047, - "step": 336 - }, - { - "epoch": 1.97, - "eval_loss": 0.36223217844963074, - "eval_runtime": 4.8806, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, + "grad_norm": 0.130859375, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.4914, "step": 336 }, { "epoch": 2.02, - "grad_norm": 0.3515625, - "learning_rate": 4.616141732283465e-05, - "loss": 0.4016, - "step": 344 - }, - { - "epoch": 2.02, - "eval_loss": 0.3615216314792633, - "eval_runtime": 4.8812, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, + "grad_norm": 0.1171875, + "learning_rate": 3.5e-05, + "loss": 0.5013, "step": 344 }, { - "epoch": 2.07, - "grad_norm": 0.447265625, - "learning_rate": 4.606299212598425e-05, - "loss": 0.3953, + "epoch": 2.06, + "grad_norm": 0.1328125, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4933, "step": 352 }, { - "epoch": 2.07, - "eval_loss": 0.3632339537143707, - "eval_runtime": 4.8829, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, + "epoch": 2.06, + "eval_loss": 0.4411480128765106, + "eval_runtime": 2.1813, + "eval_samples_per_second": 22.005, + "eval_steps_per_second": 2.751, "step": 352 }, { "epoch": 2.11, - "grad_norm": 0.384765625, - "learning_rate": 4.596456692913386e-05, - "loss": 0.3771, - "step": 360 - }, - { - "epoch": 2.11, - "eval_loss": 0.36078453063964844, - "eval_runtime": 4.8875, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, + "grad_norm": 0.1201171875, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.4741, "step": 360 }, { "epoch": 2.16, - "grad_norm": 0.40625, - "learning_rate": 4.5866141732283466e-05, - "loss": 0.3837, - "step": 368 - }, - { - "epoch": 2.16, - "eval_loss": 0.36075982451438904, - "eval_runtime": 4.8905, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, + "grad_norm": 0.1181640625, + "learning_rate": 3e-05, + "loss": 0.4802, "step": 368 }, { "epoch": 2.21, - "grad_norm": 0.42578125, - "learning_rate": 4.5767716535433076e-05, - "loss": 0.396, - "step": 376 - }, - { - "epoch": 2.21, - "eval_loss": 0.36021366715431213, - "eval_runtime": 4.8856, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, + "grad_norm": 0.1181640625, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.4911, "step": 376 }, { "epoch": 2.25, - "grad_norm": 0.423828125, - "learning_rate": 4.566929133858268e-05, - "loss": 0.4079, + "grad_norm": 0.1455078125, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.517, "step": 384 }, { "epoch": 2.25, - "eval_loss": 0.36072564125061035, - "eval_runtime": 4.8735, - "eval_samples_per_second": 9.849, - "eval_steps_per_second": 1.231, + "eval_loss": 0.4384828805923462, + "eval_runtime": 2.1808, + "eval_samples_per_second": 22.01, + "eval_steps_per_second": 2.751, "step": 384 }, { "epoch": 2.3, - "grad_norm": 0.4609375, - "learning_rate": 4.557086614173229e-05, - "loss": 0.4058, - "step": 392 - }, - { - "epoch": 2.3, - "eval_loss": 0.35943296551704407, - "eval_runtime": 4.8875, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, + "grad_norm": 0.1318359375, + "learning_rate": 2.5e-05, + "loss": 0.5024, "step": 392 }, { "epoch": 2.35, - "grad_norm": 0.4140625, - "learning_rate": 4.547244094488189e-05, - "loss": 0.3831, - "step": 400 - }, - { - "epoch": 2.35, - "eval_loss": 0.35869550704956055, - "eval_runtime": 4.8806, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, + "grad_norm": 0.1259765625, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.4802, "step": 400 }, { "epoch": 2.39, - "grad_norm": 0.404296875, - "learning_rate": 4.53740157480315e-05, - "loss": 0.3945, - "step": 408 - }, - { - "epoch": 2.39, - "eval_loss": 0.3580925762653351, - "eval_runtime": 4.8786, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, + "grad_norm": 0.12109375, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.4857, "step": 408 }, { "epoch": 2.44, - "grad_norm": 0.40625, - "learning_rate": 4.52755905511811e-05, - "loss": 0.3953, + "grad_norm": 0.11962890625, + "learning_rate": 2e-05, + "loss": 0.4949, "step": 416 }, { "epoch": 2.44, - "eval_loss": 0.3581734001636505, - "eval_runtime": 4.8757, - "eval_samples_per_second": 9.845, - "eval_steps_per_second": 1.231, + "eval_loss": 0.4360348880290985, + "eval_runtime": 2.179, + "eval_samples_per_second": 22.029, + "eval_steps_per_second": 2.754, "step": 416 }, { "epoch": 2.49, - "grad_norm": 0.435546875, - "learning_rate": 4.517716535433071e-05, - "loss": 0.4043, - "step": 424 - }, - { - "epoch": 2.49, - "eval_loss": 0.3573249280452728, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, + "grad_norm": 0.12890625, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.501, "step": 424 }, { - "epoch": 2.54, - "grad_norm": 0.404296875, - "learning_rate": 4.507874015748031e-05, - "loss": 0.3674, - "step": 432 - }, - { - "epoch": 2.54, - "eval_loss": 0.358001708984375, - "eval_runtime": 4.8899, - "eval_samples_per_second": 9.816, - "eval_steps_per_second": 1.227, + "epoch": 2.53, + "grad_norm": 0.1318359375, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.4595, "step": 432 }, { "epoch": 2.58, - "grad_norm": 0.427734375, - "learning_rate": 4.4980314960629924e-05, - "loss": 0.3783, - "step": 440 - }, - { - "epoch": 2.58, - "eval_loss": 0.35629966855049133, - "eval_runtime": 4.8821, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, + "grad_norm": 0.11962890625, + "learning_rate": 1.5e-05, + "loss": 0.4715, "step": 440 }, { "epoch": 2.63, - "grad_norm": 0.44921875, - "learning_rate": 4.488188976377953e-05, - "loss": 0.3981, + "grad_norm": 0.12255859375, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.4946, "step": 448 }, { "epoch": 2.63, - "eval_loss": 0.35627031326293945, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, + "eval_loss": 0.4355371296405792, + "eval_runtime": 2.1775, + "eval_samples_per_second": 22.044, + "eval_steps_per_second": 2.755, "step": 448 }, { - "epoch": 2.68, - "grad_norm": 0.4609375, - "learning_rate": 4.478346456692914e-05, - "loss": 0.3795, - "step": 456 - }, - { - "epoch": 2.68, - "eval_loss": 0.3570912182331085, - "eval_runtime": 4.883, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, + "epoch": 2.67, + "grad_norm": 0.123046875, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.4692, "step": 456 }, { "epoch": 2.72, - "grad_norm": 0.453125, - "learning_rate": 4.468503937007874e-05, - "loss": 0.3744, - "step": 464 - }, - { - "epoch": 2.72, - "eval_loss": 0.35693350434303284, - "eval_runtime": 4.8794, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, + "grad_norm": 0.125, + "learning_rate": 1e-05, + "loss": 0.4751, "step": 464 }, { "epoch": 2.77, - "grad_norm": 0.416015625, - "learning_rate": 4.4586614173228344e-05, - "loss": 0.4174, - "step": 472 - }, - { - "epoch": 2.77, - "eval_loss": 0.35543546080589294, - "eval_runtime": 4.88, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, + "grad_norm": 0.12060546875, + "learning_rate": 8.333333333333334e-06, + "loss": 0.5144, "step": 472 }, { "epoch": 2.82, - "grad_norm": 0.423828125, - "learning_rate": 4.4488188976377954e-05, - "loss": 0.3639, + "grad_norm": 0.119140625, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4589, "step": 480 }, { "epoch": 2.82, - "eval_loss": 0.35410675406455994, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, + "eval_loss": 0.4347504675388336, + "eval_runtime": 2.1766, + "eval_samples_per_second": 22.053, + "eval_steps_per_second": 2.757, "step": 480 }, { "epoch": 2.86, - "grad_norm": 0.40625, - "learning_rate": 4.438976377952756e-05, - "loss": 0.3877, - "step": 488 - }, - { - "epoch": 2.86, - "eval_loss": 0.35232722759246826, - "eval_runtime": 4.8764, - "eval_samples_per_second": 9.843, - "eval_steps_per_second": 1.23, + "grad_norm": 0.1162109375, + "learning_rate": 5e-06, + "loss": 0.4768, "step": 488 }, { "epoch": 2.91, - "grad_norm": 0.484375, - "learning_rate": 4.429133858267717e-05, - "loss": 0.3805, - "step": 496 - }, - { - "epoch": 2.91, - "eval_loss": 0.3522258996963501, - "eval_runtime": 4.8775, - "eval_samples_per_second": 9.841, - "eval_steps_per_second": 1.23, + "grad_norm": 0.130859375, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.4795, "step": 496 }, { "epoch": 2.96, - "grad_norm": 0.423828125, - "learning_rate": 4.419291338582677e-05, - "loss": 0.3964, - "step": 504 - }, - { - "epoch": 2.96, - "eval_loss": 0.3513607680797577, - "eval_runtime": 4.8776, - "eval_samples_per_second": 9.841, - "eval_steps_per_second": 1.23, + "grad_norm": 0.1201171875, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.4992, "step": 504 }, { - "epoch": 3.01, - "grad_norm": 0.42578125, - "learning_rate": 4.409448818897638e-05, - "loss": 0.3824, + "epoch": 3.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0, + "loss": 0.466, "step": 512 }, { - "epoch": 3.01, - "eval_loss": 0.350473016500473, - "eval_runtime": 4.8912, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, + "epoch": 3.0, + "eval_loss": 0.43469348549842834, + "eval_runtime": 2.179, + "eval_samples_per_second": 22.029, + "eval_steps_per_second": 2.754, "step": 512 }, { - "epoch": 3.05, - "grad_norm": 0.5078125, - "learning_rate": 4.3996062992125984e-05, - "loss": 0.3683, - "step": 520 - }, - { - "epoch": 3.05, - "eval_loss": 0.3539038896560669, - "eval_runtime": 4.8794, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 520 - }, - { - "epoch": 3.1, - "grad_norm": 0.44140625, - "learning_rate": 4.3897637795275594e-05, - "loss": 0.3829, - "step": 528 - }, - { - "epoch": 3.1, - "eval_loss": 0.3512038290500641, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 528 - }, - { - "epoch": 3.15, - "grad_norm": 0.44921875, - "learning_rate": 4.37992125984252e-05, - "loss": 0.3675, - "step": 536 - }, - { - "epoch": 3.15, - "eval_loss": 0.3524076044559479, - "eval_runtime": 4.8842, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 536 - }, - { - "epoch": 3.19, - "grad_norm": 0.46484375, - "learning_rate": 4.370078740157481e-05, - "loss": 0.3751, - "step": 544 - }, - { - "epoch": 3.19, - "eval_loss": 0.351447194814682, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 544 - }, - { - "epoch": 3.24, - "grad_norm": 0.44140625, - "learning_rate": 4.360236220472441e-05, - "loss": 0.3767, - "step": 552 - }, - { - "epoch": 3.24, - "eval_loss": 0.35135617852211, - "eval_runtime": 4.8875, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 552 - }, - { - "epoch": 3.29, - "grad_norm": 0.474609375, - "learning_rate": 4.350393700787402e-05, - "loss": 0.347, - "step": 560 - }, - { - "epoch": 3.29, - "eval_loss": 0.35124826431274414, - "eval_runtime": 4.879, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 560 - }, - { - "epoch": 3.33, - "grad_norm": 0.458984375, - "learning_rate": 4.3405511811023625e-05, - "loss": 0.3565, - "step": 568 - }, - { - "epoch": 3.33, - "eval_loss": 0.3507702648639679, - "eval_runtime": 4.88, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 568 - }, - { - "epoch": 3.38, - "grad_norm": 0.44921875, - "learning_rate": 4.330708661417323e-05, - "loss": 0.3986, - "step": 576 - }, - { - "epoch": 3.38, - "eval_loss": 0.35253670811653137, - "eval_runtime": 4.887, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 576 - }, - { - "epoch": 3.43, - "grad_norm": 0.458984375, - "learning_rate": 4.320866141732284e-05, - "loss": 0.3802, - "step": 584 - }, - { - "epoch": 3.43, - "eval_loss": 0.35116323828697205, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 584 - }, - { - "epoch": 3.47, - "grad_norm": 0.451171875, - "learning_rate": 4.311023622047244e-05, - "loss": 0.3556, - "step": 592 - }, - { - "epoch": 3.47, - "eval_loss": 0.35001620650291443, - "eval_runtime": 4.8879, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 592 - }, - { - "epoch": 3.52, - "grad_norm": 0.451171875, - "learning_rate": 4.301181102362205e-05, - "loss": 0.3681, - "step": 600 - }, - { - "epoch": 3.52, - "eval_loss": 0.3492867946624756, - "eval_runtime": 4.8869, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 600 - }, - { - "epoch": 3.57, - "grad_norm": 0.478515625, - "learning_rate": 4.2913385826771655e-05, - "loss": 0.3753, - "step": 608 - }, - { - "epoch": 3.57, - "eval_loss": 0.3498481810092926, - "eval_runtime": 4.8796, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 608 - }, - { - "epoch": 3.62, - "grad_norm": 0.56640625, - "learning_rate": 4.2814960629921265e-05, - "loss": 0.3694, - "step": 616 - }, - { - "epoch": 3.62, - "eval_loss": 0.3518795073032379, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 616 - }, - { - "epoch": 3.66, - "grad_norm": 0.470703125, - "learning_rate": 4.271653543307087e-05, - "loss": 0.3619, - "step": 624 - }, - { - "epoch": 3.66, - "eval_loss": 0.35094156861305237, - "eval_runtime": 4.8819, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 624 - }, - { - "epoch": 3.71, - "grad_norm": 0.494140625, - "learning_rate": 4.261811023622048e-05, - "loss": 0.3703, - "step": 632 - }, - { - "epoch": 3.71, - "eval_loss": 0.3496703803539276, - "eval_runtime": 4.8781, - "eval_samples_per_second": 9.84, - "eval_steps_per_second": 1.23, - "step": 632 - }, - { - "epoch": 3.76, - "grad_norm": 0.5234375, - "learning_rate": 4.251968503937008e-05, - "loss": 0.3391, - "step": 640 - }, - { - "epoch": 3.76, - "eval_loss": 0.3481089770793915, - "eval_runtime": 4.8794, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 640 - }, - { - "epoch": 3.8, - "grad_norm": 0.46875, - "learning_rate": 4.242125984251969e-05, - "loss": 0.3731, - "step": 648 - }, - { - "epoch": 3.8, - "eval_loss": 0.3471480906009674, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 648 - }, - { - "epoch": 3.85, - "grad_norm": 0.47265625, - "learning_rate": 4.2322834645669296e-05, - "loss": 0.3825, - "step": 656 - }, - { - "epoch": 3.85, - "eval_loss": 0.34693393111228943, - "eval_runtime": 4.884, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 656 - }, - { - "epoch": 3.9, - "grad_norm": 0.48046875, - "learning_rate": 4.22244094488189e-05, - "loss": 0.3597, - "step": 664 - }, - { - "epoch": 3.9, - "eval_loss": 0.34674397110939026, - "eval_runtime": 4.8793, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 664 - }, - { - "epoch": 3.94, - "grad_norm": 0.470703125, - "learning_rate": 4.21259842519685e-05, - "loss": 0.3801, - "step": 672 - }, - { - "epoch": 3.94, - "eval_loss": 0.3460787236690521, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 672 - }, - { - "epoch": 3.99, - "grad_norm": 0.5, - "learning_rate": 4.202755905511811e-05, - "loss": 0.3605, - "step": 680 - }, - { - "epoch": 3.99, - "eval_loss": 0.3465350568294525, - "eval_runtime": 4.8783, - "eval_samples_per_second": 9.84, - "eval_steps_per_second": 1.23, - "step": 680 - }, - { - "epoch": 4.04, - "grad_norm": 0.49609375, - "learning_rate": 4.1929133858267716e-05, - "loss": 0.3543, - "step": 688 - }, - { - "epoch": 4.04, - "eval_loss": 0.3508842885494232, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 688 - }, - { - "epoch": 4.09, - "grad_norm": 0.4609375, - "learning_rate": 4.183070866141732e-05, - "loss": 0.3491, - "step": 696 - }, - { - "epoch": 4.09, - "eval_loss": 0.3495161533355713, - "eval_runtime": 4.8837, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 696 - }, - { - "epoch": 4.13, - "grad_norm": 0.5234375, - "learning_rate": 4.173228346456693e-05, - "loss": 0.3486, - "step": 704 - }, - { - "epoch": 4.13, - "eval_loss": 0.352257639169693, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 704 - }, - { - "epoch": 4.18, - "grad_norm": 0.51953125, - "learning_rate": 4.163385826771653e-05, - "loss": 0.3514, - "step": 712 - }, - { - "epoch": 4.18, - "eval_loss": 0.3515308201313019, - "eval_runtime": 4.8782, - "eval_samples_per_second": 9.84, - "eval_steps_per_second": 1.23, - "step": 712 - }, - { - "epoch": 4.23, - "grad_norm": 0.546875, - "learning_rate": 4.153543307086614e-05, - "loss": 0.3599, - "step": 720 - }, - { - "epoch": 4.23, - "eval_loss": 0.3499787747859955, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 720 - }, - { - "epoch": 4.27, - "grad_norm": 0.54296875, - "learning_rate": 4.1437007874015747e-05, - "loss": 0.337, - "step": 728 - }, - { - "epoch": 4.27, - "eval_loss": 0.35002824664115906, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 728 - }, - { - "epoch": 4.32, - "grad_norm": 0.5703125, - "learning_rate": 4.133858267716536e-05, - "loss": 0.3466, - "step": 736 - }, - { - "epoch": 4.32, - "eval_loss": 0.3474964201450348, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 736 - }, - { - "epoch": 4.37, - "grad_norm": 0.53125, - "learning_rate": 4.124015748031496e-05, - "loss": 0.3566, - "step": 744 - }, - { - "epoch": 4.37, - "eval_loss": 0.34953927993774414, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 744 - }, - { - "epoch": 4.41, - "grad_norm": 0.53515625, - "learning_rate": 4.114173228346457e-05, - "loss": 0.3448, - "step": 752 - }, - { - "epoch": 4.41, - "eval_loss": 0.34765908122062683, - "eval_runtime": 4.8762, - "eval_samples_per_second": 9.844, - "eval_steps_per_second": 1.23, - "step": 752 - }, - { - "epoch": 4.46, - "grad_norm": 0.5, - "learning_rate": 4.1043307086614174e-05, - "loss": 0.3518, - "step": 760 - }, - { - "epoch": 4.46, - "eval_loss": 0.3473018407821655, - "eval_runtime": 4.8872, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 760 - }, - { - "epoch": 4.51, - "grad_norm": 0.55078125, - "learning_rate": 4.0944881889763784e-05, - "loss": 0.3608, - "step": 768 - }, - { - "epoch": 4.51, - "eval_loss": 0.3483521044254303, - "eval_runtime": 4.8829, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 768 - }, - { - "epoch": 4.55, - "grad_norm": 0.578125, - "learning_rate": 4.084645669291339e-05, - "loss": 0.3535, - "step": 776 - }, - { - "epoch": 4.55, - "eval_loss": 0.3491772413253784, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 776 - }, - { - "epoch": 4.6, - "grad_norm": 0.5390625, - "learning_rate": 4.074803149606299e-05, - "loss": 0.3513, - "step": 784 - }, - { - "epoch": 4.6, - "eval_loss": 0.34816303849220276, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 784 - }, - { - "epoch": 4.65, - "grad_norm": 0.53515625, - "learning_rate": 4.06496062992126e-05, - "loss": 0.3424, - "step": 792 - }, - { - "epoch": 4.65, - "eval_loss": 0.3502413332462311, - "eval_runtime": 4.881, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 792 - }, - { - "epoch": 4.7, - "grad_norm": 0.515625, - "learning_rate": 4.0551181102362204e-05, - "loss": 0.3481, - "step": 800 - }, - { - "epoch": 4.7, - "eval_loss": 0.3473189175128937, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 800 - }, - { - "epoch": 4.74, - "grad_norm": 0.5078125, - "learning_rate": 4.0452755905511814e-05, - "loss": 0.3772, - "step": 808 - }, - { - "epoch": 4.74, - "eval_loss": 0.3471733033657074, - "eval_runtime": 4.8804, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 808 - }, - { - "epoch": 4.79, - "grad_norm": 0.515625, - "learning_rate": 4.035433070866142e-05, - "loss": 0.3542, - "step": 816 - }, - { - "epoch": 4.79, - "eval_loss": 0.3472949266433716, - "eval_runtime": 4.8862, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 816 - }, - { - "epoch": 4.84, - "grad_norm": 0.51171875, - "learning_rate": 4.025590551181103e-05, - "loss": 0.3712, - "step": 824 - }, - { - "epoch": 4.84, - "eval_loss": 0.3476013243198395, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 824 - }, - { - "epoch": 4.88, - "grad_norm": 0.5234375, - "learning_rate": 4.015748031496063e-05, - "loss": 0.3496, - "step": 832 - }, - { - "epoch": 4.88, - "eval_loss": 0.3474799394607544, - "eval_runtime": 4.8815, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 832 - }, - { - "epoch": 4.93, - "grad_norm": 0.58984375, - "learning_rate": 4.005905511811024e-05, - "loss": 0.3481, - "step": 840 - }, - { - "epoch": 4.93, - "eval_loss": 0.3481266498565674, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 840 - }, - { - "epoch": 4.98, - "grad_norm": 0.53125, - "learning_rate": 3.9960629921259845e-05, - "loss": 0.3416, - "step": 848 - }, - { - "epoch": 4.98, - "eval_loss": 0.3473891317844391, - "eval_runtime": 4.8872, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 848 - }, - { - "epoch": 5.02, - "grad_norm": 0.54296875, - "learning_rate": 3.9862204724409455e-05, - "loss": 0.3518, - "step": 856 - }, - { - "epoch": 5.02, - "eval_loss": 0.34982213377952576, - "eval_runtime": 4.8796, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 856 - }, - { - "epoch": 5.07, - "grad_norm": 0.62890625, - "learning_rate": 3.976377952755906e-05, - "loss": 0.3239, - "step": 864 - }, - { - "epoch": 5.07, - "eval_loss": 0.35387638211250305, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 864 - }, - { - "epoch": 5.12, - "grad_norm": 0.578125, - "learning_rate": 3.966535433070867e-05, - "loss": 0.3389, - "step": 872 - }, - { - "epoch": 5.12, - "eval_loss": 0.3555631935596466, - "eval_runtime": 4.8804, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 872 - }, - { - "epoch": 5.17, - "grad_norm": 0.5703125, - "learning_rate": 3.956692913385827e-05, - "loss": 0.3357, - "step": 880 - }, - { - "epoch": 5.17, - "eval_loss": 0.35232648253440857, - "eval_runtime": 4.8819, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 880 - }, - { - "epoch": 5.21, - "grad_norm": 0.5859375, - "learning_rate": 3.9468503937007875e-05, - "loss": 0.3305, - "step": 888 - }, - { - "epoch": 5.21, - "eval_loss": 0.3552100956439972, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 888 - }, - { - "epoch": 5.26, - "grad_norm": 0.56640625, - "learning_rate": 3.9370078740157485e-05, - "loss": 0.3308, - "step": 896 - }, - { - "epoch": 5.26, - "eval_loss": 0.35367465019226074, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 896 - }, - { - "epoch": 5.31, - "grad_norm": 0.625, - "learning_rate": 3.927165354330709e-05, - "loss": 0.3646, - "step": 904 - }, - { - "epoch": 5.31, - "eval_loss": 0.3549775183200836, - "eval_runtime": 4.8817, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 904 - }, - { - "epoch": 5.35, - "grad_norm": 0.60546875, - "learning_rate": 3.91732283464567e-05, - "loss": 0.3369, - "step": 912 - }, - { - "epoch": 5.35, - "eval_loss": 0.352167010307312, - "eval_runtime": 4.8815, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 912 - }, - { - "epoch": 5.4, - "grad_norm": 0.66015625, - "learning_rate": 3.90748031496063e-05, - "loss": 0.3391, - "step": 920 - }, - { - "epoch": 5.4, - "eval_loss": 0.35382068157196045, - "eval_runtime": 4.8737, - "eval_samples_per_second": 9.849, - "eval_steps_per_second": 1.231, - "step": 920 - }, - { - "epoch": 5.45, - "grad_norm": 0.5703125, - "learning_rate": 3.8976377952755905e-05, - "loss": 0.3343, - "step": 928 - }, - { - "epoch": 5.45, - "eval_loss": 0.35162171721458435, - "eval_runtime": 4.8953, - "eval_samples_per_second": 9.805, - "eval_steps_per_second": 1.226, - "step": 928 - }, - { - "epoch": 5.49, - "grad_norm": 0.61328125, - "learning_rate": 3.887795275590551e-05, - "loss": 0.3333, - "step": 936 - }, - { - "epoch": 5.49, - "eval_loss": 0.3531843423843384, - "eval_runtime": 4.8803, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 936 - }, - { - "epoch": 5.54, - "grad_norm": 0.58984375, - "learning_rate": 3.877952755905512e-05, - "loss": 0.3337, - "step": 944 - }, - { - "epoch": 5.54, - "eval_loss": 0.3522440493106842, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 944 - }, - { - "epoch": 5.59, - "grad_norm": 0.64453125, - "learning_rate": 3.868110236220472e-05, - "loss": 0.3281, - "step": 952 - }, - { - "epoch": 5.59, - "eval_loss": 0.3523660898208618, - "eval_runtime": 4.8767, - "eval_samples_per_second": 9.843, - "eval_steps_per_second": 1.23, - "step": 952 - }, - { - "epoch": 5.63, - "grad_norm": 0.5625, - "learning_rate": 3.858267716535433e-05, - "loss": 0.3268, - "step": 960 - }, - { - "epoch": 5.63, - "eval_loss": 0.35167932510375977, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 960 - }, - { - "epoch": 5.68, - "grad_norm": 0.55078125, - "learning_rate": 3.8484251968503936e-05, - "loss": 0.3407, - "step": 968 - }, - { - "epoch": 5.68, - "eval_loss": 0.3501236140727997, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 968 - }, - { - "epoch": 5.73, - "grad_norm": 0.625, - "learning_rate": 3.8385826771653546e-05, - "loss": 0.3221, - "step": 976 - }, - { - "epoch": 5.73, - "eval_loss": 0.35152295231819153, - "eval_runtime": 4.8854, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 976 - }, - { - "epoch": 5.78, - "grad_norm": 0.57421875, - "learning_rate": 3.828740157480315e-05, - "loss": 0.3506, - "step": 984 - }, - { - "epoch": 5.78, - "eval_loss": 0.3493311107158661, - "eval_runtime": 4.8909, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, - "step": 984 - }, - { - "epoch": 5.82, - "grad_norm": 0.546875, - "learning_rate": 3.818897637795276e-05, - "loss": 0.3393, - "step": 992 - }, - { - "epoch": 5.82, - "eval_loss": 0.34996774792671204, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 992 - }, - { - "epoch": 5.87, - "grad_norm": 0.61328125, - "learning_rate": 3.809055118110236e-05, - "loss": 0.3431, - "step": 1000 - }, - { - "epoch": 5.87, - "eval_loss": 0.3525948226451874, - "eval_runtime": 4.889, - "eval_samples_per_second": 9.818, - "eval_steps_per_second": 1.227, - "step": 1000 - }, - { - "epoch": 5.92, - "grad_norm": 0.5625, - "learning_rate": 3.7992125984251966e-05, - "loss": 0.3454, - "step": 1008 - }, - { - "epoch": 5.92, - "eval_loss": 0.349923700094223, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 1008 - }, - { - "epoch": 5.96, - "grad_norm": 0.6015625, - "learning_rate": 3.7893700787401576e-05, - "loss": 0.3276, - "step": 1016 - }, - { - "epoch": 5.96, - "eval_loss": 0.3507179021835327, - "eval_runtime": 4.8837, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 1016 - }, - { - "epoch": 6.01, - "grad_norm": 0.61328125, - "learning_rate": 3.779527559055118e-05, - "loss": 0.3146, - "step": 1024 - }, - { - "epoch": 6.01, - "eval_loss": 0.3517909049987793, - "eval_runtime": 4.8786, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, - "step": 1024 - }, - { - "epoch": 6.06, - "grad_norm": 0.6875, - "learning_rate": 3.769685039370079e-05, - "loss": 0.3186, - "step": 1032 - }, - { - "epoch": 6.06, - "eval_loss": 0.3593278229236603, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1032 - }, - { - "epoch": 6.1, - "grad_norm": 0.63671875, - "learning_rate": 3.759842519685039e-05, - "loss": 0.3197, - "step": 1040 - }, - { - "epoch": 6.1, - "eval_loss": 0.3572365343570709, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 1040 - }, - { - "epoch": 6.15, - "grad_norm": 0.625, - "learning_rate": 3.7500000000000003e-05, - "loss": 0.3081, - "step": 1048 - }, - { - "epoch": 6.15, - "eval_loss": 0.35952886939048767, - "eval_runtime": 4.8823, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1048 - }, - { - "epoch": 6.2, - "grad_norm": 0.69921875, - "learning_rate": 3.740157480314961e-05, - "loss": 0.3225, - "step": 1056 - }, - { - "epoch": 6.2, - "eval_loss": 0.3600703179836273, - "eval_runtime": 4.8852, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1056 - }, - { - "epoch": 6.25, - "grad_norm": 0.734375, - "learning_rate": 3.730314960629922e-05, - "loss": 0.3154, - "step": 1064 - }, - { - "epoch": 6.25, - "eval_loss": 0.3606225550174713, - "eval_runtime": 4.8812, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 1064 - }, - { - "epoch": 6.29, - "grad_norm": 0.62890625, - "learning_rate": 3.720472440944882e-05, - "loss": 0.3043, - "step": 1072 - }, - { - "epoch": 6.29, - "eval_loss": 0.35906192660331726, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 1072 - }, - { - "epoch": 6.34, - "grad_norm": 0.7109375, - "learning_rate": 3.710629921259843e-05, - "loss": 0.3131, - "step": 1080 - }, - { - "epoch": 6.34, - "eval_loss": 0.3606855869293213, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1080 - }, - { - "epoch": 6.39, - "grad_norm": 0.6640625, - "learning_rate": 3.7007874015748034e-05, - "loss": 0.3131, - "step": 1088 - }, - { - "epoch": 6.39, - "eval_loss": 0.35807374119758606, - "eval_runtime": 4.8819, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1088 - }, - { - "epoch": 6.43, - "grad_norm": 0.70703125, - "learning_rate": 3.690944881889764e-05, - "loss": 0.3277, - "step": 1096 - }, - { - "epoch": 6.43, - "eval_loss": 0.35696378350257874, - "eval_runtime": 4.89, - "eval_samples_per_second": 9.816, - "eval_steps_per_second": 1.227, - "step": 1096 - }, - { - "epoch": 6.48, - "grad_norm": 0.66796875, - "learning_rate": 3.681102362204725e-05, - "loss": 0.3051, - "step": 1104 - }, - { - "epoch": 6.48, - "eval_loss": 0.359957218170166, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1104 - }, - { - "epoch": 6.53, - "grad_norm": 0.73828125, - "learning_rate": 3.671259842519685e-05, - "loss": 0.3308, - "step": 1112 - }, - { - "epoch": 6.53, - "eval_loss": 0.35956406593322754, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1112 - }, - { - "epoch": 6.57, - "grad_norm": 0.68359375, - "learning_rate": 3.661417322834646e-05, - "loss": 0.3162, - "step": 1120 - }, - { - "epoch": 6.57, - "eval_loss": 0.3570359945297241, - "eval_runtime": 4.8827, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1120 - }, - { - "epoch": 6.62, - "grad_norm": 0.76953125, - "learning_rate": 3.6515748031496064e-05, - "loss": 0.3212, - "step": 1128 - }, - { - "epoch": 6.62, - "eval_loss": 0.3584875762462616, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 1128 - }, - { - "epoch": 6.67, - "grad_norm": 0.6875, - "learning_rate": 3.6417322834645674e-05, - "loss": 0.3191, - "step": 1136 - }, - { - "epoch": 6.67, - "eval_loss": 0.3554946482181549, - "eval_runtime": 4.8817, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1136 - }, - { - "epoch": 6.71, - "grad_norm": 0.66015625, - "learning_rate": 3.631889763779528e-05, - "loss": 0.3206, - "step": 1144 - }, - { - "epoch": 6.71, - "eval_loss": 0.3560527265071869, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1144 - }, - { - "epoch": 6.76, - "grad_norm": 0.66796875, - "learning_rate": 3.622047244094489e-05, - "loss": 0.3411, - "step": 1152 - }, - { - "epoch": 6.76, - "eval_loss": 0.3585592210292816, - "eval_runtime": 4.8873, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 1152 - }, - { - "epoch": 6.81, - "grad_norm": 0.62890625, - "learning_rate": 3.612204724409449e-05, - "loss": 0.3278, - "step": 1160 - }, - { - "epoch": 6.81, - "eval_loss": 0.3558274507522583, - "eval_runtime": 4.8914, - "eval_samples_per_second": 9.813, - "eval_steps_per_second": 1.227, - "step": 1160 - }, - { - "epoch": 6.86, - "grad_norm": 0.62890625, - "learning_rate": 3.60236220472441e-05, - "loss": 0.3273, - "step": 1168 - }, - { - "epoch": 6.86, - "eval_loss": 0.3541262447834015, - "eval_runtime": 4.8855, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1168 - }, - { - "epoch": 6.9, - "grad_norm": 0.72265625, - "learning_rate": 3.59251968503937e-05, - "loss": 0.3275, - "step": 1176 - }, - { - "epoch": 6.9, - "eval_loss": 0.35615813732147217, - "eval_runtime": 4.8897, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 1176 - }, - { - "epoch": 6.95, - "grad_norm": 0.69140625, - "learning_rate": 3.582677165354331e-05, - "loss": 0.3283, - "step": 1184 - }, - { - "epoch": 6.95, - "eval_loss": 0.35849082469940186, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 1184 - }, - { - "epoch": 7.0, - "grad_norm": 0.6328125, - "learning_rate": 3.572834645669291e-05, - "loss": 0.3107, - "step": 1192 - }, - { - "epoch": 7.0, - "eval_loss": 0.3558361530303955, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1192 - }, - { - "epoch": 7.04, - "grad_norm": 0.859375, - "learning_rate": 3.562992125984252e-05, - "loss": 0.3027, - "step": 1200 - }, - { - "epoch": 7.04, - "eval_loss": 0.37191081047058105, - "eval_runtime": 4.8846, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1200 - }, - { - "epoch": 7.09, - "grad_norm": 0.7890625, - "learning_rate": 3.5531496062992125e-05, - "loss": 0.2883, - "step": 1208 - }, - { - "epoch": 7.09, - "eval_loss": 0.3696741759777069, - "eval_runtime": 4.8811, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 1208 - }, - { - "epoch": 7.14, - "grad_norm": 0.73046875, - "learning_rate": 3.5433070866141735e-05, - "loss": 0.2871, - "step": 1216 - }, - { - "epoch": 7.14, - "eval_loss": 0.36751437187194824, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1216 - }, - { - "epoch": 7.18, - "grad_norm": 0.74609375, - "learning_rate": 3.533464566929134e-05, - "loss": 0.3094, - "step": 1224 - }, - { - "epoch": 7.18, - "eval_loss": 0.3699875771999359, - "eval_runtime": 4.8886, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1224 - }, - { - "epoch": 7.23, - "grad_norm": 0.765625, - "learning_rate": 3.523622047244094e-05, - "loss": 0.3026, - "step": 1232 - }, - { - "epoch": 7.23, - "eval_loss": 0.368384450674057, - "eval_runtime": 4.8856, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1232 - }, - { - "epoch": 7.28, - "grad_norm": 0.7890625, - "learning_rate": 3.513779527559055e-05, - "loss": 0.2984, - "step": 1240 - }, - { - "epoch": 7.28, - "eval_loss": 0.3673759400844574, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 1240 - }, - { - "epoch": 7.33, - "grad_norm": 0.76953125, - "learning_rate": 3.5039370078740156e-05, - "loss": 0.3057, - "step": 1248 - }, - { - "epoch": 7.33, - "eval_loss": 0.37147435545921326, - "eval_runtime": 4.8851, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1248 - }, - { - "epoch": 7.37, - "grad_norm": 0.77734375, - "learning_rate": 3.4940944881889766e-05, - "loss": 0.277, - "step": 1256 - }, - { - "epoch": 7.37, - "eval_loss": 0.3669518232345581, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1256 - }, - { - "epoch": 7.42, - "grad_norm": 0.71484375, - "learning_rate": 3.484251968503937e-05, - "loss": 0.2999, - "step": 1264 - }, - { - "epoch": 7.42, - "eval_loss": 0.3678833246231079, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1264 - }, - { - "epoch": 7.47, - "grad_norm": 0.66015625, - "learning_rate": 3.474409448818898e-05, - "loss": 0.3124, - "step": 1272 - }, - { - "epoch": 7.47, - "eval_loss": 0.3651003837585449, - "eval_runtime": 4.8819, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1272 - }, - { - "epoch": 7.51, - "grad_norm": 0.8359375, - "learning_rate": 3.464566929133858e-05, - "loss": 0.2919, - "step": 1280 - }, - { - "epoch": 7.51, - "eval_loss": 0.36933448910713196, - "eval_runtime": 4.8856, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1280 - }, - { - "epoch": 7.56, - "grad_norm": 0.84375, - "learning_rate": 3.454724409448819e-05, - "loss": 0.3111, - "step": 1288 - }, - { - "epoch": 7.56, - "eval_loss": 0.3658764362335205, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1288 - }, - { - "epoch": 7.61, - "grad_norm": 0.7890625, - "learning_rate": 3.4448818897637796e-05, - "loss": 0.3025, - "step": 1296 - }, - { - "epoch": 7.61, - "eval_loss": 0.36660221219062805, - "eval_runtime": 4.8886, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1296 - }, - { - "epoch": 7.65, - "grad_norm": 0.78125, - "learning_rate": 3.4350393700787406e-05, - "loss": 0.3028, - "step": 1304 - }, - { - "epoch": 7.65, - "eval_loss": 0.36641809344291687, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1304 - }, - { - "epoch": 7.7, - "grad_norm": 0.765625, - "learning_rate": 3.425196850393701e-05, - "loss": 0.3075, - "step": 1312 - }, - { - "epoch": 7.7, - "eval_loss": 0.362444132566452, - "eval_runtime": 4.8873, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 1312 - }, - { - "epoch": 7.75, - "grad_norm": 0.765625, - "learning_rate": 3.415354330708661e-05, - "loss": 0.3134, - "step": 1320 - }, - { - "epoch": 7.75, - "eval_loss": 0.36705541610717773, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1320 - }, - { - "epoch": 7.79, - "grad_norm": 0.7265625, - "learning_rate": 3.405511811023622e-05, - "loss": 0.299, - "step": 1328 - }, - { - "epoch": 7.79, - "eval_loss": 0.366720050573349, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1328 - }, - { - "epoch": 7.84, - "grad_norm": 0.671875, - "learning_rate": 3.3956692913385827e-05, - "loss": 0.3067, - "step": 1336 - }, - { - "epoch": 7.84, - "eval_loss": 0.3674195110797882, - "eval_runtime": 4.8885, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 1336 - }, - { - "epoch": 7.89, - "grad_norm": 0.75, - "learning_rate": 3.385826771653544e-05, - "loss": 0.3073, - "step": 1344 - }, - { - "epoch": 7.89, - "eval_loss": 0.3649815618991852, - "eval_runtime": 4.8905, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 1344 - }, - { - "epoch": 7.94, - "grad_norm": 0.796875, - "learning_rate": 3.375984251968504e-05, - "loss": 0.3187, - "step": 1352 - }, - { - "epoch": 7.94, - "eval_loss": 0.3641921281814575, - "eval_runtime": 4.8875, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 1352 - }, - { - "epoch": 7.98, - "grad_norm": 0.83203125, - "learning_rate": 3.366141732283465e-05, - "loss": 0.2982, - "step": 1360 - }, - { - "epoch": 7.98, - "eval_loss": 0.36615875363349915, - "eval_runtime": 4.8874, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 1360 - }, - { - "epoch": 8.03, - "grad_norm": 0.77734375, - "learning_rate": 3.3562992125984254e-05, - "loss": 0.2816, - "step": 1368 - }, - { - "epoch": 8.03, - "eval_loss": 0.3784830570220947, - "eval_runtime": 4.8814, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1368 - }, - { - "epoch": 8.08, - "grad_norm": 0.8984375, - "learning_rate": 3.3464566929133864e-05, - "loss": 0.2811, - "step": 1376 - }, - { - "epoch": 8.08, - "eval_loss": 0.37378013134002686, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1376 - }, - { - "epoch": 8.12, - "grad_norm": 0.86328125, - "learning_rate": 3.336614173228347e-05, - "loss": 0.2771, - "step": 1384 - }, - { - "epoch": 8.12, - "eval_loss": 0.37527838349342346, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 1384 - }, - { - "epoch": 8.17, - "grad_norm": 0.95703125, - "learning_rate": 3.326771653543308e-05, - "loss": 0.2884, - "step": 1392 - }, - { - "epoch": 8.17, - "eval_loss": 0.37870660424232483, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1392 - }, - { - "epoch": 8.22, - "grad_norm": 0.99609375, - "learning_rate": 3.316929133858268e-05, - "loss": 0.2671, - "step": 1400 - }, - { - "epoch": 8.22, - "eval_loss": 0.3793374001979828, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 1400 - }, - { - "epoch": 8.26, - "grad_norm": 0.95703125, - "learning_rate": 3.3070866141732284e-05, - "loss": 0.2861, - "step": 1408 - }, - { - "epoch": 8.26, - "eval_loss": 0.38111141324043274, - "eval_runtime": 4.8809, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 1408 - }, - { - "epoch": 8.31, - "grad_norm": 0.83203125, - "learning_rate": 3.2972440944881894e-05, - "loss": 0.29, - "step": 1416 - }, - { - "epoch": 8.31, - "eval_loss": 0.3800075948238373, - "eval_runtime": 4.8832, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1416 - }, - { - "epoch": 8.36, - "grad_norm": 0.8515625, - "learning_rate": 3.28740157480315e-05, - "loss": 0.2739, - "step": 1424 - }, - { - "epoch": 8.36, - "eval_loss": 0.37995681166648865, - "eval_runtime": 4.8873, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 1424 - }, - { - "epoch": 8.4, - "grad_norm": 0.84375, - "learning_rate": 3.27755905511811e-05, - "loss": 0.2803, - "step": 1432 - }, - { - "epoch": 8.4, - "eval_loss": 0.37614905834198, - "eval_runtime": 4.8889, - "eval_samples_per_second": 9.818, - "eval_steps_per_second": 1.227, - "step": 1432 - }, - { - "epoch": 8.45, - "grad_norm": 1.0703125, - "learning_rate": 3.2677165354330704e-05, - "loss": 0.282, - "step": 1440 - }, - { - "epoch": 8.45, - "eval_loss": 0.37973472476005554, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1440 - }, - { - "epoch": 8.5, - "grad_norm": 0.87890625, - "learning_rate": 3.2578740157480314e-05, - "loss": 0.2858, - "step": 1448 - }, - { - "epoch": 8.5, - "eval_loss": 0.3786061704158783, - "eval_runtime": 4.8817, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1448 - }, - { - "epoch": 8.55, - "grad_norm": 0.87109375, - "learning_rate": 3.248031496062992e-05, - "loss": 0.2853, - "step": 1456 - }, - { - "epoch": 8.55, - "eval_loss": 0.38181185722351074, - "eval_runtime": 4.8849, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1456 - }, - { - "epoch": 8.59, - "grad_norm": 0.79296875, - "learning_rate": 3.238188976377953e-05, - "loss": 0.2978, - "step": 1464 - }, - { - "epoch": 8.59, - "eval_loss": 0.3762815296649933, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1464 - }, - { - "epoch": 8.64, - "grad_norm": 0.90625, - "learning_rate": 3.228346456692913e-05, - "loss": 0.2948, - "step": 1472 - }, - { - "epoch": 8.64, - "eval_loss": 0.3758275508880615, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 1472 - }, - { - "epoch": 8.69, - "grad_norm": 0.953125, - "learning_rate": 3.218503937007874e-05, - "loss": 0.2876, - "step": 1480 - }, - { - "epoch": 8.69, - "eval_loss": 0.3739815950393677, - "eval_runtime": 4.8829, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1480 - }, - { - "epoch": 8.73, - "grad_norm": 0.94921875, - "learning_rate": 3.2086614173228345e-05, - "loss": 0.27, - "step": 1488 - }, - { - "epoch": 8.73, - "eval_loss": 0.38057199120521545, - "eval_runtime": 4.8809, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 1488 - }, - { - "epoch": 8.78, - "grad_norm": 0.9609375, - "learning_rate": 3.1988188976377955e-05, - "loss": 0.2898, - "step": 1496 - }, - { - "epoch": 8.78, - "eval_loss": 0.3779284954071045, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1496 - }, - { - "epoch": 8.83, - "grad_norm": 0.80078125, - "learning_rate": 3.188976377952756e-05, - "loss": 0.2696, - "step": 1504 - }, - { - "epoch": 8.83, - "eval_loss": 0.3726881742477417, - "eval_runtime": 4.8862, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 1504 - }, - { - "epoch": 8.87, - "grad_norm": 0.92578125, - "learning_rate": 3.179133858267717e-05, - "loss": 0.2998, - "step": 1512 - }, - { - "epoch": 8.87, - "eval_loss": 0.37989136576652527, - "eval_runtime": 4.8764, - "eval_samples_per_second": 9.843, - "eval_steps_per_second": 1.23, - "step": 1512 - }, - { - "epoch": 8.92, - "grad_norm": 0.96875, - "learning_rate": 3.169291338582677e-05, - "loss": 0.2733, - "step": 1520 - }, - { - "epoch": 8.92, - "eval_loss": 0.37501704692840576, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1520 - }, - { - "epoch": 8.97, - "grad_norm": 0.87109375, - "learning_rate": 3.159448818897638e-05, - "loss": 0.2873, - "step": 1528 - }, - { - "epoch": 8.97, - "eval_loss": 0.37481260299682617, - "eval_runtime": 4.88, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 1528 - }, - { - "epoch": 9.02, - "grad_norm": 0.76171875, - "learning_rate": 3.1496062992125985e-05, - "loss": 0.2747, - "step": 1536 - }, - { - "epoch": 9.02, - "eval_loss": 0.3823326528072357, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1536 - }, - { - "epoch": 9.06, - "grad_norm": 1.0625, - "learning_rate": 3.139763779527559e-05, - "loss": 0.266, - "step": 1544 - }, - { - "epoch": 9.06, - "eval_loss": 0.38916265964508057, - "eval_runtime": 4.88, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 1544 - }, - { - "epoch": 9.11, - "grad_norm": 1.0234375, - "learning_rate": 3.12992125984252e-05, - "loss": 0.264, - "step": 1552 - }, - { - "epoch": 9.11, - "eval_loss": 0.3951665163040161, - "eval_runtime": 4.8839, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 1552 - }, - { - "epoch": 9.16, - "grad_norm": 1.046875, - "learning_rate": 3.12007874015748e-05, - "loss": 0.2638, - "step": 1560 - }, - { - "epoch": 9.16, - "eval_loss": 0.3967585861682892, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1560 - }, - { - "epoch": 9.2, - "grad_norm": 0.88671875, - "learning_rate": 3.110236220472441e-05, - "loss": 0.2597, - "step": 1568 - }, - { - "epoch": 9.2, - "eval_loss": 0.39212462306022644, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1568 - }, - { - "epoch": 9.25, - "grad_norm": 1.1015625, - "learning_rate": 3.1003937007874016e-05, - "loss": 0.2663, - "step": 1576 - }, - { - "epoch": 9.25, - "eval_loss": 0.3920614421367645, - "eval_runtime": 4.8778, - "eval_samples_per_second": 9.841, - "eval_steps_per_second": 1.23, - "step": 1576 - }, - { - "epoch": 9.3, - "grad_norm": 1.0546875, - "learning_rate": 3.0905511811023626e-05, - "loss": 0.2518, - "step": 1584 - }, - { - "epoch": 9.3, - "eval_loss": 0.393242746591568, - "eval_runtime": 4.8823, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1584 - }, - { - "epoch": 9.34, - "grad_norm": 0.98046875, - "learning_rate": 3.080708661417323e-05, - "loss": 0.261, - "step": 1592 - }, - { - "epoch": 9.34, - "eval_loss": 0.39824220538139343, - "eval_runtime": 4.8878, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 1592 - }, - { - "epoch": 9.39, - "grad_norm": 0.9609375, - "learning_rate": 3.070866141732284e-05, - "loss": 0.2499, - "step": 1600 - }, - { - "epoch": 9.39, - "eval_loss": 0.3865773677825928, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1600 - }, - { - "epoch": 9.44, - "grad_norm": 1.046875, - "learning_rate": 3.061023622047244e-05, - "loss": 0.2721, - "step": 1608 - }, - { - "epoch": 9.44, - "eval_loss": 0.387246698141098, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 1608 - }, - { - "epoch": 9.48, - "grad_norm": 1.0703125, - "learning_rate": 3.051181102362205e-05, - "loss": 0.2621, - "step": 1616 - }, - { - "epoch": 9.48, - "eval_loss": 0.3960321843624115, - "eval_runtime": 4.8825, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1616 - }, - { - "epoch": 9.53, - "grad_norm": 0.8671875, - "learning_rate": 3.0413385826771656e-05, - "loss": 0.2597, - "step": 1624 - }, - { - "epoch": 9.53, - "eval_loss": 0.3888339102268219, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 1624 - }, - { - "epoch": 9.58, - "grad_norm": 1.0703125, - "learning_rate": 3.0314960629921263e-05, - "loss": 0.2549, - "step": 1632 - }, - { - "epoch": 9.58, - "eval_loss": 0.3892790377140045, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1632 - }, - { - "epoch": 9.63, - "grad_norm": 1.1015625, - "learning_rate": 3.021653543307087e-05, - "loss": 0.2712, - "step": 1640 - }, - { - "epoch": 9.63, - "eval_loss": 0.3891643285751343, - "eval_runtime": 4.8808, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 1640 - }, - { - "epoch": 9.67, - "grad_norm": 0.9921875, - "learning_rate": 3.0118110236220477e-05, - "loss": 0.2586, - "step": 1648 - }, - { - "epoch": 9.67, - "eval_loss": 0.38737013936042786, - "eval_runtime": 4.88, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 1648 - }, - { - "epoch": 9.72, - "grad_norm": 1.0625, - "learning_rate": 3.0019685039370083e-05, - "loss": 0.2572, - "step": 1656 - }, - { - "epoch": 9.72, - "eval_loss": 0.3896384537220001, - "eval_runtime": 4.879, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 1656 - }, - { - "epoch": 9.77, - "grad_norm": 0.98046875, - "learning_rate": 2.992125984251969e-05, - "loss": 0.2642, - "step": 1664 - }, - { - "epoch": 9.77, - "eval_loss": 0.39220455288887024, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1664 - }, - { - "epoch": 9.81, - "grad_norm": 0.9609375, - "learning_rate": 2.9822834645669297e-05, - "loss": 0.2555, - "step": 1672 - }, - { - "epoch": 9.81, - "eval_loss": 0.3866095244884491, - "eval_runtime": 4.8806, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1672 - }, - { - "epoch": 9.86, - "grad_norm": 0.97265625, - "learning_rate": 2.97244094488189e-05, - "loss": 0.2639, - "step": 1680 - }, - { - "epoch": 9.86, - "eval_loss": 0.3891311585903168, - "eval_runtime": 4.8849, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1680 - }, - { - "epoch": 9.91, - "grad_norm": 1.171875, - "learning_rate": 2.9625984251968504e-05, - "loss": 0.2634, - "step": 1688 - }, - { - "epoch": 9.91, - "eval_loss": 0.38789328932762146, - "eval_runtime": 4.882, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1688 - }, - { - "epoch": 9.95, - "grad_norm": 0.953125, - "learning_rate": 2.952755905511811e-05, - "loss": 0.2638, - "step": 1696 - }, - { - "epoch": 9.95, - "eval_loss": 0.38731780648231506, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 1696 - }, - { - "epoch": 10.0, - "grad_norm": 1.03125, - "learning_rate": 2.9429133858267717e-05, - "loss": 0.2873, - "step": 1704 - }, - { - "epoch": 10.0, - "eval_loss": 0.3895009458065033, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 1704 - }, - { - "epoch": 10.05, - "grad_norm": 1.1796875, - "learning_rate": 2.933070866141732e-05, - "loss": 0.23, - "step": 1712 - }, - { - "epoch": 10.05, - "eval_loss": 0.41275954246520996, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1712 - }, - { - "epoch": 10.1, - "grad_norm": 1.09375, - "learning_rate": 2.9232283464566927e-05, - "loss": 0.2327, - "step": 1720 - }, - { - "epoch": 10.1, - "eval_loss": 0.41290533542633057, - "eval_runtime": 4.8825, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1720 - }, - { - "epoch": 10.14, - "grad_norm": 1.1015625, - "learning_rate": 2.9133858267716534e-05, - "loss": 0.2434, - "step": 1728 - }, - { - "epoch": 10.14, - "eval_loss": 0.4091669023036957, - "eval_runtime": 4.8855, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1728 - }, - { - "epoch": 10.19, - "grad_norm": 1.21875, - "learning_rate": 2.903543307086614e-05, - "loss": 0.2254, - "step": 1736 - }, - { - "epoch": 10.19, - "eval_loss": 0.4104800224304199, - "eval_runtime": 4.8818, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 1736 - }, - { - "epoch": 10.24, - "grad_norm": 1.015625, - "learning_rate": 2.8937007874015748e-05, - "loss": 0.2461, - "step": 1744 - }, - { - "epoch": 10.24, - "eval_loss": 0.4064279794692993, - "eval_runtime": 4.883, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1744 - }, - { - "epoch": 10.28, - "grad_norm": 1.046875, - "learning_rate": 2.8838582677165354e-05, - "loss": 0.2318, - "step": 1752 - }, - { - "epoch": 10.28, - "eval_loss": 0.4084310829639435, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1752 - }, - { - "epoch": 10.33, - "grad_norm": 1.1171875, - "learning_rate": 2.874015748031496e-05, - "loss": 0.2486, - "step": 1760 - }, - { - "epoch": 10.33, - "eval_loss": 0.40453585982322693, - "eval_runtime": 4.8825, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1760 - }, - { - "epoch": 10.38, - "grad_norm": 1.0546875, - "learning_rate": 2.8641732283464568e-05, - "loss": 0.2425, - "step": 1768 - }, - { - "epoch": 10.38, - "eval_loss": 0.4052884578704834, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1768 - }, - { - "epoch": 10.42, - "grad_norm": 1.2734375, - "learning_rate": 2.8543307086614175e-05, - "loss": 0.2369, - "step": 1776 - }, - { - "epoch": 10.42, - "eval_loss": 0.4092380702495575, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 1776 - }, - { - "epoch": 10.47, - "grad_norm": 1.234375, - "learning_rate": 2.844488188976378e-05, - "loss": 0.226, - "step": 1784 - }, - { - "epoch": 10.47, - "eval_loss": 0.40878424048423767, - "eval_runtime": 4.8818, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 1784 - }, - { - "epoch": 10.52, - "grad_norm": 1.0234375, - "learning_rate": 2.8346456692913388e-05, - "loss": 0.242, - "step": 1792 - }, - { - "epoch": 10.52, - "eval_loss": 0.40601250529289246, - "eval_runtime": 4.889, - "eval_samples_per_second": 9.818, - "eval_steps_per_second": 1.227, - "step": 1792 - }, - { - "epoch": 10.56, - "grad_norm": 1.09375, - "learning_rate": 2.824803149606299e-05, - "loss": 0.261, - "step": 1800 - }, - { - "epoch": 10.56, - "eval_loss": 0.4090254604816437, - "eval_runtime": 4.877, - "eval_samples_per_second": 9.842, - "eval_steps_per_second": 1.23, - "step": 1800 - }, - { - "epoch": 10.61, - "grad_norm": 1.1015625, - "learning_rate": 2.81496062992126e-05, - "loss": 0.2375, - "step": 1808 - }, - { - "epoch": 10.61, - "eval_loss": 0.4072287082672119, - "eval_runtime": 4.8878, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 1808 - }, - { - "epoch": 10.66, - "grad_norm": 1.046875, - "learning_rate": 2.8051181102362205e-05, - "loss": 0.2557, - "step": 1816 - }, - { - "epoch": 10.66, - "eval_loss": 0.4026179611682892, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1816 - }, - { - "epoch": 10.71, - "grad_norm": 1.15625, - "learning_rate": 2.7952755905511812e-05, - "loss": 0.2389, - "step": 1824 - }, - { - "epoch": 10.71, - "eval_loss": 0.4051065742969513, - "eval_runtime": 4.879, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 1824 - }, - { - "epoch": 10.75, - "grad_norm": 1.375, - "learning_rate": 2.785433070866142e-05, - "loss": 0.2481, - "step": 1832 - }, - { - "epoch": 10.75, - "eval_loss": 0.4018687307834625, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1832 - }, - { - "epoch": 10.8, - "grad_norm": 1.046875, - "learning_rate": 2.7755905511811025e-05, - "loss": 0.2612, - "step": 1840 - }, - { - "epoch": 10.8, - "eval_loss": 0.40441223978996277, - "eval_runtime": 4.8793, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 1840 - }, - { - "epoch": 10.85, - "grad_norm": 1.2890625, - "learning_rate": 2.7657480314960632e-05, - "loss": 0.2487, - "step": 1848 - }, - { - "epoch": 10.85, - "eval_loss": 0.40596404671669006, - "eval_runtime": 4.8789, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 1848 - }, - { - "epoch": 10.89, - "grad_norm": 1.109375, - "learning_rate": 2.755905511811024e-05, - "loss": 0.2339, - "step": 1856 - }, - { - "epoch": 10.89, - "eval_loss": 0.4074425995349884, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 1856 - }, - { - "epoch": 10.94, - "grad_norm": 1.046875, - "learning_rate": 2.7460629921259846e-05, - "loss": 0.2356, - "step": 1864 - }, - { - "epoch": 10.94, - "eval_loss": 0.4032224714756012, - "eval_runtime": 4.8798, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 1864 - }, - { - "epoch": 10.99, - "grad_norm": 1.125, - "learning_rate": 2.7362204724409452e-05, - "loss": 0.2452, - "step": 1872 - }, - { - "epoch": 10.99, - "eval_loss": 0.40676093101501465, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 1872 - }, - { - "epoch": 11.03, - "grad_norm": 1.2890625, - "learning_rate": 2.726377952755906e-05, - "loss": 0.2151, - "step": 1880 - }, - { - "epoch": 11.03, - "eval_loss": 0.4335891306400299, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1880 - }, - { - "epoch": 11.08, - "grad_norm": 1.1796875, - "learning_rate": 2.7165354330708666e-05, - "loss": 0.2191, - "step": 1888 - }, - { - "epoch": 11.08, - "eval_loss": 0.4158717095851898, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1888 - }, - { - "epoch": 11.13, - "grad_norm": 1.15625, - "learning_rate": 2.706692913385827e-05, - "loss": 0.2186, - "step": 1896 - }, - { - "epoch": 11.13, - "eval_loss": 0.41939032077789307, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1896 - }, - { - "epoch": 11.18, - "grad_norm": 1.1875, - "learning_rate": 2.6968503937007876e-05, - "loss": 0.2177, - "step": 1904 - }, - { - "epoch": 11.18, - "eval_loss": 0.4248329699039459, - "eval_runtime": 4.8806, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1904 - }, - { - "epoch": 11.22, - "grad_norm": 1.28125, - "learning_rate": 2.6870078740157483e-05, - "loss": 0.2203, - "step": 1912 - }, - { - "epoch": 11.22, - "eval_loss": 0.4253775179386139, - "eval_runtime": 4.878, - "eval_samples_per_second": 9.84, - "eval_steps_per_second": 1.23, - "step": 1912 - }, - { - "epoch": 11.27, - "grad_norm": 1.265625, - "learning_rate": 2.677165354330709e-05, - "loss": 0.2151, - "step": 1920 - }, - { - "epoch": 11.27, - "eval_loss": 0.4222745895385742, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 1920 - }, - { - "epoch": 11.32, - "grad_norm": 1.2109375, - "learning_rate": 2.6673228346456696e-05, - "loss": 0.2197, - "step": 1928 - }, - { - "epoch": 11.32, - "eval_loss": 0.42230936884880066, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 1928 - }, - { - "epoch": 11.36, - "grad_norm": 1.3515625, - "learning_rate": 2.6574803149606303e-05, - "loss": 0.2138, - "step": 1936 - }, - { - "epoch": 11.36, - "eval_loss": 0.4267289936542511, - "eval_runtime": 4.8854, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 1936 - }, - { - "epoch": 11.41, - "grad_norm": 1.2421875, - "learning_rate": 2.6476377952755903e-05, - "loss": 0.2217, - "step": 1944 - }, - { - "epoch": 11.41, - "eval_loss": 0.4230763018131256, - "eval_runtime": 4.8832, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 1944 - }, - { - "epoch": 11.46, - "grad_norm": 1.1640625, - "learning_rate": 2.637795275590551e-05, - "loss": 0.2189, - "step": 1952 - }, - { - "epoch": 11.46, - "eval_loss": 0.42412805557250977, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 1952 - }, - { - "epoch": 11.5, - "grad_norm": 1.2265625, - "learning_rate": 2.6279527559055117e-05, - "loss": 0.2336, - "step": 1960 - }, - { - "epoch": 11.5, - "eval_loss": 0.42926904559135437, - "eval_runtime": 4.8945, - "eval_samples_per_second": 9.807, - "eval_steps_per_second": 1.226, - "step": 1960 - }, - { - "epoch": 11.55, - "grad_norm": 1.390625, - "learning_rate": 2.6181102362204723e-05, - "loss": 0.2146, - "step": 1968 - }, - { - "epoch": 11.55, - "eval_loss": 0.4218541085720062, - "eval_runtime": 4.8946, - "eval_samples_per_second": 9.807, - "eval_steps_per_second": 1.226, - "step": 1968 - }, - { - "epoch": 11.6, - "grad_norm": 1.28125, - "learning_rate": 2.608267716535433e-05, - "loss": 0.2332, - "step": 1976 - }, - { - "epoch": 11.6, - "eval_loss": 0.42434871196746826, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 1976 - }, - { - "epoch": 11.64, - "grad_norm": 0.98828125, - "learning_rate": 2.5984251968503937e-05, - "loss": 0.214, - "step": 1984 - }, - { - "epoch": 11.64, - "eval_loss": 0.42285728454589844, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 1984 - }, - { - "epoch": 11.69, - "grad_norm": 1.2265625, - "learning_rate": 2.5885826771653544e-05, - "loss": 0.2278, - "step": 1992 - }, - { - "epoch": 11.69, - "eval_loss": 0.4248940050601959, - "eval_runtime": 4.8801, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 1992 - }, - { - "epoch": 11.74, - "grad_norm": 1.4140625, - "learning_rate": 2.578740157480315e-05, - "loss": 0.2165, - "step": 2000 - }, - { - "epoch": 11.74, - "eval_loss": 0.4275287687778473, - "eval_runtime": 4.8821, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 2000 - }, - { - "epoch": 11.79, - "grad_norm": 1.171875, - "learning_rate": 2.5688976377952757e-05, - "loss": 0.2221, - "step": 2008 - }, - { - "epoch": 11.79, - "eval_loss": 0.4152206480503082, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2008 - }, - { - "epoch": 11.83, - "grad_norm": 1.2109375, - "learning_rate": 2.5590551181102364e-05, - "loss": 0.2238, - "step": 2016 - }, - { - "epoch": 11.83, - "eval_loss": 0.4216756522655487, - "eval_runtime": 4.8871, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2016 - }, - { - "epoch": 11.88, - "grad_norm": 1.171875, - "learning_rate": 2.5492125984251967e-05, - "loss": 0.2166, - "step": 2024 - }, - { - "epoch": 11.88, - "eval_loss": 0.4253371059894562, - "eval_runtime": 4.8804, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 2024 - }, - { - "epoch": 11.93, - "grad_norm": 1.234375, - "learning_rate": 2.5393700787401574e-05, - "loss": 0.2366, - "step": 2032 - }, - { - "epoch": 11.93, - "eval_loss": 0.42123886942863464, - "eval_runtime": 4.8787, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, - "step": 2032 - }, - { - "epoch": 11.97, - "grad_norm": 1.4140625, - "learning_rate": 2.529527559055118e-05, - "loss": 0.2384, - "step": 2040 - }, - { - "epoch": 11.97, - "eval_loss": 0.42442429065704346, - "eval_runtime": 4.8882, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 2040 - }, - { - "epoch": 12.02, - "grad_norm": 1.328125, - "learning_rate": 2.5196850393700788e-05, - "loss": 0.2215, - "step": 2048 - }, - { - "epoch": 12.02, - "eval_loss": 0.4381261169910431, - "eval_runtime": 4.8867, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2048 - }, - { - "epoch": 12.07, - "grad_norm": 1.3671875, - "learning_rate": 2.5098425196850394e-05, - "loss": 0.1966, - "step": 2056 - }, - { - "epoch": 12.07, - "eval_loss": 0.4352915585041046, - "eval_runtime": 4.8879, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 2056 - }, - { - "epoch": 12.11, - "grad_norm": 1.2265625, - "learning_rate": 2.5e-05, - "loss": 0.1956, - "step": 2064 - }, - { - "epoch": 12.11, - "eval_loss": 0.44628992676734924, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2064 - }, - { - "epoch": 12.16, - "grad_norm": 1.28125, - "learning_rate": 2.4901574803149608e-05, - "loss": 0.193, - "step": 2072 - }, - { - "epoch": 12.16, - "eval_loss": 0.44343388080596924, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2072 - }, - { - "epoch": 12.21, - "grad_norm": 1.390625, - "learning_rate": 2.4803149606299215e-05, - "loss": 0.1983, - "step": 2080 - }, - { - "epoch": 12.21, - "eval_loss": 0.4414764642715454, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2080 - }, - { - "epoch": 12.26, - "grad_norm": 1.2578125, - "learning_rate": 2.470472440944882e-05, - "loss": 0.205, - "step": 2088 - }, - { - "epoch": 12.26, - "eval_loss": 0.44560471177101135, - "eval_runtime": 4.8846, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2088 - }, - { - "epoch": 12.3, - "grad_norm": 1.2578125, - "learning_rate": 2.4606299212598428e-05, - "loss": 0.2081, - "step": 2096 - }, - { - "epoch": 12.3, - "eval_loss": 0.44146040081977844, - "eval_runtime": 4.8849, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2096 - }, - { - "epoch": 12.35, - "grad_norm": 1.6953125, - "learning_rate": 2.4507874015748035e-05, - "loss": 0.1933, - "step": 2104 - }, - { - "epoch": 12.35, - "eval_loss": 0.44217804074287415, - "eval_runtime": 4.8814, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2104 - }, - { - "epoch": 12.4, - "grad_norm": 1.3984375, - "learning_rate": 2.440944881889764e-05, - "loss": 0.2039, - "step": 2112 - }, - { - "epoch": 12.4, - "eval_loss": 0.4430825710296631, - "eval_runtime": 4.8902, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 2112 - }, - { - "epoch": 12.44, - "grad_norm": 1.3046875, - "learning_rate": 2.4311023622047245e-05, - "loss": 0.1976, - "step": 2120 - }, - { - "epoch": 12.44, - "eval_loss": 0.43755796551704407, - "eval_runtime": 4.8815, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2120 - }, - { - "epoch": 12.49, - "grad_norm": 1.296875, - "learning_rate": 2.421259842519685e-05, - "loss": 0.2136, - "step": 2128 - }, - { - "epoch": 12.49, - "eval_loss": 0.43662428855895996, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 2128 - }, - { - "epoch": 12.54, - "grad_norm": 1.328125, - "learning_rate": 2.4114173228346455e-05, - "loss": 0.2026, - "step": 2136 - }, - { - "epoch": 12.54, - "eval_loss": 0.44358769059181213, - "eval_runtime": 4.8878, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 2136 - }, - { - "epoch": 12.58, - "grad_norm": 1.2421875, - "learning_rate": 2.4015748031496062e-05, - "loss": 0.2116, - "step": 2144 - }, - { - "epoch": 12.58, - "eval_loss": 0.44140565395355225, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2144 - }, - { - "epoch": 12.63, - "grad_norm": 1.3046875, - "learning_rate": 2.391732283464567e-05, - "loss": 0.2021, - "step": 2152 - }, - { - "epoch": 12.63, - "eval_loss": 0.4431957006454468, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2152 - }, - { - "epoch": 12.68, - "grad_norm": 1.4453125, - "learning_rate": 2.3818897637795276e-05, - "loss": 0.1976, - "step": 2160 - }, - { - "epoch": 12.68, - "eval_loss": 0.4439380466938019, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2160 - }, - { - "epoch": 12.72, - "grad_norm": 1.21875, - "learning_rate": 2.3720472440944882e-05, - "loss": 0.2031, - "step": 2168 - }, - { - "epoch": 12.72, - "eval_loss": 0.4403323233127594, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2168 - }, - { - "epoch": 12.77, - "grad_norm": 1.375, - "learning_rate": 2.362204724409449e-05, - "loss": 0.222, - "step": 2176 - }, - { - "epoch": 12.77, - "eval_loss": 0.4379936158657074, - "eval_runtime": 4.8834, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2176 - }, - { - "epoch": 12.82, - "grad_norm": 1.2890625, - "learning_rate": 2.3523622047244096e-05, - "loss": 0.2128, - "step": 2184 - }, - { - "epoch": 12.82, - "eval_loss": 0.43629226088523865, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 2184 - }, - { - "epoch": 12.87, - "grad_norm": 1.453125, - "learning_rate": 2.3425196850393703e-05, - "loss": 0.2014, - "step": 2192 - }, - { - "epoch": 12.87, - "eval_loss": 0.4432964324951172, - "eval_runtime": 4.887, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2192 - }, - { - "epoch": 12.91, - "grad_norm": 1.296875, - "learning_rate": 2.332677165354331e-05, - "loss": 0.2151, - "step": 2200 - }, - { - "epoch": 12.91, - "eval_loss": 0.4419555366039276, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 2200 - }, - { - "epoch": 12.96, - "grad_norm": 1.4765625, - "learning_rate": 2.3228346456692916e-05, - "loss": 0.194, - "step": 2208 - }, - { - "epoch": 12.96, - "eval_loss": 0.44093310832977295, - "eval_runtime": 4.8894, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 2208 - }, - { - "epoch": 13.01, - "grad_norm": 1.1875, - "learning_rate": 2.3129921259842523e-05, - "loss": 0.2048, - "step": 2216 - }, - { - "epoch": 13.01, - "eval_loss": 0.44801631569862366, - "eval_runtime": 4.8873, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2216 - }, - { - "epoch": 13.05, - "grad_norm": 1.25, - "learning_rate": 2.3031496062992126e-05, - "loss": 0.1913, - "step": 2224 - }, - { - "epoch": 13.05, - "eval_loss": 0.45641574263572693, - "eval_runtime": 4.8946, - "eval_samples_per_second": 9.807, - "eval_steps_per_second": 1.226, - "step": 2224 - }, - { - "epoch": 13.1, - "grad_norm": 1.5625, - "learning_rate": 2.2933070866141733e-05, - "loss": 0.1817, - "step": 2232 - }, - { - "epoch": 13.1, - "eval_loss": 0.4665268361568451, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2232 - }, - { - "epoch": 13.15, - "grad_norm": 1.5, - "learning_rate": 2.283464566929134e-05, - "loss": 0.1855, - "step": 2240 - }, - { - "epoch": 13.15, - "eval_loss": 0.462589293718338, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 2240 - }, - { - "epoch": 13.19, - "grad_norm": 1.3671875, - "learning_rate": 2.2736220472440947e-05, - "loss": 0.1776, - "step": 2248 - }, - { - "epoch": 13.19, - "eval_loss": 0.45993486046791077, - "eval_runtime": 4.8821, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 2248 - }, - { - "epoch": 13.24, - "grad_norm": 1.3828125, - "learning_rate": 2.263779527559055e-05, - "loss": 0.1895, - "step": 2256 - }, - { - "epoch": 13.24, - "eval_loss": 0.46107491850852966, - "eval_runtime": 4.8877, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2256 - }, - { - "epoch": 13.29, - "grad_norm": 1.421875, - "learning_rate": 2.2539370078740157e-05, - "loss": 0.1871, - "step": 2264 - }, - { - "epoch": 13.29, - "eval_loss": 0.4591566324234009, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 2264 - }, - { - "epoch": 13.34, - "grad_norm": 1.3125, - "learning_rate": 2.2440944881889763e-05, - "loss": 0.1834, - "step": 2272 - }, - { - "epoch": 13.34, - "eval_loss": 0.45331981778144836, - "eval_runtime": 4.8872, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2272 - }, - { - "epoch": 13.38, - "grad_norm": 1.390625, - "learning_rate": 2.234251968503937e-05, - "loss": 0.1726, - "step": 2280 - }, - { - "epoch": 13.38, - "eval_loss": 0.46644964814186096, - "eval_runtime": 4.8868, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2280 - }, - { - "epoch": 13.43, - "grad_norm": 1.390625, - "learning_rate": 2.2244094488188977e-05, - "loss": 0.1846, - "step": 2288 - }, - { - "epoch": 13.43, - "eval_loss": 0.45397841930389404, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 2288 - }, - { - "epoch": 13.48, - "grad_norm": 1.5703125, - "learning_rate": 2.2145669291338584e-05, - "loss": 0.187, - "step": 2296 - }, - { - "epoch": 13.48, - "eval_loss": 0.459914892911911, - "eval_runtime": 4.8861, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2296 - }, - { - "epoch": 13.52, - "grad_norm": 1.4140625, - "learning_rate": 2.204724409448819e-05, - "loss": 0.1731, - "step": 2304 - }, - { - "epoch": 13.52, - "eval_loss": 0.45958980917930603, - "eval_runtime": 4.8898, - "eval_samples_per_second": 9.816, - "eval_steps_per_second": 1.227, - "step": 2304 - }, - { - "epoch": 13.57, - "grad_norm": 1.578125, - "learning_rate": 2.1948818897637797e-05, - "loss": 0.1923, - "step": 2312 - }, - { - "epoch": 13.57, - "eval_loss": 0.4603557884693146, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2312 - }, - { - "epoch": 13.62, - "grad_norm": 1.328125, - "learning_rate": 2.1850393700787404e-05, - "loss": 0.1867, - "step": 2320 - }, - { - "epoch": 13.62, - "eval_loss": 0.4594487249851227, - "eval_runtime": 4.8808, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 2320 - }, - { - "epoch": 13.66, - "grad_norm": 1.390625, - "learning_rate": 2.175196850393701e-05, - "loss": 0.2047, - "step": 2328 - }, - { - "epoch": 13.66, - "eval_loss": 0.46039581298828125, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2328 - }, - { - "epoch": 13.71, - "grad_norm": 1.59375, - "learning_rate": 2.1653543307086614e-05, - "loss": 0.1851, - "step": 2336 - }, - { - "epoch": 13.71, - "eval_loss": 0.4601996839046478, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 2336 - }, - { - "epoch": 13.76, - "grad_norm": 1.4765625, - "learning_rate": 2.155511811023622e-05, - "loss": 0.1905, - "step": 2344 - }, - { - "epoch": 13.76, - "eval_loss": 0.4591521918773651, - "eval_runtime": 4.8801, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 2344 - }, - { - "epoch": 13.8, - "grad_norm": 1.3515625, - "learning_rate": 2.1456692913385828e-05, - "loss": 0.1998, - "step": 2352 - }, - { - "epoch": 13.8, - "eval_loss": 0.4632813036441803, - "eval_runtime": 4.8892, - "eval_samples_per_second": 9.818, - "eval_steps_per_second": 1.227, - "step": 2352 - }, - { - "epoch": 13.85, - "grad_norm": 1.34375, - "learning_rate": 2.1358267716535434e-05, - "loss": 0.1889, - "step": 2360 - }, - { - "epoch": 13.85, - "eval_loss": 0.45855093002319336, - "eval_runtime": 4.8879, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 2360 - }, - { - "epoch": 13.9, - "grad_norm": 1.3984375, - "learning_rate": 2.125984251968504e-05, - "loss": 0.2007, - "step": 2368 - }, - { - "epoch": 13.9, - "eval_loss": 0.46005454659461975, - "eval_runtime": 4.8909, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, - "step": 2368 - }, - { - "epoch": 13.95, - "grad_norm": 1.484375, - "learning_rate": 2.1161417322834648e-05, - "loss": 0.1929, - "step": 2376 - }, - { - "epoch": 13.95, - "eval_loss": 0.46431541442871094, - "eval_runtime": 4.8851, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2376 - }, - { - "epoch": 13.99, - "grad_norm": 1.375, - "learning_rate": 2.106299212598425e-05, - "loss": 0.1928, - "step": 2384 - }, - { - "epoch": 13.99, - "eval_loss": 0.4559873342514038, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2384 - }, - { - "epoch": 14.04, - "grad_norm": 1.234375, - "learning_rate": 2.0964566929133858e-05, - "loss": 0.1781, - "step": 2392 - }, - { - "epoch": 14.04, - "eval_loss": 0.4926118850708008, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 2392 - }, - { - "epoch": 14.09, - "grad_norm": 1.4453125, - "learning_rate": 2.0866141732283465e-05, - "loss": 0.1716, - "step": 2400 - }, - { - "epoch": 14.09, - "eval_loss": 0.4850791394710541, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2400 - }, - { - "epoch": 14.13, - "grad_norm": 1.421875, - "learning_rate": 2.076771653543307e-05, - "loss": 0.1757, - "step": 2408 - }, - { - "epoch": 14.13, - "eval_loss": 0.4749085605144501, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 2408 - }, - { - "epoch": 14.18, - "grad_norm": 1.515625, - "learning_rate": 2.066929133858268e-05, - "loss": 0.1738, - "step": 2416 - }, - { - "epoch": 14.18, - "eval_loss": 0.4851483404636383, - "eval_runtime": 4.891, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, - "step": 2416 - }, - { - "epoch": 14.23, - "grad_norm": 1.4296875, - "learning_rate": 2.0570866141732285e-05, - "loss": 0.1738, - "step": 2424 - }, - { - "epoch": 14.23, - "eval_loss": 0.4827621281147003, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 2424 - }, - { - "epoch": 14.27, - "grad_norm": 1.4609375, - "learning_rate": 2.0472440944881892e-05, - "loss": 0.1717, - "step": 2432 - }, - { - "epoch": 14.27, - "eval_loss": 0.4806636571884155, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2432 - }, - { - "epoch": 14.32, - "grad_norm": 2.1875, - "learning_rate": 2.0374015748031495e-05, - "loss": 0.1725, - "step": 2440 - }, - { - "epoch": 14.32, - "eval_loss": 0.48100754618644714, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 2440 - }, - { - "epoch": 14.37, - "grad_norm": 1.59375, - "learning_rate": 2.0275590551181102e-05, - "loss": 0.1798, - "step": 2448 - }, - { - "epoch": 14.37, - "eval_loss": 0.48213496804237366, - "eval_runtime": 4.8914, - "eval_samples_per_second": 9.813, - "eval_steps_per_second": 1.227, - "step": 2448 - }, - { - "epoch": 14.42, - "grad_norm": 1.3046875, - "learning_rate": 2.017716535433071e-05, - "loss": 0.1827, - "step": 2456 - }, - { - "epoch": 14.42, - "eval_loss": 0.48051026463508606, - "eval_runtime": 4.8852, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2456 - }, - { - "epoch": 14.46, - "grad_norm": 1.4375, - "learning_rate": 2.0078740157480316e-05, - "loss": 0.1718, - "step": 2464 - }, - { - "epoch": 14.46, - "eval_loss": 0.4809204638004303, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2464 - }, - { - "epoch": 14.51, - "grad_norm": 1.453125, - "learning_rate": 1.9980314960629922e-05, - "loss": 0.1827, - "step": 2472 - }, - { - "epoch": 14.51, - "eval_loss": 0.48105254769325256, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2472 - }, - { - "epoch": 14.56, - "grad_norm": 1.4921875, - "learning_rate": 1.988188976377953e-05, - "loss": 0.169, - "step": 2480 - }, - { - "epoch": 14.56, - "eval_loss": 0.4779665768146515, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2480 - }, - { - "epoch": 14.6, - "grad_norm": 1.53125, - "learning_rate": 1.9783464566929136e-05, - "loss": 0.1802, - "step": 2488 - }, - { - "epoch": 14.6, - "eval_loss": 0.4763511121273041, - "eval_runtime": 4.882, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 2488 - }, - { - "epoch": 14.65, - "grad_norm": 1.484375, - "learning_rate": 1.9685039370078743e-05, - "loss": 0.1742, - "step": 2496 - }, - { - "epoch": 14.65, - "eval_loss": 0.4802992641925812, - "eval_runtime": 4.8869, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2496 - }, - { - "epoch": 14.7, - "grad_norm": 1.5390625, - "learning_rate": 1.958661417322835e-05, - "loss": 0.1724, - "step": 2504 - }, - { - "epoch": 14.7, - "eval_loss": 0.47574201226234436, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2504 - }, - { - "epoch": 14.74, - "grad_norm": 1.5234375, - "learning_rate": 1.9488188976377953e-05, - "loss": 0.1697, - "step": 2512 - }, - { - "epoch": 14.74, - "eval_loss": 0.47717785835266113, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2512 - }, - { - "epoch": 14.79, - "grad_norm": 1.40625, - "learning_rate": 1.938976377952756e-05, - "loss": 0.1757, - "step": 2520 - }, - { - "epoch": 14.79, - "eval_loss": 0.48033222556114197, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 2520 - }, - { - "epoch": 14.84, - "grad_norm": 1.515625, - "learning_rate": 1.9291338582677166e-05, - "loss": 0.1758, - "step": 2528 - }, - { - "epoch": 14.84, - "eval_loss": 0.4787493050098419, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2528 - }, - { - "epoch": 14.88, - "grad_norm": 1.3828125, - "learning_rate": 1.9192913385826773e-05, - "loss": 0.1643, - "step": 2536 - }, - { - "epoch": 14.88, - "eval_loss": 0.4781215488910675, - "eval_runtime": 4.8837, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2536 - }, - { - "epoch": 14.93, - "grad_norm": 1.4453125, - "learning_rate": 1.909448818897638e-05, - "loss": 0.1714, - "step": 2544 - }, - { - "epoch": 14.93, - "eval_loss": 0.47745776176452637, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2544 - }, - { - "epoch": 14.98, - "grad_norm": 1.5, - "learning_rate": 1.8996062992125983e-05, - "loss": 0.176, - "step": 2552 - }, - { - "epoch": 14.98, - "eval_loss": 0.47671830654144287, - "eval_runtime": 4.8827, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 2552 - }, - { - "epoch": 15.03, - "grad_norm": 1.703125, - "learning_rate": 1.889763779527559e-05, - "loss": 0.1609, - "step": 2560 - }, - { - "epoch": 15.03, - "eval_loss": 0.5071304440498352, - "eval_runtime": 4.8789, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 2560 - }, - { - "epoch": 15.07, - "grad_norm": 1.28125, - "learning_rate": 1.8799212598425197e-05, - "loss": 0.1545, - "step": 2568 - }, - { - "epoch": 15.07, - "eval_loss": 0.49048110842704773, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 2568 - }, - { - "epoch": 15.12, - "grad_norm": 1.4140625, - "learning_rate": 1.8700787401574803e-05, - "loss": 0.1591, - "step": 2576 - }, - { - "epoch": 15.12, - "eval_loss": 0.5035098195075989, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2576 - }, - { - "epoch": 15.17, - "grad_norm": 1.3828125, - "learning_rate": 1.860236220472441e-05, - "loss": 0.1488, - "step": 2584 - }, - { - "epoch": 15.17, - "eval_loss": 0.49650588631629944, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2584 - }, - { - "epoch": 15.21, - "grad_norm": 1.546875, - "learning_rate": 1.8503937007874017e-05, - "loss": 0.1605, - "step": 2592 - }, - { - "epoch": 15.21, - "eval_loss": 0.5021148920059204, - "eval_runtime": 4.8938, - "eval_samples_per_second": 9.808, - "eval_steps_per_second": 1.226, - "step": 2592 - }, - { - "epoch": 15.26, - "grad_norm": 1.53125, - "learning_rate": 1.8405511811023624e-05, - "loss": 0.1453, - "step": 2600 - }, - { - "epoch": 15.26, - "eval_loss": 0.49250075221061707, - "eval_runtime": 4.8904, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 2600 - }, - { - "epoch": 15.31, - "grad_norm": 1.6328125, - "learning_rate": 1.830708661417323e-05, - "loss": 0.1577, - "step": 2608 - }, - { - "epoch": 15.31, - "eval_loss": 0.503433883190155, - "eval_runtime": 4.8812, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 2608 - }, - { - "epoch": 15.35, - "grad_norm": 1.421875, - "learning_rate": 1.8208661417322837e-05, - "loss": 0.1643, - "step": 2616 - }, - { - "epoch": 15.35, - "eval_loss": 0.4985417127609253, - "eval_runtime": 4.8869, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2616 - }, - { - "epoch": 15.4, - "grad_norm": 1.59375, - "learning_rate": 1.8110236220472444e-05, - "loss": 0.1581, - "step": 2624 - }, - { - "epoch": 15.4, - "eval_loss": 0.500441312789917, - "eval_runtime": 4.889, - "eval_samples_per_second": 9.818, - "eval_steps_per_second": 1.227, - "step": 2624 - }, - { - "epoch": 15.45, - "grad_norm": 1.53125, - "learning_rate": 1.801181102362205e-05, - "loss": 0.1597, - "step": 2632 - }, - { - "epoch": 15.45, - "eval_loss": 0.5028078556060791, - "eval_runtime": 4.8809, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 2632 - }, - { - "epoch": 15.5, - "grad_norm": 1.671875, - "learning_rate": 1.7913385826771654e-05, - "loss": 0.1689, - "step": 2640 - }, - { - "epoch": 15.5, - "eval_loss": 0.49254146218299866, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2640 - }, - { - "epoch": 15.54, - "grad_norm": 1.5625, - "learning_rate": 1.781496062992126e-05, - "loss": 0.1709, - "step": 2648 - }, - { - "epoch": 15.54, - "eval_loss": 0.4996674358844757, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 2648 - }, - { - "epoch": 15.59, - "grad_norm": 1.5, - "learning_rate": 1.7716535433070868e-05, - "loss": 0.1579, - "step": 2656 - }, - { - "epoch": 15.59, - "eval_loss": 0.49186739325523376, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2656 - }, - { - "epoch": 15.64, - "grad_norm": 1.40625, - "learning_rate": 1.761811023622047e-05, - "loss": 0.1564, - "step": 2664 - }, - { - "epoch": 15.64, - "eval_loss": 0.5025432705879211, - "eval_runtime": 4.8881, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 2664 - }, - { - "epoch": 15.68, - "grad_norm": 1.640625, - "learning_rate": 1.7519685039370078e-05, - "loss": 0.1708, - "step": 2672 - }, - { - "epoch": 15.68, - "eval_loss": 0.49223020672798157, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 2672 - }, - { - "epoch": 15.73, - "grad_norm": 1.5390625, - "learning_rate": 1.7421259842519685e-05, - "loss": 0.1723, - "step": 2680 - }, - { - "epoch": 15.73, - "eval_loss": 0.49755749106407166, - "eval_runtime": 4.8924, - "eval_samples_per_second": 9.811, - "eval_steps_per_second": 1.226, - "step": 2680 - }, - { - "epoch": 15.78, - "grad_norm": 1.546875, - "learning_rate": 1.732283464566929e-05, - "loss": 0.1603, - "step": 2688 - }, - { - "epoch": 15.78, - "eval_loss": 0.4947017431259155, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2688 - }, - { - "epoch": 15.82, - "grad_norm": 1.6328125, - "learning_rate": 1.7224409448818898e-05, - "loss": 0.1679, - "step": 2696 - }, - { - "epoch": 15.82, - "eval_loss": 0.4993188679218292, - "eval_runtime": 4.8796, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 2696 - }, - { - "epoch": 15.87, - "grad_norm": 1.4140625, - "learning_rate": 1.7125984251968505e-05, - "loss": 0.1681, - "step": 2704 - }, - { - "epoch": 15.87, - "eval_loss": 0.4942636489868164, - "eval_runtime": 4.8866, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2704 - }, - { - "epoch": 15.92, - "grad_norm": 1.4765625, - "learning_rate": 1.702755905511811e-05, - "loss": 0.1762, - "step": 2712 - }, - { - "epoch": 15.92, - "eval_loss": 0.49583515524864197, - "eval_runtime": 4.8908, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, - "step": 2712 - }, - { - "epoch": 15.96, - "grad_norm": 1.6015625, - "learning_rate": 1.692913385826772e-05, - "loss": 0.1639, - "step": 2720 - }, - { - "epoch": 15.96, - "eval_loss": 0.4938255846500397, - "eval_runtime": 4.8932, - "eval_samples_per_second": 9.809, - "eval_steps_per_second": 1.226, - "step": 2720 - }, - { - "epoch": 16.01, - "grad_norm": 1.2890625, - "learning_rate": 1.6830708661417325e-05, - "loss": 0.1606, - "step": 2728 - }, - { - "epoch": 16.01, - "eval_loss": 0.50501948595047, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2728 - }, - { - "epoch": 16.06, - "grad_norm": 1.5546875, - "learning_rate": 1.6732283464566932e-05, - "loss": 0.1362, - "step": 2736 - }, - { - "epoch": 16.06, - "eval_loss": 0.5190303921699524, - "eval_runtime": 4.8864, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2736 - }, - { - "epoch": 16.11, - "grad_norm": 1.3359375, - "learning_rate": 1.663385826771654e-05, - "loss": 0.151, - "step": 2744 - }, - { - "epoch": 16.11, - "eval_loss": 0.5132401585578918, - "eval_runtime": 4.8954, - "eval_samples_per_second": 9.805, - "eval_steps_per_second": 1.226, - "step": 2744 - }, - { - "epoch": 16.15, - "grad_norm": 1.5078125, - "learning_rate": 1.6535433070866142e-05, - "loss": 0.152, - "step": 2752 - }, - { - "epoch": 16.15, - "eval_loss": 0.5155924558639526, - "eval_runtime": 4.8785, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, - "step": 2752 - }, - { - "epoch": 16.2, - "grad_norm": 1.546875, - "learning_rate": 1.643700787401575e-05, - "loss": 0.1403, - "step": 2760 - }, - { - "epoch": 16.2, - "eval_loss": 0.5209617018699646, - "eval_runtime": 4.8897, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 2760 - }, - { - "epoch": 16.25, - "grad_norm": 1.453125, - "learning_rate": 1.6338582677165352e-05, - "loss": 0.1486, - "step": 2768 - }, - { - "epoch": 16.25, - "eval_loss": 0.5135500431060791, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2768 - }, - { - "epoch": 16.29, - "grad_norm": 1.4921875, - "learning_rate": 1.624015748031496e-05, - "loss": 0.1498, - "step": 2776 - }, - { - "epoch": 16.29, - "eval_loss": 0.5194461345672607, - "eval_runtime": 4.8786, - "eval_samples_per_second": 9.839, - "eval_steps_per_second": 1.23, - "step": 2776 - }, - { - "epoch": 16.34, - "grad_norm": 1.4921875, - "learning_rate": 1.6141732283464566e-05, - "loss": 0.1471, - "step": 2784 - }, - { - "epoch": 16.34, - "eval_loss": 0.5094698071479797, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 2784 - }, - { - "epoch": 16.39, - "grad_norm": 1.5078125, - "learning_rate": 1.6043307086614172e-05, - "loss": 0.1514, - "step": 2792 - }, - { - "epoch": 16.39, - "eval_loss": 0.5184895992279053, - "eval_runtime": 4.8815, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 2792 - }, - { - "epoch": 16.43, - "grad_norm": 1.53125, - "learning_rate": 1.594488188976378e-05, - "loss": 0.1508, - "step": 2800 - }, - { - "epoch": 16.43, - "eval_loss": 0.5169212222099304, - "eval_runtime": 4.8823, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 2800 - }, - { - "epoch": 16.48, - "grad_norm": 1.5390625, - "learning_rate": 1.5846456692913386e-05, - "loss": 0.1459, - "step": 2808 - }, - { - "epoch": 16.48, - "eval_loss": 0.5152599215507507, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2808 - }, - { - "epoch": 16.53, - "grad_norm": 1.6015625, - "learning_rate": 1.5748031496062993e-05, - "loss": 0.1647, - "step": 2816 - }, - { - "epoch": 16.53, - "eval_loss": 0.5134853720664978, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 2816 - }, - { - "epoch": 16.58, - "grad_norm": 1.625, - "learning_rate": 1.56496062992126e-05, - "loss": 0.1541, - "step": 2824 - }, - { - "epoch": 16.58, - "eval_loss": 0.520317018032074, - "eval_runtime": 4.8877, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 2824 - }, - { - "epoch": 16.62, - "grad_norm": 1.53125, - "learning_rate": 1.5551181102362206e-05, - "loss": 0.1538, - "step": 2832 - }, - { - "epoch": 16.62, - "eval_loss": 0.5117902159690857, - "eval_runtime": 4.8856, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 2832 - }, - { - "epoch": 16.67, - "grad_norm": 1.640625, - "learning_rate": 1.5452755905511813e-05, - "loss": 0.1645, - "step": 2840 - }, - { - "epoch": 16.67, - "eval_loss": 0.5128079056739807, - "eval_runtime": 4.886, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2840 - }, - { - "epoch": 16.72, - "grad_norm": 1.4765625, - "learning_rate": 1.535433070866142e-05, - "loss": 0.1544, - "step": 2848 - }, - { - "epoch": 16.72, - "eval_loss": 0.5158179402351379, - "eval_runtime": 4.8867, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 2848 - }, - { - "epoch": 16.76, - "grad_norm": 1.6875, - "learning_rate": 1.5255905511811025e-05, - "loss": 0.1552, - "step": 2856 - }, - { - "epoch": 16.76, - "eval_loss": 0.5202960968017578, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 2856 - }, - { - "epoch": 16.81, - "grad_norm": 1.4921875, - "learning_rate": 1.5157480314960632e-05, - "loss": 0.1481, - "step": 2864 - }, - { - "epoch": 16.81, - "eval_loss": 0.5155553221702576, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2864 - }, - { - "epoch": 16.86, - "grad_norm": 1.3984375, - "learning_rate": 1.5059055118110238e-05, - "loss": 0.1477, - "step": 2872 - }, - { - "epoch": 16.86, - "eval_loss": 0.5139450430870056, - "eval_runtime": 4.8861, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2872 - }, - { - "epoch": 16.9, - "grad_norm": 1.546875, - "learning_rate": 1.4960629921259845e-05, - "loss": 0.1561, - "step": 2880 - }, - { - "epoch": 16.9, - "eval_loss": 0.518673837184906, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2880 - }, - { - "epoch": 16.95, - "grad_norm": 1.4375, - "learning_rate": 1.486220472440945e-05, - "loss": 0.1398, - "step": 2888 - }, - { - "epoch": 16.95, - "eval_loss": 0.5137103796005249, - "eval_runtime": 4.8804, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 2888 - }, - { - "epoch": 17.0, - "grad_norm": 1.5859375, - "learning_rate": 1.4763779527559055e-05, - "loss": 0.16, - "step": 2896 - }, - { - "epoch": 17.0, - "eval_loss": 0.5180995464324951, - "eval_runtime": 4.8791, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 2896 - }, - { - "epoch": 17.04, - "grad_norm": 1.8125, - "learning_rate": 1.466535433070866e-05, - "loss": 0.1346, - "step": 2904 - }, - { - "epoch": 17.04, - "eval_loss": 0.5409136414527893, - "eval_runtime": 4.8801, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 2904 - }, - { - "epoch": 17.09, - "grad_norm": 1.296875, - "learning_rate": 1.4566929133858267e-05, - "loss": 0.1391, - "step": 2912 - }, - { - "epoch": 17.09, - "eval_loss": 0.527400553226471, - "eval_runtime": 4.8828, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 2912 - }, - { - "epoch": 17.14, - "grad_norm": 1.5234375, - "learning_rate": 1.4468503937007874e-05, - "loss": 0.1392, - "step": 2920 - }, - { - "epoch": 17.14, - "eval_loss": 0.5358954071998596, - "eval_runtime": 4.885, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2920 - }, - { - "epoch": 17.19, - "grad_norm": 1.65625, - "learning_rate": 1.437007874015748e-05, - "loss": 0.1457, - "step": 2928 - }, - { - "epoch": 17.19, - "eval_loss": 0.5281966328620911, - "eval_runtime": 4.885, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2928 - }, - { - "epoch": 17.23, - "grad_norm": 1.796875, - "learning_rate": 1.4271653543307087e-05, - "loss": 0.1423, - "step": 2936 - }, - { - "epoch": 17.23, - "eval_loss": 0.539531409740448, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2936 - }, - { - "epoch": 17.28, - "grad_norm": 1.25, - "learning_rate": 1.4173228346456694e-05, - "loss": 0.1406, - "step": 2944 - }, - { - "epoch": 17.28, - "eval_loss": 0.5257437825202942, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 2944 - }, - { - "epoch": 17.33, - "grad_norm": 1.3984375, - "learning_rate": 1.40748031496063e-05, - "loss": 0.1457, - "step": 2952 - }, - { - "epoch": 17.33, - "eval_loss": 0.5385013222694397, - "eval_runtime": 4.8868, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 2952 - }, - { - "epoch": 17.37, - "grad_norm": 1.4140625, - "learning_rate": 1.3976377952755906e-05, - "loss": 0.1472, - "step": 2960 - }, - { - "epoch": 17.37, - "eval_loss": 0.530069887638092, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2960 - }, - { - "epoch": 17.42, - "grad_norm": 1.6015625, - "learning_rate": 1.3877952755905513e-05, - "loss": 0.1429, - "step": 2968 - }, - { - "epoch": 17.42, - "eval_loss": 0.5348724722862244, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2968 - }, - { - "epoch": 17.47, - "grad_norm": 1.7734375, - "learning_rate": 1.377952755905512e-05, - "loss": 0.141, - "step": 2976 - }, - { - "epoch": 17.47, - "eval_loss": 0.5355264544487, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 2976 - }, - { - "epoch": 17.51, - "grad_norm": 1.515625, - "learning_rate": 1.3681102362204726e-05, - "loss": 0.1391, - "step": 2984 - }, - { - "epoch": 17.51, - "eval_loss": 0.530957043170929, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 2984 - }, - { - "epoch": 17.56, - "grad_norm": 1.5859375, - "learning_rate": 1.3582677165354333e-05, - "loss": 0.145, - "step": 2992 - }, - { - "epoch": 17.56, - "eval_loss": 0.5390105843544006, - "eval_runtime": 4.8852, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 2992 - }, - { - "epoch": 17.61, - "grad_norm": 1.3125, - "learning_rate": 1.3484251968503938e-05, - "loss": 0.1378, - "step": 3000 - }, - { - "epoch": 17.61, - "eval_loss": 0.5294584631919861, - "eval_runtime": 4.8791, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 3000 - }, - { - "epoch": 17.66, - "grad_norm": 1.7265625, - "learning_rate": 1.3385826771653545e-05, - "loss": 0.1464, - "step": 3008 - }, - { - "epoch": 17.66, - "eval_loss": 0.534781813621521, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3008 - }, - { - "epoch": 17.7, - "grad_norm": 1.6171875, - "learning_rate": 1.3287401574803152e-05, - "loss": 0.142, - "step": 3016 - }, - { - "epoch": 17.7, - "eval_loss": 0.5333996415138245, - "eval_runtime": 4.8814, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 3016 - }, - { - "epoch": 17.75, - "grad_norm": 1.484375, - "learning_rate": 1.3188976377952755e-05, - "loss": 0.1484, - "step": 3024 - }, - { - "epoch": 17.75, - "eval_loss": 0.5344775319099426, - "eval_runtime": 4.8867, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3024 - }, - { - "epoch": 17.8, - "grad_norm": 1.4296875, - "learning_rate": 1.3090551181102362e-05, - "loss": 0.1424, - "step": 3032 - }, - { - "epoch": 17.8, - "eval_loss": 0.5302088260650635, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 3032 - }, - { - "epoch": 17.84, - "grad_norm": 1.421875, - "learning_rate": 1.2992125984251968e-05, - "loss": 0.1373, - "step": 3040 - }, - { - "epoch": 17.84, - "eval_loss": 0.5336248278617859, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3040 - }, - { - "epoch": 17.89, - "grad_norm": 1.4765625, - "learning_rate": 1.2893700787401575e-05, - "loss": 0.1415, - "step": 3048 - }, - { - "epoch": 17.89, - "eval_loss": 0.5314746499061584, - "eval_runtime": 4.8897, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 3048 - }, - { - "epoch": 17.94, - "grad_norm": 1.546875, - "learning_rate": 1.2795275590551182e-05, - "loss": 0.1497, - "step": 3056 - }, - { - "epoch": 17.94, - "eval_loss": 0.534315824508667, - "eval_runtime": 4.8827, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3056 - }, - { - "epoch": 17.98, - "grad_norm": 1.3828125, - "learning_rate": 1.2696850393700787e-05, - "loss": 0.1413, - "step": 3064 - }, - { - "epoch": 17.98, - "eval_loss": 0.5319940447807312, - "eval_runtime": 4.8902, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 3064 - }, - { - "epoch": 18.03, - "grad_norm": 1.375, - "learning_rate": 1.2598425196850394e-05, - "loss": 0.1353, - "step": 3072 - }, - { - "epoch": 18.03, - "eval_loss": 0.5491126179695129, - "eval_runtime": 4.884, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3072 - }, - { - "epoch": 18.08, - "grad_norm": 1.421875, - "learning_rate": 1.25e-05, - "loss": 0.1313, - "step": 3080 - }, - { - "epoch": 18.08, - "eval_loss": 0.5500498414039612, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3080 - }, - { - "epoch": 18.12, - "grad_norm": 1.578125, - "learning_rate": 1.2401574803149607e-05, - "loss": 0.1281, - "step": 3088 - }, - { - "epoch": 18.12, - "eval_loss": 0.5494930744171143, - "eval_runtime": 4.8801, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 3088 - }, - { - "epoch": 18.17, - "grad_norm": 1.4375, - "learning_rate": 1.2303149606299214e-05, - "loss": 0.1309, - "step": 3096 - }, - { - "epoch": 18.17, - "eval_loss": 0.5453672409057617, - "eval_runtime": 4.8903, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 3096 - }, - { - "epoch": 18.22, - "grad_norm": 1.3984375, - "learning_rate": 1.220472440944882e-05, - "loss": 0.1342, - "step": 3104 - }, - { - "epoch": 18.22, - "eval_loss": 0.5458211302757263, - "eval_runtime": 4.8823, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3104 - }, - { - "epoch": 18.27, - "grad_norm": 1.6015625, - "learning_rate": 1.2106299212598424e-05, - "loss": 0.1272, - "step": 3112 - }, - { - "epoch": 18.27, - "eval_loss": 0.5490582585334778, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3112 - }, - { - "epoch": 18.31, - "grad_norm": 1.3984375, - "learning_rate": 1.2007874015748031e-05, - "loss": 0.1406, - "step": 3120 - }, - { - "epoch": 18.31, - "eval_loss": 0.5500710010528564, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 3120 - }, - { - "epoch": 18.36, - "grad_norm": 1.453125, - "learning_rate": 1.1909448818897638e-05, - "loss": 0.1358, - "step": 3128 - }, - { - "epoch": 18.36, - "eval_loss": 0.5460704565048218, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3128 - }, - { - "epoch": 18.41, - "grad_norm": 1.5078125, - "learning_rate": 1.1811023622047245e-05, - "loss": 0.1401, - "step": 3136 - }, - { - "epoch": 18.41, - "eval_loss": 0.5513778328895569, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3136 - }, - { - "epoch": 18.45, - "grad_norm": 1.5234375, - "learning_rate": 1.1712598425196851e-05, - "loss": 0.1353, - "step": 3144 - }, - { - "epoch": 18.45, - "eval_loss": 0.5427709817886353, - "eval_runtime": 4.8814, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 3144 - }, - { - "epoch": 18.5, - "grad_norm": 1.5234375, - "learning_rate": 1.1614173228346458e-05, - "loss": 0.1386, - "step": 3152 - }, - { - "epoch": 18.5, - "eval_loss": 0.5440021753311157, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3152 - }, - { - "epoch": 18.55, - "grad_norm": 1.40625, - "learning_rate": 1.1515748031496063e-05, - "loss": 0.1394, - "step": 3160 - }, - { - "epoch": 18.55, - "eval_loss": 0.548543393611908, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3160 - }, - { - "epoch": 18.59, - "grad_norm": 1.46875, - "learning_rate": 1.141732283464567e-05, - "loss": 0.1316, - "step": 3168 - }, - { - "epoch": 18.59, - "eval_loss": 0.5458658933639526, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3168 - }, - { - "epoch": 18.64, - "grad_norm": 1.484375, - "learning_rate": 1.1318897637795275e-05, - "loss": 0.1397, - "step": 3176 - }, - { - "epoch": 18.64, - "eval_loss": 0.5435895323753357, - "eval_runtime": 4.8847, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3176 - }, - { - "epoch": 18.69, - "grad_norm": 1.515625, - "learning_rate": 1.1220472440944882e-05, - "loss": 0.1248, - "step": 3184 - }, - { - "epoch": 18.69, - "eval_loss": 0.5472026467323303, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 3184 - }, - { - "epoch": 18.74, - "grad_norm": 1.3984375, - "learning_rate": 1.1122047244094488e-05, - "loss": 0.1339, - "step": 3192 - }, - { - "epoch": 18.74, - "eval_loss": 0.5473195910453796, - "eval_runtime": 4.8846, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3192 - }, - { - "epoch": 18.78, - "grad_norm": 1.4609375, - "learning_rate": 1.1023622047244095e-05, - "loss": 0.1374, - "step": 3200 - }, - { - "epoch": 18.78, - "eval_loss": 0.5464627146720886, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3200 - }, - { - "epoch": 18.83, - "grad_norm": 1.640625, - "learning_rate": 1.0925196850393702e-05, - "loss": 0.1439, - "step": 3208 - }, - { - "epoch": 18.83, - "eval_loss": 0.5440012812614441, - "eval_runtime": 4.8794, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 3208 - }, - { - "epoch": 18.88, - "grad_norm": 1.453125, - "learning_rate": 1.0826771653543307e-05, - "loss": 0.1377, - "step": 3216 - }, - { - "epoch": 18.88, - "eval_loss": 0.5485637187957764, - "eval_runtime": 4.8976, - "eval_samples_per_second": 9.801, - "eval_steps_per_second": 1.225, - "step": 3216 - }, - { - "epoch": 18.92, - "grad_norm": 1.515625, - "learning_rate": 1.0728346456692914e-05, - "loss": 0.1379, - "step": 3224 - }, - { - "epoch": 18.92, - "eval_loss": 0.5467141270637512, - "eval_runtime": 4.8886, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 3224 - }, - { - "epoch": 18.97, - "grad_norm": 1.640625, - "learning_rate": 1.062992125984252e-05, - "loss": 0.1436, - "step": 3232 - }, - { - "epoch": 18.97, - "eval_loss": 0.547528088092804, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3232 - }, - { - "epoch": 19.02, - "grad_norm": 1.3359375, - "learning_rate": 1.0531496062992126e-05, - "loss": 0.1378, - "step": 3240 - }, - { - "epoch": 19.02, - "eval_loss": 0.551363468170166, - "eval_runtime": 4.881, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 3240 - }, - { - "epoch": 19.06, - "grad_norm": 1.53125, - "learning_rate": 1.0433070866141732e-05, - "loss": 0.1222, - "step": 3248 - }, - { - "epoch": 19.06, - "eval_loss": 0.5650167465209961, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 3248 - }, - { - "epoch": 19.11, - "grad_norm": 1.484375, - "learning_rate": 1.033464566929134e-05, - "loss": 0.1295, - "step": 3256 - }, - { - "epoch": 19.11, - "eval_loss": 0.5542344450950623, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3256 - }, - { - "epoch": 19.16, - "grad_norm": 1.5, - "learning_rate": 1.0236220472440946e-05, - "loss": 0.1308, - "step": 3264 - }, - { - "epoch": 19.16, - "eval_loss": 0.5623123645782471, - "eval_runtime": 4.8834, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 3264 - }, - { - "epoch": 19.2, - "grad_norm": 1.3515625, - "learning_rate": 1.0137795275590551e-05, - "loss": 0.1353, - "step": 3272 - }, - { - "epoch": 19.2, - "eval_loss": 0.5574969053268433, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3272 - }, - { - "epoch": 19.25, - "grad_norm": 1.3203125, - "learning_rate": 1.0039370078740158e-05, - "loss": 0.1175, - "step": 3280 - }, - { - "epoch": 19.25, - "eval_loss": 0.5596539378166199, - "eval_runtime": 4.884, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3280 - }, - { - "epoch": 19.3, - "grad_norm": 1.34375, - "learning_rate": 9.940944881889765e-06, - "loss": 0.1293, - "step": 3288 - }, - { - "epoch": 19.3, - "eval_loss": 0.5600309371948242, - "eval_runtime": 4.8839, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 3288 - }, - { - "epoch": 19.35, - "grad_norm": 1.484375, - "learning_rate": 9.842519685039371e-06, - "loss": 0.1274, - "step": 3296 - }, - { - "epoch": 19.35, - "eval_loss": 0.554668128490448, - "eval_runtime": 4.8842, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3296 - }, - { - "epoch": 19.39, - "grad_norm": 1.4375, - "learning_rate": 9.744094488188976e-06, - "loss": 0.1303, - "step": 3304 - }, - { - "epoch": 19.39, - "eval_loss": 0.5577480792999268, - "eval_runtime": 4.8832, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3304 - }, - { - "epoch": 19.44, - "grad_norm": 1.4609375, - "learning_rate": 9.645669291338583e-06, - "loss": 0.1341, - "step": 3312 - }, - { - "epoch": 19.44, - "eval_loss": 0.5581305027008057, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3312 - }, - { - "epoch": 19.49, - "grad_norm": 1.3671875, - "learning_rate": 9.54724409448819e-06, - "loss": 0.1256, - "step": 3320 - }, - { - "epoch": 19.49, - "eval_loss": 0.5582204461097717, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3320 - }, - { - "epoch": 19.53, - "grad_norm": 1.4453125, - "learning_rate": 9.448818897637795e-06, - "loss": 0.1267, - "step": 3328 - }, - { - "epoch": 19.53, - "eval_loss": 0.5566105842590332, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3328 - }, - { - "epoch": 19.58, - "grad_norm": 1.4140625, - "learning_rate": 9.350393700787402e-06, - "loss": 0.126, - "step": 3336 - }, - { - "epoch": 19.58, - "eval_loss": 0.5577941536903381, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3336 - }, - { - "epoch": 19.63, - "grad_norm": 1.390625, - "learning_rate": 9.251968503937008e-06, - "loss": 0.1335, - "step": 3344 - }, - { - "epoch": 19.63, - "eval_loss": 0.559036374092102, - "eval_runtime": 4.8877, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3344 - }, - { - "epoch": 19.67, - "grad_norm": 1.484375, - "learning_rate": 9.153543307086615e-06, - "loss": 0.1254, - "step": 3352 - }, - { - "epoch": 19.67, - "eval_loss": 0.5568073987960815, - "eval_runtime": 4.8832, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3352 - }, - { - "epoch": 19.72, - "grad_norm": 1.3515625, - "learning_rate": 9.055118110236222e-06, - "loss": 0.1251, - "step": 3360 - }, - { - "epoch": 19.72, - "eval_loss": 0.5608676671981812, - "eval_runtime": 4.8884, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 3360 - }, - { - "epoch": 19.77, - "grad_norm": 1.5234375, - "learning_rate": 8.956692913385827e-06, - "loss": 0.1299, - "step": 3368 - }, - { - "epoch": 19.77, - "eval_loss": 0.5572808980941772, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3368 - }, - { - "epoch": 19.82, - "grad_norm": 1.3828125, - "learning_rate": 8.858267716535434e-06, - "loss": 0.1281, - "step": 3376 - }, - { - "epoch": 19.82, - "eval_loss": 0.5595596432685852, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3376 - }, - { - "epoch": 19.86, - "grad_norm": 1.3671875, - "learning_rate": 8.759842519685039e-06, - "loss": 0.132, - "step": 3384 - }, - { - "epoch": 19.86, - "eval_loss": 0.5580050349235535, - "eval_runtime": 4.8864, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3384 - }, - { - "epoch": 19.91, - "grad_norm": 1.4453125, - "learning_rate": 8.661417322834646e-06, - "loss": 0.1379, - "step": 3392 - }, - { - "epoch": 19.91, - "eval_loss": 0.5603541135787964, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 3392 - }, - { - "epoch": 19.96, - "grad_norm": 1.5078125, - "learning_rate": 8.562992125984252e-06, - "loss": 0.1282, - "step": 3400 - }, - { - "epoch": 19.96, - "eval_loss": 0.5588706135749817, - "eval_runtime": 4.883, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3400 - }, - { - "epoch": 20.0, - "grad_norm": 1.328125, - "learning_rate": 8.46456692913386e-06, - "loss": 0.1384, - "step": 3408 - }, - { - "epoch": 20.0, - "eval_loss": 0.5589407086372375, - "eval_runtime": 4.8904, - "eval_samples_per_second": 9.815, - "eval_steps_per_second": 1.227, - "step": 3408 - }, - { - "epoch": 20.05, - "grad_norm": 1.4765625, - "learning_rate": 8.366141732283466e-06, - "loss": 0.1241, - "step": 3416 - }, - { - "epoch": 20.05, - "eval_loss": 0.5755249857902527, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3416 - }, - { - "epoch": 20.1, - "grad_norm": 1.3046875, - "learning_rate": 8.267716535433071e-06, - "loss": 0.1159, - "step": 3424 - }, - { - "epoch": 20.1, - "eval_loss": 0.5693344473838806, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3424 - }, - { - "epoch": 20.14, - "grad_norm": 1.640625, - "learning_rate": 8.169291338582676e-06, - "loss": 0.1315, - "step": 3432 - }, - { - "epoch": 20.14, - "eval_loss": 0.566354513168335, - "eval_runtime": 4.8833, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 3432 - }, - { - "epoch": 20.19, - "grad_norm": 1.5078125, - "learning_rate": 8.070866141732283e-06, - "loss": 0.122, - "step": 3440 - }, - { - "epoch": 20.19, - "eval_loss": 0.571541428565979, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3440 - }, - { - "epoch": 20.24, - "grad_norm": 1.296875, - "learning_rate": 7.97244094488189e-06, - "loss": 0.1196, - "step": 3448 - }, - { - "epoch": 20.24, - "eval_loss": 0.5679660439491272, - "eval_runtime": 4.8818, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 3448 - }, - { - "epoch": 20.28, - "grad_norm": 1.265625, - "learning_rate": 7.874015748031496e-06, - "loss": 0.1212, - "step": 3456 - }, - { - "epoch": 20.28, - "eval_loss": 0.5651406049728394, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 3456 - }, - { - "epoch": 20.33, - "grad_norm": 1.34375, - "learning_rate": 7.775590551181103e-06, - "loss": 0.1253, - "step": 3464 - }, - { - "epoch": 20.33, - "eval_loss": 0.5682979822158813, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3464 - }, - { - "epoch": 20.38, - "grad_norm": 1.5234375, - "learning_rate": 7.67716535433071e-06, - "loss": 0.1247, - "step": 3472 - }, - { - "epoch": 20.38, - "eval_loss": 0.5737568736076355, - "eval_runtime": 4.8865, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3472 - }, - { - "epoch": 20.43, - "grad_norm": 1.5625, - "learning_rate": 7.578740157480316e-06, - "loss": 0.1237, - "step": 3480 - }, - { - "epoch": 20.43, - "eval_loss": 0.5683262944221497, - "eval_runtime": 4.8826, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3480 - }, - { - "epoch": 20.47, - "grad_norm": 1.40625, - "learning_rate": 7.4803149606299226e-06, - "loss": 0.1257, - "step": 3488 - }, - { - "epoch": 20.47, - "eval_loss": 0.5667000412940979, - "eval_runtime": 4.8835, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 3488 - }, - { - "epoch": 20.52, - "grad_norm": 1.3125, - "learning_rate": 7.381889763779528e-06, - "loss": 0.1146, - "step": 3496 - }, - { - "epoch": 20.52, - "eval_loss": 0.5698798298835754, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3496 - }, - { - "epoch": 20.57, - "grad_norm": 1.4296875, - "learning_rate": 7.2834645669291335e-06, - "loss": 0.1211, - "step": 3504 - }, - { - "epoch": 20.57, - "eval_loss": 0.5667474269866943, - "eval_runtime": 4.8839, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 3504 - }, - { - "epoch": 20.61, - "grad_norm": 1.4375, - "learning_rate": 7.18503937007874e-06, - "loss": 0.133, - "step": 3512 - }, - { - "epoch": 20.61, - "eval_loss": 0.5643436312675476, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3512 - }, - { - "epoch": 20.66, - "grad_norm": 1.4921875, - "learning_rate": 7.086614173228347e-06, - "loss": 0.1253, - "step": 3520 - }, - { - "epoch": 20.66, - "eval_loss": 0.5657409429550171, - "eval_runtime": 4.8829, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3520 - }, - { - "epoch": 20.71, - "grad_norm": 1.3203125, - "learning_rate": 6.988188976377953e-06, - "loss": 0.1217, - "step": 3528 - }, - { - "epoch": 20.71, - "eval_loss": 0.5663951635360718, - "eval_runtime": 4.8868, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 3528 - }, - { - "epoch": 20.75, - "grad_norm": 1.4921875, - "learning_rate": 6.88976377952756e-06, - "loss": 0.1385, - "step": 3536 - }, - { - "epoch": 20.75, - "eval_loss": 0.5656896233558655, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 3536 - }, - { - "epoch": 20.8, - "grad_norm": 1.34375, - "learning_rate": 6.7913385826771665e-06, - "loss": 0.1312, - "step": 3544 - }, - { - "epoch": 20.8, - "eval_loss": 0.5687183737754822, - "eval_runtime": 4.8862, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3544 - }, - { - "epoch": 20.85, - "grad_norm": 1.5390625, - "learning_rate": 6.692913385826772e-06, - "loss": 0.1279, - "step": 3552 - }, - { - "epoch": 20.85, - "eval_loss": 0.5673196911811829, - "eval_runtime": 4.8874, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3552 - }, - { - "epoch": 20.9, - "grad_norm": 1.3984375, - "learning_rate": 6.5944881889763775e-06, - "loss": 0.1298, - "step": 3560 - }, - { - "epoch": 20.9, - "eval_loss": 0.5674422383308411, - "eval_runtime": 4.8854, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3560 - }, - { - "epoch": 20.94, - "grad_norm": 1.4921875, - "learning_rate": 6.496062992125984e-06, - "loss": 0.1309, - "step": 3568 - }, - { - "epoch": 20.94, - "eval_loss": 0.566185474395752, - "eval_runtime": 4.8849, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 3568 - }, - { - "epoch": 20.99, - "grad_norm": 1.4296875, - "learning_rate": 6.397637795275591e-06, - "loss": 0.125, - "step": 3576 - }, - { - "epoch": 20.99, - "eval_loss": 0.5648412704467773, - "eval_runtime": 4.895, - "eval_samples_per_second": 9.806, - "eval_steps_per_second": 1.226, - "step": 3576 - }, - { - "epoch": 21.04, - "grad_norm": 1.3046875, - "learning_rate": 6.299212598425197e-06, - "loss": 0.1252, - "step": 3584 - }, - { - "epoch": 21.04, - "eval_loss": 0.5721383690834045, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3584 - }, - { - "epoch": 21.08, - "grad_norm": 1.546875, - "learning_rate": 6.200787401574804e-06, - "loss": 0.1339, - "step": 3592 - }, - { - "epoch": 21.08, - "eval_loss": 0.5767750144004822, - "eval_runtime": 4.8849, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 3592 - }, - { - "epoch": 21.13, - "grad_norm": 1.2890625, - "learning_rate": 6.10236220472441e-06, - "loss": 0.1167, - "step": 3600 - }, - { - "epoch": 21.13, - "eval_loss": 0.5756697058677673, - "eval_runtime": 4.8824, - "eval_samples_per_second": 9.831, - "eval_steps_per_second": 1.229, - "step": 3600 - }, - { - "epoch": 21.18, - "grad_norm": 1.328125, - "learning_rate": 6.0039370078740155e-06, - "loss": 0.1307, - "step": 3608 - }, - { - "epoch": 21.18, - "eval_loss": 0.5740421414375305, - "eval_runtime": 4.8862, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3608 - }, - { - "epoch": 21.22, - "grad_norm": 1.3984375, - "learning_rate": 5.905511811023622e-06, - "loss": 0.1229, - "step": 3616 - }, - { - "epoch": 21.22, - "eval_loss": 0.5752490162849426, - "eval_runtime": 4.8844, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3616 - }, - { - "epoch": 21.27, - "grad_norm": 1.3046875, - "learning_rate": 5.807086614173229e-06, - "loss": 0.1177, - "step": 3624 - }, - { - "epoch": 21.27, - "eval_loss": 0.5777534246444702, - "eval_runtime": 4.8857, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3624 - }, - { - "epoch": 21.32, - "grad_norm": 1.3515625, - "learning_rate": 5.708661417322835e-06, - "loss": 0.1255, - "step": 3632 - }, - { - "epoch": 21.32, - "eval_loss": 0.5778191685676575, - "eval_runtime": 4.888, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.228, - "step": 3632 - }, - { - "epoch": 21.36, - "grad_norm": 1.375, - "learning_rate": 5.610236220472441e-06, - "loss": 0.1263, - "step": 3640 - }, - { - "epoch": 21.36, - "eval_loss": 0.5756245255470276, - "eval_runtime": 4.8829, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3640 - }, - { - "epoch": 21.41, - "grad_norm": 1.390625, - "learning_rate": 5.511811023622048e-06, - "loss": 0.1201, - "step": 3648 - }, - { - "epoch": 21.41, - "eval_loss": 0.572148859500885, - "eval_runtime": 4.8807, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3648 - }, - { - "epoch": 21.46, - "grad_norm": 1.3828125, - "learning_rate": 5.4133858267716535e-06, - "loss": 0.1193, - "step": 3656 - }, - { - "epoch": 21.46, - "eval_loss": 0.5735480189323425, - "eval_runtime": 4.8918, - "eval_samples_per_second": 9.812, - "eval_steps_per_second": 1.227, - "step": 3656 - }, - { - "epoch": 21.51, - "grad_norm": 1.40625, - "learning_rate": 5.31496062992126e-06, - "loss": 0.1238, - "step": 3664 - }, - { - "epoch": 21.51, - "eval_loss": 0.5760764479637146, - "eval_runtime": 4.887, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 3664 - }, - { - "epoch": 21.55, - "grad_norm": 1.4765625, - "learning_rate": 5.216535433070866e-06, - "loss": 0.1174, - "step": 3672 - }, - { - "epoch": 21.55, - "eval_loss": 0.5745072960853577, - "eval_runtime": 4.8836, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 3672 - }, - { - "epoch": 21.6, - "grad_norm": 1.328125, - "learning_rate": 5.118110236220473e-06, - "loss": 0.1207, - "step": 3680 - }, - { - "epoch": 21.6, - "eval_loss": 0.5743126273155212, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 3680 - }, - { - "epoch": 21.65, - "grad_norm": 1.3046875, - "learning_rate": 5.019685039370079e-06, - "loss": 0.1276, - "step": 3688 - }, - { - "epoch": 21.65, - "eval_loss": 0.5735090374946594, - "eval_runtime": 4.8856, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3688 - }, - { - "epoch": 21.69, - "grad_norm": 1.3125, - "learning_rate": 4.921259842519686e-06, - "loss": 0.1202, - "step": 3696 - }, - { - "epoch": 21.69, - "eval_loss": 0.5732198357582092, - "eval_runtime": 4.8809, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 3696 - }, - { - "epoch": 21.74, - "grad_norm": 1.296875, - "learning_rate": 4.8228346456692916e-06, - "loss": 0.1221, - "step": 3704 - }, - { - "epoch": 21.74, - "eval_loss": 0.5724496245384216, - "eval_runtime": 4.8876, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3704 - }, - { - "epoch": 21.79, - "grad_norm": 1.328125, - "learning_rate": 4.7244094488188975e-06, - "loss": 0.1243, - "step": 3712 - }, - { - "epoch": 21.79, - "eval_loss": 0.5729621648788452, - "eval_runtime": 4.8869, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 3712 - }, - { - "epoch": 21.83, - "grad_norm": 1.3828125, - "learning_rate": 4.625984251968504e-06, - "loss": 0.1243, - "step": 3720 - }, - { - "epoch": 21.83, - "eval_loss": 0.5742912888526917, - "eval_runtime": 4.8842, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3720 - }, - { - "epoch": 21.88, - "grad_norm": 1.3046875, - "learning_rate": 4.527559055118111e-06, - "loss": 0.114, - "step": 3728 - }, - { - "epoch": 21.88, - "eval_loss": 0.5741546154022217, - "eval_runtime": 4.8867, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3728 - }, - { - "epoch": 21.93, - "grad_norm": 1.3984375, - "learning_rate": 4.429133858267717e-06, - "loss": 0.1256, - "step": 3736 - }, - { - "epoch": 21.93, - "eval_loss": 0.5746021866798401, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3736 - }, - { - "epoch": 21.98, - "grad_norm": 1.28125, - "learning_rate": 4.330708661417323e-06, - "loss": 0.1167, - "step": 3744 - }, - { - "epoch": 21.98, - "eval_loss": 0.5760999321937561, - "eval_runtime": 4.8858, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3744 - }, - { - "epoch": 22.02, - "grad_norm": 1.265625, - "learning_rate": 4.23228346456693e-06, - "loss": 0.1191, - "step": 3752 - }, - { - "epoch": 22.02, - "eval_loss": 0.5751403570175171, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3752 - }, - { - "epoch": 22.07, - "grad_norm": 1.3046875, - "learning_rate": 4.1338582677165355e-06, - "loss": 0.1213, - "step": 3760 - }, - { - "epoch": 22.07, - "eval_loss": 0.5758962631225586, - "eval_runtime": 4.8908, - "eval_samples_per_second": 9.814, - "eval_steps_per_second": 1.227, - "step": 3760 - }, - { - "epoch": 22.12, - "grad_norm": 1.3671875, - "learning_rate": 4.035433070866141e-06, - "loss": 0.117, - "step": 3768 - }, - { - "epoch": 22.12, - "eval_loss": 0.5779056549072266, - "eval_runtime": 4.8823, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 3768 - }, - { - "epoch": 22.16, - "grad_norm": 1.53125, - "learning_rate": 3.937007874015748e-06, - "loss": 0.1283, - "step": 3776 - }, - { - "epoch": 22.16, - "eval_loss": 0.5779562592506409, - "eval_runtime": 4.8838, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 3776 - }, - { - "epoch": 22.21, - "grad_norm": 1.296875, - "learning_rate": 3.838582677165355e-06, - "loss": 0.1231, - "step": 3784 - }, - { - "epoch": 22.21, - "eval_loss": 0.577904999256134, - "eval_runtime": 4.8859, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3784 - }, - { - "epoch": 22.26, - "grad_norm": 1.2890625, - "learning_rate": 3.7401574803149613e-06, - "loss": 0.1159, - "step": 3792 - }, - { - "epoch": 22.26, - "eval_loss": 0.577033519744873, - "eval_runtime": 4.8788, - "eval_samples_per_second": 9.838, - "eval_steps_per_second": 1.23, - "step": 3792 - }, - { - "epoch": 22.3, - "grad_norm": 1.4921875, - "learning_rate": 3.6417322834645668e-06, - "loss": 0.1194, - "step": 3800 - }, - { - "epoch": 22.3, - "eval_loss": 0.5773031115531921, - "eval_runtime": 4.8861, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3800 - }, - { - "epoch": 22.35, - "grad_norm": 1.4140625, - "learning_rate": 3.5433070866141735e-06, - "loss": 0.1258, - "step": 3808 - }, - { - "epoch": 22.35, - "eval_loss": 0.5772604942321777, - "eval_runtime": 4.8813, - "eval_samples_per_second": 9.834, - "eval_steps_per_second": 1.229, - "step": 3808 - }, - { - "epoch": 22.4, - "grad_norm": 1.203125, - "learning_rate": 3.44488188976378e-06, - "loss": 0.1194, - "step": 3816 - }, - { - "epoch": 22.4, - "eval_loss": 0.576747477054596, - "eval_runtime": 4.8816, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 3816 - }, - { - "epoch": 22.44, - "grad_norm": 1.28125, - "learning_rate": 3.346456692913386e-06, - "loss": 0.117, - "step": 3824 - }, - { - "epoch": 22.44, - "eval_loss": 0.577796459197998, - "eval_runtime": 4.8802, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.229, - "step": 3824 - }, - { - "epoch": 22.49, - "grad_norm": 1.4453125, - "learning_rate": 3.248031496062992e-06, - "loss": 0.1213, - "step": 3832 - }, - { - "epoch": 22.49, - "eval_loss": 0.578058660030365, - "eval_runtime": 4.8805, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3832 - }, - { - "epoch": 22.54, - "grad_norm": 1.4921875, - "learning_rate": 3.1496062992125985e-06, - "loss": 0.1106, - "step": 3840 - }, - { - "epoch": 22.54, - "eval_loss": 0.5779960751533508, - "eval_runtime": 4.8851, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 3840 - }, - { - "epoch": 22.59, - "grad_norm": 1.3515625, - "learning_rate": 3.051181102362205e-06, - "loss": 0.1225, - "step": 3848 - }, - { - "epoch": 22.59, - "eval_loss": 0.5778743624687195, - "eval_runtime": 4.8853, - "eval_samples_per_second": 9.825, - "eval_steps_per_second": 1.228, - "step": 3848 - }, - { - "epoch": 22.63, - "grad_norm": 1.265625, - "learning_rate": 2.952755905511811e-06, - "loss": 0.1201, - "step": 3856 - }, - { - "epoch": 22.63, - "eval_loss": 0.5778095126152039, - "eval_runtime": 4.8862, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 3856 - }, - { - "epoch": 22.68, - "grad_norm": 1.3046875, - "learning_rate": 2.8543307086614175e-06, - "loss": 0.1123, - "step": 3864 - }, - { - "epoch": 22.68, - "eval_loss": 0.576666533946991, - "eval_runtime": 4.8875, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3864 - }, - { - "epoch": 22.73, - "grad_norm": 1.515625, - "learning_rate": 2.755905511811024e-06, - "loss": 0.1265, - "step": 3872 - }, - { - "epoch": 22.73, - "eval_loss": 0.5777731537818909, - "eval_runtime": 4.8845, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3872 - }, - { - "epoch": 22.77, - "grad_norm": 1.390625, - "learning_rate": 2.65748031496063e-06, - "loss": 0.121, - "step": 3880 - }, - { - "epoch": 22.77, - "eval_loss": 0.5775730609893799, - "eval_runtime": 4.8883, - "eval_samples_per_second": 9.819, - "eval_steps_per_second": 1.227, - "step": 3880 - }, - { - "epoch": 22.82, - "grad_norm": 1.4375, - "learning_rate": 2.5590551181102365e-06, - "loss": 0.1233, - "step": 3888 - }, - { - "epoch": 22.82, - "eval_loss": 0.5771867036819458, - "eval_runtime": 4.8877, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3888 - }, - { - "epoch": 22.87, - "grad_norm": 1.3046875, - "learning_rate": 2.460629921259843e-06, - "loss": 0.1191, - "step": 3896 - }, - { - "epoch": 22.87, - "eval_loss": 0.5772641897201538, - "eval_runtime": 4.8852, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 3896 - }, - { - "epoch": 22.91, - "grad_norm": 1.375, - "learning_rate": 2.3622047244094487e-06, - "loss": 0.127, - "step": 3904 - }, - { - "epoch": 22.91, - "eval_loss": 0.577972412109375, - "eval_runtime": 4.8831, - "eval_samples_per_second": 9.83, - "eval_steps_per_second": 1.229, - "step": 3904 - }, - { - "epoch": 22.96, - "grad_norm": 1.2265625, - "learning_rate": 2.2637795275590555e-06, - "loss": 0.127, - "step": 3912 - }, - { - "epoch": 22.96, - "eval_loss": 0.5772601366043091, - "eval_runtime": 4.8799, - "eval_samples_per_second": 9.836, - "eval_steps_per_second": 1.23, - "step": 3912 - }, - { - "epoch": 23.01, - "grad_norm": 1.3125, - "learning_rate": 2.1653543307086614e-06, - "loss": 0.1155, - "step": 3920 - }, - { - "epoch": 23.01, - "eval_loss": 0.5777209997177124, - "eval_runtime": 4.8863, - "eval_samples_per_second": 9.823, - "eval_steps_per_second": 1.228, - "step": 3920 - }, - { - "epoch": 23.06, - "grad_norm": 1.203125, - "learning_rate": 2.0669291338582678e-06, - "loss": 0.1133, - "step": 3928 - }, - { - "epoch": 23.06, - "eval_loss": 0.5779727697372437, - "eval_runtime": 4.8894, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 3928 - }, - { - "epoch": 23.1, - "grad_norm": 1.3515625, - "learning_rate": 1.968503937007874e-06, - "loss": 0.115, - "step": 3936 - }, - { - "epoch": 23.1, - "eval_loss": 0.5777748823165894, - "eval_runtime": 4.8872, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 3936 - }, - { - "epoch": 23.15, - "grad_norm": 1.3203125, - "learning_rate": 1.8700787401574806e-06, - "loss": 0.1133, - "step": 3944 - }, - { - "epoch": 23.15, - "eval_loss": 0.578517735004425, - "eval_runtime": 4.8877, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 3944 - }, - { - "epoch": 23.2, - "grad_norm": 1.3828125, - "learning_rate": 1.7716535433070868e-06, - "loss": 0.1243, - "step": 3952 - }, - { - "epoch": 23.2, - "eval_loss": 0.5775905251502991, - "eval_runtime": 4.8803, - "eval_samples_per_second": 9.835, - "eval_steps_per_second": 1.229, - "step": 3952 - }, - { - "epoch": 23.24, - "grad_norm": 1.21875, - "learning_rate": 1.673228346456693e-06, - "loss": 0.1255, - "step": 3960 - }, - { - "epoch": 23.24, - "eval_loss": 0.579028308391571, - "eval_runtime": 4.8843, - "eval_samples_per_second": 9.827, - "eval_steps_per_second": 1.228, - "step": 3960 - }, - { - "epoch": 23.29, - "grad_norm": 1.3984375, - "learning_rate": 1.5748031496062992e-06, - "loss": 0.1082, - "step": 3968 - }, - { - "epoch": 23.29, - "eval_loss": 0.5788905024528503, - "eval_runtime": 4.8872, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 3968 - }, - { - "epoch": 23.34, - "grad_norm": 1.328125, - "learning_rate": 1.4763779527559056e-06, - "loss": 0.1122, - "step": 3976 - }, - { - "epoch": 23.34, - "eval_loss": 0.5791129469871521, - "eval_runtime": 4.8841, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 3976 - }, - { - "epoch": 23.38, - "grad_norm": 1.3203125, - "learning_rate": 1.377952755905512e-06, - "loss": 0.1275, - "step": 3984 - }, - { - "epoch": 23.38, - "eval_loss": 0.5788193345069885, - "eval_runtime": 4.8848, - "eval_samples_per_second": 9.826, - "eval_steps_per_second": 1.228, - "step": 3984 - }, - { - "epoch": 23.43, - "grad_norm": 1.34375, - "learning_rate": 1.2795275590551182e-06, - "loss": 0.1234, - "step": 3992 - }, - { - "epoch": 23.43, - "eval_loss": 0.5793178081512451, - "eval_runtime": 4.8814, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 3992 - }, - { - "epoch": 23.48, - "grad_norm": 1.390625, - "learning_rate": 1.1811023622047244e-06, - "loss": 0.1239, - "step": 4000 - }, - { - "epoch": 23.48, - "eval_loss": 0.5789151787757874, - "eval_runtime": 4.8797, - "eval_samples_per_second": 9.837, - "eval_steps_per_second": 1.23, - "step": 4000 - }, - { - "epoch": 23.52, - "grad_norm": 1.4140625, - "learning_rate": 1.0826771653543307e-06, - "loss": 0.1203, - "step": 4008 - }, - { - "epoch": 23.52, - "eval_loss": 0.5790886282920837, - "eval_runtime": 4.882, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 4008 - }, - { - "epoch": 23.57, - "grad_norm": 1.3984375, - "learning_rate": 9.84251968503937e-07, - "loss": 0.1218, - "step": 4016 - }, - { - "epoch": 23.57, - "eval_loss": 0.5786433815956116, - "eval_runtime": 4.8874, - "eval_samples_per_second": 9.821, - "eval_steps_per_second": 1.228, - "step": 4016 - }, - { - "epoch": 23.62, - "grad_norm": 1.359375, - "learning_rate": 8.858267716535434e-07, - "loss": 0.13, - "step": 4024 - }, - { - "epoch": 23.62, - "eval_loss": 0.5786345601081848, - "eval_runtime": 4.8839, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.229, - "step": 4024 - }, - { - "epoch": 23.67, - "grad_norm": 1.3203125, - "learning_rate": 7.874015748031496e-07, - "loss": 0.124, - "step": 4032 - }, - { - "epoch": 23.67, - "eval_loss": 0.5792049765586853, - "eval_runtime": 4.8834, - "eval_samples_per_second": 9.829, - "eval_steps_per_second": 1.229, - "step": 4032 - }, - { - "epoch": 23.71, - "grad_norm": 1.4375, - "learning_rate": 6.88976377952756e-07, - "loss": 0.1141, - "step": 4040 - }, - { - "epoch": 23.71, - "eval_loss": 0.5789989829063416, - "eval_runtime": 4.886, - "eval_samples_per_second": 9.824, - "eval_steps_per_second": 1.228, - "step": 4040 - }, - { - "epoch": 23.76, - "grad_norm": 1.40625, - "learning_rate": 5.905511811023622e-07, - "loss": 0.118, - "step": 4048 - }, - { - "epoch": 23.76, - "eval_loss": 0.5784198641777039, - "eval_runtime": 4.8871, - "eval_samples_per_second": 9.822, - "eval_steps_per_second": 1.228, - "step": 4048 - }, - { - "epoch": 23.81, - "grad_norm": 1.390625, - "learning_rate": 4.921259842519685e-07, - "loss": 0.1276, - "step": 4056 - }, - { - "epoch": 23.81, - "eval_loss": 0.57962566614151, - "eval_runtime": 4.8775, - "eval_samples_per_second": 9.841, - "eval_steps_per_second": 1.23, - "step": 4056 - }, - { - "epoch": 23.85, - "grad_norm": 1.3515625, - "learning_rate": 3.937007874015748e-07, - "loss": 0.1152, - "step": 4064 - }, - { - "epoch": 23.85, - "eval_loss": 0.5792584419250488, - "eval_runtime": 4.884, - "eval_samples_per_second": 9.828, - "eval_steps_per_second": 1.228, - "step": 4064 - }, - { - "epoch": 23.9, - "grad_norm": 1.359375, - "learning_rate": 2.952755905511811e-07, - "loss": 0.1292, - "step": 4072 - }, - { - "epoch": 23.9, - "eval_loss": 0.5795792937278748, - "eval_runtime": 4.8815, - "eval_samples_per_second": 9.833, - "eval_steps_per_second": 1.229, - "step": 4072 - }, - { - "epoch": 23.95, - "grad_norm": 1.296875, - "learning_rate": 1.968503937007874e-07, - "loss": 0.1201, - "step": 4080 - }, - { - "epoch": 23.95, - "eval_loss": 0.5792078375816345, - "eval_runtime": 4.888, - "eval_samples_per_second": 9.82, - "eval_steps_per_second": 1.227, - "step": 4080 - }, - { - "epoch": 23.99, - "grad_norm": 1.4140625, - "learning_rate": 9.84251968503937e-08, - "loss": 0.1213, - "step": 4088 - }, - { - "epoch": 23.99, - "eval_loss": 0.5791333317756653, - "eval_runtime": 4.8822, - "eval_samples_per_second": 9.832, - "eval_steps_per_second": 1.229, - "step": 4088 - }, - { - "epoch": 24.04, - "grad_norm": 1.328125, - "learning_rate": 0.0, - "loss": 0.1246, - "step": 4096 - }, - { - "epoch": 24.04, - "eval_loss": 0.5793049335479736, - "eval_runtime": 4.8897, - "eval_samples_per_second": 9.817, - "eval_steps_per_second": 1.227, - "step": 4096 - }, - { - "epoch": 24.04, - "step": 4096, - "total_flos": 2.6639402315603804e+18, - "train_loss": 0.24291648308280855, - "train_runtime": 18372.1559, - "train_samples_per_second": 3.567, - "train_steps_per_second": 0.223 + "epoch": 3.0, + "step": 512, + "total_flos": 9.863515762146509e+16, + "train_loss": 0.5746448771096766, + "train_runtime": 1124.5772, + "train_samples_per_second": 7.285, + "train_steps_per_second": 0.455 } ], "logging_steps": 8, - "max_steps": 4096, + "max_steps": 512, "num_input_tokens_seen": 0, - "num_train_epochs": 25, - "save_steps": 8, - "total_flos": 2.6639402315603804e+18, - "train_batch_size": 2, + "num_train_epochs": 4, + "save_steps": 32, + "total_flos": 9.863515762146509e+16, + "train_batch_size": 4, "trial_name": null, "trial_params": null }