| { | |
| "best_metric": 0.9716312056737588, | |
| "best_model_checkpoint": "./results/checkpoint-3807", | |
| "epoch": 70.0, | |
| "eval_steps": 500, | |
| "global_step": 5670, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.570446014404297, | |
| "learning_rate": 1.9728395061728395e-05, | |
| "loss": 2.6389, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.475177304964539, | |
| "eval_loss": 0.7098350524902344, | |
| "eval_runtime": 0.3441, | |
| "eval_samples_per_second": 819.478, | |
| "eval_steps_per_second": 52.307, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 4.623419284820557, | |
| "learning_rate": 1.944268077601411e-05, | |
| "loss": 0.6477, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.48226950354609927, | |
| "eval_loss": 0.7516428828239441, | |
| "eval_runtime": 0.3434, | |
| "eval_samples_per_second": 821.083, | |
| "eval_steps_per_second": 52.41, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 10.926794052124023, | |
| "learning_rate": 1.9156966490299824e-05, | |
| "loss": 0.6227, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.4929078014184397, | |
| "eval_loss": 0.8317187428474426, | |
| "eval_runtime": 0.3439, | |
| "eval_samples_per_second": 820.006, | |
| "eval_steps_per_second": 52.341, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 12.648384094238281, | |
| "learning_rate": 1.887125220458554e-05, | |
| "loss": 0.5403, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.4929078014184397, | |
| "eval_loss": 1.9380121231079102, | |
| "eval_runtime": 0.3424, | |
| "eval_samples_per_second": 823.561, | |
| "eval_steps_per_second": 52.568, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 23.567258834838867, | |
| "learning_rate": 1.8585537918871256e-05, | |
| "loss": 0.5108, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.49645390070921985, | |
| "eval_loss": 2.270359754562378, | |
| "eval_runtime": 0.3437, | |
| "eval_samples_per_second": 820.538, | |
| "eval_steps_per_second": 52.375, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 3.5719075202941895, | |
| "learning_rate": 1.830335097001764e-05, | |
| "loss": 0.4677, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.48936170212765956, | |
| "eval_loss": 1.6858181953430176, | |
| "eval_runtime": 0.3432, | |
| "eval_samples_per_second": 821.693, | |
| "eval_steps_per_second": 52.449, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 7.08165168762207, | |
| "learning_rate": 1.8017636684303353e-05, | |
| "loss": 0.4798, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.49645390070921985, | |
| "eval_loss": 1.623734712600708, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.682, | |
| "eval_steps_per_second": 52.384, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 10.894269943237305, | |
| "learning_rate": 1.773192239858907e-05, | |
| "loss": 0.4817, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.5141843971631206, | |
| "eval_loss": 1.3935478925704956, | |
| "eval_runtime": 0.3435, | |
| "eval_samples_per_second": 821.029, | |
| "eval_steps_per_second": 52.406, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 7.739453315734863, | |
| "learning_rate": 1.744620811287478e-05, | |
| "loss": 0.4668, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.5177304964539007, | |
| "eval_loss": 1.259345531463623, | |
| "eval_runtime": 0.343, | |
| "eval_samples_per_second": 822.053, | |
| "eval_steps_per_second": 52.471, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 17.012800216674805, | |
| "learning_rate": 1.7160493827160498e-05, | |
| "loss": 0.4359, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.5354609929078015, | |
| "eval_loss": 1.310729742050171, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.694, | |
| "eval_steps_per_second": 52.385, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 1.6642764806747437, | |
| "learning_rate": 1.687477954144621e-05, | |
| "loss": 0.3956, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.8226950354609929, | |
| "eval_loss": 0.43421775102615356, | |
| "eval_runtime": 0.3435, | |
| "eval_samples_per_second": 820.982, | |
| "eval_steps_per_second": 52.403, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.3688388168811798, | |
| "learning_rate": 1.6589065255731923e-05, | |
| "loss": 0.2906, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9290780141843972, | |
| "eval_loss": 0.23947754502296448, | |
| "eval_runtime": 0.3442, | |
| "eval_samples_per_second": 819.25, | |
| "eval_steps_per_second": 52.293, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 37.02349853515625, | |
| "learning_rate": 1.630335097001764e-05, | |
| "loss": 0.2146, | |
| "step": 1053 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9397163120567376, | |
| "eval_loss": 0.33284759521484375, | |
| "eval_runtime": 0.3437, | |
| "eval_samples_per_second": 820.462, | |
| "eval_steps_per_second": 52.37, | |
| "step": 1053 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.11939908564090729, | |
| "learning_rate": 1.601763668430335e-05, | |
| "loss": 0.1462, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.3009294867515564, | |
| "eval_runtime": 0.3439, | |
| "eval_samples_per_second": 819.984, | |
| "eval_steps_per_second": 52.339, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.08733003586530685, | |
| "learning_rate": 1.5731922398589064e-05, | |
| "loss": 0.1062, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.9290780141843972, | |
| "eval_loss": 0.21407951414585114, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.805, | |
| "eval_steps_per_second": 52.392, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.21886540949344635, | |
| "learning_rate": 1.544620811287478e-05, | |
| "loss": 0.0813, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.9432624113475178, | |
| "eval_loss": 0.34917283058166504, | |
| "eval_runtime": 0.344, | |
| "eval_samples_per_second": 819.711, | |
| "eval_steps_per_second": 52.322, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.5847246646881104, | |
| "learning_rate": 1.5160493827160495e-05, | |
| "loss": 0.1027, | |
| "step": 1377 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.9219858156028369, | |
| "eval_loss": 0.3432806432247162, | |
| "eval_runtime": 0.3446, | |
| "eval_samples_per_second": 818.425, | |
| "eval_steps_per_second": 52.24, | |
| "step": 1377 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.6198065280914307, | |
| "learning_rate": 1.4874779541446209e-05, | |
| "loss": 0.0736, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.27183273434638977, | |
| "eval_runtime": 0.3437, | |
| "eval_samples_per_second": 820.405, | |
| "eval_steps_per_second": 52.366, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.5257266163825989, | |
| "learning_rate": 1.4589065255731925e-05, | |
| "loss": 0.0684, | |
| "step": 1539 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.25684282183647156, | |
| "eval_runtime": 0.3434, | |
| "eval_samples_per_second": 821.157, | |
| "eval_steps_per_second": 52.414, | |
| "step": 1539 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.0009818405378609896, | |
| "learning_rate": 1.4303350970017638e-05, | |
| "loss": 0.0779, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.2152564525604248, | |
| "eval_runtime": 0.3431, | |
| "eval_samples_per_second": 821.93, | |
| "eval_steps_per_second": 52.464, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.4532203674316406, | |
| "learning_rate": 1.4021164021164022e-05, | |
| "loss": 0.0745, | |
| "step": 1701 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.1914406418800354, | |
| "eval_runtime": 0.344, | |
| "eval_samples_per_second": 819.813, | |
| "eval_steps_per_second": 52.329, | |
| "step": 1701 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 17.428327560424805, | |
| "learning_rate": 1.3735449735449738e-05, | |
| "loss": 0.1106, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.2807099223136902, | |
| "eval_runtime": 0.3441, | |
| "eval_samples_per_second": 819.457, | |
| "eval_steps_per_second": 52.306, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 0.00047796443686820567, | |
| "learning_rate": 1.344973544973545e-05, | |
| "loss": 0.0755, | |
| "step": 1863 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.331978976726532, | |
| "eval_runtime": 0.3453, | |
| "eval_samples_per_second": 816.672, | |
| "eval_steps_per_second": 52.128, | |
| "step": 1863 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 1.006925106048584, | |
| "learning_rate": 1.3164021164021166e-05, | |
| "loss": 0.0833, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.34625303745269775, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.661, | |
| "eval_steps_per_second": 52.383, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.506279706954956, | |
| "learning_rate": 1.288183421516755e-05, | |
| "loss": 0.0754, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_accuracy": 0.9432624113475178, | |
| "eval_loss": 0.34365448355674744, | |
| "eval_runtime": 0.3432, | |
| "eval_samples_per_second": 821.691, | |
| "eval_steps_per_second": 52.448, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 0.1998976171016693, | |
| "learning_rate": 1.2596119929453263e-05, | |
| "loss": 0.0772, | |
| "step": 2106 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.3350883424282074, | |
| "eval_runtime": 0.3435, | |
| "eval_samples_per_second": 820.852, | |
| "eval_steps_per_second": 52.395, | |
| "step": 2106 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 0.19478876888751984, | |
| "learning_rate": 1.2310405643738979e-05, | |
| "loss": 0.076, | |
| "step": 2187 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_accuracy": 0.9468085106382979, | |
| "eval_loss": 0.4145265519618988, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.483, | |
| "eval_steps_per_second": 52.244, | |
| "step": 2187 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.27469512820243835, | |
| "learning_rate": 1.2024691358024691e-05, | |
| "loss": 0.0625, | |
| "step": 2268 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.44451093673706055, | |
| "eval_runtime": 0.3439, | |
| "eval_samples_per_second": 819.913, | |
| "eval_steps_per_second": 52.335, | |
| "step": 2268 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 26.14291000366211, | |
| "learning_rate": 1.1738977072310408e-05, | |
| "loss": 0.0741, | |
| "step": 2349 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_accuracy": 0.9468085106382979, | |
| "eval_loss": 0.29801085591316223, | |
| "eval_runtime": 0.3448, | |
| "eval_samples_per_second": 817.812, | |
| "eval_steps_per_second": 52.201, | |
| "step": 2349 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.0004499799106270075, | |
| "learning_rate": 1.145326278659612e-05, | |
| "loss": 0.0649, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.28359255194664, | |
| "eval_runtime": 0.3442, | |
| "eval_samples_per_second": 819.247, | |
| "eval_steps_per_second": 52.292, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 0.0018564946949481964, | |
| "learning_rate": 1.1167548500881835e-05, | |
| "loss": 0.0688, | |
| "step": 2511 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.21793903410434723, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.498, | |
| "eval_steps_per_second": 52.245, | |
| "step": 2511 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.0009469461510889232, | |
| "learning_rate": 1.088183421516755e-05, | |
| "loss": 0.0735, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.22946923971176147, | |
| "eval_runtime": 0.3449, | |
| "eval_samples_per_second": 817.666, | |
| "eval_steps_per_second": 52.191, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 0.4778638184070587, | |
| "learning_rate": 1.0596119929453263e-05, | |
| "loss": 0.0648, | |
| "step": 2673 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_accuracy": 0.9468085106382979, | |
| "eval_loss": 0.42410480976104736, | |
| "eval_runtime": 0.3433, | |
| "eval_samples_per_second": 821.406, | |
| "eval_steps_per_second": 52.43, | |
| "step": 2673 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 0.21737487614154816, | |
| "learning_rate": 1.031040564373898e-05, | |
| "loss": 0.0672, | |
| "step": 2754 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.2829430401325226, | |
| "eval_runtime": 0.3447, | |
| "eval_samples_per_second": 818.124, | |
| "eval_steps_per_second": 52.221, | |
| "step": 2754 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.08269879966974258, | |
| "learning_rate": 1.0024691358024692e-05, | |
| "loss": 0.067, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_accuracy": 0.9468085106382979, | |
| "eval_loss": 0.3723122179508209, | |
| "eval_runtime": 0.3448, | |
| "eval_samples_per_second": 817.778, | |
| "eval_steps_per_second": 52.199, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.3665499687194824, | |
| "learning_rate": 9.738977072310406e-06, | |
| "loss": 0.0768, | |
| "step": 2916 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.25441667437553406, | |
| "eval_runtime": 0.3447, | |
| "eval_samples_per_second": 818.182, | |
| "eval_steps_per_second": 52.224, | |
| "step": 2916 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 0.11919476091861725, | |
| "learning_rate": 9.45326278659612e-06, | |
| "loss": 0.0691, | |
| "step": 2997 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.20481815934181213, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.558, | |
| "eval_steps_per_second": 52.248, | |
| "step": 2997 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 0.0036801116075366735, | |
| "learning_rate": 9.167548500881835e-06, | |
| "loss": 0.0661, | |
| "step": 3078 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_accuracy": 0.9680851063829787, | |
| "eval_loss": 0.20478524267673492, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.468, | |
| "eval_steps_per_second": 52.243, | |
| "step": 3078 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 0.12663815915584564, | |
| "learning_rate": 8.88183421516755e-06, | |
| "loss": 0.0409, | |
| "step": 3159 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.18502239882946014, | |
| "eval_runtime": 0.3434, | |
| "eval_samples_per_second": 821.144, | |
| "eval_steps_per_second": 52.413, | |
| "step": 3159 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.06950168311595917, | |
| "learning_rate": 8.596119929453264e-06, | |
| "loss": 0.0424, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.20747074484825134, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.693, | |
| "eval_steps_per_second": 52.257, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 0.09251494705677032, | |
| "learning_rate": 8.310405643738978e-06, | |
| "loss": 0.0381, | |
| "step": 3321 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.2633875906467438, | |
| "eval_runtime": 0.3468, | |
| "eval_samples_per_second": 813.14, | |
| "eval_steps_per_second": 51.903, | |
| "step": 3321 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.06917154043912888, | |
| "learning_rate": 8.024691358024692e-06, | |
| "loss": 0.0383, | |
| "step": 3402 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.3520617187023163, | |
| "eval_runtime": 0.3447, | |
| "eval_samples_per_second": 818.036, | |
| "eval_steps_per_second": 52.215, | |
| "step": 3402 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 0.0010325413895770907, | |
| "learning_rate": 7.738977072310407e-06, | |
| "loss": 0.0288, | |
| "step": 3483 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_accuracy": 0.9680851063829787, | |
| "eval_loss": 0.2726523280143738, | |
| "eval_runtime": 0.3428, | |
| "eval_samples_per_second": 822.588, | |
| "eval_steps_per_second": 52.506, | |
| "step": 3483 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.04726780578494072, | |
| "learning_rate": 7.45326278659612e-06, | |
| "loss": 0.035, | |
| "step": 3564 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.2995310127735138, | |
| "eval_runtime": 0.3442, | |
| "eval_samples_per_second": 819.308, | |
| "eval_steps_per_second": 52.296, | |
| "step": 3564 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.09283600747585297, | |
| "learning_rate": 7.167548500881835e-06, | |
| "loss": 0.0265, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.33694958686828613, | |
| "eval_runtime": 0.3443, | |
| "eval_samples_per_second": 818.994, | |
| "eval_steps_per_second": 52.276, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 0.03685113787651062, | |
| "learning_rate": 6.881834215167549e-06, | |
| "loss": 0.0217, | |
| "step": 3726 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.35722091794013977, | |
| "eval_runtime": 0.3438, | |
| "eval_samples_per_second": 820.281, | |
| "eval_steps_per_second": 52.358, | |
| "step": 3726 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 0.04708189144730568, | |
| "learning_rate": 6.596119929453263e-06, | |
| "loss": 0.0259, | |
| "step": 3807 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_accuracy": 0.9716312056737588, | |
| "eval_loss": 0.21833930909633636, | |
| "eval_runtime": 0.3427, | |
| "eval_samples_per_second": 822.913, | |
| "eval_steps_per_second": 52.526, | |
| "step": 3807 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.06329997628927231, | |
| "learning_rate": 6.310405643738977e-06, | |
| "loss": 0.0264, | |
| "step": 3888 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.2745024561882019, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.777, | |
| "eval_steps_per_second": 52.39, | |
| "step": 3888 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 0.13020673394203186, | |
| "learning_rate": 6.024691358024692e-06, | |
| "loss": 0.027, | |
| "step": 3969 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.3425739109516144, | |
| "eval_runtime": 0.3449, | |
| "eval_samples_per_second": 817.548, | |
| "eval_steps_per_second": 52.184, | |
| "step": 3969 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.04181819409132004, | |
| "learning_rate": 5.7389770723104065e-06, | |
| "loss": 0.023, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.37068530917167664, | |
| "eval_runtime": 0.3441, | |
| "eval_samples_per_second": 819.471, | |
| "eval_steps_per_second": 52.307, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 0.03754027560353279, | |
| "learning_rate": 5.453262786596121e-06, | |
| "loss": 0.0241, | |
| "step": 4131 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.3041815459728241, | |
| "eval_runtime": 0.3443, | |
| "eval_samples_per_second": 819.127, | |
| "eval_steps_per_second": 52.285, | |
| "step": 4131 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 0.06724414229393005, | |
| "learning_rate": 5.167548500881835e-06, | |
| "loss": 0.0248, | |
| "step": 4212 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.3282240927219391, | |
| "eval_runtime": 0.3433, | |
| "eval_samples_per_second": 821.512, | |
| "eval_steps_per_second": 52.437, | |
| "step": 4212 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 0.044111430644989014, | |
| "learning_rate": 4.881834215167549e-06, | |
| "loss": 0.0267, | |
| "step": 4293 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_accuracy": 0.9680851063829787, | |
| "eval_loss": 0.2480100840330124, | |
| "eval_runtime": 0.3438, | |
| "eval_samples_per_second": 820.176, | |
| "eval_steps_per_second": 52.352, | |
| "step": 4293 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 0.09385800361633301, | |
| "learning_rate": 4.596119929453263e-06, | |
| "loss": 0.019, | |
| "step": 4374 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_accuracy": 0.9680851063829787, | |
| "eval_loss": 0.2954387366771698, | |
| "eval_runtime": 0.3444, | |
| "eval_samples_per_second": 818.748, | |
| "eval_steps_per_second": 52.261, | |
| "step": 4374 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 0.00036285247188061476, | |
| "learning_rate": 4.3104056437389775e-06, | |
| "loss": 0.0233, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.26300373673439026, | |
| "eval_runtime": 0.3483, | |
| "eval_samples_per_second": 809.563, | |
| "eval_steps_per_second": 51.674, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 0.03549063578248024, | |
| "learning_rate": 4.024691358024692e-06, | |
| "loss": 0.0231, | |
| "step": 4536 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_accuracy": 0.9645390070921985, | |
| "eval_loss": 0.26614007353782654, | |
| "eval_runtime": 0.3434, | |
| "eval_samples_per_second": 821.294, | |
| "eval_steps_per_second": 52.423, | |
| "step": 4536 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 0.0008688592351973057, | |
| "learning_rate": 3.7389770723104058e-06, | |
| "loss": 0.0188, | |
| "step": 4617 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.3676702678203583, | |
| "eval_runtime": 0.3441, | |
| "eval_samples_per_second": 819.514, | |
| "eval_steps_per_second": 52.309, | |
| "step": 4617 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 0.00031407736241817474, | |
| "learning_rate": 3.4532627865961205e-06, | |
| "loss": 0.0263, | |
| "step": 4698 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.36925771832466125, | |
| "eval_runtime": 0.348, | |
| "eval_samples_per_second": 810.368, | |
| "eval_steps_per_second": 51.726, | |
| "step": 4698 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 0.040128860622644424, | |
| "learning_rate": 3.1675485008818345e-06, | |
| "loss": 0.019, | |
| "step": 4779 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.35094693303108215, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.815, | |
| "eval_steps_per_second": 52.392, | |
| "step": 4779 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 0.0004439246258698404, | |
| "learning_rate": 2.881834215167549e-06, | |
| "loss": 0.0202, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.3040333092212677, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.559, | |
| "eval_steps_per_second": 52.248, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 0.07529360055923462, | |
| "learning_rate": 2.5961199294532628e-06, | |
| "loss": 0.0208, | |
| "step": 4941 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_accuracy": 0.9468085106382979, | |
| "eval_loss": 0.5039365887641907, | |
| "eval_runtime": 0.3439, | |
| "eval_samples_per_second": 819.902, | |
| "eval_steps_per_second": 52.334, | |
| "step": 4941 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 0.00026053638430312276, | |
| "learning_rate": 2.310405643738977e-06, | |
| "loss": 0.0242, | |
| "step": 5022 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.4803861677646637, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.64, | |
| "eval_steps_per_second": 52.254, | |
| "step": 5022 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 0.06742388755083084, | |
| "learning_rate": 2.0246913580246915e-06, | |
| "loss": 0.023, | |
| "step": 5103 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_accuracy": 0.9609929078014184, | |
| "eval_loss": 0.3538144826889038, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.51, | |
| "eval_steps_per_second": 52.245, | |
| "step": 5103 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 0.00042550539365038276, | |
| "learning_rate": 1.7389770723104056e-06, | |
| "loss": 0.0189, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_accuracy": 0.9574468085106383, | |
| "eval_loss": 0.37617096304893494, | |
| "eval_runtime": 0.3442, | |
| "eval_samples_per_second": 819.198, | |
| "eval_steps_per_second": 52.289, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 0.02407378889620304, | |
| "learning_rate": 1.45326278659612e-06, | |
| "loss": 0.0209, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.43608424067497253, | |
| "eval_runtime": 0.3438, | |
| "eval_samples_per_second": 820.243, | |
| "eval_steps_per_second": 52.356, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 0.054311446845531464, | |
| "learning_rate": 1.1675485008818344e-06, | |
| "loss": 0.0209, | |
| "step": 5346 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.41794532537460327, | |
| "eval_runtime": 0.3436, | |
| "eval_samples_per_second": 820.791, | |
| "eval_steps_per_second": 52.391, | |
| "step": 5346 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 0.04109662398695946, | |
| "learning_rate": 8.818342151675485e-07, | |
| "loss": 0.0198, | |
| "step": 5427 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.3815895617008209, | |
| "eval_runtime": 0.3443, | |
| "eval_samples_per_second": 819.013, | |
| "eval_steps_per_second": 52.277, | |
| "step": 5427 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 0.13629287481307983, | |
| "learning_rate": 5.961199294532629e-07, | |
| "loss": 0.0197, | |
| "step": 5508 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.39786896109580994, | |
| "eval_runtime": 0.3445, | |
| "eval_samples_per_second": 818.46, | |
| "eval_steps_per_second": 52.242, | |
| "step": 5508 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 0.039983708411455154, | |
| "learning_rate": 3.104056437389771e-07, | |
| "loss": 0.0192, | |
| "step": 5589 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_accuracy": 0.950354609929078, | |
| "eval_loss": 0.411296546459198, | |
| "eval_runtime": 0.3435, | |
| "eval_samples_per_second": 820.901, | |
| "eval_steps_per_second": 52.398, | |
| "step": 5589 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 0.00027353325276635587, | |
| "learning_rate": 2.469135802469136e-08, | |
| "loss": 0.0177, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_accuracy": 0.9539007092198581, | |
| "eval_loss": 0.40772485733032227, | |
| "eval_runtime": 0.3437, | |
| "eval_samples_per_second": 820.466, | |
| "eval_steps_per_second": 52.37, | |
| "step": 5670 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 5670, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 70, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9735501528974304.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |