| { | |
| "best_metric": 0.910958904109589, | |
| "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-finetuned-5emotions\\checkpoint-5281", | |
| "epoch": 24.99881656804734, | |
| "eval_steps": 500, | |
| "global_step": 5281, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.047337278106508875, | |
| "grad_norm": 7.075885772705078, | |
| "learning_rate": 6.765899864682003e-07, | |
| "loss": 1.691, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09467455621301775, | |
| "grad_norm": 9.890098571777344, | |
| "learning_rate": 1.3531799729364006e-06, | |
| "loss": 1.6712, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14201183431952663, | |
| "grad_norm": 7.415971755981445, | |
| "learning_rate": 2.029769959404601e-06, | |
| "loss": 1.6787, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1893491124260355, | |
| "grad_norm": 6.3063764572143555, | |
| "learning_rate": 2.7063599458728013e-06, | |
| "loss": 1.6685, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.23668639053254437, | |
| "grad_norm": 8.763900756835938, | |
| "learning_rate": 3.3829499323410016e-06, | |
| "loss": 1.6143, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.28402366863905326, | |
| "grad_norm": 6.661700248718262, | |
| "learning_rate": 4.059539918809202e-06, | |
| "loss": 1.5849, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.33136094674556216, | |
| "grad_norm": 7.178672790527344, | |
| "learning_rate": 4.736129905277402e-06, | |
| "loss": 1.5502, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.378698224852071, | |
| "grad_norm": 5.857969284057617, | |
| "learning_rate": 5.4127198917456026e-06, | |
| "loss": 1.5274, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4260355029585799, | |
| "grad_norm": 6.652136325836182, | |
| "learning_rate": 6.089309878213803e-06, | |
| "loss": 1.4915, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.47337278106508873, | |
| "grad_norm": 6.222568035125732, | |
| "learning_rate": 6.765899864682003e-06, | |
| "loss": 1.4063, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5207100591715976, | |
| "grad_norm": 6.365822792053223, | |
| "learning_rate": 7.442489851150203e-06, | |
| "loss": 1.3811, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5680473372781065, | |
| "grad_norm": 7.9343414306640625, | |
| "learning_rate": 8.119079837618404e-06, | |
| "loss": 1.3026, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 9.204723358154297, | |
| "learning_rate": 8.795669824086604e-06, | |
| "loss": 1.2516, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6627218934911243, | |
| "grad_norm": 7.836040496826172, | |
| "learning_rate": 9.472259810554804e-06, | |
| "loss": 1.1664, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7100591715976331, | |
| "grad_norm": 10.82960319519043, | |
| "learning_rate": 1.0148849797023005e-05, | |
| "loss": 1.182, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.757396449704142, | |
| "grad_norm": 13.981669425964355, | |
| "learning_rate": 1.0825439783491205e-05, | |
| "loss": 1.0992, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8047337278106509, | |
| "grad_norm": 17.63448715209961, | |
| "learning_rate": 1.1502029769959405e-05, | |
| "loss": 1.036, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8520710059171598, | |
| "grad_norm": 13.67409610748291, | |
| "learning_rate": 1.2178619756427606e-05, | |
| "loss": 1.0372, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8994082840236687, | |
| "grad_norm": 10.486079216003418, | |
| "learning_rate": 1.2855209742895804e-05, | |
| "loss": 0.9888, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9467455621301775, | |
| "grad_norm": 10.388420104980469, | |
| "learning_rate": 1.3531799729364006e-05, | |
| "loss": 0.98, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9940828402366864, | |
| "grad_norm": 11.530645370483398, | |
| "learning_rate": 1.4208389715832207e-05, | |
| "loss": 0.9665, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9988165680473373, | |
| "eval_accuracy": 0.6835616438356165, | |
| "eval_loss": 0.8002648949623108, | |
| "eval_runtime": 6.3464, | |
| "eval_samples_per_second": 230.051, | |
| "eval_steps_per_second": 28.835, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.0414201183431953, | |
| "grad_norm": 14.492610931396484, | |
| "learning_rate": 1.4884979702300405e-05, | |
| "loss": 0.9431, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.0887573964497042, | |
| "grad_norm": 11.852544784545898, | |
| "learning_rate": 1.5561569688768607e-05, | |
| "loss": 0.8959, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.136094674556213, | |
| "grad_norm": 11.708285331726074, | |
| "learning_rate": 1.6238159675236808e-05, | |
| "loss": 0.9688, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.183431952662722, | |
| "grad_norm": 14.45132827758789, | |
| "learning_rate": 1.6914749661705008e-05, | |
| "loss": 0.8888, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 14.281059265136719, | |
| "learning_rate": 1.759133964817321e-05, | |
| "loss": 0.846, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2781065088757395, | |
| "grad_norm": 14.860888481140137, | |
| "learning_rate": 1.826792963464141e-05, | |
| "loss": 0.8906, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.3254437869822486, | |
| "grad_norm": 10.605212211608887, | |
| "learning_rate": 1.894451962110961e-05, | |
| "loss": 0.8169, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.3727810650887573, | |
| "grad_norm": 12.668191909790039, | |
| "learning_rate": 1.962110960757781e-05, | |
| "loss": 0.836, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.4201183431952662, | |
| "grad_norm": 10.248248100280762, | |
| "learning_rate": 2.029769959404601e-05, | |
| "loss": 0.7644, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.467455621301775, | |
| "grad_norm": 9.778542518615723, | |
| "learning_rate": 2.097428958051421e-05, | |
| "loss": 0.7755, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.514792899408284, | |
| "grad_norm": 9.64427661895752, | |
| "learning_rate": 2.165087956698241e-05, | |
| "loss": 0.7387, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.5621301775147929, | |
| "grad_norm": 10.04445743560791, | |
| "learning_rate": 2.232746955345061e-05, | |
| "loss": 0.7605, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.6094674556213018, | |
| "grad_norm": 13.125927925109863, | |
| "learning_rate": 2.300405953991881e-05, | |
| "loss": 0.6781, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.6568047337278107, | |
| "grad_norm": 13.797953605651855, | |
| "learning_rate": 2.368064952638701e-05, | |
| "loss": 0.6551, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.7041420118343196, | |
| "grad_norm": 14.754645347595215, | |
| "learning_rate": 2.435723951285521e-05, | |
| "loss": 0.7542, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.7514792899408285, | |
| "grad_norm": 13.914559364318848, | |
| "learning_rate": 2.5033829499323412e-05, | |
| "loss": 0.8104, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.7988165680473371, | |
| "grad_norm": 11.46696662902832, | |
| "learning_rate": 2.571041948579161e-05, | |
| "loss": 0.6945, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 10.812294960021973, | |
| "learning_rate": 2.638700947225981e-05, | |
| "loss": 0.6711, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.893491124260355, | |
| "grad_norm": 15.02450180053711, | |
| "learning_rate": 2.7063599458728013e-05, | |
| "loss": 0.7345, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.940828402366864, | |
| "grad_norm": 11.53946590423584, | |
| "learning_rate": 2.7740189445196213e-05, | |
| "loss": 0.723, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.9881656804733727, | |
| "grad_norm": 8.337069511413574, | |
| "learning_rate": 2.8416779431664413e-05, | |
| "loss": 0.6443, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.9976331360946746, | |
| "eval_accuracy": 0.8246575342465754, | |
| "eval_loss": 0.4562951624393463, | |
| "eval_runtime": 6.3186, | |
| "eval_samples_per_second": 231.065, | |
| "eval_steps_per_second": 28.962, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.035502958579882, | |
| "grad_norm": 9.038360595703125, | |
| "learning_rate": 2.9093369418132617e-05, | |
| "loss": 0.6256, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.0828402366863905, | |
| "grad_norm": 12.379063606262207, | |
| "learning_rate": 2.976995940460081e-05, | |
| "loss": 0.5998, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.1301775147928996, | |
| "grad_norm": 12.626445770263672, | |
| "learning_rate": 3.044654939106901e-05, | |
| "loss": 0.6456, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.1775147928994083, | |
| "grad_norm": 10.665410995483398, | |
| "learning_rate": 3.1123139377537215e-05, | |
| "loss": 0.6145, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.224852071005917, | |
| "grad_norm": 11.917645454406738, | |
| "learning_rate": 3.1799729364005415e-05, | |
| "loss": 0.6494, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.272189349112426, | |
| "grad_norm": 14.427268981933594, | |
| "learning_rate": 3.2476319350473615e-05, | |
| "loss": 0.5967, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.3195266272189348, | |
| "grad_norm": 14.22167682647705, | |
| "learning_rate": 3.3152909336941816e-05, | |
| "loss": 0.6356, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.366863905325444, | |
| "grad_norm": 15.034667015075684, | |
| "learning_rate": 3.3829499323410016e-05, | |
| "loss": 0.6583, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.4142011834319526, | |
| "grad_norm": 11.716626167297363, | |
| "learning_rate": 3.4506089309878216e-05, | |
| "loss": 0.5703, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 8.812618255615234, | |
| "learning_rate": 3.518267929634642e-05, | |
| "loss": 0.5495, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.5088757396449703, | |
| "grad_norm": 19.886188507080078, | |
| "learning_rate": 3.585926928281462e-05, | |
| "loss": 0.7156, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.556213017751479, | |
| "grad_norm": 10.014534950256348, | |
| "learning_rate": 3.653585926928282e-05, | |
| "loss": 0.7279, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.603550295857988, | |
| "grad_norm": 8.2186861038208, | |
| "learning_rate": 3.721244925575101e-05, | |
| "loss": 0.6629, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.6508875739644973, | |
| "grad_norm": 11.415748596191406, | |
| "learning_rate": 3.788903924221922e-05, | |
| "loss": 0.6031, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.698224852071006, | |
| "grad_norm": 6.490344047546387, | |
| "learning_rate": 3.856562922868742e-05, | |
| "loss": 0.5158, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.7455621301775146, | |
| "grad_norm": 10.63316822052002, | |
| "learning_rate": 3.924221921515562e-05, | |
| "loss": 0.6533, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.7928994082840237, | |
| "grad_norm": 9.291253089904785, | |
| "learning_rate": 3.991880920162382e-05, | |
| "loss": 0.5501, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.8402366863905324, | |
| "grad_norm": 10.60273551940918, | |
| "learning_rate": 4.059539918809202e-05, | |
| "loss": 0.5719, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.8875739644970415, | |
| "grad_norm": 10.603645324707031, | |
| "learning_rate": 4.127198917456021e-05, | |
| "loss": 0.4905, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.93491124260355, | |
| "grad_norm": 17.47416877746582, | |
| "learning_rate": 4.194857916102842e-05, | |
| "loss": 0.7037, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.9822485207100593, | |
| "grad_norm": 9.434072494506836, | |
| "learning_rate": 4.262516914749662e-05, | |
| "loss": 0.5815, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.996449704142012, | |
| "eval_accuracy": 0.8568493150684932, | |
| "eval_loss": 0.3556749224662781, | |
| "eval_runtime": 6.2074, | |
| "eval_samples_per_second": 235.204, | |
| "eval_steps_per_second": 29.481, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 3.029585798816568, | |
| "grad_norm": 13.81190299987793, | |
| "learning_rate": 4.330175913396482e-05, | |
| "loss": 0.5877, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.076923076923077, | |
| "grad_norm": 8.872483253479004, | |
| "learning_rate": 4.397834912043302e-05, | |
| "loss": 0.55, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.1242603550295858, | |
| "grad_norm": 11.748785972595215, | |
| "learning_rate": 4.465493910690122e-05, | |
| "loss": 0.6155, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.171597633136095, | |
| "grad_norm": 13.621400833129883, | |
| "learning_rate": 4.5331529093369415e-05, | |
| "loss": 0.5907, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.2189349112426036, | |
| "grad_norm": 10.422270774841309, | |
| "learning_rate": 4.600811907983762e-05, | |
| "loss": 0.6022, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.2662721893491122, | |
| "grad_norm": 12.192015647888184, | |
| "learning_rate": 4.668470906630582e-05, | |
| "loss": 0.558, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.3136094674556213, | |
| "grad_norm": 5.769958972930908, | |
| "learning_rate": 4.736129905277402e-05, | |
| "loss": 0.5257, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.36094674556213, | |
| "grad_norm": 11.664800643920898, | |
| "learning_rate": 4.803788903924222e-05, | |
| "loss": 0.6242, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.408284023668639, | |
| "grad_norm": 10.007041931152344, | |
| "learning_rate": 4.871447902571042e-05, | |
| "loss": 0.5789, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.455621301775148, | |
| "grad_norm": 18.98644256591797, | |
| "learning_rate": 4.9391069012178623e-05, | |
| "loss": 0.4632, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.502958579881657, | |
| "grad_norm": 9.949424743652344, | |
| "learning_rate": 4.999247667770087e-05, | |
| "loss": 0.5657, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.5502958579881656, | |
| "grad_norm": 7.471621513366699, | |
| "learning_rate": 4.99172434547096e-05, | |
| "loss": 0.4076, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.5976331360946747, | |
| "grad_norm": 9.102510452270508, | |
| "learning_rate": 4.9842010231718327e-05, | |
| "loss": 0.532, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.6449704142011834, | |
| "grad_norm": 9.587445259094238, | |
| "learning_rate": 4.976677700872706e-05, | |
| "loss": 0.5685, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.6923076923076925, | |
| "grad_norm": 10.277064323425293, | |
| "learning_rate": 4.969154378573578e-05, | |
| "loss": 0.5004, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.739644970414201, | |
| "grad_norm": 15.665764808654785, | |
| "learning_rate": 4.9616310562744514e-05, | |
| "loss": 0.5571, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.78698224852071, | |
| "grad_norm": 9.643716812133789, | |
| "learning_rate": 4.954107733975324e-05, | |
| "loss": 0.5235, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.834319526627219, | |
| "grad_norm": 12.600419044494629, | |
| "learning_rate": 4.9465844116761964e-05, | |
| "loss": 0.5579, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.8816568047337277, | |
| "grad_norm": 9.6210298538208, | |
| "learning_rate": 4.939061089377069e-05, | |
| "loss": 0.4711, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.9289940828402368, | |
| "grad_norm": 10.485040664672852, | |
| "learning_rate": 4.9315377670779414e-05, | |
| "loss": 0.4848, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.9763313609467454, | |
| "grad_norm": 7.453371524810791, | |
| "learning_rate": 4.9240144447788145e-05, | |
| "loss": 0.474, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.8726027397260274, | |
| "eval_loss": 0.35826006531715393, | |
| "eval_runtime": 6.1388, | |
| "eval_samples_per_second": 237.833, | |
| "eval_steps_per_second": 29.811, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 4.023668639053255, | |
| "grad_norm": 8.34096908569336, | |
| "learning_rate": 4.916491122479687e-05, | |
| "loss": 0.5093, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.071005917159764, | |
| "grad_norm": 7.713958263397217, | |
| "learning_rate": 4.90896780018056e-05, | |
| "loss": 0.4608, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.118343195266272, | |
| "grad_norm": 9.734159469604492, | |
| "learning_rate": 4.9014444778814326e-05, | |
| "loss": 0.4247, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.165680473372781, | |
| "grad_norm": 7.637202739715576, | |
| "learning_rate": 4.893921155582306e-05, | |
| "loss": 0.554, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.21301775147929, | |
| "grad_norm": 12.172405242919922, | |
| "learning_rate": 4.886397833283178e-05, | |
| "loss": 0.4859, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.260355029585799, | |
| "grad_norm": 9.40637493133545, | |
| "learning_rate": 4.878874510984051e-05, | |
| "loss": 0.5068, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.3076923076923075, | |
| "grad_norm": 5.2307209968566895, | |
| "learning_rate": 4.871351188684923e-05, | |
| "loss": 0.4318, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.355029585798817, | |
| "grad_norm": 13.809428215026855, | |
| "learning_rate": 4.8638278663857964e-05, | |
| "loss": 0.5231, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.402366863905326, | |
| "grad_norm": 9.841399192810059, | |
| "learning_rate": 4.856304544086669e-05, | |
| "loss": 0.4441, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.449704142011834, | |
| "grad_norm": 7.034471035003662, | |
| "learning_rate": 4.8487812217875414e-05, | |
| "loss": 0.5421, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.497041420118343, | |
| "grad_norm": 6.35905122756958, | |
| "learning_rate": 4.8412578994884145e-05, | |
| "loss": 0.5084, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.544378698224852, | |
| "grad_norm": 8.407711029052734, | |
| "learning_rate": 4.833734577189287e-05, | |
| "loss": 0.4067, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 4.591715976331361, | |
| "grad_norm": 7.5561113357543945, | |
| "learning_rate": 4.8262112548901595e-05, | |
| "loss": 0.4881, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 4.6390532544378695, | |
| "grad_norm": 7.843471050262451, | |
| "learning_rate": 4.818687932591032e-05, | |
| "loss": 0.4175, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 4.686390532544379, | |
| "grad_norm": 11.301685333251953, | |
| "learning_rate": 4.811164610291905e-05, | |
| "loss": 0.4423, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 4.733727810650888, | |
| "grad_norm": 7.472105503082275, | |
| "learning_rate": 4.8036412879927776e-05, | |
| "loss": 0.4525, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.781065088757396, | |
| "grad_norm": 9.092314720153809, | |
| "learning_rate": 4.796117965693651e-05, | |
| "loss": 0.5699, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 4.828402366863905, | |
| "grad_norm": 12.238302230834961, | |
| "learning_rate": 4.788594643394523e-05, | |
| "loss": 0.4524, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 4.875739644970414, | |
| "grad_norm": 5.100959777832031, | |
| "learning_rate": 4.7810713210953964e-05, | |
| "loss": 0.3866, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 4.923076923076923, | |
| "grad_norm": 9.616569519042969, | |
| "learning_rate": 4.773547998796269e-05, | |
| "loss": 0.3577, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 4.970414201183432, | |
| "grad_norm": 9.995213508605957, | |
| "learning_rate": 4.7660246764971413e-05, | |
| "loss": 0.5819, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 4.998816568047337, | |
| "eval_accuracy": 0.8671232876712329, | |
| "eval_loss": 0.34042322635650635, | |
| "eval_runtime": 6.4475, | |
| "eval_samples_per_second": 226.444, | |
| "eval_steps_per_second": 28.383, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 5.017751479289941, | |
| "grad_norm": 6.531469345092773, | |
| "learning_rate": 4.758501354198014e-05, | |
| "loss": 0.4182, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.06508875739645, | |
| "grad_norm": 11.092623710632324, | |
| "learning_rate": 4.750978031898887e-05, | |
| "loss": 0.4458, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.112426035502959, | |
| "grad_norm": 12.276275634765625, | |
| "learning_rate": 4.7434547095997595e-05, | |
| "loss": 0.5101, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.159763313609467, | |
| "grad_norm": 10.82636833190918, | |
| "learning_rate": 4.735931387300632e-05, | |
| "loss": 0.4708, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 5.207100591715976, | |
| "grad_norm": 9.973958015441895, | |
| "learning_rate": 4.728408065001505e-05, | |
| "loss": 0.5191, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.254437869822485, | |
| "grad_norm": 9.460865020751953, | |
| "learning_rate": 4.7208847427023776e-05, | |
| "loss": 0.4285, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 5.3017751479289945, | |
| "grad_norm": 15.347735404968262, | |
| "learning_rate": 4.713361420403251e-05, | |
| "loss": 0.4579, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.349112426035503, | |
| "grad_norm": 14.214599609375, | |
| "learning_rate": 4.7058380981041225e-05, | |
| "loss": 0.4787, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 5.396449704142012, | |
| "grad_norm": 9.042417526245117, | |
| "learning_rate": 4.698314775804996e-05, | |
| "loss": 0.4146, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 5.443786982248521, | |
| "grad_norm": 8.627814292907715, | |
| "learning_rate": 4.690791453505868e-05, | |
| "loss": 0.394, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 5.491124260355029, | |
| "grad_norm": 8.060114860534668, | |
| "learning_rate": 4.683268131206741e-05, | |
| "loss": 0.412, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 5.538461538461538, | |
| "grad_norm": 8.569971084594727, | |
| "learning_rate": 4.675744808907614e-05, | |
| "loss": 0.443, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 5.585798816568047, | |
| "grad_norm": 31.7719669342041, | |
| "learning_rate": 4.668221486608487e-05, | |
| "loss": 0.4424, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 5.633136094674557, | |
| "grad_norm": 10.994864463806152, | |
| "learning_rate": 4.6606981643093595e-05, | |
| "loss": 0.4072, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 5.680473372781065, | |
| "grad_norm": 12.489917755126953, | |
| "learning_rate": 4.653174842010232e-05, | |
| "loss": 0.4193, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 5.727810650887574, | |
| "grad_norm": 6.0672760009765625, | |
| "learning_rate": 4.6456515197111044e-05, | |
| "loss": 0.463, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 5.775147928994083, | |
| "grad_norm": 9.66230297088623, | |
| "learning_rate": 4.6381281974119776e-05, | |
| "loss": 0.3863, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 5.822485207100591, | |
| "grad_norm": 12.802431106567383, | |
| "learning_rate": 4.63060487511285e-05, | |
| "loss": 0.4471, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 5.8698224852071, | |
| "grad_norm": 10.842957496643066, | |
| "learning_rate": 4.6230815528137225e-05, | |
| "loss": 0.5186, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 5.9171597633136095, | |
| "grad_norm": 8.612702369689941, | |
| "learning_rate": 4.615558230514596e-05, | |
| "loss": 0.4908, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 5.964497041420119, | |
| "grad_norm": 8.768792152404785, | |
| "learning_rate": 4.608034908215468e-05, | |
| "loss": 0.4557, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 5.997633136094675, | |
| "eval_accuracy": 0.8993150684931507, | |
| "eval_loss": 0.2699526846408844, | |
| "eval_runtime": 6.396, | |
| "eval_samples_per_second": 228.266, | |
| "eval_steps_per_second": 28.611, | |
| "step": 1267 | |
| }, | |
| { | |
| "epoch": 6.011834319526627, | |
| "grad_norm": 6.778576374053955, | |
| "learning_rate": 4.600511585916341e-05, | |
| "loss": 0.4647, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 6.059171597633136, | |
| "grad_norm": 5.115172863006592, | |
| "learning_rate": 4.592988263617213e-05, | |
| "loss": 0.4053, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.106508875739645, | |
| "grad_norm": 7.163010120391846, | |
| "learning_rate": 4.585464941318086e-05, | |
| "loss": 0.4136, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 6.153846153846154, | |
| "grad_norm": 5.242615699768066, | |
| "learning_rate": 4.577941619018959e-05, | |
| "loss": 0.4233, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.201183431952662, | |
| "grad_norm": 7.148778915405273, | |
| "learning_rate": 4.570418296719832e-05, | |
| "loss": 0.3791, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 6.2485207100591715, | |
| "grad_norm": 6.911210060119629, | |
| "learning_rate": 4.5628949744207044e-05, | |
| "loss": 0.3933, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 6.295857988165681, | |
| "grad_norm": 7.753135681152344, | |
| "learning_rate": 4.5553716521215776e-05, | |
| "loss": 0.428, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 6.34319526627219, | |
| "grad_norm": 5.933778762817383, | |
| "learning_rate": 4.54784832982245e-05, | |
| "loss": 0.4668, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 6.390532544378698, | |
| "grad_norm": 7.8352556228637695, | |
| "learning_rate": 4.5403250075233225e-05, | |
| "loss": 0.3272, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 6.437869822485207, | |
| "grad_norm": 11.419840812683105, | |
| "learning_rate": 4.532801685224195e-05, | |
| "loss": 0.3954, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 6.485207100591716, | |
| "grad_norm": 9.681208610534668, | |
| "learning_rate": 4.5252783629250675e-05, | |
| "loss": 0.5153, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 6.5325443786982245, | |
| "grad_norm": 6.971587657928467, | |
| "learning_rate": 4.5177550406259406e-05, | |
| "loss": 0.4247, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 6.579881656804734, | |
| "grad_norm": 6.286644458770752, | |
| "learning_rate": 4.510231718326813e-05, | |
| "loss": 0.4618, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 6.627218934911243, | |
| "grad_norm": 11.171966552734375, | |
| "learning_rate": 4.502708396027686e-05, | |
| "loss": 0.4352, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 6.674556213017752, | |
| "grad_norm": 10.539188385009766, | |
| "learning_rate": 4.495185073728559e-05, | |
| "loss": 0.3841, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 6.72189349112426, | |
| "grad_norm": 5.127812385559082, | |
| "learning_rate": 4.487661751429432e-05, | |
| "loss": 0.3388, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 6.769230769230769, | |
| "grad_norm": 10.178089141845703, | |
| "learning_rate": 4.480138429130304e-05, | |
| "loss": 0.4024, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 6.816568047337278, | |
| "grad_norm": 5.93577766418457, | |
| "learning_rate": 4.472615106831177e-05, | |
| "loss": 0.4173, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 6.8639053254437865, | |
| "grad_norm": 5.2099609375, | |
| "learning_rate": 4.4650917845320493e-05, | |
| "loss": 0.3462, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 6.911242603550296, | |
| "grad_norm": 7.551539897918701, | |
| "learning_rate": 4.4575684622329225e-05, | |
| "loss": 0.4034, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 6.958579881656805, | |
| "grad_norm": 10.478506088256836, | |
| "learning_rate": 4.450045139933795e-05, | |
| "loss": 0.4021, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 6.9964497041420115, | |
| "eval_accuracy": 0.8917808219178082, | |
| "eval_loss": 0.3158508837223053, | |
| "eval_runtime": 6.1877, | |
| "eval_samples_per_second": 235.95, | |
| "eval_steps_per_second": 29.575, | |
| "step": 1478 | |
| }, | |
| { | |
| "epoch": 7.005917159763314, | |
| "grad_norm": 10.892561912536621, | |
| "learning_rate": 4.4425218176346675e-05, | |
| "loss": 0.3283, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.053254437869822, | |
| "grad_norm": 8.013442993164062, | |
| "learning_rate": 4.4349984953355406e-05, | |
| "loss": 0.4517, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 7.100591715976331, | |
| "grad_norm": 6.160177230834961, | |
| "learning_rate": 4.427475173036413e-05, | |
| "loss": 0.4099, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.14792899408284, | |
| "grad_norm": 8.48135757446289, | |
| "learning_rate": 4.4199518507372856e-05, | |
| "loss": 0.4019, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 7.195266272189349, | |
| "grad_norm": 10.302865982055664, | |
| "learning_rate": 4.412428528438158e-05, | |
| "loss": 0.3329, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 7.242603550295858, | |
| "grad_norm": 10.503307342529297, | |
| "learning_rate": 4.404905206139031e-05, | |
| "loss": 0.394, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 7.289940828402367, | |
| "grad_norm": 7.577216148376465, | |
| "learning_rate": 4.397381883839904e-05, | |
| "loss": 0.4075, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 7.337278106508876, | |
| "grad_norm": 12.196857452392578, | |
| "learning_rate": 4.389858561540777e-05, | |
| "loss": 0.3919, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 7.384615384615385, | |
| "grad_norm": 6.480340003967285, | |
| "learning_rate": 4.382335239241649e-05, | |
| "loss": 0.3562, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 7.431952662721893, | |
| "grad_norm": 4.814269542694092, | |
| "learning_rate": 4.3748119169425225e-05, | |
| "loss": 0.3232, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 7.479289940828402, | |
| "grad_norm": 8.813551902770996, | |
| "learning_rate": 4.367288594643394e-05, | |
| "loss": 0.3947, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 7.5266272189349115, | |
| "grad_norm": 10.225379943847656, | |
| "learning_rate": 4.3597652723442675e-05, | |
| "loss": 0.4059, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 7.57396449704142, | |
| "grad_norm": 9.415613174438477, | |
| "learning_rate": 4.35224195004514e-05, | |
| "loss": 0.3371, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 7.621301775147929, | |
| "grad_norm": 6.129647731781006, | |
| "learning_rate": 4.344718627746013e-05, | |
| "loss": 0.3652, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 7.668639053254438, | |
| "grad_norm": 9.19030475616455, | |
| "learning_rate": 4.3371953054468856e-05, | |
| "loss": 0.3562, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 7.715976331360947, | |
| "grad_norm": 12.973560333251953, | |
| "learning_rate": 4.329671983147758e-05, | |
| "loss": 0.3804, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 7.763313609467455, | |
| "grad_norm": 7.263617515563965, | |
| "learning_rate": 4.322148660848631e-05, | |
| "loss": 0.3808, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 7.810650887573964, | |
| "grad_norm": 6.532052516937256, | |
| "learning_rate": 4.314625338549504e-05, | |
| "loss": 0.468, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 7.8579881656804735, | |
| "grad_norm": 8.766283988952637, | |
| "learning_rate": 4.307102016250376e-05, | |
| "loss": 0.4145, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 7.905325443786982, | |
| "grad_norm": 5.956889629364014, | |
| "learning_rate": 4.2995786939512487e-05, | |
| "loss": 0.4047, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 7.952662721893491, | |
| "grad_norm": 6.531178951263428, | |
| "learning_rate": 4.292055371652122e-05, | |
| "loss": 0.3396, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 8.662644386291504, | |
| "learning_rate": 4.284532049352994e-05, | |
| "loss": 0.3209, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.8972602739726028, | |
| "eval_loss": 0.3082219660282135, | |
| "eval_runtime": 6.2922, | |
| "eval_samples_per_second": 232.034, | |
| "eval_steps_per_second": 29.084, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 8.04733727810651, | |
| "grad_norm": 12.477700233459473, | |
| "learning_rate": 4.2770087270538674e-05, | |
| "loss": 0.3262, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.094674556213018, | |
| "grad_norm": 6.367954730987549, | |
| "learning_rate": 4.26948540475474e-05, | |
| "loss": 0.3579, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 8.142011834319527, | |
| "grad_norm": 7.339391708374023, | |
| "learning_rate": 4.261962082455613e-05, | |
| "loss": 0.3993, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 8.189349112426035, | |
| "grad_norm": 7.060799598693848, | |
| "learning_rate": 4.2544387601564856e-05, | |
| "loss": 0.3702, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 8.236686390532544, | |
| "grad_norm": 7.423877239227295, | |
| "learning_rate": 4.246915437857358e-05, | |
| "loss": 0.4548, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 8.284023668639053, | |
| "grad_norm": 7.742123603820801, | |
| "learning_rate": 4.2393921155582305e-05, | |
| "loss": 0.3914, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 8.331360946745562, | |
| "grad_norm": 3.941162109375, | |
| "learning_rate": 4.231868793259104e-05, | |
| "loss": 0.3953, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 8.378698224852071, | |
| "grad_norm": 7.15812349319458, | |
| "learning_rate": 4.224345470959976e-05, | |
| "loss": 0.3871, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 8.42603550295858, | |
| "grad_norm": 11.954395294189453, | |
| "learning_rate": 4.2168221486608486e-05, | |
| "loss": 0.3919, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 8.47337278106509, | |
| "grad_norm": 7.049565315246582, | |
| "learning_rate": 4.209298826361722e-05, | |
| "loss": 0.3539, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 8.520710059171599, | |
| "grad_norm": 8.527347564697266, | |
| "learning_rate": 4.201775504062594e-05, | |
| "loss": 0.3883, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 8.568047337278106, | |
| "grad_norm": 9.178783416748047, | |
| "learning_rate": 4.194252181763467e-05, | |
| "loss": 0.4226, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 8.615384615384615, | |
| "grad_norm": 10.065650939941406, | |
| "learning_rate": 4.186728859464339e-05, | |
| "loss": 0.3773, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 8.662721893491124, | |
| "grad_norm": 5.588104724884033, | |
| "learning_rate": 4.1792055371652124e-05, | |
| "loss": 0.3921, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 8.710059171597633, | |
| "grad_norm": 4.505855083465576, | |
| "learning_rate": 4.171682214866085e-05, | |
| "loss": 0.3483, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 8.757396449704142, | |
| "grad_norm": 10.081398963928223, | |
| "learning_rate": 4.164158892566958e-05, | |
| "loss": 0.3312, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 8.804733727810651, | |
| "grad_norm": 7.667760848999023, | |
| "learning_rate": 4.1566355702678305e-05, | |
| "loss": 0.2838, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 8.85207100591716, | |
| "grad_norm": 11.876665115356445, | |
| "learning_rate": 4.149112247968704e-05, | |
| "loss": 0.4328, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 8.899408284023668, | |
| "grad_norm": 7.79551887512207, | |
| "learning_rate": 4.141588925669576e-05, | |
| "loss": 0.4616, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 8.946745562130177, | |
| "grad_norm": 6.006857395172119, | |
| "learning_rate": 4.1340656033704486e-05, | |
| "loss": 0.3389, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 8.994082840236686, | |
| "grad_norm": 9.194988250732422, | |
| "learning_rate": 4.126542281071321e-05, | |
| "loss": 0.3479, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 8.998816568047337, | |
| "eval_accuracy": 0.9027397260273973, | |
| "eval_loss": 0.28129294514656067, | |
| "eval_runtime": 6.217, | |
| "eval_samples_per_second": 234.84, | |
| "eval_steps_per_second": 29.435, | |
| "step": 1901 | |
| }, | |
| { | |
| "epoch": 9.041420118343195, | |
| "grad_norm": 7.2297163009643555, | |
| "learning_rate": 4.1190189587721936e-05, | |
| "loss": 0.3223, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 9.088757396449704, | |
| "grad_norm": 9.67817211151123, | |
| "learning_rate": 4.111495636473067e-05, | |
| "loss": 0.3681, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 9.136094674556213, | |
| "grad_norm": 6.748856544494629, | |
| "learning_rate": 4.103972314173939e-05, | |
| "loss": 0.351, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 9.183431952662723, | |
| "grad_norm": 3.9139935970306396, | |
| "learning_rate": 4.0964489918748124e-05, | |
| "loss": 0.39, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 9.23076923076923, | |
| "grad_norm": 5.222900390625, | |
| "learning_rate": 4.088925669575685e-05, | |
| "loss": 0.3132, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 9.278106508875739, | |
| "grad_norm": 11.637986183166504, | |
| "learning_rate": 4.081402347276558e-05, | |
| "loss": 0.3373, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 9.325443786982248, | |
| "grad_norm": 10.712813377380371, | |
| "learning_rate": 4.07387902497743e-05, | |
| "loss": 0.3424, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 9.372781065088757, | |
| "grad_norm": 7.3563947677612305, | |
| "learning_rate": 4.066355702678303e-05, | |
| "loss": 0.3709, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 9.420118343195266, | |
| "grad_norm": 8.500737190246582, | |
| "learning_rate": 4.0588323803791755e-05, | |
| "loss": 0.3398, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 9.467455621301776, | |
| "grad_norm": 10.802979469299316, | |
| "learning_rate": 4.0513090580800486e-05, | |
| "loss": 0.308, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 9.514792899408285, | |
| "grad_norm": 7.362417697906494, | |
| "learning_rate": 4.043785735780921e-05, | |
| "loss": 0.3193, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 9.562130177514792, | |
| "grad_norm": 5.569155693054199, | |
| "learning_rate": 4.0362624134817936e-05, | |
| "loss": 0.3028, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 9.609467455621301, | |
| "grad_norm": 8.995447158813477, | |
| "learning_rate": 4.028739091182667e-05, | |
| "loss": 0.4206, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 9.65680473372781, | |
| "grad_norm": 5.864706993103027, | |
| "learning_rate": 4.021215768883539e-05, | |
| "loss": 0.2987, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 9.70414201183432, | |
| "grad_norm": 8.34255313873291, | |
| "learning_rate": 4.013692446584412e-05, | |
| "loss": 0.4161, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 9.751479289940828, | |
| "grad_norm": 8.392521858215332, | |
| "learning_rate": 4.006169124285284e-05, | |
| "loss": 0.4073, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 9.798816568047338, | |
| "grad_norm": 6.388725280761719, | |
| "learning_rate": 3.998645801986157e-05, | |
| "loss": 0.3513, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 9.846153846153847, | |
| "grad_norm": 5.696859836578369, | |
| "learning_rate": 3.99112247968703e-05, | |
| "loss": 0.3219, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 9.893491124260356, | |
| "grad_norm": 8.325499534606934, | |
| "learning_rate": 3.983599157387903e-05, | |
| "loss": 0.394, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 9.940828402366863, | |
| "grad_norm": 11.819910049438477, | |
| "learning_rate": 3.9760758350887755e-05, | |
| "loss": 0.4085, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 9.988165680473372, | |
| "grad_norm": 6.419707298278809, | |
| "learning_rate": 3.9685525127896486e-05, | |
| "loss": 0.3429, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 9.997633136094674, | |
| "eval_accuracy": 0.8924657534246575, | |
| "eval_loss": 0.3318786323070526, | |
| "eval_runtime": 6.1733, | |
| "eval_samples_per_second": 236.501, | |
| "eval_steps_per_second": 29.644, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 10.035502958579881, | |
| "grad_norm": 4.879507064819336, | |
| "learning_rate": 3.9610291904905204e-05, | |
| "loss": 0.2607, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 10.08284023668639, | |
| "grad_norm": 10.089688301086426, | |
| "learning_rate": 3.9535058681913936e-05, | |
| "loss": 0.3887, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 10.1301775147929, | |
| "grad_norm": 6.6358819007873535, | |
| "learning_rate": 3.945982545892266e-05, | |
| "loss": 0.3926, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 10.177514792899409, | |
| "grad_norm": 4.718569755554199, | |
| "learning_rate": 3.938459223593139e-05, | |
| "loss": 0.2977, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 10.224852071005918, | |
| "grad_norm": 4.798628807067871, | |
| "learning_rate": 3.930935901294012e-05, | |
| "loss": 0.3167, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 10.272189349112425, | |
| "grad_norm": 12.527241706848145, | |
| "learning_rate": 3.923412578994884e-05, | |
| "loss": 0.3498, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 10.319526627218934, | |
| "grad_norm": 19.981807708740234, | |
| "learning_rate": 3.915889256695757e-05, | |
| "loss": 0.3791, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 10.366863905325443, | |
| "grad_norm": 5.31036901473999, | |
| "learning_rate": 3.90836593439663e-05, | |
| "loss": 0.3635, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 10.414201183431953, | |
| "grad_norm": 7.329598426818848, | |
| "learning_rate": 3.900842612097502e-05, | |
| "loss": 0.2612, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 10.461538461538462, | |
| "grad_norm": 10.241847038269043, | |
| "learning_rate": 3.893319289798375e-05, | |
| "loss": 0.3508, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 10.50887573964497, | |
| "grad_norm": 9.222640991210938, | |
| "learning_rate": 3.885795967499248e-05, | |
| "loss": 0.4113, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 10.55621301775148, | |
| "grad_norm": 5.4523115158081055, | |
| "learning_rate": 3.8782726452001204e-05, | |
| "loss": 0.312, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 10.603550295857989, | |
| "grad_norm": 25.376020431518555, | |
| "learning_rate": 3.8707493229009936e-05, | |
| "loss": 0.382, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 10.650887573964496, | |
| "grad_norm": 7.494572162628174, | |
| "learning_rate": 3.863226000601866e-05, | |
| "loss": 0.3078, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 10.698224852071005, | |
| "grad_norm": 9.24726390838623, | |
| "learning_rate": 3.855702678302739e-05, | |
| "loss": 0.3368, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 10.745562130177515, | |
| "grad_norm": 7.74558162689209, | |
| "learning_rate": 3.848179356003611e-05, | |
| "loss": 0.2912, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 10.792899408284024, | |
| "grad_norm": 7.557544708251953, | |
| "learning_rate": 3.840656033704484e-05, | |
| "loss": 0.3268, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 10.840236686390533, | |
| "grad_norm": 9.215229988098145, | |
| "learning_rate": 3.8331327114053566e-05, | |
| "loss": 0.4372, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 10.887573964497042, | |
| "grad_norm": 9.268451690673828, | |
| "learning_rate": 3.82560938910623e-05, | |
| "loss": 0.3564, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 10.934911242603551, | |
| "grad_norm": 4.07456111907959, | |
| "learning_rate": 3.818086066807102e-05, | |
| "loss": 0.3003, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 10.982248520710058, | |
| "grad_norm": 8.930679321289062, | |
| "learning_rate": 3.810562744507975e-05, | |
| "loss": 0.3341, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 10.996449704142012, | |
| "eval_accuracy": 0.8972602739726028, | |
| "eval_loss": 0.2900165021419525, | |
| "eval_runtime": 6.2027, | |
| "eval_samples_per_second": 235.379, | |
| "eval_steps_per_second": 29.503, | |
| "step": 2323 | |
| }, | |
| { | |
| "epoch": 11.029585798816568, | |
| "grad_norm": 7.869425296783447, | |
| "learning_rate": 3.803039422208848e-05, | |
| "loss": 0.3134, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 11.076923076923077, | |
| "grad_norm": 8.941612243652344, | |
| "learning_rate": 3.7955160999097204e-05, | |
| "loss": 0.3465, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 11.124260355029586, | |
| "grad_norm": 8.30190372467041, | |
| "learning_rate": 3.787992777610593e-05, | |
| "loss": 0.2489, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 11.171597633136095, | |
| "grad_norm": 8.490402221679688, | |
| "learning_rate": 3.7804694553114653e-05, | |
| "loss": 0.326, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 11.218934911242604, | |
| "grad_norm": 19.662193298339844, | |
| "learning_rate": 3.7729461330123385e-05, | |
| "loss": 0.3444, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 11.266272189349113, | |
| "grad_norm": 9.445649147033691, | |
| "learning_rate": 3.765422810713211e-05, | |
| "loss": 0.3185, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 11.31360946745562, | |
| "grad_norm": 4.701760292053223, | |
| "learning_rate": 3.757899488414084e-05, | |
| "loss": 0.3665, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 11.36094674556213, | |
| "grad_norm": 5.095606327056885, | |
| "learning_rate": 3.7503761661149566e-05, | |
| "loss": 0.2736, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 11.408284023668639, | |
| "grad_norm": 10.870713233947754, | |
| "learning_rate": 3.74285284381583e-05, | |
| "loss": 0.2966, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 11.455621301775148, | |
| "grad_norm": 6.850511074066162, | |
| "learning_rate": 3.7353295215167016e-05, | |
| "loss": 0.2624, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 11.502958579881657, | |
| "grad_norm": 10.627695083618164, | |
| "learning_rate": 3.727806199217575e-05, | |
| "loss": 0.3767, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 11.550295857988166, | |
| "grad_norm": 8.704399108886719, | |
| "learning_rate": 3.720282876918447e-05, | |
| "loss": 0.3127, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 11.597633136094675, | |
| "grad_norm": 7.4766716957092285, | |
| "learning_rate": 3.71275955461932e-05, | |
| "loss": 0.3015, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 11.644970414201183, | |
| "grad_norm": 8.510762214660645, | |
| "learning_rate": 3.705236232320193e-05, | |
| "loss": 0.3406, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 11.692307692307692, | |
| "grad_norm": 9.42719841003418, | |
| "learning_rate": 3.697712910021065e-05, | |
| "loss": 0.3085, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 11.7396449704142, | |
| "grad_norm": 6.386455535888672, | |
| "learning_rate": 3.6901895877219385e-05, | |
| "loss": 0.3426, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 11.78698224852071, | |
| "grad_norm": 7.612992286682129, | |
| "learning_rate": 3.682666265422811e-05, | |
| "loss": 0.3567, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 11.834319526627219, | |
| "grad_norm": 8.440069198608398, | |
| "learning_rate": 3.6751429431236835e-05, | |
| "loss": 0.3288, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 11.881656804733728, | |
| "grad_norm": 7.730615615844727, | |
| "learning_rate": 3.667619620824556e-05, | |
| "loss": 0.3253, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 11.928994082840237, | |
| "grad_norm": 7.29069185256958, | |
| "learning_rate": 3.660096298525429e-05, | |
| "loss": 0.306, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 11.976331360946746, | |
| "grad_norm": 8.983368873596191, | |
| "learning_rate": 3.6525729762263016e-05, | |
| "loss": 0.2937, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.8993150684931507, | |
| "eval_loss": 0.3500230312347412, | |
| "eval_runtime": 6.3178, | |
| "eval_samples_per_second": 231.093, | |
| "eval_steps_per_second": 28.966, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 12.023668639053254, | |
| "grad_norm": 6.368637561798096, | |
| "learning_rate": 3.645049653927175e-05, | |
| "loss": 0.3998, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 12.071005917159763, | |
| "grad_norm": 14.744524955749512, | |
| "learning_rate": 3.637526331628047e-05, | |
| "loss": 0.324, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 12.118343195266272, | |
| "grad_norm": 4.304303169250488, | |
| "learning_rate": 3.63000300932892e-05, | |
| "loss": 0.3538, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 12.165680473372781, | |
| "grad_norm": 11.705492973327637, | |
| "learning_rate": 3.622479687029793e-05, | |
| "loss": 0.3422, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 12.21301775147929, | |
| "grad_norm": 9.357977867126465, | |
| "learning_rate": 3.614956364730665e-05, | |
| "loss": 0.2732, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 12.2603550295858, | |
| "grad_norm": 12.46599006652832, | |
| "learning_rate": 3.607433042431538e-05, | |
| "loss": 0.4473, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 12.307692307692308, | |
| "grad_norm": 20.074487686157227, | |
| "learning_rate": 3.59990972013241e-05, | |
| "loss": 0.2837, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 12.355029585798816, | |
| "grad_norm": 4.281162738800049, | |
| "learning_rate": 3.5923863978332834e-05, | |
| "loss": 0.3035, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 12.402366863905325, | |
| "grad_norm": 10.390352249145508, | |
| "learning_rate": 3.584863075534156e-05, | |
| "loss": 0.2636, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 12.449704142011834, | |
| "grad_norm": 3.76784348487854, | |
| "learning_rate": 3.577339753235029e-05, | |
| "loss": 0.3388, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 12.497041420118343, | |
| "grad_norm": 9.673295021057129, | |
| "learning_rate": 3.5698164309359016e-05, | |
| "loss": 0.2947, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 12.544378698224852, | |
| "grad_norm": 6.6694722175598145, | |
| "learning_rate": 3.562293108636774e-05, | |
| "loss": 0.3453, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 12.591715976331361, | |
| "grad_norm": 7.178610324859619, | |
| "learning_rate": 3.5547697863376465e-05, | |
| "loss": 0.3383, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 12.63905325443787, | |
| "grad_norm": 10.715120315551758, | |
| "learning_rate": 3.54724646403852e-05, | |
| "loss": 0.3222, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 12.68639053254438, | |
| "grad_norm": 6.3047285079956055, | |
| "learning_rate": 3.539723141739392e-05, | |
| "loss": 0.3521, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 12.733727810650887, | |
| "grad_norm": 6.073225021362305, | |
| "learning_rate": 3.532199819440265e-05, | |
| "loss": 0.2904, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 12.781065088757396, | |
| "grad_norm": 9.05847454071045, | |
| "learning_rate": 3.524676497141138e-05, | |
| "loss": 0.3764, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 12.828402366863905, | |
| "grad_norm": 6.264795303344727, | |
| "learning_rate": 3.51715317484201e-05, | |
| "loss": 0.3159, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 12.875739644970414, | |
| "grad_norm": 7.125365257263184, | |
| "learning_rate": 3.5096298525428834e-05, | |
| "loss": 0.2996, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 12.923076923076923, | |
| "grad_norm": 9.880492210388184, | |
| "learning_rate": 3.502106530243755e-05, | |
| "loss": 0.3283, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 12.970414201183432, | |
| "grad_norm": 14.802063941955566, | |
| "learning_rate": 3.4945832079446284e-05, | |
| "loss": 0.3478, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 12.998816568047337, | |
| "eval_accuracy": 0.9013698630136986, | |
| "eval_loss": 0.3168272078037262, | |
| "eval_runtime": 6.174, | |
| "eval_samples_per_second": 236.475, | |
| "eval_steps_per_second": 29.64, | |
| "step": 2746 | |
| }, | |
| { | |
| "epoch": 13.017751479289942, | |
| "grad_norm": 9.61425495147705, | |
| "learning_rate": 3.487059885645501e-05, | |
| "loss": 0.2884, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 13.065088757396449, | |
| "grad_norm": 7.737671375274658, | |
| "learning_rate": 3.479536563346374e-05, | |
| "loss": 0.3732, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 13.112426035502958, | |
| "grad_norm": 7.558273792266846, | |
| "learning_rate": 3.4720132410472465e-05, | |
| "loss": 0.2859, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 13.159763313609467, | |
| "grad_norm": 7.560544013977051, | |
| "learning_rate": 3.46448991874812e-05, | |
| "loss": 0.2986, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 13.207100591715976, | |
| "grad_norm": 7.7973480224609375, | |
| "learning_rate": 3.456966596448992e-05, | |
| "loss": 0.291, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 13.254437869822485, | |
| "grad_norm": 9.302266120910645, | |
| "learning_rate": 3.449443274149865e-05, | |
| "loss": 0.3669, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 13.301775147928995, | |
| "grad_norm": 5.183737277984619, | |
| "learning_rate": 3.441919951850737e-05, | |
| "loss": 0.293, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 13.349112426035504, | |
| "grad_norm": 6.064436912536621, | |
| "learning_rate": 3.43439662955161e-05, | |
| "loss": 0.2944, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 13.396449704142011, | |
| "grad_norm": 9.409137725830078, | |
| "learning_rate": 3.426873307252483e-05, | |
| "loss": 0.3103, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 13.44378698224852, | |
| "grad_norm": 20.371089935302734, | |
| "learning_rate": 3.419349984953356e-05, | |
| "loss": 0.2879, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 13.49112426035503, | |
| "grad_norm": 9.97218132019043, | |
| "learning_rate": 3.4118266626542284e-05, | |
| "loss": 0.2959, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 13.538461538461538, | |
| "grad_norm": 7.915639400482178, | |
| "learning_rate": 3.404303340355101e-05, | |
| "loss": 0.2929, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 13.585798816568047, | |
| "grad_norm": 6.3162641525268555, | |
| "learning_rate": 3.396780018055974e-05, | |
| "loss": 0.2773, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 13.633136094674557, | |
| "grad_norm": 7.813812732696533, | |
| "learning_rate": 3.389256695756846e-05, | |
| "loss": 0.2795, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 13.680473372781066, | |
| "grad_norm": 13.80722427368164, | |
| "learning_rate": 3.381733373457719e-05, | |
| "loss": 0.3648, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 13.727810650887575, | |
| "grad_norm": 9.83273696899414, | |
| "learning_rate": 3.3742100511585915e-05, | |
| "loss": 0.2952, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 13.775147928994082, | |
| "grad_norm": 10.903112411499023, | |
| "learning_rate": 3.3666867288594646e-05, | |
| "loss": 0.269, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 13.822485207100591, | |
| "grad_norm": 4.992847919464111, | |
| "learning_rate": 3.359163406560337e-05, | |
| "loss": 0.3689, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 13.8698224852071, | |
| "grad_norm": 7.029762268066406, | |
| "learning_rate": 3.35164008426121e-05, | |
| "loss": 0.3296, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 13.91715976331361, | |
| "grad_norm": 15.533370018005371, | |
| "learning_rate": 3.344116761962083e-05, | |
| "loss": 0.2764, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 13.964497041420119, | |
| "grad_norm": 14.553123474121094, | |
| "learning_rate": 3.336593439662956e-05, | |
| "loss": 0.3148, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 13.997633136094674, | |
| "eval_accuracy": 0.9054794520547945, | |
| "eval_loss": 0.3071611225605011, | |
| "eval_runtime": 6.0563, | |
| "eval_samples_per_second": 241.071, | |
| "eval_steps_per_second": 30.216, | |
| "step": 2957 | |
| }, | |
| { | |
| "epoch": 14.011834319526628, | |
| "grad_norm": 6.9089035987854, | |
| "learning_rate": 3.329070117363828e-05, | |
| "loss": 0.3318, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 14.059171597633137, | |
| "grad_norm": 7.897435665130615, | |
| "learning_rate": 3.321546795064701e-05, | |
| "loss": 0.2537, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 14.106508875739644, | |
| "grad_norm": 12.082826614379883, | |
| "learning_rate": 3.314023472765573e-05, | |
| "loss": 0.2685, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 14.153846153846153, | |
| "grad_norm": 8.465901374816895, | |
| "learning_rate": 3.306500150466446e-05, | |
| "loss": 0.3849, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 14.201183431952662, | |
| "grad_norm": 9.606731414794922, | |
| "learning_rate": 3.298976828167319e-05, | |
| "loss": 0.3219, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 14.248520710059172, | |
| "grad_norm": 5.763510704040527, | |
| "learning_rate": 3.2914535058681914e-05, | |
| "loss": 0.2798, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 14.29585798816568, | |
| "grad_norm": 7.898010730743408, | |
| "learning_rate": 3.2839301835690646e-05, | |
| "loss": 0.353, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 14.34319526627219, | |
| "grad_norm": 4.139184951782227, | |
| "learning_rate": 3.276406861269937e-05, | |
| "loss": 0.3145, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 14.390532544378699, | |
| "grad_norm": 10.472068786621094, | |
| "learning_rate": 3.2688835389708096e-05, | |
| "loss": 0.2997, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 14.437869822485208, | |
| "grad_norm": 6.952048301696777, | |
| "learning_rate": 3.261360216671682e-05, | |
| "loss": 0.2931, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 14.485207100591715, | |
| "grad_norm": 11.008207321166992, | |
| "learning_rate": 3.253836894372555e-05, | |
| "loss": 0.2891, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 14.532544378698224, | |
| "grad_norm": 4.314377784729004, | |
| "learning_rate": 3.246313572073428e-05, | |
| "loss": 0.2922, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 14.579881656804734, | |
| "grad_norm": 6.738071441650391, | |
| "learning_rate": 3.238790249774301e-05, | |
| "loss": 0.2226, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 14.627218934911243, | |
| "grad_norm": 5.609333038330078, | |
| "learning_rate": 3.231266927475173e-05, | |
| "loss": 0.2366, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 14.674556213017752, | |
| "grad_norm": 5.399454116821289, | |
| "learning_rate": 3.223743605176046e-05, | |
| "loss": 0.32, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 14.721893491124261, | |
| "grad_norm": 13.962152481079102, | |
| "learning_rate": 3.216220282876918e-05, | |
| "loss": 0.3652, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 14.76923076923077, | |
| "grad_norm": 8.14931869506836, | |
| "learning_rate": 3.2086969605777914e-05, | |
| "loss": 0.2513, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 14.816568047337277, | |
| "grad_norm": 6.72014045715332, | |
| "learning_rate": 3.201173638278664e-05, | |
| "loss": 0.3068, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 14.863905325443787, | |
| "grad_norm": 9.025717735290527, | |
| "learning_rate": 3.1936503159795364e-05, | |
| "loss": 0.2845, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 14.911242603550296, | |
| "grad_norm": 3.6108787059783936, | |
| "learning_rate": 3.1861269936804096e-05, | |
| "loss": 0.2868, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 14.958579881656805, | |
| "grad_norm": 12.648404121398926, | |
| "learning_rate": 3.178603671381282e-05, | |
| "loss": 0.2896, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 14.996449704142012, | |
| "eval_accuracy": 0.9061643835616439, | |
| "eval_loss": 0.30652791261672974, | |
| "eval_runtime": 6.136, | |
| "eval_samples_per_second": 237.938, | |
| "eval_steps_per_second": 29.824, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 15.005917159763314, | |
| "grad_norm": 5.476109027862549, | |
| "learning_rate": 3.171080349082155e-05, | |
| "loss": 0.3452, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 15.053254437869823, | |
| "grad_norm": 8.330878257751465, | |
| "learning_rate": 3.163557026783028e-05, | |
| "loss": 0.2456, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 15.100591715976332, | |
| "grad_norm": 8.56313705444336, | |
| "learning_rate": 3.1560337044839e-05, | |
| "loss": 0.2296, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 15.14792899408284, | |
| "grad_norm": 10.402885437011719, | |
| "learning_rate": 3.1485103821847726e-05, | |
| "loss": 0.2862, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 15.195266272189349, | |
| "grad_norm": 7.497808933258057, | |
| "learning_rate": 3.140987059885646e-05, | |
| "loss": 0.3389, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 15.242603550295858, | |
| "grad_norm": 7.207127094268799, | |
| "learning_rate": 3.133463737586518e-05, | |
| "loss": 0.2575, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 15.289940828402367, | |
| "grad_norm": 4.729502201080322, | |
| "learning_rate": 3.1259404152873914e-05, | |
| "loss": 0.2308, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 15.337278106508876, | |
| "grad_norm": 10.251791954040527, | |
| "learning_rate": 3.118417092988264e-05, | |
| "loss": 0.2549, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 15.384615384615385, | |
| "grad_norm": 4.962519645690918, | |
| "learning_rate": 3.1108937706891364e-05, | |
| "loss": 0.2448, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 15.431952662721894, | |
| "grad_norm": 8.956313133239746, | |
| "learning_rate": 3.103370448390009e-05, | |
| "loss": 0.2278, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 15.479289940828401, | |
| "grad_norm": 5.445577144622803, | |
| "learning_rate": 3.0958471260908813e-05, | |
| "loss": 0.3195, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 15.52662721893491, | |
| "grad_norm": 8.691884994506836, | |
| "learning_rate": 3.0883238037917545e-05, | |
| "loss": 0.2816, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 15.57396449704142, | |
| "grad_norm": 4.890760898590088, | |
| "learning_rate": 3.080800481492627e-05, | |
| "loss": 0.2479, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 15.621301775147929, | |
| "grad_norm": 10.502642631530762, | |
| "learning_rate": 3.0732771591935e-05, | |
| "loss": 0.2368, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 15.668639053254438, | |
| "grad_norm": 11.197770118713379, | |
| "learning_rate": 3.0657538368943726e-05, | |
| "loss": 0.396, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 15.715976331360947, | |
| "grad_norm": 7.301953315734863, | |
| "learning_rate": 3.058230514595246e-05, | |
| "loss": 0.2605, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 15.763313609467456, | |
| "grad_norm": 9.391778945922852, | |
| "learning_rate": 3.0507071922961183e-05, | |
| "loss": 0.2318, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 15.810650887573965, | |
| "grad_norm": 11.96308708190918, | |
| "learning_rate": 3.0431838699969904e-05, | |
| "loss": 0.3574, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 15.857988165680473, | |
| "grad_norm": 6.631661415100098, | |
| "learning_rate": 3.0356605476978632e-05, | |
| "loss": 0.2773, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 15.905325443786982, | |
| "grad_norm": 7.179072380065918, | |
| "learning_rate": 3.028137225398736e-05, | |
| "loss": 0.3573, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 15.95266272189349, | |
| "grad_norm": 9.855470657348633, | |
| "learning_rate": 3.020613903099609e-05, | |
| "loss": 0.3077, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 4.808469772338867, | |
| "learning_rate": 3.0130905808004817e-05, | |
| "loss": 0.3149, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.9082191780821918, | |
| "eval_loss": 0.2928474545478821, | |
| "eval_runtime": 6.1031, | |
| "eval_samples_per_second": 239.221, | |
| "eval_steps_per_second": 29.985, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 16.047337278106507, | |
| "grad_norm": 5.927903175354004, | |
| "learning_rate": 3.0055672585013545e-05, | |
| "loss": 0.2194, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 16.09467455621302, | |
| "grad_norm": 9.440893173217773, | |
| "learning_rate": 2.9980439362022273e-05, | |
| "loss": 0.2311, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 16.142011834319526, | |
| "grad_norm": 10.132343292236328, | |
| "learning_rate": 2.9905206139031e-05, | |
| "loss": 0.2608, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 16.189349112426036, | |
| "grad_norm": 9.294024467468262, | |
| "learning_rate": 2.9829972916039723e-05, | |
| "loss": 0.3056, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 16.236686390532544, | |
| "grad_norm": 6.507917404174805, | |
| "learning_rate": 2.975473969304845e-05, | |
| "loss": 0.1905, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 16.284023668639055, | |
| "grad_norm": 8.411003112792969, | |
| "learning_rate": 2.967950647005718e-05, | |
| "loss": 0.3232, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 16.331360946745562, | |
| "grad_norm": 5.495641708374023, | |
| "learning_rate": 2.9604273247065907e-05, | |
| "loss": 0.2718, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 16.37869822485207, | |
| "grad_norm": 9.734967231750488, | |
| "learning_rate": 2.9529040024074632e-05, | |
| "loss": 0.2966, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 16.42603550295858, | |
| "grad_norm": 3.004697799682617, | |
| "learning_rate": 2.945380680108336e-05, | |
| "loss": 0.2234, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 16.473372781065088, | |
| "grad_norm": 13.730050086975098, | |
| "learning_rate": 2.937857357809209e-05, | |
| "loss": 0.3209, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 16.5207100591716, | |
| "grad_norm": 5.133395195007324, | |
| "learning_rate": 2.930334035510081e-05, | |
| "loss": 0.2561, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 16.568047337278106, | |
| "grad_norm": 5.885538101196289, | |
| "learning_rate": 2.9228107132109538e-05, | |
| "loss": 0.3166, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 16.615384615384617, | |
| "grad_norm": 8.295323371887207, | |
| "learning_rate": 2.9152873909118266e-05, | |
| "loss": 0.2634, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 16.662721893491124, | |
| "grad_norm": 8.664441108703613, | |
| "learning_rate": 2.9077640686126994e-05, | |
| "loss": 0.2488, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 16.71005917159763, | |
| "grad_norm": 13.536978721618652, | |
| "learning_rate": 2.9002407463135723e-05, | |
| "loss": 0.2616, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 16.757396449704142, | |
| "grad_norm": 8.778542518615723, | |
| "learning_rate": 2.892717424014445e-05, | |
| "loss": 0.3111, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 16.80473372781065, | |
| "grad_norm": 4.70704460144043, | |
| "learning_rate": 2.885194101715318e-05, | |
| "loss": 0.2381, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 16.85207100591716, | |
| "grad_norm": 13.269988059997559, | |
| "learning_rate": 2.8776707794161907e-05, | |
| "loss": 0.2824, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 16.899408284023668, | |
| "grad_norm": 3.4718408584594727, | |
| "learning_rate": 2.870147457117063e-05, | |
| "loss": 0.2517, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 16.94674556213018, | |
| "grad_norm": 4.911701679229736, | |
| "learning_rate": 2.8626241348179357e-05, | |
| "loss": 0.2842, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 16.994082840236686, | |
| "grad_norm": 14.3350248336792, | |
| "learning_rate": 2.8551008125188085e-05, | |
| "loss": 0.2734, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 16.99881656804734, | |
| "eval_accuracy": 0.9095890410958904, | |
| "eval_loss": 0.2769572138786316, | |
| "eval_runtime": 6.3128, | |
| "eval_samples_per_second": 231.275, | |
| "eval_steps_per_second": 28.989, | |
| "step": 3591 | |
| }, | |
| { | |
| "epoch": 17.041420118343197, | |
| "grad_norm": 9.578266143798828, | |
| "learning_rate": 2.847577490219681e-05, | |
| "loss": 0.2216, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 17.088757396449704, | |
| "grad_norm": 10.65328311920166, | |
| "learning_rate": 2.8400541679205538e-05, | |
| "loss": 0.2713, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 17.13609467455621, | |
| "grad_norm": 13.547807693481445, | |
| "learning_rate": 2.8325308456214266e-05, | |
| "loss": 0.2578, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 17.183431952662723, | |
| "grad_norm": 5.553393363952637, | |
| "learning_rate": 2.8250075233222994e-05, | |
| "loss": 0.3016, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 17.23076923076923, | |
| "grad_norm": 9.82513427734375, | |
| "learning_rate": 2.8174842010231723e-05, | |
| "loss": 0.281, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 17.27810650887574, | |
| "grad_norm": 3.8038620948791504, | |
| "learning_rate": 2.8099608787240444e-05, | |
| "loss": 0.2876, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 17.325443786982248, | |
| "grad_norm": 4.463418006896973, | |
| "learning_rate": 2.8024375564249172e-05, | |
| "loss": 0.2434, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 17.37278106508876, | |
| "grad_norm": 4.446181297302246, | |
| "learning_rate": 2.79491423412579e-05, | |
| "loss": 0.2434, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 17.420118343195266, | |
| "grad_norm": 12.428364753723145, | |
| "learning_rate": 2.787390911826663e-05, | |
| "loss": 0.2706, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 17.467455621301774, | |
| "grad_norm": 9.818281173706055, | |
| "learning_rate": 2.7798675895275357e-05, | |
| "loss": 0.232, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 17.514792899408285, | |
| "grad_norm": 19.56150245666504, | |
| "learning_rate": 2.7723442672284085e-05, | |
| "loss": 0.2981, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 17.562130177514792, | |
| "grad_norm": 8.730667114257812, | |
| "learning_rate": 2.764820944929281e-05, | |
| "loss": 0.2427, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 17.609467455621303, | |
| "grad_norm": 11.973594665527344, | |
| "learning_rate": 2.7572976226301534e-05, | |
| "loss": 0.2359, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 17.65680473372781, | |
| "grad_norm": 2.578996419906616, | |
| "learning_rate": 2.7497743003310263e-05, | |
| "loss": 0.2783, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 17.70414201183432, | |
| "grad_norm": 9.876580238342285, | |
| "learning_rate": 2.7422509780318987e-05, | |
| "loss": 0.2268, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 17.75147928994083, | |
| "grad_norm": 5.562457084655762, | |
| "learning_rate": 2.7347276557327716e-05, | |
| "loss": 0.2296, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 17.798816568047336, | |
| "grad_norm": 6.533483505249023, | |
| "learning_rate": 2.7272043334336444e-05, | |
| "loss": 0.2818, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 17.846153846153847, | |
| "grad_norm": 7.880773544311523, | |
| "learning_rate": 2.7196810111345172e-05, | |
| "loss": 0.2865, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 17.893491124260354, | |
| "grad_norm": 13.510115623474121, | |
| "learning_rate": 2.71215768883539e-05, | |
| "loss": 0.3133, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 17.940828402366865, | |
| "grad_norm": 6.314772605895996, | |
| "learning_rate": 2.704634366536263e-05, | |
| "loss": 0.2102, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 17.988165680473372, | |
| "grad_norm": 4.932859420776367, | |
| "learning_rate": 2.697111044237135e-05, | |
| "loss": 0.2344, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 17.997633136094674, | |
| "eval_accuracy": 0.8952054794520548, | |
| "eval_loss": 0.3737930953502655, | |
| "eval_runtime": 6.2965, | |
| "eval_samples_per_second": 231.875, | |
| "eval_steps_per_second": 29.064, | |
| "step": 3802 | |
| }, | |
| { | |
| "epoch": 18.035502958579883, | |
| "grad_norm": 8.163798332214355, | |
| "learning_rate": 2.6895877219380078e-05, | |
| "loss": 0.349, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 18.08284023668639, | |
| "grad_norm": 8.841765403747559, | |
| "learning_rate": 2.6820643996388806e-05, | |
| "loss": 0.2864, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 18.130177514792898, | |
| "grad_norm": 5.997651100158691, | |
| "learning_rate": 2.6745410773397534e-05, | |
| "loss": 0.2941, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 18.17751479289941, | |
| "grad_norm": 5.4760332107543945, | |
| "learning_rate": 2.6670177550406263e-05, | |
| "loss": 0.2216, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 18.224852071005916, | |
| "grad_norm": 6.478240489959717, | |
| "learning_rate": 2.6594944327414987e-05, | |
| "loss": 0.2874, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 18.272189349112427, | |
| "grad_norm": 12.63205623626709, | |
| "learning_rate": 2.6519711104423716e-05, | |
| "loss": 0.2338, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 18.319526627218934, | |
| "grad_norm": 9.010831832885742, | |
| "learning_rate": 2.6444477881432444e-05, | |
| "loss": 0.3293, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 18.366863905325445, | |
| "grad_norm": 6.102337837219238, | |
| "learning_rate": 2.6369244658441165e-05, | |
| "loss": 0.3229, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 18.414201183431953, | |
| "grad_norm": 9.948938369750977, | |
| "learning_rate": 2.6294011435449893e-05, | |
| "loss": 0.2604, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 18.46153846153846, | |
| "grad_norm": 8.575167655944824, | |
| "learning_rate": 2.621877821245862e-05, | |
| "loss": 0.2205, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 18.50887573964497, | |
| "grad_norm": 7.808337688446045, | |
| "learning_rate": 2.614354498946735e-05, | |
| "loss": 0.1802, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 18.556213017751478, | |
| "grad_norm": 11.38652515411377, | |
| "learning_rate": 2.6068311766476078e-05, | |
| "loss": 0.2161, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 18.60355029585799, | |
| "grad_norm": 7.173455715179443, | |
| "learning_rate": 2.5993078543484806e-05, | |
| "loss": 0.2973, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 18.650887573964496, | |
| "grad_norm": 10.973929405212402, | |
| "learning_rate": 2.5917845320493534e-05, | |
| "loss": 0.2557, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 18.698224852071007, | |
| "grad_norm": 6.697062015533447, | |
| "learning_rate": 2.5842612097502256e-05, | |
| "loss": 0.2371, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 18.745562130177515, | |
| "grad_norm": 11.82797908782959, | |
| "learning_rate": 2.5767378874510984e-05, | |
| "loss": 0.2639, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 18.792899408284022, | |
| "grad_norm": 4.322720050811768, | |
| "learning_rate": 2.5692145651519712e-05, | |
| "loss": 0.2212, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 18.840236686390533, | |
| "grad_norm": 5.201810836791992, | |
| "learning_rate": 2.561691242852844e-05, | |
| "loss": 0.2003, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 18.88757396449704, | |
| "grad_norm": 7.236006736755371, | |
| "learning_rate": 2.554167920553717e-05, | |
| "loss": 0.3897, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 18.93491124260355, | |
| "grad_norm": 7.327210426330566, | |
| "learning_rate": 2.5466445982545893e-05, | |
| "loss": 0.1939, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 18.98224852071006, | |
| "grad_norm": 16.192811965942383, | |
| "learning_rate": 2.539121275955462e-05, | |
| "loss": 0.2872, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 18.996449704142012, | |
| "eval_accuracy": 0.9061643835616439, | |
| "eval_loss": 0.3222917914390564, | |
| "eval_runtime": 6.2252, | |
| "eval_samples_per_second": 234.532, | |
| "eval_steps_per_second": 29.397, | |
| "step": 4013 | |
| }, | |
| { | |
| "epoch": 19.02958579881657, | |
| "grad_norm": 14.001523971557617, | |
| "learning_rate": 2.531597953656335e-05, | |
| "loss": 0.2899, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 19.076923076923077, | |
| "grad_norm": 12.866436004638672, | |
| "learning_rate": 2.524074631357207e-05, | |
| "loss": 0.2249, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 19.124260355029588, | |
| "grad_norm": 12.653215408325195, | |
| "learning_rate": 2.51655130905808e-05, | |
| "loss": 0.2008, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 19.171597633136095, | |
| "grad_norm": 6.0526604652404785, | |
| "learning_rate": 2.5090279867589527e-05, | |
| "loss": 0.2264, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 19.218934911242602, | |
| "grad_norm": 7.189617156982422, | |
| "learning_rate": 2.5015046644598256e-05, | |
| "loss": 0.2785, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 19.266272189349113, | |
| "grad_norm": 6.08707332611084, | |
| "learning_rate": 2.4939813421606984e-05, | |
| "loss": 0.2666, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 19.31360946745562, | |
| "grad_norm": 8.810041427612305, | |
| "learning_rate": 2.4864580198615712e-05, | |
| "loss": 0.2561, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 19.36094674556213, | |
| "grad_norm": 5.877760410308838, | |
| "learning_rate": 2.4789346975624437e-05, | |
| "loss": 0.1829, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 19.40828402366864, | |
| "grad_norm": 4.540722846984863, | |
| "learning_rate": 2.4714113752633165e-05, | |
| "loss": 0.2082, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 19.45562130177515, | |
| "grad_norm": 10.91895866394043, | |
| "learning_rate": 2.4638880529641893e-05, | |
| "loss": 0.2264, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 19.502958579881657, | |
| "grad_norm": 18.722084045410156, | |
| "learning_rate": 2.4563647306650618e-05, | |
| "loss": 0.2649, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 19.550295857988164, | |
| "grad_norm": 5.907430648803711, | |
| "learning_rate": 2.4488414083659346e-05, | |
| "loss": 0.1769, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 19.597633136094675, | |
| "grad_norm": 12.51977825164795, | |
| "learning_rate": 2.441318086066807e-05, | |
| "loss": 0.2895, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 19.644970414201183, | |
| "grad_norm": 9.822182655334473, | |
| "learning_rate": 2.43379476376768e-05, | |
| "loss": 0.2349, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 19.692307692307693, | |
| "grad_norm": 6.536006450653076, | |
| "learning_rate": 2.4262714414685524e-05, | |
| "loss": 0.2949, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 19.7396449704142, | |
| "grad_norm": 6.116447448730469, | |
| "learning_rate": 2.4187481191694252e-05, | |
| "loss": 0.2438, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 19.78698224852071, | |
| "grad_norm": 8.528430938720703, | |
| "learning_rate": 2.411224796870298e-05, | |
| "loss": 0.274, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 19.83431952662722, | |
| "grad_norm": 9.427675247192383, | |
| "learning_rate": 2.403701474571171e-05, | |
| "loss": 0.2848, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 19.881656804733726, | |
| "grad_norm": 5.054657459259033, | |
| "learning_rate": 2.3961781522720433e-05, | |
| "loss": 0.24, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 19.928994082840237, | |
| "grad_norm": 12.677891731262207, | |
| "learning_rate": 2.388654829972916e-05, | |
| "loss": 0.2593, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 19.976331360946745, | |
| "grad_norm": 5.84495735168457, | |
| "learning_rate": 2.381131507673789e-05, | |
| "loss": 0.2486, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9068493150684932, | |
| "eval_loss": 0.32860177755355835, | |
| "eval_runtime": 6.1011, | |
| "eval_samples_per_second": 239.302, | |
| "eval_steps_per_second": 29.995, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 20.023668639053255, | |
| "grad_norm": 6.881824970245361, | |
| "learning_rate": 2.3736081853746618e-05, | |
| "loss": 0.2215, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 20.071005917159763, | |
| "grad_norm": 10.07770824432373, | |
| "learning_rate": 2.3660848630755343e-05, | |
| "loss": 0.244, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 20.118343195266274, | |
| "grad_norm": 4.6197919845581055, | |
| "learning_rate": 2.358561540776407e-05, | |
| "loss": 0.2289, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 20.16568047337278, | |
| "grad_norm": 8.33582592010498, | |
| "learning_rate": 2.35103821847728e-05, | |
| "loss": 0.1889, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 20.21301775147929, | |
| "grad_norm": 8.195116996765137, | |
| "learning_rate": 2.3435148961781524e-05, | |
| "loss": 0.2015, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 20.2603550295858, | |
| "grad_norm": 6.473872661590576, | |
| "learning_rate": 2.335991573879025e-05, | |
| "loss": 0.2306, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 20.307692307692307, | |
| "grad_norm": 4.936031341552734, | |
| "learning_rate": 2.3284682515798977e-05, | |
| "loss": 0.2311, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 20.355029585798817, | |
| "grad_norm": 16.449352264404297, | |
| "learning_rate": 2.3209449292807705e-05, | |
| "loss": 0.2129, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 20.402366863905325, | |
| "grad_norm": 7.029664516448975, | |
| "learning_rate": 2.3134216069816433e-05, | |
| "loss": 0.2211, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 20.449704142011836, | |
| "grad_norm": 7.797490119934082, | |
| "learning_rate": 2.3058982846825158e-05, | |
| "loss": 0.2305, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 20.497041420118343, | |
| "grad_norm": 13.063493728637695, | |
| "learning_rate": 2.2983749623833886e-05, | |
| "loss": 0.2916, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 20.54437869822485, | |
| "grad_norm": 9.06458568572998, | |
| "learning_rate": 2.2908516400842614e-05, | |
| "loss": 0.2342, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 20.59171597633136, | |
| "grad_norm": 7.881487846374512, | |
| "learning_rate": 2.283328317785134e-05, | |
| "loss": 0.2041, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 20.63905325443787, | |
| "grad_norm": 10.349453926086426, | |
| "learning_rate": 2.2758049954860067e-05, | |
| "loss": 0.2949, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 20.68639053254438, | |
| "grad_norm": 12.278468132019043, | |
| "learning_rate": 2.2682816731868795e-05, | |
| "loss": 0.2607, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 20.733727810650887, | |
| "grad_norm": 11.949197769165039, | |
| "learning_rate": 2.2607583508877524e-05, | |
| "loss": 0.2741, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 20.781065088757398, | |
| "grad_norm": 13.006739616394043, | |
| "learning_rate": 2.253235028588625e-05, | |
| "loss": 0.2845, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 20.828402366863905, | |
| "grad_norm": 6.179040908813477, | |
| "learning_rate": 2.2457117062894977e-05, | |
| "loss": 0.2518, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 20.875739644970416, | |
| "grad_norm": 8.708568572998047, | |
| "learning_rate": 2.23818838399037e-05, | |
| "loss": 0.254, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 20.923076923076923, | |
| "grad_norm": 8.595051765441895, | |
| "learning_rate": 2.230665061691243e-05, | |
| "loss": 0.2462, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 20.97041420118343, | |
| "grad_norm": 8.650654792785645, | |
| "learning_rate": 2.2231417393921154e-05, | |
| "loss": 0.2818, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 20.99881656804734, | |
| "eval_accuracy": 0.8938356164383562, | |
| "eval_loss": 0.3853361904621124, | |
| "eval_runtime": 6.105, | |
| "eval_samples_per_second": 239.147, | |
| "eval_steps_per_second": 29.975, | |
| "step": 4436 | |
| }, | |
| { | |
| "epoch": 21.01775147928994, | |
| "grad_norm": 7.857712268829346, | |
| "learning_rate": 2.2156184170929883e-05, | |
| "loss": 0.2664, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 21.06508875739645, | |
| "grad_norm": 7.22745943069458, | |
| "learning_rate": 2.208095094793861e-05, | |
| "loss": 0.2062, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 21.11242603550296, | |
| "grad_norm": 2.6673853397369385, | |
| "learning_rate": 2.200571772494734e-05, | |
| "loss": 0.2239, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 21.159763313609467, | |
| "grad_norm": 4.8849005699157715, | |
| "learning_rate": 2.1930484501956064e-05, | |
| "loss": 0.1985, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 21.207100591715978, | |
| "grad_norm": 22.471643447875977, | |
| "learning_rate": 2.1855251278964792e-05, | |
| "loss": 0.2331, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 21.254437869822485, | |
| "grad_norm": 12.047694206237793, | |
| "learning_rate": 2.178001805597352e-05, | |
| "loss": 0.193, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 21.301775147928993, | |
| "grad_norm": 8.459744453430176, | |
| "learning_rate": 2.170478483298225e-05, | |
| "loss": 0.2698, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 21.349112426035504, | |
| "grad_norm": 5.106344699859619, | |
| "learning_rate": 2.1629551609990973e-05, | |
| "loss": 0.2626, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 21.39644970414201, | |
| "grad_norm": 8.469663619995117, | |
| "learning_rate": 2.15543183869997e-05, | |
| "loss": 0.208, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 21.443786982248522, | |
| "grad_norm": 4.838006496429443, | |
| "learning_rate": 2.147908516400843e-05, | |
| "loss": 0.2903, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 21.49112426035503, | |
| "grad_norm": 5.432097911834717, | |
| "learning_rate": 2.1403851941017154e-05, | |
| "loss": 0.2337, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 21.53846153846154, | |
| "grad_norm": 6.889484882354736, | |
| "learning_rate": 2.132861871802588e-05, | |
| "loss": 0.2269, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 21.585798816568047, | |
| "grad_norm": 8.73716926574707, | |
| "learning_rate": 2.1253385495034607e-05, | |
| "loss": 0.2631, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 21.633136094674555, | |
| "grad_norm": 2.4893133640289307, | |
| "learning_rate": 2.1178152272043336e-05, | |
| "loss": 0.2186, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 21.680473372781066, | |
| "grad_norm": 7.44368839263916, | |
| "learning_rate": 2.110291904905206e-05, | |
| "loss": 0.2052, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 21.727810650887573, | |
| "grad_norm": 12.204940795898438, | |
| "learning_rate": 2.102768582606079e-05, | |
| "loss": 0.2862, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 21.775147928994084, | |
| "grad_norm": 7.419914722442627, | |
| "learning_rate": 2.0952452603069517e-05, | |
| "loss": 0.2568, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 21.82248520710059, | |
| "grad_norm": 7.833005905151367, | |
| "learning_rate": 2.0877219380078245e-05, | |
| "loss": 0.2727, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 21.869822485207102, | |
| "grad_norm": 3.8460819721221924, | |
| "learning_rate": 2.080198615708697e-05, | |
| "loss": 0.2306, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 21.91715976331361, | |
| "grad_norm": 12.018167495727539, | |
| "learning_rate": 2.0726752934095698e-05, | |
| "loss": 0.2729, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 21.964497041420117, | |
| "grad_norm": 7.023700714111328, | |
| "learning_rate": 2.0651519711104426e-05, | |
| "loss": 0.2845, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 21.997633136094674, | |
| "eval_accuracy": 0.8938356164383562, | |
| "eval_loss": 0.3902602195739746, | |
| "eval_runtime": 6.3485, | |
| "eval_samples_per_second": 229.975, | |
| "eval_steps_per_second": 28.826, | |
| "step": 4647 | |
| }, | |
| { | |
| "epoch": 22.011834319526628, | |
| "grad_norm": 11.811697006225586, | |
| "learning_rate": 2.0576286488113154e-05, | |
| "loss": 0.3063, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 22.059171597633135, | |
| "grad_norm": 4.944943428039551, | |
| "learning_rate": 2.050105326512188e-05, | |
| "loss": 0.2157, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 22.106508875739646, | |
| "grad_norm": 5.4949517250061035, | |
| "learning_rate": 2.0425820042130607e-05, | |
| "loss": 0.2373, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 22.153846153846153, | |
| "grad_norm": 6.9762163162231445, | |
| "learning_rate": 2.0350586819139332e-05, | |
| "loss": 0.2378, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 22.201183431952664, | |
| "grad_norm": 6.753002643585205, | |
| "learning_rate": 2.027535359614806e-05, | |
| "loss": 0.1939, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 22.24852071005917, | |
| "grad_norm": 4.161319732666016, | |
| "learning_rate": 2.0200120373156785e-05, | |
| "loss": 0.242, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 22.29585798816568, | |
| "grad_norm": 5.062042713165283, | |
| "learning_rate": 2.0124887150165513e-05, | |
| "loss": 0.2782, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 22.34319526627219, | |
| "grad_norm": 9.755287170410156, | |
| "learning_rate": 2.004965392717424e-05, | |
| "loss": 0.2009, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 22.390532544378697, | |
| "grad_norm": 6.668210506439209, | |
| "learning_rate": 1.997442070418297e-05, | |
| "loss": 0.1617, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 22.437869822485208, | |
| "grad_norm": 3.9158642292022705, | |
| "learning_rate": 1.9899187481191694e-05, | |
| "loss": 0.2013, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 22.485207100591715, | |
| "grad_norm": 7.47080659866333, | |
| "learning_rate": 1.9823954258200423e-05, | |
| "loss": 0.2057, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 22.532544378698226, | |
| "grad_norm": 17.479690551757812, | |
| "learning_rate": 1.974872103520915e-05, | |
| "loss": 0.2964, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 22.579881656804734, | |
| "grad_norm": 9.807324409484863, | |
| "learning_rate": 1.9673487812217876e-05, | |
| "loss": 0.2519, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 22.62721893491124, | |
| "grad_norm": 8.961894035339355, | |
| "learning_rate": 1.9598254589226604e-05, | |
| "loss": 0.2724, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 22.674556213017752, | |
| "grad_norm": 3.2384064197540283, | |
| "learning_rate": 1.9523021366235332e-05, | |
| "loss": 0.187, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 22.72189349112426, | |
| "grad_norm": 5.056863307952881, | |
| "learning_rate": 1.944778814324406e-05, | |
| "loss": 0.2512, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 22.76923076923077, | |
| "grad_norm": 9.88666820526123, | |
| "learning_rate": 1.9372554920252785e-05, | |
| "loss": 0.2003, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 22.816568047337277, | |
| "grad_norm": 15.032508850097656, | |
| "learning_rate": 1.929732169726151e-05, | |
| "loss": 0.2665, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 22.86390532544379, | |
| "grad_norm": 6.520040035247803, | |
| "learning_rate": 1.9222088474270238e-05, | |
| "loss": 0.2592, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 22.911242603550296, | |
| "grad_norm": 5.046426296234131, | |
| "learning_rate": 1.9146855251278966e-05, | |
| "loss": 0.2219, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 22.958579881656803, | |
| "grad_norm": 11.43876838684082, | |
| "learning_rate": 1.907162202828769e-05, | |
| "loss": 0.227, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 22.996449704142012, | |
| "eval_accuracy": 0.9041095890410958, | |
| "eval_loss": 0.3559742569923401, | |
| "eval_runtime": 6.1037, | |
| "eval_samples_per_second": 239.198, | |
| "eval_steps_per_second": 29.982, | |
| "step": 4858 | |
| }, | |
| { | |
| "epoch": 23.005917159763314, | |
| "grad_norm": 5.062148571014404, | |
| "learning_rate": 1.899638880529642e-05, | |
| "loss": 0.1716, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 23.05325443786982, | |
| "grad_norm": 13.177910804748535, | |
| "learning_rate": 1.8921155582305147e-05, | |
| "loss": 0.2274, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 23.100591715976332, | |
| "grad_norm": 10.63724136352539, | |
| "learning_rate": 1.8845922359313875e-05, | |
| "loss": 0.2376, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 23.14792899408284, | |
| "grad_norm": 11.315512657165527, | |
| "learning_rate": 1.87706891363226e-05, | |
| "loss": 0.2451, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 23.19526627218935, | |
| "grad_norm": 9.915947914123535, | |
| "learning_rate": 1.869545591333133e-05, | |
| "loss": 0.265, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 23.242603550295858, | |
| "grad_norm": 7.371302604675293, | |
| "learning_rate": 1.8620222690340057e-05, | |
| "loss": 0.203, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 23.28994082840237, | |
| "grad_norm": 10.347346305847168, | |
| "learning_rate": 1.8544989467348785e-05, | |
| "loss": 0.2058, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 23.337278106508876, | |
| "grad_norm": 7.930377006530762, | |
| "learning_rate": 1.846975624435751e-05, | |
| "loss": 0.1881, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 23.384615384615383, | |
| "grad_norm": 7.690789699554443, | |
| "learning_rate": 1.8394523021366238e-05, | |
| "loss": 0.2058, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 23.431952662721894, | |
| "grad_norm": 9.262539863586426, | |
| "learning_rate": 1.8319289798374963e-05, | |
| "loss": 0.2432, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 23.4792899408284, | |
| "grad_norm": 6.507819652557373, | |
| "learning_rate": 1.8244056575383687e-05, | |
| "loss": 0.1951, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 23.526627218934912, | |
| "grad_norm": 5.187134742736816, | |
| "learning_rate": 1.8168823352392416e-05, | |
| "loss": 0.1948, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 23.57396449704142, | |
| "grad_norm": 5.986237525939941, | |
| "learning_rate": 1.8093590129401144e-05, | |
| "loss": 0.1896, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 23.62130177514793, | |
| "grad_norm": 3.2465999126434326, | |
| "learning_rate": 1.8018356906409872e-05, | |
| "loss": 0.2521, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 23.668639053254438, | |
| "grad_norm": 6.972270488739014, | |
| "learning_rate": 1.7943123683418597e-05, | |
| "loss": 0.2162, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 23.715976331360945, | |
| "grad_norm": 10.68996524810791, | |
| "learning_rate": 1.7867890460427325e-05, | |
| "loss": 0.228, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 23.763313609467456, | |
| "grad_norm": 13.406333923339844, | |
| "learning_rate": 1.7792657237436053e-05, | |
| "loss": 0.2184, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 23.810650887573964, | |
| "grad_norm": 10.20108699798584, | |
| "learning_rate": 1.771742401444478e-05, | |
| "loss": 0.2286, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 23.857988165680474, | |
| "grad_norm": 4.646299839019775, | |
| "learning_rate": 1.7642190791453506e-05, | |
| "loss": 0.177, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 23.90532544378698, | |
| "grad_norm": 11.070876121520996, | |
| "learning_rate": 1.7566957568462234e-05, | |
| "loss": 0.232, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 23.952662721893493, | |
| "grad_norm": 9.572555541992188, | |
| "learning_rate": 1.7491724345470962e-05, | |
| "loss": 0.2007, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 5.681567668914795, | |
| "learning_rate": 1.7416491122479687e-05, | |
| "loss": 0.1909, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9054794520547945, | |
| "eval_loss": 0.3623672127723694, | |
| "eval_runtime": 6.2394, | |
| "eval_samples_per_second": 233.996, | |
| "eval_steps_per_second": 29.33, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 24.047337278106507, | |
| "grad_norm": 9.697016716003418, | |
| "learning_rate": 1.7341257899488415e-05, | |
| "loss": 0.2439, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 24.09467455621302, | |
| "grad_norm": 10.163914680480957, | |
| "learning_rate": 1.726602467649714e-05, | |
| "loss": 0.1802, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 24.142011834319526, | |
| "grad_norm": 11.584846496582031, | |
| "learning_rate": 1.719079145350587e-05, | |
| "loss": 0.2823, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 24.189349112426036, | |
| "grad_norm": 4.525638103485107, | |
| "learning_rate": 1.7115558230514597e-05, | |
| "loss": 0.1716, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 24.236686390532544, | |
| "grad_norm": 7.053022861480713, | |
| "learning_rate": 1.704032500752332e-05, | |
| "loss": 0.218, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 24.284023668639055, | |
| "grad_norm": 2.7923426628112793, | |
| "learning_rate": 1.696509178453205e-05, | |
| "loss": 0.185, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 24.331360946745562, | |
| "grad_norm": 3.2291653156280518, | |
| "learning_rate": 1.6889858561540778e-05, | |
| "loss": 0.2196, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 24.37869822485207, | |
| "grad_norm": 11.007999420166016, | |
| "learning_rate": 1.6814625338549506e-05, | |
| "loss": 0.2367, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 24.42603550295858, | |
| "grad_norm": 10.4671049118042, | |
| "learning_rate": 1.673939211555823e-05, | |
| "loss": 0.2754, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 24.473372781065088, | |
| "grad_norm": 11.023184776306152, | |
| "learning_rate": 1.666415889256696e-05, | |
| "loss": 0.2092, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 24.5207100591716, | |
| "grad_norm": 7.405954360961914, | |
| "learning_rate": 1.6588925669575687e-05, | |
| "loss": 0.2387, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 24.568047337278106, | |
| "grad_norm": 2.6797077655792236, | |
| "learning_rate": 1.6513692446584412e-05, | |
| "loss": 0.2842, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 24.615384615384617, | |
| "grad_norm": 2.8351101875305176, | |
| "learning_rate": 1.643845922359314e-05, | |
| "loss": 0.2395, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 24.662721893491124, | |
| "grad_norm": 5.248380661010742, | |
| "learning_rate": 1.636322600060187e-05, | |
| "loss": 0.2047, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 24.71005917159763, | |
| "grad_norm": 11.657218933105469, | |
| "learning_rate": 1.6287992777610593e-05, | |
| "loss": 0.2442, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 24.757396449704142, | |
| "grad_norm": 8.078208923339844, | |
| "learning_rate": 1.6212759554619318e-05, | |
| "loss": 0.1842, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 24.80473372781065, | |
| "grad_norm": 7.111977577209473, | |
| "learning_rate": 1.6137526331628046e-05, | |
| "loss": 0.2838, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 24.85207100591716, | |
| "grad_norm": 5.9829535484313965, | |
| "learning_rate": 1.6062293108636774e-05, | |
| "loss": 0.2379, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 24.899408284023668, | |
| "grad_norm": 7.217136859893799, | |
| "learning_rate": 1.5987059885645502e-05, | |
| "loss": 0.2162, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 24.94674556213018, | |
| "grad_norm": 3.860224485397339, | |
| "learning_rate": 1.5911826662654227e-05, | |
| "loss": 0.2123, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 24.994082840236686, | |
| "grad_norm": 7.911783695220947, | |
| "learning_rate": 1.5836593439662955e-05, | |
| "loss": 0.1972, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 24.99881656804734, | |
| "eval_accuracy": 0.910958904109589, | |
| "eval_loss": 0.3805873990058899, | |
| "eval_runtime": 6.2415, | |
| "eval_samples_per_second": 233.919, | |
| "eval_steps_per_second": 29.32, | |
| "step": 5281 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7385, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 35, | |
| "save_steps": 500, | |
| "total_flos": 5.493880885130035e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |