| { | |
| "best_global_step": 2470, | |
| "best_metric": 0.00025587898562662303, | |
| "best_model_checkpoint": "./beans_outputs/checkpoint-2470", | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 6500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07692307692307693, | |
| "grad_norm": 28.160919189453125, | |
| "learning_rate": 1.9972307692307693e-05, | |
| "loss": 0.9067, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 13.613167762756348, | |
| "learning_rate": 1.9941538461538464e-05, | |
| "loss": 0.5139, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "grad_norm": 10.76201057434082, | |
| "learning_rate": 1.9910769230769232e-05, | |
| "loss": 0.2543, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 44.93894958496094, | |
| "learning_rate": 1.9880000000000003e-05, | |
| "loss": 0.5313, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 25.037395477294922, | |
| "learning_rate": 1.984923076923077e-05, | |
| "loss": 0.2034, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 40.23418045043945, | |
| "learning_rate": 1.9818461538461538e-05, | |
| "loss": 0.2249, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "grad_norm": 26.893648147583008, | |
| "learning_rate": 1.978769230769231e-05, | |
| "loss": 0.3337, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.18680469691753387, | |
| "learning_rate": 1.9756923076923077e-05, | |
| "loss": 0.0313, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6923076923076923, | |
| "grad_norm": 0.3512158691883087, | |
| "learning_rate": 1.9726153846153848e-05, | |
| "loss": 0.088, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 26.937734603881836, | |
| "learning_rate": 1.9695384615384616e-05, | |
| "loss": 0.2438, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8461538461538461, | |
| "grad_norm": 37.74310302734375, | |
| "learning_rate": 1.9664615384615387e-05, | |
| "loss": 0.3817, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 11.974236488342285, | |
| "learning_rate": 1.9633846153846155e-05, | |
| "loss": 0.0529, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7709632515907288, | |
| "learning_rate": 1.9603076923076926e-05, | |
| "loss": 0.0435, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9624060150375939, | |
| "eval_loss": 0.11930900067090988, | |
| "eval_runtime": 0.5873, | |
| "eval_samples_per_second": 226.46, | |
| "eval_steps_per_second": 28.946, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0769230769230769, | |
| "grad_norm": 0.0285137090831995, | |
| "learning_rate": 1.9572307692307693e-05, | |
| "loss": 0.1304, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 54.899940490722656, | |
| "learning_rate": 1.9541538461538464e-05, | |
| "loss": 0.1374, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 0.06680646538734436, | |
| "learning_rate": 1.9510769230769232e-05, | |
| "loss": 0.2245, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 80.54553985595703, | |
| "learning_rate": 1.948e-05, | |
| "loss": 0.1663, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3846153846153846, | |
| "grad_norm": 41.55508041381836, | |
| "learning_rate": 1.944923076923077e-05, | |
| "loss": 0.1449, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4615384615384617, | |
| "grad_norm": 0.04248078912496567, | |
| "learning_rate": 1.941846153846154e-05, | |
| "loss": 0.2905, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.199375182390213, | |
| "learning_rate": 1.938769230769231e-05, | |
| "loss": 0.3444, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 1.8903758525848389, | |
| "learning_rate": 1.9356923076923077e-05, | |
| "loss": 0.0755, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.6923076923076923, | |
| "grad_norm": 0.1028146967291832, | |
| "learning_rate": 1.932615384615385e-05, | |
| "loss": 0.3268, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.7692307692307692, | |
| "grad_norm": 7.126160621643066, | |
| "learning_rate": 1.929538461538462e-05, | |
| "loss": 0.3276, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 79.28353881835938, | |
| "learning_rate": 1.9264615384615387e-05, | |
| "loss": 0.2051, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 1.5293940305709839, | |
| "learning_rate": 1.9233846153846155e-05, | |
| "loss": 0.1562, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.852114677429199, | |
| "learning_rate": 1.9203076923076923e-05, | |
| "loss": 0.1536, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.0022762541193515062, | |
| "eval_runtime": 0.5666, | |
| "eval_samples_per_second": 234.733, | |
| "eval_steps_per_second": 30.004, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.076923076923077, | |
| "grad_norm": 62.74606704711914, | |
| "learning_rate": 1.9172307692307694e-05, | |
| "loss": 0.1537, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 0.01672833040356636, | |
| "learning_rate": 1.914153846153846e-05, | |
| "loss": 0.3, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.230769230769231, | |
| "grad_norm": 0.7310534119606018, | |
| "learning_rate": 1.9110769230769233e-05, | |
| "loss": 0.1279, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 6.83922004699707, | |
| "learning_rate": 1.908e-05, | |
| "loss": 0.0137, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.3846153846153846, | |
| "grad_norm": 39.29887771606445, | |
| "learning_rate": 1.904923076923077e-05, | |
| "loss": 0.2474, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 61.3380012512207, | |
| "learning_rate": 1.901846153846154e-05, | |
| "loss": 0.2451, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.5384615384615383, | |
| "grad_norm": 0.08102953433990479, | |
| "learning_rate": 1.898769230769231e-05, | |
| "loss": 0.0788, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 0.01904252916574478, | |
| "learning_rate": 1.8956923076923078e-05, | |
| "loss": 0.0667, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.6923076923076925, | |
| "grad_norm": 0.021739983931183815, | |
| "learning_rate": 1.892615384615385e-05, | |
| "loss": 0.2218, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.769230769230769, | |
| "grad_norm": 65.27315521240234, | |
| "learning_rate": 1.8895384615384617e-05, | |
| "loss": 0.348, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.8461538461538463, | |
| "grad_norm": 5.799041271209717, | |
| "learning_rate": 1.8864615384615384e-05, | |
| "loss": 0.213, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 29.148353576660156, | |
| "learning_rate": 1.8833846153846155e-05, | |
| "loss": 0.0291, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.01673489809036255, | |
| "learning_rate": 1.8803076923076923e-05, | |
| "loss": 0.183, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.001464208704419434, | |
| "eval_runtime": 0.7345, | |
| "eval_samples_per_second": 181.068, | |
| "eval_steps_per_second": 23.144, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.076923076923077, | |
| "grad_norm": 0.16125129163265228, | |
| "learning_rate": 1.8772307692307694e-05, | |
| "loss": 0.0953, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.1538461538461537, | |
| "grad_norm": 1.255500316619873, | |
| "learning_rate": 1.8741538461538462e-05, | |
| "loss": 0.0718, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.230769230769231, | |
| "grad_norm": 0.10942380875349045, | |
| "learning_rate": 1.8710769230769233e-05, | |
| "loss": 0.2385, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.3076923076923075, | |
| "grad_norm": 0.38341224193573, | |
| "learning_rate": 1.8680000000000004e-05, | |
| "loss": 0.1585, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.3846153846153846, | |
| "grad_norm": 5.892263412475586, | |
| "learning_rate": 1.8649230769230772e-05, | |
| "loss": 0.3054, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.4615384615384617, | |
| "grad_norm": 90.24942016601562, | |
| "learning_rate": 1.861846153846154e-05, | |
| "loss": 0.1194, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.5384615384615383, | |
| "grad_norm": 0.43507635593414307, | |
| "learning_rate": 1.8587692307692307e-05, | |
| "loss": 0.0827, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.6153846153846154, | |
| "grad_norm": 14.393282890319824, | |
| "learning_rate": 1.8556923076923078e-05, | |
| "loss": 0.0025, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.6923076923076925, | |
| "grad_norm": 0.07155368477106094, | |
| "learning_rate": 1.8526153846153846e-05, | |
| "loss": 0.0185, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.769230769230769, | |
| "grad_norm": 0.0020313099958002567, | |
| "learning_rate": 1.8495384615384617e-05, | |
| "loss": 0.0582, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "grad_norm": 21.025943756103516, | |
| "learning_rate": 1.8464615384615385e-05, | |
| "loss": 0.1224, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.9230769230769234, | |
| "grad_norm": 33.07258224487305, | |
| "learning_rate": 1.8433846153846156e-05, | |
| "loss": 0.0161, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.0024726390838623047, | |
| "learning_rate": 1.8403076923076924e-05, | |
| "loss": 0.2256, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.038613125681877136, | |
| "eval_runtime": 0.5587, | |
| "eval_samples_per_second": 238.037, | |
| "eval_steps_per_second": 30.426, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.076923076923077, | |
| "grad_norm": 112.0116195678711, | |
| "learning_rate": 1.8372307692307695e-05, | |
| "loss": 0.1838, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.153846153846154, | |
| "grad_norm": 0.09052282571792603, | |
| "learning_rate": 1.8341538461538462e-05, | |
| "loss": 0.1015, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.230769230769231, | |
| "grad_norm": 0.09375716745853424, | |
| "learning_rate": 1.8310769230769233e-05, | |
| "loss": 0.1176, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.3076923076923075, | |
| "grad_norm": 101.64923095703125, | |
| "learning_rate": 1.828e-05, | |
| "loss": 0.1395, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.384615384615385, | |
| "grad_norm": 0.0043612755835056305, | |
| "learning_rate": 1.824923076923077e-05, | |
| "loss": 0.1515, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.461538461538462, | |
| "grad_norm": 9.888139724731445, | |
| "learning_rate": 1.821846153846154e-05, | |
| "loss": 0.2349, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.538461538461538, | |
| "grad_norm": 59.99452209472656, | |
| "learning_rate": 1.8187692307692308e-05, | |
| "loss": 0.134, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.615384615384615, | |
| "grad_norm": 42.51639938354492, | |
| "learning_rate": 1.815692307692308e-05, | |
| "loss": 0.3639, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.6923076923076925, | |
| "grad_norm": 44.57368087768555, | |
| "learning_rate": 1.8126153846153846e-05, | |
| "loss": 0.2551, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.769230769230769, | |
| "grad_norm": 9.872251510620117, | |
| "learning_rate": 1.8095384615384618e-05, | |
| "loss": 0.2064, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.846153846153846, | |
| "grad_norm": 79.4139404296875, | |
| "learning_rate": 1.806461538461539e-05, | |
| "loss": 0.2031, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.923076923076923, | |
| "grad_norm": 0.005071394145488739, | |
| "learning_rate": 1.8033846153846156e-05, | |
| "loss": 0.0281, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.024555927142500877, | |
| "learning_rate": 1.8003076923076924e-05, | |
| "loss": 0.0555, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.03397071361541748, | |
| "eval_runtime": 0.5652, | |
| "eval_samples_per_second": 235.305, | |
| "eval_steps_per_second": 30.077, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.076923076923077, | |
| "grad_norm": 0.2948272228240967, | |
| "learning_rate": 1.7972307692307692e-05, | |
| "loss": 0.1079, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.153846153846154, | |
| "grad_norm": 0.010563166812062263, | |
| "learning_rate": 1.7941538461538463e-05, | |
| "loss": 0.062, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.230769230769231, | |
| "grad_norm": 0.1277681589126587, | |
| "learning_rate": 1.791076923076923e-05, | |
| "loss": 0.5324, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.3076923076923075, | |
| "grad_norm": 18.837158203125, | |
| "learning_rate": 1.788e-05, | |
| "loss": 0.0799, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.384615384615385, | |
| "grad_norm": 0.03636413440108299, | |
| "learning_rate": 1.784923076923077e-05, | |
| "loss": 0.1586, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.461538461538462, | |
| "grad_norm": 0.7668882012367249, | |
| "learning_rate": 1.781846153846154e-05, | |
| "loss": 0.2337, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.538461538461538, | |
| "grad_norm": 15.153043746948242, | |
| "learning_rate": 1.778769230769231e-05, | |
| "loss": 0.1419, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.615384615384615, | |
| "grad_norm": 0.12611930072307587, | |
| "learning_rate": 1.775692307692308e-05, | |
| "loss": 0.1464, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.6923076923076925, | |
| "grad_norm": 31.0040225982666, | |
| "learning_rate": 1.7726153846153847e-05, | |
| "loss": 0.1024, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "grad_norm": 0.11015983670949936, | |
| "learning_rate": 1.7695384615384618e-05, | |
| "loss": 0.1542, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.846153846153846, | |
| "grad_norm": 0.1276661902666092, | |
| "learning_rate": 1.7664615384615386e-05, | |
| "loss": 0.0896, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 5.923076923076923, | |
| "grad_norm": 23.176246643066406, | |
| "learning_rate": 1.7633846153846153e-05, | |
| "loss": 0.2576, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.016006305813789368, | |
| "learning_rate": 1.7603076923076924e-05, | |
| "loss": 0.0713, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.07276736199855804, | |
| "eval_runtime": 0.6348, | |
| "eval_samples_per_second": 209.524, | |
| "eval_steps_per_second": 26.781, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.076923076923077, | |
| "grad_norm": 0.00783840287476778, | |
| "learning_rate": 1.7572307692307692e-05, | |
| "loss": 0.1411, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.153846153846154, | |
| "grad_norm": 43.413272857666016, | |
| "learning_rate": 1.7541538461538463e-05, | |
| "loss": 0.1076, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.230769230769231, | |
| "grad_norm": 1.725425362586975, | |
| "learning_rate": 1.751076923076923e-05, | |
| "loss": 0.2663, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.3076923076923075, | |
| "grad_norm": 0.03434762358665466, | |
| "learning_rate": 1.7480000000000002e-05, | |
| "loss": 0.0053, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.384615384615385, | |
| "grad_norm": 0.030778298154473305, | |
| "learning_rate": 1.7449230769230773e-05, | |
| "loss": 0.0819, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.461538461538462, | |
| "grad_norm": 0.064858078956604, | |
| "learning_rate": 1.741846153846154e-05, | |
| "loss": 0.0463, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.538461538461538, | |
| "grad_norm": 10.116390228271484, | |
| "learning_rate": 1.738769230769231e-05, | |
| "loss": 0.4003, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.615384615384615, | |
| "grad_norm": 53.98569107055664, | |
| "learning_rate": 1.7356923076923076e-05, | |
| "loss": 0.3175, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.6923076923076925, | |
| "grad_norm": 9.284001350402832, | |
| "learning_rate": 1.7326153846153847e-05, | |
| "loss": 0.2402, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 6.769230769230769, | |
| "grad_norm": 0.22148916125297546, | |
| "learning_rate": 1.7295384615384615e-05, | |
| "loss": 0.0054, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 6.846153846153846, | |
| "grad_norm": 0.05801761895418167, | |
| "learning_rate": 1.7264615384615386e-05, | |
| "loss": 0.0524, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 6.923076923076923, | |
| "grad_norm": 159.55609130859375, | |
| "learning_rate": 1.7233846153846154e-05, | |
| "loss": 0.178, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.6641747951507568, | |
| "learning_rate": 1.7203076923076925e-05, | |
| "loss": 0.0082, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.04108954966068268, | |
| "eval_runtime": 0.5705, | |
| "eval_samples_per_second": 233.148, | |
| "eval_steps_per_second": 29.801, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.076923076923077, | |
| "grad_norm": 23.453298568725586, | |
| "learning_rate": 1.7172307692307696e-05, | |
| "loss": 0.3437, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.153846153846154, | |
| "grad_norm": 0.03930990770459175, | |
| "learning_rate": 1.7141538461538464e-05, | |
| "loss": 0.2261, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.230769230769231, | |
| "grad_norm": 20.72200584411621, | |
| "learning_rate": 1.711076923076923e-05, | |
| "loss": 0.1309, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.3076923076923075, | |
| "grad_norm": 39.58359909057617, | |
| "learning_rate": 1.7080000000000002e-05, | |
| "loss": 0.1134, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.384615384615385, | |
| "grad_norm": 1.5824700593948364, | |
| "learning_rate": 1.704923076923077e-05, | |
| "loss": 0.0574, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.461538461538462, | |
| "grad_norm": 22.621976852416992, | |
| "learning_rate": 1.7018461538461538e-05, | |
| "loss": 0.1681, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.538461538461538, | |
| "grad_norm": 16.806806564331055, | |
| "learning_rate": 1.698769230769231e-05, | |
| "loss": 0.1348, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.615384615384615, | |
| "grad_norm": 15.27640151977539, | |
| "learning_rate": 1.6956923076923077e-05, | |
| "loss": 0.1698, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.6923076923076925, | |
| "grad_norm": 39.200557708740234, | |
| "learning_rate": 1.6926153846153848e-05, | |
| "loss": 0.0372, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.769230769230769, | |
| "grad_norm": 2.748015880584717, | |
| "learning_rate": 1.6895384615384615e-05, | |
| "loss": 0.0288, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 7.846153846153846, | |
| "grad_norm": 15.786018371582031, | |
| "learning_rate": 1.6864615384615387e-05, | |
| "loss": 0.252, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 7.923076923076923, | |
| "grad_norm": 0.021507665514945984, | |
| "learning_rate": 1.6833846153846158e-05, | |
| "loss": 0.0895, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.04156835377216339, | |
| "learning_rate": 1.6803076923076925e-05, | |
| "loss": 0.0085, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.10019110888242722, | |
| "eval_runtime": 0.5783, | |
| "eval_samples_per_second": 229.992, | |
| "eval_steps_per_second": 29.397, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.076923076923077, | |
| "grad_norm": 0.013579009100794792, | |
| "learning_rate": 1.6772307692307693e-05, | |
| "loss": 0.1722, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.153846153846153, | |
| "grad_norm": 0.0050528221763670444, | |
| "learning_rate": 1.674153846153846e-05, | |
| "loss": 0.0511, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.23076923076923, | |
| "grad_norm": 0.015883196145296097, | |
| "learning_rate": 1.6710769230769232e-05, | |
| "loss": 0.134, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.307692307692308, | |
| "grad_norm": 0.1903264969587326, | |
| "learning_rate": 1.668e-05, | |
| "loss": 0.0889, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.384615384615385, | |
| "grad_norm": 0.02674732729792595, | |
| "learning_rate": 1.664923076923077e-05, | |
| "loss": 0.0728, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.461538461538462, | |
| "grad_norm": 0.04318476840853691, | |
| "learning_rate": 1.661846153846154e-05, | |
| "loss": 0.1575, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.538461538461538, | |
| "grad_norm": 0.01189767848700285, | |
| "learning_rate": 1.658769230769231e-05, | |
| "loss": 0.0308, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.615384615384615, | |
| "grad_norm": 0.1701788455247879, | |
| "learning_rate": 1.655692307692308e-05, | |
| "loss": 0.1881, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 8.692307692307692, | |
| "grad_norm": 4.718733787536621, | |
| "learning_rate": 1.6526153846153848e-05, | |
| "loss": 0.0678, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 8.76923076923077, | |
| "grad_norm": 0.16493193805217743, | |
| "learning_rate": 1.6495384615384616e-05, | |
| "loss": 0.172, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 8.846153846153847, | |
| "grad_norm": 0.013741947710514069, | |
| "learning_rate": 1.6464615384615387e-05, | |
| "loss": 0.1544, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 8.923076923076923, | |
| "grad_norm": 0.008533033542335033, | |
| "learning_rate": 1.6433846153846155e-05, | |
| "loss": 0.0382, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.01939828135073185, | |
| "learning_rate": 1.6403076923076922e-05, | |
| "loss": 0.0733, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.00041508162394165993, | |
| "eval_runtime": 0.5615, | |
| "eval_samples_per_second": 236.857, | |
| "eval_steps_per_second": 30.275, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.076923076923077, | |
| "grad_norm": 0.029882851988077164, | |
| "learning_rate": 1.6372307692307693e-05, | |
| "loss": 0.1645, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.153846153846153, | |
| "grad_norm": 0.02415158972144127, | |
| "learning_rate": 1.634153846153846e-05, | |
| "loss": 0.2324, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.23076923076923, | |
| "grad_norm": 0.011508429422974586, | |
| "learning_rate": 1.6310769230769232e-05, | |
| "loss": 0.0056, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.307692307692308, | |
| "grad_norm": 0.49797359108924866, | |
| "learning_rate": 1.628e-05, | |
| "loss": 0.0653, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.384615384615385, | |
| "grad_norm": 0.024000057950615883, | |
| "learning_rate": 1.624923076923077e-05, | |
| "loss": 0.0544, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.461538461538462, | |
| "grad_norm": 0.04410143569111824, | |
| "learning_rate": 1.6218461538461542e-05, | |
| "loss": 0.1128, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.538461538461538, | |
| "grad_norm": 0.10039117932319641, | |
| "learning_rate": 1.618769230769231e-05, | |
| "loss": 0.0244, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 9.615384615384615, | |
| "grad_norm": 0.4355177879333496, | |
| "learning_rate": 1.6156923076923078e-05, | |
| "loss": 0.1553, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 9.692307692307692, | |
| "grad_norm": 0.01548450905829668, | |
| "learning_rate": 1.6126153846153845e-05, | |
| "loss": 0.0277, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 9.76923076923077, | |
| "grad_norm": 0.010753784328699112, | |
| "learning_rate": 1.6095384615384616e-05, | |
| "loss": 0.1313, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 9.846153846153847, | |
| "grad_norm": 4.099564552307129, | |
| "learning_rate": 1.6064615384615384e-05, | |
| "loss": 0.028, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 9.923076923076923, | |
| "grad_norm": 0.0062192706391215324, | |
| "learning_rate": 1.6033846153846155e-05, | |
| "loss": 0.0388, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.002432856010273099, | |
| "learning_rate": 1.6003076923076923e-05, | |
| "loss": 0.0215, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.0003331384214106947, | |
| "eval_runtime": 0.5589, | |
| "eval_samples_per_second": 237.973, | |
| "eval_steps_per_second": 30.418, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 10.076923076923077, | |
| "grad_norm": 40.07775115966797, | |
| "learning_rate": 1.5972307692307694e-05, | |
| "loss": 0.1465, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 10.153846153846153, | |
| "grad_norm": 0.03718170151114464, | |
| "learning_rate": 1.5941538461538465e-05, | |
| "loss": 0.0022, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 10.23076923076923, | |
| "grad_norm": 0.006770229432731867, | |
| "learning_rate": 1.5910769230769233e-05, | |
| "loss": 0.1207, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 10.307692307692308, | |
| "grad_norm": 1.803260326385498, | |
| "learning_rate": 1.588e-05, | |
| "loss": 0.1151, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 10.384615384615385, | |
| "grad_norm": 34.21731185913086, | |
| "learning_rate": 1.584923076923077e-05, | |
| "loss": 0.1446, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 10.461538461538462, | |
| "grad_norm": 0.011409844271838665, | |
| "learning_rate": 1.581846153846154e-05, | |
| "loss": 0.0283, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 10.538461538461538, | |
| "grad_norm": 0.010058027692139149, | |
| "learning_rate": 1.5787692307692307e-05, | |
| "loss": 0.133, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 10.615384615384615, | |
| "grad_norm": 0.9373985528945923, | |
| "learning_rate": 1.5756923076923078e-05, | |
| "loss": 0.0455, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 10.692307692307692, | |
| "grad_norm": 0.008180659264326096, | |
| "learning_rate": 1.5726153846153846e-05, | |
| "loss": 0.0249, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 10.76923076923077, | |
| "grad_norm": 66.53052520751953, | |
| "learning_rate": 1.5695384615384617e-05, | |
| "loss": 0.1623, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 10.846153846153847, | |
| "grad_norm": 0.008386868052184582, | |
| "learning_rate": 1.5664615384615388e-05, | |
| "loss": 0.0183, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 10.923076923076923, | |
| "grad_norm": 6.9249043464660645, | |
| "learning_rate": 1.5633846153846156e-05, | |
| "loss": 0.0448, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.0051022847183048725, | |
| "learning_rate": 1.5603076923076927e-05, | |
| "loss": 0.0501, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.9774436090225563, | |
| "eval_loss": 0.0634198933839798, | |
| "eval_runtime": 0.56, | |
| "eval_samples_per_second": 237.505, | |
| "eval_steps_per_second": 30.358, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 11.076923076923077, | |
| "grad_norm": 51.06268310546875, | |
| "learning_rate": 1.5572307692307694e-05, | |
| "loss": 0.1581, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 11.153846153846153, | |
| "grad_norm": 22.144161224365234, | |
| "learning_rate": 1.5541538461538462e-05, | |
| "loss": 0.0681, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 11.23076923076923, | |
| "grad_norm": 2.6330926418304443, | |
| "learning_rate": 1.551076923076923e-05, | |
| "loss": 0.0616, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 11.307692307692308, | |
| "grad_norm": 0.04433123394846916, | |
| "learning_rate": 1.548e-05, | |
| "loss": 0.1252, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 11.384615384615385, | |
| "grad_norm": 23.456703186035156, | |
| "learning_rate": 1.544923076923077e-05, | |
| "loss": 0.0639, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 11.461538461538462, | |
| "grad_norm": 1.3539597988128662, | |
| "learning_rate": 1.541846153846154e-05, | |
| "loss": 0.191, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 11.538461538461538, | |
| "grad_norm": 0.002958856290206313, | |
| "learning_rate": 1.5387692307692307e-05, | |
| "loss": 0.0002, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 11.615384615384615, | |
| "grad_norm": 0.001320499461144209, | |
| "learning_rate": 1.535692307692308e-05, | |
| "loss": 0.1247, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 11.692307692307692, | |
| "grad_norm": 26.650876998901367, | |
| "learning_rate": 1.532615384615385e-05, | |
| "loss": 0.2736, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 11.76923076923077, | |
| "grad_norm": 0.6191799640655518, | |
| "learning_rate": 1.5295384615384617e-05, | |
| "loss": 0.0123, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 11.846153846153847, | |
| "grad_norm": 0.16651397943496704, | |
| "learning_rate": 1.5264615384615385e-05, | |
| "loss": 0.071, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 11.923076923076923, | |
| "grad_norm": 0.5216618180274963, | |
| "learning_rate": 1.5233846153846154e-05, | |
| "loss": 0.1478, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.0047162421979010105, | |
| "learning_rate": 1.5203076923076925e-05, | |
| "loss": 0.0338, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.024751892313361168, | |
| "eval_runtime": 0.5573, | |
| "eval_samples_per_second": 238.65, | |
| "eval_steps_per_second": 30.504, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 12.076923076923077, | |
| "grad_norm": 0.01325373724102974, | |
| "learning_rate": 1.5172307692307693e-05, | |
| "loss": 0.1432, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 12.153846153846153, | |
| "grad_norm": 24.465373992919922, | |
| "learning_rate": 1.5141538461538463e-05, | |
| "loss": 0.0911, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 12.23076923076923, | |
| "grad_norm": 0.01066196896135807, | |
| "learning_rate": 1.5110769230769232e-05, | |
| "loss": 0.0654, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 12.307692307692308, | |
| "grad_norm": 30.326496124267578, | |
| "learning_rate": 1.5080000000000001e-05, | |
| "loss": 0.0937, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 12.384615384615385, | |
| "grad_norm": 15.133655548095703, | |
| "learning_rate": 1.504923076923077e-05, | |
| "loss": 0.1265, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 12.461538461538462, | |
| "grad_norm": 0.009278634563088417, | |
| "learning_rate": 1.501846153846154e-05, | |
| "loss": 0.0028, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 12.538461538461538, | |
| "grad_norm": 34.179298400878906, | |
| "learning_rate": 1.498769230769231e-05, | |
| "loss": 0.0711, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 12.615384615384615, | |
| "grad_norm": 0.539430558681488, | |
| "learning_rate": 1.4956923076923077e-05, | |
| "loss": 0.0683, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 12.692307692307692, | |
| "grad_norm": 0.504645586013794, | |
| "learning_rate": 1.4926153846153848e-05, | |
| "loss": 0.1046, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 12.76923076923077, | |
| "grad_norm": 36.87862777709961, | |
| "learning_rate": 1.4895384615384616e-05, | |
| "loss": 0.0515, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 12.846153846153847, | |
| "grad_norm": 0.005421087611466646, | |
| "learning_rate": 1.4864615384615385e-05, | |
| "loss": 0.0943, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 12.923076923076923, | |
| "grad_norm": 54.710540771484375, | |
| "learning_rate": 1.4833846153846155e-05, | |
| "loss": 0.2085, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.004657375160604715, | |
| "learning_rate": 1.4803076923076924e-05, | |
| "loss": 0.0045, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.09390003234148026, | |
| "eval_runtime": 0.5702, | |
| "eval_samples_per_second": 233.258, | |
| "eval_steps_per_second": 29.815, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 13.076923076923077, | |
| "grad_norm": 0.014458782970905304, | |
| "learning_rate": 1.4772307692307692e-05, | |
| "loss": 0.0008, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 13.153846153846153, | |
| "grad_norm": 4.515313625335693, | |
| "learning_rate": 1.4741538461538463e-05, | |
| "loss": 0.0232, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 13.23076923076923, | |
| "grad_norm": 20.115612030029297, | |
| "learning_rate": 1.4710769230769232e-05, | |
| "loss": 0.0343, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 13.307692307692308, | |
| "grad_norm": 0.0025744156446307898, | |
| "learning_rate": 1.4680000000000002e-05, | |
| "loss": 0.2143, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 13.384615384615385, | |
| "grad_norm": 0.005626370664685965, | |
| "learning_rate": 1.4649230769230771e-05, | |
| "loss": 0.1297, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 13.461538461538462, | |
| "grad_norm": 1.902580738067627, | |
| "learning_rate": 1.4618461538461539e-05, | |
| "loss": 0.0052, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 13.538461538461538, | |
| "grad_norm": 0.0006718473159708083, | |
| "learning_rate": 1.458769230769231e-05, | |
| "loss": 0.1279, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 13.615384615384615, | |
| "grad_norm": 0.40659937262535095, | |
| "learning_rate": 1.4556923076923078e-05, | |
| "loss": 0.0005, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 13.692307692307692, | |
| "grad_norm": 0.0035146658774465322, | |
| "learning_rate": 1.4526153846153847e-05, | |
| "loss": 0.0778, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 13.76923076923077, | |
| "grad_norm": 0.05003712326288223, | |
| "learning_rate": 1.4495384615384616e-05, | |
| "loss": 0.002, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 13.846153846153847, | |
| "grad_norm": 0.0015587609959766269, | |
| "learning_rate": 1.4464615384615386e-05, | |
| "loss": 0.1025, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 13.923076923076923, | |
| "grad_norm": 0.011278959922492504, | |
| "learning_rate": 1.4433846153846155e-05, | |
| "loss": 0.141, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.002003464847803116, | |
| "learning_rate": 1.4403076923076925e-05, | |
| "loss": 0.0013, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.03729831054806709, | |
| "eval_runtime": 0.5723, | |
| "eval_samples_per_second": 232.408, | |
| "eval_steps_per_second": 29.706, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 14.076923076923077, | |
| "grad_norm": 0.023831067606806755, | |
| "learning_rate": 1.4372307692307694e-05, | |
| "loss": 0.0016, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 14.153846153846153, | |
| "grad_norm": 23.71483612060547, | |
| "learning_rate": 1.4341538461538462e-05, | |
| "loss": 0.1814, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 14.23076923076923, | |
| "grad_norm": 0.0006417655386030674, | |
| "learning_rate": 1.4310769230769233e-05, | |
| "loss": 0.1044, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 14.307692307692308, | |
| "grad_norm": 0.003414665814489126, | |
| "learning_rate": 1.428e-05, | |
| "loss": 0.1385, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 14.384615384615385, | |
| "grad_norm": 0.9071229100227356, | |
| "learning_rate": 1.4249230769230772e-05, | |
| "loss": 0.1259, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 14.461538461538462, | |
| "grad_norm": 15.944780349731445, | |
| "learning_rate": 1.421846153846154e-05, | |
| "loss": 0.1491, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 14.538461538461538, | |
| "grad_norm": 0.040743470191955566, | |
| "learning_rate": 1.4187692307692309e-05, | |
| "loss": 0.0304, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 14.615384615384615, | |
| "grad_norm": 0.09457922726869583, | |
| "learning_rate": 1.4156923076923076e-05, | |
| "loss": 0.1211, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 14.692307692307692, | |
| "grad_norm": 0.19244757294654846, | |
| "learning_rate": 1.4126153846153847e-05, | |
| "loss": 0.1062, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 14.76923076923077, | |
| "grad_norm": 1.560502529144287, | |
| "learning_rate": 1.4095384615384617e-05, | |
| "loss": 0.1089, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 14.846153846153847, | |
| "grad_norm": 0.030071793124079704, | |
| "learning_rate": 1.4064615384615386e-05, | |
| "loss": 0.0024, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 14.923076923076923, | |
| "grad_norm": 0.0013134493492543697, | |
| "learning_rate": 1.4033846153846156e-05, | |
| "loss": 0.0012, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.0009961730102077127, | |
| "learning_rate": 1.4003076923076923e-05, | |
| "loss": 0.0002, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.0515468567609787, | |
| "eval_runtime": 0.5584, | |
| "eval_samples_per_second": 238.17, | |
| "eval_steps_per_second": 30.443, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 15.076923076923077, | |
| "grad_norm": 0.0006362455897033215, | |
| "learning_rate": 1.3972307692307694e-05, | |
| "loss": 0.0451, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 15.153846153846153, | |
| "grad_norm": 0.0071708387695252895, | |
| "learning_rate": 1.3941538461538462e-05, | |
| "loss": 0.0534, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 15.23076923076923, | |
| "grad_norm": 0.006367762107402086, | |
| "learning_rate": 1.3910769230769232e-05, | |
| "loss": 0.0013, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 15.307692307692308, | |
| "grad_norm": 1.7080107927322388, | |
| "learning_rate": 1.3880000000000001e-05, | |
| "loss": 0.0584, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 15.384615384615385, | |
| "grad_norm": 48.49470901489258, | |
| "learning_rate": 1.384923076923077e-05, | |
| "loss": 0.0883, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 15.461538461538462, | |
| "grad_norm": 0.0020162356086075306, | |
| "learning_rate": 1.3818461538461541e-05, | |
| "loss": 0.0147, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 15.538461538461538, | |
| "grad_norm": 0.11068695038557053, | |
| "learning_rate": 1.3787692307692309e-05, | |
| "loss": 0.0926, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 15.615384615384615, | |
| "grad_norm": 0.001251698238775134, | |
| "learning_rate": 1.3756923076923079e-05, | |
| "loss": 0.011, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 15.692307692307692, | |
| "grad_norm": 0.0013289551716297865, | |
| "learning_rate": 1.3726153846153846e-05, | |
| "loss": 0.0048, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 15.76923076923077, | |
| "grad_norm": 87.35831451416016, | |
| "learning_rate": 1.3695384615384617e-05, | |
| "loss": 0.0228, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 15.846153846153847, | |
| "grad_norm": 0.009976107627153397, | |
| "learning_rate": 1.3664615384615385e-05, | |
| "loss": 0.2276, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 15.923076923076923, | |
| "grad_norm": 0.00704730162397027, | |
| "learning_rate": 1.3633846153846156e-05, | |
| "loss": 0.248, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.0022731584031134844, | |
| "learning_rate": 1.3603076923076924e-05, | |
| "loss": 0.0074, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.0016808852087706327, | |
| "eval_runtime": 0.5593, | |
| "eval_samples_per_second": 237.796, | |
| "eval_steps_per_second": 30.395, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 16.076923076923077, | |
| "grad_norm": 4.551645278930664, | |
| "learning_rate": 1.3572307692307693e-05, | |
| "loss": 0.0013, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 16.153846153846153, | |
| "grad_norm": 0.004595694597810507, | |
| "learning_rate": 1.3541538461538464e-05, | |
| "loss": 0.0017, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 16.23076923076923, | |
| "grad_norm": 0.027721809223294258, | |
| "learning_rate": 1.3510769230769232e-05, | |
| "loss": 0.0054, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 16.307692307692307, | |
| "grad_norm": 159.5991668701172, | |
| "learning_rate": 1.3480000000000001e-05, | |
| "loss": 0.2422, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 16.384615384615383, | |
| "grad_norm": 0.0032202787697315216, | |
| "learning_rate": 1.344923076923077e-05, | |
| "loss": 0.0059, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 16.46153846153846, | |
| "grad_norm": 0.004118644632399082, | |
| "learning_rate": 1.341846153846154e-05, | |
| "loss": 0.0003, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 16.53846153846154, | |
| "grad_norm": 0.796428382396698, | |
| "learning_rate": 1.3387692307692308e-05, | |
| "loss": 0.0837, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 16.615384615384617, | |
| "grad_norm": 0.001536823227070272, | |
| "learning_rate": 1.3356923076923079e-05, | |
| "loss": 0.0004, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 16.692307692307693, | |
| "grad_norm": 0.0012222144287079573, | |
| "learning_rate": 1.3326153846153847e-05, | |
| "loss": 0.1912, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 16.76923076923077, | |
| "grad_norm": 0.001332849613390863, | |
| "learning_rate": 1.3295384615384616e-05, | |
| "loss": 0.0007, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 16.846153846153847, | |
| "grad_norm": 5.964162826538086, | |
| "learning_rate": 1.3264615384615385e-05, | |
| "loss": 0.2174, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 16.923076923076923, | |
| "grad_norm": 0.010293848812580109, | |
| "learning_rate": 1.3233846153846155e-05, | |
| "loss": 0.0657, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.005218516103923321, | |
| "learning_rate": 1.3203076923076926e-05, | |
| "loss": 0.0005, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.058793652802705765, | |
| "eval_runtime": 0.5588, | |
| "eval_samples_per_second": 238.01, | |
| "eval_steps_per_second": 30.422, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 17.076923076923077, | |
| "grad_norm": 0.007321541663259268, | |
| "learning_rate": 1.3172307692307694e-05, | |
| "loss": 0.0652, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 17.153846153846153, | |
| "grad_norm": 0.24121706187725067, | |
| "learning_rate": 1.3141538461538463e-05, | |
| "loss": 0.1264, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 17.23076923076923, | |
| "grad_norm": 0.014303168281912804, | |
| "learning_rate": 1.311076923076923e-05, | |
| "loss": 0.0746, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 17.307692307692307, | |
| "grad_norm": 0.004994900431483984, | |
| "learning_rate": 1.3080000000000002e-05, | |
| "loss": 0.0033, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 17.384615384615383, | |
| "grad_norm": 0.0024373421911150217, | |
| "learning_rate": 1.304923076923077e-05, | |
| "loss": 0.0008, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 17.46153846153846, | |
| "grad_norm": 0.0018968008225783706, | |
| "learning_rate": 1.301846153846154e-05, | |
| "loss": 0.0121, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 17.53846153846154, | |
| "grad_norm": 0.004607335664331913, | |
| "learning_rate": 1.2987692307692308e-05, | |
| "loss": 0.0386, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 17.615384615384617, | |
| "grad_norm": 0.0010372723918408155, | |
| "learning_rate": 1.2956923076923078e-05, | |
| "loss": 0.0001, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 17.692307692307693, | |
| "grad_norm": 0.1868879795074463, | |
| "learning_rate": 1.2926153846153849e-05, | |
| "loss": 0.1025, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 17.76923076923077, | |
| "grad_norm": 0.001327532809227705, | |
| "learning_rate": 1.2895384615384616e-05, | |
| "loss": 0.1086, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 17.846153846153847, | |
| "grad_norm": 0.002292274497449398, | |
| "learning_rate": 1.2864615384615386e-05, | |
| "loss": 0.1219, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 17.923076923076923, | |
| "grad_norm": 0.0024085345212370157, | |
| "learning_rate": 1.2833846153846155e-05, | |
| "loss": 0.2003, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.01654043421149254, | |
| "learning_rate": 1.2803076923076925e-05, | |
| "loss": 0.0046, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.07149150967597961, | |
| "eval_runtime": 0.6008, | |
| "eval_samples_per_second": 221.365, | |
| "eval_steps_per_second": 28.295, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 18.076923076923077, | |
| "grad_norm": 0.0006843829760327935, | |
| "learning_rate": 1.2772307692307692e-05, | |
| "loss": 0.0572, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 18.153846153846153, | |
| "grad_norm": 0.00446607219055295, | |
| "learning_rate": 1.2741538461538463e-05, | |
| "loss": 0.1223, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 18.23076923076923, | |
| "grad_norm": 60.59695053100586, | |
| "learning_rate": 1.2710769230769231e-05, | |
| "loss": 0.155, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 18.307692307692307, | |
| "grad_norm": 0.001972057158127427, | |
| "learning_rate": 1.268e-05, | |
| "loss": 0.0193, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 18.384615384615383, | |
| "grad_norm": 36.57919692993164, | |
| "learning_rate": 1.264923076923077e-05, | |
| "loss": 0.276, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 18.46153846153846, | |
| "grad_norm": 0.3979909420013428, | |
| "learning_rate": 1.261846153846154e-05, | |
| "loss": 0.0329, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 18.53846153846154, | |
| "grad_norm": 0.05160733684897423, | |
| "learning_rate": 1.258769230769231e-05, | |
| "loss": 0.0179, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 18.615384615384617, | |
| "grad_norm": 51.901119232177734, | |
| "learning_rate": 1.2556923076923078e-05, | |
| "loss": 0.0916, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 18.692307692307693, | |
| "grad_norm": 0.018259704113006592, | |
| "learning_rate": 1.2526153846153848e-05, | |
| "loss": 0.0002, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 18.76923076923077, | |
| "grad_norm": 0.010010771453380585, | |
| "learning_rate": 1.2495384615384615e-05, | |
| "loss": 0.1238, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 18.846153846153847, | |
| "grad_norm": 0.026239359751343727, | |
| "learning_rate": 1.2464615384615386e-05, | |
| "loss": 0.0222, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 18.923076923076923, | |
| "grad_norm": 5.157707214355469, | |
| "learning_rate": 1.2433846153846154e-05, | |
| "loss": 0.0424, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.02478371374309063, | |
| "learning_rate": 1.2403076923076925e-05, | |
| "loss": 0.0618, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.00025587898562662303, | |
| "eval_runtime": 0.5583, | |
| "eval_samples_per_second": 238.234, | |
| "eval_steps_per_second": 30.451, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 19.076923076923077, | |
| "grad_norm": 0.034582603722810745, | |
| "learning_rate": 1.2372307692307693e-05, | |
| "loss": 0.0451, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 19.153846153846153, | |
| "grad_norm": 0.0011788305127993226, | |
| "learning_rate": 1.2341538461538462e-05, | |
| "loss": 0.1372, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 19.23076923076923, | |
| "grad_norm": 0.00216018408536911, | |
| "learning_rate": 1.2310769230769233e-05, | |
| "loss": 0.0001, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 19.307692307692307, | |
| "grad_norm": 0.0017174163367599249, | |
| "learning_rate": 1.2280000000000001e-05, | |
| "loss": 0.0522, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 19.384615384615383, | |
| "grad_norm": 73.49071502685547, | |
| "learning_rate": 1.224923076923077e-05, | |
| "loss": 0.0448, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 19.46153846153846, | |
| "grad_norm": 109.05286407470703, | |
| "learning_rate": 1.221846153846154e-05, | |
| "loss": 0.1094, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 19.53846153846154, | |
| "grad_norm": 46.95461654663086, | |
| "learning_rate": 1.218769230769231e-05, | |
| "loss": 0.0483, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 19.615384615384617, | |
| "grad_norm": 0.001702558365650475, | |
| "learning_rate": 1.2156923076923077e-05, | |
| "loss": 0.0478, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 19.692307692307693, | |
| "grad_norm": 0.016203153878450394, | |
| "learning_rate": 1.2126153846153848e-05, | |
| "loss": 0.0006, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 19.76923076923077, | |
| "grad_norm": 0.001535124727524817, | |
| "learning_rate": 1.2095384615384616e-05, | |
| "loss": 0.0169, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 19.846153846153847, | |
| "grad_norm": 0.0012106726644560695, | |
| "learning_rate": 1.2064615384615385e-05, | |
| "loss": 0.0004, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 19.923076923076923, | |
| "grad_norm": 17.061466217041016, | |
| "learning_rate": 1.2033846153846154e-05, | |
| "loss": 0.1205, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.009339146316051483, | |
| "learning_rate": 1.2003076923076924e-05, | |
| "loss": 0.0007, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.06972503662109375, | |
| "eval_runtime": 0.5563, | |
| "eval_samples_per_second": 239.076, | |
| "eval_steps_per_second": 30.559, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 20.076923076923077, | |
| "grad_norm": 0.009359082207083702, | |
| "learning_rate": 1.1972307692307695e-05, | |
| "loss": 0.1452, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 20.153846153846153, | |
| "grad_norm": 0.06399708241224289, | |
| "learning_rate": 1.1941538461538463e-05, | |
| "loss": 0.014, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 20.23076923076923, | |
| "grad_norm": 0.002059635240584612, | |
| "learning_rate": 1.1910769230769232e-05, | |
| "loss": 0.0001, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 20.307692307692307, | |
| "grad_norm": 31.44277572631836, | |
| "learning_rate": 1.188e-05, | |
| "loss": 0.1186, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 20.384615384615383, | |
| "grad_norm": 0.004387051798403263, | |
| "learning_rate": 1.1849230769230771e-05, | |
| "loss": 0.0004, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 20.46153846153846, | |
| "grad_norm": 0.0017380157951265574, | |
| "learning_rate": 1.1818461538461539e-05, | |
| "loss": 0.1127, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 20.53846153846154, | |
| "grad_norm": 0.01089541893452406, | |
| "learning_rate": 1.178769230769231e-05, | |
| "loss": 0.0869, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 20.615384615384617, | |
| "grad_norm": 0.0011546051828190684, | |
| "learning_rate": 1.1756923076923077e-05, | |
| "loss": 0.2089, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 20.692307692307693, | |
| "grad_norm": 0.003806912340223789, | |
| "learning_rate": 1.1726153846153847e-05, | |
| "loss": 0.1001, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 20.76923076923077, | |
| "grad_norm": 0.000958717311732471, | |
| "learning_rate": 1.1695384615384618e-05, | |
| "loss": 0.183, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 20.846153846153847, | |
| "grad_norm": 0.18556897342205048, | |
| "learning_rate": 1.1664615384615386e-05, | |
| "loss": 0.0003, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 20.923076923076923, | |
| "grad_norm": 0.0016327186021953821, | |
| "learning_rate": 1.1633846153846155e-05, | |
| "loss": 0.1264, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.002433057175949216, | |
| "learning_rate": 1.1603076923076924e-05, | |
| "loss": 0.0001, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.9774436090225563, | |
| "eval_loss": 0.11052080243825912, | |
| "eval_runtime": 0.5588, | |
| "eval_samples_per_second": 238.023, | |
| "eval_steps_per_second": 30.424, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 21.076923076923077, | |
| "grad_norm": 0.004819987341761589, | |
| "learning_rate": 1.1572307692307694e-05, | |
| "loss": 0.0602, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 21.153846153846153, | |
| "grad_norm": 0.0032386251259595156, | |
| "learning_rate": 1.1541538461538461e-05, | |
| "loss": 0.0086, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 21.23076923076923, | |
| "grad_norm": 0.0015244975220412016, | |
| "learning_rate": 1.1510769230769232e-05, | |
| "loss": 0.0002, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 21.307692307692307, | |
| "grad_norm": 0.0007208559545688331, | |
| "learning_rate": 1.148e-05, | |
| "loss": 0.0108, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 21.384615384615383, | |
| "grad_norm": 2.059755325317383, | |
| "learning_rate": 1.144923076923077e-05, | |
| "loss": 0.0232, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 21.46153846153846, | |
| "grad_norm": 0.0012232252629473805, | |
| "learning_rate": 1.141846153846154e-05, | |
| "loss": 0.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 21.53846153846154, | |
| "grad_norm": 0.07696986198425293, | |
| "learning_rate": 1.1387692307692308e-05, | |
| "loss": 0.2859, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 21.615384615384617, | |
| "grad_norm": 61.04636001586914, | |
| "learning_rate": 1.135692307692308e-05, | |
| "loss": 0.1596, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 21.692307692307693, | |
| "grad_norm": 0.001321778865531087, | |
| "learning_rate": 1.1326153846153847e-05, | |
| "loss": 0.134, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 21.76923076923077, | |
| "grad_norm": 0.08154579252004623, | |
| "learning_rate": 1.1295384615384617e-05, | |
| "loss": 0.0001, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 21.846153846153847, | |
| "grad_norm": 0.007910426706075668, | |
| "learning_rate": 1.1264615384615384e-05, | |
| "loss": 0.1725, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 21.923076923076923, | |
| "grad_norm": 0.010755512863397598, | |
| "learning_rate": 1.1233846153846155e-05, | |
| "loss": 0.0768, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 12.361237525939941, | |
| "learning_rate": 1.1203076923076923e-05, | |
| "loss": 0.0214, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.09297900646924973, | |
| "eval_runtime": 0.5599, | |
| "eval_samples_per_second": 237.551, | |
| "eval_steps_per_second": 30.364, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 22.076923076923077, | |
| "grad_norm": 10.506192207336426, | |
| "learning_rate": 1.1172307692307694e-05, | |
| "loss": 0.008, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 22.153846153846153, | |
| "grad_norm": 0.0015517654828727245, | |
| "learning_rate": 1.1141538461538462e-05, | |
| "loss": 0.0735, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 22.23076923076923, | |
| "grad_norm": 42.15374755859375, | |
| "learning_rate": 1.1110769230769231e-05, | |
| "loss": 0.0698, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 22.307692307692307, | |
| "grad_norm": 0.003171928459778428, | |
| "learning_rate": 1.1080000000000002e-05, | |
| "loss": 0.0475, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 22.384615384615383, | |
| "grad_norm": 0.0019311098149046302, | |
| "learning_rate": 1.104923076923077e-05, | |
| "loss": 0.0003, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 22.46153846153846, | |
| "grad_norm": 0.0021256860345602036, | |
| "learning_rate": 1.101846153846154e-05, | |
| "loss": 0.0308, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 22.53846153846154, | |
| "grad_norm": 0.01151897944509983, | |
| "learning_rate": 1.0987692307692309e-05, | |
| "loss": 0.123, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 22.615384615384617, | |
| "grad_norm": 0.009536568075418472, | |
| "learning_rate": 1.0956923076923078e-05, | |
| "loss": 0.0018, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 22.692307692307693, | |
| "grad_norm": 0.04015596956014633, | |
| "learning_rate": 1.0926153846153846e-05, | |
| "loss": 0.0961, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 22.76923076923077, | |
| "grad_norm": 0.009416976943612099, | |
| "learning_rate": 1.0895384615384617e-05, | |
| "loss": 0.1117, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 22.846153846153847, | |
| "grad_norm": 0.0018806488951668143, | |
| "learning_rate": 1.0864615384615385e-05, | |
| "loss": 0.0012, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 22.923076923076923, | |
| "grad_norm": 0.006934476085007191, | |
| "learning_rate": 1.0833846153846154e-05, | |
| "loss": 0.0016, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 0.0018909511854872108, | |
| "learning_rate": 1.0803076923076925e-05, | |
| "loss": 0.0004, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.027222247794270515, | |
| "eval_runtime": 0.5445, | |
| "eval_samples_per_second": 244.243, | |
| "eval_steps_per_second": 31.219, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 23.076923076923077, | |
| "grad_norm": 0.0007299839635379612, | |
| "learning_rate": 1.0772307692307693e-05, | |
| "loss": 0.0456, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 23.153846153846153, | |
| "grad_norm": 0.0018170330440625548, | |
| "learning_rate": 1.0741538461538464e-05, | |
| "loss": 0.0001, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 23.23076923076923, | |
| "grad_norm": 0.7225848436355591, | |
| "learning_rate": 1.0710769230769232e-05, | |
| "loss": 0.0004, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 23.307692307692307, | |
| "grad_norm": 0.0020151655189692974, | |
| "learning_rate": 1.0680000000000001e-05, | |
| "loss": 0.0014, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 23.384615384615383, | |
| "grad_norm": 0.00038547179428860545, | |
| "learning_rate": 1.0649230769230769e-05, | |
| "loss": 0.0051, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 23.46153846153846, | |
| "grad_norm": 0.00043245829874649644, | |
| "learning_rate": 1.061846153846154e-05, | |
| "loss": 0.1965, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 23.53846153846154, | |
| "grad_norm": 0.46839669346809387, | |
| "learning_rate": 1.0587692307692308e-05, | |
| "loss": 0.0002, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 23.615384615384617, | |
| "grad_norm": 0.008953952230513096, | |
| "learning_rate": 1.0556923076923079e-05, | |
| "loss": 0.1007, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 23.692307692307693, | |
| "grad_norm": 0.05692081153392792, | |
| "learning_rate": 1.0526153846153846e-05, | |
| "loss": 0.0562, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 23.76923076923077, | |
| "grad_norm": 0.004630447365343571, | |
| "learning_rate": 1.0495384615384616e-05, | |
| "loss": 0.0013, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 23.846153846153847, | |
| "grad_norm": 0.002413865178823471, | |
| "learning_rate": 1.0464615384615387e-05, | |
| "loss": 0.0804, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 23.923076923076923, | |
| "grad_norm": 0.002768196165561676, | |
| "learning_rate": 1.0433846153846155e-05, | |
| "loss": 0.0004, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.00019644868734758347, | |
| "learning_rate": 1.0403076923076924e-05, | |
| "loss": 0.1619, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.0024267190601676702, | |
| "eval_runtime": 0.6209, | |
| "eval_samples_per_second": 214.222, | |
| "eval_steps_per_second": 27.382, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 24.076923076923077, | |
| "grad_norm": 34.10586166381836, | |
| "learning_rate": 1.0372307692307693e-05, | |
| "loss": 0.1111, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 24.153846153846153, | |
| "grad_norm": 0.07522152364253998, | |
| "learning_rate": 1.0341538461538463e-05, | |
| "loss": 0.0008, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 24.23076923076923, | |
| "grad_norm": 92.14463806152344, | |
| "learning_rate": 1.031076923076923e-05, | |
| "loss": 0.121, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 24.307692307692307, | |
| "grad_norm": 0.0009277680655941367, | |
| "learning_rate": 1.0280000000000002e-05, | |
| "loss": 0.1377, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 24.384615384615383, | |
| "grad_norm": 0.006104259751737118, | |
| "learning_rate": 1.024923076923077e-05, | |
| "loss": 0.0001, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 24.46153846153846, | |
| "grad_norm": 53.126731872558594, | |
| "learning_rate": 1.0218461538461539e-05, | |
| "loss": 0.0853, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 24.53846153846154, | |
| "grad_norm": 0.0030978797003626823, | |
| "learning_rate": 1.018769230769231e-05, | |
| "loss": 0.181, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 24.615384615384617, | |
| "grad_norm": 0.6085143685340881, | |
| "learning_rate": 1.0156923076923077e-05, | |
| "loss": 0.0002, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 24.692307692307693, | |
| "grad_norm": 0.015882331877946854, | |
| "learning_rate": 1.0126153846153849e-05, | |
| "loss": 0.0001, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 24.76923076923077, | |
| "grad_norm": 0.000975192931946367, | |
| "learning_rate": 1.0095384615384616e-05, | |
| "loss": 0.0003, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 24.846153846153847, | |
| "grad_norm": 0.0011907311854884028, | |
| "learning_rate": 1.0064615384615386e-05, | |
| "loss": 0.0001, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 24.923076923076923, | |
| "grad_norm": 20.971738815307617, | |
| "learning_rate": 1.0033846153846153e-05, | |
| "loss": 0.2057, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.0007839695899747312, | |
| "learning_rate": 1.0003076923076924e-05, | |
| "loss": 0.0015, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.000277143029961735, | |
| "eval_runtime": 0.5589, | |
| "eval_samples_per_second": 237.951, | |
| "eval_steps_per_second": 30.415, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 25.076923076923077, | |
| "grad_norm": 0.0034783727023750544, | |
| "learning_rate": 9.972307692307694e-06, | |
| "loss": 0.059, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 25.153846153846153, | |
| "grad_norm": 0.08948632329702377, | |
| "learning_rate": 9.941538461538463e-06, | |
| "loss": 0.0159, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 25.23076923076923, | |
| "grad_norm": 3.0690531730651855, | |
| "learning_rate": 9.910769230769231e-06, | |
| "loss": 0.0012, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 25.307692307692307, | |
| "grad_norm": 0.02405349537730217, | |
| "learning_rate": 9.88e-06, | |
| "loss": 0.1706, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 25.384615384615383, | |
| "grad_norm": 0.003044095588847995, | |
| "learning_rate": 9.84923076923077e-06, | |
| "loss": 0.0011, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 25.46153846153846, | |
| "grad_norm": 0.0008050467586144805, | |
| "learning_rate": 9.818461538461539e-06, | |
| "loss": 0.0006, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 25.53846153846154, | |
| "grad_norm": 0.0011971103958785534, | |
| "learning_rate": 9.787692307692308e-06, | |
| "loss": 0.0533, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 25.615384615384617, | |
| "grad_norm": 0.0010218324605375528, | |
| "learning_rate": 9.756923076923078e-06, | |
| "loss": 0.0695, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 25.692307692307693, | |
| "grad_norm": 0.0007865706575103104, | |
| "learning_rate": 9.726153846153847e-06, | |
| "loss": 0.0096, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 25.76923076923077, | |
| "grad_norm": 70.63385772705078, | |
| "learning_rate": 9.695384615384617e-06, | |
| "loss": 0.1136, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 25.846153846153847, | |
| "grad_norm": 0.0011000982485711575, | |
| "learning_rate": 9.664615384615386e-06, | |
| "loss": 0.1563, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 25.923076923076923, | |
| "grad_norm": 0.06354475021362305, | |
| "learning_rate": 9.633846153846155e-06, | |
| "loss": 0.1292, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 0.0007121621747501194, | |
| "learning_rate": 9.603076923076923e-06, | |
| "loss": 0.0148, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_accuracy": 0.9774436090225563, | |
| "eval_loss": 0.13121125102043152, | |
| "eval_runtime": 0.5643, | |
| "eval_samples_per_second": 235.687, | |
| "eval_steps_per_second": 30.125, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 26.076923076923077, | |
| "grad_norm": 0.6681668162345886, | |
| "learning_rate": 9.572307692307693e-06, | |
| "loss": 0.1477, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 26.153846153846153, | |
| "grad_norm": 0.6779555678367615, | |
| "learning_rate": 9.541538461538462e-06, | |
| "loss": 0.0002, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 26.23076923076923, | |
| "grad_norm": 0.004583708010613918, | |
| "learning_rate": 9.510769230769231e-06, | |
| "loss": 0.1163, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 26.307692307692307, | |
| "grad_norm": 0.5304880738258362, | |
| "learning_rate": 9.48e-06, | |
| "loss": 0.0515, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 26.384615384615383, | |
| "grad_norm": 7.967146396636963, | |
| "learning_rate": 9.44923076923077e-06, | |
| "loss": 0.0007, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 26.46153846153846, | |
| "grad_norm": 18.322139739990234, | |
| "learning_rate": 9.41846153846154e-06, | |
| "loss": 0.0199, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 26.53846153846154, | |
| "grad_norm": 0.0012510762317106128, | |
| "learning_rate": 9.387692307692309e-06, | |
| "loss": 0.1079, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 26.615384615384617, | |
| "grad_norm": 0.0020101340487599373, | |
| "learning_rate": 9.356923076923078e-06, | |
| "loss": 0.0001, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 26.692307692307693, | |
| "grad_norm": 0.02414599433541298, | |
| "learning_rate": 9.326153846153848e-06, | |
| "loss": 0.1481, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 26.76923076923077, | |
| "grad_norm": 0.016970224678516388, | |
| "learning_rate": 9.295384615384615e-06, | |
| "loss": 0.0005, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 26.846153846153847, | |
| "grad_norm": 0.1080080196261406, | |
| "learning_rate": 9.264615384615385e-06, | |
| "loss": 0.0005, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 26.923076923076923, | |
| "grad_norm": 1.719024658203125, | |
| "learning_rate": 9.233846153846154e-06, | |
| "loss": 0.0003, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 0.0033194362185895443, | |
| "learning_rate": 9.203076923076924e-06, | |
| "loss": 0.0482, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.08730395138263702, | |
| "eval_runtime": 0.5625, | |
| "eval_samples_per_second": 236.442, | |
| "eval_steps_per_second": 30.222, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 27.076923076923077, | |
| "grad_norm": 0.0017070352332666516, | |
| "learning_rate": 9.172307692307693e-06, | |
| "loss": 0.0003, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 27.153846153846153, | |
| "grad_norm": 0.0074979993514716625, | |
| "learning_rate": 9.141538461538462e-06, | |
| "loss": 0.0868, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 27.23076923076923, | |
| "grad_norm": 0.001379862311296165, | |
| "learning_rate": 9.110769230769232e-06, | |
| "loss": 0.1036, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 27.307692307692307, | |
| "grad_norm": 0.02854689210653305, | |
| "learning_rate": 9.080000000000001e-06, | |
| "loss": 0.0552, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 27.384615384615383, | |
| "grad_norm": 0.012071643024682999, | |
| "learning_rate": 9.04923076923077e-06, | |
| "loss": 0.005, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 27.46153846153846, | |
| "grad_norm": 0.0024255982134491205, | |
| "learning_rate": 9.01846153846154e-06, | |
| "loss": 0.0001, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 27.53846153846154, | |
| "grad_norm": 0.03879780322313309, | |
| "learning_rate": 8.987692307692308e-06, | |
| "loss": 0.0879, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 27.615384615384617, | |
| "grad_norm": 54.51979064941406, | |
| "learning_rate": 8.956923076923077e-06, | |
| "loss": 0.1568, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 27.692307692307693, | |
| "grad_norm": 0.0009031574008986354, | |
| "learning_rate": 8.926153846153846e-06, | |
| "loss": 0.1377, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 27.76923076923077, | |
| "grad_norm": 68.2950439453125, | |
| "learning_rate": 8.895384615384616e-06, | |
| "loss": 0.2338, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 27.846153846153847, | |
| "grad_norm": 33.0212287902832, | |
| "learning_rate": 8.864615384615385e-06, | |
| "loss": 0.0164, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 27.923076923076923, | |
| "grad_norm": 0.076235830783844, | |
| "learning_rate": 8.833846153846155e-06, | |
| "loss": 0.0296, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.014614281244575977, | |
| "learning_rate": 8.803076923076924e-06, | |
| "loss": 0.0001, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.07213037461042404, | |
| "eval_runtime": 0.5725, | |
| "eval_samples_per_second": 232.306, | |
| "eval_steps_per_second": 29.693, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 28.076923076923077, | |
| "grad_norm": 24.32257843017578, | |
| "learning_rate": 8.772307692307693e-06, | |
| "loss": 0.1775, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 28.153846153846153, | |
| "grad_norm": 0.010675832629203796, | |
| "learning_rate": 8.741538461538463e-06, | |
| "loss": 0.0307, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 28.23076923076923, | |
| "grad_norm": 0.009055128321051598, | |
| "learning_rate": 8.710769230769232e-06, | |
| "loss": 0.0192, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 28.307692307692307, | |
| "grad_norm": 0.005634867586195469, | |
| "learning_rate": 8.68e-06, | |
| "loss": 0.0775, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 28.384615384615383, | |
| "grad_norm": 16.309144973754883, | |
| "learning_rate": 8.64923076923077e-06, | |
| "loss": 0.0635, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 28.46153846153846, | |
| "grad_norm": 4.859419345855713, | |
| "learning_rate": 8.618461538461539e-06, | |
| "loss": 0.0248, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 28.53846153846154, | |
| "grad_norm": 0.04865121841430664, | |
| "learning_rate": 8.587692307692308e-06, | |
| "loss": 0.0002, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 28.615384615384617, | |
| "grad_norm": 0.009109235368669033, | |
| "learning_rate": 8.556923076923077e-06, | |
| "loss": 0.0211, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 28.692307692307693, | |
| "grad_norm": 0.005094151012599468, | |
| "learning_rate": 8.526153846153847e-06, | |
| "loss": 0.0187, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 28.76923076923077, | |
| "grad_norm": 95.44650268554688, | |
| "learning_rate": 8.495384615384616e-06, | |
| "loss": 0.1308, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 28.846153846153847, | |
| "grad_norm": 0.008243425749242306, | |
| "learning_rate": 8.464615384615386e-06, | |
| "loss": 0.0006, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 28.923076923076923, | |
| "grad_norm": 0.008599749766290188, | |
| "learning_rate": 8.433846153846155e-06, | |
| "loss": 0.0206, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 0.0015361636178568006, | |
| "learning_rate": 8.403076923076924e-06, | |
| "loss": 0.0954, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.014307127334177494, | |
| "eval_runtime": 0.6444, | |
| "eval_samples_per_second": 206.4, | |
| "eval_steps_per_second": 26.382, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 29.076923076923077, | |
| "grad_norm": 0.003046586876735091, | |
| "learning_rate": 8.372307692307692e-06, | |
| "loss": 0.0145, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 29.153846153846153, | |
| "grad_norm": 0.0056587024591863155, | |
| "learning_rate": 8.341538461538462e-06, | |
| "loss": 0.0025, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 29.23076923076923, | |
| "grad_norm": 0.0009685749537311494, | |
| "learning_rate": 8.310769230769231e-06, | |
| "loss": 0.0664, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 29.307692307692307, | |
| "grad_norm": 0.0018060127040371299, | |
| "learning_rate": 8.28e-06, | |
| "loss": 0.114, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 29.384615384615383, | |
| "grad_norm": 0.007853769697248936, | |
| "learning_rate": 8.24923076923077e-06, | |
| "loss": 0.0595, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 29.46153846153846, | |
| "grad_norm": 0.0192034263163805, | |
| "learning_rate": 8.218461538461539e-06, | |
| "loss": 0.0002, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 29.53846153846154, | |
| "grad_norm": 0.0010173041373491287, | |
| "learning_rate": 8.187692307692309e-06, | |
| "loss": 0.0342, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 29.615384615384617, | |
| "grad_norm": 0.0016086545074358582, | |
| "learning_rate": 8.156923076923078e-06, | |
| "loss": 0.0524, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 29.692307692307693, | |
| "grad_norm": 3.9697511196136475, | |
| "learning_rate": 8.126153846153847e-06, | |
| "loss": 0.0973, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 29.76923076923077, | |
| "grad_norm": 0.005043383222073317, | |
| "learning_rate": 8.095384615384617e-06, | |
| "loss": 0.001, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 29.846153846153847, | |
| "grad_norm": 0.004252856131643057, | |
| "learning_rate": 8.064615384615384e-06, | |
| "loss": 0.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 29.923076923076923, | |
| "grad_norm": 0.001231856644153595, | |
| "learning_rate": 8.033846153846154e-06, | |
| "loss": 0.0011, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.01130798552185297, | |
| "learning_rate": 8.003076923076923e-06, | |
| "loss": 0.1373, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.04489622637629509, | |
| "eval_runtime": 0.5598, | |
| "eval_samples_per_second": 237.587, | |
| "eval_steps_per_second": 30.368, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 30.076923076923077, | |
| "grad_norm": 0.9912404417991638, | |
| "learning_rate": 7.972307692307693e-06, | |
| "loss": 0.0347, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 30.153846153846153, | |
| "grad_norm": 0.0017375171883031726, | |
| "learning_rate": 7.941538461538462e-06, | |
| "loss": 0.0016, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 30.23076923076923, | |
| "grad_norm": 124.9094009399414, | |
| "learning_rate": 7.910769230769231e-06, | |
| "loss": 0.0308, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 30.307692307692307, | |
| "grad_norm": 0.012814003974199295, | |
| "learning_rate": 7.88e-06, | |
| "loss": 0.0045, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 30.384615384615383, | |
| "grad_norm": 0.0006594728329218924, | |
| "learning_rate": 7.84923076923077e-06, | |
| "loss": 0.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 30.46153846153846, | |
| "grad_norm": 0.002067841589450836, | |
| "learning_rate": 7.81846153846154e-06, | |
| "loss": 0.0003, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 30.53846153846154, | |
| "grad_norm": 0.007349399384111166, | |
| "learning_rate": 7.787692307692309e-06, | |
| "loss": 0.0139, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 30.615384615384617, | |
| "grad_norm": 0.021420804783701897, | |
| "learning_rate": 7.756923076923077e-06, | |
| "loss": 0.1997, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 30.692307692307693, | |
| "grad_norm": 0.001424902817234397, | |
| "learning_rate": 7.726153846153846e-06, | |
| "loss": 0.032, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 30.76923076923077, | |
| "grad_norm": 2.7047202587127686, | |
| "learning_rate": 7.695384615384615e-06, | |
| "loss": 0.0031, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 30.846153846153847, | |
| "grad_norm": 0.0034344890154898167, | |
| "learning_rate": 7.664615384615385e-06, | |
| "loss": 0.0517, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 30.923076923076923, | |
| "grad_norm": 0.020475352182984352, | |
| "learning_rate": 7.633846153846154e-06, | |
| "loss": 0.0082, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 0.0006407782784663141, | |
| "learning_rate": 7.6030769230769245e-06, | |
| "loss": 0.0076, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.043454647064208984, | |
| "eval_runtime": 0.5607, | |
| "eval_samples_per_second": 237.216, | |
| "eval_steps_per_second": 30.321, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 31.076923076923077, | |
| "grad_norm": 0.0006706058629788458, | |
| "learning_rate": 7.572307692307693e-06, | |
| "loss": 0.0174, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 31.153846153846153, | |
| "grad_norm": 0.002210884587839246, | |
| "learning_rate": 7.5415384615384624e-06, | |
| "loss": 0.2683, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 31.23076923076923, | |
| "grad_norm": 0.006247711833566427, | |
| "learning_rate": 7.510769230769232e-06, | |
| "loss": 0.1214, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 31.307692307692307, | |
| "grad_norm": 0.0019206746947020292, | |
| "learning_rate": 7.48e-06, | |
| "loss": 0.0002, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 31.384615384615383, | |
| "grad_norm": 0.0009853794472292066, | |
| "learning_rate": 7.44923076923077e-06, | |
| "loss": 0.1271, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 31.46153846153846, | |
| "grad_norm": 0.01296025887131691, | |
| "learning_rate": 7.418461538461539e-06, | |
| "loss": 0.0008, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 31.53846153846154, | |
| "grad_norm": 0.009520775638520718, | |
| "learning_rate": 7.387692307692308e-06, | |
| "loss": 0.008, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 31.615384615384617, | |
| "grad_norm": 11.869040489196777, | |
| "learning_rate": 7.356923076923077e-06, | |
| "loss": 0.0863, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 31.692307692307693, | |
| "grad_norm": 0.016930609941482544, | |
| "learning_rate": 7.326153846153847e-06, | |
| "loss": 0.0001, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 31.76923076923077, | |
| "grad_norm": 0.005094442516565323, | |
| "learning_rate": 7.295384615384617e-06, | |
| "loss": 0.133, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 31.846153846153847, | |
| "grad_norm": 0.0016887628007680178, | |
| "learning_rate": 7.264615384615385e-06, | |
| "loss": 0.0194, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 31.923076923076923, | |
| "grad_norm": 0.004818159621208906, | |
| "learning_rate": 7.233846153846155e-06, | |
| "loss": 0.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.0007881343481130898, | |
| "learning_rate": 7.203076923076924e-06, | |
| "loss": 0.0028, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.01007868628948927, | |
| "eval_runtime": 0.592, | |
| "eval_samples_per_second": 224.646, | |
| "eval_steps_per_second": 28.714, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 32.07692307692308, | |
| "grad_norm": 0.0010091810254380107, | |
| "learning_rate": 7.172307692307693e-06, | |
| "loss": 0.0012, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 32.15384615384615, | |
| "grad_norm": 97.89594268798828, | |
| "learning_rate": 7.141538461538462e-06, | |
| "loss": 0.1107, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 32.23076923076923, | |
| "grad_norm": 47.61814498901367, | |
| "learning_rate": 7.1107692307692314e-06, | |
| "loss": 0.0909, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 32.30769230769231, | |
| "grad_norm": 0.0031400700099766254, | |
| "learning_rate": 7.08e-06, | |
| "loss": 0.001, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 32.38461538461539, | |
| "grad_norm": 0.08583249896764755, | |
| "learning_rate": 7.049230769230769e-06, | |
| "loss": 0.0001, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 32.46153846153846, | |
| "grad_norm": 0.001287147868424654, | |
| "learning_rate": 7.01846153846154e-06, | |
| "loss": 0.0694, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 32.53846153846154, | |
| "grad_norm": 0.0018081064336001873, | |
| "learning_rate": 6.987692307692309e-06, | |
| "loss": 0.0042, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 32.61538461538461, | |
| "grad_norm": 0.0005175307160243392, | |
| "learning_rate": 6.9569230769230776e-06, | |
| "loss": 0.0209, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 32.69230769230769, | |
| "grad_norm": 0.0015608895337209105, | |
| "learning_rate": 6.926153846153847e-06, | |
| "loss": 0.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 32.76923076923077, | |
| "grad_norm": 0.0004342070023994893, | |
| "learning_rate": 6.895384615384616e-06, | |
| "loss": 0.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 32.84615384615385, | |
| "grad_norm": 0.12884479761123657, | |
| "learning_rate": 6.864615384615385e-06, | |
| "loss": 0.0001, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 32.92307692307692, | |
| "grad_norm": 0.0009400132112205029, | |
| "learning_rate": 6.833846153846154e-06, | |
| "loss": 0.0045, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 0.0006513062980957329, | |
| "learning_rate": 6.803076923076924e-06, | |
| "loss": 0.0001, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.04143361374735832, | |
| "eval_runtime": 0.5612, | |
| "eval_samples_per_second": 237.01, | |
| "eval_steps_per_second": 30.294, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 33.07692307692308, | |
| "grad_norm": 0.0024744998663663864, | |
| "learning_rate": 6.772307692307692e-06, | |
| "loss": 0.0893, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 33.15384615384615, | |
| "grad_norm": 0.002017271937802434, | |
| "learning_rate": 6.741538461538462e-06, | |
| "loss": 0.0002, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 33.23076923076923, | |
| "grad_norm": 0.03144347295165062, | |
| "learning_rate": 6.710769230769232e-06, | |
| "loss": 0.0002, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 33.30769230769231, | |
| "grad_norm": 0.014736099168658257, | |
| "learning_rate": 6.680000000000001e-06, | |
| "loss": 0.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 33.38461538461539, | |
| "grad_norm": 0.0004188344464637339, | |
| "learning_rate": 6.64923076923077e-06, | |
| "loss": 0.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 33.46153846153846, | |
| "grad_norm": 0.08335867524147034, | |
| "learning_rate": 6.618461538461539e-06, | |
| "loss": 0.0001, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 33.53846153846154, | |
| "grad_norm": 0.0008053340134210885, | |
| "learning_rate": 6.587692307692309e-06, | |
| "loss": 0.0003, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 33.61538461538461, | |
| "grad_norm": 0.0012076394632458687, | |
| "learning_rate": 6.556923076923077e-06, | |
| "loss": 0.0001, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 33.69230769230769, | |
| "grad_norm": 0.0048568369820714, | |
| "learning_rate": 6.5261538461538465e-06, | |
| "loss": 0.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 33.76923076923077, | |
| "grad_norm": 0.0008632199605926871, | |
| "learning_rate": 6.495384615384616e-06, | |
| "loss": 0.0002, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 33.84615384615385, | |
| "grad_norm": 0.049176134169101715, | |
| "learning_rate": 6.4646153846153845e-06, | |
| "loss": 0.0001, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 33.92307692307692, | |
| "grad_norm": 0.00043606868712231517, | |
| "learning_rate": 6.433846153846154e-06, | |
| "loss": 0.0682, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 0.0006029644282534719, | |
| "learning_rate": 6.403076923076924e-06, | |
| "loss": 0.001, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 0.0017387220868840814, | |
| "eval_runtime": 0.5604, | |
| "eval_samples_per_second": 237.329, | |
| "eval_steps_per_second": 30.335, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 34.07692307692308, | |
| "grad_norm": 0.0005392258754000068, | |
| "learning_rate": 6.3723076923076935e-06, | |
| "loss": 0.0178, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 34.15384615384615, | |
| "grad_norm": 0.0017064090352505445, | |
| "learning_rate": 6.341538461538462e-06, | |
| "loss": 0.007, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 34.23076923076923, | |
| "grad_norm": 0.0033292374573647976, | |
| "learning_rate": 6.3107692307692315e-06, | |
| "loss": 0.007, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 34.30769230769231, | |
| "grad_norm": 31.601030349731445, | |
| "learning_rate": 6.280000000000001e-06, | |
| "loss": 0.0654, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 34.38461538461539, | |
| "grad_norm": 0.007135252468287945, | |
| "learning_rate": 6.249230769230769e-06, | |
| "loss": 0.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 34.46153846153846, | |
| "grad_norm": 81.1984634399414, | |
| "learning_rate": 6.218461538461539e-06, | |
| "loss": 0.072, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 34.53846153846154, | |
| "grad_norm": 0.06208951398730278, | |
| "learning_rate": 6.187692307692308e-06, | |
| "loss": 0.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 34.61538461538461, | |
| "grad_norm": 0.0007352761458605528, | |
| "learning_rate": 6.156923076923077e-06, | |
| "loss": 0.1805, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 34.69230769230769, | |
| "grad_norm": 0.00034046368091367185, | |
| "learning_rate": 6.126153846153846e-06, | |
| "loss": 0.0092, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 34.76923076923077, | |
| "grad_norm": 0.0068006995134055614, | |
| "learning_rate": 6.095384615384616e-06, | |
| "loss": 0.0135, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 34.84615384615385, | |
| "grad_norm": 0.009285767562687397, | |
| "learning_rate": 6.064615384615386e-06, | |
| "loss": 0.0443, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 34.92307692307692, | |
| "grad_norm": 0.0005047802696935833, | |
| "learning_rate": 6.033846153846154e-06, | |
| "loss": 0.0003, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.0017190852668136358, | |
| "learning_rate": 6.003076923076924e-06, | |
| "loss": 0.0055, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.07332061976194382, | |
| "eval_runtime": 0.5797, | |
| "eval_samples_per_second": 229.424, | |
| "eval_steps_per_second": 29.325, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 35.07692307692308, | |
| "grad_norm": 0.014129632152616978, | |
| "learning_rate": 5.972307692307693e-06, | |
| "loss": 0.0044, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 35.15384615384615, | |
| "grad_norm": 0.051782090216875076, | |
| "learning_rate": 5.941538461538462e-06, | |
| "loss": 0.1579, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 35.23076923076923, | |
| "grad_norm": 0.0010272455401718616, | |
| "learning_rate": 5.910769230769231e-06, | |
| "loss": 0.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 35.30769230769231, | |
| "grad_norm": 0.0008460541139356792, | |
| "learning_rate": 5.8800000000000005e-06, | |
| "loss": 0.0799, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 35.38461538461539, | |
| "grad_norm": 0.0015528828371316195, | |
| "learning_rate": 5.849230769230769e-06, | |
| "loss": 0.0585, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 35.46153846153846, | |
| "grad_norm": 0.0016128149582073092, | |
| "learning_rate": 5.818461538461538e-06, | |
| "loss": 0.0001, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 35.53846153846154, | |
| "grad_norm": 0.004083054140210152, | |
| "learning_rate": 5.787692307692309e-06, | |
| "loss": 0.0001, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 35.61538461538461, | |
| "grad_norm": 0.0318060964345932, | |
| "learning_rate": 5.756923076923078e-06, | |
| "loss": 0.0001, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 35.69230769230769, | |
| "grad_norm": 1.8864119052886963, | |
| "learning_rate": 5.726153846153847e-06, | |
| "loss": 0.0008, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 35.76923076923077, | |
| "grad_norm": 0.0009333742200396955, | |
| "learning_rate": 5.695384615384616e-06, | |
| "loss": 0.0028, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 35.84615384615385, | |
| "grad_norm": 0.0018823591526597738, | |
| "learning_rate": 5.664615384615385e-06, | |
| "loss": 0.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 35.92307692307692, | |
| "grad_norm": 0.014191100373864174, | |
| "learning_rate": 5.633846153846154e-06, | |
| "loss": 0.0064, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.001580247189849615, | |
| "learning_rate": 5.603076923076923e-06, | |
| "loss": 0.1471, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_accuracy": 0.9774436090225563, | |
| "eval_loss": 0.12211022526025772, | |
| "eval_runtime": 0.56, | |
| "eval_samples_per_second": 237.504, | |
| "eval_steps_per_second": 30.358, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 36.07692307692308, | |
| "grad_norm": 0.0018762449035421014, | |
| "learning_rate": 5.572307692307693e-06, | |
| "loss": 0.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 36.15384615384615, | |
| "grad_norm": 0.000685960752889514, | |
| "learning_rate": 5.541538461538461e-06, | |
| "loss": 0.0867, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 36.23076923076923, | |
| "grad_norm": 0.001832542009651661, | |
| "learning_rate": 5.5107692307692315e-06, | |
| "loss": 0.0831, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 36.30769230769231, | |
| "grad_norm": 1.0698580741882324, | |
| "learning_rate": 5.480000000000001e-06, | |
| "loss": 0.0027, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 36.38461538461539, | |
| "grad_norm": 0.002061473438516259, | |
| "learning_rate": 5.44923076923077e-06, | |
| "loss": 0.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 36.46153846153846, | |
| "grad_norm": 0.0007437376189045608, | |
| "learning_rate": 5.418461538461539e-06, | |
| "loss": 0.0002, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 36.53846153846154, | |
| "grad_norm": 0.00041431974386796355, | |
| "learning_rate": 5.387692307692308e-06, | |
| "loss": 0.0002, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 36.61538461538461, | |
| "grad_norm": 1.0955127477645874, | |
| "learning_rate": 5.356923076923078e-06, | |
| "loss": 0.0002, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 36.69230769230769, | |
| "grad_norm": 0.15827512741088867, | |
| "learning_rate": 5.326153846153846e-06, | |
| "loss": 0.0002, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 36.76923076923077, | |
| "grad_norm": 0.0007032826542854309, | |
| "learning_rate": 5.2953846153846156e-06, | |
| "loss": 0.0033, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 36.84615384615385, | |
| "grad_norm": 0.00039768809801898897, | |
| "learning_rate": 5.264615384615385e-06, | |
| "loss": 0.0028, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 36.92307692307692, | |
| "grad_norm": 0.002939037047326565, | |
| "learning_rate": 5.2338461538461535e-06, | |
| "loss": 0.0938, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 0.06541663408279419, | |
| "learning_rate": 5.203076923076924e-06, | |
| "loss": 0.0484, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.14731426537036896, | |
| "eval_runtime": 0.5828, | |
| "eval_samples_per_second": 228.19, | |
| "eval_steps_per_second": 29.167, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 37.07692307692308, | |
| "grad_norm": 0.9744657874107361, | |
| "learning_rate": 5.172307692307693e-06, | |
| "loss": 0.0206, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 37.15384615384615, | |
| "grad_norm": 27.0574893951416, | |
| "learning_rate": 5.1415384615384625e-06, | |
| "loss": 0.0719, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 37.23076923076923, | |
| "grad_norm": 0.029181215912103653, | |
| "learning_rate": 5.110769230769231e-06, | |
| "loss": 0.0002, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 37.30769230769231, | |
| "grad_norm": 37.923431396484375, | |
| "learning_rate": 5.0800000000000005e-06, | |
| "loss": 0.0878, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 37.38461538461539, | |
| "grad_norm": 0.0028850040398538113, | |
| "learning_rate": 5.04923076923077e-06, | |
| "loss": 0.0123, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 37.46153846153846, | |
| "grad_norm": 0.0006236844346858561, | |
| "learning_rate": 5.0184615384615384e-06, | |
| "loss": 0.082, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 37.53846153846154, | |
| "grad_norm": 0.0010073287412524223, | |
| "learning_rate": 4.987692307692308e-06, | |
| "loss": 0.0027, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 37.61538461538461, | |
| "grad_norm": 0.0008756616152822971, | |
| "learning_rate": 4.956923076923077e-06, | |
| "loss": 0.0001, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 37.69230769230769, | |
| "grad_norm": 0.0012337320949882269, | |
| "learning_rate": 4.926153846153847e-06, | |
| "loss": 0.0003, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 37.76923076923077, | |
| "grad_norm": 26.195831298828125, | |
| "learning_rate": 4.895384615384616e-06, | |
| "loss": 0.0488, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 37.84615384615385, | |
| "grad_norm": 0.002201149705797434, | |
| "learning_rate": 4.8646153846153846e-06, | |
| "loss": 0.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 37.92307692307692, | |
| "grad_norm": 0.0031797313131392, | |
| "learning_rate": 4.833846153846154e-06, | |
| "loss": 0.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 0.001882484881207347, | |
| "learning_rate": 4.803076923076923e-06, | |
| "loss": 0.0014, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.07482893019914627, | |
| "eval_runtime": 0.5718, | |
| "eval_samples_per_second": 232.616, | |
| "eval_steps_per_second": 29.733, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 38.07692307692308, | |
| "grad_norm": 0.008910280652344227, | |
| "learning_rate": 4.772307692307693e-06, | |
| "loss": 0.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 38.15384615384615, | |
| "grad_norm": 0.0004342030733823776, | |
| "learning_rate": 4.741538461538462e-06, | |
| "loss": 0.0995, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 38.23076923076923, | |
| "grad_norm": 0.000522003450896591, | |
| "learning_rate": 4.710769230769231e-06, | |
| "loss": 0.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 38.30769230769231, | |
| "grad_norm": 0.4271712303161621, | |
| "learning_rate": 4.680000000000001e-06, | |
| "loss": 0.049, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 38.38461538461539, | |
| "grad_norm": 0.0009867213666439056, | |
| "learning_rate": 4.6492307692307695e-06, | |
| "loss": 0.0769, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 38.46153846153846, | |
| "grad_norm": 0.0015043043531477451, | |
| "learning_rate": 4.618461538461539e-06, | |
| "loss": 0.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 38.53846153846154, | |
| "grad_norm": 0.000573535158764571, | |
| "learning_rate": 4.587692307692308e-06, | |
| "loss": 0.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 38.61538461538461, | |
| "grad_norm": 0.007249703165143728, | |
| "learning_rate": 4.556923076923077e-06, | |
| "loss": 0.0807, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 38.69230769230769, | |
| "grad_norm": 1.4994910955429077, | |
| "learning_rate": 4.526153846153847e-06, | |
| "loss": 0.0056, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 38.76923076923077, | |
| "grad_norm": 0.0006923425826244056, | |
| "learning_rate": 4.495384615384616e-06, | |
| "loss": 0.0001, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 38.84615384615385, | |
| "grad_norm": 0.010082672350108624, | |
| "learning_rate": 4.464615384615385e-06, | |
| "loss": 0.0002, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 38.92307692307692, | |
| "grad_norm": 0.0003873300738632679, | |
| "learning_rate": 4.433846153846154e-06, | |
| "loss": 0.0123, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 387.8450622558594, | |
| "learning_rate": 4.403076923076923e-06, | |
| "loss": 0.1825, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.10720350593328476, | |
| "eval_runtime": 0.558, | |
| "eval_samples_per_second": 238.346, | |
| "eval_steps_per_second": 30.465, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 39.07692307692308, | |
| "grad_norm": 0.0003139421751257032, | |
| "learning_rate": 4.372307692307693e-06, | |
| "loss": 0.0002, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 39.15384615384615, | |
| "grad_norm": 0.0030731302686035633, | |
| "learning_rate": 4.341538461538462e-06, | |
| "loss": 0.0005, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 39.23076923076923, | |
| "grad_norm": 95.7864990234375, | |
| "learning_rate": 4.310769230769231e-06, | |
| "loss": 0.1588, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 39.30769230769231, | |
| "grad_norm": 0.045532066375017166, | |
| "learning_rate": 4.2800000000000005e-06, | |
| "loss": 0.0003, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 39.38461538461539, | |
| "grad_norm": 5.153738975524902, | |
| "learning_rate": 4.249230769230769e-06, | |
| "loss": 0.0012, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 39.46153846153846, | |
| "grad_norm": 0.0010733181843534112, | |
| "learning_rate": 4.218461538461539e-06, | |
| "loss": 0.1359, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 39.53846153846154, | |
| "grad_norm": 0.00048660460743121803, | |
| "learning_rate": 4.187692307692308e-06, | |
| "loss": 0.0001, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 39.61538461538461, | |
| "grad_norm": 0.13067100942134857, | |
| "learning_rate": 4.156923076923077e-06, | |
| "loss": 0.0384, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 39.69230769230769, | |
| "grad_norm": 0.0006858094711787999, | |
| "learning_rate": 4.126153846153847e-06, | |
| "loss": 0.0008, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 39.76923076923077, | |
| "grad_norm": 5.363431453704834, | |
| "learning_rate": 4.095384615384615e-06, | |
| "loss": 0.0007, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 39.84615384615385, | |
| "grad_norm": 0.0011726649245247245, | |
| "learning_rate": 4.0646153846153854e-06, | |
| "loss": 0.0018, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 39.92307692307692, | |
| "grad_norm": 0.21595709025859833, | |
| "learning_rate": 4.033846153846154e-06, | |
| "loss": 0.0002, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.0002660917234607041, | |
| "learning_rate": 4.003076923076923e-06, | |
| "loss": 0.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.06868916749954224, | |
| "eval_runtime": 0.5588, | |
| "eval_samples_per_second": 238.025, | |
| "eval_steps_per_second": 30.424, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 40.07692307692308, | |
| "grad_norm": 0.00026413268642500043, | |
| "learning_rate": 3.972307692307693e-06, | |
| "loss": 0.0001, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 40.15384615384615, | |
| "grad_norm": 0.00025818985886871815, | |
| "learning_rate": 3.941538461538461e-06, | |
| "loss": 0.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 40.23076923076923, | |
| "grad_norm": 0.0034207049757242203, | |
| "learning_rate": 3.9107692307692316e-06, | |
| "loss": 0.2677, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 40.30769230769231, | |
| "grad_norm": 22.716407775878906, | |
| "learning_rate": 3.88e-06, | |
| "loss": 0.099, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 40.38461538461539, | |
| "grad_norm": 0.001969344448298216, | |
| "learning_rate": 3.8492307692307695e-06, | |
| "loss": 0.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 40.46153846153846, | |
| "grad_norm": 0.0004705400497186929, | |
| "learning_rate": 3.818461538461539e-06, | |
| "loss": 0.0301, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 40.53846153846154, | |
| "grad_norm": 0.0022966957185417414, | |
| "learning_rate": 3.787692307692308e-06, | |
| "loss": 0.1202, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 40.61538461538461, | |
| "grad_norm": 0.0005464573623612523, | |
| "learning_rate": 3.7569230769230773e-06, | |
| "loss": 0.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 40.69230769230769, | |
| "grad_norm": 0.00032725094933994114, | |
| "learning_rate": 3.7261538461538467e-06, | |
| "loss": 0.1325, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 40.76923076923077, | |
| "grad_norm": 1.188474416732788, | |
| "learning_rate": 3.6953846153846156e-06, | |
| "loss": 0.0002, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 40.84615384615385, | |
| "grad_norm": 0.0010689280461519957, | |
| "learning_rate": 3.6646153846153846e-06, | |
| "loss": 0.0522, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 40.92307692307692, | |
| "grad_norm": 0.00040832936065271497, | |
| "learning_rate": 3.633846153846154e-06, | |
| "loss": 0.0264, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 0.0008318335167132318, | |
| "learning_rate": 3.6030769230769234e-06, | |
| "loss": 0.0081, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.1147400438785553, | |
| "eval_runtime": 0.5673, | |
| "eval_samples_per_second": 234.428, | |
| "eval_steps_per_second": 29.964, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 41.07692307692308, | |
| "grad_norm": 0.010249706916511059, | |
| "learning_rate": 3.572307692307693e-06, | |
| "loss": 0.0003, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 41.15384615384615, | |
| "grad_norm": 0.0003951251273974776, | |
| "learning_rate": 3.5415384615384618e-06, | |
| "loss": 0.0082, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 41.23076923076923, | |
| "grad_norm": 0.008920038118958473, | |
| "learning_rate": 3.5107692307692307e-06, | |
| "loss": 0.0397, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 41.30769230769231, | |
| "grad_norm": 0.0018836874514818192, | |
| "learning_rate": 3.48e-06, | |
| "loss": 0.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 41.38461538461539, | |
| "grad_norm": 0.004664571024477482, | |
| "learning_rate": 3.4492307692307695e-06, | |
| "loss": 0.1514, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 41.46153846153846, | |
| "grad_norm": 0.0018832337809726596, | |
| "learning_rate": 3.418461538461539e-06, | |
| "loss": 0.0476, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 41.53846153846154, | |
| "grad_norm": 0.004951864946633577, | |
| "learning_rate": 3.387692307692308e-06, | |
| "loss": 0.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 41.61538461538461, | |
| "grad_norm": 0.0004485521058086306, | |
| "learning_rate": 3.356923076923077e-06, | |
| "loss": 0.004, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 41.69230769230769, | |
| "grad_norm": 0.0011319700861349702, | |
| "learning_rate": 3.3261538461538463e-06, | |
| "loss": 0.0002, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 41.76923076923077, | |
| "grad_norm": 0.1554967314004898, | |
| "learning_rate": 3.2953846153846157e-06, | |
| "loss": 0.0001, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 41.84615384615385, | |
| "grad_norm": 0.0006219987408258021, | |
| "learning_rate": 3.264615384615385e-06, | |
| "loss": 0.0004, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 41.92307692307692, | |
| "grad_norm": 0.007299738470464945, | |
| "learning_rate": 3.233846153846154e-06, | |
| "loss": 0.0001, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.0005407740827649832, | |
| "learning_rate": 3.203076923076923e-06, | |
| "loss": 0.0557, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.06299061328172684, | |
| "eval_runtime": 0.5719, | |
| "eval_samples_per_second": 232.55, | |
| "eval_steps_per_second": 29.724, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 42.07692307692308, | |
| "grad_norm": 0.000429264095146209, | |
| "learning_rate": 3.1723076923076924e-06, | |
| "loss": 0.0011, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 42.15384615384615, | |
| "grad_norm": 0.0012445776956155896, | |
| "learning_rate": 3.141538461538462e-06, | |
| "loss": 0.0215, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 42.23076923076923, | |
| "grad_norm": 0.017115270718932152, | |
| "learning_rate": 3.110769230769231e-06, | |
| "loss": 0.0005, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 42.30769230769231, | |
| "grad_norm": 0.0003503753978293389, | |
| "learning_rate": 3.08e-06, | |
| "loss": 0.0469, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 42.38461538461539, | |
| "grad_norm": 0.0008315156446769834, | |
| "learning_rate": 3.049230769230769e-06, | |
| "loss": 0.0082, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 42.46153846153846, | |
| "grad_norm": 0.0038778912276029587, | |
| "learning_rate": 3.0184615384615385e-06, | |
| "loss": 0.0115, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 42.53846153846154, | |
| "grad_norm": 0.0003100531466770917, | |
| "learning_rate": 2.987692307692308e-06, | |
| "loss": 0.0091, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 42.61538461538461, | |
| "grad_norm": 0.0004450716369319707, | |
| "learning_rate": 2.9569230769230773e-06, | |
| "loss": 0.0042, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 42.69230769230769, | |
| "grad_norm": 0.0008668462396599352, | |
| "learning_rate": 2.9261538461538463e-06, | |
| "loss": 0.0002, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 42.76923076923077, | |
| "grad_norm": 0.0007650844636373222, | |
| "learning_rate": 2.8953846153846153e-06, | |
| "loss": 0.0019, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 42.84615384615385, | |
| "grad_norm": 0.0027373021002858877, | |
| "learning_rate": 2.8646153846153847e-06, | |
| "loss": 0.039, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 42.92307692307692, | |
| "grad_norm": 0.0016651992918923497, | |
| "learning_rate": 2.833846153846154e-06, | |
| "loss": 0.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 0.0006611489807255566, | |
| "learning_rate": 2.8030769230769234e-06, | |
| "loss": 0.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.016152488067746162, | |
| "eval_runtime": 0.5602, | |
| "eval_samples_per_second": 237.419, | |
| "eval_steps_per_second": 30.347, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 43.07692307692308, | |
| "grad_norm": 0.0031338355038315058, | |
| "learning_rate": 2.7723076923076924e-06, | |
| "loss": 0.0724, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 43.15384615384615, | |
| "grad_norm": 0.04154682531952858, | |
| "learning_rate": 2.7415384615384614e-06, | |
| "loss": 0.0185, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 43.23076923076923, | |
| "grad_norm": 0.0005495785153470933, | |
| "learning_rate": 2.710769230769231e-06, | |
| "loss": 0.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 43.30769230769231, | |
| "grad_norm": 0.0006096691940911114, | |
| "learning_rate": 2.68e-06, | |
| "loss": 0.1738, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 43.38461538461539, | |
| "grad_norm": 0.0008338314946740866, | |
| "learning_rate": 2.6492307692307696e-06, | |
| "loss": 0.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 43.46153846153846, | |
| "grad_norm": 0.004199822433292866, | |
| "learning_rate": 2.6184615384615385e-06, | |
| "loss": 0.0914, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 43.53846153846154, | |
| "grad_norm": 0.000944766215980053, | |
| "learning_rate": 2.587692307692308e-06, | |
| "loss": 0.1093, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 43.61538461538461, | |
| "grad_norm": 48.179908752441406, | |
| "learning_rate": 2.5569230769230773e-06, | |
| "loss": 0.0042, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 43.69230769230769, | |
| "grad_norm": 0.006739782635122538, | |
| "learning_rate": 2.5261538461538463e-06, | |
| "loss": 0.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 43.76923076923077, | |
| "grad_norm": 0.0064003062434494495, | |
| "learning_rate": 2.4953846153846157e-06, | |
| "loss": 0.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 43.84615384615385, | |
| "grad_norm": 0.0006233023013919592, | |
| "learning_rate": 2.4646153846153847e-06, | |
| "loss": 0.0237, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 43.92307692307692, | |
| "grad_norm": 0.0021808522287756205, | |
| "learning_rate": 2.433846153846154e-06, | |
| "loss": 0.0515, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.002309564733877778, | |
| "learning_rate": 2.4030769230769235e-06, | |
| "loss": 0.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_accuracy": 0.9924812030075187, | |
| "eval_loss": 0.04631290212273598, | |
| "eval_runtime": 0.5655, | |
| "eval_samples_per_second": 235.178, | |
| "eval_steps_per_second": 30.06, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 44.07692307692308, | |
| "grad_norm": 0.0009078089497052133, | |
| "learning_rate": 2.3723076923076924e-06, | |
| "loss": 0.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 44.15384615384615, | |
| "grad_norm": 0.0005848463624715805, | |
| "learning_rate": 2.341538461538462e-06, | |
| "loss": 0.0001, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 44.23076923076923, | |
| "grad_norm": 0.0006252095336094499, | |
| "learning_rate": 2.310769230769231e-06, | |
| "loss": 0.0003, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 44.30769230769231, | |
| "grad_norm": 0.0007332940585911274, | |
| "learning_rate": 2.28e-06, | |
| "loss": 0.1749, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 44.38461538461539, | |
| "grad_norm": 0.002333400072529912, | |
| "learning_rate": 2.2492307692307696e-06, | |
| "loss": 0.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 44.46153846153846, | |
| "grad_norm": 0.01608298532664776, | |
| "learning_rate": 2.218461538461539e-06, | |
| "loss": 0.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 44.53846153846154, | |
| "grad_norm": 0.0004494412860367447, | |
| "learning_rate": 2.187692307692308e-06, | |
| "loss": 0.0037, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 44.61538461538461, | |
| "grad_norm": 0.0018300774972885847, | |
| "learning_rate": 2.156923076923077e-06, | |
| "loss": 0.103, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 44.69230769230769, | |
| "grad_norm": 0.0017312734853476286, | |
| "learning_rate": 2.1261538461538463e-06, | |
| "loss": 0.0001, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 44.76923076923077, | |
| "grad_norm": 0.0011130099883303046, | |
| "learning_rate": 2.0953846153846157e-06, | |
| "loss": 0.0001, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 44.84615384615385, | |
| "grad_norm": 0.0020217066630721092, | |
| "learning_rate": 2.064615384615385e-06, | |
| "loss": 0.0324, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 44.92307692307692, | |
| "grad_norm": 0.0037438569124788046, | |
| "learning_rate": 2.033846153846154e-06, | |
| "loss": 0.0038, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.01541079394519329, | |
| "learning_rate": 2.003076923076923e-06, | |
| "loss": 0.0197, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.07567540556192398, | |
| "eval_runtime": 0.5602, | |
| "eval_samples_per_second": 237.419, | |
| "eval_steps_per_second": 30.347, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 45.07692307692308, | |
| "grad_norm": 0.0009596956660971045, | |
| "learning_rate": 1.9723076923076924e-06, | |
| "loss": 0.0751, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 45.15384615384615, | |
| "grad_norm": 0.0015320915263146162, | |
| "learning_rate": 1.941538461538462e-06, | |
| "loss": 0.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 45.23076923076923, | |
| "grad_norm": 8.862515449523926, | |
| "learning_rate": 1.9107692307692312e-06, | |
| "loss": 0.0028, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 45.30769230769231, | |
| "grad_norm": 0.00041210712515749037, | |
| "learning_rate": 1.8800000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 45.38461538461539, | |
| "grad_norm": 0.000413590605603531, | |
| "learning_rate": 1.8492307692307692e-06, | |
| "loss": 0.0344, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 45.46153846153846, | |
| "grad_norm": 14.184650421142578, | |
| "learning_rate": 1.8184615384615386e-06, | |
| "loss": 0.0512, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 45.53846153846154, | |
| "grad_norm": 0.025255808606743813, | |
| "learning_rate": 1.7876923076923078e-06, | |
| "loss": 0.176, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 45.61538461538461, | |
| "grad_norm": 0.021409081295132637, | |
| "learning_rate": 1.7569230769230772e-06, | |
| "loss": 0.0001, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 45.69230769230769, | |
| "grad_norm": 0.0009863406885415316, | |
| "learning_rate": 1.7261538461538463e-06, | |
| "loss": 0.0001, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 45.76923076923077, | |
| "grad_norm": 4.161138534545898, | |
| "learning_rate": 1.6953846153846153e-06, | |
| "loss": 0.0021, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 45.84615384615385, | |
| "grad_norm": 0.0008297639433294535, | |
| "learning_rate": 1.6646153846153847e-06, | |
| "loss": 0.0001, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 45.92307692307692, | |
| "grad_norm": 0.0011634193360805511, | |
| "learning_rate": 1.6338461538461539e-06, | |
| "loss": 0.0, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 0.002662894781678915, | |
| "learning_rate": 1.6030769230769233e-06, | |
| "loss": 0.1442, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.09411457926034927, | |
| "eval_runtime": 0.5682, | |
| "eval_samples_per_second": 234.062, | |
| "eval_steps_per_second": 29.918, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 46.07692307692308, | |
| "grad_norm": 0.0025213544722646475, | |
| "learning_rate": 1.5723076923076925e-06, | |
| "loss": 0.0, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 46.15384615384615, | |
| "grad_norm": 0.0015282627427950501, | |
| "learning_rate": 1.5415384615384614e-06, | |
| "loss": 0.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 46.23076923076923, | |
| "grad_norm": 0.017315922304987907, | |
| "learning_rate": 1.5107692307692308e-06, | |
| "loss": 0.0003, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 46.30769230769231, | |
| "grad_norm": 0.0011600840371102095, | |
| "learning_rate": 1.48e-06, | |
| "loss": 0.0002, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 46.38461538461539, | |
| "grad_norm": 0.10278120636940002, | |
| "learning_rate": 1.4492307692307694e-06, | |
| "loss": 0.0801, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 46.46153846153846, | |
| "grad_norm": 0.006299668923020363, | |
| "learning_rate": 1.4184615384615386e-06, | |
| "loss": 0.109, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 46.53846153846154, | |
| "grad_norm": 0.00047507460112683475, | |
| "learning_rate": 1.3876923076923076e-06, | |
| "loss": 0.018, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 46.61538461538461, | |
| "grad_norm": 0.0016794758848845959, | |
| "learning_rate": 1.356923076923077e-06, | |
| "loss": 0.0001, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 46.69230769230769, | |
| "grad_norm": 0.38092464208602905, | |
| "learning_rate": 1.3261538461538461e-06, | |
| "loss": 0.0451, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 46.76923076923077, | |
| "grad_norm": 0.0006377891404554248, | |
| "learning_rate": 1.2953846153846155e-06, | |
| "loss": 0.0022, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 46.84615384615385, | |
| "grad_norm": 3.781010389328003, | |
| "learning_rate": 1.2646153846153847e-06, | |
| "loss": 0.0008, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 46.92307692307692, | |
| "grad_norm": 16.57982063293457, | |
| "learning_rate": 1.233846153846154e-06, | |
| "loss": 0.0793, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 0.0005190350348129869, | |
| "learning_rate": 1.2030769230769233e-06, | |
| "loss": 0.0019, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.07604598999023438, | |
| "eval_runtime": 0.5685, | |
| "eval_samples_per_second": 233.963, | |
| "eval_steps_per_second": 29.905, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 47.07692307692308, | |
| "grad_norm": 0.0002213361149188131, | |
| "learning_rate": 1.1723076923076925e-06, | |
| "loss": 0.0, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 47.15384615384615, | |
| "grad_norm": 0.003321431577205658, | |
| "learning_rate": 1.1415384615384617e-06, | |
| "loss": 0.0015, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 47.23076923076923, | |
| "grad_norm": 0.005034551955759525, | |
| "learning_rate": 1.1107692307692309e-06, | |
| "loss": 0.0178, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 47.30769230769231, | |
| "grad_norm": 0.002936753910034895, | |
| "learning_rate": 1.08e-06, | |
| "loss": 0.0664, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 47.38461538461539, | |
| "grad_norm": 0.0007592691108584404, | |
| "learning_rate": 1.0492307692307694e-06, | |
| "loss": 0.0001, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 47.46153846153846, | |
| "grad_norm": 0.0010430924594402313, | |
| "learning_rate": 1.0184615384615386e-06, | |
| "loss": 0.0709, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 47.53846153846154, | |
| "grad_norm": 0.00222417083568871, | |
| "learning_rate": 9.876923076923078e-07, | |
| "loss": 0.0001, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 47.61538461538461, | |
| "grad_norm": 0.0008099984261207283, | |
| "learning_rate": 9.56923076923077e-07, | |
| "loss": 0.0, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 47.69230769230769, | |
| "grad_norm": 0.005475889425724745, | |
| "learning_rate": 9.261538461538462e-07, | |
| "loss": 0.0001, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 47.76923076923077, | |
| "grad_norm": 0.0015080670127645135, | |
| "learning_rate": 8.953846153846155e-07, | |
| "loss": 0.0, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 47.84615384615385, | |
| "grad_norm": 0.0009012154769152403, | |
| "learning_rate": 8.646153846153847e-07, | |
| "loss": 0.0013, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 47.92307692307692, | |
| "grad_norm": 0.0014631409430876374, | |
| "learning_rate": 8.338461538461539e-07, | |
| "loss": 0.1119, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.0026774315629154444, | |
| "learning_rate": 8.030769230769231e-07, | |
| "loss": 0.0001, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.08850181102752686, | |
| "eval_runtime": 0.559, | |
| "eval_samples_per_second": 237.905, | |
| "eval_steps_per_second": 30.409, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 48.07692307692308, | |
| "grad_norm": 0.005435484927147627, | |
| "learning_rate": 7.723076923076923e-07, | |
| "loss": 0.0017, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 48.15384615384615, | |
| "grad_norm": 0.0006916754064150155, | |
| "learning_rate": 7.415384615384616e-07, | |
| "loss": 0.0001, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 48.23076923076923, | |
| "grad_norm": 0.008791380561888218, | |
| "learning_rate": 7.107692307692309e-07, | |
| "loss": 0.0, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 48.30769230769231, | |
| "grad_norm": 0.0013945720857009292, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 48.38461538461539, | |
| "grad_norm": 0.0008068948518484831, | |
| "learning_rate": 6.492307692307692e-07, | |
| "loss": 0.0189, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 48.46153846153846, | |
| "grad_norm": 0.0035911358427256346, | |
| "learning_rate": 6.184615384615385e-07, | |
| "loss": 0.0001, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 48.53846153846154, | |
| "grad_norm": 0.02670697495341301, | |
| "learning_rate": 5.876923076923077e-07, | |
| "loss": 0.0615, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 48.61538461538461, | |
| "grad_norm": 0.0006088964291848242, | |
| "learning_rate": 5.56923076923077e-07, | |
| "loss": 0.0069, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 48.69230769230769, | |
| "grad_norm": 26.212440490722656, | |
| "learning_rate": 5.261538461538462e-07, | |
| "loss": 0.0589, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 48.76923076923077, | |
| "grad_norm": 0.0010863661300390959, | |
| "learning_rate": 4.953846153846155e-07, | |
| "loss": 0.0001, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 48.84615384615385, | |
| "grad_norm": 0.0006395662785507739, | |
| "learning_rate": 4.6461538461538465e-07, | |
| "loss": 0.0025, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 48.92307692307692, | |
| "grad_norm": 0.000708853651303798, | |
| "learning_rate": 4.3384615384615384e-07, | |
| "loss": 0.0001, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 0.00027008148026652634, | |
| "learning_rate": 4.0307692307692313e-07, | |
| "loss": 0.0854, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.0787627175450325, | |
| "eval_runtime": 0.5704, | |
| "eval_samples_per_second": 233.186, | |
| "eval_steps_per_second": 29.806, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 49.07692307692308, | |
| "grad_norm": 0.0025157821364700794, | |
| "learning_rate": 3.7230769230769236e-07, | |
| "loss": 0.0677, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 49.15384615384615, | |
| "grad_norm": 0.002912031952291727, | |
| "learning_rate": 3.4153846153846155e-07, | |
| "loss": 0.0001, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 49.23076923076923, | |
| "grad_norm": 0.000612029223702848, | |
| "learning_rate": 3.107692307692308e-07, | |
| "loss": 0.0003, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 49.30769230769231, | |
| "grad_norm": 0.001225666725076735, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0003, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 49.38461538461539, | |
| "grad_norm": 0.884882390499115, | |
| "learning_rate": 2.4923076923076926e-07, | |
| "loss": 0.0231, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 49.46153846153846, | |
| "grad_norm": 0.0010143619729205966, | |
| "learning_rate": 2.1846153846153847e-07, | |
| "loss": 0.0, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 49.53846153846154, | |
| "grad_norm": 0.028597572818398476, | |
| "learning_rate": 1.8769230769230773e-07, | |
| "loss": 0.0004, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 49.61538461538461, | |
| "grad_norm": 0.0006769668543711305, | |
| "learning_rate": 1.5692307692307694e-07, | |
| "loss": 0.0014, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 49.69230769230769, | |
| "grad_norm": 0.0014201749581843615, | |
| "learning_rate": 1.2615384615384617e-07, | |
| "loss": 0.0, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 49.76923076923077, | |
| "grad_norm": 0.010460024699568748, | |
| "learning_rate": 9.53846153846154e-08, | |
| "loss": 0.1236, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 49.84615384615385, | |
| "grad_norm": 0.036062873899936676, | |
| "learning_rate": 6.461538461538462e-08, | |
| "loss": 0.0367, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 49.92307692307692, | |
| "grad_norm": 0.0007868582615628839, | |
| "learning_rate": 3.384615384615385e-08, | |
| "loss": 0.0051, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.04576790705323219, | |
| "learning_rate": 3.0769230769230774e-09, | |
| "loss": 0.0005, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_accuracy": 0.9849624060150376, | |
| "eval_loss": 0.07072408497333527, | |
| "eval_runtime": 0.5576, | |
| "eval_samples_per_second": 238.518, | |
| "eval_steps_per_second": 30.487, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "step": 6500, | |
| "total_flos": 4.0046594160900096e+18, | |
| "train_loss": 0.0702138227691556, | |
| "train_runtime": 508.0972, | |
| "train_samples_per_second": 101.752, | |
| "train_steps_per_second": 12.793 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.0046594160900096e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |