| { | |
| "best_metric": 1.0, | |
| "best_model_checkpoint": "/content/drive/MyDrive/Colab Notebooks/16_label_check_point/checkpoint-563", | |
| "epoch": 4.997333333333334, | |
| "eval_steps": 500, | |
| "global_step": 2810, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017777777777777778, | |
| "grad_norm": 10.457254409790039, | |
| "learning_rate": 1.7793594306049826e-06, | |
| "loss": 11.2334, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 6.316349506378174, | |
| "learning_rate": 3.558718861209965e-06, | |
| "loss": 11.1807, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 5.264202117919922, | |
| "learning_rate": 5.338078291814947e-06, | |
| "loss": 11.1463, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 5.205317497253418, | |
| "learning_rate": 7.11743772241993e-06, | |
| "loss": 11.0929, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 4.696351528167725, | |
| "learning_rate": 8.896797153024912e-06, | |
| "loss": 11.1015, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 5.9699320793151855, | |
| "learning_rate": 1.0676156583629894e-05, | |
| "loss": 11.0795, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12444444444444444, | |
| "grad_norm": 7.235191822052002, | |
| "learning_rate": 1.2455516014234877e-05, | |
| "loss": 11.046, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 14.865583419799805, | |
| "learning_rate": 1.423487544483986e-05, | |
| "loss": 11.0113, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 17.412696838378906, | |
| "learning_rate": 1.601423487544484e-05, | |
| "loss": 10.8502, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 20.41786003112793, | |
| "learning_rate": 1.7793594306049825e-05, | |
| "loss": 10.5185, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19555555555555557, | |
| "grad_norm": 24.58249282836914, | |
| "learning_rate": 1.9572953736654805e-05, | |
| "loss": 9.8301, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 43.27064895629883, | |
| "learning_rate": 2.135231316725979e-05, | |
| "loss": 8.8636, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2311111111111111, | |
| "grad_norm": 46.2359733581543, | |
| "learning_rate": 2.313167259786477e-05, | |
| "loss": 7.4813, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 71.36530303955078, | |
| "learning_rate": 2.4911032028469753e-05, | |
| "loss": 6.0609, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 61.219024658203125, | |
| "learning_rate": 2.669039145907473e-05, | |
| "loss": 4.5934, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 52.64271545410156, | |
| "learning_rate": 2.846975088967972e-05, | |
| "loss": 3.3049, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3022222222222222, | |
| "grad_norm": 40.045623779296875, | |
| "learning_rate": 3.02491103202847e-05, | |
| "loss": 2.0759, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 32.63826370239258, | |
| "learning_rate": 3.202846975088968e-05, | |
| "loss": 1.2791, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3377777777777778, | |
| "grad_norm": 39.16119384765625, | |
| "learning_rate": 3.380782918149467e-05, | |
| "loss": 0.9052, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 36.06990051269531, | |
| "learning_rate": 3.558718861209965e-05, | |
| "loss": 0.6508, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 22.00220489501953, | |
| "learning_rate": 3.736654804270463e-05, | |
| "loss": 0.6293, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 27.334341049194336, | |
| "learning_rate": 3.914590747330961e-05, | |
| "loss": 0.5774, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4088888888888889, | |
| "grad_norm": 21.130746841430664, | |
| "learning_rate": 4.09252669039146e-05, | |
| "loss": 0.4318, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 21.37102508544922, | |
| "learning_rate": 4.270462633451958e-05, | |
| "loss": 0.414, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 86.8355712890625, | |
| "learning_rate": 4.448398576512456e-05, | |
| "loss": 0.4338, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 9.731348037719727, | |
| "learning_rate": 4.626334519572954e-05, | |
| "loss": 0.3978, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 23.59229278564453, | |
| "learning_rate": 4.8042704626334526e-05, | |
| "loss": 0.2971, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 10.458292961120605, | |
| "learning_rate": 4.9822064056939506e-05, | |
| "loss": 0.3015, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5155555555555555, | |
| "grad_norm": 13.044571876525879, | |
| "learning_rate": 4.9822064056939506e-05, | |
| "loss": 0.2013, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 30.592700958251953, | |
| "learning_rate": 4.962435745353895e-05, | |
| "loss": 0.182, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5511111111111111, | |
| "grad_norm": 26.582555770874023, | |
| "learning_rate": 4.9426650850138396e-05, | |
| "loss": 0.2276, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 20.704526901245117, | |
| "learning_rate": 4.9228944246737844e-05, | |
| "loss": 0.1836, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 29.400476455688477, | |
| "learning_rate": 4.903123764333729e-05, | |
| "loss": 0.2921, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 23.031789779663086, | |
| "learning_rate": 4.8833531039936733e-05, | |
| "loss": 0.0848, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 0.9449447393417358, | |
| "learning_rate": 4.863582443653618e-05, | |
| "loss": 0.0741, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 20.334348678588867, | |
| "learning_rate": 4.843811783313563e-05, | |
| "loss": 0.1798, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6577777777777778, | |
| "grad_norm": 13.533489227294922, | |
| "learning_rate": 4.824041122973508e-05, | |
| "loss": 0.1007, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 1.5982263088226318, | |
| "learning_rate": 4.8042704626334526e-05, | |
| "loss": 0.1016, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 2.323336362838745, | |
| "learning_rate": 4.784499802293397e-05, | |
| "loss": 0.1131, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 24.458276748657227, | |
| "learning_rate": 4.7647291419533415e-05, | |
| "loss": 0.069, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7288888888888889, | |
| "grad_norm": 29.530794143676758, | |
| "learning_rate": 4.7449584816132864e-05, | |
| "loss": 0.2415, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 2.0636749267578125, | |
| "learning_rate": 4.725187821273231e-05, | |
| "loss": 0.1173, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7644444444444445, | |
| "grad_norm": 2.225900888442993, | |
| "learning_rate": 4.705417160933175e-05, | |
| "loss": 0.0918, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 7.136375904083252, | |
| "learning_rate": 4.68564650059312e-05, | |
| "loss": 0.1589, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.6402971744537354, | |
| "learning_rate": 4.665875840253064e-05, | |
| "loss": 0.2547, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 12.275321960449219, | |
| "learning_rate": 4.64610517991301e-05, | |
| "loss": 0.1082, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8355555555555556, | |
| "grad_norm": 0.6636475920677185, | |
| "learning_rate": 4.626334519572954e-05, | |
| "loss": 0.0986, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 27.784332275390625, | |
| "learning_rate": 4.606563859232899e-05, | |
| "loss": 0.1056, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8711111111111111, | |
| "grad_norm": 22.047527313232422, | |
| "learning_rate": 4.586793198892843e-05, | |
| "loss": 0.1654, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 15.613493919372559, | |
| "learning_rate": 4.5670225385527876e-05, | |
| "loss": 0.1517, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 20.039813995361328, | |
| "learning_rate": 4.5472518782127324e-05, | |
| "loss": 0.1283, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 21.469423294067383, | |
| "learning_rate": 4.527481217872677e-05, | |
| "loss": 0.0707, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9422222222222222, | |
| "grad_norm": 38.251953125, | |
| "learning_rate": 4.5077105575326214e-05, | |
| "loss": 0.1116, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 12.317658424377441, | |
| "learning_rate": 4.487939897192566e-05, | |
| "loss": 0.138, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 5.918329238891602, | |
| "learning_rate": 4.468169236852511e-05, | |
| "loss": 0.1053, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 12.31584358215332, | |
| "learning_rate": 4.448398576512456e-05, | |
| "loss": 0.1354, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 8.097552927210927e-05, | |
| "eval_runtime": 74.3955, | |
| "eval_samples_per_second": 107.533, | |
| "eval_steps_per_second": 3.36, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.0124444444444445, | |
| "grad_norm": 10.894996643066406, | |
| "learning_rate": 4.4286279161724006e-05, | |
| "loss": 0.1134, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0302222222222222, | |
| "grad_norm": 16.037940979003906, | |
| "learning_rate": 4.408857255832345e-05, | |
| "loss": 0.0917, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 0.164012148976326, | |
| "learning_rate": 4.3890865954922896e-05, | |
| "loss": 0.0816, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0657777777777777, | |
| "grad_norm": 0.05601898953318596, | |
| "learning_rate": 4.3693159351522344e-05, | |
| "loss": 0.0778, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0835555555555556, | |
| "grad_norm": 9.240290641784668, | |
| "learning_rate": 4.349545274812179e-05, | |
| "loss": 0.0893, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.1013333333333333, | |
| "grad_norm": 8.448566436767578, | |
| "learning_rate": 4.3297746144721233e-05, | |
| "loss": 0.0905, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1191111111111112, | |
| "grad_norm": 1.9537012577056885, | |
| "learning_rate": 4.310003954132068e-05, | |
| "loss": 0.1561, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1368888888888888, | |
| "grad_norm": 25.13317108154297, | |
| "learning_rate": 4.290233293792013e-05, | |
| "loss": 0.0856, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.1546666666666667, | |
| "grad_norm": 1.3837047815322876, | |
| "learning_rate": 4.270462633451958e-05, | |
| "loss": 0.0962, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1724444444444444, | |
| "grad_norm": 46.560367584228516, | |
| "learning_rate": 4.250691973111902e-05, | |
| "loss": 0.1009, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1902222222222223, | |
| "grad_norm": 42.40678787231445, | |
| "learning_rate": 4.230921312771847e-05, | |
| "loss": 0.1027, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 1.2823681831359863, | |
| "learning_rate": 4.211150652431791e-05, | |
| "loss": 0.0609, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.2257777777777779, | |
| "grad_norm": 3.9979896545410156, | |
| "learning_rate": 4.1913799920917364e-05, | |
| "loss": 0.0884, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2435555555555555, | |
| "grad_norm": 23.4843692779541, | |
| "learning_rate": 4.1716093317516805e-05, | |
| "loss": 0.049, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2613333333333334, | |
| "grad_norm": 0.9539620876312256, | |
| "learning_rate": 4.151838671411625e-05, | |
| "loss": 0.0481, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.279111111111111, | |
| "grad_norm": 0.23285511136054993, | |
| "learning_rate": 4.13206801107157e-05, | |
| "loss": 0.0388, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2968888888888888, | |
| "grad_norm": 24.404285430908203, | |
| "learning_rate": 4.112297350731515e-05, | |
| "loss": 0.0425, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.3146666666666667, | |
| "grad_norm": 3.143155813217163, | |
| "learning_rate": 4.09252669039146e-05, | |
| "loss": 0.0313, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3324444444444445, | |
| "grad_norm": 0.3580031991004944, | |
| "learning_rate": 4.072756030051404e-05, | |
| "loss": 0.0466, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3502222222222222, | |
| "grad_norm": 39.96261978149414, | |
| "learning_rate": 4.052985369711349e-05, | |
| "loss": 0.0706, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 22.012971878051758, | |
| "learning_rate": 4.033214709371293e-05, | |
| "loss": 0.1249, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3857777777777778, | |
| "grad_norm": 0.20229700207710266, | |
| "learning_rate": 4.013444049031238e-05, | |
| "loss": 0.1089, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.4035555555555557, | |
| "grad_norm": 36.518348693847656, | |
| "learning_rate": 3.9936733886911825e-05, | |
| "loss": 0.0514, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.4213333333333333, | |
| "grad_norm": 0.011868222616612911, | |
| "learning_rate": 3.973902728351127e-05, | |
| "loss": 0.0298, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.439111111111111, | |
| "grad_norm": 23.455787658691406, | |
| "learning_rate": 3.9541320680110714e-05, | |
| "loss": 0.0365, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.456888888888889, | |
| "grad_norm": 0.5454981923103333, | |
| "learning_rate": 3.934361407671016e-05, | |
| "loss": 0.0603, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4746666666666668, | |
| "grad_norm": 0.2658223509788513, | |
| "learning_rate": 3.914590747330961e-05, | |
| "loss": 0.0364, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4924444444444445, | |
| "grad_norm": 32.6451301574707, | |
| "learning_rate": 3.894820086990906e-05, | |
| "loss": 0.1105, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5102222222222221, | |
| "grad_norm": 0.37322714924812317, | |
| "learning_rate": 3.87504942665085e-05, | |
| "loss": 0.0102, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 1.2302775382995605, | |
| "learning_rate": 3.855278766310795e-05, | |
| "loss": 0.093, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.545777777777778, | |
| "grad_norm": 0.11981203407049179, | |
| "learning_rate": 3.8355081059707396e-05, | |
| "loss": 0.0721, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5635555555555556, | |
| "grad_norm": 0.09180541336536407, | |
| "learning_rate": 3.8157374456306844e-05, | |
| "loss": 0.0408, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5813333333333333, | |
| "grad_norm": 29.872051239013672, | |
| "learning_rate": 3.7959667852906285e-05, | |
| "loss": 0.0393, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5991111111111111, | |
| "grad_norm": 1.0710923671722412, | |
| "learning_rate": 3.7761961249505734e-05, | |
| "loss": 0.0023, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.616888888888889, | |
| "grad_norm": 20.386173248291016, | |
| "learning_rate": 3.756425464610518e-05, | |
| "loss": 0.0605, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6346666666666667, | |
| "grad_norm": 0.03605956584215164, | |
| "learning_rate": 3.736654804270463e-05, | |
| "loss": 0.0059, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.6524444444444444, | |
| "grad_norm": 0.38812369108200073, | |
| "learning_rate": 3.716884143930408e-05, | |
| "loss": 0.0819, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6702222222222223, | |
| "grad_norm": 27.319766998291016, | |
| "learning_rate": 3.697113483590352e-05, | |
| "loss": 0.0201, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 20.58792495727539, | |
| "learning_rate": 3.677342823250297e-05, | |
| "loss": 0.0678, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7057777777777776, | |
| "grad_norm": 0.33605310320854187, | |
| "learning_rate": 3.6575721629102416e-05, | |
| "loss": 0.0027, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.7235555555555555, | |
| "grad_norm": 0.13025720417499542, | |
| "learning_rate": 3.6378015025701864e-05, | |
| "loss": 0.0029, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7413333333333334, | |
| "grad_norm": 31.07040023803711, | |
| "learning_rate": 3.6180308422301305e-05, | |
| "loss": 0.0626, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.759111111111111, | |
| "grad_norm": 0.055677346885204315, | |
| "learning_rate": 3.598260181890075e-05, | |
| "loss": 0.0079, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7768888888888887, | |
| "grad_norm": 0.012918527238070965, | |
| "learning_rate": 3.5784895215500194e-05, | |
| "loss": 0.0308, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7946666666666666, | |
| "grad_norm": 12.896405220031738, | |
| "learning_rate": 3.558718861209965e-05, | |
| "loss": 0.0551, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.8124444444444445, | |
| "grad_norm": 0.0037423851899802685, | |
| "learning_rate": 3.538948200869909e-05, | |
| "loss": 0.0437, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.8302222222222222, | |
| "grad_norm": 0.2329370528459549, | |
| "learning_rate": 3.519177540529854e-05, | |
| "loss": 0.0281, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 0.027233602479100227, | |
| "learning_rate": 3.499406880189798e-05, | |
| "loss": 0.0136, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8657777777777778, | |
| "grad_norm": 0.1924924999475479, | |
| "learning_rate": 3.4796362198497435e-05, | |
| "loss": 0.056, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8835555555555556, | |
| "grad_norm": 0.02651727944612503, | |
| "learning_rate": 3.4598655595096876e-05, | |
| "loss": 0.0647, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.9013333333333333, | |
| "grad_norm": 38.13447570800781, | |
| "learning_rate": 3.4400948991696325e-05, | |
| "loss": 0.0216, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.919111111111111, | |
| "grad_norm": 11.278813362121582, | |
| "learning_rate": 3.420324238829577e-05, | |
| "loss": 0.0395, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.9368888888888889, | |
| "grad_norm": 0.5866456031799316, | |
| "learning_rate": 3.4005535784895214e-05, | |
| "loss": 0.0433, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.9546666666666668, | |
| "grad_norm": 2.6308796405792236, | |
| "learning_rate": 3.380782918149467e-05, | |
| "loss": 0.0118, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9724444444444444, | |
| "grad_norm": 0.1729055494070053, | |
| "learning_rate": 3.361012257809411e-05, | |
| "loss": 0.0251, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9902222222222221, | |
| "grad_norm": 10.363531112670898, | |
| "learning_rate": 3.341241597469356e-05, | |
| "loss": 0.0656, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 2.5387274945387617e-05, | |
| "eval_runtime": 75.7395, | |
| "eval_samples_per_second": 105.625, | |
| "eval_steps_per_second": 3.301, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 2.010666666666667, | |
| "grad_norm": 29.318574905395508, | |
| "learning_rate": 3.3214709371293e-05, | |
| "loss": 0.2308, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.0284444444444443, | |
| "grad_norm": 67.69547271728516, | |
| "learning_rate": 3.301700276789245e-05, | |
| "loss": 0.0604, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.046222222222222, | |
| "grad_norm": 0.18736213445663452, | |
| "learning_rate": 3.2819296164491896e-05, | |
| "loss": 0.0546, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 15.664798736572266, | |
| "learning_rate": 3.2621589561091344e-05, | |
| "loss": 0.0304, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.081777777777778, | |
| "grad_norm": 0.13788799941539764, | |
| "learning_rate": 3.2423882957690785e-05, | |
| "loss": 0.0029, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0995555555555554, | |
| "grad_norm": 4.969343185424805, | |
| "learning_rate": 3.2226176354290234e-05, | |
| "loss": 0.0099, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.1173333333333333, | |
| "grad_norm": 1.4300692081451416, | |
| "learning_rate": 3.202846975088968e-05, | |
| "loss": 0.0181, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.135111111111111, | |
| "grad_norm": 0.019833851605653763, | |
| "learning_rate": 3.183076314748913e-05, | |
| "loss": 0.0107, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.152888888888889, | |
| "grad_norm": 0.01474601961672306, | |
| "learning_rate": 3.163305654408857e-05, | |
| "loss": 0.0169, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.1706666666666665, | |
| "grad_norm": 0.004971742630004883, | |
| "learning_rate": 3.143534994068802e-05, | |
| "loss": 0.0254, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.1884444444444444, | |
| "grad_norm": 0.42502257227897644, | |
| "learning_rate": 3.123764333728747e-05, | |
| "loss": 0.0623, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.2062222222222223, | |
| "grad_norm": 30.118955612182617, | |
| "learning_rate": 3.1039936733886916e-05, | |
| "loss": 0.0456, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 7.1990132331848145, | |
| "learning_rate": 3.0842230130486364e-05, | |
| "loss": 0.0468, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.2417777777777776, | |
| "grad_norm": 0.021625256165862083, | |
| "learning_rate": 3.0644523527085805e-05, | |
| "loss": 0.0375, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.2595555555555555, | |
| "grad_norm": 0.08051316440105438, | |
| "learning_rate": 3.044681692368525e-05, | |
| "loss": 0.0097, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.2773333333333334, | |
| "grad_norm": 38.26923751831055, | |
| "learning_rate": 3.02491103202847e-05, | |
| "loss": 0.0247, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.295111111111111, | |
| "grad_norm": 1.5090163946151733, | |
| "learning_rate": 3.0051403716884146e-05, | |
| "loss": 0.0207, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.3128888888888888, | |
| "grad_norm": 14.290279388427734, | |
| "learning_rate": 2.985369711348359e-05, | |
| "loss": 0.0041, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.3306666666666667, | |
| "grad_norm": 0.025663571432232857, | |
| "learning_rate": 2.9655990510083035e-05, | |
| "loss": 0.0013, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.3484444444444446, | |
| "grad_norm": 13.073821067810059, | |
| "learning_rate": 2.9458283906682484e-05, | |
| "loss": 0.006, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.3662222222222224, | |
| "grad_norm": 0.03257250785827637, | |
| "learning_rate": 2.9260577303281932e-05, | |
| "loss": 0.0383, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 2.0085232257843018, | |
| "learning_rate": 2.906287069988138e-05, | |
| "loss": 0.0552, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.401777777777778, | |
| "grad_norm": 19.939329147338867, | |
| "learning_rate": 2.8865164096480825e-05, | |
| "loss": 0.0714, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.4195555555555557, | |
| "grad_norm": 0.028250480070710182, | |
| "learning_rate": 2.866745749308027e-05, | |
| "loss": 0.0227, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.437333333333333, | |
| "grad_norm": 0.6903110146522522, | |
| "learning_rate": 2.846975088967972e-05, | |
| "loss": 0.0085, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.455111111111111, | |
| "grad_norm": 0.22341494262218475, | |
| "learning_rate": 2.8272044286279166e-05, | |
| "loss": 0.0093, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.472888888888889, | |
| "grad_norm": 0.015194721519947052, | |
| "learning_rate": 2.807433768287861e-05, | |
| "loss": 0.0413, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.490666666666667, | |
| "grad_norm": 0.035975806415081024, | |
| "learning_rate": 2.7876631079478055e-05, | |
| "loss": 0.0013, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.5084444444444447, | |
| "grad_norm": 0.32504796981811523, | |
| "learning_rate": 2.76789244760775e-05, | |
| "loss": 0.0547, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.526222222222222, | |
| "grad_norm": 0.34928593039512634, | |
| "learning_rate": 2.748121787267695e-05, | |
| "loss": 0.0556, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.045603252947330475, | |
| "learning_rate": 2.7283511269276396e-05, | |
| "loss": 0.0156, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.561777777777778, | |
| "grad_norm": 11.350424766540527, | |
| "learning_rate": 2.708580466587584e-05, | |
| "loss": 0.011, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.5795555555555554, | |
| "grad_norm": 0.07788264751434326, | |
| "learning_rate": 2.6888098062475286e-05, | |
| "loss": 0.0607, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.5973333333333333, | |
| "grad_norm": 0.06617221236228943, | |
| "learning_rate": 2.669039145907473e-05, | |
| "loss": 0.0102, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.615111111111111, | |
| "grad_norm": 34.64754867553711, | |
| "learning_rate": 2.6492684855674182e-05, | |
| "loss": 0.0268, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.632888888888889, | |
| "grad_norm": 9.72877311706543, | |
| "learning_rate": 2.6294978252273626e-05, | |
| "loss": 0.0051, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.6506666666666665, | |
| "grad_norm": 21.619274139404297, | |
| "learning_rate": 2.609727164887307e-05, | |
| "loss": 0.0092, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.6684444444444444, | |
| "grad_norm": 4.081634521484375, | |
| "learning_rate": 2.589956504547252e-05, | |
| "loss": 0.0369, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6862222222222223, | |
| "grad_norm": 0.009345272555947304, | |
| "learning_rate": 2.5701858442071967e-05, | |
| "loss": 0.018, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 73.03565979003906, | |
| "learning_rate": 2.5504151838671416e-05, | |
| "loss": 0.0945, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.7217777777777776, | |
| "grad_norm": 1.0712828636169434, | |
| "learning_rate": 2.530644523527086e-05, | |
| "loss": 0.0156, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.7395555555555555, | |
| "grad_norm": 0.023015221580863, | |
| "learning_rate": 2.5108738631870305e-05, | |
| "loss": 0.0408, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.7573333333333334, | |
| "grad_norm": 3.0739543437957764, | |
| "learning_rate": 2.4911032028469753e-05, | |
| "loss": 0.0384, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.7751111111111113, | |
| "grad_norm": 0.017695285379886627, | |
| "learning_rate": 2.4713325425069198e-05, | |
| "loss": 0.034, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.7928888888888888, | |
| "grad_norm": 0.013055549003183842, | |
| "learning_rate": 2.4515618821668646e-05, | |
| "loss": 0.074, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.8106666666666666, | |
| "grad_norm": 6.4839582443237305, | |
| "learning_rate": 2.431791221826809e-05, | |
| "loss": 0.0016, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.8284444444444445, | |
| "grad_norm": 0.17747992277145386, | |
| "learning_rate": 2.412020561486754e-05, | |
| "loss": 0.0579, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.846222222222222, | |
| "grad_norm": 0.07140109688043594, | |
| "learning_rate": 2.3922499011466984e-05, | |
| "loss": 0.0303, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.0027039784472435713, | |
| "learning_rate": 2.3724792408066432e-05, | |
| "loss": 0.0337, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.8817777777777778, | |
| "grad_norm": 0.015552740544080734, | |
| "learning_rate": 2.3527085804665877e-05, | |
| "loss": 0.0131, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.8995555555555557, | |
| "grad_norm": 0.014052975922822952, | |
| "learning_rate": 2.332937920126532e-05, | |
| "loss": 0.0434, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.9173333333333336, | |
| "grad_norm": 0.7165421843528748, | |
| "learning_rate": 2.313167259786477e-05, | |
| "loss": 0.0069, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.935111111111111, | |
| "grad_norm": 0.020382430404424667, | |
| "learning_rate": 2.2933965994464214e-05, | |
| "loss": 0.0419, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.952888888888889, | |
| "grad_norm": 0.07127852737903595, | |
| "learning_rate": 2.2736259391063662e-05, | |
| "loss": 0.0303, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.970666666666667, | |
| "grad_norm": 2.5529978275299072, | |
| "learning_rate": 2.2538552787663107e-05, | |
| "loss": 0.0063, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.9884444444444442, | |
| "grad_norm": 0.0018354392377659678, | |
| "learning_rate": 2.2340846184262555e-05, | |
| "loss": 0.0323, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.999111111111111, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 6.3951174524845555e-06, | |
| "eval_runtime": 39.8564, | |
| "eval_samples_per_second": 200.721, | |
| "eval_steps_per_second": 6.273, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 3.007111111111111, | |
| "grad_norm": 0.0571288987994194, | |
| "learning_rate": 2.2143139580862003e-05, | |
| "loss": 0.0435, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.024888888888889, | |
| "grad_norm": 0.015790535137057304, | |
| "learning_rate": 2.1945432977461448e-05, | |
| "loss": 0.0058, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.042666666666667, | |
| "grad_norm": 0.9516779184341431, | |
| "learning_rate": 2.1747726374060896e-05, | |
| "loss": 0.0109, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.0604444444444443, | |
| "grad_norm": 0.05595998093485832, | |
| "learning_rate": 2.155001977066034e-05, | |
| "loss": 0.01, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.078222222222222, | |
| "grad_norm": 0.07043913751840591, | |
| "learning_rate": 2.135231316725979e-05, | |
| "loss": 0.0017, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 0.00207032123580575, | |
| "learning_rate": 2.1154606563859234e-05, | |
| "loss": 0.0026, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.113777777777778, | |
| "grad_norm": 0.0049515170976519585, | |
| "learning_rate": 2.0956899960458682e-05, | |
| "loss": 0.0049, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.1315555555555554, | |
| "grad_norm": 0.08053428679704666, | |
| "learning_rate": 2.0759193357058127e-05, | |
| "loss": 0.0169, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.1493333333333333, | |
| "grad_norm": 0.0009357993258163333, | |
| "learning_rate": 2.0561486753657575e-05, | |
| "loss": 0.0124, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.167111111111111, | |
| "grad_norm": 0.014298039488494396, | |
| "learning_rate": 2.036378015025702e-05, | |
| "loss": 0.0007, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.1848888888888887, | |
| "grad_norm": 1.3868434429168701, | |
| "learning_rate": 2.0166073546856464e-05, | |
| "loss": 0.0122, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.2026666666666666, | |
| "grad_norm": 0.0033609354868531227, | |
| "learning_rate": 1.9968366943455912e-05, | |
| "loss": 0.0097, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.2204444444444444, | |
| "grad_norm": 10.693577766418457, | |
| "learning_rate": 1.9770660340055357e-05, | |
| "loss": 0.0637, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.2382222222222223, | |
| "grad_norm": 0.006601002067327499, | |
| "learning_rate": 1.9572953736654805e-05, | |
| "loss": 0.0718, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 0.03210904076695442, | |
| "learning_rate": 1.937524713325425e-05, | |
| "loss": 0.0014, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.2737777777777777, | |
| "grad_norm": 0.007109949365258217, | |
| "learning_rate": 1.9177540529853698e-05, | |
| "loss": 0.0065, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.2915555555555556, | |
| "grad_norm": 0.0019361014710739255, | |
| "learning_rate": 1.8979833926453143e-05, | |
| "loss": 0.0008, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.3093333333333335, | |
| "grad_norm": 0.08277800679206848, | |
| "learning_rate": 1.878212732305259e-05, | |
| "loss": 0.0004, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.327111111111111, | |
| "grad_norm": 0.003757915459573269, | |
| "learning_rate": 1.858442071965204e-05, | |
| "loss": 0.0074, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.344888888888889, | |
| "grad_norm": 13.0957670211792, | |
| "learning_rate": 1.8386714116251484e-05, | |
| "loss": 0.0177, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.3626666666666667, | |
| "grad_norm": 0.07693179696798325, | |
| "learning_rate": 1.8189007512850932e-05, | |
| "loss": 0.0093, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.3804444444444446, | |
| "grad_norm": 0.007269900757819414, | |
| "learning_rate": 1.7991300909450377e-05, | |
| "loss": 0.0004, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.398222222222222, | |
| "grad_norm": 0.149241641163826, | |
| "learning_rate": 1.7793594306049825e-05, | |
| "loss": 0.0008, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 0.13814906775951385, | |
| "learning_rate": 1.759588770264927e-05, | |
| "loss": 0.0228, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.433777777777778, | |
| "grad_norm": 0.012578528374433517, | |
| "learning_rate": 1.7398181099248718e-05, | |
| "loss": 0.0031, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.4515555555555557, | |
| "grad_norm": 0.002241666428744793, | |
| "learning_rate": 1.7200474495848162e-05, | |
| "loss": 0.0018, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.469333333333333, | |
| "grad_norm": 0.005206266883760691, | |
| "learning_rate": 1.7002767892447607e-05, | |
| "loss": 0.0469, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.487111111111111, | |
| "grad_norm": 11.583967208862305, | |
| "learning_rate": 1.6805061289047055e-05, | |
| "loss": 0.0691, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.504888888888889, | |
| "grad_norm": 0.006195690017193556, | |
| "learning_rate": 1.66073546856465e-05, | |
| "loss": 0.0199, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.522666666666667, | |
| "grad_norm": 8.7911958694458, | |
| "learning_rate": 1.6409648082245948e-05, | |
| "loss": 0.0023, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.5404444444444443, | |
| "grad_norm": 0.002020699204877019, | |
| "learning_rate": 1.6211941478845393e-05, | |
| "loss": 0.0022, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.558222222222222, | |
| "grad_norm": 0.006164974998682737, | |
| "learning_rate": 1.601423487544484e-05, | |
| "loss": 0.0004, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 0.004664299543946981, | |
| "learning_rate": 1.5816528272044286e-05, | |
| "loss": 0.0013, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.5937777777777775, | |
| "grad_norm": 0.47931966185569763, | |
| "learning_rate": 1.5618821668643734e-05, | |
| "loss": 0.0003, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.6115555555555554, | |
| "grad_norm": 43.603816986083984, | |
| "learning_rate": 1.5421115065243182e-05, | |
| "loss": 0.0085, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.6293333333333333, | |
| "grad_norm": 0.0044282907620072365, | |
| "learning_rate": 1.5223408461842625e-05, | |
| "loss": 0.0589, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.647111111111111, | |
| "grad_norm": 0.001094396342523396, | |
| "learning_rate": 1.5025701858442073e-05, | |
| "loss": 0.0006, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.664888888888889, | |
| "grad_norm": 0.008125217631459236, | |
| "learning_rate": 1.4827995255041518e-05, | |
| "loss": 0.0002, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.6826666666666665, | |
| "grad_norm": 19.899005889892578, | |
| "learning_rate": 1.4630288651640966e-05, | |
| "loss": 0.0241, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.7004444444444444, | |
| "grad_norm": 0.19454504549503326, | |
| "learning_rate": 1.4432582048240412e-05, | |
| "loss": 0.0005, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.7182222222222223, | |
| "grad_norm": 0.003989567514508963, | |
| "learning_rate": 1.423487544483986e-05, | |
| "loss": 0.0045, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 0.0012507745996117592, | |
| "learning_rate": 1.4037168841439305e-05, | |
| "loss": 0.0011, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.7537777777777777, | |
| "grad_norm": 0.033174123615026474, | |
| "learning_rate": 1.383946223803875e-05, | |
| "loss": 0.0005, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.7715555555555556, | |
| "grad_norm": 0.04268620163202286, | |
| "learning_rate": 1.3641755634638198e-05, | |
| "loss": 0.0111, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.7893333333333334, | |
| "grad_norm": 0.041019320487976074, | |
| "learning_rate": 1.3444049031237643e-05, | |
| "loss": 0.0018, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.8071111111111113, | |
| "grad_norm": 0.1453588902950287, | |
| "learning_rate": 1.3246342427837091e-05, | |
| "loss": 0.0276, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.824888888888889, | |
| "grad_norm": 0.0027844184078276157, | |
| "learning_rate": 1.3048635824436536e-05, | |
| "loss": 0.0018, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.8426666666666667, | |
| "grad_norm": 0.0006637629121541977, | |
| "learning_rate": 1.2850929221035984e-05, | |
| "loss": 0.015, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.8604444444444446, | |
| "grad_norm": 0.00038110537570901215, | |
| "learning_rate": 1.265322261763543e-05, | |
| "loss": 0.0017, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.878222222222222, | |
| "grad_norm": 0.0005764598026871681, | |
| "learning_rate": 1.2455516014234877e-05, | |
| "loss": 0.0443, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 6.875983238220215, | |
| "learning_rate": 1.2257809410834323e-05, | |
| "loss": 0.0357, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.913777777777778, | |
| "grad_norm": 51.81392288208008, | |
| "learning_rate": 1.206010280743377e-05, | |
| "loss": 0.074, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.9315555555555557, | |
| "grad_norm": 0.0016795358387753367, | |
| "learning_rate": 1.1862396204033216e-05, | |
| "loss": 0.0046, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.9493333333333336, | |
| "grad_norm": 1.101694107055664, | |
| "learning_rate": 1.166468960063266e-05, | |
| "loss": 0.0004, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.967111111111111, | |
| "grad_norm": 0.03407168760895729, | |
| "learning_rate": 1.1466982997232107e-05, | |
| "loss": 0.0004, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.984888888888889, | |
| "grad_norm": 0.012767530046403408, | |
| "learning_rate": 1.1269276393831553e-05, | |
| "loss": 0.0426, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 7.458427717210725e-06, | |
| "eval_runtime": 41.6071, | |
| "eval_samples_per_second": 192.275, | |
| "eval_steps_per_second": 6.009, | |
| "step": 2249 | |
| }, | |
| { | |
| "epoch": 4.001777777777778, | |
| "grad_norm": 0.0036902178544551134, | |
| "learning_rate": 1.1071569790431002e-05, | |
| "loss": 0.0003, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.019555555555556, | |
| "grad_norm": 0.00998405460268259, | |
| "learning_rate": 1.0873863187030448e-05, | |
| "loss": 0.002, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 4.037333333333334, | |
| "grad_norm": 23.953229904174805, | |
| "learning_rate": 1.0676156583629894e-05, | |
| "loss": 0.0187, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 4.0551111111111116, | |
| "grad_norm": 0.008150537498295307, | |
| "learning_rate": 1.0478449980229341e-05, | |
| "loss": 0.0153, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.072888888888889, | |
| "grad_norm": 0.5894471406936646, | |
| "learning_rate": 1.0280743376828787e-05, | |
| "loss": 0.0002, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.0906666666666665, | |
| "grad_norm": 0.07007890194654465, | |
| "learning_rate": 1.0083036773428232e-05, | |
| "loss": 0.0367, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.108444444444444, | |
| "grad_norm": 0.07020383328199387, | |
| "learning_rate": 9.885330170027678e-06, | |
| "loss": 0.0017, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.126222222222222, | |
| "grad_norm": 0.0013433824060484767, | |
| "learning_rate": 9.687623566627125e-06, | |
| "loss": 0.0176, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.144, | |
| "grad_norm": 0.0036678831093013287, | |
| "learning_rate": 9.489916963226571e-06, | |
| "loss": 0.001, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.161777777777778, | |
| "grad_norm": 20.646207809448242, | |
| "learning_rate": 9.29221035982602e-06, | |
| "loss": 0.0071, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.179555555555556, | |
| "grad_norm": 0.004499041475355625, | |
| "learning_rate": 9.094503756425466e-06, | |
| "loss": 0.0004, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.197333333333333, | |
| "grad_norm": 0.0007168107549659908, | |
| "learning_rate": 8.896797153024912e-06, | |
| "loss": 0.0025, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.215111111111111, | |
| "grad_norm": 0.015021364204585552, | |
| "learning_rate": 8.699090549624359e-06, | |
| "loss": 0.0277, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.232888888888889, | |
| "grad_norm": 0.006119410507380962, | |
| "learning_rate": 8.501383946223804e-06, | |
| "loss": 0.0009, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.250666666666667, | |
| "grad_norm": 0.0018322835676372051, | |
| "learning_rate": 8.30367734282325e-06, | |
| "loss": 0.0009, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.2684444444444445, | |
| "grad_norm": 0.0025883447378873825, | |
| "learning_rate": 8.105970739422696e-06, | |
| "loss": 0.0179, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.286222222222222, | |
| "grad_norm": 0.010295086540281773, | |
| "learning_rate": 7.908264136022143e-06, | |
| "loss": 0.0002, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.304, | |
| "grad_norm": 0.27159199118614197, | |
| "learning_rate": 7.710557532621591e-06, | |
| "loss": 0.0003, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.321777777777778, | |
| "grad_norm": 0.014537914656102657, | |
| "learning_rate": 7.5128509292210365e-06, | |
| "loss": 0.0108, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.339555555555555, | |
| "grad_norm": 0.001482433988712728, | |
| "learning_rate": 7.315144325820483e-06, | |
| "loss": 0.0003, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.357333333333333, | |
| "grad_norm": 0.0015277402708306909, | |
| "learning_rate": 7.11743772241993e-06, | |
| "loss": 0.0496, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.375111111111111, | |
| "grad_norm": 0.005141290370374918, | |
| "learning_rate": 6.919731119019375e-06, | |
| "loss": 0.0001, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.392888888888889, | |
| "grad_norm": 27.9423770904541, | |
| "learning_rate": 6.722024515618821e-06, | |
| "loss": 0.0393, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.410666666666667, | |
| "grad_norm": 0.010187560692429543, | |
| "learning_rate": 6.524317912218268e-06, | |
| "loss": 0.0001, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.428444444444445, | |
| "grad_norm": 0.002554529346525669, | |
| "learning_rate": 6.326611308817715e-06, | |
| "loss": 0.0031, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.4462222222222225, | |
| "grad_norm": 4.20240592956543, | |
| "learning_rate": 6.1289047054171615e-06, | |
| "loss": 0.0016, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.464, | |
| "grad_norm": 0.013741197995841503, | |
| "learning_rate": 5.931198102016608e-06, | |
| "loss": 0.0003, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.481777777777777, | |
| "grad_norm": 0.043951794505119324, | |
| "learning_rate": 5.7334914986160535e-06, | |
| "loss": 0.0125, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.499555555555555, | |
| "grad_norm": 0.11376281827688217, | |
| "learning_rate": 5.535784895215501e-06, | |
| "loss": 0.003, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.517333333333333, | |
| "grad_norm": 0.05734412372112274, | |
| "learning_rate": 5.338078291814947e-06, | |
| "loss": 0.0009, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.535111111111111, | |
| "grad_norm": 0.0010632964549586177, | |
| "learning_rate": 5.140371688414394e-06, | |
| "loss": 0.0196, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.552888888888889, | |
| "grad_norm": 0.0036729658022522926, | |
| "learning_rate": 4.942665085013839e-06, | |
| "loss": 0.0071, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.570666666666667, | |
| "grad_norm": 5.985267162322998, | |
| "learning_rate": 4.744958481613286e-06, | |
| "loss": 0.0311, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.588444444444445, | |
| "grad_norm": 0.996809720993042, | |
| "learning_rate": 4.547251878212733e-06, | |
| "loss": 0.0012, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.606222222222222, | |
| "grad_norm": 0.11869648844003677, | |
| "learning_rate": 4.349545274812179e-06, | |
| "loss": 0.0003, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.624, | |
| "grad_norm": 0.006143218372017145, | |
| "learning_rate": 4.151838671411625e-06, | |
| "loss": 0.0013, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.641777777777778, | |
| "grad_norm": 0.024631284177303314, | |
| "learning_rate": 3.954132068011071e-06, | |
| "loss": 0.0388, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.6595555555555555, | |
| "grad_norm": 0.0017836794722825289, | |
| "learning_rate": 3.7564254646105183e-06, | |
| "loss": 0.0015, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.677333333333333, | |
| "grad_norm": 0.003801500890403986, | |
| "learning_rate": 3.558718861209965e-06, | |
| "loss": 0.0003, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.695111111111111, | |
| "grad_norm": 0.004178278613835573, | |
| "learning_rate": 3.3610122578094107e-06, | |
| "loss": 0.001, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.712888888888889, | |
| "grad_norm": 0.0044832993298769, | |
| "learning_rate": 3.1633056544088575e-06, | |
| "loss": 0.0194, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.730666666666667, | |
| "grad_norm": 0.0029173328075557947, | |
| "learning_rate": 2.965599051008304e-06, | |
| "loss": 0.0302, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.748444444444445, | |
| "grad_norm": 0.0005038917297497392, | |
| "learning_rate": 2.7678924476077504e-06, | |
| "loss": 0.011, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.766222222222222, | |
| "grad_norm": 0.01968969963490963, | |
| "learning_rate": 2.570185844207197e-06, | |
| "loss": 0.0002, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.784, | |
| "grad_norm": 0.02507755346596241, | |
| "learning_rate": 2.372479240806643e-06, | |
| "loss": 0.0012, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.801777777777778, | |
| "grad_norm": 6.288967609405518, | |
| "learning_rate": 2.1747726374060897e-06, | |
| "loss": 0.0195, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.819555555555556, | |
| "grad_norm": 0.19547367095947266, | |
| "learning_rate": 1.9770660340055357e-06, | |
| "loss": 0.0034, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.8373333333333335, | |
| "grad_norm": 0.007160472217947245, | |
| "learning_rate": 1.7793594306049826e-06, | |
| "loss": 0.0008, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.855111111111111, | |
| "grad_norm": 0.1027381643652916, | |
| "learning_rate": 1.5816528272044288e-06, | |
| "loss": 0.0006, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.872888888888889, | |
| "grad_norm": 0.6163949966430664, | |
| "learning_rate": 1.3839462238038752e-06, | |
| "loss": 0.0001, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.890666666666666, | |
| "grad_norm": 0.0065285759046673775, | |
| "learning_rate": 1.1862396204033214e-06, | |
| "loss": 0.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.908444444444444, | |
| "grad_norm": 2.0664429664611816, | |
| "learning_rate": 9.885330170027678e-07, | |
| "loss": 0.0033, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.926222222222222, | |
| "grad_norm": 0.04097575694322586, | |
| "learning_rate": 7.908264136022144e-07, | |
| "loss": 0.001, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.944, | |
| "grad_norm": 0.0015862607397139072, | |
| "learning_rate": 5.931198102016607e-07, | |
| "loss": 0.0001, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.961777777777778, | |
| "grad_norm": 0.0021847274620085955, | |
| "learning_rate": 3.954132068011072e-07, | |
| "loss": 0.0006, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.979555555555556, | |
| "grad_norm": 0.00527564063668251, | |
| "learning_rate": 1.977066034005536e-07, | |
| "loss": 0.0243, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.997333333333334, | |
| "grad_norm": 0.0007434898870997131, | |
| "learning_rate": 0.0, | |
| "loss": 0.0525, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.997333333333334, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 2.338912281629746e-06, | |
| "eval_runtime": 40.6863, | |
| "eval_samples_per_second": 196.626, | |
| "eval_steps_per_second": 6.145, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.997333333333334, | |
| "step": 2810, | |
| "total_flos": 8.94051665811918e+18, | |
| "train_loss": 0.011076973860487225, | |
| "train_runtime": 5179.8163, | |
| "train_samples_per_second": 69.501, | |
| "train_steps_per_second": 0.542 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2810, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.94051665811918e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |