| { | |
| "best_global_step": 2000, | |
| "best_metric": 2.3495261116052184, | |
| "best_model_checkpoint": "./SALAMA_NEW7/checkpoint-2000", | |
| "epoch": 0.798881565807869, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003994407829039345, | |
| "grad_norm": 3.857668399810791, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0481, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00798881565807869, | |
| "grad_norm": 2.135991096496582, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.0317, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.011983223487118035, | |
| "grad_norm": 2.663736343383789, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.0313, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01597763131615738, | |
| "grad_norm": 6.300492286682129, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.0442, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.019972039145196723, | |
| "grad_norm": 3.308899164199829, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.0378, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02396644697423607, | |
| "grad_norm": 3.85744309425354, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.0387, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.027960854803275415, | |
| "grad_norm": 2.7074952125549316, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.0411, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03195526263231476, | |
| "grad_norm": 1.499977707862854, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.0373, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.035949670461354104, | |
| "grad_norm": 3.291985273361206, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.0291, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.039944078290393446, | |
| "grad_norm": 5.130978107452393, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.035, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.043938486119432796, | |
| "grad_norm": 3.7789242267608643, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.0373, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04793289394847214, | |
| "grad_norm": 1.6493456363677979, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.0375, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05192730177751148, | |
| "grad_norm": 3.3285486698150635, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.0358, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05592170960655083, | |
| "grad_norm": 2.768843650817871, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.0447, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05991611743559017, | |
| "grad_norm": 4.878016471862793, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.0388, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06391052526462952, | |
| "grad_norm": 4.028059959411621, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.0398, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06790493309366886, | |
| "grad_norm": 3.255234956741333, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.0355, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07189934092270821, | |
| "grad_norm": 3.8220138549804688, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.0403, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07589374875174755, | |
| "grad_norm": 1.4640872478485107, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.0385, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07988815658078689, | |
| "grad_norm": 3.634939432144165, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.0468, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08388256440982625, | |
| "grad_norm": 3.357348918914795, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.0337, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08787697223886559, | |
| "grad_norm": 3.1094183921813965, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.0359, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09187138006790493, | |
| "grad_norm": 5.401888847351074, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.0628, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09586578789694428, | |
| "grad_norm": 3.3238024711608887, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.059, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.09986019572598362, | |
| "grad_norm": 2.741633892059326, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.0408, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10385460355502296, | |
| "grad_norm": 2.8601815700531006, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.0328, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.10784901138406232, | |
| "grad_norm": 2.5481112003326416, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.034, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11184341921310166, | |
| "grad_norm": 2.9293053150177, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.0345, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.115837827042141, | |
| "grad_norm": 3.529905319213867, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.0361, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.11983223487118035, | |
| "grad_norm": 3.364583969116211, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.0389, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12382664270021969, | |
| "grad_norm": 4.122386455535889, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.0588, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.12782105052925904, | |
| "grad_norm": 4.145680904388428, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.0495, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1318154583582984, | |
| "grad_norm": 4.538552761077881, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.0418, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.13580986618733773, | |
| "grad_norm": 3.27231502532959, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.037, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.13980427401637707, | |
| "grad_norm": 2.733621597290039, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.0401, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14379868184541642, | |
| "grad_norm": 3.7901926040649414, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.0414, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.14779308967445576, | |
| "grad_norm": 3.2932889461517334, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.04, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1517874975034951, | |
| "grad_norm": 2.3562254905700684, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.0424, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.15578190533253444, | |
| "grad_norm": 3.2661616802215576, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.035, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.15977631316157379, | |
| "grad_norm": 2.809990167617798, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.0404, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16377072099061313, | |
| "grad_norm": 3.149474620819092, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.0488, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1677651288196525, | |
| "grad_norm": 3.6923530101776123, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.0515, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.17175953664869184, | |
| "grad_norm": 2.9962356090545654, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.0416, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.17575394447773118, | |
| "grad_norm": 4.252007007598877, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.0448, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.17974835230677053, | |
| "grad_norm": 2.9716057777404785, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.0368, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18374276013580987, | |
| "grad_norm": 3.339590072631836, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.041, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1877371679648492, | |
| "grad_norm": 4.591104507446289, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.0469, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.19173157579388855, | |
| "grad_norm": 3.201215982437134, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.0403, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1957259836229279, | |
| "grad_norm": 3.1253278255462646, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.0428, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.19972039145196724, | |
| "grad_norm": 2.59318208694458, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.0397, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20371479928100658, | |
| "grad_norm": 3.2373461723327637, | |
| "learning_rate": 9.98716486023959e-06, | |
| "loss": 0.0515, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.20770920711004592, | |
| "grad_norm": 3.1935927867889404, | |
| "learning_rate": 9.972903593839133e-06, | |
| "loss": 0.0466, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2117036149390853, | |
| "grad_norm": 3.210402011871338, | |
| "learning_rate": 9.958642327438678e-06, | |
| "loss": 0.0441, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.21569802276812464, | |
| "grad_norm": 2.792036533355713, | |
| "learning_rate": 9.944381061038221e-06, | |
| "loss": 0.0317, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.21969243059716398, | |
| "grad_norm": 1.8875095844268799, | |
| "learning_rate": 9.930119794637765e-06, | |
| "loss": 0.035, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.22368683842620332, | |
| "grad_norm": 4.238950252532959, | |
| "learning_rate": 9.91585852823731e-06, | |
| "loss": 0.0656, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.22768124625524266, | |
| "grad_norm": 3.023404359817505, | |
| "learning_rate": 9.901597261836851e-06, | |
| "loss": 0.0511, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.231675654084282, | |
| "grad_norm": 3.4290168285369873, | |
| "learning_rate": 9.887335995436396e-06, | |
| "loss": 0.0496, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.23567006191332135, | |
| "grad_norm": 4.026778697967529, | |
| "learning_rate": 9.87307472903594e-06, | |
| "loss": 0.0394, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2396644697423607, | |
| "grad_norm": 3.315932512283325, | |
| "learning_rate": 9.858813462635483e-06, | |
| "loss": 0.044, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24365887757140003, | |
| "grad_norm": 2.7956554889678955, | |
| "learning_rate": 9.844552196235026e-06, | |
| "loss": 0.0414, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.24765328540043938, | |
| "grad_norm": 4.850486755371094, | |
| "learning_rate": 9.83029092983457e-06, | |
| "loss": 0.0522, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2516476932294787, | |
| "grad_norm": 3.1265628337860107, | |
| "learning_rate": 9.816029663434114e-06, | |
| "loss": 0.0358, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2556421010585181, | |
| "grad_norm": 2.609273910522461, | |
| "learning_rate": 9.801768397033657e-06, | |
| "loss": 0.0397, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2596365088875574, | |
| "grad_norm": 4.073995590209961, | |
| "learning_rate": 9.787507130633202e-06, | |
| "loss": 0.0491, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2636309167165968, | |
| "grad_norm": 2.9083051681518555, | |
| "learning_rate": 9.773245864232744e-06, | |
| "loss": 0.034, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2676253245456361, | |
| "grad_norm": 4.328786849975586, | |
| "learning_rate": 9.758984597832289e-06, | |
| "loss": 0.0498, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.27161973237467546, | |
| "grad_norm": 3.5342602729797363, | |
| "learning_rate": 9.744723331431832e-06, | |
| "loss": 0.0361, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2756141402037148, | |
| "grad_norm": 1.7005605697631836, | |
| "learning_rate": 9.730462065031375e-06, | |
| "loss": 0.0512, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.27960854803275415, | |
| "grad_norm": 2.4722962379455566, | |
| "learning_rate": 9.71620079863092e-06, | |
| "loss": 0.0564, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2836029558617935, | |
| "grad_norm": 4.595683574676514, | |
| "learning_rate": 9.701939532230463e-06, | |
| "loss": 0.0394, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.28759736369083283, | |
| "grad_norm": 4.185388088226318, | |
| "learning_rate": 9.687678265830007e-06, | |
| "loss": 0.0537, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2915917715198722, | |
| "grad_norm": 2.412919044494629, | |
| "learning_rate": 9.67341699942955e-06, | |
| "loss": 0.0479, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2955861793489115, | |
| "grad_norm": 3.0715839862823486, | |
| "learning_rate": 9.659155733029095e-06, | |
| "loss": 0.0524, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2995805871779509, | |
| "grad_norm": 3.375519275665283, | |
| "learning_rate": 9.644894466628636e-06, | |
| "loss": 0.0555, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3035749950069902, | |
| "grad_norm": 4.457313060760498, | |
| "learning_rate": 9.630633200228181e-06, | |
| "loss": 0.0502, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.30756940283602957, | |
| "grad_norm": 2.67417573928833, | |
| "learning_rate": 9.616371933827725e-06, | |
| "loss": 0.0367, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3115638106650689, | |
| "grad_norm": 4.992038726806641, | |
| "learning_rate": 9.602110667427268e-06, | |
| "loss": 0.0459, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.31555821849410826, | |
| "grad_norm": 3.235741138458252, | |
| "learning_rate": 9.587849401026813e-06, | |
| "loss": 0.0363, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.31955262632314757, | |
| "grad_norm": 3.169708490371704, | |
| "learning_rate": 9.573588134626356e-06, | |
| "loss": 0.0441, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32354703415218694, | |
| "grad_norm": 3.8259077072143555, | |
| "learning_rate": 9.5593268682259e-06, | |
| "loss": 0.0449, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.32754144198122626, | |
| "grad_norm": 2.690516948699951, | |
| "learning_rate": 9.545065601825442e-06, | |
| "loss": 0.0529, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3315358498102656, | |
| "grad_norm": 3.342012643814087, | |
| "learning_rate": 9.530804335424987e-06, | |
| "loss": 0.0381, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.335530257639305, | |
| "grad_norm": 4.141724109649658, | |
| "learning_rate": 9.516543069024529e-06, | |
| "loss": 0.0505, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.3395246654683443, | |
| "grad_norm": 6.260738372802734, | |
| "learning_rate": 9.502281802624074e-06, | |
| "loss": 0.0466, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3435190732973837, | |
| "grad_norm": 3.450930118560791, | |
| "learning_rate": 9.488020536223617e-06, | |
| "loss": 0.0499, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.347513481126423, | |
| "grad_norm": 3.835606813430786, | |
| "learning_rate": 9.47375926982316e-06, | |
| "loss": 0.0698, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.35150788895546237, | |
| "grad_norm": 5.289712429046631, | |
| "learning_rate": 9.459498003422705e-06, | |
| "loss": 0.0545, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3555022967845017, | |
| "grad_norm": 3.3518259525299072, | |
| "learning_rate": 9.445236737022249e-06, | |
| "loss": 0.05, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.35949670461354105, | |
| "grad_norm": 2.9629595279693604, | |
| "learning_rate": 9.430975470621792e-06, | |
| "loss": 0.0458, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36349111244258037, | |
| "grad_norm": 4.516363620758057, | |
| "learning_rate": 9.416714204221335e-06, | |
| "loss": 0.0505, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.36748552027161974, | |
| "grad_norm": 3.87882661819458, | |
| "learning_rate": 9.40245293782088e-06, | |
| "loss": 0.063, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.37147992810065905, | |
| "grad_norm": 5.499217987060547, | |
| "learning_rate": 9.388191671420423e-06, | |
| "loss": 0.0516, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3754743359296984, | |
| "grad_norm": 2.694190740585327, | |
| "learning_rate": 9.373930405019966e-06, | |
| "loss": 0.0463, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.3794687437587378, | |
| "grad_norm": 3.5198814868927, | |
| "learning_rate": 9.35966913861951e-06, | |
| "loss": 0.0504, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3834631515877771, | |
| "grad_norm": 3.1238009929656982, | |
| "learning_rate": 9.345407872219053e-06, | |
| "loss": 0.0371, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3874575594168165, | |
| "grad_norm": 4.268932819366455, | |
| "learning_rate": 9.331146605818598e-06, | |
| "loss": 0.0528, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3914519672458558, | |
| "grad_norm": 3.029754161834717, | |
| "learning_rate": 9.316885339418141e-06, | |
| "loss": 0.035, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.39544637507489516, | |
| "grad_norm": 4.07280158996582, | |
| "learning_rate": 9.302624073017684e-06, | |
| "loss": 0.0372, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3994407829039345, | |
| "grad_norm": 2.339977741241455, | |
| "learning_rate": 9.288362806617228e-06, | |
| "loss": 0.0397, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.40343519073297385, | |
| "grad_norm": 4.685292720794678, | |
| "learning_rate": 9.274101540216773e-06, | |
| "loss": 0.0498, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.40742959856201316, | |
| "grad_norm": 3.811983346939087, | |
| "learning_rate": 9.259840273816316e-06, | |
| "loss": 0.042, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.41142400639105253, | |
| "grad_norm": 4.947136878967285, | |
| "learning_rate": 9.245579007415859e-06, | |
| "loss": 0.0509, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.41541841422009185, | |
| "grad_norm": 3.673635482788086, | |
| "learning_rate": 9.231317741015402e-06, | |
| "loss": 0.0465, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.4194128220491312, | |
| "grad_norm": 4.268950462341309, | |
| "learning_rate": 9.217056474614946e-06, | |
| "loss": 0.0461, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4234072298781706, | |
| "grad_norm": 5.4644927978515625, | |
| "learning_rate": 9.20279520821449e-06, | |
| "loss": 0.0585, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.4274016377072099, | |
| "grad_norm": 2.472513437271118, | |
| "learning_rate": 9.188533941814034e-06, | |
| "loss": 0.0397, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.4313960455362493, | |
| "grad_norm": 5.088723659515381, | |
| "learning_rate": 9.174272675413579e-06, | |
| "loss": 0.0578, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.4353904533652886, | |
| "grad_norm": 5.186155796051025, | |
| "learning_rate": 9.16001140901312e-06, | |
| "loss": 0.0479, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.43938486119432796, | |
| "grad_norm": 2.81628155708313, | |
| "learning_rate": 9.145750142612665e-06, | |
| "loss": 0.0458, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4433792690233673, | |
| "grad_norm": 2.2827396392822266, | |
| "learning_rate": 9.131488876212208e-06, | |
| "loss": 0.0395, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.44737367685240664, | |
| "grad_norm": 4.735171318054199, | |
| "learning_rate": 9.117227609811752e-06, | |
| "loss": 0.0512, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.45136808468144596, | |
| "grad_norm": 2.573768138885498, | |
| "learning_rate": 9.102966343411297e-06, | |
| "loss": 0.04, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.45536249251048533, | |
| "grad_norm": 3.1118862628936768, | |
| "learning_rate": 9.08870507701084e-06, | |
| "loss": 0.0662, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.45935690033952464, | |
| "grad_norm": 4.575135231018066, | |
| "learning_rate": 9.074443810610383e-06, | |
| "loss": 0.0412, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.463351308168564, | |
| "grad_norm": 3.333889961242676, | |
| "learning_rate": 9.060182544209926e-06, | |
| "loss": 0.0422, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.46734571599760333, | |
| "grad_norm": 4.380234241485596, | |
| "learning_rate": 9.045921277809471e-06, | |
| "loss": 0.0678, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.4713401238266427, | |
| "grad_norm": 3.1090171337127686, | |
| "learning_rate": 9.031660011409013e-06, | |
| "loss": 0.0419, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.47533453165568207, | |
| "grad_norm": 3.1724114418029785, | |
| "learning_rate": 9.017398745008558e-06, | |
| "loss": 0.0471, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4793289394847214, | |
| "grad_norm": 3.701120138168335, | |
| "learning_rate": 9.003137478608101e-06, | |
| "loss": 0.0431, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48332334731376075, | |
| "grad_norm": 1.9329041242599487, | |
| "learning_rate": 8.988876212207644e-06, | |
| "loss": 0.032, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.48731775514280007, | |
| "grad_norm": 4.282286167144775, | |
| "learning_rate": 8.97461494580719e-06, | |
| "loss": 0.0469, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.49131216297183944, | |
| "grad_norm": 2.6785926818847656, | |
| "learning_rate": 8.960353679406733e-06, | |
| "loss": 0.0643, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.49530657080087875, | |
| "grad_norm": 5.211093425750732, | |
| "learning_rate": 8.946092413006276e-06, | |
| "loss": 0.0579, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4993009786299181, | |
| "grad_norm": 4.612974643707275, | |
| "learning_rate": 8.931831146605819e-06, | |
| "loss": 0.0618, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5032953864589574, | |
| "grad_norm": 4.7832441329956055, | |
| "learning_rate": 8.917569880205364e-06, | |
| "loss": 0.0567, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5072897942879968, | |
| "grad_norm": 3.8425779342651367, | |
| "learning_rate": 8.903308613804906e-06, | |
| "loss": 0.0489, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.5112842021170362, | |
| "grad_norm": 2.750922679901123, | |
| "learning_rate": 8.88904734740445e-06, | |
| "loss": 0.0438, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5152786099460755, | |
| "grad_norm": 3.5208797454833984, | |
| "learning_rate": 8.874786081003994e-06, | |
| "loss": 0.0472, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5192730177751148, | |
| "grad_norm": 5.142436981201172, | |
| "learning_rate": 8.860524814603537e-06, | |
| "loss": 0.0622, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5232674256041542, | |
| "grad_norm": 3.287731885910034, | |
| "learning_rate": 8.846263548203082e-06, | |
| "loss": 0.0476, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5272618334331935, | |
| "grad_norm": 1.9734798669815063, | |
| "learning_rate": 8.832002281802625e-06, | |
| "loss": 0.0485, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5312562412622329, | |
| "grad_norm": 4.245563507080078, | |
| "learning_rate": 8.817741015402168e-06, | |
| "loss": 0.0572, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5352506490912722, | |
| "grad_norm": 3.6561875343322754, | |
| "learning_rate": 8.803479749001712e-06, | |
| "loss": 0.05, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5392450569203115, | |
| "grad_norm": 3.4770493507385254, | |
| "learning_rate": 8.789218482601257e-06, | |
| "loss": 0.0455, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5432394647493509, | |
| "grad_norm": 2.5556652545928955, | |
| "learning_rate": 8.7749572162008e-06, | |
| "loss": 0.0396, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5472338725783903, | |
| "grad_norm": 3.5269246101379395, | |
| "learning_rate": 8.760695949800343e-06, | |
| "loss": 0.0435, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.5512282804074295, | |
| "grad_norm": 3.7632479667663574, | |
| "learning_rate": 8.746434683399886e-06, | |
| "loss": 0.0458, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5552226882364689, | |
| "grad_norm": 4.281711578369141, | |
| "learning_rate": 8.73217341699943e-06, | |
| "loss": 0.0511, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.5592170960655083, | |
| "grad_norm": 3.2763116359710693, | |
| "learning_rate": 8.717912150598975e-06, | |
| "loss": 0.0405, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5632115038945477, | |
| "grad_norm": 3.267866849899292, | |
| "learning_rate": 8.703650884198518e-06, | |
| "loss": 0.0393, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.567205911723587, | |
| "grad_norm": 3.6679697036743164, | |
| "learning_rate": 8.689389617798061e-06, | |
| "loss": 0.06, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5712003195526263, | |
| "grad_norm": 2.4105310440063477, | |
| "learning_rate": 8.675128351397604e-06, | |
| "loss": 0.0406, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.5751947273816657, | |
| "grad_norm": 4.512986183166504, | |
| "learning_rate": 8.66086708499715e-06, | |
| "loss": 0.0522, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.579189135210705, | |
| "grad_norm": 2.781480550765991, | |
| "learning_rate": 8.646605818596692e-06, | |
| "loss": 0.0575, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5831835430397444, | |
| "grad_norm": 2.455273151397705, | |
| "learning_rate": 8.632344552196236e-06, | |
| "loss": 0.0515, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5871779508687837, | |
| "grad_norm": 3.5020835399627686, | |
| "learning_rate": 8.618083285795779e-06, | |
| "loss": 0.0493, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.591172358697823, | |
| "grad_norm": 2.549257278442383, | |
| "learning_rate": 8.603822019395322e-06, | |
| "loss": 0.0305, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5951667665268624, | |
| "grad_norm": 3.308424949645996, | |
| "learning_rate": 8.589560752994867e-06, | |
| "loss": 0.0394, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5991611743559018, | |
| "grad_norm": 4.140067100524902, | |
| "learning_rate": 8.57529948659441e-06, | |
| "loss": 0.0452, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.603155582184941, | |
| "grad_norm": 4.411773204803467, | |
| "learning_rate": 8.561038220193954e-06, | |
| "loss": 0.0564, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6071499900139804, | |
| "grad_norm": 2.6754167079925537, | |
| "learning_rate": 8.546776953793497e-06, | |
| "loss": 0.0398, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6111443978430198, | |
| "grad_norm": 2.152740001678467, | |
| "learning_rate": 8.532515687393042e-06, | |
| "loss": 0.0417, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6151388056720591, | |
| "grad_norm": 2.487994909286499, | |
| "learning_rate": 8.518254420992585e-06, | |
| "loss": 0.0475, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6191332135010985, | |
| "grad_norm": 1.9966262578964233, | |
| "learning_rate": 8.503993154592128e-06, | |
| "loss": 0.0456, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6231276213301378, | |
| "grad_norm": 2.2700722217559814, | |
| "learning_rate": 8.489731888191672e-06, | |
| "loss": 0.0498, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6271220291591771, | |
| "grad_norm": 4.674302101135254, | |
| "learning_rate": 8.475470621791215e-06, | |
| "loss": 0.0531, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6311164369882165, | |
| "grad_norm": 3.2880172729492188, | |
| "learning_rate": 8.46120935539076e-06, | |
| "loss": 0.0418, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6351108448172559, | |
| "grad_norm": 4.65671968460083, | |
| "learning_rate": 8.446948088990303e-06, | |
| "loss": 0.0429, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6391052526462951, | |
| "grad_norm": 2.744880199432373, | |
| "learning_rate": 8.432686822589846e-06, | |
| "loss": 0.038, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6430996604753345, | |
| "grad_norm": 3.2633752822875977, | |
| "learning_rate": 8.41842555618939e-06, | |
| "loss": 0.0508, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6470940683043739, | |
| "grad_norm": 4.094485759735107, | |
| "learning_rate": 8.404164289788934e-06, | |
| "loss": 0.0482, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6510884761334133, | |
| "grad_norm": 3.6843981742858887, | |
| "learning_rate": 8.389903023388478e-06, | |
| "loss": 0.0426, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.6550828839624525, | |
| "grad_norm": 2.887274742126465, | |
| "learning_rate": 8.375641756988021e-06, | |
| "loss": 0.051, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.6590772917914919, | |
| "grad_norm": 3.3279778957366943, | |
| "learning_rate": 8.361380490587566e-06, | |
| "loss": 0.0378, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6630716996205313, | |
| "grad_norm": 3.5682291984558105, | |
| "learning_rate": 8.347119224187107e-06, | |
| "loss": 0.054, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.6670661074495706, | |
| "grad_norm": 4.352251052856445, | |
| "learning_rate": 8.332857957786652e-06, | |
| "loss": 0.0431, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.67106051527861, | |
| "grad_norm": 2.8711681365966797, | |
| "learning_rate": 8.318596691386196e-06, | |
| "loss": 0.0351, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.6750549231076493, | |
| "grad_norm": 3.0659475326538086, | |
| "learning_rate": 8.304335424985739e-06, | |
| "loss": 0.0413, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.6790493309366886, | |
| "grad_norm": 5.653151988983154, | |
| "learning_rate": 8.290074158585282e-06, | |
| "loss": 0.0526, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.683043738765728, | |
| "grad_norm": 7.713840007781982, | |
| "learning_rate": 8.275812892184827e-06, | |
| "loss": 0.0495, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.6870381465947674, | |
| "grad_norm": 2.798320770263672, | |
| "learning_rate": 8.26155162578437e-06, | |
| "loss": 0.0436, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.6910325544238066, | |
| "grad_norm": 3.696972131729126, | |
| "learning_rate": 8.247290359383914e-06, | |
| "loss": 0.0413, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.695026962252846, | |
| "grad_norm": 2.2935080528259277, | |
| "learning_rate": 8.233029092983458e-06, | |
| "loss": 0.0517, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6990213700818854, | |
| "grad_norm": 4.169670581817627, | |
| "learning_rate": 8.218767826583002e-06, | |
| "loss": 0.0499, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7030157779109247, | |
| "grad_norm": 4.221024036407471, | |
| "learning_rate": 8.204506560182545e-06, | |
| "loss": 0.041, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7070101857399641, | |
| "grad_norm": 4.128349781036377, | |
| "learning_rate": 8.190245293782088e-06, | |
| "loss": 0.0598, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7110045935690034, | |
| "grad_norm": 3.2227325439453125, | |
| "learning_rate": 8.175984027381633e-06, | |
| "loss": 0.0468, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7149990013980427, | |
| "grad_norm": 2.807412624359131, | |
| "learning_rate": 8.161722760981175e-06, | |
| "loss": 0.0419, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7189934092270821, | |
| "grad_norm": 3.685307741165161, | |
| "learning_rate": 8.14746149458072e-06, | |
| "loss": 0.05, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7229878170561215, | |
| "grad_norm": 3.1187121868133545, | |
| "learning_rate": 8.133200228180263e-06, | |
| "loss": 0.0483, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7269822248851607, | |
| "grad_norm": 3.4114596843719482, | |
| "learning_rate": 8.118938961779806e-06, | |
| "loss": 0.0494, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7309766327142001, | |
| "grad_norm": 1.188926100730896, | |
| "learning_rate": 8.104677695379351e-06, | |
| "loss": 0.0366, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.7349710405432395, | |
| "grad_norm": 3.5981297492980957, | |
| "learning_rate": 8.090416428978894e-06, | |
| "loss": 0.0503, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7389654483722788, | |
| "grad_norm": 3.0543458461761475, | |
| "learning_rate": 8.076155162578438e-06, | |
| "loss": 0.0495, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.7429598562013181, | |
| "grad_norm": 3.7244555950164795, | |
| "learning_rate": 8.06189389617798e-06, | |
| "loss": 0.0496, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.7469542640303575, | |
| "grad_norm": 2.367532968521118, | |
| "learning_rate": 8.047632629777526e-06, | |
| "loss": 0.0418, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.7509486718593968, | |
| "grad_norm": 2.6962149143218994, | |
| "learning_rate": 8.033371363377069e-06, | |
| "loss": 0.0379, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.7549430796884362, | |
| "grad_norm": 3.7440872192382812, | |
| "learning_rate": 8.019110096976612e-06, | |
| "loss": 0.0411, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.7589374875174756, | |
| "grad_norm": 3.3427021503448486, | |
| "learning_rate": 8.004848830576156e-06, | |
| "loss": 0.0367, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7629318953465148, | |
| "grad_norm": 3.3870105743408203, | |
| "learning_rate": 7.990587564175699e-06, | |
| "loss": 0.0575, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.7669263031755542, | |
| "grad_norm": 3.3747177124023438, | |
| "learning_rate": 7.976326297775244e-06, | |
| "loss": 0.0455, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.7709207110045936, | |
| "grad_norm": 2.9369425773620605, | |
| "learning_rate": 7.962065031374787e-06, | |
| "loss": 0.0474, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.774915118833633, | |
| "grad_norm": 2.0219178199768066, | |
| "learning_rate": 7.94780376497433e-06, | |
| "loss": 0.0496, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.7789095266626722, | |
| "grad_norm": 3.041201591491699, | |
| "learning_rate": 7.933542498573873e-06, | |
| "loss": 0.0386, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.7829039344917116, | |
| "grad_norm": 3.874758243560791, | |
| "learning_rate": 7.919281232173418e-06, | |
| "loss": 0.0331, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.786898342320751, | |
| "grad_norm": 2.3752293586730957, | |
| "learning_rate": 7.905019965772962e-06, | |
| "loss": 0.0477, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.7908927501497903, | |
| "grad_norm": 4.09476900100708, | |
| "learning_rate": 7.890758699372505e-06, | |
| "loss": 0.051, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.7948871579788296, | |
| "grad_norm": 1.380469799041748, | |
| "learning_rate": 7.876497432972048e-06, | |
| "loss": 0.0313, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.798881565807869, | |
| "grad_norm": 2.8417437076568604, | |
| "learning_rate": 7.862236166571591e-06, | |
| "loss": 0.0444, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.798881565807869, | |
| "eval_loss": 0.02616371586918831, | |
| "eval_runtime": 9253.1401, | |
| "eval_samples_per_second": 2.164, | |
| "eval_steps_per_second": 0.271, | |
| "eval_wer": 2.3495261116052184, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7512, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.846946562048e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |