| { | |
| "best_global_step": 9908, | |
| "best_metric": 1.7698270082473755, | |
| "best_model_checkpoint": "./mcqa_qwen3_letter_best/checkpoint-9908", | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 9908, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010092854259184497, | |
| "grad_norm": 48.55782699584961, | |
| "learning_rate": 8.879919273461152e-07, | |
| "loss": 2.3985, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.020185708518368994, | |
| "grad_norm": 25.696617126464844, | |
| "learning_rate": 1.8970736629667005e-06, | |
| "loss": 2.053, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.030278562777553492, | |
| "grad_norm": 27.860021591186523, | |
| "learning_rate": 2.906155398587286e-06, | |
| "loss": 1.9305, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04037141703673799, | |
| "grad_norm": 17.68500518798828, | |
| "learning_rate": 3.915237134207871e-06, | |
| "loss": 1.9294, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.050464271295922486, | |
| "grad_norm": 26.112218856811523, | |
| "learning_rate": 4.924318869828457e-06, | |
| "loss": 1.8834, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.060557125555106985, | |
| "grad_norm": 25.835376739501953, | |
| "learning_rate": 5.933400605449042e-06, | |
| "loss": 1.8517, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07064997981429148, | |
| "grad_norm": 22.44589614868164, | |
| "learning_rate": 6.942482341069627e-06, | |
| "loss": 1.8978, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08074283407347597, | |
| "grad_norm": 32.82951354980469, | |
| "learning_rate": 7.951564076690212e-06, | |
| "loss": 1.8867, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09083568833266048, | |
| "grad_norm": 35.665794372558594, | |
| "learning_rate": 8.960645812310798e-06, | |
| "loss": 1.9055, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10092854259184497, | |
| "grad_norm": 22.500865936279297, | |
| "learning_rate": 9.969727547931384e-06, | |
| "loss": 1.8755, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11102139685102948, | |
| "grad_norm": 40.59410095214844, | |
| "learning_rate": 1.0978809283551967e-05, | |
| "loss": 1.8881, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12111425111021397, | |
| "grad_norm": 28.769454956054688, | |
| "learning_rate": 1.1987891019172555e-05, | |
| "loss": 1.8713, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13120710536939847, | |
| "grad_norm": 17.596820831298828, | |
| "learning_rate": 1.299697275479314e-05, | |
| "loss": 1.8694, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.14129995962858297, | |
| "grad_norm": 17.149999618530273, | |
| "learning_rate": 1.4006054490413725e-05, | |
| "loss": 1.8809, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15139281388776746, | |
| "grad_norm": 19.181955337524414, | |
| "learning_rate": 1.5015136226034311e-05, | |
| "loss": 1.8697, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.16148566814695195, | |
| "grad_norm": 24.227073669433594, | |
| "learning_rate": 1.6024217961654894e-05, | |
| "loss": 1.9201, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17157852240613647, | |
| "grad_norm": 18.42403221130371, | |
| "learning_rate": 1.703329969727548e-05, | |
| "loss": 1.8876, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.18167137666532096, | |
| "grad_norm": 21.015230178833008, | |
| "learning_rate": 1.8042381432896066e-05, | |
| "loss": 1.8697, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19176423092450545, | |
| "grad_norm": 16.02488899230957, | |
| "learning_rate": 1.905146316851665e-05, | |
| "loss": 1.9102, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.20185708518368994, | |
| "grad_norm": 25.045923233032227, | |
| "learning_rate": 1.9993271279578333e-05, | |
| "loss": 1.9121, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.21194993944287444, | |
| "grad_norm": 17.414430618286133, | |
| "learning_rate": 1.9881125939217227e-05, | |
| "loss": 1.9449, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.22204279370205895, | |
| "grad_norm": 15.37423324584961, | |
| "learning_rate": 1.976898059885612e-05, | |
| "loss": 1.9139, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.23213564796124345, | |
| "grad_norm": 20.543489456176758, | |
| "learning_rate": 1.965683525849501e-05, | |
| "loss": 1.92, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.24222850222042794, | |
| "grad_norm": 12.01870346069336, | |
| "learning_rate": 1.9544689918133902e-05, | |
| "loss": 1.8962, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.25232135647961246, | |
| "grad_norm": 15.475773811340332, | |
| "learning_rate": 1.9432544577772796e-05, | |
| "loss": 1.9483, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.26241421073879695, | |
| "grad_norm": 11.753213882446289, | |
| "learning_rate": 1.9320399237411686e-05, | |
| "loss": 1.919, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.27250706499798144, | |
| "grad_norm": 14.90489673614502, | |
| "learning_rate": 1.920825389705058e-05, | |
| "loss": 1.8742, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.28259991925716593, | |
| "grad_norm": 12.925189971923828, | |
| "learning_rate": 1.909610855668947e-05, | |
| "loss": 1.8822, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2926927735163504, | |
| "grad_norm": 17.215579986572266, | |
| "learning_rate": 1.898396321632836e-05, | |
| "loss": 1.8796, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3027856277755349, | |
| "grad_norm": 16.483861923217773, | |
| "learning_rate": 1.8871817875967255e-05, | |
| "loss": 1.8442, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3128784820347194, | |
| "grad_norm": 18.10808753967285, | |
| "learning_rate": 1.875967253560615e-05, | |
| "loss": 1.9131, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.3229713362939039, | |
| "grad_norm": 14.261265754699707, | |
| "learning_rate": 1.864752719524504e-05, | |
| "loss": 1.7602, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.3330641905530884, | |
| "grad_norm": 16.223392486572266, | |
| "learning_rate": 1.8535381854883933e-05, | |
| "loss": 1.8392, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.34315704481227294, | |
| "grad_norm": 14.012106895446777, | |
| "learning_rate": 1.8423236514522824e-05, | |
| "loss": 1.8335, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.35324989907145743, | |
| "grad_norm": 13.234374046325684, | |
| "learning_rate": 1.8311091174161714e-05, | |
| "loss": 1.8501, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.3633427533306419, | |
| "grad_norm": 11.787166595458984, | |
| "learning_rate": 1.8198945833800608e-05, | |
| "loss": 1.8704, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3734356075898264, | |
| "grad_norm": 15.64974308013916, | |
| "learning_rate": 1.80868004934395e-05, | |
| "loss": 1.85, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3835284618490109, | |
| "grad_norm": 13.893998146057129, | |
| "learning_rate": 1.7974655153078392e-05, | |
| "loss": 1.8807, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3936213161081954, | |
| "grad_norm": 15.42603588104248, | |
| "learning_rate": 1.7862509812717283e-05, | |
| "loss": 1.8124, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.4037141703673799, | |
| "grad_norm": 12.293023109436035, | |
| "learning_rate": 1.7750364472356173e-05, | |
| "loss": 1.8112, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4138070246265644, | |
| "grad_norm": 17.576618194580078, | |
| "learning_rate": 1.7638219131995067e-05, | |
| "loss": 1.8468, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.42389987888574887, | |
| "grad_norm": 36.62916946411133, | |
| "learning_rate": 1.752607379163396e-05, | |
| "loss": 1.8563, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.43399273314493336, | |
| "grad_norm": 12.232354164123535, | |
| "learning_rate": 1.741392845127285e-05, | |
| "loss": 1.8643, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.4440855874041179, | |
| "grad_norm": 9.772968292236328, | |
| "learning_rate": 1.7301783110911742e-05, | |
| "loss": 1.8686, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.4541784416633024, | |
| "grad_norm": 13.78654956817627, | |
| "learning_rate": 1.7189637770550636e-05, | |
| "loss": 1.8477, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4642712959224869, | |
| "grad_norm": 14.448091506958008, | |
| "learning_rate": 1.7077492430189526e-05, | |
| "loss": 1.828, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4743641501816714, | |
| "grad_norm": 10.872529983520508, | |
| "learning_rate": 1.696534708982842e-05, | |
| "loss": 1.7916, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.4844570044408559, | |
| "grad_norm": 14.716806411743164, | |
| "learning_rate": 1.685320174946731e-05, | |
| "loss": 1.7982, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.49454985870004037, | |
| "grad_norm": 15.155656814575195, | |
| "learning_rate": 1.67410564091062e-05, | |
| "loss": 1.8422, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.5046427129592249, | |
| "grad_norm": 11.369612693786621, | |
| "learning_rate": 1.6628911068745095e-05, | |
| "loss": 1.8217, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5147355672184094, | |
| "grad_norm": 15.491066932678223, | |
| "learning_rate": 1.651676572838399e-05, | |
| "loss": 1.8487, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.5248284214775939, | |
| "grad_norm": 12.249984741210938, | |
| "learning_rate": 1.640462038802288e-05, | |
| "loss": 1.7951, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.5349212757367784, | |
| "grad_norm": 14.075465202331543, | |
| "learning_rate": 1.629247504766177e-05, | |
| "loss": 1.8115, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.5450141299959629, | |
| "grad_norm": 9.785154342651367, | |
| "learning_rate": 1.6180329707300664e-05, | |
| "loss": 1.8576, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.5551069842551474, | |
| "grad_norm": 14.559487342834473, | |
| "learning_rate": 1.6068184366939554e-05, | |
| "loss": 1.8263, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.5651998385143319, | |
| "grad_norm": 15.150165557861328, | |
| "learning_rate": 1.5956039026578448e-05, | |
| "loss": 1.8029, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5752926927735164, | |
| "grad_norm": 13.863632202148438, | |
| "learning_rate": 1.584389368621734e-05, | |
| "loss": 1.7863, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.5853855470327008, | |
| "grad_norm": 9.358270645141602, | |
| "learning_rate": 1.573174834585623e-05, | |
| "loss": 1.806, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5954784012918853, | |
| "grad_norm": 12.770975112915039, | |
| "learning_rate": 1.5619603005495123e-05, | |
| "loss": 1.7417, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.6055712555510698, | |
| "grad_norm": 12.026569366455078, | |
| "learning_rate": 1.5507457665134017e-05, | |
| "loss": 1.7623, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6156641098102543, | |
| "grad_norm": 9.8405122756958, | |
| "learning_rate": 1.5395312324772907e-05, | |
| "loss": 1.7941, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.6257569640694388, | |
| "grad_norm": 13.649519920349121, | |
| "learning_rate": 1.5283166984411798e-05, | |
| "loss": 1.7499, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6358498183286233, | |
| "grad_norm": 13.303316116333008, | |
| "learning_rate": 1.5171021644050692e-05, | |
| "loss": 1.7821, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.6459426725878078, | |
| "grad_norm": 14.893158912658691, | |
| "learning_rate": 1.5058876303689582e-05, | |
| "loss": 1.8423, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.6560355268469923, | |
| "grad_norm": 14.434380531311035, | |
| "learning_rate": 1.4946730963328474e-05, | |
| "loss": 1.8138, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.6661283811061768, | |
| "grad_norm": 9.59044075012207, | |
| "learning_rate": 1.4834585622967368e-05, | |
| "loss": 1.7734, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6762212353653613, | |
| "grad_norm": 12.524561882019043, | |
| "learning_rate": 1.4722440282606259e-05, | |
| "loss": 1.8246, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.6863140896245459, | |
| "grad_norm": 13.521296501159668, | |
| "learning_rate": 1.4610294942245151e-05, | |
| "loss": 1.7847, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6964069438837304, | |
| "grad_norm": 10.999866485595703, | |
| "learning_rate": 1.4498149601884043e-05, | |
| "loss": 1.8027, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.7064997981429149, | |
| "grad_norm": 15.364250183105469, | |
| "learning_rate": 1.4386004261522934e-05, | |
| "loss": 1.7802, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7165926524020994, | |
| "grad_norm": 13.141353607177734, | |
| "learning_rate": 1.4273858921161828e-05, | |
| "loss": 1.7464, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.7266855066612838, | |
| "grad_norm": 9.018637657165527, | |
| "learning_rate": 1.4161713580800718e-05, | |
| "loss": 1.7553, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.7367783609204683, | |
| "grad_norm": 11.081124305725098, | |
| "learning_rate": 1.404956824043961e-05, | |
| "loss": 1.7922, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.7468712151796528, | |
| "grad_norm": 10.0188627243042, | |
| "learning_rate": 1.3937422900078504e-05, | |
| "loss": 1.7769, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.7569640694388373, | |
| "grad_norm": 10.286458015441895, | |
| "learning_rate": 1.3825277559717395e-05, | |
| "loss": 1.7696, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.7670569236980218, | |
| "grad_norm": 11.746405601501465, | |
| "learning_rate": 1.3713132219356287e-05, | |
| "loss": 1.7188, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.7771497779572063, | |
| "grad_norm": 11.215723991394043, | |
| "learning_rate": 1.3600986878995179e-05, | |
| "loss": 1.6803, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.7872426322163908, | |
| "grad_norm": 8.982596397399902, | |
| "learning_rate": 1.348884153863407e-05, | |
| "loss": 1.7696, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7973354864755753, | |
| "grad_norm": 12.450457572937012, | |
| "learning_rate": 1.3376696198272963e-05, | |
| "loss": 1.8021, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.8074283407347598, | |
| "grad_norm": 10.87128734588623, | |
| "learning_rate": 1.3264550857911855e-05, | |
| "loss": 1.7492, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8175211949939443, | |
| "grad_norm": 11.78647518157959, | |
| "learning_rate": 1.3152405517550746e-05, | |
| "loss": 1.7883, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.8276140492531288, | |
| "grad_norm": 12.425263404846191, | |
| "learning_rate": 1.3040260177189638e-05, | |
| "loss": 1.7546, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.8377069035123133, | |
| "grad_norm": 11.663323402404785, | |
| "learning_rate": 1.2928114836828532e-05, | |
| "loss": 1.8018, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.8477997577714977, | |
| "grad_norm": 17.913087844848633, | |
| "learning_rate": 1.2815969496467423e-05, | |
| "loss": 1.7827, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.8578926120306822, | |
| "grad_norm": 9.219327926635742, | |
| "learning_rate": 1.2703824156106315e-05, | |
| "loss": 1.7245, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.8679854662898667, | |
| "grad_norm": 11.107460021972656, | |
| "learning_rate": 1.2591678815745207e-05, | |
| "loss": 1.7264, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.8780783205490512, | |
| "grad_norm": 10.487607955932617, | |
| "learning_rate": 1.2479533475384097e-05, | |
| "loss": 1.753, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.8881711748082358, | |
| "grad_norm": 13.2865571975708, | |
| "learning_rate": 1.2367388135022991e-05, | |
| "loss": 1.7317, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8982640290674203, | |
| "grad_norm": 10.927115440368652, | |
| "learning_rate": 1.2255242794661883e-05, | |
| "loss": 1.7651, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.9083568833266048, | |
| "grad_norm": 10.536073684692383, | |
| "learning_rate": 1.2143097454300774e-05, | |
| "loss": 1.7578, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9184497375857893, | |
| "grad_norm": 13.544109344482422, | |
| "learning_rate": 1.2030952113939666e-05, | |
| "loss": 1.7505, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.9285425918449738, | |
| "grad_norm": 9.343710899353027, | |
| "learning_rate": 1.1921049680385782e-05, | |
| "loss": 1.6865, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.9386354461041583, | |
| "grad_norm": 11.518623352050781, | |
| "learning_rate": 1.1808904340024674e-05, | |
| "loss": 1.7203, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.9487283003633428, | |
| "grad_norm": 7.897172927856445, | |
| "learning_rate": 1.1696758999663564e-05, | |
| "loss": 1.7201, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.9588211546225273, | |
| "grad_norm": 11.530837059020996, | |
| "learning_rate": 1.1584613659302457e-05, | |
| "loss": 1.8117, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.9689140088817118, | |
| "grad_norm": 11.721019744873047, | |
| "learning_rate": 1.147246831894135e-05, | |
| "loss": 1.7663, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.9790068631408962, | |
| "grad_norm": 11.470191955566406, | |
| "learning_rate": 1.1360322978580241e-05, | |
| "loss": 1.7655, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.9890997174000807, | |
| "grad_norm": 12.892107009887695, | |
| "learning_rate": 1.1248177638219133e-05, | |
| "loss": 1.759, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.9991925716592652, | |
| "grad_norm": 13.869138717651367, | |
| "learning_rate": 1.1136032297858025e-05, | |
| "loss": 1.7831, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.776762843132019, | |
| "eval_runtime": 226.5804, | |
| "eval_samples_per_second": 16.396, | |
| "eval_steps_per_second": 2.052, | |
| "step": 4954 | |
| }, | |
| { | |
| "epoch": 1.0092854259184498, | |
| "grad_norm": 17.121496200561523, | |
| "learning_rate": 1.1023886957496916e-05, | |
| "loss": 1.3545, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0193782801776343, | |
| "grad_norm": 14.082002639770508, | |
| "learning_rate": 1.091174161713581e-05, | |
| "loss": 1.2965, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.0294711344368188, | |
| "grad_norm": 12.042837142944336, | |
| "learning_rate": 1.07995962767747e-05, | |
| "loss": 1.3614, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.0395639886960033, | |
| "grad_norm": 14.636174201965332, | |
| "learning_rate": 1.0687450936413592e-05, | |
| "loss": 1.3486, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.0496568429551878, | |
| "grad_norm": 8.166929244995117, | |
| "learning_rate": 1.0575305596052484e-05, | |
| "loss": 1.2801, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.0597496972143723, | |
| "grad_norm": 12.425227165222168, | |
| "learning_rate": 1.0463160255691377e-05, | |
| "loss": 1.3283, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.0698425514735568, | |
| "grad_norm": 10.232943534851074, | |
| "learning_rate": 1.0351014915330269e-05, | |
| "loss": 1.3272, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.0799354057327413, | |
| "grad_norm": 10.95102310180664, | |
| "learning_rate": 1.0238869574969161e-05, | |
| "loss": 1.3399, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.0900282599919258, | |
| "grad_norm": 12.011204719543457, | |
| "learning_rate": 1.0126724234608051e-05, | |
| "loss": 1.3378, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.1001211142511103, | |
| "grad_norm": 11.14439582824707, | |
| "learning_rate": 1.0014578894246945e-05, | |
| "loss": 1.3115, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.1102139685102947, | |
| "grad_norm": 12.321435928344727, | |
| "learning_rate": 9.902433553885838e-06, | |
| "loss": 1.3166, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.1203068227694792, | |
| "grad_norm": 12.903867721557617, | |
| "learning_rate": 9.79028821352473e-06, | |
| "loss": 1.322, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.1303996770286637, | |
| "grad_norm": 15.811857223510742, | |
| "learning_rate": 9.67814287316362e-06, | |
| "loss": 1.2703, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.1404925312878482, | |
| "grad_norm": 12.508252143859863, | |
| "learning_rate": 9.565997532802512e-06, | |
| "loss": 1.3569, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.1505853855470327, | |
| "grad_norm": 11.315281867980957, | |
| "learning_rate": 9.453852192441405e-06, | |
| "loss": 1.3082, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.1606782398062172, | |
| "grad_norm": 10.650445938110352, | |
| "learning_rate": 9.341706852080297e-06, | |
| "loss": 1.3015, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.1707710940654017, | |
| "grad_norm": 11.240402221679688, | |
| "learning_rate": 9.229561511719189e-06, | |
| "loss": 1.2776, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.1808639483245862, | |
| "grad_norm": 13.537579536437988, | |
| "learning_rate": 9.117416171358081e-06, | |
| "loss": 1.3222, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.1909568025837707, | |
| "grad_norm": 12.026103973388672, | |
| "learning_rate": 9.005270830996973e-06, | |
| "loss": 1.3167, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.2010496568429552, | |
| "grad_norm": 9.895818710327148, | |
| "learning_rate": 8.893125490635864e-06, | |
| "loss": 1.3288, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.2111425111021397, | |
| "grad_norm": 12.40115737915039, | |
| "learning_rate": 8.780980150274758e-06, | |
| "loss": 1.3048, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2212353653613242, | |
| "grad_norm": 11.0924654006958, | |
| "learning_rate": 8.668834809913648e-06, | |
| "loss": 1.3203, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.2313282196205086, | |
| "grad_norm": 8.749198913574219, | |
| "learning_rate": 8.55668946955254e-06, | |
| "loss": 1.3776, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.2414210738796931, | |
| "grad_norm": 12.793172836303711, | |
| "learning_rate": 8.444544129191433e-06, | |
| "loss": 1.328, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.2515139281388776, | |
| "grad_norm": 15.980279922485352, | |
| "learning_rate": 8.332398788830325e-06, | |
| "loss": 1.2843, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.2616067823980621, | |
| "grad_norm": 18.131574630737305, | |
| "learning_rate": 8.220253448469217e-06, | |
| "loss": 1.3044, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.2716996366572466, | |
| "grad_norm": 12.027210235595703, | |
| "learning_rate": 8.108108108108109e-06, | |
| "loss": 1.3458, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.281792490916431, | |
| "grad_norm": 10.164145469665527, | |
| "learning_rate": 7.995962767747001e-06, | |
| "loss": 1.3379, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.2918853451756156, | |
| "grad_norm": 16.162031173706055, | |
| "learning_rate": 7.883817427385892e-06, | |
| "loss": 1.3204, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.3019781994348, | |
| "grad_norm": 9.44632625579834, | |
| "learning_rate": 7.771672087024786e-06, | |
| "loss": 1.3242, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.3120710536939846, | |
| "grad_norm": 12.877717971801758, | |
| "learning_rate": 7.659526746663676e-06, | |
| "loss": 1.3335, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3221639079531693, | |
| "grad_norm": 15.49012565612793, | |
| "learning_rate": 7.547381406302568e-06, | |
| "loss": 1.3426, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.3322567622123538, | |
| "grad_norm": 13.459305763244629, | |
| "learning_rate": 7.4352360659414604e-06, | |
| "loss": 1.3061, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.3423496164715383, | |
| "grad_norm": 11.868767738342285, | |
| "learning_rate": 7.325333632387574e-06, | |
| "loss": 1.3071, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.3524424707307228, | |
| "grad_norm": 8.798039436340332, | |
| "learning_rate": 7.213188292026467e-06, | |
| "loss": 1.2868, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.3625353249899073, | |
| "grad_norm": 11.275954246520996, | |
| "learning_rate": 7.101042951665359e-06, | |
| "loss": 1.3084, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.3726281792490918, | |
| "grad_norm": 14.668099403381348, | |
| "learning_rate": 6.988897611304251e-06, | |
| "loss": 1.3411, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.3827210335082762, | |
| "grad_norm": 14.613497734069824, | |
| "learning_rate": 6.876752270943143e-06, | |
| "loss": 1.2724, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.3928138877674607, | |
| "grad_norm": 9.993393898010254, | |
| "learning_rate": 6.764606930582035e-06, | |
| "loss": 1.3055, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.4029067420266452, | |
| "grad_norm": 11.838705062866211, | |
| "learning_rate": 6.6524615902209266e-06, | |
| "loss": 1.2698, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.4129995962858297, | |
| "grad_norm": 14.627535820007324, | |
| "learning_rate": 6.54031624985982e-06, | |
| "loss": 1.3263, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.4230924505450142, | |
| "grad_norm": 11.87954330444336, | |
| "learning_rate": 6.428170909498711e-06, | |
| "loss": 1.2705, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.4331853048041987, | |
| "grad_norm": 13.008570671081543, | |
| "learning_rate": 6.316025569137602e-06, | |
| "loss": 1.3362, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.4432781590633832, | |
| "grad_norm": 11.822639465332031, | |
| "learning_rate": 6.2038802287764944e-06, | |
| "loss": 1.2569, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.4533710133225677, | |
| "grad_norm": 9.144119262695312, | |
| "learning_rate": 6.0917348884153875e-06, | |
| "loss": 1.311, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.4634638675817522, | |
| "grad_norm": 13.30654525756836, | |
| "learning_rate": 5.979589548054279e-06, | |
| "loss": 1.349, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.4735567218409367, | |
| "grad_norm": 10.533941268920898, | |
| "learning_rate": 5.86744420769317e-06, | |
| "loss": 1.3082, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.4836495761001212, | |
| "grad_norm": 11.791468620300293, | |
| "learning_rate": 5.755298867332063e-06, | |
| "loss": 1.3181, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.4937424303593057, | |
| "grad_norm": 11.65937328338623, | |
| "learning_rate": 5.6431535269709545e-06, | |
| "loss": 1.2978, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.5038352846184901, | |
| "grad_norm": 12.603124618530273, | |
| "learning_rate": 5.531008186609847e-06, | |
| "loss": 1.3095, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.5139281388776746, | |
| "grad_norm": 11.710714340209961, | |
| "learning_rate": 5.418862846248739e-06, | |
| "loss": 1.2666, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.5240209931368591, | |
| "grad_norm": 12.024763107299805, | |
| "learning_rate": 5.306717505887631e-06, | |
| "loss": 1.3246, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.5341138473960436, | |
| "grad_norm": 13.552536964416504, | |
| "learning_rate": 5.194572165526522e-06, | |
| "loss": 1.2591, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.544206701655228, | |
| "grad_norm": 14.486330032348633, | |
| "learning_rate": 5.082426825165415e-06, | |
| "loss": 1.3162, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.5542995559144126, | |
| "grad_norm": 14.879488945007324, | |
| "learning_rate": 4.970281484804307e-06, | |
| "loss": 1.2642, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.564392410173597, | |
| "grad_norm": 18.098791122436523, | |
| "learning_rate": 4.858136144443199e-06, | |
| "loss": 1.3251, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.5744852644327816, | |
| "grad_norm": 12.657069206237793, | |
| "learning_rate": 4.74599080408209e-06, | |
| "loss": 1.2953, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.584578118691966, | |
| "grad_norm": 9.791812896728516, | |
| "learning_rate": 4.633845463720983e-06, | |
| "loss": 1.2816, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.5946709729511506, | |
| "grad_norm": 14.392789840698242, | |
| "learning_rate": 4.521700123359875e-06, | |
| "loss": 1.2961, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.604763827210335, | |
| "grad_norm": 13.632621765136719, | |
| "learning_rate": 4.409554782998767e-06, | |
| "loss": 1.3357, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.6148566814695196, | |
| "grad_norm": 12.285110473632812, | |
| "learning_rate": 4.297409442637659e-06, | |
| "loss": 1.319, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.624949535728704, | |
| "grad_norm": 13.019585609436035, | |
| "learning_rate": 4.18526410227655e-06, | |
| "loss": 1.285, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.6350423899878885, | |
| "grad_norm": 16.58565330505371, | |
| "learning_rate": 4.0731187619154425e-06, | |
| "loss": 1.2839, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.645135244247073, | |
| "grad_norm": 10.878052711486816, | |
| "learning_rate": 3.960973421554335e-06, | |
| "loss": 1.2821, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.6552280985062575, | |
| "grad_norm": 15.922154426574707, | |
| "learning_rate": 3.848828081193227e-06, | |
| "loss": 1.2763, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.665320952765442, | |
| "grad_norm": 13.756718635559082, | |
| "learning_rate": 3.7366827408321186e-06, | |
| "loss": 1.2714, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.6754138070246265, | |
| "grad_norm": 9.965152740478516, | |
| "learning_rate": 3.624537400471011e-06, | |
| "loss": 1.288, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.685506661283811, | |
| "grad_norm": 12.36296558380127, | |
| "learning_rate": 3.5123920601099026e-06, | |
| "loss": 1.2817, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.6955995155429955, | |
| "grad_norm": 13.448798179626465, | |
| "learning_rate": 3.4002467197487947e-06, | |
| "loss": 1.2894, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.70569236980218, | |
| "grad_norm": 9.591205596923828, | |
| "learning_rate": 3.2881013793876865e-06, | |
| "loss": 1.2892, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.7157852240613645, | |
| "grad_norm": 15.324070930480957, | |
| "learning_rate": 3.1759560390265787e-06, | |
| "loss": 1.2976, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.725878078320549, | |
| "grad_norm": 16.99766731262207, | |
| "learning_rate": 3.063810698665471e-06, | |
| "loss": 1.2416, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.7359709325797335, | |
| "grad_norm": 11.426822662353516, | |
| "learning_rate": 2.9516653583043626e-06, | |
| "loss": 1.2759, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.746063786838918, | |
| "grad_norm": 12.615915298461914, | |
| "learning_rate": 2.839520017943255e-06, | |
| "loss": 1.2956, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.7561566410981024, | |
| "grad_norm": 10.838130950927734, | |
| "learning_rate": 2.727374677582147e-06, | |
| "loss": 1.2532, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.766249495357287, | |
| "grad_norm": 13.932589530944824, | |
| "learning_rate": 2.6152293372210387e-06, | |
| "loss": 1.3155, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.7763423496164714, | |
| "grad_norm": 11.161537170410156, | |
| "learning_rate": 2.503083996859931e-06, | |
| "loss": 1.2575, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.786435203875656, | |
| "grad_norm": 13.190702438354492, | |
| "learning_rate": 2.3909386564988227e-06, | |
| "loss": 1.2629, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.7965280581348404, | |
| "grad_norm": 17.810091018676758, | |
| "learning_rate": 2.2787933161377144e-06, | |
| "loss": 1.277, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.806620912394025, | |
| "grad_norm": 12.80745792388916, | |
| "learning_rate": 2.1666479757766066e-06, | |
| "loss": 1.2732, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.8167137666532094, | |
| "grad_norm": 14.509129524230957, | |
| "learning_rate": 2.0545026354154988e-06, | |
| "loss": 1.3017, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.8268066209123939, | |
| "grad_norm": 16.78874397277832, | |
| "learning_rate": 1.9423572950543905e-06, | |
| "loss": 1.2932, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.8368994751715784, | |
| "grad_norm": 12.09203815460205, | |
| "learning_rate": 1.8302119546932825e-06, | |
| "loss": 1.2647, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.8469923294307629, | |
| "grad_norm": 17.09573745727539, | |
| "learning_rate": 1.7180666143321747e-06, | |
| "loss": 1.2795, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.8570851836899476, | |
| "grad_norm": 17.978240966796875, | |
| "learning_rate": 1.6059212739710667e-06, | |
| "loss": 1.2694, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.867178037949132, | |
| "grad_norm": 10.284587860107422, | |
| "learning_rate": 1.4937759336099586e-06, | |
| "loss": 1.2948, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.8772708922083166, | |
| "grad_norm": 22.253034591674805, | |
| "learning_rate": 1.3816305932488506e-06, | |
| "loss": 1.2731, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.887363746467501, | |
| "grad_norm": 10.942328453063965, | |
| "learning_rate": 1.2694852528877428e-06, | |
| "loss": 1.2967, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.8974566007266855, | |
| "grad_norm": 16.947174072265625, | |
| "learning_rate": 1.1573399125266345e-06, | |
| "loss": 1.2665, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.90754945498587, | |
| "grad_norm": 11.684677124023438, | |
| "learning_rate": 1.0451945721655265e-06, | |
| "loss": 1.2699, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.9176423092450545, | |
| "grad_norm": 13.600410461425781, | |
| "learning_rate": 9.330492318044186e-07, | |
| "loss": 1.2625, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.927735163504239, | |
| "grad_norm": 16.593090057373047, | |
| "learning_rate": 8.209038914433106e-07, | |
| "loss": 1.2491, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.9378280177634235, | |
| "grad_norm": 11.868796348571777, | |
| "learning_rate": 7.087585510822026e-07, | |
| "loss": 1.2725, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.947920872022608, | |
| "grad_norm": 11.491511344909668, | |
| "learning_rate": 5.966132107210946e-07, | |
| "loss": 1.2639, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.9580137262817925, | |
| "grad_norm": 14.748616218566895, | |
| "learning_rate": 4.844678703599866e-07, | |
| "loss": 1.3293, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.968106580540977, | |
| "grad_norm": 13.854447364807129, | |
| "learning_rate": 3.723225299988786e-07, | |
| "loss": 1.2409, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.9781994348001615, | |
| "grad_norm": 15.181270599365234, | |
| "learning_rate": 2.6017718963777056e-07, | |
| "loss": 1.2347, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.988292289059346, | |
| "grad_norm": 16.94820785522461, | |
| "learning_rate": 1.4803184927666255e-07, | |
| "loss": 1.2992, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.9983851433185305, | |
| "grad_norm": 12.58464527130127, | |
| "learning_rate": 3.812941572277672e-08, | |
| "loss": 1.2962, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.7698270082473755, | |
| "eval_runtime": 226.7653, | |
| "eval_samples_per_second": 16.383, | |
| "eval_steps_per_second": 2.051, | |
| "step": 9908 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 9908, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.362669429614182e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |