{ "best_global_step": 9908, "best_metric": 1.7698270082473755, "best_model_checkpoint": "./mcqa_qwen3_letter_best/checkpoint-9908", "epoch": 2.0, "eval_steps": 500, "global_step": 9908, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010092854259184497, "grad_norm": 48.55782699584961, "learning_rate": 8.879919273461152e-07, "loss": 2.3985, "step": 50 }, { "epoch": 0.020185708518368994, "grad_norm": 25.696617126464844, "learning_rate": 1.8970736629667005e-06, "loss": 2.053, "step": 100 }, { "epoch": 0.030278562777553492, "grad_norm": 27.860021591186523, "learning_rate": 2.906155398587286e-06, "loss": 1.9305, "step": 150 }, { "epoch": 0.04037141703673799, "grad_norm": 17.68500518798828, "learning_rate": 3.915237134207871e-06, "loss": 1.9294, "step": 200 }, { "epoch": 0.050464271295922486, "grad_norm": 26.112218856811523, "learning_rate": 4.924318869828457e-06, "loss": 1.8834, "step": 250 }, { "epoch": 0.060557125555106985, "grad_norm": 25.835376739501953, "learning_rate": 5.933400605449042e-06, "loss": 1.8517, "step": 300 }, { "epoch": 0.07064997981429148, "grad_norm": 22.44589614868164, "learning_rate": 6.942482341069627e-06, "loss": 1.8978, "step": 350 }, { "epoch": 0.08074283407347597, "grad_norm": 32.82951354980469, "learning_rate": 7.951564076690212e-06, "loss": 1.8867, "step": 400 }, { "epoch": 0.09083568833266048, "grad_norm": 35.665794372558594, "learning_rate": 8.960645812310798e-06, "loss": 1.9055, "step": 450 }, { "epoch": 0.10092854259184497, "grad_norm": 22.500865936279297, "learning_rate": 9.969727547931384e-06, "loss": 1.8755, "step": 500 }, { "epoch": 0.11102139685102948, "grad_norm": 40.59410095214844, "learning_rate": 1.0978809283551967e-05, "loss": 1.8881, "step": 550 }, { "epoch": 0.12111425111021397, "grad_norm": 28.769454956054688, "learning_rate": 1.1987891019172555e-05, "loss": 1.8713, "step": 600 }, { "epoch": 0.13120710536939847, "grad_norm": 17.596820831298828, "learning_rate": 1.299697275479314e-05, "loss": 1.8694, "step": 650 }, { "epoch": 0.14129995962858297, "grad_norm": 17.149999618530273, "learning_rate": 1.4006054490413725e-05, "loss": 1.8809, "step": 700 }, { "epoch": 0.15139281388776746, "grad_norm": 19.181955337524414, "learning_rate": 1.5015136226034311e-05, "loss": 1.8697, "step": 750 }, { "epoch": 0.16148566814695195, "grad_norm": 24.227073669433594, "learning_rate": 1.6024217961654894e-05, "loss": 1.9201, "step": 800 }, { "epoch": 0.17157852240613647, "grad_norm": 18.42403221130371, "learning_rate": 1.703329969727548e-05, "loss": 1.8876, "step": 850 }, { "epoch": 0.18167137666532096, "grad_norm": 21.015230178833008, "learning_rate": 1.8042381432896066e-05, "loss": 1.8697, "step": 900 }, { "epoch": 0.19176423092450545, "grad_norm": 16.02488899230957, "learning_rate": 1.905146316851665e-05, "loss": 1.9102, "step": 950 }, { "epoch": 0.20185708518368994, "grad_norm": 25.045923233032227, "learning_rate": 1.9993271279578333e-05, "loss": 1.9121, "step": 1000 }, { "epoch": 0.21194993944287444, "grad_norm": 17.414430618286133, "learning_rate": 1.9881125939217227e-05, "loss": 1.9449, "step": 1050 }, { "epoch": 0.22204279370205895, "grad_norm": 15.37423324584961, "learning_rate": 1.976898059885612e-05, "loss": 1.9139, "step": 1100 }, { "epoch": 0.23213564796124345, "grad_norm": 20.543489456176758, "learning_rate": 1.965683525849501e-05, "loss": 1.92, "step": 1150 }, { "epoch": 0.24222850222042794, "grad_norm": 12.01870346069336, "learning_rate": 1.9544689918133902e-05, "loss": 1.8962, "step": 1200 }, { "epoch": 0.25232135647961246, "grad_norm": 15.475773811340332, "learning_rate": 1.9432544577772796e-05, "loss": 1.9483, "step": 1250 }, { "epoch": 0.26241421073879695, "grad_norm": 11.753213882446289, "learning_rate": 1.9320399237411686e-05, "loss": 1.919, "step": 1300 }, { "epoch": 0.27250706499798144, "grad_norm": 14.90489673614502, "learning_rate": 1.920825389705058e-05, "loss": 1.8742, "step": 1350 }, { "epoch": 0.28259991925716593, "grad_norm": 12.925189971923828, "learning_rate": 1.909610855668947e-05, "loss": 1.8822, "step": 1400 }, { "epoch": 0.2926927735163504, "grad_norm": 17.215579986572266, "learning_rate": 1.898396321632836e-05, "loss": 1.8796, "step": 1450 }, { "epoch": 0.3027856277755349, "grad_norm": 16.483861923217773, "learning_rate": 1.8871817875967255e-05, "loss": 1.8442, "step": 1500 }, { "epoch": 0.3128784820347194, "grad_norm": 18.10808753967285, "learning_rate": 1.875967253560615e-05, "loss": 1.9131, "step": 1550 }, { "epoch": 0.3229713362939039, "grad_norm": 14.261265754699707, "learning_rate": 1.864752719524504e-05, "loss": 1.7602, "step": 1600 }, { "epoch": 0.3330641905530884, "grad_norm": 16.223392486572266, "learning_rate": 1.8535381854883933e-05, "loss": 1.8392, "step": 1650 }, { "epoch": 0.34315704481227294, "grad_norm": 14.012106895446777, "learning_rate": 1.8423236514522824e-05, "loss": 1.8335, "step": 1700 }, { "epoch": 0.35324989907145743, "grad_norm": 13.234374046325684, "learning_rate": 1.8311091174161714e-05, "loss": 1.8501, "step": 1750 }, { "epoch": 0.3633427533306419, "grad_norm": 11.787166595458984, "learning_rate": 1.8198945833800608e-05, "loss": 1.8704, "step": 1800 }, { "epoch": 0.3734356075898264, "grad_norm": 15.64974308013916, "learning_rate": 1.80868004934395e-05, "loss": 1.85, "step": 1850 }, { "epoch": 0.3835284618490109, "grad_norm": 13.893998146057129, "learning_rate": 1.7974655153078392e-05, "loss": 1.8807, "step": 1900 }, { "epoch": 0.3936213161081954, "grad_norm": 15.42603588104248, "learning_rate": 1.7862509812717283e-05, "loss": 1.8124, "step": 1950 }, { "epoch": 0.4037141703673799, "grad_norm": 12.293023109436035, "learning_rate": 1.7750364472356173e-05, "loss": 1.8112, "step": 2000 }, { "epoch": 0.4138070246265644, "grad_norm": 17.576618194580078, "learning_rate": 1.7638219131995067e-05, "loss": 1.8468, "step": 2050 }, { "epoch": 0.42389987888574887, "grad_norm": 36.62916946411133, "learning_rate": 1.752607379163396e-05, "loss": 1.8563, "step": 2100 }, { "epoch": 0.43399273314493336, "grad_norm": 12.232354164123535, "learning_rate": 1.741392845127285e-05, "loss": 1.8643, "step": 2150 }, { "epoch": 0.4440855874041179, "grad_norm": 9.772968292236328, "learning_rate": 1.7301783110911742e-05, "loss": 1.8686, "step": 2200 }, { "epoch": 0.4541784416633024, "grad_norm": 13.78654956817627, "learning_rate": 1.7189637770550636e-05, "loss": 1.8477, "step": 2250 }, { "epoch": 0.4642712959224869, "grad_norm": 14.448091506958008, "learning_rate": 1.7077492430189526e-05, "loss": 1.828, "step": 2300 }, { "epoch": 0.4743641501816714, "grad_norm": 10.872529983520508, "learning_rate": 1.696534708982842e-05, "loss": 1.7916, "step": 2350 }, { "epoch": 0.4844570044408559, "grad_norm": 14.716806411743164, "learning_rate": 1.685320174946731e-05, "loss": 1.7982, "step": 2400 }, { "epoch": 0.49454985870004037, "grad_norm": 15.155656814575195, "learning_rate": 1.67410564091062e-05, "loss": 1.8422, "step": 2450 }, { "epoch": 0.5046427129592249, "grad_norm": 11.369612693786621, "learning_rate": 1.6628911068745095e-05, "loss": 1.8217, "step": 2500 }, { "epoch": 0.5147355672184094, "grad_norm": 15.491066932678223, "learning_rate": 1.651676572838399e-05, "loss": 1.8487, "step": 2550 }, { "epoch": 0.5248284214775939, "grad_norm": 12.249984741210938, "learning_rate": 1.640462038802288e-05, "loss": 1.7951, "step": 2600 }, { "epoch": 0.5349212757367784, "grad_norm": 14.075465202331543, "learning_rate": 1.629247504766177e-05, "loss": 1.8115, "step": 2650 }, { "epoch": 0.5450141299959629, "grad_norm": 9.785154342651367, "learning_rate": 1.6180329707300664e-05, "loss": 1.8576, "step": 2700 }, { "epoch": 0.5551069842551474, "grad_norm": 14.559487342834473, "learning_rate": 1.6068184366939554e-05, "loss": 1.8263, "step": 2750 }, { "epoch": 0.5651998385143319, "grad_norm": 15.150165557861328, "learning_rate": 1.5956039026578448e-05, "loss": 1.8029, "step": 2800 }, { "epoch": 0.5752926927735164, "grad_norm": 13.863632202148438, "learning_rate": 1.584389368621734e-05, "loss": 1.7863, "step": 2850 }, { "epoch": 0.5853855470327008, "grad_norm": 9.358270645141602, "learning_rate": 1.573174834585623e-05, "loss": 1.806, "step": 2900 }, { "epoch": 0.5954784012918853, "grad_norm": 12.770975112915039, "learning_rate": 1.5619603005495123e-05, "loss": 1.7417, "step": 2950 }, { "epoch": 0.6055712555510698, "grad_norm": 12.026569366455078, "learning_rate": 1.5507457665134017e-05, "loss": 1.7623, "step": 3000 }, { "epoch": 0.6156641098102543, "grad_norm": 9.8405122756958, "learning_rate": 1.5395312324772907e-05, "loss": 1.7941, "step": 3050 }, { "epoch": 0.6257569640694388, "grad_norm": 13.649519920349121, "learning_rate": 1.5283166984411798e-05, "loss": 1.7499, "step": 3100 }, { "epoch": 0.6358498183286233, "grad_norm": 13.303316116333008, "learning_rate": 1.5171021644050692e-05, "loss": 1.7821, "step": 3150 }, { "epoch": 0.6459426725878078, "grad_norm": 14.893158912658691, "learning_rate": 1.5058876303689582e-05, "loss": 1.8423, "step": 3200 }, { "epoch": 0.6560355268469923, "grad_norm": 14.434380531311035, "learning_rate": 1.4946730963328474e-05, "loss": 1.8138, "step": 3250 }, { "epoch": 0.6661283811061768, "grad_norm": 9.59044075012207, "learning_rate": 1.4834585622967368e-05, "loss": 1.7734, "step": 3300 }, { "epoch": 0.6762212353653613, "grad_norm": 12.524561882019043, "learning_rate": 1.4722440282606259e-05, "loss": 1.8246, "step": 3350 }, { "epoch": 0.6863140896245459, "grad_norm": 13.521296501159668, "learning_rate": 1.4610294942245151e-05, "loss": 1.7847, "step": 3400 }, { "epoch": 0.6964069438837304, "grad_norm": 10.999866485595703, "learning_rate": 1.4498149601884043e-05, "loss": 1.8027, "step": 3450 }, { "epoch": 0.7064997981429149, "grad_norm": 15.364250183105469, "learning_rate": 1.4386004261522934e-05, "loss": 1.7802, "step": 3500 }, { "epoch": 0.7165926524020994, "grad_norm": 13.141353607177734, "learning_rate": 1.4273858921161828e-05, "loss": 1.7464, "step": 3550 }, { "epoch": 0.7266855066612838, "grad_norm": 9.018637657165527, "learning_rate": 1.4161713580800718e-05, "loss": 1.7553, "step": 3600 }, { "epoch": 0.7367783609204683, "grad_norm": 11.081124305725098, "learning_rate": 1.404956824043961e-05, "loss": 1.7922, "step": 3650 }, { "epoch": 0.7468712151796528, "grad_norm": 10.0188627243042, "learning_rate": 1.3937422900078504e-05, "loss": 1.7769, "step": 3700 }, { "epoch": 0.7569640694388373, "grad_norm": 10.286458015441895, "learning_rate": 1.3825277559717395e-05, "loss": 1.7696, "step": 3750 }, { "epoch": 0.7670569236980218, "grad_norm": 11.746405601501465, "learning_rate": 1.3713132219356287e-05, "loss": 1.7188, "step": 3800 }, { "epoch": 0.7771497779572063, "grad_norm": 11.215723991394043, "learning_rate": 1.3600986878995179e-05, "loss": 1.6803, "step": 3850 }, { "epoch": 0.7872426322163908, "grad_norm": 8.982596397399902, "learning_rate": 1.348884153863407e-05, "loss": 1.7696, "step": 3900 }, { "epoch": 0.7973354864755753, "grad_norm": 12.450457572937012, "learning_rate": 1.3376696198272963e-05, "loss": 1.8021, "step": 3950 }, { "epoch": 0.8074283407347598, "grad_norm": 10.87128734588623, "learning_rate": 1.3264550857911855e-05, "loss": 1.7492, "step": 4000 }, { "epoch": 0.8175211949939443, "grad_norm": 11.78647518157959, "learning_rate": 1.3152405517550746e-05, "loss": 1.7883, "step": 4050 }, { "epoch": 0.8276140492531288, "grad_norm": 12.425263404846191, "learning_rate": 1.3040260177189638e-05, "loss": 1.7546, "step": 4100 }, { "epoch": 0.8377069035123133, "grad_norm": 11.663323402404785, "learning_rate": 1.2928114836828532e-05, "loss": 1.8018, "step": 4150 }, { "epoch": 0.8477997577714977, "grad_norm": 17.913087844848633, "learning_rate": 1.2815969496467423e-05, "loss": 1.7827, "step": 4200 }, { "epoch": 0.8578926120306822, "grad_norm": 9.219327926635742, "learning_rate": 1.2703824156106315e-05, "loss": 1.7245, "step": 4250 }, { "epoch": 0.8679854662898667, "grad_norm": 11.107460021972656, "learning_rate": 1.2591678815745207e-05, "loss": 1.7264, "step": 4300 }, { "epoch": 0.8780783205490512, "grad_norm": 10.487607955932617, "learning_rate": 1.2479533475384097e-05, "loss": 1.753, "step": 4350 }, { "epoch": 0.8881711748082358, "grad_norm": 13.2865571975708, "learning_rate": 1.2367388135022991e-05, "loss": 1.7317, "step": 4400 }, { "epoch": 0.8982640290674203, "grad_norm": 10.927115440368652, "learning_rate": 1.2255242794661883e-05, "loss": 1.7651, "step": 4450 }, { "epoch": 0.9083568833266048, "grad_norm": 10.536073684692383, "learning_rate": 1.2143097454300774e-05, "loss": 1.7578, "step": 4500 }, { "epoch": 0.9184497375857893, "grad_norm": 13.544109344482422, "learning_rate": 1.2030952113939666e-05, "loss": 1.7505, "step": 4550 }, { "epoch": 0.9285425918449738, "grad_norm": 9.343710899353027, "learning_rate": 1.1921049680385782e-05, "loss": 1.6865, "step": 4600 }, { "epoch": 0.9386354461041583, "grad_norm": 11.518623352050781, "learning_rate": 1.1808904340024674e-05, "loss": 1.7203, "step": 4650 }, { "epoch": 0.9487283003633428, "grad_norm": 7.897172927856445, "learning_rate": 1.1696758999663564e-05, "loss": 1.7201, "step": 4700 }, { "epoch": 0.9588211546225273, "grad_norm": 11.530837059020996, "learning_rate": 1.1584613659302457e-05, "loss": 1.8117, "step": 4750 }, { "epoch": 0.9689140088817118, "grad_norm": 11.721019744873047, "learning_rate": 1.147246831894135e-05, "loss": 1.7663, "step": 4800 }, { "epoch": 0.9790068631408962, "grad_norm": 11.470191955566406, "learning_rate": 1.1360322978580241e-05, "loss": 1.7655, "step": 4850 }, { "epoch": 0.9890997174000807, "grad_norm": 12.892107009887695, "learning_rate": 1.1248177638219133e-05, "loss": 1.759, "step": 4900 }, { "epoch": 0.9991925716592652, "grad_norm": 13.869138717651367, "learning_rate": 1.1136032297858025e-05, "loss": 1.7831, "step": 4950 }, { "epoch": 1.0, "eval_loss": 1.776762843132019, "eval_runtime": 226.5804, "eval_samples_per_second": 16.396, "eval_steps_per_second": 2.052, "step": 4954 }, { "epoch": 1.0092854259184498, "grad_norm": 17.121496200561523, "learning_rate": 1.1023886957496916e-05, "loss": 1.3545, "step": 5000 }, { "epoch": 1.0193782801776343, "grad_norm": 14.082002639770508, "learning_rate": 1.091174161713581e-05, "loss": 1.2965, "step": 5050 }, { "epoch": 1.0294711344368188, "grad_norm": 12.042837142944336, "learning_rate": 1.07995962767747e-05, "loss": 1.3614, "step": 5100 }, { "epoch": 1.0395639886960033, "grad_norm": 14.636174201965332, "learning_rate": 1.0687450936413592e-05, "loss": 1.3486, "step": 5150 }, { "epoch": 1.0496568429551878, "grad_norm": 8.166929244995117, "learning_rate": 1.0575305596052484e-05, "loss": 1.2801, "step": 5200 }, { "epoch": 1.0597496972143723, "grad_norm": 12.425227165222168, "learning_rate": 1.0463160255691377e-05, "loss": 1.3283, "step": 5250 }, { "epoch": 1.0698425514735568, "grad_norm": 10.232943534851074, "learning_rate": 1.0351014915330269e-05, "loss": 1.3272, "step": 5300 }, { "epoch": 1.0799354057327413, "grad_norm": 10.95102310180664, "learning_rate": 1.0238869574969161e-05, "loss": 1.3399, "step": 5350 }, { "epoch": 1.0900282599919258, "grad_norm": 12.011204719543457, "learning_rate": 1.0126724234608051e-05, "loss": 1.3378, "step": 5400 }, { "epoch": 1.1001211142511103, "grad_norm": 11.14439582824707, "learning_rate": 1.0014578894246945e-05, "loss": 1.3115, "step": 5450 }, { "epoch": 1.1102139685102947, "grad_norm": 12.321435928344727, "learning_rate": 9.902433553885838e-06, "loss": 1.3166, "step": 5500 }, { "epoch": 1.1203068227694792, "grad_norm": 12.903867721557617, "learning_rate": 9.79028821352473e-06, "loss": 1.322, "step": 5550 }, { "epoch": 1.1303996770286637, "grad_norm": 15.811857223510742, "learning_rate": 9.67814287316362e-06, "loss": 1.2703, "step": 5600 }, { "epoch": 1.1404925312878482, "grad_norm": 12.508252143859863, "learning_rate": 9.565997532802512e-06, "loss": 1.3569, "step": 5650 }, { "epoch": 1.1505853855470327, "grad_norm": 11.315281867980957, "learning_rate": 9.453852192441405e-06, "loss": 1.3082, "step": 5700 }, { "epoch": 1.1606782398062172, "grad_norm": 10.650445938110352, "learning_rate": 9.341706852080297e-06, "loss": 1.3015, "step": 5750 }, { "epoch": 1.1707710940654017, "grad_norm": 11.240402221679688, "learning_rate": 9.229561511719189e-06, "loss": 1.2776, "step": 5800 }, { "epoch": 1.1808639483245862, "grad_norm": 13.537579536437988, "learning_rate": 9.117416171358081e-06, "loss": 1.3222, "step": 5850 }, { "epoch": 1.1909568025837707, "grad_norm": 12.026103973388672, "learning_rate": 9.005270830996973e-06, "loss": 1.3167, "step": 5900 }, { "epoch": 1.2010496568429552, "grad_norm": 9.895818710327148, "learning_rate": 8.893125490635864e-06, "loss": 1.3288, "step": 5950 }, { "epoch": 1.2111425111021397, "grad_norm": 12.40115737915039, "learning_rate": 8.780980150274758e-06, "loss": 1.3048, "step": 6000 }, { "epoch": 1.2212353653613242, "grad_norm": 11.0924654006958, "learning_rate": 8.668834809913648e-06, "loss": 1.3203, "step": 6050 }, { "epoch": 1.2313282196205086, "grad_norm": 8.749198913574219, "learning_rate": 8.55668946955254e-06, "loss": 1.3776, "step": 6100 }, { "epoch": 1.2414210738796931, "grad_norm": 12.793172836303711, "learning_rate": 8.444544129191433e-06, "loss": 1.328, "step": 6150 }, { "epoch": 1.2515139281388776, "grad_norm": 15.980279922485352, "learning_rate": 8.332398788830325e-06, "loss": 1.2843, "step": 6200 }, { "epoch": 1.2616067823980621, "grad_norm": 18.131574630737305, "learning_rate": 8.220253448469217e-06, "loss": 1.3044, "step": 6250 }, { "epoch": 1.2716996366572466, "grad_norm": 12.027210235595703, "learning_rate": 8.108108108108109e-06, "loss": 1.3458, "step": 6300 }, { "epoch": 1.281792490916431, "grad_norm": 10.164145469665527, "learning_rate": 7.995962767747001e-06, "loss": 1.3379, "step": 6350 }, { "epoch": 1.2918853451756156, "grad_norm": 16.162031173706055, "learning_rate": 7.883817427385892e-06, "loss": 1.3204, "step": 6400 }, { "epoch": 1.3019781994348, "grad_norm": 9.44632625579834, "learning_rate": 7.771672087024786e-06, "loss": 1.3242, "step": 6450 }, { "epoch": 1.3120710536939846, "grad_norm": 12.877717971801758, "learning_rate": 7.659526746663676e-06, "loss": 1.3335, "step": 6500 }, { "epoch": 1.3221639079531693, "grad_norm": 15.49012565612793, "learning_rate": 7.547381406302568e-06, "loss": 1.3426, "step": 6550 }, { "epoch": 1.3322567622123538, "grad_norm": 13.459305763244629, "learning_rate": 7.4352360659414604e-06, "loss": 1.3061, "step": 6600 }, { "epoch": 1.3423496164715383, "grad_norm": 11.868767738342285, "learning_rate": 7.325333632387574e-06, "loss": 1.3071, "step": 6650 }, { "epoch": 1.3524424707307228, "grad_norm": 8.798039436340332, "learning_rate": 7.213188292026467e-06, "loss": 1.2868, "step": 6700 }, { "epoch": 1.3625353249899073, "grad_norm": 11.275954246520996, "learning_rate": 7.101042951665359e-06, "loss": 1.3084, "step": 6750 }, { "epoch": 1.3726281792490918, "grad_norm": 14.668099403381348, "learning_rate": 6.988897611304251e-06, "loss": 1.3411, "step": 6800 }, { "epoch": 1.3827210335082762, "grad_norm": 14.613497734069824, "learning_rate": 6.876752270943143e-06, "loss": 1.2724, "step": 6850 }, { "epoch": 1.3928138877674607, "grad_norm": 9.993393898010254, "learning_rate": 6.764606930582035e-06, "loss": 1.3055, "step": 6900 }, { "epoch": 1.4029067420266452, "grad_norm": 11.838705062866211, "learning_rate": 6.6524615902209266e-06, "loss": 1.2698, "step": 6950 }, { "epoch": 1.4129995962858297, "grad_norm": 14.627535820007324, "learning_rate": 6.54031624985982e-06, "loss": 1.3263, "step": 7000 }, { "epoch": 1.4230924505450142, "grad_norm": 11.87954330444336, "learning_rate": 6.428170909498711e-06, "loss": 1.2705, "step": 7050 }, { "epoch": 1.4331853048041987, "grad_norm": 13.008570671081543, "learning_rate": 6.316025569137602e-06, "loss": 1.3362, "step": 7100 }, { "epoch": 1.4432781590633832, "grad_norm": 11.822639465332031, "learning_rate": 6.2038802287764944e-06, "loss": 1.2569, "step": 7150 }, { "epoch": 1.4533710133225677, "grad_norm": 9.144119262695312, "learning_rate": 6.0917348884153875e-06, "loss": 1.311, "step": 7200 }, { "epoch": 1.4634638675817522, "grad_norm": 13.30654525756836, "learning_rate": 5.979589548054279e-06, "loss": 1.349, "step": 7250 }, { "epoch": 1.4735567218409367, "grad_norm": 10.533941268920898, "learning_rate": 5.86744420769317e-06, "loss": 1.3082, "step": 7300 }, { "epoch": 1.4836495761001212, "grad_norm": 11.791468620300293, "learning_rate": 5.755298867332063e-06, "loss": 1.3181, "step": 7350 }, { "epoch": 1.4937424303593057, "grad_norm": 11.65937328338623, "learning_rate": 5.6431535269709545e-06, "loss": 1.2978, "step": 7400 }, { "epoch": 1.5038352846184901, "grad_norm": 12.603124618530273, "learning_rate": 5.531008186609847e-06, "loss": 1.3095, "step": 7450 }, { "epoch": 1.5139281388776746, "grad_norm": 11.710714340209961, "learning_rate": 5.418862846248739e-06, "loss": 1.2666, "step": 7500 }, { "epoch": 1.5240209931368591, "grad_norm": 12.024763107299805, "learning_rate": 5.306717505887631e-06, "loss": 1.3246, "step": 7550 }, { "epoch": 1.5341138473960436, "grad_norm": 13.552536964416504, "learning_rate": 5.194572165526522e-06, "loss": 1.2591, "step": 7600 }, { "epoch": 1.544206701655228, "grad_norm": 14.486330032348633, "learning_rate": 5.082426825165415e-06, "loss": 1.3162, "step": 7650 }, { "epoch": 1.5542995559144126, "grad_norm": 14.879488945007324, "learning_rate": 4.970281484804307e-06, "loss": 1.2642, "step": 7700 }, { "epoch": 1.564392410173597, "grad_norm": 18.098791122436523, "learning_rate": 4.858136144443199e-06, "loss": 1.3251, "step": 7750 }, { "epoch": 1.5744852644327816, "grad_norm": 12.657069206237793, "learning_rate": 4.74599080408209e-06, "loss": 1.2953, "step": 7800 }, { "epoch": 1.584578118691966, "grad_norm": 9.791812896728516, "learning_rate": 4.633845463720983e-06, "loss": 1.2816, "step": 7850 }, { "epoch": 1.5946709729511506, "grad_norm": 14.392789840698242, "learning_rate": 4.521700123359875e-06, "loss": 1.2961, "step": 7900 }, { "epoch": 1.604763827210335, "grad_norm": 13.632621765136719, "learning_rate": 4.409554782998767e-06, "loss": 1.3357, "step": 7950 }, { "epoch": 1.6148566814695196, "grad_norm": 12.285110473632812, "learning_rate": 4.297409442637659e-06, "loss": 1.319, "step": 8000 }, { "epoch": 1.624949535728704, "grad_norm": 13.019585609436035, "learning_rate": 4.18526410227655e-06, "loss": 1.285, "step": 8050 }, { "epoch": 1.6350423899878885, "grad_norm": 16.58565330505371, "learning_rate": 4.0731187619154425e-06, "loss": 1.2839, "step": 8100 }, { "epoch": 1.645135244247073, "grad_norm": 10.878052711486816, "learning_rate": 3.960973421554335e-06, "loss": 1.2821, "step": 8150 }, { "epoch": 1.6552280985062575, "grad_norm": 15.922154426574707, "learning_rate": 3.848828081193227e-06, "loss": 1.2763, "step": 8200 }, { "epoch": 1.665320952765442, "grad_norm": 13.756718635559082, "learning_rate": 3.7366827408321186e-06, "loss": 1.2714, "step": 8250 }, { "epoch": 1.6754138070246265, "grad_norm": 9.965152740478516, "learning_rate": 3.624537400471011e-06, "loss": 1.288, "step": 8300 }, { "epoch": 1.685506661283811, "grad_norm": 12.36296558380127, "learning_rate": 3.5123920601099026e-06, "loss": 1.2817, "step": 8350 }, { "epoch": 1.6955995155429955, "grad_norm": 13.448798179626465, "learning_rate": 3.4002467197487947e-06, "loss": 1.2894, "step": 8400 }, { "epoch": 1.70569236980218, "grad_norm": 9.591205596923828, "learning_rate": 3.2881013793876865e-06, "loss": 1.2892, "step": 8450 }, { "epoch": 1.7157852240613645, "grad_norm": 15.324070930480957, "learning_rate": 3.1759560390265787e-06, "loss": 1.2976, "step": 8500 }, { "epoch": 1.725878078320549, "grad_norm": 16.99766731262207, "learning_rate": 3.063810698665471e-06, "loss": 1.2416, "step": 8550 }, { "epoch": 1.7359709325797335, "grad_norm": 11.426822662353516, "learning_rate": 2.9516653583043626e-06, "loss": 1.2759, "step": 8600 }, { "epoch": 1.746063786838918, "grad_norm": 12.615915298461914, "learning_rate": 2.839520017943255e-06, "loss": 1.2956, "step": 8650 }, { "epoch": 1.7561566410981024, "grad_norm": 10.838130950927734, "learning_rate": 2.727374677582147e-06, "loss": 1.2532, "step": 8700 }, { "epoch": 1.766249495357287, "grad_norm": 13.932589530944824, "learning_rate": 2.6152293372210387e-06, "loss": 1.3155, "step": 8750 }, { "epoch": 1.7763423496164714, "grad_norm": 11.161537170410156, "learning_rate": 2.503083996859931e-06, "loss": 1.2575, "step": 8800 }, { "epoch": 1.786435203875656, "grad_norm": 13.190702438354492, "learning_rate": 2.3909386564988227e-06, "loss": 1.2629, "step": 8850 }, { "epoch": 1.7965280581348404, "grad_norm": 17.810091018676758, "learning_rate": 2.2787933161377144e-06, "loss": 1.277, "step": 8900 }, { "epoch": 1.806620912394025, "grad_norm": 12.80745792388916, "learning_rate": 2.1666479757766066e-06, "loss": 1.2732, "step": 8950 }, { "epoch": 1.8167137666532094, "grad_norm": 14.509129524230957, "learning_rate": 2.0545026354154988e-06, "loss": 1.3017, "step": 9000 }, { "epoch": 1.8268066209123939, "grad_norm": 16.78874397277832, "learning_rate": 1.9423572950543905e-06, "loss": 1.2932, "step": 9050 }, { "epoch": 1.8368994751715784, "grad_norm": 12.09203815460205, "learning_rate": 1.8302119546932825e-06, "loss": 1.2647, "step": 9100 }, { "epoch": 1.8469923294307629, "grad_norm": 17.09573745727539, "learning_rate": 1.7180666143321747e-06, "loss": 1.2795, "step": 9150 }, { "epoch": 1.8570851836899476, "grad_norm": 17.978240966796875, "learning_rate": 1.6059212739710667e-06, "loss": 1.2694, "step": 9200 }, { "epoch": 1.867178037949132, "grad_norm": 10.284587860107422, "learning_rate": 1.4937759336099586e-06, "loss": 1.2948, "step": 9250 }, { "epoch": 1.8772708922083166, "grad_norm": 22.253034591674805, "learning_rate": 1.3816305932488506e-06, "loss": 1.2731, "step": 9300 }, { "epoch": 1.887363746467501, "grad_norm": 10.942328453063965, "learning_rate": 1.2694852528877428e-06, "loss": 1.2967, "step": 9350 }, { "epoch": 1.8974566007266855, "grad_norm": 16.947174072265625, "learning_rate": 1.1573399125266345e-06, "loss": 1.2665, "step": 9400 }, { "epoch": 1.90754945498587, "grad_norm": 11.684677124023438, "learning_rate": 1.0451945721655265e-06, "loss": 1.2699, "step": 9450 }, { "epoch": 1.9176423092450545, "grad_norm": 13.600410461425781, "learning_rate": 9.330492318044186e-07, "loss": 1.2625, "step": 9500 }, { "epoch": 1.927735163504239, "grad_norm": 16.593090057373047, "learning_rate": 8.209038914433106e-07, "loss": 1.2491, "step": 9550 }, { "epoch": 1.9378280177634235, "grad_norm": 11.868796348571777, "learning_rate": 7.087585510822026e-07, "loss": 1.2725, "step": 9600 }, { "epoch": 1.947920872022608, "grad_norm": 11.491511344909668, "learning_rate": 5.966132107210946e-07, "loss": 1.2639, "step": 9650 }, { "epoch": 1.9580137262817925, "grad_norm": 14.748616218566895, "learning_rate": 4.844678703599866e-07, "loss": 1.3293, "step": 9700 }, { "epoch": 1.968106580540977, "grad_norm": 13.854447364807129, "learning_rate": 3.723225299988786e-07, "loss": 1.2409, "step": 9750 }, { "epoch": 1.9781994348001615, "grad_norm": 15.181270599365234, "learning_rate": 2.6017718963777056e-07, "loss": 1.2347, "step": 9800 }, { "epoch": 1.988292289059346, "grad_norm": 16.94820785522461, "learning_rate": 1.4803184927666255e-07, "loss": 1.2992, "step": 9850 }, { "epoch": 1.9983851433185305, "grad_norm": 12.58464527130127, "learning_rate": 3.812941572277672e-08, "loss": 1.2962, "step": 9900 }, { "epoch": 2.0, "eval_loss": 1.7698270082473755, "eval_runtime": 226.7653, "eval_samples_per_second": 16.383, "eval_steps_per_second": 2.051, "step": 9908 } ], "logging_steps": 50, "max_steps": 9908, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.362669429614182e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }