{ "best_global_step": 3300, "best_metric": 0.32621017, "best_model_checkpoint": "/mnt/shared-storage-user/mineru4s/jcwang/VPLT/outputs/checkpoints/29_lr2e-5_bs128_e1_VLT_TT_vp_ib09_1m_full/v0-20251204-195443/checkpoint-3300", "epoch": 1.0, "eval_steps": 100, "global_step": 7806, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012810658467845247, "grad_norm": 61.75, "learning_rate": 5.115089514066497e-08, "loss": 1.7527258396148682, "step": 1, "token_acc": 0.6929900475984422 }, { "epoch": 0.0006405329233922624, "grad_norm": 73.0, "learning_rate": 2.5575447570332484e-07, "loss": 1.7571117877960205, "step": 5, "token_acc": 0.6861388813834098 }, { "epoch": 0.0012810658467845247, "grad_norm": 58.5, "learning_rate": 5.115089514066497e-07, "loss": 1.7617622375488282, "step": 10, "token_acc": 0.6896253352946267 }, { "epoch": 0.001921598770176787, "grad_norm": 71.0, "learning_rate": 7.672634271099745e-07, "loss": 1.7400665283203125, "step": 15, "token_acc": 0.6863151530854601 }, { "epoch": 0.0025621316935690495, "grad_norm": 53.5, "learning_rate": 1.0230179028132994e-06, "loss": 1.7050804138183593, "step": 20, "token_acc": 0.6901353798396137 }, { "epoch": 0.003202664616961312, "grad_norm": 54.25, "learning_rate": 1.2787723785166241e-06, "loss": 1.6199134826660155, "step": 25, "token_acc": 0.7034818581401362 }, { "epoch": 0.003843197540353574, "grad_norm": 47.25, "learning_rate": 1.534526854219949e-06, "loss": 1.4921659469604491, "step": 30, "token_acc": 0.7096607764736231 }, { "epoch": 0.004483730463745837, "grad_norm": 43.25, "learning_rate": 1.7902813299232737e-06, "loss": 1.3593175888061524, "step": 35, "token_acc": 0.71506187745246 }, { "epoch": 0.005124263387138099, "grad_norm": 30.25, "learning_rate": 2.0460358056265987e-06, "loss": 1.1928886413574218, "step": 40, "token_acc": 0.7095462606514122 }, { "epoch": 0.005764796310530361, "grad_norm": 61.0, "learning_rate": 2.3017902813299235e-06, "loss": 1.000108528137207, "step": 45, "token_acc": 0.6974425102084677 }, { "epoch": 0.006405329233922624, "grad_norm": 47.75, "learning_rate": 2.5575447570332483e-06, "loss": 0.8234397888183593, "step": 50, "token_acc": 0.7172779381976468 }, { "epoch": 0.007045862157314886, "grad_norm": 15.0625, "learning_rate": 2.813299232736573e-06, "loss": 0.6950876235961914, "step": 55, "token_acc": 0.7412205198829402 }, { "epoch": 0.007686395080707148, "grad_norm": 9.6875, "learning_rate": 3.069053708439898e-06, "loss": 0.6445055961608886, "step": 60, "token_acc": 0.7602104627593048 }, { "epoch": 0.00832692800409941, "grad_norm": 4.46875, "learning_rate": 3.3248081841432226e-06, "loss": 0.6182816982269287, "step": 65, "token_acc": 0.7633587786259542 }, { "epoch": 0.008967460927491674, "grad_norm": 4.21875, "learning_rate": 3.5805626598465474e-06, "loss": 0.6172842979431152, "step": 70, "token_acc": 0.7656499417576255 }, { "epoch": 0.009607993850883935, "grad_norm": 3.359375, "learning_rate": 3.836317135549873e-06, "loss": 0.6099654197692871, "step": 75, "token_acc": 0.7649229712912501 }, { "epoch": 0.010248526774276198, "grad_norm": 3.265625, "learning_rate": 4.092071611253197e-06, "loss": 0.6063261985778808, "step": 80, "token_acc": 0.7644379511859746 }, { "epoch": 0.01088905969766846, "grad_norm": 6.34375, "learning_rate": 4.347826086956522e-06, "loss": 0.5800480842590332, "step": 85, "token_acc": 0.7750280729031701 }, { "epoch": 0.011529592621060722, "grad_norm": 4.03125, "learning_rate": 4.603580562659847e-06, "loss": 0.5782370567321777, "step": 90, "token_acc": 0.7780650721827193 }, { "epoch": 0.012170125544452985, "grad_norm": 21.125, "learning_rate": 4.859335038363172e-06, "loss": 0.5692886352539063, "step": 95, "token_acc": 0.7785625080745877 }, { "epoch": 0.012810658467845248, "grad_norm": 4.03125, "learning_rate": 5.1150895140664966e-06, "loss": 0.5636235237121582, "step": 100, "token_acc": 0.7817540539378037 }, { "epoch": 0.012810658467845248, "eval_loss": 0.5616942644119263, "eval_runtime": 109.4288, "eval_samples_per_second": 91.384, "eval_steps_per_second": 11.423, "eval_token_acc": 0.782156125595894, "step": 100 }, { "epoch": 0.013451191391237509, "grad_norm": 4.34375, "learning_rate": 5.370843989769821e-06, "loss": 0.5511586189270019, "step": 105, "token_acc": 0.7842739323805 }, { "epoch": 0.014091724314629772, "grad_norm": 7.34375, "learning_rate": 5.626598465473146e-06, "loss": 0.5501980781555176, "step": 110, "token_acc": 0.7883570504527814 }, { "epoch": 0.014732257238022035, "grad_norm": 4.0, "learning_rate": 5.882352941176471e-06, "loss": 0.5424150466918946, "step": 115, "token_acc": 0.7874757908327954 }, { "epoch": 0.015372790161414296, "grad_norm": 4.9375, "learning_rate": 6.138107416879796e-06, "loss": 0.5345050811767578, "step": 120, "token_acc": 0.7919321508524195 }, { "epoch": 0.01601332308480656, "grad_norm": 3.9375, "learning_rate": 6.3938618925831205e-06, "loss": 0.5287456512451172, "step": 125, "token_acc": 0.7929382311045884 }, { "epoch": 0.01665385600819882, "grad_norm": 4.1875, "learning_rate": 6.649616368286445e-06, "loss": 0.5286868572235107, "step": 130, "token_acc": 0.7934768540489235 }, { "epoch": 0.017294388931591083, "grad_norm": 6.25, "learning_rate": 6.90537084398977e-06, "loss": 0.5210060119628906, "step": 135, "token_acc": 0.7963354171157577 }, { "epoch": 0.017934921854983348, "grad_norm": 5.0625, "learning_rate": 7.161125319693095e-06, "loss": 0.5186363697052002, "step": 140, "token_acc": 0.7987665502221072 }, { "epoch": 0.01857545477837561, "grad_norm": 6.09375, "learning_rate": 7.41687979539642e-06, "loss": 0.5118862152099609, "step": 145, "token_acc": 0.7989200863930885 }, { "epoch": 0.01921598770176787, "grad_norm": 6.15625, "learning_rate": 7.672634271099745e-06, "loss": 0.5256869316101074, "step": 150, "token_acc": 0.795885056483828 }, { "epoch": 0.019856520625160134, "grad_norm": 4.25, "learning_rate": 7.92838874680307e-06, "loss": 0.5057379722595214, "step": 155, "token_acc": 0.8006816514948876 }, { "epoch": 0.020497053548552396, "grad_norm": 9.9375, "learning_rate": 8.184143222506395e-06, "loss": 0.49903292655944825, "step": 160, "token_acc": 0.8046234796860174 }, { "epoch": 0.021137586471944657, "grad_norm": 6.78125, "learning_rate": 8.43989769820972e-06, "loss": 0.5005066871643067, "step": 165, "token_acc": 0.8053670973596647 }, { "epoch": 0.02177811939533692, "grad_norm": 5.46875, "learning_rate": 8.695652173913044e-06, "loss": 0.4884012222290039, "step": 170, "token_acc": 0.8094040079812613 }, { "epoch": 0.022418652318729183, "grad_norm": 7.90625, "learning_rate": 8.95140664961637e-06, "loss": 0.4938325881958008, "step": 175, "token_acc": 0.806672997237569 }, { "epoch": 0.023059185242121444, "grad_norm": 9.9375, "learning_rate": 9.207161125319694e-06, "loss": 0.5015275478363037, "step": 180, "token_acc": 0.8040328474998926 }, { "epoch": 0.02369971816551371, "grad_norm": 6.59375, "learning_rate": 9.462915601023019e-06, "loss": 0.4769923686981201, "step": 185, "token_acc": 0.8167363295557375 }, { "epoch": 0.02434025108890597, "grad_norm": 8.875, "learning_rate": 9.718670076726344e-06, "loss": 0.48226518630981446, "step": 190, "token_acc": 0.8120698554714384 }, { "epoch": 0.02498078401229823, "grad_norm": 5.875, "learning_rate": 9.974424552429668e-06, "loss": 0.48815107345581055, "step": 195, "token_acc": 0.8090380890897353 }, { "epoch": 0.025621316935690495, "grad_norm": 8.1875, "learning_rate": 1.0230179028132993e-05, "loss": 0.4772751808166504, "step": 200, "token_acc": 0.816347690845466 }, { "epoch": 0.025621316935690495, "eval_loss": 0.47741714119911194, "eval_runtime": 103.9123, "eval_samples_per_second": 96.235, "eval_steps_per_second": 12.029, "eval_token_acc": 0.8152050539557392, "step": 200 }, { "epoch": 0.026261849859082757, "grad_norm": 8.9375, "learning_rate": 1.0485933503836318e-05, "loss": 0.47005443572998046, "step": 205, "token_acc": 0.8176898432764742 }, { "epoch": 0.026902382782475018, "grad_norm": 10.0625, "learning_rate": 1.0741687979539643e-05, "loss": 0.466142463684082, "step": 210, "token_acc": 0.818608860541286 }, { "epoch": 0.027542915705867282, "grad_norm": 7.875, "learning_rate": 1.0997442455242967e-05, "loss": 0.4647233963012695, "step": 215, "token_acc": 0.8217214883881551 }, { "epoch": 0.028183448629259544, "grad_norm": 23.125, "learning_rate": 1.1253196930946292e-05, "loss": 0.46241116523742676, "step": 220, "token_acc": 0.8234608913240433 }, { "epoch": 0.028823981552651805, "grad_norm": 10.9375, "learning_rate": 1.1508951406649617e-05, "loss": 0.4518951416015625, "step": 225, "token_acc": 0.8269572375546986 }, { "epoch": 0.02946451447604407, "grad_norm": 16.875, "learning_rate": 1.1764705882352942e-05, "loss": 0.44137048721313477, "step": 230, "token_acc": 0.8298306556665219 }, { "epoch": 0.03010504739943633, "grad_norm": 12.4375, "learning_rate": 1.2020460358056267e-05, "loss": 0.453232479095459, "step": 235, "token_acc": 0.8256595964821521 }, { "epoch": 0.030745580322828592, "grad_norm": 8.0, "learning_rate": 1.2276214833759591e-05, "loss": 0.4504352569580078, "step": 240, "token_acc": 0.8276514337302782 }, { "epoch": 0.031386113246220856, "grad_norm": 16.25, "learning_rate": 1.2531969309462916e-05, "loss": 0.43747830390930176, "step": 245, "token_acc": 0.829782636878268 }, { "epoch": 0.03202664616961312, "grad_norm": 6.96875, "learning_rate": 1.2787723785166241e-05, "loss": 0.4497882843017578, "step": 250, "token_acc": 0.8272813524236674 }, { "epoch": 0.03266717909300538, "grad_norm": 7.25, "learning_rate": 1.3043478260869566e-05, "loss": 0.4439809322357178, "step": 255, "token_acc": 0.8280041258380608 }, { "epoch": 0.03330771201639764, "grad_norm": 12.375, "learning_rate": 1.329923273657289e-05, "loss": 0.4415604591369629, "step": 260, "token_acc": 0.8288237828522189 }, { "epoch": 0.03394824493978991, "grad_norm": 9.1875, "learning_rate": 1.3554987212276215e-05, "loss": 0.4370439529418945, "step": 265, "token_acc": 0.8311521132804119 }, { "epoch": 0.034588777863182166, "grad_norm": 8.5625, "learning_rate": 1.381074168797954e-05, "loss": 0.4249903678894043, "step": 270, "token_acc": 0.8351054633471646 }, { "epoch": 0.03522931078657443, "grad_norm": 7.65625, "learning_rate": 1.4066496163682865e-05, "loss": 0.42871723175048826, "step": 275, "token_acc": 0.8337790045717243 }, { "epoch": 0.035869843709966695, "grad_norm": 17.25, "learning_rate": 1.432225063938619e-05, "loss": 0.42778358459472654, "step": 280, "token_acc": 0.8345190359160092 }, { "epoch": 0.03651037663335895, "grad_norm": 5.40625, "learning_rate": 1.4578005115089514e-05, "loss": 0.42468814849853515, "step": 285, "token_acc": 0.8370123979437557 }, { "epoch": 0.03715090955675122, "grad_norm": 9.3125, "learning_rate": 1.483375959079284e-05, "loss": 0.42493228912353515, "step": 290, "token_acc": 0.8359849954727719 }, { "epoch": 0.03779144248014348, "grad_norm": 52.25, "learning_rate": 1.5089514066496164e-05, "loss": 0.42238712310791016, "step": 295, "token_acc": 0.8344741486934435 }, { "epoch": 0.03843197540353574, "grad_norm": 12.4375, "learning_rate": 1.534526854219949e-05, "loss": 0.4189589023590088, "step": 300, "token_acc": 0.8357866481946489 }, { "epoch": 0.03843197540353574, "eval_loss": 0.42732954025268555, "eval_runtime": 102.1151, "eval_samples_per_second": 97.929, "eval_steps_per_second": 12.241, "eval_token_acc": 0.8347027589681691, "step": 300 }, { "epoch": 0.039072508326928004, "grad_norm": 8.375, "learning_rate": 1.5601023017902815e-05, "loss": 0.4307071685791016, "step": 305, "token_acc": 0.8350831713112984 }, { "epoch": 0.03971304125032027, "grad_norm": 9.25, "learning_rate": 1.585677749360614e-05, "loss": 0.41645016670227053, "step": 310, "token_acc": 0.8390616240458838 }, { "epoch": 0.04035357417371253, "grad_norm": 239.0, "learning_rate": 1.6112531969309465e-05, "loss": 0.42703800201416015, "step": 315, "token_acc": 0.8354931760451199 }, { "epoch": 0.04099410709710479, "grad_norm": 10.6875, "learning_rate": 1.636828644501279e-05, "loss": 0.41779098510742185, "step": 320, "token_acc": 0.8380400467067423 }, { "epoch": 0.041634640020497056, "grad_norm": 7.03125, "learning_rate": 1.6624040920716114e-05, "loss": 0.418929386138916, "step": 325, "token_acc": 0.8373385012919896 }, { "epoch": 0.042275172943889314, "grad_norm": 10.0, "learning_rate": 1.687979539641944e-05, "loss": 0.4039362907409668, "step": 330, "token_acc": 0.8435945139099208 }, { "epoch": 0.04291570586728158, "grad_norm": 8.375, "learning_rate": 1.7135549872122764e-05, "loss": 0.4099921226501465, "step": 335, "token_acc": 0.8431601226728866 }, { "epoch": 0.04355623879067384, "grad_norm": 13.5, "learning_rate": 1.739130434782609e-05, "loss": 0.3967348575592041, "step": 340, "token_acc": 0.8471802400898177 }, { "epoch": 0.0441967717140661, "grad_norm": 26.0, "learning_rate": 1.7647058823529414e-05, "loss": 0.4075979709625244, "step": 345, "token_acc": 0.8455312553814363 }, { "epoch": 0.044837304637458365, "grad_norm": 11.4375, "learning_rate": 1.790281329923274e-05, "loss": 0.4026969909667969, "step": 350, "token_acc": 0.8446702255898054 }, { "epoch": 0.04547783756085063, "grad_norm": 7.28125, "learning_rate": 1.8158567774936063e-05, "loss": 0.4052872657775879, "step": 355, "token_acc": 0.8408119196717772 }, { "epoch": 0.04611837048424289, "grad_norm": 7.0, "learning_rate": 1.8414322250639388e-05, "loss": 0.3986091136932373, "step": 360, "token_acc": 0.8454004142216086 }, { "epoch": 0.04675890340763515, "grad_norm": 7.0625, "learning_rate": 1.8670076726342713e-05, "loss": 0.4026648044586182, "step": 365, "token_acc": 0.844288421778084 }, { "epoch": 0.04739943633102742, "grad_norm": 8.9375, "learning_rate": 1.8925831202046038e-05, "loss": 0.38652160167694094, "step": 370, "token_acc": 0.851685393258427 }, { "epoch": 0.048039969254419675, "grad_norm": 35.0, "learning_rate": 1.9181585677749362e-05, "loss": 0.3907599687576294, "step": 375, "token_acc": 0.8490174010908147 }, { "epoch": 0.04868050217781194, "grad_norm": 8.25, "learning_rate": 1.9437340153452687e-05, "loss": 0.39287233352661133, "step": 380, "token_acc": 0.8511720096518441 }, { "epoch": 0.049321035101204204, "grad_norm": 12.25, "learning_rate": 1.9693094629156012e-05, "loss": 0.3889561653137207, "step": 385, "token_acc": 0.848631170510886 }, { "epoch": 0.04996156802459646, "grad_norm": 8.9375, "learning_rate": 1.9948849104859337e-05, "loss": 0.38450467586517334, "step": 390, "token_acc": 0.8524625544956188 }, { "epoch": 0.050602100947988726, "grad_norm": 11.3125, "learning_rate": 1.999998563957419e-05, "loss": 0.39484443664550783, "step": 395, "token_acc": 0.8477848646785037 }, { "epoch": 0.05124263387138099, "grad_norm": 14.5, "learning_rate": 1.9999927300415016e-05, "loss": 0.3870053768157959, "step": 400, "token_acc": 0.8513297986982198 }, { "epoch": 0.05124263387138099, "eval_loss": 0.40171390771865845, "eval_runtime": 107.3275, "eval_samples_per_second": 93.173, "eval_steps_per_second": 11.647, "eval_token_acc": 0.8490457391853209, "step": 400 }, { "epoch": 0.05188316679477325, "grad_norm": 7.28125, "learning_rate": 1.9999824085257465e-05, "loss": 0.3954286575317383, "step": 405, "token_acc": 0.8484640466792518 }, { "epoch": 0.05252369971816551, "grad_norm": 8.9375, "learning_rate": 1.9999675994564737e-05, "loss": 0.3951833248138428, "step": 410, "token_acc": 0.8472842901340667 }, { "epoch": 0.05316423264155778, "grad_norm": 8.5625, "learning_rate": 1.99994830290014e-05, "loss": 0.3877572536468506, "step": 415, "token_acc": 0.8527802903045183 }, { "epoch": 0.053804765564950036, "grad_norm": 6.3125, "learning_rate": 1.999924518943342e-05, "loss": 0.3790754318237305, "step": 420, "token_acc": 0.8536078906385188 }, { "epoch": 0.0544452984883423, "grad_norm": 7.21875, "learning_rate": 1.999896247692813e-05, "loss": 0.37895793914794923, "step": 425, "token_acc": 0.8564549713690786 }, { "epoch": 0.055085831411734565, "grad_norm": 7.625, "learning_rate": 1.999863489275424e-05, "loss": 0.3699763774871826, "step": 430, "token_acc": 0.8588138812154696 }, { "epoch": 0.05572636433512682, "grad_norm": 6.1875, "learning_rate": 1.9998262438381828e-05, "loss": 0.3807647228240967, "step": 435, "token_acc": 0.8515137160329013 }, { "epoch": 0.05636689725851909, "grad_norm": 4.875, "learning_rate": 1.9997845115482334e-05, "loss": 0.37743220329284666, "step": 440, "token_acc": 0.853497694064911 }, { "epoch": 0.05700743018191135, "grad_norm": 11.0, "learning_rate": 1.9997382925928544e-05, "loss": 0.36346435546875, "step": 445, "token_acc": 0.8598952244880288 }, { "epoch": 0.05764796310530361, "grad_norm": 6.0625, "learning_rate": 1.99968758717946e-05, "loss": 0.36632614135742186, "step": 450, "token_acc": 0.8593689131281652 }, { "epoch": 0.058288496028695874, "grad_norm": 22.375, "learning_rate": 1.9996323955355972e-05, "loss": 0.38116629123687745, "step": 455, "token_acc": 0.8530322580645161 }, { "epoch": 0.05892902895208814, "grad_norm": 8.9375, "learning_rate": 1.9995727179089463e-05, "loss": 0.3787653684616089, "step": 460, "token_acc": 0.8553301683211049 }, { "epoch": 0.0595695618754804, "grad_norm": 5.71875, "learning_rate": 1.9995085545673177e-05, "loss": 0.37586026191711425, "step": 465, "token_acc": 0.8558023415977961 }, { "epoch": 0.06021009479887266, "grad_norm": 8.5625, "learning_rate": 1.9994399057986537e-05, "loss": 0.36600193977355955, "step": 470, "token_acc": 0.8605023127134397 }, { "epoch": 0.060850627722264926, "grad_norm": 21.25, "learning_rate": 1.9993667719110245e-05, "loss": 0.37952864170074463, "step": 475, "token_acc": 0.8555727954486683 }, { "epoch": 0.061491160645657184, "grad_norm": 16.0, "learning_rate": 1.9992891532326277e-05, "loss": 0.379518985748291, "step": 480, "token_acc": 0.8553931082071851 }, { "epoch": 0.06213169356904945, "grad_norm": 7.0, "learning_rate": 1.9992070501117877e-05, "loss": 0.3733321189880371, "step": 485, "token_acc": 0.8571675153188919 }, { "epoch": 0.06277222649244171, "grad_norm": 11.5, "learning_rate": 1.9991204629169534e-05, "loss": 0.36601009368896487, "step": 490, "token_acc": 0.8613763013521103 }, { "epoch": 0.06341275941583398, "grad_norm": 6.59375, "learning_rate": 1.9990293920366957e-05, "loss": 0.3734764814376831, "step": 495, "token_acc": 0.8549542551355084 }, { "epoch": 0.06405329233922624, "grad_norm": 446.0, "learning_rate": 1.998933837879708e-05, "loss": 0.3742378234863281, "step": 500, "token_acc": 0.8572477856988551 }, { "epoch": 0.06405329233922624, "eval_loss": 0.3812016248703003, "eval_runtime": 102.75, "eval_samples_per_second": 97.324, "eval_steps_per_second": 12.165, "eval_token_acc": 0.8565535875445017, "step": 500 }, { "epoch": 0.06469382526261849, "grad_norm": 9.875, "learning_rate": 1.998833800874802e-05, "loss": 0.377778148651123, "step": 505, "token_acc": 0.8549496860755139 }, { "epoch": 0.06533435818601076, "grad_norm": 8.5, "learning_rate": 1.9987292814709064e-05, "loss": 0.36702466011047363, "step": 510, "token_acc": 0.8602703052808843 }, { "epoch": 0.06597489110940302, "grad_norm": 8.3125, "learning_rate": 1.9986202801370665e-05, "loss": 0.3668750047683716, "step": 515, "token_acc": 0.8610121474868009 }, { "epoch": 0.06661542403279529, "grad_norm": 5.5, "learning_rate": 1.9985067973624402e-05, "loss": 0.368256139755249, "step": 520, "token_acc": 0.8598641210870313 }, { "epoch": 0.06725595695618755, "grad_norm": 6.1875, "learning_rate": 1.9983888336562962e-05, "loss": 0.3637028694152832, "step": 525, "token_acc": 0.8616238543952498 }, { "epoch": 0.06789648987957982, "grad_norm": 8.8125, "learning_rate": 1.9982663895480125e-05, "loss": 0.3575170040130615, "step": 530, "token_acc": 0.8624665342430262 }, { "epoch": 0.06853702280297207, "grad_norm": 4.46875, "learning_rate": 1.9981394655870728e-05, "loss": 0.3676267147064209, "step": 535, "token_acc": 0.8586340206185566 }, { "epoch": 0.06917755572636433, "grad_norm": 5.28125, "learning_rate": 1.998008062343066e-05, "loss": 0.3628795385360718, "step": 540, "token_acc": 0.8599716507022894 }, { "epoch": 0.0698180886497566, "grad_norm": 4.40625, "learning_rate": 1.9978721804056806e-05, "loss": 0.351765513420105, "step": 545, "token_acc": 0.8640237603305785 }, { "epoch": 0.07045862157314886, "grad_norm": 7.4375, "learning_rate": 1.9977318203847056e-05, "loss": 0.35065426826477053, "step": 550, "token_acc": 0.864151596435061 }, { "epoch": 0.07109915449654113, "grad_norm": 5.53125, "learning_rate": 1.9975869829100248e-05, "loss": 0.3636244535446167, "step": 555, "token_acc": 0.8597329888027563 }, { "epoch": 0.07173968741993339, "grad_norm": 5.34375, "learning_rate": 1.9974376686316158e-05, "loss": 0.3594621181488037, "step": 560, "token_acc": 0.8631669907107367 }, { "epoch": 0.07238022034332564, "grad_norm": 4.78125, "learning_rate": 1.9972838782195455e-05, "loss": 0.36011404991149903, "step": 565, "token_acc": 0.8637501078609026 }, { "epoch": 0.0730207532667179, "grad_norm": 4.90625, "learning_rate": 1.99712561236397e-05, "loss": 0.36135361194610593, "step": 570, "token_acc": 0.8631270470608515 }, { "epoch": 0.07366128619011017, "grad_norm": 13.3125, "learning_rate": 1.9969628717751267e-05, "loss": 0.3561633825302124, "step": 575, "token_acc": 0.8632323755285184 }, { "epoch": 0.07430181911350243, "grad_norm": 4.59375, "learning_rate": 1.9967956571833375e-05, "loss": 0.347505521774292, "step": 580, "token_acc": 0.8662591581046517 }, { "epoch": 0.0749423520368947, "grad_norm": 5.90625, "learning_rate": 1.9966239693389982e-05, "loss": 0.3540546417236328, "step": 585, "token_acc": 0.8638228055783429 }, { "epoch": 0.07558288496028696, "grad_norm": 6.28125, "learning_rate": 1.9964478090125815e-05, "loss": 0.33905773162841796, "step": 590, "token_acc": 0.8724406047516199 }, { "epoch": 0.07622341788367921, "grad_norm": 5.125, "learning_rate": 1.9962671769946303e-05, "loss": 0.3554720401763916, "step": 595, "token_acc": 0.8650370115338268 }, { "epoch": 0.07686395080707148, "grad_norm": 16.25, "learning_rate": 1.9960820740957546e-05, "loss": 0.3572436094284058, "step": 600, "token_acc": 0.8659878327652414 }, { "epoch": 0.07686395080707148, "eval_loss": 0.36843228340148926, "eval_runtime": 102.1933, "eval_samples_per_second": 97.854, "eval_steps_per_second": 12.232, "eval_token_acc": 0.8618771835602482, "step": 600 }, { "epoch": 0.07750448373046374, "grad_norm": 5.59375, "learning_rate": 1.9958925011466283e-05, "loss": 0.3591419458389282, "step": 605, "token_acc": 0.8607949646490775 }, { "epoch": 0.07814501665385601, "grad_norm": 7.90625, "learning_rate": 1.9956984589979846e-05, "loss": 0.3471505165100098, "step": 610, "token_acc": 0.8661698828394211 }, { "epoch": 0.07878554957724827, "grad_norm": 6.96875, "learning_rate": 1.9954999485206143e-05, "loss": 0.34771528244018557, "step": 615, "token_acc": 0.8679774069762428 }, { "epoch": 0.07942608250064054, "grad_norm": 15.4375, "learning_rate": 1.9952969706053585e-05, "loss": 0.35360991954803467, "step": 620, "token_acc": 0.8646970023722235 }, { "epoch": 0.08006661542403279, "grad_norm": 5.59375, "learning_rate": 1.995089526163108e-05, "loss": 0.3377622127532959, "step": 625, "token_acc": 0.8723486808070356 }, { "epoch": 0.08070714834742505, "grad_norm": 4.25, "learning_rate": 1.994877616124797e-05, "loss": 0.3453701019287109, "step": 630, "token_acc": 0.8684722042244396 }, { "epoch": 0.08134768127081732, "grad_norm": 10.5625, "learning_rate": 1.9946612414414003e-05, "loss": 0.35302703380584716, "step": 635, "token_acc": 0.8653087478559177 }, { "epoch": 0.08198821419420958, "grad_norm": 18.875, "learning_rate": 1.9944404030839273e-05, "loss": 0.3411895513534546, "step": 640, "token_acc": 0.8682500752979648 }, { "epoch": 0.08262874711760185, "grad_norm": 7.09375, "learning_rate": 1.99421510204342e-05, "loss": 0.34150052070617676, "step": 645, "token_acc": 0.8713020295837633 }, { "epoch": 0.08326928004099411, "grad_norm": 4.09375, "learning_rate": 1.993985339330946e-05, "loss": 0.3460074424743652, "step": 650, "token_acc": 0.8699866454142076 }, { "epoch": 0.08390981296438636, "grad_norm": 6.5, "learning_rate": 1.993751115977596e-05, "loss": 0.338161039352417, "step": 655, "token_acc": 0.8716904276985743 }, { "epoch": 0.08455034588777863, "grad_norm": 4.59375, "learning_rate": 1.993512433034479e-05, "loss": 0.34201898574829104, "step": 660, "token_acc": 0.8709761050857711 }, { "epoch": 0.08519087881117089, "grad_norm": 4.53125, "learning_rate": 1.993269291572716e-05, "loss": 0.33865838050842284, "step": 665, "token_acc": 0.8712311015118791 }, { "epoch": 0.08583141173456316, "grad_norm": 5.40625, "learning_rate": 1.9930216926834366e-05, "loss": 0.3336113691329956, "step": 670, "token_acc": 0.8717793270688088 }, { "epoch": 0.08647194465795542, "grad_norm": 6.3125, "learning_rate": 1.992769637477773e-05, "loss": 0.3334836959838867, "step": 675, "token_acc": 0.8705119896305897 }, { "epoch": 0.08711247758134769, "grad_norm": 6.21875, "learning_rate": 1.9925131270868568e-05, "loss": 0.34505319595336914, "step": 680, "token_acc": 0.8677537009225488 }, { "epoch": 0.08775301050473995, "grad_norm": 8.375, "learning_rate": 1.9922521626618127e-05, "loss": 0.34624500274658204, "step": 685, "token_acc": 0.8689443941158708 }, { "epoch": 0.0883935434281322, "grad_norm": 7.125, "learning_rate": 1.9919867453737524e-05, "loss": 0.34455955028533936, "step": 690, "token_acc": 0.8656286291883521 }, { "epoch": 0.08903407635152447, "grad_norm": 6.0, "learning_rate": 1.9917168764137718e-05, "loss": 0.3470313549041748, "step": 695, "token_acc": 0.8690256366212908 }, { "epoch": 0.08967460927491673, "grad_norm": 4.6875, "learning_rate": 1.991442556992943e-05, "loss": 0.3429619312286377, "step": 700, "token_acc": 0.8695521102497846 }, { "epoch": 0.08967460927491673, "eval_loss": 0.35823777318000793, "eval_runtime": 102.8638, "eval_samples_per_second": 97.216, "eval_steps_per_second": 12.152, "eval_token_acc": 0.8654677732806972, "step": 700 }, { "epoch": 0.090315142198309, "grad_norm": 5.3125, "learning_rate": 1.9911637883423115e-05, "loss": 0.3434779167175293, "step": 705, "token_acc": 0.8690055962117951 }, { "epoch": 0.09095567512170126, "grad_norm": 4.84375, "learning_rate": 1.9908805717128876e-05, "loss": 0.3410910129547119, "step": 710, "token_acc": 0.8684765305683432 }, { "epoch": 0.09159620804509352, "grad_norm": 4.25, "learning_rate": 1.9905929083756442e-05, "loss": 0.34179034233093264, "step": 715, "token_acc": 0.8675887624956912 }, { "epoch": 0.09223674096848578, "grad_norm": 10.5, "learning_rate": 1.990300799621508e-05, "loss": 0.34283981323242185, "step": 720, "token_acc": 0.8702763191873978 }, { "epoch": 0.09287727389187804, "grad_norm": 5.5625, "learning_rate": 1.9900042467613562e-05, "loss": 0.34240546226501467, "step": 725, "token_acc": 0.8690830636461705 }, { "epoch": 0.0935178068152703, "grad_norm": 5.34375, "learning_rate": 1.9897032511260092e-05, "loss": 0.34098148345947266, "step": 730, "token_acc": 0.8700211452984077 }, { "epoch": 0.09415833973866257, "grad_norm": 3.890625, "learning_rate": 1.989397814066224e-05, "loss": 0.32979321479797363, "step": 735, "token_acc": 0.8732680105322226 }, { "epoch": 0.09479887266205483, "grad_norm": 3.6875, "learning_rate": 1.9890879369526907e-05, "loss": 0.33590106964111327, "step": 740, "token_acc": 0.868538938662991 }, { "epoch": 0.0954394055854471, "grad_norm": 4.3125, "learning_rate": 1.9887736211760237e-05, "loss": 0.33802223205566406, "step": 745, "token_acc": 0.8701846511427711 }, { "epoch": 0.09607993850883935, "grad_norm": 8.125, "learning_rate": 1.9884548681467565e-05, "loss": 0.3298491477966309, "step": 750, "token_acc": 0.8728583142721505 }, { "epoch": 0.09672047143223161, "grad_norm": 8.1875, "learning_rate": 1.9881316792953352e-05, "loss": 0.34202146530151367, "step": 755, "token_acc": 0.8698966408268733 }, { "epoch": 0.09736100435562388, "grad_norm": 4.6875, "learning_rate": 1.987804056072113e-05, "loss": 0.3250537872314453, "step": 760, "token_acc": 0.8748375920311823 }, { "epoch": 0.09800153727901614, "grad_norm": 6.0, "learning_rate": 1.987471999947343e-05, "loss": 0.34002318382263186, "step": 765, "token_acc": 0.8696007571846498 }, { "epoch": 0.09864207020240841, "grad_norm": 6.71875, "learning_rate": 1.9871355124111704e-05, "loss": 0.3327933311462402, "step": 770, "token_acc": 0.8724304715840387 }, { "epoch": 0.09928260312580067, "grad_norm": 5.125, "learning_rate": 1.986794594973627e-05, "loss": 0.334125280380249, "step": 775, "token_acc": 0.8736641716782763 }, { "epoch": 0.09992313604919292, "grad_norm": 5.375, "learning_rate": 1.986449249164626e-05, "loss": 0.33797569274902345, "step": 780, "token_acc": 0.870381508850318 }, { "epoch": 0.10056366897258519, "grad_norm": 5.875, "learning_rate": 1.986099476533953e-05, "loss": 0.337173318862915, "step": 785, "token_acc": 0.8695614640883977 }, { "epoch": 0.10120420189597745, "grad_norm": 4.1875, "learning_rate": 1.9857452786512575e-05, "loss": 0.31865544319152833, "step": 790, "token_acc": 0.8768464370803553 }, { "epoch": 0.10184473481936972, "grad_norm": 3.640625, "learning_rate": 1.98538665710605e-05, "loss": 0.3357419013977051, "step": 795, "token_acc": 0.8707342295760083 }, { "epoch": 0.10248526774276198, "grad_norm": 8.375, "learning_rate": 1.985023613507692e-05, "loss": 0.3254246711730957, "step": 800, "token_acc": 0.8745312702038706 }, { "epoch": 0.10248526774276198, "eval_loss": 0.3556683361530304, "eval_runtime": 102.7565, "eval_samples_per_second": 97.317, "eval_steps_per_second": 12.165, "eval_token_acc": 0.866935015032307, "step": 800 }, { "epoch": 0.10312580066615425, "grad_norm": 5.09375, "learning_rate": 1.9846561494853904e-05, "loss": 0.33404462337493895, "step": 805, "token_acc": 0.8705380237972065 }, { "epoch": 0.1037663335895465, "grad_norm": 3.53125, "learning_rate": 1.9842842666881885e-05, "loss": 0.3421541690826416, "step": 810, "token_acc": 0.8684289705566302 }, { "epoch": 0.10440686651293876, "grad_norm": 4.59375, "learning_rate": 1.983907966784959e-05, "loss": 0.3375978469848633, "step": 815, "token_acc": 0.869295677630446 }, { "epoch": 0.10504739943633103, "grad_norm": 9.0625, "learning_rate": 1.9835272514643978e-05, "loss": 0.3273109674453735, "step": 820, "token_acc": 0.8760344827586207 }, { "epoch": 0.10568793235972329, "grad_norm": 6.875, "learning_rate": 1.9831421224350156e-05, "loss": 0.3292600154876709, "step": 825, "token_acc": 0.874255631310952 }, { "epoch": 0.10632846528311556, "grad_norm": 11.5, "learning_rate": 1.98275258142513e-05, "loss": 0.32775206565856935, "step": 830, "token_acc": 0.8719834817395793 }, { "epoch": 0.10696899820650782, "grad_norm": 4.0, "learning_rate": 1.9823586301828572e-05, "loss": 0.3248668909072876, "step": 835, "token_acc": 0.876129143795652 }, { "epoch": 0.10760953112990007, "grad_norm": 16.25, "learning_rate": 1.9819602704761066e-05, "loss": 0.3292513132095337, "step": 840, "token_acc": 0.8749297722459917 }, { "epoch": 0.10825006405329234, "grad_norm": 8.0, "learning_rate": 1.9815575040925693e-05, "loss": 0.3171013116836548, "step": 845, "token_acc": 0.8782070696145027 }, { "epoch": 0.1088905969766846, "grad_norm": 2.96875, "learning_rate": 1.9811503328397133e-05, "loss": 0.319035267829895, "step": 850, "token_acc": 0.8752912747044101 }, { "epoch": 0.10953112990007687, "grad_norm": 5.9375, "learning_rate": 1.9807387585447734e-05, "loss": 0.32436022758483884, "step": 855, "token_acc": 0.876150555291474 }, { "epoch": 0.11017166282346913, "grad_norm": 4.5625, "learning_rate": 1.9803227830547437e-05, "loss": 0.33043532371520995, "step": 860, "token_acc": 0.8730488173995763 }, { "epoch": 0.1108121957468614, "grad_norm": 4.71875, "learning_rate": 1.9799024082363692e-05, "loss": 0.3189000129699707, "step": 865, "token_acc": 0.8785994905668523 }, { "epoch": 0.11145272867025365, "grad_norm": 5.65625, "learning_rate": 1.9794776359761378e-05, "loss": 0.32372350692749025, "step": 870, "token_acc": 0.8751831107281344 }, { "epoch": 0.11209326159364591, "grad_norm": 4.125, "learning_rate": 1.9790484681802707e-05, "loss": 0.3230480670928955, "step": 875, "token_acc": 0.8766306695464363 }, { "epoch": 0.11273379451703817, "grad_norm": 4.8125, "learning_rate": 1.9786149067747163e-05, "loss": 0.32105169296264646, "step": 880, "token_acc": 0.8791981030394481 }, { "epoch": 0.11337432744043044, "grad_norm": 5.0625, "learning_rate": 1.9781769537051384e-05, "loss": 0.3278522968292236, "step": 885, "token_acc": 0.8761810259286423 }, { "epoch": 0.1140148603638227, "grad_norm": 31.375, "learning_rate": 1.9777346109369088e-05, "loss": 0.3238049030303955, "step": 890, "token_acc": 0.8749892120479849 }, { "epoch": 0.11465539328721497, "grad_norm": 4.9375, "learning_rate": 1.9772878804551e-05, "loss": 0.33077249526977537, "step": 895, "token_acc": 0.8735443802294488 }, { "epoch": 0.11529592621060722, "grad_norm": 3.703125, "learning_rate": 1.9768367642644742e-05, "loss": 0.32166156768798826, "step": 900, "token_acc": 0.8777322698857513 }, { "epoch": 0.11529592621060722, "eval_loss": 0.34898844361305237, "eval_runtime": 102.6281, "eval_samples_per_second": 97.439, "eval_steps_per_second": 12.18, "eval_token_acc": 0.8700798954659461, "step": 900 }, { "epoch": 0.11593645913399948, "grad_norm": 5.3125, "learning_rate": 1.9763812643894743e-05, "loss": 0.32224602699279786, "step": 905, "token_acc": 0.8758474759252062 }, { "epoch": 0.11657699205739175, "grad_norm": 4.3125, "learning_rate": 1.975921382874217e-05, "loss": 0.32654175758361814, "step": 910, "token_acc": 0.8768330968047133 }, { "epoch": 0.11721752498078401, "grad_norm": 3.375, "learning_rate": 1.9754571217824815e-05, "loss": 0.3332622528076172, "step": 915, "token_acc": 0.8733866804336603 }, { "epoch": 0.11785805790417628, "grad_norm": 3.53125, "learning_rate": 1.974988483197701e-05, "loss": 0.3266796112060547, "step": 920, "token_acc": 0.873924638678596 }, { "epoch": 0.11849859082756854, "grad_norm": 2.890625, "learning_rate": 1.9745154692229524e-05, "loss": 0.3260995388031006, "step": 925, "token_acc": 0.8767960079153403 }, { "epoch": 0.1191391237509608, "grad_norm": 5.4375, "learning_rate": 1.9740380819809498e-05, "loss": 0.3234872817993164, "step": 930, "token_acc": 0.8770484733482836 }, { "epoch": 0.11977965667435306, "grad_norm": 4.34375, "learning_rate": 1.9735563236140307e-05, "loss": 0.3268592119216919, "step": 935, "token_acc": 0.8750322747224374 }, { "epoch": 0.12042018959774532, "grad_norm": 5.71875, "learning_rate": 1.9730701962841504e-05, "loss": 0.3228474140167236, "step": 940, "token_acc": 0.8774657593246619 }, { "epoch": 0.12106072252113759, "grad_norm": 10.625, "learning_rate": 1.9725797021728687e-05, "loss": 0.32127084732055666, "step": 945, "token_acc": 0.8768540876164195 }, { "epoch": 0.12170125544452985, "grad_norm": 4.3125, "learning_rate": 1.9720848434813437e-05, "loss": 0.3093282222747803, "step": 950, "token_acc": 0.8818652849740932 }, { "epoch": 0.12234178836792212, "grad_norm": 3.640625, "learning_rate": 1.9715856224303193e-05, "loss": 0.3240875244140625, "step": 955, "token_acc": 0.8766339869281046 }, { "epoch": 0.12298232129131437, "grad_norm": 10.125, "learning_rate": 1.9710820412601156e-05, "loss": 0.31369385719299314, "step": 960, "token_acc": 0.8786259212964959 }, { "epoch": 0.12362285421470663, "grad_norm": 4.5, "learning_rate": 1.97057410223062e-05, "loss": 0.3160251617431641, "step": 965, "token_acc": 0.8773067116124292 }, { "epoch": 0.1242633871380989, "grad_norm": 6.1875, "learning_rate": 1.9700618076212767e-05, "loss": 0.32041115760803224, "step": 970, "token_acc": 0.876734235207676 }, { "epoch": 0.12490392006149116, "grad_norm": 6.8125, "learning_rate": 1.969545159731075e-05, "loss": 0.3188473224639893, "step": 975, "token_acc": 0.8775827114696113 }, { "epoch": 0.12554445298488343, "grad_norm": 5.75, "learning_rate": 1.9690241608785404e-05, "loss": 0.31864352226257325, "step": 980, "token_acc": 0.8768779140044898 }, { "epoch": 0.12618498590827568, "grad_norm": 6.03125, "learning_rate": 1.9684988134017254e-05, "loss": 0.32373876571655275, "step": 985, "token_acc": 0.8762113968212948 }, { "epoch": 0.12682551883166795, "grad_norm": 7.625, "learning_rate": 1.9679691196581957e-05, "loss": 0.3241652727127075, "step": 990, "token_acc": 0.8755324183625177 }, { "epoch": 0.1274660517550602, "grad_norm": 5.0625, "learning_rate": 1.9674350820250222e-05, "loss": 0.31421942710876466, "step": 995, "token_acc": 0.8811829816672432 }, { "epoch": 0.12810658467845248, "grad_norm": 4.25, "learning_rate": 1.9668967028987694e-05, "loss": 0.3193212985992432, "step": 1000, "token_acc": 0.8778954619822612 }, { "epoch": 0.12810658467845248, "eval_loss": 0.34447741508483887, "eval_runtime": 102.4624, "eval_samples_per_second": 97.597, "eval_steps_per_second": 12.2, "eval_token_acc": 0.8711014279307462, "step": 1000 }, { "epoch": 0.12874711760184473, "grad_norm": 8.5, "learning_rate": 1.966353984695485e-05, "loss": 0.3204664707183838, "step": 1005, "token_acc": 0.8778589649357948 }, { "epoch": 0.12938765052523699, "grad_norm": 3.140625, "learning_rate": 1.965806929850689e-05, "loss": 0.3205430030822754, "step": 1010, "token_acc": 0.877480437508106 }, { "epoch": 0.13002818344862926, "grad_norm": 5.15625, "learning_rate": 1.9652555408193623e-05, "loss": 0.31981477737426756, "step": 1015, "token_acc": 0.8765867722363269 }, { "epoch": 0.13066871637202152, "grad_norm": 5.5, "learning_rate": 1.9646998200759366e-05, "loss": 0.31712310314178466, "step": 1020, "token_acc": 0.8808243342081487 }, { "epoch": 0.1313092492954138, "grad_norm": 5.21875, "learning_rate": 1.9641397701142818e-05, "loss": 0.3185598850250244, "step": 1025, "token_acc": 0.8790852498703096 }, { "epoch": 0.13194978221880604, "grad_norm": 4.625, "learning_rate": 1.9635753934476963e-05, "loss": 0.31679530143737794, "step": 1030, "token_acc": 0.8782927355033412 }, { "epoch": 0.13259031514219832, "grad_norm": 4.15625, "learning_rate": 1.963006692608896e-05, "loss": 0.3205821990966797, "step": 1035, "token_acc": 0.8770858420781008 }, { "epoch": 0.13323084806559057, "grad_norm": 2.640625, "learning_rate": 1.9624336701500005e-05, "loss": 0.3191715717315674, "step": 1040, "token_acc": 0.8761724464331813 }, { "epoch": 0.13387138098898282, "grad_norm": 5.78125, "learning_rate": 1.9618563286425236e-05, "loss": 0.3229659080505371, "step": 1045, "token_acc": 0.8782109398609852 }, { "epoch": 0.1345119139123751, "grad_norm": 3.796875, "learning_rate": 1.9612746706773627e-05, "loss": 0.3189516067504883, "step": 1050, "token_acc": 0.8770367809136881 }, { "epoch": 0.13515244683576735, "grad_norm": 3.453125, "learning_rate": 1.9606886988647846e-05, "loss": 0.31815266609191895, "step": 1055, "token_acc": 0.8782769920662298 }, { "epoch": 0.13579297975915963, "grad_norm": 2.75, "learning_rate": 1.9600984158344153e-05, "loss": 0.3152862548828125, "step": 1060, "token_acc": 0.8782785291448818 }, { "epoch": 0.13643351268255188, "grad_norm": 8.625, "learning_rate": 1.9595038242352283e-05, "loss": 0.31676223278045657, "step": 1065, "token_acc": 0.8806634129486459 }, { "epoch": 0.13707404560594413, "grad_norm": 12.3125, "learning_rate": 1.958904926735532e-05, "loss": 0.31054699420928955, "step": 1070, "token_acc": 0.8809544356230583 }, { "epoch": 0.1377145785293364, "grad_norm": 7.09375, "learning_rate": 1.958301726022958e-05, "loss": 0.30883467197418213, "step": 1075, "token_acc": 0.8819924033149171 }, { "epoch": 0.13835511145272866, "grad_norm": 4.46875, "learning_rate": 1.9576942248044505e-05, "loss": 0.31630141735076905, "step": 1080, "token_acc": 0.8776055124892335 }, { "epoch": 0.13899564437612094, "grad_norm": 4.65625, "learning_rate": 1.95708242580625e-05, "loss": 0.30964021682739257, "step": 1085, "token_acc": 0.8792420327304048 }, { "epoch": 0.1396361772995132, "grad_norm": 4.65625, "learning_rate": 1.956466331773887e-05, "loss": 0.3171691417694092, "step": 1090, "token_acc": 0.8775633293124246 }, { "epoch": 0.14027671022290547, "grad_norm": 3.5625, "learning_rate": 1.9558459454721642e-05, "loss": 0.31899094581604004, "step": 1095, "token_acc": 0.878079188341 }, { "epoch": 0.14091724314629772, "grad_norm": 3.234375, "learning_rate": 1.955221269685148e-05, "loss": 0.31024134159088135, "step": 1100, "token_acc": 0.8816972001382648 }, { "epoch": 0.14091724314629772, "eval_loss": 0.3438940942287445, "eval_runtime": 103.1967, "eval_samples_per_second": 96.902, "eval_steps_per_second": 12.113, "eval_token_acc": 0.8718599642325219, "step": 1100 }, { "epoch": 0.14155777606968997, "grad_norm": 4.5, "learning_rate": 1.9545923072161534e-05, "loss": 0.31498963832855226, "step": 1105, "token_acc": 0.8790587219343696 }, { "epoch": 0.14219830899308225, "grad_norm": 3.53125, "learning_rate": 1.9539590608877326e-05, "loss": 0.3086799144744873, "step": 1110, "token_acc": 0.8817727625118442 }, { "epoch": 0.1428388419164745, "grad_norm": 6.46875, "learning_rate": 1.9533215335416623e-05, "loss": 0.3052536487579346, "step": 1115, "token_acc": 0.8838970651519064 }, { "epoch": 0.14347937483986678, "grad_norm": 4.125, "learning_rate": 1.9526797280389314e-05, "loss": 0.3200625658035278, "step": 1120, "token_acc": 0.8772527377770113 }, { "epoch": 0.14411990776325903, "grad_norm": 3.5625, "learning_rate": 1.952033647259727e-05, "loss": 0.3077129364013672, "step": 1125, "token_acc": 0.8831737581039887 }, { "epoch": 0.14476044068665128, "grad_norm": 10.125, "learning_rate": 1.951383294103422e-05, "loss": 0.31737732887268066, "step": 1130, "token_acc": 0.8796427497960411 }, { "epoch": 0.14540097361004356, "grad_norm": 4.40625, "learning_rate": 1.9507286714885623e-05, "loss": 0.31585164070129396, "step": 1135, "token_acc": 0.8788506342221072 }, { "epoch": 0.1460415065334358, "grad_norm": 3.34375, "learning_rate": 1.9500697823528538e-05, "loss": 0.32686147689819334, "step": 1140, "token_acc": 0.8751074806534824 }, { "epoch": 0.1466820394568281, "grad_norm": 4.9375, "learning_rate": 1.9494066296531484e-05, "loss": 0.3137520790100098, "step": 1145, "token_acc": 0.8789589348041539 }, { "epoch": 0.14732257238022034, "grad_norm": 4.59375, "learning_rate": 1.948739216365432e-05, "loss": 0.30913615226745605, "step": 1150, "token_acc": 0.8794063079777366 }, { "epoch": 0.14796310530361262, "grad_norm": 3.5625, "learning_rate": 1.9480675454848103e-05, "loss": 0.3166754722595215, "step": 1155, "token_acc": 0.8766857684518936 }, { "epoch": 0.14860363822700487, "grad_norm": 3.46875, "learning_rate": 1.947391620025495e-05, "loss": 0.3093476057052612, "step": 1160, "token_acc": 0.8816090465708489 }, { "epoch": 0.14924417115039712, "grad_norm": 3.453125, "learning_rate": 1.9467114430207916e-05, "loss": 0.30673789978027344, "step": 1165, "token_acc": 0.8819172300934499 }, { "epoch": 0.1498847040737894, "grad_norm": 4.78125, "learning_rate": 1.9460270175230834e-05, "loss": 0.314839768409729, "step": 1170, "token_acc": 0.8800895239734872 }, { "epoch": 0.15052523699718165, "grad_norm": 5.21875, "learning_rate": 1.9453383466038218e-05, "loss": 0.3102754592895508, "step": 1175, "token_acc": 0.8781979498664829 }, { "epoch": 0.15116576992057393, "grad_norm": 2.515625, "learning_rate": 1.944645433353508e-05, "loss": 0.30159687995910645, "step": 1180, "token_acc": 0.8842528536838464 }, { "epoch": 0.15180630284396618, "grad_norm": 5.96875, "learning_rate": 1.9439482808816823e-05, "loss": 0.31150016784667967, "step": 1185, "token_acc": 0.8806630308755958 }, { "epoch": 0.15244683576735843, "grad_norm": 2.828125, "learning_rate": 1.9432468923169086e-05, "loss": 0.3075159311294556, "step": 1190, "token_acc": 0.8814693313765269 }, { "epoch": 0.1530873686907507, "grad_norm": 6.21875, "learning_rate": 1.9425412708067612e-05, "loss": 0.3062115669250488, "step": 1195, "token_acc": 0.8820224719101124 }, { "epoch": 0.15372790161414296, "grad_norm": 4.28125, "learning_rate": 1.94183141951781e-05, "loss": 0.305180287361145, "step": 1200, "token_acc": 0.8842530282637954 }, { "epoch": 0.15372790161414296, "eval_loss": 0.3408718407154083, "eval_runtime": 103.7033, "eval_samples_per_second": 96.429, "eval_steps_per_second": 12.054, "eval_token_acc": 0.8721810963894779, "step": 1200 }, { "epoch": 0.15436843453753524, "grad_norm": 3.9375, "learning_rate": 1.9411173416356065e-05, "loss": 0.30832886695861816, "step": 1205, "token_acc": 0.8802281368821293 }, { "epoch": 0.1550089674609275, "grad_norm": 5.15625, "learning_rate": 1.9403990403646702e-05, "loss": 0.3051230192184448, "step": 1210, "token_acc": 0.8818798910458732 }, { "epoch": 0.15564950038431977, "grad_norm": 4.40625, "learning_rate": 1.9396765189284726e-05, "loss": 0.3141745090484619, "step": 1215, "token_acc": 0.8782784418264683 }, { "epoch": 0.15629003330771202, "grad_norm": 4.34375, "learning_rate": 1.938949780569425e-05, "loss": 0.3076632976531982, "step": 1220, "token_acc": 0.8809513532179393 }, { "epoch": 0.15693056623110427, "grad_norm": 4.1875, "learning_rate": 1.9382188285488612e-05, "loss": 0.30438895225524903, "step": 1225, "token_acc": 0.8833096682586807 }, { "epoch": 0.15757109915449655, "grad_norm": 4.59375, "learning_rate": 1.9374836661470263e-05, "loss": 0.30989761352539064, "step": 1230, "token_acc": 0.8809441615603694 }, { "epoch": 0.1582116320778888, "grad_norm": 4.34375, "learning_rate": 1.9367442966630583e-05, "loss": 0.3067667484283447, "step": 1235, "token_acc": 0.8825937096079276 }, { "epoch": 0.15885216500128108, "grad_norm": 7.46875, "learning_rate": 1.9360007234149756e-05, "loss": 0.29884748458862304, "step": 1240, "token_acc": 0.8846866250269223 }, { "epoch": 0.15949269792467333, "grad_norm": 6.75, "learning_rate": 1.9352529497396623e-05, "loss": 0.3064408779144287, "step": 1245, "token_acc": 0.8819226300615345 }, { "epoch": 0.16013323084806558, "grad_norm": 8.625, "learning_rate": 1.9345009789928507e-05, "loss": 0.3079418182373047, "step": 1250, "token_acc": 0.8822084303077321 }, { "epoch": 0.16077376377145786, "grad_norm": 6.25, "learning_rate": 1.9337448145491106e-05, "loss": 0.3048593044281006, "step": 1255, "token_acc": 0.8844390623648474 }, { "epoch": 0.1614142966948501, "grad_norm": 4.59375, "learning_rate": 1.9329844598018288e-05, "loss": 0.31249561309814455, "step": 1260, "token_acc": 0.8813588549749957 }, { "epoch": 0.16205482961824239, "grad_norm": 10.125, "learning_rate": 1.9322199181631985e-05, "loss": 0.30511524677276614, "step": 1265, "token_acc": 0.8825485961123111 }, { "epoch": 0.16269536254163464, "grad_norm": 4.3125, "learning_rate": 1.9314511930642017e-05, "loss": 0.30724005699157714, "step": 1270, "token_acc": 0.8831095955453878 }, { "epoch": 0.16333589546502691, "grad_norm": 14.375, "learning_rate": 1.930678287954594e-05, "loss": 0.30521693229675295, "step": 1275, "token_acc": 0.8835102252135646 }, { "epoch": 0.16397642838841917, "grad_norm": 4.6875, "learning_rate": 1.9299012063028893e-05, "loss": 0.2963773250579834, "step": 1280, "token_acc": 0.8844027981690993 }, { "epoch": 0.16461696131181142, "grad_norm": 2.796875, "learning_rate": 1.9291199515963445e-05, "loss": 0.30706090927124025, "step": 1285, "token_acc": 0.8825376344086021 }, { "epoch": 0.1652574942352037, "grad_norm": 4.3125, "learning_rate": 1.9283345273409434e-05, "loss": 0.2986742496490479, "step": 1290, "token_acc": 0.8834094237229736 }, { "epoch": 0.16589802715859595, "grad_norm": 26.875, "learning_rate": 1.927544937061382e-05, "loss": 0.30177807807922363, "step": 1295, "token_acc": 0.8840617188173433 }, { "epoch": 0.16653856008198822, "grad_norm": 5.25, "learning_rate": 1.9267511843010508e-05, "loss": 0.3020944356918335, "step": 1300, "token_acc": 0.8823529411764706 }, { "epoch": 0.16653856008198822, "eval_loss": 0.34260889887809753, "eval_runtime": 102.5088, "eval_samples_per_second": 97.553, "eval_steps_per_second": 12.194, "eval_token_acc": 0.8718101333805803, "step": 1300 }, { "epoch": 0.16717909300538047, "grad_norm": 3.734375, "learning_rate": 1.925953272622021e-05, "loss": 0.30364112854003905, "step": 1305, "token_acc": 0.8823174931129476 }, { "epoch": 0.16781962592877273, "grad_norm": 3.609375, "learning_rate": 1.9251512056050257e-05, "loss": 0.3062715768814087, "step": 1310, "token_acc": 0.8822084303077321 }, { "epoch": 0.168460158852165, "grad_norm": 6.09375, "learning_rate": 1.9243449868494482e-05, "loss": 0.3047629356384277, "step": 1315, "token_acc": 0.8817324665260258 }, { "epoch": 0.16910069177555725, "grad_norm": 4.34375, "learning_rate": 1.9235346199733013e-05, "loss": 0.30484819412231445, "step": 1320, "token_acc": 0.8817362039953403 }, { "epoch": 0.16974122469894953, "grad_norm": 3.234375, "learning_rate": 1.9227201086132138e-05, "loss": 0.29434518814086913, "step": 1325, "token_acc": 0.8883073063113726 }, { "epoch": 0.17038175762234178, "grad_norm": 3.25, "learning_rate": 1.9219014564244135e-05, "loss": 0.30238900184631345, "step": 1330, "token_acc": 0.8828684914946896 }, { "epoch": 0.17102229054573406, "grad_norm": 5.9375, "learning_rate": 1.9210786670807103e-05, "loss": 0.30103113651275637, "step": 1335, "token_acc": 0.8840491860062348 }, { "epoch": 0.1716628234691263, "grad_norm": 2.84375, "learning_rate": 1.9202517442744804e-05, "loss": 0.3020737409591675, "step": 1340, "token_acc": 0.8843631342768381 }, { "epoch": 0.17230335639251856, "grad_norm": 3.828125, "learning_rate": 1.9194206917166496e-05, "loss": 0.30103378295898436, "step": 1345, "token_acc": 0.8852183650615901 }, { "epoch": 0.17294388931591084, "grad_norm": 13.9375, "learning_rate": 1.9185855131366762e-05, "loss": 0.3041229248046875, "step": 1350, "token_acc": 0.884064226519337 }, { "epoch": 0.1735844222393031, "grad_norm": 20.25, "learning_rate": 1.9177462122825344e-05, "loss": 0.308376407623291, "step": 1355, "token_acc": 0.8811586706323549 }, { "epoch": 0.17422495516269537, "grad_norm": 3.390625, "learning_rate": 1.9169027929206987e-05, "loss": 0.3022352695465088, "step": 1360, "token_acc": 0.8822768434670116 }, { "epoch": 0.17486548808608762, "grad_norm": 6.5625, "learning_rate": 1.916055258836125e-05, "loss": 0.3043084621429443, "step": 1365, "token_acc": 0.8829251495717299 }, { "epoch": 0.1755060210094799, "grad_norm": 3.3125, "learning_rate": 1.9152036138322345e-05, "loss": 0.30508239269256593, "step": 1370, "token_acc": 0.882540092007395 }, { "epoch": 0.17614655393287215, "grad_norm": 4.1875, "learning_rate": 1.9143478617308966e-05, "loss": 0.3004749059677124, "step": 1375, "token_acc": 0.8839621418384546 }, { "epoch": 0.1767870868562644, "grad_norm": 6.125, "learning_rate": 1.913488006372413e-05, "loss": 0.3041959762573242, "step": 1380, "token_acc": 0.8817213611568101 }, { "epoch": 0.17742761977965668, "grad_norm": 11.625, "learning_rate": 1.912624051615498e-05, "loss": 0.3068222522735596, "step": 1385, "token_acc": 0.8823149463893554 }, { "epoch": 0.17806815270304893, "grad_norm": 6.84375, "learning_rate": 1.9117560013372633e-05, "loss": 0.29890620708465576, "step": 1390, "token_acc": 0.885037126575721 }, { "epoch": 0.1787086856264412, "grad_norm": 3.359375, "learning_rate": 1.9108838594331997e-05, "loss": 0.308072566986084, "step": 1395, "token_acc": 0.8794112583921501 }, { "epoch": 0.17934921854983346, "grad_norm": 6.3125, "learning_rate": 1.9100076298171587e-05, "loss": 0.29462456703186035, "step": 1400, "token_acc": 0.885492563052382 }, { "epoch": 0.17934921854983346, "eval_loss": 0.3468918800354004, "eval_runtime": 107.9248, "eval_samples_per_second": 92.657, "eval_steps_per_second": 11.582, "eval_token_acc": 0.8731887869509609, "step": 1400 }, { "epoch": 0.1799897514732257, "grad_norm": 7.59375, "learning_rate": 1.9091273164213374e-05, "loss": 0.29882164001464845, "step": 1405, "token_acc": 0.8843217597584645 }, { "epoch": 0.180630284396618, "grad_norm": 3.078125, "learning_rate": 1.9082429231962586e-05, "loss": 0.29425759315490724, "step": 1410, "token_acc": 0.8862321968062149 }, { "epoch": 0.18127081732001024, "grad_norm": 6.0625, "learning_rate": 1.9073544541107544e-05, "loss": 0.2935910701751709, "step": 1415, "token_acc": 0.8873945945945946 }, { "epoch": 0.18191135024340252, "grad_norm": 3.3125, "learning_rate": 1.906461913151947e-05, "loss": 0.29699931144714353, "step": 1420, "token_acc": 0.8851322249978465 }, { "epoch": 0.18255188316679477, "grad_norm": 3.515625, "learning_rate": 1.9055653043252324e-05, "loss": 0.2873663902282715, "step": 1425, "token_acc": 0.8880069025021571 }, { "epoch": 0.18319241609018705, "grad_norm": 4.3125, "learning_rate": 1.9046646316542613e-05, "loss": 0.3060638904571533, "step": 1430, "token_acc": 0.8829251495717299 }, { "epoch": 0.1838329490135793, "grad_norm": 107.5, "learning_rate": 1.9037598991809225e-05, "loss": 0.3029170513153076, "step": 1435, "token_acc": 0.8842598563996732 }, { "epoch": 0.18447348193697155, "grad_norm": 3.984375, "learning_rate": 1.9028511109653212e-05, "loss": 0.29811413288116456, "step": 1440, "token_acc": 0.884185544768069 }, { "epoch": 0.18511401486036383, "grad_norm": 3.671875, "learning_rate": 1.9019382710857663e-05, "loss": 0.291642951965332, "step": 1445, "token_acc": 0.888984918542846 }, { "epoch": 0.18575454778375608, "grad_norm": 5.84375, "learning_rate": 1.901021383638747e-05, "loss": 0.29584593772888185, "step": 1450, "token_acc": 0.8839431769263882 }, { "epoch": 0.18639508070714836, "grad_norm": 5.125, "learning_rate": 1.900100452738917e-05, "loss": 0.29843716621398925, "step": 1455, "token_acc": 0.8849805783340526 }, { "epoch": 0.1870356136305406, "grad_norm": 4.28125, "learning_rate": 1.899175482519077e-05, "loss": 0.3069281578063965, "step": 1460, "token_acc": 0.8841452917886039 }, { "epoch": 0.18767614655393286, "grad_norm": 3.875, "learning_rate": 1.898246477130152e-05, "loss": 0.304925274848938, "step": 1465, "token_acc": 0.8836258819480296 }, { "epoch": 0.18831667947732514, "grad_norm": 3.21875, "learning_rate": 1.8973134407411768e-05, "loss": 0.29193341732025146, "step": 1470, "token_acc": 0.8880477570619025 }, { "epoch": 0.1889572124007174, "grad_norm": 3.84375, "learning_rate": 1.8963763775392766e-05, "loss": 0.2908176898956299, "step": 1475, "token_acc": 0.8886537381764782 }, { "epoch": 0.18959774532410967, "grad_norm": 5.75, "learning_rate": 1.895435291729646e-05, "loss": 0.2873049259185791, "step": 1480, "token_acc": 0.8900970369086814 }, { "epoch": 0.19023827824750192, "grad_norm": 13.0625, "learning_rate": 1.8944901875355325e-05, "loss": 0.29516000747680665, "step": 1485, "token_acc": 0.8862587849782262 }, { "epoch": 0.1908788111708942, "grad_norm": 4.71875, "learning_rate": 1.8935410691982163e-05, "loss": 0.2935316562652588, "step": 1490, "token_acc": 0.8873926367128491 }, { "epoch": 0.19151934409428645, "grad_norm": 4.625, "learning_rate": 1.8925879409769915e-05, "loss": 0.293272590637207, "step": 1495, "token_acc": 0.8866402002243894 }, { "epoch": 0.1921598770176787, "grad_norm": 7.3125, "learning_rate": 1.8916308071491474e-05, "loss": 0.28766617774963377, "step": 1500, "token_acc": 0.8866454279318065 }, { "epoch": 0.1921598770176787, "eval_loss": 0.33948463201522827, "eval_runtime": 105.2784, "eval_samples_per_second": 94.986, "eval_steps_per_second": 11.873, "eval_token_acc": 0.8743791906362293, "step": 1500 }, { "epoch": 0.19280040994107098, "grad_norm": 6.8125, "learning_rate": 1.8906696720099492e-05, "loss": 0.2923029899597168, "step": 1505, "token_acc": 0.8879117089153302 }, { "epoch": 0.19344094286446323, "grad_norm": 3.578125, "learning_rate": 1.8897045398726176e-05, "loss": 0.29394724369049074, "step": 1510, "token_acc": 0.8863872082973207 }, { "epoch": 0.1940814757878555, "grad_norm": 4.28125, "learning_rate": 1.8887354150683108e-05, "loss": 0.2944790840148926, "step": 1515, "token_acc": 0.8862324459377962 }, { "epoch": 0.19472200871124776, "grad_norm": 3.296875, "learning_rate": 1.8877623019461053e-05, "loss": 0.292703914642334, "step": 1520, "token_acc": 0.8874542715730579 }, { "epoch": 0.19536254163464, "grad_norm": 5.34375, "learning_rate": 1.886785204872975e-05, "loss": 0.28728461265563965, "step": 1525, "token_acc": 0.8864913949667041 }, { "epoch": 0.1960030745580323, "grad_norm": 11.875, "learning_rate": 1.885804128233772e-05, "loss": 0.29347355365753175, "step": 1530, "token_acc": 0.8853349426674714 }, { "epoch": 0.19664360748142454, "grad_norm": 38.0, "learning_rate": 1.884819076431208e-05, "loss": 0.2958747148513794, "step": 1535, "token_acc": 0.88622210690192 }, { "epoch": 0.19728414040481682, "grad_norm": 4.1875, "learning_rate": 1.8838300538858338e-05, "loss": 0.29049015045166016, "step": 1540, "token_acc": 0.887807818150508 }, { "epoch": 0.19792467332820907, "grad_norm": 5.125, "learning_rate": 1.8828370650360183e-05, "loss": 0.29225118160247804, "step": 1545, "token_acc": 0.8890085278663106 }, { "epoch": 0.19856520625160134, "grad_norm": 5.71875, "learning_rate": 1.8818401143379312e-05, "loss": 0.2903005599975586, "step": 1550, "token_acc": 0.8862319467266281 }, { "epoch": 0.1992057391749936, "grad_norm": 4.6875, "learning_rate": 1.8808392062655206e-05, "loss": 0.2934314966201782, "step": 1555, "token_acc": 0.8883467898622684 }, { "epoch": 0.19984627209838585, "grad_norm": 5.3125, "learning_rate": 1.8798343453104937e-05, "loss": 0.2941242218017578, "step": 1560, "token_acc": 0.887303361127826 }, { "epoch": 0.20048680502177813, "grad_norm": 7.1875, "learning_rate": 1.8788255359822975e-05, "loss": 0.30154945850372317, "step": 1565, "token_acc": 0.8849016823716708 }, { "epoch": 0.20112733794517038, "grad_norm": 5.1875, "learning_rate": 1.8778127828080978e-05, "loss": 0.3002612590789795, "step": 1570, "token_acc": 0.8851415297255442 }, { "epoch": 0.20176787086856265, "grad_norm": 3.484375, "learning_rate": 1.8767960903327575e-05, "loss": 0.28886966705322265, "step": 1575, "token_acc": 0.8898769695661558 }, { "epoch": 0.2024084037919549, "grad_norm": 4.125, "learning_rate": 1.87577546311882e-05, "loss": 0.29037837982177733, "step": 1580, "token_acc": 0.8892447522181346 }, { "epoch": 0.20304893671534716, "grad_norm": 3.8125, "learning_rate": 1.8747509057464844e-05, "loss": 0.2931870937347412, "step": 1585, "token_acc": 0.8877136936625799 }, { "epoch": 0.20368946963873943, "grad_norm": 6.96875, "learning_rate": 1.8737224228135883e-05, "loss": 0.29495954513549805, "step": 1590, "token_acc": 0.8842078167231542 }, { "epoch": 0.20433000256213169, "grad_norm": 3.5625, "learning_rate": 1.872690018935584e-05, "loss": 0.30391521453857423, "step": 1595, "token_acc": 0.882081572820557 }, { "epoch": 0.20497053548552396, "grad_norm": 3.265625, "learning_rate": 1.8716536987455216e-05, "loss": 0.292464280128479, "step": 1600, "token_acc": 0.8882241542566928 }, { "epoch": 0.20497053548552396, "eval_loss": 0.3390945494174957, "eval_runtime": 103.2443, "eval_samples_per_second": 96.858, "eval_steps_per_second": 12.107, "eval_token_acc": 0.8747529220257902, "step": 1600 }, { "epoch": 0.20561106840891621, "grad_norm": 4.25, "learning_rate": 1.870613466894025e-05, "loss": 0.29375975131988524, "step": 1605, "token_acc": 0.8863264779278892 }, { "epoch": 0.2062516013323085, "grad_norm": 6.8125, "learning_rate": 1.8695693280492723e-05, "loss": 0.29436321258544923, "step": 1610, "token_acc": 0.8866014901589215 }, { "epoch": 0.20689213425570074, "grad_norm": 3.875, "learning_rate": 1.8685212868969747e-05, "loss": 0.2911177635192871, "step": 1615, "token_acc": 0.8861411643482741 }, { "epoch": 0.207532667179093, "grad_norm": 3.125, "learning_rate": 1.867469348140356e-05, "loss": 0.2982187747955322, "step": 1620, "token_acc": 0.8837629642380685 }, { "epoch": 0.20817320010248527, "grad_norm": 3.25, "learning_rate": 1.8664135165001305e-05, "loss": 0.28237018585205076, "step": 1625, "token_acc": 0.8903340102838871 }, { "epoch": 0.20881373302587752, "grad_norm": 4.65625, "learning_rate": 1.865353796714483e-05, "loss": 0.2914335012435913, "step": 1630, "token_acc": 0.8859261186264308 }, { "epoch": 0.2094542659492698, "grad_norm": 3.953125, "learning_rate": 1.8642901935390457e-05, "loss": 0.2944057464599609, "step": 1635, "token_acc": 0.8854714864981451 }, { "epoch": 0.21009479887266205, "grad_norm": 7.78125, "learning_rate": 1.8632227117468794e-05, "loss": 0.2919133186340332, "step": 1640, "token_acc": 0.8885867219200553 }, { "epoch": 0.2107353317960543, "grad_norm": 5.3125, "learning_rate": 1.86215135612845e-05, "loss": 0.29367570877075194, "step": 1645, "token_acc": 0.8866028091576822 }, { "epoch": 0.21137586471944658, "grad_norm": 2.65625, "learning_rate": 1.8610761314916067e-05, "loss": 0.29374768733978274, "step": 1650, "token_acc": 0.8863107047356164 }, { "epoch": 0.21201639764283883, "grad_norm": 4.4375, "learning_rate": 1.859997042661564e-05, "loss": 0.286625599861145, "step": 1655, "token_acc": 0.8884670147128619 }, { "epoch": 0.2126569305662311, "grad_norm": 2.703125, "learning_rate": 1.858914094480875e-05, "loss": 0.2876077651977539, "step": 1660, "token_acc": 0.8886684208256049 }, { "epoch": 0.21329746348962336, "grad_norm": 5.5625, "learning_rate": 1.8578272918094134e-05, "loss": 0.2962442398071289, "step": 1665, "token_acc": 0.885100138121547 }, { "epoch": 0.21393799641301564, "grad_norm": 10.125, "learning_rate": 1.85673663952435e-05, "loss": 0.2934115886688232, "step": 1670, "token_acc": 0.8867126833477136 }, { "epoch": 0.2145785293364079, "grad_norm": 18.25, "learning_rate": 1.855642142520132e-05, "loss": 0.2942723274230957, "step": 1675, "token_acc": 0.8857585939519256 }, { "epoch": 0.21521906225980014, "grad_norm": 4.78125, "learning_rate": 1.8545438057084587e-05, "loss": 0.29166316986083984, "step": 1680, "token_acc": 0.8873196855834845 }, { "epoch": 0.21585959518319242, "grad_norm": 5.5625, "learning_rate": 1.8534416340182625e-05, "loss": 0.29405913352966306, "step": 1685, "token_acc": 0.8858929269245799 }, { "epoch": 0.21650012810658467, "grad_norm": 3.734375, "learning_rate": 1.852335632395685e-05, "loss": 0.2922976493835449, "step": 1690, "token_acc": 0.8865123084868308 }, { "epoch": 0.21714066102997695, "grad_norm": 3.765625, "learning_rate": 1.851225805804055e-05, "loss": 0.2907034158706665, "step": 1695, "token_acc": 0.8852600656644203 }, { "epoch": 0.2177811939533692, "grad_norm": 2.84375, "learning_rate": 1.850112159223866e-05, "loss": 0.29575324058532715, "step": 1700, "token_acc": 0.8855988654432937 }, { "epoch": 0.2177811939533692, "eval_loss": 0.3360104560852051, "eval_runtime": 105.7607, "eval_samples_per_second": 94.553, "eval_steps_per_second": 11.819, "eval_token_acc": 0.8750740541827463, "step": 1700 }, { "epoch": 0.21842172687676145, "grad_norm": 4.46875, "learning_rate": 1.848994697652755e-05, "loss": 0.2958073139190674, "step": 1705, "token_acc": 0.884866163349348 }, { "epoch": 0.21906225980015373, "grad_norm": 6.5, "learning_rate": 1.8478734261054785e-05, "loss": 0.29183714389801024, "step": 1710, "token_acc": 0.8865205384880911 }, { "epoch": 0.21970279272354598, "grad_norm": 3.109375, "learning_rate": 1.8467483496138913e-05, "loss": 0.29586522579193114, "step": 1715, "token_acc": 0.8874040865591861 }, { "epoch": 0.22034332564693826, "grad_norm": 3.03125, "learning_rate": 1.8456194732269227e-05, "loss": 0.298976993560791, "step": 1720, "token_acc": 0.8849873210985516 }, { "epoch": 0.2209838585703305, "grad_norm": 5.25, "learning_rate": 1.8444868020105556e-05, "loss": 0.28900148868560793, "step": 1725, "token_acc": 0.8885448916408669 }, { "epoch": 0.2216243914937228, "grad_norm": 100.5, "learning_rate": 1.8433503410478018e-05, "loss": 0.2942624092102051, "step": 1730, "token_acc": 0.8865779615036817 }, { "epoch": 0.22226492441711504, "grad_norm": 3.40625, "learning_rate": 1.8422100954386805e-05, "loss": 0.2904630184173584, "step": 1735, "token_acc": 0.8851269649334945 }, { "epoch": 0.2229054573405073, "grad_norm": 3.75, "learning_rate": 1.841066070300195e-05, "loss": 0.2874864101409912, "step": 1740, "token_acc": 0.8891145585756882 }, { "epoch": 0.22354599026389957, "grad_norm": 5.75, "learning_rate": 1.8399182707663097e-05, "loss": 0.28712892532348633, "step": 1745, "token_acc": 0.8877060885369337 }, { "epoch": 0.22418652318729182, "grad_norm": 4.71875, "learning_rate": 1.8387667019879267e-05, "loss": 0.29011356830596924, "step": 1750, "token_acc": 0.8868864532339817 }, { "epoch": 0.2248270561106841, "grad_norm": 4.0, "learning_rate": 1.8376113691328638e-05, "loss": 0.2822575569152832, "step": 1755, "token_acc": 0.8909889352908253 }, { "epoch": 0.22546758903407635, "grad_norm": 2.828125, "learning_rate": 1.83645227738583e-05, "loss": 0.28959126472473146, "step": 1760, "token_acc": 0.8877797943133696 }, { "epoch": 0.22610812195746863, "grad_norm": 6.1875, "learning_rate": 1.8352894319484028e-05, "loss": 0.29406278133392333, "step": 1765, "token_acc": 0.8860137145814465 }, { "epoch": 0.22674865488086088, "grad_norm": 5.1875, "learning_rate": 1.834122838039006e-05, "loss": 0.293654203414917, "step": 1770, "token_acc": 0.8864692718195903 }, { "epoch": 0.22738918780425313, "grad_norm": 6.0, "learning_rate": 1.8329525008928835e-05, "loss": 0.28885598182678224, "step": 1775, "token_acc": 0.8875652811946998 }, { "epoch": 0.2280297207276454, "grad_norm": 3.140625, "learning_rate": 1.8317784257620784e-05, "loss": 0.286731481552124, "step": 1780, "token_acc": 0.8878766945859597 }, { "epoch": 0.22867025365103766, "grad_norm": 2.875, "learning_rate": 1.830600617915409e-05, "loss": 0.2842278003692627, "step": 1785, "token_acc": 0.8887885045603167 }, { "epoch": 0.22931078657442994, "grad_norm": 5.53125, "learning_rate": 1.829419082638443e-05, "loss": 0.2927645206451416, "step": 1790, "token_acc": 0.8868731372294933 }, { "epoch": 0.2299513194978222, "grad_norm": 2.453125, "learning_rate": 1.828233825233477e-05, "loss": 0.28806371688842775, "step": 1795, "token_acc": 0.8892287806979751 }, { "epoch": 0.23059185242121444, "grad_norm": 7.71875, "learning_rate": 1.827044851019511e-05, "loss": 0.27867727279663085, "step": 1800, "token_acc": 0.8907344926031664 }, { "epoch": 0.23059185242121444, "eval_loss": 0.3403998911380768, "eval_runtime": 102.8463, "eval_samples_per_second": 97.232, "eval_steps_per_second": 12.154, "eval_token_acc": 0.8755059215662391, "step": 1800 }, { "epoch": 0.23123238534460672, "grad_norm": 3.0625, "learning_rate": 1.8258521653322234e-05, "loss": 0.29278562068939207, "step": 1805, "token_acc": 0.886281276962899 }, { "epoch": 0.23187291826799897, "grad_norm": 13.375, "learning_rate": 1.8246557735239497e-05, "loss": 0.28790295124053955, "step": 1810, "token_acc": 0.889273356401384 }, { "epoch": 0.23251345119139125, "grad_norm": 19.125, "learning_rate": 1.8234556809636567e-05, "loss": 0.2872922897338867, "step": 1815, "token_acc": 0.890285369947919 }, { "epoch": 0.2331539841147835, "grad_norm": 8.375, "learning_rate": 1.8222518930369188e-05, "loss": 0.29638094902038575, "step": 1820, "token_acc": 0.8846751229614289 }, { "epoch": 0.23379451703817578, "grad_norm": 6.46875, "learning_rate": 1.8210444151458935e-05, "loss": 0.2879481792449951, "step": 1825, "token_acc": 0.8891678933240973 }, { "epoch": 0.23443504996156803, "grad_norm": 3.734375, "learning_rate": 1.819833252709298e-05, "loss": 0.2846549034118652, "step": 1830, "token_acc": 0.8897593732512591 }, { "epoch": 0.23507558288496028, "grad_norm": 2.859375, "learning_rate": 1.818618411162384e-05, "loss": 0.2873443841934204, "step": 1835, "token_acc": 0.8899616395845007 }, { "epoch": 0.23571611580835256, "grad_norm": 6.125, "learning_rate": 1.817399895956914e-05, "loss": 0.2880409240722656, "step": 1840, "token_acc": 0.889992689115383 }, { "epoch": 0.2363566487317448, "grad_norm": 2.90625, "learning_rate": 1.816177712561136e-05, "loss": 0.28801445960998534, "step": 1845, "token_acc": 0.8874516544907607 }, { "epoch": 0.23699718165513708, "grad_norm": 3.625, "learning_rate": 1.8149518664597604e-05, "loss": 0.2893885850906372, "step": 1850, "token_acc": 0.887526974536038 }, { "epoch": 0.23763771457852934, "grad_norm": 3.421875, "learning_rate": 1.8137223631539335e-05, "loss": 0.28786296844482423, "step": 1855, "token_acc": 0.8883566373209045 }, { "epoch": 0.2382782475019216, "grad_norm": 3.59375, "learning_rate": 1.8124892081612148e-05, "loss": 0.2903712511062622, "step": 1860, "token_acc": 0.8874086807047701 }, { "epoch": 0.23891878042531386, "grad_norm": 5.625, "learning_rate": 1.8112524070155503e-05, "loss": 0.2792266607284546, "step": 1865, "token_acc": 0.8924651889125059 }, { "epoch": 0.23955931334870612, "grad_norm": 6.5625, "learning_rate": 1.8100119652672488e-05, "loss": 0.28893446922302246, "step": 1870, "token_acc": 0.8882462122847153 }, { "epoch": 0.2401998462720984, "grad_norm": 4.46875, "learning_rate": 1.8087678884829573e-05, "loss": 0.28021440505981443, "step": 1875, "token_acc": 0.8914177335229967 }, { "epoch": 0.24084037919549064, "grad_norm": 7.53125, "learning_rate": 1.8075201822456353e-05, "loss": 0.287343430519104, "step": 1880, "token_acc": 0.8879555440682347 }, { "epoch": 0.24148091211888292, "grad_norm": 3.875, "learning_rate": 1.8062688521545294e-05, "loss": 0.2859031677246094, "step": 1885, "token_acc": 0.8882400927396849 }, { "epoch": 0.24212144504227517, "grad_norm": 3.765625, "learning_rate": 1.805013903825149e-05, "loss": 0.2897067070007324, "step": 1890, "token_acc": 0.8861764071598016 }, { "epoch": 0.24276197796566742, "grad_norm": 3.3125, "learning_rate": 1.803755342889242e-05, "loss": 0.28570735454559326, "step": 1895, "token_acc": 0.8883235598482235 }, { "epoch": 0.2434025108890597, "grad_norm": 3.546875, "learning_rate": 1.802493174994766e-05, "loss": 0.2846654176712036, "step": 1900, "token_acc": 0.8901407234740568 }, { "epoch": 0.2434025108890597, "eval_loss": 0.3387065827846527, "eval_runtime": 103.0506, "eval_samples_per_second": 97.04, "eval_steps_per_second": 12.13, "eval_token_acc": 0.8753702709137318, "step": 1900 }, { "epoch": 0.24404304381245195, "grad_norm": 2.6875, "learning_rate": 1.8012274058058673e-05, "loss": 0.276248574256897, "step": 1905, "token_acc": 0.8937481149554052 }, { "epoch": 0.24468357673584423, "grad_norm": 4.96875, "learning_rate": 1.799958041002853e-05, "loss": 0.2819145679473877, "step": 1910, "token_acc": 0.8917169974115617 }, { "epoch": 0.24532410965923648, "grad_norm": 5.15625, "learning_rate": 1.7986850862821654e-05, "loss": 0.28824849128723146, "step": 1915, "token_acc": 0.8893915891072044 }, { "epoch": 0.24596464258262873, "grad_norm": 3.375, "learning_rate": 1.797408547356357e-05, "loss": 0.28600053787231444, "step": 1920, "token_acc": 0.8900150959672202 }, { "epoch": 0.246605175506021, "grad_norm": 16.875, "learning_rate": 1.7961284299540666e-05, "loss": 0.2812356948852539, "step": 1925, "token_acc": 0.8895660442600276 }, { "epoch": 0.24724570842941326, "grad_norm": 4.09375, "learning_rate": 1.7948447398199893e-05, "loss": 0.2775670051574707, "step": 1930, "token_acc": 0.892983822129942 }, { "epoch": 0.24788624135280554, "grad_norm": 3.21875, "learning_rate": 1.7935574827148554e-05, "loss": 0.28611729145050047, "step": 1935, "token_acc": 0.8871710951294087 }, { "epoch": 0.2485267742761978, "grad_norm": 4.09375, "learning_rate": 1.7922666644154015e-05, "loss": 0.2785792827606201, "step": 1940, "token_acc": 0.8903730601305494 }, { "epoch": 0.24916730719959007, "grad_norm": 2.78125, "learning_rate": 1.7909722907143456e-05, "loss": 0.28144145011901855, "step": 1945, "token_acc": 0.8897014540135937 }, { "epoch": 0.24980784012298232, "grad_norm": 2.859375, "learning_rate": 1.789674367420361e-05, "loss": 0.2729172706604004, "step": 1950, "token_acc": 0.892122991881154 }, { "epoch": 0.2504483730463746, "grad_norm": 3.9375, "learning_rate": 1.788372900358051e-05, "loss": 0.28403098583221437, "step": 1955, "token_acc": 0.8906849433165223 }, { "epoch": 0.25108890596976685, "grad_norm": 4.0, "learning_rate": 1.7870678953679208e-05, "loss": 0.28090338706970214, "step": 1960, "token_acc": 0.8915574335977924 }, { "epoch": 0.25172943889315913, "grad_norm": 3.015625, "learning_rate": 1.7857593583063533e-05, "loss": 0.2826396942138672, "step": 1965, "token_acc": 0.8903665961397297 }, { "epoch": 0.25236997181655135, "grad_norm": 6.8125, "learning_rate": 1.784447295045582e-05, "loss": 0.28316607475280764, "step": 1970, "token_acc": 0.8914384300194091 }, { "epoch": 0.25301050473994363, "grad_norm": 2.96875, "learning_rate": 1.7831317114736647e-05, "loss": 0.27657251358032225, "step": 1975, "token_acc": 0.8923183631003988 }, { "epoch": 0.2536510376633359, "grad_norm": 3.265625, "learning_rate": 1.7818126134944565e-05, "loss": 0.2740725040435791, "step": 1980, "token_acc": 0.8953624065349872 }, { "epoch": 0.25429157058672813, "grad_norm": 2.890625, "learning_rate": 1.7804900070275853e-05, "loss": 0.2742879867553711, "step": 1985, "token_acc": 0.8926875593542261 }, { "epoch": 0.2549321035101204, "grad_norm": 3.625, "learning_rate": 1.7791638980084217e-05, "loss": 0.2816567897796631, "step": 1990, "token_acc": 0.8898188093183779 }, { "epoch": 0.2555726364335127, "grad_norm": 4.71875, "learning_rate": 1.777834292388056e-05, "loss": 0.28623175621032715, "step": 1995, "token_acc": 0.8866232702504634 }, { "epoch": 0.25621316935690497, "grad_norm": 3.46875, "learning_rate": 1.7765011961332695e-05, "loss": 0.287227988243103, "step": 2000, "token_acc": 0.8884247696150203 }, { "epoch": 0.25621316935690497, "eval_loss": 0.3332171142101288, "eval_runtime": 103.5637, "eval_samples_per_second": 96.559, "eval_steps_per_second": 12.07, "eval_token_acc": 0.8768624280913123, "step": 2000 }, { "epoch": 0.2568537022802972, "grad_norm": 7.625, "learning_rate": 1.7751646152265086e-05, "loss": 0.2728090763092041, "step": 2005, "token_acc": 0.8931116389548693 }, { "epoch": 0.25749423520368947, "grad_norm": 3.453125, "learning_rate": 1.7738245556658566e-05, "loss": 0.28285210132598876, "step": 2010, "token_acc": 0.8916788698423637 }, { "epoch": 0.25813476812708175, "grad_norm": 5.4375, "learning_rate": 1.7724810234650086e-05, "loss": 0.2836940050125122, "step": 2015, "token_acc": 0.8885819123677963 }, { "epoch": 0.25877530105047397, "grad_norm": 4.5, "learning_rate": 1.7711340246532433e-05, "loss": 0.28231005668640136, "step": 2020, "token_acc": 0.8901254726710209 }, { "epoch": 0.25941583397386625, "grad_norm": 4.34375, "learning_rate": 1.769783565275396e-05, "loss": 0.27600274085998533, "step": 2025, "token_acc": 0.8926168707952389 }, { "epoch": 0.26005636689725853, "grad_norm": 2.875, "learning_rate": 1.768429651391833e-05, "loss": 0.28248867988586424, "step": 2030, "token_acc": 0.8890473720608575 }, { "epoch": 0.2606968998206508, "grad_norm": 3.8125, "learning_rate": 1.767072289078421e-05, "loss": 0.2763254642486572, "step": 2035, "token_acc": 0.8923250173250173 }, { "epoch": 0.26133743274404303, "grad_norm": 27.5, "learning_rate": 1.7657114844265036e-05, "loss": 0.2861664056777954, "step": 2040, "token_acc": 0.8899861997584958 }, { "epoch": 0.2619779656674353, "grad_norm": 5.46875, "learning_rate": 1.764347243542872e-05, "loss": 0.278385591506958, "step": 2045, "token_acc": 0.8922844175491679 }, { "epoch": 0.2626184985908276, "grad_norm": 7.46875, "learning_rate": 1.7629795725497382e-05, "loss": 0.28106253147125243, "step": 2050, "token_acc": 0.890844918865407 }, { "epoch": 0.2632590315142198, "grad_norm": 3.34375, "learning_rate": 1.7616084775847064e-05, "loss": 0.2838444709777832, "step": 2055, "token_acc": 0.8908127665073007 }, { "epoch": 0.2638995644376121, "grad_norm": 3.484375, "learning_rate": 1.760233964800747e-05, "loss": 0.27664880752563475, "step": 2060, "token_acc": 0.8921712169494445 }, { "epoch": 0.26454009736100437, "grad_norm": 4.03125, "learning_rate": 1.7588560403661686e-05, "loss": 0.2756629228591919, "step": 2065, "token_acc": 0.8935702272629263 }, { "epoch": 0.26518063028439665, "grad_norm": 25.5, "learning_rate": 1.7574747104645894e-05, "loss": 0.28539879322052003, "step": 2070, "token_acc": 0.890754132231405 }, { "epoch": 0.26582116320778887, "grad_norm": 3.3125, "learning_rate": 1.7560899812949097e-05, "loss": 0.28184425830841064, "step": 2075, "token_acc": 0.8913267940113577 }, { "epoch": 0.26646169613118115, "grad_norm": 3.375, "learning_rate": 1.7547018590712862e-05, "loss": 0.2689033508300781, "step": 2080, "token_acc": 0.895397489539749 }, { "epoch": 0.2671022290545734, "grad_norm": 4.59375, "learning_rate": 1.7533103500231002e-05, "loss": 0.2777507543563843, "step": 2085, "token_acc": 0.892097198843282 }, { "epoch": 0.26774276197796565, "grad_norm": 4.28125, "learning_rate": 1.7519154603949332e-05, "loss": 0.2816345691680908, "step": 2090, "token_acc": 0.889937781591933 }, { "epoch": 0.2683832949013579, "grad_norm": 4.9375, "learning_rate": 1.750517196446538e-05, "loss": 0.27451438903808595, "step": 2095, "token_acc": 0.8933247200689061 }, { "epoch": 0.2690238278247502, "grad_norm": 3.828125, "learning_rate": 1.749115564452808e-05, "loss": 0.28593323230743406, "step": 2100, "token_acc": 0.8909067435555365 }, { "epoch": 0.2690238278247502, "eval_loss": 0.3338375389575958, "eval_runtime": 104.1317, "eval_samples_per_second": 96.032, "eval_steps_per_second": 12.004, "eval_token_acc": 0.8768458178073317, "step": 2100 }, { "epoch": 0.26966436074814243, "grad_norm": 2.921875, "learning_rate": 1.747710570703753e-05, "loss": 0.28068857192993163, "step": 2105, "token_acc": 0.8920412834132228 }, { "epoch": 0.2703048936715347, "grad_norm": 3.734375, "learning_rate": 1.7463022215044686e-05, "loss": 0.2719306945800781, "step": 2110, "token_acc": 0.8959989630141721 }, { "epoch": 0.270945426594927, "grad_norm": 3.609375, "learning_rate": 1.7448905231751086e-05, "loss": 0.27764501571655276, "step": 2115, "token_acc": 0.8928232144399344 }, { "epoch": 0.27158595951831926, "grad_norm": 2.59375, "learning_rate": 1.743475482050856e-05, "loss": 0.27620573043823243, "step": 2120, "token_acc": 0.8925562707910313 }, { "epoch": 0.2722264924417115, "grad_norm": 8.0, "learning_rate": 1.7420571044818954e-05, "loss": 0.27559990882873536, "step": 2125, "token_acc": 0.8930252645217016 }, { "epoch": 0.27286702536510377, "grad_norm": 2.71875, "learning_rate": 1.7406353968333837e-05, "loss": 0.2709467887878418, "step": 2130, "token_acc": 0.8944920546057448 }, { "epoch": 0.27350755828849604, "grad_norm": 3.796875, "learning_rate": 1.7392103654854223e-05, "loss": 0.27666122913360597, "step": 2135, "token_acc": 0.8931367037149159 }, { "epoch": 0.27414809121188827, "grad_norm": 12.5, "learning_rate": 1.7377820168330285e-05, "loss": 0.28263001441955565, "step": 2140, "token_acc": 0.8904274533413606 }, { "epoch": 0.27478862413528055, "grad_norm": 2.40625, "learning_rate": 1.7363503572861066e-05, "loss": 0.2721690654754639, "step": 2145, "token_acc": 0.8954508143603923 }, { "epoch": 0.2754291570586728, "grad_norm": 3.265625, "learning_rate": 1.734915393269417e-05, "loss": 0.28317282199859617, "step": 2150, "token_acc": 0.8910895342842413 }, { "epoch": 0.2760696899820651, "grad_norm": 16.75, "learning_rate": 1.733477131222552e-05, "loss": 0.27765982151031493, "step": 2155, "token_acc": 0.8932398123843539 }, { "epoch": 0.2767102229054573, "grad_norm": 4.90625, "learning_rate": 1.7320355775999024e-05, "loss": 0.2786709785461426, "step": 2160, "token_acc": 0.8914231613375221 }, { "epoch": 0.2773507558288496, "grad_norm": 6.21875, "learning_rate": 1.7305907388706312e-05, "loss": 0.28313846588134767, "step": 2165, "token_acc": 0.8894102453723634 }, { "epoch": 0.2779912887522419, "grad_norm": 4.96875, "learning_rate": 1.7291426215186436e-05, "loss": 0.27286443710327146, "step": 2170, "token_acc": 0.892789455547898 }, { "epoch": 0.2786318216756341, "grad_norm": 3.828125, "learning_rate": 1.7276912320425584e-05, "loss": 0.270449161529541, "step": 2175, "token_acc": 0.8942574600971548 }, { "epoch": 0.2792723545990264, "grad_norm": 3.84375, "learning_rate": 1.726236576955678e-05, "loss": 0.26513283252716063, "step": 2180, "token_acc": 0.8964592970472526 }, { "epoch": 0.27991288752241866, "grad_norm": 4.4375, "learning_rate": 1.7247786627859594e-05, "loss": 0.2790388822555542, "step": 2185, "token_acc": 0.8934532002752925 }, { "epoch": 0.28055342044581094, "grad_norm": 4.5625, "learning_rate": 1.7233174960759855e-05, "loss": 0.2737919807434082, "step": 2190, "token_acc": 0.8916871152438762 }, { "epoch": 0.28119395336920316, "grad_norm": 4.03125, "learning_rate": 1.721853083382936e-05, "loss": 0.2736166715621948, "step": 2195, "token_acc": 0.8931646005509641 }, { "epoch": 0.28183448629259544, "grad_norm": 2.703125, "learning_rate": 1.7203854312785565e-05, "loss": 0.26971442699432374, "step": 2200, "token_acc": 0.8943965517241379 }, { "epoch": 0.28183448629259544, "eval_loss": 0.33342963457107544, "eval_runtime": 105.1061, "eval_samples_per_second": 95.142, "eval_steps_per_second": 11.893, "eval_token_acc": 0.8773164425201123, "step": 2200 }, { "epoch": 0.2824750192159877, "grad_norm": 4.46875, "learning_rate": 1.7189145463491303e-05, "loss": 0.271907377243042, "step": 2205, "token_acc": 0.8919140136992203 }, { "epoch": 0.28311555213937994, "grad_norm": 3.109375, "learning_rate": 1.7174404351954485e-05, "loss": 0.2717395782470703, "step": 2210, "token_acc": 0.8932424268576853 }, { "epoch": 0.2837560850627722, "grad_norm": 4.375, "learning_rate": 1.7159631044327798e-05, "loss": 0.26909971237182617, "step": 2215, "token_acc": 0.8958585509251253 }, { "epoch": 0.2843966179861645, "grad_norm": 9.0, "learning_rate": 1.714482560690842e-05, "loss": 0.2807865858078003, "step": 2220, "token_acc": 0.8910201273008773 }, { "epoch": 0.2850371509095567, "grad_norm": 4.71875, "learning_rate": 1.7129988106137715e-05, "loss": 0.2830962657928467, "step": 2225, "token_acc": 0.8901977644024076 }, { "epoch": 0.285677683832949, "grad_norm": 5.28125, "learning_rate": 1.7115118608600925e-05, "loss": 0.2666552782058716, "step": 2230, "token_acc": 0.8958189058171745 }, { "epoch": 0.2863182167563413, "grad_norm": 6.09375, "learning_rate": 1.7100217181026898e-05, "loss": 0.2754360198974609, "step": 2235, "token_acc": 0.8915355351893481 }, { "epoch": 0.28695874967973356, "grad_norm": 3.921875, "learning_rate": 1.708528389028776e-05, "loss": 0.2791600227355957, "step": 2240, "token_acc": 0.8892330193143201 }, { "epoch": 0.2875992826031258, "grad_norm": 5.71875, "learning_rate": 1.707031880339863e-05, "loss": 0.27956390380859375, "step": 2245, "token_acc": 0.8920872595843552 }, { "epoch": 0.28823981552651806, "grad_norm": 3.328125, "learning_rate": 1.705532198751732e-05, "loss": 0.27212765216827395, "step": 2250, "token_acc": 0.8932557638439992 }, { "epoch": 0.28888034844991034, "grad_norm": 8.1875, "learning_rate": 1.7040293509944027e-05, "loss": 0.27141647338867186, "step": 2255, "token_acc": 0.8947846000950611 }, { "epoch": 0.28952088137330256, "grad_norm": 3.5625, "learning_rate": 1.7025233438121037e-05, "loss": 0.27087936401367185, "step": 2260, "token_acc": 0.8955649693092418 }, { "epoch": 0.29016141429669484, "grad_norm": 3.59375, "learning_rate": 1.7010141839632417e-05, "loss": 0.27354631423950193, "step": 2265, "token_acc": 0.8951717573764818 }, { "epoch": 0.2908019472200871, "grad_norm": 3.0625, "learning_rate": 1.699501878220371e-05, "loss": 0.27661924362182616, "step": 2270, "token_acc": 0.8932264736297828 }, { "epoch": 0.2914424801434794, "grad_norm": 2.46875, "learning_rate": 1.6979864333701645e-05, "loss": 0.271943473815918, "step": 2275, "token_acc": 0.8923408845738943 }, { "epoch": 0.2920830130668716, "grad_norm": 9.75, "learning_rate": 1.6964678562133815e-05, "loss": 0.27072222232818605, "step": 2280, "token_acc": 0.8939870012482245 }, { "epoch": 0.2927235459902639, "grad_norm": 2.90625, "learning_rate": 1.6949461535648377e-05, "loss": 0.26898555755615233, "step": 2285, "token_acc": 0.8956255128039038 }, { "epoch": 0.2933640789136562, "grad_norm": 3.390625, "learning_rate": 1.6934213322533758e-05, "loss": 0.27256574630737307, "step": 2290, "token_acc": 0.8912772451743262 }, { "epoch": 0.2940046118370484, "grad_norm": 28.0, "learning_rate": 1.6918933991218333e-05, "loss": 0.28531837463378906, "step": 2295, "token_acc": 0.8895293813989503 }, { "epoch": 0.2946451447604407, "grad_norm": 8.8125, "learning_rate": 1.6903623610270127e-05, "loss": 0.28380842208862306, "step": 2300, "token_acc": 0.8899965475573969 }, { "epoch": 0.2946451447604407, "eval_loss": 0.32995760440826416, "eval_runtime": 105.2266, "eval_samples_per_second": 95.033, "eval_steps_per_second": 11.879, "eval_token_acc": 0.8770423728344342, "step": 2300 }, { "epoch": 0.29528567768383296, "grad_norm": 33.5, "learning_rate": 1.6888282248396498e-05, "loss": 0.2725163459777832, "step": 2305, "token_acc": 0.8936618507051943 }, { "epoch": 0.29592621060722524, "grad_norm": 3.609375, "learning_rate": 1.6872909974443847e-05, "loss": 0.2721263885498047, "step": 2310, "token_acc": 0.8947889750215332 }, { "epoch": 0.29656674353061746, "grad_norm": 4.4375, "learning_rate": 1.685750685739728e-05, "loss": 0.27622146606445314, "step": 2315, "token_acc": 0.8931007685001295 }, { "epoch": 0.29720727645400974, "grad_norm": 2.859375, "learning_rate": 1.6842072966380333e-05, "loss": 0.274534273147583, "step": 2320, "token_acc": 0.8931854473263994 }, { "epoch": 0.297847809377402, "grad_norm": 3.375, "learning_rate": 1.682660837065463e-05, "loss": 0.2757120132446289, "step": 2325, "token_acc": 0.8937096079276174 }, { "epoch": 0.29848834230079424, "grad_norm": 3.1875, "learning_rate": 1.6811113139619596e-05, "loss": 0.276756739616394, "step": 2330, "token_acc": 0.8923500559332244 }, { "epoch": 0.2991288752241865, "grad_norm": 3.125, "learning_rate": 1.6795587342812137e-05, "loss": 0.2754298448562622, "step": 2335, "token_acc": 0.8925148925148925 }, { "epoch": 0.2997694081475788, "grad_norm": 3.484375, "learning_rate": 1.6780031049906317e-05, "loss": 0.2664804935455322, "step": 2340, "token_acc": 0.8968724315438854 }, { "epoch": 0.300409941070971, "grad_norm": 2.640625, "learning_rate": 1.6764444330713062e-05, "loss": 0.2691181182861328, "step": 2345, "token_acc": 0.8944920546057448 }, { "epoch": 0.3010504739943633, "grad_norm": 3.65625, "learning_rate": 1.674882725517984e-05, "loss": 0.27036800384521487, "step": 2350, "token_acc": 0.8949255020513928 }, { "epoch": 0.3016910069177556, "grad_norm": 2.578125, "learning_rate": 1.6733179893390342e-05, "loss": 0.2797673463821411, "step": 2355, "token_acc": 0.8923209292320929 }, { "epoch": 0.30233153984114786, "grad_norm": 13.1875, "learning_rate": 1.671750231556419e-05, "loss": 0.2723313093185425, "step": 2360, "token_acc": 0.894600767009954 }, { "epoch": 0.3029720727645401, "grad_norm": 5.21875, "learning_rate": 1.6701794592056572e-05, "loss": 0.26928038597106935, "step": 2365, "token_acc": 0.8950982509177283 }, { "epoch": 0.30361260568793236, "grad_norm": 2.703125, "learning_rate": 1.6686056793357993e-05, "loss": 0.27132067680358884, "step": 2370, "token_acc": 0.8939844120053395 }, { "epoch": 0.30425313861132464, "grad_norm": 3.28125, "learning_rate": 1.6670288990093904e-05, "loss": 0.2636139392852783, "step": 2375, "token_acc": 0.8964771990490599 }, { "epoch": 0.30489367153471686, "grad_norm": 2.375, "learning_rate": 1.665449125302441e-05, "loss": 0.2698176860809326, "step": 2380, "token_acc": 0.8951257453979777 }, { "epoch": 0.30553420445810914, "grad_norm": 4.96875, "learning_rate": 1.663866365304395e-05, "loss": 0.27487883567810056, "step": 2385, "token_acc": 0.8932954398656504 }, { "epoch": 0.3061747373815014, "grad_norm": 2.6875, "learning_rate": 1.6622806261180975e-05, "loss": 0.27799344062805176, "step": 2390, "token_acc": 0.8914015477214101 }, { "epoch": 0.3068152703048937, "grad_norm": 6.71875, "learning_rate": 1.660691914859763e-05, "loss": 0.2766709566116333, "step": 2395, "token_acc": 0.8933596431022649 }, { "epoch": 0.3074558032282859, "grad_norm": 3.03125, "learning_rate": 1.659100238658944e-05, "loss": 0.2766282081604004, "step": 2400, "token_acc": 0.8921505237294711 }, { "epoch": 0.3074558032282859, "eval_loss": 0.33286821842193604, "eval_runtime": 102.9833, "eval_samples_per_second": 97.103, "eval_steps_per_second": 12.138, "eval_token_acc": 0.8770562147377513, "step": 2400 }, { "epoch": 0.3080963361516782, "grad_norm": 2.765625, "learning_rate": 1.6575056046584982e-05, "loss": 0.2664001703262329, "step": 2405, "token_acc": 0.8966157299490632 }, { "epoch": 0.3087368690750705, "grad_norm": 2.921875, "learning_rate": 1.6559080200145565e-05, "loss": 0.2731971740722656, "step": 2410, "token_acc": 0.8921074184232078 }, { "epoch": 0.3093774019984627, "grad_norm": 3.796875, "learning_rate": 1.6543074918964923e-05, "loss": 0.27004868984222413, "step": 2415, "token_acc": 0.8929048954065129 }, { "epoch": 0.310017934921855, "grad_norm": 2.71875, "learning_rate": 1.652704027486887e-05, "loss": 0.27138233184814453, "step": 2420, "token_acc": 0.8945260347129506 }, { "epoch": 0.31065846784524725, "grad_norm": 3.078125, "learning_rate": 1.6510976339814998e-05, "loss": 0.27827138900756837, "step": 2425, "token_acc": 0.8920770324415148 }, { "epoch": 0.31129900076863953, "grad_norm": 3.578125, "learning_rate": 1.6494883185892345e-05, "loss": 0.268726110458374, "step": 2430, "token_acc": 0.8919771764502464 }, { "epoch": 0.31193953369203176, "grad_norm": 6.4375, "learning_rate": 1.647876088532107e-05, "loss": 0.27257063388824465, "step": 2435, "token_acc": 0.8935447825339522 }, { "epoch": 0.31258006661542403, "grad_norm": 3.578125, "learning_rate": 1.6462609510452126e-05, "loss": 0.27083382606506345, "step": 2440, "token_acc": 0.8954088212535466 }, { "epoch": 0.3132205995388163, "grad_norm": 3.078125, "learning_rate": 1.6446429133766955e-05, "loss": 0.2705575942993164, "step": 2445, "token_acc": 0.8939740326963723 }, { "epoch": 0.31386113246220854, "grad_norm": 2.6875, "learning_rate": 1.6430219827877137e-05, "loss": 0.27445831298828127, "step": 2450, "token_acc": 0.892545649838883 }, { "epoch": 0.3145016653856008, "grad_norm": 2.78125, "learning_rate": 1.641398166552408e-05, "loss": 0.26441996097564696, "step": 2455, "token_acc": 0.8965159505489755 }, { "epoch": 0.3151421983089931, "grad_norm": 4.3125, "learning_rate": 1.6397714719578692e-05, "loss": 0.2621718406677246, "step": 2460, "token_acc": 0.8974180950314602 }, { "epoch": 0.31578273123238537, "grad_norm": 4.96875, "learning_rate": 1.6381419063041044e-05, "loss": 0.2664108991622925, "step": 2465, "token_acc": 0.8976900534390623 }, { "epoch": 0.3164232641557776, "grad_norm": 3.171875, "learning_rate": 1.636509476904005e-05, "loss": 0.26438174247741697, "step": 2470, "token_acc": 0.8964934333145508 }, { "epoch": 0.3170637970791699, "grad_norm": 3.390625, "learning_rate": 1.634874191083315e-05, "loss": 0.2664673328399658, "step": 2475, "token_acc": 0.8969490355154706 }, { "epoch": 0.31770433000256215, "grad_norm": 7.3125, "learning_rate": 1.6332360561805953e-05, "loss": 0.2602536678314209, "step": 2480, "token_acc": 0.8977645434144658 }, { "epoch": 0.3183448629259544, "grad_norm": 3.0625, "learning_rate": 1.631595079547194e-05, "loss": 0.26571226119995117, "step": 2485, "token_acc": 0.8940942154485625 }, { "epoch": 0.31898539584934665, "grad_norm": 2.953125, "learning_rate": 1.6299512685472104e-05, "loss": 0.2715281009674072, "step": 2490, "token_acc": 0.8944671689989235 }, { "epoch": 0.31962592877273893, "grad_norm": 2.375, "learning_rate": 1.6283046305574646e-05, "loss": 0.26947875022888185, "step": 2495, "token_acc": 0.8938190607734806 }, { "epoch": 0.32026646169613116, "grad_norm": 2.65625, "learning_rate": 1.6266551729674625e-05, "loss": 0.26917757987976076, "step": 2500, "token_acc": 0.8948387096774194 }, { "epoch": 0.32026646169613116, "eval_loss": 0.33583664894104004, "eval_runtime": 101.9417, "eval_samples_per_second": 98.095, "eval_steps_per_second": 12.262, "eval_token_acc": 0.8780445266345903, "step": 2500 }, { "epoch": 0.32090699461952343, "grad_norm": 2.8125, "learning_rate": 1.6250029031793637e-05, "loss": 0.26485161781311034, "step": 2505, "token_acc": 0.8968752690023242 }, { "epoch": 0.3215475275429157, "grad_norm": 3.515625, "learning_rate": 1.623347828607948e-05, "loss": 0.27197585105895994, "step": 2510, "token_acc": 0.8942249763318703 }, { "epoch": 0.322188060466308, "grad_norm": 3.53125, "learning_rate": 1.621689956680581e-05, "loss": 0.26804704666137696, "step": 2515, "token_acc": 0.8950142573230796 }, { "epoch": 0.3228285933897002, "grad_norm": 3.390625, "learning_rate": 1.6200292948371826e-05, "loss": 0.27621660232543943, "step": 2520, "token_acc": 0.8921281543364051 }, { "epoch": 0.3234691263130925, "grad_norm": 7.625, "learning_rate": 1.6183658505301937e-05, "loss": 0.270648455619812, "step": 2525, "token_acc": 0.8940274727640701 }, { "epoch": 0.32410965923648477, "grad_norm": 3.9375, "learning_rate": 1.6166996312245403e-05, "loss": 0.2624387502670288, "step": 2530, "token_acc": 0.8973961027763407 }, { "epoch": 0.324750192159877, "grad_norm": 18.0, "learning_rate": 1.6150306443976026e-05, "loss": 0.270206356048584, "step": 2535, "token_acc": 0.8947050707140394 }, { "epoch": 0.32539072508326927, "grad_norm": 7.5625, "learning_rate": 1.6133588975391793e-05, "loss": 0.26768012046813966, "step": 2540, "token_acc": 0.8949362728212195 }, { "epoch": 0.32603125800666155, "grad_norm": 2.53125, "learning_rate": 1.6116843981514568e-05, "loss": 0.265167760848999, "step": 2545, "token_acc": 0.895794614686433 }, { "epoch": 0.32667179093005383, "grad_norm": 2.34375, "learning_rate": 1.6100071537489726e-05, "loss": 0.2654293060302734, "step": 2550, "token_acc": 0.8955745341614907 }, { "epoch": 0.32731232385344605, "grad_norm": 4.71875, "learning_rate": 1.6083271718585828e-05, "loss": 0.2678376197814941, "step": 2555, "token_acc": 0.8955519229114687 }, { "epoch": 0.32795285677683833, "grad_norm": 4.8125, "learning_rate": 1.606644460019429e-05, "loss": 0.2697244644165039, "step": 2560, "token_acc": 0.8963868911760906 }, { "epoch": 0.3285933897002306, "grad_norm": 3.390625, "learning_rate": 1.604959025782904e-05, "loss": 0.2643167972564697, "step": 2565, "token_acc": 0.8960286936606024 }, { "epoch": 0.32923392262362283, "grad_norm": 4.0625, "learning_rate": 1.6032708767126158e-05, "loss": 0.2669541835784912, "step": 2570, "token_acc": 0.8956862407439298 }, { "epoch": 0.3298744555470151, "grad_norm": 7.125, "learning_rate": 1.601580020384358e-05, "loss": 0.27081780433654784, "step": 2575, "token_acc": 0.8950130095403296 }, { "epoch": 0.3305149884704074, "grad_norm": 6.9375, "learning_rate": 1.5998864643860723e-05, "loss": 0.25800356864929197, "step": 2580, "token_acc": 0.8975890576981345 }, { "epoch": 0.33115552139379967, "grad_norm": 3.078125, "learning_rate": 1.5981902163178152e-05, "loss": 0.26956448554992674, "step": 2585, "token_acc": 0.8931310867878997 }, { "epoch": 0.3317960543171919, "grad_norm": 2.34375, "learning_rate": 1.596491283791725e-05, "loss": 0.26112003326416017, "step": 2590, "token_acc": 0.8994970516822755 }, { "epoch": 0.33243658724058417, "grad_norm": 3.265625, "learning_rate": 1.594789674431986e-05, "loss": 0.27035064697265626, "step": 2595, "token_acc": 0.8960434445306439 }, { "epoch": 0.33307712016397645, "grad_norm": 3.140625, "learning_rate": 1.593085395874796e-05, "loss": 0.27758283615112306, "step": 2600, "token_acc": 0.8915802607236587 }, { "epoch": 0.33307712016397645, "eval_loss": 0.3308471143245697, "eval_runtime": 103.8949, "eval_samples_per_second": 96.251, "eval_steps_per_second": 12.031, "eval_token_acc": 0.8782355449003659, "step": 2600 }, { "epoch": 0.33371765308736867, "grad_norm": 3.203125, "learning_rate": 1.5913784557683304e-05, "loss": 0.2707799196243286, "step": 2605, "token_acc": 0.8923612603705455 }, { "epoch": 0.33435818601076095, "grad_norm": 2.609375, "learning_rate": 1.5896688617727095e-05, "loss": 0.2663607120513916, "step": 2610, "token_acc": 0.8958297432362571 }, { "epoch": 0.33499871893415323, "grad_norm": 2.859375, "learning_rate": 1.5879566215599623e-05, "loss": 0.2679924488067627, "step": 2615, "token_acc": 0.8955616958811847 }, { "epoch": 0.33563925185754545, "grad_norm": 3.046875, "learning_rate": 1.5862417428139938e-05, "loss": 0.268009090423584, "step": 2620, "token_acc": 0.8962800309997416 }, { "epoch": 0.33627978478093773, "grad_norm": 3.234375, "learning_rate": 1.5845242332305496e-05, "loss": 0.257326078414917, "step": 2625, "token_acc": 0.8986603284356093 }, { "epoch": 0.33692031770433, "grad_norm": 3.078125, "learning_rate": 1.5828041005171818e-05, "loss": 0.2634852647781372, "step": 2630, "token_acc": 0.8965665605369589 }, { "epoch": 0.3375608506277223, "grad_norm": 3.453125, "learning_rate": 1.581081352393213e-05, "loss": 0.2582373857498169, "step": 2635, "token_acc": 0.8991817398794143 }, { "epoch": 0.3382013835511145, "grad_norm": 3.6875, "learning_rate": 1.5793559965897042e-05, "loss": 0.27222495079040526, "step": 2640, "token_acc": 0.891765924391507 }, { "epoch": 0.3388419164745068, "grad_norm": 2.703125, "learning_rate": 1.577628040849418e-05, "loss": 0.2661598205566406, "step": 2645, "token_acc": 0.8968578940562907 }, { "epoch": 0.33948244939789907, "grad_norm": 5.53125, "learning_rate": 1.5758974929267844e-05, "loss": 0.2645248889923096, "step": 2650, "token_acc": 0.899624563069089 }, { "epoch": 0.3401229823212913, "grad_norm": 3.359375, "learning_rate": 1.574164360587867e-05, "loss": 0.2611443281173706, "step": 2655, "token_acc": 0.8988632925616977 }, { "epoch": 0.34076351524468357, "grad_norm": 2.328125, "learning_rate": 1.572428651610326e-05, "loss": 0.27028732299804686, "step": 2660, "token_acc": 0.8950970685721665 }, { "epoch": 0.34140404816807585, "grad_norm": 2.609375, "learning_rate": 1.570690373783386e-05, "loss": 0.2680711269378662, "step": 2665, "token_acc": 0.8941867495897037 }, { "epoch": 0.3420445810914681, "grad_norm": 2.796875, "learning_rate": 1.5689495349077984e-05, "loss": 0.2609850406646729, "step": 2670, "token_acc": 0.8973326405126872 }, { "epoch": 0.34268511401486035, "grad_norm": 2.984375, "learning_rate": 1.5672061427958086e-05, "loss": 0.26308517456054686, "step": 2675, "token_acc": 0.896771416272062 }, { "epoch": 0.3433256469382526, "grad_norm": 2.875, "learning_rate": 1.5654602052711202e-05, "loss": 0.27320644855499265, "step": 2680, "token_acc": 0.8943637769567833 }, { "epoch": 0.3439661798616449, "grad_norm": 2.515625, "learning_rate": 1.563711730168858e-05, "loss": 0.26294333934783937, "step": 2685, "token_acc": 0.8980288752485519 }, { "epoch": 0.34460671278503713, "grad_norm": 2.5, "learning_rate": 1.5619607253355365e-05, "loss": 0.2679460048675537, "step": 2690, "token_acc": 0.8959211771792445 }, { "epoch": 0.3452472457084294, "grad_norm": 2.625, "learning_rate": 1.5602071986290214e-05, "loss": 0.2540433883666992, "step": 2695, "token_acc": 0.8991190188288133 }, { "epoch": 0.3458877786318217, "grad_norm": 3.734375, "learning_rate": 1.558451157918496e-05, "loss": 0.26918482780456543, "step": 2700, "token_acc": 0.8953965852604279 }, { "epoch": 0.3458877786318217, "eval_loss": 0.33141687512397766, "eval_runtime": 102.8827, "eval_samples_per_second": 97.198, "eval_steps_per_second": 12.15, "eval_token_acc": 0.8783047544169513, "step": 2700 }, { "epoch": 0.34652831155521396, "grad_norm": 2.8125, "learning_rate": 1.556692611084426e-05, "loss": 0.2630035400390625, "step": 2705, "token_acc": 0.8983968711049986 }, { "epoch": 0.3471688444786062, "grad_norm": 5.09375, "learning_rate": 1.554931566018523e-05, "loss": 0.26360278129577636, "step": 2710, "token_acc": 0.8968615649183147 }, { "epoch": 0.34780937740199847, "grad_norm": 2.671875, "learning_rate": 1.55316803062371e-05, "loss": 0.25818705558776855, "step": 2715, "token_acc": 0.8981868297514967 }, { "epoch": 0.34844991032539074, "grad_norm": 2.921875, "learning_rate": 1.5514020128140854e-05, "loss": 0.26247010231018064, "step": 2720, "token_acc": 0.8978712401965009 }, { "epoch": 0.34909044324878297, "grad_norm": 3.3125, "learning_rate": 1.5496335205148888e-05, "loss": 0.26362130641937254, "step": 2725, "token_acc": 0.8962732651034244 }, { "epoch": 0.34973097617217525, "grad_norm": 2.328125, "learning_rate": 1.547862561662463e-05, "loss": 0.26531424522399905, "step": 2730, "token_acc": 0.8975365632684758 }, { "epoch": 0.3503715090955675, "grad_norm": 5.3125, "learning_rate": 1.546089144204221e-05, "loss": 0.2550010919570923, "step": 2735, "token_acc": 0.9006299620296859 }, { "epoch": 0.3510120420189598, "grad_norm": 2.890625, "learning_rate": 1.5443132760986077e-05, "loss": 0.25297343730926514, "step": 2740, "token_acc": 0.9003972023141352 }, { "epoch": 0.351652574942352, "grad_norm": 3.421875, "learning_rate": 1.5425349653150674e-05, "loss": 0.2688558578491211, "step": 2745, "token_acc": 0.8946644309729567 }, { "epoch": 0.3522931078657443, "grad_norm": 3.765625, "learning_rate": 1.5407542198340045e-05, "loss": 0.25696539878845215, "step": 2750, "token_acc": 0.8994601597927013 }, { "epoch": 0.3529336407891366, "grad_norm": 2.671875, "learning_rate": 1.538971047646751e-05, "loss": 0.2645355224609375, "step": 2755, "token_acc": 0.8972590932597828 }, { "epoch": 0.3535741737125288, "grad_norm": 3.09375, "learning_rate": 1.537185456755528e-05, "loss": 0.2609572410583496, "step": 2760, "token_acc": 0.8978304088512404 }, { "epoch": 0.3542147066359211, "grad_norm": 3.28125, "learning_rate": 1.5353974551734102e-05, "loss": 0.25736873149871825, "step": 2765, "token_acc": 0.9001164445594514 }, { "epoch": 0.35485523955931336, "grad_norm": 2.890625, "learning_rate": 1.533607050924293e-05, "loss": 0.2622791290283203, "step": 2770, "token_acc": 0.8979319258940112 }, { "epoch": 0.3554957724827056, "grad_norm": 3.375, "learning_rate": 1.531814252042852e-05, "loss": 0.2560434818267822, "step": 2775, "token_acc": 0.9002417335750669 }, { "epoch": 0.35613630540609786, "grad_norm": 3.109375, "learning_rate": 1.5300190665745097e-05, "loss": 0.26474769115448, "step": 2780, "token_acc": 0.8953059298034841 }, { "epoch": 0.35677683832949014, "grad_norm": 7.5625, "learning_rate": 1.5282215025753984e-05, "loss": 0.2650959014892578, "step": 2785, "token_acc": 0.8951942520328701 }, { "epoch": 0.3574173712528824, "grad_norm": 2.71875, "learning_rate": 1.526421568112325e-05, "loss": 0.26280429363250735, "step": 2790, "token_acc": 0.8963790945578525 }, { "epoch": 0.35805790417627464, "grad_norm": 8.0625, "learning_rate": 1.5246192712627341e-05, "loss": 0.2684659957885742, "step": 2795, "token_acc": 0.8949045957703406 }, { "epoch": 0.3586984370996669, "grad_norm": 3.390625, "learning_rate": 1.522814620114671e-05, "loss": 0.2673259019851685, "step": 2800, "token_acc": 0.8951397849462366 }, { "epoch": 0.3586984370996669, "eval_loss": 0.3319157361984253, "eval_runtime": 102.1149, "eval_samples_per_second": 97.929, "eval_steps_per_second": 12.241, "eval_token_acc": 0.8786978644711563, "step": 2800 }, { "epoch": 0.3593389700230592, "grad_norm": 2.734375, "learning_rate": 1.5210076227667467e-05, "loss": 0.26007418632507323, "step": 2805, "token_acc": 0.898749460974558 }, { "epoch": 0.3599795029464514, "grad_norm": 3.328125, "learning_rate": 1.5191982873281016e-05, "loss": 0.2620399951934814, "step": 2810, "token_acc": 0.8979195441988951 }, { "epoch": 0.3606200358698437, "grad_norm": 3.203125, "learning_rate": 1.5173866219183681e-05, "loss": 0.2614466667175293, "step": 2815, "token_acc": 0.8992734192543897 }, { "epoch": 0.361260568793236, "grad_norm": 2.5, "learning_rate": 1.5155726346676342e-05, "loss": 0.2509075880050659, "step": 2820, "token_acc": 0.9027616216449097 }, { "epoch": 0.36190110171662826, "grad_norm": 2.78125, "learning_rate": 1.5137563337164088e-05, "loss": 0.26183514595031737, "step": 2825, "token_acc": 0.8969877438287589 }, { "epoch": 0.3625416346400205, "grad_norm": 2.84375, "learning_rate": 1.5119377272155821e-05, "loss": 0.2658205032348633, "step": 2830, "token_acc": 0.8951929295106704 }, { "epoch": 0.36318216756341276, "grad_norm": 3.359375, "learning_rate": 1.5101168233263925e-05, "loss": 0.25493884086608887, "step": 2835, "token_acc": 0.9017814778070138 }, { "epoch": 0.36382270048680504, "grad_norm": 4.125, "learning_rate": 1.508293630220387e-05, "loss": 0.2533620119094849, "step": 2840, "token_acc": 0.8996077417130048 }, { "epoch": 0.36446323341019726, "grad_norm": 3.5, "learning_rate": 1.506468156079386e-05, "loss": 0.2602185010910034, "step": 2845, "token_acc": 0.8991640093079376 }, { "epoch": 0.36510376633358954, "grad_norm": 2.90625, "learning_rate": 1.5046404090954467e-05, "loss": 0.26317653656005857, "step": 2850, "token_acc": 0.8983934186156696 }, { "epoch": 0.3657442992569818, "grad_norm": 3.9375, "learning_rate": 1.5028103974708259e-05, "loss": 0.2617523670196533, "step": 2855, "token_acc": 0.8986605796976614 }, { "epoch": 0.3663848321803741, "grad_norm": 3.609375, "learning_rate": 1.5009781294179431e-05, "loss": 0.2595290899276733, "step": 2860, "token_acc": 0.8996727523251808 }, { "epoch": 0.3670253651037663, "grad_norm": 7.65625, "learning_rate": 1.4991436131593438e-05, "loss": 0.2566396236419678, "step": 2865, "token_acc": 0.8992037873897138 }, { "epoch": 0.3676658980271586, "grad_norm": 3.953125, "learning_rate": 1.4973068569276627e-05, "loss": 0.2593822479248047, "step": 2870, "token_acc": 0.8983920334526017 }, { "epoch": 0.3683064309505509, "grad_norm": 3.71875, "learning_rate": 1.495467868965587e-05, "loss": 0.25176091194152833, "step": 2875, "token_acc": 0.8993063035891249 }, { "epoch": 0.3689469638739431, "grad_norm": 5.6875, "learning_rate": 1.4936266575258184e-05, "loss": 0.26164243221282957, "step": 2880, "token_acc": 0.8975323047668439 }, { "epoch": 0.3695874967973354, "grad_norm": 8.1875, "learning_rate": 1.4917832308710374e-05, "loss": 0.2630914211273193, "step": 2885, "token_acc": 0.897822806639362 }, { "epoch": 0.37022802972072766, "grad_norm": 2.890625, "learning_rate": 1.489937597273865e-05, "loss": 0.26312851905822754, "step": 2890, "token_acc": 0.8980260322386001 }, { "epoch": 0.3708685626441199, "grad_norm": 2.71875, "learning_rate": 1.4880897650168269e-05, "loss": 0.26306843757629395, "step": 2895, "token_acc": 0.8972188633615478 }, { "epoch": 0.37150909556751216, "grad_norm": 6.25, "learning_rate": 1.4862397423923148e-05, "loss": 0.2542487382888794, "step": 2900, "token_acc": 0.9007903943333477 }, { "epoch": 0.37150909556751216, "eval_loss": 0.3293861448764801, "eval_runtime": 103.439, "eval_samples_per_second": 96.675, "eval_steps_per_second": 12.084, "eval_token_acc": 0.8789276400662197, "step": 2900 }, { "epoch": 0.37214962849090444, "grad_norm": 5.3125, "learning_rate": 1.48438753770255e-05, "loss": 0.2586365222930908, "step": 2905, "token_acc": 0.8990952175786299 }, { "epoch": 0.3727901614142967, "grad_norm": 3.40625, "learning_rate": 1.4825331592595471e-05, "loss": 0.25507054328918455, "step": 2910, "token_acc": 0.8996947418203706 }, { "epoch": 0.37343069433768894, "grad_norm": 3.796875, "learning_rate": 1.480676615385074e-05, "loss": 0.25874695777893064, "step": 2915, "token_acc": 0.8979389016117184 }, { "epoch": 0.3740712272610812, "grad_norm": 3.03125, "learning_rate": 1.4788179144106187e-05, "loss": 0.2610326766967773, "step": 2920, "token_acc": 0.8965872504829363 }, { "epoch": 0.3747117601844735, "grad_norm": 2.96875, "learning_rate": 1.4769570646773469e-05, "loss": 0.25159320831298826, "step": 2925, "token_acc": 0.9015691868758916 }, { "epoch": 0.3753522931078657, "grad_norm": 3.0, "learning_rate": 1.4750940745360683e-05, "loss": 0.2555972099304199, "step": 2930, "token_acc": 0.9030378872874774 }, { "epoch": 0.375992826031258, "grad_norm": 3.359375, "learning_rate": 1.4732289523471983e-05, "loss": 0.25429458618164064, "step": 2935, "token_acc": 0.901710690739863 }, { "epoch": 0.3766333589546503, "grad_norm": 3.015625, "learning_rate": 1.47136170648072e-05, "loss": 0.260566258430481, "step": 2940, "token_acc": 0.8977762454749181 }, { "epoch": 0.37727389187804256, "grad_norm": 4.4375, "learning_rate": 1.469492345316146e-05, "loss": 0.2575147390365601, "step": 2945, "token_acc": 0.9003219575016098 }, { "epoch": 0.3779144248014348, "grad_norm": 3.078125, "learning_rate": 1.4676208772424825e-05, "loss": 0.26031718254089353, "step": 2950, "token_acc": 0.8967789165446559 }, { "epoch": 0.37855495772482706, "grad_norm": 3.9375, "learning_rate": 1.4657473106581903e-05, "loss": 0.2566239356994629, "step": 2955, "token_acc": 0.900069096562446 }, { "epoch": 0.37919549064821934, "grad_norm": 24.5, "learning_rate": 1.4638716539711477e-05, "loss": 0.26539459228515627, "step": 2960, "token_acc": 0.8971825516676015 }, { "epoch": 0.37983602357161156, "grad_norm": 2.796875, "learning_rate": 1.4619939155986122e-05, "loss": 0.2547321081161499, "step": 2965, "token_acc": 0.9001380977041257 }, { "epoch": 0.38047655649500384, "grad_norm": 3.53125, "learning_rate": 1.4601141039671837e-05, "loss": 0.26095755100250245, "step": 2970, "token_acc": 0.8989492800622649 }, { "epoch": 0.3811170894183961, "grad_norm": 2.65625, "learning_rate": 1.4582322275127663e-05, "loss": 0.2595865726470947, "step": 2975, "token_acc": 0.8997066689673022 }, { "epoch": 0.3817576223417884, "grad_norm": 3.0625, "learning_rate": 1.4563482946805291e-05, "loss": 0.2566410541534424, "step": 2980, "token_acc": 0.8992691315563198 }, { "epoch": 0.3823981552651806, "grad_norm": 3.734375, "learning_rate": 1.4544623139248707e-05, "loss": 0.26386113166809083, "step": 2985, "token_acc": 0.8982700748773561 }, { "epoch": 0.3830386881885729, "grad_norm": 4.15625, "learning_rate": 1.4525742937093797e-05, "loss": 0.2548778533935547, "step": 2990, "token_acc": 0.8996550237171195 }, { "epoch": 0.3836792211119652, "grad_norm": 2.84375, "learning_rate": 1.4506842425067963e-05, "loss": 0.2560065746307373, "step": 2995, "token_acc": 0.8996068944662836 }, { "epoch": 0.3843197540353574, "grad_norm": 3.84375, "learning_rate": 1.4487921687989763e-05, "loss": 0.2564894676208496, "step": 3000, "token_acc": 0.8991150442477877 }, { "epoch": 0.3843197540353574, "eval_loss": 0.33271661400794983, "eval_runtime": 102.5035, "eval_samples_per_second": 97.558, "eval_steps_per_second": 12.195, "eval_token_acc": 0.879677871226005, "step": 3000 }, { "epoch": 0.3849602869587497, "grad_norm": 3.09375, "learning_rate": 1.4468980810768507e-05, "loss": 0.2549588203430176, "step": 3005, "token_acc": 0.9006631071305546 }, { "epoch": 0.38560081988214195, "grad_norm": 2.296875, "learning_rate": 1.4450019878403894e-05, "loss": 0.256690239906311, "step": 3010, "token_acc": 0.9002636014001123 }, { "epoch": 0.3862413528055342, "grad_norm": 6.46875, "learning_rate": 1.4431038975985616e-05, "loss": 0.2593832969665527, "step": 3015, "token_acc": 0.8996518973741888 }, { "epoch": 0.38688188572892646, "grad_norm": 5.46875, "learning_rate": 1.441203818869299e-05, "loss": 0.26778130531311034, "step": 3020, "token_acc": 0.8993683667769519 }, { "epoch": 0.38752241865231873, "grad_norm": 3.15625, "learning_rate": 1.4393017601794558e-05, "loss": 0.25722360610961914, "step": 3025, "token_acc": 0.8998792687133494 }, { "epoch": 0.388162951575711, "grad_norm": 3.671875, "learning_rate": 1.4373977300647735e-05, "loss": 0.25923748016357423, "step": 3030, "token_acc": 0.8972777082704172 }, { "epoch": 0.38880348449910324, "grad_norm": 3.234375, "learning_rate": 1.4354917370698388e-05, "loss": 0.24125347137451172, "step": 3035, "token_acc": 0.9047619047619048 }, { "epoch": 0.3894440174224955, "grad_norm": 2.59375, "learning_rate": 1.4335837897480475e-05, "loss": 0.26301088333129885, "step": 3040, "token_acc": 0.89577136945558 }, { "epoch": 0.3900845503458878, "grad_norm": 5.1875, "learning_rate": 1.4316738966615665e-05, "loss": 0.25510516166687014, "step": 3045, "token_acc": 0.9006117525417887 }, { "epoch": 0.39072508326928, "grad_norm": 4.0, "learning_rate": 1.4297620663812934e-05, "loss": 0.26404881477355957, "step": 3050, "token_acc": 0.8973620897061351 }, { "epoch": 0.3913656161926723, "grad_norm": 2.921875, "learning_rate": 1.4278483074868206e-05, "loss": 0.2587254524230957, "step": 3055, "token_acc": 0.8988434317279476 }, { "epoch": 0.3920061491160646, "grad_norm": 2.625, "learning_rate": 1.4259326285663942e-05, "loss": 0.2552812576293945, "step": 3060, "token_acc": 0.9003674086881348 }, { "epoch": 0.39264668203945685, "grad_norm": 2.859375, "learning_rate": 1.4240150382168766e-05, "loss": 0.2574739933013916, "step": 3065, "token_acc": 0.9006264852019875 }, { "epoch": 0.3932872149628491, "grad_norm": 3.109375, "learning_rate": 1.4220955450437097e-05, "loss": 0.2653143644332886, "step": 3070, "token_acc": 0.8960445153776474 }, { "epoch": 0.39392774788624135, "grad_norm": 3.125, "learning_rate": 1.4201741576608724e-05, "loss": 0.2522631883621216, "step": 3075, "token_acc": 0.9006473888649115 }, { "epoch": 0.39456828080963363, "grad_norm": 2.75, "learning_rate": 1.4182508846908456e-05, "loss": 0.25041637420654295, "step": 3080, "token_acc": 0.9022227988237329 }, { "epoch": 0.39520881373302585, "grad_norm": 2.359375, "learning_rate": 1.4163257347645711e-05, "loss": 0.25646038055419923, "step": 3085, "token_acc": 0.8983489244298831 }, { "epoch": 0.39584934665641813, "grad_norm": 3.46875, "learning_rate": 1.4143987165214146e-05, "loss": 0.2523691654205322, "step": 3090, "token_acc": 0.9009485036164407 }, { "epoch": 0.3964898795798104, "grad_norm": 3.40625, "learning_rate": 1.4124698386091256e-05, "loss": 0.2536661148071289, "step": 3095, "token_acc": 0.8998623537508603 }, { "epoch": 0.3971304125032027, "grad_norm": 3.046875, "learning_rate": 1.4105391096837988e-05, "loss": 0.25694756507873534, "step": 3100, "token_acc": 0.8986861942709455 }, { "epoch": 0.3971304125032027, "eval_loss": 0.3274412453174591, "eval_runtime": 105.1272, "eval_samples_per_second": 95.123, "eval_steps_per_second": 11.89, "eval_token_acc": 0.880165106222766, "step": 3100 }, { "epoch": 0.3977709454265949, "grad_norm": 2.90625, "learning_rate": 1.4086065384098367e-05, "loss": 0.2536616802215576, "step": 3105, "token_acc": 0.9024840983324738 }, { "epoch": 0.3984114783499872, "grad_norm": 2.65625, "learning_rate": 1.4066721334599084e-05, "loss": 0.2547293663024902, "step": 3110, "token_acc": 0.8989598169968492 }, { "epoch": 0.39905201127337947, "grad_norm": 2.4375, "learning_rate": 1.4047359035149126e-05, "loss": 0.24942498207092284, "step": 3115, "token_acc": 0.9021344624956792 }, { "epoch": 0.3996925441967717, "grad_norm": 2.953125, "learning_rate": 1.4027978572639375e-05, "loss": 0.25708999633789065, "step": 3120, "token_acc": 0.900335801618736 }, { "epoch": 0.40033307712016397, "grad_norm": 3.25, "learning_rate": 1.4008580034042226e-05, "loss": 0.254312539100647, "step": 3125, "token_acc": 0.9001466528640442 }, { "epoch": 0.40097361004355625, "grad_norm": 4.21875, "learning_rate": 1.3989163506411187e-05, "loss": 0.25107884407043457, "step": 3130, "token_acc": 0.902516670251667 }, { "epoch": 0.40161414296694853, "grad_norm": 2.953125, "learning_rate": 1.39697290768805e-05, "loss": 0.24998018741607667, "step": 3135, "token_acc": 0.9032967981358418 }, { "epoch": 0.40225467589034075, "grad_norm": 2.75, "learning_rate": 1.3950276832664745e-05, "loss": 0.2500455856323242, "step": 3140, "token_acc": 0.9015105740181268 }, { "epoch": 0.40289520881373303, "grad_norm": 4.25, "learning_rate": 1.3930806861058438e-05, "loss": 0.25563080310821534, "step": 3145, "token_acc": 0.8991560454702032 }, { "epoch": 0.4035357417371253, "grad_norm": 3.0, "learning_rate": 1.3911319249435657e-05, "loss": 0.25497581958770754, "step": 3150, "token_acc": 0.8996980155306299 }, { "epoch": 0.40417627466051753, "grad_norm": 2.796875, "learning_rate": 1.3891814085249644e-05, "loss": 0.25732955932617185, "step": 3155, "token_acc": 0.8988880268942333 }, { "epoch": 0.4048168075839098, "grad_norm": 8.375, "learning_rate": 1.3872291456032405e-05, "loss": 0.2536874294281006, "step": 3160, "token_acc": 0.9003486720330592 }, { "epoch": 0.4054573405073021, "grad_norm": 2.6875, "learning_rate": 1.3852751449394324e-05, "loss": 0.2530160427093506, "step": 3165, "token_acc": 0.9021668892430965 }, { "epoch": 0.4060978734306943, "grad_norm": 3.109375, "learning_rate": 1.383319415302377e-05, "loss": 0.2553149938583374, "step": 3170, "token_acc": 0.8996287342427906 }, { "epoch": 0.4067384063540866, "grad_norm": 4.40625, "learning_rate": 1.3813619654686703e-05, "loss": 0.25406613349914553, "step": 3175, "token_acc": 0.9016216216216216 }, { "epoch": 0.40737893927747887, "grad_norm": 2.859375, "learning_rate": 1.3794028042226273e-05, "loss": 0.2548455476760864, "step": 3180, "token_acc": 0.9005142832447384 }, { "epoch": 0.40801947220087115, "grad_norm": 3.15625, "learning_rate": 1.3774419403562437e-05, "loss": 0.2509315013885498, "step": 3185, "token_acc": 0.9003025064822817 }, { "epoch": 0.40866000512426337, "grad_norm": 2.609375, "learning_rate": 1.3754793826691565e-05, "loss": 0.2544880390167236, "step": 3190, "token_acc": 0.8993057052913019 }, { "epoch": 0.40930053804765565, "grad_norm": 2.546875, "learning_rate": 1.3735151399686024e-05, "loss": 0.25415782928466796, "step": 3195, "token_acc": 0.9004563850856798 }, { "epoch": 0.4099410709710479, "grad_norm": 2.734375, "learning_rate": 1.371549221069381e-05, "loss": 0.252706241607666, "step": 3200, "token_acc": 0.9001977984176126 }, { "epoch": 0.4099410709710479, "eval_loss": 0.3304235339164734, "eval_runtime": 103.4877, "eval_samples_per_second": 96.63, "eval_steps_per_second": 12.079, "eval_token_acc": 0.8798495108271368, "step": 3200 }, { "epoch": 0.41058160389444015, "grad_norm": 3.171875, "learning_rate": 1.369581634793814e-05, "loss": 0.24554102420806884, "step": 3205, "token_acc": 0.9046261500583128 }, { "epoch": 0.41122213681783243, "grad_norm": 3.15625, "learning_rate": 1.367612389971705e-05, "loss": 0.25843195915222167, "step": 3210, "token_acc": 0.8992231333621061 }, { "epoch": 0.4118626697412247, "grad_norm": 2.703125, "learning_rate": 1.3656414954403015e-05, "loss": 0.2526721477508545, "step": 3215, "token_acc": 0.9009551673694174 }, { "epoch": 0.412503202664617, "grad_norm": 4.03125, "learning_rate": 1.3636689600442535e-05, "loss": 0.2488550662994385, "step": 3220, "token_acc": 0.9032662773091067 }, { "epoch": 0.4131437355880092, "grad_norm": 8.6875, "learning_rate": 1.3616947926355748e-05, "loss": 0.2410456657409668, "step": 3225, "token_acc": 0.9047186932849365 }, { "epoch": 0.4137842685114015, "grad_norm": 3.6875, "learning_rate": 1.3597190020736032e-05, "loss": 0.25398988723754884, "step": 3230, "token_acc": 0.9014248202832422 }, { "epoch": 0.41442480143479377, "grad_norm": 4.0625, "learning_rate": 1.3577415972249608e-05, "loss": 0.24551260471343994, "step": 3235, "token_acc": 0.9037839237174785 }, { "epoch": 0.415065334358186, "grad_norm": 4.125, "learning_rate": 1.3557625869635136e-05, "loss": 0.2562254905700684, "step": 3240, "token_acc": 0.8990086206896551 }, { "epoch": 0.41570586728157827, "grad_norm": 2.890625, "learning_rate": 1.3537819801703323e-05, "loss": 0.2528964996337891, "step": 3245, "token_acc": 0.9020725388601036 }, { "epoch": 0.41634640020497055, "grad_norm": 70.0, "learning_rate": 1.3517997857336522e-05, "loss": 0.2532426595687866, "step": 3250, "token_acc": 0.8992752998015702 }, { "epoch": 0.4169869331283628, "grad_norm": 3.03125, "learning_rate": 1.3498160125488336e-05, "loss": 0.248179292678833, "step": 3255, "token_acc": 0.9034928848641656 }, { "epoch": 0.41762746605175505, "grad_norm": 2.703125, "learning_rate": 1.3478306695183212e-05, "loss": 0.25196003913879395, "step": 3260, "token_acc": 0.9024979507312654 }, { "epoch": 0.4182679989751473, "grad_norm": 3.75, "learning_rate": 1.3458437655516048e-05, "loss": 0.2540182590484619, "step": 3265, "token_acc": 0.9011801322785631 }, { "epoch": 0.4189085318985396, "grad_norm": 4.6875, "learning_rate": 1.3438553095651794e-05, "loss": 0.24988923072814942, "step": 3270, "token_acc": 0.9028460543337645 }, { "epoch": 0.4195490648219318, "grad_norm": 3.3125, "learning_rate": 1.3418653104825044e-05, "loss": 0.25744991302490233, "step": 3275, "token_acc": 0.8989907702924178 }, { "epoch": 0.4201895977453241, "grad_norm": 2.515625, "learning_rate": 1.3398737772339643e-05, "loss": 0.25907082557678224, "step": 3280, "token_acc": 0.8988532405617833 }, { "epoch": 0.4208301306687164, "grad_norm": 2.59375, "learning_rate": 1.3378807187568288e-05, "loss": 0.2617329597473145, "step": 3285, "token_acc": 0.8974062165058949 }, { "epoch": 0.4214706635921086, "grad_norm": 2.34375, "learning_rate": 1.335886143995211e-05, "loss": 0.25168476104736326, "step": 3290, "token_acc": 0.9027303015879884 }, { "epoch": 0.4221111965155009, "grad_norm": 3.484375, "learning_rate": 1.3338900619000299e-05, "loss": 0.25457475185394285, "step": 3295, "token_acc": 0.9008958566629339 }, { "epoch": 0.42275172943889316, "grad_norm": 3.25, "learning_rate": 1.3318924814289682e-05, "loss": 0.25605058670043945, "step": 3300, "token_acc": 0.9003745640849012 }, { "epoch": 0.42275172943889316, "eval_loss": 0.3262101709842682, "eval_runtime": 103.038, "eval_samples_per_second": 97.052, "eval_steps_per_second": 12.131, "eval_token_acc": 0.8801623378421026, "step": 3300 }, { "epoch": 0.42339226236228544, "grad_norm": 2.9375, "learning_rate": 1.329893411546433e-05, "loss": 0.2533790826797485, "step": 3305, "token_acc": 0.9021087584630644 }, { "epoch": 0.42403279528567767, "grad_norm": 3.109375, "learning_rate": 1.327892861223515e-05, "loss": 0.26516075134277345, "step": 3310, "token_acc": 0.8975274134594711 }, { "epoch": 0.42467332820906994, "grad_norm": 2.828125, "learning_rate": 1.3258908394379492e-05, "loss": 0.24489293098449708, "step": 3315, "token_acc": 0.9024179620034543 }, { "epoch": 0.4253138611324622, "grad_norm": 2.671875, "learning_rate": 1.323887355174073e-05, "loss": 0.2507158279418945, "step": 3320, "token_acc": 0.9005565382458259 }, { "epoch": 0.42595439405585445, "grad_norm": 2.859375, "learning_rate": 1.3218824174227876e-05, "loss": 0.2552894353866577, "step": 3325, "token_acc": 0.899343072002766 }, { "epoch": 0.4265949269792467, "grad_norm": 4.6875, "learning_rate": 1.3198760351815165e-05, "loss": 0.25093369483947753, "step": 3330, "token_acc": 0.901463793773479 }, { "epoch": 0.427235459902639, "grad_norm": 6.78125, "learning_rate": 1.3178682174541664e-05, "loss": 0.25160994529724123, "step": 3335, "token_acc": 0.9026560138199093 }, { "epoch": 0.4278759928260313, "grad_norm": 3.1875, "learning_rate": 1.3158589732510847e-05, "loss": 0.25160062313079834, "step": 3340, "token_acc": 0.9015646492434664 }, { "epoch": 0.4285165257494235, "grad_norm": 3.015625, "learning_rate": 1.3138483115890214e-05, "loss": 0.24968068599700927, "step": 3345, "token_acc": 0.9020596346087556 }, { "epoch": 0.4291570586728158, "grad_norm": 2.40625, "learning_rate": 1.3118362414910869e-05, "loss": 0.25055222511291503, "step": 3350, "token_acc": 0.902113891285591 }, { "epoch": 0.42979759159620806, "grad_norm": 3.21875, "learning_rate": 1.3098227719867117e-05, "loss": 0.23631854057312013, "step": 3355, "token_acc": 0.9082366187424216 }, { "epoch": 0.4304381245196003, "grad_norm": 3.125, "learning_rate": 1.3078079121116074e-05, "loss": 0.2557328939437866, "step": 3360, "token_acc": 0.9006379860332787 }, { "epoch": 0.43107865744299256, "grad_norm": 2.828125, "learning_rate": 1.305791670907725e-05, "loss": 0.2555293083190918, "step": 3365, "token_acc": 0.9005658502872446 }, { "epoch": 0.43171919036638484, "grad_norm": 2.5625, "learning_rate": 1.3037740574232134e-05, "loss": 0.25120766162872316, "step": 3370, "token_acc": 0.9023821853961678 }, { "epoch": 0.4323597232897771, "grad_norm": 3.09375, "learning_rate": 1.3017550807123806e-05, "loss": 0.2534923553466797, "step": 3375, "token_acc": 0.9000387780602352 }, { "epoch": 0.43300025621316934, "grad_norm": 3.109375, "learning_rate": 1.2997347498356519e-05, "loss": 0.24217534065246582, "step": 3380, "token_acc": 0.9059788473990935 }, { "epoch": 0.4336407891365616, "grad_norm": 2.296875, "learning_rate": 1.2977130738595298e-05, "loss": 0.2505367279052734, "step": 3385, "token_acc": 0.9020945800043187 }, { "epoch": 0.4342813220599539, "grad_norm": 2.65625, "learning_rate": 1.2956900618565532e-05, "loss": 0.24520423412322997, "step": 3390, "token_acc": 0.9031252705393472 }, { "epoch": 0.4349218549833461, "grad_norm": 3.640625, "learning_rate": 1.293665722905256e-05, "loss": 0.2532040596008301, "step": 3395, "token_acc": 0.9023980712102295 }, { "epoch": 0.4355623879067384, "grad_norm": 3.03125, "learning_rate": 1.2916400660901276e-05, "loss": 0.24737958908081054, "step": 3400, "token_acc": 0.9031688624817016 }, { "epoch": 0.4355623879067384, "eval_loss": 0.3326387107372284, "eval_runtime": 103.1507, "eval_samples_per_second": 96.945, "eval_steps_per_second": 12.118, "eval_token_acc": 0.8803395142045611, "step": 3400 }, { "epoch": 0.4362029208301307, "grad_norm": 2.265625, "learning_rate": 1.2896131005015717e-05, "loss": 0.25047874450683594, "step": 3405, "token_acc": 0.9031589849818746 }, { "epoch": 0.4368434537535229, "grad_norm": 2.515625, "learning_rate": 1.2875848352358644e-05, "loss": 0.25389971733093264, "step": 3410, "token_acc": 0.9008509541000516 }, { "epoch": 0.4374839866769152, "grad_norm": 3.875, "learning_rate": 1.2855552793951146e-05, "loss": 0.2464221954345703, "step": 3415, "token_acc": 0.9023832138848114 }, { "epoch": 0.43812451960030746, "grad_norm": 3.6875, "learning_rate": 1.2835244420872232e-05, "loss": 0.25347232818603516, "step": 3420, "token_acc": 0.900116044182748 }, { "epoch": 0.43876505252369974, "grad_norm": 2.546875, "learning_rate": 1.2814923324258416e-05, "loss": 0.2549131393432617, "step": 3425, "token_acc": 0.9009849886016603 }, { "epoch": 0.43940558544709196, "grad_norm": 2.9375, "learning_rate": 1.2794589595303316e-05, "loss": 0.24712648391723632, "step": 3430, "token_acc": 0.9032174364296834 }, { "epoch": 0.44004611837048424, "grad_norm": 3.140625, "learning_rate": 1.277424332525723e-05, "loss": 0.24843959808349608, "step": 3435, "token_acc": 0.9036415534988322 }, { "epoch": 0.4406866512938765, "grad_norm": 9.1875, "learning_rate": 1.2753884605426736e-05, "loss": 0.24894342422485352, "step": 3440, "token_acc": 0.9017941861468127 }, { "epoch": 0.44132718421726874, "grad_norm": 2.953125, "learning_rate": 1.273351352717429e-05, "loss": 0.24595353603363038, "step": 3445, "token_acc": 0.9049160516207001 }, { "epoch": 0.441967717140661, "grad_norm": 2.921875, "learning_rate": 1.2713130181917806e-05, "loss": 0.25805752277374266, "step": 3450, "token_acc": 0.8997292302402544 }, { "epoch": 0.4426082500640533, "grad_norm": 3.546875, "learning_rate": 1.269273466113024e-05, "loss": 0.2535008430480957, "step": 3455, "token_acc": 0.9011762685670129 }, { "epoch": 0.4432487829874456, "grad_norm": 3.671875, "learning_rate": 1.2672327056339198e-05, "loss": 0.24500885009765624, "step": 3460, "token_acc": 0.905705264068515 }, { "epoch": 0.4438893159108378, "grad_norm": 4.28125, "learning_rate": 1.2651907459126512e-05, "loss": 0.25068912506103513, "step": 3465, "token_acc": 0.9028803385585352 }, { "epoch": 0.4445298488342301, "grad_norm": 2.828125, "learning_rate": 1.2631475961127822e-05, "loss": 0.2502088785171509, "step": 3470, "token_acc": 0.9028352292312996 }, { "epoch": 0.44517038175762236, "grad_norm": 4.375, "learning_rate": 1.2611032654032185e-05, "loss": 0.2501903295516968, "step": 3475, "token_acc": 0.901529554096094 }, { "epoch": 0.4458109146810146, "grad_norm": 4.125, "learning_rate": 1.2590577629581648e-05, "loss": 0.25160870552062986, "step": 3480, "token_acc": 0.9006165653429914 }, { "epoch": 0.44645144760440686, "grad_norm": 10.75, "learning_rate": 1.2570110979570846e-05, "loss": 0.2540600299835205, "step": 3485, "token_acc": 0.9013576215844646 }, { "epoch": 0.44709198052779914, "grad_norm": 2.625, "learning_rate": 1.2549632795846582e-05, "loss": 0.25437102317810056, "step": 3490, "token_acc": 0.9012979172955026 }, { "epoch": 0.4477325134511914, "grad_norm": 2.609375, "learning_rate": 1.2529143170307418e-05, "loss": 0.25037708282470705, "step": 3495, "token_acc": 0.90243692783771 }, { "epoch": 0.44837304637458364, "grad_norm": 2.984375, "learning_rate": 1.250864219490326e-05, "loss": 0.2490053653717041, "step": 3500, "token_acc": 0.9017818527809013 }, { "epoch": 0.44837304637458364, "eval_loss": 0.3326828181743622, "eval_runtime": 103.6038, "eval_samples_per_second": 96.522, "eval_steps_per_second": 12.065, "eval_token_acc": 0.8804225656244636, "step": 3500 }, { "epoch": 0.4490135792979759, "grad_norm": 3.390625, "learning_rate": 1.248812996163495e-05, "loss": 0.24862072467803956, "step": 3505, "token_acc": 0.9011164274322169 }, { "epoch": 0.4496541122213682, "grad_norm": 2.71875, "learning_rate": 1.2467606562553858e-05, "loss": 0.25421929359436035, "step": 3510, "token_acc": 0.9018134715025907 }, { "epoch": 0.4502946451447604, "grad_norm": 5.625, "learning_rate": 1.244707208976145e-05, "loss": 0.24651005268096923, "step": 3515, "token_acc": 0.903206343733839 }, { "epoch": 0.4509351780681527, "grad_norm": 2.859375, "learning_rate": 1.2426526635408896e-05, "loss": 0.24950928688049318, "step": 3520, "token_acc": 0.9035152037955575 }, { "epoch": 0.451575710991545, "grad_norm": 4.875, "learning_rate": 1.240597029169664e-05, "loss": 0.25514960289001465, "step": 3525, "token_acc": 0.8999569336778639 }, { "epoch": 0.45221624391493725, "grad_norm": 2.484375, "learning_rate": 1.2385403150874003e-05, "loss": 0.24595193862915038, "step": 3530, "token_acc": 0.904110774556469 }, { "epoch": 0.4528567768383295, "grad_norm": 2.265625, "learning_rate": 1.2364825305238748e-05, "loss": 0.24859437942504883, "step": 3535, "token_acc": 0.9019379844961241 }, { "epoch": 0.45349730976172176, "grad_norm": 3.328125, "learning_rate": 1.2344236847136683e-05, "loss": 0.25172064304351804, "step": 3540, "token_acc": 0.9005772378736969 }, { "epoch": 0.45413784268511403, "grad_norm": 3.140625, "learning_rate": 1.2323637868961247e-05, "loss": 0.2530811309814453, "step": 3545, "token_acc": 0.900356943190126 }, { "epoch": 0.45477837560850626, "grad_norm": 2.78125, "learning_rate": 1.2303028463153081e-05, "loss": 0.25023765563964845, "step": 3550, "token_acc": 0.9036663650855198 }, { "epoch": 0.45541890853189854, "grad_norm": 3.375, "learning_rate": 1.2282408722199623e-05, "loss": 0.2615813732147217, "step": 3555, "token_acc": 0.8973312132021144 }, { "epoch": 0.4560594414552908, "grad_norm": 4.625, "learning_rate": 1.2261778738634688e-05, "loss": 0.24770092964172363, "step": 3560, "token_acc": 0.9046594673665189 }, { "epoch": 0.45669997437868304, "grad_norm": 3.09375, "learning_rate": 1.2241138605038065e-05, "loss": 0.2506240367889404, "step": 3565, "token_acc": 0.9026666666666666 }, { "epoch": 0.4573405073020753, "grad_norm": 5.75, "learning_rate": 1.2220488414035088e-05, "loss": 0.24530596733093263, "step": 3570, "token_acc": 0.9045893719806763 }, { "epoch": 0.4579810402254676, "grad_norm": 6.125, "learning_rate": 1.2199828258296219e-05, "loss": 0.24243788719177245, "step": 3575, "token_acc": 0.9063712388999051 }, { "epoch": 0.4586215731488599, "grad_norm": 3.359375, "learning_rate": 1.2179158230536648e-05, "loss": 0.25044434070587157, "step": 3580, "token_acc": 0.9025926405245676 }, { "epoch": 0.4592621060722521, "grad_norm": 2.734375, "learning_rate": 1.215847842351586e-05, "loss": 0.23762269020080568, "step": 3585, "token_acc": 0.907480400225235 }, { "epoch": 0.4599026389956444, "grad_norm": 4.25, "learning_rate": 1.213778893003723e-05, "loss": 0.24655213356018066, "step": 3590, "token_acc": 0.9028251024369204 }, { "epoch": 0.46054317191903665, "grad_norm": 2.984375, "learning_rate": 1.2117089842947602e-05, "loss": 0.2541653633117676, "step": 3595, "token_acc": 0.9008563191187229 }, { "epoch": 0.4611837048424289, "grad_norm": 4.25, "learning_rate": 1.2096381255136869e-05, "loss": 0.25534510612487793, "step": 3600, "token_acc": 0.9020348587619129 }, { "epoch": 0.4611837048424289, "eval_loss": 0.33049342036247253, "eval_runtime": 103.5925, "eval_samples_per_second": 96.532, "eval_steps_per_second": 12.067, "eval_token_acc": 0.881139576216288, "step": 3600 }, { "epoch": 0.46182423776582116, "grad_norm": 2.90625, "learning_rate": 1.207566325953756e-05, "loss": 0.2503223896026611, "step": 3605, "token_acc": 0.9033789323781988 }, { "epoch": 0.46246477068921343, "grad_norm": 4.21875, "learning_rate": 1.2054935949124429e-05, "loss": 0.2458160400390625, "step": 3610, "token_acc": 0.9040593589577671 }, { "epoch": 0.4631053036126057, "grad_norm": 3.625, "learning_rate": 1.2034199416914026e-05, "loss": 0.2477043390274048, "step": 3615, "token_acc": 0.9040510807196169 }, { "epoch": 0.46374583653599794, "grad_norm": 3.21875, "learning_rate": 1.2013453755964282e-05, "loss": 0.24677414894104005, "step": 3620, "token_acc": 0.9042521044679473 }, { "epoch": 0.4643863694593902, "grad_norm": 2.890625, "learning_rate": 1.1992699059374103e-05, "loss": 0.24625577926635742, "step": 3625, "token_acc": 0.9028618364170845 }, { "epoch": 0.4650269023827825, "grad_norm": 3.0, "learning_rate": 1.197193542028294e-05, "loss": 0.2436453342437744, "step": 3630, "token_acc": 0.9060716139076285 }, { "epoch": 0.4656674353061747, "grad_norm": 3.25, "learning_rate": 1.1951162931870367e-05, "loss": 0.24116950035095214, "step": 3635, "token_acc": 0.9068125053828267 }, { "epoch": 0.466307968229567, "grad_norm": 5.09375, "learning_rate": 1.1930381687355685e-05, "loss": 0.25627937316894533, "step": 3640, "token_acc": 0.899607910724288 }, { "epoch": 0.46694850115295927, "grad_norm": 4.34375, "learning_rate": 1.190959177999748e-05, "loss": 0.23881065845489502, "step": 3645, "token_acc": 0.9060243048047398 }, { "epoch": 0.46758903407635155, "grad_norm": 2.59375, "learning_rate": 1.1888793303093211e-05, "loss": 0.24708976745605468, "step": 3650, "token_acc": 0.9032951905704207 }, { "epoch": 0.4682295669997438, "grad_norm": 3.53125, "learning_rate": 1.18679863499788e-05, "loss": 0.2470097541809082, "step": 3655, "token_acc": 0.9035228009509402 }, { "epoch": 0.46887009992313605, "grad_norm": 2.5625, "learning_rate": 1.1847171014028207e-05, "loss": 0.24061377048492433, "step": 3660, "token_acc": 0.9055094274346999 }, { "epoch": 0.46951063284652833, "grad_norm": 2.421875, "learning_rate": 1.1826347388653005e-05, "loss": 0.24855940341949462, "step": 3665, "token_acc": 0.9045786674737343 }, { "epoch": 0.47015116576992055, "grad_norm": 2.984375, "learning_rate": 1.180551556730198e-05, "loss": 0.2487732172012329, "step": 3670, "token_acc": 0.9030577088716624 }, { "epoch": 0.47079169869331283, "grad_norm": 2.453125, "learning_rate": 1.1784675643460682e-05, "loss": 0.24545960426330565, "step": 3675, "token_acc": 0.9027664242997891 }, { "epoch": 0.4714322316167051, "grad_norm": 3.0625, "learning_rate": 1.176382771065103e-05, "loss": 0.23899221420288086, "step": 3680, "token_acc": 0.906933437269932 }, { "epoch": 0.47207276454009733, "grad_norm": 6.9375, "learning_rate": 1.1742971862430888e-05, "loss": 0.25500404834747314, "step": 3685, "token_acc": 0.9016767964136385 }, { "epoch": 0.4727132974634896, "grad_norm": 4.53125, "learning_rate": 1.1722108192393635e-05, "loss": 0.24634737968444825, "step": 3690, "token_acc": 0.9017565007749269 }, { "epoch": 0.4733538303868819, "grad_norm": 2.796875, "learning_rate": 1.1701236794167753e-05, "loss": 0.2485578775405884, "step": 3695, "token_acc": 0.9019531419167641 }, { "epoch": 0.47399436331027417, "grad_norm": 3.265625, "learning_rate": 1.168035776141641e-05, "loss": 0.24262137413024903, "step": 3700, "token_acc": 0.9076976212062341 }, { "epoch": 0.47399436331027417, "eval_loss": 0.3315788805484772, "eval_runtime": 103.5836, "eval_samples_per_second": 96.54, "eval_steps_per_second": 12.068, "eval_token_acc": 0.8810066939444441, "step": 3700 }, { "epoch": 0.4746348962336664, "grad_norm": 4.34375, "learning_rate": 1.165947118783703e-05, "loss": 0.24236671924591063, "step": 3705, "token_acc": 0.906829352418729 }, { "epoch": 0.47527542915705867, "grad_norm": 3.375, "learning_rate": 1.1638577167160874e-05, "loss": 0.24883639812469482, "step": 3710, "token_acc": 0.9045120220224526 }, { "epoch": 0.47591596208045095, "grad_norm": 3.375, "learning_rate": 1.1617675793152631e-05, "loss": 0.2473759651184082, "step": 3715, "token_acc": 0.9040358358099668 }, { "epoch": 0.4765564950038432, "grad_norm": 4.34375, "learning_rate": 1.1596767159609988e-05, "loss": 0.2524222135543823, "step": 3720, "token_acc": 0.9003019844693702 }, { "epoch": 0.47719702792723545, "grad_norm": 2.796875, "learning_rate": 1.1575851360363201e-05, "loss": 0.2499473810195923, "step": 3725, "token_acc": 0.9026130307718755 }, { "epoch": 0.47783756085062773, "grad_norm": 3.8125, "learning_rate": 1.1554928489274697e-05, "loss": 0.24831125736236573, "step": 3730, "token_acc": 0.9022294772112204 }, { "epoch": 0.47847809377402, "grad_norm": 2.796875, "learning_rate": 1.1533998640238626e-05, "loss": 0.2451251983642578, "step": 3735, "token_acc": 0.9032967032967033 }, { "epoch": 0.47911862669741223, "grad_norm": 2.90625, "learning_rate": 1.1513061907180462e-05, "loss": 0.2445608615875244, "step": 3740, "token_acc": 0.9054381711118809 }, { "epoch": 0.4797591596208045, "grad_norm": 2.65625, "learning_rate": 1.1492118384056565e-05, "loss": 0.2464083194732666, "step": 3745, "token_acc": 0.9023748976337227 }, { "epoch": 0.4803996925441968, "grad_norm": 3.5625, "learning_rate": 1.1471168164853769e-05, "loss": 0.24423737525939943, "step": 3750, "token_acc": 0.9049323100801931 }, { "epoch": 0.481040225467589, "grad_norm": 4.34375, "learning_rate": 1.1450211343588962e-05, "loss": 0.24666328430175782, "step": 3755, "token_acc": 0.905542815109688 }, { "epoch": 0.4816807583909813, "grad_norm": 2.28125, "learning_rate": 1.142924801430865e-05, "loss": 0.24257378578186034, "step": 3760, "token_acc": 0.9064537017051586 }, { "epoch": 0.48232129131437357, "grad_norm": 4.25, "learning_rate": 1.1408278271088555e-05, "loss": 0.24482569694519044, "step": 3765, "token_acc": 0.9051044583243593 }, { "epoch": 0.48296182423776585, "grad_norm": 4.75, "learning_rate": 1.1387302208033173e-05, "loss": 0.24971480369567872, "step": 3770, "token_acc": 0.9026384883570783 }, { "epoch": 0.48360235716115807, "grad_norm": 4.09375, "learning_rate": 1.1366319919275368e-05, "loss": 0.24797563552856444, "step": 3775, "token_acc": 0.9017930438539642 }, { "epoch": 0.48424289008455035, "grad_norm": 3.328125, "learning_rate": 1.1345331498975938e-05, "loss": 0.24426445960998536, "step": 3780, "token_acc": 0.9049935428325441 }, { "epoch": 0.4848834230079426, "grad_norm": 3.765625, "learning_rate": 1.1324337041323204e-05, "loss": 0.25280845165252686, "step": 3785, "token_acc": 0.9023151734228418 }, { "epoch": 0.48552395593133485, "grad_norm": 2.71875, "learning_rate": 1.1303336640532567e-05, "loss": 0.24581615924835204, "step": 3790, "token_acc": 0.9035905403072674 }, { "epoch": 0.48616448885472713, "grad_norm": 2.4375, "learning_rate": 1.1282330390846117e-05, "loss": 0.24577610492706298, "step": 3795, "token_acc": 0.9040096680909836 }, { "epoch": 0.4868050217781194, "grad_norm": 2.921875, "learning_rate": 1.1261318386532177e-05, "loss": 0.2388829231262207, "step": 3800, "token_acc": 0.9051091740743937 }, { "epoch": 0.4868050217781194, "eval_loss": 0.3326900601387024, "eval_runtime": 103.7176, "eval_samples_per_second": 96.416, "eval_steps_per_second": 12.052, "eval_token_acc": 0.8808295175819856, "step": 3800 }, { "epoch": 0.4874455547015117, "grad_norm": 3.0625, "learning_rate": 1.12403007218849e-05, "loss": 0.2488114833831787, "step": 3805, "token_acc": 0.9019819043515725 }, { "epoch": 0.4880860876249039, "grad_norm": 3.546875, "learning_rate": 1.121927749122384e-05, "loss": 0.24407386779785156, "step": 3810, "token_acc": 0.9046487514041303 }, { "epoch": 0.4887266205482962, "grad_norm": 3.4375, "learning_rate": 1.1198248788893531e-05, "loss": 0.23504881858825682, "step": 3815, "token_acc": 0.908643467748899 }, { "epoch": 0.48936715347168847, "grad_norm": 2.890625, "learning_rate": 1.117721470926306e-05, "loss": 0.240411376953125, "step": 3820, "token_acc": 0.9068462401795735 }, { "epoch": 0.4900076863950807, "grad_norm": 6.03125, "learning_rate": 1.1156175346725644e-05, "loss": 0.23961906433105468, "step": 3825, "token_acc": 0.9053354053354054 }, { "epoch": 0.49064821931847297, "grad_norm": 3.453125, "learning_rate": 1.113513079569821e-05, "loss": 0.24782803058624267, "step": 3830, "token_acc": 0.9041679212009118 }, { "epoch": 0.49128875224186525, "grad_norm": 2.625, "learning_rate": 1.1114081150620968e-05, "loss": 0.24206724166870117, "step": 3835, "token_acc": 0.9066114275850164 }, { "epoch": 0.49192928516525747, "grad_norm": 3.03125, "learning_rate": 1.1093026505956989e-05, "loss": 0.24786317348480225, "step": 3840, "token_acc": 0.9004302925989673 }, { "epoch": 0.49256981808864975, "grad_norm": 3.421875, "learning_rate": 1.107196695619178e-05, "loss": 0.2436119556427002, "step": 3845, "token_acc": 0.9046715265496269 }, { "epoch": 0.493210351012042, "grad_norm": 3.328125, "learning_rate": 1.105090259583286e-05, "loss": 0.25207488536834716, "step": 3850, "token_acc": 0.901183050118305 }, { "epoch": 0.4938508839354343, "grad_norm": 2.65625, "learning_rate": 1.1029833519409337e-05, "loss": 0.24722940921783448, "step": 3855, "token_acc": 0.9046530682651257 }, { "epoch": 0.4944914168588265, "grad_norm": 2.6875, "learning_rate": 1.100875982147148e-05, "loss": 0.24187374114990234, "step": 3860, "token_acc": 0.906386286109072 }, { "epoch": 0.4951319497822188, "grad_norm": 7.78125, "learning_rate": 1.09876815965903e-05, "loss": 0.25092015266418455, "step": 3865, "token_acc": 0.9019010427841909 }, { "epoch": 0.4957724827056111, "grad_norm": 2.71875, "learning_rate": 1.096659893935713e-05, "loss": 0.2399623155593872, "step": 3870, "token_acc": 0.9047742492979045 }, { "epoch": 0.4964130156290033, "grad_norm": 2.734375, "learning_rate": 1.0945511944383178e-05, "loss": 0.24817066192626952, "step": 3875, "token_acc": 0.9023097474791002 }, { "epoch": 0.4970535485523956, "grad_norm": 32.75, "learning_rate": 1.0924420706299131e-05, "loss": 0.23887484073638915, "step": 3880, "token_acc": 0.9074281287657083 }, { "epoch": 0.49769408147578786, "grad_norm": 3.078125, "learning_rate": 1.0903325319754717e-05, "loss": 0.24414317607879638, "step": 3885, "token_acc": 0.9059984419631265 }, { "epoch": 0.49833461439918014, "grad_norm": 6.90625, "learning_rate": 1.0882225879418272e-05, "loss": 0.2399660110473633, "step": 3890, "token_acc": 0.9039769303606783 }, { "epoch": 0.49897514732257237, "grad_norm": 6.09375, "learning_rate": 1.086112247997633e-05, "loss": 0.24311909675598145, "step": 3895, "token_acc": 0.9053466029037956 }, { "epoch": 0.49961568024596464, "grad_norm": 2.84375, "learning_rate": 1.0840015216133195e-05, "loss": 0.24150404930114747, "step": 3900, "token_acc": 0.9062136674848211 }, { "epoch": 0.49961568024596464, "eval_loss": 0.3318501114845276, "eval_runtime": 103.7178, "eval_samples_per_second": 96.415, "eval_steps_per_second": 12.052, "eval_token_acc": 0.880832285962649, "step": 3900 }, { "epoch": 0.5002562131693569, "grad_norm": 3.078125, "learning_rate": 1.0818904182610505e-05, "loss": 0.23810501098632814, "step": 3905, "token_acc": 0.9054380664652568 }, { "epoch": 0.5008967460927491, "grad_norm": 3.03125, "learning_rate": 1.0797789474146825e-05, "loss": 0.24517326354980468, "step": 3910, "token_acc": 0.9039823773324119 }, { "epoch": 0.5015372790161414, "grad_norm": 2.828125, "learning_rate": 1.07766711854972e-05, "loss": 0.24150619506835938, "step": 3915, "token_acc": 0.9048644922228446 }, { "epoch": 0.5021778119395337, "grad_norm": 3.671875, "learning_rate": 1.0755549411432754e-05, "loss": 0.24119091033935547, "step": 3920, "token_acc": 0.9053400155534433 }, { "epoch": 0.502818344862926, "grad_norm": 3.578125, "learning_rate": 1.0734424246740238e-05, "loss": 0.24077696800231935, "step": 3925, "token_acc": 0.9054638194864701 }, { "epoch": 0.5034588777863183, "grad_norm": 4.03125, "learning_rate": 1.0713295786221634e-05, "loss": 0.24392437934875488, "step": 3930, "token_acc": 0.9047639614736751 }, { "epoch": 0.5040994107097104, "grad_norm": 3.328125, "learning_rate": 1.0692164124693703e-05, "loss": 0.23980698585510254, "step": 3935, "token_acc": 0.905363456066224 }, { "epoch": 0.5047399436331027, "grad_norm": 2.765625, "learning_rate": 1.067102935698758e-05, "loss": 0.23611803054809571, "step": 3940, "token_acc": 0.9073914169760815 }, { "epoch": 0.505380476556495, "grad_norm": 3.40625, "learning_rate": 1.064989157794833e-05, "loss": 0.2380206823348999, "step": 3945, "token_acc": 0.9074281287657083 }, { "epoch": 0.5060210094798873, "grad_norm": 3.6875, "learning_rate": 1.0628750882434537e-05, "loss": 0.2411219596862793, "step": 3950, "token_acc": 0.9061099620820406 }, { "epoch": 0.5066615424032795, "grad_norm": 3.171875, "learning_rate": 1.0607607365317874e-05, "loss": 0.23887009620666505, "step": 3955, "token_acc": 0.906588215083319 }, { "epoch": 0.5073020753266718, "grad_norm": 4.625, "learning_rate": 1.0586461121482672e-05, "loss": 0.2420198917388916, "step": 3960, "token_acc": 0.9068667497957604 }, { "epoch": 0.5079426082500641, "grad_norm": 2.65625, "learning_rate": 1.0565312245825505e-05, "loss": 0.2432565689086914, "step": 3965, "token_acc": 0.905863065706027 }, { "epoch": 0.5085831411734563, "grad_norm": 2.8125, "learning_rate": 1.0544160833254752e-05, "loss": 0.2371816873550415, "step": 3970, "token_acc": 0.9089968976215098 }, { "epoch": 0.5092236740968485, "grad_norm": 3.6875, "learning_rate": 1.052300697869018e-05, "loss": 0.2434596061706543, "step": 3975, "token_acc": 0.9051765010128874 }, { "epoch": 0.5098642070202408, "grad_norm": 25.25, "learning_rate": 1.0501850777062512e-05, "loss": 0.24385199546813965, "step": 3980, "token_acc": 0.9046000258186669 }, { "epoch": 0.5105047399436331, "grad_norm": 3.578125, "learning_rate": 1.0480692323313007e-05, "loss": 0.23917775154113768, "step": 3985, "token_acc": 0.9061435654235827 }, { "epoch": 0.5111452728670254, "grad_norm": 3.546875, "learning_rate": 1.0459531712393025e-05, "loss": 0.2387022018432617, "step": 3990, "token_acc": 0.9047331145275522 }, { "epoch": 0.5117858057904177, "grad_norm": 2.515625, "learning_rate": 1.0438369039263614e-05, "loss": 0.24243345260620117, "step": 3995, "token_acc": 0.9045505472722571 }, { "epoch": 0.5124263387138099, "grad_norm": 3.078125, "learning_rate": 1.0417204398895072e-05, "loss": 0.23408794403076172, "step": 4000, "token_acc": 0.9087918271936279 }, { "epoch": 0.5124263387138099, "eval_loss": 0.3293675482273102, "eval_runtime": 102.8807, "eval_samples_per_second": 97.2, "eval_steps_per_second": 12.15, "eval_token_acc": 0.8810675983190392, "step": 4000 }, { "epoch": 0.5130668716372021, "grad_norm": 6.1875, "learning_rate": 1.039603788626653e-05, "loss": 0.2410355567932129, "step": 4005, "token_acc": 0.9065908013276435 }, { "epoch": 0.5137074045605944, "grad_norm": 3.0, "learning_rate": 1.0374869596365508e-05, "loss": 0.2497018337249756, "step": 4010, "token_acc": 0.900746973469563 }, { "epoch": 0.5143479374839867, "grad_norm": 6.78125, "learning_rate": 1.035369962418752e-05, "loss": 0.24223339557647705, "step": 4015, "token_acc": 0.9042599153201417 }, { "epoch": 0.5149884704073789, "grad_norm": 4.375, "learning_rate": 1.0332528064735614e-05, "loss": 0.24308998584747316, "step": 4020, "token_acc": 0.905254091300603 }, { "epoch": 0.5156290033307712, "grad_norm": 2.78125, "learning_rate": 1.031135501301997e-05, "loss": 0.24722557067871093, "step": 4025, "token_acc": 0.902273805928291 }, { "epoch": 0.5162695362541635, "grad_norm": 3.4375, "learning_rate": 1.0290180564057461e-05, "loss": 0.23717832565307617, "step": 4030, "token_acc": 0.9058447172747709 }, { "epoch": 0.5169100691775558, "grad_norm": 4.03125, "learning_rate": 1.0269004812871236e-05, "loss": 0.23974413871765138, "step": 4035, "token_acc": 0.906494960806271 }, { "epoch": 0.5175506021009479, "grad_norm": 2.921875, "learning_rate": 1.024782785449028e-05, "loss": 0.25091626644134524, "step": 4040, "token_acc": 0.9013481980814104 }, { "epoch": 0.5181911350243402, "grad_norm": 3.0, "learning_rate": 1.0226649783948997e-05, "loss": 0.2415644645690918, "step": 4045, "token_acc": 0.9057377049180327 }, { "epoch": 0.5188316679477325, "grad_norm": 4.875, "learning_rate": 1.0205470696286787e-05, "loss": 0.24452197551727295, "step": 4050, "token_acc": 0.9055036791600327 }, { "epoch": 0.5194722008711248, "grad_norm": 4.25, "learning_rate": 1.0184290686547611e-05, "loss": 0.23984365463256835, "step": 4055, "token_acc": 0.9044206527370057 }, { "epoch": 0.5201127337945171, "grad_norm": 6.65625, "learning_rate": 1.0163109849779567e-05, "loss": 0.24106016159057617, "step": 4060, "token_acc": 0.9063618718999353 }, { "epoch": 0.5207532667179093, "grad_norm": 3.46875, "learning_rate": 1.0141928281034468e-05, "loss": 0.2418668746948242, "step": 4065, "token_acc": 0.9044542086671836 }, { "epoch": 0.5213937996413016, "grad_norm": 2.9375, "learning_rate": 1.0120746075367406e-05, "loss": 0.2402285099029541, "step": 4070, "token_acc": 0.9048873154304464 }, { "epoch": 0.5220343325646938, "grad_norm": 2.90625, "learning_rate": 1.0099563327836338e-05, "loss": 0.23992910385131835, "step": 4075, "token_acc": 0.906600034464932 }, { "epoch": 0.5226748654880861, "grad_norm": 3.515625, "learning_rate": 1.0078380133501646e-05, "loss": 0.24107756614685058, "step": 4080, "token_acc": 0.9048888506686159 }, { "epoch": 0.5233153984114783, "grad_norm": 3.078125, "learning_rate": 1.0057196587425721e-05, "loss": 0.24715356826782225, "step": 4085, "token_acc": 0.904106634457769 }, { "epoch": 0.5239559313348706, "grad_norm": 3.703125, "learning_rate": 1.0036012784672538e-05, "loss": 0.24057602882385254, "step": 4090, "token_acc": 0.9058342303552207 }, { "epoch": 0.5245964642582629, "grad_norm": 6.59375, "learning_rate": 1.001482882030721e-05, "loss": 0.2389677047729492, "step": 4095, "token_acc": 0.9081055404413352 }, { "epoch": 0.5252369971816552, "grad_norm": 2.53125, "learning_rate": 9.99364478939559e-06, "loss": 0.24011881351470948, "step": 4100, "token_acc": 0.9068454177084231 }, { "epoch": 0.5252369971816552, "eval_loss": 0.33330872654914856, "eval_runtime": 102.9347, "eval_samples_per_second": 97.149, "eval_steps_per_second": 12.144, "eval_token_acc": 0.8811091240289904, "step": 4100 }, { "epoch": 0.5258775301050475, "grad_norm": 9.6875, "learning_rate": 9.972460787003814e-06, "loss": 0.24392313957214357, "step": 4105, "token_acc": 0.9049197262514527 }, { "epoch": 0.5265180630284396, "grad_norm": 3.078125, "learning_rate": 9.95127690819791e-06, "loss": 0.24266266822814941, "step": 4110, "token_acc": 0.9032661151327128 }, { "epoch": 0.5271585959518319, "grad_norm": 2.75, "learning_rate": 9.93009324804333e-06, "loss": 0.24082543849945068, "step": 4115, "token_acc": 0.9040199707325471 }, { "epoch": 0.5277991288752242, "grad_norm": 3.921875, "learning_rate": 9.908909901604563e-06, "loss": 0.24310965538024903, "step": 4120, "token_acc": 0.9044476079547905 }, { "epoch": 0.5284396617986165, "grad_norm": 2.453125, "learning_rate": 9.887726963944676e-06, "loss": 0.2375312328338623, "step": 4125, "token_acc": 0.9068424681144432 }, { "epoch": 0.5290801947220087, "grad_norm": 2.90625, "learning_rate": 9.86654453012491e-06, "loss": 0.23663816452026368, "step": 4130, "token_acc": 0.9086728274545534 }, { "epoch": 0.529720727645401, "grad_norm": 2.65625, "learning_rate": 9.845362695204245e-06, "loss": 0.24443821907043456, "step": 4135, "token_acc": 0.9042672413793104 }, { "epoch": 0.5303612605687933, "grad_norm": 4.65625, "learning_rate": 9.824181554238965e-06, "loss": 0.23482506275177, "step": 4140, "token_acc": 0.9089650996842971 }, { "epoch": 0.5310017934921855, "grad_norm": 17.5, "learning_rate": 9.803001202282254e-06, "loss": 0.24258599281311036, "step": 4145, "token_acc": 0.9058409510321446 }, { "epoch": 0.5316423264155777, "grad_norm": 2.890625, "learning_rate": 9.781821734383741e-06, "loss": 0.2373753547668457, "step": 4150, "token_acc": 0.9077843280691941 }, { "epoch": 0.53228285933897, "grad_norm": 3.46875, "learning_rate": 9.760643245589096e-06, "loss": 0.23887972831726073, "step": 4155, "token_acc": 0.9065009065009065 }, { "epoch": 0.5329233922623623, "grad_norm": 3.875, "learning_rate": 9.73946583093959e-06, "loss": 0.2412872791290283, "step": 4160, "token_acc": 0.9053228996474332 }, { "epoch": 0.5335639251857546, "grad_norm": 2.90625, "learning_rate": 9.718289585471683e-06, "loss": 0.23278658390045165, "step": 4165, "token_acc": 0.9080731969860064 }, { "epoch": 0.5342044581091469, "grad_norm": 3.015625, "learning_rate": 9.697114604216573e-06, "loss": 0.24164493083953859, "step": 4170, "token_acc": 0.9067511639937921 }, { "epoch": 0.534844991032539, "grad_norm": 3.109375, "learning_rate": 9.6759409821998e-06, "loss": 0.23885555267333985, "step": 4175, "token_acc": 0.9060220159723721 }, { "epoch": 0.5354855239559313, "grad_norm": 2.875, "learning_rate": 9.65476881444079e-06, "loss": 0.23392415046691895, "step": 4180, "token_acc": 0.9085397540273688 }, { "epoch": 0.5361260568793236, "grad_norm": 3.40625, "learning_rate": 9.633598195952461e-06, "loss": 0.23441019058227539, "step": 4185, "token_acc": 0.908675799086758 }, { "epoch": 0.5367665898027159, "grad_norm": 2.84375, "learning_rate": 9.612429221740761e-06, "loss": 0.23697328567504883, "step": 4190, "token_acc": 0.9047413793103448 }, { "epoch": 0.5374071227261081, "grad_norm": 3.15625, "learning_rate": 9.591261986804264e-06, "loss": 0.24399030208587646, "step": 4195, "token_acc": 0.905359667537809 }, { "epoch": 0.5380476556495004, "grad_norm": 3.25, "learning_rate": 9.570096586133748e-06, "loss": 0.23835985660552977, "step": 4200, "token_acc": 0.9053366669540478 }, { "epoch": 0.5380476556495004, "eval_loss": 0.32872986793518066, "eval_runtime": 104.0894, "eval_samples_per_second": 96.071, "eval_steps_per_second": 12.009, "eval_token_acc": 0.8815603700771271, "step": 4200 }, { "epoch": 0.5386881885728927, "grad_norm": 2.625, "learning_rate": 9.548933114711742e-06, "loss": 0.2371370553970337, "step": 4205, "token_acc": 0.908296379213846 }, { "epoch": 0.5393287214962849, "grad_norm": 3.03125, "learning_rate": 9.527771667512138e-06, "loss": 0.2396193265914917, "step": 4210, "token_acc": 0.9050984786450028 }, { "epoch": 0.5399692544196771, "grad_norm": 3.171875, "learning_rate": 9.506612339499725e-06, "loss": 0.23920049667358398, "step": 4215, "token_acc": 0.9055294573977503 }, { "epoch": 0.5406097873430694, "grad_norm": 2.90625, "learning_rate": 9.485455225629798e-06, "loss": 0.23707218170166017, "step": 4220, "token_acc": 0.9071637426900585 }, { "epoch": 0.5412503202664617, "grad_norm": 3.046875, "learning_rate": 9.464300420847698e-06, "loss": 0.2316804885864258, "step": 4225, "token_acc": 0.9091498185588388 }, { "epoch": 0.541890853189854, "grad_norm": 3.28125, "learning_rate": 9.443148020088426e-06, "loss": 0.24402837753295897, "step": 4230, "token_acc": 0.9042589878437797 }, { "epoch": 0.5425313861132463, "grad_norm": 2.625, "learning_rate": 9.421998118276169e-06, "loss": 0.2375174045562744, "step": 4235, "token_acc": 0.9062986675895484 }, { "epoch": 0.5431719190366385, "grad_norm": 4.625, "learning_rate": 9.400850810323925e-06, "loss": 0.24248833656311036, "step": 4240, "token_acc": 0.9057444415606887 }, { "epoch": 0.5438124519600307, "grad_norm": 3.875, "learning_rate": 9.379706191133033e-06, "loss": 0.24248261451721193, "step": 4245, "token_acc": 0.9062149331031506 }, { "epoch": 0.544452984883423, "grad_norm": 3.96875, "learning_rate": 9.358564355592775e-06, "loss": 0.23543434143066405, "step": 4250, "token_acc": 0.9084744298548721 }, { "epoch": 0.5450935178068153, "grad_norm": 3.203125, "learning_rate": 9.337425398579932e-06, "loss": 0.23771212100982667, "step": 4255, "token_acc": 0.9078099493083598 }, { "epoch": 0.5457340507302075, "grad_norm": 3.515625, "learning_rate": 9.316289414958379e-06, "loss": 0.2383446216583252, "step": 4260, "token_acc": 0.9065444799861675 }, { "epoch": 0.5463745836535998, "grad_norm": 4.625, "learning_rate": 9.295156499578647e-06, "loss": 0.24645309448242186, "step": 4265, "token_acc": 0.9030008180135187 }, { "epoch": 0.5470151165769921, "grad_norm": 3.03125, "learning_rate": 9.274026747277487e-06, "loss": 0.23401763439178466, "step": 4270, "token_acc": 0.9084355033672941 }, { "epoch": 0.5476556495003844, "grad_norm": 3.15625, "learning_rate": 9.252900252877464e-06, "loss": 0.24168498516082765, "step": 4275, "token_acc": 0.9051816813588476 }, { "epoch": 0.5482961824237765, "grad_norm": 3.296875, "learning_rate": 9.231777111186514e-06, "loss": 0.23365185260772706, "step": 4280, "token_acc": 0.9065501055079453 }, { "epoch": 0.5489367153471688, "grad_norm": 3.140625, "learning_rate": 9.210657416997543e-06, "loss": 0.2374626636505127, "step": 4285, "token_acc": 0.9064794816414686 }, { "epoch": 0.5495772482705611, "grad_norm": 3.296875, "learning_rate": 9.189541265087966e-06, "loss": 0.23222618103027343, "step": 4290, "token_acc": 0.9083916688272405 }, { "epoch": 0.5502177811939534, "grad_norm": 3.671875, "learning_rate": 9.168428750219323e-06, "loss": 0.24052739143371582, "step": 4295, "token_acc": 0.9065121508259102 }, { "epoch": 0.5508583141173456, "grad_norm": 4.28125, "learning_rate": 9.14731996713681e-06, "loss": 0.2399202346801758, "step": 4300, "token_acc": 0.9062012818292049 }, { "epoch": 0.5508583141173456, "eval_loss": 0.32973331212997437, "eval_runtime": 104.5664, "eval_samples_per_second": 95.633, "eval_steps_per_second": 11.954, "eval_token_acc": 0.8813555099080344, "step": 4300 }, { "epoch": 0.5514988470407379, "grad_norm": 4.5625, "learning_rate": 9.126215010568896e-06, "loss": 0.2345888137817383, "step": 4305, "token_acc": 0.9077134986225895 }, { "epoch": 0.5521393799641302, "grad_norm": 3.6875, "learning_rate": 9.105113975226865e-06, "loss": 0.24162061214447023, "step": 4310, "token_acc": 0.9073946393174179 }, { "epoch": 0.5527799128875224, "grad_norm": 2.5625, "learning_rate": 9.08401695580441e-06, "loss": 0.2340158700942993, "step": 4315, "token_acc": 0.9100293711126468 }, { "epoch": 0.5534204458109147, "grad_norm": 3.546875, "learning_rate": 9.062924046977194e-06, "loss": 0.23286752700805663, "step": 4320, "token_acc": 0.9091850828729282 }, { "epoch": 0.5540609787343069, "grad_norm": 3.515625, "learning_rate": 9.041835343402445e-06, "loss": 0.23487985134124756, "step": 4325, "token_acc": 0.9075176937683411 }, { "epoch": 0.5547015116576992, "grad_norm": 2.734375, "learning_rate": 9.020750939718518e-06, "loss": 0.2381136178970337, "step": 4330, "token_acc": 0.9072947672662957 }, { "epoch": 0.5553420445810915, "grad_norm": 3.234375, "learning_rate": 8.999670930544459e-06, "loss": 0.2352077007293701, "step": 4335, "token_acc": 0.9067487855655795 }, { "epoch": 0.5559825775044838, "grad_norm": 3.625, "learning_rate": 8.978595410479609e-06, "loss": 0.2357017993927002, "step": 4340, "token_acc": 0.9078482104355325 }, { "epoch": 0.556623110427876, "grad_norm": 2.65625, "learning_rate": 8.957524474103146e-06, "loss": 0.2459559679031372, "step": 4345, "token_acc": 0.9045916609235011 }, { "epoch": 0.5572636433512682, "grad_norm": 3.21875, "learning_rate": 8.936458215973698e-06, "loss": 0.23383736610412598, "step": 4350, "token_acc": 0.9094742276912486 }, { "epoch": 0.5579041762746605, "grad_norm": 2.734375, "learning_rate": 8.915396730628882e-06, "loss": 0.24825828075408934, "step": 4355, "token_acc": 0.9029686759446767 }, { "epoch": 0.5585447091980528, "grad_norm": 3.25, "learning_rate": 8.894340112584909e-06, "loss": 0.23654026985168458, "step": 4360, "token_acc": 0.9066741350338231 }, { "epoch": 0.559185242121445, "grad_norm": 3.5, "learning_rate": 8.873288456336138e-06, "loss": 0.23700532913208008, "step": 4365, "token_acc": 0.9077705469120373 }, { "epoch": 0.5598257750448373, "grad_norm": 3.765625, "learning_rate": 8.852241856354669e-06, "loss": 0.23611578941345215, "step": 4370, "token_acc": 0.9087608592298051 }, { "epoch": 0.5604663079682296, "grad_norm": 4.5625, "learning_rate": 8.831200407089897e-06, "loss": 0.24070956707000732, "step": 4375, "token_acc": 0.9068157385508493 }, { "epoch": 0.5611068408916219, "grad_norm": 2.453125, "learning_rate": 8.810164202968123e-06, "loss": 0.2372671604156494, "step": 4380, "token_acc": 0.9063080980587142 }, { "epoch": 0.561747373815014, "grad_norm": 2.875, "learning_rate": 8.789133338392099e-06, "loss": 0.2328266382217407, "step": 4385, "token_acc": 0.9102890972732379 }, { "epoch": 0.5623879067384063, "grad_norm": 2.984375, "learning_rate": 8.76810790774061e-06, "loss": 0.2348611831665039, "step": 4390, "token_acc": 0.9081421424874936 }, { "epoch": 0.5630284396617986, "grad_norm": 3.59375, "learning_rate": 8.747088005368068e-06, "loss": 0.2405010223388672, "step": 4395, "token_acc": 0.9055416702576919 }, { "epoch": 0.5636689725851909, "grad_norm": 2.921875, "learning_rate": 8.726073725604061e-06, "loss": 0.2389441728591919, "step": 4400, "token_acc": 0.9059644888812274 }, { "epoch": 0.5636689725851909, "eval_loss": 0.328523188829422, "eval_runtime": 102.5856, "eval_samples_per_second": 97.48, "eval_steps_per_second": 12.185, "eval_token_acc": 0.8815963590257515, "step": 4400 }, { "epoch": 0.5643095055085832, "grad_norm": 2.90625, "learning_rate": 8.705065162752961e-06, "loss": 0.24200544357299805, "step": 4405, "token_acc": 0.9059365448361961 }, { "epoch": 0.5649500384319754, "grad_norm": 5.03125, "learning_rate": 8.68406241109347e-06, "loss": 0.24370207786560058, "step": 4410, "token_acc": 0.904322319040635 }, { "epoch": 0.5655905713553676, "grad_norm": 2.65625, "learning_rate": 8.663065564878223e-06, "loss": 0.2380732536315918, "step": 4415, "token_acc": 0.9064141196728368 }, { "epoch": 0.5662311042787599, "grad_norm": 3.421875, "learning_rate": 8.642074718333345e-06, "loss": 0.2384279727935791, "step": 4420, "token_acc": 0.9063185513355413 }, { "epoch": 0.5668716372021522, "grad_norm": 4.1875, "learning_rate": 8.621089965658046e-06, "loss": 0.23173861503601073, "step": 4425, "token_acc": 0.9093538222471619 }, { "epoch": 0.5675121701255444, "grad_norm": 2.859375, "learning_rate": 8.600111401024177e-06, "loss": 0.2245471954345703, "step": 4430, "token_acc": 0.9126788570440496 }, { "epoch": 0.5681527030489367, "grad_norm": 2.75, "learning_rate": 8.57913911857583e-06, "loss": 0.23515353202819825, "step": 4435, "token_acc": 0.9106183959812922 }, { "epoch": 0.568793235972329, "grad_norm": 2.75, "learning_rate": 8.558173212428895e-06, "loss": 0.23450264930725098, "step": 4440, "token_acc": 0.9078670050324745 }, { "epoch": 0.5694337688957213, "grad_norm": 12.875, "learning_rate": 8.537213776670656e-06, "loss": 0.23401873111724852, "step": 4445, "token_acc": 0.9095069510404974 }, { "epoch": 0.5700743018191134, "grad_norm": 5.25, "learning_rate": 8.516260905359364e-06, "loss": 0.23944463729858398, "step": 4450, "token_acc": 0.9062594106259411 }, { "epoch": 0.5707148347425057, "grad_norm": 3.34375, "learning_rate": 8.495314692523795e-06, "loss": 0.23881077766418457, "step": 4455, "token_acc": 0.90836533068726 }, { "epoch": 0.571355367665898, "grad_norm": 2.609375, "learning_rate": 8.47437523216286e-06, "loss": 0.23443114757537842, "step": 4460, "token_acc": 0.9087260486794407 }, { "epoch": 0.5719959005892903, "grad_norm": 6.125, "learning_rate": 8.453442618245155e-06, "loss": 0.24273183345794677, "step": 4465, "token_acc": 0.9040428010527678 }, { "epoch": 0.5726364335126826, "grad_norm": 2.8125, "learning_rate": 8.432516944708565e-06, "loss": 0.23376893997192383, "step": 4470, "token_acc": 0.9095967220185465 }, { "epoch": 0.5732769664360748, "grad_norm": 2.953125, "learning_rate": 8.411598305459812e-06, "loss": 0.23575949668884277, "step": 4475, "token_acc": 0.9096040329182644 }, { "epoch": 0.5739174993594671, "grad_norm": 3.625, "learning_rate": 8.390686794374072e-06, "loss": 0.24351611137390136, "step": 4480, "token_acc": 0.905288150226635 }, { "epoch": 0.5745580322828593, "grad_norm": 4.84375, "learning_rate": 8.369782505294511e-06, "loss": 0.2270632266998291, "step": 4485, "token_acc": 0.9119619294830197 }, { "epoch": 0.5751985652062516, "grad_norm": 3.15625, "learning_rate": 8.348885532031904e-06, "loss": 0.23725566864013672, "step": 4490, "token_acc": 0.9062796515138446 }, { "epoch": 0.5758390981296438, "grad_norm": 2.9375, "learning_rate": 8.327995968364178e-06, "loss": 0.23767762184143065, "step": 4495, "token_acc": 0.907425097698654 }, { "epoch": 0.5764796310530361, "grad_norm": 3.03125, "learning_rate": 8.307113908036024e-06, "loss": 0.24003219604492188, "step": 4500, "token_acc": 0.9065685894954187 }, { "epoch": 0.5764796310530361, "eval_loss": 0.32945460081100464, "eval_runtime": 103.1647, "eval_samples_per_second": 96.932, "eval_steps_per_second": 12.117, "eval_token_acc": 0.881673873684327, "step": 4500 }, { "epoch": 0.5771201639764284, "grad_norm": 2.921875, "learning_rate": 8.286239444758448e-06, "loss": 0.225927734375, "step": 4505, "token_acc": 0.9105997573236263 }, { "epoch": 0.5777606968998207, "grad_norm": 2.53125, "learning_rate": 8.265372672208375e-06, "loss": 0.24204869270324708, "step": 4510, "token_acc": 0.907190275023709 }, { "epoch": 0.578401229823213, "grad_norm": 3.109375, "learning_rate": 8.244513684028208e-06, "loss": 0.23642313480377197, "step": 4515, "token_acc": 0.9095570492933471 }, { "epoch": 0.5790417627466051, "grad_norm": 5.25, "learning_rate": 8.223662573825418e-06, "loss": 0.23264212608337403, "step": 4520, "token_acc": 0.9079717630853994 }, { "epoch": 0.5796822956699974, "grad_norm": 2.640625, "learning_rate": 8.202819435172129e-06, "loss": 0.22397842407226562, "step": 4525, "token_acc": 0.9115804932832275 }, { "epoch": 0.5803228285933897, "grad_norm": 3.8125, "learning_rate": 8.181984361604677e-06, "loss": 0.24578235149383545, "step": 4530, "token_acc": 0.9068775316728432 }, { "epoch": 0.580963361516782, "grad_norm": 3.234375, "learning_rate": 8.161157446623227e-06, "loss": 0.23125510215759276, "step": 4535, "token_acc": 0.9093144656801415 }, { "epoch": 0.5816038944401742, "grad_norm": 3.125, "learning_rate": 8.140338783691308e-06, "loss": 0.2348803997039795, "step": 4540, "token_acc": 0.9077382239716251 }, { "epoch": 0.5822444273635665, "grad_norm": 3.125, "learning_rate": 8.119528466235434e-06, "loss": 0.22919659614562987, "step": 4545, "token_acc": 0.9098924731182796 }, { "epoch": 0.5828849602869588, "grad_norm": 18.5, "learning_rate": 8.098726587644659e-06, "loss": 0.23590612411499023, "step": 4550, "token_acc": 0.9070518339934561 }, { "epoch": 0.583525493210351, "grad_norm": 3.359375, "learning_rate": 8.07793324127017e-06, "loss": 0.23590869903564454, "step": 4555, "token_acc": 0.9086673281849951 }, { "epoch": 0.5841660261337432, "grad_norm": 12.1875, "learning_rate": 8.05714852042486e-06, "loss": 0.23389995098114014, "step": 4560, "token_acc": 0.9086344946981173 }, { "epoch": 0.5848065590571355, "grad_norm": 3.53125, "learning_rate": 8.036372518382922e-06, "loss": 0.2384809970855713, "step": 4565, "token_acc": 0.9059391015978293 }, { "epoch": 0.5854470919805278, "grad_norm": 3.84375, "learning_rate": 8.015605328379407e-06, "loss": 0.23714299201965333, "step": 4570, "token_acc": 0.9076115033580162 }, { "epoch": 0.5860876249039201, "grad_norm": 2.609375, "learning_rate": 7.994847043609844e-06, "loss": 0.23302805423736572, "step": 4575, "token_acc": 0.9086178721940311 }, { "epoch": 0.5867281578273124, "grad_norm": 4.59375, "learning_rate": 7.974097757229781e-06, "loss": 0.23717694282531737, "step": 4580, "token_acc": 0.9076014314663907 }, { "epoch": 0.5873686907507046, "grad_norm": 3.0, "learning_rate": 7.953357562354384e-06, "loss": 0.23976330757141112, "step": 4585, "token_acc": 0.9052459298819882 }, { "epoch": 0.5880092236740968, "grad_norm": 5.625, "learning_rate": 7.932626552058032e-06, "loss": 0.23990461826324463, "step": 4590, "token_acc": 0.9076478454039598 }, { "epoch": 0.5886497565974891, "grad_norm": 2.78125, "learning_rate": 7.911904819373873e-06, "loss": 0.23198351860046387, "step": 4595, "token_acc": 0.9084361252479944 }, { "epoch": 0.5892902895208814, "grad_norm": 3.515625, "learning_rate": 7.891192457293433e-06, "loss": 0.2373666524887085, "step": 4600, "token_acc": 0.9076724137931035 }, { "epoch": 0.5892902895208814, "eval_loss": 0.3317316174507141, "eval_runtime": 102.7383, "eval_samples_per_second": 97.335, "eval_steps_per_second": 12.167, "eval_token_acc": 0.8819451749893418, "step": 4600 }, { "epoch": 0.5899308224442736, "grad_norm": 2.734375, "learning_rate": 7.870489558766178e-06, "loss": 0.23856124877929688, "step": 4605, "token_acc": 0.9061164587559891 }, { "epoch": 0.5905713553676659, "grad_norm": 5.84375, "learning_rate": 7.84979621669911e-06, "loss": 0.23322293758392335, "step": 4610, "token_acc": 0.9075289241927128 }, { "epoch": 0.5912118882910582, "grad_norm": 2.546875, "learning_rate": 7.829112523956335e-06, "loss": 0.23959455490112305, "step": 4615, "token_acc": 0.9058361730578441 }, { "epoch": 0.5918524212144505, "grad_norm": 3.0, "learning_rate": 7.808438573358674e-06, "loss": 0.2323786735534668, "step": 4620, "token_acc": 0.9092474599621146 }, { "epoch": 0.5924929541378426, "grad_norm": 2.890625, "learning_rate": 7.787774457683209e-06, "loss": 0.23595137596130372, "step": 4625, "token_acc": 0.9091104889080336 }, { "epoch": 0.5931334870612349, "grad_norm": 4.25, "learning_rate": 7.767120269662905e-06, "loss": 0.2342782974243164, "step": 4630, "token_acc": 0.9082426127527217 }, { "epoch": 0.5937740199846272, "grad_norm": 3.59375, "learning_rate": 7.746476101986164e-06, "loss": 0.2340677261352539, "step": 4635, "token_acc": 0.9087346024636058 }, { "epoch": 0.5944145529080195, "grad_norm": 3.0625, "learning_rate": 7.725842047296419e-06, "loss": 0.23336553573608398, "step": 4640, "token_acc": 0.9081429560401523 }, { "epoch": 0.5950550858314118, "grad_norm": 3.203125, "learning_rate": 7.70521819819173e-06, "loss": 0.24391114711761475, "step": 4645, "token_acc": 0.9032941379906623 }, { "epoch": 0.595695618754804, "grad_norm": 4.625, "learning_rate": 7.684604647224345e-06, "loss": 0.23319551944732667, "step": 4650, "token_acc": 0.9080335989661856 }, { "epoch": 0.5963361516781963, "grad_norm": 2.9375, "learning_rate": 7.66400148690031e-06, "loss": 0.22878189086914064, "step": 4655, "token_acc": 0.9093414875748309 }, { "epoch": 0.5969766846015885, "grad_norm": 2.859375, "learning_rate": 7.643408809679034e-06, "loss": 0.2268078327178955, "step": 4660, "token_acc": 0.9114099182844049 }, { "epoch": 0.5976172175249808, "grad_norm": 3.25, "learning_rate": 7.622826707972883e-06, "loss": 0.23129682540893554, "step": 4665, "token_acc": 0.9086430423509075 }, { "epoch": 0.598257750448373, "grad_norm": 2.703125, "learning_rate": 7.602255274146767e-06, "loss": 0.2353008508682251, "step": 4670, "token_acc": 0.9068710222106767 }, { "epoch": 0.5988982833717653, "grad_norm": 2.875, "learning_rate": 7.58169460051772e-06, "loss": 0.2389591693878174, "step": 4675, "token_acc": 0.9058991190188288 }, { "epoch": 0.5995388162951576, "grad_norm": 3.078125, "learning_rate": 7.561144779354483e-06, "loss": 0.23087067604064943, "step": 4680, "token_acc": 0.9091809064692463 }, { "epoch": 0.6001793492185499, "grad_norm": 5.78125, "learning_rate": 7.540605902877108e-06, "loss": 0.2390049457550049, "step": 4685, "token_acc": 0.9069245380763253 }, { "epoch": 0.600819882141942, "grad_norm": 2.9375, "learning_rate": 7.520078063256517e-06, "loss": 0.23379735946655272, "step": 4690, "token_acc": 0.9081875135018362 }, { "epoch": 0.6014604150653343, "grad_norm": 3.5, "learning_rate": 7.4995613526141156e-06, "loss": 0.2288158893585205, "step": 4695, "token_acc": 0.9082493403123243 }, { "epoch": 0.6021009479887266, "grad_norm": 3.515625, "learning_rate": 7.47905586302136e-06, "loss": 0.23635220527648926, "step": 4700, "token_acc": 0.908278174159718 }, { "epoch": 0.6021009479887266, "eval_loss": 0.33331820368766785, "eval_runtime": 103.2322, "eval_samples_per_second": 96.869, "eval_steps_per_second": 12.109, "eval_token_acc": 0.8817569251042295, "step": 4700 }, { "epoch": 0.6027414809121189, "grad_norm": 3.03125, "learning_rate": 7.458561686499345e-06, "loss": 0.22352910041809082, "step": 4705, "token_acc": 0.9113029146426093 }, { "epoch": 0.6033820138355112, "grad_norm": 18.75, "learning_rate": 7.438078915018409e-06, "loss": 0.22866015434265136, "step": 4710, "token_acc": 0.9103150625809237 }, { "epoch": 0.6040225467589034, "grad_norm": 2.953125, "learning_rate": 7.417607640497697e-06, "loss": 0.23653111457824708, "step": 4715, "token_acc": 0.9067353067353068 }, { "epoch": 0.6046630796822957, "grad_norm": 4.40625, "learning_rate": 7.397147954804771e-06, "loss": 0.23970022201538085, "step": 4720, "token_acc": 0.9069306076680899 }, { "epoch": 0.6053036126056879, "grad_norm": 5.3125, "learning_rate": 7.376699949755176e-06, "loss": 0.2359128475189209, "step": 4725, "token_acc": 0.9068213176957571 }, { "epoch": 0.6059441455290802, "grad_norm": 3.3125, "learning_rate": 7.356263717112047e-06, "loss": 0.23450722694396972, "step": 4730, "token_acc": 0.9093179469514295 }, { "epoch": 0.6065846784524724, "grad_norm": 3.03125, "learning_rate": 7.335839348585676e-06, "loss": 0.23415303230285645, "step": 4735, "token_acc": 0.9075492812257898 }, { "epoch": 0.6072252113758647, "grad_norm": 2.921875, "learning_rate": 7.315426935833135e-06, "loss": 0.22811522483825683, "step": 4740, "token_acc": 0.9106996417627001 }, { "epoch": 0.607865744299257, "grad_norm": 3.453125, "learning_rate": 7.29502657045782e-06, "loss": 0.23572731018066406, "step": 4745, "token_acc": 0.9084832017941088 }, { "epoch": 0.6085062772226493, "grad_norm": 2.921875, "learning_rate": 7.274638344009079e-06, "loss": 0.22873611450195314, "step": 4750, "token_acc": 0.9101705895055063 }, { "epoch": 0.6091468101460416, "grad_norm": 2.953125, "learning_rate": 7.254262347981777e-06, "loss": 0.23314647674560546, "step": 4755, "token_acc": 0.9090830933241628 }, { "epoch": 0.6097873430694337, "grad_norm": 3.078125, "learning_rate": 7.233898673815891e-06, "loss": 0.2401879072189331, "step": 4760, "token_acc": 0.9065476960213802 }, { "epoch": 0.610427875992826, "grad_norm": 4.625, "learning_rate": 7.213547412896116e-06, "loss": 0.23366336822509765, "step": 4765, "token_acc": 0.9075221619760737 }, { "epoch": 0.6110684089162183, "grad_norm": 2.921875, "learning_rate": 7.193208656551419e-06, "loss": 0.22110800743103026, "step": 4770, "token_acc": 0.9112323547241707 }, { "epoch": 0.6117089418396106, "grad_norm": 3.328125, "learning_rate": 7.172882496054675e-06, "loss": 0.22980756759643556, "step": 4775, "token_acc": 0.9108833830587625 }, { "epoch": 0.6123494747630028, "grad_norm": 8.6875, "learning_rate": 7.152569022622213e-06, "loss": 0.238081693649292, "step": 4780, "token_acc": 0.9061473283762753 }, { "epoch": 0.6129900076863951, "grad_norm": 3.03125, "learning_rate": 7.1322683274134405e-06, "loss": 0.23080739974975586, "step": 4785, "token_acc": 0.9094276239286792 }, { "epoch": 0.6136305406097874, "grad_norm": 3.65625, "learning_rate": 7.111980501530413e-06, "loss": 0.23122069835662842, "step": 4790, "token_acc": 0.9078907501190322 }, { "epoch": 0.6142710735331796, "grad_norm": 3.03125, "learning_rate": 7.091705636017443e-06, "loss": 0.23598337173461914, "step": 4795, "token_acc": 0.9072490063936409 }, { "epoch": 0.6149116064565718, "grad_norm": 5.1875, "learning_rate": 7.071443821860664e-06, "loss": 0.23058700561523438, "step": 4800, "token_acc": 0.9092871637666767 }, { "epoch": 0.6149116064565718, "eval_loss": 0.33330273628234863, "eval_runtime": 103.0939, "eval_samples_per_second": 96.999, "eval_steps_per_second": 12.125, "eval_token_acc": 0.882147266777771, "step": 4800 }, { "epoch": 0.6155521393799641, "grad_norm": 3.4375, "learning_rate": 7.051195149987662e-06, "loss": 0.23541276454925536, "step": 4805, "token_acc": 0.9070878340577527 }, { "epoch": 0.6161926723033564, "grad_norm": 3.578125, "learning_rate": 7.030959711267026e-06, "loss": 0.24111108779907225, "step": 4810, "token_acc": 0.9057868281995103 }, { "epoch": 0.6168332052267487, "grad_norm": 2.75, "learning_rate": 7.010737596507975e-06, "loss": 0.2280668020248413, "step": 4815, "token_acc": 0.9101400414937759 }, { "epoch": 0.617473738150141, "grad_norm": 3.171875, "learning_rate": 6.990528896459922e-06, "loss": 0.23039546012878417, "step": 4820, "token_acc": 0.9103489771359807 }, { "epoch": 0.6181142710735332, "grad_norm": 3.328125, "learning_rate": 6.9703337018120845e-06, "loss": 0.233514666557312, "step": 4825, "token_acc": 0.9083129058616093 }, { "epoch": 0.6187548039969254, "grad_norm": 4.4375, "learning_rate": 6.9501521031930816e-06, "loss": 0.23697190284729003, "step": 4830, "token_acc": 0.9069807427785419 }, { "epoch": 0.6193953369203177, "grad_norm": 2.953125, "learning_rate": 6.9299841911705e-06, "loss": 0.23227353096008302, "step": 4835, "token_acc": 0.9092987147416545 }, { "epoch": 0.62003586984371, "grad_norm": 3.5625, "learning_rate": 6.909830056250527e-06, "loss": 0.23467817306518554, "step": 4840, "token_acc": 0.909126180109497 }, { "epoch": 0.6206764027671022, "grad_norm": 3.125, "learning_rate": 6.889689788877505e-06, "loss": 0.22795021533966064, "step": 4845, "token_acc": 0.9109432333261386 }, { "epoch": 0.6213169356904945, "grad_norm": 2.609375, "learning_rate": 6.869563479433555e-06, "loss": 0.23201088905334472, "step": 4850, "token_acc": 0.9089618990281242 }, { "epoch": 0.6219574686138868, "grad_norm": 2.875, "learning_rate": 6.849451218238152e-06, "loss": 0.23549177646636962, "step": 4855, "token_acc": 0.9081416921948483 }, { "epoch": 0.6225980015372791, "grad_norm": 2.75, "learning_rate": 6.82935309554774e-06, "loss": 0.22994422912597656, "step": 4860, "token_acc": 0.9110630942091617 }, { "epoch": 0.6232385344606712, "grad_norm": 3.03125, "learning_rate": 6.8092692015552984e-06, "loss": 0.22758188247680664, "step": 4865, "token_acc": 0.9083592938733126 }, { "epoch": 0.6238790673840635, "grad_norm": 3.375, "learning_rate": 6.789199626389971e-06, "loss": 0.22297306060791017, "step": 4870, "token_acc": 0.913397067093481 }, { "epoch": 0.6245196003074558, "grad_norm": 3.5, "learning_rate": 6.7691444601166255e-06, "loss": 0.2313997268676758, "step": 4875, "token_acc": 0.9092045160734293 }, { "epoch": 0.6251601332308481, "grad_norm": 4.125, "learning_rate": 6.749103792735481e-06, "loss": 0.236191987991333, "step": 4880, "token_acc": 0.9096788100883811 }, { "epoch": 0.6258006661542403, "grad_norm": 2.984375, "learning_rate": 6.729077714181692e-06, "loss": 0.2335993766784668, "step": 4885, "token_acc": 0.9090713486530683 }, { "epoch": 0.6264411990776326, "grad_norm": 2.671875, "learning_rate": 6.709066314324929e-06, "loss": 0.23459205627441407, "step": 4890, "token_acc": 0.9073938032064301 }, { "epoch": 0.6270817320010249, "grad_norm": 3.359375, "learning_rate": 6.689069682969009e-06, "loss": 0.2288151502609253, "step": 4895, "token_acc": 0.9099611901681759 }, { "epoch": 0.6277222649244171, "grad_norm": 3.203125, "learning_rate": 6.669087909851459e-06, "loss": 0.23342595100402833, "step": 4900, "token_acc": 0.9083329752030599 }, { "epoch": 0.6277222649244171, "eval_loss": 0.33611026406288147, "eval_runtime": 102.7976, "eval_samples_per_second": 97.278, "eval_steps_per_second": 12.16, "eval_token_acc": 0.8819313330860247, "step": 4900 }, { "epoch": 0.6283627978478094, "grad_norm": 9.75, "learning_rate": 6.649121084643133e-06, "loss": 0.2269826889038086, "step": 4905, "token_acc": 0.9081998359169221 }, { "epoch": 0.6290033307712016, "grad_norm": 3.265625, "learning_rate": 6.629169296947804e-06, "loss": 0.2403498649597168, "step": 4910, "token_acc": 0.9052165312002752 }, { "epoch": 0.6296438636945939, "grad_norm": 3.234375, "learning_rate": 6.6092326363017635e-06, "loss": 0.23246257305145263, "step": 4915, "token_acc": 0.9084171289875174 }, { "epoch": 0.6302843966179862, "grad_norm": 3.109375, "learning_rate": 6.589311192173414e-06, "loss": 0.228167724609375, "step": 4920, "token_acc": 0.910772955213557 }, { "epoch": 0.6309249295413785, "grad_norm": 9.5, "learning_rate": 6.5694050539628805e-06, "loss": 0.2342754125595093, "step": 4925, "token_acc": 0.9082616179001721 }, { "epoch": 0.6315654624647707, "grad_norm": 3.15625, "learning_rate": 6.549514311001587e-06, "loss": 0.23288652896881104, "step": 4930, "token_acc": 0.9084992673045427 }, { "epoch": 0.6322059953881629, "grad_norm": 7.25, "learning_rate": 6.529639052551886e-06, "loss": 0.23185653686523439, "step": 4935, "token_acc": 0.911041091160221 }, { "epoch": 0.6328465283115552, "grad_norm": 3.1875, "learning_rate": 6.509779367806625e-06, "loss": 0.23133904933929444, "step": 4940, "token_acc": 0.9111350884764782 }, { "epoch": 0.6334870612349475, "grad_norm": 4.3125, "learning_rate": 6.489935345888774e-06, "loss": 0.22587313652038574, "step": 4945, "token_acc": 0.9098948272161408 }, { "epoch": 0.6341275941583397, "grad_norm": 3.109375, "learning_rate": 6.470107075851011e-06, "loss": 0.2315220832824707, "step": 4950, "token_acc": 0.9086874084288546 }, { "epoch": 0.634768127081732, "grad_norm": 2.640625, "learning_rate": 6.450294646675319e-06, "loss": 0.22459986209869384, "step": 4955, "token_acc": 0.9098201578470695 }, { "epoch": 0.6354086600051243, "grad_norm": 2.796875, "learning_rate": 6.430498147272607e-06, "loss": 0.2365894317626953, "step": 4960, "token_acc": 0.9069646344109351 }, { "epoch": 0.6360491929285165, "grad_norm": 7.15625, "learning_rate": 6.41071766648228e-06, "loss": 0.2363147735595703, "step": 4965, "token_acc": 0.9058361730578441 }, { "epoch": 0.6366897258519087, "grad_norm": 2.875, "learning_rate": 6.390953293071871e-06, "loss": 0.22636122703552247, "step": 4970, "token_acc": 0.9104109944249967 }, { "epoch": 0.637330258775301, "grad_norm": 3.265625, "learning_rate": 6.371205115736618e-06, "loss": 0.22853505611419678, "step": 4975, "token_acc": 0.9105326667815894 }, { "epoch": 0.6379707916986933, "grad_norm": 4.125, "learning_rate": 6.351473223099089e-06, "loss": 0.23797154426574707, "step": 4980, "token_acc": 0.9067096774193548 }, { "epoch": 0.6386113246220856, "grad_norm": 2.78125, "learning_rate": 6.33175770370876e-06, "loss": 0.23546228408813477, "step": 4985, "token_acc": 0.9074697754749568 }, { "epoch": 0.6392518575454779, "grad_norm": 5.90625, "learning_rate": 6.3120586460416454e-06, "loss": 0.22152302265167237, "step": 4990, "token_acc": 0.9129477772982305 }, { "epoch": 0.6398923904688701, "grad_norm": 3.0625, "learning_rate": 6.292376138499865e-06, "loss": 0.23244686126708985, "step": 4995, "token_acc": 0.9084896688856229 }, { "epoch": 0.6405329233922623, "grad_norm": 4.4375, "learning_rate": 6.272710269411286e-06, "loss": 0.2383200168609619, "step": 5000, "token_acc": 0.9069365908404196 }, { "epoch": 0.6405329233922623, "eval_loss": 0.3339642584323883, "eval_runtime": 102.7379, "eval_samples_per_second": 97.335, "eval_steps_per_second": 12.167, "eval_token_acc": 0.8819368698473515, "step": 5000 }, { "epoch": 0.6411734563156546, "grad_norm": 3.203125, "learning_rate": 6.2530611270290935e-06, "loss": 0.22576665878295898, "step": 5005, "token_acc": 0.9115372986048119 }, { "epoch": 0.6418139892390469, "grad_norm": 3.078125, "learning_rate": 6.23342879953142e-06, "loss": 0.23447873592376708, "step": 5010, "token_acc": 0.9083354860931715 }, { "epoch": 0.6424545221624391, "grad_norm": 3.375, "learning_rate": 6.2138133750209425e-06, "loss": 0.223459792137146, "step": 5015, "token_acc": 0.9112359550561798 }, { "epoch": 0.6430950550858314, "grad_norm": 2.828125, "learning_rate": 6.19421494152447e-06, "loss": 0.22827987670898436, "step": 5020, "token_acc": 0.9120494424755813 }, { "epoch": 0.6437355880092237, "grad_norm": 3.40625, "learning_rate": 6.174633586992569e-06, "loss": 0.22968311309814454, "step": 5025, "token_acc": 0.9102702236779727 }, { "epoch": 0.644376120932616, "grad_norm": 3.859375, "learning_rate": 6.155069399299163e-06, "loss": 0.23479781150817872, "step": 5030, "token_acc": 0.9084179721122396 }, { "epoch": 0.6450166538560081, "grad_norm": 2.890625, "learning_rate": 6.1355224662411375e-06, "loss": 0.2318052291870117, "step": 5035, "token_acc": 0.909961603175288 }, { "epoch": 0.6456571867794004, "grad_norm": 3.703125, "learning_rate": 6.115992875537937e-06, "loss": 0.23980298042297363, "step": 5040, "token_acc": 0.9071697134707637 }, { "epoch": 0.6462977197027927, "grad_norm": 3.40625, "learning_rate": 6.096480714831197e-06, "loss": 0.22896120548248292, "step": 5045, "token_acc": 0.9098480925254617 }, { "epoch": 0.646938252626185, "grad_norm": 3.015625, "learning_rate": 6.076986071684313e-06, "loss": 0.22948775291442872, "step": 5050, "token_acc": 0.9118307426597582 }, { "epoch": 0.6475787855495773, "grad_norm": 2.9375, "learning_rate": 6.057509033582087e-06, "loss": 0.23411431312561035, "step": 5055, "token_acc": 0.9089658138034831 }, { "epoch": 0.6482193184729695, "grad_norm": 3.234375, "learning_rate": 6.038049687930303e-06, "loss": 0.22734377384185792, "step": 5060, "token_acc": 0.9120784583620096 }, { "epoch": 0.6488598513963618, "grad_norm": 3.078125, "learning_rate": 6.018608122055352e-06, "loss": 0.21841344833374024, "step": 5065, "token_acc": 0.9142105036033314 }, { "epoch": 0.649500384319754, "grad_norm": 3.21875, "learning_rate": 5.9991844232038385e-06, "loss": 0.23631734848022462, "step": 5070, "token_acc": 0.9079987900263602 }, { "epoch": 0.6501409172431463, "grad_norm": 5.9375, "learning_rate": 5.9797786785421806e-06, "loss": 0.22841448783874513, "step": 5075, "token_acc": 0.9127186352839559 }, { "epoch": 0.6507814501665385, "grad_norm": 3.265625, "learning_rate": 5.960390975156234e-06, "loss": 0.2350531816482544, "step": 5080, "token_acc": 0.9096751930293749 }, { "epoch": 0.6514219830899308, "grad_norm": 3.15625, "learning_rate": 5.94102140005088e-06, "loss": 0.23367114067077638, "step": 5085, "token_acc": 0.9095145631067961 }, { "epoch": 0.6520625160133231, "grad_norm": 3.234375, "learning_rate": 5.921670040149655e-06, "loss": 0.2327101230621338, "step": 5090, "token_acc": 0.9080370942812983 }, { "epoch": 0.6527030489367154, "grad_norm": 4.0, "learning_rate": 5.902336982294346e-06, "loss": 0.22089247703552245, "step": 5095, "token_acc": 0.9123602296766394 }, { "epoch": 0.6533435818601077, "grad_norm": 5.75, "learning_rate": 5.88302231324462e-06, "loss": 0.23006696701049806, "step": 5100, "token_acc": 0.9100451710045171 }, { "epoch": 0.6533435818601077, "eval_loss": 0.3327247202396393, "eval_runtime": 103.2522, "eval_samples_per_second": 96.85, "eval_steps_per_second": 12.106, "eval_token_acc": 0.8824545570314101, "step": 5100 }, { "epoch": 0.6539841147834998, "grad_norm": 3.515625, "learning_rate": 5.863726119677602e-06, "loss": 0.23648326396942138, "step": 5105, "token_acc": 0.9088638125592213 }, { "epoch": 0.6546246477068921, "grad_norm": 2.65625, "learning_rate": 5.844448488187526e-06, "loss": 0.22581000328063966, "step": 5110, "token_acc": 0.9106820331985895 }, { "epoch": 0.6552651806302844, "grad_norm": 2.9375, "learning_rate": 5.825189505285308e-06, "loss": 0.2255998134613037, "step": 5115, "token_acc": 0.9123876210235131 }, { "epoch": 0.6559057135536767, "grad_norm": 5.0, "learning_rate": 5.805949257398195e-06, "loss": 0.23895587921142578, "step": 5120, "token_acc": 0.9072859041982932 }, { "epoch": 0.6565462464770689, "grad_norm": 2.9375, "learning_rate": 5.786727830869337e-06, "loss": 0.2289639711380005, "step": 5125, "token_acc": 0.9104838361603868 }, { "epoch": 0.6571867794004612, "grad_norm": 16.75, "learning_rate": 5.767525311957441e-06, "loss": 0.22975871562957764, "step": 5130, "token_acc": 0.9101954341058457 }, { "epoch": 0.6578273123238535, "grad_norm": 12.4375, "learning_rate": 5.748341786836353e-06, "loss": 0.23110666275024414, "step": 5135, "token_acc": 0.910392569978931 }, { "epoch": 0.6584678452472457, "grad_norm": 2.65625, "learning_rate": 5.729177341594674e-06, "loss": 0.23442704677581788, "step": 5140, "token_acc": 0.9096712966957122 }, { "epoch": 0.6591083781706379, "grad_norm": 2.828125, "learning_rate": 5.710032062235404e-06, "loss": 0.23014814853668214, "step": 5145, "token_acc": 0.9096832657288341 }, { "epoch": 0.6597489110940302, "grad_norm": 3.03125, "learning_rate": 5.690906034675505e-06, "loss": 0.2316150188446045, "step": 5150, "token_acc": 0.9095728632386535 }, { "epoch": 0.6603894440174225, "grad_norm": 2.796875, "learning_rate": 5.671799344745577e-06, "loss": 0.22539763450622557, "step": 5155, "token_acc": 0.9121212121212121 }, { "epoch": 0.6610299769408148, "grad_norm": 2.484375, "learning_rate": 5.652712078189408e-06, "loss": 0.23087406158447266, "step": 5160, "token_acc": 0.9089225734217552 }, { "epoch": 0.6616705098642071, "grad_norm": 3.453125, "learning_rate": 5.633644320663638e-06, "loss": 0.2334925651550293, "step": 5165, "token_acc": 0.9076453650780008 }, { "epoch": 0.6623110427875993, "grad_norm": 2.96875, "learning_rate": 5.614596157737357e-06, "loss": 0.22363200187683105, "step": 5170, "token_acc": 0.9122427805637212 }, { "epoch": 0.6629515757109915, "grad_norm": 3.640625, "learning_rate": 5.5955676748917195e-06, "loss": 0.2343050003051758, "step": 5175, "token_acc": 0.9070922598479613 }, { "epoch": 0.6635921086343838, "grad_norm": 3.484375, "learning_rate": 5.57655895751956e-06, "loss": 0.23191659450531005, "step": 5180, "token_acc": 0.9086604683195593 }, { "epoch": 0.6642326415577761, "grad_norm": 2.9375, "learning_rate": 5.557570090925019e-06, "loss": 0.22515347003936767, "step": 5185, "token_acc": 0.9112387698686939 }, { "epoch": 0.6648731744811683, "grad_norm": 3.828125, "learning_rate": 5.538601160323147e-06, "loss": 0.23082191944122316, "step": 5190, "token_acc": 0.9089143103820418 }, { "epoch": 0.6655137074045606, "grad_norm": 3.8125, "learning_rate": 5.519652250839537e-06, "loss": 0.22431740760803223, "step": 5195, "token_acc": 0.912159537272845 }, { "epoch": 0.6661542403279529, "grad_norm": 3.234375, "learning_rate": 5.500723447509925e-06, "loss": 0.23847784996032714, "step": 5200, "token_acc": 0.9073309241094476 }, { "epoch": 0.6661542403279529, "eval_loss": 0.3346344828605652, "eval_runtime": 103.736, "eval_samples_per_second": 96.399, "eval_steps_per_second": 12.05, "eval_token_acc": 0.8817237045362686, "step": 5200 }, { "epoch": 0.6667947732513452, "grad_norm": 3.265625, "learning_rate": 5.48181483527983e-06, "loss": 0.24246997833251954, "step": 5205, "token_acc": 0.9048132493746226 }, { "epoch": 0.6674353061747373, "grad_norm": 2.96875, "learning_rate": 5.462926499004148e-06, "loss": 0.23247838020324707, "step": 5210, "token_acc": 0.9081055404413352 }, { "epoch": 0.6680758390981296, "grad_norm": 4.0, "learning_rate": 5.4440585234467935e-06, "loss": 0.2290191411972046, "step": 5215, "token_acc": 0.9098120365580272 }, { "epoch": 0.6687163720215219, "grad_norm": 2.625, "learning_rate": 5.425210993280306e-06, "loss": 0.22439243793487548, "step": 5220, "token_acc": 0.9100359509680773 }, { "epoch": 0.6693569049449142, "grad_norm": 5.3125, "learning_rate": 5.406383993085471e-06, "loss": 0.22781476974487305, "step": 5225, "token_acc": 0.910941475826972 }, { "epoch": 0.6699974378683065, "grad_norm": 2.828125, "learning_rate": 5.387577607350951e-06, "loss": 0.2305924892425537, "step": 5230, "token_acc": 0.9094285837688421 }, { "epoch": 0.6706379707916987, "grad_norm": 4.125, "learning_rate": 5.368791920472884e-06, "loss": 0.2318443775177002, "step": 5235, "token_acc": 0.9093386392144989 }, { "epoch": 0.6712785037150909, "grad_norm": 3.84375, "learning_rate": 5.35002701675454e-06, "loss": 0.2296751022338867, "step": 5240, "token_acc": 0.9120812882114872 }, { "epoch": 0.6719190366384832, "grad_norm": 2.734375, "learning_rate": 5.331282980405896e-06, "loss": 0.2311159610748291, "step": 5245, "token_acc": 0.9103889922547704 }, { "epoch": 0.6725595695618755, "grad_norm": 4.03125, "learning_rate": 5.3125598955433145e-06, "loss": 0.23089895248413086, "step": 5250, "token_acc": 0.909507544640927 }, { "epoch": 0.6732001024852677, "grad_norm": 2.84375, "learning_rate": 5.293857846189108e-06, "loss": 0.23441662788391113, "step": 5255, "token_acc": 0.9084364357460016 }, { "epoch": 0.67384063540866, "grad_norm": 2.96875, "learning_rate": 5.275176916271197e-06, "loss": 0.2311511754989624, "step": 5260, "token_acc": 0.9103763417683322 }, { "epoch": 0.6744811683320523, "grad_norm": 2.734375, "learning_rate": 5.256517189622742e-06, "loss": 0.23376543521881105, "step": 5265, "token_acc": 0.9086750107898144 }, { "epoch": 0.6751217012554446, "grad_norm": 2.609375, "learning_rate": 5.237878749981724e-06, "loss": 0.22374820709228516, "step": 5270, "token_acc": 0.912248865845755 }, { "epoch": 0.6757622341788367, "grad_norm": 3.859375, "learning_rate": 5.219261680990624e-06, "loss": 0.22372374534606934, "step": 5275, "token_acc": 0.9098982583204 }, { "epoch": 0.676402767102229, "grad_norm": 3.84375, "learning_rate": 5.200666066195993e-06, "loss": 0.22683911323547362, "step": 5280, "token_acc": 0.9123762590239053 }, { "epoch": 0.6770433000256213, "grad_norm": 3.046875, "learning_rate": 5.182091989048121e-06, "loss": 0.22960472106933594, "step": 5285, "token_acc": 0.9087181700474752 }, { "epoch": 0.6776838329490136, "grad_norm": 3.21875, "learning_rate": 5.163539532900639e-06, "loss": 0.23558075428009034, "step": 5290, "token_acc": 0.9076750989502668 }, { "epoch": 0.6783243658724059, "grad_norm": 9.5, "learning_rate": 5.14500878101015e-06, "loss": 0.23191981315612792, "step": 5295, "token_acc": 0.9099460625674218 }, { "epoch": 0.6789648987957981, "grad_norm": 2.765625, "learning_rate": 5.126499816535861e-06, "loss": 0.22278881072998047, "step": 5300, "token_acc": 0.9129082426127527 }, { "epoch": 0.6789648987957981, "eval_loss": 0.3326459527015686, "eval_runtime": 102.5695, "eval_samples_per_second": 97.495, "eval_steps_per_second": 12.187, "eval_token_acc": 0.8820392999318979, "step": 5300 }, { "epoch": 0.6796054317191904, "grad_norm": 5.0, "learning_rate": 5.108012722539199e-06, "loss": 0.22774300575256348, "step": 5305, "token_acc": 0.910229284511421 }, { "epoch": 0.6802459646425826, "grad_norm": 4.5, "learning_rate": 5.0895475819834474e-06, "loss": 0.23403663635253907, "step": 5310, "token_acc": 0.9082355973707952 }, { "epoch": 0.6808864975659749, "grad_norm": 4.25, "learning_rate": 5.071104477733372e-06, "loss": 0.23252689838409424, "step": 5315, "token_acc": 0.9085381630012936 }, { "epoch": 0.6815270304893671, "grad_norm": 2.96875, "learning_rate": 5.052683492554844e-06, "loss": 0.23012104034423828, "step": 5320, "token_acc": 0.9094350987394054 }, { "epoch": 0.6821675634127594, "grad_norm": 5.03125, "learning_rate": 5.034284709114476e-06, "loss": 0.2321260929107666, "step": 5325, "token_acc": 0.9089814695386732 }, { "epoch": 0.6828080963361517, "grad_norm": 7.90625, "learning_rate": 5.0159082099792465e-06, "loss": 0.22481832504272461, "step": 5330, "token_acc": 0.9132952973720608 }, { "epoch": 0.683448629259544, "grad_norm": 2.890625, "learning_rate": 4.997554077616128e-06, "loss": 0.2297644853591919, "step": 5335, "token_acc": 0.9089265731255918 }, { "epoch": 0.6840891621829363, "grad_norm": 3.34375, "learning_rate": 4.979222394391721e-06, "loss": 0.22588052749633789, "step": 5340, "token_acc": 0.911449325492909 }, { "epoch": 0.6847296951063284, "grad_norm": 3.515625, "learning_rate": 4.960913242571882e-06, "loss": 0.22864861488342286, "step": 5345, "token_acc": 0.9100142014890046 }, { "epoch": 0.6853702280297207, "grad_norm": 2.953125, "learning_rate": 4.9426267043213594e-06, "loss": 0.23536896705627441, "step": 5350, "token_acc": 0.9080583865952668 }, { "epoch": 0.686010760953113, "grad_norm": 2.96875, "learning_rate": 4.924362861703405e-06, "loss": 0.22937750816345215, "step": 5355, "token_acc": 0.9104786545924968 }, { "epoch": 0.6866512938765053, "grad_norm": 2.953125, "learning_rate": 4.906121796679445e-06, "loss": 0.2339865207672119, "step": 5360, "token_acc": 0.9082639996551278 }, { "epoch": 0.6872918267998975, "grad_norm": 3.046875, "learning_rate": 4.887903591108663e-06, "loss": 0.23555207252502441, "step": 5365, "token_acc": 0.9080790717662226 }, { "epoch": 0.6879323597232898, "grad_norm": 3.546875, "learning_rate": 4.869708326747681e-06, "loss": 0.2278905391693115, "step": 5370, "token_acc": 0.9106388481765669 }, { "epoch": 0.6885728926466821, "grad_norm": 3.671875, "learning_rate": 4.8515360852501496e-06, "loss": 0.22571067810058593, "step": 5375, "token_acc": 0.9102165846923808 }, { "epoch": 0.6892134255700743, "grad_norm": 6.8125, "learning_rate": 4.833386948166409e-06, "loss": 0.23547790050506592, "step": 5380, "token_acc": 0.9070218543902755 }, { "epoch": 0.6898539584934665, "grad_norm": 6.1875, "learning_rate": 4.815260996943126e-06, "loss": 0.23141322135925294, "step": 5385, "token_acc": 0.9082988267770876 }, { "epoch": 0.6904944914168588, "grad_norm": 4.125, "learning_rate": 4.797158312922895e-06, "loss": 0.2272815227508545, "step": 5390, "token_acc": 0.9105712070302404 }, { "epoch": 0.6911350243402511, "grad_norm": 3.671875, "learning_rate": 4.779078977343922e-06, "loss": 0.22905888557434081, "step": 5395, "token_acc": 0.9104548394050442 }, { "epoch": 0.6917755572636434, "grad_norm": 3.46875, "learning_rate": 4.761023071339608e-06, "loss": 0.22437114715576173, "step": 5400, "token_acc": 0.9122693567856527 }, { "epoch": 0.6917755572636434, "eval_loss": 0.3339126706123352, "eval_runtime": 103.3301, "eval_samples_per_second": 96.777, "eval_steps_per_second": 12.097, "eval_token_acc": 0.8821528035390979, "step": 5400 }, { "epoch": 0.6924160901870356, "grad_norm": 7.34375, "learning_rate": 4.742990675938228e-06, "loss": 0.22792973518371581, "step": 5405, "token_acc": 0.9097371822490306 }, { "epoch": 0.6930566231104279, "grad_norm": 3.109375, "learning_rate": 4.724981872062545e-06, "loss": 0.22467894554138185, "step": 5410, "token_acc": 0.9115411195577056 }, { "epoch": 0.6936971560338201, "grad_norm": 4.875, "learning_rate": 4.706996740529453e-06, "loss": 0.22711763381958008, "step": 5415, "token_acc": 0.9103552206673843 }, { "epoch": 0.6943376889572124, "grad_norm": 3.546875, "learning_rate": 4.689035362049609e-06, "loss": 0.22871413230895996, "step": 5420, "token_acc": 0.9115798536375377 }, { "epoch": 0.6949782218806047, "grad_norm": 3.203125, "learning_rate": 4.6710978172270794e-06, "loss": 0.22836050987243653, "step": 5425, "token_acc": 0.9107814729922588 }, { "epoch": 0.6956187548039969, "grad_norm": 3.265625, "learning_rate": 4.653184186558975e-06, "loss": 0.22787034511566162, "step": 5430, "token_acc": 0.9093102408340873 }, { "epoch": 0.6962592877273892, "grad_norm": 5.4375, "learning_rate": 4.635294550435086e-06, "loss": 0.21838183403015138, "step": 5435, "token_acc": 0.9151326592342927 }, { "epoch": 0.6968998206507815, "grad_norm": 3.875, "learning_rate": 4.617428989137517e-06, "loss": 0.2287057876586914, "step": 5440, "token_acc": 0.9102171191781413 }, { "epoch": 0.6975403535741738, "grad_norm": 4.0625, "learning_rate": 4.599587582840349e-06, "loss": 0.23020198345184326, "step": 5445, "token_acc": 0.9096006210644354 }, { "epoch": 0.6981808864975659, "grad_norm": 2.8125, "learning_rate": 4.581770411609254e-06, "loss": 0.22472758293151857, "step": 5450, "token_acc": 0.9107643229727982 }, { "epoch": 0.6988214194209582, "grad_norm": 3.546875, "learning_rate": 4.563977555401148e-06, "loss": 0.22312564849853517, "step": 5455, "token_acc": 0.9104580812445981 }, { "epoch": 0.6994619523443505, "grad_norm": 4.21875, "learning_rate": 4.546209094063829e-06, "loss": 0.23030247688293456, "step": 5460, "token_acc": 0.9101837837837837 }, { "epoch": 0.7001024852677428, "grad_norm": 3.359375, "learning_rate": 4.528465107335621e-06, "loss": 0.22946977615356445, "step": 5465, "token_acc": 0.9094075156935248 }, { "epoch": 0.700743018191135, "grad_norm": 4.90625, "learning_rate": 4.5107456748450206e-06, "loss": 0.23560161590576173, "step": 5470, "token_acc": 0.9065336143490043 }, { "epoch": 0.7013835511145273, "grad_norm": 3.3125, "learning_rate": 4.4930508761103145e-06, "loss": 0.23189268112182618, "step": 5475, "token_acc": 0.909141750914175 }, { "epoch": 0.7020240840379196, "grad_norm": 5.46875, "learning_rate": 4.475380790539272e-06, "loss": 0.2286592483520508, "step": 5480, "token_acc": 0.9110030970406057 }, { "epoch": 0.7026646169613118, "grad_norm": 3.328125, "learning_rate": 4.457735497428728e-06, "loss": 0.22808377742767333, "step": 5485, "token_acc": 0.9114810810810811 }, { "epoch": 0.703305149884704, "grad_norm": 3.484375, "learning_rate": 4.4401150759642875e-06, "loss": 0.22788479328155517, "step": 5490, "token_acc": 0.9103065034297126 }, { "epoch": 0.7039456828080963, "grad_norm": 3.71875, "learning_rate": 4.422519605219914e-06, "loss": 0.23326406478881836, "step": 5495, "token_acc": 0.9091496232508073 }, { "epoch": 0.7045862157314886, "grad_norm": 3.828125, "learning_rate": 4.404949164157617e-06, "loss": 0.23126420974731446, "step": 5500, "token_acc": 0.9086606720302887 }, { "epoch": 0.7045862157314886, "eval_loss": 0.3346463441848755, "eval_runtime": 103.293, "eval_samples_per_second": 96.812, "eval_steps_per_second": 12.101, "eval_token_acc": 0.8824379467474296, "step": 5500 }, { "epoch": 0.7052267486548809, "grad_norm": 3.03125, "learning_rate": 4.387403831627079e-06, "loss": 0.22369828224182128, "step": 5505, "token_acc": 0.9125932956555503 }, { "epoch": 0.7058672815782732, "grad_norm": 3.46875, "learning_rate": 4.3698836863653005e-06, "loss": 0.23041157722473143, "step": 5510, "token_acc": 0.9091222179172586 }, { "epoch": 0.7065078145016653, "grad_norm": 14.625, "learning_rate": 4.352388806996263e-06, "loss": 0.2362978458404541, "step": 5515, "token_acc": 0.9075854931217388 }, { "epoch": 0.7071483474250576, "grad_norm": 2.578125, "learning_rate": 4.334919272030547e-06, "loss": 0.23041419982910155, "step": 5520, "token_acc": 0.909314147854558 }, { "epoch": 0.7077888803484499, "grad_norm": 2.96875, "learning_rate": 4.317475159865005e-06, "loss": 0.22999229431152343, "step": 5525, "token_acc": 0.9102149842746974 }, { "epoch": 0.7084294132718422, "grad_norm": 3.0625, "learning_rate": 4.300056548782404e-06, "loss": 0.22720894813537598, "step": 5530, "token_acc": 0.9110218883564305 }, { "epoch": 0.7090699461952344, "grad_norm": 2.96875, "learning_rate": 4.282663516951068e-06, "loss": 0.23367710113525392, "step": 5535, "token_acc": 0.9050570260383043 }, { "epoch": 0.7097104791186267, "grad_norm": 3.0, "learning_rate": 4.265296142424529e-06, "loss": 0.22929010391235352, "step": 5540, "token_acc": 0.9078120967048094 }, { "epoch": 0.710351012042019, "grad_norm": 4.03125, "learning_rate": 4.247954503141183e-06, "loss": 0.2340301513671875, "step": 5545, "token_acc": 0.9078794652452392 }, { "epoch": 0.7109915449654112, "grad_norm": 5.09375, "learning_rate": 4.230638676923932e-06, "loss": 0.2315293788909912, "step": 5550, "token_acc": 0.9097634408602151 }, { "epoch": 0.7116320778888034, "grad_norm": 3.5625, "learning_rate": 4.213348741479847e-06, "loss": 0.22180113792419434, "step": 5555, "token_acc": 0.9152183311716385 }, { "epoch": 0.7122726108121957, "grad_norm": 4.53125, "learning_rate": 4.196084774399788e-06, "loss": 0.2180586576461792, "step": 5560, "token_acc": 0.913914992671782 }, { "epoch": 0.712913143735588, "grad_norm": 11.0, "learning_rate": 4.1788468531581065e-06, "loss": 0.23073256015777588, "step": 5565, "token_acc": 0.9099750408813151 }, { "epoch": 0.7135536766589803, "grad_norm": 5.15625, "learning_rate": 4.161635055112254e-06, "loss": 0.2295978307723999, "step": 5570, "token_acc": 0.9100305784056161 }, { "epoch": 0.7141942095823726, "grad_norm": 2.84375, "learning_rate": 4.1444494575024555e-06, "loss": 0.23021929264068602, "step": 5575, "token_acc": 0.9086412318809411 }, { "epoch": 0.7148347425057648, "grad_norm": 3.078125, "learning_rate": 4.1272901374513555e-06, "loss": 0.23160245418548583, "step": 5580, "token_acc": 0.9093453919035315 }, { "epoch": 0.715475275429157, "grad_norm": 3.34375, "learning_rate": 4.110157171963674e-06, "loss": 0.22630250453948975, "step": 5585, "token_acc": 0.9112262521588946 }, { "epoch": 0.7161158083525493, "grad_norm": 5.09375, "learning_rate": 4.093050637925871e-06, "loss": 0.22265501022338868, "step": 5590, "token_acc": 0.9139391854113802 }, { "epoch": 0.7167563412759416, "grad_norm": 3.546875, "learning_rate": 4.07597061210577e-06, "loss": 0.22470180988311766, "step": 5595, "token_acc": 0.911774876804703 }, { "epoch": 0.7173968741993338, "grad_norm": 2.953125, "learning_rate": 4.0589171711522626e-06, "loss": 0.238523530960083, "step": 5600, "token_acc": 0.9068379005240099 }, { "epoch": 0.7173968741993338, "eval_loss": 0.33452367782592773, "eval_runtime": 102.4933, "eval_samples_per_second": 97.567, "eval_steps_per_second": 12.196, "eval_token_acc": 0.882180487345732, "step": 5600 }, { "epoch": 0.7180374071227261, "grad_norm": 2.859375, "learning_rate": 4.0418903915949125e-06, "loss": 0.22467451095581054, "step": 5605, "token_acc": 0.9131561892417369 }, { "epoch": 0.7186779400461184, "grad_norm": 4.03125, "learning_rate": 4.0248903498436624e-06, "loss": 0.22909164428710938, "step": 5610, "token_acc": 0.9093295464325674 }, { "epoch": 0.7193184729695107, "grad_norm": 3.171875, "learning_rate": 4.007917122188438e-06, "loss": 0.22771682739257812, "step": 5615, "token_acc": 0.9116303129580137 }, { "epoch": 0.7199590058929028, "grad_norm": 2.921875, "learning_rate": 3.990970784798854e-06, "loss": 0.23022587299346925, "step": 5620, "token_acc": 0.9101649768001375 }, { "epoch": 0.7205995388162951, "grad_norm": 3.28125, "learning_rate": 3.974051413723842e-06, "loss": 0.23161954879760743, "step": 5625, "token_acc": 0.9094234079173839 }, { "epoch": 0.7212400717396874, "grad_norm": 4.90625, "learning_rate": 3.957159084891318e-06, "loss": 0.23545317649841307, "step": 5630, "token_acc": 0.908342315154128 }, { "epoch": 0.7218806046630797, "grad_norm": 3.890625, "learning_rate": 3.940293874107854e-06, "loss": 0.2253598690032959, "step": 5635, "token_acc": 0.9121528376746593 }, { "epoch": 0.722521137586472, "grad_norm": 3.546875, "learning_rate": 3.923455857058311e-06, "loss": 0.2275296449661255, "step": 5640, "token_acc": 0.9096149709614971 }, { "epoch": 0.7231616705098642, "grad_norm": 2.828125, "learning_rate": 3.906645109305521e-06, "loss": 0.23534011840820312, "step": 5645, "token_acc": 0.908305245873304 }, { "epoch": 0.7238022034332565, "grad_norm": 3.0625, "learning_rate": 3.88986170628994e-06, "loss": 0.23898892402648925, "step": 5650, "token_acc": 0.9062513444908145 }, { "epoch": 0.7244427363566487, "grad_norm": 3.109375, "learning_rate": 3.873105723329317e-06, "loss": 0.23146333694458007, "step": 5655, "token_acc": 0.908890330953926 }, { "epoch": 0.725083269280041, "grad_norm": 5.375, "learning_rate": 3.856377235618341e-06, "loss": 0.24037771224975585, "step": 5660, "token_acc": 0.9058757646247954 }, { "epoch": 0.7257238022034332, "grad_norm": 5.6875, "learning_rate": 3.839676318228319e-06, "loss": 0.2313528299331665, "step": 5665, "token_acc": 0.9094474614257392 }, { "epoch": 0.7263643351268255, "grad_norm": 3.90625, "learning_rate": 3.823003046106828e-06, "loss": 0.23002188205718993, "step": 5670, "token_acc": 0.9098721713594748 }, { "epoch": 0.7270048680502178, "grad_norm": 3.421875, "learning_rate": 3.8063574940773907e-06, "loss": 0.2305138111114502, "step": 5675, "token_acc": 0.9112459129237652 }, { "epoch": 0.7276454009736101, "grad_norm": 7.6875, "learning_rate": 3.789739736839114e-06, "loss": 0.2200489044189453, "step": 5680, "token_acc": 0.9114310270734852 }, { "epoch": 0.7282859338970024, "grad_norm": 3.109375, "learning_rate": 3.773149848966401e-06, "loss": 0.22987012863159179, "step": 5685, "token_acc": 0.9078311172509066 }, { "epoch": 0.7289264668203945, "grad_norm": 3.15625, "learning_rate": 3.7565879049085562e-06, "loss": 0.22706859111785888, "step": 5690, "token_acc": 0.9124580573001807 }, { "epoch": 0.7295669997437868, "grad_norm": 3.859375, "learning_rate": 3.7400539789895074e-06, "loss": 0.23126349449157715, "step": 5695, "token_acc": 0.9095687389599759 }, { "epoch": 0.7302075326671791, "grad_norm": 3.125, "learning_rate": 3.7235481454074373e-06, "loss": 0.2237870693206787, "step": 5700, "token_acc": 0.9137209201950882 }, { "epoch": 0.7302075326671791, "eval_loss": 0.33403199911117554, "eval_runtime": 103.444, "eval_samples_per_second": 96.671, "eval_steps_per_second": 12.084, "eval_token_acc": 0.8826123547292247, "step": 5700 }, { "epoch": 0.7308480655905714, "grad_norm": 3.34375, "learning_rate": 3.70707047823445e-06, "loss": 0.22239408493041993, "step": 5705, "token_acc": 0.9109814094249892 }, { "epoch": 0.7314885985139636, "grad_norm": 3.09375, "learning_rate": 3.6906210514162744e-06, "loss": 0.23225040435791017, "step": 5710, "token_acc": 0.9099184606756116 }, { "epoch": 0.7321291314373559, "grad_norm": 3.671875, "learning_rate": 3.6741999387718773e-06, "loss": 0.2249077320098877, "step": 5715, "token_acc": 0.9111187815506753 }, { "epoch": 0.7327696643607482, "grad_norm": 3.953125, "learning_rate": 3.657807213993192e-06, "loss": 0.2272716522216797, "step": 5720, "token_acc": 0.9099200345796412 }, { "epoch": 0.7334101972841404, "grad_norm": 2.875, "learning_rate": 3.641442950644728e-06, "loss": 0.22431583404541017, "step": 5725, "token_acc": 0.910762060930353 }, { "epoch": 0.7340507302075326, "grad_norm": 2.875, "learning_rate": 3.6251072221632978e-06, "loss": 0.2208378553390503, "step": 5730, "token_acc": 0.9123809523809524 }, { "epoch": 0.7346912631309249, "grad_norm": 4.09375, "learning_rate": 3.608800101857637e-06, "loss": 0.22057173252105713, "step": 5735, "token_acc": 0.9117215168005528 }, { "epoch": 0.7353317960543172, "grad_norm": 9.25, "learning_rate": 3.5925216629081116e-06, "loss": 0.2260368824005127, "step": 5740, "token_acc": 0.910849706997587 }, { "epoch": 0.7359723289777095, "grad_norm": 3.015625, "learning_rate": 3.5762719783663724e-06, "loss": 0.22467224597930907, "step": 5745, "token_acc": 0.9124238891048063 }, { "epoch": 0.7366128619011018, "grad_norm": 3.015625, "learning_rate": 3.5600511211550283e-06, "loss": 0.2277822256088257, "step": 5750, "token_acc": 0.9120300427331981 }, { "epoch": 0.7372533948244939, "grad_norm": 2.6875, "learning_rate": 3.5438591640673346e-06, "loss": 0.21924290657043458, "step": 5755, "token_acc": 0.9147353856796956 }, { "epoch": 0.7378939277478862, "grad_norm": 2.8125, "learning_rate": 3.527696179766833e-06, "loss": 0.229719877243042, "step": 5760, "token_acc": 0.9090753057283845 }, { "epoch": 0.7385344606712785, "grad_norm": 4.75, "learning_rate": 3.5115622407870607e-06, "loss": 0.22485427856445311, "step": 5765, "token_acc": 0.9123470045093408 }, { "epoch": 0.7391749935946708, "grad_norm": 3.6875, "learning_rate": 3.495457419531206e-06, "loss": 0.2279944896697998, "step": 5770, "token_acc": 0.9102829537612146 }, { "epoch": 0.739815526518063, "grad_norm": 3.53125, "learning_rate": 3.4793817882717863e-06, "loss": 0.22675998210906984, "step": 5775, "token_acc": 0.9114967836636014 }, { "epoch": 0.7404560594414553, "grad_norm": 2.90625, "learning_rate": 3.463335419150328e-06, "loss": 0.23147711753845215, "step": 5780, "token_acc": 0.908021712907117 }, { "epoch": 0.7410965923648476, "grad_norm": 4.8125, "learning_rate": 3.4473183841770364e-06, "loss": 0.22812228202819823, "step": 5785, "token_acc": 0.9115860226636219 }, { "epoch": 0.7417371252882398, "grad_norm": 3.421875, "learning_rate": 3.4313307552304785e-06, "loss": 0.22540197372436524, "step": 5790, "token_acc": 0.9123684664481628 }, { "epoch": 0.742377658211632, "grad_norm": 4.25, "learning_rate": 3.4153726040572612e-06, "loss": 0.23054356575012208, "step": 5795, "token_acc": 0.90987696808053 }, { "epoch": 0.7430181911350243, "grad_norm": 3.125, "learning_rate": 3.3994440022716902e-06, "loss": 0.2308722972869873, "step": 5800, "token_acc": 0.9087033288833384 }, { "epoch": 0.7430181911350243, "eval_loss": 0.33602866530418396, "eval_runtime": 106.6156, "eval_samples_per_second": 93.795, "eval_steps_per_second": 11.724, "eval_token_acc": 0.8821970976297124, "step": 5800 }, { "epoch": 0.7436587240584166, "grad_norm": 2.765625, "learning_rate": 3.3835450213554887e-06, "loss": 0.23508167266845703, "step": 5805, "token_acc": 0.9071099435855475 }, { "epoch": 0.7442992569818089, "grad_norm": 2.6875, "learning_rate": 3.3676757326574293e-06, "loss": 0.2318406581878662, "step": 5810, "token_acc": 0.9082233589820745 }, { "epoch": 0.7449397899052012, "grad_norm": 3.421875, "learning_rate": 3.351836207393054e-06, "loss": 0.2296595573425293, "step": 5815, "token_acc": 0.9098555100280353 }, { "epoch": 0.7455803228285934, "grad_norm": 3.453125, "learning_rate": 3.3360265166443316e-06, "loss": 0.2280057430267334, "step": 5820, "token_acc": 0.9113049486138699 }, { "epoch": 0.7462208557519856, "grad_norm": 4.875, "learning_rate": 3.3202467313593345e-06, "loss": 0.22925915718078613, "step": 5825, "token_acc": 0.9111722005068511 }, { "epoch": 0.7468613886753779, "grad_norm": 3.0, "learning_rate": 3.304496922351952e-06, "loss": 0.22095022201538086, "step": 5830, "token_acc": 0.9120665861652579 }, { "epoch": 0.7475019215987702, "grad_norm": 3.0625, "learning_rate": 3.2887771603015237e-06, "loss": 0.22771050930023193, "step": 5835, "token_acc": 0.9096476473886229 }, { "epoch": 0.7481424545221624, "grad_norm": 10.0625, "learning_rate": 3.273087515752579e-06, "loss": 0.23041772842407227, "step": 5840, "token_acc": 0.9116174693595719 }, { "epoch": 0.7487829874455547, "grad_norm": 3.75, "learning_rate": 3.2574280591144623e-06, "loss": 0.22076497077941895, "step": 5845, "token_acc": 0.9142450633784599 }, { "epoch": 0.749423520368947, "grad_norm": 3.953125, "learning_rate": 3.2417988606610738e-06, "loss": 0.2274242639541626, "step": 5850, "token_acc": 0.9110910575394268 }, { "epoch": 0.7500640532923393, "grad_norm": 3.71875, "learning_rate": 3.2261999905304996e-06, "loss": 0.23234589099884034, "step": 5855, "token_acc": 0.9092159559834938 }, { "epoch": 0.7507045862157314, "grad_norm": 2.90625, "learning_rate": 3.2106315187247417e-06, "loss": 0.2272249221801758, "step": 5860, "token_acc": 0.9115452624315349 }, { "epoch": 0.7513451191391237, "grad_norm": 3.421875, "learning_rate": 3.1950935151093778e-06, "loss": 0.23643298149108888, "step": 5865, "token_acc": 0.9063857235003225 }, { "epoch": 0.751985652062516, "grad_norm": 3.34375, "learning_rate": 3.179586049413257e-06, "loss": 0.23007550239562988, "step": 5870, "token_acc": 0.9103555536354603 }, { "epoch": 0.7526261849859083, "grad_norm": 3.46875, "learning_rate": 3.164109191228187e-06, "loss": 0.22181496620178223, "step": 5875, "token_acc": 0.912943921195887 }, { "epoch": 0.7532667179093006, "grad_norm": 2.703125, "learning_rate": 3.148663010008618e-06, "loss": 0.22368183135986328, "step": 5880, "token_acc": 0.9129305868097628 }, { "epoch": 0.7539072508326928, "grad_norm": 3.046875, "learning_rate": 3.1332475750713352e-06, "loss": 0.23119454383850097, "step": 5885, "token_acc": 0.9087276008766275 }, { "epoch": 0.7545477837560851, "grad_norm": 3.859375, "learning_rate": 3.1178629555951446e-06, "loss": 0.2248836040496826, "step": 5890, "token_acc": 0.9119996545619413 }, { "epoch": 0.7551883166794773, "grad_norm": 3.671875, "learning_rate": 3.1025092206205642e-06, "loss": 0.22220723628997802, "step": 5895, "token_acc": 0.9144963780614005 }, { "epoch": 0.7558288496028696, "grad_norm": 5.96875, "learning_rate": 3.087186439049512e-06, "loss": 0.23192427158355713, "step": 5900, "token_acc": 0.9082466248172671 }, { "epoch": 0.7558288496028696, "eval_loss": 0.3338736891746521, "eval_runtime": 103.6094, "eval_samples_per_second": 96.516, "eval_steps_per_second": 12.065, "eval_token_acc": 0.8824739356960539, "step": 5900 }, { "epoch": 0.7564693825262618, "grad_norm": 5.3125, "learning_rate": 3.0718946796450012e-06, "loss": 0.23041715621948242, "step": 5905, "token_acc": 0.9090869902577808 }, { "epoch": 0.7571099154496541, "grad_norm": 2.6875, "learning_rate": 3.056634011030828e-06, "loss": 0.23436269760131836, "step": 5910, "token_acc": 0.9085108217642494 }, { "epoch": 0.7577504483730464, "grad_norm": 2.96875, "learning_rate": 3.0414045016912673e-06, "loss": 0.22445986270904542, "step": 5915, "token_acc": 0.9102779573367809 }, { "epoch": 0.7583909812964387, "grad_norm": 2.90625, "learning_rate": 3.0262062199707486e-06, "loss": 0.22754263877868652, "step": 5920, "token_acc": 0.910606582801999 }, { "epoch": 0.759031514219831, "grad_norm": 3.515625, "learning_rate": 3.0110392340735892e-06, "loss": 0.2298940658569336, "step": 5925, "token_acc": 0.9089810539035864 }, { "epoch": 0.7596720471432231, "grad_norm": 3.875, "learning_rate": 2.995903612063634e-06, "loss": 0.22265000343322755, "step": 5930, "token_acc": 0.9121761658031088 }, { "epoch": 0.7603125800666154, "grad_norm": 3.09375, "learning_rate": 2.9807994218640035e-06, "loss": 0.22582578659057617, "step": 5935, "token_acc": 0.9107952827335954 }, { "epoch": 0.7609531129900077, "grad_norm": 2.859375, "learning_rate": 2.965726731256743e-06, "loss": 0.23047933578491211, "step": 5940, "token_acc": 0.9101230304338441 }, { "epoch": 0.7615936459134, "grad_norm": 3.15625, "learning_rate": 2.9506856078825473e-06, "loss": 0.22990131378173828, "step": 5945, "token_acc": 0.9090360926867086 }, { "epoch": 0.7622341788367922, "grad_norm": 2.6875, "learning_rate": 2.9356761192404616e-06, "loss": 0.23607187271118163, "step": 5950, "token_acc": 0.905852417302799 }, { "epoch": 0.7628747117601845, "grad_norm": 3.515625, "learning_rate": 2.9206983326875393e-06, "loss": 0.22556428909301757, "step": 5955, "token_acc": 0.9103385965667082 }, { "epoch": 0.7635152446835768, "grad_norm": 2.671875, "learning_rate": 2.905752315438596e-06, "loss": 0.22193589210510253, "step": 5960, "token_acc": 0.9134333505776858 }, { "epoch": 0.764155777606969, "grad_norm": 8.5625, "learning_rate": 2.8908381345658497e-06, "loss": 0.22921185493469237, "step": 5965, "token_acc": 0.9122292224044187 }, { "epoch": 0.7647963105303612, "grad_norm": 3.03125, "learning_rate": 2.875955856998677e-06, "loss": 0.2280503749847412, "step": 5970, "token_acc": 0.9099413692015865 }, { "epoch": 0.7654368434537535, "grad_norm": 3.3125, "learning_rate": 2.8611055495232585e-06, "loss": 0.2285156488418579, "step": 5975, "token_acc": 0.9090477833362084 }, { "epoch": 0.7660773763771458, "grad_norm": 3.328125, "learning_rate": 2.8462872787823213e-06, "loss": 0.22320642471313476, "step": 5980, "token_acc": 0.9124087591240876 }, { "epoch": 0.7667179093005381, "grad_norm": 2.65625, "learning_rate": 2.831501111274816e-06, "loss": 0.23166375160217284, "step": 5985, "token_acc": 0.9094241966788872 }, { "epoch": 0.7673584422239303, "grad_norm": 2.6875, "learning_rate": 2.81674711335563e-06, "loss": 0.22401225566864014, "step": 5990, "token_acc": 0.9130397385171168 }, { "epoch": 0.7679989751473226, "grad_norm": 3.671875, "learning_rate": 2.8020253512352814e-06, "loss": 0.23468830585479736, "step": 5995, "token_acc": 0.9090440165061898 }, { "epoch": 0.7686395080707148, "grad_norm": 4.40625, "learning_rate": 2.7873358909796287e-06, "loss": 0.2302248954772949, "step": 6000, "token_acc": 0.9099036841877942 }, { "epoch": 0.7686395080707148, "eval_loss": 0.334545373916626, "eval_runtime": 103.0056, "eval_samples_per_second": 97.082, "eval_steps_per_second": 12.135, "eval_token_acc": 0.8822773806689514, "step": 6000 }, { "epoch": 0.7692800409941071, "grad_norm": 2.984375, "learning_rate": 2.7726787985095717e-06, "loss": 0.23136000633239745, "step": 6005, "token_acc": 0.9077314256162731 }, { "epoch": 0.7699205739174994, "grad_norm": 3.890625, "learning_rate": 2.7580541396007523e-06, "loss": 0.22109587192535402, "step": 6010, "token_acc": 0.9138497449641221 }, { "epoch": 0.7705611068408916, "grad_norm": 3.125, "learning_rate": 2.743461979883265e-06, "loss": 0.2210922956466675, "step": 6015, "token_acc": 0.913861557051614 }, { "epoch": 0.7712016397642839, "grad_norm": 3.328125, "learning_rate": 2.728902384841361e-06, "loss": 0.22745194435119628, "step": 6020, "token_acc": 0.9132437785240679 }, { "epoch": 0.7718421726876762, "grad_norm": 3.328125, "learning_rate": 2.71437541981315e-06, "loss": 0.2273806095123291, "step": 6025, "token_acc": 0.9103077254142458 }, { "epoch": 0.7724827056110684, "grad_norm": 3.546875, "learning_rate": 2.699881149990313e-06, "loss": 0.2318946361541748, "step": 6030, "token_acc": 0.9076658793214516 }, { "epoch": 0.7731232385344606, "grad_norm": 2.8125, "learning_rate": 2.6854196404178077e-06, "loss": 0.22452447414398194, "step": 6035, "token_acc": 0.9115502437761573 }, { "epoch": 0.7737637714578529, "grad_norm": 10.4375, "learning_rate": 2.6709909559935652e-06, "loss": 0.23456428050994874, "step": 6040, "token_acc": 0.9073548387096774 }, { "epoch": 0.7744043043812452, "grad_norm": 4.3125, "learning_rate": 2.6565951614682316e-06, "loss": 0.22777628898620605, "step": 6045, "token_acc": 0.9110881364693719 }, { "epoch": 0.7750448373046375, "grad_norm": 13.1875, "learning_rate": 2.6422323214448275e-06, "loss": 0.2248152017593384, "step": 6050, "token_acc": 0.9121726395589249 }, { "epoch": 0.7756853702280297, "grad_norm": 2.953125, "learning_rate": 2.6279025003785132e-06, "loss": 0.2368108034133911, "step": 6055, "token_acc": 0.9071379369726192 }, { "epoch": 0.776325903151422, "grad_norm": 4.0625, "learning_rate": 2.6136057625762503e-06, "loss": 0.22743830680847169, "step": 6060, "token_acc": 0.9096523429064997 }, { "epoch": 0.7769664360748142, "grad_norm": 3.09375, "learning_rate": 2.5993421721965416e-06, "loss": 0.22994532585144042, "step": 6065, "token_acc": 0.9099663183349167 }, { "epoch": 0.7776069689982065, "grad_norm": 2.90625, "learning_rate": 2.58511179324915e-06, "loss": 0.22852482795715331, "step": 6070, "token_acc": 0.9125949585635359 }, { "epoch": 0.7782475019215987, "grad_norm": 2.703125, "learning_rate": 2.5709146895947713e-06, "loss": 0.23030381202697753, "step": 6075, "token_acc": 0.9101974108640488 }, { "epoch": 0.778888034844991, "grad_norm": 2.96875, "learning_rate": 2.556750924944802e-06, "loss": 0.22189459800720215, "step": 6080, "token_acc": 0.9149477863122465 }, { "epoch": 0.7795285677683833, "grad_norm": 3.5625, "learning_rate": 2.5426205628610046e-06, "loss": 0.22595663070678712, "step": 6085, "token_acc": 0.911052608864529 }, { "epoch": 0.7801691006917756, "grad_norm": 3.734375, "learning_rate": 2.5285236667552503e-06, "loss": 0.22210302352905273, "step": 6090, "token_acc": 0.9138005344366865 }, { "epoch": 0.7808096336151679, "grad_norm": 3.1875, "learning_rate": 2.5144602998892308e-06, "loss": 0.22484986782073973, "step": 6095, "token_acc": 0.9116413781178403 }, { "epoch": 0.78145016653856, "grad_norm": 3.578125, "learning_rate": 2.500430525374167e-06, "loss": 0.2381572961807251, "step": 6100, "token_acc": 0.9064785339413233 }, { "epoch": 0.78145016653856, "eval_loss": 0.3342524766921997, "eval_runtime": 102.5935, "eval_samples_per_second": 97.472, "eval_steps_per_second": 12.184, "eval_token_acc": 0.8827729208077028, "step": 6100 }, { "epoch": 0.7820906994619523, "grad_norm": 4.0625, "learning_rate": 2.486434406170529e-06, "loss": 0.23040971755981446, "step": 6105, "token_acc": 0.9120883863450002 }, { "epoch": 0.7827312323853446, "grad_norm": 4.15625, "learning_rate": 2.472472005087758e-06, "loss": 0.23743114471435547, "step": 6110, "token_acc": 0.9058546000428909 }, { "epoch": 0.7833717653087369, "grad_norm": 2.515625, "learning_rate": 2.4585433847839757e-06, "loss": 0.2203622817993164, "step": 6115, "token_acc": 0.913583977208961 }, { "epoch": 0.7840122982321291, "grad_norm": 3.28125, "learning_rate": 2.444648607765713e-06, "loss": 0.2203676223754883, "step": 6120, "token_acc": 0.9142634112494037 }, { "epoch": 0.7846528311555214, "grad_norm": 3.703125, "learning_rate": 2.430787736387621e-06, "loss": 0.2319796562194824, "step": 6125, "token_acc": 0.9104683790200734 }, { "epoch": 0.7852933640789137, "grad_norm": 3.578125, "learning_rate": 2.4169608328521966e-06, "loss": 0.22085697650909425, "step": 6130, "token_acc": 0.9123194047928022 }, { "epoch": 0.7859338970023059, "grad_norm": 3.203125, "learning_rate": 2.4031679592095014e-06, "loss": 0.22805500030517578, "step": 6135, "token_acc": 0.911108238538435 }, { "epoch": 0.7865744299256981, "grad_norm": 3.046875, "learning_rate": 2.3894091773568818e-06, "loss": 0.22629399299621583, "step": 6140, "token_acc": 0.9122247597707588 }, { "epoch": 0.7872149628490904, "grad_norm": 3.6875, "learning_rate": 2.3756845490386947e-06, "loss": 0.22798571586608887, "step": 6145, "token_acc": 0.9104625171939478 }, { "epoch": 0.7878554957724827, "grad_norm": 4.15625, "learning_rate": 2.3619941358460263e-06, "loss": 0.23149216175079346, "step": 6150, "token_acc": 0.9089227327482361 }, { "epoch": 0.788496028695875, "grad_norm": 3.296875, "learning_rate": 2.3483379992164245e-06, "loss": 0.23463683128356932, "step": 6155, "token_acc": 0.9076479697178252 }, { "epoch": 0.7891365616192673, "grad_norm": 2.953125, "learning_rate": 2.334716200433601e-06, "loss": 0.2272404193878174, "step": 6160, "token_acc": 0.9092513668259503 }, { "epoch": 0.7897770945426595, "grad_norm": 3.546875, "learning_rate": 2.3211288006271936e-06, "loss": 0.22353928089141845, "step": 6165, "token_acc": 0.9137633666781649 }, { "epoch": 0.7904176274660517, "grad_norm": 2.921875, "learning_rate": 2.3075758607724486e-06, "loss": 0.22103281021118165, "step": 6170, "token_acc": 0.9112282824790389 }, { "epoch": 0.791058160389444, "grad_norm": 3.046875, "learning_rate": 2.2940574416899895e-06, "loss": 0.22877752780914307, "step": 6175, "token_acc": 0.9084078248477782 }, { "epoch": 0.7916986933128363, "grad_norm": 2.875, "learning_rate": 2.280573604045504e-06, "loss": 0.229004168510437, "step": 6180, "token_acc": 0.9116313220748931 }, { "epoch": 0.7923392262362285, "grad_norm": 2.984375, "learning_rate": 2.2671244083495026e-06, "loss": 0.22659940719604493, "step": 6185, "token_acc": 0.9117138908085695 }, { "epoch": 0.7929797591596208, "grad_norm": 3.203125, "learning_rate": 2.253709914957032e-06, "loss": 0.2304908275604248, "step": 6190, "token_acc": 0.9093925032313658 }, { "epoch": 0.7936202920830131, "grad_norm": 3.75, "learning_rate": 2.2403301840674062e-06, "loss": 0.23479413986206055, "step": 6195, "token_acc": 0.9068654915312675 }, { "epoch": 0.7942608250064054, "grad_norm": 3.9375, "learning_rate": 2.2269852757239473e-06, "loss": 0.22974464893341065, "step": 6200, "token_acc": 0.9106674125392659 }, { "epoch": 0.7942608250064054, "eval_loss": 0.33391064405441284, "eval_runtime": 103.739, "eval_samples_per_second": 96.396, "eval_steps_per_second": 12.049, "eval_token_acc": 0.8825237665479955, "step": 6200 }, { "epoch": 0.7949013579297975, "grad_norm": 5.34375, "learning_rate": 2.2136752498136924e-06, "loss": 0.232399320602417, "step": 6205, "token_acc": 0.9098353590207741 }, { "epoch": 0.7955418908531898, "grad_norm": 7.78125, "learning_rate": 2.200400166067147e-06, "loss": 0.22328581809997558, "step": 6210, "token_acc": 0.9126268076840061 }, { "epoch": 0.7961824237765821, "grad_norm": 2.78125, "learning_rate": 2.1871600840580087e-06, "loss": 0.22782430648803711, "step": 6215, "token_acc": 0.9107918620155706 }, { "epoch": 0.7968229566999744, "grad_norm": 7.3125, "learning_rate": 2.1739550632028995e-06, "loss": 0.22463743686676024, "step": 6220, "token_acc": 0.9123086872170727 }, { "epoch": 0.7974634896233667, "grad_norm": 3.171875, "learning_rate": 2.160785162761099e-06, "loss": 0.22946014404296874, "step": 6225, "token_acc": 0.9104503339797457 }, { "epoch": 0.7981040225467589, "grad_norm": 3.234375, "learning_rate": 2.1476504418342803e-06, "loss": 0.22696642875671386, "step": 6230, "token_acc": 0.9110344827586206 }, { "epoch": 0.7987445554701512, "grad_norm": 3.359375, "learning_rate": 2.1345509593662426e-06, "loss": 0.2333219289779663, "step": 6235, "token_acc": 0.9078953042128411 }, { "epoch": 0.7993850883935434, "grad_norm": 4.6875, "learning_rate": 2.1214867741426505e-06, "loss": 0.2281118631362915, "step": 6240, "token_acc": 0.9095139607032058 }, { "epoch": 0.8000256213169357, "grad_norm": 11.375, "learning_rate": 2.108457944790764e-06, "loss": 0.22590672969818115, "step": 6245, "token_acc": 0.9117824773413897 }, { "epoch": 0.8006661542403279, "grad_norm": 2.90625, "learning_rate": 2.095464529779182e-06, "loss": 0.22068183422088622, "step": 6250, "token_acc": 0.9125511302475781 }, { "epoch": 0.8013066871637202, "grad_norm": 3.3125, "learning_rate": 2.0825065874175744e-06, "loss": 0.2325758457183838, "step": 6255, "token_acc": 0.9100626770842277 }, { "epoch": 0.8019472200871125, "grad_norm": 4.125, "learning_rate": 2.069584175856424e-06, "loss": 0.22739195823669434, "step": 6260, "token_acc": 0.9109864018994173 }, { "epoch": 0.8025877530105048, "grad_norm": 4.09375, "learning_rate": 2.056697353086765e-06, "loss": 0.22868261337280274, "step": 6265, "token_acc": 0.9094489893087477 }, { "epoch": 0.8032282859338971, "grad_norm": 3.546875, "learning_rate": 2.0438461769399207e-06, "loss": 0.23165996074676515, "step": 6270, "token_acc": 0.908895110919664 }, { "epoch": 0.8038688188572892, "grad_norm": 3.015625, "learning_rate": 2.031030705087251e-06, "loss": 0.2177964687347412, "step": 6275, "token_acc": 0.9145767686795874 }, { "epoch": 0.8045093517806815, "grad_norm": 3.328125, "learning_rate": 2.0182509950398732e-06, "loss": 0.2247143268585205, "step": 6280, "token_acc": 0.9119068162208801 }, { "epoch": 0.8051498847040738, "grad_norm": 10.4375, "learning_rate": 2.005507104148441e-06, "loss": 0.22496967315673827, "step": 6285, "token_acc": 0.9104271735850683 }, { "epoch": 0.8057904176274661, "grad_norm": 2.984375, "learning_rate": 1.9927990896028416e-06, "loss": 0.22278683185577391, "step": 6290, "token_acc": 0.9130923555863023 }, { "epoch": 0.8064309505508583, "grad_norm": 3.890625, "learning_rate": 1.9801270084319847e-06, "loss": 0.22296977043151855, "step": 6295, "token_acc": 0.9139506811519228 }, { "epoch": 0.8070714834742506, "grad_norm": 5.84375, "learning_rate": 1.967490917503504e-06, "loss": 0.2246922492980957, "step": 6300, "token_acc": 0.9113056226284926 }, { "epoch": 0.8070714834742506, "eval_loss": 0.33457258343696594, "eval_runtime": 102.1762, "eval_samples_per_second": 97.87, "eval_steps_per_second": 12.234, "eval_token_acc": 0.8825071562640149, "step": 6300 }, { "epoch": 0.8077120163976428, "grad_norm": 2.59375, "learning_rate": 1.954890873523535e-06, "loss": 0.22967491149902344, "step": 6305, "token_acc": 0.9100146387668991 }, { "epoch": 0.8083525493210351, "grad_norm": 3.421875, "learning_rate": 1.9423269330364446e-06, "loss": 0.23272688388824464, "step": 6310, "token_acc": 0.9100631361937894 }, { "epoch": 0.8089930822444273, "grad_norm": 3.734375, "learning_rate": 1.929799152424576e-06, "loss": 0.22082395553588868, "step": 6315, "token_acc": 0.9147945323616921 }, { "epoch": 0.8096336151678196, "grad_norm": 2.96875, "learning_rate": 1.917307587908013e-06, "loss": 0.22631459236145018, "step": 6320, "token_acc": 0.910735097336729 }, { "epoch": 0.8102741480912119, "grad_norm": 3.578125, "learning_rate": 1.9048522955442973e-06, "loss": 0.22592225074768066, "step": 6325, "token_acc": 0.9107575233483224 }, { "epoch": 0.8109146810146042, "grad_norm": 2.875, "learning_rate": 1.8924333312282072e-06, "loss": 0.22494149208068848, "step": 6330, "token_acc": 0.9138549272043893 }, { "epoch": 0.8115552139379965, "grad_norm": 4.09375, "learning_rate": 1.880050750691489e-06, "loss": 0.23039345741271972, "step": 6335, "token_acc": 0.9105968858131488 }, { "epoch": 0.8121957468613886, "grad_norm": 3.109375, "learning_rate": 1.867704609502613e-06, "loss": 0.22507119178771973, "step": 6340, "token_acc": 0.9115109155233411 }, { "epoch": 0.8128362797847809, "grad_norm": 3.4375, "learning_rate": 1.8553949630665246e-06, "loss": 0.23071153163909913, "step": 6345, "token_acc": 0.9095776837378012 }, { "epoch": 0.8134768127081732, "grad_norm": 5.34375, "learning_rate": 1.843121866624391e-06, "loss": 0.22440800666809083, "step": 6350, "token_acc": 0.9121522693997072 }, { "epoch": 0.8141173456315655, "grad_norm": 3.796875, "learning_rate": 1.8308853752533595e-06, "loss": 0.22544093132019044, "step": 6355, "token_acc": 0.9106696543997242 }, { "epoch": 0.8147578785549577, "grad_norm": 2.34375, "learning_rate": 1.8186855438663042e-06, "loss": 0.2227323532104492, "step": 6360, "token_acc": 0.9120110525861325 }, { "epoch": 0.81539841147835, "grad_norm": 2.78125, "learning_rate": 1.8065224272115866e-06, "loss": 0.22800102233886718, "step": 6365, "token_acc": 0.9097010109701011 }, { "epoch": 0.8160389444017423, "grad_norm": 3.734375, "learning_rate": 1.7943960798728056e-06, "loss": 0.22401859760284423, "step": 6370, "token_acc": 0.9109385113268609 }, { "epoch": 0.8166794773251345, "grad_norm": 3.328125, "learning_rate": 1.7823065562685437e-06, "loss": 0.23256373405456543, "step": 6375, "token_acc": 0.9090204520990313 }, { "epoch": 0.8173200102485267, "grad_norm": 2.9375, "learning_rate": 1.7702539106521467e-06, "loss": 0.22349081039428711, "step": 6380, "token_acc": 0.9125831820931639 }, { "epoch": 0.817960543171919, "grad_norm": 3.265625, "learning_rate": 1.7582381971114548e-06, "loss": 0.23039307594299316, "step": 6385, "token_acc": 0.9086009915930158 }, { "epoch": 0.8186010760953113, "grad_norm": 3.703125, "learning_rate": 1.7462594695685763e-06, "loss": 0.22513654232025146, "step": 6390, "token_acc": 0.9117127975549911 }, { "epoch": 0.8192416090187036, "grad_norm": 3.8125, "learning_rate": 1.7343177817796397e-06, "loss": 0.2271491050720215, "step": 6395, "token_acc": 0.9126460569999569 }, { "epoch": 0.8198821419420959, "grad_norm": 5.0, "learning_rate": 1.7224131873345417e-06, "loss": 0.2326582908630371, "step": 6400, "token_acc": 0.9083699681061977 }, { "epoch": 0.8198821419420959, "eval_loss": 0.334193617105484, "eval_runtime": 106.2327, "eval_samples_per_second": 94.133, "eval_steps_per_second": 11.767, "eval_token_acc": 0.8823493585662003, "step": 6400 }, { "epoch": 0.8205226748654881, "grad_norm": 3.75, "learning_rate": 1.7105457396567383e-06, "loss": 0.2375797748565674, "step": 6405, "token_acc": 0.9064124038998411 }, { "epoch": 0.8211632077888803, "grad_norm": 3.9375, "learning_rate": 1.6987154920029625e-06, "loss": 0.22246260643005372, "step": 6410, "token_acc": 0.9119291304721768 }, { "epoch": 0.8218037407122726, "grad_norm": 3.203125, "learning_rate": 1.6869224974630283e-06, "loss": 0.23738515377044678, "step": 6415, "token_acc": 0.9083624143724958 }, { "epoch": 0.8224442736356649, "grad_norm": 3.765625, "learning_rate": 1.675166808959552e-06, "loss": 0.23724117279052734, "step": 6420, "token_acc": 0.9078502673796791 }, { "epoch": 0.8230848065590571, "grad_norm": 3.203125, "learning_rate": 1.6634484792477468e-06, "loss": 0.23424534797668456, "step": 6425, "token_acc": 0.910155913515376 }, { "epoch": 0.8237253394824494, "grad_norm": 3.578125, "learning_rate": 1.6517675609151683e-06, "loss": 0.23035151958465577, "step": 6430, "token_acc": 0.9098056155507559 }, { "epoch": 0.8243658724058417, "grad_norm": 2.796875, "learning_rate": 1.6401241063814854e-06, "loss": 0.22955503463745117, "step": 6435, "token_acc": 0.9094276239286792 }, { "epoch": 0.825006405329234, "grad_norm": 3.125, "learning_rate": 1.6285181678982432e-06, "loss": 0.2227609395980835, "step": 6440, "token_acc": 0.9112541026083952 }, { "epoch": 0.8256469382526261, "grad_norm": 2.6875, "learning_rate": 1.6169497975486282e-06, "loss": 0.22880702018737792, "step": 6445, "token_acc": 0.9112530754953166 }, { "epoch": 0.8262874711760184, "grad_norm": 2.96875, "learning_rate": 1.605419047247232e-06, "loss": 0.2208636999130249, "step": 6450, "token_acc": 0.9133209711501142 }, { "epoch": 0.8269280040994107, "grad_norm": 2.875, "learning_rate": 1.5939259687398279e-06, "loss": 0.22008955478668213, "step": 6455, "token_acc": 0.9133350640359986 }, { "epoch": 0.827568537022803, "grad_norm": 2.90625, "learning_rate": 1.5824706136031255e-06, "loss": 0.22201809883117676, "step": 6460, "token_acc": 0.9131427094996124 }, { "epoch": 0.8282090699461953, "grad_norm": 7.78125, "learning_rate": 1.5710530332445484e-06, "loss": 0.22498104572296143, "step": 6465, "token_acc": 0.9109814094249892 }, { "epoch": 0.8288496028695875, "grad_norm": 5.0625, "learning_rate": 1.559673278902002e-06, "loss": 0.23075518608093262, "step": 6470, "token_acc": 0.909899408539481 }, { "epoch": 0.8294901357929798, "grad_norm": 2.734375, "learning_rate": 1.5483314016436402e-06, "loss": 0.23160152435302733, "step": 6475, "token_acc": 0.9085847468600284 }, { "epoch": 0.830130668716372, "grad_norm": 2.78125, "learning_rate": 1.537027452367641e-06, "loss": 0.2284604549407959, "step": 6480, "token_acc": 0.9090869865377977 }, { "epoch": 0.8307712016397643, "grad_norm": 2.84375, "learning_rate": 1.5257614818019716e-06, "loss": 0.22905595302581788, "step": 6485, "token_acc": 0.9103867022650934 }, { "epoch": 0.8314117345631565, "grad_norm": 4.53125, "learning_rate": 1.5145335405041728e-06, "loss": 0.23354558944702147, "step": 6490, "token_acc": 0.9073544698544699 }, { "epoch": 0.8320522674865488, "grad_norm": 4.25, "learning_rate": 1.50334367886111e-06, "loss": 0.2220928192138672, "step": 6495, "token_acc": 0.9137692440754195 }, { "epoch": 0.8326928004099411, "grad_norm": 2.671875, "learning_rate": 1.4921919470887758e-06, "loss": 0.22195751667022706, "step": 6500, "token_acc": 0.9108550636749545 }, { "epoch": 0.8326928004099411, "eval_loss": 0.3345060646533966, "eval_runtime": 102.629, "eval_samples_per_second": 97.438, "eval_steps_per_second": 12.18, "eval_token_acc": 0.8823133696175759, "step": 6500 }, { "epoch": 0.8333333333333334, "grad_norm": 3.21875, "learning_rate": 1.4810783952320417e-06, "loss": 0.2198798656463623, "step": 6505, "token_acc": 0.91326310335895 }, { "epoch": 0.8339738662567256, "grad_norm": 2.96875, "learning_rate": 1.4700030731644444e-06, "loss": 0.22199637889862062, "step": 6510, "token_acc": 0.9108222490931076 }, { "epoch": 0.8346143991801178, "grad_norm": 3.59375, "learning_rate": 1.4589660305879615e-06, "loss": 0.22134122848510743, "step": 6515, "token_acc": 0.9125576981148354 }, { "epoch": 0.8352549321035101, "grad_norm": 3.203125, "learning_rate": 1.4479673170327745e-06, "loss": 0.22954387664794923, "step": 6520, "token_acc": 0.9111398405516052 }, { "epoch": 0.8358954650269024, "grad_norm": 4.3125, "learning_rate": 1.4370069818570787e-06, "loss": 0.22780919075012207, "step": 6525, "token_acc": 0.9098325276243094 }, { "epoch": 0.8365359979502947, "grad_norm": 3.453125, "learning_rate": 1.4260850742468202e-06, "loss": 0.22985472679138183, "step": 6530, "token_acc": 0.909024211298606 }, { "epoch": 0.8371765308736869, "grad_norm": 3.3125, "learning_rate": 1.4152016432155158e-06, "loss": 0.22617745399475098, "step": 6535, "token_acc": 0.9123404622283546 }, { "epoch": 0.8378170637970792, "grad_norm": 2.9375, "learning_rate": 1.4043567376039956e-06, "loss": 0.22737021446228028, "step": 6540, "token_acc": 0.910641053313188 }, { "epoch": 0.8384575967204715, "grad_norm": 3.515625, "learning_rate": 1.393550406080213e-06, "loss": 0.22855916023254394, "step": 6545, "token_acc": 0.9108817204301075 }, { "epoch": 0.8390981296438637, "grad_norm": 4.625, "learning_rate": 1.3827826971390135e-06, "loss": 0.21400003433227538, "step": 6550, "token_acc": 0.9176013805004314 }, { "epoch": 0.8397386625672559, "grad_norm": 5.09375, "learning_rate": 1.372053659101915e-06, "loss": 0.22439954280853272, "step": 6555, "token_acc": 0.9112418357195381 }, { "epoch": 0.8403791954906482, "grad_norm": 3.515625, "learning_rate": 1.361363340116899e-06, "loss": 0.22323524951934814, "step": 6560, "token_acc": 0.9125355634106388 }, { "epoch": 0.8410197284140405, "grad_norm": 3.0625, "learning_rate": 1.3507117881581866e-06, "loss": 0.2269625186920166, "step": 6565, "token_acc": 0.9102128574500108 }, { "epoch": 0.8416602613374328, "grad_norm": 3.125, "learning_rate": 1.3400990510260282e-06, "loss": 0.21720943450927735, "step": 6570, "token_acc": 0.9142647249470637 }, { "epoch": 0.842300794260825, "grad_norm": 3.015625, "learning_rate": 1.3295251763464877e-06, "loss": 0.22070887088775634, "step": 6575, "token_acc": 0.91288746703558 }, { "epoch": 0.8429413271842172, "grad_norm": 3.109375, "learning_rate": 1.3189902115712294e-06, "loss": 0.23354511260986327, "step": 6580, "token_acc": 0.9081190159288995 }, { "epoch": 0.8435818601076095, "grad_norm": 2.953125, "learning_rate": 1.3084942039773018e-06, "loss": 0.22521576881408692, "step": 6585, "token_acc": 0.9107534747622531 }, { "epoch": 0.8442223930310018, "grad_norm": 3.0625, "learning_rate": 1.2980372006669296e-06, "loss": 0.2297739267349243, "step": 6590, "token_acc": 0.9092908902691511 }, { "epoch": 0.844862925954394, "grad_norm": 4.15625, "learning_rate": 1.287619248567301e-06, "loss": 0.22501018047332763, "step": 6595, "token_acc": 0.9111034393475165 }, { "epoch": 0.8455034588777863, "grad_norm": 4.0625, "learning_rate": 1.2772403944303556e-06, "loss": 0.23542351722717286, "step": 6600, "token_acc": 0.9083812301621343 }, { "epoch": 0.8455034588777863, "eval_loss": 0.33506593108177185, "eval_runtime": 103.0867, "eval_samples_per_second": 97.006, "eval_steps_per_second": 12.126, "eval_token_acc": 0.882360432088854, "step": 6600 }, { "epoch": 0.8461439918011786, "grad_norm": 2.53125, "learning_rate": 1.266900684832576e-06, "loss": 0.22258315086364747, "step": 6605, "token_acc": 0.9125866597769453 }, { "epoch": 0.8467845247245709, "grad_norm": 3.3125, "learning_rate": 1.2566001661747807e-06, "loss": 0.22833826541900634, "step": 6610, "token_acc": 0.9116735537190083 }, { "epoch": 0.847425057647963, "grad_norm": 3.265625, "learning_rate": 1.2463388846819058e-06, "loss": 0.23099522590637206, "step": 6615, "token_acc": 0.91005291005291 }, { "epoch": 0.8480655905713553, "grad_norm": 2.90625, "learning_rate": 1.2361168864028183e-06, "loss": 0.2343848466873169, "step": 6620, "token_acc": 0.9084549356223176 }, { "epoch": 0.8487061234947476, "grad_norm": 2.890625, "learning_rate": 1.225934217210083e-06, "loss": 0.22270684242248534, "step": 6625, "token_acc": 0.9118066047917116 }, { "epoch": 0.8493466564181399, "grad_norm": 4.0, "learning_rate": 1.2157909227997822e-06, "loss": 0.22519948482513427, "step": 6630, "token_acc": 0.9111332783970161 }, { "epoch": 0.8499871893415322, "grad_norm": 2.71875, "learning_rate": 1.205687048691293e-06, "loss": 0.2298964500427246, "step": 6635, "token_acc": 0.9109686303197212 }, { "epoch": 0.8506277222649244, "grad_norm": 6.0625, "learning_rate": 1.1956226402270821e-06, "loss": 0.22732067108154297, "step": 6640, "token_acc": 0.9125354411891056 }, { "epoch": 0.8512682551883167, "grad_norm": 3.953125, "learning_rate": 1.1855977425725252e-06, "loss": 0.23059117794036865, "step": 6645, "token_acc": 0.9109086197961651 }, { "epoch": 0.8519087881117089, "grad_norm": 3.40625, "learning_rate": 1.1756124007156699e-06, "loss": 0.23375325202941893, "step": 6650, "token_acc": 0.9093841389987958 }, { "epoch": 0.8525493210351012, "grad_norm": 3.609375, "learning_rate": 1.1656666594670673e-06, "loss": 0.22103147506713866, "step": 6655, "token_acc": 0.9112014180104622 }, { "epoch": 0.8531898539584934, "grad_norm": 37.75, "learning_rate": 1.1557605634595437e-06, "loss": 0.2286379814147949, "step": 6660, "token_acc": 0.9100968783638321 }, { "epoch": 0.8538303868818857, "grad_norm": 3.1875, "learning_rate": 1.1458941571480198e-06, "loss": 0.22343990802764893, "step": 6665, "token_acc": 0.911326860841424 }, { "epoch": 0.854470919805278, "grad_norm": 2.921875, "learning_rate": 1.136067484809299e-06, "loss": 0.22610747814178467, "step": 6670, "token_acc": 0.9108187134502924 }, { "epoch": 0.8551114527286703, "grad_norm": 3.1875, "learning_rate": 1.126280590541876e-06, "loss": 0.2264204740524292, "step": 6675, "token_acc": 0.9111684958037444 }, { "epoch": 0.8557519856520626, "grad_norm": 4.15625, "learning_rate": 1.1165335182657365e-06, "loss": 0.23050973415374756, "step": 6680, "token_acc": 0.9092639868460906 }, { "epoch": 0.8563925185754547, "grad_norm": 2.671875, "learning_rate": 1.1068263117221568e-06, "loss": 0.2229710578918457, "step": 6685, "token_acc": 0.9126929378287488 }, { "epoch": 0.857033051498847, "grad_norm": 4.09375, "learning_rate": 1.0971590144735122e-06, "loss": 0.22901148796081544, "step": 6690, "token_acc": 0.9086844368013758 }, { "epoch": 0.8576735844222393, "grad_norm": 3.09375, "learning_rate": 1.0875316699030802e-06, "loss": 0.22709619998931885, "step": 6695, "token_acc": 0.9104509880226574 }, { "epoch": 0.8583141173456316, "grad_norm": 3.171875, "learning_rate": 1.0779443212148444e-06, "loss": 0.2268310546875, "step": 6700, "token_acc": 0.9107427341227126 }, { "epoch": 0.8583141173456316, "eval_loss": 0.3350731432437897, "eval_runtime": 104.1893, "eval_samples_per_second": 95.979, "eval_steps_per_second": 11.997, "eval_token_acc": 0.8825652922579467, "step": 6700 }, { "epoch": 0.8589546502690238, "grad_norm": 3.1875, "learning_rate": 1.0683970114333032e-06, "loss": 0.22931032180786132, "step": 6705, "token_acc": 0.9111389236545682 }, { "epoch": 0.8595951831924161, "grad_norm": 5.25, "learning_rate": 1.0588897834032718e-06, "loss": 0.2266333818435669, "step": 6710, "token_acc": 0.9122988654501532 }, { "epoch": 0.8602357161158084, "grad_norm": 4.28125, "learning_rate": 1.0494226797896978e-06, "loss": 0.22155840396881105, "step": 6715, "token_acc": 0.9117913343392973 }, { "epoch": 0.8608762490392006, "grad_norm": 2.859375, "learning_rate": 1.0399957430774598e-06, "loss": 0.23419654369354248, "step": 6720, "token_acc": 0.9080201906898485 }, { "epoch": 0.8615167819625928, "grad_norm": 3.875, "learning_rate": 1.030609015571188e-06, "loss": 0.23095030784606935, "step": 6725, "token_acc": 0.9089889579020014 }, { "epoch": 0.8621573148859851, "grad_norm": 3.265625, "learning_rate": 1.021262539395066e-06, "loss": 0.2203512191772461, "step": 6730, "token_acc": 0.9137580554474287 }, { "epoch": 0.8627978478093774, "grad_norm": 3.0, "learning_rate": 1.0119563564926372e-06, "loss": 0.22832462787628174, "step": 6735, "token_acc": 0.9118601531738133 }, { "epoch": 0.8634383807327697, "grad_norm": 4.90625, "learning_rate": 1.0026905086266392e-06, "loss": 0.22600264549255372, "step": 6740, "token_acc": 0.9125167076273013 }, { "epoch": 0.864078913656162, "grad_norm": 3.046875, "learning_rate": 9.934650373787823e-07, "loss": 0.22522459030151368, "step": 6745, "token_acc": 0.9116609294320138 }, { "epoch": 0.8647194465795542, "grad_norm": 3.421875, "learning_rate": 9.842799841495986e-07, "loss": 0.22795772552490234, "step": 6750, "token_acc": 0.9116564948275115 }, { "epoch": 0.8653599795029464, "grad_norm": 4.75, "learning_rate": 9.751353901582294e-07, "loss": 0.22496397495269777, "step": 6755, "token_acc": 0.9126722718210973 }, { "epoch": 0.8660005124263387, "grad_norm": 2.765625, "learning_rate": 9.660312964422469e-07, "loss": 0.2258981943130493, "step": 6760, "token_acc": 0.9107835531419706 }, { "epoch": 0.866641045349731, "grad_norm": 3.734375, "learning_rate": 9.569677438574842e-07, "loss": 0.22349743843078612, "step": 6765, "token_acc": 0.912551306977749 }, { "epoch": 0.8672815782731232, "grad_norm": 5.46875, "learning_rate": 9.479447730778268e-07, "loss": 0.22322914600372315, "step": 6770, "token_acc": 0.911628910463862 }, { "epoch": 0.8679221111965155, "grad_norm": 2.984375, "learning_rate": 9.389624245950601e-07, "loss": 0.217413330078125, "step": 6775, "token_acc": 0.9126826316244488 }, { "epoch": 0.8685626441199078, "grad_norm": 3.15625, "learning_rate": 9.300207387186555e-07, "loss": 0.237738037109375, "step": 6780, "token_acc": 0.9056814760655456 }, { "epoch": 0.8692031770433001, "grad_norm": 3.96875, "learning_rate": 9.211197555756157e-07, "loss": 0.22690942287445068, "step": 6785, "token_acc": 0.9100542775911088 }, { "epoch": 0.8698437099666922, "grad_norm": 4.40625, "learning_rate": 9.122595151102809e-07, "loss": 0.23275787830352784, "step": 6790, "token_acc": 0.9072804862278546 }, { "epoch": 0.8704842428900845, "grad_norm": 4.65625, "learning_rate": 9.034400570841551e-07, "loss": 0.22703733444213867, "step": 6795, "token_acc": 0.9107196692364012 }, { "epoch": 0.8711247758134768, "grad_norm": 3.0, "learning_rate": 8.946614210757221e-07, "loss": 0.22760224342346191, "step": 6800, "token_acc": 0.9104903571737438 }, { "epoch": 0.8711247758134768, "eval_loss": 0.33460694551467896, "eval_runtime": 104.7122, "eval_samples_per_second": 95.5, "eval_steps_per_second": 11.937, "eval_token_acc": 0.8824462518894198, "step": 6800 }, { "epoch": 0.8717653087368691, "grad_norm": 3.1875, "learning_rate": 8.859236464802756e-07, "loss": 0.22689156532287597, "step": 6805, "token_acc": 0.9119266844505637 }, { "epoch": 0.8724058416602614, "grad_norm": 2.84375, "learning_rate": 8.772267725097361e-07, "loss": 0.23056597709655763, "step": 6810, "token_acc": 0.9115866839602248 }, { "epoch": 0.8730463745836536, "grad_norm": 3.375, "learning_rate": 8.685708381924784e-07, "loss": 0.23043975830078126, "step": 6815, "token_acc": 0.9112482202183199 }, { "epoch": 0.8736869075070458, "grad_norm": 2.828125, "learning_rate": 8.599558823731524e-07, "loss": 0.22515459060668946, "step": 6820, "token_acc": 0.9119896305897602 }, { "epoch": 0.8743274404304381, "grad_norm": 4.8125, "learning_rate": 8.513819437125148e-07, "loss": 0.2265780448913574, "step": 6825, "token_acc": 0.9123948304276173 }, { "epoch": 0.8749679733538304, "grad_norm": 3.21875, "learning_rate": 8.428490606872519e-07, "loss": 0.22168455123901368, "step": 6830, "token_acc": 0.9113792656026989 }, { "epoch": 0.8756085062772226, "grad_norm": 2.90625, "learning_rate": 8.343572715898041e-07, "loss": 0.2171454668045044, "step": 6835, "token_acc": 0.914162535029101 }, { "epoch": 0.8762490392006149, "grad_norm": 3.109375, "learning_rate": 8.259066145282024e-07, "loss": 0.21893837451934814, "step": 6840, "token_acc": 0.9133895738697815 }, { "epoch": 0.8768895721240072, "grad_norm": 3.953125, "learning_rate": 8.17497127425888e-07, "loss": 0.22579605579376222, "step": 6845, "token_acc": 0.9109958954417801 }, { "epoch": 0.8775301050473995, "grad_norm": 2.90625, "learning_rate": 8.091288480215509e-07, "loss": 0.2259922981262207, "step": 6850, "token_acc": 0.9134802754081324 }, { "epoch": 0.8781706379707916, "grad_norm": 2.921875, "learning_rate": 8.008018138689477e-07, "loss": 0.23148341178894044, "step": 6855, "token_acc": 0.9080638206123329 }, { "epoch": 0.8788111708941839, "grad_norm": 2.75, "learning_rate": 7.925160623367534e-07, "loss": 0.22035045623779298, "step": 6860, "token_acc": 0.9124632924512005 }, { "epoch": 0.8794517038175762, "grad_norm": 3.125, "learning_rate": 7.842716306083709e-07, "loss": 0.22205777168273927, "step": 6865, "token_acc": 0.9132595729968018 }, { "epoch": 0.8800922367409685, "grad_norm": 2.640625, "learning_rate": 7.760685556817837e-07, "loss": 0.22817633152008057, "step": 6870, "token_acc": 0.908126751455055 }, { "epoch": 0.8807327696643608, "grad_norm": 5.34375, "learning_rate": 7.679068743693741e-07, "loss": 0.2194456100463867, "step": 6875, "token_acc": 0.914544352044352 }, { "epoch": 0.881373302587753, "grad_norm": 3.546875, "learning_rate": 7.59786623297768e-07, "loss": 0.22601814270019532, "step": 6880, "token_acc": 0.911353032659409 }, { "epoch": 0.8820138355111453, "grad_norm": 3.0, "learning_rate": 7.517078389076715e-07, "loss": 0.23260602951049805, "step": 6885, "token_acc": 0.9088834345261163 }, { "epoch": 0.8826543684345375, "grad_norm": 2.765625, "learning_rate": 7.43670557453694e-07, "loss": 0.22155818939208985, "step": 6890, "token_acc": 0.9120893334483056 }, { "epoch": 0.8832949013579298, "grad_norm": 56.0, "learning_rate": 7.35674815004207e-07, "loss": 0.23186612129211426, "step": 6895, "token_acc": 0.9093760742523204 }, { "epoch": 0.883935434281322, "grad_norm": 3.671875, "learning_rate": 7.277206474411591e-07, "loss": 0.22928218841552733, "step": 6900, "token_acc": 0.9102180604326527 }, { "epoch": 0.883935434281322, "eval_loss": 0.3345526456832886, "eval_runtime": 103.6333, "eval_samples_per_second": 96.494, "eval_steps_per_second": 12.062, "eval_token_acc": 0.8824102629407954, "step": 6900 }, { "epoch": 0.8845759672047143, "grad_norm": 3.9375, "learning_rate": 7.198080904599314e-07, "loss": 0.22185420989990234, "step": 6905, "token_acc": 0.9122216468151217 }, { "epoch": 0.8852165001281066, "grad_norm": 3.5, "learning_rate": 7.119371795691732e-07, "loss": 0.22938218116760253, "step": 6910, "token_acc": 0.9106750053914169 }, { "epoch": 0.8858570330514989, "grad_norm": 3.078125, "learning_rate": 7.041079500906389e-07, "loss": 0.22525992393493652, "step": 6915, "token_acc": 0.9117697816895332 }, { "epoch": 0.8864975659748912, "grad_norm": 3.015625, "learning_rate": 6.963204371590327e-07, "loss": 0.22642955780029297, "step": 6920, "token_acc": 0.9109674639086404 }, { "epoch": 0.8871380988982833, "grad_norm": 10.5, "learning_rate": 6.885746757218504e-07, "loss": 0.2312746524810791, "step": 6925, "token_acc": 0.9084555651423641 }, { "epoch": 0.8877786318216756, "grad_norm": 4.15625, "learning_rate": 6.808707005392234e-07, "loss": 0.22308661937713622, "step": 6930, "token_acc": 0.9129004329004329 }, { "epoch": 0.8884191647450679, "grad_norm": 4.21875, "learning_rate": 6.73208546183759e-07, "loss": 0.23537328243255615, "step": 6935, "token_acc": 0.9080825451418745 }, { "epoch": 0.8890596976684602, "grad_norm": 2.828125, "learning_rate": 6.655882470403918e-07, "loss": 0.22550048828125, "step": 6940, "token_acc": 0.9109734436598362 }, { "epoch": 0.8897002305918524, "grad_norm": 2.734375, "learning_rate": 6.580098373062227e-07, "loss": 0.21899161338806153, "step": 6945, "token_acc": 0.9117697816895332 }, { "epoch": 0.8903407635152447, "grad_norm": 2.453125, "learning_rate": 6.504733509903693e-07, "loss": 0.22932813167572022, "step": 6950, "token_acc": 0.9097964815453604 }, { "epoch": 0.890981296438637, "grad_norm": 4.28125, "learning_rate": 6.429788219138111e-07, "loss": 0.22290611267089844, "step": 6955, "token_acc": 0.9123820195664354 }, { "epoch": 0.8916218293620292, "grad_norm": 4.53125, "learning_rate": 6.355262837092424e-07, "loss": 0.2280646324157715, "step": 6960, "token_acc": 0.909892094063024 }, { "epoch": 0.8922623622854214, "grad_norm": 3.609375, "learning_rate": 6.281157698209139e-07, "loss": 0.23290627002716063, "step": 6965, "token_acc": 0.9101938603687233 }, { "epoch": 0.8929028952088137, "grad_norm": 4.15625, "learning_rate": 6.207473135044905e-07, "loss": 0.22637267112731935, "step": 6970, "token_acc": 0.9106436069523318 }, { "epoch": 0.893543428132206, "grad_norm": 2.84375, "learning_rate": 6.134209478268904e-07, "loss": 0.22555007934570312, "step": 6975, "token_acc": 0.9121092067866678 }, { "epoch": 0.8941839610555983, "grad_norm": 3.59375, "learning_rate": 6.061367056661582e-07, "loss": 0.2194199800491333, "step": 6980, "token_acc": 0.914880720439884 }, { "epoch": 0.8948244939789906, "grad_norm": 3.34375, "learning_rate": 5.988946197112866e-07, "loss": 0.22179160118103028, "step": 6985, "token_acc": 0.9131431041936878 }, { "epoch": 0.8954650269023828, "grad_norm": 2.546875, "learning_rate": 5.916947224621039e-07, "loss": 0.2265388011932373, "step": 6990, "token_acc": 0.9122412824612194 }, { "epoch": 0.896105559825775, "grad_norm": 3.109375, "learning_rate": 5.845370462290978e-07, "loss": 0.22730591297149658, "step": 6995, "token_acc": 0.909079168281028 }, { "epoch": 0.8967460927491673, "grad_norm": 3.15625, "learning_rate": 5.774216231332875e-07, "loss": 0.22771029472351073, "step": 7000, "token_acc": 0.9113219754151392 }, { "epoch": 0.8967460927491673, "eval_loss": 0.33467555046081543, "eval_runtime": 103.9504, "eval_samples_per_second": 96.2, "eval_steps_per_second": 12.025, "eval_token_acc": 0.8823936526568149, "step": 7000 }, { "epoch": 0.8973866256725596, "grad_norm": 3.375, "learning_rate": 5.703484851060825e-07, "loss": 0.23281164169311525, "step": 7005, "token_acc": 0.9091221882929766 }, { "epoch": 0.8980271585959518, "grad_norm": 3.078125, "learning_rate": 5.633176638891191e-07, "loss": 0.2271268367767334, "step": 7010, "token_acc": 0.9112791702679343 }, { "epoch": 0.8986676915193441, "grad_norm": 4.0625, "learning_rate": 5.563291910341462e-07, "loss": 0.22890748977661132, "step": 7015, "token_acc": 0.9119872731963196 }, { "epoch": 0.8993082244427364, "grad_norm": 3.109375, "learning_rate": 5.493830979028569e-07, "loss": 0.22680349349975587, "step": 7020, "token_acc": 0.913206895061995 }, { "epoch": 0.8999487573661287, "grad_norm": 23.25, "learning_rate": 5.424794156667645e-07, "loss": 0.22985119819641114, "step": 7025, "token_acc": 0.9109837054918527 }, { "epoch": 0.9005892902895208, "grad_norm": 4.3125, "learning_rate": 5.356181753070588e-07, "loss": 0.22275919914245607, "step": 7030, "token_acc": 0.9113749190589251 }, { "epoch": 0.9012298232129131, "grad_norm": 3.203125, "learning_rate": 5.287994076144643e-07, "loss": 0.22965426445007325, "step": 7035, "token_acc": 0.9091612903225806 }, { "epoch": 0.9018703561363054, "grad_norm": 5.8125, "learning_rate": 5.220231431891032e-07, "loss": 0.2193136692047119, "step": 7040, "token_acc": 0.9128340853870184 }, { "epoch": 0.9025108890596977, "grad_norm": 19.625, "learning_rate": 5.152894124403618e-07, "loss": 0.2251948356628418, "step": 7045, "token_acc": 0.9117292456079917 }, { "epoch": 0.90315142198309, "grad_norm": 3.84375, "learning_rate": 5.085982455867477e-07, "loss": 0.22256324291229249, "step": 7050, "token_acc": 0.9116405307599518 }, { "epoch": 0.9037919549064822, "grad_norm": 4.1875, "learning_rate": 5.019496726557571e-07, "loss": 0.23459949493408203, "step": 7055, "token_acc": 0.9083365578915689 }, { "epoch": 0.9044324878298745, "grad_norm": 5.3125, "learning_rate": 4.953437234837444e-07, "loss": 0.22082552909851075, "step": 7060, "token_acc": 0.9143460643158893 }, { "epoch": 0.9050730207532667, "grad_norm": 2.859375, "learning_rate": 4.887804277157803e-07, "loss": 0.228281831741333, "step": 7065, "token_acc": 0.9109963417258446 }, { "epoch": 0.905713553676659, "grad_norm": 2.671875, "learning_rate": 4.822598148055235e-07, "loss": 0.2322796106338501, "step": 7070, "token_acc": 0.9108313211452225 }, { "epoch": 0.9063540866000512, "grad_norm": 3.078125, "learning_rate": 4.757819140150888e-07, "loss": 0.23224186897277832, "step": 7075, "token_acc": 0.9088714544357273 }, { "epoch": 0.9069946195234435, "grad_norm": 2.484375, "learning_rate": 4.693467544149133e-07, "loss": 0.21920361518859863, "step": 7080, "token_acc": 0.9131602894657018 }, { "epoch": 0.9076351524468358, "grad_norm": 3.484375, "learning_rate": 4.629543648836288e-07, "loss": 0.21608197689056396, "step": 7085, "token_acc": 0.9152600757836721 }, { "epoch": 0.9082756853702281, "grad_norm": 3.203125, "learning_rate": 4.566047741079316e-07, "loss": 0.2328326940536499, "step": 7090, "token_acc": 0.9090478037846459 }, { "epoch": 0.9089162182936202, "grad_norm": 3.515625, "learning_rate": 4.5029801058244726e-07, "loss": 0.23201301097869872, "step": 7095, "token_acc": 0.9082667817828621 }, { "epoch": 0.9095567512170125, "grad_norm": 3.234375, "learning_rate": 4.4403410260961733e-07, "loss": 0.22749040126800538, "step": 7100, "token_acc": 0.9113321799307958 }, { "epoch": 0.9095567512170125, "eval_loss": 0.3351185917854309, "eval_runtime": 103.2381, "eval_samples_per_second": 96.863, "eval_steps_per_second": 12.108, "eval_token_acc": 0.8822164762943564, "step": 7100 }, { "epoch": 0.9101972841404048, "grad_norm": 4.125, "learning_rate": 4.3781307829955375e-07, "loss": 0.22915854454040527, "step": 7105, "token_acc": 0.9114854122803249 }, { "epoch": 0.9108378170637971, "grad_norm": 2.9375, "learning_rate": 4.3163496556993143e-07, "loss": 0.22949614524841308, "step": 7110, "token_acc": 0.9098265398355787 }, { "epoch": 0.9114783499871894, "grad_norm": 4.65625, "learning_rate": 4.2549979214584703e-07, "loss": 0.234299373626709, "step": 7115, "token_acc": 0.9077444835579978 }, { "epoch": 0.9121188829105816, "grad_norm": 2.78125, "learning_rate": 4.194075855597046e-07, "loss": 0.21983301639556885, "step": 7120, "token_acc": 0.9148798481384012 }, { "epoch": 0.9127594158339739, "grad_norm": 2.59375, "learning_rate": 4.133583731510893e-07, "loss": 0.23418021202087402, "step": 7125, "token_acc": 0.9072267311345191 }, { "epoch": 0.9133999487573661, "grad_norm": 2.75, "learning_rate": 4.073521820666393e-07, "loss": 0.22026586532592773, "step": 7130, "token_acc": 0.9122671141517147 }, { "epoch": 0.9140404816807584, "grad_norm": 4.09375, "learning_rate": 4.0138903925993957e-07, "loss": 0.22925994396209717, "step": 7135, "token_acc": 0.910432351043235 }, { "epoch": 0.9146810146041506, "grad_norm": 3.140625, "learning_rate": 3.954689714913762e-07, "loss": 0.22760000228881835, "step": 7140, "token_acc": 0.911580763424628 }, { "epoch": 0.9153215475275429, "grad_norm": 3.9375, "learning_rate": 3.895920053280422e-07, "loss": 0.22435307502746582, "step": 7145, "token_acc": 0.9124141209004882 }, { "epoch": 0.9159620804509352, "grad_norm": 2.9375, "learning_rate": 3.837581671435997e-07, "loss": 0.2232006549835205, "step": 7150, "token_acc": 0.911838464199239 }, { "epoch": 0.9166026133743275, "grad_norm": 3.671875, "learning_rate": 3.779674831181701e-07, "loss": 0.2235502243041992, "step": 7155, "token_acc": 0.9120366369999136 }, { "epoch": 0.9172431462977197, "grad_norm": 2.71875, "learning_rate": 3.722199792382164e-07, "loss": 0.22374234199523926, "step": 7160, "token_acc": 0.9131279129984464 }, { "epoch": 0.9178836792211119, "grad_norm": 3.828125, "learning_rate": 3.665156812964221e-07, "loss": 0.22843289375305176, "step": 7165, "token_acc": 0.9109647990360616 }, { "epoch": 0.9185242121445042, "grad_norm": 2.609375, "learning_rate": 3.608546148915804e-07, "loss": 0.22373640537261963, "step": 7170, "token_acc": 0.9105750592289468 }, { "epoch": 0.9191647450678965, "grad_norm": 3.453125, "learning_rate": 3.552368054284772e-07, "loss": 0.21737513542175294, "step": 7175, "token_acc": 0.9161251191404558 }, { "epoch": 0.9198052779912887, "grad_norm": 3.515625, "learning_rate": 3.496622781177761e-07, "loss": 0.22703731060028076, "step": 7180, "token_acc": 0.9097332931190486 }, { "epoch": 0.920445810914681, "grad_norm": 3.984375, "learning_rate": 3.441310579759072e-07, "loss": 0.22722623348236085, "step": 7185, "token_acc": 0.9093100331425128 }, { "epoch": 0.9210863438380733, "grad_norm": 3.234375, "learning_rate": 3.386431698249526e-07, "loss": 0.2288762092590332, "step": 7190, "token_acc": 0.9103326439158911 }, { "epoch": 0.9217268767614656, "grad_norm": 3.484375, "learning_rate": 3.3319863829253895e-07, "loss": 0.22250890731811523, "step": 7195, "token_acc": 0.9124709527498064 }, { "epoch": 0.9223674096848578, "grad_norm": 3.390625, "learning_rate": 3.277974878117207e-07, "loss": 0.22609634399414064, "step": 7200, "token_acc": 0.9101618122977346 }, { "epoch": 0.9223674096848578, "eval_loss": 0.3345736861228943, "eval_runtime": 102.7358, "eval_samples_per_second": 97.337, "eval_steps_per_second": 12.167, "eval_token_acc": 0.8823410534242101, "step": 7200 }, { "epoch": 0.92300794260825, "grad_norm": 3.390625, "learning_rate": 3.2243974262087805e-07, "loss": 0.2214569091796875, "step": 7205, "token_acc": 0.9122594594594594 }, { "epoch": 0.9236484755316423, "grad_norm": 3.125, "learning_rate": 3.171254267636015e-07, "loss": 0.23588757514953612, "step": 7210, "token_acc": 0.9061101549053356 }, { "epoch": 0.9242890084550346, "grad_norm": 5.15625, "learning_rate": 3.1185456408858505e-07, "loss": 0.22405190467834474, "step": 7215, "token_acc": 0.9125026992010364 }, { "epoch": 0.9249295413784269, "grad_norm": 2.921875, "learning_rate": 3.0662717824952894e-07, "loss": 0.22633728981018067, "step": 7220, "token_acc": 0.9114834596829773 }, { "epoch": 0.9255700743018191, "grad_norm": 2.890625, "learning_rate": 3.014432927050126e-07, "loss": 0.22840723991394044, "step": 7225, "token_acc": 0.910606582801999 }, { "epoch": 0.9262106072252114, "grad_norm": 4.09375, "learning_rate": 2.9630293071841397e-07, "loss": 0.22615447044372558, "step": 7230, "token_acc": 0.9125701943844492 }, { "epoch": 0.9268511401486036, "grad_norm": 8.75, "learning_rate": 2.912061153577872e-07, "loss": 0.22545180320739747, "step": 7235, "token_acc": 0.9107838891294933 }, { "epoch": 0.9274916730719959, "grad_norm": 3.375, "learning_rate": 2.861528694957649e-07, "loss": 0.22807738780975342, "step": 7240, "token_acc": 0.9106092073381793 }, { "epoch": 0.9281322059953881, "grad_norm": 3.140625, "learning_rate": 2.8114321580945846e-07, "loss": 0.23368797302246094, "step": 7245, "token_acc": 0.9072138340431023 }, { "epoch": 0.9287727389187804, "grad_norm": 3.359375, "learning_rate": 2.761771767803512e-07, "loss": 0.2348182201385498, "step": 7250, "token_acc": 0.9079960428405522 }, { "epoch": 0.9294132718421727, "grad_norm": 11.25, "learning_rate": 2.71254774694204e-07, "loss": 0.22567691802978515, "step": 7255, "token_acc": 0.9117279965569185 }, { "epoch": 0.930053804765565, "grad_norm": 2.546875, "learning_rate": 2.6637603164094584e-07, "loss": 0.2227564811706543, "step": 7260, "token_acc": 0.9113984055160526 }, { "epoch": 0.9306943376889573, "grad_norm": 12.5625, "learning_rate": 2.615409695145832e-07, "loss": 0.22351694107055664, "step": 7265, "token_acc": 0.9124437910757524 }, { "epoch": 0.9313348706123494, "grad_norm": 3.125, "learning_rate": 2.567496100130973e-07, "loss": 0.22547354698181152, "step": 7270, "token_acc": 0.9113066735688711 }, { "epoch": 0.9319754035357417, "grad_norm": 3.453125, "learning_rate": 2.5200197463834843e-07, "loss": 0.23171014785766603, "step": 7275, "token_acc": 0.9080668134144763 }, { "epoch": 0.932615936459134, "grad_norm": 9.375, "learning_rate": 2.472980846959794e-07, "loss": 0.22420947551727294, "step": 7280, "token_acc": 0.9112663303582977 }, { "epoch": 0.9332564693825263, "grad_norm": 9.25, "learning_rate": 2.4263796129532e-07, "loss": 0.22904155254364014, "step": 7285, "token_acc": 0.9106152457113376 }, { "epoch": 0.9338970023059185, "grad_norm": 4.625, "learning_rate": 2.3802162534929063e-07, "loss": 0.22856383323669432, "step": 7290, "token_acc": 0.9091379087501615 }, { "epoch": 0.9345375352293108, "grad_norm": 3.203125, "learning_rate": 2.33449097574312e-07, "loss": 0.23378937244415282, "step": 7295, "token_acc": 0.90822689545435 }, { "epoch": 0.9351780681527031, "grad_norm": 2.71875, "learning_rate": 2.2892039849020552e-07, "loss": 0.22789459228515624, "step": 7300, "token_acc": 0.9115628641719539 }, { "epoch": 0.9351780681527031, "eval_loss": 0.3358408808708191, "eval_runtime": 109.8432, "eval_samples_per_second": 91.039, "eval_steps_per_second": 11.38, "eval_token_acc": 0.882407494560132, "step": 7300 }, { "epoch": 0.9358186010760953, "grad_norm": 4.15625, "learning_rate": 2.2443554842011107e-07, "loss": 0.22101092338562012, "step": 7305, "token_acc": 0.9120034542314335 }, { "epoch": 0.9364591339994875, "grad_norm": 2.796875, "learning_rate": 2.199945674903836e-07, "loss": 0.22407989501953124, "step": 7310, "token_acc": 0.9132890651948948 }, { "epoch": 0.9370996669228798, "grad_norm": 3.25, "learning_rate": 2.155974756305157e-07, "loss": 0.22648565769195556, "step": 7315, "token_acc": 0.9100392258286996 }, { "epoch": 0.9377401998462721, "grad_norm": 3.484375, "learning_rate": 2.112442925730407e-07, "loss": 0.2312589168548584, "step": 7320, "token_acc": 0.9074082027056045 }, { "epoch": 0.9383807327696644, "grad_norm": 3.171875, "learning_rate": 2.0693503785344294e-07, "loss": 0.2254408359527588, "step": 7325, "token_acc": 0.9107181367263317 }, { "epoch": 0.9390212656930567, "grad_norm": 4.40625, "learning_rate": 2.0266973081007335e-07, "loss": 0.22427871227264404, "step": 7330, "token_acc": 0.9129628831314394 }, { "epoch": 0.9396617986164489, "grad_norm": 2.53125, "learning_rate": 1.9844839058406174e-07, "loss": 0.23152542114257812, "step": 7335, "token_acc": 0.9090241796200346 }, { "epoch": 0.9403023315398411, "grad_norm": 3.28125, "learning_rate": 1.9427103611923458e-07, "loss": 0.23547761440277098, "step": 7340, "token_acc": 0.9070280082987552 }, { "epoch": 0.9409428644632334, "grad_norm": 3.71875, "learning_rate": 1.9013768616201856e-07, "loss": 0.23559412956237794, "step": 7345, "token_acc": 0.9068175949040199 }, { "epoch": 0.9415833973866257, "grad_norm": 6.5, "learning_rate": 1.860483592613749e-07, "loss": 0.23159332275390626, "step": 7350, "token_acc": 0.9084540336098337 }, { "epoch": 0.9422239303100179, "grad_norm": 3.40625, "learning_rate": 1.8200307376869396e-07, "loss": 0.228605318069458, "step": 7355, "token_acc": 0.9085082587749483 }, { "epoch": 0.9428644632334102, "grad_norm": 3.234375, "learning_rate": 1.7800184783773433e-07, "loss": 0.22635889053344727, "step": 7360, "token_acc": 0.9109769247358206 }, { "epoch": 0.9435049961568025, "grad_norm": 3.484375, "learning_rate": 1.7404469942452597e-07, "loss": 0.21885204315185547, "step": 7365, "token_acc": 0.9139455635595048 }, { "epoch": 0.9441455290801947, "grad_norm": 3.234375, "learning_rate": 1.7013164628729483e-07, "loss": 0.22530250549316405, "step": 7370, "token_acc": 0.9120013769955678 }, { "epoch": 0.944786062003587, "grad_norm": 3.421875, "learning_rate": 1.6626270598638972e-07, "loss": 0.23129520416259766, "step": 7375, "token_acc": 0.9073307460112118 }, { "epoch": 0.9454265949269792, "grad_norm": 3.359375, "learning_rate": 1.624378958841888e-07, "loss": 0.23236556053161622, "step": 7380, "token_acc": 0.9091888166113815 }, { "epoch": 0.9460671278503715, "grad_norm": 4.34375, "learning_rate": 1.5865723314503535e-07, "loss": 0.2195420265197754, "step": 7385, "token_acc": 0.9148448976064979 }, { "epoch": 0.9467076607737638, "grad_norm": 3.5625, "learning_rate": 1.5492073473515334e-07, "loss": 0.2299337387084961, "step": 7390, "token_acc": 0.9092899459088177 }, { "epoch": 0.9473481936971561, "grad_norm": 3.296875, "learning_rate": 1.5122841742257533e-07, "loss": 0.2305469512939453, "step": 7395, "token_acc": 0.9094121703154628 }, { "epoch": 0.9479887266205483, "grad_norm": 3.0, "learning_rate": 1.475802977770646e-07, "loss": 0.2353046417236328, "step": 7400, "token_acc": 0.9065898637321068 }, { "epoch": 0.9479887266205483, "eval_loss": 0.3345721662044525, "eval_runtime": 102.9085, "eval_samples_per_second": 97.174, "eval_steps_per_second": 12.147, "eval_token_acc": 0.8821223513518003, "step": 7400 }, { "epoch": 0.9486292595439405, "grad_norm": 3.3125, "learning_rate": 1.43976392170041e-07, "loss": 0.22667450904846193, "step": 7405, "token_acc": 0.9105691056910569 }, { "epoch": 0.9492697924673328, "grad_norm": 3.65625, "learning_rate": 1.404167167745074e-07, "loss": 0.23315582275390626, "step": 7410, "token_acc": 0.9090674355553643 }, { "epoch": 0.9499103253907251, "grad_norm": 4.09375, "learning_rate": 1.3690128756498e-07, "loss": 0.2316906452178955, "step": 7415, "token_acc": 0.9079900017238407 }, { "epoch": 0.9505508583141173, "grad_norm": 3.5, "learning_rate": 1.3343012031741155e-07, "loss": 0.22472708225250243, "step": 7420, "token_acc": 0.9114065659825309 }, { "epoch": 0.9511913912375096, "grad_norm": 3.453125, "learning_rate": 1.30003230609127e-07, "loss": 0.22815487384796143, "step": 7425, "token_acc": 0.9114841828147253 }, { "epoch": 0.9518319241609019, "grad_norm": 3.375, "learning_rate": 1.266206338187448e-07, "loss": 0.2252589225769043, "step": 7430, "token_acc": 0.911378744712078 }, { "epoch": 0.9524724570842942, "grad_norm": 4.03125, "learning_rate": 1.2328234512611893e-07, "loss": 0.23881807327270507, "step": 7435, "token_acc": 0.9063224808865218 }, { "epoch": 0.9531129900076863, "grad_norm": 3.765625, "learning_rate": 1.1998837951226027e-07, "loss": 0.2236201286315918, "step": 7440, "token_acc": 0.9124207256568445 }, { "epoch": 0.9537535229310786, "grad_norm": 3.671875, "learning_rate": 1.1673875175927773e-07, "loss": 0.22488207817077638, "step": 7445, "token_acc": 0.9117697816895332 }, { "epoch": 0.9543940558544709, "grad_norm": 15.125, "learning_rate": 1.1353347645030488e-07, "loss": 0.23006877899169922, "step": 7450, "token_acc": 0.9091143483305402 }, { "epoch": 0.9550345887778632, "grad_norm": 3.90625, "learning_rate": 1.1037256796943896e-07, "loss": 0.23117449283599853, "step": 7455, "token_acc": 0.9106395825246906 }, { "epoch": 0.9556751217012555, "grad_norm": 3.453125, "learning_rate": 1.072560405016776e-07, "loss": 0.22410707473754882, "step": 7460, "token_acc": 0.913257805067889 }, { "epoch": 0.9563156546246477, "grad_norm": 3.1875, "learning_rate": 1.0418390803284772e-07, "loss": 0.22124795913696288, "step": 7465, "token_acc": 0.9126255442044916 }, { "epoch": 0.95695618754804, "grad_norm": 3.0, "learning_rate": 1.0115618434955233e-07, "loss": 0.22695465087890626, "step": 7470, "token_acc": 0.9115334773218142 }, { "epoch": 0.9575967204714322, "grad_norm": 2.796875, "learning_rate": 9.817288303910267e-08, "loss": 0.22336146831512452, "step": 7475, "token_acc": 0.9113754903228587 }, { "epoch": 0.9582372533948245, "grad_norm": 2.828125, "learning_rate": 9.523401748945837e-08, "loss": 0.22532784938812256, "step": 7480, "token_acc": 0.9119910089046425 }, { "epoch": 0.9588777863182167, "grad_norm": 5.875, "learning_rate": 9.233960088916749e-08, "loss": 0.23188343048095703, "step": 7485, "token_acc": 0.9097377954114197 }, { "epoch": 0.959518319241609, "grad_norm": 12.875, "learning_rate": 8.948964622730761e-08, "loss": 0.22753703594207764, "step": 7490, "token_acc": 0.911497176359012 }, { "epoch": 0.9601588521650013, "grad_norm": 4.84375, "learning_rate": 8.668416629342813e-08, "loss": 0.23263895511627197, "step": 7495, "token_acc": 0.9096037898363479 }, { "epoch": 0.9607993850883936, "grad_norm": 3.40625, "learning_rate": 8.392317367749259e-08, "loss": 0.23171706199645997, "step": 7500, "token_acc": 0.9093455125166962 }, { "epoch": 0.9607993850883936, "eval_loss": 0.3347827196121216, "eval_runtime": 102.5334, "eval_samples_per_second": 97.529, "eval_steps_per_second": 12.191, "eval_token_acc": 0.8825099246446784, "step": 7500 }, { "epoch": 0.9614399180117859, "grad_norm": 2.796875, "learning_rate": 8.120668076982085e-08, "loss": 0.23077220916748048, "step": 7505, "token_acc": 0.9088870682592385 }, { "epoch": 0.962080450935178, "grad_norm": 3.296875, "learning_rate": 7.853469976103367e-08, "loss": 0.2174984931945801, "step": 7510, "token_acc": 0.9130472325360505 }, { "epoch": 0.9627209838585703, "grad_norm": 2.546875, "learning_rate": 7.590724264200044e-08, "loss": 0.2254131555557251, "step": 7515, "token_acc": 0.9099638305201516 }, { "epoch": 0.9633615167819626, "grad_norm": 2.828125, "learning_rate": 7.332432120378263e-08, "loss": 0.21682121753692626, "step": 7520, "token_acc": 0.9132147340172272 }, { "epoch": 0.9640020497053549, "grad_norm": 4.53125, "learning_rate": 7.07859470375838e-08, "loss": 0.22123939990997316, "step": 7525, "token_acc": 0.913344287814581 }, { "epoch": 0.9646425826287471, "grad_norm": 4.0, "learning_rate": 6.829213153469294e-08, "loss": 0.2257563591003418, "step": 7530, "token_acc": 0.9111034244206156 }, { "epoch": 0.9652831155521394, "grad_norm": 3.25, "learning_rate": 6.584288588643795e-08, "loss": 0.21516809463500977, "step": 7535, "token_acc": 0.9163821788168186 }, { "epoch": 0.9659236484755317, "grad_norm": 5.46875, "learning_rate": 6.343822108413111e-08, "loss": 0.23532419204711913, "step": 7540, "token_acc": 0.9062943071965628 }, { "epoch": 0.9665641813989239, "grad_norm": 3.515625, "learning_rate": 6.10781479190281e-08, "loss": 0.22491927146911622, "step": 7545, "token_acc": 0.9103519579545944 }, { "epoch": 0.9672047143223161, "grad_norm": 3.078125, "learning_rate": 5.8762676982265785e-08, "loss": 0.23122644424438477, "step": 7550, "token_acc": 0.9087346024636058 }, { "epoch": 0.9678452472457084, "grad_norm": 2.75, "learning_rate": 5.649181866483e-08, "loss": 0.22680530548095704, "step": 7555, "token_acc": 0.911484593837535 }, { "epoch": 0.9684857801691007, "grad_norm": 4.34375, "learning_rate": 5.426558315749675e-08, "loss": 0.22133951187133788, "step": 7560, "token_acc": 0.9124602287384986 }, { "epoch": 0.969126313092493, "grad_norm": 4.71875, "learning_rate": 5.208398045079222e-08, "loss": 0.2312103033065796, "step": 7565, "token_acc": 0.9101176672678862 }, { "epoch": 0.9697668460158853, "grad_norm": 3.078125, "learning_rate": 4.994702033494947e-08, "loss": 0.22229225635528566, "step": 7570, "token_acc": 0.9136234136234136 }, { "epoch": 0.9704073789392775, "grad_norm": 3.125, "learning_rate": 4.785471239985851e-08, "loss": 0.2287161111831665, "step": 7575, "token_acc": 0.9087663454920853 }, { "epoch": 0.9710479118626697, "grad_norm": 2.75, "learning_rate": 4.5807066035028494e-08, "loss": 0.226922607421875, "step": 7580, "token_acc": 0.9117697816895332 }, { "epoch": 0.971688444786062, "grad_norm": 7.21875, "learning_rate": 4.3804090429543366e-08, "loss": 0.23184614181518554, "step": 7585, "token_acc": 0.9090673798636874 }, { "epoch": 0.9723289777094543, "grad_norm": 3.640625, "learning_rate": 4.184579457202298e-08, "loss": 0.22905006408691406, "step": 7590, "token_acc": 0.9108255451713395 }, { "epoch": 0.9729695106328465, "grad_norm": 3.046875, "learning_rate": 3.993218725057868e-08, "loss": 0.22608802318572999, "step": 7595, "token_acc": 0.911466643667256 }, { "epoch": 0.9736100435562388, "grad_norm": 3.140625, "learning_rate": 3.806327705277557e-08, "loss": 0.23126821517944335, "step": 7600, "token_acc": 0.9085889305897071 }, { "epoch": 0.9736100435562388, "eval_loss": 0.33461424708366394, "eval_runtime": 103.0674, "eval_samples_per_second": 97.024, "eval_steps_per_second": 12.128, "eval_token_acc": 0.8822912225722686, "step": 7600 }, { "epoch": 0.9742505764796311, "grad_norm": 2.890625, "learning_rate": 3.6239072365596984e-08, "loss": 0.23053784370422364, "step": 7605, "token_acc": 0.9106689064047896 }, { "epoch": 0.9748911094030234, "grad_norm": 3.484375, "learning_rate": 3.4459581375403395e-08, "loss": 0.22285847663879393, "step": 7610, "token_acc": 0.9107173725151253 }, { "epoch": 0.9755316423264155, "grad_norm": 3.390625, "learning_rate": 3.2724812067895795e-08, "loss": 0.22678759098052978, "step": 7615, "token_acc": 0.9103671706263499 }, { "epoch": 0.9761721752498078, "grad_norm": 3.125, "learning_rate": 3.103477222808016e-08, "loss": 0.22554678916931153, "step": 7620, "token_acc": 0.9114635094845094 }, { "epoch": 0.9768127081732001, "grad_norm": 3.171875, "learning_rate": 2.938946944023635e-08, "loss": 0.2290804386138916, "step": 7625, "token_acc": 0.9116782006920415 }, { "epoch": 0.9774532410965924, "grad_norm": 2.96875, "learning_rate": 2.7788911087877067e-08, "loss": 0.22720465660095215, "step": 7630, "token_acc": 0.910641229921192 }, { "epoch": 0.9780937740199847, "grad_norm": 3.40625, "learning_rate": 2.6233104353720063e-08, "loss": 0.22470946311950685, "step": 7635, "token_acc": 0.9122565074987071 }, { "epoch": 0.9787343069433769, "grad_norm": 3.4375, "learning_rate": 2.4722056219654843e-08, "loss": 0.22875847816467285, "step": 7640, "token_acc": 0.9108098773959592 }, { "epoch": 0.9793748398667691, "grad_norm": 3.609375, "learning_rate": 2.3255773466708266e-08, "loss": 0.2208240509033203, "step": 7645, "token_acc": 0.9132234969378072 }, { "epoch": 0.9800153727901614, "grad_norm": 4.75, "learning_rate": 2.1834262675021202e-08, "loss": 0.2248084306716919, "step": 7650, "token_acc": 0.9121955410455775 }, { "epoch": 0.9806559057135537, "grad_norm": 3.625, "learning_rate": 2.0457530223809695e-08, "loss": 0.2183553695678711, "step": 7655, "token_acc": 0.9145100069013112 }, { "epoch": 0.9812964386369459, "grad_norm": 12.8125, "learning_rate": 1.912558229134387e-08, "loss": 0.22381486892700195, "step": 7660, "token_acc": 0.9117545822218398 }, { "epoch": 0.9819369715603382, "grad_norm": 2.734375, "learning_rate": 1.7838424854915714e-08, "loss": 0.22946505546569823, "step": 7665, "token_acc": 0.9092513668259503 }, { "epoch": 0.9825775044837305, "grad_norm": 3.125, "learning_rate": 1.659606369081468e-08, "loss": 0.23253355026245118, "step": 7670, "token_acc": 0.9086559186136736 }, { "epoch": 0.9832180374071228, "grad_norm": 4.375, "learning_rate": 1.5398504374302124e-08, "loss": 0.21708984375, "step": 7675, "token_acc": 0.9138436341694681 }, { "epoch": 0.9838585703305149, "grad_norm": 6.65625, "learning_rate": 1.424575227958358e-08, "loss": 0.2197282314300537, "step": 7680, "token_acc": 0.9130565972671236 }, { "epoch": 0.9844991032539072, "grad_norm": 3.65625, "learning_rate": 1.3137812579785415e-08, "loss": 0.22876739501953125, "step": 7685, "token_acc": 0.9095221666379162 }, { "epoch": 0.9851396361772995, "grad_norm": 2.953125, "learning_rate": 1.2074690246937081e-08, "loss": 0.22080717086791993, "step": 7690, "token_acc": 0.9124427942319316 }, { "epoch": 0.9857801691006918, "grad_norm": 5.46875, "learning_rate": 1.1056390051936705e-08, "loss": 0.23291680812835694, "step": 7695, "token_acc": 0.9067782067782068 }, { "epoch": 0.986420702024084, "grad_norm": 3.1875, "learning_rate": 1.008291656454441e-08, "loss": 0.22717700004577637, "step": 7700, "token_acc": 0.9099401610056395 }, { "epoch": 0.986420702024084, "eval_loss": 0.33420565724372864, "eval_runtime": 102.458, "eval_samples_per_second": 97.601, "eval_steps_per_second": 12.2, "eval_token_acc": 0.8827535421430588, "step": 7700 }, { "epoch": 0.9870612349474763, "grad_norm": 2.84375, "learning_rate": 9.154274153351239e-09, "loss": 0.2262244701385498, "step": 7705, "token_acc": 0.9095655175389394 }, { "epoch": 0.9877017678708686, "grad_norm": 2.875, "learning_rate": 8.270466985761393e-09, "loss": 0.22812366485595703, "step": 7710, "token_acc": 0.9118508311084317 }, { "epoch": 0.9883423007942608, "grad_norm": 2.875, "learning_rate": 7.431499027976685e-09, "loss": 0.2235403537750244, "step": 7715, "token_acc": 0.9136387118994317 }, { "epoch": 0.988982833717653, "grad_norm": 2.84375, "learning_rate": 6.637374044978772e-09, "loss": 0.2251359224319458, "step": 7720, "token_acc": 0.9123222748815166 }, { "epoch": 0.9896233666410453, "grad_norm": 3.171875, "learning_rate": 5.88809560050696e-09, "loss": 0.22147438526153565, "step": 7725, "token_acc": 0.9122496546961326 }, { "epoch": 0.9902638995644376, "grad_norm": 11.375, "learning_rate": 5.1836670570493135e-09, "loss": 0.23077549934387206, "step": 7730, "token_acc": 0.9092400690846286 }, { "epoch": 0.9909044324878299, "grad_norm": 3.09375, "learning_rate": 4.524091575819345e-09, "loss": 0.23009955883026123, "step": 7735, "token_acc": 0.9085662603901977 }, { "epoch": 0.9915449654112222, "grad_norm": 3.125, "learning_rate": 3.9093721167526854e-09, "loss": 0.2305884838104248, "step": 7740, "token_acc": 0.9105272196462305 }, { "epoch": 0.9921854983346144, "grad_norm": 2.859375, "learning_rate": 3.339511438481546e-09, "loss": 0.23010706901550293, "step": 7745, "token_acc": 0.9100301334481274 }, { "epoch": 0.9928260312580066, "grad_norm": 2.671875, "learning_rate": 2.8145120983336106e-09, "loss": 0.23720641136169435, "step": 7750, "token_acc": 0.9077843280691941 }, { "epoch": 0.9934665641813989, "grad_norm": 3.515625, "learning_rate": 2.334376452310938e-09, "loss": 0.2344132900238037, "step": 7755, "token_acc": 0.9084367459496725 }, { "epoch": 0.9941070971047912, "grad_norm": 3.03125, "learning_rate": 1.899106655087746e-09, "loss": 0.23155610561370848, "step": 7760, "token_acc": 0.9105244966732913 }, { "epoch": 0.9947476300281834, "grad_norm": 2.8125, "learning_rate": 1.5087046599926435e-09, "loss": 0.22453222274780274, "step": 7765, "token_acc": 0.9133017649591046 }, { "epoch": 0.9953881629515757, "grad_norm": 2.984375, "learning_rate": 1.1631722190086348e-09, "loss": 0.22471303939819337, "step": 7770, "token_acc": 0.9115940774092995 }, { "epoch": 0.996028695874968, "grad_norm": 3.28125, "learning_rate": 8.625108827564621e-10, "loss": 0.22228624820709228, "step": 7775, "token_acc": 0.9125803251822142 }, { "epoch": 0.9966692287983603, "grad_norm": 4.40625, "learning_rate": 6.067220004946084e-10, "loss": 0.226347017288208, "step": 7780, "token_acc": 0.9129628831314394 }, { "epoch": 0.9973097617217525, "grad_norm": 2.671875, "learning_rate": 3.958067201093041e-10, "loss": 0.2226627826690674, "step": 7785, "token_acc": 0.91291213533575 }, { "epoch": 0.9979502946451447, "grad_norm": 5.34375, "learning_rate": 2.297659881111969e-10, "loss": 0.22344522476196288, "step": 7790, "token_acc": 0.9124097007223942 }, { "epoch": 0.998590827568537, "grad_norm": 2.765625, "learning_rate": 1.0860054962980038e-10, "loss": 0.22511889934539794, "step": 7795, "token_acc": 0.9111601540525337 }, { "epoch": 0.9992313604919293, "grad_norm": 2.859375, "learning_rate": 3.23109484112738e-11, "loss": 0.22252602577209474, "step": 7800, "token_acc": 0.9127169127169127 }, { "epoch": 0.9992313604919293, "eval_loss": 0.3345825672149658, "eval_runtime": 103.1214, "eval_samples_per_second": 96.973, "eval_steps_per_second": 12.122, "eval_token_acc": 0.8827258583364247, "step": 7800 }, { "epoch": 0.9998718934153216, "grad_norm": 7.1875, "learning_rate": 8.975268150912541e-13, "loss": 0.22932782173156738, "step": 7805, "token_acc": 0.9116207163102293 }, { "epoch": 1.0, "eval_loss": 0.33455130457878113, "eval_runtime": 101.6743, "eval_samples_per_second": 98.353, "eval_steps_per_second": 12.294, "eval_token_acc": 0.8823687372308442, "step": 7806 } ], "logging_steps": 5, "max_steps": 7806, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2803424419153183e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }