diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,81976 @@ +{ + "best_global_step": 6824, + "best_metric": 3.07427096, + "best_model_checkpoint": "/inspire/hdd/project/deepanalysis/guitao-25013/Muse/workspace/Finals/ckpt/Muse_4b_main_2e-4/v0-20251228-195009/checkpoint-6824", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 10236, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005863383172090296, + "grad_norm": 584.2804376471709, + "learning_rate": 1.1723329425556858e-07, + "loss": 24.62055206298828, + "step": 1, + "token_acc": 0.007686354255189996 + }, + { + "epoch": 0.0011726766344180592, + "grad_norm": 585.5252394109084, + "learning_rate": 2.3446658851113715e-07, + "loss": 24.677995681762695, + "step": 2, + "token_acc": 0.008093503709854125 + }, + { + "epoch": 0.001759014951627089, + "grad_norm": 585.7065303934635, + "learning_rate": 3.516998827667058e-07, + "loss": 24.603187561035156, + "step": 3, + "token_acc": 0.008458956494478706 + }, + { + "epoch": 0.0023453532688361184, + "grad_norm": 574.3386500798841, + "learning_rate": 4.689331770222743e-07, + "loss": 24.61533546447754, + "step": 4, + "token_acc": 0.008429689439370778 + }, + { + "epoch": 0.002931691586045148, + "grad_norm": 586.9828711617042, + "learning_rate": 5.861664712778429e-07, + "loss": 24.568662643432617, + "step": 5, + "token_acc": 0.00815803723341387 + }, + { + "epoch": 0.003518029903254178, + "grad_norm": 581.1894751304563, + "learning_rate": 7.033997655334116e-07, + "loss": 24.425533294677734, + "step": 6, + "token_acc": 0.008231019863606543 + }, + { + "epoch": 0.004104368220463207, + "grad_norm": 576.582735702365, + "learning_rate": 8.206330597889802e-07, + "loss": 24.278076171875, + "step": 7, + "token_acc": 0.008396241956107345 + }, + { + "epoch": 0.004690706537672237, + "grad_norm": 559.7170641683947, + "learning_rate": 9.378663540445486e-07, + "loss": 23.611698150634766, + "step": 8, + "token_acc": 0.008459112531510722 + }, + { + "epoch": 0.005277044854881266, + "grad_norm": 553.2699738013441, + "learning_rate": 1.0550996483001172e-06, + "loss": 23.430171966552734, + "step": 9, + "token_acc": 0.008667536718691499 + }, + { + "epoch": 0.005863383172090296, + "grad_norm": 470.6857657644861, + "learning_rate": 1.1723329425556858e-06, + "loss": 21.311586380004883, + "step": 10, + "token_acc": 0.008872265692218927 + }, + { + "epoch": 0.006449721489299325, + "grad_norm": 465.08434745509754, + "learning_rate": 1.2895662368112545e-06, + "loss": 21.151081085205078, + "step": 11, + "token_acc": 0.009220426307274241 + }, + { + "epoch": 0.007036059806508356, + "grad_norm": 435.77600914753805, + "learning_rate": 1.4067995310668232e-06, + "loss": 20.59938621520996, + "step": 12, + "token_acc": 0.009601930789116425 + }, + { + "epoch": 0.007622398123717385, + "grad_norm": 247.10891420254228, + "learning_rate": 1.5240328253223916e-06, + "loss": 18.007747650146484, + "step": 13, + "token_acc": 0.007578861663965909 + }, + { + "epoch": 0.008208736440926415, + "grad_norm": 237.9853724112765, + "learning_rate": 1.6412661195779603e-06, + "loss": 17.838977813720703, + "step": 14, + "token_acc": 0.007391450069974441 + }, + { + "epoch": 0.008795074758135445, + "grad_norm": 226.18401166349722, + "learning_rate": 1.7584994138335288e-06, + "loss": 17.58853530883789, + "step": 15, + "token_acc": 0.00753833547933981 + }, + { + "epoch": 0.009381413075344474, + "grad_norm": 212.04199238263578, + "learning_rate": 1.8757327080890972e-06, + "loss": 17.134864807128906, + "step": 16, + "token_acc": 0.00739965274231437 + }, + { + "epoch": 0.009967751392553504, + "grad_norm": 203.06989705564112, + "learning_rate": 1.992966002344666e-06, + "loss": 16.87917709350586, + "step": 17, + "token_acc": 0.007535626739789771 + }, + { + "epoch": 0.010554089709762533, + "grad_norm": 215.18456945822558, + "learning_rate": 2.1101992966002344e-06, + "loss": 15.041175842285156, + "step": 18, + "token_acc": 0.007634953613450764 + }, + { + "epoch": 0.011140428026971563, + "grad_norm": 227.02143780993362, + "learning_rate": 2.2274325908558035e-06, + "loss": 14.484148025512695, + "step": 19, + "token_acc": 0.006797998652741568 + }, + { + "epoch": 0.011726766344180592, + "grad_norm": 183.1105892045818, + "learning_rate": 2.3446658851113717e-06, + "loss": 14.200407028198242, + "step": 20, + "token_acc": 0.0065096879784594264 + }, + { + "epoch": 0.012313104661389622, + "grad_norm": 129.9586725795648, + "learning_rate": 2.4618991793669404e-06, + "loss": 13.929039001464844, + "step": 21, + "token_acc": 0.006199476003017114 + }, + { + "epoch": 0.01289944297859865, + "grad_norm": 93.69074694202115, + "learning_rate": 2.579132473622509e-06, + "loss": 13.488332748413086, + "step": 22, + "token_acc": 0.005730457203949437 + }, + { + "epoch": 0.013485781295807681, + "grad_norm": 74.83117664490601, + "learning_rate": 2.6963657678780773e-06, + "loss": 13.16888427734375, + "step": 23, + "token_acc": 0.005586667167793307 + }, + { + "epoch": 0.014072119613016711, + "grad_norm": 59.01013500162835, + "learning_rate": 2.8135990621336464e-06, + "loss": 12.899251937866211, + "step": 24, + "token_acc": 0.005102690654354402 + }, + { + "epoch": 0.01465845793022574, + "grad_norm": 27.82824358924883, + "learning_rate": 2.9308323563892146e-06, + "loss": 12.312295913696289, + "step": 25, + "token_acc": 0.003928652065929104 + }, + { + "epoch": 0.01524479624743477, + "grad_norm": 14.386423261749979, + "learning_rate": 3.0480656506447833e-06, + "loss": 12.14841079711914, + "step": 26, + "token_acc": 0.0035273685317681225 + }, + { + "epoch": 0.0158311345646438, + "grad_norm": 7.000013089518929, + "learning_rate": 3.1652989449003515e-06, + "loss": 12.062915802001953, + "step": 27, + "token_acc": 0.006908471711882418 + }, + { + "epoch": 0.01641747288185283, + "grad_norm": 4.7552241501145, + "learning_rate": 3.2825322391559206e-06, + "loss": 12.0260009765625, + "step": 28, + "token_acc": 0.008021770475822993 + }, + { + "epoch": 0.017003811199061858, + "grad_norm": 2.9608083367079754, + "learning_rate": 3.3997655334114893e-06, + "loss": 11.992738723754883, + "step": 29, + "token_acc": 0.00914451688065049 + }, + { + "epoch": 0.01759014951627089, + "grad_norm": 2.141838994655946, + "learning_rate": 3.5169988276670575e-06, + "loss": 11.975879669189453, + "step": 30, + "token_acc": 0.009006903876792352 + }, + { + "epoch": 0.01817648783347992, + "grad_norm": 1.2898203877804213, + "learning_rate": 3.6342321219226262e-06, + "loss": 11.957919120788574, + "step": 31, + "token_acc": 0.009075683445647444 + }, + { + "epoch": 0.018762826150688947, + "grad_norm": 0.9495857459426789, + "learning_rate": 3.7514654161781945e-06, + "loss": 11.939529418945312, + "step": 32, + "token_acc": 0.009957634316987306 + }, + { + "epoch": 0.019349164467897976, + "grad_norm": 0.6584091413204045, + "learning_rate": 3.8686987104337636e-06, + "loss": 11.937095642089844, + "step": 33, + "token_acc": 0.009662900129048503 + }, + { + "epoch": 0.019935502785107008, + "grad_norm": 0.688413907968875, + "learning_rate": 3.985932004689332e-06, + "loss": 11.92875862121582, + "step": 34, + "token_acc": 0.009882162229565434 + }, + { + "epoch": 0.020521841102316037, + "grad_norm": 0.5648461988789549, + "learning_rate": 4.103165298944901e-06, + "loss": 11.915830612182617, + "step": 35, + "token_acc": 0.010540905856617228 + }, + { + "epoch": 0.021108179419525065, + "grad_norm": 0.48416007694292196, + "learning_rate": 4.220398593200469e-06, + "loss": 11.917096138000488, + "step": 36, + "token_acc": 0.010059371030048482 + }, + { + "epoch": 0.021694517736734097, + "grad_norm": 0.46871120331217914, + "learning_rate": 4.337631887456037e-06, + "loss": 11.916532516479492, + "step": 37, + "token_acc": 0.009734524520581022 + }, + { + "epoch": 0.022280856053943126, + "grad_norm": 0.46509747869489587, + "learning_rate": 4.454865181711607e-06, + "loss": 11.914100646972656, + "step": 38, + "token_acc": 0.009585416977414215 + }, + { + "epoch": 0.022867194371152155, + "grad_norm": 0.4139732923446741, + "learning_rate": 4.572098475967175e-06, + "loss": 11.90422248840332, + "step": 39, + "token_acc": 0.010022894937674892 + }, + { + "epoch": 0.023453532688361183, + "grad_norm": 0.43569069749728834, + "learning_rate": 4.689331770222743e-06, + "loss": 11.899462699890137, + "step": 40, + "token_acc": 0.00998906306963179 + }, + { + "epoch": 0.024039871005570215, + "grad_norm": 0.4427395068446952, + "learning_rate": 4.806565064478312e-06, + "loss": 11.893888473510742, + "step": 41, + "token_acc": 0.009981918543302703 + }, + { + "epoch": 0.024626209322779244, + "grad_norm": 0.41295279047613054, + "learning_rate": 4.923798358733881e-06, + "loss": 11.887333869934082, + "step": 42, + "token_acc": 0.010011369961669247 + }, + { + "epoch": 0.025212547639988273, + "grad_norm": 0.44474419506351187, + "learning_rate": 5.041031652989449e-06, + "loss": 11.893207550048828, + "step": 43, + "token_acc": 0.008911209871034638 + }, + { + "epoch": 0.0257988859571973, + "grad_norm": 0.45640471360812557, + "learning_rate": 5.158264947245018e-06, + "loss": 11.87860107421875, + "step": 44, + "token_acc": 0.00932694862883434 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5137403210835171, + "learning_rate": 5.275498241500587e-06, + "loss": 11.868346214294434, + "step": 45, + "token_acc": 0.009457271649877254 + }, + { + "epoch": 0.026971562591615362, + "grad_norm": 0.48121145685232086, + "learning_rate": 5.3927315357561546e-06, + "loss": 11.854333877563477, + "step": 46, + "token_acc": 0.009954296567402753 + }, + { + "epoch": 0.02755790090882439, + "grad_norm": 0.5757201494805352, + "learning_rate": 5.509964830011723e-06, + "loss": 11.848532676696777, + "step": 47, + "token_acc": 0.00974159406980461 + }, + { + "epoch": 0.028144239226033423, + "grad_norm": 0.538540019928557, + "learning_rate": 5.627198124267293e-06, + "loss": 11.834402084350586, + "step": 48, + "token_acc": 0.010114053269194533 + }, + { + "epoch": 0.02873057754324245, + "grad_norm": 0.6339441614898844, + "learning_rate": 5.7444314185228606e-06, + "loss": 11.829710006713867, + "step": 49, + "token_acc": 0.00965653692444547 + }, + { + "epoch": 0.02931691586045148, + "grad_norm": 0.6309417881063738, + "learning_rate": 5.861664712778429e-06, + "loss": 11.810378074645996, + "step": 50, + "token_acc": 0.010177322843888137 + }, + { + "epoch": 0.02990325417766051, + "grad_norm": 0.8037078188876334, + "learning_rate": 5.978898007033998e-06, + "loss": 11.799762725830078, + "step": 51, + "token_acc": 0.00994983299427297 + }, + { + "epoch": 0.03048959249486954, + "grad_norm": 0.7589222871188954, + "learning_rate": 6.096131301289567e-06, + "loss": 11.775461196899414, + "step": 52, + "token_acc": 0.010665230272017236 + }, + { + "epoch": 0.03107593081207857, + "grad_norm": 0.891510021568239, + "learning_rate": 6.213364595545135e-06, + "loss": 11.758807182312012, + "step": 53, + "token_acc": 0.010381088131673077 + }, + { + "epoch": 0.0316622691292876, + "grad_norm": 1.0221106596531047, + "learning_rate": 6.330597889800703e-06, + "loss": 11.738049507141113, + "step": 54, + "token_acc": 0.009833599549249058 + }, + { + "epoch": 0.03224860744649663, + "grad_norm": 1.2614499319305532, + "learning_rate": 6.447831184056272e-06, + "loss": 11.710914611816406, + "step": 55, + "token_acc": 0.009808991012105984 + }, + { + "epoch": 0.03283494576370566, + "grad_norm": 1.4989446931123451, + "learning_rate": 6.565064478311841e-06, + "loss": 11.674249649047852, + "step": 56, + "token_acc": 0.010150821438190143 + }, + { + "epoch": 0.03342128408091469, + "grad_norm": 2.0703781344125622, + "learning_rate": 6.68229777256741e-06, + "loss": 11.643367767333984, + "step": 57, + "token_acc": 0.009861996179177854 + }, + { + "epoch": 0.034007622398123716, + "grad_norm": 2.7785040239090337, + "learning_rate": 6.799531066822979e-06, + "loss": 11.613290786743164, + "step": 58, + "token_acc": 0.009541249531284511 + }, + { + "epoch": 0.034593960715332744, + "grad_norm": 2.457524019443084, + "learning_rate": 6.916764361078546e-06, + "loss": 11.562480926513672, + "step": 59, + "token_acc": 0.01030941659537149 + }, + { + "epoch": 0.03518029903254178, + "grad_norm": 15.595939019506787, + "learning_rate": 7.033997655334115e-06, + "loss": 11.552815437316895, + "step": 60, + "token_acc": 0.009987452006606464 + }, + { + "epoch": 0.03576663734975081, + "grad_norm": 2.9202319636395058, + "learning_rate": 7.151230949589684e-06, + "loss": 11.480331420898438, + "step": 61, + "token_acc": 0.010081490959432815 + }, + { + "epoch": 0.03635297566695984, + "grad_norm": 2.4298458485314085, + "learning_rate": 7.2684642438452524e-06, + "loss": 11.429304122924805, + "step": 62, + "token_acc": 0.009682069716251167 + }, + { + "epoch": 0.036939313984168866, + "grad_norm": 28.665591175799804, + "learning_rate": 7.38569753810082e-06, + "loss": 11.454853057861328, + "step": 63, + "token_acc": 0.009593557236477977 + }, + { + "epoch": 0.037525652301377894, + "grad_norm": 5.095442403329534, + "learning_rate": 7.502930832356389e-06, + "loss": 11.323440551757812, + "step": 64, + "token_acc": 0.009494669682006818 + }, + { + "epoch": 0.03811199061858692, + "grad_norm": 5.679296974668429, + "learning_rate": 7.620164126611958e-06, + "loss": 11.302069664001465, + "step": 65, + "token_acc": 0.009763125864741436 + }, + { + "epoch": 0.03869832893579595, + "grad_norm": 5.267458930049111, + "learning_rate": 7.737397420867527e-06, + "loss": 11.252420425415039, + "step": 66, + "token_acc": 0.009664113140836771 + }, + { + "epoch": 0.03928466725300499, + "grad_norm": 5.744586630979143, + "learning_rate": 7.854630715123097e-06, + "loss": 11.17559814453125, + "step": 67, + "token_acc": 0.009575511602198475 + }, + { + "epoch": 0.039871005570214016, + "grad_norm": 12.534206770958226, + "learning_rate": 7.971864009378664e-06, + "loss": 11.125930786132812, + "step": 68, + "token_acc": 0.009746186611111684 + }, + { + "epoch": 0.040457343887423045, + "grad_norm": 9.747991427787758, + "learning_rate": 8.089097303634232e-06, + "loss": 11.078652381896973, + "step": 69, + "token_acc": 0.01034937890285831 + }, + { + "epoch": 0.04104368220463207, + "grad_norm": 5.9133151907886115, + "learning_rate": 8.206330597889802e-06, + "loss": 11.016322135925293, + "step": 70, + "token_acc": 0.009599121765713144 + }, + { + "epoch": 0.0416300205218411, + "grad_norm": 7.909715485811744, + "learning_rate": 8.32356389214537e-06, + "loss": 10.97691535949707, + "step": 71, + "token_acc": 0.009460101329955669 + }, + { + "epoch": 0.04221635883905013, + "grad_norm": 4.201718910330622, + "learning_rate": 8.440797186400937e-06, + "loss": 10.892742156982422, + "step": 72, + "token_acc": 0.010042107779976887 + }, + { + "epoch": 0.04280269715625916, + "grad_norm": 5.546328235312145, + "learning_rate": 8.558030480656507e-06, + "loss": 10.84189224243164, + "step": 73, + "token_acc": 0.00965858873464549 + }, + { + "epoch": 0.043389035473468195, + "grad_norm": 5.738914691308477, + "learning_rate": 8.675263774912075e-06, + "loss": 10.781645774841309, + "step": 74, + "token_acc": 0.00988893140261154 + }, + { + "epoch": 0.04397537379067722, + "grad_norm": 3.862020542912135, + "learning_rate": 8.792497069167643e-06, + "loss": 10.690727233886719, + "step": 75, + "token_acc": 0.010311249857864977 + }, + { + "epoch": 0.04456171210788625, + "grad_norm": 4.329753834367066, + "learning_rate": 8.909730363423214e-06, + "loss": 10.62625503540039, + "step": 76, + "token_acc": 0.009845417740154582 + }, + { + "epoch": 0.04514805042509528, + "grad_norm": 5.141225655320356, + "learning_rate": 9.026963657678782e-06, + "loss": 10.559036254882812, + "step": 77, + "token_acc": 0.010230417527201116 + }, + { + "epoch": 0.04573438874230431, + "grad_norm": 7.79799248442612, + "learning_rate": 9.14419695193435e-06, + "loss": 10.475767135620117, + "step": 78, + "token_acc": 0.009848835496833646 + }, + { + "epoch": 0.04632072705951334, + "grad_norm": 5.76678187359056, + "learning_rate": 9.261430246189919e-06, + "loss": 10.391319274902344, + "step": 79, + "token_acc": 0.010374260292944979 + }, + { + "epoch": 0.046907065376722366, + "grad_norm": 4.725496790498841, + "learning_rate": 9.378663540445487e-06, + "loss": 10.323711395263672, + "step": 80, + "token_acc": 0.010034155478255395 + }, + { + "epoch": 0.047493403693931395, + "grad_norm": 6.699213727205319, + "learning_rate": 9.495896834701056e-06, + "loss": 10.274932861328125, + "step": 81, + "token_acc": 0.010143984210553942 + }, + { + "epoch": 0.04807974201114043, + "grad_norm": 7.551130505152835, + "learning_rate": 9.613130128956624e-06, + "loss": 10.208810806274414, + "step": 82, + "token_acc": 0.009235846796731522 + }, + { + "epoch": 0.04866608032834946, + "grad_norm": 4.267879546505137, + "learning_rate": 9.730363423212192e-06, + "loss": 10.137231826782227, + "step": 83, + "token_acc": 0.009774705608551664 + }, + { + "epoch": 0.04925241864555849, + "grad_norm": 5.926476780253599, + "learning_rate": 9.847596717467761e-06, + "loss": 10.076465606689453, + "step": 84, + "token_acc": 0.00937833267980549 + }, + { + "epoch": 0.049838756962767516, + "grad_norm": 9.139763924223722, + "learning_rate": 9.96483001172333e-06, + "loss": 10.046934127807617, + "step": 85, + "token_acc": 0.009814420553032374 + }, + { + "epoch": 0.050425095279976545, + "grad_norm": 3.0602941868449354, + "learning_rate": 1.0082063305978899e-05, + "loss": 9.955390930175781, + "step": 86, + "token_acc": 0.008730275995944383 + }, + { + "epoch": 0.051011433597185574, + "grad_norm": 4.009236491484756, + "learning_rate": 1.0199296600234468e-05, + "loss": 9.890663146972656, + "step": 87, + "token_acc": 0.009424335670251352 + }, + { + "epoch": 0.0515977719143946, + "grad_norm": 12.892889552308237, + "learning_rate": 1.0316529894490036e-05, + "loss": 9.887292861938477, + "step": 88, + "token_acc": 0.009787613606229357 + }, + { + "epoch": 0.05218411023160364, + "grad_norm": 7.248811196026036, + "learning_rate": 1.0433763188745604e-05, + "loss": 9.79570484161377, + "step": 89, + "token_acc": 0.009228338721805463 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 11.890879296453985, + "learning_rate": 1.0550996483001173e-05, + "loss": 9.779861450195312, + "step": 90, + "token_acc": 0.009727726177726582 + }, + { + "epoch": 0.053356786866021695, + "grad_norm": 9.250824864575373, + "learning_rate": 1.0668229777256741e-05, + "loss": 9.756024360656738, + "step": 91, + "token_acc": 0.010158740210668524 + }, + { + "epoch": 0.053943125183230724, + "grad_norm": 4.638097055819151, + "learning_rate": 1.0785463071512309e-05, + "loss": 9.630939483642578, + "step": 92, + "token_acc": 0.009815266901281929 + }, + { + "epoch": 0.05452946350043975, + "grad_norm": 16.090620397340597, + "learning_rate": 1.0902696365767879e-05, + "loss": 9.688409805297852, + "step": 93, + "token_acc": 0.009961478678110428 + }, + { + "epoch": 0.05511580181764878, + "grad_norm": 16.640299084865184, + "learning_rate": 1.1019929660023446e-05, + "loss": 9.71513557434082, + "step": 94, + "token_acc": 0.009475612488303746 + }, + { + "epoch": 0.05570214013485781, + "grad_norm": 11.309254640285038, + "learning_rate": 1.1137162954279014e-05, + "loss": 9.633021354675293, + "step": 95, + "token_acc": 0.01003043354640326 + }, + { + "epoch": 0.056288478452066845, + "grad_norm": 2.423524902674893, + "learning_rate": 1.1254396248534585e-05, + "loss": 9.489806175231934, + "step": 96, + "token_acc": 0.01216712036927447 + }, + { + "epoch": 0.056874816769275874, + "grad_norm": 2.753859300151003, + "learning_rate": 1.1371629542790153e-05, + "loss": 9.52678108215332, + "step": 97, + "token_acc": 0.011481548078652043 + }, + { + "epoch": 0.0574611550864849, + "grad_norm": 2.045608031903105, + "learning_rate": 1.1488862837045721e-05, + "loss": 9.473427772521973, + "step": 98, + "token_acc": 0.011738283236948985 + }, + { + "epoch": 0.05804749340369393, + "grad_norm": 1.9804489391067366, + "learning_rate": 1.160609613130129e-05, + "loss": 9.454517364501953, + "step": 99, + "token_acc": 0.010973224189932379 + }, + { + "epoch": 0.05863383172090296, + "grad_norm": 1.8514809324227361, + "learning_rate": 1.1723329425556858e-05, + "loss": 9.397886276245117, + "step": 100, + "token_acc": 0.011507198891024228 + }, + { + "epoch": 0.05922017003811199, + "grad_norm": 1.764706822609502, + "learning_rate": 1.1840562719812428e-05, + "loss": 9.400192260742188, + "step": 101, + "token_acc": 0.011799695525480124 + }, + { + "epoch": 0.05980650835532102, + "grad_norm": 1.9003077492099714, + "learning_rate": 1.1957796014067996e-05, + "loss": 9.331049919128418, + "step": 102, + "token_acc": 0.011042258066142622 + }, + { + "epoch": 0.06039284667253005, + "grad_norm": 1.4989488763131005, + "learning_rate": 1.2075029308323564e-05, + "loss": 9.37513542175293, + "step": 103, + "token_acc": 0.011396305491181799 + }, + { + "epoch": 0.06097918498973908, + "grad_norm": 1.5982239206402578, + "learning_rate": 1.2192262602579133e-05, + "loss": 9.262404441833496, + "step": 104, + "token_acc": 0.011589492193744338 + }, + { + "epoch": 0.06156552330694811, + "grad_norm": 1.340719061509751, + "learning_rate": 1.2309495896834701e-05, + "loss": 9.31516170501709, + "step": 105, + "token_acc": 0.011250097929125428 + }, + { + "epoch": 0.06215186162415714, + "grad_norm": 1.2080849609378068, + "learning_rate": 1.242672919109027e-05, + "loss": 9.287046432495117, + "step": 106, + "token_acc": 0.012050421500488889 + }, + { + "epoch": 0.06273819994136617, + "grad_norm": 1.3475762654660048, + "learning_rate": 1.254396248534584e-05, + "loss": 9.218914031982422, + "step": 107, + "token_acc": 0.01137484366861183 + }, + { + "epoch": 0.0633245382585752, + "grad_norm": 1.264803927924952, + "learning_rate": 1.2661195779601406e-05, + "loss": 9.20506477355957, + "step": 108, + "token_acc": 0.011012659841656362 + }, + { + "epoch": 0.06391087657578423, + "grad_norm": 1.0501627514033995, + "learning_rate": 1.2778429073856976e-05, + "loss": 9.221490859985352, + "step": 109, + "token_acc": 0.011815973159538777 + }, + { + "epoch": 0.06449721489299326, + "grad_norm": 1.1351548669273637, + "learning_rate": 1.2895662368112543e-05, + "loss": 9.166486740112305, + "step": 110, + "token_acc": 0.011135588616953858 + }, + { + "epoch": 0.06508355321020229, + "grad_norm": 1.151934615978955, + "learning_rate": 1.3012895662368113e-05, + "loss": 9.145608901977539, + "step": 111, + "token_acc": 0.011828176856410338 + }, + { + "epoch": 0.06566989152741132, + "grad_norm": 1.0585231459700444, + "learning_rate": 1.3130128956623683e-05, + "loss": 9.142675399780273, + "step": 112, + "token_acc": 0.011521914309518282 + }, + { + "epoch": 0.06625622984462035, + "grad_norm": 0.9875605344566676, + "learning_rate": 1.324736225087925e-05, + "loss": 9.238873481750488, + "step": 113, + "token_acc": 0.011856729851139261 + }, + { + "epoch": 0.06684256816182937, + "grad_norm": 1.0325355052786365, + "learning_rate": 1.336459554513482e-05, + "loss": 9.105939865112305, + "step": 114, + "token_acc": 0.011576966306819666 + }, + { + "epoch": 0.0674289064790384, + "grad_norm": 1.1325458448605066, + "learning_rate": 1.3481828839390388e-05, + "loss": 9.110050201416016, + "step": 115, + "token_acc": 0.011211430980672428 + }, + { + "epoch": 0.06801524479624743, + "grad_norm": 1.090821686429762, + "learning_rate": 1.3599062133645957e-05, + "loss": 9.112598419189453, + "step": 116, + "token_acc": 0.011903571597720325 + }, + { + "epoch": 0.06860158311345646, + "grad_norm": 0.952304136534885, + "learning_rate": 1.3716295427901523e-05, + "loss": 9.133591651916504, + "step": 117, + "token_acc": 0.012701033123868357 + }, + { + "epoch": 0.06918792143066549, + "grad_norm": 1.1331670203367967, + "learning_rate": 1.3833528722157093e-05, + "loss": 9.020330429077148, + "step": 118, + "token_acc": 0.012102275793135877 + }, + { + "epoch": 0.06977425974787452, + "grad_norm": 0.9634373184547826, + "learning_rate": 1.395076201641266e-05, + "loss": 9.114398956298828, + "step": 119, + "token_acc": 0.012448431162093276 + }, + { + "epoch": 0.07036059806508356, + "grad_norm": 0.8265779678478534, + "learning_rate": 1.406799531066823e-05, + "loss": 9.048168182373047, + "step": 120, + "token_acc": 0.011561001740279884 + }, + { + "epoch": 0.07094693638229259, + "grad_norm": 0.833447735319011, + "learning_rate": 1.41852286049238e-05, + "loss": 9.062210083007812, + "step": 121, + "token_acc": 0.01157997128167122 + }, + { + "epoch": 0.07153327469950162, + "grad_norm": 0.7656553702940753, + "learning_rate": 1.4302461899179368e-05, + "loss": 9.11124038696289, + "step": 122, + "token_acc": 0.012586028094654473 + }, + { + "epoch": 0.07211961301671065, + "grad_norm": 0.7852937585282732, + "learning_rate": 1.4419695193434937e-05, + "loss": 9.070611000061035, + "step": 123, + "token_acc": 0.011125447142537066 + }, + { + "epoch": 0.07270595133391967, + "grad_norm": 0.7642616437853497, + "learning_rate": 1.4536928487690505e-05, + "loss": 9.045153617858887, + "step": 124, + "token_acc": 0.011685694001289344 + }, + { + "epoch": 0.0732922896511287, + "grad_norm": 0.6978454932933204, + "learning_rate": 1.4654161781946074e-05, + "loss": 9.031429290771484, + "step": 125, + "token_acc": 0.011616803743544022 + }, + { + "epoch": 0.07387862796833773, + "grad_norm": 0.7597479616255512, + "learning_rate": 1.477139507620164e-05, + "loss": 9.09515380859375, + "step": 126, + "token_acc": 0.012234666235394008 + }, + { + "epoch": 0.07446496628554676, + "grad_norm": 0.748173937002817, + "learning_rate": 1.4888628370457212e-05, + "loss": 9.073539733886719, + "step": 127, + "token_acc": 0.01131718594020911 + }, + { + "epoch": 0.07505130460275579, + "grad_norm": 0.8594747492046234, + "learning_rate": 1.5005861664712778e-05, + "loss": 9.14208984375, + "step": 128, + "token_acc": 0.012112918486366284 + }, + { + "epoch": 0.07563764291996482, + "grad_norm": 0.7994670855063187, + "learning_rate": 1.5123094958968347e-05, + "loss": 9.087879180908203, + "step": 129, + "token_acc": 0.012230560517448794 + }, + { + "epoch": 0.07622398123717385, + "grad_norm": 0.8570311648892158, + "learning_rate": 1.5240328253223915e-05, + "loss": 9.015796661376953, + "step": 130, + "token_acc": 0.011548020087755196 + }, + { + "epoch": 0.07681031955438287, + "grad_norm": 0.7219671720650426, + "learning_rate": 1.5357561547479485e-05, + "loss": 9.11224365234375, + "step": 131, + "token_acc": 0.011591454742862385 + }, + { + "epoch": 0.0773966578715919, + "grad_norm": 0.7638687168526499, + "learning_rate": 1.5474794841735054e-05, + "loss": 9.054033279418945, + "step": 132, + "token_acc": 0.012042958660582874 + }, + { + "epoch": 0.07798299618880093, + "grad_norm": 0.97440464015292, + "learning_rate": 1.559202813599062e-05, + "loss": 9.11070442199707, + "step": 133, + "token_acc": 0.011402734220381816 + }, + { + "epoch": 0.07856933450600997, + "grad_norm": 0.9001307159126438, + "learning_rate": 1.5709261430246193e-05, + "loss": 9.088046073913574, + "step": 134, + "token_acc": 0.012035650089874176 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.8229793059931592, + "learning_rate": 1.582649472450176e-05, + "loss": 9.12911319732666, + "step": 135, + "token_acc": 0.012511244824788131 + }, + { + "epoch": 0.07974201114042803, + "grad_norm": 0.8517253815463744, + "learning_rate": 1.594372801875733e-05, + "loss": 8.992467880249023, + "step": 136, + "token_acc": 0.012404457301708667 + }, + { + "epoch": 0.08032834945763706, + "grad_norm": 0.6550721875663478, + "learning_rate": 1.6060961313012895e-05, + "loss": 9.066499710083008, + "step": 137, + "token_acc": 0.011989589001714632 + }, + { + "epoch": 0.08091468777484609, + "grad_norm": 0.7043109470028683, + "learning_rate": 1.6178194607268465e-05, + "loss": 9.069589614868164, + "step": 138, + "token_acc": 0.012002772152274216 + }, + { + "epoch": 0.08150102609205512, + "grad_norm": 0.7186750105655103, + "learning_rate": 1.6295427901524034e-05, + "loss": 9.092554092407227, + "step": 139, + "token_acc": 0.01172333546141648 + }, + { + "epoch": 0.08208736440926415, + "grad_norm": 0.7578414288822541, + "learning_rate": 1.6412661195779604e-05, + "loss": 9.005353927612305, + "step": 140, + "token_acc": 0.012403433476394849 + }, + { + "epoch": 0.08267370272647317, + "grad_norm": 0.6863964235343788, + "learning_rate": 1.6529894490035173e-05, + "loss": 9.08322525024414, + "step": 141, + "token_acc": 0.012483895800871413 + }, + { + "epoch": 0.0832600410436822, + "grad_norm": 0.7115942082178114, + "learning_rate": 1.664712778429074e-05, + "loss": 9.083134651184082, + "step": 142, + "token_acc": 0.011353050913913107 + }, + { + "epoch": 0.08384637936089123, + "grad_norm": 0.6761155619240857, + "learning_rate": 1.676436107854631e-05, + "loss": 9.069738388061523, + "step": 143, + "token_acc": 0.011400751517881825 + }, + { + "epoch": 0.08443271767810026, + "grad_norm": 0.7425388051750792, + "learning_rate": 1.6881594372801875e-05, + "loss": 8.993913650512695, + "step": 144, + "token_acc": 0.012067434266875038 + }, + { + "epoch": 0.08501905599530929, + "grad_norm": 0.9021054749102797, + "learning_rate": 1.6998827667057444e-05, + "loss": 9.140275955200195, + "step": 145, + "token_acc": 0.012101386584342878 + }, + { + "epoch": 0.08560539431251832, + "grad_norm": 0.8504217653550193, + "learning_rate": 1.7116060961313014e-05, + "loss": 9.014819145202637, + "step": 146, + "token_acc": 0.012131624988214094 + }, + { + "epoch": 0.08619173262972735, + "grad_norm": 0.8358068078543422, + "learning_rate": 1.7233294255568583e-05, + "loss": 9.017570495605469, + "step": 147, + "token_acc": 0.011914827195983298 + }, + { + "epoch": 0.08677807094693639, + "grad_norm": 0.7227265093302682, + "learning_rate": 1.735052754982415e-05, + "loss": 9.058671951293945, + "step": 148, + "token_acc": 0.014255455800161248 + }, + { + "epoch": 0.08736440926414542, + "grad_norm": 0.7834286440676679, + "learning_rate": 1.746776084407972e-05, + "loss": 9.038613319396973, + "step": 149, + "token_acc": 0.014479944047774593 + }, + { + "epoch": 0.08795074758135445, + "grad_norm": 0.696907119834478, + "learning_rate": 1.7584994138335285e-05, + "loss": 9.044143676757812, + "step": 150, + "token_acc": 0.013753129701304278 + }, + { + "epoch": 0.08853708589856348, + "grad_norm": 0.6962648070847909, + "learning_rate": 1.7702227432590858e-05, + "loss": 9.083137512207031, + "step": 151, + "token_acc": 0.014561923822757769 + }, + { + "epoch": 0.0891234242157725, + "grad_norm": 0.7437592420092028, + "learning_rate": 1.7819460726846428e-05, + "loss": 9.040020942687988, + "step": 152, + "token_acc": 0.014798572838693756 + }, + { + "epoch": 0.08970976253298153, + "grad_norm": 0.707776049021692, + "learning_rate": 1.7936694021101994e-05, + "loss": 9.024340629577637, + "step": 153, + "token_acc": 0.014677986703254204 + }, + { + "epoch": 0.09029610085019056, + "grad_norm": 0.7257843762539763, + "learning_rate": 1.8053927315357563e-05, + "loss": 9.047564506530762, + "step": 154, + "token_acc": 0.014282354619899112 + }, + { + "epoch": 0.09088243916739959, + "grad_norm": 0.641385919441577, + "learning_rate": 1.817116060961313e-05, + "loss": 9.074129104614258, + "step": 155, + "token_acc": 0.013060438286499033 + }, + { + "epoch": 0.09146877748460862, + "grad_norm": 0.7037882352890727, + "learning_rate": 1.82883939038687e-05, + "loss": 9.017114639282227, + "step": 156, + "token_acc": 0.01331883095490841 + }, + { + "epoch": 0.09205511580181765, + "grad_norm": 0.72330399374591, + "learning_rate": 1.840562719812427e-05, + "loss": 9.049703598022461, + "step": 157, + "token_acc": 0.013519621187834705 + }, + { + "epoch": 0.09264145411902668, + "grad_norm": 0.8153885218738263, + "learning_rate": 1.8522860492379838e-05, + "loss": 9.003667831420898, + "step": 158, + "token_acc": 0.012964697709666554 + }, + { + "epoch": 0.0932277924362357, + "grad_norm": 9.31449519764132, + "learning_rate": 1.8640093786635404e-05, + "loss": 8.997857093811035, + "step": 159, + "token_acc": 0.013419320710095996 + }, + { + "epoch": 0.09381413075344473, + "grad_norm": 1.005337534873343, + "learning_rate": 1.8757327080890974e-05, + "loss": 9.076769828796387, + "step": 160, + "token_acc": 0.013001575410405827 + }, + { + "epoch": 0.09440046907065376, + "grad_norm": 0.7684279289802697, + "learning_rate": 1.8874560375146543e-05, + "loss": 9.057096481323242, + "step": 161, + "token_acc": 0.01347530621106096 + }, + { + "epoch": 0.09498680738786279, + "grad_norm": 0.9409533191933808, + "learning_rate": 1.8991793669402113e-05, + "loss": 9.03506088256836, + "step": 162, + "token_acc": 0.014003734329154442 + }, + { + "epoch": 0.09557314570507183, + "grad_norm": 0.8645441983935221, + "learning_rate": 1.9109026963657682e-05, + "loss": 9.019186973571777, + "step": 163, + "token_acc": 0.01292129111088484 + }, + { + "epoch": 0.09615948402228086, + "grad_norm": 0.9393253312821077, + "learning_rate": 1.9226260257913248e-05, + "loss": 9.03659439086914, + "step": 164, + "token_acc": 0.012735673011338043 + }, + { + "epoch": 0.09674582233948989, + "grad_norm": 0.7811423991520909, + "learning_rate": 1.9343493552168818e-05, + "loss": 9.040750503540039, + "step": 165, + "token_acc": 0.014251560049570644 + }, + { + "epoch": 0.09733216065669892, + "grad_norm": 0.6971025306360749, + "learning_rate": 1.9460726846424384e-05, + "loss": 9.061229705810547, + "step": 166, + "token_acc": 0.013292242560079692 + }, + { + "epoch": 0.09791849897390795, + "grad_norm": 0.8212909189612504, + "learning_rate": 1.9577960140679953e-05, + "loss": 8.96902847290039, + "step": 167, + "token_acc": 0.014282893606546027 + }, + { + "epoch": 0.09850483729111698, + "grad_norm": 1.200594312016131, + "learning_rate": 1.9695193434935523e-05, + "loss": 8.952869415283203, + "step": 168, + "token_acc": 0.014163986808955163 + }, + { + "epoch": 0.099091175608326, + "grad_norm": 5.824729012820525, + "learning_rate": 1.9812426729191092e-05, + "loss": 8.955404281616211, + "step": 169, + "token_acc": 0.01576004892542661 + }, + { + "epoch": 0.09967751392553503, + "grad_norm": 1.4242937668335696, + "learning_rate": 1.992966002344666e-05, + "loss": 8.959024429321289, + "step": 170, + "token_acc": 0.014124234872105026 + }, + { + "epoch": 0.10026385224274406, + "grad_norm": 1.3609270715797308, + "learning_rate": 2.0046893317702228e-05, + "loss": 8.948944091796875, + "step": 171, + "token_acc": 0.016941815646295533 + }, + { + "epoch": 0.10085019055995309, + "grad_norm": 2.566289899575198, + "learning_rate": 2.0164126611957798e-05, + "loss": 8.961725234985352, + "step": 172, + "token_acc": 0.015024575656908973 + }, + { + "epoch": 0.10143652887716212, + "grad_norm": 1.375139853508446, + "learning_rate": 2.0281359906213364e-05, + "loss": 8.978058815002441, + "step": 173, + "token_acc": 0.015213520107315722 + }, + { + "epoch": 0.10202286719437115, + "grad_norm": 1.0423960317679877, + "learning_rate": 2.0398593200468937e-05, + "loss": 8.9154052734375, + "step": 174, + "token_acc": 0.0152281594750071 + }, + { + "epoch": 0.10260920551158018, + "grad_norm": 2.301740162866994, + "learning_rate": 2.0515826494724503e-05, + "loss": 8.96900749206543, + "step": 175, + "token_acc": 0.015455946544198266 + }, + { + "epoch": 0.1031955438287892, + "grad_norm": 1.5482325621563005, + "learning_rate": 2.0633059788980072e-05, + "loss": 8.960334777832031, + "step": 176, + "token_acc": 0.017298787210584345 + }, + { + "epoch": 0.10378188214599825, + "grad_norm": 0.8556971596952577, + "learning_rate": 2.075029308323564e-05, + "loss": 8.910372734069824, + "step": 177, + "token_acc": 0.016071924195301374 + }, + { + "epoch": 0.10436822046320728, + "grad_norm": 5.295799730411908, + "learning_rate": 2.0867526377491208e-05, + "loss": 8.86441707611084, + "step": 178, + "token_acc": 0.015786640844907197 + }, + { + "epoch": 0.1049545587804163, + "grad_norm": 3.5152963718692534, + "learning_rate": 2.0984759671746777e-05, + "loss": 8.927906036376953, + "step": 179, + "token_acc": 0.01683879018052864 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 3.138251926137507, + "learning_rate": 2.1101992966002347e-05, + "loss": 8.926370620727539, + "step": 180, + "token_acc": 0.015805829122476772 + }, + { + "epoch": 0.10612723541483436, + "grad_norm": 9.333767190674024, + "learning_rate": 2.1219226260257916e-05, + "loss": 8.921234130859375, + "step": 181, + "token_acc": 0.016851821229386207 + }, + { + "epoch": 0.10671357373204339, + "grad_norm": 3.323326480985714, + "learning_rate": 2.1336459554513483e-05, + "loss": 8.781801223754883, + "step": 182, + "token_acc": 0.01659174478355853 + }, + { + "epoch": 0.10729991204925242, + "grad_norm": 14.405946436689645, + "learning_rate": 2.1453692848769052e-05, + "loss": 8.9700927734375, + "step": 183, + "token_acc": 0.015191283957919641 + }, + { + "epoch": 0.10788625036646145, + "grad_norm": 13.346227364046948, + "learning_rate": 2.1570926143024618e-05, + "loss": 8.917417526245117, + "step": 184, + "token_acc": 0.01641444240406353 + }, + { + "epoch": 0.10847258868367048, + "grad_norm": 6.895339235739814, + "learning_rate": 2.1688159437280188e-05, + "loss": 8.860979080200195, + "step": 185, + "token_acc": 0.016249619733633078 + }, + { + "epoch": 0.1090589270008795, + "grad_norm": 12.261686386665007, + "learning_rate": 2.1805392731535757e-05, + "loss": 8.921403884887695, + "step": 186, + "token_acc": 0.015335129246208268 + }, + { + "epoch": 0.10964526531808853, + "grad_norm": 11.243118734511503, + "learning_rate": 2.1922626025791327e-05, + "loss": 8.925213813781738, + "step": 187, + "token_acc": 0.015112607008083487 + }, + { + "epoch": 0.11023160363529756, + "grad_norm": 6.1716959372824505, + "learning_rate": 2.2039859320046893e-05, + "loss": 8.8018798828125, + "step": 188, + "token_acc": 0.016534557011216457 + }, + { + "epoch": 0.11081794195250659, + "grad_norm": 3.101446594108936, + "learning_rate": 2.2157092614302462e-05, + "loss": 8.832223892211914, + "step": 189, + "token_acc": 0.01590452254985572 + }, + { + "epoch": 0.11140428026971562, + "grad_norm": 5.06177739064044, + "learning_rate": 2.227432590855803e-05, + "loss": 8.88629150390625, + "step": 190, + "token_acc": 0.01556583153825492 + }, + { + "epoch": 0.11199061858692466, + "grad_norm": 1.7667216614786856, + "learning_rate": 2.23915592028136e-05, + "loss": 8.804533958435059, + "step": 191, + "token_acc": 0.015173052296839346 + }, + { + "epoch": 0.11257695690413369, + "grad_norm": 3.658817747813563, + "learning_rate": 2.250879249706917e-05, + "loss": 8.903039932250977, + "step": 192, + "token_acc": 0.01608010181553726 + }, + { + "epoch": 0.11316329522134272, + "grad_norm": 3.961014838557213, + "learning_rate": 2.2626025791324737e-05, + "loss": 8.841934204101562, + "step": 193, + "token_acc": 0.01553905562836573 + }, + { + "epoch": 0.11374963353855175, + "grad_norm": 4.719743908208459, + "learning_rate": 2.2743259085580307e-05, + "loss": 8.834268569946289, + "step": 194, + "token_acc": 0.014658590076126611 + }, + { + "epoch": 0.11433597185576078, + "grad_norm": 2.4698910205375064, + "learning_rate": 2.2860492379835873e-05, + "loss": 8.811420440673828, + "step": 195, + "token_acc": 0.015618440936949038 + }, + { + "epoch": 0.1149223101729698, + "grad_norm": 7.209373338178073, + "learning_rate": 2.2977725674091442e-05, + "loss": 8.798686981201172, + "step": 196, + "token_acc": 0.01697163252817443 + }, + { + "epoch": 0.11550864849017883, + "grad_norm": 5.336832747788366, + "learning_rate": 2.3094958968347012e-05, + "loss": 8.872821807861328, + "step": 197, + "token_acc": 0.016580901911036543 + }, + { + "epoch": 0.11609498680738786, + "grad_norm": 1.0947098512600604, + "learning_rate": 2.321219226260258e-05, + "loss": 8.73609733581543, + "step": 198, + "token_acc": 0.017363586019040856 + }, + { + "epoch": 0.11668132512459689, + "grad_norm": 5.9811775760993, + "learning_rate": 2.3329425556858147e-05, + "loss": 8.715837478637695, + "step": 199, + "token_acc": 0.016491450941757517 + }, + { + "epoch": 0.11726766344180592, + "grad_norm": 2.0633491724212685, + "learning_rate": 2.3446658851113717e-05, + "loss": 8.695581436157227, + "step": 200, + "token_acc": 0.015138196091258485 + }, + { + "epoch": 0.11785400175901495, + "grad_norm": 10.591654191799721, + "learning_rate": 2.3563892145369286e-05, + "loss": 8.832849502563477, + "step": 201, + "token_acc": 0.015996674908346834 + }, + { + "epoch": 0.11844034007622398, + "grad_norm": 13.136811922185172, + "learning_rate": 2.3681125439624856e-05, + "loss": 8.710956573486328, + "step": 202, + "token_acc": 0.01733241936261253 + }, + { + "epoch": 0.119026678393433, + "grad_norm": 3.2711713145232273, + "learning_rate": 2.3798358733880426e-05, + "loss": 8.791112899780273, + "step": 203, + "token_acc": 0.01658490333767251 + }, + { + "epoch": 0.11961301671064203, + "grad_norm": 10.597238560834104, + "learning_rate": 2.391559202813599e-05, + "loss": 8.680379867553711, + "step": 204, + "token_acc": 0.016328462403123706 + }, + { + "epoch": 0.12019935502785108, + "grad_norm": 13.504735178378652, + "learning_rate": 2.403282532239156e-05, + "loss": 8.875692367553711, + "step": 205, + "token_acc": 0.015691883679761105 + }, + { + "epoch": 0.1207856933450601, + "grad_norm": 8.13171450433769, + "learning_rate": 2.4150058616647127e-05, + "loss": 8.767316818237305, + "step": 206, + "token_acc": 0.016828419899942028 + }, + { + "epoch": 0.12137203166226913, + "grad_norm": 1.9173355946554083, + "learning_rate": 2.4267291910902697e-05, + "loss": 8.74384593963623, + "step": 207, + "token_acc": 0.016572997407502156 + }, + { + "epoch": 0.12195836997947816, + "grad_norm": 3.4276935341479318, + "learning_rate": 2.4384525205158266e-05, + "loss": 8.787282943725586, + "step": 208, + "token_acc": 0.0161269036872806 + }, + { + "epoch": 0.12254470829668719, + "grad_norm": 1.2552424841660939, + "learning_rate": 2.4501758499413836e-05, + "loss": 8.732902526855469, + "step": 209, + "token_acc": 0.015460392390613068 + }, + { + "epoch": 0.12313104661389622, + "grad_norm": 2.8609025149950833, + "learning_rate": 2.4618991793669402e-05, + "loss": 8.645153045654297, + "step": 210, + "token_acc": 0.016576407000640725 + }, + { + "epoch": 0.12371738493110525, + "grad_norm": 1.8799254088976998, + "learning_rate": 2.473622508792497e-05, + "loss": 8.766607284545898, + "step": 211, + "token_acc": 0.017658121765497414 + }, + { + "epoch": 0.12430372324831428, + "grad_norm": 1.0712384945420417, + "learning_rate": 2.485345838218054e-05, + "loss": 8.612556457519531, + "step": 212, + "token_acc": 0.01722776504600158 + }, + { + "epoch": 0.1248900615655233, + "grad_norm": 2.7720756099267305, + "learning_rate": 2.4970691676436107e-05, + "loss": 8.535324096679688, + "step": 213, + "token_acc": 0.017316714193726428 + }, + { + "epoch": 0.12547639988273235, + "grad_norm": 8.642709154598062, + "learning_rate": 2.508792497069168e-05, + "loss": 8.709345817565918, + "step": 214, + "token_acc": 0.016801567304730695 + }, + { + "epoch": 0.12606273819994138, + "grad_norm": 5.202525132830785, + "learning_rate": 2.520515826494725e-05, + "loss": 8.681105613708496, + "step": 215, + "token_acc": 0.015975298354672072 + }, + { + "epoch": 0.1266490765171504, + "grad_norm": 8.168177418130778, + "learning_rate": 2.5322391559202812e-05, + "loss": 8.639896392822266, + "step": 216, + "token_acc": 0.016853400577382514 + }, + { + "epoch": 0.12723541483435943, + "grad_norm": 5.221265842440695, + "learning_rate": 2.5439624853458382e-05, + "loss": 8.569507598876953, + "step": 217, + "token_acc": 0.01684437386569873 + }, + { + "epoch": 0.12782175315156846, + "grad_norm": 6.303391587658705, + "learning_rate": 2.555685814771395e-05, + "loss": 8.726802825927734, + "step": 218, + "token_acc": 0.016575941281428277 + }, + { + "epoch": 0.1284080914687775, + "grad_norm": 8.912004600500023, + "learning_rate": 2.567409144196952e-05, + "loss": 8.578927993774414, + "step": 219, + "token_acc": 0.01656643387633698 + }, + { + "epoch": 0.12899442978598652, + "grad_norm": 3.7996099592616632, + "learning_rate": 2.5791324736225087e-05, + "loss": 8.55063247680664, + "step": 220, + "token_acc": 0.01638220035436132 + }, + { + "epoch": 0.12958076810319555, + "grad_norm": 3.6968260263310673, + "learning_rate": 2.5908558030480656e-05, + "loss": 8.614810943603516, + "step": 221, + "token_acc": 0.017415957877683273 + }, + { + "epoch": 0.13016710642040458, + "grad_norm": 17.05818270793626, + "learning_rate": 2.6025791324736226e-05, + "loss": 8.583649635314941, + "step": 222, + "token_acc": 0.017697264823808075 + }, + { + "epoch": 0.1307534447376136, + "grad_norm": 16.282298485283953, + "learning_rate": 2.6143024618991796e-05, + "loss": 8.66443920135498, + "step": 223, + "token_acc": 0.016362078958571627 + }, + { + "epoch": 0.13133978305482263, + "grad_norm": 2.0644641560828476, + "learning_rate": 2.6260257913247365e-05, + "loss": 8.622711181640625, + "step": 224, + "token_acc": 0.01609407258292875 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 2.9579060756891993, + "learning_rate": 2.637749120750293e-05, + "loss": 8.510727882385254, + "step": 225, + "token_acc": 0.01712861956339319 + }, + { + "epoch": 0.1325124596892407, + "grad_norm": 2.2077382104085386, + "learning_rate": 2.64947245017585e-05, + "loss": 8.509180068969727, + "step": 226, + "token_acc": 0.017173266755904484 + }, + { + "epoch": 0.13309879800644972, + "grad_norm": 9.986535544055963, + "learning_rate": 2.661195779601407e-05, + "loss": 8.598289489746094, + "step": 227, + "token_acc": 0.016466763159575147 + }, + { + "epoch": 0.13368513632365875, + "grad_norm": 6.5569825066103835, + "learning_rate": 2.672919109026964e-05, + "loss": 8.589065551757812, + "step": 228, + "token_acc": 0.016002592717545207 + }, + { + "epoch": 0.13427147464086778, + "grad_norm": 13.528946435191246, + "learning_rate": 2.6846424384525202e-05, + "loss": 8.567931175231934, + "step": 229, + "token_acc": 0.016549324269644734 + }, + { + "epoch": 0.1348578129580768, + "grad_norm": 14.673761218136983, + "learning_rate": 2.6963657678780775e-05, + "loss": 8.519822120666504, + "step": 230, + "token_acc": 0.017126661208036784 + }, + { + "epoch": 0.13544415127528583, + "grad_norm": 5.382544931895434, + "learning_rate": 2.7080890973036345e-05, + "loss": 8.577202796936035, + "step": 231, + "token_acc": 0.015222634267783511 + }, + { + "epoch": 0.13603048959249486, + "grad_norm": 4.550736155538764, + "learning_rate": 2.7198124267291914e-05, + "loss": 8.50474739074707, + "step": 232, + "token_acc": 0.016359756787404724 + }, + { + "epoch": 0.1366168279097039, + "grad_norm": 9.984215509657442, + "learning_rate": 2.7315357561547484e-05, + "loss": 8.471184730529785, + "step": 233, + "token_acc": 0.018521768477894027 + }, + { + "epoch": 0.13720316622691292, + "grad_norm": 5.524981881058105, + "learning_rate": 2.7432590855803047e-05, + "loss": 8.468493461608887, + "step": 234, + "token_acc": 0.017912650035211357 + }, + { + "epoch": 0.13778950454412195, + "grad_norm": 11.699236774247959, + "learning_rate": 2.7549824150058616e-05, + "loss": 8.444255828857422, + "step": 235, + "token_acc": 0.01791036236821437 + }, + { + "epoch": 0.13837584286133098, + "grad_norm": 10.123664924444109, + "learning_rate": 2.7667057444314186e-05, + "loss": 8.387200355529785, + "step": 236, + "token_acc": 0.01708580705129367 + }, + { + "epoch": 0.13896218117854, + "grad_norm": 3.9487437276302484, + "learning_rate": 2.778429073856976e-05, + "loss": 8.458383560180664, + "step": 237, + "token_acc": 0.017343458903028196 + }, + { + "epoch": 0.13954851949574903, + "grad_norm": 3.963875412289304, + "learning_rate": 2.790152403282532e-05, + "loss": 8.412214279174805, + "step": 238, + "token_acc": 0.019628865768343048 + }, + { + "epoch": 0.14013485781295806, + "grad_norm": 12.211930197030025, + "learning_rate": 2.801875732708089e-05, + "loss": 8.418054580688477, + "step": 239, + "token_acc": 0.017701699611281117 + }, + { + "epoch": 0.14072119613016712, + "grad_norm": 10.909038604133126, + "learning_rate": 2.813599062133646e-05, + "loss": 8.36636734008789, + "step": 240, + "token_acc": 0.017211151251472892 + }, + { + "epoch": 0.14130753444737615, + "grad_norm": 8.28542484631731, + "learning_rate": 2.825322391559203e-05, + "loss": 8.370955467224121, + "step": 241, + "token_acc": 0.019585555889333625 + }, + { + "epoch": 0.14189387276458518, + "grad_norm": 7.33590152292038, + "learning_rate": 2.83704572098476e-05, + "loss": 8.39145278930664, + "step": 242, + "token_acc": 0.018641399386104405 + }, + { + "epoch": 0.1424802110817942, + "grad_norm": 7.749166741267496, + "learning_rate": 2.8487690504103166e-05, + "loss": 8.324542999267578, + "step": 243, + "token_acc": 0.018362774783092776 + }, + { + "epoch": 0.14306654939900323, + "grad_norm": 5.324767302634211, + "learning_rate": 2.8604923798358735e-05, + "loss": 8.325507164001465, + "step": 244, + "token_acc": 0.01934745508491453 + }, + { + "epoch": 0.14365288771621226, + "grad_norm": 9.781482280787388, + "learning_rate": 2.8722157092614305e-05, + "loss": 8.440896987915039, + "step": 245, + "token_acc": 0.01824733374953695 + }, + { + "epoch": 0.1442392260334213, + "grad_norm": 10.380814201783155, + "learning_rate": 2.8839390386869874e-05, + "loss": 8.244488716125488, + "step": 246, + "token_acc": 0.020205417955481815 + }, + { + "epoch": 0.14482556435063032, + "grad_norm": 5.154728848909781, + "learning_rate": 2.895662368112544e-05, + "loss": 8.224533081054688, + "step": 247, + "token_acc": 0.01909172108635162 + }, + { + "epoch": 0.14541190266783935, + "grad_norm": 5.502263745507572, + "learning_rate": 2.907385697538101e-05, + "loss": 8.275815963745117, + "step": 248, + "token_acc": 0.019192503889150177 + }, + { + "epoch": 0.14599824098504838, + "grad_norm": 7.585891046065305, + "learning_rate": 2.919109026963658e-05, + "loss": 8.325769424438477, + "step": 249, + "token_acc": 0.020240734874881216 + }, + { + "epoch": 0.1465845793022574, + "grad_norm": 3.7468291672525176, + "learning_rate": 2.930832356389215e-05, + "loss": 8.226460456848145, + "step": 250, + "token_acc": 0.021003967820610828 + }, + { + "epoch": 0.14717091761946643, + "grad_norm": 11.951610769329216, + "learning_rate": 2.9425556858147718e-05, + "loss": 8.30726432800293, + "step": 251, + "token_acc": 0.01735603104933756 + }, + { + "epoch": 0.14775725593667546, + "grad_norm": 10.49913542957289, + "learning_rate": 2.954279015240328e-05, + "loss": 8.246274948120117, + "step": 252, + "token_acc": 0.01941996525233919 + }, + { + "epoch": 0.1483435942538845, + "grad_norm": 4.437843733752013, + "learning_rate": 2.9660023446658854e-05, + "loss": 8.106080055236816, + "step": 253, + "token_acc": 0.021376591873862948 + }, + { + "epoch": 0.14892993257109352, + "grad_norm": 5.150232708191367, + "learning_rate": 2.9777256740914423e-05, + "loss": 8.203289031982422, + "step": 254, + "token_acc": 0.0213694924389258 + }, + { + "epoch": 0.14951627088830255, + "grad_norm": 6.97369549993389, + "learning_rate": 2.9894490035169993e-05, + "loss": 8.237259864807129, + "step": 255, + "token_acc": 0.02066425045949239 + }, + { + "epoch": 0.15010260920551158, + "grad_norm": 4.124770989228856, + "learning_rate": 3.0011723329425556e-05, + "loss": 8.218684196472168, + "step": 256, + "token_acc": 0.02030398362175691 + }, + { + "epoch": 0.1506889475227206, + "grad_norm": 4.511899369399747, + "learning_rate": 3.0128956623681125e-05, + "loss": 8.083969116210938, + "step": 257, + "token_acc": 0.02240091461469559 + }, + { + "epoch": 0.15127528583992964, + "grad_norm": 6.980317013092456, + "learning_rate": 3.0246189917936695e-05, + "loss": 8.092212677001953, + "step": 258, + "token_acc": 0.022515734420707385 + }, + { + "epoch": 0.15186162415713866, + "grad_norm": 3.317675318677222, + "learning_rate": 3.0363423212192264e-05, + "loss": 8.109350204467773, + "step": 259, + "token_acc": 0.021867027916403905 + }, + { + "epoch": 0.1524479624743477, + "grad_norm": 11.65603149025743, + "learning_rate": 3.048065650644783e-05, + "loss": 8.081318855285645, + "step": 260, + "token_acc": 0.023007410084944876 + }, + { + "epoch": 0.15303430079155672, + "grad_norm": 8.447222871175889, + "learning_rate": 3.05978898007034e-05, + "loss": 8.224822998046875, + "step": 261, + "token_acc": 0.020889620909093368 + }, + { + "epoch": 0.15362063910876575, + "grad_norm": 8.744415179863386, + "learning_rate": 3.071512309495897e-05, + "loss": 8.100028038024902, + "step": 262, + "token_acc": 0.021298408978786383 + }, + { + "epoch": 0.15420697742597478, + "grad_norm": 4.970656588146134, + "learning_rate": 3.083235638921454e-05, + "loss": 8.10700798034668, + "step": 263, + "token_acc": 0.023156668608037275 + }, + { + "epoch": 0.1547933157431838, + "grad_norm": 12.728606559643172, + "learning_rate": 3.094958968347011e-05, + "loss": 8.07313060760498, + "step": 264, + "token_acc": 0.023119693382585425 + }, + { + "epoch": 0.15537965406039284, + "grad_norm": 11.2290241235744, + "learning_rate": 3.106682297772567e-05, + "loss": 8.088520050048828, + "step": 265, + "token_acc": 0.023887609373042917 + }, + { + "epoch": 0.15596599237760186, + "grad_norm": 5.6683871927044525, + "learning_rate": 3.118405627198124e-05, + "loss": 7.989017486572266, + "step": 266, + "token_acc": 0.023079469594480113 + }, + { + "epoch": 0.1565523306948109, + "grad_norm": 4.550302568228976, + "learning_rate": 3.130128956623681e-05, + "loss": 8.032395362854004, + "step": 267, + "token_acc": 0.024050520610699337 + }, + { + "epoch": 0.15713866901201995, + "grad_norm": 6.713584702227589, + "learning_rate": 3.1418522860492386e-05, + "loss": 8.073318481445312, + "step": 268, + "token_acc": 0.02364868286457964 + }, + { + "epoch": 0.15772500732922898, + "grad_norm": 3.1993517533470652, + "learning_rate": 3.153575615474795e-05, + "loss": 7.910236358642578, + "step": 269, + "token_acc": 0.02569669008944696 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 12.713592150068552, + "learning_rate": 3.165298944900352e-05, + "loss": 8.088440895080566, + "step": 270, + "token_acc": 0.02105357438504401 + }, + { + "epoch": 0.15889768396364704, + "grad_norm": 11.677275340003138, + "learning_rate": 3.177022274325909e-05, + "loss": 7.953788757324219, + "step": 271, + "token_acc": 0.022620417049463826 + }, + { + "epoch": 0.15948402228085606, + "grad_norm": 4.557457205967349, + "learning_rate": 3.188745603751466e-05, + "loss": 7.925534725189209, + "step": 272, + "token_acc": 0.02491990124847167 + }, + { + "epoch": 0.1600703605980651, + "grad_norm": 8.31131892366096, + "learning_rate": 3.200468933177023e-05, + "loss": 7.940683841705322, + "step": 273, + "token_acc": 0.024467306800732145 + }, + { + "epoch": 0.16065669891527412, + "grad_norm": 5.278990594903741, + "learning_rate": 3.212192262602579e-05, + "loss": 8.003097534179688, + "step": 274, + "token_acc": 0.023447019707503708 + }, + { + "epoch": 0.16124303723248315, + "grad_norm": 6.065647567055946, + "learning_rate": 3.223915592028136e-05, + "loss": 7.935528755187988, + "step": 275, + "token_acc": 0.024805437322650642 + }, + { + "epoch": 0.16182937554969218, + "grad_norm": 2.5249131219658145, + "learning_rate": 3.235638921453693e-05, + "loss": 7.942893028259277, + "step": 276, + "token_acc": 0.0249650614526599 + }, + { + "epoch": 0.1624157138669012, + "grad_norm": 11.358404063997227, + "learning_rate": 3.24736225087925e-05, + "loss": 7.845043182373047, + "step": 277, + "token_acc": 0.027566706532497525 + }, + { + "epoch": 0.16300205218411024, + "grad_norm": 6.6310532158063245, + "learning_rate": 3.259085580304807e-05, + "loss": 7.773580551147461, + "step": 278, + "token_acc": 0.02728788508370985 + }, + { + "epoch": 0.16358839050131926, + "grad_norm": 11.793723136999827, + "learning_rate": 3.270808909730364e-05, + "loss": 7.852505207061768, + "step": 279, + "token_acc": 0.02475860361475613 + }, + { + "epoch": 0.1641747288185283, + "grad_norm": 10.028946127084307, + "learning_rate": 3.282532239155921e-05, + "loss": 7.7964186668396, + "step": 280, + "token_acc": 0.0256860655973738 + }, + { + "epoch": 0.16476106713573732, + "grad_norm": 6.988201898998311, + "learning_rate": 3.294255568581478e-05, + "loss": 7.760739326477051, + "step": 281, + "token_acc": 0.029985963318567 + }, + { + "epoch": 0.16534740545294635, + "grad_norm": 7.0124436464722, + "learning_rate": 3.3059788980070346e-05, + "loss": 7.711109161376953, + "step": 282, + "token_acc": 0.029875808366755168 + }, + { + "epoch": 0.16593374377015538, + "grad_norm": 7.6616465604716995, + "learning_rate": 3.317702227432591e-05, + "loss": 7.811481952667236, + "step": 283, + "token_acc": 0.024813128320286103 + }, + { + "epoch": 0.1665200820873644, + "grad_norm": 7.060172207640459, + "learning_rate": 3.329425556858148e-05, + "loss": 7.762804985046387, + "step": 284, + "token_acc": 0.027798685207458658 + }, + { + "epoch": 0.16710642040457344, + "grad_norm": 4.784394588924861, + "learning_rate": 3.341148886283705e-05, + "loss": 7.639404773712158, + "step": 285, + "token_acc": 0.03056520248815575 + }, + { + "epoch": 0.16769275872178246, + "grad_norm": 1.5110618201600867, + "learning_rate": 3.352872215709262e-05, + "loss": 7.698606491088867, + "step": 286, + "token_acc": 0.029386059615246204 + }, + { + "epoch": 0.1682790970389915, + "grad_norm": 9.132929671450656, + "learning_rate": 3.364595545134818e-05, + "loss": 7.63512659072876, + "step": 287, + "token_acc": 0.029604158172619553 + }, + { + "epoch": 0.16886543535620052, + "grad_norm": 5.870506476761636, + "learning_rate": 3.376318874560375e-05, + "loss": 7.667540550231934, + "step": 288, + "token_acc": 0.02818635598471411 + }, + { + "epoch": 0.16945177367340955, + "grad_norm": 11.39258145479053, + "learning_rate": 3.388042203985932e-05, + "loss": 7.682027816772461, + "step": 289, + "token_acc": 0.03005629348996723 + }, + { + "epoch": 0.17003811199061858, + "grad_norm": 9.26264010364802, + "learning_rate": 3.399765533411489e-05, + "loss": 7.638578414916992, + "step": 290, + "token_acc": 0.03236761528777258 + }, + { + "epoch": 0.1706244503078276, + "grad_norm": 5.576872460057638, + "learning_rate": 3.411488862837046e-05, + "loss": 7.578747749328613, + "step": 291, + "token_acc": 0.031305617085568685 + }, + { + "epoch": 0.17121078862503664, + "grad_norm": 4.854351933904024, + "learning_rate": 3.423212192262603e-05, + "loss": 7.461580753326416, + "step": 292, + "token_acc": 0.034631231017537345 + }, + { + "epoch": 0.17179712694224566, + "grad_norm": 7.95537002735846, + "learning_rate": 3.43493552168816e-05, + "loss": 7.587418079376221, + "step": 293, + "token_acc": 0.03271779374030124 + }, + { + "epoch": 0.1723834652594547, + "grad_norm": 3.8841427282950365, + "learning_rate": 3.446658851113717e-05, + "loss": 7.523349285125732, + "step": 294, + "token_acc": 0.033108201384131716 + }, + { + "epoch": 0.17296980357666372, + "grad_norm": 3.271674798272289, + "learning_rate": 3.4583821805392736e-05, + "loss": 7.487792015075684, + "step": 295, + "token_acc": 0.03521963536444989 + }, + { + "epoch": 0.17355614189387278, + "grad_norm": 4.937817206942403, + "learning_rate": 3.47010550996483e-05, + "loss": 7.451592445373535, + "step": 296, + "token_acc": 0.0363185712795222 + }, + { + "epoch": 0.1741424802110818, + "grad_norm": 2.2811487818554443, + "learning_rate": 3.481828839390387e-05, + "loss": 7.391551494598389, + "step": 297, + "token_acc": 0.036459766394216504 + }, + { + "epoch": 0.17472881852829084, + "grad_norm": 4.656159086752642, + "learning_rate": 3.493552168815944e-05, + "loss": 7.45686149597168, + "step": 298, + "token_acc": 0.034385240265990706 + }, + { + "epoch": 0.17531515684549986, + "grad_norm": 6.681774346496673, + "learning_rate": 3.505275498241501e-05, + "loss": 7.279962062835693, + "step": 299, + "token_acc": 0.036752212223622564 + }, + { + "epoch": 0.1759014951627089, + "grad_norm": 3.006122301061248, + "learning_rate": 3.516998827667057e-05, + "loss": 7.289892196655273, + "step": 300, + "token_acc": 0.03830300798789032 + }, + { + "epoch": 0.17648783347991792, + "grad_norm": 7.749870951875948, + "learning_rate": 3.528722157092615e-05, + "loss": 7.46074104309082, + "step": 301, + "token_acc": 0.03685454597347739 + }, + { + "epoch": 0.17707417179712695, + "grad_norm": 4.507101515014864, + "learning_rate": 3.5404454865181716e-05, + "loss": 7.326682090759277, + "step": 302, + "token_acc": 0.03886698788144255 + }, + { + "epoch": 0.17766051011433598, + "grad_norm": 6.117967333469634, + "learning_rate": 3.5521688159437286e-05, + "loss": 7.266670227050781, + "step": 303, + "token_acc": 0.03918212128075341 + }, + { + "epoch": 0.178246848431545, + "grad_norm": 4.15836396325537, + "learning_rate": 3.5638921453692855e-05, + "loss": 7.240307807922363, + "step": 304, + "token_acc": 0.03883024628167284 + }, + { + "epoch": 0.17883318674875404, + "grad_norm": 4.675615474675205, + "learning_rate": 3.575615474794842e-05, + "loss": 7.203600883483887, + "step": 305, + "token_acc": 0.03955861084206741 + }, + { + "epoch": 0.17941952506596306, + "grad_norm": 4.909434257054275, + "learning_rate": 3.587338804220399e-05, + "loss": 7.126448631286621, + "step": 306, + "token_acc": 0.041463668269855716 + }, + { + "epoch": 0.1800058633831721, + "grad_norm": 5.506219891330509, + "learning_rate": 3.599062133645956e-05, + "loss": 7.274214744567871, + "step": 307, + "token_acc": 0.04332027850304613 + }, + { + "epoch": 0.18059220170038112, + "grad_norm": 3.781047379953383, + "learning_rate": 3.6107854630715126e-05, + "loss": 7.200839996337891, + "step": 308, + "token_acc": 0.04252858212020492 + }, + { + "epoch": 0.18117854001759015, + "grad_norm": 5.139988025106219, + "learning_rate": 3.622508792497069e-05, + "loss": 7.19188117980957, + "step": 309, + "token_acc": 0.04257972609502846 + }, + { + "epoch": 0.18176487833479918, + "grad_norm": 4.531100392177371, + "learning_rate": 3.634232121922626e-05, + "loss": 7.082777500152588, + "step": 310, + "token_acc": 0.045122115215524825 + }, + { + "epoch": 0.1823512166520082, + "grad_norm": 7.489128865005872, + "learning_rate": 3.645955451348183e-05, + "loss": 7.129194259643555, + "step": 311, + "token_acc": 0.043057136052081404 + }, + { + "epoch": 0.18293755496921724, + "grad_norm": 2.659038273550176, + "learning_rate": 3.65767878077374e-05, + "loss": 6.948609828948975, + "step": 312, + "token_acc": 0.0508242877217278 + }, + { + "epoch": 0.18352389328642627, + "grad_norm": 7.652578801222082, + "learning_rate": 3.669402110199297e-05, + "loss": 7.0207414627075195, + "step": 313, + "token_acc": 0.05026367704743116 + }, + { + "epoch": 0.1841102316036353, + "grad_norm": 5.807980961527149, + "learning_rate": 3.681125439624854e-05, + "loss": 7.00704288482666, + "step": 314, + "token_acc": 0.04856493435284969 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 3.0899571116273266, + "learning_rate": 3.6928487690504106e-05, + "loss": 6.859892845153809, + "step": 315, + "token_acc": 0.05295855992685723 + }, + { + "epoch": 0.18528290823805335, + "grad_norm": 9.622625931652337, + "learning_rate": 3.7045720984759676e-05, + "loss": 6.883492469787598, + "step": 316, + "token_acc": 0.051444997690056976 + }, + { + "epoch": 0.18586924655526238, + "grad_norm": 4.960630303186147, + "learning_rate": 3.7162954279015245e-05, + "loss": 6.878410339355469, + "step": 317, + "token_acc": 0.05236103850845095 + }, + { + "epoch": 0.1864555848724714, + "grad_norm": 8.175282079681656, + "learning_rate": 3.728018757327081e-05, + "loss": 6.859060287475586, + "step": 318, + "token_acc": 0.0509749130328842 + }, + { + "epoch": 0.18704192318968044, + "grad_norm": 6.115616224953417, + "learning_rate": 3.739742086752638e-05, + "loss": 6.931354999542236, + "step": 319, + "token_acc": 0.050930681099246186 + }, + { + "epoch": 0.18762826150688947, + "grad_norm": 6.459522185580921, + "learning_rate": 3.751465416178195e-05, + "loss": 6.845908164978027, + "step": 320, + "token_acc": 0.054294137119413215 + }, + { + "epoch": 0.1882145998240985, + "grad_norm": 4.692766055687629, + "learning_rate": 3.763188745603752e-05, + "loss": 6.842883586883545, + "step": 321, + "token_acc": 0.05154300972725539 + }, + { + "epoch": 0.18880093814130752, + "grad_norm": 7.042664582653199, + "learning_rate": 3.7749120750293086e-05, + "loss": 6.786920547485352, + "step": 322, + "token_acc": 0.05923182977914182 + }, + { + "epoch": 0.18938727645851655, + "grad_norm": 5.431578393082523, + "learning_rate": 3.786635404454865e-05, + "loss": 6.746435165405273, + "step": 323, + "token_acc": 0.05848481009985855 + }, + { + "epoch": 0.18997361477572558, + "grad_norm": 4.075530250856771, + "learning_rate": 3.7983587338804225e-05, + "loss": 6.725821495056152, + "step": 324, + "token_acc": 0.057761391070179406 + }, + { + "epoch": 0.19055995309293464, + "grad_norm": 6.003719707092143, + "learning_rate": 3.8100820633059795e-05, + "loss": 6.6018218994140625, + "step": 325, + "token_acc": 0.06687685381913036 + }, + { + "epoch": 0.19114629141014366, + "grad_norm": 4.201184398536375, + "learning_rate": 3.8218053927315364e-05, + "loss": 6.591614246368408, + "step": 326, + "token_acc": 0.06588746070688217 + }, + { + "epoch": 0.1917326297273527, + "grad_norm": 3.6096920754357935, + "learning_rate": 3.833528722157093e-05, + "loss": 6.626930236816406, + "step": 327, + "token_acc": 0.0645815578218781 + }, + { + "epoch": 0.19231896804456172, + "grad_norm": 6.394680532706906, + "learning_rate": 3.8452520515826496e-05, + "loss": 6.544139385223389, + "step": 328, + "token_acc": 0.0649723083069822 + }, + { + "epoch": 0.19290530636177075, + "grad_norm": 4.488174882421174, + "learning_rate": 3.8569753810082066e-05, + "loss": 6.556469440460205, + "step": 329, + "token_acc": 0.06285310734463277 + }, + { + "epoch": 0.19349164467897978, + "grad_norm": 5.898014370678826, + "learning_rate": 3.8686987104337636e-05, + "loss": 6.525766372680664, + "step": 330, + "token_acc": 0.07114072078716843 + }, + { + "epoch": 0.1940779829961888, + "grad_norm": 4.575229544093522, + "learning_rate": 3.88042203985932e-05, + "loss": 6.511011600494385, + "step": 331, + "token_acc": 0.07230858509008153 + }, + { + "epoch": 0.19466432131339784, + "grad_norm": 4.944474437604798, + "learning_rate": 3.892145369284877e-05, + "loss": 6.596277236938477, + "step": 332, + "token_acc": 0.06643491598196785 + }, + { + "epoch": 0.19525065963060687, + "grad_norm": 6.28951471147423, + "learning_rate": 3.903868698710434e-05, + "loss": 6.444920539855957, + "step": 333, + "token_acc": 0.07534779227512313 + }, + { + "epoch": 0.1958369979478159, + "grad_norm": 5.054363457676562, + "learning_rate": 3.915592028135991e-05, + "loss": 6.509368419647217, + "step": 334, + "token_acc": 0.06765982279145505 + }, + { + "epoch": 0.19642333626502492, + "grad_norm": 5.824608881894826, + "learning_rate": 3.9273153575615476e-05, + "loss": 6.386015892028809, + "step": 335, + "token_acc": 0.07861068389479499 + }, + { + "epoch": 0.19700967458223395, + "grad_norm": 4.696158819548083, + "learning_rate": 3.9390386869871046e-05, + "loss": 6.4276933670043945, + "step": 336, + "token_acc": 0.07102308817045741 + }, + { + "epoch": 0.19759601289944298, + "grad_norm": 4.966826764596138, + "learning_rate": 3.9507620164126615e-05, + "loss": 6.30764627456665, + "step": 337, + "token_acc": 0.08435175928724391 + }, + { + "epoch": 0.198182351216652, + "grad_norm": 3.5096544889134917, + "learning_rate": 3.9624853458382185e-05, + "loss": 6.336080551147461, + "step": 338, + "token_acc": 0.07917522227729282 + }, + { + "epoch": 0.19876868953386104, + "grad_norm": 4.475172724159575, + "learning_rate": 3.9742086752637754e-05, + "loss": 6.37416934967041, + "step": 339, + "token_acc": 0.07943514235455079 + }, + { + "epoch": 0.19935502785107007, + "grad_norm": 3.6635807270155736, + "learning_rate": 3.985932004689332e-05, + "loss": 6.239297866821289, + "step": 340, + "token_acc": 0.08446195973949082 + }, + { + "epoch": 0.1999413661682791, + "grad_norm": 7.001244208216398, + "learning_rate": 3.997655334114889e-05, + "loss": 6.343474388122559, + "step": 341, + "token_acc": 0.081755765294107 + }, + { + "epoch": 0.20052770448548812, + "grad_norm": 4.380565562063101, + "learning_rate": 4.0093786635404456e-05, + "loss": 6.209048271179199, + "step": 342, + "token_acc": 0.08533622570830124 + }, + { + "epoch": 0.20111404280269715, + "grad_norm": 5.347243976155213, + "learning_rate": 4.0211019929660026e-05, + "loss": 6.310052394866943, + "step": 343, + "token_acc": 0.0825726972771285 + }, + { + "epoch": 0.20170038111990618, + "grad_norm": 4.812075288202936, + "learning_rate": 4.0328253223915595e-05, + "loss": 6.210603713989258, + "step": 344, + "token_acc": 0.08699436953622759 + }, + { + "epoch": 0.2022867194371152, + "grad_norm": 7.151960616505751, + "learning_rate": 4.044548651817116e-05, + "loss": 6.280752182006836, + "step": 345, + "token_acc": 0.08506893140265447 + }, + { + "epoch": 0.20287305775432424, + "grad_norm": 3.8099430441961077, + "learning_rate": 4.056271981242673e-05, + "loss": 6.210249900817871, + "step": 346, + "token_acc": 0.08739278807262285 + }, + { + "epoch": 0.20345939607153327, + "grad_norm": 9.325598699943097, + "learning_rate": 4.06799531066823e-05, + "loss": 6.164924621582031, + "step": 347, + "token_acc": 0.08574209856965663 + }, + { + "epoch": 0.2040457343887423, + "grad_norm": 6.221575328878346, + "learning_rate": 4.079718640093787e-05, + "loss": 6.174298286437988, + "step": 348, + "token_acc": 0.08908505885098864 + }, + { + "epoch": 0.20463207270595132, + "grad_norm": 6.149406958395271, + "learning_rate": 4.0914419695193436e-05, + "loss": 6.135800361633301, + "step": 349, + "token_acc": 0.09458441428215142 + }, + { + "epoch": 0.20521841102316035, + "grad_norm": 5.832286508856503, + "learning_rate": 4.1031652989449006e-05, + "loss": 6.150198459625244, + "step": 350, + "token_acc": 0.09132320819287217 + }, + { + "epoch": 0.20580474934036938, + "grad_norm": 4.692689603551784, + "learning_rate": 4.1148886283704575e-05, + "loss": 6.036190986633301, + "step": 351, + "token_acc": 0.09885695917711347 + }, + { + "epoch": 0.2063910876575784, + "grad_norm": 4.601553977320466, + "learning_rate": 4.1266119577960145e-05, + "loss": 6.0102338790893555, + "step": 352, + "token_acc": 0.09844902060736734 + }, + { + "epoch": 0.20697742597478747, + "grad_norm": 4.7297483925632715, + "learning_rate": 4.1383352872215714e-05, + "loss": 6.127054214477539, + "step": 353, + "token_acc": 0.08951850197088224 + }, + { + "epoch": 0.2075637642919965, + "grad_norm": 5.755085063329411, + "learning_rate": 4.150058616647128e-05, + "loss": 6.058117389678955, + "step": 354, + "token_acc": 0.09324453412615784 + }, + { + "epoch": 0.20815010260920552, + "grad_norm": 5.341676855616793, + "learning_rate": 4.1617819460726846e-05, + "loss": 5.9347381591796875, + "step": 355, + "token_acc": 0.10432863113897596 + }, + { + "epoch": 0.20873644092641455, + "grad_norm": 5.395934916865773, + "learning_rate": 4.1735052754982416e-05, + "loss": 6.054256439208984, + "step": 356, + "token_acc": 0.09926763192347078 + }, + { + "epoch": 0.20932277924362358, + "grad_norm": 5.715870640293422, + "learning_rate": 4.1852286049237985e-05, + "loss": 5.997771739959717, + "step": 357, + "token_acc": 0.09700314021161956 + }, + { + "epoch": 0.2099091175608326, + "grad_norm": 3.5615944041553607, + "learning_rate": 4.1969519343493555e-05, + "loss": 5.895035743713379, + "step": 358, + "token_acc": 0.10641487918318648 + }, + { + "epoch": 0.21049545587804164, + "grad_norm": 5.845631819196494, + "learning_rate": 4.2086752637749124e-05, + "loss": 5.883098602294922, + "step": 359, + "token_acc": 0.10628197675940909 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 3.5360548107743766, + "learning_rate": 4.2203985932004694e-05, + "loss": 5.856866836547852, + "step": 360, + "token_acc": 0.10811878248745563 + }, + { + "epoch": 0.2116681325124597, + "grad_norm": 5.967455575364195, + "learning_rate": 4.2321219226260263e-05, + "loss": 5.933638572692871, + "step": 361, + "token_acc": 0.10281906474726657 + }, + { + "epoch": 0.21225447082966872, + "grad_norm": 4.229025443485074, + "learning_rate": 4.243845252051583e-05, + "loss": 5.913669109344482, + "step": 362, + "token_acc": 0.1050239128567077 + }, + { + "epoch": 0.21284080914687775, + "grad_norm": 4.529543652120965, + "learning_rate": 4.2555685814771396e-05, + "loss": 5.800420761108398, + "step": 363, + "token_acc": 0.11265826075653192 + }, + { + "epoch": 0.21342714746408678, + "grad_norm": 5.686450704195899, + "learning_rate": 4.2672919109026965e-05, + "loss": 5.866429328918457, + "step": 364, + "token_acc": 0.1057487212408623 + }, + { + "epoch": 0.2140134857812958, + "grad_norm": 4.077313786677668, + "learning_rate": 4.2790152403282535e-05, + "loss": 5.747271537780762, + "step": 365, + "token_acc": 0.11442014276304292 + }, + { + "epoch": 0.21459982409850484, + "grad_norm": 6.672966648822387, + "learning_rate": 4.2907385697538104e-05, + "loss": 5.730378150939941, + "step": 366, + "token_acc": 0.11439315034922191 + }, + { + "epoch": 0.21518616241571387, + "grad_norm": 2.7752563636797394, + "learning_rate": 4.302461899179367e-05, + "loss": 5.722157955169678, + "step": 367, + "token_acc": 0.11698321017378943 + }, + { + "epoch": 0.2157725007329229, + "grad_norm": 8.241006700989637, + "learning_rate": 4.3141852286049237e-05, + "loss": 5.77549934387207, + "step": 368, + "token_acc": 0.11451595990905626 + }, + { + "epoch": 0.21635883905013192, + "grad_norm": 5.269185513436011, + "learning_rate": 4.3259085580304806e-05, + "loss": 5.682419776916504, + "step": 369, + "token_acc": 0.12174918787329506 + }, + { + "epoch": 0.21694517736734095, + "grad_norm": 5.705536472809987, + "learning_rate": 4.3376318874560376e-05, + "loss": 5.659437656402588, + "step": 370, + "token_acc": 0.11823349019193148 + }, + { + "epoch": 0.21753151568454998, + "grad_norm": 5.457771189060109, + "learning_rate": 4.3493552168815945e-05, + "loss": 5.651723861694336, + "step": 371, + "token_acc": 0.12209207554199754 + }, + { + "epoch": 0.218117854001759, + "grad_norm": 3.1384572810976428, + "learning_rate": 4.3610785463071515e-05, + "loss": 5.68341064453125, + "step": 372, + "token_acc": 0.1172328908535052 + }, + { + "epoch": 0.21870419231896804, + "grad_norm": 6.773099153647383, + "learning_rate": 4.3728018757327084e-05, + "loss": 5.680211067199707, + "step": 373, + "token_acc": 0.1181388900922757 + }, + { + "epoch": 0.21929053063617707, + "grad_norm": 3.9603005154282895, + "learning_rate": 4.3845252051582654e-05, + "loss": 5.662923812866211, + "step": 374, + "token_acc": 0.12033459095852522 + }, + { + "epoch": 0.2198768689533861, + "grad_norm": 4.891664810590013, + "learning_rate": 4.396248534583822e-05, + "loss": 5.65479850769043, + "step": 375, + "token_acc": 0.11968197935379797 + }, + { + "epoch": 0.22046320727059512, + "grad_norm": 4.451815920987612, + "learning_rate": 4.4079718640093786e-05, + "loss": 5.569565773010254, + "step": 376, + "token_acc": 0.12573834146086366 + }, + { + "epoch": 0.22104954558780415, + "grad_norm": 3.6537013107000744, + "learning_rate": 4.4196951934349355e-05, + "loss": 5.645937919616699, + "step": 377, + "token_acc": 0.1212388384973406 + }, + { + "epoch": 0.22163588390501318, + "grad_norm": 3.8855429079124764, + "learning_rate": 4.4314185228604925e-05, + "loss": 5.52020263671875, + "step": 378, + "token_acc": 0.12892019050975298 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 5.587337565697956, + "learning_rate": 4.4431418522860494e-05, + "loss": 5.542252540588379, + "step": 379, + "token_acc": 0.13029775523505793 + }, + { + "epoch": 0.22280856053943124, + "grad_norm": 2.542651347601647, + "learning_rate": 4.454865181711606e-05, + "loss": 5.49782657623291, + "step": 380, + "token_acc": 0.12988392743879235 + }, + { + "epoch": 0.2233948988566403, + "grad_norm": 7.035213414596234, + "learning_rate": 4.4665885111371633e-05, + "loss": 5.471724987030029, + "step": 381, + "token_acc": 0.13173882035975593 + }, + { + "epoch": 0.22398123717384932, + "grad_norm": 4.007597974750773, + "learning_rate": 4.47831184056272e-05, + "loss": 5.421903610229492, + "step": 382, + "token_acc": 0.1353257786024259 + }, + { + "epoch": 0.22456757549105835, + "grad_norm": 6.123846328762211, + "learning_rate": 4.490035169988277e-05, + "loss": 5.4561638832092285, + "step": 383, + "token_acc": 0.13245959184311193 + }, + { + "epoch": 0.22515391380826738, + "grad_norm": 3.915813309836124, + "learning_rate": 4.501758499413834e-05, + "loss": 5.427371978759766, + "step": 384, + "token_acc": 0.13743677567931242 + }, + { + "epoch": 0.2257402521254764, + "grad_norm": 5.051389456308207, + "learning_rate": 4.5134818288393905e-05, + "loss": 5.4404191970825195, + "step": 385, + "token_acc": 0.13028250727540924 + }, + { + "epoch": 0.22632659044268544, + "grad_norm": 3.7147143922772434, + "learning_rate": 4.5252051582649474e-05, + "loss": 5.433538436889648, + "step": 386, + "token_acc": 0.1338136581432912 + }, + { + "epoch": 0.22691292875989447, + "grad_norm": 3.842044243055899, + "learning_rate": 4.5369284876905044e-05, + "loss": 5.426792144775391, + "step": 387, + "token_acc": 0.13355265071679231 + }, + { + "epoch": 0.2274992670771035, + "grad_norm": 4.494266776847402, + "learning_rate": 4.548651817116061e-05, + "loss": 5.399279594421387, + "step": 388, + "token_acc": 0.13653473341978656 + }, + { + "epoch": 0.22808560539431252, + "grad_norm": 4.0793197936546415, + "learning_rate": 4.5603751465416176e-05, + "loss": 5.424678325653076, + "step": 389, + "token_acc": 0.13530543056793912 + }, + { + "epoch": 0.22867194371152155, + "grad_norm": 4.937034919559551, + "learning_rate": 4.5720984759671746e-05, + "loss": 5.320995330810547, + "step": 390, + "token_acc": 0.1419976913509804 + }, + { + "epoch": 0.22925828202873058, + "grad_norm": 5.49299598158518, + "learning_rate": 4.5838218053927315e-05, + "loss": 5.3018798828125, + "step": 391, + "token_acc": 0.14444821839035063 + }, + { + "epoch": 0.2298446203459396, + "grad_norm": 3.8325353825306303, + "learning_rate": 4.5955451348182885e-05, + "loss": 5.3399224281311035, + "step": 392, + "token_acc": 0.14038277289548506 + }, + { + "epoch": 0.23043095866314864, + "grad_norm": 5.20437433350758, + "learning_rate": 4.6072684642438454e-05, + "loss": 5.305937767028809, + "step": 393, + "token_acc": 0.14236234978409376 + }, + { + "epoch": 0.23101729698035767, + "grad_norm": 3.728657536120107, + "learning_rate": 4.6189917936694024e-05, + "loss": 5.201369285583496, + "step": 394, + "token_acc": 0.1493447073297205 + }, + { + "epoch": 0.2316036352975667, + "grad_norm": 2.9344606334239405, + "learning_rate": 4.630715123094959e-05, + "loss": 5.297689437866211, + "step": 395, + "token_acc": 0.14088288217292513 + }, + { + "epoch": 0.23218997361477572, + "grad_norm": 5.086839632347553, + "learning_rate": 4.642438452520516e-05, + "loss": 5.243603706359863, + "step": 396, + "token_acc": 0.14902199223803364 + }, + { + "epoch": 0.23277631193198475, + "grad_norm": 3.210427720308006, + "learning_rate": 4.654161781946073e-05, + "loss": 5.270680904388428, + "step": 397, + "token_acc": 0.14516630888662863 + }, + { + "epoch": 0.23336265024919378, + "grad_norm": 4.778326085569365, + "learning_rate": 4.6658851113716295e-05, + "loss": 5.325577735900879, + "step": 398, + "token_acc": 0.13558950291864524 + }, + { + "epoch": 0.2339489885664028, + "grad_norm": 3.6557315264797308, + "learning_rate": 4.6776084407971864e-05, + "loss": 5.205186367034912, + "step": 399, + "token_acc": 0.14969502934116122 + }, + { + "epoch": 0.23453532688361184, + "grad_norm": 5.398471352403463, + "learning_rate": 4.6893317702227434e-05, + "loss": 5.219882011413574, + "step": 400, + "token_acc": 0.14776963416945726 + }, + { + "epoch": 0.23512166520082087, + "grad_norm": 3.5376565697782043, + "learning_rate": 4.7010550996483003e-05, + "loss": 5.163187026977539, + "step": 401, + "token_acc": 0.15280870318055817 + }, + { + "epoch": 0.2357080035180299, + "grad_norm": 6.476770641315054, + "learning_rate": 4.712778429073857e-05, + "loss": 5.174548149108887, + "step": 402, + "token_acc": 0.14979405187777747 + }, + { + "epoch": 0.23629434183523892, + "grad_norm": 4.423286572106772, + "learning_rate": 4.7245017584994136e-05, + "loss": 5.136268138885498, + "step": 403, + "token_acc": 0.15266173311052758 + }, + { + "epoch": 0.23688068015244795, + "grad_norm": 4.778090928904879, + "learning_rate": 4.736225087924971e-05, + "loss": 5.147398471832275, + "step": 404, + "token_acc": 0.15271420087556561 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 3.827407600350183, + "learning_rate": 4.747948417350528e-05, + "loss": 5.075559616088867, + "step": 405, + "token_acc": 0.15962479565608195 + }, + { + "epoch": 0.238053356786866, + "grad_norm": 3.0031702278636634, + "learning_rate": 4.759671746776085e-05, + "loss": 5.111988067626953, + "step": 406, + "token_acc": 0.15723925391241372 + }, + { + "epoch": 0.23863969510407504, + "grad_norm": 5.420472709231712, + "learning_rate": 4.7713950762016414e-05, + "loss": 5.096987724304199, + "step": 407, + "token_acc": 0.1567969515346462 + }, + { + "epoch": 0.23922603342128407, + "grad_norm": 5.449500272978671, + "learning_rate": 4.783118405627198e-05, + "loss": 5.149758338928223, + "step": 408, + "token_acc": 0.1505394283172061 + }, + { + "epoch": 0.23981237173849312, + "grad_norm": 4.608375821577883, + "learning_rate": 4.794841735052755e-05, + "loss": 5.046197891235352, + "step": 409, + "token_acc": 0.15915106813062027 + }, + { + "epoch": 0.24039871005570215, + "grad_norm": 3.4576392609912467, + "learning_rate": 4.806565064478312e-05, + "loss": 5.0519208908081055, + "step": 410, + "token_acc": 0.16108123017514825 + }, + { + "epoch": 0.24098504837291118, + "grad_norm": 4.179272701853546, + "learning_rate": 4.8182883939038685e-05, + "loss": 4.946943283081055, + "step": 411, + "token_acc": 0.16871528649976275 + }, + { + "epoch": 0.2415713866901202, + "grad_norm": 3.675872246847288, + "learning_rate": 4.8300117233294255e-05, + "loss": 5.007920265197754, + "step": 412, + "token_acc": 0.16317211767851533 + }, + { + "epoch": 0.24215772500732924, + "grad_norm": 4.9142371455491975, + "learning_rate": 4.8417350527549824e-05, + "loss": 5.013101577758789, + "step": 413, + "token_acc": 0.15803190248750165 + }, + { + "epoch": 0.24274406332453827, + "grad_norm": 3.1770908201582193, + "learning_rate": 4.8534583821805394e-05, + "loss": 4.974532604217529, + "step": 414, + "token_acc": 0.16620475884176283 + }, + { + "epoch": 0.2433304016417473, + "grad_norm": 5.089356478874053, + "learning_rate": 4.865181711606096e-05, + "loss": 5.017539978027344, + "step": 415, + "token_acc": 0.15953471249268353 + }, + { + "epoch": 0.24391673995895632, + "grad_norm": 3.0222424743635856, + "learning_rate": 4.876905041031653e-05, + "loss": 5.045172691345215, + "step": 416, + "token_acc": 0.1563271753528109 + }, + { + "epoch": 0.24450307827616535, + "grad_norm": 4.265443053126318, + "learning_rate": 4.88862837045721e-05, + "loss": 4.937203407287598, + "step": 417, + "token_acc": 0.16491518885726497 + }, + { + "epoch": 0.24508941659337438, + "grad_norm": 3.310746147527371, + "learning_rate": 4.900351699882767e-05, + "loss": 4.929368019104004, + "step": 418, + "token_acc": 0.16673037442441518 + }, + { + "epoch": 0.2456757549105834, + "grad_norm": 3.9085209966297474, + "learning_rate": 4.912075029308324e-05, + "loss": 4.954566955566406, + "step": 419, + "token_acc": 0.163182649935057 + }, + { + "epoch": 0.24626209322779244, + "grad_norm": 5.4866721056533585, + "learning_rate": 4.9237983587338804e-05, + "loss": 4.969094276428223, + "step": 420, + "token_acc": 0.15997041320646874 + }, + { + "epoch": 0.24684843154500147, + "grad_norm": 3.3752213125571915, + "learning_rate": 4.9355216881594373e-05, + "loss": 4.968939304351807, + "step": 421, + "token_acc": 0.1638807205676675 + }, + { + "epoch": 0.2474347698622105, + "grad_norm": 4.646425434911336, + "learning_rate": 4.947245017584994e-05, + "loss": 4.950305938720703, + "step": 422, + "token_acc": 0.16355955125645616 + }, + { + "epoch": 0.24802110817941952, + "grad_norm": 3.8480374814367377, + "learning_rate": 4.958968347010551e-05, + "loss": 4.9508562088012695, + "step": 423, + "token_acc": 0.16265855959744208 + }, + { + "epoch": 0.24860744649662855, + "grad_norm": 2.7654982744712124, + "learning_rate": 4.970691676436108e-05, + "loss": 4.805039405822754, + "step": 424, + "token_acc": 0.1760382252263728 + }, + { + "epoch": 0.24919378481383758, + "grad_norm": 4.423193541228044, + "learning_rate": 4.9824150058616645e-05, + "loss": 4.82535457611084, + "step": 425, + "token_acc": 0.17515355442282757 + }, + { + "epoch": 0.2497801231310466, + "grad_norm": 4.61103093354574, + "learning_rate": 4.9941383352872214e-05, + "loss": 4.852334499359131, + "step": 426, + "token_acc": 0.1716508990275111 + }, + { + "epoch": 0.25036646144825564, + "grad_norm": 3.5509009977431463, + "learning_rate": 5.005861664712779e-05, + "loss": 4.856379508972168, + "step": 427, + "token_acc": 0.17005668037198926 + }, + { + "epoch": 0.2509527997654647, + "grad_norm": 3.2601719431377774, + "learning_rate": 5.017584994138336e-05, + "loss": 4.8676605224609375, + "step": 428, + "token_acc": 0.16826130775559006 + }, + { + "epoch": 0.2515391380826737, + "grad_norm": 4.232915465436352, + "learning_rate": 5.029308323563893e-05, + "loss": 4.851711273193359, + "step": 429, + "token_acc": 0.17420452430952443 + }, + { + "epoch": 0.25212547639988275, + "grad_norm": 3.884975874428379, + "learning_rate": 5.04103165298945e-05, + "loss": 4.870162487030029, + "step": 430, + "token_acc": 0.1702800798191851 + }, + { + "epoch": 0.25271181471709175, + "grad_norm": 4.682919672338841, + "learning_rate": 5.0527549824150055e-05, + "loss": 4.756889343261719, + "step": 431, + "token_acc": 0.18132139178353895 + }, + { + "epoch": 0.2532981530343008, + "grad_norm": 3.3294638413452087, + "learning_rate": 5.0644783118405625e-05, + "loss": 4.807443618774414, + "step": 432, + "token_acc": 0.1746914565177199 + }, + { + "epoch": 0.2538844913515098, + "grad_norm": 4.029265109002957, + "learning_rate": 5.0762016412661194e-05, + "loss": 4.8091630935668945, + "step": 433, + "token_acc": 0.17244274906875232 + }, + { + "epoch": 0.25447082966871887, + "grad_norm": 4.566084594423671, + "learning_rate": 5.0879249706916764e-05, + "loss": 4.794814586639404, + "step": 434, + "token_acc": 0.17268670627378901 + }, + { + "epoch": 0.25505716798592787, + "grad_norm": 2.671062649057688, + "learning_rate": 5.099648300117233e-05, + "loss": 4.72795295715332, + "step": 435, + "token_acc": 0.17818418092444874 + }, + { + "epoch": 0.2556435063031369, + "grad_norm": 4.478555987155282, + "learning_rate": 5.11137162954279e-05, + "loss": 4.773293495178223, + "step": 436, + "token_acc": 0.1773602397430093 + }, + { + "epoch": 0.2562298446203459, + "grad_norm": 2.657925898560469, + "learning_rate": 5.123094958968347e-05, + "loss": 4.810330867767334, + "step": 437, + "token_acc": 0.1714175287751882 + }, + { + "epoch": 0.256816182937555, + "grad_norm": 6.023820501185185, + "learning_rate": 5.134818288393904e-05, + "loss": 4.823758125305176, + "step": 438, + "token_acc": 0.17141139311959966 + }, + { + "epoch": 0.257402521254764, + "grad_norm": 3.3973305175625237, + "learning_rate": 5.146541617819461e-05, + "loss": 4.7239603996276855, + "step": 439, + "token_acc": 0.18132137159824996 + }, + { + "epoch": 0.25798885957197304, + "grad_norm": 5.365086437028126, + "learning_rate": 5.1582649472450174e-05, + "loss": 4.755255222320557, + "step": 440, + "token_acc": 0.17837833469374562 + }, + { + "epoch": 0.25857519788918204, + "grad_norm": 4.070008151234846, + "learning_rate": 5.1699882766705743e-05, + "loss": 4.7630934715271, + "step": 441, + "token_acc": 0.17507118100162025 + }, + { + "epoch": 0.2591615362063911, + "grad_norm": 4.015443808366917, + "learning_rate": 5.181711606096131e-05, + "loss": 4.7503342628479, + "step": 442, + "token_acc": 0.1794966939295083 + }, + { + "epoch": 0.2597478745236001, + "grad_norm": 3.7405364322711496, + "learning_rate": 5.193434935521688e-05, + "loss": 4.753812313079834, + "step": 443, + "token_acc": 0.17403056114921983 + }, + { + "epoch": 0.26033421284080915, + "grad_norm": 4.255768332015965, + "learning_rate": 5.205158264947245e-05, + "loss": 4.750164031982422, + "step": 444, + "token_acc": 0.17661911771639313 + }, + { + "epoch": 0.26092055115801815, + "grad_norm": 3.344182570884632, + "learning_rate": 5.216881594372802e-05, + "loss": 4.712779998779297, + "step": 445, + "token_acc": 0.17827867186589672 + }, + { + "epoch": 0.2615068894752272, + "grad_norm": 3.2942120060425526, + "learning_rate": 5.228604923798359e-05, + "loss": 4.713029861450195, + "step": 446, + "token_acc": 0.17849801394105191 + }, + { + "epoch": 0.2620932277924362, + "grad_norm": 3.631913296407001, + "learning_rate": 5.240328253223916e-05, + "loss": 4.672717094421387, + "step": 447, + "token_acc": 0.1823320681065526 + }, + { + "epoch": 0.26267956610964527, + "grad_norm": 3.5917764041423896, + "learning_rate": 5.252051582649473e-05, + "loss": 4.751844882965088, + "step": 448, + "token_acc": 0.17539330716575505 + }, + { + "epoch": 0.26326590442685427, + "grad_norm": 4.66199453953803, + "learning_rate": 5.263774912075029e-05, + "loss": 4.6465606689453125, + "step": 449, + "token_acc": 0.18647601638893072 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 3.110033847968872, + "learning_rate": 5.275498241500586e-05, + "loss": 4.732669830322266, + "step": 450, + "token_acc": 0.17615747478164193 + }, + { + "epoch": 0.2644385810612723, + "grad_norm": 6.136425267770306, + "learning_rate": 5.287221570926143e-05, + "loss": 4.752108573913574, + "step": 451, + "token_acc": 0.17556507392085308 + }, + { + "epoch": 0.2650249193784814, + "grad_norm": 2.454061213324948, + "learning_rate": 5.2989449003517e-05, + "loss": 4.7041215896606445, + "step": 452, + "token_acc": 0.17846731011200684 + }, + { + "epoch": 0.26561125769569044, + "grad_norm": 5.095585691439736, + "learning_rate": 5.310668229777257e-05, + "loss": 4.647067070007324, + "step": 453, + "token_acc": 0.18367408715393832 + }, + { + "epoch": 0.26619759601289944, + "grad_norm": 3.912351219840878, + "learning_rate": 5.322391559202814e-05, + "loss": 4.601974010467529, + "step": 454, + "token_acc": 0.18942634666886832 + }, + { + "epoch": 0.2667839343301085, + "grad_norm": 3.1909408395324905, + "learning_rate": 5.334114888628371e-05, + "loss": 4.691964149475098, + "step": 455, + "token_acc": 0.17834420158649184 + }, + { + "epoch": 0.2673702726473175, + "grad_norm": 3.6206275155656824, + "learning_rate": 5.345838218053928e-05, + "loss": 4.594367027282715, + "step": 456, + "token_acc": 0.18779055573401862 + }, + { + "epoch": 0.26795661096452655, + "grad_norm": 3.0073995915152962, + "learning_rate": 5.357561547479485e-05, + "loss": 4.679717063903809, + "step": 457, + "token_acc": 0.18085086832674813 + }, + { + "epoch": 0.26854294928173555, + "grad_norm": 3.7129917382547335, + "learning_rate": 5.3692848769050405e-05, + "loss": 4.6553473472595215, + "step": 458, + "token_acc": 0.1805312309849467 + }, + { + "epoch": 0.2691292875989446, + "grad_norm": 2.7666424999717423, + "learning_rate": 5.3810082063305974e-05, + "loss": 4.669938087463379, + "step": 459, + "token_acc": 0.1797411519543521 + }, + { + "epoch": 0.2697156259161536, + "grad_norm": 3.4278824474403264, + "learning_rate": 5.392731535756155e-05, + "loss": 4.65788459777832, + "step": 460, + "token_acc": 0.17988725461707755 + }, + { + "epoch": 0.27030196423336267, + "grad_norm": 3.8472485397278873, + "learning_rate": 5.404454865181712e-05, + "loss": 4.689602851867676, + "step": 461, + "token_acc": 0.17704811200316223 + }, + { + "epoch": 0.27088830255057167, + "grad_norm": 2.823584758680837, + "learning_rate": 5.416178194607269e-05, + "loss": 4.56749153137207, + "step": 462, + "token_acc": 0.1920817936233539 + }, + { + "epoch": 0.2714746408677807, + "grad_norm": 3.6490140941426192, + "learning_rate": 5.427901524032826e-05, + "loss": 4.528097152709961, + "step": 463, + "token_acc": 0.19415251296067332 + }, + { + "epoch": 0.2720609791849897, + "grad_norm": 4.7132989375231755, + "learning_rate": 5.439624853458383e-05, + "loss": 4.581571578979492, + "step": 464, + "token_acc": 0.18678572372070906 + }, + { + "epoch": 0.2726473175021988, + "grad_norm": 2.755636297750266, + "learning_rate": 5.45134818288394e-05, + "loss": 4.650444984436035, + "step": 465, + "token_acc": 0.17997130267645373 + }, + { + "epoch": 0.2732336558194078, + "grad_norm": 4.397395293628881, + "learning_rate": 5.463071512309497e-05, + "loss": 4.568827152252197, + "step": 466, + "token_acc": 0.1866361042097733 + }, + { + "epoch": 0.27381999413661684, + "grad_norm": 2.986730148647632, + "learning_rate": 5.4747948417350524e-05, + "loss": 4.6009202003479, + "step": 467, + "token_acc": 0.18425193276548105 + }, + { + "epoch": 0.27440633245382584, + "grad_norm": 4.699682073573299, + "learning_rate": 5.486518171160609e-05, + "loss": 4.602838516235352, + "step": 468, + "token_acc": 0.1868637541790316 + }, + { + "epoch": 0.2749926707710349, + "grad_norm": 2.8796390464984034, + "learning_rate": 5.498241500586166e-05, + "loss": 4.545544624328613, + "step": 469, + "token_acc": 0.18741544055470996 + }, + { + "epoch": 0.2755790090882439, + "grad_norm": 5.3082079793418115, + "learning_rate": 5.509964830011723e-05, + "loss": 4.601395130157471, + "step": 470, + "token_acc": 0.18333350748140098 + }, + { + "epoch": 0.27616534740545295, + "grad_norm": 3.121777272541811, + "learning_rate": 5.52168815943728e-05, + "loss": 4.559847354888916, + "step": 471, + "token_acc": 0.19044785427520886 + }, + { + "epoch": 0.27675168572266196, + "grad_norm": 3.5974998778561282, + "learning_rate": 5.533411488862837e-05, + "loss": 4.589479446411133, + "step": 472, + "token_acc": 0.18454644037680137 + }, + { + "epoch": 0.277338024039871, + "grad_norm": 3.7081488343461184, + "learning_rate": 5.545134818288395e-05, + "loss": 4.587972640991211, + "step": 473, + "token_acc": 0.1865380705053056 + }, + { + "epoch": 0.27792436235708, + "grad_norm": 3.132876670561137, + "learning_rate": 5.556858147713952e-05, + "loss": 4.512928485870361, + "step": 474, + "token_acc": 0.19116964313349222 + }, + { + "epoch": 0.27851070067428907, + "grad_norm": 4.563254403654539, + "learning_rate": 5.568581477139509e-05, + "loss": 4.464240074157715, + "step": 475, + "token_acc": 0.1967592958287914 + }, + { + "epoch": 0.27909703899149807, + "grad_norm": 2.531606706540315, + "learning_rate": 5.580304806565064e-05, + "loss": 4.568399429321289, + "step": 476, + "token_acc": 0.18655219884448201 + }, + { + "epoch": 0.2796833773087071, + "grad_norm": 3.246063240066807, + "learning_rate": 5.592028135990621e-05, + "loss": 4.511410236358643, + "step": 477, + "token_acc": 0.1905886843449062 + }, + { + "epoch": 0.2802697156259161, + "grad_norm": 2.9008077674361146, + "learning_rate": 5.603751465416178e-05, + "loss": 4.5342512130737305, + "step": 478, + "token_acc": 0.1906546070687063 + }, + { + "epoch": 0.2808560539431252, + "grad_norm": 4.482368078854062, + "learning_rate": 5.615474794841735e-05, + "loss": 4.595100402832031, + "step": 479, + "token_acc": 0.18227198923855414 + }, + { + "epoch": 0.28144239226033424, + "grad_norm": 2.8591624906446964, + "learning_rate": 5.627198124267292e-05, + "loss": 4.464855194091797, + "step": 480, + "token_acc": 0.19437381590122166 + }, + { + "epoch": 0.28202873057754324, + "grad_norm": 3.732394683730308, + "learning_rate": 5.638921453692849e-05, + "loss": 4.487558364868164, + "step": 481, + "token_acc": 0.19142626325801995 + }, + { + "epoch": 0.2826150688947523, + "grad_norm": 3.0738507311070222, + "learning_rate": 5.650644783118406e-05, + "loss": 4.506575107574463, + "step": 482, + "token_acc": 0.18991745469496188 + }, + { + "epoch": 0.2832014072119613, + "grad_norm": 2.640228879884843, + "learning_rate": 5.662368112543963e-05, + "loss": 4.508360385894775, + "step": 483, + "token_acc": 0.1901190896678247 + }, + { + "epoch": 0.28378774552917035, + "grad_norm": 4.1319681597755284, + "learning_rate": 5.67409144196952e-05, + "loss": 4.513545036315918, + "step": 484, + "token_acc": 0.1895675356064771 + }, + { + "epoch": 0.28437408384637936, + "grad_norm": 1.9575377377133154, + "learning_rate": 5.685814771395076e-05, + "loss": 4.48956298828125, + "step": 485, + "token_acc": 0.19050407965148464 + }, + { + "epoch": 0.2849604221635884, + "grad_norm": 4.50932084457957, + "learning_rate": 5.697538100820633e-05, + "loss": 4.452964782714844, + "step": 486, + "token_acc": 0.1954853463892806 + }, + { + "epoch": 0.2855467604807974, + "grad_norm": 2.9427026965492, + "learning_rate": 5.70926143024619e-05, + "loss": 4.47849178314209, + "step": 487, + "token_acc": 0.19249258797281396 + }, + { + "epoch": 0.28613309879800647, + "grad_norm": 3.480974505558243, + "learning_rate": 5.720984759671747e-05, + "loss": 4.5085368156433105, + "step": 488, + "token_acc": 0.19087664413766048 + }, + { + "epoch": 0.28671943711521547, + "grad_norm": 3.7232006890841616, + "learning_rate": 5.732708089097304e-05, + "loss": 4.407660484313965, + "step": 489, + "token_acc": 0.1992382192647413 + }, + { + "epoch": 0.2873057754324245, + "grad_norm": 2.7136169159354, + "learning_rate": 5.744431418522861e-05, + "loss": 4.480723857879639, + "step": 490, + "token_acc": 0.19082514871551567 + }, + { + "epoch": 0.2878921137496335, + "grad_norm": 2.9036343074174873, + "learning_rate": 5.756154747948418e-05, + "loss": 4.38192892074585, + "step": 491, + "token_acc": 0.20023494677312043 + }, + { + "epoch": 0.2884784520668426, + "grad_norm": 3.5612553025697067, + "learning_rate": 5.767878077373975e-05, + "loss": 4.463859558105469, + "step": 492, + "token_acc": 0.19478429944158673 + }, + { + "epoch": 0.2890647903840516, + "grad_norm": 4.205181719826563, + "learning_rate": 5.779601406799532e-05, + "loss": 4.446661472320557, + "step": 493, + "token_acc": 0.19458880257165817 + }, + { + "epoch": 0.28965112870126064, + "grad_norm": 3.5094414677268015, + "learning_rate": 5.791324736225088e-05, + "loss": 4.450451850891113, + "step": 494, + "token_acc": 0.19391485323613777 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 3.34698573124997, + "learning_rate": 5.803048065650645e-05, + "loss": 4.456144332885742, + "step": 495, + "token_acc": 0.19304447214220327 + }, + { + "epoch": 0.2908238053356787, + "grad_norm": 2.753792317524598, + "learning_rate": 5.814771395076202e-05, + "loss": 4.411314010620117, + "step": 496, + "token_acc": 0.19554556279769367 + }, + { + "epoch": 0.2914101436528877, + "grad_norm": 3.8606605865456904, + "learning_rate": 5.826494724501759e-05, + "loss": 4.470166206359863, + "step": 497, + "token_acc": 0.1917171439892062 + }, + { + "epoch": 0.29199648197009676, + "grad_norm": 2.455948618551917, + "learning_rate": 5.838218053927316e-05, + "loss": 4.377389430999756, + "step": 498, + "token_acc": 0.19982438861704732 + }, + { + "epoch": 0.29258282028730576, + "grad_norm": 5.343417119635586, + "learning_rate": 5.849941383352873e-05, + "loss": 4.406900882720947, + "step": 499, + "token_acc": 0.1980844031385122 + }, + { + "epoch": 0.2931691586045148, + "grad_norm": 2.9239887603882244, + "learning_rate": 5.86166471277843e-05, + "loss": 4.51575231552124, + "step": 500, + "token_acc": 0.1866204138401301 + }, + { + "epoch": 0.2937554969217238, + "grad_norm": 2.800433513644758, + "learning_rate": 5.873388042203987e-05, + "loss": 4.3633198738098145, + "step": 501, + "token_acc": 0.20142863367361846 + }, + { + "epoch": 0.29434183523893287, + "grad_norm": 3.828413682092989, + "learning_rate": 5.8851113716295437e-05, + "loss": 4.386670112609863, + "step": 502, + "token_acc": 0.20040446044356172 + }, + { + "epoch": 0.29492817355614187, + "grad_norm": 2.8804133009076853, + "learning_rate": 5.896834701055099e-05, + "loss": 4.323336601257324, + "step": 503, + "token_acc": 0.20636026312722425 + }, + { + "epoch": 0.2955145118733509, + "grad_norm": 2.643471056930714, + "learning_rate": 5.908558030480656e-05, + "loss": 4.417023658752441, + "step": 504, + "token_acc": 0.19429314731992475 + }, + { + "epoch": 0.2961008501905599, + "grad_norm": 3.369462429534447, + "learning_rate": 5.920281359906213e-05, + "loss": 4.443114757537842, + "step": 505, + "token_acc": 0.19048538440746032 + }, + { + "epoch": 0.296687188507769, + "grad_norm": 3.7701041467513283, + "learning_rate": 5.932004689331771e-05, + "loss": 4.392706394195557, + "step": 506, + "token_acc": 0.19715420061212915 + }, + { + "epoch": 0.297273526824978, + "grad_norm": 2.4770220668655187, + "learning_rate": 5.943728018757328e-05, + "loss": 4.408752918243408, + "step": 507, + "token_acc": 0.19628239302219208 + }, + { + "epoch": 0.29785986514218704, + "grad_norm": 3.7387126188839277, + "learning_rate": 5.955451348182885e-05, + "loss": 4.347857475280762, + "step": 508, + "token_acc": 0.20171198176137803 + }, + { + "epoch": 0.2984462034593961, + "grad_norm": 3.23206754272277, + "learning_rate": 5.9671746776084416e-05, + "loss": 4.395596981048584, + "step": 509, + "token_acc": 0.19687374442765032 + }, + { + "epoch": 0.2990325417766051, + "grad_norm": 3.358368391772889, + "learning_rate": 5.9788980070339986e-05, + "loss": 4.325115203857422, + "step": 510, + "token_acc": 0.2025297587359724 + }, + { + "epoch": 0.29961888009381415, + "grad_norm": 2.6976862248522515, + "learning_rate": 5.990621336459554e-05, + "loss": 4.369183540344238, + "step": 511, + "token_acc": 0.19953872324438277 + }, + { + "epoch": 0.30020521841102316, + "grad_norm": 2.4095643340089823, + "learning_rate": 6.002344665885111e-05, + "loss": 4.426271438598633, + "step": 512, + "token_acc": 0.19135800869278996 + }, + { + "epoch": 0.3007915567282322, + "grad_norm": 3.8703607332517675, + "learning_rate": 6.014067995310668e-05, + "loss": 4.4024763107299805, + "step": 513, + "token_acc": 0.1927203055160008 + }, + { + "epoch": 0.3013778950454412, + "grad_norm": 1.6644479259440794, + "learning_rate": 6.025791324736225e-05, + "loss": 4.343623161315918, + "step": 514, + "token_acc": 0.19906187400591813 + }, + { + "epoch": 0.30196423336265027, + "grad_norm": 3.480729105453486, + "learning_rate": 6.037514654161782e-05, + "loss": 4.367980480194092, + "step": 515, + "token_acc": 0.1987372864490044 + }, + { + "epoch": 0.30255057167985927, + "grad_norm": 2.6814707008914342, + "learning_rate": 6.049237983587339e-05, + "loss": 4.429714679718018, + "step": 516, + "token_acc": 0.19339287208232242 + }, + { + "epoch": 0.3031369099970683, + "grad_norm": 2.785312769386459, + "learning_rate": 6.060961313012896e-05, + "loss": 4.3388566970825195, + "step": 517, + "token_acc": 0.20033506676621549 + }, + { + "epoch": 0.3037232483142773, + "grad_norm": 2.495024892629932, + "learning_rate": 6.072684642438453e-05, + "loss": 4.331653594970703, + "step": 518, + "token_acc": 0.20044875808808182 + }, + { + "epoch": 0.3043095866314864, + "grad_norm": 3.4708620091733438, + "learning_rate": 6.08440797186401e-05, + "loss": 4.390961647033691, + "step": 519, + "token_acc": 0.1969736034213751 + }, + { + "epoch": 0.3048959249486954, + "grad_norm": 3.033005292285393, + "learning_rate": 6.096131301289566e-05, + "loss": 4.349078178405762, + "step": 520, + "token_acc": 0.19735757707282164 + }, + { + "epoch": 0.30548226326590444, + "grad_norm": 2.5937866371977747, + "learning_rate": 6.107854630715122e-05, + "loss": 4.394092559814453, + "step": 521, + "token_acc": 0.1937719465109225 + }, + { + "epoch": 0.30606860158311344, + "grad_norm": 3.1840985236302886, + "learning_rate": 6.11957796014068e-05, + "loss": 4.303842544555664, + "step": 522, + "token_acc": 0.20364115984287895 + }, + { + "epoch": 0.3066549399003225, + "grad_norm": 3.1289547158734514, + "learning_rate": 6.131301289566238e-05, + "loss": 4.325160503387451, + "step": 523, + "token_acc": 0.20073300492610838 + }, + { + "epoch": 0.3072412782175315, + "grad_norm": 3.0988410176487204, + "learning_rate": 6.143024618991794e-05, + "loss": 4.369857311248779, + "step": 524, + "token_acc": 0.1947928981836714 + }, + { + "epoch": 0.30782761653474056, + "grad_norm": 2.41363271069774, + "learning_rate": 6.154747948417352e-05, + "loss": 4.2893781661987305, + "step": 525, + "token_acc": 0.20386245102430636 + }, + { + "epoch": 0.30841395485194956, + "grad_norm": 3.4778544856462688, + "learning_rate": 6.166471277842908e-05, + "loss": 4.30779504776001, + "step": 526, + "token_acc": 0.2030679195216372 + }, + { + "epoch": 0.3090002931691586, + "grad_norm": 2.251000694494349, + "learning_rate": 6.178194607268465e-05, + "loss": 4.313277244567871, + "step": 527, + "token_acc": 0.20145233757064016 + }, + { + "epoch": 0.3095866314863676, + "grad_norm": 3.6065247266884075, + "learning_rate": 6.189917936694022e-05, + "loss": 4.308966159820557, + "step": 528, + "token_acc": 0.2023708563351735 + }, + { + "epoch": 0.31017296980357667, + "grad_norm": 2.961381538778253, + "learning_rate": 6.201641266119578e-05, + "loss": 4.340670108795166, + "step": 529, + "token_acc": 0.1994566223499258 + }, + { + "epoch": 0.31075930812078567, + "grad_norm": 3.125363991499523, + "learning_rate": 6.213364595545134e-05, + "loss": 4.285861015319824, + "step": 530, + "token_acc": 0.20383868029825172 + }, + { + "epoch": 0.3113456464379947, + "grad_norm": 2.289866650220009, + "learning_rate": 6.225087924970692e-05, + "loss": 4.2917022705078125, + "step": 531, + "token_acc": 0.20222967263608363 + }, + { + "epoch": 0.31193198475520373, + "grad_norm": 3.382631909872567, + "learning_rate": 6.236811254396248e-05, + "loss": 4.259230136871338, + "step": 532, + "token_acc": 0.2058734019101656 + }, + { + "epoch": 0.3125183230724128, + "grad_norm": 3.110635914721146, + "learning_rate": 6.248534583821806e-05, + "loss": 4.2336039543151855, + "step": 533, + "token_acc": 0.20849226414753255 + }, + { + "epoch": 0.3131046613896218, + "grad_norm": 3.0024036914041328, + "learning_rate": 6.260257913247362e-05, + "loss": 4.2339277267456055, + "step": 534, + "token_acc": 0.20773492774766358 + }, + { + "epoch": 0.31369099970683084, + "grad_norm": 2.4222094918046535, + "learning_rate": 6.27198124267292e-05, + "loss": 4.2574992179870605, + "step": 535, + "token_acc": 0.20562153012503526 + }, + { + "epoch": 0.3142773380240399, + "grad_norm": 3.3072388970515414, + "learning_rate": 6.283704572098477e-05, + "loss": 4.30474853515625, + "step": 536, + "token_acc": 0.2000345180293251 + }, + { + "epoch": 0.3148636763412489, + "grad_norm": 2.4305479203971174, + "learning_rate": 6.295427901524034e-05, + "loss": 4.312208652496338, + "step": 537, + "token_acc": 0.1995582754336723 + }, + { + "epoch": 0.31545001465845796, + "grad_norm": 2.9078070087946886, + "learning_rate": 6.30715123094959e-05, + "loss": 4.245410919189453, + "step": 538, + "token_acc": 0.20610069637343187 + }, + { + "epoch": 0.31603635297566696, + "grad_norm": 3.0994651985418256, + "learning_rate": 6.318874560375146e-05, + "loss": 4.257229804992676, + "step": 539, + "token_acc": 0.20460338566165456 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 2.3054571015684946, + "learning_rate": 6.330597889800704e-05, + "loss": 4.27479362487793, + "step": 540, + "token_acc": 0.2029582760249069 + }, + { + "epoch": 0.317209029610085, + "grad_norm": 3.8815160826069444, + "learning_rate": 6.34232121922626e-05, + "loss": 4.2639007568359375, + "step": 541, + "token_acc": 0.20444951698603656 + }, + { + "epoch": 0.31779536792729407, + "grad_norm": 2.1634279733391244, + "learning_rate": 6.354044548651818e-05, + "loss": 4.313498497009277, + "step": 542, + "token_acc": 0.20060012214875594 + }, + { + "epoch": 0.31838170624450307, + "grad_norm": 3.305437100668334, + "learning_rate": 6.365767878077374e-05, + "loss": 4.308589935302734, + "step": 543, + "token_acc": 0.1977927693446581 + }, + { + "epoch": 0.3189680445617121, + "grad_norm": 3.025695879328745, + "learning_rate": 6.377491207502932e-05, + "loss": 4.358048439025879, + "step": 544, + "token_acc": 0.19489793545686834 + }, + { + "epoch": 0.31955438287892113, + "grad_norm": 3.12511711141683, + "learning_rate": 6.389214536928488e-05, + "loss": 4.206076622009277, + "step": 545, + "token_acc": 0.21111521694751448 + }, + { + "epoch": 0.3201407211961302, + "grad_norm": 2.0898254045131877, + "learning_rate": 6.400937866354045e-05, + "loss": 4.251198768615723, + "step": 546, + "token_acc": 0.20295281799774112 + }, + { + "epoch": 0.3207270595133392, + "grad_norm": 2.364299326785073, + "learning_rate": 6.412661195779602e-05, + "loss": 4.267653942108154, + "step": 547, + "token_acc": 0.20484509545151638 + }, + { + "epoch": 0.32131339783054824, + "grad_norm": 2.9441084483992395, + "learning_rate": 6.424384525205158e-05, + "loss": 4.3124775886535645, + "step": 548, + "token_acc": 0.19838106032508473 + }, + { + "epoch": 0.32189973614775724, + "grad_norm": 3.449394281505019, + "learning_rate": 6.436107854630716e-05, + "loss": 4.265442371368408, + "step": 549, + "token_acc": 0.20195000544906644 + }, + { + "epoch": 0.3224860744649663, + "grad_norm": 2.5815111328172615, + "learning_rate": 6.447831184056272e-05, + "loss": 4.201301097869873, + "step": 550, + "token_acc": 0.20851545146530456 + }, + { + "epoch": 0.3230724127821753, + "grad_norm": 3.2272637224586096, + "learning_rate": 6.45955451348183e-05, + "loss": 4.259978771209717, + "step": 551, + "token_acc": 0.20282464020967703 + }, + { + "epoch": 0.32365875109938436, + "grad_norm": 2.0735665000713617, + "learning_rate": 6.471277842907386e-05, + "loss": 4.219204902648926, + "step": 552, + "token_acc": 0.20681528987982986 + }, + { + "epoch": 0.32424508941659336, + "grad_norm": 2.9399438736941934, + "learning_rate": 6.483001172332943e-05, + "loss": 4.272747993469238, + "step": 553, + "token_acc": 0.20200138337145113 + }, + { + "epoch": 0.3248314277338024, + "grad_norm": 2.4112188164337893, + "learning_rate": 6.4947245017585e-05, + "loss": 4.183073997497559, + "step": 554, + "token_acc": 0.2122227612683225 + }, + { + "epoch": 0.3254177660510114, + "grad_norm": 3.008468569567238, + "learning_rate": 6.506447831184057e-05, + "loss": 4.204877853393555, + "step": 555, + "token_acc": 0.21000351304153025 + }, + { + "epoch": 0.32600410436822047, + "grad_norm": 3.102461422246427, + "learning_rate": 6.518171160609614e-05, + "loss": 4.25691032409668, + "step": 556, + "token_acc": 0.20268615728654027 + }, + { + "epoch": 0.32659044268542947, + "grad_norm": 3.669523274302679, + "learning_rate": 6.52989449003517e-05, + "loss": 4.295387268066406, + "step": 557, + "token_acc": 0.19698025911524897 + }, + { + "epoch": 0.32717678100263853, + "grad_norm": 1.96765863474173, + "learning_rate": 6.541617819460728e-05, + "loss": 4.196666717529297, + "step": 558, + "token_acc": 0.20732685099124679 + }, + { + "epoch": 0.32776311931984753, + "grad_norm": 2.432340640743014, + "learning_rate": 6.553341148886284e-05, + "loss": 4.208634853363037, + "step": 559, + "token_acc": 0.20784644069711897 + }, + { + "epoch": 0.3283494576370566, + "grad_norm": 3.0206287169947728, + "learning_rate": 6.565064478311841e-05, + "loss": 4.251628875732422, + "step": 560, + "token_acc": 0.20332906600440653 + }, + { + "epoch": 0.3289357959542656, + "grad_norm": 2.116632864769787, + "learning_rate": 6.576787807737398e-05, + "loss": 4.215981483459473, + "step": 561, + "token_acc": 0.20722201670426776 + }, + { + "epoch": 0.32952213427147464, + "grad_norm": 3.2491172523454166, + "learning_rate": 6.588511137162955e-05, + "loss": 4.176368236541748, + "step": 562, + "token_acc": 0.2099354680968941 + }, + { + "epoch": 0.33010847258868364, + "grad_norm": 1.9983710024669683, + "learning_rate": 6.600234466588512e-05, + "loss": 4.156754970550537, + "step": 563, + "token_acc": 0.2149327449049561 + }, + { + "epoch": 0.3306948109058927, + "grad_norm": 2.8918187477077786, + "learning_rate": 6.611957796014069e-05, + "loss": 4.195796012878418, + "step": 564, + "token_acc": 0.20799567081102985 + }, + { + "epoch": 0.33128114922310176, + "grad_norm": 2.405068276555556, + "learning_rate": 6.623681125439624e-05, + "loss": 4.185660362243652, + "step": 565, + "token_acc": 0.2096683954273283 + }, + { + "epoch": 0.33186748754031076, + "grad_norm": 2.2574345534241917, + "learning_rate": 6.635404454865182e-05, + "loss": 4.200215816497803, + "step": 566, + "token_acc": 0.2059081648644413 + }, + { + "epoch": 0.3324538258575198, + "grad_norm": 2.8489403260176243, + "learning_rate": 6.647127784290738e-05, + "loss": 4.239872932434082, + "step": 567, + "token_acc": 0.20448478931461037 + }, + { + "epoch": 0.3330401641747288, + "grad_norm": 2.388464643582114, + "learning_rate": 6.658851113716296e-05, + "loss": 4.209165573120117, + "step": 568, + "token_acc": 0.20629989740598587 + }, + { + "epoch": 0.33362650249193787, + "grad_norm": 2.7440106681362613, + "learning_rate": 6.670574443141853e-05, + "loss": 4.1976447105407715, + "step": 569, + "token_acc": 0.2070436644140365 + }, + { + "epoch": 0.33421284080914687, + "grad_norm": 3.399737707685417, + "learning_rate": 6.68229777256741e-05, + "loss": 4.252682685852051, + "step": 570, + "token_acc": 0.20181014251032653 + }, + { + "epoch": 0.33479917912635593, + "grad_norm": 2.470896500135244, + "learning_rate": 6.694021101992967e-05, + "loss": 4.138561248779297, + "step": 571, + "token_acc": 0.21314833832699862 + }, + { + "epoch": 0.33538551744356493, + "grad_norm": 2.4140149900254064, + "learning_rate": 6.705744431418523e-05, + "loss": 4.170652389526367, + "step": 572, + "token_acc": 0.20969149388007258 + }, + { + "epoch": 0.335971855760774, + "grad_norm": 2.822389985905648, + "learning_rate": 6.717467760844081e-05, + "loss": 4.154342174530029, + "step": 573, + "token_acc": 0.21218587763162086 + }, + { + "epoch": 0.336558194077983, + "grad_norm": 3.222209285487207, + "learning_rate": 6.729191090269636e-05, + "loss": 4.208691596984863, + "step": 574, + "token_acc": 0.20521084935746625 + }, + { + "epoch": 0.33714453239519204, + "grad_norm": 2.5246955124082144, + "learning_rate": 6.740914419695194e-05, + "loss": 4.15161657333374, + "step": 575, + "token_acc": 0.21294184498742869 + }, + { + "epoch": 0.33773087071240104, + "grad_norm": 3.066062702228726, + "learning_rate": 6.75263774912075e-05, + "loss": 4.170627593994141, + "step": 576, + "token_acc": 0.2093553177326028 + }, + { + "epoch": 0.3383172090296101, + "grad_norm": 2.6538439988281746, + "learning_rate": 6.764361078546308e-05, + "loss": 4.179351806640625, + "step": 577, + "token_acc": 0.20783089318924616 + }, + { + "epoch": 0.3389035473468191, + "grad_norm": 2.4515128383597626, + "learning_rate": 6.776084407971864e-05, + "loss": 4.2190728187561035, + "step": 578, + "token_acc": 0.2043195899375451 + }, + { + "epoch": 0.33948988566402816, + "grad_norm": 2.419640052287422, + "learning_rate": 6.787807737397421e-05, + "loss": 4.163544654846191, + "step": 579, + "token_acc": 0.21081945719155631 + }, + { + "epoch": 0.34007622398123716, + "grad_norm": 2.298839773360609, + "learning_rate": 6.799531066822978e-05, + "loss": 4.110836029052734, + "step": 580, + "token_acc": 0.21495364823106514 + }, + { + "epoch": 0.3406625622984462, + "grad_norm": 3.3550963244004826, + "learning_rate": 6.811254396248535e-05, + "loss": 4.175626754760742, + "step": 581, + "token_acc": 0.20792943824519308 + }, + { + "epoch": 0.3412489006156552, + "grad_norm": 2.0308058243557907, + "learning_rate": 6.822977725674092e-05, + "loss": 4.170405864715576, + "step": 582, + "token_acc": 0.20964526761889812 + }, + { + "epoch": 0.34183523893286427, + "grad_norm": 3.650819037132007, + "learning_rate": 6.834701055099648e-05, + "loss": 4.214597225189209, + "step": 583, + "token_acc": 0.20386166106908057 + }, + { + "epoch": 0.3424215772500733, + "grad_norm": 1.811216058241486, + "learning_rate": 6.846424384525206e-05, + "loss": 4.193975448608398, + "step": 584, + "token_acc": 0.20604070531994234 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 3.5601494060460728, + "learning_rate": 6.858147713950762e-05, + "loss": 4.249112129211426, + "step": 585, + "token_acc": 0.20160157087321603 + }, + { + "epoch": 0.34359425388449133, + "grad_norm": 2.546240696896199, + "learning_rate": 6.86987104337632e-05, + "loss": 4.188510417938232, + "step": 586, + "token_acc": 0.20822171119698443 + }, + { + "epoch": 0.3441805922017004, + "grad_norm": 2.2424369094428407, + "learning_rate": 6.881594372801876e-05, + "loss": 4.160363674163818, + "step": 587, + "token_acc": 0.2108433895806685 + }, + { + "epoch": 0.3447669305189094, + "grad_norm": 3.0594478016209186, + "learning_rate": 6.893317702227433e-05, + "loss": 4.154229164123535, + "step": 588, + "token_acc": 0.21069161160004213 + }, + { + "epoch": 0.34535326883611844, + "grad_norm": 1.9778745335785097, + "learning_rate": 6.90504103165299e-05, + "loss": 4.158848762512207, + "step": 589, + "token_acc": 0.21057924164727437 + }, + { + "epoch": 0.34593960715332744, + "grad_norm": 2.4539916534467014, + "learning_rate": 6.916764361078547e-05, + "loss": 4.145071983337402, + "step": 590, + "token_acc": 0.2105528348693428 + }, + { + "epoch": 0.3465259454705365, + "grad_norm": 2.1635838933477642, + "learning_rate": 6.928487690504104e-05, + "loss": 4.216288089752197, + "step": 591, + "token_acc": 0.20310441229456674 + }, + { + "epoch": 0.34711228378774556, + "grad_norm": 2.532584626642596, + "learning_rate": 6.94021101992966e-05, + "loss": 4.083550453186035, + "step": 592, + "token_acc": 0.21601965729738162 + }, + { + "epoch": 0.34769862210495456, + "grad_norm": 2.039146815016486, + "learning_rate": 6.951934349355217e-05, + "loss": 4.123083114624023, + "step": 593, + "token_acc": 0.21277903624920874 + }, + { + "epoch": 0.3482849604221636, + "grad_norm": 2.808290026352396, + "learning_rate": 6.963657678780774e-05, + "loss": 4.155226707458496, + "step": 594, + "token_acc": 0.2085782112488905 + }, + { + "epoch": 0.3488712987393726, + "grad_norm": 2.4468154808366966, + "learning_rate": 6.975381008206331e-05, + "loss": 4.1117119789123535, + "step": 595, + "token_acc": 0.21326740303108144 + }, + { + "epoch": 0.34945763705658167, + "grad_norm": 2.5366374410948596, + "learning_rate": 6.987104337631888e-05, + "loss": 4.112312316894531, + "step": 596, + "token_acc": 0.21480364630228507 + }, + { + "epoch": 0.3500439753737907, + "grad_norm": 2.4227846174641616, + "learning_rate": 6.998827667057445e-05, + "loss": 4.150864601135254, + "step": 597, + "token_acc": 0.20841413373046058 + }, + { + "epoch": 0.35063031369099973, + "grad_norm": 2.2854830698100397, + "learning_rate": 7.010550996483002e-05, + "loss": 4.147131443023682, + "step": 598, + "token_acc": 0.2085973335004138 + }, + { + "epoch": 0.35121665200820873, + "grad_norm": 3.0727805184674257, + "learning_rate": 7.022274325908559e-05, + "loss": 4.153531074523926, + "step": 599, + "token_acc": 0.2079611058898861 + }, + { + "epoch": 0.3518029903254178, + "grad_norm": 2.5258316759349153, + "learning_rate": 7.033997655334114e-05, + "loss": 4.129731178283691, + "step": 600, + "token_acc": 0.21162862738303378 + }, + { + "epoch": 0.3523893286426268, + "grad_norm": 2.9267199154449934, + "learning_rate": 7.045720984759672e-05, + "loss": 4.118048191070557, + "step": 601, + "token_acc": 0.2126418067910963 + }, + { + "epoch": 0.35297566695983584, + "grad_norm": 1.8731295565234167, + "learning_rate": 7.05744431418523e-05, + "loss": 4.1715192794799805, + "step": 602, + "token_acc": 0.20527919047183255 + }, + { + "epoch": 0.35356200527704484, + "grad_norm": 3.3143169992915245, + "learning_rate": 7.069167643610786e-05, + "loss": 4.100879669189453, + "step": 603, + "token_acc": 0.21716443857009016 + }, + { + "epoch": 0.3541483435942539, + "grad_norm": 2.018374253692198, + "learning_rate": 7.080890973036343e-05, + "loss": 4.091705322265625, + "step": 604, + "token_acc": 0.21426541631145213 + }, + { + "epoch": 0.3547346819114629, + "grad_norm": 2.885438639183108, + "learning_rate": 7.0926143024619e-05, + "loss": 4.0958991050720215, + "step": 605, + "token_acc": 0.21525665386851 + }, + { + "epoch": 0.35532102022867196, + "grad_norm": 2.2996061397666074, + "learning_rate": 7.104337631887457e-05, + "loss": 4.094749927520752, + "step": 606, + "token_acc": 0.2160558994417651 + }, + { + "epoch": 0.35590735854588096, + "grad_norm": 2.511428308734274, + "learning_rate": 7.116060961313013e-05, + "loss": 4.122867584228516, + "step": 607, + "token_acc": 0.21326891173011467 + }, + { + "epoch": 0.35649369686309, + "grad_norm": 3.19272148743335, + "learning_rate": 7.127784290738571e-05, + "loss": 4.177186012268066, + "step": 608, + "token_acc": 0.20434262095356895 + }, + { + "epoch": 0.357080035180299, + "grad_norm": 1.5290924558721875, + "learning_rate": 7.139507620164126e-05, + "loss": 4.097538471221924, + "step": 609, + "token_acc": 0.21519980028796862 + }, + { + "epoch": 0.35766637349750807, + "grad_norm": 3.8177361886489027, + "learning_rate": 7.151230949589684e-05, + "loss": 4.184349060058594, + "step": 610, + "token_acc": 0.20378709185527077 + }, + { + "epoch": 0.3582527118147171, + "grad_norm": 1.9845041679115725, + "learning_rate": 7.16295427901524e-05, + "loss": 4.1518073081970215, + "step": 611, + "token_acc": 0.20982409381663114 + }, + { + "epoch": 0.35883905013192613, + "grad_norm": 2.9172907817702947, + "learning_rate": 7.174677608440797e-05, + "loss": 4.14028263092041, + "step": 612, + "token_acc": 0.21218426241624838 + }, + { + "epoch": 0.35942538844913513, + "grad_norm": 2.2453526658169802, + "learning_rate": 7.186400937866354e-05, + "loss": 4.142838001251221, + "step": 613, + "token_acc": 0.21205322308721536 + }, + { + "epoch": 0.3600117267663442, + "grad_norm": 2.5834409682522037, + "learning_rate": 7.198124267291911e-05, + "loss": 4.2115254402160645, + "step": 614, + "token_acc": 0.2025347590813764 + }, + { + "epoch": 0.3605980650835532, + "grad_norm": 2.0144628590268856, + "learning_rate": 7.209847596717468e-05, + "loss": 4.098343372344971, + "step": 615, + "token_acc": 0.21502927868335459 + }, + { + "epoch": 0.36118440340076224, + "grad_norm": 2.7391891206161736, + "learning_rate": 7.221570926143025e-05, + "loss": 4.07414436340332, + "step": 616, + "token_acc": 0.2168717809704527 + }, + { + "epoch": 0.36177074171797124, + "grad_norm": 2.149948650310363, + "learning_rate": 7.233294255568583e-05, + "loss": 4.059930801391602, + "step": 617, + "token_acc": 0.21909523528854014 + }, + { + "epoch": 0.3623570800351803, + "grad_norm": 2.075780985724799, + "learning_rate": 7.245017584994138e-05, + "loss": 4.108323097229004, + "step": 618, + "token_acc": 0.2142750568838823 + }, + { + "epoch": 0.3629434183523893, + "grad_norm": 2.585159275197605, + "learning_rate": 7.256740914419695e-05, + "loss": 4.162170886993408, + "step": 619, + "token_acc": 0.2068520399525986 + }, + { + "epoch": 0.36352975666959836, + "grad_norm": 1.6350827403615937, + "learning_rate": 7.268464243845252e-05, + "loss": 4.104890823364258, + "step": 620, + "token_acc": 0.21055449430171663 + }, + { + "epoch": 0.3641160949868074, + "grad_norm": 2.7039293247821834, + "learning_rate": 7.28018757327081e-05, + "loss": 4.109732151031494, + "step": 621, + "token_acc": 0.2145976358849946 + }, + { + "epoch": 0.3647024333040164, + "grad_norm": 2.1190350311590067, + "learning_rate": 7.291910902696366e-05, + "loss": 4.059538841247559, + "step": 622, + "token_acc": 0.2174160411729703 + }, + { + "epoch": 0.36528877162122547, + "grad_norm": 2.902336514881658, + "learning_rate": 7.303634232121923e-05, + "loss": 4.145930290222168, + "step": 623, + "token_acc": 0.21163515634174201 + }, + { + "epoch": 0.3658751099384345, + "grad_norm": 1.7549445332593754, + "learning_rate": 7.31535756154748e-05, + "loss": 4.1091227531433105, + "step": 624, + "token_acc": 0.2112800324090298 + }, + { + "epoch": 0.36646144825564353, + "grad_norm": 3.9523416271040603, + "learning_rate": 7.327080890973037e-05, + "loss": 4.074059963226318, + "step": 625, + "token_acc": 0.2165876496065328 + }, + { + "epoch": 0.36704778657285253, + "grad_norm": 2.3403679467796605, + "learning_rate": 7.338804220398593e-05, + "loss": 4.123082637786865, + "step": 626, + "token_acc": 0.21221446641182018 + }, + { + "epoch": 0.3676341248900616, + "grad_norm": 2.1452631986183737, + "learning_rate": 7.35052754982415e-05, + "loss": 4.087530136108398, + "step": 627, + "token_acc": 0.21535591662386114 + }, + { + "epoch": 0.3682204632072706, + "grad_norm": 2.9309804271543367, + "learning_rate": 7.362250879249707e-05, + "loss": 4.101961612701416, + "step": 628, + "token_acc": 0.21293058083188984 + }, + { + "epoch": 0.36880680152447964, + "grad_norm": 1.7598176871254188, + "learning_rate": 7.373974208675264e-05, + "loss": 4.1332550048828125, + "step": 629, + "token_acc": 0.2088044037612862 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 3.0626904431246156, + "learning_rate": 7.385697538100821e-05, + "loss": 4.096952438354492, + "step": 630, + "token_acc": 0.2133911631997453 + }, + { + "epoch": 0.3699794781588977, + "grad_norm": 1.6434192302561115, + "learning_rate": 7.397420867526378e-05, + "loss": 4.105629920959473, + "step": 631, + "token_acc": 0.21183037001693572 + }, + { + "epoch": 0.3705658164761067, + "grad_norm": 3.1225308376453125, + "learning_rate": 7.409144196951935e-05, + "loss": 4.0817790031433105, + "step": 632, + "token_acc": 0.2125951542719465 + }, + { + "epoch": 0.37115215479331576, + "grad_norm": 1.876114939668875, + "learning_rate": 7.420867526377491e-05, + "loss": 4.103361129760742, + "step": 633, + "token_acc": 0.2110155437428117 + }, + { + "epoch": 0.37173849311052476, + "grad_norm": 2.9456311645822697, + "learning_rate": 7.432590855803049e-05, + "loss": 4.085481643676758, + "step": 634, + "token_acc": 0.21446758808643177 + }, + { + "epoch": 0.3723248314277338, + "grad_norm": 2.222620899646469, + "learning_rate": 7.444314185228605e-05, + "loss": 4.087406158447266, + "step": 635, + "token_acc": 0.21455811377482395 + }, + { + "epoch": 0.3729111697449428, + "grad_norm": 1.8617111992948212, + "learning_rate": 7.456037514654162e-05, + "loss": 4.094298362731934, + "step": 636, + "token_acc": 0.21440071372902908 + }, + { + "epoch": 0.3734975080621519, + "grad_norm": 2.4678181433695707, + "learning_rate": 7.467760844079719e-05, + "loss": 4.054091930389404, + "step": 637, + "token_acc": 0.21730131701682898 + }, + { + "epoch": 0.3740838463793609, + "grad_norm": 1.8739959514199582, + "learning_rate": 7.479484173505276e-05, + "loss": 4.097439765930176, + "step": 638, + "token_acc": 0.21199458815407057 + }, + { + "epoch": 0.37467018469656993, + "grad_norm": 1.999997947980484, + "learning_rate": 7.491207502930833e-05, + "loss": 4.060850143432617, + "step": 639, + "token_acc": 0.2155914878931368 + }, + { + "epoch": 0.37525652301377893, + "grad_norm": 2.473943136769906, + "learning_rate": 7.50293083235639e-05, + "loss": 4.12377405166626, + "step": 640, + "token_acc": 0.20844757986509665 + }, + { + "epoch": 0.375842861330988, + "grad_norm": 2.1540266521538496, + "learning_rate": 7.514654161781947e-05, + "loss": 4.0693206787109375, + "step": 641, + "token_acc": 0.21374037776709007 + }, + { + "epoch": 0.376429199648197, + "grad_norm": 1.9555041691960107, + "learning_rate": 7.526377491207503e-05, + "loss": 4.0703816413879395, + "step": 642, + "token_acc": 0.21338682410864312 + }, + { + "epoch": 0.37701553796540604, + "grad_norm": 2.4746565620809533, + "learning_rate": 7.538100820633061e-05, + "loss": 3.993250846862793, + "step": 643, + "token_acc": 0.22307548327388813 + }, + { + "epoch": 0.37760187628261505, + "grad_norm": 1.9200736941317331, + "learning_rate": 7.549824150058617e-05, + "loss": 4.02085018157959, + "step": 644, + "token_acc": 0.21993525156256702 + }, + { + "epoch": 0.3781882145998241, + "grad_norm": 2.4149432761682417, + "learning_rate": 7.561547479484174e-05, + "loss": 4.071619033813477, + "step": 645, + "token_acc": 0.21326240469941257 + }, + { + "epoch": 0.3787745529170331, + "grad_norm": 2.4871582498174125, + "learning_rate": 7.57327080890973e-05, + "loss": 4.132684707641602, + "step": 646, + "token_acc": 0.20779481374768471 + }, + { + "epoch": 0.37936089123424216, + "grad_norm": 3.226823534661959, + "learning_rate": 7.584994138335287e-05, + "loss": 4.0648603439331055, + "step": 647, + "token_acc": 0.21545968622224757 + }, + { + "epoch": 0.37994722955145116, + "grad_norm": 1.6540072681096925, + "learning_rate": 7.596717467760845e-05, + "loss": 4.031717300415039, + "step": 648, + "token_acc": 0.2201859905981044 + }, + { + "epoch": 0.3805335678686602, + "grad_norm": 3.138959935778616, + "learning_rate": 7.608440797186401e-05, + "loss": 4.13758659362793, + "step": 649, + "token_acc": 0.20802588963436297 + }, + { + "epoch": 0.3811199061858693, + "grad_norm": 2.1891622390419894, + "learning_rate": 7.620164126611959e-05, + "loss": 4.064674377441406, + "step": 650, + "token_acc": 0.2122129264246783 + }, + { + "epoch": 0.3817062445030783, + "grad_norm": 2.632488521285851, + "learning_rate": 7.631887456037515e-05, + "loss": 4.001059055328369, + "step": 651, + "token_acc": 0.22232432998164614 + }, + { + "epoch": 0.38229258282028733, + "grad_norm": 1.5309038291721875, + "learning_rate": 7.643610785463073e-05, + "loss": 4.113205432891846, + "step": 652, + "token_acc": 0.20970721296591022 + }, + { + "epoch": 0.38287892113749633, + "grad_norm": 3.0572790928646048, + "learning_rate": 7.655334114888629e-05, + "loss": 4.093487739562988, + "step": 653, + "token_acc": 0.21182909957443227 + }, + { + "epoch": 0.3834652594547054, + "grad_norm": 2.5229273446299123, + "learning_rate": 7.667057444314185e-05, + "loss": 4.037172794342041, + "step": 654, + "token_acc": 0.2184483672218355 + }, + { + "epoch": 0.3840515977719144, + "grad_norm": 1.9075802163317703, + "learning_rate": 7.678780773739742e-05, + "loss": 4.034085273742676, + "step": 655, + "token_acc": 0.2168918608454039 + }, + { + "epoch": 0.38463793608912344, + "grad_norm": 2.411474115819284, + "learning_rate": 7.690504103165299e-05, + "loss": 4.028837203979492, + "step": 656, + "token_acc": 0.21688751715757676 + }, + { + "epoch": 0.38522427440633245, + "grad_norm": 2.1772932845239636, + "learning_rate": 7.702227432590856e-05, + "loss": 4.046817779541016, + "step": 657, + "token_acc": 0.21557185728576192 + }, + { + "epoch": 0.3858106127235415, + "grad_norm": 2.4533435573107263, + "learning_rate": 7.713950762016413e-05, + "loss": 4.056399822235107, + "step": 658, + "token_acc": 0.2157916559510001 + }, + { + "epoch": 0.3863969510407505, + "grad_norm": 1.8473469022474964, + "learning_rate": 7.72567409144197e-05, + "loss": 4.04714298248291, + "step": 659, + "token_acc": 0.21505665695962742 + }, + { + "epoch": 0.38698328935795956, + "grad_norm": 2.3602063123064188, + "learning_rate": 7.737397420867527e-05, + "loss": 4.014403820037842, + "step": 660, + "token_acc": 0.21875912798096045 + }, + { + "epoch": 0.38756962767516856, + "grad_norm": 1.7020453106770677, + "learning_rate": 7.749120750293083e-05, + "loss": 4.058602333068848, + "step": 661, + "token_acc": 0.21461313141590313 + }, + { + "epoch": 0.3881559659923776, + "grad_norm": 2.0873313035369665, + "learning_rate": 7.76084407971864e-05, + "loss": 4.054744720458984, + "step": 662, + "token_acc": 0.21395478518271455 + }, + { + "epoch": 0.3887423043095866, + "grad_norm": 2.395388425986399, + "learning_rate": 7.772567409144197e-05, + "loss": 3.9661686420440674, + "step": 663, + "token_acc": 0.22505120987458538 + }, + { + "epoch": 0.3893286426267957, + "grad_norm": 1.9608797379106884, + "learning_rate": 7.784290738569754e-05, + "loss": 3.990251064300537, + "step": 664, + "token_acc": 0.21994407252760115 + }, + { + "epoch": 0.3899149809440047, + "grad_norm": 2.553547491895268, + "learning_rate": 7.796014067995311e-05, + "loss": 4.010693550109863, + "step": 665, + "token_acc": 0.22014327655842145 + }, + { + "epoch": 0.39050131926121373, + "grad_norm": 2.073981011067883, + "learning_rate": 7.807737397420867e-05, + "loss": 4.03361701965332, + "step": 666, + "token_acc": 0.2182426436154111 + }, + { + "epoch": 0.39108765757842273, + "grad_norm": 1.918682275933873, + "learning_rate": 7.819460726846425e-05, + "loss": 4.050731658935547, + "step": 667, + "token_acc": 0.21442472323545989 + }, + { + "epoch": 0.3916739958956318, + "grad_norm": 2.1443675765705796, + "learning_rate": 7.831184056271981e-05, + "loss": 3.9616920948028564, + "step": 668, + "token_acc": 0.2242642406901948 + }, + { + "epoch": 0.3922603342128408, + "grad_norm": 2.1331902977640156, + "learning_rate": 7.842907385697539e-05, + "loss": 4.044347286224365, + "step": 669, + "token_acc": 0.21538822330878069 + }, + { + "epoch": 0.39284667253004985, + "grad_norm": 2.243072466940306, + "learning_rate": 7.854630715123095e-05, + "loss": 4.085768222808838, + "step": 670, + "token_acc": 0.21124857053747792 + }, + { + "epoch": 0.39343301084725885, + "grad_norm": 2.231107447143818, + "learning_rate": 7.866354044548652e-05, + "loss": 3.9956047534942627, + "step": 671, + "token_acc": 0.2188850407541879 + }, + { + "epoch": 0.3940193491644679, + "grad_norm": 2.108116358043997, + "learning_rate": 7.878077373974209e-05, + "loss": 3.9872591495513916, + "step": 672, + "token_acc": 0.22259305938062415 + }, + { + "epoch": 0.3946056874816769, + "grad_norm": 2.335134786070096, + "learning_rate": 7.889800703399765e-05, + "loss": 3.9818410873413086, + "step": 673, + "token_acc": 0.22226903001881143 + }, + { + "epoch": 0.39519202579888596, + "grad_norm": 1.7637353035663494, + "learning_rate": 7.901524032825323e-05, + "loss": 3.9797415733337402, + "step": 674, + "token_acc": 0.22384945423246957 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 2.751577869702306, + "learning_rate": 7.91324736225088e-05, + "loss": 4.02831506729126, + "step": 675, + "token_acc": 0.21854878487728793 + }, + { + "epoch": 0.396364702433304, + "grad_norm": 1.605864066444226, + "learning_rate": 7.924970691676437e-05, + "loss": 3.9465856552124023, + "step": 676, + "token_acc": 0.2257202965851656 + }, + { + "epoch": 0.3969510407505131, + "grad_norm": 2.8483845164092987, + "learning_rate": 7.936694021101993e-05, + "loss": 3.9910123348236084, + "step": 677, + "token_acc": 0.2207799455615958 + }, + { + "epoch": 0.3975373790677221, + "grad_norm": 1.4685836708648312, + "learning_rate": 7.948417350527551e-05, + "loss": 3.9373085498809814, + "step": 678, + "token_acc": 0.22478049654830656 + }, + { + "epoch": 0.39812371738493113, + "grad_norm": 3.1584410509641083, + "learning_rate": 7.960140679953107e-05, + "loss": 4.052556037902832, + "step": 679, + "token_acc": 0.21437743014739127 + }, + { + "epoch": 0.39871005570214013, + "grad_norm": 2.1134430010740166, + "learning_rate": 7.971864009378663e-05, + "loss": 3.953364849090576, + "step": 680, + "token_acc": 0.22410616518009976 + }, + { + "epoch": 0.3992963940193492, + "grad_norm": 2.027947057181343, + "learning_rate": 7.983587338804221e-05, + "loss": 4.093752384185791, + "step": 681, + "token_acc": 0.20953233488854062 + }, + { + "epoch": 0.3998827323365582, + "grad_norm": 2.834836713241114, + "learning_rate": 7.995310668229777e-05, + "loss": 4.011902809143066, + "step": 682, + "token_acc": 0.21620308608178657 + }, + { + "epoch": 0.40046907065376725, + "grad_norm": 1.9358391805959974, + "learning_rate": 8.007033997655335e-05, + "loss": 4.031554222106934, + "step": 683, + "token_acc": 0.21570020881772092 + }, + { + "epoch": 0.40105540897097625, + "grad_norm": 1.795843654663749, + "learning_rate": 8.018757327080891e-05, + "loss": 3.986050605773926, + "step": 684, + "token_acc": 0.222041572229191 + }, + { + "epoch": 0.4016417472881853, + "grad_norm": 2.6068865630161007, + "learning_rate": 8.030480656506449e-05, + "loss": 3.9698944091796875, + "step": 685, + "token_acc": 0.22357636607220177 + }, + { + "epoch": 0.4022280856053943, + "grad_norm": 1.8456236622891797, + "learning_rate": 8.042203985932005e-05, + "loss": 4.0039215087890625, + "step": 686, + "token_acc": 0.21838219936472802 + }, + { + "epoch": 0.40281442392260336, + "grad_norm": 2.276514144099755, + "learning_rate": 8.053927315357563e-05, + "loss": 3.993375062942505, + "step": 687, + "token_acc": 0.21902647209953063 + }, + { + "epoch": 0.40340076223981236, + "grad_norm": 1.9406005967692974, + "learning_rate": 8.065650644783119e-05, + "loss": 3.9986319541931152, + "step": 688, + "token_acc": 0.22091693269034843 + }, + { + "epoch": 0.4039871005570214, + "grad_norm": 2.6068922881409, + "learning_rate": 8.077373974208675e-05, + "loss": 3.9992117881774902, + "step": 689, + "token_acc": 0.21998947532651608 + }, + { + "epoch": 0.4045734388742304, + "grad_norm": 2.1736297833674647, + "learning_rate": 8.089097303634232e-05, + "loss": 3.9667046070098877, + "step": 690, + "token_acc": 0.220259143520898 + }, + { + "epoch": 0.4051597771914395, + "grad_norm": 2.27013025247477, + "learning_rate": 8.100820633059789e-05, + "loss": 4.015791416168213, + "step": 691, + "token_acc": 0.2171304206329835 + }, + { + "epoch": 0.4057461155086485, + "grad_norm": 1.8133066440047596, + "learning_rate": 8.112543962485345e-05, + "loss": 4.029618263244629, + "step": 692, + "token_acc": 0.21445345442750569 + }, + { + "epoch": 0.40633245382585753, + "grad_norm": 1.947593396191246, + "learning_rate": 8.124267291910903e-05, + "loss": 4.012633323669434, + "step": 693, + "token_acc": 0.2172589234258634 + }, + { + "epoch": 0.40691879214306653, + "grad_norm": 2.3347922630746054, + "learning_rate": 8.13599062133646e-05, + "loss": 4.041747093200684, + "step": 694, + "token_acc": 0.21478468911923815 + }, + { + "epoch": 0.4075051304602756, + "grad_norm": 1.5684402865571183, + "learning_rate": 8.147713950762017e-05, + "loss": 3.923807382583618, + "step": 695, + "token_acc": 0.22580291453496612 + }, + { + "epoch": 0.4080914687774846, + "grad_norm": 2.130446863345715, + "learning_rate": 8.159437280187575e-05, + "loss": 3.9689764976501465, + "step": 696, + "token_acc": 0.2218048385437413 + }, + { + "epoch": 0.40867780709469365, + "grad_norm": 2.2117512943963296, + "learning_rate": 8.171160609613131e-05, + "loss": 3.9482274055480957, + "step": 697, + "token_acc": 0.2253584146845398 + }, + { + "epoch": 0.40926414541190265, + "grad_norm": 2.2471329420627564, + "learning_rate": 8.182883939038687e-05, + "loss": 4.002993583679199, + "step": 698, + "token_acc": 0.21702742510308898 + }, + { + "epoch": 0.4098504837291117, + "grad_norm": 2.5516320367202474, + "learning_rate": 8.194607268464243e-05, + "loss": 3.9073610305786133, + "step": 699, + "token_acc": 0.2278225212855082 + }, + { + "epoch": 0.4104368220463207, + "grad_norm": 2.07434391491364, + "learning_rate": 8.206330597889801e-05, + "loss": 3.996926784515381, + "step": 700, + "token_acc": 0.21590331255668146 + }, + { + "epoch": 0.41102316036352976, + "grad_norm": 2.701039367480318, + "learning_rate": 8.218053927315357e-05, + "loss": 4.009852409362793, + "step": 701, + "token_acc": 0.2172141838888435 + }, + { + "epoch": 0.41160949868073876, + "grad_norm": 1.4646660186614113, + "learning_rate": 8.229777256740915e-05, + "loss": 3.9965920448303223, + "step": 702, + "token_acc": 0.2191698207378447 + }, + { + "epoch": 0.4121958369979478, + "grad_norm": 3.1570371849933956, + "learning_rate": 8.241500586166471e-05, + "loss": 3.992090940475464, + "step": 703, + "token_acc": 0.2190071815547436 + }, + { + "epoch": 0.4127821753151568, + "grad_norm": 1.8870292738736854, + "learning_rate": 8.253223915592029e-05, + "loss": 4.004321098327637, + "step": 704, + "token_acc": 0.21755276839848503 + }, + { + "epoch": 0.4133685136323659, + "grad_norm": 3.3549004761100933, + "learning_rate": 8.264947245017585e-05, + "loss": 4.063796043395996, + "step": 705, + "token_acc": 0.21098198024245524 + }, + { + "epoch": 0.41395485194957493, + "grad_norm": 2.16753937155867, + "learning_rate": 8.276670574443143e-05, + "loss": 3.979288101196289, + "step": 706, + "token_acc": 0.2197244342267462 + }, + { + "epoch": 0.41454119026678393, + "grad_norm": 2.256985192036278, + "learning_rate": 8.288393903868699e-05, + "loss": 3.9586453437805176, + "step": 707, + "token_acc": 0.22309999231025557 + }, + { + "epoch": 0.415127528583993, + "grad_norm": 1.954939127077894, + "learning_rate": 8.300117233294255e-05, + "loss": 3.9463043212890625, + "step": 708, + "token_acc": 0.2236039054298286 + }, + { + "epoch": 0.415713866901202, + "grad_norm": 2.4362819756259326, + "learning_rate": 8.311840562719813e-05, + "loss": 3.9839253425598145, + "step": 709, + "token_acc": 0.22196783379716523 + }, + { + "epoch": 0.41630020521841105, + "grad_norm": 2.062117685664566, + "learning_rate": 8.323563892145369e-05, + "loss": 3.9828412532806396, + "step": 710, + "token_acc": 0.22041821921958635 + }, + { + "epoch": 0.41688654353562005, + "grad_norm": 2.518798149480698, + "learning_rate": 8.335287221570927e-05, + "loss": 3.966951608657837, + "step": 711, + "token_acc": 0.22057594887185403 + }, + { + "epoch": 0.4174728818528291, + "grad_norm": 2.33488563735422, + "learning_rate": 8.347010550996483e-05, + "loss": 4.020444869995117, + "step": 712, + "token_acc": 0.21567288505328716 + }, + { + "epoch": 0.4180592201700381, + "grad_norm": 1.3085221419826674, + "learning_rate": 8.358733880422041e-05, + "loss": 4.00489616394043, + "step": 713, + "token_acc": 0.21908290268196823 + }, + { + "epoch": 0.41864555848724716, + "grad_norm": 2.4149560137622115, + "learning_rate": 8.370457209847597e-05, + "loss": 3.9871957302093506, + "step": 714, + "token_acc": 0.21834809649697443 + }, + { + "epoch": 0.41923189680445616, + "grad_norm": 1.848581982176165, + "learning_rate": 8.382180539273155e-05, + "loss": 3.9747753143310547, + "step": 715, + "token_acc": 0.22014599739217036 + }, + { + "epoch": 0.4198182351216652, + "grad_norm": 1.5880853884188684, + "learning_rate": 8.393903868698711e-05, + "loss": 3.9152612686157227, + "step": 716, + "token_acc": 0.2266055384096822 + }, + { + "epoch": 0.4204045734388742, + "grad_norm": 2.4278481046131786, + "learning_rate": 8.405627198124267e-05, + "loss": 3.990471601486206, + "step": 717, + "token_acc": 0.21986459581955026 + }, + { + "epoch": 0.4209909117560833, + "grad_norm": 1.496198977995066, + "learning_rate": 8.417350527549825e-05, + "loss": 3.996959686279297, + "step": 718, + "token_acc": 0.21578277516266198 + }, + { + "epoch": 0.4215772500732923, + "grad_norm": 2.229378172850658, + "learning_rate": 8.429073856975381e-05, + "loss": 3.946504592895508, + "step": 719, + "token_acc": 0.22425191965447533 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 1.7659846802731216, + "learning_rate": 8.440797186400939e-05, + "loss": 4.021200180053711, + "step": 720, + "token_acc": 0.21562805573663624 + }, + { + "epoch": 0.42274992670771033, + "grad_norm": 2.3525238749731665, + "learning_rate": 8.452520515826495e-05, + "loss": 3.9007182121276855, + "step": 721, + "token_acc": 0.2278418843187129 + }, + { + "epoch": 0.4233362650249194, + "grad_norm": 2.0762583390587555, + "learning_rate": 8.464243845252053e-05, + "loss": 3.995725631713867, + "step": 722, + "token_acc": 0.21636987180662792 + }, + { + "epoch": 0.4239226033421284, + "grad_norm": 1.835997138157056, + "learning_rate": 8.475967174677609e-05, + "loss": 3.9783501625061035, + "step": 723, + "token_acc": 0.2200947039352703 + }, + { + "epoch": 0.42450894165933745, + "grad_norm": 1.998538865801269, + "learning_rate": 8.487690504103167e-05, + "loss": 3.899714946746826, + "step": 724, + "token_acc": 0.2277796328981751 + }, + { + "epoch": 0.42509527997654645, + "grad_norm": 2.056393049375904, + "learning_rate": 8.499413833528722e-05, + "loss": 3.9990315437316895, + "step": 725, + "token_acc": 0.21626505392136294 + }, + { + "epoch": 0.4256816182937555, + "grad_norm": 1.5564394332706086, + "learning_rate": 8.511137162954279e-05, + "loss": 4.009754657745361, + "step": 726, + "token_acc": 0.2143423783411283 + }, + { + "epoch": 0.4262679566109645, + "grad_norm": 2.1011744301973794, + "learning_rate": 8.522860492379835e-05, + "loss": 3.9437594413757324, + "step": 727, + "token_acc": 0.22307386260338471 + }, + { + "epoch": 0.42685429492817356, + "grad_norm": 2.300110050957418, + "learning_rate": 8.534583821805393e-05, + "loss": 3.9627723693847656, + "step": 728, + "token_acc": 0.22014249117395904 + }, + { + "epoch": 0.42744063324538256, + "grad_norm": 1.8150002936726186, + "learning_rate": 8.54630715123095e-05, + "loss": 3.937901258468628, + "step": 729, + "token_acc": 0.22241908243543698 + }, + { + "epoch": 0.4280269715625916, + "grad_norm": 2.70277092710677, + "learning_rate": 8.558030480656507e-05, + "loss": 3.9719107151031494, + "step": 730, + "token_acc": 0.22138685801847835 + }, + { + "epoch": 0.4286133098798006, + "grad_norm": 1.6532035295187189, + "learning_rate": 8.569753810082065e-05, + "loss": 3.9723806381225586, + "step": 731, + "token_acc": 0.21880205655526994 + }, + { + "epoch": 0.4291996481970097, + "grad_norm": 2.676508541580671, + "learning_rate": 8.581477139507621e-05, + "loss": 3.9484610557556152, + "step": 732, + "token_acc": 0.2227352200484422 + }, + { + "epoch": 0.42978598651421873, + "grad_norm": 1.403127241890776, + "learning_rate": 8.593200468933178e-05, + "loss": 3.9220519065856934, + "step": 733, + "token_acc": 0.22626335247921198 + }, + { + "epoch": 0.43037232483142773, + "grad_norm": 2.6423420760706486, + "learning_rate": 8.604923798358733e-05, + "loss": 3.9348416328430176, + "step": 734, + "token_acc": 0.22170235924365955 + }, + { + "epoch": 0.4309586631486368, + "grad_norm": 1.9143950897663173, + "learning_rate": 8.616647127784291e-05, + "loss": 3.951483726501465, + "step": 735, + "token_acc": 0.2222365654851916 + }, + { + "epoch": 0.4315450014658458, + "grad_norm": 1.814817755227424, + "learning_rate": 8.628370457209847e-05, + "loss": 3.900132179260254, + "step": 736, + "token_acc": 0.2292070940655613 + }, + { + "epoch": 0.43213133978305485, + "grad_norm": 1.8966274546804156, + "learning_rate": 8.640093786635405e-05, + "loss": 3.9608585834503174, + "step": 737, + "token_acc": 0.2217940825334792 + }, + { + "epoch": 0.43271767810026385, + "grad_norm": 2.2331692203473326, + "learning_rate": 8.651817116060961e-05, + "loss": 3.9567646980285645, + "step": 738, + "token_acc": 0.2201034012744563 + }, + { + "epoch": 0.4333040164174729, + "grad_norm": 1.6670104427552728, + "learning_rate": 8.663540445486519e-05, + "loss": 3.947218179702759, + "step": 739, + "token_acc": 0.22202044884573338 + }, + { + "epoch": 0.4338903547346819, + "grad_norm": 2.0507402177009824, + "learning_rate": 8.675263774912075e-05, + "loss": 3.910508871078491, + "step": 740, + "token_acc": 0.22730556287898762 + }, + { + "epoch": 0.43447669305189096, + "grad_norm": 1.615803099592315, + "learning_rate": 8.686987104337633e-05, + "loss": 3.9932494163513184, + "step": 741, + "token_acc": 0.21694113506191537 + }, + { + "epoch": 0.43506303136909996, + "grad_norm": 2.5371822313297403, + "learning_rate": 8.698710433763189e-05, + "loss": 3.8925976753234863, + "step": 742, + "token_acc": 0.22879213092889494 + }, + { + "epoch": 0.435649369686309, + "grad_norm": 1.8626483187207974, + "learning_rate": 8.710433763188745e-05, + "loss": 3.937966823577881, + "step": 743, + "token_acc": 0.22448765850708127 + }, + { + "epoch": 0.436235708003518, + "grad_norm": 2.250001136779272, + "learning_rate": 8.722157092614303e-05, + "loss": 3.900315046310425, + "step": 744, + "token_acc": 0.22572346116801262 + }, + { + "epoch": 0.4368220463207271, + "grad_norm": 2.0386335384596954, + "learning_rate": 8.733880422039859e-05, + "loss": 3.938730239868164, + "step": 745, + "token_acc": 0.22347982681328787 + }, + { + "epoch": 0.4374083846379361, + "grad_norm": 2.0306205239517334, + "learning_rate": 8.745603751465417e-05, + "loss": 3.966027021408081, + "step": 746, + "token_acc": 0.2189314855824024 + }, + { + "epoch": 0.43799472295514513, + "grad_norm": 1.9042181280416175, + "learning_rate": 8.757327080890973e-05, + "loss": 3.906756639480591, + "step": 747, + "token_acc": 0.2251283743647284 + }, + { + "epoch": 0.43858106127235413, + "grad_norm": 1.83388543747683, + "learning_rate": 8.769050410316531e-05, + "loss": 3.9863650798797607, + "step": 748, + "token_acc": 0.21566247599724647 + }, + { + "epoch": 0.4391673995895632, + "grad_norm": 2.4927173945530576, + "learning_rate": 8.780773739742087e-05, + "loss": 3.9408254623413086, + "step": 749, + "token_acc": 0.2226859005154706 + }, + { + "epoch": 0.4397537379067722, + "grad_norm": 1.878251095328266, + "learning_rate": 8.792497069167645e-05, + "loss": 3.9462904930114746, + "step": 750, + "token_acc": 0.22154756755045843 + }, + { + "epoch": 0.44034007622398125, + "grad_norm": 2.5811473775151583, + "learning_rate": 8.804220398593201e-05, + "loss": 3.9655089378356934, + "step": 751, + "token_acc": 0.21821542515543843 + }, + { + "epoch": 0.44092641454119025, + "grad_norm": 1.5868598891613437, + "learning_rate": 8.815943728018757e-05, + "loss": 3.9307169914245605, + "step": 752, + "token_acc": 0.22196996782266715 + }, + { + "epoch": 0.4415127528583993, + "grad_norm": 2.2626821868614995, + "learning_rate": 8.827667057444315e-05, + "loss": 3.926063060760498, + "step": 753, + "token_acc": 0.22557470488856743 + }, + { + "epoch": 0.4420990911756083, + "grad_norm": 1.9137229422291535, + "learning_rate": 8.839390386869871e-05, + "loss": 3.8885974884033203, + "step": 754, + "token_acc": 0.22743625191613862 + }, + { + "epoch": 0.44268542949281736, + "grad_norm": 2.42027527214035, + "learning_rate": 8.851113716295429e-05, + "loss": 3.951854705810547, + "step": 755, + "token_acc": 0.21919975367235237 + }, + { + "epoch": 0.44327176781002636, + "grad_norm": 1.4274348182652419, + "learning_rate": 8.862837045720985e-05, + "loss": 3.924478054046631, + "step": 756, + "token_acc": 0.22586739733819544 + }, + { + "epoch": 0.4438581061272354, + "grad_norm": 2.387809799570043, + "learning_rate": 8.874560375146543e-05, + "loss": 3.9574551582336426, + "step": 757, + "token_acc": 0.22065239441538465 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.066025639346712, + "learning_rate": 8.886283704572099e-05, + "loss": 3.9644346237182617, + "step": 758, + "token_acc": 0.21807099281297007 + }, + { + "epoch": 0.4450307827616535, + "grad_norm": 2.0071041068521813, + "learning_rate": 8.898007033997657e-05, + "loss": 3.89326810836792, + "step": 759, + "token_acc": 0.2262106542215205 + }, + { + "epoch": 0.4456171210788625, + "grad_norm": 2.0702333406413, + "learning_rate": 8.909730363423211e-05, + "loss": 3.949096202850342, + "step": 760, + "token_acc": 0.21959702504489864 + }, + { + "epoch": 0.44620345939607153, + "grad_norm": 1.7844832049053576, + "learning_rate": 8.921453692848769e-05, + "loss": 3.9036145210266113, + "step": 761, + "token_acc": 0.22562241895689575 + }, + { + "epoch": 0.4467897977132806, + "grad_norm": 2.035225491767218, + "learning_rate": 8.933177022274327e-05, + "loss": 3.894869327545166, + "step": 762, + "token_acc": 0.2258279057563059 + }, + { + "epoch": 0.4473761360304896, + "grad_norm": 1.8586834290729655, + "learning_rate": 8.944900351699883e-05, + "loss": 3.9075770378112793, + "step": 763, + "token_acc": 0.2255222223975676 + }, + { + "epoch": 0.44796247434769865, + "grad_norm": 1.7336219261594272, + "learning_rate": 8.95662368112544e-05, + "loss": 3.8645763397216797, + "step": 764, + "token_acc": 0.22972833902867018 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 2.2428110263096537, + "learning_rate": 8.968347010550997e-05, + "loss": 3.932220935821533, + "step": 765, + "token_acc": 0.22227289432809236 + }, + { + "epoch": 0.4491351509821167, + "grad_norm": 1.3581669888821015, + "learning_rate": 8.980070339976554e-05, + "loss": 3.8672127723693848, + "step": 766, + "token_acc": 0.23058741996545196 + }, + { + "epoch": 0.4497214892993257, + "grad_norm": 2.6220000478513033, + "learning_rate": 8.991793669402111e-05, + "loss": 3.922210454940796, + "step": 767, + "token_acc": 0.2243839689135488 + }, + { + "epoch": 0.45030782761653476, + "grad_norm": 1.3507666299828758, + "learning_rate": 9.003516998827668e-05, + "loss": 3.9137024879455566, + "step": 768, + "token_acc": 0.22226228221795702 + }, + { + "epoch": 0.45089416593374376, + "grad_norm": 1.9320738465240743, + "learning_rate": 9.015240328253223e-05, + "loss": 3.960395336151123, + "step": 769, + "token_acc": 0.21898678494363363 + }, + { + "epoch": 0.4514805042509528, + "grad_norm": 1.6960861932966782, + "learning_rate": 9.026963657678781e-05, + "loss": 3.899871587753296, + "step": 770, + "token_acc": 0.2266482478203641 + }, + { + "epoch": 0.4520668425681618, + "grad_norm": 2.4099589458695125, + "learning_rate": 9.038686987104337e-05, + "loss": 3.891418933868408, + "step": 771, + "token_acc": 0.22540586273314767 + }, + { + "epoch": 0.4526531808853709, + "grad_norm": 1.5763104772797327, + "learning_rate": 9.050410316529895e-05, + "loss": 3.8581748008728027, + "step": 772, + "token_acc": 0.23018679436762654 + }, + { + "epoch": 0.4532395192025799, + "grad_norm": 2.549973842613314, + "learning_rate": 9.062133645955451e-05, + "loss": 3.918881893157959, + "step": 773, + "token_acc": 0.22459708741725015 + }, + { + "epoch": 0.45382585751978893, + "grad_norm": 1.736984546111328, + "learning_rate": 9.073856975381009e-05, + "loss": 3.9528517723083496, + "step": 774, + "token_acc": 0.22035208243881493 + }, + { + "epoch": 0.45441219583699793, + "grad_norm": 1.698273811652094, + "learning_rate": 9.085580304806566e-05, + "loss": 3.9416563510894775, + "step": 775, + "token_acc": 0.2212251668707295 + }, + { + "epoch": 0.454998534154207, + "grad_norm": 2.069628020016678, + "learning_rate": 9.097303634232123e-05, + "loss": 3.903822898864746, + "step": 776, + "token_acc": 0.22593174857732728 + }, + { + "epoch": 0.455584872471416, + "grad_norm": 1.8985676809172887, + "learning_rate": 9.10902696365768e-05, + "loss": 3.8916778564453125, + "step": 777, + "token_acc": 0.22497064181408177 + }, + { + "epoch": 0.45617121078862505, + "grad_norm": 2.249942313911619, + "learning_rate": 9.120750293083235e-05, + "loss": 3.893077850341797, + "step": 778, + "token_acc": 0.22636775568028203 + }, + { + "epoch": 0.45675754910583405, + "grad_norm": 1.7904222203599198, + "learning_rate": 9.132473622508793e-05, + "loss": 3.8899126052856445, + "step": 779, + "token_acc": 0.2254609469100343 + }, + { + "epoch": 0.4573438874230431, + "grad_norm": 1.7742101311256486, + "learning_rate": 9.144196951934349e-05, + "loss": 3.8878064155578613, + "step": 780, + "token_acc": 0.2239327950160751 + }, + { + "epoch": 0.4579302257402521, + "grad_norm": 1.6449534718641463, + "learning_rate": 9.155920281359907e-05, + "loss": 3.867175579071045, + "step": 781, + "token_acc": 0.22982883813379193 + }, + { + "epoch": 0.45851656405746116, + "grad_norm": 1.7486791327143634, + "learning_rate": 9.167643610785463e-05, + "loss": 3.8803577423095703, + "step": 782, + "token_acc": 0.22562936079944795 + }, + { + "epoch": 0.45910290237467016, + "grad_norm": 1.6895566187152857, + "learning_rate": 9.17936694021102e-05, + "loss": 3.902367353439331, + "step": 783, + "token_acc": 0.22516781262920058 + }, + { + "epoch": 0.4596892406918792, + "grad_norm": 2.576619700263025, + "learning_rate": 9.191090269636577e-05, + "loss": 3.921027183532715, + "step": 784, + "token_acc": 0.22181978404718755 + }, + { + "epoch": 0.4602755790090882, + "grad_norm": 1.5998442620731603, + "learning_rate": 9.202813599062135e-05, + "loss": 3.902172088623047, + "step": 785, + "token_acc": 0.22429702639920943 + }, + { + "epoch": 0.4608619173262973, + "grad_norm": 2.800536871811806, + "learning_rate": 9.214536928487691e-05, + "loss": 3.870784282684326, + "step": 786, + "token_acc": 0.22730087367374868 + }, + { + "epoch": 0.4614482556435063, + "grad_norm": 1.5157492029140756, + "learning_rate": 9.226260257913247e-05, + "loss": 3.9423084259033203, + "step": 787, + "token_acc": 0.21816674971535385 + }, + { + "epoch": 0.46203459396071533, + "grad_norm": 2.5524842560316743, + "learning_rate": 9.237983587338805e-05, + "loss": 3.909968852996826, + "step": 788, + "token_acc": 0.22396359055214693 + }, + { + "epoch": 0.46262093227792433, + "grad_norm": 1.8200972978087542, + "learning_rate": 9.249706916764361e-05, + "loss": 3.8912699222564697, + "step": 789, + "token_acc": 0.2261728535641084 + }, + { + "epoch": 0.4632072705951334, + "grad_norm": 1.7984710543234683, + "learning_rate": 9.261430246189919e-05, + "loss": 3.8978404998779297, + "step": 790, + "token_acc": 0.22575110823458103 + }, + { + "epoch": 0.46379360891234245, + "grad_norm": 1.869542470040667, + "learning_rate": 9.273153575615475e-05, + "loss": 3.928298234939575, + "step": 791, + "token_acc": 0.22056627989208727 + }, + { + "epoch": 0.46437994722955145, + "grad_norm": 1.9655390950080824, + "learning_rate": 9.284876905041033e-05, + "loss": 3.8563456535339355, + "step": 792, + "token_acc": 0.22948494983277593 + }, + { + "epoch": 0.4649662855467605, + "grad_norm": 1.7826658467215866, + "learning_rate": 9.296600234466589e-05, + "loss": 3.9013195037841797, + "step": 793, + "token_acc": 0.22432930503774495 + }, + { + "epoch": 0.4655526238639695, + "grad_norm": 1.6155847072960474, + "learning_rate": 9.308323563892146e-05, + "loss": 3.880685329437256, + "step": 794, + "token_acc": 0.22561012673616387 + }, + { + "epoch": 0.46613896218117856, + "grad_norm": 2.464187024479387, + "learning_rate": 9.320046893317703e-05, + "loss": 3.8675413131713867, + "step": 795, + "token_acc": 0.2293023035287835 + }, + { + "epoch": 0.46672530049838756, + "grad_norm": 2.117923485251248, + "learning_rate": 9.331770222743259e-05, + "loss": 3.890979290008545, + "step": 796, + "token_acc": 0.2274577739281074 + }, + { + "epoch": 0.4673116388155966, + "grad_norm": 1.8135485629403558, + "learning_rate": 9.343493552168817e-05, + "loss": 3.8722591400146484, + "step": 797, + "token_acc": 0.22754903413092575 + }, + { + "epoch": 0.4678979771328056, + "grad_norm": 1.6666757203163662, + "learning_rate": 9.355216881594373e-05, + "loss": 3.7965779304504395, + "step": 798, + "token_acc": 0.23548949511358522 + }, + { + "epoch": 0.4684843154500147, + "grad_norm": 1.6498589259186138, + "learning_rate": 9.36694021101993e-05, + "loss": 3.8730015754699707, + "step": 799, + "token_acc": 0.22462406762145762 + }, + { + "epoch": 0.4690706537672237, + "grad_norm": 1.8834256598113779, + "learning_rate": 9.378663540445487e-05, + "loss": 3.8841490745544434, + "step": 800, + "token_acc": 0.2255475091680745 + }, + { + "epoch": 0.46965699208443273, + "grad_norm": 2.0334556878234915, + "learning_rate": 9.390386869871044e-05, + "loss": 3.8848514556884766, + "step": 801, + "token_acc": 0.22690193035731007 + }, + { + "epoch": 0.47024333040164173, + "grad_norm": 2.0377081679199613, + "learning_rate": 9.402110199296601e-05, + "loss": 3.877686023712158, + "step": 802, + "token_acc": 0.2246462658967874 + }, + { + "epoch": 0.4708296687188508, + "grad_norm": 2.0015218844171843, + "learning_rate": 9.413833528722158e-05, + "loss": 3.8933260440826416, + "step": 803, + "token_acc": 0.2268061021422694 + }, + { + "epoch": 0.4714160070360598, + "grad_norm": 2.0478849793439164, + "learning_rate": 9.425556858147715e-05, + "loss": 3.9260916709899902, + "step": 804, + "token_acc": 0.22048878420875148 + }, + { + "epoch": 0.47200234535326885, + "grad_norm": 1.8694828328303443, + "learning_rate": 9.437280187573271e-05, + "loss": 3.889716863632202, + "step": 805, + "token_acc": 0.22447748263432293 + }, + { + "epoch": 0.47258868367047785, + "grad_norm": 1.8970059184469343, + "learning_rate": 9.449003516998827e-05, + "loss": 3.8182625770568848, + "step": 806, + "token_acc": 0.2320792620500958 + }, + { + "epoch": 0.4731750219876869, + "grad_norm": 1.7310927049039988, + "learning_rate": 9.460726846424385e-05, + "loss": 3.9098448753356934, + "step": 807, + "token_acc": 0.22147892786801618 + }, + { + "epoch": 0.4737613603048959, + "grad_norm": 2.1400799019598016, + "learning_rate": 9.472450175849942e-05, + "loss": 3.884195566177368, + "step": 808, + "token_acc": 0.2241544220289735 + }, + { + "epoch": 0.47434769862210496, + "grad_norm": 1.5526087586862078, + "learning_rate": 9.484173505275499e-05, + "loss": 3.8570384979248047, + "step": 809, + "token_acc": 0.22767766931192512 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 2.7078440330444646, + "learning_rate": 9.495896834701056e-05, + "loss": 3.8440260887145996, + "step": 810, + "token_acc": 0.2282954245620147 + }, + { + "epoch": 0.475520375256523, + "grad_norm": 1.1888403967427135, + "learning_rate": 9.507620164126613e-05, + "loss": 3.896343946456909, + "step": 811, + "token_acc": 0.22439760246719737 + }, + { + "epoch": 0.476106713573732, + "grad_norm": 2.3604336924134524, + "learning_rate": 9.51934349355217e-05, + "loss": 3.8686041831970215, + "step": 812, + "token_acc": 0.22957351201370874 + }, + { + "epoch": 0.4766930518909411, + "grad_norm": 2.124829798786561, + "learning_rate": 9.531066822977726e-05, + "loss": 3.8537416458129883, + "step": 813, + "token_acc": 0.22822571306228304 + }, + { + "epoch": 0.4772793902081501, + "grad_norm": 2.119433619207387, + "learning_rate": 9.542790152403283e-05, + "loss": 3.876708984375, + "step": 814, + "token_acc": 0.2248204058179777 + }, + { + "epoch": 0.47786572852535913, + "grad_norm": 1.9560413764089617, + "learning_rate": 9.554513481828839e-05, + "loss": 3.894780397415161, + "step": 815, + "token_acc": 0.2237485651875396 + }, + { + "epoch": 0.47845206684256814, + "grad_norm": 2.0383280700068243, + "learning_rate": 9.566236811254397e-05, + "loss": 3.897106647491455, + "step": 816, + "token_acc": 0.2221390579942326 + }, + { + "epoch": 0.4790384051597772, + "grad_norm": 1.846400839215357, + "learning_rate": 9.577960140679953e-05, + "loss": 3.8745126724243164, + "step": 817, + "token_acc": 0.22795407309778476 + }, + { + "epoch": 0.47962474347698625, + "grad_norm": 1.7701654805658873, + "learning_rate": 9.58968347010551e-05, + "loss": 3.862316131591797, + "step": 818, + "token_acc": 0.22820242274212202 + }, + { + "epoch": 0.48021108179419525, + "grad_norm": 1.716739886261576, + "learning_rate": 9.601406799531067e-05, + "loss": 3.8438267707824707, + "step": 819, + "token_acc": 0.23013671227251908 + }, + { + "epoch": 0.4807974201114043, + "grad_norm": 1.8530692487312084, + "learning_rate": 9.613130128956624e-05, + "loss": 3.8538131713867188, + "step": 820, + "token_acc": 0.22755805208956828 + }, + { + "epoch": 0.4813837584286133, + "grad_norm": 1.7477164041091169, + "learning_rate": 9.624853458382182e-05, + "loss": 3.883127212524414, + "step": 821, + "token_acc": 0.22496032785369838 + }, + { + "epoch": 0.48197009674582236, + "grad_norm": 2.0859655635067873, + "learning_rate": 9.636576787807737e-05, + "loss": 3.873800277709961, + "step": 822, + "token_acc": 0.22847557625010845 + }, + { + "epoch": 0.48255643506303136, + "grad_norm": 1.6743160528208407, + "learning_rate": 9.648300117233295e-05, + "loss": 3.8300459384918213, + "step": 823, + "token_acc": 0.2311213819101965 + }, + { + "epoch": 0.4831427733802404, + "grad_norm": 2.377277112859616, + "learning_rate": 9.660023446658851e-05, + "loss": 3.863558769226074, + "step": 824, + "token_acc": 0.22637062959109683 + }, + { + "epoch": 0.4837291116974494, + "grad_norm": 1.3795840428609316, + "learning_rate": 9.671746776084409e-05, + "loss": 3.9056777954101562, + "step": 825, + "token_acc": 0.22270229737941022 + }, + { + "epoch": 0.4843154500146585, + "grad_norm": 2.5341038548867947, + "learning_rate": 9.683470105509965e-05, + "loss": 3.840391159057617, + "step": 826, + "token_acc": 0.23263906782718133 + }, + { + "epoch": 0.4849017883318675, + "grad_norm": 1.5028432775377962, + "learning_rate": 9.695193434935522e-05, + "loss": 3.810879707336426, + "step": 827, + "token_acc": 0.23285418668243293 + }, + { + "epoch": 0.48548812664907653, + "grad_norm": 1.8140075471214623, + "learning_rate": 9.706916764361079e-05, + "loss": 3.917994260787964, + "step": 828, + "token_acc": 0.22115723101134763 + }, + { + "epoch": 0.48607446496628554, + "grad_norm": 1.9311951666675868, + "learning_rate": 9.718640093786636e-05, + "loss": 3.866739273071289, + "step": 829, + "token_acc": 0.22711367867703428 + }, + { + "epoch": 0.4866608032834946, + "grad_norm": 1.5652147363547808, + "learning_rate": 9.730363423212193e-05, + "loss": 3.8571949005126953, + "step": 830, + "token_acc": 0.22794324157886736 + }, + { + "epoch": 0.4872471416007036, + "grad_norm": 1.5963904007804146, + "learning_rate": 9.742086752637749e-05, + "loss": 3.8184940814971924, + "step": 831, + "token_acc": 0.23298841173483809 + }, + { + "epoch": 0.48783347991791265, + "grad_norm": 1.934804762361602, + "learning_rate": 9.753810082063307e-05, + "loss": 3.9114720821380615, + "step": 832, + "token_acc": 0.22209300212607438 + }, + { + "epoch": 0.48841981823512165, + "grad_norm": 2.181504126974787, + "learning_rate": 9.765533411488863e-05, + "loss": 3.825620651245117, + "step": 833, + "token_acc": 0.23236013563877253 + }, + { + "epoch": 0.4890061565523307, + "grad_norm": 1.3501053666987277, + "learning_rate": 9.77725674091442e-05, + "loss": 3.8362808227539062, + "step": 834, + "token_acc": 0.22815712722812798 + }, + { + "epoch": 0.4895924948695397, + "grad_norm": 2.1709396292068512, + "learning_rate": 9.788980070339977e-05, + "loss": 3.8445987701416016, + "step": 835, + "token_acc": 0.22872417662200514 + }, + { + "epoch": 0.49017883318674876, + "grad_norm": 1.8524811589318733, + "learning_rate": 9.800703399765534e-05, + "loss": 3.8186087608337402, + "step": 836, + "token_acc": 0.22899310066872072 + }, + { + "epoch": 0.49076517150395776, + "grad_norm": 1.9013615109708906, + "learning_rate": 9.81242672919109e-05, + "loss": 3.896653175354004, + "step": 837, + "token_acc": 0.22417524877975442 + }, + { + "epoch": 0.4913515098211668, + "grad_norm": 1.5993231767825782, + "learning_rate": 9.824150058616648e-05, + "loss": 3.8276803493499756, + "step": 838, + "token_acc": 0.2329621095750128 + }, + { + "epoch": 0.4919378481383758, + "grad_norm": 1.5379382240093624, + "learning_rate": 9.835873388042205e-05, + "loss": 3.857423782348633, + "step": 839, + "token_acc": 0.22638995819417768 + }, + { + "epoch": 0.4925241864555849, + "grad_norm": 1.9325385810138958, + "learning_rate": 9.847596717467761e-05, + "loss": 3.8191981315612793, + "step": 840, + "token_acc": 0.23254505214970866 + }, + { + "epoch": 0.4931105247727939, + "grad_norm": 2.0185300492717646, + "learning_rate": 9.859320046893318e-05, + "loss": 3.8605804443359375, + "step": 841, + "token_acc": 0.2257750988692554 + }, + { + "epoch": 0.49369686309000294, + "grad_norm": 2.137494691892694, + "learning_rate": 9.871043376318875e-05, + "loss": 3.8438796997070312, + "step": 842, + "token_acc": 0.22847942050462317 + }, + { + "epoch": 0.49428320140721194, + "grad_norm": 1.6701073786712586, + "learning_rate": 9.882766705744432e-05, + "loss": 3.8718926906585693, + "step": 843, + "token_acc": 0.22499588537324386 + }, + { + "epoch": 0.494869539724421, + "grad_norm": 2.44596872111063, + "learning_rate": 9.894490035169989e-05, + "loss": 3.8270504474639893, + "step": 844, + "token_acc": 0.23281811158439822 + }, + { + "epoch": 0.49545587804163, + "grad_norm": 1.4396085371604697, + "learning_rate": 9.906213364595546e-05, + "loss": 3.787402629852295, + "step": 845, + "token_acc": 0.2341805672679405 + }, + { + "epoch": 0.49604221635883905, + "grad_norm": 2.1093892691958294, + "learning_rate": 9.917936694021102e-05, + "loss": 3.825852632522583, + "step": 846, + "token_acc": 0.2288999992382442 + }, + { + "epoch": 0.4966285546760481, + "grad_norm": 1.3845878898796742, + "learning_rate": 9.92966002344666e-05, + "loss": 3.8680436611175537, + "step": 847, + "token_acc": 0.22720113956684423 + }, + { + "epoch": 0.4972148929932571, + "grad_norm": 1.7096703494505658, + "learning_rate": 9.941383352872216e-05, + "loss": 3.8637142181396484, + "step": 848, + "token_acc": 0.22786956150280868 + }, + { + "epoch": 0.49780123131046616, + "grad_norm": 1.9757041679481984, + "learning_rate": 9.953106682297773e-05, + "loss": 3.9013869762420654, + "step": 849, + "token_acc": 0.22096929045834154 + }, + { + "epoch": 0.49838756962767516, + "grad_norm": 1.7708102881508065, + "learning_rate": 9.964830011723329e-05, + "loss": 3.8716068267822266, + "step": 850, + "token_acc": 0.2249626042643815 + }, + { + "epoch": 0.4989739079448842, + "grad_norm": 1.7217166317282757, + "learning_rate": 9.976553341148887e-05, + "loss": 3.804633140563965, + "step": 851, + "token_acc": 0.2343589863950368 + }, + { + "epoch": 0.4995602462620932, + "grad_norm": 1.80491131215425, + "learning_rate": 9.988276670574443e-05, + "loss": 3.856325626373291, + "step": 852, + "token_acc": 0.22735655345687358 + }, + { + "epoch": 0.5001465845793023, + "grad_norm": 1.7306387003338115, + "learning_rate": 0.0001, + "loss": 3.8415231704711914, + "step": 853, + "token_acc": 0.22943609841062487 + }, + { + "epoch": 0.5007329228965113, + "grad_norm": 1.4543956068024768, + "learning_rate": 0.00010011723329425558, + "loss": 3.833892345428467, + "step": 854, + "token_acc": 0.22797933966267506 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 2.4912687753891865, + "learning_rate": 0.00010023446658851114, + "loss": 3.8768229484558105, + "step": 855, + "token_acc": 0.22621239130262039 + }, + { + "epoch": 0.5019055995309294, + "grad_norm": 1.6059915633961455, + "learning_rate": 0.00010035169988276672, + "loss": 3.8482866287231445, + "step": 856, + "token_acc": 0.22745292368681863 + }, + { + "epoch": 0.5024919378481384, + "grad_norm": 1.8684922419831205, + "learning_rate": 0.00010046893317702228, + "loss": 3.8241138458251953, + "step": 857, + "token_acc": 0.2299262666576473 + }, + { + "epoch": 0.5030782761653474, + "grad_norm": 1.733250431300054, + "learning_rate": 0.00010058616647127786, + "loss": 3.84586501121521, + "step": 858, + "token_acc": 0.2268165269153445 + }, + { + "epoch": 0.5036646144825564, + "grad_norm": 1.7962357758711094, + "learning_rate": 0.00010070339976553342, + "loss": 3.859968662261963, + "step": 859, + "token_acc": 0.22739971672492432 + }, + { + "epoch": 0.5042509527997655, + "grad_norm": 1.968146608694918, + "learning_rate": 0.000100820633059789, + "loss": 3.840559959411621, + "step": 860, + "token_acc": 0.22714941899052896 + }, + { + "epoch": 0.5048372911169745, + "grad_norm": 2.029459209917352, + "learning_rate": 0.00010093786635404456, + "loss": 3.8006794452667236, + "step": 861, + "token_acc": 0.23248480901095095 + }, + { + "epoch": 0.5054236294341835, + "grad_norm": 1.9457035827435212, + "learning_rate": 0.00010105509964830011, + "loss": 3.8426709175109863, + "step": 862, + "token_acc": 0.22913558299736517 + }, + { + "epoch": 0.5060099677513925, + "grad_norm": 1.8138451516963194, + "learning_rate": 0.00010117233294255569, + "loss": 3.8550000190734863, + "step": 863, + "token_acc": 0.22490673112290474 + }, + { + "epoch": 0.5065963060686016, + "grad_norm": 1.7123521094464096, + "learning_rate": 0.00010128956623681125, + "loss": 3.871476411819458, + "step": 864, + "token_acc": 0.2238877891227339 + }, + { + "epoch": 0.5071826443858106, + "grad_norm": 1.964610991944096, + "learning_rate": 0.00010140679953106683, + "loss": 3.8169126510620117, + "step": 865, + "token_acc": 0.23134838785521156 + }, + { + "epoch": 0.5077689827030196, + "grad_norm": 1.7420097589471077, + "learning_rate": 0.00010152403282532239, + "loss": 3.8456084728240967, + "step": 866, + "token_acc": 0.2285793508997261 + }, + { + "epoch": 0.5083553210202286, + "grad_norm": 2.094009505514912, + "learning_rate": 0.00010164126611957796, + "loss": 3.7970852851867676, + "step": 867, + "token_acc": 0.23256878923058702 + }, + { + "epoch": 0.5089416593374377, + "grad_norm": 1.4531493423760264, + "learning_rate": 0.00010175849941383353, + "loss": 3.7433066368103027, + "step": 868, + "token_acc": 0.23901962450089204 + }, + { + "epoch": 0.5095279976546467, + "grad_norm": 2.4676101825746053, + "learning_rate": 0.0001018757327080891, + "loss": 3.8639516830444336, + "step": 869, + "token_acc": 0.22586000257010755 + }, + { + "epoch": 0.5101143359718557, + "grad_norm": 1.4596223269244695, + "learning_rate": 0.00010199296600234467, + "loss": 3.8177967071533203, + "step": 870, + "token_acc": 0.23146747739655388 + }, + { + "epoch": 0.5107006742890647, + "grad_norm": 2.086274189884841, + "learning_rate": 0.00010211019929660024, + "loss": 3.8644027709960938, + "step": 871, + "token_acc": 0.22530568621330047 + }, + { + "epoch": 0.5112870126062738, + "grad_norm": 1.4906891681833554, + "learning_rate": 0.0001022274325908558, + "loss": 3.7592356204986572, + "step": 872, + "token_acc": 0.23870524787070854 + }, + { + "epoch": 0.5118733509234829, + "grad_norm": 1.813031339407144, + "learning_rate": 0.00010234466588511138, + "loss": 3.8510303497314453, + "step": 873, + "token_acc": 0.2251747542497511 + }, + { + "epoch": 0.5124596892406919, + "grad_norm": 1.9028976719121513, + "learning_rate": 0.00010246189917936694, + "loss": 3.790393352508545, + "step": 874, + "token_acc": 0.23384112795006262 + }, + { + "epoch": 0.513046027557901, + "grad_norm": 1.9782628570344742, + "learning_rate": 0.00010257913247362252, + "loss": 3.766150951385498, + "step": 875, + "token_acc": 0.23384121362014829 + }, + { + "epoch": 0.51363236587511, + "grad_norm": 1.5214097874722905, + "learning_rate": 0.00010269636576787808, + "loss": 3.776675224304199, + "step": 876, + "token_acc": 0.2348864863742107 + }, + { + "epoch": 0.514218704192319, + "grad_norm": 1.7257292481633826, + "learning_rate": 0.00010281359906213366, + "loss": 3.7921090126037598, + "step": 877, + "token_acc": 0.2324697342975973 + }, + { + "epoch": 0.514805042509528, + "grad_norm": 1.8749626116343154, + "learning_rate": 0.00010293083235638922, + "loss": 3.7958459854125977, + "step": 878, + "token_acc": 0.2325338070225837 + }, + { + "epoch": 0.5153913808267371, + "grad_norm": 1.7442970438803282, + "learning_rate": 0.0001030480656506448, + "loss": 3.835383653640747, + "step": 879, + "token_acc": 0.22866630324251225 + }, + { + "epoch": 0.5159777191439461, + "grad_norm": 1.6921559194705589, + "learning_rate": 0.00010316529894490035, + "loss": 3.7514145374298096, + "step": 880, + "token_acc": 0.23953068970578983 + }, + { + "epoch": 0.5165640574611551, + "grad_norm": 1.7685275933137885, + "learning_rate": 0.00010328253223915591, + "loss": 3.811249256134033, + "step": 881, + "token_acc": 0.22951919281562566 + }, + { + "epoch": 0.5171503957783641, + "grad_norm": 1.5976335129970325, + "learning_rate": 0.00010339976553341149, + "loss": 3.76723313331604, + "step": 882, + "token_acc": 0.23343578348406657 + }, + { + "epoch": 0.5177367340955732, + "grad_norm": 1.446161383733489, + "learning_rate": 0.00010351699882766705, + "loss": 3.8000478744506836, + "step": 883, + "token_acc": 0.23038577026820703 + }, + { + "epoch": 0.5183230724127822, + "grad_norm": 2.6438643568947913, + "learning_rate": 0.00010363423212192263, + "loss": 3.7621946334838867, + "step": 884, + "token_acc": 0.2357914611851902 + }, + { + "epoch": 0.5189094107299912, + "grad_norm": 1.7984800858785115, + "learning_rate": 0.00010375146541617819, + "loss": 3.7875049114227295, + "step": 885, + "token_acc": 0.2339973790343062 + }, + { + "epoch": 0.5194957490472002, + "grad_norm": 2.057739881614775, + "learning_rate": 0.00010386869871043376, + "loss": 3.8160524368286133, + "step": 886, + "token_acc": 0.2294129875871361 + }, + { + "epoch": 0.5200820873644093, + "grad_norm": 1.3221060403789813, + "learning_rate": 0.00010398593200468934, + "loss": 3.811464548110962, + "step": 887, + "token_acc": 0.2297662252217836 + }, + { + "epoch": 0.5206684256816183, + "grad_norm": 2.579994915319218, + "learning_rate": 0.0001041031652989449, + "loss": 3.7498552799224854, + "step": 888, + "token_acc": 0.23858625456937563 + }, + { + "epoch": 0.5212547639988273, + "grad_norm": 1.6194049991898507, + "learning_rate": 0.00010422039859320048, + "loss": 3.7258219718933105, + "step": 889, + "token_acc": 0.24174800448857742 + }, + { + "epoch": 0.5218411023160363, + "grad_norm": 2.2252441754806904, + "learning_rate": 0.00010433763188745604, + "loss": 3.9231722354888916, + "step": 890, + "token_acc": 0.22072242471334583 + }, + { + "epoch": 0.5224274406332454, + "grad_norm": 1.508934049123201, + "learning_rate": 0.00010445486518171162, + "loss": 3.821866989135742, + "step": 891, + "token_acc": 0.2280791724033397 + }, + { + "epoch": 0.5230137789504544, + "grad_norm": 1.9029437974971617, + "learning_rate": 0.00010457209847596718, + "loss": 3.783266305923462, + "step": 892, + "token_acc": 0.2320349860006442 + }, + { + "epoch": 0.5236001172676634, + "grad_norm": 2.061112201745218, + "learning_rate": 0.00010468933177022276, + "loss": 3.8460941314697266, + "step": 893, + "token_acc": 0.22568094427846033 + }, + { + "epoch": 0.5241864555848724, + "grad_norm": 1.6800004489068228, + "learning_rate": 0.00010480656506447832, + "loss": 3.8569884300231934, + "step": 894, + "token_acc": 0.22416407289258547 + }, + { + "epoch": 0.5247727939020815, + "grad_norm": 1.876459454000299, + "learning_rate": 0.0001049237983587339, + "loss": 3.776334524154663, + "step": 895, + "token_acc": 0.23372316963315007 + }, + { + "epoch": 0.5253591322192905, + "grad_norm": 2.196412564145204, + "learning_rate": 0.00010504103165298946, + "loss": 3.7707877159118652, + "step": 896, + "token_acc": 0.23543779904921974 + }, + { + "epoch": 0.5259454705364995, + "grad_norm": 1.262367050458971, + "learning_rate": 0.00010515826494724504, + "loss": 3.8026881217956543, + "step": 897, + "token_acc": 0.23261858518295406 + }, + { + "epoch": 0.5265318088537085, + "grad_norm": 2.003904875585833, + "learning_rate": 0.00010527549824150059, + "loss": 3.8653600215911865, + "step": 898, + "token_acc": 0.22501260742304274 + }, + { + "epoch": 0.5271181471709177, + "grad_norm": 2.025059662809299, + "learning_rate": 0.00010539273153575615, + "loss": 3.8105030059814453, + "step": 899, + "token_acc": 0.23099826192095518 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 1.9684297037770544, + "learning_rate": 0.00010550996483001172, + "loss": 3.772536277770996, + "step": 900, + "token_acc": 0.2345436096073513 + }, + { + "epoch": 0.5282908238053357, + "grad_norm": 1.675156861231727, + "learning_rate": 0.00010562719812426729, + "loss": 3.7684578895568848, + "step": 901, + "token_acc": 0.23502143355597646 + }, + { + "epoch": 0.5288771621225447, + "grad_norm": 1.7973141739107814, + "learning_rate": 0.00010574443141852286, + "loss": 3.8040270805358887, + "step": 902, + "token_acc": 0.23088953797132236 + }, + { + "epoch": 0.5294635004397538, + "grad_norm": 2.20925332979252, + "learning_rate": 0.00010586166471277843, + "loss": 3.8547840118408203, + "step": 903, + "token_acc": 0.22518435723729946 + }, + { + "epoch": 0.5300498387569628, + "grad_norm": 1.4999529571292936, + "learning_rate": 0.000105978898007034, + "loss": 3.8437447547912598, + "step": 904, + "token_acc": 0.22553469606208443 + }, + { + "epoch": 0.5306361770741718, + "grad_norm": 1.8571005394765723, + "learning_rate": 0.00010609613130128957, + "loss": 3.819962978363037, + "step": 905, + "token_acc": 0.2276766843785414 + }, + { + "epoch": 0.5312225153913809, + "grad_norm": 1.92435176695105, + "learning_rate": 0.00010621336459554514, + "loss": 3.8028135299682617, + "step": 906, + "token_acc": 0.2323573534194266 + }, + { + "epoch": 0.5318088537085899, + "grad_norm": 1.578137152214784, + "learning_rate": 0.0001063305978898007, + "loss": 3.7573342323303223, + "step": 907, + "token_acc": 0.2364324050767776 + }, + { + "epoch": 0.5323951920257989, + "grad_norm": 1.9135186047711916, + "learning_rate": 0.00010644783118405628, + "loss": 3.7947237491607666, + "step": 908, + "token_acc": 0.2321420236187394 + }, + { + "epoch": 0.5329815303430079, + "grad_norm": 1.9462589392980687, + "learning_rate": 0.00010656506447831184, + "loss": 3.7544782161712646, + "step": 909, + "token_acc": 0.23690342058916833 + }, + { + "epoch": 0.533567868660217, + "grad_norm": 1.2507640993549884, + "learning_rate": 0.00010668229777256742, + "loss": 3.808830738067627, + "step": 910, + "token_acc": 0.23140994503387446 + }, + { + "epoch": 0.534154206977426, + "grad_norm": 2.378214650949949, + "learning_rate": 0.00010679953106682298, + "loss": 3.867976188659668, + "step": 911, + "token_acc": 0.22268661018519473 + }, + { + "epoch": 0.534740545294635, + "grad_norm": 1.3070852369055637, + "learning_rate": 0.00010691676436107856, + "loss": 3.8175320625305176, + "step": 912, + "token_acc": 0.2271232033616834 + }, + { + "epoch": 0.535326883611844, + "grad_norm": 1.8245769629936717, + "learning_rate": 0.00010703399765533414, + "loss": 3.8083512783050537, + "step": 913, + "token_acc": 0.23271369949745815 + }, + { + "epoch": 0.5359132219290531, + "grad_norm": 1.539603236136721, + "learning_rate": 0.0001071512309495897, + "loss": 3.7720625400543213, + "step": 914, + "token_acc": 0.23479382305773572 + }, + { + "epoch": 0.5364995602462621, + "grad_norm": 2.1312462253701034, + "learning_rate": 0.00010726846424384527, + "loss": 3.767002582550049, + "step": 915, + "token_acc": 0.2339139624387474 + }, + { + "epoch": 0.5370858985634711, + "grad_norm": 1.799797612861782, + "learning_rate": 0.00010738569753810081, + "loss": 3.755925178527832, + "step": 916, + "token_acc": 0.23522587046955107 + }, + { + "epoch": 0.5376722368806801, + "grad_norm": 1.3268815812318113, + "learning_rate": 0.00010750293083235639, + "loss": 3.7807416915893555, + "step": 917, + "token_acc": 0.23370628297782575 + }, + { + "epoch": 0.5382585751978892, + "grad_norm": 1.8272570883668895, + "learning_rate": 0.00010762016412661195, + "loss": 3.7635884284973145, + "step": 918, + "token_acc": 0.23440094899169633 + }, + { + "epoch": 0.5388449135150982, + "grad_norm": 1.6569650320141702, + "learning_rate": 0.00010773739742086753, + "loss": 3.8084869384765625, + "step": 919, + "token_acc": 0.23000301153747846 + }, + { + "epoch": 0.5394312518323072, + "grad_norm": 1.6493331312472133, + "learning_rate": 0.0001078546307151231, + "loss": 3.7704248428344727, + "step": 920, + "token_acc": 0.23372646514871298 + }, + { + "epoch": 0.5400175901495162, + "grad_norm": 1.901996980044487, + "learning_rate": 0.00010797186400937866, + "loss": 3.7993061542510986, + "step": 921, + "token_acc": 0.22834509795462757 + }, + { + "epoch": 0.5406039284667253, + "grad_norm": 1.857921370257786, + "learning_rate": 0.00010808909730363424, + "loss": 3.78391695022583, + "step": 922, + "token_acc": 0.23128040698530647 + }, + { + "epoch": 0.5411902667839343, + "grad_norm": 1.5643984865513767, + "learning_rate": 0.0001082063305978898, + "loss": 3.778407096862793, + "step": 923, + "token_acc": 0.23182526824975774 + }, + { + "epoch": 0.5417766051011433, + "grad_norm": 2.160622739821634, + "learning_rate": 0.00010832356389214538, + "loss": 3.8278236389160156, + "step": 924, + "token_acc": 0.2279319383069058 + }, + { + "epoch": 0.5423629434183523, + "grad_norm": 1.7577557152303356, + "learning_rate": 0.00010844079718640094, + "loss": 3.7881975173950195, + "step": 925, + "token_acc": 0.23241702532693262 + }, + { + "epoch": 0.5429492817355615, + "grad_norm": 1.6376321911289973, + "learning_rate": 0.00010855803048065652, + "loss": 3.774430751800537, + "step": 926, + "token_acc": 0.23250080286045907 + }, + { + "epoch": 0.5435356200527705, + "grad_norm": 2.2510626806122014, + "learning_rate": 0.00010867526377491208, + "loss": 3.8066251277923584, + "step": 927, + "token_acc": 0.22921893027637527 + }, + { + "epoch": 0.5441219583699795, + "grad_norm": 1.3507963593683292, + "learning_rate": 0.00010879249706916766, + "loss": 3.8082144260406494, + "step": 928, + "token_acc": 0.22806348944760885 + }, + { + "epoch": 0.5447082966871885, + "grad_norm": 1.721975309121372, + "learning_rate": 0.00010890973036342322, + "loss": 3.76438307762146, + "step": 929, + "token_acc": 0.23699217163945577 + }, + { + "epoch": 0.5452946350043976, + "grad_norm": 1.7543096211850333, + "learning_rate": 0.0001090269636576788, + "loss": 3.758983850479126, + "step": 930, + "token_acc": 0.23427981306851373 + }, + { + "epoch": 0.5458809733216066, + "grad_norm": 2.2392104181490744, + "learning_rate": 0.00010914419695193436, + "loss": 3.8164052963256836, + "step": 931, + "token_acc": 0.22810268298007255 + }, + { + "epoch": 0.5464673116388156, + "grad_norm": 1.3443931431814997, + "learning_rate": 0.00010926143024618994, + "loss": 3.7565994262695312, + "step": 932, + "token_acc": 0.2363223524834168 + }, + { + "epoch": 0.5470536499560247, + "grad_norm": 1.9892276651408798, + "learning_rate": 0.00010937866354044548, + "loss": 3.7771010398864746, + "step": 933, + "token_acc": 0.2334187228022037 + }, + { + "epoch": 0.5476399882732337, + "grad_norm": 1.422651020578365, + "learning_rate": 0.00010949589683470105, + "loss": 3.787266969680786, + "step": 934, + "token_acc": 0.23238941357290635 + }, + { + "epoch": 0.5482263265904427, + "grad_norm": 1.54850828933813, + "learning_rate": 0.00010961313012895662, + "loss": 3.7013561725616455, + "step": 935, + "token_acc": 0.243059228782932 + }, + { + "epoch": 0.5488126649076517, + "grad_norm": 1.7871858193673842, + "learning_rate": 0.00010973036342321219, + "loss": 3.786162853240967, + "step": 936, + "token_acc": 0.23171379610290174 + }, + { + "epoch": 0.5493990032248608, + "grad_norm": 1.5475729081770393, + "learning_rate": 0.00010984759671746776, + "loss": 3.7843947410583496, + "step": 937, + "token_acc": 0.23255325185737855 + }, + { + "epoch": 0.5499853415420698, + "grad_norm": 1.6485880047228505, + "learning_rate": 0.00010996483001172333, + "loss": 3.780978202819824, + "step": 938, + "token_acc": 0.23280722687956737 + }, + { + "epoch": 0.5505716798592788, + "grad_norm": 1.7559637306591538, + "learning_rate": 0.0001100820633059789, + "loss": 3.7709059715270996, + "step": 939, + "token_acc": 0.23403717382974898 + }, + { + "epoch": 0.5511580181764878, + "grad_norm": 2.0445619793785146, + "learning_rate": 0.00011019929660023446, + "loss": 3.835878610610962, + "step": 940, + "token_acc": 0.2242450152284803 + }, + { + "epoch": 0.5517443564936969, + "grad_norm": 1.71281684025806, + "learning_rate": 0.00011031652989449004, + "loss": 3.7487008571624756, + "step": 941, + "token_acc": 0.23653796922134923 + }, + { + "epoch": 0.5523306948109059, + "grad_norm": 2.0816977456445978, + "learning_rate": 0.0001104337631887456, + "loss": 3.805837392807007, + "step": 942, + "token_acc": 0.2290530772408096 + }, + { + "epoch": 0.5529170331281149, + "grad_norm": 1.3372348512750465, + "learning_rate": 0.00011055099648300118, + "loss": 3.74919056892395, + "step": 943, + "token_acc": 0.2353206636242987 + }, + { + "epoch": 0.5535033714453239, + "grad_norm": 2.3229828997542636, + "learning_rate": 0.00011066822977725674, + "loss": 3.7610578536987305, + "step": 944, + "token_acc": 0.23237212880888647 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 1.6633473163118637, + "learning_rate": 0.00011078546307151232, + "loss": 3.7717862129211426, + "step": 945, + "token_acc": 0.23228038610099191 + }, + { + "epoch": 0.554676048079742, + "grad_norm": 1.6284272767111407, + "learning_rate": 0.0001109026963657679, + "loss": 3.761871337890625, + "step": 946, + "token_acc": 0.23458261587264753 + }, + { + "epoch": 0.555262386396951, + "grad_norm": 1.812537473430118, + "learning_rate": 0.00011101992966002346, + "loss": 3.7693562507629395, + "step": 947, + "token_acc": 0.23165263188819182 + }, + { + "epoch": 0.55584872471416, + "grad_norm": 2.014904504689635, + "learning_rate": 0.00011113716295427903, + "loss": 3.7694320678710938, + "step": 948, + "token_acc": 0.23309949999466945 + }, + { + "epoch": 0.5564350630313691, + "grad_norm": 2.0184673137181286, + "learning_rate": 0.0001112543962485346, + "loss": 3.763669967651367, + "step": 949, + "token_acc": 0.23315400723497204 + }, + { + "epoch": 0.5570214013485781, + "grad_norm": 1.7798335442382502, + "learning_rate": 0.00011137162954279017, + "loss": 3.7868223190307617, + "step": 950, + "token_acc": 0.2307915083561389 + }, + { + "epoch": 0.5576077396657871, + "grad_norm": 2.453680530842864, + "learning_rate": 0.00011148886283704571, + "loss": 3.7713940143585205, + "step": 951, + "token_acc": 0.23199098815326546 + }, + { + "epoch": 0.5581940779829961, + "grad_norm": 1.49336162751524, + "learning_rate": 0.00011160609613130129, + "loss": 3.7290735244750977, + "step": 952, + "token_acc": 0.23638256641437336 + }, + { + "epoch": 0.5587804163002053, + "grad_norm": 1.921848350128607, + "learning_rate": 0.00011172332942555686, + "loss": 3.79306960105896, + "step": 953, + "token_acc": 0.2287944816136972 + }, + { + "epoch": 0.5593667546174143, + "grad_norm": 1.716134004776264, + "learning_rate": 0.00011184056271981242, + "loss": 3.704774856567383, + "step": 954, + "token_acc": 0.23754556342766878 + }, + { + "epoch": 0.5599530929346233, + "grad_norm": 1.8374662527703833, + "learning_rate": 0.000111957796014068, + "loss": 3.708160400390625, + "step": 955, + "token_acc": 0.23707179071339932 + }, + { + "epoch": 0.5605394312518323, + "grad_norm": 1.8095621647414326, + "learning_rate": 0.00011207502930832356, + "loss": 3.649742603302002, + "step": 956, + "token_acc": 0.24341598602930808 + }, + { + "epoch": 0.5611257695690414, + "grad_norm": 1.3847027472870947, + "learning_rate": 0.00011219226260257914, + "loss": 3.6778266429901123, + "step": 957, + "token_acc": 0.2415768115942029 + }, + { + "epoch": 0.5617121078862504, + "grad_norm": 2.056697609684765, + "learning_rate": 0.0001123094958968347, + "loss": 3.6843035221099854, + "step": 958, + "token_acc": 0.24111701958960155 + }, + { + "epoch": 0.5622984462034594, + "grad_norm": 2.518201220929928, + "learning_rate": 0.00011242672919109028, + "loss": 3.7458324432373047, + "step": 959, + "token_acc": 0.2350365776766022 + }, + { + "epoch": 0.5628847845206685, + "grad_norm": 2.2554232379779724, + "learning_rate": 0.00011254396248534584, + "loss": 3.723310947418213, + "step": 960, + "token_acc": 0.23437844569264096 + }, + { + "epoch": 0.5634711228378775, + "grad_norm": 1.402262058604264, + "learning_rate": 0.00011266119577960142, + "loss": 3.7047066688537598, + "step": 961, + "token_acc": 0.23892739345075129 + }, + { + "epoch": 0.5640574611550865, + "grad_norm": 2.0569922023113025, + "learning_rate": 0.00011277842907385698, + "loss": 3.7138237953186035, + "step": 962, + "token_acc": 0.23666115056891182 + }, + { + "epoch": 0.5646437994722955, + "grad_norm": 2.071576549804621, + "learning_rate": 0.00011289566236811256, + "loss": 3.661585807800293, + "step": 963, + "token_acc": 0.24430577383601143 + }, + { + "epoch": 0.5652301377895046, + "grad_norm": 1.6598968721395821, + "learning_rate": 0.00011301289566236812, + "loss": 3.656017303466797, + "step": 964, + "token_acc": 0.24012215520433164 + }, + { + "epoch": 0.5658164761067136, + "grad_norm": 2.4405248472422114, + "learning_rate": 0.0001131301289566237, + "loss": 3.6877269744873047, + "step": 965, + "token_acc": 0.23853472562390013 + }, + { + "epoch": 0.5664028144239226, + "grad_norm": 1.5687434808702136, + "learning_rate": 0.00011324736225087926, + "loss": 3.691927909851074, + "step": 966, + "token_acc": 0.24020295742562833 + }, + { + "epoch": 0.5669891527411316, + "grad_norm": 1.9184569816695656, + "learning_rate": 0.00011336459554513483, + "loss": 3.737863063812256, + "step": 967, + "token_acc": 0.23096642686474808 + }, + { + "epoch": 0.5675754910583407, + "grad_norm": 1.3414726313118897, + "learning_rate": 0.0001134818288393904, + "loss": 3.6842947006225586, + "step": 968, + "token_acc": 0.2379509556552493 + }, + { + "epoch": 0.5681618293755497, + "grad_norm": 2.287396406370554, + "learning_rate": 0.00011359906213364595, + "loss": 3.6681060791015625, + "step": 969, + "token_acc": 0.24232340861891272 + }, + { + "epoch": 0.5687481676927587, + "grad_norm": 1.5438155016982122, + "learning_rate": 0.00011371629542790152, + "loss": 3.7281970977783203, + "step": 970, + "token_acc": 0.23182750012187267 + }, + { + "epoch": 0.5693345060099677, + "grad_norm": 1.7546438757153469, + "learning_rate": 0.00011383352872215709, + "loss": 3.6396799087524414, + "step": 971, + "token_acc": 0.24385292367941566 + }, + { + "epoch": 0.5699208443271768, + "grad_norm": 2.070338535019644, + "learning_rate": 0.00011395076201641266, + "loss": 3.6432807445526123, + "step": 972, + "token_acc": 0.24361332245036413 + }, + { + "epoch": 0.5705071826443858, + "grad_norm": 1.3792662791881012, + "learning_rate": 0.00011406799531066822, + "loss": 3.674607515335083, + "step": 973, + "token_acc": 0.23749031633744447 + }, + { + "epoch": 0.5710935209615948, + "grad_norm": 1.6276588527399602, + "learning_rate": 0.0001141852286049238, + "loss": 3.6293632984161377, + "step": 974, + "token_acc": 0.2411245255372049 + }, + { + "epoch": 0.5716798592788038, + "grad_norm": 2.0925242589180106, + "learning_rate": 0.00011430246189917936, + "loss": 3.6402673721313477, + "step": 975, + "token_acc": 0.24347585106603403 + }, + { + "epoch": 0.5722661975960129, + "grad_norm": 2.140237356216463, + "learning_rate": 0.00011441969519343494, + "loss": 3.604214668273926, + "step": 976, + "token_acc": 0.24692300579867119 + }, + { + "epoch": 0.5728525359132219, + "grad_norm": 1.7583526965991143, + "learning_rate": 0.0001145369284876905, + "loss": 3.6000659465789795, + "step": 977, + "token_acc": 0.24751831368944927 + }, + { + "epoch": 0.5734388742304309, + "grad_norm": 1.7354266205377469, + "learning_rate": 0.00011465416178194608, + "loss": 3.615461587905884, + "step": 978, + "token_acc": 0.24299503445480342 + }, + { + "epoch": 0.5740252125476399, + "grad_norm": 1.8684657430988447, + "learning_rate": 0.00011477139507620166, + "loss": 3.6727147102355957, + "step": 979, + "token_acc": 0.23572003625371524 + }, + { + "epoch": 0.574611550864849, + "grad_norm": 2.0169862020881597, + "learning_rate": 0.00011488862837045722, + "loss": 3.6161770820617676, + "step": 980, + "token_acc": 0.24400889121338912 + }, + { + "epoch": 0.575197889182058, + "grad_norm": 2.086021665945833, + "learning_rate": 0.0001150058616647128, + "loss": 3.685018539428711, + "step": 981, + "token_acc": 0.23590091302457505 + }, + { + "epoch": 0.575784227499267, + "grad_norm": 1.7657187461744435, + "learning_rate": 0.00011512309495896836, + "loss": 3.6133508682250977, + "step": 982, + "token_acc": 0.24450684730965278 + }, + { + "epoch": 0.576370565816476, + "grad_norm": 1.6640375825801352, + "learning_rate": 0.00011524032825322393, + "loss": 3.644221782684326, + "step": 983, + "token_acc": 0.24128423865563492 + }, + { + "epoch": 0.5769569041336852, + "grad_norm": 1.6260652624574337, + "learning_rate": 0.0001153575615474795, + "loss": 3.55635929107666, + "step": 984, + "token_acc": 0.2501803600928462 + }, + { + "epoch": 0.5775432424508942, + "grad_norm": 1.9391807019065288, + "learning_rate": 0.00011547479484173507, + "loss": 3.57075834274292, + "step": 985, + "token_acc": 0.24781159366696073 + }, + { + "epoch": 0.5781295807681032, + "grad_norm": 1.5168846814109316, + "learning_rate": 0.00011559202813599064, + "loss": 3.5659799575805664, + "step": 986, + "token_acc": 0.25015804945894404 + }, + { + "epoch": 0.5787159190853123, + "grad_norm": 2.253670056227686, + "learning_rate": 0.00011570926143024618, + "loss": 3.651451349258423, + "step": 987, + "token_acc": 0.23741272833647534 + }, + { + "epoch": 0.5793022574025213, + "grad_norm": 1.5832478240738754, + "learning_rate": 0.00011582649472450176, + "loss": 3.618858814239502, + "step": 988, + "token_acc": 0.24266613656488883 + }, + { + "epoch": 0.5798885957197303, + "grad_norm": 1.9384595711708528, + "learning_rate": 0.00011594372801875732, + "loss": 3.591041326522827, + "step": 989, + "token_acc": 0.2462241108431388 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 1.9409775983104318, + "learning_rate": 0.0001160609613130129, + "loss": 3.6330628395080566, + "step": 990, + "token_acc": 0.24072135756398977 + }, + { + "epoch": 0.5810612723541484, + "grad_norm": 1.8649630902274321, + "learning_rate": 0.00011617819460726846, + "loss": 3.5837771892547607, + "step": 991, + "token_acc": 0.24744177658697444 + }, + { + "epoch": 0.5816476106713574, + "grad_norm": 2.007962694836894, + "learning_rate": 0.00011629542790152404, + "loss": 3.622023105621338, + "step": 992, + "token_acc": 0.240656168937927 + }, + { + "epoch": 0.5822339489885664, + "grad_norm": 1.6953485128758858, + "learning_rate": 0.0001164126611957796, + "loss": 3.644559860229492, + "step": 993, + "token_acc": 0.23858164042858698 + }, + { + "epoch": 0.5828202873057754, + "grad_norm": 1.8791298938661651, + "learning_rate": 0.00011652989449003518, + "loss": 3.5799875259399414, + "step": 994, + "token_acc": 0.24684730786184395 + }, + { + "epoch": 0.5834066256229845, + "grad_norm": 1.9800222468744946, + "learning_rate": 0.00011664712778429074, + "loss": 3.588566780090332, + "step": 995, + "token_acc": 0.2442843915584062 + }, + { + "epoch": 0.5839929639401935, + "grad_norm": 1.6936884817749778, + "learning_rate": 0.00011676436107854632, + "loss": 3.603142261505127, + "step": 996, + "token_acc": 0.2431729586761323 + }, + { + "epoch": 0.5845793022574025, + "grad_norm": 1.917207538491688, + "learning_rate": 0.00011688159437280188, + "loss": 3.5945591926574707, + "step": 997, + "token_acc": 0.24411139261113315 + }, + { + "epoch": 0.5851656405746115, + "grad_norm": 1.8963396992981865, + "learning_rate": 0.00011699882766705746, + "loss": 3.60992431640625, + "step": 998, + "token_acc": 0.2410411698255303 + }, + { + "epoch": 0.5857519788918206, + "grad_norm": 2.114140147694409, + "learning_rate": 0.00011711606096131302, + "loss": 3.5892937183380127, + "step": 999, + "token_acc": 0.24507057797210516 + }, + { + "epoch": 0.5863383172090296, + "grad_norm": 1.5279872071452991, + "learning_rate": 0.0001172332942555686, + "loss": 3.5577597618103027, + "step": 1000, + "token_acc": 0.24804607157548333 + }, + { + "epoch": 0.5869246555262386, + "grad_norm": 1.8380089007207305, + "learning_rate": 0.00011735052754982416, + "loss": 3.5900192260742188, + "step": 1001, + "token_acc": 0.24536310495550057 + }, + { + "epoch": 0.5875109938434476, + "grad_norm": 1.5345365657474852, + "learning_rate": 0.00011746776084407973, + "loss": 3.629021406173706, + "step": 1002, + "token_acc": 0.24034049022633877 + }, + { + "epoch": 0.5880973321606567, + "grad_norm": 2.0175488502446672, + "learning_rate": 0.0001175849941383353, + "loss": 3.6154723167419434, + "step": 1003, + "token_acc": 0.2418381409494844 + }, + { + "epoch": 0.5886836704778657, + "grad_norm": 1.6442053720874912, + "learning_rate": 0.00011770222743259087, + "loss": 3.567788600921631, + "step": 1004, + "token_acc": 0.2463777426236573 + }, + { + "epoch": 0.5892700087950747, + "grad_norm": 2.3014889718091567, + "learning_rate": 0.00011781946072684642, + "loss": 3.59326171875, + "step": 1005, + "token_acc": 0.24266820304692757 + }, + { + "epoch": 0.5898563471122837, + "grad_norm": 1.6215891741265107, + "learning_rate": 0.00011793669402110198, + "loss": 3.587190628051758, + "step": 1006, + "token_acc": 0.24356150170363747 + }, + { + "epoch": 0.5904426854294929, + "grad_norm": 2.1067037835523723, + "learning_rate": 0.00011805392731535756, + "loss": 3.5938265323638916, + "step": 1007, + "token_acc": 0.2460848367664104 + }, + { + "epoch": 0.5910290237467019, + "grad_norm": 2.0533641055689604, + "learning_rate": 0.00011817116060961312, + "loss": 3.5194754600524902, + "step": 1008, + "token_acc": 0.252768340254184 + }, + { + "epoch": 0.5916153620639109, + "grad_norm": 1.851628456518367, + "learning_rate": 0.0001182883939038687, + "loss": 3.5379581451416016, + "step": 1009, + "token_acc": 0.25209016208226664 + }, + { + "epoch": 0.5922017003811199, + "grad_norm": 1.9423931309802098, + "learning_rate": 0.00011840562719812426, + "loss": 3.5537705421447754, + "step": 1010, + "token_acc": 0.24610016526699166 + }, + { + "epoch": 0.592788038698329, + "grad_norm": 1.4879750769961273, + "learning_rate": 0.00011852286049237984, + "loss": 3.5941834449768066, + "step": 1011, + "token_acc": 0.24298500492917552 + }, + { + "epoch": 0.593374377015538, + "grad_norm": 1.7865343627672126, + "learning_rate": 0.00011864009378663542, + "loss": 3.5110604763031006, + "step": 1012, + "token_acc": 0.2530759994172055 + }, + { + "epoch": 0.593960715332747, + "grad_norm": 2.2710812531249096, + "learning_rate": 0.00011875732708089098, + "loss": 3.5475857257843018, + "step": 1013, + "token_acc": 0.24811421104637663 + }, + { + "epoch": 0.594547053649956, + "grad_norm": 1.7002491154210158, + "learning_rate": 0.00011887456037514655, + "loss": 3.5835108757019043, + "step": 1014, + "token_acc": 0.24467408543787103 + }, + { + "epoch": 0.5951333919671651, + "grad_norm": 2.2862630423372297, + "learning_rate": 0.00011899179366940212, + "loss": 3.5338692665100098, + "step": 1015, + "token_acc": 0.2500873650439978 + }, + { + "epoch": 0.5957197302843741, + "grad_norm": 1.3693836962539019, + "learning_rate": 0.0001191090269636577, + "loss": 3.564061164855957, + "step": 1016, + "token_acc": 0.24591350811180113 + }, + { + "epoch": 0.5963060686015831, + "grad_norm": 2.4472082545421423, + "learning_rate": 0.00011922626025791326, + "loss": 3.5913989543914795, + "step": 1017, + "token_acc": 0.24268384782403474 + }, + { + "epoch": 0.5968924069187922, + "grad_norm": 1.405334717963625, + "learning_rate": 0.00011934349355216883, + "loss": 3.565342903137207, + "step": 1018, + "token_acc": 0.24828950467910144 + }, + { + "epoch": 0.5974787452360012, + "grad_norm": 2.331025007443443, + "learning_rate": 0.0001194607268464244, + "loss": 3.5455679893493652, + "step": 1019, + "token_acc": 0.24851790871264526 + }, + { + "epoch": 0.5980650835532102, + "grad_norm": 1.7212425563401157, + "learning_rate": 0.00011957796014067997, + "loss": 3.5542187690734863, + "step": 1020, + "token_acc": 0.2487347733348725 + }, + { + "epoch": 0.5986514218704192, + "grad_norm": 1.4235933517609585, + "learning_rate": 0.00011969519343493553, + "loss": 3.5619888305664062, + "step": 1021, + "token_acc": 0.2462524929210144 + }, + { + "epoch": 0.5992377601876283, + "grad_norm": 1.437780044595699, + "learning_rate": 0.00011981242672919108, + "loss": 3.528001308441162, + "step": 1022, + "token_acc": 0.2525707935525284 + }, + { + "epoch": 0.5998240985048373, + "grad_norm": 1.8358999290776543, + "learning_rate": 0.00011992966002344666, + "loss": 3.5367085933685303, + "step": 1023, + "token_acc": 0.2500249428939008 + }, + { + "epoch": 0.6004104368220463, + "grad_norm": 1.7309382950918344, + "learning_rate": 0.00012004689331770222, + "loss": 3.5310699939727783, + "step": 1024, + "token_acc": 0.2503434770126728 + }, + { + "epoch": 0.6009967751392553, + "grad_norm": 1.7271494701512455, + "learning_rate": 0.0001201641266119578, + "loss": 3.519312620162964, + "step": 1025, + "token_acc": 0.24971892370711457 + }, + { + "epoch": 0.6015831134564644, + "grad_norm": 1.6427595325413202, + "learning_rate": 0.00012028135990621336, + "loss": 3.5315117835998535, + "step": 1026, + "token_acc": 0.24985313527576236 + }, + { + "epoch": 0.6021694517736734, + "grad_norm": 2.4052887256712334, + "learning_rate": 0.00012039859320046894, + "loss": 3.54502534866333, + "step": 1027, + "token_acc": 0.24903145572272464 + }, + { + "epoch": 0.6027557900908824, + "grad_norm": 1.6272249422076415, + "learning_rate": 0.0001205158264947245, + "loss": 3.530763626098633, + "step": 1028, + "token_acc": 0.25066152732294644 + }, + { + "epoch": 0.6033421284080914, + "grad_norm": 1.6975485142504392, + "learning_rate": 0.00012063305978898008, + "loss": 3.5671520233154297, + "step": 1029, + "token_acc": 0.24636307143198585 + }, + { + "epoch": 0.6039284667253005, + "grad_norm": 1.9413921795179907, + "learning_rate": 0.00012075029308323564, + "loss": 3.528944253921509, + "step": 1030, + "token_acc": 0.24968577844973627 + }, + { + "epoch": 0.6045148050425095, + "grad_norm": 1.4661973754385087, + "learning_rate": 0.00012086752637749122, + "loss": 3.5883588790893555, + "step": 1031, + "token_acc": 0.24417072832176165 + }, + { + "epoch": 0.6051011433597185, + "grad_norm": 1.3027521205746548, + "learning_rate": 0.00012098475967174678, + "loss": 3.521371841430664, + "step": 1032, + "token_acc": 0.2522332377904402 + }, + { + "epoch": 0.6056874816769275, + "grad_norm": 1.564390745879449, + "learning_rate": 0.00012110199296600236, + "loss": 3.545868158340454, + "step": 1033, + "token_acc": 0.24922245260663506 + }, + { + "epoch": 0.6062738199941367, + "grad_norm": 2.0398894688915297, + "learning_rate": 0.00012121922626025792, + "loss": 3.5891737937927246, + "step": 1034, + "token_acc": 0.24372606620154225 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 1.7606517882190575, + "learning_rate": 0.0001213364595545135, + "loss": 3.5038208961486816, + "step": 1035, + "token_acc": 0.2536491752291313 + }, + { + "epoch": 0.6074464966285547, + "grad_norm": 2.333883518702912, + "learning_rate": 0.00012145369284876906, + "loss": 3.5442709922790527, + "step": 1036, + "token_acc": 0.24726089257643302 + }, + { + "epoch": 0.6080328349457637, + "grad_norm": 1.114544756021895, + "learning_rate": 0.00012157092614302463, + "loss": 3.54575514793396, + "step": 1037, + "token_acc": 0.2496079379540105 + }, + { + "epoch": 0.6086191732629728, + "grad_norm": 2.5881243038473682, + "learning_rate": 0.0001216881594372802, + "loss": 3.5461928844451904, + "step": 1038, + "token_acc": 0.2511278234838494 + }, + { + "epoch": 0.6092055115801818, + "grad_norm": 1.5597162463514849, + "learning_rate": 0.00012180539273153577, + "loss": 3.5267446041107178, + "step": 1039, + "token_acc": 0.25204129706155526 + }, + { + "epoch": 0.6097918498973908, + "grad_norm": 2.516686640525285, + "learning_rate": 0.00012192262602579132, + "loss": 3.530693531036377, + "step": 1040, + "token_acc": 0.25008318395998586 + }, + { + "epoch": 0.6103781882145998, + "grad_norm": 1.6850491492445012, + "learning_rate": 0.00012203985932004688, + "loss": 3.5639073848724365, + "step": 1041, + "token_acc": 0.24865802386467173 + }, + { + "epoch": 0.6109645265318089, + "grad_norm": 1.4679810620191514, + "learning_rate": 0.00012215709261430245, + "loss": 3.519987106323242, + "step": 1042, + "token_acc": 0.25096974597261174 + }, + { + "epoch": 0.6115508648490179, + "grad_norm": 1.5092842726632996, + "learning_rate": 0.00012227432590855802, + "loss": 3.550071954727173, + "step": 1043, + "token_acc": 0.24859595676997523 + }, + { + "epoch": 0.6121372031662269, + "grad_norm": 1.9934694210660846, + "learning_rate": 0.0001223915592028136, + "loss": 3.4802637100219727, + "step": 1044, + "token_acc": 0.2578290441368231 + }, + { + "epoch": 0.612723541483436, + "grad_norm": 2.060691871395609, + "learning_rate": 0.00012250879249706918, + "loss": 3.5283963680267334, + "step": 1045, + "token_acc": 0.24987356309728265 + }, + { + "epoch": 0.613309879800645, + "grad_norm": 1.610775755258291, + "learning_rate": 0.00012262602579132475, + "loss": 3.513293981552124, + "step": 1046, + "token_acc": 0.2521797858335686 + }, + { + "epoch": 0.613896218117854, + "grad_norm": 1.567077117287682, + "learning_rate": 0.0001227432590855803, + "loss": 3.5368242263793945, + "step": 1047, + "token_acc": 0.24913905961650692 + }, + { + "epoch": 0.614482556435063, + "grad_norm": 1.2397506038477286, + "learning_rate": 0.00012286049237983588, + "loss": 3.4795591831207275, + "step": 1048, + "token_acc": 0.2553865767673025 + }, + { + "epoch": 0.6150688947522721, + "grad_norm": 2.040014403136, + "learning_rate": 0.00012297772567409145, + "loss": 3.528956651687622, + "step": 1049, + "token_acc": 0.25158624874025626 + }, + { + "epoch": 0.6156552330694811, + "grad_norm": 1.3637790558172789, + "learning_rate": 0.00012309495896834703, + "loss": 3.5730528831481934, + "step": 1050, + "token_acc": 0.24380263666934499 + }, + { + "epoch": 0.6162415713866901, + "grad_norm": 2.0188828791662043, + "learning_rate": 0.00012321219226260258, + "loss": 3.4821696281433105, + "step": 1051, + "token_acc": 0.2546078234599534 + }, + { + "epoch": 0.6168279097038991, + "grad_norm": 1.6221663290511539, + "learning_rate": 0.00012332942555685816, + "loss": 3.5753936767578125, + "step": 1052, + "token_acc": 0.24374691758948086 + }, + { + "epoch": 0.6174142480211082, + "grad_norm": 1.3056918655843865, + "learning_rate": 0.00012344665885111373, + "loss": 3.489816188812256, + "step": 1053, + "token_acc": 0.2523561981796686 + }, + { + "epoch": 0.6180005863383172, + "grad_norm": 1.7606388602798402, + "learning_rate": 0.0001235638921453693, + "loss": 3.536217212677002, + "step": 1054, + "token_acc": 0.24904610632801868 + }, + { + "epoch": 0.6185869246555262, + "grad_norm": 1.8754227219939648, + "learning_rate": 0.00012368112543962486, + "loss": 3.4936347007751465, + "step": 1055, + "token_acc": 0.25389181524029886 + }, + { + "epoch": 0.6191732629727352, + "grad_norm": 1.627350070674053, + "learning_rate": 0.00012379835873388043, + "loss": 3.5174760818481445, + "step": 1056, + "token_acc": 0.2514314801821509 + }, + { + "epoch": 0.6197596012899443, + "grad_norm": 2.038677065727236, + "learning_rate": 0.000123915592028136, + "loss": 3.519094944000244, + "step": 1057, + "token_acc": 0.2526970338486536 + }, + { + "epoch": 0.6203459396071533, + "grad_norm": 2.1219571998137474, + "learning_rate": 0.00012403282532239156, + "loss": 3.512622833251953, + "step": 1058, + "token_acc": 0.25292715119002 + }, + { + "epoch": 0.6209322779243623, + "grad_norm": 1.299068027420354, + "learning_rate": 0.00012415005861664714, + "loss": 3.5118026733398438, + "step": 1059, + "token_acc": 0.2518489933332446 + }, + { + "epoch": 0.6215186162415713, + "grad_norm": 1.7519605670437755, + "learning_rate": 0.00012426729191090268, + "loss": 3.5191869735717773, + "step": 1060, + "token_acc": 0.2499653107622026 + }, + { + "epoch": 0.6221049545587805, + "grad_norm": 1.8271166639104148, + "learning_rate": 0.00012438452520515826, + "loss": 3.5297532081604004, + "step": 1061, + "token_acc": 0.2502845398745261 + }, + { + "epoch": 0.6226912928759895, + "grad_norm": 1.8922368681259265, + "learning_rate": 0.00012450175849941384, + "loss": 3.515394687652588, + "step": 1062, + "token_acc": 0.2519606520606354 + }, + { + "epoch": 0.6232776311931985, + "grad_norm": 1.539420971927679, + "learning_rate": 0.0001246189917936694, + "loss": 3.4920270442962646, + "step": 1063, + "token_acc": 0.2542557358840476 + }, + { + "epoch": 0.6238639695104075, + "grad_norm": 1.9999533787208568, + "learning_rate": 0.00012473622508792496, + "loss": 3.5001182556152344, + "step": 1064, + "token_acc": 0.25403221544608034 + }, + { + "epoch": 0.6244503078276166, + "grad_norm": 1.7162620503065502, + "learning_rate": 0.00012485345838218054, + "loss": 3.550278663635254, + "step": 1065, + "token_acc": 0.24859334770137545 + }, + { + "epoch": 0.6250366461448256, + "grad_norm": 2.0711543389115703, + "learning_rate": 0.00012497069167643612, + "loss": 3.5615978240966797, + "step": 1066, + "token_acc": 0.2443112978698021 + }, + { + "epoch": 0.6256229844620346, + "grad_norm": 1.358213927617702, + "learning_rate": 0.0001250879249706917, + "loss": 3.5284581184387207, + "step": 1067, + "token_acc": 0.24863853143998796 + }, + { + "epoch": 0.6262093227792436, + "grad_norm": 2.4227399183564216, + "learning_rate": 0.00012520515826494724, + "loss": 3.548661231994629, + "step": 1068, + "token_acc": 0.24701709041728034 + }, + { + "epoch": 0.6267956610964527, + "grad_norm": 1.6267457221115287, + "learning_rate": 0.00012532239155920282, + "loss": 3.444316864013672, + "step": 1069, + "token_acc": 0.26093052867080957 + }, + { + "epoch": 0.6273819994136617, + "grad_norm": 2.1591777111398778, + "learning_rate": 0.0001254396248534584, + "loss": 3.5324316024780273, + "step": 1070, + "token_acc": 0.2469512036434613 + }, + { + "epoch": 0.6279683377308707, + "grad_norm": 1.8230344124061826, + "learning_rate": 0.00012555685814771397, + "loss": 3.5422282218933105, + "step": 1071, + "token_acc": 0.24894721302814166 + }, + { + "epoch": 0.6285546760480798, + "grad_norm": 1.590985352686556, + "learning_rate": 0.00012567409144196955, + "loss": 3.5391898155212402, + "step": 1072, + "token_acc": 0.24830772853022545 + }, + { + "epoch": 0.6291410143652888, + "grad_norm": 2.5241418747300806, + "learning_rate": 0.0001257913247362251, + "loss": 3.5325958728790283, + "step": 1073, + "token_acc": 0.24972000347854503 + }, + { + "epoch": 0.6297273526824978, + "grad_norm": 1.186593901828575, + "learning_rate": 0.00012590855803048067, + "loss": 3.550349235534668, + "step": 1074, + "token_acc": 0.24905727527038732 + }, + { + "epoch": 0.6303136909997068, + "grad_norm": 1.8635284633897857, + "learning_rate": 0.00012602579132473625, + "loss": 3.477795124053955, + "step": 1075, + "token_acc": 0.2554012947507995 + }, + { + "epoch": 0.6309000293169159, + "grad_norm": 1.2308935970133303, + "learning_rate": 0.0001261430246189918, + "loss": 3.4932165145874023, + "step": 1076, + "token_acc": 0.2534502923976608 + }, + { + "epoch": 0.6314863676341249, + "grad_norm": 1.6951857620001005, + "learning_rate": 0.00012626025791324735, + "loss": 3.4882161617279053, + "step": 1077, + "token_acc": 0.25234785113039837 + }, + { + "epoch": 0.6320727059513339, + "grad_norm": 2.084310509374996, + "learning_rate": 0.00012637749120750292, + "loss": 3.517474889755249, + "step": 1078, + "token_acc": 0.250917833673959 + }, + { + "epoch": 0.6326590442685429, + "grad_norm": 1.5971346790556673, + "learning_rate": 0.0001264947245017585, + "loss": 3.5417568683624268, + "step": 1079, + "token_acc": 0.244869416447781 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 1.7307696948581068, + "learning_rate": 0.00012661195779601407, + "loss": 3.5061228275299072, + "step": 1080, + "token_acc": 0.25109964992918415 + }, + { + "epoch": 0.633831720902961, + "grad_norm": 1.422347164184185, + "learning_rate": 0.00012672919109026965, + "loss": 3.4645237922668457, + "step": 1081, + "token_acc": 0.2591485483660982 + }, + { + "epoch": 0.63441805922017, + "grad_norm": 1.5545632056258527, + "learning_rate": 0.0001268464243845252, + "loss": 3.4975290298461914, + "step": 1082, + "token_acc": 0.2529178124045218 + }, + { + "epoch": 0.635004397537379, + "grad_norm": 1.7581962442441528, + "learning_rate": 0.00012696365767878078, + "loss": 3.577753782272339, + "step": 1083, + "token_acc": 0.24156160931829354 + }, + { + "epoch": 0.6355907358545881, + "grad_norm": 2.029382316035274, + "learning_rate": 0.00012708089097303635, + "loss": 3.5328097343444824, + "step": 1084, + "token_acc": 0.25042286473071024 + }, + { + "epoch": 0.6361770741717971, + "grad_norm": 1.744941716582373, + "learning_rate": 0.00012719812426729193, + "loss": 3.5380783081054688, + "step": 1085, + "token_acc": 0.24770657102253502 + }, + { + "epoch": 0.6367634124890061, + "grad_norm": 2.1457773193567626, + "learning_rate": 0.00012731535756154748, + "loss": 3.552659511566162, + "step": 1086, + "token_acc": 0.24597393222785918 + }, + { + "epoch": 0.6373497508062151, + "grad_norm": 1.1034536806597226, + "learning_rate": 0.00012743259085580305, + "loss": 3.5078885555267334, + "step": 1087, + "token_acc": 0.2502465059597607 + }, + { + "epoch": 0.6379360891234243, + "grad_norm": 2.650228498942423, + "learning_rate": 0.00012754982415005863, + "loss": 3.509427070617676, + "step": 1088, + "token_acc": 0.2510381370216017 + }, + { + "epoch": 0.6385224274406333, + "grad_norm": 1.3299083509225766, + "learning_rate": 0.0001276670574443142, + "loss": 3.5170092582702637, + "step": 1089, + "token_acc": 0.25079531505728314 + }, + { + "epoch": 0.6391087657578423, + "grad_norm": 2.413161617838325, + "learning_rate": 0.00012778429073856976, + "loss": 3.515328884124756, + "step": 1090, + "token_acc": 0.252939483361085 + }, + { + "epoch": 0.6396951040750513, + "grad_norm": 1.5195580244089815, + "learning_rate": 0.00012790152403282533, + "loss": 3.5002281665802, + "step": 1091, + "token_acc": 0.2523816193161548 + }, + { + "epoch": 0.6402814423922604, + "grad_norm": 1.827565115895193, + "learning_rate": 0.0001280187573270809, + "loss": 3.4980390071868896, + "step": 1092, + "token_acc": 0.25216120565226546 + }, + { + "epoch": 0.6408677807094694, + "grad_norm": 1.3941520607158913, + "learning_rate": 0.00012813599062133646, + "loss": 3.578462600708008, + "step": 1093, + "token_acc": 0.24434646673142302 + }, + { + "epoch": 0.6414541190266784, + "grad_norm": 1.6278234400731246, + "learning_rate": 0.00012825322391559203, + "loss": 3.492392063140869, + "step": 1094, + "token_acc": 0.25403324518525894 + }, + { + "epoch": 0.6420404573438874, + "grad_norm": 1.498206000755919, + "learning_rate": 0.00012837045720984758, + "loss": 3.4665799140930176, + "step": 1095, + "token_acc": 0.2553039102827884 + }, + { + "epoch": 0.6426267956610965, + "grad_norm": 1.8632723546682861, + "learning_rate": 0.00012848769050410316, + "loss": 3.488762378692627, + "step": 1096, + "token_acc": 0.25406301404529924 + }, + { + "epoch": 0.6432131339783055, + "grad_norm": 1.6216461967877078, + "learning_rate": 0.00012860492379835874, + "loss": 3.483595371246338, + "step": 1097, + "token_acc": 0.2537727592364937 + }, + { + "epoch": 0.6437994722955145, + "grad_norm": 1.530482642788609, + "learning_rate": 0.0001287221570926143, + "loss": 3.5086817741394043, + "step": 1098, + "token_acc": 0.25126196268846124 + }, + { + "epoch": 0.6443858106127235, + "grad_norm": 1.5286022307510954, + "learning_rate": 0.00012883939038686986, + "loss": 3.517098903656006, + "step": 1099, + "token_acc": 0.25082736680740236 + }, + { + "epoch": 0.6449721489299326, + "grad_norm": 1.8723222869852563, + "learning_rate": 0.00012895662368112544, + "loss": 3.5396828651428223, + "step": 1100, + "token_acc": 0.24596171980528583 + }, + { + "epoch": 0.6455584872471416, + "grad_norm": 1.2057591076547487, + "learning_rate": 0.00012907385697538101, + "loss": 3.5273144245147705, + "step": 1101, + "token_acc": 0.2507911666147651 + }, + { + "epoch": 0.6461448255643506, + "grad_norm": 1.56219115690715, + "learning_rate": 0.0001291910902696366, + "loss": 3.467332601547241, + "step": 1102, + "token_acc": 0.25445789832907806 + }, + { + "epoch": 0.6467311638815597, + "grad_norm": 1.8889136923915626, + "learning_rate": 0.00012930832356389214, + "loss": 3.4939475059509277, + "step": 1103, + "token_acc": 0.25297546229087486 + }, + { + "epoch": 0.6473175021987687, + "grad_norm": 1.369326308728851, + "learning_rate": 0.00012942555685814772, + "loss": 3.47835111618042, + "step": 1104, + "token_acc": 0.25634802333960266 + }, + { + "epoch": 0.6479038405159777, + "grad_norm": 1.7569054045285029, + "learning_rate": 0.0001295427901524033, + "loss": 3.535393238067627, + "step": 1105, + "token_acc": 0.2467586940201812 + }, + { + "epoch": 0.6484901788331867, + "grad_norm": 1.4719551234319308, + "learning_rate": 0.00012966002344665887, + "loss": 3.478846549987793, + "step": 1106, + "token_acc": 0.25528716923713135 + }, + { + "epoch": 0.6490765171503958, + "grad_norm": 2.0550446970114646, + "learning_rate": 0.00012977725674091445, + "loss": 3.497086524963379, + "step": 1107, + "token_acc": 0.2520218300360883 + }, + { + "epoch": 0.6496628554676048, + "grad_norm": 1.6069513815207634, + "learning_rate": 0.00012989449003517, + "loss": 3.5329363346099854, + "step": 1108, + "token_acc": 0.24734512155017582 + }, + { + "epoch": 0.6502491937848138, + "grad_norm": 1.2804483281919128, + "learning_rate": 0.00013001172332942557, + "loss": 3.424595832824707, + "step": 1109, + "token_acc": 0.2618272569444444 + }, + { + "epoch": 0.6508355321020228, + "grad_norm": 2.039101242262156, + "learning_rate": 0.00013012895662368115, + "loss": 3.5017282962799072, + "step": 1110, + "token_acc": 0.2523237122569409 + }, + { + "epoch": 0.6514218704192319, + "grad_norm": 1.3664521359312445, + "learning_rate": 0.0001302461899179367, + "loss": 3.445675849914551, + "step": 1111, + "token_acc": 0.2569971745844011 + }, + { + "epoch": 0.6520082087364409, + "grad_norm": 2.1525887606817453, + "learning_rate": 0.00013036342321219227, + "loss": 3.497997283935547, + "step": 1112, + "token_acc": 0.2519394990908191 + }, + { + "epoch": 0.6525945470536499, + "grad_norm": 1.3205000448848785, + "learning_rate": 0.00013048065650644782, + "loss": 3.463045835494995, + "step": 1113, + "token_acc": 0.25626202755753985 + }, + { + "epoch": 0.6531808853708589, + "grad_norm": 2.1881990930939876, + "learning_rate": 0.0001305978898007034, + "loss": 3.532336473464966, + "step": 1114, + "token_acc": 0.2501864373418055 + }, + { + "epoch": 0.653767223688068, + "grad_norm": 1.3557148426305365, + "learning_rate": 0.00013071512309495897, + "loss": 3.5048410892486572, + "step": 1115, + "token_acc": 0.2526701980424238 + }, + { + "epoch": 0.6543535620052771, + "grad_norm": 2.090818690777382, + "learning_rate": 0.00013083235638921455, + "loss": 3.5050439834594727, + "step": 1116, + "token_acc": 0.2500398582588884 + }, + { + "epoch": 0.6549399003224861, + "grad_norm": 1.399410097228022, + "learning_rate": 0.0001309495896834701, + "loss": 3.448899030685425, + "step": 1117, + "token_acc": 0.25704342492384785 + }, + { + "epoch": 0.6555262386396951, + "grad_norm": 1.2068609383343245, + "learning_rate": 0.00013106682297772568, + "loss": 3.5348777770996094, + "step": 1118, + "token_acc": 0.24623374349546712 + }, + { + "epoch": 0.6561125769569042, + "grad_norm": 1.958200666790498, + "learning_rate": 0.00013118405627198125, + "loss": 3.5034704208374023, + "step": 1119, + "token_acc": 0.25266238264113544 + }, + { + "epoch": 0.6566989152741132, + "grad_norm": 1.489734388560857, + "learning_rate": 0.00013130128956623683, + "loss": 3.477001667022705, + "step": 1120, + "token_acc": 0.25504203392592795 + }, + { + "epoch": 0.6572852535913222, + "grad_norm": 1.6449328602949582, + "learning_rate": 0.00013141852286049238, + "loss": 3.4948043823242188, + "step": 1121, + "token_acc": 0.2527632082769042 + }, + { + "epoch": 0.6578715919085312, + "grad_norm": 1.5537009140122566, + "learning_rate": 0.00013153575615474795, + "loss": 3.535090208053589, + "step": 1122, + "token_acc": 0.24608306261345386 + }, + { + "epoch": 0.6584579302257403, + "grad_norm": 1.4467939324894372, + "learning_rate": 0.00013165298944900353, + "loss": 3.507394790649414, + "step": 1123, + "token_acc": 0.25186225504791665 + }, + { + "epoch": 0.6590442685429493, + "grad_norm": 1.848536295175601, + "learning_rate": 0.0001317702227432591, + "loss": 3.4809556007385254, + "step": 1124, + "token_acc": 0.25317463170164367 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 1.3540894022617835, + "learning_rate": 0.00013188745603751466, + "loss": 3.420164108276367, + "step": 1125, + "token_acc": 0.2625032262142417 + }, + { + "epoch": 0.6602169451773673, + "grad_norm": 1.7658244362011437, + "learning_rate": 0.00013200468933177023, + "loss": 3.476072311401367, + "step": 1126, + "token_acc": 0.2514708501888419 + }, + { + "epoch": 0.6608032834945764, + "grad_norm": 1.2030198146154674, + "learning_rate": 0.0001321219226260258, + "loss": 3.4701123237609863, + "step": 1127, + "token_acc": 0.2556297174224939 + }, + { + "epoch": 0.6613896218117854, + "grad_norm": 1.9478204145441387, + "learning_rate": 0.00013223915592028138, + "loss": 3.509843111038208, + "step": 1128, + "token_acc": 0.2506740699833325 + }, + { + "epoch": 0.6619759601289944, + "grad_norm": 1.5709301998951004, + "learning_rate": 0.00013235638921453693, + "loss": 3.4634764194488525, + "step": 1129, + "token_acc": 0.2570987277974231 + }, + { + "epoch": 0.6625622984462035, + "grad_norm": 1.6644135462592604, + "learning_rate": 0.00013247362250879248, + "loss": 3.4557394981384277, + "step": 1130, + "token_acc": 0.25608796032454983 + }, + { + "epoch": 0.6631486367634125, + "grad_norm": 1.3209046507378825, + "learning_rate": 0.00013259085580304806, + "loss": 3.492227792739868, + "step": 1131, + "token_acc": 0.2507001174390543 + }, + { + "epoch": 0.6637349750806215, + "grad_norm": 1.601190670556154, + "learning_rate": 0.00013270808909730364, + "loss": 3.505174160003662, + "step": 1132, + "token_acc": 0.25047178863868985 + }, + { + "epoch": 0.6643213133978305, + "grad_norm": 1.2937312085375858, + "learning_rate": 0.0001328253223915592, + "loss": 3.5209531784057617, + "step": 1133, + "token_acc": 0.25001055319867455 + }, + { + "epoch": 0.6649076517150396, + "grad_norm": 1.5136100568934243, + "learning_rate": 0.00013294255568581476, + "loss": 3.5230441093444824, + "step": 1134, + "token_acc": 0.2491668412807695 + }, + { + "epoch": 0.6654939900322486, + "grad_norm": 1.3476310158466427, + "learning_rate": 0.00013305978898007034, + "loss": 3.476250410079956, + "step": 1135, + "token_acc": 0.2552725572454639 + }, + { + "epoch": 0.6660803283494576, + "grad_norm": 1.612856545570486, + "learning_rate": 0.00013317702227432591, + "loss": 3.4437780380249023, + "step": 1136, + "token_acc": 0.26018507119185746 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.6901966602018512, + "learning_rate": 0.0001332942555685815, + "loss": 3.492274522781372, + "step": 1137, + "token_acc": 0.2519536002335868 + }, + { + "epoch": 0.6672530049838757, + "grad_norm": 1.6451507271722177, + "learning_rate": 0.00013341148886283707, + "loss": 3.4644644260406494, + "step": 1138, + "token_acc": 0.2554292386809039 + }, + { + "epoch": 0.6678393433010847, + "grad_norm": 1.7317421627476886, + "learning_rate": 0.00013352872215709262, + "loss": 3.485468626022339, + "step": 1139, + "token_acc": 0.2518462972399936 + }, + { + "epoch": 0.6684256816182937, + "grad_norm": 1.406611673191925, + "learning_rate": 0.0001336459554513482, + "loss": 3.4783644676208496, + "step": 1140, + "token_acc": 0.25571126222589574 + }, + { + "epoch": 0.6690120199355027, + "grad_norm": 1.4433754487081043, + "learning_rate": 0.00013376318874560377, + "loss": 3.477328300476074, + "step": 1141, + "token_acc": 0.2538552512580294 + }, + { + "epoch": 0.6695983582527119, + "grad_norm": 1.6052659185720268, + "learning_rate": 0.00013388042203985934, + "loss": 3.451526641845703, + "step": 1142, + "token_acc": 0.2585951539327396 + }, + { + "epoch": 0.6701846965699209, + "grad_norm": 1.935123880711027, + "learning_rate": 0.0001339976553341149, + "loss": 3.463585138320923, + "step": 1143, + "token_acc": 0.2562888924376216 + }, + { + "epoch": 0.6707710348871299, + "grad_norm": 1.7345535778589711, + "learning_rate": 0.00013411488862837047, + "loss": 3.493476390838623, + "step": 1144, + "token_acc": 0.2517500955086711 + }, + { + "epoch": 0.6713573732043389, + "grad_norm": 1.9665978031881401, + "learning_rate": 0.00013423212192262605, + "loss": 3.4867167472839355, + "step": 1145, + "token_acc": 0.2516953431275492 + }, + { + "epoch": 0.671943711521548, + "grad_norm": 1.2055017270599082, + "learning_rate": 0.00013434935521688162, + "loss": 3.4312050342559814, + "step": 1146, + "token_acc": 0.26020613419226907 + }, + { + "epoch": 0.672530049838757, + "grad_norm": 1.9362557441138761, + "learning_rate": 0.00013446658851113717, + "loss": 3.4426937103271484, + "step": 1147, + "token_acc": 0.25910963571782586 + }, + { + "epoch": 0.673116388155966, + "grad_norm": 1.4747065249377906, + "learning_rate": 0.00013458382180539272, + "loss": 3.4932241439819336, + "step": 1148, + "token_acc": 0.25150536348048014 + }, + { + "epoch": 0.673702726473175, + "grad_norm": 1.7730607946554675, + "learning_rate": 0.0001347010550996483, + "loss": 3.4713006019592285, + "step": 1149, + "token_acc": 0.25320306808930026 + }, + { + "epoch": 0.6742890647903841, + "grad_norm": 1.4865355728025393, + "learning_rate": 0.00013481828839390387, + "loss": 3.488659381866455, + "step": 1150, + "token_acc": 0.25279314240933526 + }, + { + "epoch": 0.6748754031075931, + "grad_norm": 1.648445234210883, + "learning_rate": 0.00013493552168815945, + "loss": 3.4634125232696533, + "step": 1151, + "token_acc": 0.25615860421577596 + }, + { + "epoch": 0.6754617414248021, + "grad_norm": 1.4977150332918276, + "learning_rate": 0.000135052754982415, + "loss": 3.4460153579711914, + "step": 1152, + "token_acc": 0.2573434857078827 + }, + { + "epoch": 0.6760480797420111, + "grad_norm": 1.6378107882960005, + "learning_rate": 0.00013516998827667058, + "loss": 3.458651065826416, + "step": 1153, + "token_acc": 0.25469644059239316 + }, + { + "epoch": 0.6766344180592202, + "grad_norm": 1.4152790273997304, + "learning_rate": 0.00013528722157092615, + "loss": 3.4933724403381348, + "step": 1154, + "token_acc": 0.25211046502314705 + }, + { + "epoch": 0.6772207563764292, + "grad_norm": 2.382720307770914, + "learning_rate": 0.00013540445486518173, + "loss": 3.4635462760925293, + "step": 1155, + "token_acc": 0.2553938716700291 + }, + { + "epoch": 0.6778070946936382, + "grad_norm": 0.9658115323946195, + "learning_rate": 0.00013552168815943728, + "loss": 3.487952470779419, + "step": 1156, + "token_acc": 0.25169223508409233 + }, + { + "epoch": 0.6783934330108473, + "grad_norm": 2.370806679735023, + "learning_rate": 0.00013563892145369285, + "loss": 3.427581310272217, + "step": 1157, + "token_acc": 0.2600588493489181 + }, + { + "epoch": 0.6789797713280563, + "grad_norm": 1.545395581925495, + "learning_rate": 0.00013575615474794843, + "loss": 3.5274524688720703, + "step": 1158, + "token_acc": 0.24657122937012402 + }, + { + "epoch": 0.6795661096452653, + "grad_norm": 2.5188462999917864, + "learning_rate": 0.000135873388042204, + "loss": 3.491748809814453, + "step": 1159, + "token_acc": 0.2526028742489678 + }, + { + "epoch": 0.6801524479624743, + "grad_norm": 1.5118078808710111, + "learning_rate": 0.00013599062133645955, + "loss": 3.411062717437744, + "step": 1160, + "token_acc": 0.26083639642138845 + }, + { + "epoch": 0.6807387862796834, + "grad_norm": 1.715140864966689, + "learning_rate": 0.00013610785463071513, + "loss": 3.5262625217437744, + "step": 1161, + "token_acc": 0.24832881662149955 + }, + { + "epoch": 0.6813251245968924, + "grad_norm": 1.8745914458701944, + "learning_rate": 0.0001362250879249707, + "loss": 3.4538118839263916, + "step": 1162, + "token_acc": 0.2572644479944114 + }, + { + "epoch": 0.6819114629141014, + "grad_norm": 1.6597030765621454, + "learning_rate": 0.00013634232121922628, + "loss": 3.503361701965332, + "step": 1163, + "token_acc": 0.25056399655830935 + }, + { + "epoch": 0.6824978012313104, + "grad_norm": 2.075399261672626, + "learning_rate": 0.00013645955451348183, + "loss": 3.4680135250091553, + "step": 1164, + "token_acc": 0.25656342223268597 + }, + { + "epoch": 0.6830841395485195, + "grad_norm": 1.3848175296512513, + "learning_rate": 0.00013657678780773738, + "loss": 3.4982101917266846, + "step": 1165, + "token_acc": 0.24971137746221148 + }, + { + "epoch": 0.6836704778657285, + "grad_norm": 1.8366172051748701, + "learning_rate": 0.00013669402110199296, + "loss": 3.479870557785034, + "step": 1166, + "token_acc": 0.25457881128949855 + }, + { + "epoch": 0.6842568161829375, + "grad_norm": 1.6676678514974097, + "learning_rate": 0.00013681125439624853, + "loss": 3.505056858062744, + "step": 1167, + "token_acc": 0.2515033080573154 + }, + { + "epoch": 0.6848431545001465, + "grad_norm": 1.545090889989793, + "learning_rate": 0.0001369284876905041, + "loss": 3.493459701538086, + "step": 1168, + "token_acc": 0.25335046719247495 + }, + { + "epoch": 0.6854294928173557, + "grad_norm": 1.3231709050723404, + "learning_rate": 0.00013704572098475966, + "loss": 3.4779584407806396, + "step": 1169, + "token_acc": 0.25170333343946005 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 1.274085655715862, + "learning_rate": 0.00013716295427901524, + "loss": 3.4401192665100098, + "step": 1170, + "token_acc": 0.25725683596271526 + }, + { + "epoch": 0.6866021694517737, + "grad_norm": 1.5637680209566254, + "learning_rate": 0.0001372801875732708, + "loss": 3.4660308361053467, + "step": 1171, + "token_acc": 0.25411138938210914 + }, + { + "epoch": 0.6871885077689827, + "grad_norm": 2.0975606997572345, + "learning_rate": 0.0001373974208675264, + "loss": 3.4714128971099854, + "step": 1172, + "token_acc": 0.2526440723348163 + }, + { + "epoch": 0.6877748460861918, + "grad_norm": 1.2120288961775754, + "learning_rate": 0.00013751465416178197, + "loss": 3.46285343170166, + "step": 1173, + "token_acc": 0.2542201591511936 + }, + { + "epoch": 0.6883611844034008, + "grad_norm": 1.9373218951808908, + "learning_rate": 0.00013763188745603751, + "loss": 3.4868929386138916, + "step": 1174, + "token_acc": 0.2544079962912207 + }, + { + "epoch": 0.6889475227206098, + "grad_norm": 1.2102680020113552, + "learning_rate": 0.0001377491207502931, + "loss": 3.4891185760498047, + "step": 1175, + "token_acc": 0.25215280838980186 + }, + { + "epoch": 0.6895338610378188, + "grad_norm": 2.120038596659682, + "learning_rate": 0.00013786635404454867, + "loss": 3.500941038131714, + "step": 1176, + "token_acc": 0.2507066243395172 + }, + { + "epoch": 0.6901201993550279, + "grad_norm": 1.3143829428572822, + "learning_rate": 0.00013798358733880424, + "loss": 3.467639446258545, + "step": 1177, + "token_acc": 0.2514419411656458 + }, + { + "epoch": 0.6907065376722369, + "grad_norm": 2.216158579059092, + "learning_rate": 0.0001381008206330598, + "loss": 3.5305237770080566, + "step": 1178, + "token_acc": 0.24700482402920784 + }, + { + "epoch": 0.6912928759894459, + "grad_norm": 1.5039306542106392, + "learning_rate": 0.00013821805392731537, + "loss": 3.49605131149292, + "step": 1179, + "token_acc": 0.2527539731284977 + }, + { + "epoch": 0.6918792143066549, + "grad_norm": 1.4823825099735322, + "learning_rate": 0.00013833528722157095, + "loss": 3.4661059379577637, + "step": 1180, + "token_acc": 0.25359403415729426 + }, + { + "epoch": 0.692465552623864, + "grad_norm": 1.2336420752638715, + "learning_rate": 0.00013845252051582652, + "loss": 3.5125927925109863, + "step": 1181, + "token_acc": 0.24958010983545514 + }, + { + "epoch": 0.693051890941073, + "grad_norm": 1.5499329078256574, + "learning_rate": 0.00013856975381008207, + "loss": 3.4173507690429688, + "step": 1182, + "token_acc": 0.2592866948644761 + }, + { + "epoch": 0.693638229258282, + "grad_norm": 1.704060419983667, + "learning_rate": 0.00013868698710433762, + "loss": 3.436004161834717, + "step": 1183, + "token_acc": 0.2581195476575121 + }, + { + "epoch": 0.6942245675754911, + "grad_norm": 1.5624784944045353, + "learning_rate": 0.0001388042203985932, + "loss": 3.4286084175109863, + "step": 1184, + "token_acc": 0.2586951738775933 + }, + { + "epoch": 0.6948109058927001, + "grad_norm": 1.6598018768124778, + "learning_rate": 0.00013892145369284877, + "loss": 3.4594101905822754, + "step": 1185, + "token_acc": 0.2564892108369623 + }, + { + "epoch": 0.6953972442099091, + "grad_norm": 1.630253826126385, + "learning_rate": 0.00013903868698710435, + "loss": 3.4498209953308105, + "step": 1186, + "token_acc": 0.25624231551434823 + }, + { + "epoch": 0.6959835825271181, + "grad_norm": 1.2925787028343558, + "learning_rate": 0.0001391559202813599, + "loss": 3.47819185256958, + "step": 1187, + "token_acc": 0.2531844705595316 + }, + { + "epoch": 0.6965699208443272, + "grad_norm": 1.7395817545075576, + "learning_rate": 0.00013927315357561547, + "loss": 3.4408469200134277, + "step": 1188, + "token_acc": 0.2551191272939393 + }, + { + "epoch": 0.6971562591615362, + "grad_norm": 1.4333187255233182, + "learning_rate": 0.00013939038686987105, + "loss": 3.4492459297180176, + "step": 1189, + "token_acc": 0.25624443284895826 + }, + { + "epoch": 0.6977425974787452, + "grad_norm": 1.8954820162326889, + "learning_rate": 0.00013950762016412663, + "loss": 3.497751235961914, + "step": 1190, + "token_acc": 0.2515698723518011 + }, + { + "epoch": 0.6983289357959542, + "grad_norm": 1.0768710632639547, + "learning_rate": 0.00013962485345838218, + "loss": 3.466113567352295, + "step": 1191, + "token_acc": 0.2554972332765328 + }, + { + "epoch": 0.6989152741131633, + "grad_norm": 2.202932416719709, + "learning_rate": 0.00013974208675263775, + "loss": 3.4374327659606934, + "step": 1192, + "token_acc": 0.2571900485705379 + }, + { + "epoch": 0.6995016124303723, + "grad_norm": 1.2470936737497804, + "learning_rate": 0.00013985932004689333, + "loss": 3.480226516723633, + "step": 1193, + "token_acc": 0.2524677906487747 + }, + { + "epoch": 0.7000879507475813, + "grad_norm": 1.566575092916346, + "learning_rate": 0.0001399765533411489, + "loss": 3.4235100746154785, + "step": 1194, + "token_acc": 0.2607239874492492 + }, + { + "epoch": 0.7006742890647903, + "grad_norm": 1.6899843128382854, + "learning_rate": 0.00014009378663540445, + "loss": 3.486229181289673, + "step": 1195, + "token_acc": 0.25187803566081685 + }, + { + "epoch": 0.7012606273819995, + "grad_norm": 1.4562076013023741, + "learning_rate": 0.00014021101992966003, + "loss": 3.45029878616333, + "step": 1196, + "token_acc": 0.2562012248282901 + }, + { + "epoch": 0.7018469656992085, + "grad_norm": 1.800918602986552, + "learning_rate": 0.0001403282532239156, + "loss": 3.4462833404541016, + "step": 1197, + "token_acc": 0.25553232973492196 + }, + { + "epoch": 0.7024333040164175, + "grad_norm": 1.4128034542778685, + "learning_rate": 0.00014044548651817118, + "loss": 3.5032455921173096, + "step": 1198, + "token_acc": 0.24997202990068662 + }, + { + "epoch": 0.7030196423336265, + "grad_norm": 1.4769940162096624, + "learning_rate": 0.00014056271981242676, + "loss": 3.485568046569824, + "step": 1199, + "token_acc": 0.2522193805646135 + }, + { + "epoch": 0.7036059806508356, + "grad_norm": 1.3036482528323072, + "learning_rate": 0.00014067995310668228, + "loss": 3.474188804626465, + "step": 1200, + "token_acc": 0.2537372817175032 + }, + { + "epoch": 0.7041923189680446, + "grad_norm": 1.4723395310551526, + "learning_rate": 0.00014079718640093786, + "loss": 3.4770054817199707, + "step": 1201, + "token_acc": 0.25310733851583983 + }, + { + "epoch": 0.7047786572852536, + "grad_norm": 1.4346245786714835, + "learning_rate": 0.00014091441969519343, + "loss": 3.4641005992889404, + "step": 1202, + "token_acc": 0.2551397503438611 + }, + { + "epoch": 0.7053649956024626, + "grad_norm": 1.9910262359209665, + "learning_rate": 0.000141031652989449, + "loss": 3.499342679977417, + "step": 1203, + "token_acc": 0.2502527659057731 + }, + { + "epoch": 0.7059513339196717, + "grad_norm": 1.1008432105637287, + "learning_rate": 0.0001411488862837046, + "loss": 3.413872480392456, + "step": 1204, + "token_acc": 0.2591675108691064 + }, + { + "epoch": 0.7065376722368807, + "grad_norm": 1.9399993744358432, + "learning_rate": 0.00014126611957796014, + "loss": 3.451542615890503, + "step": 1205, + "token_acc": 0.2568799910138893 + }, + { + "epoch": 0.7071240105540897, + "grad_norm": 1.3098845892311908, + "learning_rate": 0.0001413833528722157, + "loss": 3.46754789352417, + "step": 1206, + "token_acc": 0.2540631990827233 + }, + { + "epoch": 0.7077103488712987, + "grad_norm": 1.9204628656422873, + "learning_rate": 0.0001415005861664713, + "loss": 3.489941120147705, + "step": 1207, + "token_acc": 0.25179534387438607 + }, + { + "epoch": 0.7082966871885078, + "grad_norm": 1.1355505445567018, + "learning_rate": 0.00014161781946072686, + "loss": 3.461859703063965, + "step": 1208, + "token_acc": 0.2556939287453718 + }, + { + "epoch": 0.7088830255057168, + "grad_norm": 1.7566338045040304, + "learning_rate": 0.00014173505275498241, + "loss": 3.433861255645752, + "step": 1209, + "token_acc": 0.25782065972398266 + }, + { + "epoch": 0.7094693638229258, + "grad_norm": 1.588510194498836, + "learning_rate": 0.000141852286049238, + "loss": 3.4759602546691895, + "step": 1210, + "token_acc": 0.25250954370965856 + }, + { + "epoch": 0.7100557021401348, + "grad_norm": 1.4002796477850517, + "learning_rate": 0.00014196951934349357, + "loss": 3.443500518798828, + "step": 1211, + "token_acc": 0.25674212854540723 + }, + { + "epoch": 0.7106420404573439, + "grad_norm": 1.4191876157312409, + "learning_rate": 0.00014208675263774914, + "loss": 3.4555506706237793, + "step": 1212, + "token_acc": 0.2563782834829119 + }, + { + "epoch": 0.7112283787745529, + "grad_norm": 1.5005901937434414, + "learning_rate": 0.0001422039859320047, + "loss": 3.430723190307617, + "step": 1213, + "token_acc": 0.25753400348646643 + }, + { + "epoch": 0.7118147170917619, + "grad_norm": 1.6088040769827598, + "learning_rate": 0.00014232121922626027, + "loss": 3.4346261024475098, + "step": 1214, + "token_acc": 0.2557810155872556 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 1.3423283013158216, + "learning_rate": 0.00014243845252051584, + "loss": 3.4267687797546387, + "step": 1215, + "token_acc": 0.25865128660159714 + }, + { + "epoch": 0.71298739372618, + "grad_norm": 1.6098935344855423, + "learning_rate": 0.00014255568581477142, + "loss": 3.4264941215515137, + "step": 1216, + "token_acc": 0.25920328329785747 + }, + { + "epoch": 0.713573732043389, + "grad_norm": 2.020396576089131, + "learning_rate": 0.00014267291910902697, + "loss": 3.4637906551361084, + "step": 1217, + "token_acc": 0.25425219345065625 + }, + { + "epoch": 0.714160070360598, + "grad_norm": 1.6529506103097913, + "learning_rate": 0.00014279015240328252, + "loss": 3.469149112701416, + "step": 1218, + "token_acc": 0.2524846926964005 + }, + { + "epoch": 0.7147464086778071, + "grad_norm": 1.4243087086883681, + "learning_rate": 0.0001429073856975381, + "loss": 3.425438165664673, + "step": 1219, + "token_acc": 0.2580548409199622 + }, + { + "epoch": 0.7153327469950161, + "grad_norm": 1.747125321342865, + "learning_rate": 0.00014302461899179367, + "loss": 3.462268590927124, + "step": 1220, + "token_acc": 0.25417433336061224 + }, + { + "epoch": 0.7159190853122251, + "grad_norm": 1.1244870775445328, + "learning_rate": 0.00014314185228604925, + "loss": 3.441065549850464, + "step": 1221, + "token_acc": 0.2571446563058865 + }, + { + "epoch": 0.7165054236294341, + "grad_norm": 1.7202691526506555, + "learning_rate": 0.0001432590855803048, + "loss": 3.4279541969299316, + "step": 1222, + "token_acc": 0.25888663563480474 + }, + { + "epoch": 0.7170917619466433, + "grad_norm": 1.2019259449746946, + "learning_rate": 0.00014337631887456037, + "loss": 3.457341432571411, + "step": 1223, + "token_acc": 0.25481550847347406 + }, + { + "epoch": 0.7176781002638523, + "grad_norm": 1.4470131795694363, + "learning_rate": 0.00014349355216881595, + "loss": 3.3964061737060547, + "step": 1224, + "token_acc": 0.2638310887113639 + }, + { + "epoch": 0.7182644385810613, + "grad_norm": 1.3847022714095332, + "learning_rate": 0.00014361078546307153, + "loss": 3.4749650955200195, + "step": 1225, + "token_acc": 0.25443063008424854 + }, + { + "epoch": 0.7188507768982703, + "grad_norm": 2.4091107675152097, + "learning_rate": 0.00014372801875732708, + "loss": 3.4837698936462402, + "step": 1226, + "token_acc": 0.25130256462188916 + }, + { + "epoch": 0.7194371152154794, + "grad_norm": 1.070933155686379, + "learning_rate": 0.00014384525205158265, + "loss": 3.4488720893859863, + "step": 1227, + "token_acc": 0.2558907879417146 + }, + { + "epoch": 0.7200234535326884, + "grad_norm": 1.90006729372095, + "learning_rate": 0.00014396248534583823, + "loss": 3.4941225051879883, + "step": 1228, + "token_acc": 0.24938360225960488 + }, + { + "epoch": 0.7206097918498974, + "grad_norm": 1.4756717727350943, + "learning_rate": 0.0001440797186400938, + "loss": 3.41542911529541, + "step": 1229, + "token_acc": 0.2618272421771129 + }, + { + "epoch": 0.7211961301671064, + "grad_norm": 1.717199483069328, + "learning_rate": 0.00014419695193434935, + "loss": 3.458010673522949, + "step": 1230, + "token_acc": 0.2551103571437696 + }, + { + "epoch": 0.7217824684843155, + "grad_norm": 1.5982566554184876, + "learning_rate": 0.00014431418522860493, + "loss": 3.472919464111328, + "step": 1231, + "token_acc": 0.2528250758134579 + }, + { + "epoch": 0.7223688068015245, + "grad_norm": 1.5848855529324406, + "learning_rate": 0.0001444314185228605, + "loss": 3.4550588130950928, + "step": 1232, + "token_acc": 0.25651067677394257 + }, + { + "epoch": 0.7229551451187335, + "grad_norm": 1.1142469544379312, + "learning_rate": 0.00014454865181711608, + "loss": 3.4776453971862793, + "step": 1233, + "token_acc": 0.25244300518134716 + }, + { + "epoch": 0.7235414834359425, + "grad_norm": 1.4252964258088752, + "learning_rate": 0.00014466588511137166, + "loss": 3.4053268432617188, + "step": 1234, + "token_acc": 0.26168830280563043 + }, + { + "epoch": 0.7241278217531516, + "grad_norm": 1.5215669994356336, + "learning_rate": 0.0001447831184056272, + "loss": 3.419320583343506, + "step": 1235, + "token_acc": 0.2621442456807112 + }, + { + "epoch": 0.7247141600703606, + "grad_norm": 1.7620545535774552, + "learning_rate": 0.00014490035169988276, + "loss": 3.4716854095458984, + "step": 1236, + "token_acc": 0.2529282595010434 + }, + { + "epoch": 0.7253004983875696, + "grad_norm": 1.125677312131224, + "learning_rate": 0.00014501758499413833, + "loss": 3.4332993030548096, + "step": 1237, + "token_acc": 0.2582909167235137 + }, + { + "epoch": 0.7258868367047786, + "grad_norm": 1.6530519878535423, + "learning_rate": 0.0001451348182883939, + "loss": 3.438295841217041, + "step": 1238, + "token_acc": 0.25633007656415147 + }, + { + "epoch": 0.7264731750219877, + "grad_norm": 1.3717098277020645, + "learning_rate": 0.00014525205158264949, + "loss": 3.470151424407959, + "step": 1239, + "token_acc": 0.2530085862583296 + }, + { + "epoch": 0.7270595133391967, + "grad_norm": 1.3297734199312607, + "learning_rate": 0.00014536928487690504, + "loss": 3.403834819793701, + "step": 1240, + "token_acc": 0.2608286551005952 + }, + { + "epoch": 0.7276458516564057, + "grad_norm": 1.0204923456862767, + "learning_rate": 0.0001454865181711606, + "loss": 3.4251315593719482, + "step": 1241, + "token_acc": 0.2607074554425213 + }, + { + "epoch": 0.7282321899736148, + "grad_norm": 1.8614463934254661, + "learning_rate": 0.0001456037514654162, + "loss": 3.4099807739257812, + "step": 1242, + "token_acc": 0.2611788128381741 + }, + { + "epoch": 0.7288185282908238, + "grad_norm": 1.3041772868140586, + "learning_rate": 0.00014572098475967176, + "loss": 3.4974818229675293, + "step": 1243, + "token_acc": 0.25138140499575157 + }, + { + "epoch": 0.7294048666080328, + "grad_norm": 1.0564648533267627, + "learning_rate": 0.0001458382180539273, + "loss": 3.4022369384765625, + "step": 1244, + "token_acc": 0.26258460024501434 + }, + { + "epoch": 0.7299912049252418, + "grad_norm": 1.5660257008685003, + "learning_rate": 0.0001459554513481829, + "loss": 3.5017075538635254, + "step": 1245, + "token_acc": 0.2478392421708435 + }, + { + "epoch": 0.7305775432424509, + "grad_norm": 1.2967295612996657, + "learning_rate": 0.00014607268464243847, + "loss": 3.4555978775024414, + "step": 1246, + "token_acc": 0.25633834953517004 + }, + { + "epoch": 0.73116388155966, + "grad_norm": 1.6036217524756642, + "learning_rate": 0.00014618991793669404, + "loss": 3.4531102180480957, + "step": 1247, + "token_acc": 0.25551073004975366 + }, + { + "epoch": 0.731750219876869, + "grad_norm": 1.8233658190421305, + "learning_rate": 0.0001463071512309496, + "loss": 3.4363183975219727, + "step": 1248, + "token_acc": 0.25739716565920356 + }, + { + "epoch": 0.732336558194078, + "grad_norm": 1.4089515172355789, + "learning_rate": 0.00014642438452520517, + "loss": 3.4581356048583984, + "step": 1249, + "token_acc": 0.2550260601121613 + }, + { + "epoch": 0.7329228965112871, + "grad_norm": 1.607016573916611, + "learning_rate": 0.00014654161781946074, + "loss": 3.4439473152160645, + "step": 1250, + "token_acc": 0.25563950842057354 + }, + { + "epoch": 0.7335092348284961, + "grad_norm": 1.1982802132043384, + "learning_rate": 0.00014665885111371632, + "loss": 3.4321069717407227, + "step": 1251, + "token_acc": 0.2599147181673893 + }, + { + "epoch": 0.7340955731457051, + "grad_norm": 1.7916351410224547, + "learning_rate": 0.00014677608440797187, + "loss": 3.421072006225586, + "step": 1252, + "token_acc": 0.25949011141664224 + }, + { + "epoch": 0.7346819114629141, + "grad_norm": 1.0673689664702601, + "learning_rate": 0.00014689331770222742, + "loss": 3.436981439590454, + "step": 1253, + "token_acc": 0.2575099137726195 + }, + { + "epoch": 0.7352682497801232, + "grad_norm": 2.200834619563645, + "learning_rate": 0.000147010550996483, + "loss": 3.5111794471740723, + "step": 1254, + "token_acc": 0.24833392065005597 + }, + { + "epoch": 0.7358545880973322, + "grad_norm": 1.2114155715238122, + "learning_rate": 0.00014712778429073857, + "loss": 3.465855598449707, + "step": 1255, + "token_acc": 0.25337390507231616 + }, + { + "epoch": 0.7364409264145412, + "grad_norm": 1.8737492870695667, + "learning_rate": 0.00014724501758499415, + "loss": 3.4212727546691895, + "step": 1256, + "token_acc": 0.2583062956261309 + }, + { + "epoch": 0.7370272647317502, + "grad_norm": 1.2925024235484885, + "learning_rate": 0.0001473622508792497, + "loss": 3.516406536102295, + "step": 1257, + "token_acc": 0.24767110093880912 + }, + { + "epoch": 0.7376136030489593, + "grad_norm": 1.3301291188019166, + "learning_rate": 0.00014747948417350527, + "loss": 3.4706199169158936, + "step": 1258, + "token_acc": 0.2524147376225389 + }, + { + "epoch": 0.7381999413661683, + "grad_norm": 1.462833984668704, + "learning_rate": 0.00014759671746776085, + "loss": 3.42460298538208, + "step": 1259, + "token_acc": 0.2579313166414959 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 1.463724768341424, + "learning_rate": 0.00014771395076201643, + "loss": 3.4307332038879395, + "step": 1260, + "token_acc": 0.25681902280526564 + }, + { + "epoch": 0.7393726180005863, + "grad_norm": 1.9347344545582497, + "learning_rate": 0.00014783118405627197, + "loss": 3.4678549766540527, + "step": 1261, + "token_acc": 0.25190761193722877 + }, + { + "epoch": 0.7399589563177954, + "grad_norm": 1.071088548058867, + "learning_rate": 0.00014794841735052755, + "loss": 3.409794569015503, + "step": 1262, + "token_acc": 0.2600979207995055 + }, + { + "epoch": 0.7405452946350044, + "grad_norm": 1.3377183751567074, + "learning_rate": 0.00014806565064478313, + "loss": 3.426347255706787, + "step": 1263, + "token_acc": 0.2566860011934008 + }, + { + "epoch": 0.7411316329522134, + "grad_norm": 1.2992008834059154, + "learning_rate": 0.0001481828839390387, + "loss": 3.3940846920013428, + "step": 1264, + "token_acc": 0.2626419599212203 + }, + { + "epoch": 0.7417179712694224, + "grad_norm": 1.4215781057479917, + "learning_rate": 0.00014830011723329428, + "loss": 3.489041805267334, + "step": 1265, + "token_acc": 0.2505786647472776 + }, + { + "epoch": 0.7423043095866315, + "grad_norm": 1.6380426361759095, + "learning_rate": 0.00014841735052754983, + "loss": 3.400296688079834, + "step": 1266, + "token_acc": 0.26068273928385105 + }, + { + "epoch": 0.7428906479038405, + "grad_norm": 1.4624959953044716, + "learning_rate": 0.0001485345838218054, + "loss": 3.386765480041504, + "step": 1267, + "token_acc": 0.2634867093277663 + }, + { + "epoch": 0.7434769862210495, + "grad_norm": 1.1925947158925931, + "learning_rate": 0.00014865181711606098, + "loss": 3.405383348464966, + "step": 1268, + "token_acc": 0.26165629461706974 + }, + { + "epoch": 0.7440633245382586, + "grad_norm": 1.5621420987415842, + "learning_rate": 0.00014876905041031656, + "loss": 3.455237865447998, + "step": 1269, + "token_acc": 0.25608269723867205 + }, + { + "epoch": 0.7446496628554676, + "grad_norm": 1.2501024366577473, + "learning_rate": 0.0001488862837045721, + "loss": 3.451732635498047, + "step": 1270, + "token_acc": 0.25428297483582685 + }, + { + "epoch": 0.7452360011726766, + "grad_norm": 1.5867269645544448, + "learning_rate": 0.00014900351699882766, + "loss": 3.4027304649353027, + "step": 1271, + "token_acc": 0.26125793800224917 + }, + { + "epoch": 0.7458223394898856, + "grad_norm": 1.4362521735255311, + "learning_rate": 0.00014912075029308323, + "loss": 3.4019625186920166, + "step": 1272, + "token_acc": 0.2621157117107969 + }, + { + "epoch": 0.7464086778070947, + "grad_norm": 1.4031492824481668, + "learning_rate": 0.0001492379835873388, + "loss": 3.4212403297424316, + "step": 1273, + "token_acc": 0.25754086028153916 + }, + { + "epoch": 0.7469950161243037, + "grad_norm": 1.1736845539272112, + "learning_rate": 0.00014935521688159439, + "loss": 3.372968912124634, + "step": 1274, + "token_acc": 0.2637764211331364 + }, + { + "epoch": 0.7475813544415127, + "grad_norm": 2.1327757325866665, + "learning_rate": 0.00014947245017584993, + "loss": 3.4224917888641357, + "step": 1275, + "token_acc": 0.2572750514305629 + }, + { + "epoch": 0.7481676927587217, + "grad_norm": 0.9704083716251074, + "learning_rate": 0.0001495896834701055, + "loss": 3.469961643218994, + "step": 1276, + "token_acc": 0.2511131296796077 + }, + { + "epoch": 0.7487540310759309, + "grad_norm": 1.684596950507075, + "learning_rate": 0.0001497069167643611, + "loss": 3.4014203548431396, + "step": 1277, + "token_acc": 0.2611449381377652 + }, + { + "epoch": 0.7493403693931399, + "grad_norm": 1.495602383274155, + "learning_rate": 0.00014982415005861666, + "loss": 3.457026720046997, + "step": 1278, + "token_acc": 0.25410032171363417 + }, + { + "epoch": 0.7499267077103489, + "grad_norm": 1.1631171518798684, + "learning_rate": 0.0001499413833528722, + "loss": 3.440493106842041, + "step": 1279, + "token_acc": 0.2563788021259717 + }, + { + "epoch": 0.7505130460275579, + "grad_norm": 1.4239185209019283, + "learning_rate": 0.0001500586166471278, + "loss": 3.4252877235412598, + "step": 1280, + "token_acc": 0.2577691483379487 + }, + { + "epoch": 0.751099384344767, + "grad_norm": 1.503648105025796, + "learning_rate": 0.00015017584994138336, + "loss": 3.4105794429779053, + "step": 1281, + "token_acc": 0.25994719461785376 + }, + { + "epoch": 0.751685722661976, + "grad_norm": 1.4989788864720033, + "learning_rate": 0.00015029308323563894, + "loss": 3.3793835639953613, + "step": 1282, + "token_acc": 0.2641823459503802 + }, + { + "epoch": 0.752272060979185, + "grad_norm": 1.1259577353541548, + "learning_rate": 0.0001504103165298945, + "loss": 3.4287967681884766, + "step": 1283, + "token_acc": 0.258247277481624 + }, + { + "epoch": 0.752858399296394, + "grad_norm": 1.365508010565486, + "learning_rate": 0.00015052754982415007, + "loss": 3.4712073802948, + "step": 1284, + "token_acc": 0.2511922578844111 + }, + { + "epoch": 0.7534447376136031, + "grad_norm": 1.3879643463695752, + "learning_rate": 0.00015064478311840564, + "loss": 3.4485044479370117, + "step": 1285, + "token_acc": 0.2555265362183873 + }, + { + "epoch": 0.7540310759308121, + "grad_norm": 1.083976351073512, + "learning_rate": 0.00015076201641266122, + "loss": 3.3934457302093506, + "step": 1286, + "token_acc": 0.26216669351915256 + }, + { + "epoch": 0.7546174142480211, + "grad_norm": 1.9741451219382833, + "learning_rate": 0.00015087924970691677, + "loss": 3.487504005432129, + "step": 1287, + "token_acc": 0.2494730862545856 + }, + { + "epoch": 0.7552037525652301, + "grad_norm": 1.1428712878378593, + "learning_rate": 0.00015099648300117234, + "loss": 3.423361301422119, + "step": 1288, + "token_acc": 0.2589780945256001 + }, + { + "epoch": 0.7557900908824392, + "grad_norm": 2.101406189706504, + "learning_rate": 0.0001511137162954279, + "loss": 3.5219991207122803, + "step": 1289, + "token_acc": 0.24635049896560543 + }, + { + "epoch": 0.7563764291996482, + "grad_norm": 1.1749592828379896, + "learning_rate": 0.00015123094958968347, + "loss": 3.4067013263702393, + "step": 1290, + "token_acc": 0.2583606817438206 + }, + { + "epoch": 0.7569627675168572, + "grad_norm": 1.7453724325699769, + "learning_rate": 0.00015134818288393905, + "loss": 3.4472789764404297, + "step": 1291, + "token_acc": 0.25690357139000186 + }, + { + "epoch": 0.7575491058340662, + "grad_norm": 1.4255935976090912, + "learning_rate": 0.0001514654161781946, + "loss": 3.451256513595581, + "step": 1292, + "token_acc": 0.2533840859858418 + }, + { + "epoch": 0.7581354441512753, + "grad_norm": 1.3609357668653277, + "learning_rate": 0.00015158264947245017, + "loss": 3.456042528152466, + "step": 1293, + "token_acc": 0.2553718680096888 + }, + { + "epoch": 0.7587217824684843, + "grad_norm": 1.1425740824308384, + "learning_rate": 0.00015169988276670575, + "loss": 3.444441318511963, + "step": 1294, + "token_acc": 0.25538292193245155 + }, + { + "epoch": 0.7593081207856933, + "grad_norm": 1.349462185130714, + "learning_rate": 0.00015181711606096132, + "loss": 3.437311887741089, + "step": 1295, + "token_acc": 0.25674294578147616 + }, + { + "epoch": 0.7598944591029023, + "grad_norm": 1.1815147139675841, + "learning_rate": 0.0001519343493552169, + "loss": 3.3935465812683105, + "step": 1296, + "token_acc": 0.2591039459381589 + }, + { + "epoch": 0.7604807974201114, + "grad_norm": 1.3949443956168464, + "learning_rate": 0.00015205158264947245, + "loss": 3.437002182006836, + "step": 1297, + "token_acc": 0.2576903945336021 + }, + { + "epoch": 0.7610671357373204, + "grad_norm": 1.2568676986391838, + "learning_rate": 0.00015216881594372803, + "loss": 3.4558348655700684, + "step": 1298, + "token_acc": 0.2528200862920414 + }, + { + "epoch": 0.7616534740545294, + "grad_norm": 1.723296330765033, + "learning_rate": 0.0001522860492379836, + "loss": 3.407172203063965, + "step": 1299, + "token_acc": 0.2594460144525037 + }, + { + "epoch": 0.7622398123717385, + "grad_norm": 1.184642490472284, + "learning_rate": 0.00015240328253223918, + "loss": 3.3951706886291504, + "step": 1300, + "token_acc": 0.2610623112112838 + }, + { + "epoch": 0.7628261506889475, + "grad_norm": 1.391509690869005, + "learning_rate": 0.00015252051582649473, + "loss": 3.4404056072235107, + "step": 1301, + "token_acc": 0.25584921351277345 + }, + { + "epoch": 0.7634124890061565, + "grad_norm": 1.808794960352588, + "learning_rate": 0.0001526377491207503, + "loss": 3.410461902618408, + "step": 1302, + "token_acc": 0.2592892654233827 + }, + { + "epoch": 0.7639988273233655, + "grad_norm": 1.4129400271120751, + "learning_rate": 0.00015275498241500588, + "loss": 3.376692295074463, + "step": 1303, + "token_acc": 0.26259100967086174 + }, + { + "epoch": 0.7645851656405747, + "grad_norm": 1.3652975227714779, + "learning_rate": 0.00015287221570926146, + "loss": 3.443056583404541, + "step": 1304, + "token_acc": 0.25679442869800867 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 1.6500751105813, + "learning_rate": 0.000152989449003517, + "loss": 3.4749786853790283, + "step": 1305, + "token_acc": 0.25160368078042383 + }, + { + "epoch": 0.7657578422749927, + "grad_norm": 1.3643491048592706, + "learning_rate": 0.00015310668229777258, + "loss": 3.4558467864990234, + "step": 1306, + "token_acc": 0.25366106791004345 + }, + { + "epoch": 0.7663441805922017, + "grad_norm": 1.1966787537750312, + "learning_rate": 0.00015322391559202813, + "loss": 3.441133499145508, + "step": 1307, + "token_acc": 0.25537933176417443 + }, + { + "epoch": 0.7669305189094108, + "grad_norm": 1.5129240749984183, + "learning_rate": 0.0001533411488862837, + "loss": 3.3756179809570312, + "step": 1308, + "token_acc": 0.26349618123386875 + }, + { + "epoch": 0.7675168572266198, + "grad_norm": 1.1951269731142888, + "learning_rate": 0.00015345838218053928, + "loss": 3.39105224609375, + "step": 1309, + "token_acc": 0.25945828131054377 + }, + { + "epoch": 0.7681031955438288, + "grad_norm": 1.894693856226019, + "learning_rate": 0.00015357561547479483, + "loss": 3.372954845428467, + "step": 1310, + "token_acc": 0.2660941659615742 + }, + { + "epoch": 0.7686895338610378, + "grad_norm": 1.2533924620598944, + "learning_rate": 0.0001536928487690504, + "loss": 3.451615571975708, + "step": 1311, + "token_acc": 0.25490759007235286 + }, + { + "epoch": 0.7692758721782469, + "grad_norm": 1.6002890668463903, + "learning_rate": 0.00015381008206330599, + "loss": 3.440751075744629, + "step": 1312, + "token_acc": 0.2539912917271408 + }, + { + "epoch": 0.7698622104954559, + "grad_norm": 1.6960338715166758, + "learning_rate": 0.00015392731535756156, + "loss": 3.4475626945495605, + "step": 1313, + "token_acc": 0.2562255252516928 + }, + { + "epoch": 0.7704485488126649, + "grad_norm": 1.6382056215658634, + "learning_rate": 0.0001540445486518171, + "loss": 3.4409759044647217, + "step": 1314, + "token_acc": 0.2567159275735413 + }, + { + "epoch": 0.7710348871298739, + "grad_norm": 1.112109441841703, + "learning_rate": 0.0001541617819460727, + "loss": 3.400407314300537, + "step": 1315, + "token_acc": 0.2598234620379898 + }, + { + "epoch": 0.771621225447083, + "grad_norm": 1.7300435782980532, + "learning_rate": 0.00015427901524032826, + "loss": 3.423166036605835, + "step": 1316, + "token_acc": 0.25859233614990024 + }, + { + "epoch": 0.772207563764292, + "grad_norm": 1.0481917826667875, + "learning_rate": 0.00015439624853458384, + "loss": 3.36118221282959, + "step": 1317, + "token_acc": 0.2661040212904172 + }, + { + "epoch": 0.772793902081501, + "grad_norm": 1.5133302293363555, + "learning_rate": 0.0001545134818288394, + "loss": 3.4344353675842285, + "step": 1318, + "token_acc": 0.2549398361836664 + }, + { + "epoch": 0.77338024039871, + "grad_norm": 1.3923986861306115, + "learning_rate": 0.00015463071512309497, + "loss": 3.4415740966796875, + "step": 1319, + "token_acc": 0.2538437828151693 + }, + { + "epoch": 0.7739665787159191, + "grad_norm": 1.26715567321541, + "learning_rate": 0.00015474794841735054, + "loss": 3.4111063480377197, + "step": 1320, + "token_acc": 0.26099016709775424 + }, + { + "epoch": 0.7745529170331281, + "grad_norm": 1.6039435091472998, + "learning_rate": 0.00015486518171160612, + "loss": 3.3888659477233887, + "step": 1321, + "token_acc": 0.26226347675832495 + }, + { + "epoch": 0.7751392553503371, + "grad_norm": 1.299338894205598, + "learning_rate": 0.00015498241500586167, + "loss": 3.432020425796509, + "step": 1322, + "token_acc": 0.2575829078896564 + }, + { + "epoch": 0.7757255936675461, + "grad_norm": 2.1581629296393174, + "learning_rate": 0.00015509964830011724, + "loss": 3.439408302307129, + "step": 1323, + "token_acc": 0.2557061556649163 + }, + { + "epoch": 0.7763119319847552, + "grad_norm": 1.0742571991536303, + "learning_rate": 0.0001552168815943728, + "loss": 3.416118860244751, + "step": 1324, + "token_acc": 0.25807778076515525 + }, + { + "epoch": 0.7768982703019642, + "grad_norm": 2.070348002401218, + "learning_rate": 0.00015533411488862837, + "loss": 3.4130992889404297, + "step": 1325, + "token_acc": 0.25778490509608987 + }, + { + "epoch": 0.7774846086191732, + "grad_norm": 1.3415453751304076, + "learning_rate": 0.00015545134818288395, + "loss": 3.4504311084747314, + "step": 1326, + "token_acc": 0.25675405082858765 + }, + { + "epoch": 0.7780709469363823, + "grad_norm": 1.585903227171164, + "learning_rate": 0.0001555685814771395, + "loss": 3.4499378204345703, + "step": 1327, + "token_acc": 0.25374800427513955 + }, + { + "epoch": 0.7786572852535913, + "grad_norm": 1.2295335910085605, + "learning_rate": 0.00015568581477139507, + "loss": 3.4052810668945312, + "step": 1328, + "token_acc": 0.26061243094850095 + }, + { + "epoch": 0.7792436235708003, + "grad_norm": 1.5659238291239055, + "learning_rate": 0.00015580304806565065, + "loss": 3.402463912963867, + "step": 1329, + "token_acc": 0.2592266290350665 + }, + { + "epoch": 0.7798299618880093, + "grad_norm": 1.4527356133551028, + "learning_rate": 0.00015592028135990622, + "loss": 3.4638214111328125, + "step": 1330, + "token_acc": 0.2531017698294788 + }, + { + "epoch": 0.7804163002052185, + "grad_norm": 0.9328913251748306, + "learning_rate": 0.0001560375146541618, + "loss": 3.3938584327697754, + "step": 1331, + "token_acc": 0.2614598071754304 + }, + { + "epoch": 0.7810026385224275, + "grad_norm": 1.278046136398353, + "learning_rate": 0.00015615474794841735, + "loss": 3.432962417602539, + "step": 1332, + "token_acc": 0.25674984414214236 + }, + { + "epoch": 0.7815889768396365, + "grad_norm": 1.6059735919672473, + "learning_rate": 0.00015627198124267293, + "loss": 3.447343587875366, + "step": 1333, + "token_acc": 0.2535945372241119 + }, + { + "epoch": 0.7821753151568455, + "grad_norm": 1.4555663814285882, + "learning_rate": 0.0001563892145369285, + "loss": 3.4815449714660645, + "step": 1334, + "token_acc": 0.25159082057194737 + }, + { + "epoch": 0.7827616534740546, + "grad_norm": 1.3086545765548627, + "learning_rate": 0.00015650644783118408, + "loss": 3.4199166297912598, + "step": 1335, + "token_acc": 0.2569769906803189 + }, + { + "epoch": 0.7833479917912636, + "grad_norm": 1.4963351416344797, + "learning_rate": 0.00015662368112543963, + "loss": 3.386457920074463, + "step": 1336, + "token_acc": 0.26096128968056526 + }, + { + "epoch": 0.7839343301084726, + "grad_norm": 1.5883719751491485, + "learning_rate": 0.0001567409144196952, + "loss": 3.3737833499908447, + "step": 1337, + "token_acc": 0.2638557454155593 + }, + { + "epoch": 0.7845206684256816, + "grad_norm": 1.2353488782439634, + "learning_rate": 0.00015685814771395078, + "loss": 3.436501979827881, + "step": 1338, + "token_acc": 0.2563241222176783 + }, + { + "epoch": 0.7851070067428907, + "grad_norm": 1.7294240285955818, + "learning_rate": 0.00015697538100820636, + "loss": 3.4303267002105713, + "step": 1339, + "token_acc": 0.2563434290566703 + }, + { + "epoch": 0.7856933450600997, + "grad_norm": 1.246095593696147, + "learning_rate": 0.0001570926143024619, + "loss": 3.4334189891815186, + "step": 1340, + "token_acc": 0.2560173915307411 + }, + { + "epoch": 0.7862796833773087, + "grad_norm": 1.2776804162160535, + "learning_rate": 0.00015720984759671748, + "loss": 3.436596393585205, + "step": 1341, + "token_acc": 0.255089413858814 + }, + { + "epoch": 0.7868660216945177, + "grad_norm": 1.514762385377181, + "learning_rate": 0.00015732708089097303, + "loss": 3.430767059326172, + "step": 1342, + "token_acc": 0.2558594645833015 + }, + { + "epoch": 0.7874523600117268, + "grad_norm": 1.4027348139873257, + "learning_rate": 0.0001574443141852286, + "loss": 3.3967700004577637, + "step": 1343, + "token_acc": 0.26001397638893686 + }, + { + "epoch": 0.7880386983289358, + "grad_norm": 0.849016016734809, + "learning_rate": 0.00015756154747948418, + "loss": 3.4489917755126953, + "step": 1344, + "token_acc": 0.2538228204150157 + }, + { + "epoch": 0.7886250366461448, + "grad_norm": 1.2740348555365664, + "learning_rate": 0.00015767878077373973, + "loss": 3.395728826522827, + "step": 1345, + "token_acc": 0.2604047013591269 + }, + { + "epoch": 0.7892113749633538, + "grad_norm": 1.7118483109604976, + "learning_rate": 0.0001577960140679953, + "loss": 3.40120792388916, + "step": 1346, + "token_acc": 0.25980947824911343 + }, + { + "epoch": 0.7897977132805629, + "grad_norm": 1.0549361486081228, + "learning_rate": 0.00015791324736225089, + "loss": 3.4053001403808594, + "step": 1347, + "token_acc": 0.2620256645529124 + }, + { + "epoch": 0.7903840515977719, + "grad_norm": 1.950275266239379, + "learning_rate": 0.00015803048065650646, + "loss": 3.4412758350372314, + "step": 1348, + "token_acc": 0.254904207412934 + }, + { + "epoch": 0.7909703899149809, + "grad_norm": 1.1228903245861406, + "learning_rate": 0.000158147713950762, + "loss": 3.432236433029175, + "step": 1349, + "token_acc": 0.2571157247051683 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 1.3719579410268588, + "learning_rate": 0.0001582649472450176, + "loss": 3.3730268478393555, + "step": 1350, + "token_acc": 0.26413195183073307 + }, + { + "epoch": 0.792143066549399, + "grad_norm": 1.3933650515460838, + "learning_rate": 0.00015838218053927316, + "loss": 3.457695245742798, + "step": 1351, + "token_acc": 0.25246354793104175 + }, + { + "epoch": 0.792729404866608, + "grad_norm": 1.2398987593095994, + "learning_rate": 0.00015849941383352874, + "loss": 3.423532009124756, + "step": 1352, + "token_acc": 0.2594626607177961 + }, + { + "epoch": 0.793315743183817, + "grad_norm": 1.2534835683702565, + "learning_rate": 0.0001586166471277843, + "loss": 3.4474854469299316, + "step": 1353, + "token_acc": 0.25201994779531195 + }, + { + "epoch": 0.7939020815010261, + "grad_norm": 1.285133784299824, + "learning_rate": 0.00015873388042203987, + "loss": 3.47641658782959, + "step": 1354, + "token_acc": 0.2512447501694354 + }, + { + "epoch": 0.7944884198182351, + "grad_norm": 1.109232567566872, + "learning_rate": 0.00015885111371629544, + "loss": 3.397341012954712, + "step": 1355, + "token_acc": 0.26022165954090737 + }, + { + "epoch": 0.7950747581354441, + "grad_norm": 1.762289433959861, + "learning_rate": 0.00015896834701055102, + "loss": 3.3867056369781494, + "step": 1356, + "token_acc": 0.2630039572503675 + }, + { + "epoch": 0.7956610964526531, + "grad_norm": 1.120508650628853, + "learning_rate": 0.0001590855803048066, + "loss": 3.3182373046875, + "step": 1357, + "token_acc": 0.2717155273177257 + }, + { + "epoch": 0.7962474347698623, + "grad_norm": 1.5455193392268172, + "learning_rate": 0.00015920281359906214, + "loss": 3.47340989112854, + "step": 1358, + "token_acc": 0.24862044616470957 + }, + { + "epoch": 0.7968337730870713, + "grad_norm": 1.257763918145666, + "learning_rate": 0.00015932004689331772, + "loss": 3.4118173122406006, + "step": 1359, + "token_acc": 0.2599451845241889 + }, + { + "epoch": 0.7974201114042803, + "grad_norm": 1.5886698994149002, + "learning_rate": 0.00015943728018757327, + "loss": 3.4121928215026855, + "step": 1360, + "token_acc": 0.25824333705439445 + }, + { + "epoch": 0.7980064497214893, + "grad_norm": 1.1914023578162718, + "learning_rate": 0.00015955451348182884, + "loss": 3.391681432723999, + "step": 1361, + "token_acc": 0.2615334035739896 + }, + { + "epoch": 0.7985927880386984, + "grad_norm": 1.4961224966281925, + "learning_rate": 0.00015967174677608442, + "loss": 3.343844413757324, + "step": 1362, + "token_acc": 0.26985119112605627 + }, + { + "epoch": 0.7991791263559074, + "grad_norm": 1.0959747188625273, + "learning_rate": 0.00015978898007033997, + "loss": 3.4245657920837402, + "step": 1363, + "token_acc": 0.25810379864372207 + }, + { + "epoch": 0.7997654646731164, + "grad_norm": 1.7365757187062323, + "learning_rate": 0.00015990621336459555, + "loss": 3.386023998260498, + "step": 1364, + "token_acc": 0.2617228654632686 + }, + { + "epoch": 0.8003518029903254, + "grad_norm": 1.5458352769590975, + "learning_rate": 0.00016002344665885112, + "loss": 3.4225351810455322, + "step": 1365, + "token_acc": 0.2577579675334096 + }, + { + "epoch": 0.8009381413075345, + "grad_norm": 1.1889062179803223, + "learning_rate": 0.0001601406799531067, + "loss": 3.4193105697631836, + "step": 1366, + "token_acc": 0.25750404225916945 + }, + { + "epoch": 0.8015244796247435, + "grad_norm": 1.2835901201805882, + "learning_rate": 0.00016025791324736225, + "loss": 3.412130355834961, + "step": 1367, + "token_acc": 0.2601837543628296 + }, + { + "epoch": 0.8021108179419525, + "grad_norm": 1.2469611497385342, + "learning_rate": 0.00016037514654161782, + "loss": 3.4162330627441406, + "step": 1368, + "token_acc": 0.2566167064388572 + }, + { + "epoch": 0.8026971562591615, + "grad_norm": 1.2230045755904724, + "learning_rate": 0.0001604923798358734, + "loss": 3.3724398612976074, + "step": 1369, + "token_acc": 0.264433783198075 + }, + { + "epoch": 0.8032834945763706, + "grad_norm": 1.5382512758511608, + "learning_rate": 0.00016060961313012898, + "loss": 3.3815901279449463, + "step": 1370, + "token_acc": 0.26097927461139897 + }, + { + "epoch": 0.8038698328935796, + "grad_norm": 1.2621043706789439, + "learning_rate": 0.00016072684642438453, + "loss": 3.412454128265381, + "step": 1371, + "token_acc": 0.25885567801793125 + }, + { + "epoch": 0.8044561712107886, + "grad_norm": 1.5347829120672927, + "learning_rate": 0.0001608440797186401, + "loss": 3.4170382022857666, + "step": 1372, + "token_acc": 0.2598218303370546 + }, + { + "epoch": 0.8050425095279976, + "grad_norm": 1.11545032567644, + "learning_rate": 0.00016096131301289568, + "loss": 3.407500743865967, + "step": 1373, + "token_acc": 0.25997555783126564 + }, + { + "epoch": 0.8056288478452067, + "grad_norm": 1.4832187245841353, + "learning_rate": 0.00016107854630715126, + "loss": 3.4161930084228516, + "step": 1374, + "token_acc": 0.25709930931113406 + }, + { + "epoch": 0.8062151861624157, + "grad_norm": 1.2183620021343144, + "learning_rate": 0.0001611957796014068, + "loss": 3.401772975921631, + "step": 1375, + "token_acc": 0.2607040862194731 + }, + { + "epoch": 0.8068015244796247, + "grad_norm": 1.6897382285213896, + "learning_rate": 0.00016131301289566238, + "loss": 3.3984644412994385, + "step": 1376, + "token_acc": 0.2604184746425713 + }, + { + "epoch": 0.8073878627968337, + "grad_norm": 1.148681955178217, + "learning_rate": 0.00016143024618991796, + "loss": 3.4220640659332275, + "step": 1377, + "token_acc": 0.25745807360267076 + }, + { + "epoch": 0.8079742011140428, + "grad_norm": 1.3871973480585504, + "learning_rate": 0.0001615474794841735, + "loss": 3.4250235557556152, + "step": 1378, + "token_acc": 0.25745404825903884 + }, + { + "epoch": 0.8085605394312518, + "grad_norm": 1.2059697467200812, + "learning_rate": 0.00016166471277842908, + "loss": 3.391935348510742, + "step": 1379, + "token_acc": 0.2591476636645117 + }, + { + "epoch": 0.8091468777484608, + "grad_norm": 1.6435658372062054, + "learning_rate": 0.00016178194607268463, + "loss": 3.442290782928467, + "step": 1380, + "token_acc": 0.25447408569698116 + }, + { + "epoch": 0.8097332160656698, + "grad_norm": 1.0838856957383594, + "learning_rate": 0.0001618991793669402, + "loss": 3.424408435821533, + "step": 1381, + "token_acc": 0.255436966909302 + }, + { + "epoch": 0.810319554382879, + "grad_norm": 1.3007345446042187, + "learning_rate": 0.00016201641266119578, + "loss": 3.4230763912200928, + "step": 1382, + "token_acc": 0.25723885236577204 + }, + { + "epoch": 0.810905892700088, + "grad_norm": 1.4520047599653023, + "learning_rate": 0.00016213364595545136, + "loss": 3.4378623962402344, + "step": 1383, + "token_acc": 0.2536172648810061 + }, + { + "epoch": 0.811492231017297, + "grad_norm": 1.2279829832455855, + "learning_rate": 0.0001622508792497069, + "loss": 3.3899765014648438, + "step": 1384, + "token_acc": 0.2612733355143047 + }, + { + "epoch": 0.8120785693345061, + "grad_norm": 1.492650011748883, + "learning_rate": 0.00016236811254396249, + "loss": 3.385045051574707, + "step": 1385, + "token_acc": 0.2609395538203836 + }, + { + "epoch": 0.8126649076517151, + "grad_norm": 1.1957336848313698, + "learning_rate": 0.00016248534583821806, + "loss": 3.39316463470459, + "step": 1386, + "token_acc": 0.2610308908506195 + }, + { + "epoch": 0.8132512459689241, + "grad_norm": 0.9913932941187936, + "learning_rate": 0.00016260257913247364, + "loss": 3.370640754699707, + "step": 1387, + "token_acc": 0.2630724498988089 + }, + { + "epoch": 0.8138375842861331, + "grad_norm": 1.778919795056691, + "learning_rate": 0.0001627198124267292, + "loss": 3.451665163040161, + "step": 1388, + "token_acc": 0.25280098208899393 + }, + { + "epoch": 0.8144239226033422, + "grad_norm": 1.4065760392925368, + "learning_rate": 0.00016283704572098476, + "loss": 3.3990657329559326, + "step": 1389, + "token_acc": 0.25854908427720236 + }, + { + "epoch": 0.8150102609205512, + "grad_norm": 1.5865502333380712, + "learning_rate": 0.00016295427901524034, + "loss": 3.3746392726898193, + "step": 1390, + "token_acc": 0.2614234161466408 + }, + { + "epoch": 0.8155965992377602, + "grad_norm": 0.8139752770538708, + "learning_rate": 0.00016307151230949592, + "loss": 3.3624978065490723, + "step": 1391, + "token_acc": 0.26318448483848683 + }, + { + "epoch": 0.8161829375549692, + "grad_norm": 1.6002682530181054, + "learning_rate": 0.0001631887456037515, + "loss": 3.418271541595459, + "step": 1392, + "token_acc": 0.2594736223651742 + }, + { + "epoch": 0.8167692758721783, + "grad_norm": 1.095166228887982, + "learning_rate": 0.00016330597889800704, + "loss": 3.3780651092529297, + "step": 1393, + "token_acc": 0.2626238116887429 + }, + { + "epoch": 0.8173556141893873, + "grad_norm": 1.4719030043321082, + "learning_rate": 0.00016342321219226262, + "loss": 3.386111259460449, + "step": 1394, + "token_acc": 0.25987115905994823 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 1.033948798933131, + "learning_rate": 0.0001635404454865182, + "loss": 3.3902335166931152, + "step": 1395, + "token_acc": 0.26015015350120285 + }, + { + "epoch": 0.8185282908238053, + "grad_norm": 1.165489899796703, + "learning_rate": 0.00016365767878077374, + "loss": 3.3687996864318848, + "step": 1396, + "token_acc": 0.263056588230324 + }, + { + "epoch": 0.8191146291410144, + "grad_norm": 1.123544971930076, + "learning_rate": 0.00016377491207502932, + "loss": 3.416421890258789, + "step": 1397, + "token_acc": 0.25979316409747855 + }, + { + "epoch": 0.8197009674582234, + "grad_norm": 1.386911827937946, + "learning_rate": 0.00016389214536928487, + "loss": 3.457995891571045, + "step": 1398, + "token_acc": 0.25363085330885976 + }, + { + "epoch": 0.8202873057754324, + "grad_norm": 1.589659105437681, + "learning_rate": 0.00016400937866354045, + "loss": 3.4378838539123535, + "step": 1399, + "token_acc": 0.25467336992293643 + }, + { + "epoch": 0.8208736440926414, + "grad_norm": 1.2484402205481129, + "learning_rate": 0.00016412661195779602, + "loss": 3.386201858520508, + "step": 1400, + "token_acc": 0.2603389146661287 + }, + { + "epoch": 0.8214599824098505, + "grad_norm": 1.7403796109007217, + "learning_rate": 0.0001642438452520516, + "loss": 3.4289164543151855, + "step": 1401, + "token_acc": 0.2564007197617557 + }, + { + "epoch": 0.8220463207270595, + "grad_norm": 0.9383213961790757, + "learning_rate": 0.00016436107854630715, + "loss": 3.3875303268432617, + "step": 1402, + "token_acc": 0.26220817272226665 + }, + { + "epoch": 0.8226326590442685, + "grad_norm": 1.735972178394569, + "learning_rate": 0.00016447831184056272, + "loss": 3.3960866928100586, + "step": 1403, + "token_acc": 0.26150425467806787 + }, + { + "epoch": 0.8232189973614775, + "grad_norm": 1.0435792060814726, + "learning_rate": 0.0001645955451348183, + "loss": 3.356168270111084, + "step": 1404, + "token_acc": 0.2658659432290763 + }, + { + "epoch": 0.8238053356786866, + "grad_norm": 1.1976902584261255, + "learning_rate": 0.00016471277842907388, + "loss": 3.3646349906921387, + "step": 1405, + "token_acc": 0.26182173567103045 + }, + { + "epoch": 0.8243916739958956, + "grad_norm": 1.2784395426836739, + "learning_rate": 0.00016483001172332943, + "loss": 3.3935470581054688, + "step": 1406, + "token_acc": 0.258898987515772 + }, + { + "epoch": 0.8249780123131046, + "grad_norm": 1.4494234385569889, + "learning_rate": 0.000164947245017585, + "loss": 3.4402318000793457, + "step": 1407, + "token_acc": 0.2535252485579404 + }, + { + "epoch": 0.8255643506303136, + "grad_norm": 1.0203063742982343, + "learning_rate": 0.00016506447831184058, + "loss": 3.3333630561828613, + "step": 1408, + "token_acc": 0.2681763254949554 + }, + { + "epoch": 0.8261506889475227, + "grad_norm": 1.622378942597262, + "learning_rate": 0.00016518171160609615, + "loss": 3.4576644897460938, + "step": 1409, + "token_acc": 0.252765525169086 + }, + { + "epoch": 0.8267370272647317, + "grad_norm": 1.1218495729331617, + "learning_rate": 0.0001652989449003517, + "loss": 3.37540340423584, + "step": 1410, + "token_acc": 0.2617770668673943 + }, + { + "epoch": 0.8273233655819408, + "grad_norm": 1.1296709021138347, + "learning_rate": 0.00016541617819460728, + "loss": 3.4006543159484863, + "step": 1411, + "token_acc": 0.2601903311762467 + }, + { + "epoch": 0.8279097038991499, + "grad_norm": 1.1791375750025688, + "learning_rate": 0.00016553341148886286, + "loss": 3.3970694541931152, + "step": 1412, + "token_acc": 0.25899754965961636 + }, + { + "epoch": 0.8284960422163589, + "grad_norm": 0.8829390034570248, + "learning_rate": 0.0001656506447831184, + "loss": 3.346217393875122, + "step": 1413, + "token_acc": 0.2659625198584681 + }, + { + "epoch": 0.8290823805335679, + "grad_norm": 1.2655513374992022, + "learning_rate": 0.00016576787807737398, + "loss": 3.424145221710205, + "step": 1414, + "token_acc": 0.2559237366869084 + }, + { + "epoch": 0.8296687188507769, + "grad_norm": 1.1905826909476929, + "learning_rate": 0.00016588511137162953, + "loss": 3.3977012634277344, + "step": 1415, + "token_acc": 0.26070246134311365 + }, + { + "epoch": 0.830255057167986, + "grad_norm": 1.5004251628761933, + "learning_rate": 0.0001660023446658851, + "loss": 3.4330170154571533, + "step": 1416, + "token_acc": 0.2561364192522185 + }, + { + "epoch": 0.830841395485195, + "grad_norm": 1.757025167113541, + "learning_rate": 0.00016611957796014068, + "loss": 3.3519184589385986, + "step": 1417, + "token_acc": 0.26574610198664134 + }, + { + "epoch": 0.831427733802404, + "grad_norm": 0.9523233453133809, + "learning_rate": 0.00016623681125439626, + "loss": 3.4260971546173096, + "step": 1418, + "token_acc": 0.25653466896227894 + }, + { + "epoch": 0.832014072119613, + "grad_norm": 1.8447147161589188, + "learning_rate": 0.0001663540445486518, + "loss": 3.4153528213500977, + "step": 1419, + "token_acc": 0.2566249853682866 + }, + { + "epoch": 0.8326004104368221, + "grad_norm": 1.208719964117776, + "learning_rate": 0.00016647127784290739, + "loss": 3.3849730491638184, + "step": 1420, + "token_acc": 0.2612123694450133 + }, + { + "epoch": 0.8331867487540311, + "grad_norm": 1.3798265107313714, + "learning_rate": 0.00016658851113716296, + "loss": 3.442805290222168, + "step": 1421, + "token_acc": 0.25360380085504053 + }, + { + "epoch": 0.8337730870712401, + "grad_norm": 1.2747954913225212, + "learning_rate": 0.00016670574443141854, + "loss": 3.4103610515594482, + "step": 1422, + "token_acc": 0.25752999994664305 + }, + { + "epoch": 0.8343594253884491, + "grad_norm": 1.514706727116007, + "learning_rate": 0.00016682297772567411, + "loss": 3.3485260009765625, + "step": 1423, + "token_acc": 0.26686710305982647 + }, + { + "epoch": 0.8349457637056582, + "grad_norm": 1.0380132802134088, + "learning_rate": 0.00016694021101992966, + "loss": 3.3454668521881104, + "step": 1424, + "token_acc": 0.2657586972612879 + }, + { + "epoch": 0.8355321020228672, + "grad_norm": 1.3767539039099006, + "learning_rate": 0.00016705744431418524, + "loss": 3.3448781967163086, + "step": 1425, + "token_acc": 0.26729262288914446 + }, + { + "epoch": 0.8361184403400762, + "grad_norm": 1.3993449161884737, + "learning_rate": 0.00016717467760844082, + "loss": 3.3844680786132812, + "step": 1426, + "token_acc": 0.2601063046689782 + }, + { + "epoch": 0.8367047786572852, + "grad_norm": 1.371283751056096, + "learning_rate": 0.0001672919109026964, + "loss": 3.40334415435791, + "step": 1427, + "token_acc": 0.2597881625743528 + }, + { + "epoch": 0.8372911169744943, + "grad_norm": 1.152520742035871, + "learning_rate": 0.00016740914419695194, + "loss": 3.4200756549835205, + "step": 1428, + "token_acc": 0.2566177504073463 + }, + { + "epoch": 0.8378774552917033, + "grad_norm": 1.7393925132629913, + "learning_rate": 0.00016752637749120752, + "loss": 3.3893885612487793, + "step": 1429, + "token_acc": 0.2589688966794248 + }, + { + "epoch": 0.8384637936089123, + "grad_norm": 1.0337838621521869, + "learning_rate": 0.0001676436107854631, + "loss": 3.398236036300659, + "step": 1430, + "token_acc": 0.2590882267295784 + }, + { + "epoch": 0.8390501319261213, + "grad_norm": 1.5778013181855102, + "learning_rate": 0.00016776084407971864, + "loss": 3.4039158821105957, + "step": 1431, + "token_acc": 0.2594672722469467 + }, + { + "epoch": 0.8396364702433304, + "grad_norm": 0.9642733717561321, + "learning_rate": 0.00016787807737397422, + "loss": 3.3822526931762695, + "step": 1432, + "token_acc": 0.2612234161482514 + }, + { + "epoch": 0.8402228085605394, + "grad_norm": 1.5432076240472155, + "learning_rate": 0.00016799531066822977, + "loss": 3.4229960441589355, + "step": 1433, + "token_acc": 0.25693975760302645 + }, + { + "epoch": 0.8408091468777484, + "grad_norm": 1.5409483700047024, + "learning_rate": 0.00016811254396248535, + "loss": 3.388065814971924, + "step": 1434, + "token_acc": 0.2598596917352489 + }, + { + "epoch": 0.8413954851949574, + "grad_norm": 1.2065574350500252, + "learning_rate": 0.00016822977725674092, + "loss": 3.385983467102051, + "step": 1435, + "token_acc": 0.26186015079438696 + }, + { + "epoch": 0.8419818235121665, + "grad_norm": 1.3630722912432809, + "learning_rate": 0.0001683470105509965, + "loss": 3.424725294113159, + "step": 1436, + "token_acc": 0.2560455528432426 + }, + { + "epoch": 0.8425681618293756, + "grad_norm": 1.2030244597933781, + "learning_rate": 0.00016846424384525205, + "loss": 3.4501466751098633, + "step": 1437, + "token_acc": 0.25450108416901945 + }, + { + "epoch": 0.8431545001465846, + "grad_norm": 1.0413621962361068, + "learning_rate": 0.00016858147713950762, + "loss": 3.409201145172119, + "step": 1438, + "token_acc": 0.2596036829779237 + }, + { + "epoch": 0.8437408384637937, + "grad_norm": 1.2623221674477745, + "learning_rate": 0.0001686987104337632, + "loss": 3.385174512863159, + "step": 1439, + "token_acc": 0.2593272105823389 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.9411913690571712, + "learning_rate": 0.00016881594372801878, + "loss": 3.38513445854187, + "step": 1440, + "token_acc": 0.2620564172446704 + }, + { + "epoch": 0.8449135150982117, + "grad_norm": 1.360864186045538, + "learning_rate": 0.00016893317702227432, + "loss": 3.4050650596618652, + "step": 1441, + "token_acc": 0.2587739324443522 + }, + { + "epoch": 0.8454998534154207, + "grad_norm": 1.1054797246575565, + "learning_rate": 0.0001690504103165299, + "loss": 3.3806333541870117, + "step": 1442, + "token_acc": 0.25974824519608286 + }, + { + "epoch": 0.8460861917326298, + "grad_norm": 1.4179040184340541, + "learning_rate": 0.00016916764361078548, + "loss": 3.431645393371582, + "step": 1443, + "token_acc": 0.2547193630874358 + }, + { + "epoch": 0.8466725300498388, + "grad_norm": 0.9801711189000787, + "learning_rate": 0.00016928487690504105, + "loss": 3.3983826637268066, + "step": 1444, + "token_acc": 0.25908235005167984 + }, + { + "epoch": 0.8472588683670478, + "grad_norm": 1.2106105538420087, + "learning_rate": 0.0001694021101992966, + "loss": 3.380951404571533, + "step": 1445, + "token_acc": 0.2609729458551294 + }, + { + "epoch": 0.8478452066842568, + "grad_norm": 1.3818651652596927, + "learning_rate": 0.00016951934349355218, + "loss": 3.404722213745117, + "step": 1446, + "token_acc": 0.25812670153223144 + }, + { + "epoch": 0.8484315450014659, + "grad_norm": 1.3789259248888228, + "learning_rate": 0.00016963657678780776, + "loss": 3.359693765640259, + "step": 1447, + "token_acc": 0.2657556477441266 + }, + { + "epoch": 0.8490178833186749, + "grad_norm": 1.108831893018942, + "learning_rate": 0.00016975381008206333, + "loss": 3.412679672241211, + "step": 1448, + "token_acc": 0.25687653173773756 + }, + { + "epoch": 0.8496042216358839, + "grad_norm": 1.2391722519744244, + "learning_rate": 0.00016987104337631888, + "loss": 3.39201283454895, + "step": 1449, + "token_acc": 0.26091004481529106 + }, + { + "epoch": 0.8501905599530929, + "grad_norm": 1.218245224599716, + "learning_rate": 0.00016998827667057443, + "loss": 3.380063772201538, + "step": 1450, + "token_acc": 0.26086066380566164 + }, + { + "epoch": 0.850776898270302, + "grad_norm": 1.4912963455771948, + "learning_rate": 0.00017010550996483, + "loss": 3.3462018966674805, + "step": 1451, + "token_acc": 0.2675874434800686 + }, + { + "epoch": 0.851363236587511, + "grad_norm": 1.1009741632522507, + "learning_rate": 0.00017022274325908558, + "loss": 3.3902649879455566, + "step": 1452, + "token_acc": 0.2602115225749865 + }, + { + "epoch": 0.85194957490472, + "grad_norm": 1.4848246642260061, + "learning_rate": 0.00017033997655334116, + "loss": 3.4019291400909424, + "step": 1453, + "token_acc": 0.2597050539924878 + }, + { + "epoch": 0.852535913221929, + "grad_norm": 1.079156558361002, + "learning_rate": 0.0001704572098475967, + "loss": 3.396552801132202, + "step": 1454, + "token_acc": 0.2582360587379828 + }, + { + "epoch": 0.8531222515391381, + "grad_norm": 1.342325168555911, + "learning_rate": 0.00017057444314185228, + "loss": 3.3619723320007324, + "step": 1455, + "token_acc": 0.26495436483215856 + }, + { + "epoch": 0.8537085898563471, + "grad_norm": 1.3079130800654015, + "learning_rate": 0.00017069167643610786, + "loss": 3.2954182624816895, + "step": 1456, + "token_acc": 0.27105714749903537 + }, + { + "epoch": 0.8542949281735561, + "grad_norm": 1.284983108830768, + "learning_rate": 0.00017080890973036344, + "loss": 3.3426520824432373, + "step": 1457, + "token_acc": 0.2663694074568128 + }, + { + "epoch": 0.8548812664907651, + "grad_norm": 1.4309767515396352, + "learning_rate": 0.000170926143024619, + "loss": 3.397672414779663, + "step": 1458, + "token_acc": 0.26012984336356143 + }, + { + "epoch": 0.8554676048079742, + "grad_norm": 1.1185046909742629, + "learning_rate": 0.00017104337631887456, + "loss": 3.316408634185791, + "step": 1459, + "token_acc": 0.27010468388273534 + }, + { + "epoch": 0.8560539431251832, + "grad_norm": 1.5050914891082074, + "learning_rate": 0.00017116060961313014, + "loss": 3.3816070556640625, + "step": 1460, + "token_acc": 0.26188591728174637 + }, + { + "epoch": 0.8566402814423922, + "grad_norm": 1.218411932089343, + "learning_rate": 0.00017127784290738572, + "loss": 3.3504467010498047, + "step": 1461, + "token_acc": 0.26462275727212514 + }, + { + "epoch": 0.8572266197596012, + "grad_norm": 1.3706023121231508, + "learning_rate": 0.0001713950762016413, + "loss": 3.39461088180542, + "step": 1462, + "token_acc": 0.26109746758562413 + }, + { + "epoch": 0.8578129580768104, + "grad_norm": 0.9611026830318272, + "learning_rate": 0.00017151230949589684, + "loss": 3.3955650329589844, + "step": 1463, + "token_acc": 0.2602593257114245 + }, + { + "epoch": 0.8583992963940194, + "grad_norm": 1.3148266847507588, + "learning_rate": 0.00017162954279015242, + "loss": 3.368884801864624, + "step": 1464, + "token_acc": 0.2636086704765741 + }, + { + "epoch": 0.8589856347112284, + "grad_norm": 1.1069858969393984, + "learning_rate": 0.000171746776084408, + "loss": 3.3595385551452637, + "step": 1465, + "token_acc": 0.2638883811294543 + }, + { + "epoch": 0.8595719730284375, + "grad_norm": 1.3789698013931453, + "learning_rate": 0.00017186400937866357, + "loss": 3.3402113914489746, + "step": 1466, + "token_acc": 0.266412021149323 + }, + { + "epoch": 0.8601583113456465, + "grad_norm": 1.2340979412692332, + "learning_rate": 0.00017198124267291912, + "loss": 3.3974380493164062, + "step": 1467, + "token_acc": 0.25799578504774023 + }, + { + "epoch": 0.8607446496628555, + "grad_norm": 1.2950079205083143, + "learning_rate": 0.00017209847596717467, + "loss": 3.3972349166870117, + "step": 1468, + "token_acc": 0.2576024040391237 + }, + { + "epoch": 0.8613309879800645, + "grad_norm": 1.062214385558007, + "learning_rate": 0.00017221570926143024, + "loss": 3.3747265338897705, + "step": 1469, + "token_acc": 0.26251473168687633 + }, + { + "epoch": 0.8619173262972736, + "grad_norm": 1.4292231887001252, + "learning_rate": 0.00017233294255568582, + "loss": 3.3396902084350586, + "step": 1470, + "token_acc": 0.2678973850748157 + }, + { + "epoch": 0.8625036646144826, + "grad_norm": 1.2545980116133342, + "learning_rate": 0.0001724501758499414, + "loss": 3.417757987976074, + "step": 1471, + "token_acc": 0.25574512002091915 + }, + { + "epoch": 0.8630900029316916, + "grad_norm": 0.8717300775605415, + "learning_rate": 0.00017256740914419695, + "loss": 3.4178900718688965, + "step": 1472, + "token_acc": 0.25856996608206484 + }, + { + "epoch": 0.8636763412489006, + "grad_norm": 1.4890831767095827, + "learning_rate": 0.00017268464243845252, + "loss": 3.3594353199005127, + "step": 1473, + "token_acc": 0.265992410650493 + }, + { + "epoch": 0.8642626795661097, + "grad_norm": 0.9579228720723092, + "learning_rate": 0.0001728018757327081, + "loss": 3.413370370864868, + "step": 1474, + "token_acc": 0.2575329530414322 + }, + { + "epoch": 0.8648490178833187, + "grad_norm": 1.155826149920576, + "learning_rate": 0.00017291910902696367, + "loss": 3.3810291290283203, + "step": 1475, + "token_acc": 0.25937102297327436 + }, + { + "epoch": 0.8654353562005277, + "grad_norm": 1.399884311989567, + "learning_rate": 0.00017303634232121922, + "loss": 3.3536834716796875, + "step": 1476, + "token_acc": 0.2650819792508051 + }, + { + "epoch": 0.8660216945177367, + "grad_norm": 1.2756653677508316, + "learning_rate": 0.0001731535756154748, + "loss": 3.3735735416412354, + "step": 1477, + "token_acc": 0.2613435984024083 + }, + { + "epoch": 0.8666080328349458, + "grad_norm": 1.0320424687456473, + "learning_rate": 0.00017327080890973038, + "loss": 3.41560697555542, + "step": 1478, + "token_acc": 0.25624816779042187 + }, + { + "epoch": 0.8671943711521548, + "grad_norm": 1.2321800203382742, + "learning_rate": 0.00017338804220398595, + "loss": 3.3585915565490723, + "step": 1479, + "token_acc": 0.2626471017819138 + }, + { + "epoch": 0.8677807094693638, + "grad_norm": 1.1478764139345106, + "learning_rate": 0.0001735052754982415, + "loss": 3.3904266357421875, + "step": 1480, + "token_acc": 0.26105803395183574 + }, + { + "epoch": 0.8683670477865728, + "grad_norm": 1.0322712865679524, + "learning_rate": 0.00017362250879249708, + "loss": 3.3808116912841797, + "step": 1481, + "token_acc": 0.2612883553896913 + }, + { + "epoch": 0.8689533861037819, + "grad_norm": 1.6175135364430304, + "learning_rate": 0.00017373974208675265, + "loss": 3.324183940887451, + "step": 1482, + "token_acc": 0.26901215572439974 + }, + { + "epoch": 0.8695397244209909, + "grad_norm": 0.8772822431254642, + "learning_rate": 0.00017385697538100823, + "loss": 3.369882583618164, + "step": 1483, + "token_acc": 0.2621472591631528 + }, + { + "epoch": 0.8701260627381999, + "grad_norm": 1.1641519696498677, + "learning_rate": 0.00017397420867526378, + "loss": 3.393336772918701, + "step": 1484, + "token_acc": 0.25840881388702086 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 1.2882711860033051, + "learning_rate": 0.00017409144196951933, + "loss": 3.4106829166412354, + "step": 1485, + "token_acc": 0.2574341184670473 + }, + { + "epoch": 0.871298739372618, + "grad_norm": 1.2626286397657405, + "learning_rate": 0.0001742086752637749, + "loss": 3.404301404953003, + "step": 1486, + "token_acc": 0.25690334006230003 + }, + { + "epoch": 0.871885077689827, + "grad_norm": 1.1819986252413377, + "learning_rate": 0.00017432590855803048, + "loss": 3.369124412536621, + "step": 1487, + "token_acc": 0.26201105658610085 + }, + { + "epoch": 0.872471416007036, + "grad_norm": 1.2573971511765212, + "learning_rate": 0.00017444314185228606, + "loss": 3.408914089202881, + "step": 1488, + "token_acc": 0.25828872275893644 + }, + { + "epoch": 0.873057754324245, + "grad_norm": 1.100159807291389, + "learning_rate": 0.00017456037514654163, + "loss": 3.3941423892974854, + "step": 1489, + "token_acc": 0.26022142840368545 + }, + { + "epoch": 0.8736440926414542, + "grad_norm": 1.1743512324022334, + "learning_rate": 0.00017467760844079718, + "loss": 3.3964481353759766, + "step": 1490, + "token_acc": 0.25867709547625317 + }, + { + "epoch": 0.8742304309586632, + "grad_norm": 1.2954899078211197, + "learning_rate": 0.00017479484173505276, + "loss": 3.376728057861328, + "step": 1491, + "token_acc": 0.2615496460684905 + }, + { + "epoch": 0.8748167692758722, + "grad_norm": 1.2763621724988437, + "learning_rate": 0.00017491207502930834, + "loss": 3.3485350608825684, + "step": 1492, + "token_acc": 0.2655099026138166 + }, + { + "epoch": 0.8754031075930812, + "grad_norm": 0.9991407635116363, + "learning_rate": 0.0001750293083235639, + "loss": 3.3698904514312744, + "step": 1493, + "token_acc": 0.2607958864768984 + }, + { + "epoch": 0.8759894459102903, + "grad_norm": 1.352455401779166, + "learning_rate": 0.00017514654161781946, + "loss": 3.3584060668945312, + "step": 1494, + "token_acc": 0.26509359311277486 + }, + { + "epoch": 0.8765757842274993, + "grad_norm": 1.1208433807693403, + "learning_rate": 0.00017526377491207504, + "loss": 3.3936424255371094, + "step": 1495, + "token_acc": 0.25898503057794126 + }, + { + "epoch": 0.8771621225447083, + "grad_norm": 1.5152089916871343, + "learning_rate": 0.00017538100820633061, + "loss": 3.3777639865875244, + "step": 1496, + "token_acc": 0.2621632182858071 + }, + { + "epoch": 0.8777484608619174, + "grad_norm": 0.736659725372689, + "learning_rate": 0.0001754982415005862, + "loss": 3.4409842491149902, + "step": 1497, + "token_acc": 0.2521883962115106 + }, + { + "epoch": 0.8783347991791264, + "grad_norm": 1.1438099619520186, + "learning_rate": 0.00017561547479484174, + "loss": 3.3725433349609375, + "step": 1498, + "token_acc": 0.2612237375598968 + }, + { + "epoch": 0.8789211374963354, + "grad_norm": 1.6226355212468893, + "learning_rate": 0.00017573270808909732, + "loss": 3.3909592628479004, + "step": 1499, + "token_acc": 0.25940590002381725 + }, + { + "epoch": 0.8795074758135444, + "grad_norm": 0.9613854092073866, + "learning_rate": 0.0001758499413833529, + "loss": 3.383650064468384, + "step": 1500, + "token_acc": 0.26161368531566925 + }, + { + "epoch": 0.8800938141307535, + "grad_norm": 0.9310447066820396, + "learning_rate": 0.00017596717467760847, + "loss": 3.367323637008667, + "step": 1501, + "token_acc": 0.26356278589925874 + }, + { + "epoch": 0.8806801524479625, + "grad_norm": 1.0902444785263428, + "learning_rate": 0.00017608440797186402, + "loss": 3.354609489440918, + "step": 1502, + "token_acc": 0.26554700512764196 + }, + { + "epoch": 0.8812664907651715, + "grad_norm": 1.4774117537193938, + "learning_rate": 0.00017620164126611957, + "loss": 3.387373685836792, + "step": 1503, + "token_acc": 0.25984581780804317 + }, + { + "epoch": 0.8818528290823805, + "grad_norm": 2.266694979750916, + "learning_rate": 0.00017631887456037514, + "loss": 3.407223701477051, + "step": 1504, + "token_acc": 0.25702938882074367 + }, + { + "epoch": 0.8824391673995896, + "grad_norm": 2.0749896393951426, + "learning_rate": 0.00017643610785463072, + "loss": 3.404712677001953, + "step": 1505, + "token_acc": 0.26099025840006607 + }, + { + "epoch": 0.8830255057167986, + "grad_norm": 1.173192163339113, + "learning_rate": 0.0001765533411488863, + "loss": 3.351816177368164, + "step": 1506, + "token_acc": 0.2640559527221185 + }, + { + "epoch": 0.8836118440340076, + "grad_norm": 1.452009753822364, + "learning_rate": 0.00017667057444314185, + "loss": 3.4304556846618652, + "step": 1507, + "token_acc": 0.25553283149286343 + }, + { + "epoch": 0.8841981823512166, + "grad_norm": 1.15685785181313, + "learning_rate": 0.00017678780773739742, + "loss": 3.407529354095459, + "step": 1508, + "token_acc": 0.25884024471705913 + }, + { + "epoch": 0.8847845206684257, + "grad_norm": 1.5138501965965592, + "learning_rate": 0.000176905041031653, + "loss": 3.3573224544525146, + "step": 1509, + "token_acc": 0.2630266569036829 + }, + { + "epoch": 0.8853708589856347, + "grad_norm": 1.0157207828911852, + "learning_rate": 0.00017702227432590857, + "loss": 3.323700428009033, + "step": 1510, + "token_acc": 0.2691512717812135 + }, + { + "epoch": 0.8859571973028437, + "grad_norm": 1.3384678807577561, + "learning_rate": 0.00017713950762016412, + "loss": 3.406172037124634, + "step": 1511, + "token_acc": 0.25799224993945263 + }, + { + "epoch": 0.8865435356200527, + "grad_norm": 1.2471878125708253, + "learning_rate": 0.0001772567409144197, + "loss": 3.3650341033935547, + "step": 1512, + "token_acc": 0.2642705110952654 + }, + { + "epoch": 0.8871298739372618, + "grad_norm": 1.519185393671763, + "learning_rate": 0.00017737397420867528, + "loss": 3.360307216644287, + "step": 1513, + "token_acc": 0.26373175542406313 + }, + { + "epoch": 0.8877162122544708, + "grad_norm": 0.9706358535167096, + "learning_rate": 0.00017749120750293085, + "loss": 3.426914930343628, + "step": 1514, + "token_acc": 0.252681631322408 + }, + { + "epoch": 0.8883025505716798, + "grad_norm": 1.2134110848710862, + "learning_rate": 0.00017760844079718643, + "loss": 3.417804718017578, + "step": 1515, + "token_acc": 0.2581786868065955 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.6261002168992242, + "learning_rate": 0.00017772567409144198, + "loss": 3.3730387687683105, + "step": 1516, + "token_acc": 0.2605859012068107 + }, + { + "epoch": 0.889475227206098, + "grad_norm": 1.0997783048125969, + "learning_rate": 0.00017784290738569755, + "loss": 3.37394642829895, + "step": 1517, + "token_acc": 0.2603151199541986 + }, + { + "epoch": 0.890061565523307, + "grad_norm": 1.6285489879453978, + "learning_rate": 0.00017796014067995313, + "loss": 3.3897604942321777, + "step": 1518, + "token_acc": 0.26049355181506895 + }, + { + "epoch": 0.890647903840516, + "grad_norm": 3.3286981841905225, + "learning_rate": 0.0001780773739742087, + "loss": 3.4052653312683105, + "step": 1519, + "token_acc": 0.26048819927977573 + }, + { + "epoch": 0.891234242157725, + "grad_norm": 1.159283016464403, + "learning_rate": 0.00017819460726846423, + "loss": 3.361818313598633, + "step": 1520, + "token_acc": 0.26199120707289775 + }, + { + "epoch": 0.8918205804749341, + "grad_norm": 1.8574366638650486, + "learning_rate": 0.0001783118405627198, + "loss": 3.392158269882202, + "step": 1521, + "token_acc": 0.25933242648280447 + }, + { + "epoch": 0.8924069187921431, + "grad_norm": 1.2350760832240921, + "learning_rate": 0.00017842907385697538, + "loss": 3.419607162475586, + "step": 1522, + "token_acc": 0.25626366190631006 + }, + { + "epoch": 0.8929932571093521, + "grad_norm": 1.3560927403181309, + "learning_rate": 0.00017854630715123096, + "loss": 3.3976871967315674, + "step": 1523, + "token_acc": 0.25769128754226445 + }, + { + "epoch": 0.8935795954265612, + "grad_norm": 1.1181138982280656, + "learning_rate": 0.00017866354044548653, + "loss": 3.3918638229370117, + "step": 1524, + "token_acc": 0.2582132367106412 + }, + { + "epoch": 0.8941659337437702, + "grad_norm": 1.554557886780218, + "learning_rate": 0.00017878077373974208, + "loss": 3.3687663078308105, + "step": 1525, + "token_acc": 0.2614903793607753 + }, + { + "epoch": 0.8947522720609792, + "grad_norm": 1.1530624476758216, + "learning_rate": 0.00017889800703399766, + "loss": 3.374950647354126, + "step": 1526, + "token_acc": 0.26278817244505454 + }, + { + "epoch": 0.8953386103781882, + "grad_norm": 1.333841294396085, + "learning_rate": 0.00017901524032825324, + "loss": 3.3191957473754883, + "step": 1527, + "token_acc": 0.26790004551083263 + }, + { + "epoch": 0.8959249486953973, + "grad_norm": 0.9422572212953042, + "learning_rate": 0.0001791324736225088, + "loss": 3.421724319458008, + "step": 1528, + "token_acc": 0.2570019734002097 + }, + { + "epoch": 0.8965112870126063, + "grad_norm": 1.3299214955353553, + "learning_rate": 0.00017924970691676436, + "loss": 3.3493521213531494, + "step": 1529, + "token_acc": 0.26553432711582686 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 1.4589638858468008, + "learning_rate": 0.00017936694021101994, + "loss": 3.321837902069092, + "step": 1530, + "token_acc": 0.2685723435320426 + }, + { + "epoch": 0.8976839636470243, + "grad_norm": 1.2870542763683752, + "learning_rate": 0.00017948417350527551, + "loss": 3.3878555297851562, + "step": 1531, + "token_acc": 0.25975829316325505 + }, + { + "epoch": 0.8982703019642334, + "grad_norm": 1.2187544875158403, + "learning_rate": 0.0001796014067995311, + "loss": 3.3984103202819824, + "step": 1532, + "token_acc": 0.25735855262312646 + }, + { + "epoch": 0.8988566402814424, + "grad_norm": 1.29561914753019, + "learning_rate": 0.00017971864009378664, + "loss": 3.3379642963409424, + "step": 1533, + "token_acc": 0.2644837250747305 + }, + { + "epoch": 0.8994429785986514, + "grad_norm": 1.2966455411514421, + "learning_rate": 0.00017983587338804222, + "loss": 3.3390920162200928, + "step": 1534, + "token_acc": 0.26739066089357233 + }, + { + "epoch": 0.9000293169158604, + "grad_norm": 1.1386317992267077, + "learning_rate": 0.0001799531066822978, + "loss": 3.3433074951171875, + "step": 1535, + "token_acc": 0.2656255790928296 + }, + { + "epoch": 0.9006156552330695, + "grad_norm": 1.312571750111663, + "learning_rate": 0.00018007033997655337, + "loss": 3.3589138984680176, + "step": 1536, + "token_acc": 0.2636712168848914 + }, + { + "epoch": 0.9012019935502785, + "grad_norm": 1.211894350862937, + "learning_rate": 0.00018018757327080892, + "loss": 3.3316750526428223, + "step": 1537, + "token_acc": 0.2656379854089685 + }, + { + "epoch": 0.9017883318674875, + "grad_norm": 1.6156276018651505, + "learning_rate": 0.00018030480656506447, + "loss": 3.342571258544922, + "step": 1538, + "token_acc": 0.26371886229368413 + }, + { + "epoch": 0.9023746701846965, + "grad_norm": 0.9816790098360255, + "learning_rate": 0.00018042203985932004, + "loss": 3.3893649578094482, + "step": 1539, + "token_acc": 0.2592479339932253 + }, + { + "epoch": 0.9029610085019056, + "grad_norm": 1.4254902800405698, + "learning_rate": 0.00018053927315357562, + "loss": 3.345700740814209, + "step": 1540, + "token_acc": 0.2653448586075908 + }, + { + "epoch": 0.9035473468191146, + "grad_norm": 1.2825574459255813, + "learning_rate": 0.0001806565064478312, + "loss": 3.3366165161132812, + "step": 1541, + "token_acc": 0.2659552316501822 + }, + { + "epoch": 0.9041336851363236, + "grad_norm": 1.4129078528307166, + "learning_rate": 0.00018077373974208674, + "loss": 3.340075969696045, + "step": 1542, + "token_acc": 0.2659273755164166 + }, + { + "epoch": 0.9047200234535326, + "grad_norm": 0.9069217276918115, + "learning_rate": 0.00018089097303634232, + "loss": 3.3417882919311523, + "step": 1543, + "token_acc": 0.26521126684839 + }, + { + "epoch": 0.9053063617707418, + "grad_norm": 1.5657331153023728, + "learning_rate": 0.0001810082063305979, + "loss": 3.3857264518737793, + "step": 1544, + "token_acc": 0.26106904070891346 + }, + { + "epoch": 0.9058927000879508, + "grad_norm": 1.1012884221426182, + "learning_rate": 0.00018112543962485347, + "loss": 3.3069815635681152, + "step": 1545, + "token_acc": 0.27012238945341804 + }, + { + "epoch": 0.9064790384051598, + "grad_norm": 1.1739170478926193, + "learning_rate": 0.00018124267291910902, + "loss": 3.280078887939453, + "step": 1546, + "token_acc": 0.2727946511868975 + }, + { + "epoch": 0.9070653767223688, + "grad_norm": 1.175053454563694, + "learning_rate": 0.0001813599062133646, + "loss": 3.3163278102874756, + "step": 1547, + "token_acc": 0.26934326329982056 + }, + { + "epoch": 0.9076517150395779, + "grad_norm": 1.24818481620032, + "learning_rate": 0.00018147713950762018, + "loss": 3.3843674659729004, + "step": 1548, + "token_acc": 0.2582754371568118 + }, + { + "epoch": 0.9082380533567869, + "grad_norm": 1.3122585625308938, + "learning_rate": 0.00018159437280187575, + "loss": 3.3370323181152344, + "step": 1549, + "token_acc": 0.2653369841906081 + }, + { + "epoch": 0.9088243916739959, + "grad_norm": 1.286304506506278, + "learning_rate": 0.00018171160609613133, + "loss": 3.3194799423217773, + "step": 1550, + "token_acc": 0.26853451254059285 + }, + { + "epoch": 0.909410729991205, + "grad_norm": 1.0788506085500287, + "learning_rate": 0.00018182883939038688, + "loss": 3.353987216949463, + "step": 1551, + "token_acc": 0.264657159612043 + }, + { + "epoch": 0.909997068308414, + "grad_norm": 1.4452448808436773, + "learning_rate": 0.00018194607268464245, + "loss": 3.378814697265625, + "step": 1552, + "token_acc": 0.2593958374111046 + }, + { + "epoch": 0.910583406625623, + "grad_norm": 1.2357466809858635, + "learning_rate": 0.00018206330597889803, + "loss": 3.342198371887207, + "step": 1553, + "token_acc": 0.26663439312892834 + }, + { + "epoch": 0.911169744942832, + "grad_norm": 1.2129130656116176, + "learning_rate": 0.0001821805392731536, + "loss": 3.3796777725219727, + "step": 1554, + "token_acc": 0.2617166528059962 + }, + { + "epoch": 0.9117560832600411, + "grad_norm": 1.146604556201497, + "learning_rate": 0.00018229777256740915, + "loss": 3.396350860595703, + "step": 1555, + "token_acc": 0.25920019468298 + }, + { + "epoch": 0.9123424215772501, + "grad_norm": 2.05040081403037, + "learning_rate": 0.0001824150058616647, + "loss": 3.4083476066589355, + "step": 1556, + "token_acc": 0.25626040551594265 + }, + { + "epoch": 0.9129287598944591, + "grad_norm": 0.7773064290736583, + "learning_rate": 0.00018253223915592028, + "loss": 3.4253311157226562, + "step": 1557, + "token_acc": 0.25439652848210964 + }, + { + "epoch": 0.9135150982116681, + "grad_norm": 1.79627057337562, + "learning_rate": 0.00018264947245017586, + "loss": 3.391282558441162, + "step": 1558, + "token_acc": 0.2600664629429257 + }, + { + "epoch": 0.9141014365288772, + "grad_norm": 0.9184980870345989, + "learning_rate": 0.00018276670574443143, + "loss": 3.3942761421203613, + "step": 1559, + "token_acc": 0.25757309601731765 + }, + { + "epoch": 0.9146877748460862, + "grad_norm": 1.5377769612778618, + "learning_rate": 0.00018288393903868698, + "loss": 3.384425163269043, + "step": 1560, + "token_acc": 0.26069879383911443 + }, + { + "epoch": 0.9152741131632952, + "grad_norm": 1.2520046523629007, + "learning_rate": 0.00018300117233294256, + "loss": 3.295742988586426, + "step": 1561, + "token_acc": 0.27246077902770727 + }, + { + "epoch": 0.9158604514805042, + "grad_norm": 0.9420638984060122, + "learning_rate": 0.00018311840562719813, + "loss": 3.352247714996338, + "step": 1562, + "token_acc": 0.26366578156920917 + }, + { + "epoch": 0.9164467897977133, + "grad_norm": 1.1499957980941797, + "learning_rate": 0.0001832356389214537, + "loss": 3.325246810913086, + "step": 1563, + "token_acc": 0.26839646056747535 + }, + { + "epoch": 0.9170331281149223, + "grad_norm": 1.199653384799704, + "learning_rate": 0.00018335287221570926, + "loss": 3.3027188777923584, + "step": 1564, + "token_acc": 0.2728387244873506 + }, + { + "epoch": 0.9176194664321313, + "grad_norm": 1.3214843926296007, + "learning_rate": 0.00018347010550996484, + "loss": 3.3684563636779785, + "step": 1565, + "token_acc": 0.26195374187315873 + }, + { + "epoch": 0.9182058047493403, + "grad_norm": 1.0088904138662307, + "learning_rate": 0.0001835873388042204, + "loss": 3.3322877883911133, + "step": 1566, + "token_acc": 0.2671360414082261 + }, + { + "epoch": 0.9187921430665494, + "grad_norm": 1.8745183782424188, + "learning_rate": 0.000183704572098476, + "loss": 3.4052467346191406, + "step": 1567, + "token_acc": 0.25619687218573595 + }, + { + "epoch": 0.9193784813837584, + "grad_norm": 1.1478874787274114, + "learning_rate": 0.00018382180539273154, + "loss": 3.348512649536133, + "step": 1568, + "token_acc": 0.2656653861424669 + }, + { + "epoch": 0.9199648197009674, + "grad_norm": 1.3820644524817856, + "learning_rate": 0.00018393903868698711, + "loss": 3.327195167541504, + "step": 1569, + "token_acc": 0.2666618390221951 + }, + { + "epoch": 0.9205511580181764, + "grad_norm": 1.0844950939908533, + "learning_rate": 0.0001840562719812427, + "loss": 3.343712091445923, + "step": 1570, + "token_acc": 0.2670703929511573 + }, + { + "epoch": 0.9211374963353856, + "grad_norm": 1.4791056070916166, + "learning_rate": 0.00018417350527549827, + "loss": 3.3564248085021973, + "step": 1571, + "token_acc": 0.263818210393397 + }, + { + "epoch": 0.9217238346525946, + "grad_norm": 1.0240460325274745, + "learning_rate": 0.00018429073856975382, + "loss": 3.361377477645874, + "step": 1572, + "token_acc": 0.263559146354769 + }, + { + "epoch": 0.9223101729698036, + "grad_norm": 1.7780360415280625, + "learning_rate": 0.00018440797186400937, + "loss": 3.3474106788635254, + "step": 1573, + "token_acc": 0.26460279598935516 + }, + { + "epoch": 0.9228965112870126, + "grad_norm": 1.2707892836571437, + "learning_rate": 0.00018452520515826494, + "loss": 3.3712053298950195, + "step": 1574, + "token_acc": 0.2607989121897075 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 1.3658681696317136, + "learning_rate": 0.00018464243845252052, + "loss": 3.3270702362060547, + "step": 1575, + "token_acc": 0.26739641964942834 + }, + { + "epoch": 0.9240691879214307, + "grad_norm": 1.1444399495291406, + "learning_rate": 0.0001847596717467761, + "loss": 3.358936071395874, + "step": 1576, + "token_acc": 0.26351826678444296 + }, + { + "epoch": 0.9246555262386397, + "grad_norm": 1.7313194132063021, + "learning_rate": 0.00018487690504103164, + "loss": 3.3354296684265137, + "step": 1577, + "token_acc": 0.26686180890037264 + }, + { + "epoch": 0.9252418645558487, + "grad_norm": 0.9809863176003378, + "learning_rate": 0.00018499413833528722, + "loss": 3.4418704509735107, + "step": 1578, + "token_acc": 0.2533676839741049 + }, + { + "epoch": 0.9258282028730578, + "grad_norm": 1.4626773683461787, + "learning_rate": 0.0001851113716295428, + "loss": 3.3287243843078613, + "step": 1579, + "token_acc": 0.2658683516410984 + }, + { + "epoch": 0.9264145411902668, + "grad_norm": 1.1107742365425293, + "learning_rate": 0.00018522860492379837, + "loss": 3.3714849948883057, + "step": 1580, + "token_acc": 0.26082237584088525 + }, + { + "epoch": 0.9270008795074758, + "grad_norm": 1.1379131287331465, + "learning_rate": 0.00018534583821805395, + "loss": 3.3280763626098633, + "step": 1581, + "token_acc": 0.26735450497238705 + }, + { + "epoch": 0.9275872178246849, + "grad_norm": 1.1422660187588143, + "learning_rate": 0.0001854630715123095, + "loss": 3.3566975593566895, + "step": 1582, + "token_acc": 0.2637835350171905 + }, + { + "epoch": 0.9281735561418939, + "grad_norm": 1.667148809225169, + "learning_rate": 0.00018558030480656507, + "loss": 3.348209857940674, + "step": 1583, + "token_acc": 0.26456177855970214 + }, + { + "epoch": 0.9287598944591029, + "grad_norm": 1.0589916525400245, + "learning_rate": 0.00018569753810082065, + "loss": 3.3752589225769043, + "step": 1584, + "token_acc": 0.26295822252197704 + }, + { + "epoch": 0.9293462327763119, + "grad_norm": 1.5126729922013449, + "learning_rate": 0.00018581477139507623, + "loss": 3.3634002208709717, + "step": 1585, + "token_acc": 0.2607700258397933 + }, + { + "epoch": 0.929932571093521, + "grad_norm": 1.0568507383446433, + "learning_rate": 0.00018593200468933178, + "loss": 3.3675577640533447, + "step": 1586, + "token_acc": 0.2624037973977843 + }, + { + "epoch": 0.93051890941073, + "grad_norm": 1.7091785296659054, + "learning_rate": 0.00018604923798358735, + "loss": 3.3551533222198486, + "step": 1587, + "token_acc": 0.26344831526029333 + }, + { + "epoch": 0.931105247727939, + "grad_norm": 0.8770975885347173, + "learning_rate": 0.00018616647127784293, + "loss": 3.40421986579895, + "step": 1588, + "token_acc": 0.2572160555444066 + }, + { + "epoch": 0.931691586045148, + "grad_norm": 1.3252112943791836, + "learning_rate": 0.0001862837045720985, + "loss": 3.374990463256836, + "step": 1589, + "token_acc": 0.26150502451369695 + }, + { + "epoch": 0.9322779243623571, + "grad_norm": 1.1006264437220463, + "learning_rate": 0.00018640093786635405, + "loss": 3.3426198959350586, + "step": 1590, + "token_acc": 0.26584830068414994 + }, + { + "epoch": 0.9328642626795661, + "grad_norm": 1.298907876748038, + "learning_rate": 0.0001865181711606096, + "loss": 3.3545188903808594, + "step": 1591, + "token_acc": 0.26420038535645474 + }, + { + "epoch": 0.9334506009967751, + "grad_norm": 1.1254981141043234, + "learning_rate": 0.00018663540445486518, + "loss": 3.3663008213043213, + "step": 1592, + "token_acc": 0.2600399224246426 + }, + { + "epoch": 0.9340369393139841, + "grad_norm": 0.9673224135870939, + "learning_rate": 0.00018675263774912076, + "loss": 3.329442024230957, + "step": 1593, + "token_acc": 0.26830519442939144 + }, + { + "epoch": 0.9346232776311932, + "grad_norm": 1.2807819401048186, + "learning_rate": 0.00018686987104337633, + "loss": 3.3085124492645264, + "step": 1594, + "token_acc": 0.26717921147902635 + }, + { + "epoch": 0.9352096159484022, + "grad_norm": 1.536074091228922, + "learning_rate": 0.00018698710433763188, + "loss": 3.3698372840881348, + "step": 1595, + "token_acc": 0.2624507783912302 + }, + { + "epoch": 0.9357959542656112, + "grad_norm": 0.9167595605290084, + "learning_rate": 0.00018710433763188746, + "loss": 3.299405097961426, + "step": 1596, + "token_acc": 0.2704570749056434 + }, + { + "epoch": 0.9363822925828202, + "grad_norm": 1.3910912934898383, + "learning_rate": 0.00018722157092614303, + "loss": 3.395594596862793, + "step": 1597, + "token_acc": 0.25920034359614724 + }, + { + "epoch": 0.9369686309000294, + "grad_norm": 1.0860076009830906, + "learning_rate": 0.0001873388042203986, + "loss": 3.328111171722412, + "step": 1598, + "token_acc": 0.26592719458402675 + }, + { + "epoch": 0.9375549692172384, + "grad_norm": 1.0385921769905857, + "learning_rate": 0.00018745603751465416, + "loss": 3.3274950981140137, + "step": 1599, + "token_acc": 0.26740883122430453 + }, + { + "epoch": 0.9381413075344474, + "grad_norm": 1.2096505355335772, + "learning_rate": 0.00018757327080890974, + "loss": 3.3526041507720947, + "step": 1600, + "token_acc": 0.26282343576619055 + }, + { + "epoch": 0.9387276458516564, + "grad_norm": 1.7862437778363993, + "learning_rate": 0.0001876905041031653, + "loss": 3.371318817138672, + "step": 1601, + "token_acc": 0.26098816986804657 + }, + { + "epoch": 0.9393139841688655, + "grad_norm": 0.9278779931498861, + "learning_rate": 0.0001878077373974209, + "loss": 3.3075666427612305, + "step": 1602, + "token_acc": 0.2711867494651585 + }, + { + "epoch": 0.9399003224860745, + "grad_norm": 1.3853507347174254, + "learning_rate": 0.00018792497069167644, + "loss": 3.386536121368408, + "step": 1603, + "token_acc": 0.2577131342005218 + }, + { + "epoch": 0.9404866608032835, + "grad_norm": 1.2455254804914728, + "learning_rate": 0.00018804220398593201, + "loss": 3.3553366661071777, + "step": 1604, + "token_acc": 0.26208293829629725 + }, + { + "epoch": 0.9410729991204925, + "grad_norm": 1.1034562345082979, + "learning_rate": 0.0001881594372801876, + "loss": 3.4065918922424316, + "step": 1605, + "token_acc": 0.2567852023170673 + }, + { + "epoch": 0.9416593374377016, + "grad_norm": 1.7314831405483755, + "learning_rate": 0.00018827667057444317, + "loss": 3.404006004333496, + "step": 1606, + "token_acc": 0.2566374214229216 + }, + { + "epoch": 0.9422456757549106, + "grad_norm": 0.9528185692483703, + "learning_rate": 0.00018839390386869872, + "loss": 3.3299367427825928, + "step": 1607, + "token_acc": 0.2652288450898681 + }, + { + "epoch": 0.9428320140721196, + "grad_norm": 1.6403223377372316, + "learning_rate": 0.0001885111371629543, + "loss": 3.364982843399048, + "step": 1608, + "token_acc": 0.26326353771937766 + }, + { + "epoch": 0.9434183523893287, + "grad_norm": 1.0774998417516244, + "learning_rate": 0.00018862837045720984, + "loss": 3.382481813430786, + "step": 1609, + "token_acc": 0.2604807886000413 + }, + { + "epoch": 0.9440046907065377, + "grad_norm": 1.2288036718456758, + "learning_rate": 0.00018874560375146542, + "loss": 3.4060754776000977, + "step": 1610, + "token_acc": 0.25696048285763134 + }, + { + "epoch": 0.9445910290237467, + "grad_norm": 1.2183498809049207, + "learning_rate": 0.000188862837045721, + "loss": 3.350726842880249, + "step": 1611, + "token_acc": 0.26291805881769165 + }, + { + "epoch": 0.9451773673409557, + "grad_norm": 1.2340283636660205, + "learning_rate": 0.00018898007033997654, + "loss": 3.3825385570526123, + "step": 1612, + "token_acc": 0.26280442628044265 + }, + { + "epoch": 0.9457637056581648, + "grad_norm": 1.4681438532432032, + "learning_rate": 0.00018909730363423212, + "loss": 3.3893299102783203, + "step": 1613, + "token_acc": 0.2599766233697703 + }, + { + "epoch": 0.9463500439753738, + "grad_norm": 0.9809257475169157, + "learning_rate": 0.0001892145369284877, + "loss": 3.3195793628692627, + "step": 1614, + "token_acc": 0.2687134502923977 + }, + { + "epoch": 0.9469363822925828, + "grad_norm": 1.080422657307698, + "learning_rate": 0.00018933177022274327, + "loss": 3.3488292694091797, + "step": 1615, + "token_acc": 0.26532750725639975 + }, + { + "epoch": 0.9475227206097918, + "grad_norm": 1.0009022253200996, + "learning_rate": 0.00018944900351699885, + "loss": 3.341477870941162, + "step": 1616, + "token_acc": 0.2667368677920066 + }, + { + "epoch": 0.9481090589270009, + "grad_norm": 1.15274710216785, + "learning_rate": 0.0001895662368112544, + "loss": 3.3550496101379395, + "step": 1617, + "token_acc": 0.26292694117521526 + }, + { + "epoch": 0.9486953972442099, + "grad_norm": 1.5075813308380936, + "learning_rate": 0.00018968347010550997, + "loss": 3.3716845512390137, + "step": 1618, + "token_acc": 0.26060553414618737 + }, + { + "epoch": 0.9492817355614189, + "grad_norm": 1.2939910636519527, + "learning_rate": 0.00018980070339976555, + "loss": 3.3509111404418945, + "step": 1619, + "token_acc": 0.26327204522639613 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 1.1685680176518416, + "learning_rate": 0.00018991793669402113, + "loss": 3.3880279064178467, + "step": 1620, + "token_acc": 0.25947568727898707 + }, + { + "epoch": 0.950454412195837, + "grad_norm": 1.1646410392004614, + "learning_rate": 0.00019003516998827668, + "loss": 3.379462718963623, + "step": 1621, + "token_acc": 0.25987230167435893 + }, + { + "epoch": 0.951040750513046, + "grad_norm": 1.1853606258623022, + "learning_rate": 0.00019015240328253225, + "loss": 3.3054118156433105, + "step": 1622, + "token_acc": 0.26873828734523614 + }, + { + "epoch": 0.951627088830255, + "grad_norm": 1.1596618641034058, + "learning_rate": 0.00019026963657678783, + "loss": 3.3406171798706055, + "step": 1623, + "token_acc": 0.2640560872859681 + }, + { + "epoch": 0.952213427147464, + "grad_norm": 1.0399427731744997, + "learning_rate": 0.0001903868698710434, + "loss": 3.3570809364318848, + "step": 1624, + "token_acc": 0.26354488550827987 + }, + { + "epoch": 0.9527997654646732, + "grad_norm": 1.3145697144554607, + "learning_rate": 0.00019050410316529895, + "loss": 3.3618650436401367, + "step": 1625, + "token_acc": 0.2605064394938216 + }, + { + "epoch": 0.9533861037818822, + "grad_norm": 1.234950117312448, + "learning_rate": 0.00019062133645955453, + "loss": 3.333404541015625, + "step": 1626, + "token_acc": 0.2664859959286409 + }, + { + "epoch": 0.9539724420990912, + "grad_norm": 0.7601010073161639, + "learning_rate": 0.00019073856975381008, + "loss": 3.2304720878601074, + "step": 1627, + "token_acc": 0.2797544757569719 + }, + { + "epoch": 0.9545587804163002, + "grad_norm": 1.0934566867466122, + "learning_rate": 0.00019085580304806566, + "loss": 3.337761163711548, + "step": 1628, + "token_acc": 0.2670156927712458 + }, + { + "epoch": 0.9551451187335093, + "grad_norm": 1.3532055534212197, + "learning_rate": 0.00019097303634232123, + "loss": 3.3167316913604736, + "step": 1629, + "token_acc": 0.268028252558853 + }, + { + "epoch": 0.9557314570507183, + "grad_norm": 0.9863954009406589, + "learning_rate": 0.00019109026963657678, + "loss": 3.370453357696533, + "step": 1630, + "token_acc": 0.26140677374670884 + }, + { + "epoch": 0.9563177953679273, + "grad_norm": 1.083660466001697, + "learning_rate": 0.00019120750293083236, + "loss": 3.321348190307617, + "step": 1631, + "token_acc": 0.26936538699218204 + }, + { + "epoch": 0.9569041336851363, + "grad_norm": 1.025468990014639, + "learning_rate": 0.00019132473622508793, + "loss": 3.3492209911346436, + "step": 1632, + "token_acc": 0.26293236022416266 + }, + { + "epoch": 0.9574904720023454, + "grad_norm": 1.3090003190742197, + "learning_rate": 0.0001914419695193435, + "loss": 3.3504490852355957, + "step": 1633, + "token_acc": 0.26397720093024507 + }, + { + "epoch": 0.9580768103195544, + "grad_norm": 1.0616255883973102, + "learning_rate": 0.00019155920281359906, + "loss": 3.3855478763580322, + "step": 1634, + "token_acc": 0.2592721395986335 + }, + { + "epoch": 0.9586631486367634, + "grad_norm": 1.268569969842506, + "learning_rate": 0.00019167643610785463, + "loss": 3.3541502952575684, + "step": 1635, + "token_acc": 0.26462931404016854 + }, + { + "epoch": 0.9592494869539725, + "grad_norm": 0.9399033412577804, + "learning_rate": 0.0001917936694021102, + "loss": 3.3510520458221436, + "step": 1636, + "token_acc": 0.26333195104076196 + }, + { + "epoch": 0.9598358252711815, + "grad_norm": 1.3434907248263739, + "learning_rate": 0.0001919109026963658, + "loss": 3.329986095428467, + "step": 1637, + "token_acc": 0.26799489666829956 + }, + { + "epoch": 0.9604221635883905, + "grad_norm": 0.9557131361184009, + "learning_rate": 0.00019202813599062134, + "loss": 3.313842296600342, + "step": 1638, + "token_acc": 0.2673025062213013 + }, + { + "epoch": 0.9610085019055995, + "grad_norm": 1.3746672616070599, + "learning_rate": 0.0001921453692848769, + "loss": 3.3858346939086914, + "step": 1639, + "token_acc": 0.26089356096697436 + }, + { + "epoch": 0.9615948402228086, + "grad_norm": 1.2466323604766458, + "learning_rate": 0.0001922626025791325, + "loss": 3.3362507820129395, + "step": 1640, + "token_acc": 0.2641340981062967 + }, + { + "epoch": 0.9621811785400176, + "grad_norm": 1.1661886070180711, + "learning_rate": 0.00019237983587338807, + "loss": 3.34262752532959, + "step": 1641, + "token_acc": 0.2638271435191551 + }, + { + "epoch": 0.9627675168572266, + "grad_norm": 1.0763985551493827, + "learning_rate": 0.00019249706916764364, + "loss": 3.38920259475708, + "step": 1642, + "token_acc": 0.2574238146236978 + }, + { + "epoch": 0.9633538551744356, + "grad_norm": 1.4672173336155059, + "learning_rate": 0.0001926143024618992, + "loss": 3.4152050018310547, + "step": 1643, + "token_acc": 0.2557633730982246 + }, + { + "epoch": 0.9639401934916447, + "grad_norm": 1.0097615490274985, + "learning_rate": 0.00019273153575615474, + "loss": 3.3637962341308594, + "step": 1644, + "token_acc": 0.2617014567698808 + }, + { + "epoch": 0.9645265318088537, + "grad_norm": 1.4088918757593316, + "learning_rate": 0.00019284876905041032, + "loss": 3.3202617168426514, + "step": 1645, + "token_acc": 0.26727753405137655 + }, + { + "epoch": 0.9651128701260627, + "grad_norm": 0.8159601394987848, + "learning_rate": 0.0001929660023446659, + "loss": 3.355109691619873, + "step": 1646, + "token_acc": 0.2618737966175746 + }, + { + "epoch": 0.9656992084432717, + "grad_norm": 1.3380815813123037, + "learning_rate": 0.00019308323563892147, + "loss": 3.286242723464966, + "step": 1647, + "token_acc": 0.2713701593331223 + }, + { + "epoch": 0.9662855467604808, + "grad_norm": 1.069492747115931, + "learning_rate": 0.00019320046893317702, + "loss": 3.333033323287964, + "step": 1648, + "token_acc": 0.2669163912730537 + }, + { + "epoch": 0.9668718850776898, + "grad_norm": 1.409310148376673, + "learning_rate": 0.0001933177022274326, + "loss": 3.369143009185791, + "step": 1649, + "token_acc": 0.2602760636990466 + }, + { + "epoch": 0.9674582233948988, + "grad_norm": 0.8765148746893767, + "learning_rate": 0.00019343493552168817, + "loss": 3.3058595657348633, + "step": 1650, + "token_acc": 0.26922890280001943 + }, + { + "epoch": 0.9680445617121078, + "grad_norm": 1.2416515988593004, + "learning_rate": 0.00019355216881594375, + "loss": 3.374819278717041, + "step": 1651, + "token_acc": 0.260814549785172 + }, + { + "epoch": 0.968630900029317, + "grad_norm": 1.193887346640774, + "learning_rate": 0.0001936694021101993, + "loss": 3.3142249584198, + "step": 1652, + "token_acc": 0.2698347298974594 + }, + { + "epoch": 0.969217238346526, + "grad_norm": 1.301001313648477, + "learning_rate": 0.00019378663540445487, + "loss": 3.3638806343078613, + "step": 1653, + "token_acc": 0.26062861295798373 + }, + { + "epoch": 0.969803576663735, + "grad_norm": 1.2074897494380648, + "learning_rate": 0.00019390386869871045, + "loss": 3.3101613521575928, + "step": 1654, + "token_acc": 0.2697356348872009 + }, + { + "epoch": 0.970389914980944, + "grad_norm": 1.188749264177797, + "learning_rate": 0.00019402110199296603, + "loss": 3.3214831352233887, + "step": 1655, + "token_acc": 0.2668062699061004 + }, + { + "epoch": 0.9709762532981531, + "grad_norm": 1.1283344902120356, + "learning_rate": 0.00019413833528722157, + "loss": 3.32655930519104, + "step": 1656, + "token_acc": 0.26514057794140405 + }, + { + "epoch": 0.9715625916153621, + "grad_norm": 1.1359896286023063, + "learning_rate": 0.00019425556858147715, + "loss": 3.3051419258117676, + "step": 1657, + "token_acc": 0.2687050172108063 + }, + { + "epoch": 0.9721489299325711, + "grad_norm": 1.0982522470553582, + "learning_rate": 0.00019437280187573273, + "loss": 3.3482327461242676, + "step": 1658, + "token_acc": 0.2631812769889645 + }, + { + "epoch": 0.9727352682497801, + "grad_norm": 1.3726150948191387, + "learning_rate": 0.0001944900351699883, + "loss": 3.3184661865234375, + "step": 1659, + "token_acc": 0.2679870865960748 + }, + { + "epoch": 0.9733216065669892, + "grad_norm": 0.82247418854886, + "learning_rate": 0.00019460726846424385, + "loss": 3.3374791145324707, + "step": 1660, + "token_acc": 0.2626613638465706 + }, + { + "epoch": 0.9739079448841982, + "grad_norm": 1.1078922024567954, + "learning_rate": 0.00019472450175849943, + "loss": 3.3170008659362793, + "step": 1661, + "token_acc": 0.26823623554516496 + }, + { + "epoch": 0.9744942832014072, + "grad_norm": 1.4095959082789207, + "learning_rate": 0.00019484173505275498, + "loss": 3.3128089904785156, + "step": 1662, + "token_acc": 0.2679425069351745 + }, + { + "epoch": 0.9750806215186162, + "grad_norm": 1.124856845808925, + "learning_rate": 0.00019495896834701055, + "loss": 3.341902732849121, + "step": 1663, + "token_acc": 0.2643564710876091 + }, + { + "epoch": 0.9756669598358253, + "grad_norm": 1.254935463989328, + "learning_rate": 0.00019507620164126613, + "loss": 3.384016513824463, + "step": 1664, + "token_acc": 0.26044897038053194 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.9877355941561444, + "learning_rate": 0.00019519343493552168, + "loss": 3.329348564147949, + "step": 1665, + "token_acc": 0.2653352833827317 + }, + { + "epoch": 0.9768396364702433, + "grad_norm": 1.2405044810523733, + "learning_rate": 0.00019531066822977726, + "loss": 3.363208770751953, + "step": 1666, + "token_acc": 0.2611990303444244 + }, + { + "epoch": 0.9774259747874524, + "grad_norm": 1.3913658890510827, + "learning_rate": 0.00019542790152403283, + "loss": 3.3351492881774902, + "step": 1667, + "token_acc": 0.26598281772008886 + }, + { + "epoch": 0.9780123131046614, + "grad_norm": 1.0463702076469992, + "learning_rate": 0.0001955451348182884, + "loss": 3.325552463531494, + "step": 1668, + "token_acc": 0.26524881838646086 + }, + { + "epoch": 0.9785986514218704, + "grad_norm": 1.3362195386632891, + "learning_rate": 0.00019566236811254396, + "loss": 3.3576231002807617, + "step": 1669, + "token_acc": 0.26250827864410864 + }, + { + "epoch": 0.9791849897390794, + "grad_norm": 1.2372034602087982, + "learning_rate": 0.00019577960140679953, + "loss": 3.3630270957946777, + "step": 1670, + "token_acc": 0.261734803136759 + }, + { + "epoch": 0.9797713280562885, + "grad_norm": 1.0897028163957025, + "learning_rate": 0.0001958968347010551, + "loss": 3.269686222076416, + "step": 1671, + "token_acc": 0.27446752437768035 + }, + { + "epoch": 0.9803576663734975, + "grad_norm": 0.8559570187978328, + "learning_rate": 0.0001960140679953107, + "loss": 3.369194269180298, + "step": 1672, + "token_acc": 0.25938527753984114 + }, + { + "epoch": 0.9809440046907065, + "grad_norm": 1.1928655157623724, + "learning_rate": 0.00019613130128956624, + "loss": 3.3436310291290283, + "step": 1673, + "token_acc": 0.26232732332187825 + }, + { + "epoch": 0.9815303430079155, + "grad_norm": 1.110231146103432, + "learning_rate": 0.0001962485345838218, + "loss": 3.3459889888763428, + "step": 1674, + "token_acc": 0.2655485240258312 + }, + { + "epoch": 0.9821166813251246, + "grad_norm": 1.139243369163609, + "learning_rate": 0.0001963657678780774, + "loss": 3.3928275108337402, + "step": 1675, + "token_acc": 0.2585317398404427 + }, + { + "epoch": 0.9827030196423336, + "grad_norm": 1.3943270513203412, + "learning_rate": 0.00019648300117233296, + "loss": 3.3056437969207764, + "step": 1676, + "token_acc": 0.2691272676063551 + }, + { + "epoch": 0.9832893579595426, + "grad_norm": 0.8656601760412752, + "learning_rate": 0.00019660023446658854, + "loss": 3.3615996837615967, + "step": 1677, + "token_acc": 0.2604645273325856 + }, + { + "epoch": 0.9838756962767516, + "grad_norm": 1.0567517919130265, + "learning_rate": 0.0001967174677608441, + "loss": 3.355379104614258, + "step": 1678, + "token_acc": 0.2640887747217391 + }, + { + "epoch": 0.9844620345939608, + "grad_norm": 1.0912951100232295, + "learning_rate": 0.00019683470105509967, + "loss": 3.3363449573516846, + "step": 1679, + "token_acc": 0.2645881705565771 + }, + { + "epoch": 0.9850483729111698, + "grad_norm": 1.0629044257606912, + "learning_rate": 0.00019695193434935522, + "loss": 3.3193612098693848, + "step": 1680, + "token_acc": 0.26749073252232025 + }, + { + "epoch": 0.9856347112283788, + "grad_norm": 1.2775943008961042, + "learning_rate": 0.0001970691676436108, + "loss": 3.3616628646850586, + "step": 1681, + "token_acc": 0.26352890950894214 + }, + { + "epoch": 0.9862210495455878, + "grad_norm": 1.0545305769637172, + "learning_rate": 0.00019718640093786637, + "loss": 3.313585042953491, + "step": 1682, + "token_acc": 0.26803396115113975 + }, + { + "epoch": 0.9868073878627969, + "grad_norm": 1.4977277396578403, + "learning_rate": 0.00019730363423212192, + "loss": 3.3703408241271973, + "step": 1683, + "token_acc": 0.2614033910136795 + }, + { + "epoch": 0.9873937261800059, + "grad_norm": 0.7038650037196329, + "learning_rate": 0.0001974208675263775, + "loss": 3.3422064781188965, + "step": 1684, + "token_acc": 0.2643694724138552 + }, + { + "epoch": 0.9879800644972149, + "grad_norm": 0.9994849745233718, + "learning_rate": 0.00019753810082063307, + "loss": 3.3329501152038574, + "step": 1685, + "token_acc": 0.2643449852989219 + }, + { + "epoch": 0.9885664028144239, + "grad_norm": 1.240826872987356, + "learning_rate": 0.00019765533411488865, + "loss": 3.3225011825561523, + "step": 1686, + "token_acc": 0.2679423794747557 + }, + { + "epoch": 0.989152741131633, + "grad_norm": 0.7525156425662606, + "learning_rate": 0.0001977725674091442, + "loss": 3.2979655265808105, + "step": 1687, + "token_acc": 0.2704380875334561 + }, + { + "epoch": 0.989739079448842, + "grad_norm": 1.129174319725173, + "learning_rate": 0.00019788980070339977, + "loss": 3.35269832611084, + "step": 1688, + "token_acc": 0.26368715681762234 + }, + { + "epoch": 0.990325417766051, + "grad_norm": 1.0608179493864747, + "learning_rate": 0.00019800703399765535, + "loss": 3.3505702018737793, + "step": 1689, + "token_acc": 0.26353862997797095 + }, + { + "epoch": 0.99091175608326, + "grad_norm": 1.2304127161934297, + "learning_rate": 0.00019812426729191092, + "loss": 3.3202266693115234, + "step": 1690, + "token_acc": 0.2672586436934661 + }, + { + "epoch": 0.9914980944004691, + "grad_norm": 1.3773044486110175, + "learning_rate": 0.00019824150058616647, + "loss": 3.3369665145874023, + "step": 1691, + "token_acc": 0.2653060689611792 + }, + { + "epoch": 0.9920844327176781, + "grad_norm": 1.439023029743615, + "learning_rate": 0.00019835873388042205, + "loss": 3.3842334747314453, + "step": 1692, + "token_acc": 0.259608531169472 + }, + { + "epoch": 0.9926707710348871, + "grad_norm": 0.6211096328699252, + "learning_rate": 0.00019847596717467763, + "loss": 3.3407256603240967, + "step": 1693, + "token_acc": 0.26415282302576665 + }, + { + "epoch": 0.9932571093520962, + "grad_norm": 1.1934650957874657, + "learning_rate": 0.0001985932004689332, + "loss": 3.346158981323242, + "step": 1694, + "token_acc": 0.2637598514346047 + }, + { + "epoch": 0.9938434476693052, + "grad_norm": 1.2851550551049356, + "learning_rate": 0.00019871043376318875, + "loss": 3.325650691986084, + "step": 1695, + "token_acc": 0.26750293137378156 + }, + { + "epoch": 0.9944297859865142, + "grad_norm": 1.1508433553141901, + "learning_rate": 0.00019882766705744433, + "loss": 3.2948293685913086, + "step": 1696, + "token_acc": 0.27120840789266654 + }, + { + "epoch": 0.9950161243037232, + "grad_norm": 0.925348185731345, + "learning_rate": 0.0001989449003516999, + "loss": 3.3334450721740723, + "step": 1697, + "token_acc": 0.2650206492295395 + }, + { + "epoch": 0.9956024626209323, + "grad_norm": 1.0793354554960248, + "learning_rate": 0.00019906213364595545, + "loss": 3.3594651222229004, + "step": 1698, + "token_acc": 0.2616705998862176 + }, + { + "epoch": 0.9961888009381413, + "grad_norm": 1.1589122134078231, + "learning_rate": 0.00019917936694021103, + "loss": 3.356414318084717, + "step": 1699, + "token_acc": 0.26265433904851077 + }, + { + "epoch": 0.9967751392553503, + "grad_norm": 0.8063315224940307, + "learning_rate": 0.00019929660023446658, + "loss": 3.2872724533081055, + "step": 1700, + "token_acc": 0.27135269402531514 + }, + { + "epoch": 0.9973614775725593, + "grad_norm": 0.9275315135898576, + "learning_rate": 0.00019941383352872216, + "loss": 3.3003945350646973, + "step": 1701, + "token_acc": 0.2693471513356969 + }, + { + "epoch": 0.9979478158897684, + "grad_norm": 1.3977515363576045, + "learning_rate": 0.00019953106682297773, + "loss": 3.3070321083068848, + "step": 1702, + "token_acc": 0.2674597083653108 + }, + { + "epoch": 0.9985341542069774, + "grad_norm": 0.8099153127824107, + "learning_rate": 0.0001996483001172333, + "loss": 3.3445329666137695, + "step": 1703, + "token_acc": 0.26612909436781346 + }, + { + "epoch": 0.9991204925241864, + "grad_norm": 1.004579303177753, + "learning_rate": 0.00019976553341148886, + "loss": 3.2992019653320312, + "step": 1704, + "token_acc": 0.27126144178848344 + }, + { + "epoch": 0.9997068308413954, + "grad_norm": 1.1612980474686398, + "learning_rate": 0.00019988276670574443, + "loss": 3.3426826000213623, + "step": 1705, + "token_acc": 0.26442122186495176 + }, + { + "epoch": 1.0, + "grad_norm": 1.2283793216853351, + "learning_rate": 0.0002, + "loss": 3.2606582641601562, + "step": 1706, + "token_acc": 0.27698120124646125 + }, + { + "epoch": 1.0, + "eval_loss": 3.3105568885803223, + "eval_runtime": 16.4318, + "eval_samples_per_second": 15.58, + "eval_steps_per_second": 1.947, + "eval_token_acc": 0.26746655304875117, + "step": 1706 + }, + { + "epoch": 1.0005863383172091, + "grad_norm": 1.556494600009794, + "learning_rate": 0.0001999999995303174, + "loss": 3.328307628631592, + "step": 1707, + "token_acc": 0.26595388183638846 + }, + { + "epoch": 1.001172676634418, + "grad_norm": 0.8523857318890425, + "learning_rate": 0.0001999999981212695, + "loss": 3.280609130859375, + "step": 1708, + "token_acc": 0.27219681858250355 + }, + { + "epoch": 1.0017590149516271, + "grad_norm": 1.0527442206149304, + "learning_rate": 0.00019999999577285642, + "loss": 3.310378074645996, + "step": 1709, + "token_acc": 0.2680256480687648 + }, + { + "epoch": 1.0023453532688362, + "grad_norm": 1.4864315522560045, + "learning_rate": 0.00019999999248507814, + "loss": 3.2888007164001465, + "step": 1710, + "token_acc": 0.27098477695009415 + }, + { + "epoch": 1.0029316915860451, + "grad_norm": 0.9322965113037861, + "learning_rate": 0.00019999998825793463, + "loss": 3.3101136684417725, + "step": 1711, + "token_acc": 0.26868223099627375 + }, + { + "epoch": 1.0035180299032542, + "grad_norm": 1.2140821221169382, + "learning_rate": 0.000199999983091426, + "loss": 3.400871753692627, + "step": 1712, + "token_acc": 0.258787012987013 + }, + { + "epoch": 1.0041043682204631, + "grad_norm": 1.009684052655398, + "learning_rate": 0.0001999999769855523, + "loss": 3.3012166023254395, + "step": 1713, + "token_acc": 0.2689036216709844 + }, + { + "epoch": 1.0046907065376722, + "grad_norm": 1.202943246939225, + "learning_rate": 0.00019999996994031353, + "loss": 3.296599864959717, + "step": 1714, + "token_acc": 0.2679038800635739 + }, + { + "epoch": 1.0052770448548813, + "grad_norm": 0.9546427191555766, + "learning_rate": 0.00019999996195570985, + "loss": 3.2607038021087646, + "step": 1715, + "token_acc": 0.27512077985191963 + }, + { + "epoch": 1.0058633831720902, + "grad_norm": 1.2335410018132984, + "learning_rate": 0.00019999995303174125, + "loss": 3.291771650314331, + "step": 1716, + "token_acc": 0.27041116645418095 + }, + { + "epoch": 1.0064497214892993, + "grad_norm": 1.0921421651203864, + "learning_rate": 0.00019999994316840782, + "loss": 3.312577962875366, + "step": 1717, + "token_acc": 0.26718313356119155 + }, + { + "epoch": 1.0070360598065085, + "grad_norm": 1.3695469400955087, + "learning_rate": 0.0001999999323657097, + "loss": 3.313555955886841, + "step": 1718, + "token_acc": 0.2650570854629135 + }, + { + "epoch": 1.0076223981237173, + "grad_norm": 1.2628121447009166, + "learning_rate": 0.00019999992062364697, + "loss": 3.315274238586426, + "step": 1719, + "token_acc": 0.26700405538482214 + }, + { + "epoch": 1.0082087364409265, + "grad_norm": 0.9494902519256, + "learning_rate": 0.00019999990794221972, + "loss": 3.328627347946167, + "step": 1720, + "token_acc": 0.2664342695895 + }, + { + "epoch": 1.0087950747581353, + "grad_norm": 1.3881544086186914, + "learning_rate": 0.0001999998943214281, + "loss": 3.323829174041748, + "step": 1721, + "token_acc": 0.2675207274672372 + }, + { + "epoch": 1.0093814130753445, + "grad_norm": 1.1653045499417287, + "learning_rate": 0.00019999987976127224, + "loss": 3.3238110542297363, + "step": 1722, + "token_acc": 0.2664469337147566 + }, + { + "epoch": 1.0099677513925536, + "grad_norm": 1.2575657364561457, + "learning_rate": 0.00019999986426175225, + "loss": 3.2585737705230713, + "step": 1723, + "token_acc": 0.27267261489622663 + }, + { + "epoch": 1.0105540897097625, + "grad_norm": 0.9936817246844969, + "learning_rate": 0.00019999984782286827, + "loss": 3.33901309967041, + "step": 1724, + "token_acc": 0.2643873417123293 + }, + { + "epoch": 1.0111404280269716, + "grad_norm": 1.340730015302387, + "learning_rate": 0.00019999983044462048, + "loss": 3.309812068939209, + "step": 1725, + "token_acc": 0.26730696234676476 + }, + { + "epoch": 1.0117267663441807, + "grad_norm": 1.1609375856392976, + "learning_rate": 0.00019999981212700903, + "loss": 3.327427387237549, + "step": 1726, + "token_acc": 0.26356468645519043 + }, + { + "epoch": 1.0123131046613896, + "grad_norm": 1.1536328502356665, + "learning_rate": 0.0001999997928700341, + "loss": 3.3147873878479004, + "step": 1727, + "token_acc": 0.264424529776481 + }, + { + "epoch": 1.0128994429785987, + "grad_norm": 1.2723917601515504, + "learning_rate": 0.00019999977267369588, + "loss": 3.351543664932251, + "step": 1728, + "token_acc": 0.26157961462485024 + }, + { + "epoch": 1.0134857812958076, + "grad_norm": 1.2415985145042923, + "learning_rate": 0.00019999975153799454, + "loss": 3.297794818878174, + "step": 1729, + "token_acc": 0.268834089803858 + }, + { + "epoch": 1.0140721196130167, + "grad_norm": 1.491086041243301, + "learning_rate": 0.00019999972946293027, + "loss": 3.32405161857605, + "step": 1730, + "token_acc": 0.26578664402198915 + }, + { + "epoch": 1.0146584579302258, + "grad_norm": 0.9843345055647239, + "learning_rate": 0.0001999997064485033, + "loss": 3.259993076324463, + "step": 1731, + "token_acc": 0.2712384590244301 + }, + { + "epoch": 1.0152447962474347, + "grad_norm": 1.450624321829804, + "learning_rate": 0.00019999968249471387, + "loss": 3.259441375732422, + "step": 1732, + "token_acc": 0.2731529558877592 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.7765770096713421, + "learning_rate": 0.00019999965760156215, + "loss": 3.2235169410705566, + "step": 1733, + "token_acc": 0.2790696458905005 + }, + { + "epoch": 1.016417472881853, + "grad_norm": 1.1086717941818662, + "learning_rate": 0.00019999963176904837, + "loss": 3.3074679374694824, + "step": 1734, + "token_acc": 0.2657508289909995 + }, + { + "epoch": 1.0170038111990618, + "grad_norm": 0.8405811975346262, + "learning_rate": 0.00019999960499717282, + "loss": 3.3117685317993164, + "step": 1735, + "token_acc": 0.26819336840551106 + }, + { + "epoch": 1.017590149516271, + "grad_norm": 0.7431296755526309, + "learning_rate": 0.00019999957728593574, + "loss": 3.2740464210510254, + "step": 1736, + "token_acc": 0.272194382292572 + }, + { + "epoch": 1.01817648783348, + "grad_norm": 1.0002176285988111, + "learning_rate": 0.00019999954863533738, + "loss": 3.332535743713379, + "step": 1737, + "token_acc": 0.26577302609376474 + }, + { + "epoch": 1.018762826150689, + "grad_norm": 1.2184169161517018, + "learning_rate": 0.00019999951904537802, + "loss": 3.2924976348876953, + "step": 1738, + "token_acc": 0.26945296707130617 + }, + { + "epoch": 1.019349164467898, + "grad_norm": 0.9445123407964942, + "learning_rate": 0.00019999948851605792, + "loss": 3.255767583847046, + "step": 1739, + "token_acc": 0.2741988044308533 + }, + { + "epoch": 1.019935502785107, + "grad_norm": 1.1780655715075063, + "learning_rate": 0.0001999994570473774, + "loss": 3.308506965637207, + "step": 1740, + "token_acc": 0.26806187491382694 + }, + { + "epoch": 1.020521841102316, + "grad_norm": 1.194753028822761, + "learning_rate": 0.0001999994246393367, + "loss": 3.284492015838623, + "step": 1741, + "token_acc": 0.2715702479338843 + }, + { + "epoch": 1.0211081794195251, + "grad_norm": 1.3119055079428288, + "learning_rate": 0.00019999939129193616, + "loss": 3.300604820251465, + "step": 1742, + "token_acc": 0.27166271324997654 + }, + { + "epoch": 1.021694517736734, + "grad_norm": 1.0244102968927506, + "learning_rate": 0.00019999935700517612, + "loss": 3.3619349002838135, + "step": 1743, + "token_acc": 0.26027573111536045 + }, + { + "epoch": 1.0222808560539431, + "grad_norm": 1.1205548988389924, + "learning_rate": 0.00019999932177905682, + "loss": 3.287616729736328, + "step": 1744, + "token_acc": 0.2713776186858607 + }, + { + "epoch": 1.0228671943711523, + "grad_norm": 0.906680758714488, + "learning_rate": 0.0001999992856135787, + "loss": 3.3427040576934814, + "step": 1745, + "token_acc": 0.2634611589476199 + }, + { + "epoch": 1.0234535326883611, + "grad_norm": 1.0278739172048916, + "learning_rate": 0.000199999248508742, + "loss": 3.2839174270629883, + "step": 1746, + "token_acc": 0.2727939603264358 + }, + { + "epoch": 1.0240398710055703, + "grad_norm": 1.0618976481102271, + "learning_rate": 0.00019999921046454712, + "loss": 3.272448778152466, + "step": 1747, + "token_acc": 0.272406176332544 + }, + { + "epoch": 1.0246262093227791, + "grad_norm": 1.1981723135672615, + "learning_rate": 0.00019999917148099444, + "loss": 3.2798118591308594, + "step": 1748, + "token_acc": 0.2707797500811168 + }, + { + "epoch": 1.0252125476399883, + "grad_norm": 0.8220649631902591, + "learning_rate": 0.00019999913155808424, + "loss": 3.3087692260742188, + "step": 1749, + "token_acc": 0.2675946817596594 + }, + { + "epoch": 1.0257988859571974, + "grad_norm": 0.7858420635565981, + "learning_rate": 0.00019999909069581698, + "loss": 3.2957398891448975, + "step": 1750, + "token_acc": 0.2678634647537192 + }, + { + "epoch": 1.0263852242744063, + "grad_norm": 1.0644351666311864, + "learning_rate": 0.000199999048894193, + "loss": 3.276577949523926, + "step": 1751, + "token_acc": 0.2719083224269991 + }, + { + "epoch": 1.0269715625916154, + "grad_norm": 1.0826172294506995, + "learning_rate": 0.00019999900615321275, + "loss": 3.2733945846557617, + "step": 1752, + "token_acc": 0.27152142087227915 + }, + { + "epoch": 1.0275579009088245, + "grad_norm": 0.910384673197123, + "learning_rate": 0.00019999896247287655, + "loss": 3.290496349334717, + "step": 1753, + "token_acc": 0.26887546914838023 + }, + { + "epoch": 1.0281442392260334, + "grad_norm": 1.2049532798048312, + "learning_rate": 0.00019999891785318485, + "loss": 3.284515142440796, + "step": 1754, + "token_acc": 0.2718623126829334 + }, + { + "epoch": 1.0287305775432425, + "grad_norm": 1.3463184648379212, + "learning_rate": 0.00019999887229413808, + "loss": 3.3190548419952393, + "step": 1755, + "token_acc": 0.26470618212580327 + }, + { + "epoch": 1.0293169158604514, + "grad_norm": 0.9365315302293586, + "learning_rate": 0.00019999882579573662, + "loss": 3.2841455936431885, + "step": 1756, + "token_acc": 0.2714503637956142 + }, + { + "epoch": 1.0299032541776605, + "grad_norm": 0.8056751332984439, + "learning_rate": 0.00019999877835798097, + "loss": 3.294391632080078, + "step": 1757, + "token_acc": 0.269180780278424 + }, + { + "epoch": 1.0304895924948696, + "grad_norm": 0.807073807025428, + "learning_rate": 0.00019999872998087157, + "loss": 3.2585020065307617, + "step": 1758, + "token_acc": 0.2738279981233534 + }, + { + "epoch": 1.0310759308120785, + "grad_norm": 0.9188963200611165, + "learning_rate": 0.00019999868066440882, + "loss": 3.3134231567382812, + "step": 1759, + "token_acc": 0.2672857108687968 + }, + { + "epoch": 1.0316622691292876, + "grad_norm": 1.0913130207182808, + "learning_rate": 0.00019999863040859323, + "loss": 3.2983882427215576, + "step": 1760, + "token_acc": 0.267619630223296 + }, + { + "epoch": 1.0322486074464967, + "grad_norm": 1.1510614450793855, + "learning_rate": 0.00019999857921342522, + "loss": 3.3039979934692383, + "step": 1761, + "token_acc": 0.2680319802876973 + }, + { + "epoch": 1.0328349457637056, + "grad_norm": 0.9742543785483267, + "learning_rate": 0.00019999852707890537, + "loss": 3.297025680541992, + "step": 1762, + "token_acc": 0.2686513171894284 + }, + { + "epoch": 1.0334212840809147, + "grad_norm": 0.975133901184013, + "learning_rate": 0.00019999847400503407, + "loss": 3.3022875785827637, + "step": 1763, + "token_acc": 0.2681696764678582 + }, + { + "epoch": 1.0340076223981236, + "grad_norm": 1.1341862309020048, + "learning_rate": 0.00019999841999181184, + "loss": 3.3030343055725098, + "step": 1764, + "token_acc": 0.2676047242141809 + }, + { + "epoch": 1.0345939607153327, + "grad_norm": 1.1853320474359959, + "learning_rate": 0.00019999836503923924, + "loss": 3.260789155960083, + "step": 1765, + "token_acc": 0.273162407391992 + }, + { + "epoch": 1.0351802990325418, + "grad_norm": 1.0721449050596954, + "learning_rate": 0.00019999830914731672, + "loss": 3.257016181945801, + "step": 1766, + "token_acc": 0.27417226188508353 + }, + { + "epoch": 1.0357666373497507, + "grad_norm": 0.8758866062121071, + "learning_rate": 0.00019999825231604485, + "loss": 3.293463706970215, + "step": 1767, + "token_acc": 0.2697982297340385 + }, + { + "epoch": 1.0363529756669598, + "grad_norm": 0.9709334656668901, + "learning_rate": 0.00019999819454542415, + "loss": 3.3248915672302246, + "step": 1768, + "token_acc": 0.26673016544960665 + }, + { + "epoch": 1.036939313984169, + "grad_norm": 1.158069377903993, + "learning_rate": 0.00019999813583545514, + "loss": 3.2606422901153564, + "step": 1769, + "token_acc": 0.27214280169910254 + }, + { + "epoch": 1.0375256523013778, + "grad_norm": 0.9039828439206207, + "learning_rate": 0.0001999980761861384, + "loss": 3.2774105072021484, + "step": 1770, + "token_acc": 0.27073095704008543 + }, + { + "epoch": 1.038111990618587, + "grad_norm": 0.8163781196590087, + "learning_rate": 0.00019999801559747452, + "loss": 3.2986931800842285, + "step": 1771, + "token_acc": 0.2703006784472543 + }, + { + "epoch": 1.038698328935796, + "grad_norm": 1.0642042112355978, + "learning_rate": 0.000199997954069464, + "loss": 3.329517364501953, + "step": 1772, + "token_acc": 0.2645986330401327 + }, + { + "epoch": 1.039284667253005, + "grad_norm": 1.0848399924353889, + "learning_rate": 0.00019999789160210746, + "loss": 3.3166861534118652, + "step": 1773, + "token_acc": 0.2661930023498191 + }, + { + "epoch": 1.039871005570214, + "grad_norm": 0.9107319036886131, + "learning_rate": 0.0001999978281954055, + "loss": 3.263587236404419, + "step": 1774, + "token_acc": 0.27257831783802067 + }, + { + "epoch": 1.040457343887423, + "grad_norm": 1.1628462014535277, + "learning_rate": 0.00019999776384935865, + "loss": 3.2985498905181885, + "step": 1775, + "token_acc": 0.26976071607855084 + }, + { + "epoch": 1.041043682204632, + "grad_norm": 1.0262176865398889, + "learning_rate": 0.0001999976985639676, + "loss": 3.3277931213378906, + "step": 1776, + "token_acc": 0.26588533180912494 + }, + { + "epoch": 1.0416300205218412, + "grad_norm": 0.9108082402598152, + "learning_rate": 0.00019999763233923289, + "loss": 3.295567512512207, + "step": 1777, + "token_acc": 0.2681652723673287 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.7922922566611076, + "learning_rate": 0.00019999756517515515, + "loss": 3.3071703910827637, + "step": 1778, + "token_acc": 0.2682451743505674 + }, + { + "epoch": 1.0428026971562592, + "grad_norm": 1.018941359511883, + "learning_rate": 0.00019999749707173508, + "loss": 3.277599334716797, + "step": 1779, + "token_acc": 0.27027566350238036 + }, + { + "epoch": 1.0433890354734683, + "grad_norm": 1.141157542909064, + "learning_rate": 0.00019999742802897326, + "loss": 3.316535711288452, + "step": 1780, + "token_acc": 0.26503940615143745 + }, + { + "epoch": 1.0439753737906772, + "grad_norm": 0.8583974318986433, + "learning_rate": 0.00019999735804687035, + "loss": 3.2898850440979004, + "step": 1781, + "token_acc": 0.2694515670782688 + }, + { + "epoch": 1.0445617121078863, + "grad_norm": 0.799766314629756, + "learning_rate": 0.000199997287125427, + "loss": 3.2240023612976074, + "step": 1782, + "token_acc": 0.27754897839715503 + }, + { + "epoch": 1.0451480504250952, + "grad_norm": 0.7868300315672145, + "learning_rate": 0.00019999721526464388, + "loss": 3.294559955596924, + "step": 1783, + "token_acc": 0.27013611583264885 + }, + { + "epoch": 1.0457343887423043, + "grad_norm": 0.910621209933701, + "learning_rate": 0.00019999714246452167, + "loss": 3.249177932739258, + "step": 1784, + "token_acc": 0.27533909017691893 + }, + { + "epoch": 1.0463207270595134, + "grad_norm": 0.9453268505360458, + "learning_rate": 0.00019999706872506109, + "loss": 3.3015735149383545, + "step": 1785, + "token_acc": 0.2685105329668182 + }, + { + "epoch": 1.0469070653767223, + "grad_norm": 1.0734196248765913, + "learning_rate": 0.00019999699404626278, + "loss": 3.3147549629211426, + "step": 1786, + "token_acc": 0.26804633536700084 + }, + { + "epoch": 1.0474934036939314, + "grad_norm": 0.8869495141916742, + "learning_rate": 0.00019999691842812744, + "loss": 3.3309764862060547, + "step": 1787, + "token_acc": 0.2650002479271981 + }, + { + "epoch": 1.0480797420111405, + "grad_norm": 0.7685706828134822, + "learning_rate": 0.00019999684187065584, + "loss": 3.2902088165283203, + "step": 1788, + "token_acc": 0.2699603973251964 + }, + { + "epoch": 1.0486660803283494, + "grad_norm": 0.9020925336833913, + "learning_rate": 0.0001999967643738486, + "loss": 3.3249220848083496, + "step": 1789, + "token_acc": 0.26559681807510116 + }, + { + "epoch": 1.0492524186455585, + "grad_norm": 0.858729389464275, + "learning_rate": 0.00019999668593770654, + "loss": 3.266615390777588, + "step": 1790, + "token_acc": 0.2732078374609987 + }, + { + "epoch": 1.0498387569627674, + "grad_norm": 1.1730061251275634, + "learning_rate": 0.00019999660656223038, + "loss": 3.3166589736938477, + "step": 1791, + "token_acc": 0.26601517773931566 + }, + { + "epoch": 1.0504250952799765, + "grad_norm": 1.053190266511645, + "learning_rate": 0.00019999652624742083, + "loss": 3.2984392642974854, + "step": 1792, + "token_acc": 0.2692320630558596 + }, + { + "epoch": 1.0510114335971856, + "grad_norm": 0.7347243315133648, + "learning_rate": 0.00019999644499327866, + "loss": 3.247314453125, + "step": 1793, + "token_acc": 0.27509218527623236 + }, + { + "epoch": 1.0515977719143945, + "grad_norm": 0.8659809903170016, + "learning_rate": 0.00019999636279980463, + "loss": 3.308168411254883, + "step": 1794, + "token_acc": 0.26753872361826386 + }, + { + "epoch": 1.0521841102316036, + "grad_norm": 0.6952421235492701, + "learning_rate": 0.00019999627966699952, + "loss": 3.288480758666992, + "step": 1795, + "token_acc": 0.2701284869957663 + }, + { + "epoch": 1.0527704485488127, + "grad_norm": 0.820818645842747, + "learning_rate": 0.00019999619559486412, + "loss": 3.269139289855957, + "step": 1796, + "token_acc": 0.2713431240959616 + }, + { + "epoch": 1.0533567868660216, + "grad_norm": 0.8511836269489489, + "learning_rate": 0.0001999961105833992, + "loss": 3.262439250946045, + "step": 1797, + "token_acc": 0.27265030078449615 + }, + { + "epoch": 1.0539431251832307, + "grad_norm": 0.8689921812019348, + "learning_rate": 0.00019999602463260555, + "loss": 3.311861515045166, + "step": 1798, + "token_acc": 0.2675016376260972 + }, + { + "epoch": 1.0545294635004399, + "grad_norm": 1.0833239741221916, + "learning_rate": 0.00019999593774248405, + "loss": 3.2748827934265137, + "step": 1799, + "token_acc": 0.2728999686389833 + }, + { + "epoch": 1.0551158018176487, + "grad_norm": 1.2047799553899454, + "learning_rate": 0.0001999958499130354, + "loss": 3.268369674682617, + "step": 1800, + "token_acc": 0.2714039075076445 + }, + { + "epoch": 1.0557021401348579, + "grad_norm": 0.703747546598522, + "learning_rate": 0.00019999576114426053, + "loss": 3.3230302333831787, + "step": 1801, + "token_acc": 0.26502325605472743 + }, + { + "epoch": 1.0562884784520667, + "grad_norm": 0.8702269717656477, + "learning_rate": 0.0001999956714361602, + "loss": 3.3232264518737793, + "step": 1802, + "token_acc": 0.26534232662588064 + }, + { + "epoch": 1.0568748167692759, + "grad_norm": 1.362888073173297, + "learning_rate": 0.00019999558078873531, + "loss": 3.3137786388397217, + "step": 1803, + "token_acc": 0.2652842483969347 + }, + { + "epoch": 1.057461155086485, + "grad_norm": 1.0817603877332413, + "learning_rate": 0.00019999548920198668, + "loss": 3.3073887825012207, + "step": 1804, + "token_acc": 0.2677067626484421 + }, + { + "epoch": 1.0580474934036939, + "grad_norm": 1.008988663572395, + "learning_rate": 0.00019999539667591516, + "loss": 3.2882063388824463, + "step": 1805, + "token_acc": 0.2694314283704975 + }, + { + "epoch": 1.058633831720903, + "grad_norm": 1.036650086221023, + "learning_rate": 0.00019999530321052162, + "loss": 3.276179790496826, + "step": 1806, + "token_acc": 0.2710957839782997 + }, + { + "epoch": 1.059220170038112, + "grad_norm": 1.2501711170050571, + "learning_rate": 0.00019999520880580697, + "loss": 3.2764010429382324, + "step": 1807, + "token_acc": 0.26930443521865643 + }, + { + "epoch": 1.059806508355321, + "grad_norm": 0.7549989365623009, + "learning_rate": 0.00019999511346177208, + "loss": 3.2647905349731445, + "step": 1808, + "token_acc": 0.27399514922327 + }, + { + "epoch": 1.06039284667253, + "grad_norm": 1.1043727468325106, + "learning_rate": 0.00019999501717841785, + "loss": 3.296976327896118, + "step": 1809, + "token_acc": 0.2662570815052756 + }, + { + "epoch": 1.060979184989739, + "grad_norm": 0.8399871366697835, + "learning_rate": 0.00019999491995574515, + "loss": 3.29217529296875, + "step": 1810, + "token_acc": 0.26831980887259543 + }, + { + "epoch": 1.061565523306948, + "grad_norm": 0.791199311126801, + "learning_rate": 0.00019999482179375498, + "loss": 3.291705369949341, + "step": 1811, + "token_acc": 0.268534645772606 + }, + { + "epoch": 1.0621518616241572, + "grad_norm": 0.9071268839262596, + "learning_rate": 0.00019999472269244815, + "loss": 3.27934193611145, + "step": 1812, + "token_acc": 0.27001719356193726 + }, + { + "epoch": 1.062738199941366, + "grad_norm": 0.9246439977527963, + "learning_rate": 0.00019999462265182566, + "loss": 3.2667436599731445, + "step": 1813, + "token_acc": 0.27299279065171583 + }, + { + "epoch": 1.0633245382585752, + "grad_norm": 1.0563031336108097, + "learning_rate": 0.00019999452167188844, + "loss": 3.3085427284240723, + "step": 1814, + "token_acc": 0.2687913473201375 + }, + { + "epoch": 1.0639108765757843, + "grad_norm": 1.285219725925068, + "learning_rate": 0.00019999441975263743, + "loss": 3.289022445678711, + "step": 1815, + "token_acc": 0.2705862229325579 + }, + { + "epoch": 1.0644972148929932, + "grad_norm": 1.055197371119659, + "learning_rate": 0.0001999943168940736, + "loss": 3.2957544326782227, + "step": 1816, + "token_acc": 0.268383770728044 + }, + { + "epoch": 1.0650835532102023, + "grad_norm": 0.8367355036894132, + "learning_rate": 0.00019999421309619788, + "loss": 3.2617506980895996, + "step": 1817, + "token_acc": 0.27230388703642827 + }, + { + "epoch": 1.0656698915274112, + "grad_norm": 1.1100409339317048, + "learning_rate": 0.0001999941083590113, + "loss": 3.2972044944763184, + "step": 1818, + "token_acc": 0.2690728317053265 + }, + { + "epoch": 1.0662562298446203, + "grad_norm": 0.9979100975216163, + "learning_rate": 0.0001999940026825148, + "loss": 3.30416202545166, + "step": 1819, + "token_acc": 0.26464092233246855 + }, + { + "epoch": 1.0668425681618294, + "grad_norm": 1.1617221906558397, + "learning_rate": 0.00019999389606670937, + "loss": 3.303950786590576, + "step": 1820, + "token_acc": 0.26752964638011467 + }, + { + "epoch": 1.0674289064790383, + "grad_norm": 0.8527361789114442, + "learning_rate": 0.00019999378851159606, + "loss": 3.2986960411071777, + "step": 1821, + "token_acc": 0.2685309317645327 + }, + { + "epoch": 1.0680152447962474, + "grad_norm": 0.7592424934168102, + "learning_rate": 0.00019999368001717585, + "loss": 3.2949836254119873, + "step": 1822, + "token_acc": 0.2695125763386424 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.7034165253275559, + "learning_rate": 0.00019999357058344975, + "loss": 3.260822296142578, + "step": 1823, + "token_acc": 0.2720663221610568 + }, + { + "epoch": 1.0691879214306654, + "grad_norm": 0.8712614654449726, + "learning_rate": 0.0001999934602104188, + "loss": 3.300623893737793, + "step": 1824, + "token_acc": 0.26847036711493205 + }, + { + "epoch": 1.0697742597478745, + "grad_norm": 0.9007256456017521, + "learning_rate": 0.00019999334889808404, + "loss": 3.2684574127197266, + "step": 1825, + "token_acc": 0.2723802977407159 + }, + { + "epoch": 1.0703605980650837, + "grad_norm": 0.7507976999403495, + "learning_rate": 0.00019999323664644648, + "loss": 3.2420732975006104, + "step": 1826, + "token_acc": 0.27524754026446074 + }, + { + "epoch": 1.0709469363822925, + "grad_norm": 0.810335357863364, + "learning_rate": 0.00019999312345550725, + "loss": 3.233001708984375, + "step": 1827, + "token_acc": 0.2759885658568814 + }, + { + "epoch": 1.0715332746995017, + "grad_norm": 0.9445326861640987, + "learning_rate": 0.00019999300932526735, + "loss": 3.2508413791656494, + "step": 1828, + "token_acc": 0.2707376405760607 + }, + { + "epoch": 1.0721196130167105, + "grad_norm": 0.8669637829950226, + "learning_rate": 0.00019999289425572786, + "loss": 3.2938332557678223, + "step": 1829, + "token_acc": 0.2667393692262709 + }, + { + "epoch": 1.0727059513339197, + "grad_norm": 1.124917753942543, + "learning_rate": 0.00019999277824688986, + "loss": 3.3348937034606934, + "step": 1830, + "token_acc": 0.2609762902404251 + }, + { + "epoch": 1.0732922896511288, + "grad_norm": 1.060397605529051, + "learning_rate": 0.00019999266129875446, + "loss": 3.2559690475463867, + "step": 1831, + "token_acc": 0.27436314305923587 + }, + { + "epoch": 1.0738786279683377, + "grad_norm": 1.3091179763595335, + "learning_rate": 0.0001999925434113228, + "loss": 3.2178902626037598, + "step": 1832, + "token_acc": 0.2782505960152876 + }, + { + "epoch": 1.0744649662855468, + "grad_norm": 1.007301994772881, + "learning_rate": 0.00019999242458459588, + "loss": 3.2721283435821533, + "step": 1833, + "token_acc": 0.27125916679669215 + }, + { + "epoch": 1.0750513046027559, + "grad_norm": 1.1357547058198343, + "learning_rate": 0.00019999230481857486, + "loss": 3.2517542839050293, + "step": 1834, + "token_acc": 0.2733259049680942 + }, + { + "epoch": 1.0756376429199648, + "grad_norm": 0.8691403991492244, + "learning_rate": 0.0001999921841132609, + "loss": 3.3080880641937256, + "step": 1835, + "token_acc": 0.26807200439487433 + }, + { + "epoch": 1.0762239812371739, + "grad_norm": 0.7002440657151844, + "learning_rate": 0.00019999206246865513, + "loss": 3.2627310752868652, + "step": 1836, + "token_acc": 0.2732244404741222 + }, + { + "epoch": 1.0768103195543828, + "grad_norm": 0.8886735739645918, + "learning_rate": 0.00019999193988475865, + "loss": 3.3120410442352295, + "step": 1837, + "token_acc": 0.26791576631874814 + }, + { + "epoch": 1.077396657871592, + "grad_norm": 1.330173506907914, + "learning_rate": 0.00019999181636157264, + "loss": 3.269960880279541, + "step": 1838, + "token_acc": 0.27057833744945314 + }, + { + "epoch": 1.077982996188801, + "grad_norm": 0.7155288492670615, + "learning_rate": 0.00019999169189909827, + "loss": 3.293656826019287, + "step": 1839, + "token_acc": 0.26868622646776047 + }, + { + "epoch": 1.07856933450601, + "grad_norm": 1.022679874427088, + "learning_rate": 0.00019999156649733667, + "loss": 3.3083438873291016, + "step": 1840, + "token_acc": 0.26548180989068626 + }, + { + "epoch": 1.079155672823219, + "grad_norm": 1.1952614128090076, + "learning_rate": 0.00019999144015628905, + "loss": 3.256667137145996, + "step": 1841, + "token_acc": 0.27316613772995846 + }, + { + "epoch": 1.0797420111404281, + "grad_norm": 0.6980783547281257, + "learning_rate": 0.0001999913128759566, + "loss": 3.262158155441284, + "step": 1842, + "token_acc": 0.2726297984282676 + }, + { + "epoch": 1.080328349457637, + "grad_norm": 0.972063667146567, + "learning_rate": 0.00019999118465634051, + "loss": 3.3121566772460938, + "step": 1843, + "token_acc": 0.2663731749262482 + }, + { + "epoch": 1.0809146877748461, + "grad_norm": 1.219059018842931, + "learning_rate": 0.00019999105549744196, + "loss": 3.2812654972076416, + "step": 1844, + "token_acc": 0.2690675897090897 + }, + { + "epoch": 1.081501026092055, + "grad_norm": 0.8471561043926981, + "learning_rate": 0.0001999909253992622, + "loss": 3.266209125518799, + "step": 1845, + "token_acc": 0.2721826676317694 + }, + { + "epoch": 1.0820873644092641, + "grad_norm": 0.6927929587665244, + "learning_rate": 0.00019999079436180245, + "loss": 3.221024751663208, + "step": 1846, + "token_acc": 0.2784002218205718 + }, + { + "epoch": 1.0826737027264732, + "grad_norm": 0.8530468672856815, + "learning_rate": 0.0001999906623850639, + "loss": 3.2521119117736816, + "step": 1847, + "token_acc": 0.2743287859953603 + }, + { + "epoch": 1.0832600410436821, + "grad_norm": 1.053296951556091, + "learning_rate": 0.00019999052946904783, + "loss": 3.2244858741760254, + "step": 1848, + "token_acc": 0.2772258336714868 + }, + { + "epoch": 1.0838463793608912, + "grad_norm": 1.0873412752462581, + "learning_rate": 0.00019999039561375545, + "loss": 3.1957848072052, + "step": 1849, + "token_acc": 0.2817763057614455 + }, + { + "epoch": 1.0844327176781003, + "grad_norm": 0.893711618354491, + "learning_rate": 0.00019999026081918807, + "loss": 3.283287286758423, + "step": 1850, + "token_acc": 0.2702660709828274 + }, + { + "epoch": 1.0850190559953092, + "grad_norm": 0.72093050589574, + "learning_rate": 0.0001999901250853469, + "loss": 3.2703752517700195, + "step": 1851, + "token_acc": 0.2711028626304261 + }, + { + "epoch": 1.0856053943125183, + "grad_norm": 0.7998399852054139, + "learning_rate": 0.0001999899884122333, + "loss": 3.2745933532714844, + "step": 1852, + "token_acc": 0.2721095961381231 + }, + { + "epoch": 1.0861917326297275, + "grad_norm": 0.7439192601798515, + "learning_rate": 0.00019998985079984843, + "loss": 3.291837215423584, + "step": 1853, + "token_acc": 0.2682023104372479 + }, + { + "epoch": 1.0867780709469363, + "grad_norm": 0.9396164741457514, + "learning_rate": 0.0001999897122481937, + "loss": 3.313176155090332, + "step": 1854, + "token_acc": 0.26611744102036494 + }, + { + "epoch": 1.0873644092641455, + "grad_norm": 1.3512446163325371, + "learning_rate": 0.00019998957275727032, + "loss": 3.2301125526428223, + "step": 1855, + "token_acc": 0.27694490151951745 + }, + { + "epoch": 1.0879507475813543, + "grad_norm": 0.6356362614994411, + "learning_rate": 0.00019998943232707968, + "loss": 3.2800469398498535, + "step": 1856, + "token_acc": 0.2689310999918318 + }, + { + "epoch": 1.0885370858985635, + "grad_norm": 0.8873756071458211, + "learning_rate": 0.000199989290957623, + "loss": 3.335606575012207, + "step": 1857, + "token_acc": 0.2634876940533554 + }, + { + "epoch": 1.0891234242157726, + "grad_norm": 1.490022148237173, + "learning_rate": 0.00019998914864890175, + "loss": 3.2807559967041016, + "step": 1858, + "token_acc": 0.26815423812720474 + }, + { + "epoch": 1.0897097625329815, + "grad_norm": 0.8737319010951705, + "learning_rate": 0.00019998900540091713, + "loss": 3.283431053161621, + "step": 1859, + "token_acc": 0.26906003232567494 + }, + { + "epoch": 1.0902961008501906, + "grad_norm": 1.4000082887324203, + "learning_rate": 0.00019998886121367056, + "loss": 3.2860045433044434, + "step": 1860, + "token_acc": 0.26787274709833064 + }, + { + "epoch": 1.0908824391673997, + "grad_norm": 0.7021338758272468, + "learning_rate": 0.00019998871608716337, + "loss": 3.3341658115386963, + "step": 1861, + "token_acc": 0.2641494583682047 + }, + { + "epoch": 1.0914687774846086, + "grad_norm": 0.9623430337957842, + "learning_rate": 0.00019998857002139693, + "loss": 3.266134262084961, + "step": 1862, + "token_acc": 0.27455271220134675 + }, + { + "epoch": 1.0920551158018177, + "grad_norm": 0.6703763160659446, + "learning_rate": 0.00019998842301637262, + "loss": 3.284419536590576, + "step": 1863, + "token_acc": 0.2699447357233952 + }, + { + "epoch": 1.0926414541190266, + "grad_norm": 0.7072274104346352, + "learning_rate": 0.0001999882750720918, + "loss": 3.2228569984436035, + "step": 1864, + "token_acc": 0.2781263660692391 + }, + { + "epoch": 1.0932277924362357, + "grad_norm": 0.7321361759974758, + "learning_rate": 0.00019998812618855587, + "loss": 3.2342801094055176, + "step": 1865, + "token_acc": 0.27553758028469605 + }, + { + "epoch": 1.0938141307534448, + "grad_norm": 0.8849821970747961, + "learning_rate": 0.00019998797636576625, + "loss": 3.2230300903320312, + "step": 1866, + "token_acc": 0.2776290101997139 + }, + { + "epoch": 1.0944004690706537, + "grad_norm": 0.944092279363582, + "learning_rate": 0.0001999878256037243, + "loss": 3.2624435424804688, + "step": 1867, + "token_acc": 0.27430302545008073 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 1.0826813034969316, + "learning_rate": 0.00019998767390243147, + "loss": 3.30016827583313, + "step": 1868, + "token_acc": 0.26709118682616967 + }, + { + "epoch": 1.095573145705072, + "grad_norm": 1.1165912813478263, + "learning_rate": 0.00019998752126188917, + "loss": 3.2919421195983887, + "step": 1869, + "token_acc": 0.2697934673588236 + }, + { + "epoch": 1.0961594840222808, + "grad_norm": 1.0671160764336696, + "learning_rate": 0.00019998736768209887, + "loss": 3.3012094497680664, + "step": 1870, + "token_acc": 0.2694465171167453 + }, + { + "epoch": 1.09674582233949, + "grad_norm": 1.3854577877137213, + "learning_rate": 0.00019998721316306196, + "loss": 3.2641139030456543, + "step": 1871, + "token_acc": 0.27333703783307733 + }, + { + "epoch": 1.0973321606566988, + "grad_norm": 0.816386087943341, + "learning_rate": 0.00019998705770477994, + "loss": 3.2627835273742676, + "step": 1872, + "token_acc": 0.27380241122280463 + }, + { + "epoch": 1.097918498973908, + "grad_norm": 1.089044996735678, + "learning_rate": 0.00019998690130725426, + "loss": 3.245826244354248, + "step": 1873, + "token_acc": 0.27344066893427044 + }, + { + "epoch": 1.098504837291117, + "grad_norm": 1.2041683248652024, + "learning_rate": 0.00019998674397048632, + "loss": 3.2560391426086426, + "step": 1874, + "token_acc": 0.2755694643171402 + }, + { + "epoch": 1.099091175608326, + "grad_norm": 1.0372118770152572, + "learning_rate": 0.00019998658569447773, + "loss": 3.272392749786377, + "step": 1875, + "token_acc": 0.2698370931562452 + }, + { + "epoch": 1.099677513925535, + "grad_norm": 1.175723248517185, + "learning_rate": 0.00019998642647922984, + "loss": 3.3034539222717285, + "step": 1876, + "token_acc": 0.26746391901179145 + }, + { + "epoch": 1.1002638522427441, + "grad_norm": 1.0801960673699682, + "learning_rate": 0.00019998626632474422, + "loss": 3.2207512855529785, + "step": 1877, + "token_acc": 0.2797358622447505 + }, + { + "epoch": 1.100850190559953, + "grad_norm": 1.1218411404944004, + "learning_rate": 0.00019998610523102236, + "loss": 3.2619869709014893, + "step": 1878, + "token_acc": 0.2740995549412434 + }, + { + "epoch": 1.1014365288771621, + "grad_norm": 0.978740176291759, + "learning_rate": 0.00019998594319806578, + "loss": 3.3516883850097656, + "step": 1879, + "token_acc": 0.25971205559600913 + }, + { + "epoch": 1.1020228671943713, + "grad_norm": 1.0325344995137953, + "learning_rate": 0.000199985780225876, + "loss": 3.2373228073120117, + "step": 1880, + "token_acc": 0.2749891017429188 + }, + { + "epoch": 1.1026092055115801, + "grad_norm": 1.0222006803207744, + "learning_rate": 0.00019998561631445457, + "loss": 3.3411693572998047, + "step": 1881, + "token_acc": 0.26362470444181724 + }, + { + "epoch": 1.1031955438287893, + "grad_norm": 1.264597044475318, + "learning_rate": 0.00019998545146380296, + "loss": 3.251077175140381, + "step": 1882, + "token_acc": 0.27242163349768794 + }, + { + "epoch": 1.1037818821459981, + "grad_norm": 0.8723940992647987, + "learning_rate": 0.0001999852856739228, + "loss": 3.277228832244873, + "step": 1883, + "token_acc": 0.27135951569103867 + }, + { + "epoch": 1.1043682204632073, + "grad_norm": 0.9850632801138791, + "learning_rate": 0.0001999851189448156, + "loss": 3.322584629058838, + "step": 1884, + "token_acc": 0.2658655524353838 + }, + { + "epoch": 1.1049545587804164, + "grad_norm": 0.9902402188049431, + "learning_rate": 0.00019998495127648293, + "loss": 3.2704763412475586, + "step": 1885, + "token_acc": 0.2714962568155683 + }, + { + "epoch": 1.1055408970976253, + "grad_norm": 0.9124318344783483, + "learning_rate": 0.00019998478266892636, + "loss": 3.260925769805908, + "step": 1886, + "token_acc": 0.273031196589223 + }, + { + "epoch": 1.1061272354148344, + "grad_norm": 0.9050821086194911, + "learning_rate": 0.00019998461312214754, + "loss": 3.261286735534668, + "step": 1887, + "token_acc": 0.27357934417540575 + }, + { + "epoch": 1.1067135737320435, + "grad_norm": 0.7104105119569273, + "learning_rate": 0.000199984442636148, + "loss": 3.2960314750671387, + "step": 1888, + "token_acc": 0.2684728940323172 + }, + { + "epoch": 1.1072999120492524, + "grad_norm": 0.8027407709505193, + "learning_rate": 0.0001999842712109293, + "loss": 3.2970781326293945, + "step": 1889, + "token_acc": 0.2678171615045574 + }, + { + "epoch": 1.1078862503664615, + "grad_norm": 0.7217541932961473, + "learning_rate": 0.00019998409884649317, + "loss": 3.2484469413757324, + "step": 1890, + "token_acc": 0.2740503407640718 + }, + { + "epoch": 1.1084725886836704, + "grad_norm": 0.5599878746408855, + "learning_rate": 0.0001999839255428411, + "loss": 3.2927956581115723, + "step": 1891, + "token_acc": 0.26801284754215277 + }, + { + "epoch": 1.1090589270008795, + "grad_norm": 0.8896501995553592, + "learning_rate": 0.00019998375129997483, + "loss": 3.266360282897949, + "step": 1892, + "token_acc": 0.27072063507773086 + }, + { + "epoch": 1.1096452653180886, + "grad_norm": 0.7384035138966728, + "learning_rate": 0.00019998357611789592, + "loss": 3.2611944675445557, + "step": 1893, + "token_acc": 0.2720865153258709 + }, + { + "epoch": 1.1102316036352975, + "grad_norm": 0.6556593809311072, + "learning_rate": 0.00019998339999660605, + "loss": 3.2644712924957275, + "step": 1894, + "token_acc": 0.2736169697406971 + }, + { + "epoch": 1.1108179419525066, + "grad_norm": 0.9243238387160311, + "learning_rate": 0.00019998322293610684, + "loss": 3.277118682861328, + "step": 1895, + "token_acc": 0.269430762896659 + }, + { + "epoch": 1.1114042802697157, + "grad_norm": 0.9078650178153008, + "learning_rate": 0.00019998304493640002, + "loss": 3.2808914184570312, + "step": 1896, + "token_acc": 0.27006933621704143 + }, + { + "epoch": 1.1119906185869246, + "grad_norm": 0.9420049031876414, + "learning_rate": 0.0001999828659974872, + "loss": 3.264310121536255, + "step": 1897, + "token_acc": 0.27259516087132096 + }, + { + "epoch": 1.1125769569041337, + "grad_norm": 1.1332686217221968, + "learning_rate": 0.0001999826861193701, + "loss": 3.2504005432128906, + "step": 1898, + "token_acc": 0.2756216605566724 + }, + { + "epoch": 1.1131632952213426, + "grad_norm": 0.9309216392145607, + "learning_rate": 0.00019998250530205036, + "loss": 3.2979202270507812, + "step": 1899, + "token_acc": 0.2698678803039699 + }, + { + "epoch": 1.1137496335385517, + "grad_norm": 0.879448792237718, + "learning_rate": 0.00019998232354552972, + "loss": 3.215097427368164, + "step": 1900, + "token_acc": 0.27841257569900785 + }, + { + "epoch": 1.1143359718557608, + "grad_norm": 0.7579504968972564, + "learning_rate": 0.0001999821408498099, + "loss": 3.303497791290283, + "step": 1901, + "token_acc": 0.26813595729057516 + }, + { + "epoch": 1.1149223101729697, + "grad_norm": 0.763066652633548, + "learning_rate": 0.00019998195721489256, + "loss": 3.2818799018859863, + "step": 1902, + "token_acc": 0.27061246618089946 + }, + { + "epoch": 1.1155086484901788, + "grad_norm": 1.0251203513603857, + "learning_rate": 0.00019998177264077952, + "loss": 3.235461711883545, + "step": 1903, + "token_acc": 0.2764125273168302 + }, + { + "epoch": 1.116094986807388, + "grad_norm": 0.9161793449526694, + "learning_rate": 0.00019998158712747238, + "loss": 3.3044650554656982, + "step": 1904, + "token_acc": 0.2677502962283379 + }, + { + "epoch": 1.1166813251245968, + "grad_norm": 0.5852962554527037, + "learning_rate": 0.000199981400674973, + "loss": 3.2016355991363525, + "step": 1905, + "token_acc": 0.2807474966253843 + }, + { + "epoch": 1.117267663441806, + "grad_norm": 0.7826824908010281, + "learning_rate": 0.0001999812132832831, + "loss": 3.2903404235839844, + "step": 1906, + "token_acc": 0.2689765607368161 + }, + { + "epoch": 1.117854001759015, + "grad_norm": 1.3109488063799364, + "learning_rate": 0.00019998102495240438, + "loss": 3.3185501098632812, + "step": 1907, + "token_acc": 0.26501113948812693 + }, + { + "epoch": 1.118440340076224, + "grad_norm": 0.7801529538690938, + "learning_rate": 0.0001999808356823387, + "loss": 3.2621328830718994, + "step": 1908, + "token_acc": 0.271211303397889 + }, + { + "epoch": 1.119026678393433, + "grad_norm": 0.9244284006318981, + "learning_rate": 0.00019998064547308776, + "loss": 3.225433349609375, + "step": 1909, + "token_acc": 0.2752866920349615 + }, + { + "epoch": 1.119613016710642, + "grad_norm": 0.8327282322235704, + "learning_rate": 0.0001999804543246534, + "loss": 3.2406206130981445, + "step": 1910, + "token_acc": 0.27458419847164844 + }, + { + "epoch": 1.120199355027851, + "grad_norm": 0.8573451591305861, + "learning_rate": 0.0001999802622370374, + "loss": 3.2883567810058594, + "step": 1911, + "token_acc": 0.2680903920669972 + }, + { + "epoch": 1.1207856933450602, + "grad_norm": 0.9516033291377468, + "learning_rate": 0.00019998006921024156, + "loss": 3.279902935028076, + "step": 1912, + "token_acc": 0.26893878170140073 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.7209634110149035, + "learning_rate": 0.0001999798752442677, + "loss": 3.2574493885040283, + "step": 1913, + "token_acc": 0.27425602873268345 + }, + { + "epoch": 1.1219583699794782, + "grad_norm": 0.8473539207558439, + "learning_rate": 0.00019997968033911762, + "loss": 3.306807041168213, + "step": 1914, + "token_acc": 0.2668744631631088 + }, + { + "epoch": 1.1225447082966873, + "grad_norm": 0.9133499325320734, + "learning_rate": 0.00019997948449479317, + "loss": 3.2702248096466064, + "step": 1915, + "token_acc": 0.27176241939117085 + }, + { + "epoch": 1.1231310466138962, + "grad_norm": 1.2364189553174405, + "learning_rate": 0.0001999792877112962, + "loss": 3.271911144256592, + "step": 1916, + "token_acc": 0.26938964837458607 + }, + { + "epoch": 1.1237173849311053, + "grad_norm": 0.7910159191946837, + "learning_rate": 0.00019997908998862853, + "loss": 3.2572860717773438, + "step": 1917, + "token_acc": 0.2728228611084094 + }, + { + "epoch": 1.1243037232483142, + "grad_norm": 0.8390285877249307, + "learning_rate": 0.00019997889132679204, + "loss": 3.27970552444458, + "step": 1918, + "token_acc": 0.27095307252671735 + }, + { + "epoch": 1.1248900615655233, + "grad_norm": 0.7623328731020307, + "learning_rate": 0.00019997869172578862, + "loss": 3.295478582382202, + "step": 1919, + "token_acc": 0.2679587057931986 + }, + { + "epoch": 1.1254763998827324, + "grad_norm": 0.9580346972410974, + "learning_rate": 0.00019997849118562005, + "loss": 3.279165744781494, + "step": 1920, + "token_acc": 0.2711731792504322 + }, + { + "epoch": 1.1260627381999413, + "grad_norm": 1.3158502452665057, + "learning_rate": 0.00019997828970628833, + "loss": 3.2494115829467773, + "step": 1921, + "token_acc": 0.27161823030455445 + }, + { + "epoch": 1.1266490765171504, + "grad_norm": 0.6457485214399122, + "learning_rate": 0.00019997808728779525, + "loss": 3.2810144424438477, + "step": 1922, + "token_acc": 0.2719407215367957 + }, + { + "epoch": 1.1272354148343595, + "grad_norm": 0.7927644439403694, + "learning_rate": 0.0001999778839301428, + "loss": 3.2561545372009277, + "step": 1923, + "token_acc": 0.2740903478246925 + }, + { + "epoch": 1.1278217531515684, + "grad_norm": 1.1914709675638202, + "learning_rate": 0.00019997767963333285, + "loss": 3.2644879817962646, + "step": 1924, + "token_acc": 0.2732024267727836 + }, + { + "epoch": 1.1284080914687775, + "grad_norm": 0.74748013902585, + "learning_rate": 0.00019997747439736734, + "loss": 3.25892972946167, + "step": 1925, + "token_acc": 0.27457159469896436 + }, + { + "epoch": 1.1289944297859864, + "grad_norm": 0.7377013543754479, + "learning_rate": 0.00019997726822224815, + "loss": 3.27091121673584, + "step": 1926, + "token_acc": 0.27047361504245854 + }, + { + "epoch": 1.1295807681031955, + "grad_norm": 0.9968560683067745, + "learning_rate": 0.00019997706110797724, + "loss": 3.266421318054199, + "step": 1927, + "token_acc": 0.2703210360604249 + }, + { + "epoch": 1.1301671064204046, + "grad_norm": 0.9327762840593035, + "learning_rate": 0.00019997685305455658, + "loss": 3.2666382789611816, + "step": 1928, + "token_acc": 0.2724808558442495 + }, + { + "epoch": 1.1307534447376135, + "grad_norm": 1.089282866912289, + "learning_rate": 0.00019997664406198813, + "loss": 3.218949556350708, + "step": 1929, + "token_acc": 0.2790187156750647 + }, + { + "epoch": 1.1313397830548226, + "grad_norm": 0.8085210582791142, + "learning_rate": 0.0001999764341302738, + "loss": 3.2501697540283203, + "step": 1930, + "token_acc": 0.27364933396443303 + }, + { + "epoch": 1.1319261213720317, + "grad_norm": 0.6959949615508015, + "learning_rate": 0.00019997622325941555, + "loss": 3.2533717155456543, + "step": 1931, + "token_acc": 0.27409006175464495 + }, + { + "epoch": 1.1325124596892406, + "grad_norm": 0.8481027511717674, + "learning_rate": 0.00019997601144941546, + "loss": 3.2488253116607666, + "step": 1932, + "token_acc": 0.27500396786794934 + }, + { + "epoch": 1.1330987980064497, + "grad_norm": 0.841987039466926, + "learning_rate": 0.00019997579870027545, + "loss": 3.2770237922668457, + "step": 1933, + "token_acc": 0.2708072353565416 + }, + { + "epoch": 1.1336851363236589, + "grad_norm": 0.9206310826549635, + "learning_rate": 0.00019997558501199753, + "loss": 3.3043227195739746, + "step": 1934, + "token_acc": 0.26589753328979404 + }, + { + "epoch": 1.1342714746408677, + "grad_norm": 0.749045382110028, + "learning_rate": 0.0001999753703845837, + "loss": 3.241389751434326, + "step": 1935, + "token_acc": 0.2754146346646267 + }, + { + "epoch": 1.1348578129580769, + "grad_norm": 0.5879098678686154, + "learning_rate": 0.00019997515481803602, + "loss": 3.2266345024108887, + "step": 1936, + "token_acc": 0.27851853000304916 + }, + { + "epoch": 1.1354441512752858, + "grad_norm": 0.6078205697295972, + "learning_rate": 0.00019997493831235642, + "loss": 3.2653117179870605, + "step": 1937, + "token_acc": 0.27090985218280783 + }, + { + "epoch": 1.1360304895924949, + "grad_norm": 0.6744283889950183, + "learning_rate": 0.00019997472086754703, + "loss": 3.2614212036132812, + "step": 1938, + "token_acc": 0.2717488194407598 + }, + { + "epoch": 1.136616827909704, + "grad_norm": 0.7901297961529072, + "learning_rate": 0.00019997450248360985, + "loss": 3.300222396850586, + "step": 1939, + "token_acc": 0.2661325920398533 + }, + { + "epoch": 1.1372031662269129, + "grad_norm": 0.8473773438046425, + "learning_rate": 0.00019997428316054694, + "loss": 3.2864084243774414, + "step": 1940, + "token_acc": 0.2703568523467954 + }, + { + "epoch": 1.137789504544122, + "grad_norm": 1.1686955481581123, + "learning_rate": 0.00019997406289836033, + "loss": 3.2466187477111816, + "step": 1941, + "token_acc": 0.27399214843326036 + }, + { + "epoch": 1.1383758428613309, + "grad_norm": 0.9373845068040074, + "learning_rate": 0.00019997384169705214, + "loss": 3.2632014751434326, + "step": 1942, + "token_acc": 0.2723962530882955 + }, + { + "epoch": 1.13896218117854, + "grad_norm": 0.8803843651518309, + "learning_rate": 0.00019997361955662442, + "loss": 3.2998671531677246, + "step": 1943, + "token_acc": 0.26663300250174304 + }, + { + "epoch": 1.139548519495749, + "grad_norm": 0.8697430520871321, + "learning_rate": 0.00019997339647707924, + "loss": 3.315596103668213, + "step": 1944, + "token_acc": 0.2656780890695745 + }, + { + "epoch": 1.140134857812958, + "grad_norm": 0.9277301011220579, + "learning_rate": 0.00019997317245841877, + "loss": 3.2290682792663574, + "step": 1945, + "token_acc": 0.27639741113377964 + }, + { + "epoch": 1.140721196130167, + "grad_norm": 1.2182329091203932, + "learning_rate": 0.000199972947500645, + "loss": 3.2481350898742676, + "step": 1946, + "token_acc": 0.2749248382884388 + }, + { + "epoch": 1.1413075344473762, + "grad_norm": 0.9417842095689553, + "learning_rate": 0.00019997272160376012, + "loss": 3.309577703475952, + "step": 1947, + "token_acc": 0.2660076398665799 + }, + { + "epoch": 1.141893872764585, + "grad_norm": 0.8144209020152119, + "learning_rate": 0.00019997249476776626, + "loss": 3.2357430458068848, + "step": 1948, + "token_acc": 0.27612199380090463 + }, + { + "epoch": 1.1424802110817942, + "grad_norm": 0.695459287540676, + "learning_rate": 0.0001999722669926655, + "loss": 3.25854754447937, + "step": 1949, + "token_acc": 0.27435871082212676 + }, + { + "epoch": 1.1430665493990033, + "grad_norm": 0.8129316378146372, + "learning_rate": 0.00019997203827846, + "loss": 3.3170485496520996, + "step": 1950, + "token_acc": 0.2654045212951503 + }, + { + "epoch": 1.1436528877162122, + "grad_norm": 0.7584360415146316, + "learning_rate": 0.00019997180862515196, + "loss": 3.290923595428467, + "step": 1951, + "token_acc": 0.26751815149262753 + }, + { + "epoch": 1.1442392260334213, + "grad_norm": 0.6701358583432266, + "learning_rate": 0.00019997157803274346, + "loss": 3.2826781272888184, + "step": 1952, + "token_acc": 0.2691333738498118 + }, + { + "epoch": 1.1448255643506302, + "grad_norm": 0.6356742044277998, + "learning_rate": 0.00019997134650123668, + "loss": 3.253901720046997, + "step": 1953, + "token_acc": 0.27367806780997855 + }, + { + "epoch": 1.1454119026678393, + "grad_norm": 0.8124574061078009, + "learning_rate": 0.00019997111403063383, + "loss": 3.2431864738464355, + "step": 1954, + "token_acc": 0.2740222609343209 + }, + { + "epoch": 1.1459982409850484, + "grad_norm": 0.932957826122799, + "learning_rate": 0.00019997088062093706, + "loss": 3.2937915325164795, + "step": 1955, + "token_acc": 0.2693883749752535 + }, + { + "epoch": 1.1465845793022573, + "grad_norm": 0.8784728539545127, + "learning_rate": 0.00019997064627214861, + "loss": 3.301311492919922, + "step": 1956, + "token_acc": 0.2658231471674348 + }, + { + "epoch": 1.1471709176194664, + "grad_norm": 0.9398565733266478, + "learning_rate": 0.00019997041098427065, + "loss": 3.2685933113098145, + "step": 1957, + "token_acc": 0.2717892958881396 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.9989404646996672, + "learning_rate": 0.00019997017475730539, + "loss": 3.2379813194274902, + "step": 1958, + "token_acc": 0.2758472597261068 + }, + { + "epoch": 1.1483435942538844, + "grad_norm": 0.958986700205622, + "learning_rate": 0.00019996993759125502, + "loss": 3.2710471153259277, + "step": 1959, + "token_acc": 0.27277080243606366 + }, + { + "epoch": 1.1489299325710935, + "grad_norm": 0.6672534510044124, + "learning_rate": 0.0001999696994861218, + "loss": 3.2638556957244873, + "step": 1960, + "token_acc": 0.27444183698886954 + }, + { + "epoch": 1.1495162708883027, + "grad_norm": 0.8300351749240533, + "learning_rate": 0.000199969460441908, + "loss": 3.2085366249084473, + "step": 1961, + "token_acc": 0.2780671778079071 + }, + { + "epoch": 1.1501026092055116, + "grad_norm": 1.0218739127993308, + "learning_rate": 0.00019996922045861578, + "loss": 3.2892489433288574, + "step": 1962, + "token_acc": 0.2695970600838863 + }, + { + "epoch": 1.1506889475227207, + "grad_norm": 0.8623101638235369, + "learning_rate": 0.0001999689795362475, + "loss": 3.2662177085876465, + "step": 1963, + "token_acc": 0.27107829949305523 + }, + { + "epoch": 1.1512752858399296, + "grad_norm": 0.8477698554209213, + "learning_rate": 0.00019996873767480535, + "loss": 3.2148969173431396, + "step": 1964, + "token_acc": 0.278102336568739 + }, + { + "epoch": 1.1518616241571387, + "grad_norm": 0.8845595793807728, + "learning_rate": 0.00019996849487429158, + "loss": 3.2937536239624023, + "step": 1965, + "token_acc": 0.2692820363247805 + }, + { + "epoch": 1.1524479624743478, + "grad_norm": 0.985028635816655, + "learning_rate": 0.00019996825113470856, + "loss": 3.276862621307373, + "step": 1966, + "token_acc": 0.269620024570337 + }, + { + "epoch": 1.1530343007915567, + "grad_norm": 0.6553812430577929, + "learning_rate": 0.0001999680064560585, + "loss": 3.2651476860046387, + "step": 1967, + "token_acc": 0.2728559725768096 + }, + { + "epoch": 1.1536206391087658, + "grad_norm": 0.673768152013136, + "learning_rate": 0.00019996776083834375, + "loss": 3.2586464881896973, + "step": 1968, + "token_acc": 0.27199175735720266 + }, + { + "epoch": 1.1542069774259747, + "grad_norm": 0.7660411828513154, + "learning_rate": 0.00019996751428156658, + "loss": 3.212815284729004, + "step": 1969, + "token_acc": 0.2786290108724476 + }, + { + "epoch": 1.1547933157431838, + "grad_norm": 0.9964241024652095, + "learning_rate": 0.0001999672667857293, + "loss": 3.218221664428711, + "step": 1970, + "token_acc": 0.2782306018854242 + }, + { + "epoch": 1.155379654060393, + "grad_norm": 0.8397888760741828, + "learning_rate": 0.00019996701835083428, + "loss": 3.3010244369506836, + "step": 1971, + "token_acc": 0.26843564143276194 + }, + { + "epoch": 1.1559659923776018, + "grad_norm": 0.7689723436886481, + "learning_rate": 0.00019996676897688384, + "loss": 3.2293550968170166, + "step": 1972, + "token_acc": 0.27704683934769464 + }, + { + "epoch": 1.156552330694811, + "grad_norm": 0.853284027378518, + "learning_rate": 0.0001999665186638803, + "loss": 3.2223005294799805, + "step": 1973, + "token_acc": 0.27791917861510806 + }, + { + "epoch": 1.15713866901202, + "grad_norm": 0.9891626363442542, + "learning_rate": 0.00019996626741182602, + "loss": 3.2942628860473633, + "step": 1974, + "token_acc": 0.2664776430547771 + }, + { + "epoch": 1.157725007329229, + "grad_norm": 0.9271522061011388, + "learning_rate": 0.00019996601522072338, + "loss": 3.2600202560424805, + "step": 1975, + "token_acc": 0.2732699905345899 + }, + { + "epoch": 1.158311345646438, + "grad_norm": 0.8254289934400133, + "learning_rate": 0.0001999657620905747, + "loss": 3.2105512619018555, + "step": 1976, + "token_acc": 0.2811946145953173 + }, + { + "epoch": 1.1588976839636471, + "grad_norm": 0.8296627513688101, + "learning_rate": 0.00019996550802138242, + "loss": 3.297508955001831, + "step": 1977, + "token_acc": 0.2684389024014251 + }, + { + "epoch": 1.159484022280856, + "grad_norm": 0.7257055600701903, + "learning_rate": 0.0001999652530131489, + "loss": 3.283249855041504, + "step": 1978, + "token_acc": 0.269326168903832 + }, + { + "epoch": 1.1600703605980651, + "grad_norm": 0.7027186500898142, + "learning_rate": 0.00019996499706587652, + "loss": 3.2580084800720215, + "step": 1979, + "token_acc": 0.27335557797263393 + }, + { + "epoch": 1.160656698915274, + "grad_norm": 0.8093970817807803, + "learning_rate": 0.0001999647401795677, + "loss": 3.2579898834228516, + "step": 1980, + "token_acc": 0.2714008811482285 + }, + { + "epoch": 1.1612430372324831, + "grad_norm": 0.7827391216836376, + "learning_rate": 0.00019996448235422488, + "loss": 3.2746973037719727, + "step": 1981, + "token_acc": 0.27259540630511236 + }, + { + "epoch": 1.1618293755496922, + "grad_norm": 0.6506335267933542, + "learning_rate": 0.0001999642235898504, + "loss": 3.2467989921569824, + "step": 1982, + "token_acc": 0.2744970328556955 + }, + { + "epoch": 1.1624157138669011, + "grad_norm": 0.7348201980745489, + "learning_rate": 0.00019996396388644676, + "loss": 3.242039680480957, + "step": 1983, + "token_acc": 0.2756301775996537 + }, + { + "epoch": 1.1630020521841102, + "grad_norm": 0.7300972178012441, + "learning_rate": 0.00019996370324401637, + "loss": 3.247382402420044, + "step": 1984, + "token_acc": 0.2748797946231797 + }, + { + "epoch": 1.1635883905013193, + "grad_norm": 0.8353060349608965, + "learning_rate": 0.00019996344166256172, + "loss": 3.253366708755493, + "step": 1985, + "token_acc": 0.27241338966525835 + }, + { + "epoch": 1.1641747288185282, + "grad_norm": 0.8356436308209866, + "learning_rate": 0.00019996317914208525, + "loss": 3.27044677734375, + "step": 1986, + "token_acc": 0.27158714445984294 + }, + { + "epoch": 1.1647610671357373, + "grad_norm": 0.7822850471316256, + "learning_rate": 0.00019996291568258939, + "loss": 3.2056796550750732, + "step": 1987, + "token_acc": 0.2808533593901086 + }, + { + "epoch": 1.1653474054529465, + "grad_norm": 0.8682312917673789, + "learning_rate": 0.00019996265128407662, + "loss": 3.259547233581543, + "step": 1988, + "token_acc": 0.2729803630432055 + }, + { + "epoch": 1.1659337437701554, + "grad_norm": 0.8675749657265975, + "learning_rate": 0.00019996238594654947, + "loss": 3.2365829944610596, + "step": 1989, + "token_acc": 0.27469171142431853 + }, + { + "epoch": 1.1665200820873645, + "grad_norm": 0.8032126529475314, + "learning_rate": 0.0001999621196700104, + "loss": 3.284641742706299, + "step": 1990, + "token_acc": 0.26934748807034475 + }, + { + "epoch": 1.1671064204045734, + "grad_norm": 0.778041515573633, + "learning_rate": 0.0001999618524544619, + "loss": 3.266928195953369, + "step": 1991, + "token_acc": 0.27106657122405153 + }, + { + "epoch": 1.1676927587217825, + "grad_norm": 0.8690471278502755, + "learning_rate": 0.00019996158429990652, + "loss": 3.2374746799468994, + "step": 1992, + "token_acc": 0.2763543248320766 + }, + { + "epoch": 1.1682790970389916, + "grad_norm": 0.861630358486042, + "learning_rate": 0.00019996131520634672, + "loss": 3.2934985160827637, + "step": 1993, + "token_acc": 0.2671104362541348 + }, + { + "epoch": 1.1688654353562005, + "grad_norm": 0.9036689254364504, + "learning_rate": 0.00019996104517378511, + "loss": 3.294443130493164, + "step": 1994, + "token_acc": 0.26843497874555394 + }, + { + "epoch": 1.1694517736734096, + "grad_norm": 0.9412080658009019, + "learning_rate": 0.00019996077420222417, + "loss": 3.249305486679077, + "step": 1995, + "token_acc": 0.2716355183147056 + }, + { + "epoch": 1.1700381119906185, + "grad_norm": 0.8857930200724065, + "learning_rate": 0.00019996050229166643, + "loss": 3.2995262145996094, + "step": 1996, + "token_acc": 0.2678745764658588 + }, + { + "epoch": 1.1706244503078276, + "grad_norm": 0.6518930795464495, + "learning_rate": 0.00019996022944211448, + "loss": 3.2961931228637695, + "step": 1997, + "token_acc": 0.2659757341989117 + }, + { + "epoch": 1.1712107886250367, + "grad_norm": 0.7335991476032812, + "learning_rate": 0.00019995995565357086, + "loss": 3.253488063812256, + "step": 1998, + "token_acc": 0.27380656405787446 + }, + { + "epoch": 1.1717971269422456, + "grad_norm": 0.7292390556697382, + "learning_rate": 0.00019995968092603817, + "loss": 3.2600302696228027, + "step": 1999, + "token_acc": 0.27267906096340455 + }, + { + "epoch": 1.1723834652594547, + "grad_norm": 0.5177465826850679, + "learning_rate": 0.00019995940525951896, + "loss": 3.26132869720459, + "step": 2000, + "token_acc": 0.2734660728458835 + }, + { + "epoch": 1.1729698035766638, + "grad_norm": 0.718937689264216, + "learning_rate": 0.0001999591286540159, + "loss": 3.2502331733703613, + "step": 2001, + "token_acc": 0.27259292796207085 + }, + { + "epoch": 1.1735561418938727, + "grad_norm": 0.6157077969268541, + "learning_rate": 0.00019995885110953146, + "loss": 3.221062183380127, + "step": 2002, + "token_acc": 0.2772924220541152 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.8024661926483448, + "learning_rate": 0.0001999585726260683, + "loss": 3.2899162769317627, + "step": 2003, + "token_acc": 0.2689389087294471 + }, + { + "epoch": 1.174728818528291, + "grad_norm": 1.1519905683398246, + "learning_rate": 0.00019995829320362907, + "loss": 3.257702350616455, + "step": 2004, + "token_acc": 0.27184555577518615 + }, + { + "epoch": 1.1753151568454998, + "grad_norm": 0.8024844038706597, + "learning_rate": 0.00019995801284221638, + "loss": 3.268692970275879, + "step": 2005, + "token_acc": 0.2712818958787032 + }, + { + "epoch": 1.175901495162709, + "grad_norm": 0.6693598356277342, + "learning_rate": 0.00019995773154183284, + "loss": 3.2361884117126465, + "step": 2006, + "token_acc": 0.27483449756932576 + }, + { + "epoch": 1.1764878334799178, + "grad_norm": 0.670025167376958, + "learning_rate": 0.00019995744930248114, + "loss": 3.2938122749328613, + "step": 2007, + "token_acc": 0.26667725279117777 + }, + { + "epoch": 1.177074171797127, + "grad_norm": 0.8540840861124067, + "learning_rate": 0.00019995716612416383, + "loss": 3.282114028930664, + "step": 2008, + "token_acc": 0.26711221272012353 + }, + { + "epoch": 1.177660510114336, + "grad_norm": 0.7931607422309142, + "learning_rate": 0.0001999568820068837, + "loss": 3.236847162246704, + "step": 2009, + "token_acc": 0.27360249839761763 + }, + { + "epoch": 1.178246848431545, + "grad_norm": 0.8554516183858673, + "learning_rate": 0.00019995659695064332, + "loss": 3.2495715618133545, + "step": 2010, + "token_acc": 0.27311921905757475 + }, + { + "epoch": 1.178833186748754, + "grad_norm": 0.7745890448428505, + "learning_rate": 0.00019995631095544542, + "loss": 3.195685863494873, + "step": 2011, + "token_acc": 0.28077329904283893 + }, + { + "epoch": 1.1794195250659631, + "grad_norm": 0.8852564516137074, + "learning_rate": 0.00019995602402129268, + "loss": 3.289680004119873, + "step": 2012, + "token_acc": 0.2668665512427849 + }, + { + "epoch": 1.180005863383172, + "grad_norm": 0.9622267091569193, + "learning_rate": 0.00019995573614818777, + "loss": 3.2482614517211914, + "step": 2013, + "token_acc": 0.27555830661858915 + }, + { + "epoch": 1.1805922017003811, + "grad_norm": 0.9613379214541318, + "learning_rate": 0.00019995544733613342, + "loss": 3.242762565612793, + "step": 2014, + "token_acc": 0.27423466367819177 + }, + { + "epoch": 1.1811785400175903, + "grad_norm": 0.9445665991133994, + "learning_rate": 0.0001999551575851323, + "loss": 3.233274459838867, + "step": 2015, + "token_acc": 0.2763170836091241 + }, + { + "epoch": 1.1817648783347992, + "grad_norm": 0.8930833042601309, + "learning_rate": 0.00019995486689518722, + "loss": 3.261763572692871, + "step": 2016, + "token_acc": 0.2726266801184626 + }, + { + "epoch": 1.1823512166520083, + "grad_norm": 0.7972642302546983, + "learning_rate": 0.00019995457526630084, + "loss": 3.2242867946624756, + "step": 2017, + "token_acc": 0.27707603005088915 + }, + { + "epoch": 1.1829375549692172, + "grad_norm": 0.7404690405877784, + "learning_rate": 0.0001999542826984759, + "loss": 3.2541773319244385, + "step": 2018, + "token_acc": 0.2733113797783366 + }, + { + "epoch": 1.1835238932864263, + "grad_norm": 0.8161207325155335, + "learning_rate": 0.00019995398919171517, + "loss": 3.2292823791503906, + "step": 2019, + "token_acc": 0.2751745130082022 + }, + { + "epoch": 1.1841102316036354, + "grad_norm": 0.7570681518435783, + "learning_rate": 0.0001999536947460214, + "loss": 3.2853622436523438, + "step": 2020, + "token_acc": 0.26895290852946085 + }, + { + "epoch": 1.1846965699208443, + "grad_norm": 0.6825551127485251, + "learning_rate": 0.0001999533993613974, + "loss": 3.254007339477539, + "step": 2021, + "token_acc": 0.27249751875006395 + }, + { + "epoch": 1.1852829082380534, + "grad_norm": 0.74254226673996, + "learning_rate": 0.00019995310303784584, + "loss": 3.2557644844055176, + "step": 2022, + "token_acc": 0.27399547794818707 + }, + { + "epoch": 1.1858692465552623, + "grad_norm": 0.6435715481447366, + "learning_rate": 0.0001999528057753696, + "loss": 3.2203073501586914, + "step": 2023, + "token_acc": 0.278271369756778 + }, + { + "epoch": 1.1864555848724714, + "grad_norm": 0.6747413634798002, + "learning_rate": 0.00019995250757397142, + "loss": 3.277912139892578, + "step": 2024, + "token_acc": 0.2721270641938896 + }, + { + "epoch": 1.1870419231896805, + "grad_norm": 0.7371871656659613, + "learning_rate": 0.0001999522084336541, + "loss": 3.258727550506592, + "step": 2025, + "token_acc": 0.27213489184468626 + }, + { + "epoch": 1.1876282615068894, + "grad_norm": 0.7492219471380487, + "learning_rate": 0.00019995190835442051, + "loss": 3.2612719535827637, + "step": 2026, + "token_acc": 0.27125480802989754 + }, + { + "epoch": 1.1882145998240985, + "grad_norm": 0.8117697901910712, + "learning_rate": 0.00019995160733627342, + "loss": 3.2734169960021973, + "step": 2027, + "token_acc": 0.27141197086658353 + }, + { + "epoch": 1.1888009381413076, + "grad_norm": 0.7554405098347979, + "learning_rate": 0.00019995130537921565, + "loss": 3.2358875274658203, + "step": 2028, + "token_acc": 0.2743048526894978 + }, + { + "epoch": 1.1893872764585165, + "grad_norm": 0.7519687425779817, + "learning_rate": 0.00019995100248325007, + "loss": 3.250917673110962, + "step": 2029, + "token_acc": 0.27256431749306853 + }, + { + "epoch": 1.1899736147757256, + "grad_norm": 0.7660760097585455, + "learning_rate": 0.0001999506986483795, + "loss": 3.264845371246338, + "step": 2030, + "token_acc": 0.27109547774583653 + }, + { + "epoch": 1.1905599530929347, + "grad_norm": 0.7227019360396504, + "learning_rate": 0.0001999503938746068, + "loss": 3.224827766418457, + "step": 2031, + "token_acc": 0.27476198214511416 + }, + { + "epoch": 1.1911462914101436, + "grad_norm": 0.7096499179706384, + "learning_rate": 0.00019995008816193485, + "loss": 3.237521171569824, + "step": 2032, + "token_acc": 0.27524512826837694 + }, + { + "epoch": 1.1917326297273527, + "grad_norm": 0.6703253516677996, + "learning_rate": 0.00019994978151036648, + "loss": 3.3084309101104736, + "step": 2033, + "token_acc": 0.26603097552728905 + }, + { + "epoch": 1.1923189680445616, + "grad_norm": 0.6022497359643784, + "learning_rate": 0.0001999494739199046, + "loss": 3.210780143737793, + "step": 2034, + "token_acc": 0.27962328185095997 + }, + { + "epoch": 1.1929053063617707, + "grad_norm": 1.0656626725126377, + "learning_rate": 0.0001999491653905521, + "loss": 3.2924139499664307, + "step": 2035, + "token_acc": 0.268935212925878 + }, + { + "epoch": 1.1934916446789798, + "grad_norm": 1.014876992677786, + "learning_rate": 0.0001999488559223119, + "loss": 3.2329001426696777, + "step": 2036, + "token_acc": 0.27664740420749845 + }, + { + "epoch": 1.1940779829961887, + "grad_norm": 0.7789300777988992, + "learning_rate": 0.00019994854551518682, + "loss": 3.264383316040039, + "step": 2037, + "token_acc": 0.2710244596791369 + }, + { + "epoch": 1.1946643213133978, + "grad_norm": 0.6606641768237064, + "learning_rate": 0.0001999482341691799, + "loss": 3.206993579864502, + "step": 2038, + "token_acc": 0.2794790145330045 + }, + { + "epoch": 1.195250659630607, + "grad_norm": 0.7800734140061765, + "learning_rate": 0.000199947921884294, + "loss": 3.2454488277435303, + "step": 2039, + "token_acc": 0.27575220119487787 + }, + { + "epoch": 1.1958369979478158, + "grad_norm": 0.7206048992364572, + "learning_rate": 0.00019994760866053198, + "loss": 3.2386507987976074, + "step": 2040, + "token_acc": 0.2736122628864098 + }, + { + "epoch": 1.196423336265025, + "grad_norm": 0.614807989212569, + "learning_rate": 0.00019994729449789692, + "loss": 3.2631406784057617, + "step": 2041, + "token_acc": 0.27201316675394627 + }, + { + "epoch": 1.197009674582234, + "grad_norm": 0.7670226964696416, + "learning_rate": 0.00019994697939639173, + "loss": 3.238020896911621, + "step": 2042, + "token_acc": 0.27551113864719984 + }, + { + "epoch": 1.197596012899443, + "grad_norm": 0.7732689235303511, + "learning_rate": 0.0001999466633560193, + "loss": 3.283295154571533, + "step": 2043, + "token_acc": 0.26933461658516283 + }, + { + "epoch": 1.198182351216652, + "grad_norm": 0.7873626704170653, + "learning_rate": 0.0001999463463767827, + "loss": 3.217207670211792, + "step": 2044, + "token_acc": 0.2780436341932072 + }, + { + "epoch": 1.198768689533861, + "grad_norm": 0.6780692112504618, + "learning_rate": 0.0001999460284586848, + "loss": 3.267735242843628, + "step": 2045, + "token_acc": 0.2701111054956922 + }, + { + "epoch": 1.19935502785107, + "grad_norm": 0.6411496423631139, + "learning_rate": 0.00019994570960172868, + "loss": 3.235504627227783, + "step": 2046, + "token_acc": 0.2770185407894702 + }, + { + "epoch": 1.1999413661682792, + "grad_norm": 0.6949354759461744, + "learning_rate": 0.00019994538980591726, + "loss": 3.192155361175537, + "step": 2047, + "token_acc": 0.2809576303059385 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.6618181695803856, + "learning_rate": 0.0001999450690712536, + "loss": 3.3199121952056885, + "step": 2048, + "token_acc": 0.26607390054307317 + }, + { + "epoch": 1.2011140428026972, + "grad_norm": 0.5337850893424061, + "learning_rate": 0.0001999447473977407, + "loss": 3.2487683296203613, + "step": 2049, + "token_acc": 0.27245741332192736 + }, + { + "epoch": 1.201700381119906, + "grad_norm": 0.6664085775810086, + "learning_rate": 0.0001999444247853816, + "loss": 3.2674710750579834, + "step": 2050, + "token_acc": 0.2721702610611665 + }, + { + "epoch": 1.2022867194371152, + "grad_norm": 0.823499590078672, + "learning_rate": 0.00019994410123417924, + "loss": 3.2266602516174316, + "step": 2051, + "token_acc": 0.27755598656219616 + }, + { + "epoch": 1.2028730577543243, + "grad_norm": 0.8721375598893872, + "learning_rate": 0.00019994377674413676, + "loss": 3.2453181743621826, + "step": 2052, + "token_acc": 0.2741025907813963 + }, + { + "epoch": 1.2034593960715332, + "grad_norm": 0.9244862466117538, + "learning_rate": 0.00019994345131525717, + "loss": 3.2178244590759277, + "step": 2053, + "token_acc": 0.2791771509525723 + }, + { + "epoch": 1.2040457343887423, + "grad_norm": 1.1196099245212572, + "learning_rate": 0.00019994312494754356, + "loss": 3.256829261779785, + "step": 2054, + "token_acc": 0.2720128265783525 + }, + { + "epoch": 1.2046320727059514, + "grad_norm": 0.8641842542700614, + "learning_rate": 0.00019994279764099892, + "loss": 3.1919198036193848, + "step": 2055, + "token_acc": 0.2803798969817114 + }, + { + "epoch": 1.2052184110231603, + "grad_norm": 0.7219574640915918, + "learning_rate": 0.0001999424693956264, + "loss": 3.264314651489258, + "step": 2056, + "token_acc": 0.2727636143242163 + }, + { + "epoch": 1.2058047493403694, + "grad_norm": 0.9909495877523862, + "learning_rate": 0.00019994214021142902, + "loss": 3.2319424152374268, + "step": 2057, + "token_acc": 0.2761497505554857 + }, + { + "epoch": 1.2063910876575785, + "grad_norm": 1.0646397989858591, + "learning_rate": 0.00019994181008840996, + "loss": 3.298980474472046, + "step": 2058, + "token_acc": 0.2668688002601858 + }, + { + "epoch": 1.2069774259747874, + "grad_norm": 0.8597897116920034, + "learning_rate": 0.00019994147902657224, + "loss": 3.2467713356018066, + "step": 2059, + "token_acc": 0.27467695235883466 + }, + { + "epoch": 1.2075637642919965, + "grad_norm": 1.0475000077040948, + "learning_rate": 0.00019994114702591898, + "loss": 3.238417625427246, + "step": 2060, + "token_acc": 0.2745846389509174 + }, + { + "epoch": 1.2081501026092054, + "grad_norm": 0.8130668100084034, + "learning_rate": 0.00019994081408645332, + "loss": 3.274354934692383, + "step": 2061, + "token_acc": 0.26963046020117226 + }, + { + "epoch": 1.2087364409264145, + "grad_norm": 0.7261892841799678, + "learning_rate": 0.0001999404802081784, + "loss": 3.253032684326172, + "step": 2062, + "token_acc": 0.27494122754998307 + }, + { + "epoch": 1.2093227792436236, + "grad_norm": 0.7616039116543535, + "learning_rate": 0.00019994014539109737, + "loss": 3.2325727939605713, + "step": 2063, + "token_acc": 0.2752068788984797 + }, + { + "epoch": 1.2099091175608325, + "grad_norm": 0.9753137276825926, + "learning_rate": 0.0001999398096352133, + "loss": 3.176018476486206, + "step": 2064, + "token_acc": 0.28387230460402774 + }, + { + "epoch": 1.2104954558780416, + "grad_norm": 0.9444559467709397, + "learning_rate": 0.00019993947294052943, + "loss": 3.261976718902588, + "step": 2065, + "token_acc": 0.27122822830783916 + }, + { + "epoch": 1.2110817941952507, + "grad_norm": 0.6061907031630014, + "learning_rate": 0.00019993913530704882, + "loss": 3.2700295448303223, + "step": 2066, + "token_acc": 0.2695068367421966 + }, + { + "epoch": 1.2116681325124596, + "grad_norm": 0.7702134973146593, + "learning_rate": 0.00019993879673477474, + "loss": 3.2154996395111084, + "step": 2067, + "token_acc": 0.27732970636255355 + }, + { + "epoch": 1.2122544708296688, + "grad_norm": 0.9520840502911053, + "learning_rate": 0.00019993845722371032, + "loss": 3.248842239379883, + "step": 2068, + "token_acc": 0.27322229835199086 + }, + { + "epoch": 1.2128408091468779, + "grad_norm": 0.8658794283548628, + "learning_rate": 0.0001999381167738588, + "loss": 3.2003722190856934, + "step": 2069, + "token_acc": 0.2797433590113028 + }, + { + "epoch": 1.2134271474640868, + "grad_norm": 0.9016477761825271, + "learning_rate": 0.0001999377753852233, + "loss": 3.217028856277466, + "step": 2070, + "token_acc": 0.2782054450615919 + }, + { + "epoch": 1.2140134857812959, + "grad_norm": 0.8036189779348722, + "learning_rate": 0.00019993743305780708, + "loss": 3.295497417449951, + "step": 2071, + "token_acc": 0.26756264710507865 + }, + { + "epoch": 1.2145998240985048, + "grad_norm": 0.6545245660927652, + "learning_rate": 0.00019993708979161335, + "loss": 3.2754857540130615, + "step": 2072, + "token_acc": 0.26995701387291304 + }, + { + "epoch": 1.2151861624157139, + "grad_norm": 0.5805237574808724, + "learning_rate": 0.00019993674558664533, + "loss": 3.259950876235962, + "step": 2073, + "token_acc": 0.27178820685442473 + }, + { + "epoch": 1.215772500732923, + "grad_norm": 0.5642645501949415, + "learning_rate": 0.00019993640044290624, + "loss": 3.2452166080474854, + "step": 2074, + "token_acc": 0.27498536915921434 + }, + { + "epoch": 1.2163588390501319, + "grad_norm": 0.4813788786612914, + "learning_rate": 0.00019993605436039932, + "loss": 3.238658905029297, + "step": 2075, + "token_acc": 0.2766798943202436 + }, + { + "epoch": 1.216945177367341, + "grad_norm": 0.5817760126336858, + "learning_rate": 0.00019993570733912788, + "loss": 3.200277805328369, + "step": 2076, + "token_acc": 0.27990723225622555 + }, + { + "epoch": 1.2175315156845499, + "grad_norm": 0.5940748385262843, + "learning_rate": 0.00019993535937909508, + "loss": 3.214750051498413, + "step": 2077, + "token_acc": 0.27732758284727005 + }, + { + "epoch": 1.218117854001759, + "grad_norm": 0.5495133965349347, + "learning_rate": 0.0001999350104803043, + "loss": 3.2320947647094727, + "step": 2078, + "token_acc": 0.2753966405259208 + }, + { + "epoch": 1.218704192318968, + "grad_norm": 0.6913603813879012, + "learning_rate": 0.00019993466064275873, + "loss": 3.2195510864257812, + "step": 2079, + "token_acc": 0.27786837328082215 + }, + { + "epoch": 1.219290530636177, + "grad_norm": 0.9379586266692725, + "learning_rate": 0.0001999343098664617, + "loss": 3.2361984252929688, + "step": 2080, + "token_acc": 0.2748539705019098 + }, + { + "epoch": 1.219876868953386, + "grad_norm": 0.9134461505820622, + "learning_rate": 0.00019993395815141648, + "loss": 3.2189457416534424, + "step": 2081, + "token_acc": 0.2780203900779996 + }, + { + "epoch": 1.2204632072705952, + "grad_norm": 0.7060965669344349, + "learning_rate": 0.0001999336054976264, + "loss": 3.271450996398926, + "step": 2082, + "token_acc": 0.2709042554596282 + }, + { + "epoch": 1.221049545587804, + "grad_norm": 0.6668726382203133, + "learning_rate": 0.00019993325190509472, + "loss": 3.25471830368042, + "step": 2083, + "token_acc": 0.2727644723844672 + }, + { + "epoch": 1.2216358839050132, + "grad_norm": 0.7428538402139995, + "learning_rate": 0.00019993289737382482, + "loss": 3.2256226539611816, + "step": 2084, + "token_acc": 0.27700972329217766 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.7248845749148265, + "learning_rate": 0.00019993254190382004, + "loss": 3.270136594772339, + "step": 2085, + "token_acc": 0.269031450338762 + }, + { + "epoch": 1.2228085605394312, + "grad_norm": 0.8480222408794976, + "learning_rate": 0.00019993218549508364, + "loss": 3.234304189682007, + "step": 2086, + "token_acc": 0.2751677675797537 + }, + { + "epoch": 1.2233948988566403, + "grad_norm": 0.7953781322910523, + "learning_rate": 0.00019993182814761906, + "loss": 3.258174419403076, + "step": 2087, + "token_acc": 0.27418361734209395 + }, + { + "epoch": 1.2239812371738492, + "grad_norm": 0.8043188982441075, + "learning_rate": 0.0001999314698614296, + "loss": 3.2224857807159424, + "step": 2088, + "token_acc": 0.2771479979646542 + }, + { + "epoch": 1.2245675754910583, + "grad_norm": 0.8763197021257622, + "learning_rate": 0.00019993111063651867, + "loss": 3.238600254058838, + "step": 2089, + "token_acc": 0.2759119171410032 + }, + { + "epoch": 1.2251539138082674, + "grad_norm": 0.7835168773415344, + "learning_rate": 0.00019993075047288955, + "loss": 3.236215591430664, + "step": 2090, + "token_acc": 0.2735578123699326 + }, + { + "epoch": 1.2257402521254763, + "grad_norm": 0.6740945104488825, + "learning_rate": 0.00019993038937054573, + "loss": 3.2422947883605957, + "step": 2091, + "token_acc": 0.2759259456409492 + }, + { + "epoch": 1.2263265904426854, + "grad_norm": 0.5212942601727388, + "learning_rate": 0.00019993002732949055, + "loss": 3.2242431640625, + "step": 2092, + "token_acc": 0.27570878172601965 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.6049830425773788, + "learning_rate": 0.00019992966434972741, + "loss": 3.2530179023742676, + "step": 2093, + "token_acc": 0.27245231121600905 + }, + { + "epoch": 1.2274992670771034, + "grad_norm": 0.6753899437272635, + "learning_rate": 0.00019992930043125976, + "loss": 3.265923500061035, + "step": 2094, + "token_acc": 0.2701323961830084 + }, + { + "epoch": 1.2280856053943126, + "grad_norm": 0.456299638698701, + "learning_rate": 0.00019992893557409098, + "loss": 3.2533345222473145, + "step": 2095, + "token_acc": 0.27371421752562286 + }, + { + "epoch": 1.2286719437115217, + "grad_norm": 0.5209591167763342, + "learning_rate": 0.0001999285697782245, + "loss": 3.207162618637085, + "step": 2096, + "token_acc": 0.2784138670686107 + }, + { + "epoch": 1.2292582820287306, + "grad_norm": 0.6513276798334263, + "learning_rate": 0.00019992820304366374, + "loss": 3.265101909637451, + "step": 2097, + "token_acc": 0.2713824629637132 + }, + { + "epoch": 1.2298446203459397, + "grad_norm": 0.8913523771844125, + "learning_rate": 0.0001999278353704122, + "loss": 3.2463419437408447, + "step": 2098, + "token_acc": 0.27331340158766665 + }, + { + "epoch": 1.2304309586631486, + "grad_norm": 1.1717570085462516, + "learning_rate": 0.00019992746675847326, + "loss": 3.2408082485198975, + "step": 2099, + "token_acc": 0.27433389028481053 + }, + { + "epoch": 1.2310172969803577, + "grad_norm": 0.7572878790321899, + "learning_rate": 0.00019992709720785046, + "loss": 3.2798378467559814, + "step": 2100, + "token_acc": 0.2700344377649185 + }, + { + "epoch": 1.2316036352975668, + "grad_norm": 0.8730022204651231, + "learning_rate": 0.00019992672671854722, + "loss": 3.2689807415008545, + "step": 2101, + "token_acc": 0.2717733871682256 + }, + { + "epoch": 1.2321899736147757, + "grad_norm": 1.1999072009775023, + "learning_rate": 0.00019992635529056708, + "loss": 3.3413915634155273, + "step": 2102, + "token_acc": 0.2625287819785024 + }, + { + "epoch": 1.2327763119319848, + "grad_norm": 0.7171919039153778, + "learning_rate": 0.00019992598292391343, + "loss": 3.2402501106262207, + "step": 2103, + "token_acc": 0.2761459413290932 + }, + { + "epoch": 1.2333626502491937, + "grad_norm": 0.8064018182677524, + "learning_rate": 0.00019992560961858984, + "loss": 3.262726306915283, + "step": 2104, + "token_acc": 0.271804857497036 + }, + { + "epoch": 1.2339489885664028, + "grad_norm": 0.8459765717445233, + "learning_rate": 0.0001999252353745998, + "loss": 3.27354097366333, + "step": 2105, + "token_acc": 0.2695342313298159 + }, + { + "epoch": 1.234535326883612, + "grad_norm": 0.836947634583542, + "learning_rate": 0.00019992486019194684, + "loss": 3.260399580001831, + "step": 2106, + "token_acc": 0.27135008623470236 + }, + { + "epoch": 1.2351216652008208, + "grad_norm": 0.7602294558882761, + "learning_rate": 0.0001999244840706344, + "loss": 3.2782716751098633, + "step": 2107, + "token_acc": 0.2698450443930367 + }, + { + "epoch": 1.23570800351803, + "grad_norm": 0.7814186784810427, + "learning_rate": 0.00019992410701066615, + "loss": 3.2937588691711426, + "step": 2108, + "token_acc": 0.26817424134866674 + }, + { + "epoch": 1.236294341835239, + "grad_norm": 0.6259638405837912, + "learning_rate": 0.00019992372901204557, + "loss": 3.2281618118286133, + "step": 2109, + "token_acc": 0.27528459848538006 + }, + { + "epoch": 1.236880680152448, + "grad_norm": 0.483736593563296, + "learning_rate": 0.00019992335007477618, + "loss": 3.2342374324798584, + "step": 2110, + "token_acc": 0.27565841476256625 + }, + { + "epoch": 1.237467018469657, + "grad_norm": 0.630266095374544, + "learning_rate": 0.00019992297019886154, + "loss": 3.2631545066833496, + "step": 2111, + "token_acc": 0.27094199318790463 + }, + { + "epoch": 1.2380533567868661, + "grad_norm": 0.5639592873036668, + "learning_rate": 0.0001999225893843053, + "loss": 3.198659896850586, + "step": 2112, + "token_acc": 0.2802096105336322 + }, + { + "epoch": 1.238639695104075, + "grad_norm": 0.6632747779578492, + "learning_rate": 0.00019992220763111093, + "loss": 3.2554709911346436, + "step": 2113, + "token_acc": 0.27210225544652716 + }, + { + "epoch": 1.2392260334212841, + "grad_norm": 0.7272417788706874, + "learning_rate": 0.0001999218249392821, + "loss": 3.237191677093506, + "step": 2114, + "token_acc": 0.27372554095743534 + }, + { + "epoch": 1.239812371738493, + "grad_norm": 0.8577853457392106, + "learning_rate": 0.00019992144130882234, + "loss": 3.253605365753174, + "step": 2115, + "token_acc": 0.27277816302086616 + }, + { + "epoch": 1.2403987100557021, + "grad_norm": 0.8920984218240926, + "learning_rate": 0.00019992105673973532, + "loss": 3.2642173767089844, + "step": 2116, + "token_acc": 0.2720777279521674 + }, + { + "epoch": 1.2409850483729112, + "grad_norm": 0.6724170158262036, + "learning_rate": 0.0001999206712320246, + "loss": 3.2222015857696533, + "step": 2117, + "token_acc": 0.2753767035227565 + }, + { + "epoch": 1.2415713866901201, + "grad_norm": 0.6083123287550836, + "learning_rate": 0.0001999202847856938, + "loss": 3.2815628051757812, + "step": 2118, + "token_acc": 0.26882221370623344 + }, + { + "epoch": 1.2421577250073292, + "grad_norm": 0.7345078030171832, + "learning_rate": 0.0001999198974007466, + "loss": 3.2437543869018555, + "step": 2119, + "token_acc": 0.2739233463562791 + }, + { + "epoch": 1.2427440633245384, + "grad_norm": 0.7783908533656655, + "learning_rate": 0.00019991950907718658, + "loss": 3.2664730548858643, + "step": 2120, + "token_acc": 0.27040531532654466 + }, + { + "epoch": 1.2433304016417472, + "grad_norm": 0.9970579713164444, + "learning_rate": 0.00019991911981501743, + "loss": 3.259303569793701, + "step": 2121, + "token_acc": 0.27222284366033406 + }, + { + "epoch": 1.2439167399589564, + "grad_norm": 1.0198371246734628, + "learning_rate": 0.0001999187296142428, + "loss": 3.2523155212402344, + "step": 2122, + "token_acc": 0.27243841255547696 + }, + { + "epoch": 1.2445030782761655, + "grad_norm": 0.7793279231356892, + "learning_rate": 0.00019991833847486635, + "loss": 3.2021541595458984, + "step": 2123, + "token_acc": 0.27830900598079394 + }, + { + "epoch": 1.2450894165933744, + "grad_norm": 0.6446590922413522, + "learning_rate": 0.00019991794639689174, + "loss": 3.266526699066162, + "step": 2124, + "token_acc": 0.27196870191286004 + }, + { + "epoch": 1.2456757549105835, + "grad_norm": 0.594158856943162, + "learning_rate": 0.00019991755338032266, + "loss": 3.2651641368865967, + "step": 2125, + "token_acc": 0.27245110411774415 + }, + { + "epoch": 1.2462620932277924, + "grad_norm": 0.5342609035495294, + "learning_rate": 0.0001999171594251628, + "loss": 3.302712917327881, + "step": 2126, + "token_acc": 0.26477727599302064 + }, + { + "epoch": 1.2468484315450015, + "grad_norm": 0.4775861843555774, + "learning_rate": 0.0001999167645314159, + "loss": 3.281754493713379, + "step": 2127, + "token_acc": 0.2664507966795083 + }, + { + "epoch": 1.2474347698622106, + "grad_norm": 0.5566738635080677, + "learning_rate": 0.0001999163686990856, + "loss": 3.23808217048645, + "step": 2128, + "token_acc": 0.2747500603212236 + }, + { + "epoch": 1.2480211081794195, + "grad_norm": 0.45640086438844707, + "learning_rate": 0.00019991597192817566, + "loss": 3.2125957012176514, + "step": 2129, + "token_acc": 0.27919218914341043 + }, + { + "epoch": 1.2486074464966286, + "grad_norm": 0.5992324717617522, + "learning_rate": 0.00019991557421868982, + "loss": 3.2420685291290283, + "step": 2130, + "token_acc": 0.2736017205457356 + }, + { + "epoch": 1.2491937848138375, + "grad_norm": 0.5895001888250889, + "learning_rate": 0.0001999151755706318, + "loss": 3.243584156036377, + "step": 2131, + "token_acc": 0.27488625013871937 + }, + { + "epoch": 1.2497801231310466, + "grad_norm": 0.729691625239649, + "learning_rate": 0.00019991477598400533, + "loss": 3.2531533241271973, + "step": 2132, + "token_acc": 0.2725829221224955 + }, + { + "epoch": 1.2503664614482557, + "grad_norm": 0.8477726169408945, + "learning_rate": 0.0001999143754588142, + "loss": 3.272512674331665, + "step": 2133, + "token_acc": 0.2689022057886013 + }, + { + "epoch": 1.2509527997654648, + "grad_norm": 1.065825354563309, + "learning_rate": 0.00019991397399506214, + "loss": 3.281550884246826, + "step": 2134, + "token_acc": 0.2683666234096352 + }, + { + "epoch": 1.2515391380826737, + "grad_norm": 0.8596864082259226, + "learning_rate": 0.00019991357159275291, + "loss": 3.239239454269409, + "step": 2135, + "token_acc": 0.27454686045387716 + }, + { + "epoch": 1.2521254763998828, + "grad_norm": 0.7655384878888009, + "learning_rate": 0.0001999131682518903, + "loss": 3.2360548973083496, + "step": 2136, + "token_acc": 0.275677134795408 + }, + { + "epoch": 1.2527118147170917, + "grad_norm": 0.5258642112913008, + "learning_rate": 0.00019991276397247814, + "loss": 3.2244338989257812, + "step": 2137, + "token_acc": 0.2766056994735059 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.6186875016615384, + "learning_rate": 0.00019991235875452018, + "loss": 3.2318358421325684, + "step": 2138, + "token_acc": 0.2740722568241444 + }, + { + "epoch": 1.25388449135151, + "grad_norm": 0.7519474609825382, + "learning_rate": 0.00019991195259802026, + "loss": 3.216973304748535, + "step": 2139, + "token_acc": 0.2766929594528477 + }, + { + "epoch": 1.2544708296687188, + "grad_norm": 0.665647492742681, + "learning_rate": 0.00019991154550298216, + "loss": 3.2894692420959473, + "step": 2140, + "token_acc": 0.2688509385961731 + }, + { + "epoch": 1.255057167985928, + "grad_norm": 0.7019323356323866, + "learning_rate": 0.00019991113746940973, + "loss": 3.2308433055877686, + "step": 2141, + "token_acc": 0.27640619617721846 + }, + { + "epoch": 1.2556435063031368, + "grad_norm": 0.6241000967432053, + "learning_rate": 0.00019991072849730678, + "loss": 3.2318615913391113, + "step": 2142, + "token_acc": 0.2759584386046786 + }, + { + "epoch": 1.256229844620346, + "grad_norm": 0.6953932918913984, + "learning_rate": 0.0001999103185866772, + "loss": 3.2314140796661377, + "step": 2143, + "token_acc": 0.2751824911822845 + }, + { + "epoch": 1.256816182937555, + "grad_norm": 0.7471169400723318, + "learning_rate": 0.00019990990773752478, + "loss": 3.3068342208862305, + "step": 2144, + "token_acc": 0.26609806696390076 + }, + { + "epoch": 1.257402521254764, + "grad_norm": 0.6696148307418435, + "learning_rate": 0.0001999094959498534, + "loss": 3.2601871490478516, + "step": 2145, + "token_acc": 0.2703857538432159 + }, + { + "epoch": 1.257988859571973, + "grad_norm": 0.7970819925096037, + "learning_rate": 0.00019990908322366696, + "loss": 3.2590270042419434, + "step": 2146, + "token_acc": 0.2714718917658036 + }, + { + "epoch": 1.258575197889182, + "grad_norm": 0.6530741985561475, + "learning_rate": 0.00019990866955896933, + "loss": 3.1904618740081787, + "step": 2147, + "token_acc": 0.2813886848666039 + }, + { + "epoch": 1.259161536206391, + "grad_norm": 0.6327277126307087, + "learning_rate": 0.00019990825495576434, + "loss": 3.2751193046569824, + "step": 2148, + "token_acc": 0.26960616068318705 + }, + { + "epoch": 1.2597478745236002, + "grad_norm": 0.6823775712105917, + "learning_rate": 0.00019990783941405593, + "loss": 3.1972270011901855, + "step": 2149, + "token_acc": 0.2791242397366087 + }, + { + "epoch": 1.2603342128408093, + "grad_norm": 0.5467204222176628, + "learning_rate": 0.000199907422933848, + "loss": 3.2215945720672607, + "step": 2150, + "token_acc": 0.27730997797339524 + }, + { + "epoch": 1.2609205511580182, + "grad_norm": 0.6335703896690619, + "learning_rate": 0.00019990700551514445, + "loss": 3.224214553833008, + "step": 2151, + "token_acc": 0.2768862396189084 + }, + { + "epoch": 1.2615068894752273, + "grad_norm": 0.5949194678482058, + "learning_rate": 0.00019990658715794923, + "loss": 3.2176411151885986, + "step": 2152, + "token_acc": 0.27845557104300117 + }, + { + "epoch": 1.2620932277924362, + "grad_norm": 0.6106785645344613, + "learning_rate": 0.00019990616786226624, + "loss": 3.22220516204834, + "step": 2153, + "token_acc": 0.2765824224342079 + }, + { + "epoch": 1.2626795661096453, + "grad_norm": 0.6497760827356881, + "learning_rate": 0.00019990574762809943, + "loss": 3.235352039337158, + "step": 2154, + "token_acc": 0.2742665407610396 + }, + { + "epoch": 1.2632659044268544, + "grad_norm": 0.7023374739174322, + "learning_rate": 0.00019990532645545274, + "loss": 3.2108068466186523, + "step": 2155, + "token_acc": 0.27878084179970974 + }, + { + "epoch": 1.2638522427440633, + "grad_norm": 0.7291585526232633, + "learning_rate": 0.00019990490434433012, + "loss": 3.233272075653076, + "step": 2156, + "token_acc": 0.27409529052496 + }, + { + "epoch": 1.2644385810612724, + "grad_norm": 0.7176754275713474, + "learning_rate": 0.00019990448129473558, + "loss": 3.298462390899658, + "step": 2157, + "token_acc": 0.266765168235606 + }, + { + "epoch": 1.2650249193784813, + "grad_norm": 0.712373129786324, + "learning_rate": 0.00019990405730667304, + "loss": 3.269559383392334, + "step": 2158, + "token_acc": 0.2699186564516341 + }, + { + "epoch": 1.2656112576956904, + "grad_norm": 0.6114463624677939, + "learning_rate": 0.0001999036323801465, + "loss": 3.2140755653381348, + "step": 2159, + "token_acc": 0.2767617469399277 + }, + { + "epoch": 1.2661975960128995, + "grad_norm": 0.5830773296686367, + "learning_rate": 0.00019990320651515998, + "loss": 3.2144436836242676, + "step": 2160, + "token_acc": 0.2772574890816264 + }, + { + "epoch": 1.2667839343301086, + "grad_norm": 0.7750272980897177, + "learning_rate": 0.00019990277971171746, + "loss": 3.235363721847534, + "step": 2161, + "token_acc": 0.27473418425258866 + }, + { + "epoch": 1.2673702726473175, + "grad_norm": 0.7512707441949172, + "learning_rate": 0.0001999023519698229, + "loss": 3.2060887813568115, + "step": 2162, + "token_acc": 0.27986954920835305 + }, + { + "epoch": 1.2679566109645266, + "grad_norm": 0.6293976369808847, + "learning_rate": 0.00019990192328948037, + "loss": 3.269015312194824, + "step": 2163, + "token_acc": 0.2693025795368286 + }, + { + "epoch": 1.2685429492817355, + "grad_norm": 0.7412463955948765, + "learning_rate": 0.00019990149367069392, + "loss": 3.199249744415283, + "step": 2164, + "token_acc": 0.2781958390606435 + }, + { + "epoch": 1.2691292875989446, + "grad_norm": 0.5633395651180294, + "learning_rate": 0.00019990106311346755, + "loss": 3.1954383850097656, + "step": 2165, + "token_acc": 0.2825022938820282 + }, + { + "epoch": 1.2697156259161537, + "grad_norm": 0.663621156911283, + "learning_rate": 0.00019990063161780532, + "loss": 3.2284841537475586, + "step": 2166, + "token_acc": 0.2753045252798392 + }, + { + "epoch": 1.2703019642333626, + "grad_norm": 0.8266267092526703, + "learning_rate": 0.00019990019918371123, + "loss": 3.248342514038086, + "step": 2167, + "token_acc": 0.2741044207317073 + }, + { + "epoch": 1.2708883025505717, + "grad_norm": 0.7546182676163214, + "learning_rate": 0.00019989976581118944, + "loss": 3.263092517852783, + "step": 2168, + "token_acc": 0.2699587672382286 + }, + { + "epoch": 1.2714746408677806, + "grad_norm": 0.7404262312100173, + "learning_rate": 0.00019989933150024394, + "loss": 3.2622618675231934, + "step": 2169, + "token_acc": 0.27007631612986677 + }, + { + "epoch": 1.2720609791849897, + "grad_norm": 0.5533614220311185, + "learning_rate": 0.00019989889625087883, + "loss": 3.2398970127105713, + "step": 2170, + "token_acc": 0.27420291627181853 + }, + { + "epoch": 1.2726473175021988, + "grad_norm": 0.5833857254368834, + "learning_rate": 0.00019989846006309822, + "loss": 3.273425340652466, + "step": 2171, + "token_acc": 0.2690484607187183 + }, + { + "epoch": 1.2732336558194077, + "grad_norm": 0.5783378198681538, + "learning_rate": 0.00019989802293690615, + "loss": 3.267561674118042, + "step": 2172, + "token_acc": 0.27068126050831265 + }, + { + "epoch": 1.2738199941366168, + "grad_norm": 0.6052277147591019, + "learning_rate": 0.00019989758487230682, + "loss": 3.2454724311828613, + "step": 2173, + "token_acc": 0.274316649888388 + }, + { + "epoch": 1.2744063324538257, + "grad_norm": 0.7487010540600042, + "learning_rate": 0.00019989714586930428, + "loss": 3.2361464500427246, + "step": 2174, + "token_acc": 0.27505867716321397 + }, + { + "epoch": 1.2749926707710348, + "grad_norm": 0.683503620306193, + "learning_rate": 0.00019989670592790267, + "loss": 3.2607080936431885, + "step": 2175, + "token_acc": 0.27137833854745713 + }, + { + "epoch": 1.275579009088244, + "grad_norm": 0.4819795493747239, + "learning_rate": 0.00019989626504810613, + "loss": 3.2419040203094482, + "step": 2176, + "token_acc": 0.2741549246443665 + }, + { + "epoch": 1.276165347405453, + "grad_norm": 0.5235147978251798, + "learning_rate": 0.00019989582322991876, + "loss": 3.235147476196289, + "step": 2177, + "token_acc": 0.2746303321991518 + }, + { + "epoch": 1.276751685722662, + "grad_norm": 0.4527633688792298, + "learning_rate": 0.00019989538047334476, + "loss": 3.2353568077087402, + "step": 2178, + "token_acc": 0.27443259686864074 + }, + { + "epoch": 1.277338024039871, + "grad_norm": 0.6131312861860545, + "learning_rate": 0.00019989493677838825, + "loss": 3.222853183746338, + "step": 2179, + "token_acc": 0.2749989909725679 + }, + { + "epoch": 1.27792436235708, + "grad_norm": 0.5677673801830521, + "learning_rate": 0.00019989449214505346, + "loss": 3.2522151470184326, + "step": 2180, + "token_acc": 0.27095981002827463 + }, + { + "epoch": 1.278510700674289, + "grad_norm": 0.5601336655207524, + "learning_rate": 0.00019989404657334449, + "loss": 3.2426445484161377, + "step": 2181, + "token_acc": 0.2739189555273256 + }, + { + "epoch": 1.2790970389914982, + "grad_norm": 0.5945498272627211, + "learning_rate": 0.00019989360006326558, + "loss": 3.270915985107422, + "step": 2182, + "token_acc": 0.27023450880052824 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.6923372350605109, + "learning_rate": 0.00019989315261482093, + "loss": 3.2347657680511475, + "step": 2183, + "token_acc": 0.2753687582340937 + }, + { + "epoch": 1.2802697156259162, + "grad_norm": 0.8223412916588332, + "learning_rate": 0.00019989270422801468, + "loss": 3.2032394409179688, + "step": 2184, + "token_acc": 0.27703133047923395 + }, + { + "epoch": 1.280856053943125, + "grad_norm": 1.052744822789687, + "learning_rate": 0.0001998922549028511, + "loss": 3.2600834369659424, + "step": 2185, + "token_acc": 0.2696017944204055 + }, + { + "epoch": 1.2814423922603342, + "grad_norm": 0.9838125937422946, + "learning_rate": 0.00019989180463933436, + "loss": 3.20468807220459, + "step": 2186, + "token_acc": 0.27884265872101655 + }, + { + "epoch": 1.2820287305775433, + "grad_norm": 0.8908789010185257, + "learning_rate": 0.00019989135343746877, + "loss": 3.2697415351867676, + "step": 2187, + "token_acc": 0.2686644022393985 + }, + { + "epoch": 1.2826150688947524, + "grad_norm": 1.0217860724860865, + "learning_rate": 0.0001998909012972585, + "loss": 3.2619426250457764, + "step": 2188, + "token_acc": 0.270508053897804 + }, + { + "epoch": 1.2832014072119613, + "grad_norm": 0.6154334343656209, + "learning_rate": 0.0001998904482187078, + "loss": 3.211843967437744, + "step": 2189, + "token_acc": 0.2774608888627537 + }, + { + "epoch": 1.2837877455291704, + "grad_norm": 0.6684831294288789, + "learning_rate": 0.00019988999420182096, + "loss": 3.234035015106201, + "step": 2190, + "token_acc": 0.2760094847890219 + }, + { + "epoch": 1.2843740838463793, + "grad_norm": 0.7719757671130388, + "learning_rate": 0.0001998895392466022, + "loss": 3.232257127761841, + "step": 2191, + "token_acc": 0.27577705805578384 + }, + { + "epoch": 1.2849604221635884, + "grad_norm": 0.723670775455715, + "learning_rate": 0.00019988908335305587, + "loss": 3.240888833999634, + "step": 2192, + "token_acc": 0.2752683200188974 + }, + { + "epoch": 1.2855467604807975, + "grad_norm": 0.8035035576830801, + "learning_rate": 0.00019988862652118615, + "loss": 3.2716798782348633, + "step": 2193, + "token_acc": 0.26834522381463477 + }, + { + "epoch": 1.2861330987980064, + "grad_norm": 0.6514107213947139, + "learning_rate": 0.00019988816875099742, + "loss": 3.2054567337036133, + "step": 2194, + "token_acc": 0.2789974576578349 + }, + { + "epoch": 1.2867194371152155, + "grad_norm": 0.6919988548748391, + "learning_rate": 0.00019988771004249395, + "loss": 3.234520196914673, + "step": 2195, + "token_acc": 0.2748395634490793 + }, + { + "epoch": 1.2873057754324244, + "grad_norm": 0.8253730982703739, + "learning_rate": 0.00019988725039568004, + "loss": 3.30932354927063, + "step": 2196, + "token_acc": 0.26580008168628916 + }, + { + "epoch": 1.2878921137496335, + "grad_norm": 0.6938011649814418, + "learning_rate": 0.00019988678981056, + "loss": 3.2249906063079834, + "step": 2197, + "token_acc": 0.2765212639921562 + }, + { + "epoch": 1.2884784520668426, + "grad_norm": 0.534500212770452, + "learning_rate": 0.00019988632828713817, + "loss": 3.259059429168701, + "step": 2198, + "token_acc": 0.27322731297967445 + }, + { + "epoch": 1.2890647903840515, + "grad_norm": 0.5421164903860508, + "learning_rate": 0.00019988586582541892, + "loss": 3.2708330154418945, + "step": 2199, + "token_acc": 0.27159299246282337 + }, + { + "epoch": 1.2896511287012606, + "grad_norm": 0.7825625360605715, + "learning_rate": 0.00019988540242540652, + "loss": 3.178575277328491, + "step": 2200, + "token_acc": 0.2823877524765213 + }, + { + "epoch": 1.2902374670184695, + "grad_norm": 0.7519683535990888, + "learning_rate": 0.0001998849380871054, + "loss": 3.2659499645233154, + "step": 2201, + "token_acc": 0.271232045986735 + }, + { + "epoch": 1.2908238053356786, + "grad_norm": 0.6429486171573856, + "learning_rate": 0.00019988447281051982, + "loss": 3.2065045833587646, + "step": 2202, + "token_acc": 0.2772391075439612 + }, + { + "epoch": 1.2914101436528878, + "grad_norm": 0.6309962708672037, + "learning_rate": 0.00019988400659565425, + "loss": 3.1658382415771484, + "step": 2203, + "token_acc": 0.28441528545119704 + }, + { + "epoch": 1.2919964819700969, + "grad_norm": 0.6632401044365421, + "learning_rate": 0.00019988353944251302, + "loss": 3.262974739074707, + "step": 2204, + "token_acc": 0.27275845896147405 + }, + { + "epoch": 1.2925828202873058, + "grad_norm": 0.7247508636439692, + "learning_rate": 0.00019988307135110055, + "loss": 3.2054362297058105, + "step": 2205, + "token_acc": 0.2795609575947167 + }, + { + "epoch": 1.2931691586045149, + "grad_norm": 0.7405059386418791, + "learning_rate": 0.00019988260232142122, + "loss": 3.1694211959838867, + "step": 2206, + "token_acc": 0.28214910316292363 + }, + { + "epoch": 1.2937554969217238, + "grad_norm": 0.6922066938209215, + "learning_rate": 0.0001998821323534794, + "loss": 3.2345194816589355, + "step": 2207, + "token_acc": 0.2743523647891383 + }, + { + "epoch": 1.2943418352389329, + "grad_norm": 0.7606908562672414, + "learning_rate": 0.00019988166144727958, + "loss": 3.233335018157959, + "step": 2208, + "token_acc": 0.27613305123384374 + }, + { + "epoch": 1.294928173556142, + "grad_norm": 0.6492810415886225, + "learning_rate": 0.00019988118960282615, + "loss": 3.2524380683898926, + "step": 2209, + "token_acc": 0.27368770134546144 + }, + { + "epoch": 1.2955145118733509, + "grad_norm": 0.5637800516502484, + "learning_rate": 0.0001998807168201235, + "loss": 3.2404277324676514, + "step": 2210, + "token_acc": 0.27279534934691513 + }, + { + "epoch": 1.29610085019056, + "grad_norm": 0.6166664962410506, + "learning_rate": 0.0001998802430991761, + "loss": 3.2613275051116943, + "step": 2211, + "token_acc": 0.2733495702928017 + }, + { + "epoch": 1.2966871885077689, + "grad_norm": 0.676391139608334, + "learning_rate": 0.00019987976843998843, + "loss": 3.2212557792663574, + "step": 2212, + "token_acc": 0.27698590788507216 + }, + { + "epoch": 1.297273526824978, + "grad_norm": 0.6862026576124307, + "learning_rate": 0.00019987929284256492, + "loss": 3.2168734073638916, + "step": 2213, + "token_acc": 0.2778010260225353 + }, + { + "epoch": 1.297859865142187, + "grad_norm": 0.7131653085970625, + "learning_rate": 0.00019987881630691005, + "loss": 3.236105442047119, + "step": 2214, + "token_acc": 0.27362127759839877 + }, + { + "epoch": 1.2984462034593962, + "grad_norm": 0.7137339920348341, + "learning_rate": 0.00019987833883302827, + "loss": 3.2486140727996826, + "step": 2215, + "token_acc": 0.27384627721422555 + }, + { + "epoch": 1.299032541776605, + "grad_norm": 0.7174074078760778, + "learning_rate": 0.00019987786042092412, + "loss": 3.2277421951293945, + "step": 2216, + "token_acc": 0.2757485193479669 + }, + { + "epoch": 1.2996188800938142, + "grad_norm": 0.5986700207760487, + "learning_rate": 0.00019987738107060202, + "loss": 3.2506535053253174, + "step": 2217, + "token_acc": 0.27179324956204826 + }, + { + "epoch": 1.300205218411023, + "grad_norm": 0.9365846432662207, + "learning_rate": 0.00019987690078206654, + "loss": 3.238294839859009, + "step": 2218, + "token_acc": 0.2738704964148108 + }, + { + "epoch": 1.3007915567282322, + "grad_norm": 0.8365571196498061, + "learning_rate": 0.00019987641955532216, + "loss": 3.2023885250091553, + "step": 2219, + "token_acc": 0.27832308986219834 + }, + { + "epoch": 1.3013778950454413, + "grad_norm": 0.5952536669010751, + "learning_rate": 0.00019987593739037338, + "loss": 3.202956199645996, + "step": 2220, + "token_acc": 0.27807432958894485 + }, + { + "epoch": 1.3019642333626502, + "grad_norm": 0.6546631752159368, + "learning_rate": 0.00019987545428722477, + "loss": 3.2611093521118164, + "step": 2221, + "token_acc": 0.27222647648774906 + }, + { + "epoch": 1.3025505716798593, + "grad_norm": 0.5203480196727166, + "learning_rate": 0.00019987497024588085, + "loss": 3.208035945892334, + "step": 2222, + "token_acc": 0.27859294453760636 + }, + { + "epoch": 1.3031369099970682, + "grad_norm": 0.691001425135421, + "learning_rate": 0.00019987448526634615, + "loss": 3.2042624950408936, + "step": 2223, + "token_acc": 0.2787840675272461 + }, + { + "epoch": 1.3037232483142773, + "grad_norm": 0.6866352526769937, + "learning_rate": 0.00019987399934862524, + "loss": 3.217952251434326, + "step": 2224, + "token_acc": 0.27749061270317077 + }, + { + "epoch": 1.3043095866314864, + "grad_norm": 0.6644913935534694, + "learning_rate": 0.00019987351249272273, + "loss": 3.2209808826446533, + "step": 2225, + "token_acc": 0.27552274296424983 + }, + { + "epoch": 1.3048959249486953, + "grad_norm": 0.4563799464246214, + "learning_rate": 0.00019987302469864313, + "loss": 3.2819745540618896, + "step": 2226, + "token_acc": 0.2690673817288386 + }, + { + "epoch": 1.3054822632659044, + "grad_norm": 0.6332059547099614, + "learning_rate": 0.00019987253596639103, + "loss": 3.245044231414795, + "step": 2227, + "token_acc": 0.27424212759452116 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.6401487888261346, + "learning_rate": 0.00019987204629597107, + "loss": 3.211263656616211, + "step": 2228, + "token_acc": 0.27881904752375686 + }, + { + "epoch": 1.3066549399003224, + "grad_norm": 0.5456118423587977, + "learning_rate": 0.00019987155568738777, + "loss": 3.1943864822387695, + "step": 2229, + "token_acc": 0.28030357454407373 + }, + { + "epoch": 1.3072412782175316, + "grad_norm": 0.5887965883742874, + "learning_rate": 0.00019987106414064582, + "loss": 3.211409568786621, + "step": 2230, + "token_acc": 0.2771836961832181 + }, + { + "epoch": 1.3078276165347407, + "grad_norm": 0.7659352395773921, + "learning_rate": 0.00019987057165574975, + "loss": 3.222933769226074, + "step": 2231, + "token_acc": 0.2776629356776802 + }, + { + "epoch": 1.3084139548519496, + "grad_norm": 0.8456032469825049, + "learning_rate": 0.0001998700782327043, + "loss": 3.2683181762695312, + "step": 2232, + "token_acc": 0.2706966732297522 + }, + { + "epoch": 1.3090002931691587, + "grad_norm": 0.9133910492023074, + "learning_rate": 0.00019986958387151402, + "loss": 3.2389771938323975, + "step": 2233, + "token_acc": 0.27440895604793214 + }, + { + "epoch": 1.3095866314863676, + "grad_norm": 0.8930941498191963, + "learning_rate": 0.00019986908857218353, + "loss": 3.2202579975128174, + "step": 2234, + "token_acc": 0.2770410053726919 + }, + { + "epoch": 1.3101729698035767, + "grad_norm": 0.5511377077731109, + "learning_rate": 0.00019986859233471756, + "loss": 3.2436373233795166, + "step": 2235, + "token_acc": 0.27343714568499344 + }, + { + "epoch": 1.3107593081207858, + "grad_norm": 0.6617750902178912, + "learning_rate": 0.00019986809515912078, + "loss": 3.2722220420837402, + "step": 2236, + "token_acc": 0.2708851563178634 + }, + { + "epoch": 1.3113456464379947, + "grad_norm": 1.009080456548298, + "learning_rate": 0.00019986759704539777, + "loss": 3.2702579498291016, + "step": 2237, + "token_acc": 0.26745342785637244 + }, + { + "epoch": 1.3119319847552038, + "grad_norm": 0.8767344557309221, + "learning_rate": 0.0001998670979935533, + "loss": 3.2472572326660156, + "step": 2238, + "token_acc": 0.27203619909502263 + }, + { + "epoch": 1.3125183230724127, + "grad_norm": 0.6228109132234975, + "learning_rate": 0.00019986659800359197, + "loss": 3.261760711669922, + "step": 2239, + "token_acc": 0.2709176576406974 + }, + { + "epoch": 1.3131046613896218, + "grad_norm": 0.765110871533862, + "learning_rate": 0.00019986609707551856, + "loss": 3.208376407623291, + "step": 2240, + "token_acc": 0.2765927422911097 + }, + { + "epoch": 1.313690999706831, + "grad_norm": 0.8750214227254173, + "learning_rate": 0.0001998655952093377, + "loss": 3.183520793914795, + "step": 2241, + "token_acc": 0.28137957899160954 + }, + { + "epoch": 1.31427733802404, + "grad_norm": 0.80459460675569, + "learning_rate": 0.00019986509240505417, + "loss": 3.2631258964538574, + "step": 2242, + "token_acc": 0.2708663223753606 + }, + { + "epoch": 1.314863676341249, + "grad_norm": 0.5970988188248154, + "learning_rate": 0.00019986458866267268, + "loss": 3.2179551124572754, + "step": 2243, + "token_acc": 0.27717829783674536 + }, + { + "epoch": 1.315450014658458, + "grad_norm": 0.6176680112576002, + "learning_rate": 0.00019986408398219791, + "loss": 3.232510566711426, + "step": 2244, + "token_acc": 0.2741283894483491 + }, + { + "epoch": 1.316036352975667, + "grad_norm": 0.700601417333807, + "learning_rate": 0.00019986357836363467, + "loss": 3.192729949951172, + "step": 2245, + "token_acc": 0.2806099774578204 + }, + { + "epoch": 1.316622691292876, + "grad_norm": 0.5887891281797769, + "learning_rate": 0.00019986307180698768, + "loss": 3.2269630432128906, + "step": 2246, + "token_acc": 0.2763050440454765 + }, + { + "epoch": 1.3172090296100851, + "grad_norm": 0.689922146390308, + "learning_rate": 0.0001998625643122617, + "loss": 3.225935935974121, + "step": 2247, + "token_acc": 0.27600047096377506 + }, + { + "epoch": 1.317795367927294, + "grad_norm": 0.6515404401005768, + "learning_rate": 0.00019986205587946146, + "loss": 3.2339959144592285, + "step": 2248, + "token_acc": 0.27438105406292734 + }, + { + "epoch": 1.3183817062445031, + "grad_norm": 0.5004332671999709, + "learning_rate": 0.00019986154650859182, + "loss": 3.197260856628418, + "step": 2249, + "token_acc": 0.27783522204887656 + }, + { + "epoch": 1.318968044561712, + "grad_norm": 0.5869524509361221, + "learning_rate": 0.0001998610361996575, + "loss": 3.2309985160827637, + "step": 2250, + "token_acc": 0.2741105999591882 + }, + { + "epoch": 1.3195543828789211, + "grad_norm": 0.7122138470828963, + "learning_rate": 0.00019986052495266326, + "loss": 3.1646969318389893, + "step": 2251, + "token_acc": 0.28456891106107973 + }, + { + "epoch": 1.3201407211961302, + "grad_norm": 0.6227812834177969, + "learning_rate": 0.000199860012767614, + "loss": 3.279928684234619, + "step": 2252, + "token_acc": 0.26896610901360735 + }, + { + "epoch": 1.3207270595133391, + "grad_norm": 0.6328106520193089, + "learning_rate": 0.0001998594996445145, + "loss": 3.2405104637145996, + "step": 2253, + "token_acc": 0.27473872467040533 + }, + { + "epoch": 1.3213133978305482, + "grad_norm": 0.6028168797756295, + "learning_rate": 0.00019985898558336953, + "loss": 3.237025499343872, + "step": 2254, + "token_acc": 0.27497195012795783 + }, + { + "epoch": 1.3218997361477571, + "grad_norm": 0.6781529378114041, + "learning_rate": 0.00019985847058418395, + "loss": 3.2425265312194824, + "step": 2255, + "token_acc": 0.2728344552643618 + }, + { + "epoch": 1.3224860744649662, + "grad_norm": 0.6709203509667832, + "learning_rate": 0.00019985795464696262, + "loss": 3.250643014907837, + "step": 2256, + "token_acc": 0.27371561223096397 + }, + { + "epoch": 1.3230724127821754, + "grad_norm": 0.6159761443485521, + "learning_rate": 0.00019985743777171036, + "loss": 3.1767172813415527, + "step": 2257, + "token_acc": 0.28017775419513163 + }, + { + "epoch": 1.3236587510993845, + "grad_norm": 0.827291773636289, + "learning_rate": 0.00019985691995843202, + "loss": 3.199859857559204, + "step": 2258, + "token_acc": 0.27957185621610464 + }, + { + "epoch": 1.3242450894165934, + "grad_norm": 0.5843705585078939, + "learning_rate": 0.0001998564012071325, + "loss": 3.253561496734619, + "step": 2259, + "token_acc": 0.27299517032981224 + }, + { + "epoch": 1.3248314277338025, + "grad_norm": 0.7190081360302373, + "learning_rate": 0.0001998558815178166, + "loss": 3.2165870666503906, + "step": 2260, + "token_acc": 0.27615683998320906 + }, + { + "epoch": 1.3254177660510114, + "grad_norm": 0.8784520031631571, + "learning_rate": 0.00019985536089048932, + "loss": 3.26008939743042, + "step": 2261, + "token_acc": 0.27026334489490245 + }, + { + "epoch": 1.3260041043682205, + "grad_norm": 0.7390322447560675, + "learning_rate": 0.0001998548393251554, + "loss": 3.2137928009033203, + "step": 2262, + "token_acc": 0.2756235250566841 + }, + { + "epoch": 1.3265904426854296, + "grad_norm": 0.5443228917546807, + "learning_rate": 0.00019985431682181988, + "loss": 3.201265335083008, + "step": 2263, + "token_acc": 0.2792324227966235 + }, + { + "epoch": 1.3271767810026385, + "grad_norm": 0.5575561333066834, + "learning_rate": 0.0001998537933804876, + "loss": 3.19649076461792, + "step": 2264, + "token_acc": 0.28025750505624664 + }, + { + "epoch": 1.3277631193198476, + "grad_norm": 0.48236433167077003, + "learning_rate": 0.00019985326900116347, + "loss": 3.2390646934509277, + "step": 2265, + "token_acc": 0.27410909127562105 + }, + { + "epoch": 1.3283494576370565, + "grad_norm": 0.5218286509775047, + "learning_rate": 0.00019985274368385244, + "loss": 3.232114553451538, + "step": 2266, + "token_acc": 0.2740335892048921 + }, + { + "epoch": 1.3289357959542656, + "grad_norm": 0.5376447609392558, + "learning_rate": 0.00019985221742855943, + "loss": 3.213789939880371, + "step": 2267, + "token_acc": 0.2782256401797864 + }, + { + "epoch": 1.3295221342714747, + "grad_norm": 0.5982102119809181, + "learning_rate": 0.0001998516902352894, + "loss": 3.208268404006958, + "step": 2268, + "token_acc": 0.27627221298694604 + }, + { + "epoch": 1.3301084725886836, + "grad_norm": 0.5571299573171761, + "learning_rate": 0.0001998511621040473, + "loss": 3.2628281116485596, + "step": 2269, + "token_acc": 0.27125648071476416 + }, + { + "epoch": 1.3306948109058927, + "grad_norm": 0.6002613267922414, + "learning_rate": 0.0001998506330348381, + "loss": 3.192324161529541, + "step": 2270, + "token_acc": 0.2790401160982689 + }, + { + "epoch": 1.3312811492231018, + "grad_norm": 0.6672970821280866, + "learning_rate": 0.00019985010302766673, + "loss": 3.2170486450195312, + "step": 2271, + "token_acc": 0.2753741905854105 + }, + { + "epoch": 1.3318674875403107, + "grad_norm": 0.6512075256222587, + "learning_rate": 0.00019984957208253815, + "loss": 3.2053141593933105, + "step": 2272, + "token_acc": 0.27761073667795305 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.526392866128523, + "learning_rate": 0.00019984904019945744, + "loss": 3.2353649139404297, + "step": 2273, + "token_acc": 0.27385611882231153 + }, + { + "epoch": 1.333040164174729, + "grad_norm": 0.5005279643343734, + "learning_rate": 0.00019984850737842953, + "loss": 3.2453670501708984, + "step": 2274, + "token_acc": 0.2743985618357743 + }, + { + "epoch": 1.3336265024919378, + "grad_norm": 0.5653080210755295, + "learning_rate": 0.00019984797361945943, + "loss": 3.2648744583129883, + "step": 2275, + "token_acc": 0.2692078742081471 + }, + { + "epoch": 1.334212840809147, + "grad_norm": 0.6128537822142958, + "learning_rate": 0.0001998474389225522, + "loss": 3.2634871006011963, + "step": 2276, + "token_acc": 0.2705781708585041 + }, + { + "epoch": 1.3347991791263558, + "grad_norm": 0.543765648879881, + "learning_rate": 0.0001998469032877128, + "loss": 3.234138250350952, + "step": 2277, + "token_acc": 0.27253592501906637 + }, + { + "epoch": 1.335385517443565, + "grad_norm": 0.6509227426809466, + "learning_rate": 0.0001998463667149463, + "loss": 3.158964157104492, + "step": 2278, + "token_acc": 0.28470620575905436 + }, + { + "epoch": 1.335971855760774, + "grad_norm": 0.6258121495301059, + "learning_rate": 0.00019984582920425773, + "loss": 3.2377443313598633, + "step": 2279, + "token_acc": 0.272644602264654 + }, + { + "epoch": 1.336558194077983, + "grad_norm": 0.5524376756774653, + "learning_rate": 0.00019984529075565212, + "loss": 3.2220206260681152, + "step": 2280, + "token_acc": 0.27683494164872263 + }, + { + "epoch": 1.337144532395192, + "grad_norm": 0.594201190056979, + "learning_rate": 0.00019984475136913457, + "loss": 3.2362303733825684, + "step": 2281, + "token_acc": 0.2747722914353091 + }, + { + "epoch": 1.337730870712401, + "grad_norm": 0.6699420353678947, + "learning_rate": 0.00019984421104471013, + "loss": 3.1627087593078613, + "step": 2282, + "token_acc": 0.28453556234677063 + }, + { + "epoch": 1.33831720902961, + "grad_norm": 0.5502641986777778, + "learning_rate": 0.00019984366978238383, + "loss": 3.1949961185455322, + "step": 2283, + "token_acc": 0.27853903232527205 + }, + { + "epoch": 1.3389035473468192, + "grad_norm": 0.543135943297599, + "learning_rate": 0.0001998431275821608, + "loss": 3.2371275424957275, + "step": 2284, + "token_acc": 0.2733758035044426 + }, + { + "epoch": 1.3394898856640283, + "grad_norm": 0.6619079985080465, + "learning_rate": 0.00019984258444404615, + "loss": 3.233591079711914, + "step": 2285, + "token_acc": 0.2740162125326743 + }, + { + "epoch": 1.3400762239812372, + "grad_norm": 0.5913094287642736, + "learning_rate": 0.00019984204036804494, + "loss": 3.328991174697876, + "step": 2286, + "token_acc": 0.26215950066267335 + }, + { + "epoch": 1.3406625622984463, + "grad_norm": 0.6062428982943917, + "learning_rate": 0.0001998414953541623, + "loss": 3.1826672554016113, + "step": 2287, + "token_acc": 0.28090535346045103 + }, + { + "epoch": 1.3412489006156552, + "grad_norm": 0.7284424605106146, + "learning_rate": 0.00019984094940240338, + "loss": 3.25720477104187, + "step": 2288, + "token_acc": 0.26918541280909797 + }, + { + "epoch": 1.3418352389328643, + "grad_norm": 0.7718300280624945, + "learning_rate": 0.00019984040251277329, + "loss": 3.2442948818206787, + "step": 2289, + "token_acc": 0.27410718749259994 + }, + { + "epoch": 1.3424215772500734, + "grad_norm": 0.807390073401052, + "learning_rate": 0.00019983985468527708, + "loss": 3.2178585529327393, + "step": 2290, + "token_acc": 0.2757340080535077 + }, + { + "epoch": 1.3430079155672823, + "grad_norm": 0.6404755554627158, + "learning_rate": 0.00019983930591992, + "loss": 3.2205185890197754, + "step": 2291, + "token_acc": 0.27700200160759036 + }, + { + "epoch": 1.3435942538844914, + "grad_norm": 0.553621751543449, + "learning_rate": 0.00019983875621670723, + "loss": 3.2219204902648926, + "step": 2292, + "token_acc": 0.2742485572707437 + }, + { + "epoch": 1.3441805922017003, + "grad_norm": 0.7061550096923472, + "learning_rate": 0.0001998382055756438, + "loss": 3.2496695518493652, + "step": 2293, + "token_acc": 0.2730970952124798 + }, + { + "epoch": 1.3447669305189094, + "grad_norm": 0.6849586960660874, + "learning_rate": 0.000199837653996735, + "loss": 3.274606466293335, + "step": 2294, + "token_acc": 0.26955158341042634 + }, + { + "epoch": 1.3453532688361185, + "grad_norm": 0.7289622826884206, + "learning_rate": 0.00019983710147998597, + "loss": 3.163226366043091, + "step": 2295, + "token_acc": 0.28403002514659875 + }, + { + "epoch": 1.3459396071533274, + "grad_norm": 0.7688357287609365, + "learning_rate": 0.00019983654802540194, + "loss": 3.2314798831939697, + "step": 2296, + "token_acc": 0.2742084156658738 + }, + { + "epoch": 1.3465259454705365, + "grad_norm": 0.5642437501537321, + "learning_rate": 0.00019983599363298802, + "loss": 3.2372398376464844, + "step": 2297, + "token_acc": 0.27441326111931724 + }, + { + "epoch": 1.3471122837877456, + "grad_norm": 0.6241511696739673, + "learning_rate": 0.00019983543830274952, + "loss": 3.2144174575805664, + "step": 2298, + "token_acc": 0.2769318829278027 + }, + { + "epoch": 1.3476986221049545, + "grad_norm": 0.5972181630469828, + "learning_rate": 0.00019983488203469157, + "loss": 3.274454116821289, + "step": 2299, + "token_acc": 0.26843078732784215 + }, + { + "epoch": 1.3482849604221636, + "grad_norm": 0.5406282947798213, + "learning_rate": 0.00019983432482881944, + "loss": 3.2105770111083984, + "step": 2300, + "token_acc": 0.27812797117207977 + }, + { + "epoch": 1.3488712987393727, + "grad_norm": 0.5522460594731188, + "learning_rate": 0.00019983376668513835, + "loss": 3.185898780822754, + "step": 2301, + "token_acc": 0.2811006360565548 + }, + { + "epoch": 1.3494576370565816, + "grad_norm": 0.520622267451164, + "learning_rate": 0.00019983320760365357, + "loss": 3.185917615890503, + "step": 2302, + "token_acc": 0.278965231810255 + }, + { + "epoch": 1.3500439753737907, + "grad_norm": 0.6353993616355507, + "learning_rate": 0.00019983264758437032, + "loss": 3.2158405780792236, + "step": 2303, + "token_acc": 0.27638747397794217 + }, + { + "epoch": 1.3506303136909996, + "grad_norm": 0.8916374222969062, + "learning_rate": 0.00019983208662729385, + "loss": 3.200443744659424, + "step": 2304, + "token_acc": 0.2782498921532562 + }, + { + "epoch": 1.3512166520082087, + "grad_norm": 0.8026032583630455, + "learning_rate": 0.00019983152473242947, + "loss": 3.260380268096924, + "step": 2305, + "token_acc": 0.27103291479357206 + }, + { + "epoch": 1.3518029903254178, + "grad_norm": 0.6341304989208912, + "learning_rate": 0.00019983096189978244, + "loss": 3.1936960220336914, + "step": 2306, + "token_acc": 0.28008496663886573 + }, + { + "epoch": 1.3523893286426267, + "grad_norm": 0.584122256168139, + "learning_rate": 0.00019983039812935806, + "loss": 3.2230868339538574, + "step": 2307, + "token_acc": 0.2768247829597217 + }, + { + "epoch": 1.3529756669598358, + "grad_norm": 0.6495324025482342, + "learning_rate": 0.0001998298334211616, + "loss": 3.30298113822937, + "step": 2308, + "token_acc": 0.2657771204253706 + }, + { + "epoch": 1.3535620052770447, + "grad_norm": 0.5313045079574161, + "learning_rate": 0.00019982926777519836, + "loss": 3.273165702819824, + "step": 2309, + "token_acc": 0.26882349484167056 + }, + { + "epoch": 1.3541483435942538, + "grad_norm": 0.5889348974434815, + "learning_rate": 0.0001998287011914737, + "loss": 3.207526922225952, + "step": 2310, + "token_acc": 0.2781034070731651 + }, + { + "epoch": 1.354734681911463, + "grad_norm": 0.6029770858900678, + "learning_rate": 0.00019982813366999295, + "loss": 3.217254638671875, + "step": 2311, + "token_acc": 0.2779865550355765 + }, + { + "epoch": 1.355321020228672, + "grad_norm": 0.4482869287343374, + "learning_rate": 0.00019982756521076133, + "loss": 3.1985254287719727, + "step": 2312, + "token_acc": 0.27998129403376787 + }, + { + "epoch": 1.355907358545881, + "grad_norm": 0.5913790783237538, + "learning_rate": 0.00019982699581378428, + "loss": 3.203127145767212, + "step": 2313, + "token_acc": 0.2761919782722597 + }, + { + "epoch": 1.35649369686309, + "grad_norm": 0.5958282437330858, + "learning_rate": 0.0001998264254790671, + "loss": 3.253300189971924, + "step": 2314, + "token_acc": 0.2701720766009949 + }, + { + "epoch": 1.357080035180299, + "grad_norm": 0.4361716900632453, + "learning_rate": 0.0001998258542066152, + "loss": 3.1949198246002197, + "step": 2315, + "token_acc": 0.27966022329485196 + }, + { + "epoch": 1.357666373497508, + "grad_norm": 0.5560691823152235, + "learning_rate": 0.00019982528199643393, + "loss": 3.215174674987793, + "step": 2316, + "token_acc": 0.2759738021371941 + }, + { + "epoch": 1.3582527118147172, + "grad_norm": 0.6014850157988356, + "learning_rate": 0.00019982470884852865, + "loss": 3.2045233249664307, + "step": 2317, + "token_acc": 0.276458151314626 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.5856864457684887, + "learning_rate": 0.00019982413476290472, + "loss": 3.2175469398498535, + "step": 2318, + "token_acc": 0.27388324415318943 + }, + { + "epoch": 1.3594253884491352, + "grad_norm": 0.5750138873305165, + "learning_rate": 0.00019982355973956756, + "loss": 3.2004001140594482, + "step": 2319, + "token_acc": 0.27929333692045555 + }, + { + "epoch": 1.360011726766344, + "grad_norm": 0.8072528462993173, + "learning_rate": 0.00019982298377852257, + "loss": 3.1702215671539307, + "step": 2320, + "token_acc": 0.28318932990754014 + }, + { + "epoch": 1.3605980650835532, + "grad_norm": 0.7485061521492183, + "learning_rate": 0.00019982240687977518, + "loss": 3.254049301147461, + "step": 2321, + "token_acc": 0.27151328816823866 + }, + { + "epoch": 1.3611844034007623, + "grad_norm": 0.737655221888409, + "learning_rate": 0.00019982182904333077, + "loss": 3.223850727081299, + "step": 2322, + "token_acc": 0.27522641618527494 + }, + { + "epoch": 1.3617707417179712, + "grad_norm": 0.6708598417714791, + "learning_rate": 0.0001998212502691948, + "loss": 3.2333619594573975, + "step": 2323, + "token_acc": 0.27585207675678725 + }, + { + "epoch": 1.3623570800351803, + "grad_norm": 0.5251337659452316, + "learning_rate": 0.00019982067055737267, + "loss": 3.2231106758117676, + "step": 2324, + "token_acc": 0.275502283707791 + }, + { + "epoch": 1.3629434183523892, + "grad_norm": 0.5057244774599616, + "learning_rate": 0.00019982008990786988, + "loss": 3.2434182167053223, + "step": 2325, + "token_acc": 0.2733519512015874 + }, + { + "epoch": 1.3635297566695983, + "grad_norm": 0.6742115577405382, + "learning_rate": 0.00019981950832069186, + "loss": 3.2518272399902344, + "step": 2326, + "token_acc": 0.2722143936349346 + }, + { + "epoch": 1.3641160949868074, + "grad_norm": 0.7967552228871582, + "learning_rate": 0.00019981892579584406, + "loss": 3.2046470642089844, + "step": 2327, + "token_acc": 0.2771851183075018 + }, + { + "epoch": 1.3647024333040165, + "grad_norm": 0.6365444965383319, + "learning_rate": 0.00019981834233333197, + "loss": 3.259977340698242, + "step": 2328, + "token_acc": 0.26872736003123754 + }, + { + "epoch": 1.3652887716212254, + "grad_norm": 0.5333450647854192, + "learning_rate": 0.00019981775793316103, + "loss": 3.192427396774292, + "step": 2329, + "token_acc": 0.279133975670701 + }, + { + "epoch": 1.3658751099384345, + "grad_norm": 0.5252276987943667, + "learning_rate": 0.00019981717259533676, + "loss": 3.23946475982666, + "step": 2330, + "token_acc": 0.27350827744334877 + }, + { + "epoch": 1.3664614482556434, + "grad_norm": 0.5863381450647908, + "learning_rate": 0.0001998165863198647, + "loss": 3.1982219219207764, + "step": 2331, + "token_acc": 0.27799400090831317 + }, + { + "epoch": 1.3670477865728525, + "grad_norm": 0.6108867726281499, + "learning_rate": 0.00019981599910675026, + "loss": 3.2414894104003906, + "step": 2332, + "token_acc": 0.2736137221125383 + }, + { + "epoch": 1.3676341248900616, + "grad_norm": 0.7113411580867973, + "learning_rate": 0.00019981541095599905, + "loss": 3.242356300354004, + "step": 2333, + "token_acc": 0.27478681788759746 + }, + { + "epoch": 1.3682204632072705, + "grad_norm": 0.5791722020006749, + "learning_rate": 0.00019981482186761656, + "loss": 3.2137610912323, + "step": 2334, + "token_acc": 0.27603771301583113 + }, + { + "epoch": 1.3688068015244796, + "grad_norm": 0.5424069615258724, + "learning_rate": 0.0001998142318416083, + "loss": 3.1894490718841553, + "step": 2335, + "token_acc": 0.279772065405126 + }, + { + "epoch": 1.3693931398416885, + "grad_norm": 0.5035732296910779, + "learning_rate": 0.00019981364087797986, + "loss": 3.1840128898620605, + "step": 2336, + "token_acc": 0.2813259961761568 + }, + { + "epoch": 1.3699794781588976, + "grad_norm": 0.6264023162641467, + "learning_rate": 0.00019981304897673675, + "loss": 3.2200043201446533, + "step": 2337, + "token_acc": 0.27721157032861726 + }, + { + "epoch": 1.3705658164761068, + "grad_norm": 0.6514884948140748, + "learning_rate": 0.00019981245613788452, + "loss": 3.2242660522460938, + "step": 2338, + "token_acc": 0.2758242906170619 + }, + { + "epoch": 1.3711521547933159, + "grad_norm": 0.5291656236228282, + "learning_rate": 0.00019981186236142878, + "loss": 3.199312686920166, + "step": 2339, + "token_acc": 0.2786369761920041 + }, + { + "epoch": 1.3717384931105248, + "grad_norm": 0.4565481505770869, + "learning_rate": 0.0001998112676473751, + "loss": 3.2137539386749268, + "step": 2340, + "token_acc": 0.27764540774914515 + }, + { + "epoch": 1.3723248314277339, + "grad_norm": 0.5558804257886234, + "learning_rate": 0.00019981067199572908, + "loss": 3.249399185180664, + "step": 2341, + "token_acc": 0.27489022621241227 + }, + { + "epoch": 1.3729111697449428, + "grad_norm": 0.7062373053707898, + "learning_rate": 0.00019981007540649626, + "loss": 3.239898681640625, + "step": 2342, + "token_acc": 0.2759300705867622 + }, + { + "epoch": 1.3734975080621519, + "grad_norm": 0.8363165494654855, + "learning_rate": 0.00019980947787968226, + "loss": 3.2085115909576416, + "step": 2343, + "token_acc": 0.2780588166091669 + }, + { + "epoch": 1.374083846379361, + "grad_norm": 0.6247519410570893, + "learning_rate": 0.00019980887941529273, + "loss": 3.2052500247955322, + "step": 2344, + "token_acc": 0.27813829500780873 + }, + { + "epoch": 1.3746701846965699, + "grad_norm": 0.5549906609830094, + "learning_rate": 0.0001998082800133333, + "loss": 3.275214195251465, + "step": 2345, + "token_acc": 0.2680938476347191 + }, + { + "epoch": 1.375256523013779, + "grad_norm": 0.683637149844554, + "learning_rate": 0.00019980767967380954, + "loss": 3.1837263107299805, + "step": 2346, + "token_acc": 0.28113576082953984 + }, + { + "epoch": 1.3758428613309879, + "grad_norm": 0.6688027626371712, + "learning_rate": 0.00019980707839672713, + "loss": 3.220227003097534, + "step": 2347, + "token_acc": 0.27730263331451516 + }, + { + "epoch": 1.376429199648197, + "grad_norm": 0.6017286270436637, + "learning_rate": 0.00019980647618209171, + "loss": 3.2132978439331055, + "step": 2348, + "token_acc": 0.27634790503912193 + }, + { + "epoch": 1.377015537965406, + "grad_norm": 0.4831958397930829, + "learning_rate": 0.00019980587302990895, + "loss": 3.2312119007110596, + "step": 2349, + "token_acc": 0.27301190201642417 + }, + { + "epoch": 1.377601876282615, + "grad_norm": 0.5930291861968073, + "learning_rate": 0.0001998052689401845, + "loss": 3.1645560264587402, + "step": 2350, + "token_acc": 0.2840330780360248 + }, + { + "epoch": 1.378188214599824, + "grad_norm": 0.5622695346890244, + "learning_rate": 0.00019980466391292403, + "loss": 3.2629756927490234, + "step": 2351, + "token_acc": 0.2710935200370043 + }, + { + "epoch": 1.378774552917033, + "grad_norm": 0.6856628267589485, + "learning_rate": 0.0001998040579481332, + "loss": 3.2333879470825195, + "step": 2352, + "token_acc": 0.27330692104856597 + }, + { + "epoch": 1.379360891234242, + "grad_norm": 0.6863675546101641, + "learning_rate": 0.00019980345104581777, + "loss": 3.2529211044311523, + "step": 2353, + "token_acc": 0.2715952691528902 + }, + { + "epoch": 1.3799472295514512, + "grad_norm": 0.6899547873172952, + "learning_rate": 0.0001998028432059834, + "loss": 3.2395944595336914, + "step": 2354, + "token_acc": 0.2726540005264743 + }, + { + "epoch": 1.3805335678686603, + "grad_norm": 0.6177271704604851, + "learning_rate": 0.0001998022344286358, + "loss": 3.229825973510742, + "step": 2355, + "token_acc": 0.2758487321610102 + }, + { + "epoch": 1.3811199061858692, + "grad_norm": 0.623916065446246, + "learning_rate": 0.0001998016247137807, + "loss": 3.2398109436035156, + "step": 2356, + "token_acc": 0.2735699090934939 + }, + { + "epoch": 1.3817062445030783, + "grad_norm": 0.567433224816844, + "learning_rate": 0.0001998010140614238, + "loss": 3.215573787689209, + "step": 2357, + "token_acc": 0.27671625281107 + }, + { + "epoch": 1.3822925828202872, + "grad_norm": 0.6598461412279835, + "learning_rate": 0.00019980040247157091, + "loss": 3.1701865196228027, + "step": 2358, + "token_acc": 0.28190160952444854 + }, + { + "epoch": 1.3828789211374963, + "grad_norm": 0.6705453813496319, + "learning_rate": 0.00019979978994422767, + "loss": 3.2331960201263428, + "step": 2359, + "token_acc": 0.2736695959484404 + }, + { + "epoch": 1.3834652594547054, + "grad_norm": 0.6921995770859319, + "learning_rate": 0.00019979917647939988, + "loss": 3.2129034996032715, + "step": 2360, + "token_acc": 0.27447688934007985 + }, + { + "epoch": 1.3840515977719143, + "grad_norm": 0.6961748960749062, + "learning_rate": 0.00019979856207709334, + "loss": 3.2598257064819336, + "step": 2361, + "token_acc": 0.27075521005119985 + }, + { + "epoch": 1.3846379360891234, + "grad_norm": 0.6664236138689095, + "learning_rate": 0.00019979794673731375, + "loss": 3.2386069297790527, + "step": 2362, + "token_acc": 0.27316250203976356 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.5350056839982936, + "learning_rate": 0.00019979733046006696, + "loss": 3.2003543376922607, + "step": 2363, + "token_acc": 0.27673492378530756 + }, + { + "epoch": 1.3858106127235414, + "grad_norm": 0.5791184182126338, + "learning_rate": 0.00019979671324535874, + "loss": 3.2376790046691895, + "step": 2364, + "token_acc": 0.27297032744464056 + }, + { + "epoch": 1.3863969510407506, + "grad_norm": 0.5941116777472292, + "learning_rate": 0.00019979609509319487, + "loss": 3.2144241333007812, + "step": 2365, + "token_acc": 0.2756934051551069 + }, + { + "epoch": 1.3869832893579597, + "grad_norm": 0.5665023755124522, + "learning_rate": 0.00019979547600358115, + "loss": 3.249159336090088, + "step": 2366, + "token_acc": 0.2718142607769924 + }, + { + "epoch": 1.3875696276751686, + "grad_norm": 0.5406878436917767, + "learning_rate": 0.0001997948559765234, + "loss": 3.207214832305908, + "step": 2367, + "token_acc": 0.2771076390537852 + }, + { + "epoch": 1.3881559659923777, + "grad_norm": 0.6048104840838411, + "learning_rate": 0.00019979423501202746, + "loss": 3.2486190795898438, + "step": 2368, + "token_acc": 0.2729608237791709 + }, + { + "epoch": 1.3887423043095866, + "grad_norm": 0.6415322916823561, + "learning_rate": 0.00019979361311009918, + "loss": 3.2393012046813965, + "step": 2369, + "token_acc": 0.2724976393447826 + }, + { + "epoch": 1.3893286426267957, + "grad_norm": 0.7798264213529195, + "learning_rate": 0.00019979299027074435, + "loss": 3.228355884552002, + "step": 2370, + "token_acc": 0.27472287316675004 + }, + { + "epoch": 1.3899149809440048, + "grad_norm": 0.714062649363242, + "learning_rate": 0.0001997923664939689, + "loss": 3.142852783203125, + "step": 2371, + "token_acc": 0.2855816119518031 + }, + { + "epoch": 1.3905013192612137, + "grad_norm": 0.4872556186733384, + "learning_rate": 0.00019979174177977858, + "loss": 3.2286200523376465, + "step": 2372, + "token_acc": 0.2735760971055089 + }, + { + "epoch": 1.3910876575784228, + "grad_norm": 0.518015842590596, + "learning_rate": 0.00019979111612817934, + "loss": 3.178013801574707, + "step": 2373, + "token_acc": 0.27998639617031584 + }, + { + "epoch": 1.3916739958956317, + "grad_norm": 0.6617056547679382, + "learning_rate": 0.00019979048953917703, + "loss": 3.2063825130462646, + "step": 2374, + "token_acc": 0.2755368359051696 + }, + { + "epoch": 1.3922603342128408, + "grad_norm": 0.5868555124254369, + "learning_rate": 0.00019978986201277753, + "loss": 3.2318530082702637, + "step": 2375, + "token_acc": 0.2750218121213267 + }, + { + "epoch": 1.39284667253005, + "grad_norm": 0.6275465511085035, + "learning_rate": 0.00019978923354898678, + "loss": 3.2033700942993164, + "step": 2376, + "token_acc": 0.2784625454430749 + }, + { + "epoch": 1.3934330108472588, + "grad_norm": 0.7537940168930265, + "learning_rate": 0.00019978860414781061, + "loss": 3.250974655151367, + "step": 2377, + "token_acc": 0.27068091131419747 + }, + { + "epoch": 1.394019349164468, + "grad_norm": 0.7391654192718113, + "learning_rate": 0.000199787973809255, + "loss": 3.231107234954834, + "step": 2378, + "token_acc": 0.2753643515957184 + }, + { + "epoch": 1.3946056874816768, + "grad_norm": 0.7801532073842946, + "learning_rate": 0.00019978734253332583, + "loss": 3.2028980255126953, + "step": 2379, + "token_acc": 0.2796999285299275 + }, + { + "epoch": 1.395192025798886, + "grad_norm": 0.8112482551412615, + "learning_rate": 0.00019978671032002903, + "loss": 3.2664527893066406, + "step": 2380, + "token_acc": 0.27029385400112643 + }, + { + "epoch": 1.395778364116095, + "grad_norm": 0.6584939446017312, + "learning_rate": 0.00019978607716937056, + "loss": 3.2783241271972656, + "step": 2381, + "token_acc": 0.2664420988812552 + }, + { + "epoch": 1.3963647024333041, + "grad_norm": 0.504938948741125, + "learning_rate": 0.00019978544308135634, + "loss": 3.2081007957458496, + "step": 2382, + "token_acc": 0.27739006404087646 + }, + { + "epoch": 1.396951040750513, + "grad_norm": 0.63134838024265, + "learning_rate": 0.00019978480805599237, + "loss": 3.2313427925109863, + "step": 2383, + "token_acc": 0.2729746265227571 + }, + { + "epoch": 1.3975373790677221, + "grad_norm": 0.697818896389499, + "learning_rate": 0.0001997841720932846, + "loss": 3.2521848678588867, + "step": 2384, + "token_acc": 0.27125546625616476 + }, + { + "epoch": 1.398123717384931, + "grad_norm": 0.5015365181197838, + "learning_rate": 0.000199783535193239, + "loss": 3.227332592010498, + "step": 2385, + "token_acc": 0.276129883843717 + }, + { + "epoch": 1.3987100557021401, + "grad_norm": 0.5464465531096463, + "learning_rate": 0.00019978289735586149, + "loss": 3.240340232849121, + "step": 2386, + "token_acc": 0.2731998514020727 + }, + { + "epoch": 1.3992963940193492, + "grad_norm": 0.7590414798245688, + "learning_rate": 0.00019978225858115816, + "loss": 3.249244213104248, + "step": 2387, + "token_acc": 0.2715142894349337 + }, + { + "epoch": 1.3998827323365581, + "grad_norm": 0.6888253009341045, + "learning_rate": 0.00019978161886913495, + "loss": 3.229674816131592, + "step": 2388, + "token_acc": 0.2764979406055231 + }, + { + "epoch": 1.4004690706537672, + "grad_norm": 0.6199260499984348, + "learning_rate": 0.0001997809782197979, + "loss": 3.2093381881713867, + "step": 2389, + "token_acc": 0.27615976536629977 + }, + { + "epoch": 1.4010554089709761, + "grad_norm": 0.5217379877598975, + "learning_rate": 0.00019978033663315304, + "loss": 3.2073476314544678, + "step": 2390, + "token_acc": 0.2771758081719724 + }, + { + "epoch": 1.4016417472881852, + "grad_norm": 0.567808325841439, + "learning_rate": 0.00019977969410920634, + "loss": 3.2124552726745605, + "step": 2391, + "token_acc": 0.27608767711418963 + }, + { + "epoch": 1.4022280856053944, + "grad_norm": 0.6140491549357279, + "learning_rate": 0.00019977905064796388, + "loss": 3.236581802368164, + "step": 2392, + "token_acc": 0.2758689023584681 + }, + { + "epoch": 1.4028144239226035, + "grad_norm": 0.5686894086779063, + "learning_rate": 0.00019977840624943167, + "loss": 3.2035863399505615, + "step": 2393, + "token_acc": 0.2768361581920904 + }, + { + "epoch": 1.4034007622398124, + "grad_norm": 0.5972326681685846, + "learning_rate": 0.00019977776091361583, + "loss": 3.2221245765686035, + "step": 2394, + "token_acc": 0.27841265236558416 + }, + { + "epoch": 1.4039871005570215, + "grad_norm": 0.4943320081203837, + "learning_rate": 0.00019977711464052233, + "loss": 3.184516668319702, + "step": 2395, + "token_acc": 0.2796301576106801 + }, + { + "epoch": 1.4045734388742304, + "grad_norm": 0.7119464937378809, + "learning_rate": 0.00019977646743015733, + "loss": 3.2168521881103516, + "step": 2396, + "token_acc": 0.27513242951460637 + }, + { + "epoch": 1.4051597771914395, + "grad_norm": 0.6645047305604501, + "learning_rate": 0.00019977581928252685, + "loss": 3.195078134536743, + "step": 2397, + "token_acc": 0.2799557319452792 + }, + { + "epoch": 1.4057461155086486, + "grad_norm": 0.44322205655539476, + "learning_rate": 0.000199775170197637, + "loss": 3.194641590118408, + "step": 2398, + "token_acc": 0.27951687403049474 + }, + { + "epoch": 1.4063324538258575, + "grad_norm": 0.5374841510988142, + "learning_rate": 0.00019977452017549388, + "loss": 3.2574822902679443, + "step": 2399, + "token_acc": 0.2714746574602407 + }, + { + "epoch": 1.4069187921430666, + "grad_norm": 0.6055705981763689, + "learning_rate": 0.0001997738692161036, + "loss": 3.228893280029297, + "step": 2400, + "token_acc": 0.2745166001672188 + }, + { + "epoch": 1.4075051304602755, + "grad_norm": 0.5640662013810072, + "learning_rate": 0.00019977321731947225, + "loss": 3.267703056335449, + "step": 2401, + "token_acc": 0.26852412883925514 + }, + { + "epoch": 1.4080914687774846, + "grad_norm": 0.6622212614390544, + "learning_rate": 0.00019977256448560596, + "loss": 3.259082317352295, + "step": 2402, + "token_acc": 0.27010537296814735 + }, + { + "epoch": 1.4086778070946937, + "grad_norm": 0.7015160617451744, + "learning_rate": 0.0001997719107145109, + "loss": 3.2116522789001465, + "step": 2403, + "token_acc": 0.2758896892948403 + }, + { + "epoch": 1.4092641454119026, + "grad_norm": 0.720953842758773, + "learning_rate": 0.00019977125600619314, + "loss": 3.187607526779175, + "step": 2404, + "token_acc": 0.28193377084275123 + }, + { + "epoch": 1.4098504837291117, + "grad_norm": 0.6453306553580302, + "learning_rate": 0.00019977060036065894, + "loss": 3.2051162719726562, + "step": 2405, + "token_acc": 0.27592966712226574 + }, + { + "epoch": 1.4104368220463206, + "grad_norm": 0.5671117551022952, + "learning_rate": 0.00019976994377791433, + "loss": 3.2236409187316895, + "step": 2406, + "token_acc": 0.2759743129761069 + }, + { + "epoch": 1.4110231603635297, + "grad_norm": 0.5408282221800969, + "learning_rate": 0.00019976928625796555, + "loss": 3.2435684204101562, + "step": 2407, + "token_acc": 0.2732884516333782 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.6399059777064569, + "learning_rate": 0.0001997686278008188, + "loss": 3.207069158554077, + "step": 2408, + "token_acc": 0.27848512135363224 + }, + { + "epoch": 1.412195836997948, + "grad_norm": 0.7672870758021896, + "learning_rate": 0.0001997679684064802, + "loss": 3.2588136196136475, + "step": 2409, + "token_acc": 0.2709196503106131 + }, + { + "epoch": 1.4127821753151568, + "grad_norm": 0.8975147344890393, + "learning_rate": 0.00019976730807495598, + "loss": 3.2286903858184814, + "step": 2410, + "token_acc": 0.27536499722786917 + }, + { + "epoch": 1.413368513632366, + "grad_norm": 0.8407919415761599, + "learning_rate": 0.00019976664680625237, + "loss": 3.176316261291504, + "step": 2411, + "token_acc": 0.280994843702989 + }, + { + "epoch": 1.4139548519495748, + "grad_norm": 0.6143956569068743, + "learning_rate": 0.0001997659846003755, + "loss": 3.227346420288086, + "step": 2412, + "token_acc": 0.27461865863007906 + }, + { + "epoch": 1.414541190266784, + "grad_norm": 0.48061711290683096, + "learning_rate": 0.0001997653214573317, + "loss": 3.223888397216797, + "step": 2413, + "token_acc": 0.27643683101921507 + }, + { + "epoch": 1.415127528583993, + "grad_norm": 0.7327587026591392, + "learning_rate": 0.0001997646573771271, + "loss": 3.1877684593200684, + "step": 2414, + "token_acc": 0.2809499477589744 + }, + { + "epoch": 1.415713866901202, + "grad_norm": 0.6464437940365194, + "learning_rate": 0.00019976399235976797, + "loss": 3.2386980056762695, + "step": 2415, + "token_acc": 0.27239908876227015 + }, + { + "epoch": 1.416300205218411, + "grad_norm": 0.5445316993199244, + "learning_rate": 0.00019976332640526059, + "loss": 3.204814910888672, + "step": 2416, + "token_acc": 0.2777111016947002 + }, + { + "epoch": 1.41688654353562, + "grad_norm": 0.5315443796877389, + "learning_rate": 0.0001997626595136112, + "loss": 3.207308292388916, + "step": 2417, + "token_acc": 0.27723640192492977 + }, + { + "epoch": 1.417472881852829, + "grad_norm": 0.49204973705629174, + "learning_rate": 0.00019976199168482604, + "loss": 3.2157540321350098, + "step": 2418, + "token_acc": 0.27719271109736227 + }, + { + "epoch": 1.4180592201700382, + "grad_norm": 0.4954114593314, + "learning_rate": 0.00019976132291891138, + "loss": 3.249311923980713, + "step": 2419, + "token_acc": 0.2708770871888711 + }, + { + "epoch": 1.4186455584872473, + "grad_norm": 0.5442182335881889, + "learning_rate": 0.00019976065321587353, + "loss": 3.236985206604004, + "step": 2420, + "token_acc": 0.2732306121254993 + }, + { + "epoch": 1.4192318968044562, + "grad_norm": 0.5813570205458434, + "learning_rate": 0.00019975998257571877, + "loss": 3.2398853302001953, + "step": 2421, + "token_acc": 0.27186714667517653 + }, + { + "epoch": 1.4198182351216653, + "grad_norm": 0.5748631709304792, + "learning_rate": 0.00019975931099845343, + "loss": 3.1971547603607178, + "step": 2422, + "token_acc": 0.2776223706609061 + }, + { + "epoch": 1.4204045734388742, + "grad_norm": 0.5024429192673745, + "learning_rate": 0.00019975863848408377, + "loss": 3.1993765830993652, + "step": 2423, + "token_acc": 0.277317748066927 + }, + { + "epoch": 1.4209909117560833, + "grad_norm": 0.5594769247118319, + "learning_rate": 0.00019975796503261613, + "loss": 3.170154094696045, + "step": 2424, + "token_acc": 0.2817924620004809 + }, + { + "epoch": 1.4215772500732924, + "grad_norm": 0.6378007881785445, + "learning_rate": 0.00019975729064405684, + "loss": 3.1824638843536377, + "step": 2425, + "token_acc": 0.2817877124458989 + }, + { + "epoch": 1.4221635883905013, + "grad_norm": 0.6649885201154627, + "learning_rate": 0.00019975661531841223, + "loss": 3.2195241451263428, + "step": 2426, + "token_acc": 0.27712886836974426 + }, + { + "epoch": 1.4227499267077104, + "grad_norm": 0.5871732574327972, + "learning_rate": 0.00019975593905568862, + "loss": 3.236100673675537, + "step": 2427, + "token_acc": 0.27433189030173033 + }, + { + "epoch": 1.4233362650249193, + "grad_norm": 0.5080365770420008, + "learning_rate": 0.0001997552618558924, + "loss": 3.2706072330474854, + "step": 2428, + "token_acc": 0.2694888404568753 + }, + { + "epoch": 1.4239226033421284, + "grad_norm": 0.6149748132532346, + "learning_rate": 0.00019975458371902994, + "loss": 3.1961092948913574, + "step": 2429, + "token_acc": 0.27900624991879003 + }, + { + "epoch": 1.4245089416593375, + "grad_norm": 0.6128820510083652, + "learning_rate": 0.00019975390464510757, + "loss": 3.214637517929077, + "step": 2430, + "token_acc": 0.27467537661645114 + }, + { + "epoch": 1.4250952799765464, + "grad_norm": 0.5678014153886253, + "learning_rate": 0.00019975322463413169, + "loss": 3.1884846687316895, + "step": 2431, + "token_acc": 0.2799892381143349 + }, + { + "epoch": 1.4256816182937555, + "grad_norm": 0.5115537102444275, + "learning_rate": 0.00019975254368610865, + "loss": 3.1780881881713867, + "step": 2432, + "token_acc": 0.28135368786338316 + }, + { + "epoch": 1.4262679566109644, + "grad_norm": 0.7145595930189065, + "learning_rate": 0.0001997518618010449, + "loss": 3.251774787902832, + "step": 2433, + "token_acc": 0.26977892492425093 + }, + { + "epoch": 1.4268542949281735, + "grad_norm": 0.6457555340355297, + "learning_rate": 0.00019975117897894684, + "loss": 3.21073055267334, + "step": 2434, + "token_acc": 0.27644302320374464 + }, + { + "epoch": 1.4274406332453826, + "grad_norm": 0.5887215595468299, + "learning_rate": 0.00019975049521982086, + "loss": 3.2040278911590576, + "step": 2435, + "token_acc": 0.2785312525379015 + }, + { + "epoch": 1.4280269715625917, + "grad_norm": 0.678209814023707, + "learning_rate": 0.00019974981052367342, + "loss": 3.212954521179199, + "step": 2436, + "token_acc": 0.2769654705519899 + }, + { + "epoch": 1.4286133098798006, + "grad_norm": 0.4820990664345289, + "learning_rate": 0.00019974912489051087, + "loss": 3.2021005153656006, + "step": 2437, + "token_acc": 0.2775631795839333 + }, + { + "epoch": 1.4291996481970097, + "grad_norm": 0.5112590163606578, + "learning_rate": 0.00019974843832033977, + "loss": 3.2184808254241943, + "step": 2438, + "token_acc": 0.27523801017191374 + }, + { + "epoch": 1.4297859865142186, + "grad_norm": 0.535891472216452, + "learning_rate": 0.00019974775081316642, + "loss": 3.209672451019287, + "step": 2439, + "token_acc": 0.2755418848167539 + }, + { + "epoch": 1.4303723248314277, + "grad_norm": 0.5010239049644959, + "learning_rate": 0.00019974706236899743, + "loss": 3.221233606338501, + "step": 2440, + "token_acc": 0.2752595612569502 + }, + { + "epoch": 1.4309586631486368, + "grad_norm": 0.5494723549490004, + "learning_rate": 0.00019974637298783918, + "loss": 3.211958646774292, + "step": 2441, + "token_acc": 0.27455440690385946 + }, + { + "epoch": 1.4315450014658457, + "grad_norm": 0.5352035058258378, + "learning_rate": 0.00019974568266969818, + "loss": 3.177605152130127, + "step": 2442, + "token_acc": 0.28160656644363297 + }, + { + "epoch": 1.4321313397830548, + "grad_norm": 0.6491733593368556, + "learning_rate": 0.0001997449914145809, + "loss": 3.204132080078125, + "step": 2443, + "token_acc": 0.27808162441723394 + }, + { + "epoch": 1.4327176781002637, + "grad_norm": 0.5205690583051633, + "learning_rate": 0.00019974429922249383, + "loss": 3.21420955657959, + "step": 2444, + "token_acc": 0.27544076062242245 + }, + { + "epoch": 1.4333040164174728, + "grad_norm": 0.6189631960741147, + "learning_rate": 0.00019974360609344345, + "loss": 3.203617572784424, + "step": 2445, + "token_acc": 0.2771131527410854 + }, + { + "epoch": 1.433890354734682, + "grad_norm": 0.7812580128702256, + "learning_rate": 0.0001997429120274363, + "loss": 3.217839241027832, + "step": 2446, + "token_acc": 0.2766352362819467 + }, + { + "epoch": 1.434476693051891, + "grad_norm": 0.7860513794869695, + "learning_rate": 0.00019974221702447894, + "loss": 3.2327585220336914, + "step": 2447, + "token_acc": 0.2743000894337432 + }, + { + "epoch": 1.4350630313691, + "grad_norm": 0.972253037056436, + "learning_rate": 0.0001997415210845778, + "loss": 3.2005929946899414, + "step": 2448, + "token_acc": 0.27736807947158515 + }, + { + "epoch": 1.435649369686309, + "grad_norm": 0.9164018339659405, + "learning_rate": 0.00019974082420773953, + "loss": 3.2413651943206787, + "step": 2449, + "token_acc": 0.2726021183641963 + }, + { + "epoch": 1.436235708003518, + "grad_norm": 0.6397973299426116, + "learning_rate": 0.00019974012639397058, + "loss": 3.176875114440918, + "step": 2450, + "token_acc": 0.28275956496876337 + }, + { + "epoch": 1.436822046320727, + "grad_norm": 0.5675265328723258, + "learning_rate": 0.00019973942764327753, + "loss": 3.213367462158203, + "step": 2451, + "token_acc": 0.27693923433360423 + }, + { + "epoch": 1.4374083846379362, + "grad_norm": 0.7443237591104463, + "learning_rate": 0.000199738727955667, + "loss": 3.156369686126709, + "step": 2452, + "token_acc": 0.28361667580674643 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.6221376707507675, + "learning_rate": 0.00019973802733114552, + "loss": 3.183365821838379, + "step": 2453, + "token_acc": 0.28065817145761135 + }, + { + "epoch": 1.4385810612723542, + "grad_norm": 0.7968473205957822, + "learning_rate": 0.00019973732576971962, + "loss": 3.2436492443084717, + "step": 2454, + "token_acc": 0.27124884623651835 + }, + { + "epoch": 1.439167399589563, + "grad_norm": 0.8105287534621836, + "learning_rate": 0.00019973662327139597, + "loss": 3.1755919456481934, + "step": 2455, + "token_acc": 0.2830058134448836 + }, + { + "epoch": 1.4397537379067722, + "grad_norm": 0.6506472426254918, + "learning_rate": 0.00019973591983618117, + "loss": 3.222956657409668, + "step": 2456, + "token_acc": 0.27398947306421645 + }, + { + "epoch": 1.4403400762239813, + "grad_norm": 0.5257835253078588, + "learning_rate": 0.00019973521546408175, + "loss": 3.1910247802734375, + "step": 2457, + "token_acc": 0.2784298055929884 + }, + { + "epoch": 1.4409264145411902, + "grad_norm": 0.5783526811024802, + "learning_rate": 0.00019973451015510444, + "loss": 3.2501659393310547, + "step": 2458, + "token_acc": 0.27256398984900254 + }, + { + "epoch": 1.4415127528583993, + "grad_norm": 0.5021171327916099, + "learning_rate": 0.00019973380390925574, + "loss": 3.204328775405884, + "step": 2459, + "token_acc": 0.276011258463376 + }, + { + "epoch": 1.4420990911756082, + "grad_norm": 0.6881020903119974, + "learning_rate": 0.00019973309672654236, + "loss": 3.207953453063965, + "step": 2460, + "token_acc": 0.2765532796417194 + }, + { + "epoch": 1.4426854294928173, + "grad_norm": 0.6899068798047286, + "learning_rate": 0.00019973238860697095, + "loss": 3.2005624771118164, + "step": 2461, + "token_acc": 0.27745833125569164 + }, + { + "epoch": 1.4432717678100264, + "grad_norm": 0.42998025252895544, + "learning_rate": 0.00019973167955054813, + "loss": 3.184542655944824, + "step": 2462, + "token_acc": 0.2789898037916017 + }, + { + "epoch": 1.4438581061272355, + "grad_norm": 0.5517684630508338, + "learning_rate": 0.00019973096955728056, + "loss": 3.1826236248016357, + "step": 2463, + "token_acc": 0.2817761181059024 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5959620886106191, + "learning_rate": 0.00019973025862717492, + "loss": 3.210742712020874, + "step": 2464, + "token_acc": 0.2760825731126846 + }, + { + "epoch": 1.4450307827616535, + "grad_norm": 0.6175176816843638, + "learning_rate": 0.00019972954676023789, + "loss": 3.2387290000915527, + "step": 2465, + "token_acc": 0.27281263573345654 + }, + { + "epoch": 1.4456171210788624, + "grad_norm": 0.5435348542510129, + "learning_rate": 0.00019972883395647615, + "loss": 3.2437338829040527, + "step": 2466, + "token_acc": 0.2719556463504217 + }, + { + "epoch": 1.4462034593960715, + "grad_norm": 0.500203085694502, + "learning_rate": 0.0001997281202158964, + "loss": 3.1948535442352295, + "step": 2467, + "token_acc": 0.27947645129877596 + }, + { + "epoch": 1.4467897977132806, + "grad_norm": 0.5265721057911557, + "learning_rate": 0.00019972740553850539, + "loss": 3.207122325897217, + "step": 2468, + "token_acc": 0.2768359936565258 + }, + { + "epoch": 1.4473761360304895, + "grad_norm": 0.4829913796755665, + "learning_rate": 0.00019972668992430977, + "loss": 3.2574994564056396, + "step": 2469, + "token_acc": 0.2711800433262895 + }, + { + "epoch": 1.4479624743476986, + "grad_norm": 0.6122344296399751, + "learning_rate": 0.00019972597337331624, + "loss": 3.2081079483032227, + "step": 2470, + "token_acc": 0.27587271688616466 + }, + { + "epoch": 1.4485488126649075, + "grad_norm": 0.6038361584270306, + "learning_rate": 0.00019972525588553158, + "loss": 3.1788229942321777, + "step": 2471, + "token_acc": 0.28153393442867 + }, + { + "epoch": 1.4491351509821166, + "grad_norm": 0.564385493509247, + "learning_rate": 0.00019972453746096256, + "loss": 3.247359275817871, + "step": 2472, + "token_acc": 0.27234472354539724 + }, + { + "epoch": 1.4497214892993258, + "grad_norm": 0.5059872544628228, + "learning_rate": 0.0001997238180996159, + "loss": 3.163174629211426, + "step": 2473, + "token_acc": 0.28155848717225523 + }, + { + "epoch": 1.4503078276165349, + "grad_norm": 0.4907589108075454, + "learning_rate": 0.0001997230978014983, + "loss": 3.213125467300415, + "step": 2474, + "token_acc": 0.2760072251982666 + }, + { + "epoch": 1.4508941659337438, + "grad_norm": 0.49759636932039647, + "learning_rate": 0.00019972237656661662, + "loss": 3.2524139881134033, + "step": 2475, + "token_acc": 0.2692037756976918 + }, + { + "epoch": 1.4514805042509529, + "grad_norm": 0.6335015042988192, + "learning_rate": 0.00019972165439497753, + "loss": 3.271270751953125, + "step": 2476, + "token_acc": 0.26658019033458025 + }, + { + "epoch": 1.4520668425681618, + "grad_norm": 0.5829846017636969, + "learning_rate": 0.00019972093128658793, + "loss": 3.1781558990478516, + "step": 2477, + "token_acc": 0.2814562592726567 + }, + { + "epoch": 1.4526531808853709, + "grad_norm": 0.4196788623607607, + "learning_rate": 0.00019972020724145454, + "loss": 3.2532718181610107, + "step": 2478, + "token_acc": 0.2708930404653948 + }, + { + "epoch": 1.45323951920258, + "grad_norm": 0.5933753110909107, + "learning_rate": 0.00019971948225958416, + "loss": 3.1762609481811523, + "step": 2479, + "token_acc": 0.2810328849749037 + }, + { + "epoch": 1.4538258575197889, + "grad_norm": 0.642452763253212, + "learning_rate": 0.00019971875634098365, + "loss": 3.2116127014160156, + "step": 2480, + "token_acc": 0.275731276626071 + }, + { + "epoch": 1.454412195836998, + "grad_norm": 0.6106044558450442, + "learning_rate": 0.00019971802948565975, + "loss": 3.197416305541992, + "step": 2481, + "token_acc": 0.2787896599885042 + }, + { + "epoch": 1.4549985341542069, + "grad_norm": 0.4585504986172105, + "learning_rate": 0.00019971730169361939, + "loss": 3.167546510696411, + "step": 2482, + "token_acc": 0.283260456400101 + }, + { + "epoch": 1.455584872471416, + "grad_norm": 0.49179417002102926, + "learning_rate": 0.00019971657296486933, + "loss": 3.1961326599121094, + "step": 2483, + "token_acc": 0.27924459351874814 + }, + { + "epoch": 1.456171210788625, + "grad_norm": 0.5750534623613722, + "learning_rate": 0.00019971584329941643, + "loss": 3.195237398147583, + "step": 2484, + "token_acc": 0.27979462959032053 + }, + { + "epoch": 1.456757549105834, + "grad_norm": 0.6616069458002177, + "learning_rate": 0.00019971511269726756, + "loss": 3.1538100242614746, + "step": 2485, + "token_acc": 0.284324502866934 + }, + { + "epoch": 1.457343887423043, + "grad_norm": 0.6106337556194826, + "learning_rate": 0.00019971438115842956, + "loss": 3.1540393829345703, + "step": 2486, + "token_acc": 0.2842917941967422 + }, + { + "epoch": 1.457930225740252, + "grad_norm": 0.7204347558629824, + "learning_rate": 0.00019971364868290933, + "loss": 3.1896331310272217, + "step": 2487, + "token_acc": 0.28045013633152155 + }, + { + "epoch": 1.458516564057461, + "grad_norm": 0.6753786806621657, + "learning_rate": 0.0001997129152707137, + "loss": 3.192776679992676, + "step": 2488, + "token_acc": 0.28070888458700094 + }, + { + "epoch": 1.4591029023746702, + "grad_norm": 0.46156542412579743, + "learning_rate": 0.00019971218092184963, + "loss": 3.189014434814453, + "step": 2489, + "token_acc": 0.27789217890758755 + }, + { + "epoch": 1.4596892406918793, + "grad_norm": 0.6302476502557568, + "learning_rate": 0.000199711445636324, + "loss": 3.160496234893799, + "step": 2490, + "token_acc": 0.2818571182035446 + }, + { + "epoch": 1.4602755790090882, + "grad_norm": 0.6700090303154498, + "learning_rate": 0.00019971070941414366, + "loss": 3.169593095779419, + "step": 2491, + "token_acc": 0.28137520772375546 + }, + { + "epoch": 1.4608619173262973, + "grad_norm": 0.7256959653113945, + "learning_rate": 0.0001997099722553156, + "loss": 3.2104599475860596, + "step": 2492, + "token_acc": 0.27613938937579086 + }, + { + "epoch": 1.4614482556435062, + "grad_norm": 0.7035745442690808, + "learning_rate": 0.0001997092341598467, + "loss": 3.226490020751953, + "step": 2493, + "token_acc": 0.27574394804959385 + }, + { + "epoch": 1.4620345939607153, + "grad_norm": 0.5944670778921513, + "learning_rate": 0.00019970849512774392, + "loss": 3.181095600128174, + "step": 2494, + "token_acc": 0.2797733911191252 + }, + { + "epoch": 1.4626209322779244, + "grad_norm": 0.5894188467720394, + "learning_rate": 0.00019970775515901416, + "loss": 3.2125582695007324, + "step": 2495, + "token_acc": 0.2759883176224896 + }, + { + "epoch": 1.4632072705951333, + "grad_norm": 0.5368885645687302, + "learning_rate": 0.00019970701425366441, + "loss": 3.2335870265960693, + "step": 2496, + "token_acc": 0.2732706999526237 + }, + { + "epoch": 1.4637936089123424, + "grad_norm": 0.6562322633980282, + "learning_rate": 0.0001997062724117016, + "loss": 3.234812021255493, + "step": 2497, + "token_acc": 0.27149178826231113 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.5122770302918647, + "learning_rate": 0.00019970552963313276, + "loss": 3.2412772178649902, + "step": 2498, + "token_acc": 0.2729527556984066 + }, + { + "epoch": 1.4649662855467604, + "grad_norm": 0.5521648029820367, + "learning_rate": 0.00019970478591796478, + "loss": 3.223781108856201, + "step": 2499, + "token_acc": 0.2747450525567885 + }, + { + "epoch": 1.4655526238639696, + "grad_norm": 0.5130770167363079, + "learning_rate": 0.0001997040412662047, + "loss": 3.2385120391845703, + "step": 2500, + "token_acc": 0.27294334934186665 + }, + { + "epoch": 1.4661389621811787, + "grad_norm": 0.4811323211021129, + "learning_rate": 0.00019970329567785952, + "loss": 3.192143440246582, + "step": 2501, + "token_acc": 0.2798596619365609 + }, + { + "epoch": 1.4667253004983876, + "grad_norm": 0.4194047828520244, + "learning_rate": 0.0001997025491529362, + "loss": 3.157729148864746, + "step": 2502, + "token_acc": 0.28574631215531543 + }, + { + "epoch": 1.4673116388155967, + "grad_norm": 0.47850992005191706, + "learning_rate": 0.00019970180169144185, + "loss": 3.239119052886963, + "step": 2503, + "token_acc": 0.27235876215887334 + }, + { + "epoch": 1.4678979771328056, + "grad_norm": 0.5177957871782771, + "learning_rate": 0.00019970105329338334, + "loss": 3.1877291202545166, + "step": 2504, + "token_acc": 0.2771754225211358 + }, + { + "epoch": 1.4684843154500147, + "grad_norm": 0.6531418016107837, + "learning_rate": 0.00019970030395876785, + "loss": 3.1982970237731934, + "step": 2505, + "token_acc": 0.27739468161906067 + }, + { + "epoch": 1.4690706537672238, + "grad_norm": 0.6542004822307297, + "learning_rate": 0.00019969955368760232, + "loss": 3.208113193511963, + "step": 2506, + "token_acc": 0.276475195475846 + }, + { + "epoch": 1.4696569920844327, + "grad_norm": 0.6121823158096926, + "learning_rate": 0.0001996988024798938, + "loss": 3.205082893371582, + "step": 2507, + "token_acc": 0.27654330188880133 + }, + { + "epoch": 1.4702433304016418, + "grad_norm": 0.5519609062019573, + "learning_rate": 0.00019969805033564944, + "loss": 3.219461441040039, + "step": 2508, + "token_acc": 0.2752365888881248 + }, + { + "epoch": 1.4708296687188507, + "grad_norm": 0.5722058555205045, + "learning_rate": 0.0001996972972548762, + "loss": 3.1967005729675293, + "step": 2509, + "token_acc": 0.2791950701041383 + }, + { + "epoch": 1.4714160070360598, + "grad_norm": 0.726680201817417, + "learning_rate": 0.00019969654323758121, + "loss": 3.180673360824585, + "step": 2510, + "token_acc": 0.28260201723973244 + }, + { + "epoch": 1.472002345353269, + "grad_norm": 0.7854221005748253, + "learning_rate": 0.00019969578828377154, + "loss": 3.200195789337158, + "step": 2511, + "token_acc": 0.2782077423951851 + }, + { + "epoch": 1.4725886836704778, + "grad_norm": 0.5887666723660547, + "learning_rate": 0.0001996950323934543, + "loss": 3.250013828277588, + "step": 2512, + "token_acc": 0.2720941860525054 + }, + { + "epoch": 1.473175021987687, + "grad_norm": 0.5420748971636962, + "learning_rate": 0.0001996942755666365, + "loss": 3.200059413909912, + "step": 2513, + "token_acc": 0.27972385645520587 + }, + { + "epoch": 1.4737613603048958, + "grad_norm": 0.47571610123741964, + "learning_rate": 0.00019969351780332536, + "loss": 3.190324068069458, + "step": 2514, + "token_acc": 0.2793133982347034 + }, + { + "epoch": 1.474347698622105, + "grad_norm": 0.6183824062005063, + "learning_rate": 0.000199692759103528, + "loss": 3.2048568725585938, + "step": 2515, + "token_acc": 0.2773247020869146 + }, + { + "epoch": 1.474934036939314, + "grad_norm": 0.5427911055629165, + "learning_rate": 0.00019969199946725147, + "loss": 3.208702325820923, + "step": 2516, + "token_acc": 0.2763342050752977 + }, + { + "epoch": 1.4755203752565231, + "grad_norm": 0.4922526607332147, + "learning_rate": 0.00019969123889450294, + "loss": 3.2250969409942627, + "step": 2517, + "token_acc": 0.2744276248172996 + }, + { + "epoch": 1.476106713573732, + "grad_norm": 0.7313959804633519, + "learning_rate": 0.00019969047738528956, + "loss": 3.213158130645752, + "step": 2518, + "token_acc": 0.27674735567136743 + }, + { + "epoch": 1.4766930518909411, + "grad_norm": 0.5834036986696088, + "learning_rate": 0.0001996897149396185, + "loss": 3.1552886962890625, + "step": 2519, + "token_acc": 0.28381487715613757 + }, + { + "epoch": 1.47727939020815, + "grad_norm": 0.6075928501263398, + "learning_rate": 0.00019968895155749686, + "loss": 3.2194197177886963, + "step": 2520, + "token_acc": 0.27472501930481985 + }, + { + "epoch": 1.4778657285253591, + "grad_norm": 0.5101410971984783, + "learning_rate": 0.00019968818723893188, + "loss": 3.2222201824188232, + "step": 2521, + "token_acc": 0.27400174883691963 + }, + { + "epoch": 1.4784520668425682, + "grad_norm": 0.6437041639971338, + "learning_rate": 0.00019968742198393072, + "loss": 3.141871929168701, + "step": 2522, + "token_acc": 0.284780965783233 + }, + { + "epoch": 1.4790384051597771, + "grad_norm": 0.5121587196636244, + "learning_rate": 0.00019968665579250052, + "loss": 3.2558109760284424, + "step": 2523, + "token_acc": 0.26877669995826364 + }, + { + "epoch": 1.4796247434769862, + "grad_norm": 0.6496642947867797, + "learning_rate": 0.00019968588866464858, + "loss": 3.2151339054107666, + "step": 2524, + "token_acc": 0.2754572951173844 + }, + { + "epoch": 1.4802110817941951, + "grad_norm": 0.7097584022477003, + "learning_rate": 0.00019968512060038199, + "loss": 3.1849985122680664, + "step": 2525, + "token_acc": 0.2789456645467172 + }, + { + "epoch": 1.4807974201114043, + "grad_norm": 0.5734534267527766, + "learning_rate": 0.00019968435159970803, + "loss": 3.227900505065918, + "step": 2526, + "token_acc": 0.2764788783290572 + }, + { + "epoch": 1.4813837584286134, + "grad_norm": 0.6627169235649571, + "learning_rate": 0.00019968358166263394, + "loss": 3.2236788272857666, + "step": 2527, + "token_acc": 0.2744796961104275 + }, + { + "epoch": 1.4819700967458225, + "grad_norm": 0.5891236881226227, + "learning_rate": 0.00019968281078916691, + "loss": 3.217014789581299, + "step": 2528, + "token_acc": 0.27542196531791907 + }, + { + "epoch": 1.4825564350630314, + "grad_norm": 0.6223975932642402, + "learning_rate": 0.00019968203897931418, + "loss": 3.2257957458496094, + "step": 2529, + "token_acc": 0.2728936105043405 + }, + { + "epoch": 1.4831427733802405, + "grad_norm": 0.7123687047983446, + "learning_rate": 0.00019968126623308305, + "loss": 3.1915369033813477, + "step": 2530, + "token_acc": 0.27894352576500875 + }, + { + "epoch": 1.4837291116974494, + "grad_norm": 0.5008508330566309, + "learning_rate": 0.00019968049255048072, + "loss": 3.2058990001678467, + "step": 2531, + "token_acc": 0.2779154603358425 + }, + { + "epoch": 1.4843154500146585, + "grad_norm": 0.5905457680258205, + "learning_rate": 0.0001996797179315145, + "loss": 3.2080085277557373, + "step": 2532, + "token_acc": 0.27641812966857454 + }, + { + "epoch": 1.4849017883318676, + "grad_norm": 0.6573248059047676, + "learning_rate": 0.00019967894237619166, + "loss": 3.1979641914367676, + "step": 2533, + "token_acc": 0.2769533304369698 + }, + { + "epoch": 1.4854881266490765, + "grad_norm": 0.5719084354713385, + "learning_rate": 0.00019967816588451945, + "loss": 3.212754726409912, + "step": 2534, + "token_acc": 0.27594639359973333 + }, + { + "epoch": 1.4860744649662856, + "grad_norm": 0.6130197424173706, + "learning_rate": 0.00019967738845650518, + "loss": 3.1927692890167236, + "step": 2535, + "token_acc": 0.27878230146247207 + }, + { + "epoch": 1.4866608032834945, + "grad_norm": 0.5401694099444807, + "learning_rate": 0.0001996766100921562, + "loss": 3.224625825881958, + "step": 2536, + "token_acc": 0.273315781736539 + }, + { + "epoch": 1.4872471416007036, + "grad_norm": 0.45654371177895586, + "learning_rate": 0.00019967583079147976, + "loss": 3.1967859268188477, + "step": 2537, + "token_acc": 0.2805367278045989 + }, + { + "epoch": 1.4878334799179127, + "grad_norm": 0.568102826814915, + "learning_rate": 0.0001996750505544832, + "loss": 3.1914587020874023, + "step": 2538, + "token_acc": 0.2796358439137668 + }, + { + "epoch": 1.4884198182351216, + "grad_norm": 0.5407867112711078, + "learning_rate": 0.00019967426938117386, + "loss": 3.188796043395996, + "step": 2539, + "token_acc": 0.2804518599933688 + }, + { + "epoch": 1.4890061565523307, + "grad_norm": 0.5245662356683642, + "learning_rate": 0.00019967348727155908, + "loss": 3.243316650390625, + "step": 2540, + "token_acc": 0.2711999186548855 + }, + { + "epoch": 1.4895924948695396, + "grad_norm": 0.5958080055453131, + "learning_rate": 0.0001996727042256462, + "loss": 3.229617118835449, + "step": 2541, + "token_acc": 0.27406487204055857 + }, + { + "epoch": 1.4901788331867487, + "grad_norm": 0.659684891150021, + "learning_rate": 0.00019967192024344254, + "loss": 3.1845126152038574, + "step": 2542, + "token_acc": 0.28146634590131214 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.5195139402251614, + "learning_rate": 0.00019967113532495554, + "loss": 3.225705146789551, + "step": 2543, + "token_acc": 0.27475822274107653 + }, + { + "epoch": 1.491351509821167, + "grad_norm": 0.5026947473730926, + "learning_rate": 0.00019967034947019255, + "loss": 3.198336124420166, + "step": 2544, + "token_acc": 0.2772863046982627 + }, + { + "epoch": 1.4919378481383758, + "grad_norm": 0.4792965054812544, + "learning_rate": 0.0001996695626791609, + "loss": 3.184157609939575, + "step": 2545, + "token_acc": 0.27936705773972415 + }, + { + "epoch": 1.492524186455585, + "grad_norm": 0.5166776279078952, + "learning_rate": 0.000199668774951868, + "loss": 3.234355926513672, + "step": 2546, + "token_acc": 0.2726265891745137 + }, + { + "epoch": 1.4931105247727938, + "grad_norm": 0.46635813548353355, + "learning_rate": 0.00019966798628832128, + "loss": 3.2106258869171143, + "step": 2547, + "token_acc": 0.2743577288573169 + }, + { + "epoch": 1.493696863090003, + "grad_norm": 0.47208036868593695, + "learning_rate": 0.00019966719668852815, + "loss": 3.2191061973571777, + "step": 2548, + "token_acc": 0.2737214241428892 + }, + { + "epoch": 1.494283201407212, + "grad_norm": 0.4072077133491671, + "learning_rate": 0.00019966640615249598, + "loss": 3.195671319961548, + "step": 2549, + "token_acc": 0.27851041351447486 + }, + { + "epoch": 1.494869539724421, + "grad_norm": 0.5007068754891894, + "learning_rate": 0.00019966561468023227, + "loss": 3.1850838661193848, + "step": 2550, + "token_acc": 0.27543604098848623 + }, + { + "epoch": 1.49545587804163, + "grad_norm": 0.5484460382686189, + "learning_rate": 0.00019966482227174438, + "loss": 3.1656007766723633, + "step": 2551, + "token_acc": 0.2817036151228038 + }, + { + "epoch": 1.496042216358839, + "grad_norm": 0.4472232777621029, + "learning_rate": 0.00019966402892703978, + "loss": 3.228626251220703, + "step": 2552, + "token_acc": 0.27372426994913096 + }, + { + "epoch": 1.496628554676048, + "grad_norm": 0.5370529019064525, + "learning_rate": 0.00019966323464612592, + "loss": 3.1678030490875244, + "step": 2553, + "token_acc": 0.28093113452666824 + }, + { + "epoch": 1.4972148929932572, + "grad_norm": 0.5188188924773037, + "learning_rate": 0.0001996624394290103, + "loss": 3.16164231300354, + "step": 2554, + "token_acc": 0.2828802311161986 + }, + { + "epoch": 1.4978012313104663, + "grad_norm": 0.5491705479819814, + "learning_rate": 0.00019966164327570032, + "loss": 3.204102039337158, + "step": 2555, + "token_acc": 0.27742823436885566 + }, + { + "epoch": 1.4983875696276752, + "grad_norm": 0.4946163233521165, + "learning_rate": 0.00019966084618620354, + "loss": 3.1950695514678955, + "step": 2556, + "token_acc": 0.27978114896791845 + }, + { + "epoch": 1.4989739079448843, + "grad_norm": 0.44101646202070716, + "learning_rate": 0.0001996600481605274, + "loss": 3.241257429122925, + "step": 2557, + "token_acc": 0.2737059898066241 + }, + { + "epoch": 1.4995602462620932, + "grad_norm": 0.5287581474783024, + "learning_rate": 0.00019965924919867939, + "loss": 3.205601453781128, + "step": 2558, + "token_acc": 0.27714623154134815 + }, + { + "epoch": 1.5001465845793023, + "grad_norm": 0.5615238682513923, + "learning_rate": 0.000199658449300667, + "loss": 3.1826610565185547, + "step": 2559, + "token_acc": 0.2795690325224993 + }, + { + "epoch": 1.5007329228965114, + "grad_norm": 0.5499962633852297, + "learning_rate": 0.00019965764846649776, + "loss": 3.1878738403320312, + "step": 2560, + "token_acc": 0.2794553280211387 + }, + { + "epoch": 1.5013192612137203, + "grad_norm": 0.5262601707034167, + "learning_rate": 0.00019965684669617927, + "loss": 3.18379545211792, + "step": 2561, + "token_acc": 0.27958872974826027 + }, + { + "epoch": 1.5019055995309294, + "grad_norm": 0.4890548671392714, + "learning_rate": 0.00019965604398971895, + "loss": 3.2516584396362305, + "step": 2562, + "token_acc": 0.27112523358590834 + }, + { + "epoch": 1.5024919378481383, + "grad_norm": 0.5510605958581747, + "learning_rate": 0.0001996552403471244, + "loss": 3.174196243286133, + "step": 2563, + "token_acc": 0.2815971559334802 + }, + { + "epoch": 1.5030782761653474, + "grad_norm": 0.679424420448652, + "learning_rate": 0.00019965443576840314, + "loss": 3.2020723819732666, + "step": 2564, + "token_acc": 0.27895799482989964 + }, + { + "epoch": 1.5036646144825565, + "grad_norm": 0.6111051627171314, + "learning_rate": 0.00019965363025356277, + "loss": 3.1935744285583496, + "step": 2565, + "token_acc": 0.2779376021038371 + }, + { + "epoch": 1.5042509527997656, + "grad_norm": 0.5954299683518938, + "learning_rate": 0.0001996528238026108, + "loss": 3.188821315765381, + "step": 2566, + "token_acc": 0.27774087768753236 + }, + { + "epoch": 1.5048372911169745, + "grad_norm": 0.6188524450667723, + "learning_rate": 0.00019965201641555485, + "loss": 3.195739507675171, + "step": 2567, + "token_acc": 0.2796716656173524 + }, + { + "epoch": 1.5054236294341834, + "grad_norm": 0.6079346382368913, + "learning_rate": 0.00019965120809240248, + "loss": 3.204178810119629, + "step": 2568, + "token_acc": 0.2761797866168941 + }, + { + "epoch": 1.5060099677513925, + "grad_norm": 0.5856612871716462, + "learning_rate": 0.00019965039883316127, + "loss": 3.2181806564331055, + "step": 2569, + "token_acc": 0.2737018028687985 + }, + { + "epoch": 1.5065963060686016, + "grad_norm": 0.6220883117403615, + "learning_rate": 0.0001996495886378389, + "loss": 3.1793951988220215, + "step": 2570, + "token_acc": 0.2790577931264043 + }, + { + "epoch": 1.5071826443858107, + "grad_norm": 0.8896693590849469, + "learning_rate": 0.00019964877750644288, + "loss": 3.2209482192993164, + "step": 2571, + "token_acc": 0.27534816586801114 + }, + { + "epoch": 1.5077689827030196, + "grad_norm": 0.7766437880647261, + "learning_rate": 0.00019964796543898088, + "loss": 3.2114968299865723, + "step": 2572, + "token_acc": 0.2768520780038822 + }, + { + "epoch": 1.5083553210202285, + "grad_norm": 0.5061783353958996, + "learning_rate": 0.00019964715243546053, + "loss": 3.1883745193481445, + "step": 2573, + "token_acc": 0.2800476750744604 + }, + { + "epoch": 1.5089416593374376, + "grad_norm": 0.5876043860514126, + "learning_rate": 0.00019964633849588946, + "loss": 3.230588674545288, + "step": 2574, + "token_acc": 0.2737344191118766 + }, + { + "epoch": 1.5095279976546467, + "grad_norm": 0.5490222411240606, + "learning_rate": 0.00019964552362027532, + "loss": 3.225931406021118, + "step": 2575, + "token_acc": 0.27572280226782847 + }, + { + "epoch": 1.5101143359718558, + "grad_norm": 0.49634159791958343, + "learning_rate": 0.00019964470780862574, + "loss": 3.183867931365967, + "step": 2576, + "token_acc": 0.2834025110211658 + }, + { + "epoch": 1.5107006742890647, + "grad_norm": 0.6039783785082427, + "learning_rate": 0.00019964389106094844, + "loss": 3.197291374206543, + "step": 2577, + "token_acc": 0.27901096736060943 + }, + { + "epoch": 1.5112870126062738, + "grad_norm": 0.6449220446386443, + "learning_rate": 0.000199643073377251, + "loss": 3.2544384002685547, + "step": 2578, + "token_acc": 0.26886562540617825 + }, + { + "epoch": 1.5118733509234827, + "grad_norm": 0.518953885142126, + "learning_rate": 0.0001996422547575412, + "loss": 3.210023880004883, + "step": 2579, + "token_acc": 0.2760992423707493 + }, + { + "epoch": 1.5124596892406919, + "grad_norm": 0.42408514917536944, + "learning_rate": 0.00019964143520182667, + "loss": 3.1973390579223633, + "step": 2580, + "token_acc": 0.2768521919998344 + }, + { + "epoch": 1.513046027557901, + "grad_norm": 0.5466683052297299, + "learning_rate": 0.00019964061471011512, + "loss": 3.2478909492492676, + "step": 2581, + "token_acc": 0.27186737810999884 + }, + { + "epoch": 1.51363236587511, + "grad_norm": 0.6076113311440955, + "learning_rate": 0.00019963979328241428, + "loss": 3.250032424926758, + "step": 2582, + "token_acc": 0.27027553278722244 + }, + { + "epoch": 1.514218704192319, + "grad_norm": 0.6575414575084427, + "learning_rate": 0.00019963897091873184, + "loss": 3.2322471141815186, + "step": 2583, + "token_acc": 0.27306669045532544 + }, + { + "epoch": 1.5148050425095279, + "grad_norm": 0.5153929024427344, + "learning_rate": 0.00019963814761907552, + "loss": 3.195371150970459, + "step": 2584, + "token_acc": 0.27909082964575405 + }, + { + "epoch": 1.515391380826737, + "grad_norm": 0.5355900598890126, + "learning_rate": 0.00019963732338345306, + "loss": 3.211622953414917, + "step": 2585, + "token_acc": 0.27775738837240616 + }, + { + "epoch": 1.515977719143946, + "grad_norm": 0.5007075998587486, + "learning_rate": 0.00019963649821187223, + "loss": 3.228663444519043, + "step": 2586, + "token_acc": 0.27121009478397395 + }, + { + "epoch": 1.5165640574611552, + "grad_norm": 0.5738858838888441, + "learning_rate": 0.00019963567210434078, + "loss": 3.187683582305908, + "step": 2587, + "token_acc": 0.2799089048245198 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.47501189834544905, + "learning_rate": 0.0001996348450608664, + "loss": 3.197097063064575, + "step": 2588, + "token_acc": 0.2761465863892387 + }, + { + "epoch": 1.5177367340955732, + "grad_norm": 0.45559079551047527, + "learning_rate": 0.00019963401708145698, + "loss": 3.1959357261657715, + "step": 2589, + "token_acc": 0.2776377602428715 + }, + { + "epoch": 1.518323072412782, + "grad_norm": 0.475828542760936, + "learning_rate": 0.00019963318816612017, + "loss": 3.1912906169891357, + "step": 2590, + "token_acc": 0.27945222164264666 + }, + { + "epoch": 1.5189094107299912, + "grad_norm": 0.4692414063210235, + "learning_rate": 0.00019963235831486383, + "loss": 3.187471866607666, + "step": 2591, + "token_acc": 0.27807520185933976 + }, + { + "epoch": 1.5194957490472003, + "grad_norm": 0.5019159356266234, + "learning_rate": 0.00019963152752769573, + "loss": 3.213104009628296, + "step": 2592, + "token_acc": 0.2756329589037356 + }, + { + "epoch": 1.5200820873644094, + "grad_norm": 0.5053822379157825, + "learning_rate": 0.00019963069580462373, + "loss": 3.2158172130584717, + "step": 2593, + "token_acc": 0.2762074884447156 + }, + { + "epoch": 1.5206684256816183, + "grad_norm": 0.48863112503599165, + "learning_rate": 0.00019962986314565556, + "loss": 3.2027974128723145, + "step": 2594, + "token_acc": 0.2782632491106449 + }, + { + "epoch": 1.5212547639988272, + "grad_norm": 0.4879702271670185, + "learning_rate": 0.00019962902955079909, + "loss": 3.2264890670776367, + "step": 2595, + "token_acc": 0.27760177811295905 + }, + { + "epoch": 1.5218411023160363, + "grad_norm": 0.5216363861434566, + "learning_rate": 0.00019962819502006212, + "loss": 3.14943790435791, + "step": 2596, + "token_acc": 0.28374012017098155 + }, + { + "epoch": 1.5224274406332454, + "grad_norm": 0.6062754835776472, + "learning_rate": 0.00019962735955345254, + "loss": 3.217008352279663, + "step": 2597, + "token_acc": 0.2761816870789298 + }, + { + "epoch": 1.5230137789504545, + "grad_norm": 0.6130860589728432, + "learning_rate": 0.0001996265231509781, + "loss": 3.1947407722473145, + "step": 2598, + "token_acc": 0.2762130818891434 + }, + { + "epoch": 1.5236001172676634, + "grad_norm": 0.5512465483085283, + "learning_rate": 0.0001996256858126468, + "loss": 3.187373638153076, + "step": 2599, + "token_acc": 0.2788363181117269 + }, + { + "epoch": 1.5241864555848723, + "grad_norm": 0.4863912519444314, + "learning_rate": 0.00019962484753846638, + "loss": 3.23612904548645, + "step": 2600, + "token_acc": 0.2708356227749822 + }, + { + "epoch": 1.5247727939020814, + "grad_norm": 0.5688343992849446, + "learning_rate": 0.0001996240083284448, + "loss": 3.1475396156311035, + "step": 2601, + "token_acc": 0.2843472738912127 + }, + { + "epoch": 1.5253591322192905, + "grad_norm": 0.5127350507540789, + "learning_rate": 0.00019962316818258988, + "loss": 3.1505918502807617, + "step": 2602, + "token_acc": 0.28518486598293546 + }, + { + "epoch": 1.5259454705364996, + "grad_norm": 0.45775306404651056, + "learning_rate": 0.00019962232710090956, + "loss": 3.178192138671875, + "step": 2603, + "token_acc": 0.2800021140811014 + }, + { + "epoch": 1.5265318088537085, + "grad_norm": 0.4377406264509686, + "learning_rate": 0.0001996214850834117, + "loss": 3.189859390258789, + "step": 2604, + "token_acc": 0.27828651125722254 + }, + { + "epoch": 1.5271181471709177, + "grad_norm": 0.4769385208152471, + "learning_rate": 0.00019962064213010426, + "loss": 3.19708514213562, + "step": 2605, + "token_acc": 0.27913751349145405 + }, + { + "epoch": 1.5277044854881265, + "grad_norm": 0.47255046494484093, + "learning_rate": 0.0001996197982409951, + "loss": 3.1860170364379883, + "step": 2606, + "token_acc": 0.2778017286702698 + }, + { + "epoch": 1.5282908238053357, + "grad_norm": 0.6331900327363463, + "learning_rate": 0.00019961895341609215, + "loss": 3.226107120513916, + "step": 2607, + "token_acc": 0.27322191058551754 + }, + { + "epoch": 1.5288771621225448, + "grad_norm": 0.6149696737319772, + "learning_rate": 0.00019961810765540343, + "loss": 3.1917643547058105, + "step": 2608, + "token_acc": 0.2792448388313568 + }, + { + "epoch": 1.5294635004397539, + "grad_norm": 0.5239107493552146, + "learning_rate": 0.00019961726095893677, + "loss": 3.1599059104919434, + "step": 2609, + "token_acc": 0.28273398939268524 + }, + { + "epoch": 1.5300498387569628, + "grad_norm": 0.4945186851477478, + "learning_rate": 0.0001996164133267002, + "loss": 3.2321929931640625, + "step": 2610, + "token_acc": 0.27270286543930394 + }, + { + "epoch": 1.5306361770741717, + "grad_norm": 0.5753562879050244, + "learning_rate": 0.00019961556475870168, + "loss": 3.2100563049316406, + "step": 2611, + "token_acc": 0.2773342589378688 + }, + { + "epoch": 1.5312225153913808, + "grad_norm": 0.5160719678668378, + "learning_rate": 0.00019961471525494916, + "loss": 3.20725154876709, + "step": 2612, + "token_acc": 0.27617841178653846 + }, + { + "epoch": 1.5318088537085899, + "grad_norm": 0.5256446973537284, + "learning_rate": 0.0001996138648154506, + "loss": 3.1539833545684814, + "step": 2613, + "token_acc": 0.2822624860346754 + }, + { + "epoch": 1.532395192025799, + "grad_norm": 0.6137646415632242, + "learning_rate": 0.00019961301344021404, + "loss": 3.2356555461883545, + "step": 2614, + "token_acc": 0.27469596079437625 + }, + { + "epoch": 1.5329815303430079, + "grad_norm": 0.6521672014337166, + "learning_rate": 0.00019961216112924742, + "loss": 3.150270938873291, + "step": 2615, + "token_acc": 0.28337966119473434 + }, + { + "epoch": 1.533567868660217, + "grad_norm": 0.5332738384184421, + "learning_rate": 0.00019961130788255879, + "loss": 3.181734561920166, + "step": 2616, + "token_acc": 0.27969242713490344 + }, + { + "epoch": 1.5341542069774259, + "grad_norm": 0.4156409182836854, + "learning_rate": 0.00019961045370015613, + "loss": 3.2311954498291016, + "step": 2617, + "token_acc": 0.2751625786077235 + }, + { + "epoch": 1.534740545294635, + "grad_norm": 0.5004138723909748, + "learning_rate": 0.00019960959858204754, + "loss": 3.2367100715637207, + "step": 2618, + "token_acc": 0.2735227854582693 + }, + { + "epoch": 1.535326883611844, + "grad_norm": 0.6042173400527904, + "learning_rate": 0.00019960874252824095, + "loss": 3.194169044494629, + "step": 2619, + "token_acc": 0.2786639051183539 + }, + { + "epoch": 1.5359132219290532, + "grad_norm": 0.6074869541352533, + "learning_rate": 0.00019960788553874447, + "loss": 3.1802079677581787, + "step": 2620, + "token_acc": 0.27933774463502375 + }, + { + "epoch": 1.536499560246262, + "grad_norm": 0.5655148941661284, + "learning_rate": 0.0001996070276135661, + "loss": 3.2038466930389404, + "step": 2621, + "token_acc": 0.27691549642769153 + }, + { + "epoch": 1.537085898563471, + "grad_norm": 0.5229282285106968, + "learning_rate": 0.00019960616875271394, + "loss": 3.1922194957733154, + "step": 2622, + "token_acc": 0.27793751538514244 + }, + { + "epoch": 1.53767223688068, + "grad_norm": 0.662935111578389, + "learning_rate": 0.00019960530895619605, + "loss": 3.2193267345428467, + "step": 2623, + "token_acc": 0.2761253467358765 + }, + { + "epoch": 1.5382585751978892, + "grad_norm": 0.6163901793771169, + "learning_rate": 0.00019960444822402052, + "loss": 3.255826473236084, + "step": 2624, + "token_acc": 0.2709860892208593 + }, + { + "epoch": 1.5388449135150983, + "grad_norm": 0.5201199129258697, + "learning_rate": 0.0001996035865561954, + "loss": 3.194967269897461, + "step": 2625, + "token_acc": 0.276978892168926 + }, + { + "epoch": 1.5394312518323072, + "grad_norm": 0.5252987951046296, + "learning_rate": 0.0001996027239527288, + "loss": 3.2182321548461914, + "step": 2626, + "token_acc": 0.27467083397769293 + }, + { + "epoch": 1.5400175901495161, + "grad_norm": 0.5261210305135171, + "learning_rate": 0.00019960186041362882, + "loss": 3.1783833503723145, + "step": 2627, + "token_acc": 0.27898189905479703 + }, + { + "epoch": 1.5406039284667252, + "grad_norm": 0.5690603608090683, + "learning_rate": 0.00019960099593890359, + "loss": 3.1578307151794434, + "step": 2628, + "token_acc": 0.2823582628857462 + }, + { + "epoch": 1.5411902667839343, + "grad_norm": 0.6391591437890906, + "learning_rate": 0.0001996001305285612, + "loss": 3.191183567047119, + "step": 2629, + "token_acc": 0.27895011696747823 + }, + { + "epoch": 1.5417766051011434, + "grad_norm": 0.5516666086835875, + "learning_rate": 0.0001995992641826098, + "loss": 3.150019645690918, + "step": 2630, + "token_acc": 0.2834971021296807 + }, + { + "epoch": 1.5423629434183523, + "grad_norm": 0.4992849421599657, + "learning_rate": 0.00019959839690105756, + "loss": 3.1774020195007324, + "step": 2631, + "token_acc": 0.2818502258432184 + }, + { + "epoch": 1.5429492817355615, + "grad_norm": 0.6586184011729296, + "learning_rate": 0.00019959752868391255, + "loss": 3.149879217147827, + "step": 2632, + "token_acc": 0.2849889433512346 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.544734172641942, + "learning_rate": 0.000199596659531183, + "loss": 3.1702375411987305, + "step": 2633, + "token_acc": 0.28135284737918703 + }, + { + "epoch": 1.5441219583699795, + "grad_norm": 0.5693199786630552, + "learning_rate": 0.000199595789442877, + "loss": 3.2126379013061523, + "step": 2634, + "token_acc": 0.2774177174120658 + }, + { + "epoch": 1.5447082966871886, + "grad_norm": 0.6306540179255922, + "learning_rate": 0.0001995949184190028, + "loss": 3.2129197120666504, + "step": 2635, + "token_acc": 0.277051965255377 + }, + { + "epoch": 1.5452946350043977, + "grad_norm": 0.7031656807168499, + "learning_rate": 0.00019959404645956852, + "loss": 3.151315212249756, + "step": 2636, + "token_acc": 0.2851654513357908 + }, + { + "epoch": 1.5458809733216066, + "grad_norm": 0.6385330011965263, + "learning_rate": 0.0001995931735645824, + "loss": 3.206514358520508, + "step": 2637, + "token_acc": 0.2767687727285706 + }, + { + "epoch": 1.5464673116388155, + "grad_norm": 0.5427616952591011, + "learning_rate": 0.0001995922997340526, + "loss": 3.222639560699463, + "step": 2638, + "token_acc": 0.2738811113390889 + }, + { + "epoch": 1.5470536499560246, + "grad_norm": 0.6360618557810592, + "learning_rate": 0.00019959142496798736, + "loss": 3.174762725830078, + "step": 2639, + "token_acc": 0.28050552040844295 + }, + { + "epoch": 1.5476399882732337, + "grad_norm": 0.6747420405956391, + "learning_rate": 0.00019959054926639488, + "loss": 3.182965040206909, + "step": 2640, + "token_acc": 0.2802981890009137 + }, + { + "epoch": 1.5482263265904428, + "grad_norm": 0.6363593952484315, + "learning_rate": 0.0001995896726292834, + "loss": 3.257631301879883, + "step": 2641, + "token_acc": 0.2707891266835962 + }, + { + "epoch": 1.5488126649076517, + "grad_norm": 0.6625944911321094, + "learning_rate": 0.00019958879505666116, + "loss": 3.1755876541137695, + "step": 2642, + "token_acc": 0.28102680515634526 + }, + { + "epoch": 1.5493990032248608, + "grad_norm": 0.5672519022346407, + "learning_rate": 0.00019958791654853635, + "loss": 3.1996805667877197, + "step": 2643, + "token_acc": 0.27751013453968626 + }, + { + "epoch": 1.5499853415420697, + "grad_norm": 0.5406927723118318, + "learning_rate": 0.00019958703710491727, + "loss": 3.1929984092712402, + "step": 2644, + "token_acc": 0.27944954318506704 + }, + { + "epoch": 1.5505716798592788, + "grad_norm": 0.507959746515845, + "learning_rate": 0.00019958615672581217, + "loss": 3.1744046211242676, + "step": 2645, + "token_acc": 0.2798236925249174 + }, + { + "epoch": 1.551158018176488, + "grad_norm": 0.4750646893039651, + "learning_rate": 0.00019958527541122934, + "loss": 3.2198405265808105, + "step": 2646, + "token_acc": 0.2751212389841998 + }, + { + "epoch": 1.551744356493697, + "grad_norm": 0.4857295736421394, + "learning_rate": 0.00019958439316117703, + "loss": 3.1757779121398926, + "step": 2647, + "token_acc": 0.28054909628328795 + }, + { + "epoch": 1.552330694810906, + "grad_norm": 0.5201699525293312, + "learning_rate": 0.0001995835099756635, + "loss": 3.171346664428711, + "step": 2648, + "token_acc": 0.2825549642999701 + }, + { + "epoch": 1.5529170331281148, + "grad_norm": 0.5029864219315964, + "learning_rate": 0.00019958262585469716, + "loss": 3.2130441665649414, + "step": 2649, + "token_acc": 0.2756864398078347 + }, + { + "epoch": 1.553503371445324, + "grad_norm": 0.6019664214965286, + "learning_rate": 0.00019958174079828618, + "loss": 3.2080445289611816, + "step": 2650, + "token_acc": 0.2769343593556634 + }, + { + "epoch": 1.554089709762533, + "grad_norm": 0.7028617053857716, + "learning_rate": 0.00019958085480643897, + "loss": 3.239995241165161, + "step": 2651, + "token_acc": 0.27251708474853703 + }, + { + "epoch": 1.5546760480797421, + "grad_norm": 0.5436524782703289, + "learning_rate": 0.00019957996787916377, + "loss": 3.1789169311523438, + "step": 2652, + "token_acc": 0.2803233368021612 + }, + { + "epoch": 1.555262386396951, + "grad_norm": 0.4757696680370547, + "learning_rate": 0.000199579080016469, + "loss": 3.210191011428833, + "step": 2653, + "token_acc": 0.27740491585797705 + }, + { + "epoch": 1.55584872471416, + "grad_norm": 0.6083067017802386, + "learning_rate": 0.00019957819121836295, + "loss": 3.2088663578033447, + "step": 2654, + "token_acc": 0.2763939105107056 + }, + { + "epoch": 1.556435063031369, + "grad_norm": 0.6294612819493758, + "learning_rate": 0.00019957730148485397, + "loss": 3.23964262008667, + "step": 2655, + "token_acc": 0.27324605844445343 + }, + { + "epoch": 1.5570214013485781, + "grad_norm": 0.4144174263100731, + "learning_rate": 0.00019957641081595043, + "loss": 3.2180051803588867, + "step": 2656, + "token_acc": 0.274720674888109 + }, + { + "epoch": 1.5576077396657872, + "grad_norm": 0.5328353351184156, + "learning_rate": 0.00019957551921166066, + "loss": 3.224337100982666, + "step": 2657, + "token_acc": 0.2755813180530997 + }, + { + "epoch": 1.5581940779829961, + "grad_norm": 0.659026532932319, + "learning_rate": 0.0001995746266719931, + "loss": 3.1951637268066406, + "step": 2658, + "token_acc": 0.27689645966814525 + }, + { + "epoch": 1.5587804163002053, + "grad_norm": 0.5667812985259825, + "learning_rate": 0.0001995737331969561, + "loss": 3.1846213340759277, + "step": 2659, + "token_acc": 0.2788577125691601 + }, + { + "epoch": 1.5593667546174141, + "grad_norm": 0.479856358444414, + "learning_rate": 0.00019957283878655803, + "loss": 3.1924424171447754, + "step": 2660, + "token_acc": 0.27787733055606606 + }, + { + "epoch": 1.5599530929346233, + "grad_norm": 0.5176713862951597, + "learning_rate": 0.00019957194344080733, + "loss": 3.195065975189209, + "step": 2661, + "token_acc": 0.27675317847365927 + }, + { + "epoch": 1.5605394312518324, + "grad_norm": 0.5146265093831452, + "learning_rate": 0.00019957104715971242, + "loss": 3.1754794120788574, + "step": 2662, + "token_acc": 0.28130853907516157 + }, + { + "epoch": 1.5611257695690415, + "grad_norm": 0.5683193146204041, + "learning_rate": 0.00019957014994328168, + "loss": 3.1656100749969482, + "step": 2663, + "token_acc": 0.2802190425537408 + }, + { + "epoch": 1.5617121078862504, + "grad_norm": 0.43697032957482, + "learning_rate": 0.00019956925179152353, + "loss": 3.1724894046783447, + "step": 2664, + "token_acc": 0.2798126161954595 + }, + { + "epoch": 1.5622984462034593, + "grad_norm": 0.5494073812297098, + "learning_rate": 0.00019956835270444647, + "loss": 3.2296042442321777, + "step": 2665, + "token_acc": 0.27485585123638306 + }, + { + "epoch": 1.5628847845206684, + "grad_norm": 0.6155876772911851, + "learning_rate": 0.00019956745268205888, + "loss": 3.1836578845977783, + "step": 2666, + "token_acc": 0.27898089511219526 + }, + { + "epoch": 1.5634711228378775, + "grad_norm": 0.6041915727364132, + "learning_rate": 0.00019956655172436924, + "loss": 3.219562530517578, + "step": 2667, + "token_acc": 0.27594651000170095 + }, + { + "epoch": 1.5640574611550866, + "grad_norm": 0.5596558073345214, + "learning_rate": 0.00019956564983138604, + "loss": 3.269540786743164, + "step": 2668, + "token_acc": 0.26898652084133196 + }, + { + "epoch": 1.5646437994722955, + "grad_norm": 0.4728116260076817, + "learning_rate": 0.0001995647470031177, + "loss": 3.2143819332122803, + "step": 2669, + "token_acc": 0.27603340449444247 + }, + { + "epoch": 1.5652301377895046, + "grad_norm": 0.4644898856649255, + "learning_rate": 0.00019956384323957274, + "loss": 3.1786398887634277, + "step": 2670, + "token_acc": 0.27788483651733953 + }, + { + "epoch": 1.5658164761067135, + "grad_norm": 0.5413404826495073, + "learning_rate": 0.00019956293854075962, + "loss": 3.24879789352417, + "step": 2671, + "token_acc": 0.27239148791536777 + }, + { + "epoch": 1.5664028144239226, + "grad_norm": 0.5852980635252474, + "learning_rate": 0.00019956203290668687, + "loss": 3.1647677421569824, + "step": 2672, + "token_acc": 0.28221272648271706 + }, + { + "epoch": 1.5669891527411317, + "grad_norm": 0.7232027440384753, + "learning_rate": 0.00019956112633736297, + "loss": 3.205904722213745, + "step": 2673, + "token_acc": 0.2773987235837631 + }, + { + "epoch": 1.5675754910583408, + "grad_norm": 0.6398004954008825, + "learning_rate": 0.00019956021883279647, + "loss": 3.1643640995025635, + "step": 2674, + "token_acc": 0.28241405935695385 + }, + { + "epoch": 1.5681618293755497, + "grad_norm": 0.5213728902471416, + "learning_rate": 0.00019955931039299584, + "loss": 3.1736936569213867, + "step": 2675, + "token_acc": 0.28060349495198783 + }, + { + "epoch": 1.5687481676927586, + "grad_norm": 0.459778616600603, + "learning_rate": 0.0001995584010179697, + "loss": 3.189206123352051, + "step": 2676, + "token_acc": 0.2775991119733177 + }, + { + "epoch": 1.5693345060099677, + "grad_norm": 0.5447058164749606, + "learning_rate": 0.0001995574907077265, + "loss": 3.211618661880493, + "step": 2677, + "token_acc": 0.27584803650844403 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.5974072672707506, + "learning_rate": 0.0001995565794622748, + "loss": 3.1757566928863525, + "step": 2678, + "token_acc": 0.27955970128395047 + }, + { + "epoch": 1.570507182644386, + "grad_norm": 0.5087768719774082, + "learning_rate": 0.00019955566728162324, + "loss": 3.1685850620269775, + "step": 2679, + "token_acc": 0.2823508306514743 + }, + { + "epoch": 1.5710935209615948, + "grad_norm": 0.5180332022246887, + "learning_rate": 0.00019955475416578034, + "loss": 3.1553893089294434, + "step": 2680, + "token_acc": 0.28352413904004065 + }, + { + "epoch": 1.5716798592788037, + "grad_norm": 0.5078383827595645, + "learning_rate": 0.00019955384011475466, + "loss": 3.171081781387329, + "step": 2681, + "token_acc": 0.2821244942123537 + }, + { + "epoch": 1.5722661975960128, + "grad_norm": 0.45968922884256685, + "learning_rate": 0.0001995529251285548, + "loss": 3.180799961090088, + "step": 2682, + "token_acc": 0.279760293325273 + }, + { + "epoch": 1.572852535913222, + "grad_norm": 0.38608565678066015, + "learning_rate": 0.00019955200920718935, + "loss": 3.1922671794891357, + "step": 2683, + "token_acc": 0.2782131502229626 + }, + { + "epoch": 1.573438874230431, + "grad_norm": 0.4120686703038631, + "learning_rate": 0.00019955109235066692, + "loss": 3.1866798400878906, + "step": 2684, + "token_acc": 0.2787603731018378 + }, + { + "epoch": 1.57402521254764, + "grad_norm": 0.390296803124471, + "learning_rate": 0.00019955017455899614, + "loss": 3.2009198665618896, + "step": 2685, + "token_acc": 0.2778909601112092 + }, + { + "epoch": 1.574611550864849, + "grad_norm": 0.36461532552874704, + "learning_rate": 0.00019954925583218563, + "loss": 3.1958224773406982, + "step": 2686, + "token_acc": 0.27872545694960343 + }, + { + "epoch": 1.575197889182058, + "grad_norm": 0.4338738884702662, + "learning_rate": 0.00019954833617024398, + "loss": 3.2154078483581543, + "step": 2687, + "token_acc": 0.27445210850178753 + }, + { + "epoch": 1.575784227499267, + "grad_norm": 0.4951501833715675, + "learning_rate": 0.00019954741557317985, + "loss": 3.1404380798339844, + "step": 2688, + "token_acc": 0.2859533352225956 + }, + { + "epoch": 1.5763705658164762, + "grad_norm": 0.4442526149505525, + "learning_rate": 0.00019954649404100192, + "loss": 3.23830246925354, + "step": 2689, + "token_acc": 0.27081915756158476 + }, + { + "epoch": 1.5769569041336853, + "grad_norm": 0.47908842238193333, + "learning_rate": 0.0001995455715737188, + "loss": 3.1975960731506348, + "step": 2690, + "token_acc": 0.2759045852785061 + }, + { + "epoch": 1.5775432424508942, + "grad_norm": 0.4323151601088509, + "learning_rate": 0.00019954464817133918, + "loss": 3.2066562175750732, + "step": 2691, + "token_acc": 0.2761840301701575 + }, + { + "epoch": 1.578129580768103, + "grad_norm": 0.4377696635345492, + "learning_rate": 0.00019954372383387172, + "loss": 3.155653715133667, + "step": 2692, + "token_acc": 0.28266484842753836 + }, + { + "epoch": 1.5787159190853122, + "grad_norm": 0.42547903489818345, + "learning_rate": 0.00019954279856132515, + "loss": 3.1684136390686035, + "step": 2693, + "token_acc": 0.2815018958563869 + }, + { + "epoch": 1.5793022574025213, + "grad_norm": 0.4406429540119083, + "learning_rate": 0.0001995418723537081, + "loss": 3.16751766204834, + "step": 2694, + "token_acc": 0.281215789641502 + }, + { + "epoch": 1.5798885957197304, + "grad_norm": 0.45730537507453584, + "learning_rate": 0.00019954094521102927, + "loss": 3.2066006660461426, + "step": 2695, + "token_acc": 0.27585700036747335 + }, + { + "epoch": 1.5804749340369393, + "grad_norm": 0.5479819208363274, + "learning_rate": 0.0001995400171332974, + "loss": 3.167570114135742, + "step": 2696, + "token_acc": 0.28002477289345834 + }, + { + "epoch": 1.5810612723541484, + "grad_norm": 0.6279364661837348, + "learning_rate": 0.0001995390881205212, + "loss": 3.21063232421875, + "step": 2697, + "token_acc": 0.27513481949644486 + }, + { + "epoch": 1.5816476106713573, + "grad_norm": 0.5812659942346973, + "learning_rate": 0.0001995381581727094, + "loss": 3.187704563140869, + "step": 2698, + "token_acc": 0.27986604887640754 + }, + { + "epoch": 1.5822339489885664, + "grad_norm": 0.5916367253251017, + "learning_rate": 0.00019953722728987075, + "loss": 3.1654257774353027, + "step": 2699, + "token_acc": 0.28133582679321756 + }, + { + "epoch": 1.5828202873057755, + "grad_norm": 0.5577317903217045, + "learning_rate": 0.00019953629547201398, + "loss": 3.217228412628174, + "step": 2700, + "token_acc": 0.27419637325609664 + }, + { + "epoch": 1.5834066256229846, + "grad_norm": 0.619298920858638, + "learning_rate": 0.0001995353627191478, + "loss": 3.206254482269287, + "step": 2701, + "token_acc": 0.27676129094422647 + }, + { + "epoch": 1.5839929639401935, + "grad_norm": 0.5749382188890245, + "learning_rate": 0.00019953442903128106, + "loss": 3.1901440620422363, + "step": 2702, + "token_acc": 0.27944942212554313 + }, + { + "epoch": 1.5845793022574024, + "grad_norm": 0.518724504608595, + "learning_rate": 0.0001995334944084225, + "loss": 3.2007551193237305, + "step": 2703, + "token_acc": 0.2781976286192018 + }, + { + "epoch": 1.5851656405746115, + "grad_norm": 0.5017412619657687, + "learning_rate": 0.00019953255885058082, + "loss": 3.158094882965088, + "step": 2704, + "token_acc": 0.2820509585639959 + }, + { + "epoch": 1.5857519788918206, + "grad_norm": 0.49752078884240086, + "learning_rate": 0.0001995316223577649, + "loss": 3.185879945755005, + "step": 2705, + "token_acc": 0.2782083482208537 + }, + { + "epoch": 1.5863383172090297, + "grad_norm": 0.4458699434766253, + "learning_rate": 0.00019953068492998353, + "loss": 3.163083553314209, + "step": 2706, + "token_acc": 0.28040390383596864 + }, + { + "epoch": 1.5869246555262386, + "grad_norm": 0.5639789481036582, + "learning_rate": 0.00019952974656724546, + "loss": 3.2054343223571777, + "step": 2707, + "token_acc": 0.2781341618619222 + }, + { + "epoch": 1.5875109938434475, + "grad_norm": 0.5635996894132868, + "learning_rate": 0.00019952880726955953, + "loss": 3.1374831199645996, + "step": 2708, + "token_acc": 0.28522633396174457 + }, + { + "epoch": 1.5880973321606566, + "grad_norm": 0.4575514254484415, + "learning_rate": 0.00019952786703693461, + "loss": 3.1733005046844482, + "step": 2709, + "token_acc": 0.27982315344827136 + }, + { + "epoch": 1.5886836704778657, + "grad_norm": 0.43108544564551143, + "learning_rate": 0.00019952692586937948, + "loss": 3.1765170097351074, + "step": 2710, + "token_acc": 0.2815371987566775 + }, + { + "epoch": 1.5892700087950749, + "grad_norm": 0.46493941079626905, + "learning_rate": 0.000199525983766903, + "loss": 3.1962127685546875, + "step": 2711, + "token_acc": 0.27802675738578203 + }, + { + "epoch": 1.5898563471122837, + "grad_norm": 0.5392401843766587, + "learning_rate": 0.00019952504072951398, + "loss": 3.1908326148986816, + "step": 2712, + "token_acc": 0.2791409961261202 + }, + { + "epoch": 1.5904426854294929, + "grad_norm": 0.4911716648950553, + "learning_rate": 0.00019952409675722137, + "loss": 3.133856773376465, + "step": 2713, + "token_acc": 0.2855261546000245 + }, + { + "epoch": 1.5910290237467017, + "grad_norm": 0.4706750182588433, + "learning_rate": 0.00019952315185003396, + "loss": 3.18925404548645, + "step": 2714, + "token_acc": 0.27776780197204076 + }, + { + "epoch": 1.5916153620639109, + "grad_norm": 0.5703214209218508, + "learning_rate": 0.00019952220600796063, + "loss": 3.239129066467285, + "step": 2715, + "token_acc": 0.2723365881121179 + }, + { + "epoch": 1.59220170038112, + "grad_norm": 0.5799968257842225, + "learning_rate": 0.0001995212592310103, + "loss": 3.225174903869629, + "step": 2716, + "token_acc": 0.274231573444851 + }, + { + "epoch": 1.592788038698329, + "grad_norm": 0.5976963321482966, + "learning_rate": 0.00019952031151919183, + "loss": 3.2250146865844727, + "step": 2717, + "token_acc": 0.27567759665128494 + }, + { + "epoch": 1.593374377015538, + "grad_norm": 0.5470508560318481, + "learning_rate": 0.00019951936287251415, + "loss": 3.174149513244629, + "step": 2718, + "token_acc": 0.2816732072972812 + }, + { + "epoch": 1.5939607153327469, + "grad_norm": 0.5089985677498646, + "learning_rate": 0.00019951841329098616, + "loss": 3.167757511138916, + "step": 2719, + "token_acc": 0.280138073321695 + }, + { + "epoch": 1.594547053649956, + "grad_norm": 0.5175469222570545, + "learning_rate": 0.0001995174627746168, + "loss": 3.190876007080078, + "step": 2720, + "token_acc": 0.27822243367891153 + }, + { + "epoch": 1.595133391967165, + "grad_norm": 0.4874609566704708, + "learning_rate": 0.00019951651132341496, + "loss": 3.1772279739379883, + "step": 2721, + "token_acc": 0.2787389515155891 + }, + { + "epoch": 1.5957197302843742, + "grad_norm": 0.5352029315649367, + "learning_rate": 0.0001995155589373896, + "loss": 3.1827778816223145, + "step": 2722, + "token_acc": 0.281116451297919 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.7246851472059336, + "learning_rate": 0.00019951460561654964, + "loss": 3.19466495513916, + "step": 2723, + "token_acc": 0.27677593216642693 + }, + { + "epoch": 1.5968924069187922, + "grad_norm": 0.7422870273747878, + "learning_rate": 0.00019951365136090408, + "loss": 3.2076501846313477, + "step": 2724, + "token_acc": 0.2752647582564694 + }, + { + "epoch": 1.597478745236001, + "grad_norm": 0.46800913628303287, + "learning_rate": 0.00019951269617046188, + "loss": 3.1576504707336426, + "step": 2725, + "token_acc": 0.28191748949427425 + }, + { + "epoch": 1.5980650835532102, + "grad_norm": 0.6107549778916752, + "learning_rate": 0.00019951174004523194, + "loss": 3.1973519325256348, + "step": 2726, + "token_acc": 0.27642424589557874 + }, + { + "epoch": 1.5986514218704193, + "grad_norm": 0.6172861175810778, + "learning_rate": 0.0001995107829852233, + "loss": 3.2109103202819824, + "step": 2727, + "token_acc": 0.2767768183327798 + }, + { + "epoch": 1.5992377601876284, + "grad_norm": 0.525818498176962, + "learning_rate": 0.00019950982499044502, + "loss": 3.1743531227111816, + "step": 2728, + "token_acc": 0.28082511182724545 + }, + { + "epoch": 1.5998240985048373, + "grad_norm": 0.5983146747536802, + "learning_rate": 0.00019950886606090598, + "loss": 3.2880496978759766, + "step": 2729, + "token_acc": 0.26584680729288257 + }, + { + "epoch": 1.6004104368220462, + "grad_norm": 0.4831373520742412, + "learning_rate": 0.00019950790619661522, + "loss": 3.1485490798950195, + "step": 2730, + "token_acc": 0.2836629001883239 + }, + { + "epoch": 1.6009967751392553, + "grad_norm": 0.6913192914898139, + "learning_rate": 0.0001995069453975818, + "loss": 3.185734748840332, + "step": 2731, + "token_acc": 0.27820943271148185 + }, + { + "epoch": 1.6015831134564644, + "grad_norm": 0.6058190723599078, + "learning_rate": 0.00019950598366381468, + "loss": 3.21842622756958, + "step": 2732, + "token_acc": 0.27552854817867856 + }, + { + "epoch": 1.6021694517736735, + "grad_norm": 0.5033882498220025, + "learning_rate": 0.00019950502099532296, + "loss": 3.1752514839172363, + "step": 2733, + "token_acc": 0.28058724241852495 + }, + { + "epoch": 1.6027557900908824, + "grad_norm": 0.588131090385271, + "learning_rate": 0.00019950405739211564, + "loss": 3.2080841064453125, + "step": 2734, + "token_acc": 0.27592276668628507 + }, + { + "epoch": 1.6033421284080913, + "grad_norm": 0.5080300539935406, + "learning_rate": 0.0001995030928542018, + "loss": 3.2264797687530518, + "step": 2735, + "token_acc": 0.27153140444863944 + }, + { + "epoch": 1.6039284667253004, + "grad_norm": 0.5451005728266386, + "learning_rate": 0.00019950212738159044, + "loss": 3.2129735946655273, + "step": 2736, + "token_acc": 0.2755253702090592 + }, + { + "epoch": 1.6045148050425095, + "grad_norm": 0.5973111399793514, + "learning_rate": 0.00019950116097429071, + "loss": 3.2457523345947266, + "step": 2737, + "token_acc": 0.27139484380235684 + }, + { + "epoch": 1.6051011433597187, + "grad_norm": 0.5126073270540585, + "learning_rate": 0.00019950019363231163, + "loss": 3.1716482639312744, + "step": 2738, + "token_acc": 0.2805184499404841 + }, + { + "epoch": 1.6056874816769275, + "grad_norm": 0.41094424643029165, + "learning_rate": 0.00019949922535566234, + "loss": 3.160980224609375, + "step": 2739, + "token_acc": 0.2823145319602347 + }, + { + "epoch": 1.6062738199941367, + "grad_norm": 0.548692898926108, + "learning_rate": 0.00019949825614435187, + "loss": 3.1645078659057617, + "step": 2740, + "token_acc": 0.28048002368470754 + }, + { + "epoch": 1.6068601583113455, + "grad_norm": 0.48559433319884715, + "learning_rate": 0.0001994972859983894, + "loss": 3.173524856567383, + "step": 2741, + "token_acc": 0.28287436230226554 + }, + { + "epoch": 1.6074464966285547, + "grad_norm": 0.4460907615137774, + "learning_rate": 0.00019949631491778398, + "loss": 3.15598464012146, + "step": 2742, + "token_acc": 0.28203202286300516 + }, + { + "epoch": 1.6080328349457638, + "grad_norm": 0.45976814831807006, + "learning_rate": 0.00019949534290254474, + "loss": 3.16910457611084, + "step": 2743, + "token_acc": 0.28094700900226377 + }, + { + "epoch": 1.6086191732629729, + "grad_norm": 0.3915946096315081, + "learning_rate": 0.00019949436995268086, + "loss": 3.1151230335235596, + "step": 2744, + "token_acc": 0.2892059609069751 + }, + { + "epoch": 1.6092055115801818, + "grad_norm": 0.6197851649176394, + "learning_rate": 0.0001994933960682014, + "loss": 3.155679225921631, + "step": 2745, + "token_acc": 0.28426036862149345 + }, + { + "epoch": 1.6097918498973907, + "grad_norm": 0.637770260883947, + "learning_rate": 0.0001994924212491156, + "loss": 3.19277286529541, + "step": 2746, + "token_acc": 0.277201729030205 + }, + { + "epoch": 1.6103781882145998, + "grad_norm": 0.4495472654610107, + "learning_rate": 0.00019949144549543253, + "loss": 3.215533971786499, + "step": 2747, + "token_acc": 0.27371296076922874 + }, + { + "epoch": 1.6109645265318089, + "grad_norm": 0.49777329723678526, + "learning_rate": 0.0001994904688071614, + "loss": 3.1957359313964844, + "step": 2748, + "token_acc": 0.27660357646472933 + }, + { + "epoch": 1.611550864849018, + "grad_norm": 0.544561957830631, + "learning_rate": 0.0001994894911843114, + "loss": 3.2205286026000977, + "step": 2749, + "token_acc": 0.2740589779704717 + }, + { + "epoch": 1.6121372031662269, + "grad_norm": 0.4828421329681335, + "learning_rate": 0.0001994885126268917, + "loss": 3.1864824295043945, + "step": 2750, + "token_acc": 0.27895519251574136 + }, + { + "epoch": 1.612723541483436, + "grad_norm": 0.5080422877395361, + "learning_rate": 0.0001994875331349115, + "loss": 3.1510653495788574, + "step": 2751, + "token_acc": 0.2836355525579987 + }, + { + "epoch": 1.6133098798006449, + "grad_norm": 0.5548527930230237, + "learning_rate": 0.00019948655270837993, + "loss": 3.157036781311035, + "step": 2752, + "token_acc": 0.28403876786039767 + }, + { + "epoch": 1.613896218117854, + "grad_norm": 0.46216027103306473, + "learning_rate": 0.00019948557134730628, + "loss": 3.168409824371338, + "step": 2753, + "token_acc": 0.2813674256727674 + }, + { + "epoch": 1.614482556435063, + "grad_norm": 0.4858823496593671, + "learning_rate": 0.00019948458905169977, + "loss": 3.16196608543396, + "step": 2754, + "token_acc": 0.28132535713811996 + }, + { + "epoch": 1.6150688947522722, + "grad_norm": 0.517089754032563, + "learning_rate": 0.0001994836058215696, + "loss": 3.2013792991638184, + "step": 2755, + "token_acc": 0.27720342154794125 + }, + { + "epoch": 1.6156552330694811, + "grad_norm": 0.5278272720208319, + "learning_rate": 0.000199482621656925, + "loss": 3.1952462196350098, + "step": 2756, + "token_acc": 0.27911832004911186 + }, + { + "epoch": 1.61624157138669, + "grad_norm": 0.4625206974032342, + "learning_rate": 0.00019948163655777518, + "loss": 3.2086873054504395, + "step": 2757, + "token_acc": 0.2774865796939432 + }, + { + "epoch": 1.6168279097038991, + "grad_norm": 0.42925296836957116, + "learning_rate": 0.0001994806505241295, + "loss": 3.1837313175201416, + "step": 2758, + "token_acc": 0.2777092888410627 + }, + { + "epoch": 1.6174142480211082, + "grad_norm": 0.5208040772991201, + "learning_rate": 0.00019947966355599714, + "loss": 3.2236547470092773, + "step": 2759, + "token_acc": 0.2710851188856017 + }, + { + "epoch": 1.6180005863383173, + "grad_norm": 0.5824409233992457, + "learning_rate": 0.00019947867565338738, + "loss": 3.2049150466918945, + "step": 2760, + "token_acc": 0.2781085939574694 + }, + { + "epoch": 1.6185869246555262, + "grad_norm": 0.7489718278310702, + "learning_rate": 0.00019947768681630951, + "loss": 3.2080612182617188, + "step": 2761, + "token_acc": 0.27595411057843283 + }, + { + "epoch": 1.6191732629727351, + "grad_norm": 0.7295635521381536, + "learning_rate": 0.00019947669704477284, + "loss": 3.1222336292266846, + "step": 2762, + "token_acc": 0.2876163039669872 + }, + { + "epoch": 1.6197596012899442, + "grad_norm": 0.693750412192316, + "learning_rate": 0.00019947570633878665, + "loss": 3.151315689086914, + "step": 2763, + "token_acc": 0.2832147979918719 + }, + { + "epoch": 1.6203459396071533, + "grad_norm": 0.7141214163460657, + "learning_rate": 0.00019947471469836022, + "loss": 3.1913294792175293, + "step": 2764, + "token_acc": 0.2762640125182772 + }, + { + "epoch": 1.6209322779243625, + "grad_norm": 0.5977329495917485, + "learning_rate": 0.00019947372212350293, + "loss": 3.1339991092681885, + "step": 2765, + "token_acc": 0.2844116214982933 + }, + { + "epoch": 1.6215186162415713, + "grad_norm": 0.5402359990732961, + "learning_rate": 0.000199472728614224, + "loss": 3.1637227535247803, + "step": 2766, + "token_acc": 0.28120816137576576 + }, + { + "epoch": 1.6221049545587805, + "grad_norm": 0.6132452949488804, + "learning_rate": 0.00019947173417053285, + "loss": 3.179791212081909, + "step": 2767, + "token_acc": 0.2818497784617017 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5474417208949522, + "learning_rate": 0.00019947073879243883, + "loss": 3.1509242057800293, + "step": 2768, + "token_acc": 0.28458485950716145 + }, + { + "epoch": 1.6232776311931985, + "grad_norm": 0.46283082939957776, + "learning_rate": 0.00019946974247995124, + "loss": 3.203836441040039, + "step": 2769, + "token_acc": 0.27665301255990193 + }, + { + "epoch": 1.6238639695104076, + "grad_norm": 0.5812805316464508, + "learning_rate": 0.00019946874523307947, + "loss": 3.176304340362549, + "step": 2770, + "token_acc": 0.2798545052075638 + }, + { + "epoch": 1.6244503078276167, + "grad_norm": 0.5505599696850706, + "learning_rate": 0.00019946774705183285, + "loss": 3.176671266555786, + "step": 2771, + "token_acc": 0.27985580311161706 + }, + { + "epoch": 1.6250366461448256, + "grad_norm": 0.5357277806754507, + "learning_rate": 0.0001994667479362208, + "loss": 3.178579092025757, + "step": 2772, + "token_acc": 0.28141497705282614 + }, + { + "epoch": 1.6256229844620345, + "grad_norm": 0.6477300828637749, + "learning_rate": 0.00019946574788625267, + "loss": 3.2067723274230957, + "step": 2773, + "token_acc": 0.2753825811491719 + }, + { + "epoch": 1.6262093227792436, + "grad_norm": 0.5183817930345591, + "learning_rate": 0.00019946474690193787, + "loss": 3.248183250427246, + "step": 2774, + "token_acc": 0.27132052554557057 + }, + { + "epoch": 1.6267956610964527, + "grad_norm": 0.4787561276120512, + "learning_rate": 0.00019946374498328582, + "loss": 3.1965436935424805, + "step": 2775, + "token_acc": 0.2777290393791798 + }, + { + "epoch": 1.6273819994136618, + "grad_norm": 0.5559703945608404, + "learning_rate": 0.00019946274213030588, + "loss": 3.216770648956299, + "step": 2776, + "token_acc": 0.2768947462877564 + }, + { + "epoch": 1.6279683377308707, + "grad_norm": 0.4279772556025272, + "learning_rate": 0.0001994617383430075, + "loss": 3.153958320617676, + "step": 2777, + "token_acc": 0.282367425317854 + }, + { + "epoch": 1.6285546760480798, + "grad_norm": 0.3907500799893467, + "learning_rate": 0.0001994607336214002, + "loss": 3.156097173690796, + "step": 2778, + "token_acc": 0.2828516197181266 + }, + { + "epoch": 1.6291410143652887, + "grad_norm": 0.4678167192536674, + "learning_rate": 0.00019945972796549323, + "loss": 3.1726467609405518, + "step": 2779, + "token_acc": 0.2798339177741428 + }, + { + "epoch": 1.6297273526824978, + "grad_norm": 0.41685268286936933, + "learning_rate": 0.0001994587213752962, + "loss": 3.216644287109375, + "step": 2780, + "token_acc": 0.27539233973682 + }, + { + "epoch": 1.630313690999707, + "grad_norm": 0.42632467830232973, + "learning_rate": 0.00019945771385081852, + "loss": 3.1680068969726562, + "step": 2781, + "token_acc": 0.28034242248304897 + }, + { + "epoch": 1.630900029316916, + "grad_norm": 0.48642994751214513, + "learning_rate": 0.0001994567053920696, + "loss": 3.178630828857422, + "step": 2782, + "token_acc": 0.2807507335631356 + }, + { + "epoch": 1.631486367634125, + "grad_norm": 0.6684322972811882, + "learning_rate": 0.00019945569599905894, + "loss": 3.1403369903564453, + "step": 2783, + "token_acc": 0.28564544796260943 + }, + { + "epoch": 1.6320727059513338, + "grad_norm": 0.6217527901632093, + "learning_rate": 0.0001994546856717961, + "loss": 3.173407554626465, + "step": 2784, + "token_acc": 0.2812266485003551 + }, + { + "epoch": 1.632659044268543, + "grad_norm": 0.447455355709841, + "learning_rate": 0.00019945367441029043, + "loss": 3.1835484504699707, + "step": 2785, + "token_acc": 0.28042461018670617 + }, + { + "epoch": 1.633245382585752, + "grad_norm": 0.5709725194838717, + "learning_rate": 0.00019945266221455153, + "loss": 3.148618698120117, + "step": 2786, + "token_acc": 0.28191742174891027 + }, + { + "epoch": 1.6338317209029611, + "grad_norm": 0.6456972940792354, + "learning_rate": 0.00019945164908458888, + "loss": 3.1668946743011475, + "step": 2787, + "token_acc": 0.2824771258615904 + }, + { + "epoch": 1.63441805922017, + "grad_norm": 0.5409629829091859, + "learning_rate": 0.00019945063502041204, + "loss": 3.217435121536255, + "step": 2788, + "token_acc": 0.2754680326833172 + }, + { + "epoch": 1.635004397537379, + "grad_norm": 0.5476724181885979, + "learning_rate": 0.00019944962002203044, + "loss": 3.222550392150879, + "step": 2789, + "token_acc": 0.2746753077743942 + }, + { + "epoch": 1.635590735854588, + "grad_norm": 0.5563318894192146, + "learning_rate": 0.0001994486040894537, + "loss": 3.2024638652801514, + "step": 2790, + "token_acc": 0.275915849585174 + }, + { + "epoch": 1.6361770741717971, + "grad_norm": 0.5108529937699117, + "learning_rate": 0.00019944758722269132, + "loss": 3.213064670562744, + "step": 2791, + "token_acc": 0.2746729773332048 + }, + { + "epoch": 1.6367634124890063, + "grad_norm": 0.4669508848959545, + "learning_rate": 0.00019944656942175287, + "loss": 3.1516754627227783, + "step": 2792, + "token_acc": 0.2810695622879796 + }, + { + "epoch": 1.6373497508062151, + "grad_norm": 0.5512676952212291, + "learning_rate": 0.0001994455506866479, + "loss": 3.2636749744415283, + "step": 2793, + "token_acc": 0.26851698925407724 + }, + { + "epoch": 1.6379360891234243, + "grad_norm": 0.7116761369698605, + "learning_rate": 0.000199444531017386, + "loss": 3.210907459259033, + "step": 2794, + "token_acc": 0.2757381205213318 + }, + { + "epoch": 1.6385224274406331, + "grad_norm": 0.5933682303517525, + "learning_rate": 0.00019944351041397673, + "loss": 3.1618857383728027, + "step": 2795, + "token_acc": 0.28142113891818066 + }, + { + "epoch": 1.6391087657578423, + "grad_norm": 0.5454498753958668, + "learning_rate": 0.0001994424888764297, + "loss": 3.1935489177703857, + "step": 2796, + "token_acc": 0.27928899562346 + }, + { + "epoch": 1.6396951040750514, + "grad_norm": 0.44898988179923366, + "learning_rate": 0.00019944146640475446, + "loss": 3.233721971511841, + "step": 2797, + "token_acc": 0.2727671048196832 + }, + { + "epoch": 1.6402814423922605, + "grad_norm": 0.5118375794797234, + "learning_rate": 0.00019944044299896065, + "loss": 3.232759475708008, + "step": 2798, + "token_acc": 0.27262206470242945 + }, + { + "epoch": 1.6408677807094694, + "grad_norm": 0.46028969475881437, + "learning_rate": 0.00019943941865905787, + "loss": 3.2208476066589355, + "step": 2799, + "token_acc": 0.2740351849494095 + }, + { + "epoch": 1.6414541190266783, + "grad_norm": 0.39303700313203016, + "learning_rate": 0.00019943839338505576, + "loss": 3.2424120903015137, + "step": 2800, + "token_acc": 0.2709996169653302 + }, + { + "epoch": 1.6420404573438874, + "grad_norm": 0.4456127668592045, + "learning_rate": 0.00019943736717696392, + "loss": 3.1765012741088867, + "step": 2801, + "token_acc": 0.27878677068096863 + }, + { + "epoch": 1.6426267956610965, + "grad_norm": 0.4070435637317442, + "learning_rate": 0.000199436340034792, + "loss": 3.12042236328125, + "step": 2802, + "token_acc": 0.2877063544734332 + }, + { + "epoch": 1.6432131339783056, + "grad_norm": 0.3966690487864105, + "learning_rate": 0.0001994353119585497, + "loss": 3.1749918460845947, + "step": 2803, + "token_acc": 0.28035081916432375 + }, + { + "epoch": 1.6437994722955145, + "grad_norm": 0.4093232052551597, + "learning_rate": 0.0001994342829482466, + "loss": 3.1941781044006348, + "step": 2804, + "token_acc": 0.27880860950659914 + }, + { + "epoch": 1.6443858106127234, + "grad_norm": 0.3810772658009836, + "learning_rate": 0.00019943325300389244, + "loss": 3.184230327606201, + "step": 2805, + "token_acc": 0.279265041260886 + }, + { + "epoch": 1.6449721489299325, + "grad_norm": 0.45041957857186704, + "learning_rate": 0.00019943222212549683, + "loss": 3.19789457321167, + "step": 2806, + "token_acc": 0.2778620645244381 + }, + { + "epoch": 1.6455584872471416, + "grad_norm": 0.5384638761927716, + "learning_rate": 0.00019943119031306947, + "loss": 3.1767196655273438, + "step": 2807, + "token_acc": 0.27961630695443646 + }, + { + "epoch": 1.6461448255643507, + "grad_norm": 0.5611399780308302, + "learning_rate": 0.00019943015756662008, + "loss": 3.186880588531494, + "step": 2808, + "token_acc": 0.277730273909917 + }, + { + "epoch": 1.6467311638815598, + "grad_norm": 0.5147937348325553, + "learning_rate": 0.00019942912388615832, + "loss": 3.1641759872436523, + "step": 2809, + "token_acc": 0.28220344265190156 + }, + { + "epoch": 1.6473175021987687, + "grad_norm": 0.5654180450263895, + "learning_rate": 0.00019942808927169393, + "loss": 3.181835651397705, + "step": 2810, + "token_acc": 0.2800724241731822 + }, + { + "epoch": 1.6479038405159776, + "grad_norm": 0.4710954340628746, + "learning_rate": 0.00019942705372323665, + "loss": 3.110374927520752, + "step": 2811, + "token_acc": 0.2905339968612091 + }, + { + "epoch": 1.6484901788331867, + "grad_norm": 0.641923435500879, + "learning_rate": 0.00019942601724079614, + "loss": 3.148357391357422, + "step": 2812, + "token_acc": 0.2847719991926427 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7035982045092186, + "learning_rate": 0.00019942497982438221, + "loss": 3.2139718532562256, + "step": 2813, + "token_acc": 0.27800097019465536 + }, + { + "epoch": 1.649662855467605, + "grad_norm": 0.6533232878713798, + "learning_rate": 0.00019942394147400458, + "loss": 3.1651039123535156, + "step": 2814, + "token_acc": 0.27936322402774777 + }, + { + "epoch": 1.6502491937848138, + "grad_norm": 0.5097307738582846, + "learning_rate": 0.00019942290218967297, + "loss": 3.2052364349365234, + "step": 2815, + "token_acc": 0.2763967126035146 + }, + { + "epoch": 1.6508355321020227, + "grad_norm": 0.5340012826822406, + "learning_rate": 0.00019942186197139717, + "loss": 3.178485631942749, + "step": 2816, + "token_acc": 0.28015954427557277 + }, + { + "epoch": 1.6514218704192318, + "grad_norm": 0.5458380092394408, + "learning_rate": 0.00019942082081918696, + "loss": 3.128283977508545, + "step": 2817, + "token_acc": 0.28661020253216507 + }, + { + "epoch": 1.652008208736441, + "grad_norm": 0.5302909609158541, + "learning_rate": 0.00019941977873305208, + "loss": 3.1871461868286133, + "step": 2818, + "token_acc": 0.2791355145052794 + }, + { + "epoch": 1.65259454705365, + "grad_norm": 0.5545279452543547, + "learning_rate": 0.00019941873571300238, + "loss": 3.1247646808624268, + "step": 2819, + "token_acc": 0.28715901906460584 + }, + { + "epoch": 1.653180885370859, + "grad_norm": 0.5637508909733369, + "learning_rate": 0.0001994176917590476, + "loss": 3.2287073135375977, + "step": 2820, + "token_acc": 0.2727975106896406 + }, + { + "epoch": 1.653767223688068, + "grad_norm": 0.4334130476328985, + "learning_rate": 0.00019941664687119761, + "loss": 3.1863481998443604, + "step": 2821, + "token_acc": 0.2769217046741501 + }, + { + "epoch": 1.654353562005277, + "grad_norm": 0.5661342567934482, + "learning_rate": 0.00019941560104946214, + "loss": 3.1425609588623047, + "step": 2822, + "token_acc": 0.28483202234412275 + }, + { + "epoch": 1.654939900322486, + "grad_norm": 0.4724971327625286, + "learning_rate": 0.00019941455429385113, + "loss": 3.1545259952545166, + "step": 2823, + "token_acc": 0.2845815757493748 + }, + { + "epoch": 1.6555262386396952, + "grad_norm": 0.42441310994818526, + "learning_rate": 0.0001994135066043743, + "loss": 3.1781954765319824, + "step": 2824, + "token_acc": 0.2791140845395099 + }, + { + "epoch": 1.6561125769569043, + "grad_norm": 0.4805471471675668, + "learning_rate": 0.00019941245798104154, + "loss": 3.1931538581848145, + "step": 2825, + "token_acc": 0.2764317471032799 + }, + { + "epoch": 1.6566989152741132, + "grad_norm": 0.4490735814543902, + "learning_rate": 0.0001994114084238627, + "loss": 3.1688804626464844, + "step": 2826, + "token_acc": 0.2796020803880624 + }, + { + "epoch": 1.657285253591322, + "grad_norm": 0.4837428920863334, + "learning_rate": 0.00019941035793284763, + "loss": 3.180664300918579, + "step": 2827, + "token_acc": 0.27999723307118457 + }, + { + "epoch": 1.6578715919085312, + "grad_norm": 0.6014004035986636, + "learning_rate": 0.00019940930650800623, + "loss": 3.1675233840942383, + "step": 2828, + "token_acc": 0.28181392715021164 + }, + { + "epoch": 1.6584579302257403, + "grad_norm": 0.6296702881561701, + "learning_rate": 0.0001994082541493483, + "loss": 3.1702237129211426, + "step": 2829, + "token_acc": 0.28171520863661 + }, + { + "epoch": 1.6590442685429494, + "grad_norm": 0.4554945635315403, + "learning_rate": 0.00019940720085688383, + "loss": 3.200129747390747, + "step": 2830, + "token_acc": 0.2752150997189167 + }, + { + "epoch": 1.6596306068601583, + "grad_norm": 0.468217256444725, + "learning_rate": 0.00019940614663062264, + "loss": 3.1139862537384033, + "step": 2831, + "token_acc": 0.287813491589331 + }, + { + "epoch": 1.6602169451773672, + "grad_norm": 0.4843141143557423, + "learning_rate": 0.00019940509147057465, + "loss": 3.161217451095581, + "step": 2832, + "token_acc": 0.2827087828607178 + }, + { + "epoch": 1.6608032834945763, + "grad_norm": 0.5172996191376336, + "learning_rate": 0.00019940403537674976, + "loss": 3.174161434173584, + "step": 2833, + "token_acc": 0.28115372826499174 + }, + { + "epoch": 1.6613896218117854, + "grad_norm": 0.5258942444428928, + "learning_rate": 0.00019940297834915793, + "loss": 3.162675619125366, + "step": 2834, + "token_acc": 0.28078772839358296 + }, + { + "epoch": 1.6619759601289945, + "grad_norm": 0.5964774941323191, + "learning_rate": 0.00019940192038780908, + "loss": 3.19197416305542, + "step": 2835, + "token_acc": 0.27723134520978415 + }, + { + "epoch": 1.6625622984462036, + "grad_norm": 0.5935246696336092, + "learning_rate": 0.0001994008614927131, + "loss": 3.1661930084228516, + "step": 2836, + "token_acc": 0.28156718862608926 + }, + { + "epoch": 1.6631486367634125, + "grad_norm": 0.47680265846594466, + "learning_rate": 0.00019939980166387998, + "loss": 3.1605894565582275, + "step": 2837, + "token_acc": 0.2811075905914905 + }, + { + "epoch": 1.6637349750806214, + "grad_norm": 0.4575440808755721, + "learning_rate": 0.00019939874090131967, + "loss": 3.230023145675659, + "step": 2838, + "token_acc": 0.2740931006392277 + }, + { + "epoch": 1.6643213133978305, + "grad_norm": 0.5302389176435637, + "learning_rate": 0.00019939767920504212, + "loss": 3.191206693649292, + "step": 2839, + "token_acc": 0.2803357288368818 + }, + { + "epoch": 1.6649076517150396, + "grad_norm": 0.5891384990559664, + "learning_rate": 0.00019939661657505733, + "loss": 3.1735386848449707, + "step": 2840, + "token_acc": 0.2803433542674766 + }, + { + "epoch": 1.6654939900322487, + "grad_norm": 0.5056553838673572, + "learning_rate": 0.00019939555301137527, + "loss": 3.1374077796936035, + "step": 2841, + "token_acc": 0.28513276380859126 + }, + { + "epoch": 1.6660803283494576, + "grad_norm": 0.5211869762050975, + "learning_rate": 0.0001993944885140059, + "loss": 3.178830146789551, + "step": 2842, + "token_acc": 0.2801529820412386 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5290404941384889, + "learning_rate": 0.00019939342308295928, + "loss": 3.1898064613342285, + "step": 2843, + "token_acc": 0.27836815514908336 + }, + { + "epoch": 1.6672530049838756, + "grad_norm": 0.5248195790713743, + "learning_rate": 0.00019939235671824536, + "loss": 3.194986343383789, + "step": 2844, + "token_acc": 0.27819109028808875 + }, + { + "epoch": 1.6678393433010847, + "grad_norm": 0.5342485486341926, + "learning_rate": 0.0001993912894198742, + "loss": 3.234722375869751, + "step": 2845, + "token_acc": 0.2717894357785718 + }, + { + "epoch": 1.6684256816182939, + "grad_norm": 0.5723597126667632, + "learning_rate": 0.0001993902211878558, + "loss": 3.1826910972595215, + "step": 2846, + "token_acc": 0.2799006350647 + }, + { + "epoch": 1.6690120199355027, + "grad_norm": 0.5875655218507776, + "learning_rate": 0.0001993891520222002, + "loss": 3.2054429054260254, + "step": 2847, + "token_acc": 0.2763533220517509 + }, + { + "epoch": 1.6695983582527119, + "grad_norm": 0.45210739813903605, + "learning_rate": 0.00019938808192291742, + "loss": 3.1546449661254883, + "step": 2848, + "token_acc": 0.2835193023624874 + }, + { + "epoch": 1.6701846965699207, + "grad_norm": 0.410651589274117, + "learning_rate": 0.0001993870108900176, + "loss": 3.158632755279541, + "step": 2849, + "token_acc": 0.2825274691294399 + }, + { + "epoch": 1.6707710348871299, + "grad_norm": 0.5076025403410077, + "learning_rate": 0.0001993859389235107, + "loss": 3.151688814163208, + "step": 2850, + "token_acc": 0.2817990995219111 + }, + { + "epoch": 1.671357373204339, + "grad_norm": 0.5655705922447349, + "learning_rate": 0.00019938486602340684, + "loss": 3.1843314170837402, + "step": 2851, + "token_acc": 0.2783322364600674 + }, + { + "epoch": 1.671943711521548, + "grad_norm": 0.5054898852378661, + "learning_rate": 0.00019938379218971606, + "loss": 3.244879722595215, + "step": 2852, + "token_acc": 0.2693527268138005 + }, + { + "epoch": 1.672530049838757, + "grad_norm": 0.5145462850095652, + "learning_rate": 0.00019938271742244847, + "loss": 3.1979732513427734, + "step": 2853, + "token_acc": 0.2773263196081988 + }, + { + "epoch": 1.6731163881559659, + "grad_norm": 0.5860352353182224, + "learning_rate": 0.0001993816417216142, + "loss": 3.1838808059692383, + "step": 2854, + "token_acc": 0.279310915503884 + }, + { + "epoch": 1.673702726473175, + "grad_norm": 0.531101742611855, + "learning_rate": 0.0001993805650872233, + "loss": 3.1591529846191406, + "step": 2855, + "token_acc": 0.28283665538030855 + }, + { + "epoch": 1.674289064790384, + "grad_norm": 0.505796845679621, + "learning_rate": 0.00019937948751928592, + "loss": 3.19934344291687, + "step": 2856, + "token_acc": 0.27863207889495456 + }, + { + "epoch": 1.6748754031075932, + "grad_norm": 0.5913169894322906, + "learning_rate": 0.00019937840901781216, + "loss": 3.218362808227539, + "step": 2857, + "token_acc": 0.27454606545970217 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.47681022526918515, + "learning_rate": 0.0001993773295828122, + "loss": 3.187335729598999, + "step": 2858, + "token_acc": 0.2782354546217782 + }, + { + "epoch": 1.676048079742011, + "grad_norm": 0.48875808842423657, + "learning_rate": 0.0001993762492142961, + "loss": 3.1691136360168457, + "step": 2859, + "token_acc": 0.28176882419385774 + }, + { + "epoch": 1.67663441805922, + "grad_norm": 0.5893408164355202, + "learning_rate": 0.00019937516791227407, + "loss": 3.1616156101226807, + "step": 2860, + "token_acc": 0.2805149123662818 + }, + { + "epoch": 1.6772207563764292, + "grad_norm": 0.49386637891633156, + "learning_rate": 0.00019937408567675624, + "loss": 3.156148910522461, + "step": 2861, + "token_acc": 0.28219289263705577 + }, + { + "epoch": 1.6778070946936383, + "grad_norm": 0.5274373794424084, + "learning_rate": 0.0001993730025077528, + "loss": 3.195784568786621, + "step": 2862, + "token_acc": 0.27668566810426987 + }, + { + "epoch": 1.6783934330108474, + "grad_norm": 0.5805842082848377, + "learning_rate": 0.00019937191840527387, + "loss": 3.163623094558716, + "step": 2863, + "token_acc": 0.2815180458293623 + }, + { + "epoch": 1.6789797713280563, + "grad_norm": 0.5690415842433055, + "learning_rate": 0.0001993708333693297, + "loss": 3.207353115081787, + "step": 2864, + "token_acc": 0.27436192075097954 + }, + { + "epoch": 1.6795661096452652, + "grad_norm": 0.4920691316075731, + "learning_rate": 0.00019936974739993043, + "loss": 3.185525894165039, + "step": 2865, + "token_acc": 0.280002003780818 + }, + { + "epoch": 1.6801524479624743, + "grad_norm": 0.4295407615000572, + "learning_rate": 0.0001993686604970863, + "loss": 3.162886142730713, + "step": 2866, + "token_acc": 0.280847477088161 + }, + { + "epoch": 1.6807387862796834, + "grad_norm": 0.47064503676598063, + "learning_rate": 0.00019936757266080752, + "loss": 3.225635290145874, + "step": 2867, + "token_acc": 0.27406725135931587 + }, + { + "epoch": 1.6813251245968925, + "grad_norm": 0.45731242505805786, + "learning_rate": 0.00019936648389110427, + "loss": 3.191657781600952, + "step": 2868, + "token_acc": 0.2790455386540409 + }, + { + "epoch": 1.6819114629141014, + "grad_norm": 0.4245866315215397, + "learning_rate": 0.00019936539418798684, + "loss": 3.1368765830993652, + "step": 2869, + "token_acc": 0.2837512944739622 + }, + { + "epoch": 1.6824978012313103, + "grad_norm": 0.45209880995773777, + "learning_rate": 0.0001993643035514654, + "loss": 3.225928783416748, + "step": 2870, + "token_acc": 0.272837426717328 + }, + { + "epoch": 1.6830841395485194, + "grad_norm": 0.5005974334931792, + "learning_rate": 0.00019936321198155024, + "loss": 3.171158790588379, + "step": 2871, + "token_acc": 0.27911255157080567 + }, + { + "epoch": 1.6836704778657285, + "grad_norm": 0.5394073750087237, + "learning_rate": 0.00019936211947825156, + "loss": 3.1790037155151367, + "step": 2872, + "token_acc": 0.27890558369220725 + }, + { + "epoch": 1.6842568161829377, + "grad_norm": 0.3825921915654264, + "learning_rate": 0.00019936102604157968, + "loss": 3.15989351272583, + "step": 2873, + "token_acc": 0.28138625616671104 + }, + { + "epoch": 1.6848431545001465, + "grad_norm": 0.46364738182691906, + "learning_rate": 0.00019935993167154487, + "loss": 3.1814894676208496, + "step": 2874, + "token_acc": 0.2791355754550869 + }, + { + "epoch": 1.6854294928173557, + "grad_norm": 0.535513402830021, + "learning_rate": 0.0001993588363681574, + "loss": 3.1777539253234863, + "step": 2875, + "token_acc": 0.2805452131121931 + }, + { + "epoch": 1.6860158311345645, + "grad_norm": 0.5640000929035263, + "learning_rate": 0.0001993577401314275, + "loss": 3.195171594619751, + "step": 2876, + "token_acc": 0.2782568544991382 + }, + { + "epoch": 1.6866021694517737, + "grad_norm": 0.48746766377487216, + "learning_rate": 0.00019935664296136555, + "loss": 3.2331278324127197, + "step": 2877, + "token_acc": 0.27207686175189627 + }, + { + "epoch": 1.6871885077689828, + "grad_norm": 0.5287358491402241, + "learning_rate": 0.00019935554485798183, + "loss": 3.1588358879089355, + "step": 2878, + "token_acc": 0.2815789021012874 + }, + { + "epoch": 1.6877748460861919, + "grad_norm": 0.5550813944975893, + "learning_rate": 0.00019935444582128663, + "loss": 3.1451661586761475, + "step": 2879, + "token_acc": 0.2844005191578331 + }, + { + "epoch": 1.6883611844034008, + "grad_norm": 0.4958963197967804, + "learning_rate": 0.0001993533458512903, + "loss": 3.176665782928467, + "step": 2880, + "token_acc": 0.27947364202849184 + }, + { + "epoch": 1.6889475227206097, + "grad_norm": 0.4833627588146955, + "learning_rate": 0.00019935224494800315, + "loss": 3.151298999786377, + "step": 2881, + "token_acc": 0.28536362463160836 + }, + { + "epoch": 1.6895338610378188, + "grad_norm": 0.48442665402806734, + "learning_rate": 0.00019935114311143558, + "loss": 3.200427532196045, + "step": 2882, + "token_acc": 0.2765808244412231 + }, + { + "epoch": 1.6901201993550279, + "grad_norm": 0.4945639896516093, + "learning_rate": 0.00019935004034159787, + "loss": 3.172539234161377, + "step": 2883, + "token_acc": 0.2806612085212444 + }, + { + "epoch": 1.690706537672237, + "grad_norm": 0.5308268897978228, + "learning_rate": 0.00019934893663850042, + "loss": 3.1641626358032227, + "step": 2884, + "token_acc": 0.2804209247143987 + }, + { + "epoch": 1.6912928759894459, + "grad_norm": 0.42741321845992697, + "learning_rate": 0.00019934783200215356, + "loss": 3.199472188949585, + "step": 2885, + "token_acc": 0.2758055863710463 + }, + { + "epoch": 1.6918792143066548, + "grad_norm": 0.4461637606371874, + "learning_rate": 0.00019934672643256768, + "loss": 3.1667284965515137, + "step": 2886, + "token_acc": 0.2798052557547137 + }, + { + "epoch": 1.692465552623864, + "grad_norm": 0.37347174832680125, + "learning_rate": 0.00019934561992975323, + "loss": 3.148986577987671, + "step": 2887, + "token_acc": 0.2835601598121861 + }, + { + "epoch": 1.693051890941073, + "grad_norm": 0.4827196511133989, + "learning_rate": 0.0001993445124937205, + "loss": 3.1320087909698486, + "step": 2888, + "token_acc": 0.2863869140782155 + }, + { + "epoch": 1.6936382292582821, + "grad_norm": 0.5928113365653325, + "learning_rate": 0.00019934340412448, + "loss": 3.104896068572998, + "step": 2889, + "token_acc": 0.2885311382212618 + }, + { + "epoch": 1.6942245675754912, + "grad_norm": 0.5769506545441206, + "learning_rate": 0.00019934229482204203, + "loss": 3.1635777950286865, + "step": 2890, + "token_acc": 0.2805257529761754 + }, + { + "epoch": 1.6948109058927001, + "grad_norm": 0.5741931261972979, + "learning_rate": 0.00019934118458641708, + "loss": 3.188312292098999, + "step": 2891, + "token_acc": 0.27832153907422724 + }, + { + "epoch": 1.695397244209909, + "grad_norm": 0.5481898046958634, + "learning_rate": 0.00019934007341761557, + "loss": 3.2050981521606445, + "step": 2892, + "token_acc": 0.2754472905460247 + }, + { + "epoch": 1.6959835825271181, + "grad_norm": 0.5889394511835178, + "learning_rate": 0.00019933896131564796, + "loss": 3.191642999649048, + "step": 2893, + "token_acc": 0.27666597242910207 + }, + { + "epoch": 1.6965699208443272, + "grad_norm": 0.5215155630058546, + "learning_rate": 0.00019933784828052463, + "loss": 3.182464838027954, + "step": 2894, + "token_acc": 0.27765188553808584 + }, + { + "epoch": 1.6971562591615363, + "grad_norm": 0.4124214757736865, + "learning_rate": 0.0001993367343122561, + "loss": 3.1560251712799072, + "step": 2895, + "token_acc": 0.28177326663083474 + }, + { + "epoch": 1.6977425974787452, + "grad_norm": 0.4565107019668311, + "learning_rate": 0.0001993356194108528, + "loss": 3.163017749786377, + "step": 2896, + "token_acc": 0.27998352129357845 + }, + { + "epoch": 1.6983289357959541, + "grad_norm": 0.46896999094016806, + "learning_rate": 0.00019933450357632518, + "loss": 3.179335594177246, + "step": 2897, + "token_acc": 0.27935651549550733 + }, + { + "epoch": 1.6989152741131632, + "grad_norm": 0.54030386956148, + "learning_rate": 0.0001993333868086838, + "loss": 3.145297050476074, + "step": 2898, + "token_acc": 0.28323526192623016 + }, + { + "epoch": 1.6995016124303723, + "grad_norm": 0.5992877342188514, + "learning_rate": 0.00019933226910793907, + "loss": 3.1982274055480957, + "step": 2899, + "token_acc": 0.27621170065122236 + }, + { + "epoch": 1.7000879507475815, + "grad_norm": 0.4978431391472942, + "learning_rate": 0.00019933115047410157, + "loss": 3.1967391967773438, + "step": 2900, + "token_acc": 0.2771879983383139 + }, + { + "epoch": 1.7006742890647903, + "grad_norm": 0.3541573185415294, + "learning_rate": 0.0001993300309071817, + "loss": 3.1787989139556885, + "step": 2901, + "token_acc": 0.278938959475094 + }, + { + "epoch": 1.7012606273819995, + "grad_norm": 0.5585327563935711, + "learning_rate": 0.0001993289104071901, + "loss": 3.211782217025757, + "step": 2902, + "token_acc": 0.2758927825142093 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.472150796169749, + "learning_rate": 0.00019932778897413717, + "loss": 3.1785478591918945, + "step": 2903, + "token_acc": 0.2790779318136225 + }, + { + "epoch": 1.7024333040164175, + "grad_norm": 0.4999137610265311, + "learning_rate": 0.00019932666660803355, + "loss": 3.155221939086914, + "step": 2904, + "token_acc": 0.28294798827534534 + }, + { + "epoch": 1.7030196423336266, + "grad_norm": 0.4005926558521058, + "learning_rate": 0.00019932554330888972, + "loss": 3.183867931365967, + "step": 2905, + "token_acc": 0.2772364740503499 + }, + { + "epoch": 1.7036059806508357, + "grad_norm": 0.4284079358266169, + "learning_rate": 0.00019932441907671627, + "loss": 3.149411201477051, + "step": 2906, + "token_acc": 0.2830999812065401 + }, + { + "epoch": 1.7041923189680446, + "grad_norm": 0.41098272253810497, + "learning_rate": 0.00019932329391152376, + "loss": 3.1500651836395264, + "step": 2907, + "token_acc": 0.2852632483842481 + }, + { + "epoch": 1.7047786572852535, + "grad_norm": 0.4161714813761226, + "learning_rate": 0.00019932216781332274, + "loss": 3.133920669555664, + "step": 2908, + "token_acc": 0.28551931357716437 + }, + { + "epoch": 1.7053649956024626, + "grad_norm": 0.5852421122953465, + "learning_rate": 0.00019932104078212377, + "loss": 3.20621395111084, + "step": 2909, + "token_acc": 0.27520247352269317 + }, + { + "epoch": 1.7059513339196717, + "grad_norm": 0.5208252732729349, + "learning_rate": 0.00019931991281793747, + "loss": 3.199059009552002, + "step": 2910, + "token_acc": 0.276131851543956 + }, + { + "epoch": 1.7065376722368808, + "grad_norm": 0.43240647867487997, + "learning_rate": 0.0001993187839207744, + "loss": 3.231233596801758, + "step": 2911, + "token_acc": 0.27231693415137115 + }, + { + "epoch": 1.7071240105540897, + "grad_norm": 0.497565831245306, + "learning_rate": 0.00019931765409064522, + "loss": 3.1517786979675293, + "step": 2912, + "token_acc": 0.2827456479538712 + }, + { + "epoch": 1.7077103488712986, + "grad_norm": 0.5521926148334956, + "learning_rate": 0.0001993165233275605, + "loss": 3.1693263053894043, + "step": 2913, + "token_acc": 0.2823290453622207 + }, + { + "epoch": 1.7082966871885077, + "grad_norm": 0.47795892042186205, + "learning_rate": 0.00019931539163153087, + "loss": 3.1930737495422363, + "step": 2914, + "token_acc": 0.2781994977883777 + }, + { + "epoch": 1.7088830255057168, + "grad_norm": 0.525501649135695, + "learning_rate": 0.000199314259002567, + "loss": 3.158327102661133, + "step": 2915, + "token_acc": 0.2821428660696209 + }, + { + "epoch": 1.709469363822926, + "grad_norm": 0.5044557197640296, + "learning_rate": 0.00019931312544067944, + "loss": 3.150179386138916, + "step": 2916, + "token_acc": 0.2823472966088675 + }, + { + "epoch": 1.7100557021401348, + "grad_norm": 0.45471575655936647, + "learning_rate": 0.00019931199094587893, + "loss": 3.1563398838043213, + "step": 2917, + "token_acc": 0.2827311188241892 + }, + { + "epoch": 1.710642040457344, + "grad_norm": 0.4285700704788987, + "learning_rate": 0.00019931085551817606, + "loss": 3.167116641998291, + "step": 2918, + "token_acc": 0.28129183864371565 + }, + { + "epoch": 1.7112283787745528, + "grad_norm": 0.3865481749065967, + "learning_rate": 0.00019930971915758155, + "loss": 3.2068588733673096, + "step": 2919, + "token_acc": 0.27631501094609784 + }, + { + "epoch": 1.711814717091762, + "grad_norm": 0.3600539480078632, + "learning_rate": 0.00019930858186410606, + "loss": 3.141140937805176, + "step": 2920, + "token_acc": 0.2842637126483831 + }, + { + "epoch": 1.712401055408971, + "grad_norm": 0.4203679226575949, + "learning_rate": 0.00019930744363776023, + "loss": 3.1812448501586914, + "step": 2921, + "token_acc": 0.27773104633727097 + }, + { + "epoch": 1.7129873937261801, + "grad_norm": 0.45423744609617117, + "learning_rate": 0.00019930630447855482, + "loss": 3.1945416927337646, + "step": 2922, + "token_acc": 0.27645626667357354 + }, + { + "epoch": 1.713573732043389, + "grad_norm": 0.590883518978047, + "learning_rate": 0.00019930516438650047, + "loss": 3.165205240249634, + "step": 2923, + "token_acc": 0.2807734240565356 + }, + { + "epoch": 1.714160070360598, + "grad_norm": 0.6288219517397439, + "learning_rate": 0.00019930402336160792, + "loss": 3.1942429542541504, + "step": 2924, + "token_acc": 0.2786602284797641 + }, + { + "epoch": 1.714746408677807, + "grad_norm": 0.5029922238265038, + "learning_rate": 0.0001993028814038879, + "loss": 3.18017578125, + "step": 2925, + "token_acc": 0.2822263903759493 + }, + { + "epoch": 1.7153327469950161, + "grad_norm": 0.431579024019595, + "learning_rate": 0.0001993017385133511, + "loss": 3.1377241611480713, + "step": 2926, + "token_acc": 0.28584441373423175 + }, + { + "epoch": 1.7159190853122253, + "grad_norm": 0.4072035207708839, + "learning_rate": 0.00019930059469000828, + "loss": 3.139044761657715, + "step": 2927, + "token_acc": 0.28454719632136927 + }, + { + "epoch": 1.7165054236294341, + "grad_norm": 0.4686421589854534, + "learning_rate": 0.0001992994499338702, + "loss": 3.217597007751465, + "step": 2928, + "token_acc": 0.2740496111149091 + }, + { + "epoch": 1.7170917619466433, + "grad_norm": 0.49424836300744446, + "learning_rate": 0.00019929830424494758, + "loss": 3.1672630310058594, + "step": 2929, + "token_acc": 0.27884598131320254 + }, + { + "epoch": 1.7176781002638521, + "grad_norm": 0.3828704871206606, + "learning_rate": 0.00019929715762325118, + "loss": 3.1571407318115234, + "step": 2930, + "token_acc": 0.28220529348569195 + }, + { + "epoch": 1.7182644385810613, + "grad_norm": 0.4249963223417634, + "learning_rate": 0.0001992960100687918, + "loss": 3.155071258544922, + "step": 2931, + "token_acc": 0.28383223243134875 + }, + { + "epoch": 1.7188507768982704, + "grad_norm": 0.43678675139626283, + "learning_rate": 0.00019929486158158026, + "loss": 3.1963229179382324, + "step": 2932, + "token_acc": 0.2767800938601923 + }, + { + "epoch": 1.7194371152154795, + "grad_norm": 0.4222783201707346, + "learning_rate": 0.00019929371216162724, + "loss": 3.1996073722839355, + "step": 2933, + "token_acc": 0.2763362212060528 + }, + { + "epoch": 1.7200234535326884, + "grad_norm": 0.40576027123520764, + "learning_rate": 0.00019929256180894363, + "loss": 3.219005584716797, + "step": 2934, + "token_acc": 0.2743520359089452 + }, + { + "epoch": 1.7206097918498973, + "grad_norm": 0.4079332691990933, + "learning_rate": 0.00019929141052354017, + "loss": 3.1944832801818848, + "step": 2935, + "token_acc": 0.27836408198721796 + }, + { + "epoch": 1.7211961301671064, + "grad_norm": 0.40567734116732407, + "learning_rate": 0.00019929025830542772, + "loss": 3.1318769454956055, + "step": 2936, + "token_acc": 0.28480665580345954 + }, + { + "epoch": 1.7217824684843155, + "grad_norm": 0.4677194849468562, + "learning_rate": 0.00019928910515461707, + "loss": 3.127445697784424, + "step": 2937, + "token_acc": 0.28622995232179343 + }, + { + "epoch": 1.7223688068015246, + "grad_norm": 0.5505240163025872, + "learning_rate": 0.0001992879510711191, + "loss": 3.1670050621032715, + "step": 2938, + "token_acc": 0.282949513554612 + }, + { + "epoch": 1.7229551451187335, + "grad_norm": 0.46898597172313217, + "learning_rate": 0.0001992867960549446, + "loss": 3.1783666610717773, + "step": 2939, + "token_acc": 0.27973292817793766 + }, + { + "epoch": 1.7235414834359424, + "grad_norm": 0.4200944151114649, + "learning_rate": 0.00019928564010610446, + "loss": 3.1799914836883545, + "step": 2940, + "token_acc": 0.2791474867453043 + }, + { + "epoch": 1.7241278217531515, + "grad_norm": 0.47003934108964335, + "learning_rate": 0.0001992844832246095, + "loss": 3.1871261596679688, + "step": 2941, + "token_acc": 0.27674479797624535 + }, + { + "epoch": 1.7247141600703606, + "grad_norm": 0.4548504319968893, + "learning_rate": 0.00019928332541047062, + "loss": 3.145134210586548, + "step": 2942, + "token_acc": 0.2828656992456029 + }, + { + "epoch": 1.7253004983875697, + "grad_norm": 0.5929313399835197, + "learning_rate": 0.00019928216666369866, + "loss": 3.223623275756836, + "step": 2943, + "token_acc": 0.27338482728679925 + }, + { + "epoch": 1.7258868367047786, + "grad_norm": 0.4946492615337616, + "learning_rate": 0.00019928100698430457, + "loss": 3.1497507095336914, + "step": 2944, + "token_acc": 0.28352766821664094 + }, + { + "epoch": 1.7264731750219877, + "grad_norm": 0.44667284874524504, + "learning_rate": 0.00019927984637229916, + "loss": 3.1424336433410645, + "step": 2945, + "token_acc": 0.28461872035112157 + }, + { + "epoch": 1.7270595133391966, + "grad_norm": 0.4875574370392208, + "learning_rate": 0.0001992786848276934, + "loss": 3.139509677886963, + "step": 2946, + "token_acc": 0.28392719261490457 + }, + { + "epoch": 1.7276458516564057, + "grad_norm": 0.5383482075781147, + "learning_rate": 0.00019927752235049818, + "loss": 3.210911512374878, + "step": 2947, + "token_acc": 0.2761984728818451 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.5675300881849085, + "learning_rate": 0.00019927635894072441, + "loss": 3.204254388809204, + "step": 2948, + "token_acc": 0.2760426158997297 + }, + { + "epoch": 1.728818528290824, + "grad_norm": 0.5185310735844899, + "learning_rate": 0.000199275194598383, + "loss": 3.1933059692382812, + "step": 2949, + "token_acc": 0.2753251075158714 + }, + { + "epoch": 1.7294048666080328, + "grad_norm": 0.44282516835485364, + "learning_rate": 0.00019927402932348495, + "loss": 3.1646814346313477, + "step": 2950, + "token_acc": 0.2821495417370398 + }, + { + "epoch": 1.7299912049252417, + "grad_norm": 0.6059149019326739, + "learning_rate": 0.00019927286311604116, + "loss": 3.1785902976989746, + "step": 2951, + "token_acc": 0.2787837699549734 + }, + { + "epoch": 1.7305775432424508, + "grad_norm": 0.6009788122075265, + "learning_rate": 0.00019927169597606259, + "loss": 3.1773481369018555, + "step": 2952, + "token_acc": 0.2785320789447859 + }, + { + "epoch": 1.73116388155966, + "grad_norm": 0.4108489499660139, + "learning_rate": 0.00019927052790356018, + "loss": 3.190739154815674, + "step": 2953, + "token_acc": 0.2777231600696069 + }, + { + "epoch": 1.731750219876869, + "grad_norm": 0.5943709731097288, + "learning_rate": 0.00019926935889854496, + "loss": 3.1499979496002197, + "step": 2954, + "token_acc": 0.28312958305897784 + }, + { + "epoch": 1.732336558194078, + "grad_norm": 0.5570602788999452, + "learning_rate": 0.00019926818896102785, + "loss": 3.1789417266845703, + "step": 2955, + "token_acc": 0.2800467377901401 + }, + { + "epoch": 1.732922896511287, + "grad_norm": 0.45445196617147793, + "learning_rate": 0.0001992670180910199, + "loss": 3.168210983276367, + "step": 2956, + "token_acc": 0.2819754329651425 + }, + { + "epoch": 1.733509234828496, + "grad_norm": 0.5279809570234126, + "learning_rate": 0.00019926584628853207, + "loss": 3.1595590114593506, + "step": 2957, + "token_acc": 0.2831381109572995 + }, + { + "epoch": 1.734095573145705, + "grad_norm": 0.44546732674888434, + "learning_rate": 0.00019926467355357538, + "loss": 3.141796112060547, + "step": 2958, + "token_acc": 0.28208896778572545 + }, + { + "epoch": 1.7346819114629142, + "grad_norm": 0.5877392437830491, + "learning_rate": 0.00019926349988616085, + "loss": 3.2119336128234863, + "step": 2959, + "token_acc": 0.2753157357290625 + }, + { + "epoch": 1.7352682497801233, + "grad_norm": 0.607689834386909, + "learning_rate": 0.0001992623252862995, + "loss": 3.2077219486236572, + "step": 2960, + "token_acc": 0.2747951398700198 + }, + { + "epoch": 1.7358545880973322, + "grad_norm": 0.464827305686298, + "learning_rate": 0.00019926114975400233, + "loss": 3.1841320991516113, + "step": 2961, + "token_acc": 0.27840309944306885 + }, + { + "epoch": 1.736440926414541, + "grad_norm": 0.5245681655155779, + "learning_rate": 0.00019925997328928044, + "loss": 3.1783597469329834, + "step": 2962, + "token_acc": 0.27932300814306243 + }, + { + "epoch": 1.7370272647317502, + "grad_norm": 0.4342385783593552, + "learning_rate": 0.00019925879589214484, + "loss": 3.1499011516571045, + "step": 2963, + "token_acc": 0.2833291091944262 + }, + { + "epoch": 1.7376136030489593, + "grad_norm": 0.5632735135512961, + "learning_rate": 0.00019925761756260662, + "loss": 3.15573787689209, + "step": 2964, + "token_acc": 0.28356300499437803 + }, + { + "epoch": 1.7381999413661684, + "grad_norm": 0.6530433317404106, + "learning_rate": 0.00019925643830067684, + "loss": 3.247023105621338, + "step": 2965, + "token_acc": 0.26976083718972377 + }, + { + "epoch": 1.7387862796833773, + "grad_norm": 0.5584649856214898, + "learning_rate": 0.00019925525810636654, + "loss": 3.2268121242523193, + "step": 2966, + "token_acc": 0.2721100807782146 + }, + { + "epoch": 1.7393726180005862, + "grad_norm": 0.5187481398812246, + "learning_rate": 0.00019925407697968687, + "loss": 3.2307956218719482, + "step": 2967, + "token_acc": 0.2724502197370537 + }, + { + "epoch": 1.7399589563177953, + "grad_norm": 0.4993459761634379, + "learning_rate": 0.00019925289492064887, + "loss": 3.14756441116333, + "step": 2968, + "token_acc": 0.284447056937505 + }, + { + "epoch": 1.7405452946350044, + "grad_norm": 0.5812028092626107, + "learning_rate": 0.00019925171192926368, + "loss": 3.1684460639953613, + "step": 2969, + "token_acc": 0.2806512964356571 + }, + { + "epoch": 1.7411316329522135, + "grad_norm": 0.4636865440896602, + "learning_rate": 0.0001992505280055424, + "loss": 3.179443120956421, + "step": 2970, + "token_acc": 0.2778191494901732 + }, + { + "epoch": 1.7417179712694224, + "grad_norm": 0.5199778602800091, + "learning_rate": 0.00019924934314949615, + "loss": 3.167562961578369, + "step": 2971, + "token_acc": 0.2773112278059198 + }, + { + "epoch": 1.7423043095866315, + "grad_norm": 0.5894125515202733, + "learning_rate": 0.00019924815736113604, + "loss": 3.16648530960083, + "step": 2972, + "token_acc": 0.279594569755534 + }, + { + "epoch": 1.7428906479038404, + "grad_norm": 0.6045610857425243, + "learning_rate": 0.00019924697064047325, + "loss": 3.2132866382598877, + "step": 2973, + "token_acc": 0.2751497635596008 + }, + { + "epoch": 1.7434769862210495, + "grad_norm": 0.5335964002396725, + "learning_rate": 0.00019924578298751892, + "loss": 3.1057212352752686, + "step": 2974, + "token_acc": 0.2878994852723486 + }, + { + "epoch": 1.7440633245382586, + "grad_norm": 0.4881282210096283, + "learning_rate": 0.00019924459440228418, + "loss": 3.102341890335083, + "step": 2975, + "token_acc": 0.2914280976833117 + }, + { + "epoch": 1.7446496628554677, + "grad_norm": 0.4900026835507201, + "learning_rate": 0.0001992434048847802, + "loss": 3.1748886108398438, + "step": 2976, + "token_acc": 0.2813820432445144 + }, + { + "epoch": 1.7452360011726766, + "grad_norm": 0.4473743388228127, + "learning_rate": 0.0001992422144350182, + "loss": 3.161172389984131, + "step": 2977, + "token_acc": 0.28295660782049165 + }, + { + "epoch": 1.7458223394898855, + "grad_norm": 0.3869530721282537, + "learning_rate": 0.0001992410230530093, + "loss": 3.1615891456604004, + "step": 2978, + "token_acc": 0.28125982721989706 + }, + { + "epoch": 1.7464086778070946, + "grad_norm": 0.5210363465722914, + "learning_rate": 0.0001992398307387647, + "loss": 3.178642511367798, + "step": 2979, + "token_acc": 0.2799093771803463 + }, + { + "epoch": 1.7469950161243037, + "grad_norm": 0.49233621170987435, + "learning_rate": 0.00019923863749229565, + "loss": 3.1947317123413086, + "step": 2980, + "token_acc": 0.2767130119697093 + }, + { + "epoch": 1.7475813544415129, + "grad_norm": 0.5036711434451157, + "learning_rate": 0.0001992374433136133, + "loss": 3.186971664428711, + "step": 2981, + "token_acc": 0.2781111932674747 + }, + { + "epoch": 1.7481676927587217, + "grad_norm": 0.5637417984398475, + "learning_rate": 0.00019923624820272892, + "loss": 3.1288182735443115, + "step": 2982, + "token_acc": 0.2842603793982137 + }, + { + "epoch": 1.7487540310759309, + "grad_norm": 0.45974986236021675, + "learning_rate": 0.0001992350521596537, + "loss": 3.1544909477233887, + "step": 2983, + "token_acc": 0.2832700126997001 + }, + { + "epoch": 1.7493403693931397, + "grad_norm": 0.4974945816916409, + "learning_rate": 0.00019923385518439888, + "loss": 3.1807093620300293, + "step": 2984, + "token_acc": 0.276837415890835 + }, + { + "epoch": 1.7499267077103489, + "grad_norm": 0.5223041141785619, + "learning_rate": 0.00019923265727697572, + "loss": 3.149597644805908, + "step": 2985, + "token_acc": 0.2835854969595085 + }, + { + "epoch": 1.750513046027558, + "grad_norm": 0.4795707459014642, + "learning_rate": 0.00019923145843739546, + "loss": 3.155219793319702, + "step": 2986, + "token_acc": 0.282635992248865 + }, + { + "epoch": 1.751099384344767, + "grad_norm": 0.4800813183042214, + "learning_rate": 0.00019923025866566934, + "loss": 3.1162140369415283, + "step": 2987, + "token_acc": 0.28655299941605683 + }, + { + "epoch": 1.751685722661976, + "grad_norm": 0.47063526100676, + "learning_rate": 0.00019922905796180868, + "loss": 3.1883671283721924, + "step": 2988, + "token_acc": 0.2774695374265878 + }, + { + "epoch": 1.7522720609791849, + "grad_norm": 0.4740801150819172, + "learning_rate": 0.00019922785632582474, + "loss": 3.1551594734191895, + "step": 2989, + "token_acc": 0.28174274394414545 + }, + { + "epoch": 1.752858399296394, + "grad_norm": 0.42031980860106155, + "learning_rate": 0.00019922665375772877, + "loss": 3.0999274253845215, + "step": 2990, + "token_acc": 0.2921123869663792 + }, + { + "epoch": 1.753444737613603, + "grad_norm": 0.4279337941873416, + "learning_rate": 0.0001992254502575321, + "loss": 3.1406030654907227, + "step": 2991, + "token_acc": 0.28388867436032333 + }, + { + "epoch": 1.7540310759308122, + "grad_norm": 0.4822528199621917, + "learning_rate": 0.00019922424582524605, + "loss": 3.140556573867798, + "step": 2992, + "token_acc": 0.2823709581803621 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.46709560319205784, + "learning_rate": 0.00019922304046088193, + "loss": 3.17539644241333, + "step": 2993, + "token_acc": 0.2804816219702862 + }, + { + "epoch": 1.75520375256523, + "grad_norm": 0.4735641819553356, + "learning_rate": 0.00019922183416445105, + "loss": 3.187039375305176, + "step": 2994, + "token_acc": 0.2771921039453918 + }, + { + "epoch": 1.755790090882439, + "grad_norm": 0.4243697906927226, + "learning_rate": 0.0001992206269359647, + "loss": 3.1152756214141846, + "step": 2995, + "token_acc": 0.286199438614131 + }, + { + "epoch": 1.7563764291996482, + "grad_norm": 0.3803997306002624, + "learning_rate": 0.0001992194187754343, + "loss": 3.159212827682495, + "step": 2996, + "token_acc": 0.28233056531132567 + }, + { + "epoch": 1.7569627675168573, + "grad_norm": 0.47718705361028896, + "learning_rate": 0.00019921820968287114, + "loss": 3.1570539474487305, + "step": 2997, + "token_acc": 0.2809807326764675 + }, + { + "epoch": 1.7575491058340662, + "grad_norm": 0.4334016567262529, + "learning_rate": 0.00019921699965828662, + "loss": 3.221966505050659, + "step": 2998, + "token_acc": 0.2720114493884986 + }, + { + "epoch": 1.7581354441512753, + "grad_norm": 0.4224043239909534, + "learning_rate": 0.00019921578870169207, + "loss": 3.139960289001465, + "step": 2999, + "token_acc": 0.2831893670949925 + }, + { + "epoch": 1.7587217824684842, + "grad_norm": 0.5760770644554536, + "learning_rate": 0.0001992145768130989, + "loss": 3.186737537384033, + "step": 3000, + "token_acc": 0.2788789185291265 + }, + { + "epoch": 1.7593081207856933, + "grad_norm": 0.5278759212283373, + "learning_rate": 0.00019921336399251845, + "loss": 3.180309295654297, + "step": 3001, + "token_acc": 0.27854078742595695 + }, + { + "epoch": 1.7598944591029024, + "grad_norm": 0.5175272863777317, + "learning_rate": 0.00019921215023996214, + "loss": 3.1663174629211426, + "step": 3002, + "token_acc": 0.2823732731222378 + }, + { + "epoch": 1.7604807974201115, + "grad_norm": 0.5391112315476505, + "learning_rate": 0.00019921093555544137, + "loss": 3.1592674255371094, + "step": 3003, + "token_acc": 0.2818145646797544 + }, + { + "epoch": 1.7610671357373204, + "grad_norm": 0.36222608133305795, + "learning_rate": 0.00019920971993896754, + "loss": 3.1571292877197266, + "step": 3004, + "token_acc": 0.28130298056669434 + }, + { + "epoch": 1.7616534740545293, + "grad_norm": 0.4627275609861067, + "learning_rate": 0.0001992085033905521, + "loss": 3.1610560417175293, + "step": 3005, + "token_acc": 0.2795801400064243 + }, + { + "epoch": 1.7622398123717384, + "grad_norm": 0.5197302972298847, + "learning_rate": 0.00019920728591020644, + "loss": 3.1341347694396973, + "step": 3006, + "token_acc": 0.2844290169037867 + }, + { + "epoch": 1.7628261506889475, + "grad_norm": 0.3826671986522429, + "learning_rate": 0.000199206067497942, + "loss": 3.1703736782073975, + "step": 3007, + "token_acc": 0.2788192245630119 + }, + { + "epoch": 1.7634124890061567, + "grad_norm": 0.3791269480787098, + "learning_rate": 0.00019920484815377028, + "loss": 3.132236957550049, + "step": 3008, + "token_acc": 0.2844655062234226 + }, + { + "epoch": 1.7639988273233655, + "grad_norm": 0.46593906643907695, + "learning_rate": 0.00019920362787770267, + "loss": 3.167722702026367, + "step": 3009, + "token_acc": 0.28091726384364823 + }, + { + "epoch": 1.7645851656405747, + "grad_norm": 0.5206506000069724, + "learning_rate": 0.00019920240666975063, + "loss": 3.1813013553619385, + "step": 3010, + "token_acc": 0.27919728675133726 + }, + { + "epoch": 1.7651715039577835, + "grad_norm": 0.6684871069170841, + "learning_rate": 0.00019920118452992566, + "loss": 3.1811587810516357, + "step": 3011, + "token_acc": 0.2798628190413729 + }, + { + "epoch": 1.7657578422749927, + "grad_norm": 0.4810106260010527, + "learning_rate": 0.00019919996145823928, + "loss": 3.149163246154785, + "step": 3012, + "token_acc": 0.28248165787102586 + }, + { + "epoch": 1.7663441805922018, + "grad_norm": 0.4199080589424211, + "learning_rate": 0.0001991987374547029, + "loss": 3.176048755645752, + "step": 3013, + "token_acc": 0.28029436286693443 + }, + { + "epoch": 1.7669305189094109, + "grad_norm": 0.656167772073662, + "learning_rate": 0.00019919751251932805, + "loss": 3.181730270385742, + "step": 3014, + "token_acc": 0.2788619373725757 + }, + { + "epoch": 1.7675168572266198, + "grad_norm": 0.6365634301450452, + "learning_rate": 0.00019919628665212625, + "loss": 3.1575212478637695, + "step": 3015, + "token_acc": 0.2834912413841664 + }, + { + "epoch": 1.7681031955438287, + "grad_norm": 0.6061216058130342, + "learning_rate": 0.00019919505985310903, + "loss": 3.1693007946014404, + "step": 3016, + "token_acc": 0.28168481568282167 + }, + { + "epoch": 1.7686895338610378, + "grad_norm": 0.5224879460581037, + "learning_rate": 0.00019919383212228787, + "loss": 3.177699089050293, + "step": 3017, + "token_acc": 0.27810073050913203 + }, + { + "epoch": 1.7692758721782469, + "grad_norm": 0.4708567621285417, + "learning_rate": 0.00019919260345967432, + "loss": 3.159008264541626, + "step": 3018, + "token_acc": 0.2821663586934847 + }, + { + "epoch": 1.769862210495456, + "grad_norm": 0.569703262239816, + "learning_rate": 0.00019919137386527992, + "loss": 3.1542842388153076, + "step": 3019, + "token_acc": 0.28311320853762245 + }, + { + "epoch": 1.770448548812665, + "grad_norm": 0.5081818788344902, + "learning_rate": 0.0001991901433391162, + "loss": 3.203446626663208, + "step": 3020, + "token_acc": 0.2757139325009272 + }, + { + "epoch": 1.7710348871298738, + "grad_norm": 0.37851780497659443, + "learning_rate": 0.00019918891188119479, + "loss": 3.192237615585327, + "step": 3021, + "token_acc": 0.27776048751838484 + }, + { + "epoch": 1.771621225447083, + "grad_norm": 0.5072145605374052, + "learning_rate": 0.0001991876794915272, + "loss": 3.1420278549194336, + "step": 3022, + "token_acc": 0.28276179634438037 + }, + { + "epoch": 1.772207563764292, + "grad_norm": 0.42189941961971417, + "learning_rate": 0.000199186446170125, + "loss": 3.1737537384033203, + "step": 3023, + "token_acc": 0.279103261384525 + }, + { + "epoch": 1.7727939020815011, + "grad_norm": 0.43456976457360336, + "learning_rate": 0.00019918521191699982, + "loss": 3.1521992683410645, + "step": 3024, + "token_acc": 0.2824392609228256 + }, + { + "epoch": 1.77338024039871, + "grad_norm": 0.5967042142888881, + "learning_rate": 0.00019918397673216321, + "loss": 3.148320198059082, + "step": 3025, + "token_acc": 0.2832938618237096 + }, + { + "epoch": 1.7739665787159191, + "grad_norm": 0.5760626797180998, + "learning_rate": 0.0001991827406156268, + "loss": 3.2021431922912598, + "step": 3026, + "token_acc": 0.2751859699966751 + }, + { + "epoch": 1.774552917033128, + "grad_norm": 0.5881445536292035, + "learning_rate": 0.0001991815035674022, + "loss": 3.160079002380371, + "step": 3027, + "token_acc": 0.28160910509831033 + }, + { + "epoch": 1.7751392553503371, + "grad_norm": 0.6072959336180391, + "learning_rate": 0.000199180265587501, + "loss": 3.178093671798706, + "step": 3028, + "token_acc": 0.2801709860088017 + }, + { + "epoch": 1.7757255936675462, + "grad_norm": 0.4275048324707711, + "learning_rate": 0.00019917902667593486, + "loss": 3.1129212379455566, + "step": 3029, + "token_acc": 0.289480570456627 + }, + { + "epoch": 1.7763119319847553, + "grad_norm": 0.62312483881686, + "learning_rate": 0.00019917778683271542, + "loss": 3.1683366298675537, + "step": 3030, + "token_acc": 0.27920484358164543 + }, + { + "epoch": 1.7768982703019642, + "grad_norm": 0.5855244741365908, + "learning_rate": 0.0001991765460578543, + "loss": 3.1667299270629883, + "step": 3031, + "token_acc": 0.2795283266566668 + }, + { + "epoch": 1.7774846086191731, + "grad_norm": 0.4476736547873714, + "learning_rate": 0.00019917530435136315, + "loss": 3.1795425415039062, + "step": 3032, + "token_acc": 0.2780890166001695 + }, + { + "epoch": 1.7780709469363822, + "grad_norm": 0.4544646699813525, + "learning_rate": 0.00019917406171325372, + "loss": 3.1279683113098145, + "step": 3033, + "token_acc": 0.28464937491018827 + }, + { + "epoch": 1.7786572852535913, + "grad_norm": 0.5585388899459923, + "learning_rate": 0.00019917281814353757, + "loss": 3.1551103591918945, + "step": 3034, + "token_acc": 0.2820819062360247 + }, + { + "epoch": 1.7792436235708005, + "grad_norm": 0.5288719587818126, + "learning_rate": 0.00019917157364222646, + "loss": 3.1912384033203125, + "step": 3035, + "token_acc": 0.27838537919125056 + }, + { + "epoch": 1.7798299618880093, + "grad_norm": 0.3976081046917742, + "learning_rate": 0.000199170328209332, + "loss": 3.155442953109741, + "step": 3036, + "token_acc": 0.281032449059833 + }, + { + "epoch": 1.7804163002052185, + "grad_norm": 0.4651851421845836, + "learning_rate": 0.000199169081844866, + "loss": 3.1420576572418213, + "step": 3037, + "token_acc": 0.2857071839539088 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.4282274246645864, + "learning_rate": 0.0001991678345488401, + "loss": 3.177009344100952, + "step": 3038, + "token_acc": 0.27881092756124404 + }, + { + "epoch": 1.7815889768396365, + "grad_norm": 0.4288496461862397, + "learning_rate": 0.000199166586321266, + "loss": 3.1417412757873535, + "step": 3039, + "token_acc": 0.2827716383923786 + }, + { + "epoch": 1.7821753151568456, + "grad_norm": 0.40504853086945547, + "learning_rate": 0.00019916533716215544, + "loss": 3.1637840270996094, + "step": 3040, + "token_acc": 0.27994391960460285 + }, + { + "epoch": 1.7827616534740547, + "grad_norm": 0.4298805277151954, + "learning_rate": 0.0001991640870715202, + "loss": 3.1684818267822266, + "step": 3041, + "token_acc": 0.28085395330924645 + }, + { + "epoch": 1.7833479917912636, + "grad_norm": 0.48612480313029516, + "learning_rate": 0.00019916283604937197, + "loss": 3.147237777709961, + "step": 3042, + "token_acc": 0.2839322341699118 + }, + { + "epoch": 1.7839343301084725, + "grad_norm": 0.4735647922796634, + "learning_rate": 0.00019916158409572253, + "loss": 3.201700448989868, + "step": 3043, + "token_acc": 0.2793902308678923 + }, + { + "epoch": 1.7845206684256816, + "grad_norm": 0.47738699892800107, + "learning_rate": 0.0001991603312105836, + "loss": 3.154893398284912, + "step": 3044, + "token_acc": 0.28147513754801207 + }, + { + "epoch": 1.7851070067428907, + "grad_norm": 0.5457774547215198, + "learning_rate": 0.00019915907739396702, + "loss": 3.14322566986084, + "step": 3045, + "token_acc": 0.2843496606461381 + }, + { + "epoch": 1.7856933450600998, + "grad_norm": 0.6286875772424948, + "learning_rate": 0.00019915782264588448, + "loss": 3.175854206085205, + "step": 3046, + "token_acc": 0.28110637141403133 + }, + { + "epoch": 1.7862796833773087, + "grad_norm": 0.4347377571502573, + "learning_rate": 0.00019915656696634787, + "loss": 3.20487904548645, + "step": 3047, + "token_acc": 0.27684753305095283 + }, + { + "epoch": 1.7868660216945176, + "grad_norm": 0.41599768451301034, + "learning_rate": 0.00019915531035536889, + "loss": 3.1952695846557617, + "step": 3048, + "token_acc": 0.27704563179423514 + }, + { + "epoch": 1.7874523600117267, + "grad_norm": 0.5451411727324161, + "learning_rate": 0.00019915405281295935, + "loss": 3.126847743988037, + "step": 3049, + "token_acc": 0.28492401262253875 + }, + { + "epoch": 1.7880386983289358, + "grad_norm": 0.5327343397423094, + "learning_rate": 0.00019915279433913114, + "loss": 3.1617894172668457, + "step": 3050, + "token_acc": 0.2805701092265341 + }, + { + "epoch": 1.788625036646145, + "grad_norm": 0.6607358887310494, + "learning_rate": 0.00019915153493389604, + "loss": 3.1562929153442383, + "step": 3051, + "token_acc": 0.2825911016558186 + }, + { + "epoch": 1.7892113749633538, + "grad_norm": 0.48660973076480024, + "learning_rate": 0.00019915027459726587, + "loss": 3.141599655151367, + "step": 3052, + "token_acc": 0.2835019790428629 + }, + { + "epoch": 1.789797713280563, + "grad_norm": 0.5029640909091398, + "learning_rate": 0.0001991490133292525, + "loss": 3.1314804553985596, + "step": 3053, + "token_acc": 0.28575136743628443 + }, + { + "epoch": 1.7903840515977718, + "grad_norm": 0.4483339364013523, + "learning_rate": 0.0001991477511298677, + "loss": 3.1960182189941406, + "step": 3054, + "token_acc": 0.2757918220285726 + }, + { + "epoch": 1.790970389914981, + "grad_norm": 0.4076418437939134, + "learning_rate": 0.00019914648799912343, + "loss": 3.1197569370269775, + "step": 3055, + "token_acc": 0.28692908517720045 + }, + { + "epoch": 1.79155672823219, + "grad_norm": 0.4656427706249111, + "learning_rate": 0.00019914522393703148, + "loss": 3.1805639266967773, + "step": 3056, + "token_acc": 0.2792051125278751 + }, + { + "epoch": 1.7921430665493991, + "grad_norm": 0.3995577284508524, + "learning_rate": 0.00019914395894360376, + "loss": 3.1196675300598145, + "step": 3057, + "token_acc": 0.287803298357192 + }, + { + "epoch": 1.792729404866608, + "grad_norm": 0.3999947543571872, + "learning_rate": 0.00019914269301885216, + "loss": 3.1162376403808594, + "step": 3058, + "token_acc": 0.28868699573169915 + }, + { + "epoch": 1.793315743183817, + "grad_norm": 0.4848404806778235, + "learning_rate": 0.00019914142616278853, + "loss": 3.151334524154663, + "step": 3059, + "token_acc": 0.2840709265143338 + }, + { + "epoch": 1.793902081501026, + "grad_norm": 0.5072766140544103, + "learning_rate": 0.0001991401583754248, + "loss": 3.2069458961486816, + "step": 3060, + "token_acc": 0.275489189146041 + }, + { + "epoch": 1.7944884198182351, + "grad_norm": 0.42656632984038606, + "learning_rate": 0.00019913888965677288, + "loss": 3.173333168029785, + "step": 3061, + "token_acc": 0.27925479802407005 + }, + { + "epoch": 1.7950747581354443, + "grad_norm": 0.5381640481886466, + "learning_rate": 0.0001991376200068447, + "loss": 3.1604881286621094, + "step": 3062, + "token_acc": 0.28115659502324025 + }, + { + "epoch": 1.7956610964526531, + "grad_norm": 0.6167835803124507, + "learning_rate": 0.00019913634942565214, + "loss": 3.2319087982177734, + "step": 3063, + "token_acc": 0.27010222636096465 + }, + { + "epoch": 1.7962474347698623, + "grad_norm": 0.5446378898704225, + "learning_rate": 0.00019913507791320716, + "loss": 3.204888343811035, + "step": 3064, + "token_acc": 0.27742048163064026 + }, + { + "epoch": 1.7968337730870712, + "grad_norm": 0.46935161188374286, + "learning_rate": 0.00019913380546952175, + "loss": 3.116142749786377, + "step": 3065, + "token_acc": 0.2882703049171321 + }, + { + "epoch": 1.7974201114042803, + "grad_norm": 0.4913748013302024, + "learning_rate": 0.0001991325320946078, + "loss": 3.1805639266967773, + "step": 3066, + "token_acc": 0.2785570981031671 + }, + { + "epoch": 1.7980064497214894, + "grad_norm": 0.4811261140490574, + "learning_rate": 0.00019913125778847727, + "loss": 3.1658692359924316, + "step": 3067, + "token_acc": 0.28080753602546915 + }, + { + "epoch": 1.7985927880386985, + "grad_norm": 0.5044322634295477, + "learning_rate": 0.00019912998255114218, + "loss": 3.2018842697143555, + "step": 3068, + "token_acc": 0.2766618925260341 + }, + { + "epoch": 1.7991791263559074, + "grad_norm": 0.4577689454026066, + "learning_rate": 0.0001991287063826145, + "loss": 3.218240261077881, + "step": 3069, + "token_acc": 0.27432852023635884 + }, + { + "epoch": 1.7997654646731163, + "grad_norm": 0.4946044488876774, + "learning_rate": 0.00019912742928290619, + "loss": 3.1451101303100586, + "step": 3070, + "token_acc": 0.2819843874202556 + }, + { + "epoch": 1.8003518029903254, + "grad_norm": 0.5411147607557574, + "learning_rate": 0.00019912615125202923, + "loss": 3.2268686294555664, + "step": 3071, + "token_acc": 0.2741400034342625 + }, + { + "epoch": 1.8009381413075345, + "grad_norm": 0.4538215012039641, + "learning_rate": 0.00019912487228999565, + "loss": 3.1676385402679443, + "step": 3072, + "token_acc": 0.2793928795503998 + }, + { + "epoch": 1.8015244796247436, + "grad_norm": 0.40272706270350467, + "learning_rate": 0.0001991235923968175, + "loss": 3.149338483810425, + "step": 3073, + "token_acc": 0.2834419796883512 + }, + { + "epoch": 1.8021108179419525, + "grad_norm": 0.4820116694752974, + "learning_rate": 0.00019912231157250676, + "loss": 3.1644253730773926, + "step": 3074, + "token_acc": 0.2815013196507523 + }, + { + "epoch": 1.8026971562591614, + "grad_norm": 0.37437633237231566, + "learning_rate": 0.0001991210298170755, + "loss": 3.14346981048584, + "step": 3075, + "token_acc": 0.28332368268674 + }, + { + "epoch": 1.8032834945763705, + "grad_norm": 0.4542849510565713, + "learning_rate": 0.00019911974713053568, + "loss": 3.1468682289123535, + "step": 3076, + "token_acc": 0.2828481960932949 + }, + { + "epoch": 1.8038698328935796, + "grad_norm": 0.4786750086557114, + "learning_rate": 0.00019911846351289945, + "loss": 3.2040350437164307, + "step": 3077, + "token_acc": 0.2752003594853844 + }, + { + "epoch": 1.8044561712107887, + "grad_norm": 0.4683742858060124, + "learning_rate": 0.00019911717896417877, + "loss": 3.1499221324920654, + "step": 3078, + "token_acc": 0.2820556092521849 + }, + { + "epoch": 1.8050425095279976, + "grad_norm": 0.4596835123780443, + "learning_rate": 0.0001991158934843858, + "loss": 3.1615498065948486, + "step": 3079, + "token_acc": 0.28226946178306445 + }, + { + "epoch": 1.8056288478452067, + "grad_norm": 0.3979062359320113, + "learning_rate": 0.00019911460707353252, + "loss": 3.1861116886138916, + "step": 3080, + "token_acc": 0.2778765450334724 + }, + { + "epoch": 1.8062151861624156, + "grad_norm": 0.539597419602727, + "learning_rate": 0.0001991133197316311, + "loss": 3.175027370452881, + "step": 3081, + "token_acc": 0.2790517783594051 + }, + { + "epoch": 1.8068015244796247, + "grad_norm": 0.5410597373827273, + "learning_rate": 0.00019911203145869363, + "loss": 3.1609606742858887, + "step": 3082, + "token_acc": 0.28009506219082186 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.45497342033714855, + "learning_rate": 0.0001991107422547321, + "loss": 3.163029670715332, + "step": 3083, + "token_acc": 0.28220511931280096 + }, + { + "epoch": 1.807974201114043, + "grad_norm": 0.46070598860710826, + "learning_rate": 0.00019910945211975878, + "loss": 3.1028380393981934, + "step": 3084, + "token_acc": 0.2891598972581285 + }, + { + "epoch": 1.8085605394312518, + "grad_norm": 0.3973208565188474, + "learning_rate": 0.00019910816105378566, + "loss": 3.174038887023926, + "step": 3085, + "token_acc": 0.2800026711185309 + }, + { + "epoch": 1.8091468777484607, + "grad_norm": 0.5937607544865409, + "learning_rate": 0.00019910686905682493, + "loss": 3.144839286804199, + "step": 3086, + "token_acc": 0.28407432680699174 + }, + { + "epoch": 1.8097332160656698, + "grad_norm": 0.44947229912356496, + "learning_rate": 0.0001991055761288887, + "loss": 3.121633529663086, + "step": 3087, + "token_acc": 0.2861460381906997 + }, + { + "epoch": 1.810319554382879, + "grad_norm": 0.4881463974021204, + "learning_rate": 0.00019910428226998917, + "loss": 3.191561698913574, + "step": 3088, + "token_acc": 0.2780304822771585 + }, + { + "epoch": 1.810905892700088, + "grad_norm": 0.4986072395865644, + "learning_rate": 0.0001991029874801384, + "loss": 3.105213165283203, + "step": 3089, + "token_acc": 0.2889542538437207 + }, + { + "epoch": 1.811492231017297, + "grad_norm": 0.39515824330155225, + "learning_rate": 0.00019910169175934862, + "loss": 3.141718864440918, + "step": 3090, + "token_acc": 0.28354780383614947 + }, + { + "epoch": 1.812078569334506, + "grad_norm": 0.44609364501065435, + "learning_rate": 0.000199100395107632, + "loss": 3.207822799682617, + "step": 3091, + "token_acc": 0.27437709957327594 + }, + { + "epoch": 1.812664907651715, + "grad_norm": 0.39730960624598455, + "learning_rate": 0.00019909909752500072, + "loss": 3.1471433639526367, + "step": 3092, + "token_acc": 0.28447041859192285 + }, + { + "epoch": 1.813251245968924, + "grad_norm": 0.4299903857887841, + "learning_rate": 0.00019909779901146696, + "loss": 3.0998849868774414, + "step": 3093, + "token_acc": 0.2882964324801995 + }, + { + "epoch": 1.8138375842861332, + "grad_norm": 0.42318956721727574, + "learning_rate": 0.00019909649956704288, + "loss": 3.146383285522461, + "step": 3094, + "token_acc": 0.2846735484499744 + }, + { + "epoch": 1.8144239226033423, + "grad_norm": 0.3964355897550008, + "learning_rate": 0.00019909519919174073, + "loss": 3.156064987182617, + "step": 3095, + "token_acc": 0.28207419695773744 + }, + { + "epoch": 1.8150102609205512, + "grad_norm": 0.3886094065854018, + "learning_rate": 0.0001990938978855727, + "loss": 3.1582894325256348, + "step": 3096, + "token_acc": 0.2823546332005199 + }, + { + "epoch": 1.81559659923776, + "grad_norm": 0.4322618227611294, + "learning_rate": 0.00019909259564855107, + "loss": 3.126788854598999, + "step": 3097, + "token_acc": 0.2877920745701149 + }, + { + "epoch": 1.8161829375549692, + "grad_norm": 0.49541260098442413, + "learning_rate": 0.000199091292480688, + "loss": 3.12581205368042, + "step": 3098, + "token_acc": 0.2866413067800403 + }, + { + "epoch": 1.8167692758721783, + "grad_norm": 0.5257304249492958, + "learning_rate": 0.0001990899883819958, + "loss": 3.19355845451355, + "step": 3099, + "token_acc": 0.2769926104760851 + }, + { + "epoch": 1.8173556141893874, + "grad_norm": 0.5266293716946155, + "learning_rate": 0.00019908868335248665, + "loss": 3.18611741065979, + "step": 3100, + "token_acc": 0.2790668954746952 + }, + { + "epoch": 1.8179419525065963, + "grad_norm": 0.530398560636547, + "learning_rate": 0.00019908737739217288, + "loss": 3.201910972595215, + "step": 3101, + "token_acc": 0.27632447555378586 + }, + { + "epoch": 1.8185282908238052, + "grad_norm": 0.5592629899128742, + "learning_rate": 0.0001990860705010667, + "loss": 3.1702399253845215, + "step": 3102, + "token_acc": 0.27971160674770223 + }, + { + "epoch": 1.8191146291410143, + "grad_norm": 0.4928914900045002, + "learning_rate": 0.0001990847626791804, + "loss": 3.1639957427978516, + "step": 3103, + "token_acc": 0.2814083925195036 + }, + { + "epoch": 1.8197009674582234, + "grad_norm": 0.4619609014090586, + "learning_rate": 0.00019908345392652627, + "loss": 3.1560728549957275, + "step": 3104, + "token_acc": 0.28103688870317955 + }, + { + "epoch": 1.8202873057754325, + "grad_norm": 0.46498529137086914, + "learning_rate": 0.00019908214424311659, + "loss": 3.1222422122955322, + "step": 3105, + "token_acc": 0.28770440183283025 + }, + { + "epoch": 1.8208736440926414, + "grad_norm": 0.3822225674850646, + "learning_rate": 0.00019908083362896374, + "loss": 3.158790111541748, + "step": 3106, + "token_acc": 0.2818113135268579 + }, + { + "epoch": 1.8214599824098505, + "grad_norm": 0.45697850826883385, + "learning_rate": 0.00019907952208407994, + "loss": 3.168910503387451, + "step": 3107, + "token_acc": 0.2804518996633234 + }, + { + "epoch": 1.8220463207270594, + "grad_norm": 0.5235350239621589, + "learning_rate": 0.00019907820960847751, + "loss": 3.1152772903442383, + "step": 3108, + "token_acc": 0.287955321479604 + }, + { + "epoch": 1.8226326590442685, + "grad_norm": 0.4414073006027613, + "learning_rate": 0.00019907689620216886, + "loss": 3.1422181129455566, + "step": 3109, + "token_acc": 0.2832322958648017 + }, + { + "epoch": 1.8232189973614776, + "grad_norm": 0.47080282638609633, + "learning_rate": 0.00019907558186516627, + "loss": 3.181764602661133, + "step": 3110, + "token_acc": 0.2803719336435384 + }, + { + "epoch": 1.8238053356786867, + "grad_norm": 0.5955304612722607, + "learning_rate": 0.00019907426659748209, + "loss": 3.1344666481018066, + "step": 3111, + "token_acc": 0.2840586464393233 + }, + { + "epoch": 1.8243916739958956, + "grad_norm": 0.5011131899334585, + "learning_rate": 0.00019907295039912865, + "loss": 3.1668143272399902, + "step": 3112, + "token_acc": 0.2806568501851981 + }, + { + "epoch": 1.8249780123131045, + "grad_norm": 0.3836304485547096, + "learning_rate": 0.00019907163327011836, + "loss": 3.1633336544036865, + "step": 3113, + "token_acc": 0.28461729602346963 + }, + { + "epoch": 1.8255643506303136, + "grad_norm": 0.5570824636065277, + "learning_rate": 0.00019907031521046358, + "loss": 3.139370918273926, + "step": 3114, + "token_acc": 0.28400644443233736 + }, + { + "epoch": 1.8261506889475227, + "grad_norm": 0.48508610386875756, + "learning_rate": 0.0001990689962201767, + "loss": 3.155259132385254, + "step": 3115, + "token_acc": 0.28220868759026685 + }, + { + "epoch": 1.8267370272647319, + "grad_norm": 0.4578595611560492, + "learning_rate": 0.00019906767629927007, + "loss": 3.1544992923736572, + "step": 3116, + "token_acc": 0.2832061108127092 + }, + { + "epoch": 1.8273233655819408, + "grad_norm": 0.4664607070688245, + "learning_rate": 0.00019906635544775613, + "loss": 3.16265869140625, + "step": 3117, + "token_acc": 0.28168601181982805 + }, + { + "epoch": 1.8279097038991499, + "grad_norm": 0.542635133224902, + "learning_rate": 0.00019906503366564726, + "loss": 3.124886989593506, + "step": 3118, + "token_acc": 0.28445999281441525 + }, + { + "epoch": 1.8284960422163588, + "grad_norm": 0.37000960648454784, + "learning_rate": 0.00019906371095295593, + "loss": 3.1056067943573, + "step": 3119, + "token_acc": 0.28758862933026913 + }, + { + "epoch": 1.8290823805335679, + "grad_norm": 0.48343534455547665, + "learning_rate": 0.00019906238730969447, + "loss": 3.17891001701355, + "step": 3120, + "token_acc": 0.27801841715957565 + }, + { + "epoch": 1.829668718850777, + "grad_norm": 0.5271375500367259, + "learning_rate": 0.0001990610627358754, + "loss": 3.131997585296631, + "step": 3121, + "token_acc": 0.28568261826472935 + }, + { + "epoch": 1.830255057167986, + "grad_norm": 0.3844341963235111, + "learning_rate": 0.00019905973723151116, + "loss": 3.1893386840820312, + "step": 3122, + "token_acc": 0.2754078405948585 + }, + { + "epoch": 1.830841395485195, + "grad_norm": 0.4408772790351935, + "learning_rate": 0.00019905841079661415, + "loss": 3.1531100273132324, + "step": 3123, + "token_acc": 0.28219660365334487 + }, + { + "epoch": 1.8314277338024039, + "grad_norm": 0.5017661646029039, + "learning_rate": 0.00019905708343119684, + "loss": 3.083510160446167, + "step": 3124, + "token_acc": 0.29125712797208386 + }, + { + "epoch": 1.832014072119613, + "grad_norm": 0.4635979401537257, + "learning_rate": 0.00019905575513527173, + "loss": 3.1558456420898438, + "step": 3125, + "token_acc": 0.2812394270892072 + }, + { + "epoch": 1.832600410436822, + "grad_norm": 0.4390360060925875, + "learning_rate": 0.0001990544259088513, + "loss": 3.167863607406616, + "step": 3126, + "token_acc": 0.28058437367169914 + }, + { + "epoch": 1.8331867487540312, + "grad_norm": 0.3825523965730083, + "learning_rate": 0.00019905309575194796, + "loss": 3.180602550506592, + "step": 3127, + "token_acc": 0.2799893286089116 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.3999150554311957, + "learning_rate": 0.00019905176466457428, + "loss": 3.2018299102783203, + "step": 3128, + "token_acc": 0.274423176519083 + }, + { + "epoch": 1.834359425388449, + "grad_norm": 0.43843222210410726, + "learning_rate": 0.00019905043264674277, + "loss": 3.1406569480895996, + "step": 3129, + "token_acc": 0.28449841246243535 + }, + { + "epoch": 1.834945763705658, + "grad_norm": 0.4199883691086164, + "learning_rate": 0.0001990490996984659, + "loss": 3.1223912239074707, + "step": 3130, + "token_acc": 0.28435085515085823 + }, + { + "epoch": 1.8355321020228672, + "grad_norm": 0.49060521693446923, + "learning_rate": 0.0001990477658197562, + "loss": 3.1625328063964844, + "step": 3131, + "token_acc": 0.28045417811106366 + }, + { + "epoch": 1.8361184403400763, + "grad_norm": 0.5913217811317946, + "learning_rate": 0.00019904643101062623, + "loss": 3.162557601928711, + "step": 3132, + "token_acc": 0.28365348013619696 + }, + { + "epoch": 1.8367047786572852, + "grad_norm": 0.5151613602192372, + "learning_rate": 0.0001990450952710885, + "loss": 3.1815497875213623, + "step": 3133, + "token_acc": 0.2780936978297162 + }, + { + "epoch": 1.8372911169744943, + "grad_norm": 0.48828518108244184, + "learning_rate": 0.00019904375860115556, + "loss": 3.175595760345459, + "step": 3134, + "token_acc": 0.2802329372923842 + }, + { + "epoch": 1.8378774552917032, + "grad_norm": 0.5194582629651129, + "learning_rate": 0.00019904242100083995, + "loss": 3.173851490020752, + "step": 3135, + "token_acc": 0.27954982705873693 + }, + { + "epoch": 1.8384637936089123, + "grad_norm": 0.47738387383060926, + "learning_rate": 0.0001990410824701543, + "loss": 3.15293550491333, + "step": 3136, + "token_acc": 0.28130395461645974 + }, + { + "epoch": 1.8390501319261214, + "grad_norm": 0.5504641213053837, + "learning_rate": 0.0001990397430091111, + "loss": 3.176800012588501, + "step": 3137, + "token_acc": 0.27962055671257907 + }, + { + "epoch": 1.8396364702433305, + "grad_norm": 0.47256662147259404, + "learning_rate": 0.000199038402617723, + "loss": 3.148754596710205, + "step": 3138, + "token_acc": 0.28285508560456557 + }, + { + "epoch": 1.8402228085605394, + "grad_norm": 0.4510580171307375, + "learning_rate": 0.00019903706129600256, + "loss": 3.150977373123169, + "step": 3139, + "token_acc": 0.2831304148795884 + }, + { + "epoch": 1.8408091468777483, + "grad_norm": 0.4536714716277695, + "learning_rate": 0.00019903571904396236, + "loss": 3.131408214569092, + "step": 3140, + "token_acc": 0.28619942215701516 + }, + { + "epoch": 1.8413954851949574, + "grad_norm": 0.40055272148812404, + "learning_rate": 0.00019903437586161506, + "loss": 3.176074981689453, + "step": 3141, + "token_acc": 0.2797853212745535 + }, + { + "epoch": 1.8419818235121665, + "grad_norm": 0.5316181387254535, + "learning_rate": 0.00019903303174897326, + "loss": 3.1866860389709473, + "step": 3142, + "token_acc": 0.2789888101526589 + }, + { + "epoch": 1.8425681618293757, + "grad_norm": 0.5494955938967461, + "learning_rate": 0.00019903168670604954, + "loss": 3.120840072631836, + "step": 3143, + "token_acc": 0.2872036809020802 + }, + { + "epoch": 1.8431545001465846, + "grad_norm": 0.6059995295340069, + "learning_rate": 0.00019903034073285659, + "loss": 3.1684136390686035, + "step": 3144, + "token_acc": 0.2804469658263335 + }, + { + "epoch": 1.8437408384637937, + "grad_norm": 0.4685524579929522, + "learning_rate": 0.00019902899382940703, + "loss": 3.1659278869628906, + "step": 3145, + "token_acc": 0.28042383381337793 + }, + { + "epoch": 1.8443271767810026, + "grad_norm": 0.49366014345488596, + "learning_rate": 0.0001990276459957135, + "loss": 3.148099899291992, + "step": 3146, + "token_acc": 0.28271852902504424 + }, + { + "epoch": 1.8449135150982117, + "grad_norm": 0.4906771288993331, + "learning_rate": 0.0001990262972317887, + "loss": 3.182332992553711, + "step": 3147, + "token_acc": 0.27778937507000806 + }, + { + "epoch": 1.8454998534154208, + "grad_norm": 0.36889227100532984, + "learning_rate": 0.00019902494753764528, + "loss": 3.136589527130127, + "step": 3148, + "token_acc": 0.2857649357607337 + }, + { + "epoch": 1.8460861917326299, + "grad_norm": 0.4447326938565774, + "learning_rate": 0.0001990235969132959, + "loss": 3.1523277759552, + "step": 3149, + "token_acc": 0.28308221063436245 + }, + { + "epoch": 1.8466725300498388, + "grad_norm": 0.4246869726872641, + "learning_rate": 0.00019902224535875326, + "loss": 3.125051498413086, + "step": 3150, + "token_acc": 0.2865712395596402 + }, + { + "epoch": 1.8472588683670477, + "grad_norm": 0.45784392703608906, + "learning_rate": 0.00019902089287403008, + "loss": 3.15807843208313, + "step": 3151, + "token_acc": 0.2820328311221048 + }, + { + "epoch": 1.8478452066842568, + "grad_norm": 0.4949707066269936, + "learning_rate": 0.00019901953945913902, + "loss": 3.156583786010742, + "step": 3152, + "token_acc": 0.28144236428128333 + }, + { + "epoch": 1.848431545001466, + "grad_norm": 0.5047709018593335, + "learning_rate": 0.00019901818511409283, + "loss": 3.1799654960632324, + "step": 3153, + "token_acc": 0.27888227779383284 + }, + { + "epoch": 1.849017883318675, + "grad_norm": 0.5342647379919129, + "learning_rate": 0.0001990168298389042, + "loss": 3.160592555999756, + "step": 3154, + "token_acc": 0.28125910430681283 + }, + { + "epoch": 1.849604221635884, + "grad_norm": 0.37399436600903335, + "learning_rate": 0.00019901547363358591, + "loss": 3.170722007751465, + "step": 3155, + "token_acc": 0.27877136000662006 + }, + { + "epoch": 1.8501905599530928, + "grad_norm": 0.42857212560436597, + "learning_rate": 0.00019901411649815066, + "loss": 3.1543173789978027, + "step": 3156, + "token_acc": 0.28016198112987284 + }, + { + "epoch": 1.850776898270302, + "grad_norm": 0.3761209757628327, + "learning_rate": 0.00019901275843261121, + "loss": 3.1635982990264893, + "step": 3157, + "token_acc": 0.2823472037395881 + }, + { + "epoch": 1.851363236587511, + "grad_norm": 0.4259229383619852, + "learning_rate": 0.00019901139943698031, + "loss": 3.129589557647705, + "step": 3158, + "token_acc": 0.2839228416389456 + }, + { + "epoch": 1.8519495749047201, + "grad_norm": 0.39051938087359483, + "learning_rate": 0.00019901003951127073, + "loss": 3.174107551574707, + "step": 3159, + "token_acc": 0.2797626425384234 + }, + { + "epoch": 1.852535913221929, + "grad_norm": 0.3915902518792056, + "learning_rate": 0.00019900867865549527, + "loss": 3.1363000869750977, + "step": 3160, + "token_acc": 0.28624257911026424 + }, + { + "epoch": 1.8531222515391381, + "grad_norm": 0.3785700978596616, + "learning_rate": 0.00019900731686966664, + "loss": 3.180429458618164, + "step": 3161, + "token_acc": 0.2794011360533656 + }, + { + "epoch": 1.853708589856347, + "grad_norm": 0.38563795576470977, + "learning_rate": 0.00019900595415379773, + "loss": 3.126704216003418, + "step": 3162, + "token_acc": 0.28596632939375904 + }, + { + "epoch": 1.8542949281735561, + "grad_norm": 0.3914382463597483, + "learning_rate": 0.00019900459050790125, + "loss": 3.181446075439453, + "step": 3163, + "token_acc": 0.2805578819718734 + }, + { + "epoch": 1.8548812664907652, + "grad_norm": 0.4024508840454432, + "learning_rate": 0.0001990032259319901, + "loss": 3.2028589248657227, + "step": 3164, + "token_acc": 0.27596925794587185 + }, + { + "epoch": 1.8554676048079743, + "grad_norm": 0.4120602701393418, + "learning_rate": 0.00019900186042607701, + "loss": 3.168173313140869, + "step": 3165, + "token_acc": 0.27970881257714814 + }, + { + "epoch": 1.8560539431251832, + "grad_norm": 0.4295809551244402, + "learning_rate": 0.0001990004939901749, + "loss": 3.1263885498046875, + "step": 3166, + "token_acc": 0.2859093509630666 + }, + { + "epoch": 1.8566402814423921, + "grad_norm": 0.4991480125804199, + "learning_rate": 0.0001989991266242965, + "loss": 3.1439974308013916, + "step": 3167, + "token_acc": 0.28274302782106886 + }, + { + "epoch": 1.8572266197596012, + "grad_norm": 0.4152419616776265, + "learning_rate": 0.00019899775832845474, + "loss": 3.133056402206421, + "step": 3168, + "token_acc": 0.285669307442251 + }, + { + "epoch": 1.8578129580768104, + "grad_norm": 0.4198853473450349, + "learning_rate": 0.0001989963891026624, + "loss": 3.138800621032715, + "step": 3169, + "token_acc": 0.2836669862119605 + }, + { + "epoch": 1.8583992963940195, + "grad_norm": 0.3865886311465718, + "learning_rate": 0.00019899501894693242, + "loss": 3.152841567993164, + "step": 3170, + "token_acc": 0.2819349054640347 + }, + { + "epoch": 1.8589856347112284, + "grad_norm": 0.4376017857952635, + "learning_rate": 0.00019899364786127763, + "loss": 3.117344379425049, + "step": 3171, + "token_acc": 0.28789297230690447 + }, + { + "epoch": 1.8595719730284375, + "grad_norm": 0.4605694190844209, + "learning_rate": 0.0001989922758457109, + "loss": 3.159036636352539, + "step": 3172, + "token_acc": 0.28163598598360523 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.42679047445479973, + "learning_rate": 0.00019899090290024515, + "loss": 3.1550045013427734, + "step": 3173, + "token_acc": 0.2815179716566147 + }, + { + "epoch": 1.8607446496628555, + "grad_norm": 0.38364390450781116, + "learning_rate": 0.00019898952902489328, + "loss": 3.154336929321289, + "step": 3174, + "token_acc": 0.28197236661312997 + }, + { + "epoch": 1.8613309879800646, + "grad_norm": 0.4613518652409264, + "learning_rate": 0.00019898815421966814, + "loss": 3.152181386947632, + "step": 3175, + "token_acc": 0.2814420961136387 + }, + { + "epoch": 1.8619173262972737, + "grad_norm": 0.541889744943375, + "learning_rate": 0.0001989867784845827, + "loss": 3.1032278537750244, + "step": 3176, + "token_acc": 0.28945720753263016 + }, + { + "epoch": 1.8625036646144826, + "grad_norm": 0.4744886296487689, + "learning_rate": 0.00019898540181964984, + "loss": 3.183441638946533, + "step": 3177, + "token_acc": 0.27838271030453443 + }, + { + "epoch": 1.8630900029316915, + "grad_norm": 0.4113562635132549, + "learning_rate": 0.00019898402422488252, + "loss": 3.155787467956543, + "step": 3178, + "token_acc": 0.28137896858683337 + }, + { + "epoch": 1.8636763412489006, + "grad_norm": 0.46275175698303483, + "learning_rate": 0.0001989826457002937, + "loss": 3.1900312900543213, + "step": 3179, + "token_acc": 0.2780106990330221 + }, + { + "epoch": 1.8642626795661097, + "grad_norm": 0.5517062487342601, + "learning_rate": 0.00019898126624589625, + "loss": 3.137910842895508, + "step": 3180, + "token_acc": 0.28437317442136334 + }, + { + "epoch": 1.8648490178833188, + "grad_norm": 0.5793527723350571, + "learning_rate": 0.00019897988586170325, + "loss": 3.170560836791992, + "step": 3181, + "token_acc": 0.280449121261692 + }, + { + "epoch": 1.8654353562005277, + "grad_norm": 0.4169418895038681, + "learning_rate": 0.00019897850454772756, + "loss": 3.175182342529297, + "step": 3182, + "token_acc": 0.27981192173517366 + }, + { + "epoch": 1.8660216945177366, + "grad_norm": 0.470673492609235, + "learning_rate": 0.0001989771223039822, + "loss": 3.155707359313965, + "step": 3183, + "token_acc": 0.2796852139664154 + }, + { + "epoch": 1.8666080328349457, + "grad_norm": 0.4759097125799998, + "learning_rate": 0.00019897573913048015, + "loss": 3.1349759101867676, + "step": 3184, + "token_acc": 0.2847529225552344 + }, + { + "epoch": 1.8671943711521548, + "grad_norm": 0.4436084008989719, + "learning_rate": 0.0001989743550272344, + "loss": 3.1443428993225098, + "step": 3185, + "token_acc": 0.28360019536849257 + }, + { + "epoch": 1.867780709469364, + "grad_norm": 0.4973986451857044, + "learning_rate": 0.00019897296999425796, + "loss": 3.1864750385284424, + "step": 3186, + "token_acc": 0.2769893738091889 + }, + { + "epoch": 1.8683670477865728, + "grad_norm": 0.67971881169631, + "learning_rate": 0.00019897158403156385, + "loss": 3.1825883388519287, + "step": 3187, + "token_acc": 0.27941157717821263 + }, + { + "epoch": 1.868953386103782, + "grad_norm": 0.551316890839125, + "learning_rate": 0.00019897019713916505, + "loss": 3.126471519470215, + "step": 3188, + "token_acc": 0.28547865170896464 + }, + { + "epoch": 1.8695397244209908, + "grad_norm": 0.560415571591795, + "learning_rate": 0.0001989688093170746, + "loss": 3.1342384815216064, + "step": 3189, + "token_acc": 0.2844964558750535 + }, + { + "epoch": 1.8701260627382, + "grad_norm": 0.37715300556323406, + "learning_rate": 0.00019896742056530557, + "loss": 3.146027088165283, + "step": 3190, + "token_acc": 0.2810979523515864 + }, + { + "epoch": 1.870712401055409, + "grad_norm": 0.5176268270274003, + "learning_rate": 0.00019896603088387097, + "loss": 3.1563191413879395, + "step": 3191, + "token_acc": 0.2839779473681423 + }, + { + "epoch": 1.8712987393726181, + "grad_norm": 0.5449484842845632, + "learning_rate": 0.00019896464027278386, + "loss": 3.1326215267181396, + "step": 3192, + "token_acc": 0.2843115874577804 + }, + { + "epoch": 1.871885077689827, + "grad_norm": 0.4325658013105419, + "learning_rate": 0.00019896324873205736, + "loss": 3.163142681121826, + "step": 3193, + "token_acc": 0.280619848060109 + }, + { + "epoch": 1.872471416007036, + "grad_norm": 0.4338395486238251, + "learning_rate": 0.00019896185626170447, + "loss": 3.138472080230713, + "step": 3194, + "token_acc": 0.2846876903961837 + }, + { + "epoch": 1.873057754324245, + "grad_norm": 0.38759165733547385, + "learning_rate": 0.00019896046286173826, + "loss": 3.1290807723999023, + "step": 3195, + "token_acc": 0.2846563168577541 + }, + { + "epoch": 1.8736440926414542, + "grad_norm": 0.3883007953784523, + "learning_rate": 0.00019895906853217187, + "loss": 3.130443811416626, + "step": 3196, + "token_acc": 0.2842502250411282 + }, + { + "epoch": 1.8742304309586633, + "grad_norm": 0.40024998634002007, + "learning_rate": 0.00019895767327301843, + "loss": 3.1582517623901367, + "step": 3197, + "token_acc": 0.2824491552781664 + }, + { + "epoch": 1.8748167692758722, + "grad_norm": 0.33091114363166074, + "learning_rate": 0.00019895627708429095, + "loss": 3.15684175491333, + "step": 3198, + "token_acc": 0.28035562250311086 + }, + { + "epoch": 1.875403107593081, + "grad_norm": 0.464193434237222, + "learning_rate": 0.0001989548799660026, + "loss": 3.125330686569214, + "step": 3199, + "token_acc": 0.28594692320888926 + }, + { + "epoch": 1.8759894459102902, + "grad_norm": 0.41848552746109946, + "learning_rate": 0.00019895348191816648, + "loss": 3.1433699131011963, + "step": 3200, + "token_acc": 0.285457865335396 + }, + { + "epoch": 1.8765757842274993, + "grad_norm": 0.36822204876631076, + "learning_rate": 0.00019895208294079574, + "loss": 3.1163330078125, + "step": 3201, + "token_acc": 0.2858785297202118 + }, + { + "epoch": 1.8771621225447084, + "grad_norm": 0.42315000973182176, + "learning_rate": 0.00019895068303390354, + "loss": 3.087249755859375, + "step": 3202, + "token_acc": 0.29129069798912016 + }, + { + "epoch": 1.8777484608619175, + "grad_norm": 0.42522593857781243, + "learning_rate": 0.00019894928219750302, + "loss": 3.164109230041504, + "step": 3203, + "token_acc": 0.28085653347620915 + }, + { + "epoch": 1.8783347991791264, + "grad_norm": 0.39354903853590534, + "learning_rate": 0.0001989478804316073, + "loss": 3.155294418334961, + "step": 3204, + "token_acc": 0.2827194182648332 + }, + { + "epoch": 1.8789211374963353, + "grad_norm": 0.4630169258894746, + "learning_rate": 0.0001989464777362296, + "loss": 3.157960891723633, + "step": 3205, + "token_acc": 0.2803755176100145 + }, + { + "epoch": 1.8795074758135444, + "grad_norm": 0.5026799394237053, + "learning_rate": 0.00019894507411138306, + "loss": 3.1797566413879395, + "step": 3206, + "token_acc": 0.2790555533410399 + }, + { + "epoch": 1.8800938141307535, + "grad_norm": 0.4373500449653247, + "learning_rate": 0.00019894366955708085, + "loss": 3.12471866607666, + "step": 3207, + "token_acc": 0.2846829782460095 + }, + { + "epoch": 1.8806801524479626, + "grad_norm": 0.4406153282876968, + "learning_rate": 0.00019894226407333624, + "loss": 3.1485939025878906, + "step": 3208, + "token_acc": 0.2831579057943611 + }, + { + "epoch": 1.8812664907651715, + "grad_norm": 0.38880043225933736, + "learning_rate": 0.0001989408576601624, + "loss": 3.144099712371826, + "step": 3209, + "token_acc": 0.2819289096906153 + }, + { + "epoch": 1.8818528290823804, + "grad_norm": 0.40873807140141116, + "learning_rate": 0.00019893945031757246, + "loss": 3.166224479675293, + "step": 3210, + "token_acc": 0.280845210137918 + }, + { + "epoch": 1.8824391673995895, + "grad_norm": 0.4396707790499638, + "learning_rate": 0.00019893804204557976, + "loss": 3.1492788791656494, + "step": 3211, + "token_acc": 0.28203996610648213 + }, + { + "epoch": 1.8830255057167986, + "grad_norm": 0.41933922322374717, + "learning_rate": 0.00019893663284419746, + "loss": 3.169053554534912, + "step": 3212, + "token_acc": 0.2790902012522845 + }, + { + "epoch": 1.8836118440340077, + "grad_norm": 0.5402001093625535, + "learning_rate": 0.0001989352227134388, + "loss": 3.166184425354004, + "step": 3213, + "token_acc": 0.2798218323566291 + }, + { + "epoch": 1.8841981823512166, + "grad_norm": 0.45991063578681857, + "learning_rate": 0.00019893381165331708, + "loss": 3.143467903137207, + "step": 3214, + "token_acc": 0.28353184217909155 + }, + { + "epoch": 1.8847845206684257, + "grad_norm": 0.4392756768707656, + "learning_rate": 0.00019893239966384547, + "loss": 3.1961116790771484, + "step": 3215, + "token_acc": 0.2780671288642478 + }, + { + "epoch": 1.8853708589856346, + "grad_norm": 0.4851611001595174, + "learning_rate": 0.0001989309867450373, + "loss": 3.1460814476013184, + "step": 3216, + "token_acc": 0.28068519498493855 + }, + { + "epoch": 1.8859571973028437, + "grad_norm": 0.4769388617482181, + "learning_rate": 0.0001989295728969058, + "loss": 3.171431064605713, + "step": 3217, + "token_acc": 0.27904988294801036 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.4281180237668282, + "learning_rate": 0.00019892815811946432, + "loss": 3.144864797592163, + "step": 3218, + "token_acc": 0.28378929745637566 + }, + { + "epoch": 1.887129873937262, + "grad_norm": 0.4114349528380137, + "learning_rate": 0.00019892674241272604, + "loss": 3.1885626316070557, + "step": 3219, + "token_acc": 0.27822747734415015 + }, + { + "epoch": 1.8877162122544708, + "grad_norm": 0.4530915036357786, + "learning_rate": 0.00019892532577670436, + "loss": 3.157224655151367, + "step": 3220, + "token_acc": 0.28090873634945396 + }, + { + "epoch": 1.8883025505716797, + "grad_norm": 0.41687071293147554, + "learning_rate": 0.00019892390821141254, + "loss": 3.1666622161865234, + "step": 3221, + "token_acc": 0.28051364536669016 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.47071131866251914, + "learning_rate": 0.0001989224897168639, + "loss": 3.181093692779541, + "step": 3222, + "token_acc": 0.27842009522615574 + }, + { + "epoch": 1.889475227206098, + "grad_norm": 0.5178138802282173, + "learning_rate": 0.00019892107029307175, + "loss": 3.178314208984375, + "step": 3223, + "token_acc": 0.2774907174897517 + }, + { + "epoch": 1.890061565523307, + "grad_norm": 0.5301123123952524, + "learning_rate": 0.0001989196499400495, + "loss": 3.15483021736145, + "step": 3224, + "token_acc": 0.28167424058735036 + }, + { + "epoch": 1.890647903840516, + "grad_norm": 0.38066199442346643, + "learning_rate": 0.0001989182286578104, + "loss": 3.1507301330566406, + "step": 3225, + "token_acc": 0.28354695732450613 + }, + { + "epoch": 1.8912342421577248, + "grad_norm": 0.5204048495102469, + "learning_rate": 0.00019891680644636782, + "loss": 3.1960787773132324, + "step": 3226, + "token_acc": 0.275404504348358 + }, + { + "epoch": 1.891820580474934, + "grad_norm": 0.49118000046949734, + "learning_rate": 0.00019891538330573516, + "loss": 3.191270112991333, + "step": 3227, + "token_acc": 0.2767167457549299 + }, + { + "epoch": 1.892406918792143, + "grad_norm": 0.3899909943241877, + "learning_rate": 0.00019891395923592574, + "loss": 3.1280264854431152, + "step": 3228, + "token_acc": 0.286118061584027 + }, + { + "epoch": 1.8929932571093522, + "grad_norm": 0.42050702720869215, + "learning_rate": 0.00019891253423695297, + "loss": 3.1218748092651367, + "step": 3229, + "token_acc": 0.28625676140866796 + }, + { + "epoch": 1.8935795954265613, + "grad_norm": 0.42389573750178067, + "learning_rate": 0.00019891110830883024, + "loss": 3.086203098297119, + "step": 3230, + "token_acc": 0.2896606914212548 + }, + { + "epoch": 1.8941659337437702, + "grad_norm": 0.3980895946819844, + "learning_rate": 0.00019890968145157092, + "loss": 3.151057720184326, + "step": 3231, + "token_acc": 0.28283329047377925 + }, + { + "epoch": 1.894752272060979, + "grad_norm": 0.5630381563534455, + "learning_rate": 0.0001989082536651884, + "loss": 3.1902682781219482, + "step": 3232, + "token_acc": 0.2777019180107638 + }, + { + "epoch": 1.8953386103781882, + "grad_norm": 0.4678747353585678, + "learning_rate": 0.0001989068249496962, + "loss": 3.1839427947998047, + "step": 3233, + "token_acc": 0.2783641160949868 + }, + { + "epoch": 1.8959249486953973, + "grad_norm": 0.3531644593514653, + "learning_rate": 0.00019890539530510756, + "loss": 3.1549038887023926, + "step": 3234, + "token_acc": 0.28208884575617266 + }, + { + "epoch": 1.8965112870126064, + "grad_norm": 0.4652094218639067, + "learning_rate": 0.0001989039647314361, + "loss": 3.1172502040863037, + "step": 3235, + "token_acc": 0.28850686655926067 + }, + { + "epoch": 1.8970976253298153, + "grad_norm": 0.423795459729361, + "learning_rate": 0.00019890253322869507, + "loss": 3.153761148452759, + "step": 3236, + "token_acc": 0.2811508508797231 + }, + { + "epoch": 1.8976839636470242, + "grad_norm": 0.4079519359854486, + "learning_rate": 0.0001989011007968981, + "loss": 3.1571176052093506, + "step": 3237, + "token_acc": 0.28128988572027397 + }, + { + "epoch": 1.8982703019642333, + "grad_norm": 0.5022121493751411, + "learning_rate": 0.0001988996674360585, + "loss": 3.1455979347229004, + "step": 3238, + "token_acc": 0.2820751399745747 + }, + { + "epoch": 1.8988566402814424, + "grad_norm": 0.44162989100445377, + "learning_rate": 0.0001988982331461898, + "loss": 3.111053466796875, + "step": 3239, + "token_acc": 0.28854914518878155 + }, + { + "epoch": 1.8994429785986515, + "grad_norm": 0.4108162647140152, + "learning_rate": 0.0001988967979273055, + "loss": 3.148909091949463, + "step": 3240, + "token_acc": 0.28336517697199404 + }, + { + "epoch": 1.9000293169158604, + "grad_norm": 0.46593593405901823, + "learning_rate": 0.00019889536177941902, + "loss": 3.1266627311706543, + "step": 3241, + "token_acc": 0.2853437335726309 + }, + { + "epoch": 1.9006156552330695, + "grad_norm": 0.4885165951808546, + "learning_rate": 0.0001988939247025439, + "loss": 3.1374144554138184, + "step": 3242, + "token_acc": 0.28303717388126154 + }, + { + "epoch": 1.9012019935502784, + "grad_norm": 0.4270875478585992, + "learning_rate": 0.0001988924866966936, + "loss": 3.158627986907959, + "step": 3243, + "token_acc": 0.280642790182376 + }, + { + "epoch": 1.9017883318674875, + "grad_norm": 0.378508542696163, + "learning_rate": 0.00019889104776188167, + "loss": 3.1566808223724365, + "step": 3244, + "token_acc": 0.2791309951210531 + }, + { + "epoch": 1.9023746701846966, + "grad_norm": 0.4332333932746044, + "learning_rate": 0.00019888960789812157, + "loss": 3.181943893432617, + "step": 3245, + "token_acc": 0.2781220370245639 + }, + { + "epoch": 1.9029610085019057, + "grad_norm": 0.3622081683512169, + "learning_rate": 0.0001988881671054269, + "loss": 3.1232476234436035, + "step": 3246, + "token_acc": 0.2884079992922822 + }, + { + "epoch": 1.9035473468191146, + "grad_norm": 0.4354392331831683, + "learning_rate": 0.0001988867253838111, + "loss": 3.1134345531463623, + "step": 3247, + "token_acc": 0.2862761521054132 + }, + { + "epoch": 1.9041336851363235, + "grad_norm": 0.4529090136982653, + "learning_rate": 0.00019888528273328778, + "loss": 3.1472015380859375, + "step": 3248, + "token_acc": 0.2817212417093578 + }, + { + "epoch": 1.9047200234535326, + "grad_norm": 0.3402119759869073, + "learning_rate": 0.0001988838391538705, + "loss": 3.17484712600708, + "step": 3249, + "token_acc": 0.2786861618207602 + }, + { + "epoch": 1.9053063617707418, + "grad_norm": 0.5259127551499162, + "learning_rate": 0.0001988823946455728, + "loss": 3.1805531978607178, + "step": 3250, + "token_acc": 0.27801108337828806 + }, + { + "epoch": 1.9058927000879509, + "grad_norm": 0.6804863844074353, + "learning_rate": 0.00019888094920840826, + "loss": 3.110687732696533, + "step": 3251, + "token_acc": 0.28825601101578224 + }, + { + "epoch": 1.9064790384051598, + "grad_norm": 0.5303752984573485, + "learning_rate": 0.00019887950284239043, + "loss": 3.1947684288024902, + "step": 3252, + "token_acc": 0.27786302676530206 + }, + { + "epoch": 1.9070653767223686, + "grad_norm": 0.6977649656088845, + "learning_rate": 0.00019887805554753293, + "loss": 3.1728124618530273, + "step": 3253, + "token_acc": 0.2783591859200066 + }, + { + "epoch": 1.9076517150395778, + "grad_norm": 0.565955665320739, + "learning_rate": 0.0001988766073238493, + "loss": 3.152412176132202, + "step": 3254, + "token_acc": 0.2821726788637258 + }, + { + "epoch": 1.9082380533567869, + "grad_norm": 0.429956343896276, + "learning_rate": 0.00019887515817135318, + "loss": 3.138113498687744, + "step": 3255, + "token_acc": 0.2829673441137608 + }, + { + "epoch": 1.908824391673996, + "grad_norm": 0.4172307717284425, + "learning_rate": 0.00019887370809005823, + "loss": 3.1706557273864746, + "step": 3256, + "token_acc": 0.28047691039032435 + }, + { + "epoch": 1.909410729991205, + "grad_norm": 0.4403381961855377, + "learning_rate": 0.000198872257079978, + "loss": 3.145526885986328, + "step": 3257, + "token_acc": 0.2823988848335231 + }, + { + "epoch": 1.909997068308414, + "grad_norm": 0.45722631012763915, + "learning_rate": 0.00019887080514112616, + "loss": 3.1019086837768555, + "step": 3258, + "token_acc": 0.28945744064694895 + }, + { + "epoch": 1.9105834066256229, + "grad_norm": 0.43816366204106433, + "learning_rate": 0.00019886935227351628, + "loss": 3.133572816848755, + "step": 3259, + "token_acc": 0.2841028609526861 + }, + { + "epoch": 1.911169744942832, + "grad_norm": 0.4360792024039627, + "learning_rate": 0.00019886789847716214, + "loss": 3.0969831943511963, + "step": 3260, + "token_acc": 0.28956911514858685 + }, + { + "epoch": 1.911756083260041, + "grad_norm": 0.5317691126725539, + "learning_rate": 0.00019886644375207728, + "loss": 3.141867160797119, + "step": 3261, + "token_acc": 0.28271220459342566 + }, + { + "epoch": 1.9123424215772502, + "grad_norm": 0.5305958061096548, + "learning_rate": 0.0001988649880982754, + "loss": 3.178436279296875, + "step": 3262, + "token_acc": 0.2790539063069442 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.4118645617158987, + "learning_rate": 0.0001988635315157702, + "loss": 3.121683120727539, + "step": 3263, + "token_acc": 0.28587478432998514 + }, + { + "epoch": 1.913515098211668, + "grad_norm": 0.4905472482001144, + "learning_rate": 0.00019886207400457532, + "loss": 3.1227633953094482, + "step": 3264, + "token_acc": 0.28750758181006864 + }, + { + "epoch": 1.914101436528877, + "grad_norm": 0.5474341061494424, + "learning_rate": 0.0001988606155647045, + "loss": 3.111152172088623, + "step": 3265, + "token_acc": 0.28842247058114895 + }, + { + "epoch": 1.9146877748460862, + "grad_norm": 0.5576072595580398, + "learning_rate": 0.0001988591561961714, + "loss": 3.160722494125366, + "step": 3266, + "token_acc": 0.2810477272490385 + }, + { + "epoch": 1.9152741131632953, + "grad_norm": 0.4483321811592267, + "learning_rate": 0.00019885769589898976, + "loss": 3.158362865447998, + "step": 3267, + "token_acc": 0.28155793510752297 + }, + { + "epoch": 1.9158604514805042, + "grad_norm": 0.5518105917816112, + "learning_rate": 0.00019885623467317325, + "loss": 3.152453899383545, + "step": 3268, + "token_acc": 0.28189682214766887 + }, + { + "epoch": 1.9164467897977133, + "grad_norm": 0.46328509549078806, + "learning_rate": 0.00019885477251873563, + "loss": 3.1459293365478516, + "step": 3269, + "token_acc": 0.2838478452183762 + }, + { + "epoch": 1.9170331281149222, + "grad_norm": 0.4901014470659852, + "learning_rate": 0.00019885330943569061, + "loss": 3.1369147300720215, + "step": 3270, + "token_acc": 0.2839331734899286 + }, + { + "epoch": 1.9176194664321313, + "grad_norm": 0.46140819061057525, + "learning_rate": 0.000198851845424052, + "loss": 3.1623873710632324, + "step": 3271, + "token_acc": 0.28042558065675 + }, + { + "epoch": 1.9182058047493404, + "grad_norm": 0.4297237070282822, + "learning_rate": 0.0001988503804838335, + "loss": 3.136974334716797, + "step": 3272, + "token_acc": 0.2852420885107425 + }, + { + "epoch": 1.9187921430665495, + "grad_norm": 0.43603662214846406, + "learning_rate": 0.00019884891461504888, + "loss": 3.114511489868164, + "step": 3273, + "token_acc": 0.28738246610019147 + }, + { + "epoch": 1.9193784813837584, + "grad_norm": 0.40496148678850025, + "learning_rate": 0.00019884744781771188, + "loss": 3.106574058532715, + "step": 3274, + "token_acc": 0.2887540598216222 + }, + { + "epoch": 1.9199648197009673, + "grad_norm": 0.5032773562536458, + "learning_rate": 0.00019884598009183631, + "loss": 3.1133415699005127, + "step": 3275, + "token_acc": 0.28727047766969843 + }, + { + "epoch": 1.9205511580181764, + "grad_norm": 0.42612099397598424, + "learning_rate": 0.000198844511437436, + "loss": 3.171154737472534, + "step": 3276, + "token_acc": 0.2806739911535418 + }, + { + "epoch": 1.9211374963353856, + "grad_norm": 0.40530336978197173, + "learning_rate": 0.00019884304185452463, + "loss": 3.1084368228912354, + "step": 3277, + "token_acc": 0.28794486958531595 + }, + { + "epoch": 1.9217238346525947, + "grad_norm": 0.5002513987649199, + "learning_rate": 0.0001988415713431161, + "loss": 3.146696090698242, + "step": 3278, + "token_acc": 0.28150187394212944 + }, + { + "epoch": 1.9223101729698036, + "grad_norm": 0.49568001781253934, + "learning_rate": 0.00019884009990322424, + "loss": 3.1555867195129395, + "step": 3279, + "token_acc": 0.2810712071786429 + }, + { + "epoch": 1.9228965112870124, + "grad_norm": 0.41848514741737625, + "learning_rate": 0.0001988386275348628, + "loss": 3.1539266109466553, + "step": 3280, + "token_acc": 0.283250924972562 + }, + { + "epoch": 1.9234828496042216, + "grad_norm": 0.3924025283135276, + "learning_rate": 0.00019883715423804561, + "loss": 3.142360210418701, + "step": 3281, + "token_acc": 0.2814026341496278 + }, + { + "epoch": 1.9240691879214307, + "grad_norm": 0.49662023762200874, + "learning_rate": 0.00019883568001278658, + "loss": 3.1667699813842773, + "step": 3282, + "token_acc": 0.2814986148839461 + }, + { + "epoch": 1.9246555262386398, + "grad_norm": 0.47243956565238826, + "learning_rate": 0.00019883420485909952, + "loss": 3.130139112472534, + "step": 3283, + "token_acc": 0.2833863507219747 + }, + { + "epoch": 1.9252418645558487, + "grad_norm": 0.4353539007460067, + "learning_rate": 0.00019883272877699825, + "loss": 3.1643974781036377, + "step": 3284, + "token_acc": 0.27937373221816764 + }, + { + "epoch": 1.9258282028730578, + "grad_norm": 0.4536369818582669, + "learning_rate": 0.0001988312517664967, + "loss": 3.161803722381592, + "step": 3285, + "token_acc": 0.2811986579964589 + }, + { + "epoch": 1.9264145411902667, + "grad_norm": 0.4128846601990197, + "learning_rate": 0.0001988297738276087, + "loss": 3.1450726985931396, + "step": 3286, + "token_acc": 0.2837359733159987 + }, + { + "epoch": 1.9270008795074758, + "grad_norm": 0.4858051560011724, + "learning_rate": 0.00019882829496034815, + "loss": 3.1175756454467773, + "step": 3287, + "token_acc": 0.2867556918262933 + }, + { + "epoch": 1.927587217824685, + "grad_norm": 0.39386646450620316, + "learning_rate": 0.00019882681516472897, + "loss": 3.1193647384643555, + "step": 3288, + "token_acc": 0.2862091812102598 + }, + { + "epoch": 1.928173556141894, + "grad_norm": 0.44727425742997867, + "learning_rate": 0.00019882533444076503, + "loss": 3.0887303352355957, + "step": 3289, + "token_acc": 0.29138277747966024 + }, + { + "epoch": 1.928759894459103, + "grad_norm": 0.4319031486000401, + "learning_rate": 0.00019882385278847023, + "loss": 3.186368942260742, + "step": 3290, + "token_acc": 0.2747711554109032 + }, + { + "epoch": 1.9293462327763118, + "grad_norm": 0.45965988926961476, + "learning_rate": 0.00019882237020785848, + "loss": 3.165771961212158, + "step": 3291, + "token_acc": 0.28079758001439975 + }, + { + "epoch": 1.929932571093521, + "grad_norm": 0.47732279940864375, + "learning_rate": 0.00019882088669894373, + "loss": 3.1610002517700195, + "step": 3292, + "token_acc": 0.28118670383106836 + }, + { + "epoch": 1.93051890941073, + "grad_norm": 0.5632949973091116, + "learning_rate": 0.00019881940226173993, + "loss": 3.1582798957824707, + "step": 3293, + "token_acc": 0.28076239765178435 + }, + { + "epoch": 1.9311052477279391, + "grad_norm": 0.6016462719645413, + "learning_rate": 0.00019881791689626101, + "loss": 3.18418288230896, + "step": 3294, + "token_acc": 0.27669563157498034 + }, + { + "epoch": 1.931691586045148, + "grad_norm": 0.5180000945548677, + "learning_rate": 0.0001988164306025209, + "loss": 3.158323287963867, + "step": 3295, + "token_acc": 0.2817328122831182 + }, + { + "epoch": 1.9322779243623571, + "grad_norm": 0.6408493537146691, + "learning_rate": 0.00019881494338053361, + "loss": 3.1660799980163574, + "step": 3296, + "token_acc": 0.28048748919250543 + }, + { + "epoch": 1.932864262679566, + "grad_norm": 0.5210578365408927, + "learning_rate": 0.00019881345523031305, + "loss": 3.147230386734009, + "step": 3297, + "token_acc": 0.28441532785528556 + }, + { + "epoch": 1.9334506009967751, + "grad_norm": 0.42222538281236116, + "learning_rate": 0.00019881196615187326, + "loss": 3.145153045654297, + "step": 3298, + "token_acc": 0.2823341970077519 + }, + { + "epoch": 1.9340369393139842, + "grad_norm": 0.4735551531552477, + "learning_rate": 0.00019881047614522817, + "loss": 3.1301538944244385, + "step": 3299, + "token_acc": 0.2849992345711411 + }, + { + "epoch": 1.9346232776311933, + "grad_norm": 0.5227444240607516, + "learning_rate": 0.00019880898521039184, + "loss": 3.1397571563720703, + "step": 3300, + "token_acc": 0.28414099763452816 + }, + { + "epoch": 1.9352096159484022, + "grad_norm": 0.3761506949759527, + "learning_rate": 0.0001988074933473782, + "loss": 3.170996904373169, + "step": 3301, + "token_acc": 0.28052839287370296 + }, + { + "epoch": 1.9357959542656111, + "grad_norm": 0.41975710479807016, + "learning_rate": 0.00019880600055620135, + "loss": 3.1350131034851074, + "step": 3302, + "token_acc": 0.28523785384594863 + }, + { + "epoch": 1.9363822925828202, + "grad_norm": 0.3863424758899897, + "learning_rate": 0.00019880450683687525, + "loss": 3.1554925441741943, + "step": 3303, + "token_acc": 0.28064895879180607 + }, + { + "epoch": 1.9369686309000294, + "grad_norm": 0.415419385074687, + "learning_rate": 0.00019880301218941392, + "loss": 3.1456151008605957, + "step": 3304, + "token_acc": 0.284612005124684 + }, + { + "epoch": 1.9375549692172385, + "grad_norm": 0.396386282676236, + "learning_rate": 0.00019880151661383146, + "loss": 3.145545721054077, + "step": 3305, + "token_acc": 0.28296389916830456 + }, + { + "epoch": 1.9381413075344474, + "grad_norm": 0.34068428027016884, + "learning_rate": 0.00019880002011014186, + "loss": 3.1382217407226562, + "step": 3306, + "token_acc": 0.28485461297363274 + }, + { + "epoch": 1.9387276458516562, + "grad_norm": 0.3733837201393907, + "learning_rate": 0.00019879852267835926, + "loss": 3.1436362266540527, + "step": 3307, + "token_acc": 0.2833681594257234 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.4271428322944052, + "learning_rate": 0.00019879702431849762, + "loss": 3.169227123260498, + "step": 3308, + "token_acc": 0.27909203036542757 + }, + { + "epoch": 1.9399003224860745, + "grad_norm": 0.3735020631025754, + "learning_rate": 0.00019879552503057109, + "loss": 3.181687831878662, + "step": 3309, + "token_acc": 0.27802120971142197 + }, + { + "epoch": 1.9404866608032836, + "grad_norm": 0.3493616037545797, + "learning_rate": 0.00019879402481459373, + "loss": 3.1482725143432617, + "step": 3310, + "token_acc": 0.28244964462710653 + }, + { + "epoch": 1.9410729991204925, + "grad_norm": 0.3638311640730497, + "learning_rate": 0.0001987925236705796, + "loss": 3.1121773719787598, + "step": 3311, + "token_acc": 0.28694095177462997 + }, + { + "epoch": 1.9416593374377016, + "grad_norm": 0.4579330539231343, + "learning_rate": 0.0001987910215985429, + "loss": 3.148362398147583, + "step": 3312, + "token_acc": 0.28170394605061094 + }, + { + "epoch": 1.9422456757549105, + "grad_norm": 0.5126014948678351, + "learning_rate": 0.00019878951859849764, + "loss": 3.1898064613342285, + "step": 3313, + "token_acc": 0.27752203929105806 + }, + { + "epoch": 1.9428320140721196, + "grad_norm": 0.4775509119943933, + "learning_rate": 0.00019878801467045794, + "loss": 3.183704376220703, + "step": 3314, + "token_acc": 0.2772992384687874 + }, + { + "epoch": 1.9434183523893287, + "grad_norm": 0.3792946399382829, + "learning_rate": 0.000198786509814438, + "loss": 3.169325828552246, + "step": 3315, + "token_acc": 0.28031376746132874 + }, + { + "epoch": 1.9440046907065378, + "grad_norm": 0.5277176341625125, + "learning_rate": 0.00019878500403045193, + "loss": 3.1394429206848145, + "step": 3316, + "token_acc": 0.2827082241745049 + }, + { + "epoch": 1.9445910290237467, + "grad_norm": 0.48063624585973536, + "learning_rate": 0.00019878349731851383, + "loss": 3.1598827838897705, + "step": 3317, + "token_acc": 0.2785116999942403 + }, + { + "epoch": 1.9451773673409556, + "grad_norm": 0.39340284887613947, + "learning_rate": 0.0001987819896786379, + "loss": 3.1648051738739014, + "step": 3318, + "token_acc": 0.2801377908588172 + }, + { + "epoch": 1.9457637056581647, + "grad_norm": 0.5167857954480706, + "learning_rate": 0.00019878048111083828, + "loss": 3.1408488750457764, + "step": 3319, + "token_acc": 0.2840455056745914 + }, + { + "epoch": 1.9463500439753738, + "grad_norm": 0.43064435349633695, + "learning_rate": 0.00019877897161512916, + "loss": 3.149380922317505, + "step": 3320, + "token_acc": 0.28209120356176437 + }, + { + "epoch": 1.946936382292583, + "grad_norm": 0.39222523034198636, + "learning_rate": 0.0001987774611915247, + "loss": 3.087924003601074, + "step": 3321, + "token_acc": 0.29100672121566334 + }, + { + "epoch": 1.9475227206097918, + "grad_norm": 0.47295727228284346, + "learning_rate": 0.00019877594984003908, + "loss": 3.149669885635376, + "step": 3322, + "token_acc": 0.2809865064491827 + }, + { + "epoch": 1.948109058927001, + "grad_norm": 0.32591262465000376, + "learning_rate": 0.00019877443756068655, + "loss": 3.1068527698516846, + "step": 3323, + "token_acc": 0.2885880770954729 + }, + { + "epoch": 1.9486953972442098, + "grad_norm": 0.4387291187794354, + "learning_rate": 0.00019877292435348124, + "loss": 3.195404529571533, + "step": 3324, + "token_acc": 0.27587786983245816 + }, + { + "epoch": 1.949281735561419, + "grad_norm": 0.4497504094418033, + "learning_rate": 0.00019877141021843744, + "loss": 3.0984115600585938, + "step": 3325, + "token_acc": 0.29035156379190596 + }, + { + "epoch": 1.949868073878628, + "grad_norm": 0.49371039634781844, + "learning_rate": 0.0001987698951555693, + "loss": 3.1666388511657715, + "step": 3326, + "token_acc": 0.27935762843245027 + }, + { + "epoch": 1.9504544121958372, + "grad_norm": 0.4759427493389348, + "learning_rate": 0.0001987683791648911, + "loss": 3.1158556938171387, + "step": 3327, + "token_acc": 0.2856056083554776 + }, + { + "epoch": 1.951040750513046, + "grad_norm": 0.41721154036558145, + "learning_rate": 0.0001987668622464171, + "loss": 3.1539034843444824, + "step": 3328, + "token_acc": 0.28220472042139383 + }, + { + "epoch": 1.951627088830255, + "grad_norm": 0.4375019697132133, + "learning_rate": 0.0001987653444001615, + "loss": 3.151810646057129, + "step": 3329, + "token_acc": 0.28326284115060074 + }, + { + "epoch": 1.952213427147464, + "grad_norm": 0.4656794834872127, + "learning_rate": 0.00019876382562613855, + "loss": 3.1609416007995605, + "step": 3330, + "token_acc": 0.2799758578221472 + }, + { + "epoch": 1.9527997654646732, + "grad_norm": 0.43300670870512536, + "learning_rate": 0.0001987623059243626, + "loss": 3.1560091972351074, + "step": 3331, + "token_acc": 0.2809870713534952 + }, + { + "epoch": 1.9533861037818823, + "grad_norm": 0.40685596219884385, + "learning_rate": 0.00019876078529484784, + "loss": 3.098418712615967, + "step": 3332, + "token_acc": 0.28920361631235375 + }, + { + "epoch": 1.9539724420990912, + "grad_norm": 0.4838990784783711, + "learning_rate": 0.00019875926373760856, + "loss": 3.1322765350341797, + "step": 3333, + "token_acc": 0.28403591826440594 + }, + { + "epoch": 1.9545587804163, + "grad_norm": 0.41922430021704327, + "learning_rate": 0.00019875774125265911, + "loss": 3.144613742828369, + "step": 3334, + "token_acc": 0.28423512758693553 + }, + { + "epoch": 1.9551451187335092, + "grad_norm": 0.4577425714268011, + "learning_rate": 0.00019875621784001376, + "loss": 3.1639633178710938, + "step": 3335, + "token_acc": 0.2799635240142184 + }, + { + "epoch": 1.9557314570507183, + "grad_norm": 0.4800700659895863, + "learning_rate": 0.0001987546934996868, + "loss": 3.1706573963165283, + "step": 3336, + "token_acc": 0.27969955598776725 + }, + { + "epoch": 1.9563177953679274, + "grad_norm": 0.4314120027220838, + "learning_rate": 0.00019875316823169257, + "loss": 3.170544147491455, + "step": 3337, + "token_acc": 0.2772709211255529 + }, + { + "epoch": 1.9569041336851363, + "grad_norm": 0.4620926943256627, + "learning_rate": 0.0001987516420360454, + "loss": 3.1554579734802246, + "step": 3338, + "token_acc": 0.2813799386623303 + }, + { + "epoch": 1.9574904720023454, + "grad_norm": 0.5480097916324622, + "learning_rate": 0.00019875011491275963, + "loss": 3.132462978363037, + "step": 3339, + "token_acc": 0.28333226190093536 + }, + { + "epoch": 1.9580768103195543, + "grad_norm": 0.5265670614061894, + "learning_rate": 0.0001987485868618496, + "loss": 3.1695430278778076, + "step": 3340, + "token_acc": 0.27784372034166405 + }, + { + "epoch": 1.9586631486367634, + "grad_norm": 0.469388764590352, + "learning_rate": 0.00019874705788332965, + "loss": 3.171898365020752, + "step": 3341, + "token_acc": 0.2783012653291384 + }, + { + "epoch": 1.9592494869539725, + "grad_norm": 0.5154525378605237, + "learning_rate": 0.00019874552797721415, + "loss": 3.185338020324707, + "step": 3342, + "token_acc": 0.2791533016992286 + }, + { + "epoch": 1.9598358252711816, + "grad_norm": 0.48672872858945443, + "learning_rate": 0.00019874399714351747, + "loss": 3.162562608718872, + "step": 3343, + "token_acc": 0.2797225679097112 + }, + { + "epoch": 1.9604221635883905, + "grad_norm": 0.563296153780986, + "learning_rate": 0.00019874246538225402, + "loss": 3.175508737564087, + "step": 3344, + "token_acc": 0.27854774120885994 + }, + { + "epoch": 1.9610085019055994, + "grad_norm": 0.4600294746050792, + "learning_rate": 0.00019874093269343817, + "loss": 3.1723577976226807, + "step": 3345, + "token_acc": 0.28015407295684436 + }, + { + "epoch": 1.9615948402228085, + "grad_norm": 0.41544442130913456, + "learning_rate": 0.0001987393990770843, + "loss": 3.1486682891845703, + "step": 3346, + "token_acc": 0.2807905904542359 + }, + { + "epoch": 1.9621811785400176, + "grad_norm": 0.4325196323458287, + "learning_rate": 0.00019873786453320682, + "loss": 3.15328049659729, + "step": 3347, + "token_acc": 0.2827602383569081 + }, + { + "epoch": 1.9627675168572267, + "grad_norm": 0.4361981691380038, + "learning_rate": 0.00019873632906182017, + "loss": 3.1095821857452393, + "step": 3348, + "token_acc": 0.2869903838512278 + }, + { + "epoch": 1.9633538551744356, + "grad_norm": 0.39307651954305384, + "learning_rate": 0.00019873479266293873, + "loss": 3.1318202018737793, + "step": 3349, + "token_acc": 0.28561577858605425 + }, + { + "epoch": 1.9639401934916447, + "grad_norm": 0.38923621504452854, + "learning_rate": 0.00019873325533657698, + "loss": 3.1935160160064697, + "step": 3350, + "token_acc": 0.2759723291360192 + }, + { + "epoch": 1.9645265318088536, + "grad_norm": 0.4452045939892435, + "learning_rate": 0.00019873171708274936, + "loss": 3.127716541290283, + "step": 3351, + "token_acc": 0.28447302913317724 + }, + { + "epoch": 1.9651128701260627, + "grad_norm": 0.41973476067702986, + "learning_rate": 0.00019873017790147026, + "loss": 3.152920722961426, + "step": 3352, + "token_acc": 0.28289961935313973 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.43602126808209046, + "learning_rate": 0.0001987286377927542, + "loss": 3.1512486934661865, + "step": 3353, + "token_acc": 0.2825903589051495 + }, + { + "epoch": 1.966285546760481, + "grad_norm": 0.4422121562419161, + "learning_rate": 0.00019872709675661563, + "loss": 3.161752700805664, + "step": 3354, + "token_acc": 0.2802415432622347 + }, + { + "epoch": 1.9668718850776898, + "grad_norm": 0.48987517201512987, + "learning_rate": 0.000198725554793069, + "loss": 3.0940101146698, + "step": 3355, + "token_acc": 0.28962415129118946 + }, + { + "epoch": 1.9674582233948987, + "grad_norm": 0.457699278304656, + "learning_rate": 0.00019872401190212884, + "loss": 3.1350083351135254, + "step": 3356, + "token_acc": 0.28572879145779667 + }, + { + "epoch": 1.9680445617121078, + "grad_norm": 0.40411359416622067, + "learning_rate": 0.0001987224680838096, + "loss": 3.1508283615112305, + "step": 3357, + "token_acc": 0.2820381212072607 + }, + { + "epoch": 1.968630900029317, + "grad_norm": 0.5006961450935864, + "learning_rate": 0.00019872092333812584, + "loss": 3.1286706924438477, + "step": 3358, + "token_acc": 0.2856202527709176 + }, + { + "epoch": 1.969217238346526, + "grad_norm": 0.3607412408312874, + "learning_rate": 0.00019871937766509202, + "loss": 3.0948667526245117, + "step": 3359, + "token_acc": 0.2901008152145106 + }, + { + "epoch": 1.969803576663735, + "grad_norm": 0.4733421343166111, + "learning_rate": 0.00019871783106472263, + "loss": 3.1291213035583496, + "step": 3360, + "token_acc": 0.28497296286484547 + }, + { + "epoch": 1.9703899149809438, + "grad_norm": 0.4503017132922778, + "learning_rate": 0.0001987162835370323, + "loss": 3.182110071182251, + "step": 3361, + "token_acc": 0.2754765786435286 + }, + { + "epoch": 1.970976253298153, + "grad_norm": 0.41805652952339045, + "learning_rate": 0.00019871473508203548, + "loss": 3.175023317337036, + "step": 3362, + "token_acc": 0.27923016517375854 + }, + { + "epoch": 1.971562591615362, + "grad_norm": 0.39126822844907544, + "learning_rate": 0.00019871318569974675, + "loss": 3.1669154167175293, + "step": 3363, + "token_acc": 0.28012119405439856 + }, + { + "epoch": 1.9721489299325712, + "grad_norm": 0.41518041785061105, + "learning_rate": 0.00019871163539018064, + "loss": 3.149796962738037, + "step": 3364, + "token_acc": 0.2815502463349828 + }, + { + "epoch": 1.97273526824978, + "grad_norm": 0.3815359428472301, + "learning_rate": 0.00019871008415335174, + "loss": 3.1111063957214355, + "step": 3365, + "token_acc": 0.2875617283950617 + }, + { + "epoch": 1.9733216065669892, + "grad_norm": 0.3772803270741792, + "learning_rate": 0.00019870853198927465, + "loss": 3.130253791809082, + "step": 3366, + "token_acc": 0.2863953610411419 + }, + { + "epoch": 1.973907944884198, + "grad_norm": 0.4216104835740131, + "learning_rate": 0.00019870697889796385, + "loss": 3.1638271808624268, + "step": 3367, + "token_acc": 0.27893247800859944 + }, + { + "epoch": 1.9744942832014072, + "grad_norm": 0.42528007468966156, + "learning_rate": 0.00019870542487943405, + "loss": 3.1918272972106934, + "step": 3368, + "token_acc": 0.27538355524225167 + }, + { + "epoch": 1.9750806215186163, + "grad_norm": 0.40591839088801823, + "learning_rate": 0.00019870386993369973, + "loss": 3.091796875, + "step": 3369, + "token_acc": 0.2925289121409811 + }, + { + "epoch": 1.9756669598358254, + "grad_norm": 0.41318265937469273, + "learning_rate": 0.0001987023140607756, + "loss": 3.1552672386169434, + "step": 3370, + "token_acc": 0.2815559777337886 + }, + { + "epoch": 1.9762532981530343, + "grad_norm": 0.42682423412482684, + "learning_rate": 0.00019870075726067624, + "loss": 3.16373348236084, + "step": 3371, + "token_acc": 0.279953922833294 + }, + { + "epoch": 1.9768396364702432, + "grad_norm": 0.37378410161823516, + "learning_rate": 0.00019869919953341625, + "loss": 3.1154117584228516, + "step": 3372, + "token_acc": 0.28725539049979126 + }, + { + "epoch": 1.9774259747874523, + "grad_norm": 0.4600561649469349, + "learning_rate": 0.00019869764087901027, + "loss": 3.170482635498047, + "step": 3373, + "token_acc": 0.27974898297973255 + }, + { + "epoch": 1.9780123131046614, + "grad_norm": 0.4685646491125004, + "learning_rate": 0.00019869608129747298, + "loss": 3.1628518104553223, + "step": 3374, + "token_acc": 0.27904471748623205 + }, + { + "epoch": 1.9785986514218705, + "grad_norm": 0.43323598555098025, + "learning_rate": 0.000198694520788819, + "loss": 3.1561319828033447, + "step": 3375, + "token_acc": 0.2815776272251415 + }, + { + "epoch": 1.9791849897390794, + "grad_norm": 0.41106599945008765, + "learning_rate": 0.00019869295935306296, + "loss": 3.1661226749420166, + "step": 3376, + "token_acc": 0.27984305748059296 + }, + { + "epoch": 1.9797713280562885, + "grad_norm": 0.43136435540252194, + "learning_rate": 0.00019869139699021956, + "loss": 3.1784708499908447, + "step": 3377, + "token_acc": 0.277418543987401 + }, + { + "epoch": 1.9803576663734974, + "grad_norm": 0.3785295059250903, + "learning_rate": 0.00019868983370030348, + "loss": 3.1546106338500977, + "step": 3378, + "token_acc": 0.28043269355634626 + }, + { + "epoch": 1.9809440046907065, + "grad_norm": 0.38796815634361265, + "learning_rate": 0.0001986882694833294, + "loss": 3.1565778255462646, + "step": 3379, + "token_acc": 0.2815353324425814 + }, + { + "epoch": 1.9815303430079156, + "grad_norm": 0.46409959664125106, + "learning_rate": 0.000198686704339312, + "loss": 3.139758586883545, + "step": 3380, + "token_acc": 0.2824740250121675 + }, + { + "epoch": 1.9821166813251248, + "grad_norm": 0.5320599989592694, + "learning_rate": 0.000198685138268266, + "loss": 3.126986026763916, + "step": 3381, + "token_acc": 0.2844456209191035 + }, + { + "epoch": 1.9827030196423336, + "grad_norm": 0.4424768302535164, + "learning_rate": 0.00019868357127020612, + "loss": 3.149501323699951, + "step": 3382, + "token_acc": 0.28110199385533485 + }, + { + "epoch": 1.9832893579595425, + "grad_norm": 0.4068907625625445, + "learning_rate": 0.00019868200334514707, + "loss": 3.1123039722442627, + "step": 3383, + "token_acc": 0.28754924014405053 + }, + { + "epoch": 1.9838756962767516, + "grad_norm": 0.41291295574925646, + "learning_rate": 0.00019868043449310357, + "loss": 3.161336660385132, + "step": 3384, + "token_acc": 0.2824079239985364 + }, + { + "epoch": 1.9844620345939608, + "grad_norm": 0.5100690021328849, + "learning_rate": 0.00019867886471409033, + "loss": 3.113718032836914, + "step": 3385, + "token_acc": 0.28607224676812304 + }, + { + "epoch": 1.9850483729111699, + "grad_norm": 0.4253595109390804, + "learning_rate": 0.00019867729400812215, + "loss": 3.158912420272827, + "step": 3386, + "token_acc": 0.28058674159910285 + }, + { + "epoch": 1.9856347112283788, + "grad_norm": 0.47414792190475336, + "learning_rate": 0.00019867572237521376, + "loss": 3.140915870666504, + "step": 3387, + "token_acc": 0.2838225022814289 + }, + { + "epoch": 1.9862210495455876, + "grad_norm": 0.5041684332202685, + "learning_rate": 0.00019867414981537994, + "loss": 3.1242451667785645, + "step": 3388, + "token_acc": 0.2864778810267547 + }, + { + "epoch": 1.9868073878627968, + "grad_norm": 0.31775098694333365, + "learning_rate": 0.00019867257632863545, + "loss": 3.145717144012451, + "step": 3389, + "token_acc": 0.28361240805884236 + }, + { + "epoch": 1.9873937261800059, + "grad_norm": 0.4548406079373691, + "learning_rate": 0.00019867100191499505, + "loss": 3.1325159072875977, + "step": 3390, + "token_acc": 0.28403392041748204 + }, + { + "epoch": 1.987980064497215, + "grad_norm": 0.4459615977449683, + "learning_rate": 0.00019866942657447356, + "loss": 3.1439404487609863, + "step": 3391, + "token_acc": 0.28202944397128604 + }, + { + "epoch": 1.9885664028144239, + "grad_norm": 0.3444542004327783, + "learning_rate": 0.00019866785030708576, + "loss": 3.1276817321777344, + "step": 3392, + "token_acc": 0.2840370166432796 + }, + { + "epoch": 1.989152741131633, + "grad_norm": 0.4310743480927085, + "learning_rate": 0.00019866627311284645, + "loss": 3.1900343894958496, + "step": 3393, + "token_acc": 0.2770741904949914 + }, + { + "epoch": 1.9897390794488419, + "grad_norm": 0.3660813583455183, + "learning_rate": 0.0001986646949917705, + "loss": 3.1720938682556152, + "step": 3394, + "token_acc": 0.27853221239349224 + }, + { + "epoch": 1.990325417766051, + "grad_norm": 0.4419095801887838, + "learning_rate": 0.00019866311594387263, + "loss": 3.148575782775879, + "step": 3395, + "token_acc": 0.2823388978685363 + }, + { + "epoch": 1.99091175608326, + "grad_norm": 0.3771321482302679, + "learning_rate": 0.0001986615359691678, + "loss": 3.161640167236328, + "step": 3396, + "token_acc": 0.28125916460681216 + }, + { + "epoch": 1.9914980944004692, + "grad_norm": 0.3976610643672019, + "learning_rate": 0.00019865995506767074, + "loss": 3.1200175285339355, + "step": 3397, + "token_acc": 0.2871431249804702 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.31356070794172497, + "learning_rate": 0.0001986583732393964, + "loss": 3.149045944213867, + "step": 3398, + "token_acc": 0.2845456971942661 + }, + { + "epoch": 1.992670771034887, + "grad_norm": 0.3957748313816249, + "learning_rate": 0.00019865679048435952, + "loss": 3.183539390563965, + "step": 3399, + "token_acc": 0.27813685026502694 + }, + { + "epoch": 1.993257109352096, + "grad_norm": 0.4635986896605718, + "learning_rate": 0.00019865520680257507, + "loss": 3.182753562927246, + "step": 3400, + "token_acc": 0.2778512426403186 + }, + { + "epoch": 1.9938434476693052, + "grad_norm": 0.36538791459746844, + "learning_rate": 0.0001986536221940579, + "loss": 3.1634182929992676, + "step": 3401, + "token_acc": 0.27964701758102634 + }, + { + "epoch": 1.9944297859865143, + "grad_norm": 0.45411599852976825, + "learning_rate": 0.00019865203665882288, + "loss": 3.1822521686553955, + "step": 3402, + "token_acc": 0.27824841501106046 + }, + { + "epoch": 1.9950161243037232, + "grad_norm": 0.5380577160784337, + "learning_rate": 0.00019865045019688488, + "loss": 3.170513153076172, + "step": 3403, + "token_acc": 0.27743396541467946 + }, + { + "epoch": 1.9956024626209323, + "grad_norm": 0.46977869056729854, + "learning_rate": 0.00019864886280825887, + "loss": 3.1478023529052734, + "step": 3404, + "token_acc": 0.28211187162501444 + }, + { + "epoch": 1.9961888009381412, + "grad_norm": 0.4067343623388572, + "learning_rate": 0.00019864727449295973, + "loss": 3.167191982269287, + "step": 3405, + "token_acc": 0.2790123591045766 + }, + { + "epoch": 1.9967751392553503, + "grad_norm": 0.5719470991780657, + "learning_rate": 0.00019864568525100235, + "loss": 3.1479108333587646, + "step": 3406, + "token_acc": 0.2808350046251519 + }, + { + "epoch": 1.9973614775725594, + "grad_norm": 0.5640755107525108, + "learning_rate": 0.0001986440950824017, + "loss": 3.1524441242218018, + "step": 3407, + "token_acc": 0.2820435300396182 + }, + { + "epoch": 1.9979478158897686, + "grad_norm": 0.41340584585630324, + "learning_rate": 0.0001986425039871727, + "loss": 3.1483821868896484, + "step": 3408, + "token_acc": 0.28315104139120134 + }, + { + "epoch": 1.9985341542069774, + "grad_norm": 0.451575231052006, + "learning_rate": 0.0001986409119653303, + "loss": 3.119262218475342, + "step": 3409, + "token_acc": 0.28730103037007076 + }, + { + "epoch": 1.9991204925241863, + "grad_norm": 0.43381716125735736, + "learning_rate": 0.00019863931901688942, + "loss": 3.137439727783203, + "step": 3410, + "token_acc": 0.28318712429290516 + }, + { + "epoch": 1.9997068308413954, + "grad_norm": 0.4454884243532071, + "learning_rate": 0.00019863772514186508, + "loss": 3.137871265411377, + "step": 3411, + "token_acc": 0.2824936790643489 + }, + { + "epoch": 2.0, + "grad_norm": 0.5569415830635872, + "learning_rate": 0.00019863613034027224, + "loss": 3.1297950744628906, + "step": 3412, + "token_acc": 0.2860031799662129 + }, + { + "epoch": 2.0, + "eval_loss": 3.125737190246582, + "eval_runtime": 16.5539, + "eval_samples_per_second": 15.465, + "eval_steps_per_second": 1.933, + "eval_token_acc": 0.284939371791372, + "step": 3412 + }, + { + "epoch": 2.000586338317209, + "grad_norm": 0.49998177441428876, + "learning_rate": 0.00019863453461212586, + "loss": 3.0684006214141846, + "step": 3413, + "token_acc": 0.2918300454927708 + }, + { + "epoch": 2.0011726766344182, + "grad_norm": 0.5058384648990105, + "learning_rate": 0.00019863293795744093, + "loss": 3.1008150577545166, + "step": 3414, + "token_acc": 0.28668079556569936 + }, + { + "epoch": 2.001759014951627, + "grad_norm": 0.512666475312042, + "learning_rate": 0.00019863134037623246, + "loss": 3.144676446914673, + "step": 3415, + "token_acc": 0.28229278178549544 + }, + { + "epoch": 2.002345353268836, + "grad_norm": 0.4595128529384985, + "learning_rate": 0.00019862974186851548, + "loss": 3.1008362770080566, + "step": 3416, + "token_acc": 0.2864899099397408 + }, + { + "epoch": 2.002931691586045, + "grad_norm": 0.4934855750548127, + "learning_rate": 0.00019862814243430497, + "loss": 3.083914279937744, + "step": 3417, + "token_acc": 0.2897059667914207 + }, + { + "epoch": 2.0035180299032542, + "grad_norm": 0.4384831085729346, + "learning_rate": 0.00019862654207361595, + "loss": 3.0431952476501465, + "step": 3418, + "token_acc": 0.295449011206499 + }, + { + "epoch": 2.0041043682204633, + "grad_norm": 0.5781943659392206, + "learning_rate": 0.00019862494078646346, + "loss": 3.1228113174438477, + "step": 3419, + "token_acc": 0.284195790366606 + }, + { + "epoch": 2.0046907065376725, + "grad_norm": 0.4053322140989937, + "learning_rate": 0.00019862333857286258, + "loss": 3.0822365283966064, + "step": 3420, + "token_acc": 0.28940557612319917 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.41631005009025635, + "learning_rate": 0.0001986217354328283, + "loss": 3.0864272117614746, + "step": 3421, + "token_acc": 0.2897685593697841 + }, + { + "epoch": 2.0058633831720902, + "grad_norm": 0.4859054693647988, + "learning_rate": 0.00019862013136637575, + "loss": 3.0980827808380127, + "step": 3422, + "token_acc": 0.290450429600486 + }, + { + "epoch": 2.0064497214892993, + "grad_norm": 0.4716793129928112, + "learning_rate": 0.00019861852637351993, + "loss": 3.0215601921081543, + "step": 3423, + "token_acc": 0.2977535771448794 + }, + { + "epoch": 2.0070360598065085, + "grad_norm": 0.3894036170417833, + "learning_rate": 0.00019861692045427593, + "loss": 3.0676109790802, + "step": 3424, + "token_acc": 0.2916630855178341 + }, + { + "epoch": 2.0076223981237176, + "grad_norm": 0.49506202086101725, + "learning_rate": 0.00019861531360865887, + "loss": 3.062096118927002, + "step": 3425, + "token_acc": 0.29257836496758355 + }, + { + "epoch": 2.0082087364409262, + "grad_norm": 0.5123133372513321, + "learning_rate": 0.00019861370583668385, + "loss": 3.070327043533325, + "step": 3426, + "token_acc": 0.2919775506934651 + }, + { + "epoch": 2.0087950747581353, + "grad_norm": 0.3602691345753264, + "learning_rate": 0.0001986120971383659, + "loss": 3.081812858581543, + "step": 3427, + "token_acc": 0.28976210521496504 + }, + { + "epoch": 2.0093814130753445, + "grad_norm": 0.4063315734456028, + "learning_rate": 0.0001986104875137202, + "loss": 3.067838430404663, + "step": 3428, + "token_acc": 0.29133206102777987 + }, + { + "epoch": 2.0099677513925536, + "grad_norm": 0.4386921411857858, + "learning_rate": 0.00019860887696276184, + "loss": 3.0660181045532227, + "step": 3429, + "token_acc": 0.2910207190101613 + }, + { + "epoch": 2.0105540897097627, + "grad_norm": 0.3945006421655463, + "learning_rate": 0.00019860726548550598, + "loss": 3.036353826522827, + "step": 3430, + "token_acc": 0.29546336597042605 + }, + { + "epoch": 2.0111404280269713, + "grad_norm": 0.3728931073773427, + "learning_rate": 0.00019860565308196774, + "loss": 3.0315330028533936, + "step": 3431, + "token_acc": 0.2956075014443872 + }, + { + "epoch": 2.0117267663441805, + "grad_norm": 0.3634816282563962, + "learning_rate": 0.0001986040397521622, + "loss": 3.0377249717712402, + "step": 3432, + "token_acc": 0.2943529859744875 + }, + { + "epoch": 2.0123131046613896, + "grad_norm": 0.4164182457161111, + "learning_rate": 0.00019860242549610464, + "loss": 3.0665740966796875, + "step": 3433, + "token_acc": 0.2913614950713841 + }, + { + "epoch": 2.0128994429785987, + "grad_norm": 0.5309312411672618, + "learning_rate": 0.0001986008103138101, + "loss": 3.046165943145752, + "step": 3434, + "token_acc": 0.2952960849782499 + }, + { + "epoch": 2.013485781295808, + "grad_norm": 0.48631486628813625, + "learning_rate": 0.0001985991942052939, + "loss": 3.0645008087158203, + "step": 3435, + "token_acc": 0.29291761998940036 + }, + { + "epoch": 2.014072119613017, + "grad_norm": 0.43106715431796166, + "learning_rate": 0.00019859757717057108, + "loss": 3.063448667526245, + "step": 3436, + "token_acc": 0.29248950728223944 + }, + { + "epoch": 2.0146584579302256, + "grad_norm": 0.5491545871820467, + "learning_rate": 0.0001985959592096569, + "loss": 3.0395636558532715, + "step": 3437, + "token_acc": 0.29448824328339207 + }, + { + "epoch": 2.0152447962474347, + "grad_norm": 0.43026686914734347, + "learning_rate": 0.00019859434032256653, + "loss": 3.0461792945861816, + "step": 3438, + "token_acc": 0.29394151291843723 + }, + { + "epoch": 2.015831134564644, + "grad_norm": 0.4737817193480008, + "learning_rate": 0.0001985927205093152, + "loss": 3.0697813034057617, + "step": 3439, + "token_acc": 0.28973387451513577 + }, + { + "epoch": 2.016417472881853, + "grad_norm": 0.5152683331568874, + "learning_rate": 0.00019859109976991813, + "loss": 3.0762979984283447, + "step": 3440, + "token_acc": 0.29219706265867046 + }, + { + "epoch": 2.017003811199062, + "grad_norm": 0.37020473520260655, + "learning_rate": 0.00019858947810439052, + "loss": 3.063088893890381, + "step": 3441, + "token_acc": 0.2914163203726741 + }, + { + "epoch": 2.0175901495162707, + "grad_norm": 0.46565884157360854, + "learning_rate": 0.0001985878555127476, + "loss": 3.0816354751586914, + "step": 3442, + "token_acc": 0.2916014828594348 + }, + { + "epoch": 2.01817648783348, + "grad_norm": 0.40670115880252516, + "learning_rate": 0.0001985862319950046, + "loss": 3.1309866905212402, + "step": 3443, + "token_acc": 0.281848419868346 + }, + { + "epoch": 2.018762826150689, + "grad_norm": 0.3987171154986315, + "learning_rate": 0.00019858460755117684, + "loss": 3.087066650390625, + "step": 3444, + "token_acc": 0.28855074872693753 + }, + { + "epoch": 2.019349164467898, + "grad_norm": 0.4644166502967986, + "learning_rate": 0.00019858298218127955, + "loss": 3.088263511657715, + "step": 3445, + "token_acc": 0.288081842214614 + }, + { + "epoch": 2.019935502785107, + "grad_norm": 0.4039692632761868, + "learning_rate": 0.00019858135588532796, + "loss": 3.0819506645202637, + "step": 3446, + "token_acc": 0.2898649526896724 + }, + { + "epoch": 2.0205218411023163, + "grad_norm": 0.4255264216949989, + "learning_rate": 0.00019857972866333737, + "loss": 3.0539302825927734, + "step": 3447, + "token_acc": 0.29272292467946565 + }, + { + "epoch": 2.021108179419525, + "grad_norm": 0.32938719001048683, + "learning_rate": 0.00019857810051532307, + "loss": 3.023671865463257, + "step": 3448, + "token_acc": 0.2993614499292624 + }, + { + "epoch": 2.021694517736734, + "grad_norm": 0.3666656134199677, + "learning_rate": 0.00019857647144130036, + "loss": 3.032489061355591, + "step": 3449, + "token_acc": 0.29779452364550413 + }, + { + "epoch": 2.022280856053943, + "grad_norm": 0.3731779593142578, + "learning_rate": 0.00019857484144128451, + "loss": 3.063467025756836, + "step": 3450, + "token_acc": 0.29030540042911274 + }, + { + "epoch": 2.0228671943711523, + "grad_norm": 0.3733023019833434, + "learning_rate": 0.00019857321051529087, + "loss": 3.065704584121704, + "step": 3451, + "token_acc": 0.29059855505190363 + }, + { + "epoch": 2.0234535326883614, + "grad_norm": 0.41854461610494786, + "learning_rate": 0.00019857157866333479, + "loss": 3.117169141769409, + "step": 3452, + "token_acc": 0.28622212213580833 + }, + { + "epoch": 2.02403987100557, + "grad_norm": 0.383719543772172, + "learning_rate": 0.0001985699458854315, + "loss": 2.9990878105163574, + "step": 3453, + "token_acc": 0.3008129651276572 + }, + { + "epoch": 2.024626209322779, + "grad_norm": 0.3909280486530855, + "learning_rate": 0.0001985683121815964, + "loss": 3.1234984397888184, + "step": 3454, + "token_acc": 0.28396082501690095 + }, + { + "epoch": 2.0252125476399883, + "grad_norm": 0.4681280032661688, + "learning_rate": 0.00019856667755184483, + "loss": 3.0707132816314697, + "step": 3455, + "token_acc": 0.289514245679589 + }, + { + "epoch": 2.0257988859571974, + "grad_norm": 0.3912012569231109, + "learning_rate": 0.00019856504199619213, + "loss": 3.0272216796875, + "step": 3456, + "token_acc": 0.2984804400458735 + }, + { + "epoch": 2.0263852242744065, + "grad_norm": 0.3685152149409096, + "learning_rate": 0.00019856340551465375, + "loss": 3.0364389419555664, + "step": 3457, + "token_acc": 0.29583195774182897 + }, + { + "epoch": 2.026971562591615, + "grad_norm": 0.377225996263772, + "learning_rate": 0.00019856176810724492, + "loss": 3.0778818130493164, + "step": 3458, + "token_acc": 0.28806333260880385 + }, + { + "epoch": 2.0275579009088243, + "grad_norm": 0.46385651039535053, + "learning_rate": 0.0001985601297739811, + "loss": 2.998626947402954, + "step": 3459, + "token_acc": 0.30145061915386256 + }, + { + "epoch": 2.0281442392260334, + "grad_norm": 0.48582397782206593, + "learning_rate": 0.0001985584905148777, + "loss": 3.0886669158935547, + "step": 3460, + "token_acc": 0.2898088681087191 + }, + { + "epoch": 2.0287305775432425, + "grad_norm": 0.5842700349621689, + "learning_rate": 0.00019855685032995005, + "loss": 3.066650390625, + "step": 3461, + "token_acc": 0.29034488409968556 + }, + { + "epoch": 2.0293169158604516, + "grad_norm": 0.5087797994759428, + "learning_rate": 0.00019855520921921365, + "loss": 3.0560476779937744, + "step": 3462, + "token_acc": 0.29298299146153484 + }, + { + "epoch": 2.0299032541776607, + "grad_norm": 0.4210059422162485, + "learning_rate": 0.00019855356718268384, + "loss": 3.0664210319519043, + "step": 3463, + "token_acc": 0.2936837703651165 + }, + { + "epoch": 2.0304895924948694, + "grad_norm": 0.4591620472515439, + "learning_rate": 0.00019855192422037608, + "loss": 3.0583577156066895, + "step": 3464, + "token_acc": 0.2933609325859828 + }, + { + "epoch": 2.0310759308120785, + "grad_norm": 0.5921027615980743, + "learning_rate": 0.00019855028033230576, + "loss": 3.0732574462890625, + "step": 3465, + "token_acc": 0.2918837444648508 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.5137673535451546, + "learning_rate": 0.00019854863551848836, + "loss": 3.0678906440734863, + "step": 3466, + "token_acc": 0.2931201793793116 + }, + { + "epoch": 2.0322486074464967, + "grad_norm": 0.3943921958960617, + "learning_rate": 0.00019854698977893938, + "loss": 3.0622215270996094, + "step": 3467, + "token_acc": 0.2919001417793718 + }, + { + "epoch": 2.032834945763706, + "grad_norm": 0.4995869652818478, + "learning_rate": 0.00019854534311367417, + "loss": 3.094608783721924, + "step": 3468, + "token_acc": 0.28634284886548234 + }, + { + "epoch": 2.0334212840809145, + "grad_norm": 0.533878743324846, + "learning_rate": 0.00019854369552270827, + "loss": 3.0816850662231445, + "step": 3469, + "token_acc": 0.28921093562263495 + }, + { + "epoch": 2.0340076223981236, + "grad_norm": 0.40059612158034985, + "learning_rate": 0.00019854204700605715, + "loss": 3.0667741298675537, + "step": 3470, + "token_acc": 0.2913932884513108 + }, + { + "epoch": 2.0345939607153327, + "grad_norm": 0.3892015949638416, + "learning_rate": 0.00019854039756373622, + "loss": 3.0442728996276855, + "step": 3471, + "token_acc": 0.29434659742643227 + }, + { + "epoch": 2.035180299032542, + "grad_norm": 0.4033420378507342, + "learning_rate": 0.0001985387471957611, + "loss": 3.072866439819336, + "step": 3472, + "token_acc": 0.2899768014355261 + }, + { + "epoch": 2.035766637349751, + "grad_norm": 0.43480898610428975, + "learning_rate": 0.00019853709590214727, + "loss": 3.086298942565918, + "step": 3473, + "token_acc": 0.2895063902065166 + }, + { + "epoch": 2.03635297566696, + "grad_norm": 0.41313741237025026, + "learning_rate": 0.00019853544368291014, + "loss": 3.0476536750793457, + "step": 3474, + "token_acc": 0.2943198957719525 + }, + { + "epoch": 2.0369393139841687, + "grad_norm": 0.34062561269281616, + "learning_rate": 0.0001985337905380653, + "loss": 3.030444622039795, + "step": 3475, + "token_acc": 0.2968849954268857 + }, + { + "epoch": 2.037525652301378, + "grad_norm": 0.4832973665437105, + "learning_rate": 0.00019853213646762828, + "loss": 3.0658798217773438, + "step": 3476, + "token_acc": 0.29068780439834585 + }, + { + "epoch": 2.038111990618587, + "grad_norm": 0.3406440089866578, + "learning_rate": 0.00019853048147161465, + "loss": 3.0281901359558105, + "step": 3477, + "token_acc": 0.294366284410772 + }, + { + "epoch": 2.038698328935796, + "grad_norm": 0.34721901517972675, + "learning_rate": 0.00019852882555003988, + "loss": 3.0644588470458984, + "step": 3478, + "token_acc": 0.2929816358199139 + }, + { + "epoch": 2.039284667253005, + "grad_norm": 0.3278211538544744, + "learning_rate": 0.00019852716870291957, + "loss": 3.0754313468933105, + "step": 3479, + "token_acc": 0.29226345761473105 + }, + { + "epoch": 2.039871005570214, + "grad_norm": 0.36777062679505923, + "learning_rate": 0.00019852551093026927, + "loss": 3.0932412147521973, + "step": 3480, + "token_acc": 0.2874230559476872 + }, + { + "epoch": 2.040457343887423, + "grad_norm": 0.3582524845269563, + "learning_rate": 0.00019852385223210459, + "loss": 3.059009075164795, + "step": 3481, + "token_acc": 0.2947999550255073 + }, + { + "epoch": 2.041043682204632, + "grad_norm": 0.3687789468112822, + "learning_rate": 0.00019852219260844105, + "loss": 3.057910919189453, + "step": 3482, + "token_acc": 0.292932579009593 + }, + { + "epoch": 2.041630020521841, + "grad_norm": 0.41719645953801243, + "learning_rate": 0.00019852053205929426, + "loss": 3.035706043243408, + "step": 3483, + "token_acc": 0.29699823927248126 + }, + { + "epoch": 2.0422163588390503, + "grad_norm": 0.4020654247008344, + "learning_rate": 0.00019851887058467985, + "loss": 3.092768669128418, + "step": 3484, + "token_acc": 0.28870452936573976 + }, + { + "epoch": 2.042802697156259, + "grad_norm": 0.4206807940837558, + "learning_rate": 0.00019851720818461343, + "loss": 3.0886054039001465, + "step": 3485, + "token_acc": 0.28889771778598083 + }, + { + "epoch": 2.043389035473468, + "grad_norm": 0.4154958071402737, + "learning_rate": 0.00019851554485911054, + "loss": 3.057370185852051, + "step": 3486, + "token_acc": 0.29132255749434965 + }, + { + "epoch": 2.043975373790677, + "grad_norm": 0.410118890770655, + "learning_rate": 0.0001985138806081869, + "loss": 3.0689868927001953, + "step": 3487, + "token_acc": 0.2920596962047739 + }, + { + "epoch": 2.0445617121078863, + "grad_norm": 0.5391771603918918, + "learning_rate": 0.0001985122154318581, + "loss": 3.072880744934082, + "step": 3488, + "token_acc": 0.29119332693132827 + }, + { + "epoch": 2.0451480504250954, + "grad_norm": 0.4158624111171172, + "learning_rate": 0.00019851054933013975, + "loss": 3.0308005809783936, + "step": 3489, + "token_acc": 0.29695606072361447 + }, + { + "epoch": 2.0457343887423045, + "grad_norm": 0.37581621398878684, + "learning_rate": 0.00019850888230304756, + "loss": 3.028323173522949, + "step": 3490, + "token_acc": 0.29698904068610427 + }, + { + "epoch": 2.046320727059513, + "grad_norm": 0.43321913811391444, + "learning_rate": 0.00019850721435059717, + "loss": 3.059274196624756, + "step": 3491, + "token_acc": 0.2940401405451448 + }, + { + "epoch": 2.0469070653767223, + "grad_norm": 0.5072053974047472, + "learning_rate": 0.0001985055454728042, + "loss": 3.0779662132263184, + "step": 3492, + "token_acc": 0.2899087070220951 + }, + { + "epoch": 2.0474934036939314, + "grad_norm": 0.4286230359049069, + "learning_rate": 0.00019850387566968443, + "loss": 3.050110340118408, + "step": 3493, + "token_acc": 0.29475422006370006 + }, + { + "epoch": 2.0480797420111405, + "grad_norm": 0.4311795484638781, + "learning_rate": 0.00019850220494125345, + "loss": 3.088566303253174, + "step": 3494, + "token_acc": 0.28928430270458944 + }, + { + "epoch": 2.0486660803283496, + "grad_norm": 0.5333324649753729, + "learning_rate": 0.00019850053328752699, + "loss": 3.0589442253112793, + "step": 3495, + "token_acc": 0.2923652698479981 + }, + { + "epoch": 2.0492524186455583, + "grad_norm": 0.3646003012931469, + "learning_rate": 0.00019849886070852073, + "loss": 3.0413968563079834, + "step": 3496, + "token_acc": 0.29438729036060657 + }, + { + "epoch": 2.0498387569627674, + "grad_norm": 0.4564982574744297, + "learning_rate": 0.00019849718720425043, + "loss": 3.089567184448242, + "step": 3497, + "token_acc": 0.28769977714926864 + }, + { + "epoch": 2.0504250952799765, + "grad_norm": 0.4059631300798474, + "learning_rate": 0.00019849551277473175, + "loss": 3.08150053024292, + "step": 3498, + "token_acc": 0.28999146906990986 + }, + { + "epoch": 2.0510114335971856, + "grad_norm": 0.425047068153382, + "learning_rate": 0.0001984938374199805, + "loss": 3.053403854370117, + "step": 3499, + "token_acc": 0.2946768459578217 + }, + { + "epoch": 2.0515977719143947, + "grad_norm": 0.46989438353270113, + "learning_rate": 0.00019849216114001234, + "loss": 3.061589241027832, + "step": 3500, + "token_acc": 0.2916836650093831 + }, + { + "epoch": 2.052184110231604, + "grad_norm": 0.3759395336356404, + "learning_rate": 0.00019849048393484305, + "loss": 3.0550172328948975, + "step": 3501, + "token_acc": 0.29278502437383536 + }, + { + "epoch": 2.0527704485488125, + "grad_norm": 0.4078171189552231, + "learning_rate": 0.00019848880580448838, + "loss": 3.027357578277588, + "step": 3502, + "token_acc": 0.29764322521976355 + }, + { + "epoch": 2.0533567868660216, + "grad_norm": 0.3494611134069768, + "learning_rate": 0.0001984871267489641, + "loss": 3.0791642665863037, + "step": 3503, + "token_acc": 0.28908811097408005 + }, + { + "epoch": 2.0539431251832307, + "grad_norm": 0.42854641975385266, + "learning_rate": 0.00019848544676828595, + "loss": 3.0633440017700195, + "step": 3504, + "token_acc": 0.2923390811811173 + }, + { + "epoch": 2.05452946350044, + "grad_norm": 0.34141891620291975, + "learning_rate": 0.00019848376586246977, + "loss": 3.0867271423339844, + "step": 3505, + "token_acc": 0.28649241731234837 + }, + { + "epoch": 2.055115801817649, + "grad_norm": 0.4106934820384049, + "learning_rate": 0.00019848208403153131, + "loss": 3.045044422149658, + "step": 3506, + "token_acc": 0.29336567278454706 + }, + { + "epoch": 2.0557021401348576, + "grad_norm": 0.423858969468152, + "learning_rate": 0.0001984804012754864, + "loss": 3.0614547729492188, + "step": 3507, + "token_acc": 0.2937394146002519 + }, + { + "epoch": 2.0562884784520667, + "grad_norm": 0.3661184348548931, + "learning_rate": 0.00019847871759435078, + "loss": 3.078434705734253, + "step": 3508, + "token_acc": 0.29179172681670423 + }, + { + "epoch": 2.056874816769276, + "grad_norm": 0.37746873292411587, + "learning_rate": 0.00019847703298814034, + "loss": 3.074838399887085, + "step": 3509, + "token_acc": 0.2877032958018007 + }, + { + "epoch": 2.057461155086485, + "grad_norm": 0.4806787404143008, + "learning_rate": 0.00019847534745687085, + "loss": 3.074542999267578, + "step": 3510, + "token_acc": 0.2896674140982699 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.44611190387507604, + "learning_rate": 0.0001984736610005582, + "loss": 3.037825345993042, + "step": 3511, + "token_acc": 0.29517590067757393 + }, + { + "epoch": 2.0586338317209028, + "grad_norm": 0.503512057534701, + "learning_rate": 0.00019847197361921818, + "loss": 3.0500144958496094, + "step": 3512, + "token_acc": 0.29422766432879965 + }, + { + "epoch": 2.059220170038112, + "grad_norm": 0.46373397802087224, + "learning_rate": 0.00019847028531286666, + "loss": 3.036174774169922, + "step": 3513, + "token_acc": 0.2946828143935494 + }, + { + "epoch": 2.059806508355321, + "grad_norm": 0.4001270520429139, + "learning_rate": 0.0001984685960815195, + "loss": 3.0489931106567383, + "step": 3514, + "token_acc": 0.29345834367458584 + }, + { + "epoch": 2.06039284667253, + "grad_norm": 0.48306416006728575, + "learning_rate": 0.0001984669059251926, + "loss": 3.057803153991699, + "step": 3515, + "token_acc": 0.29279026266844005 + }, + { + "epoch": 2.060979184989739, + "grad_norm": 0.4813301321880618, + "learning_rate": 0.00019846521484390177, + "loss": 3.0901288986206055, + "step": 3516, + "token_acc": 0.28752015242561924 + }, + { + "epoch": 2.0615655233069483, + "grad_norm": 0.3701617411106914, + "learning_rate": 0.0001984635228376629, + "loss": 3.0545034408569336, + "step": 3517, + "token_acc": 0.294136307132062 + }, + { + "epoch": 2.062151861624157, + "grad_norm": 0.39510654241557325, + "learning_rate": 0.00019846182990649198, + "loss": 3.0678539276123047, + "step": 3518, + "token_acc": 0.29048785186987913 + }, + { + "epoch": 2.062738199941366, + "grad_norm": 0.3848599230601269, + "learning_rate": 0.00019846013605040482, + "loss": 3.051362991333008, + "step": 3519, + "token_acc": 0.29433504343937333 + }, + { + "epoch": 2.063324538258575, + "grad_norm": 0.47196422224442947, + "learning_rate": 0.00019845844126941734, + "loss": 3.066368341445923, + "step": 3520, + "token_acc": 0.29149874039342605 + }, + { + "epoch": 2.0639108765757843, + "grad_norm": 0.41602885467357575, + "learning_rate": 0.0001984567455635455, + "loss": 3.0600249767303467, + "step": 3521, + "token_acc": 0.2937404009241823 + }, + { + "epoch": 2.0644972148929934, + "grad_norm": 0.3666488477481219, + "learning_rate": 0.0001984550489328052, + "loss": 3.0707387924194336, + "step": 3522, + "token_acc": 0.29189663987833814 + }, + { + "epoch": 2.065083553210202, + "grad_norm": 0.39247047727059314, + "learning_rate": 0.0001984533513772124, + "loss": 3.0812063217163086, + "step": 3523, + "token_acc": 0.2898525144189089 + }, + { + "epoch": 2.065669891527411, + "grad_norm": 0.4739724032491044, + "learning_rate": 0.000198451652896783, + "loss": 3.0333967208862305, + "step": 3524, + "token_acc": 0.29536122574097257 + }, + { + "epoch": 2.0662562298446203, + "grad_norm": 0.4046881247518256, + "learning_rate": 0.00019844995349153303, + "loss": 3.118180751800537, + "step": 3525, + "token_acc": 0.28439861418010204 + }, + { + "epoch": 2.0668425681618294, + "grad_norm": 0.41100666233856453, + "learning_rate": 0.00019844825316147837, + "loss": 3.0766966342926025, + "step": 3526, + "token_acc": 0.2892769383593873 + }, + { + "epoch": 2.0674289064790385, + "grad_norm": 0.46547796178567485, + "learning_rate": 0.00019844655190663505, + "loss": 3.047614097595215, + "step": 3527, + "token_acc": 0.294341582500903 + }, + { + "epoch": 2.068015244796247, + "grad_norm": 0.35337948019950033, + "learning_rate": 0.00019844484972701904, + "loss": 3.045591354370117, + "step": 3528, + "token_acc": 0.29449108994356576 + }, + { + "epoch": 2.0686015831134563, + "grad_norm": 0.4224475103668469, + "learning_rate": 0.0001984431466226463, + "loss": 3.0759239196777344, + "step": 3529, + "token_acc": 0.2911184509717808 + }, + { + "epoch": 2.0691879214306654, + "grad_norm": 0.4288655247363128, + "learning_rate": 0.0001984414425935329, + "loss": 3.0538578033447266, + "step": 3530, + "token_acc": 0.2934134629579427 + }, + { + "epoch": 2.0697742597478745, + "grad_norm": 0.4671349349305833, + "learning_rate": 0.00019843973763969476, + "loss": 3.075728416442871, + "step": 3531, + "token_acc": 0.2908591003335437 + }, + { + "epoch": 2.0703605980650837, + "grad_norm": 0.38404919381097924, + "learning_rate": 0.00019843803176114794, + "loss": 3.1061019897460938, + "step": 3532, + "token_acc": 0.28568734431519643 + }, + { + "epoch": 2.0709469363822928, + "grad_norm": 0.4100207609153704, + "learning_rate": 0.00019843632495790842, + "loss": 3.0284202098846436, + "step": 3533, + "token_acc": 0.29634538768004903 + }, + { + "epoch": 2.0715332746995014, + "grad_norm": 0.45128159799533396, + "learning_rate": 0.00019843461722999231, + "loss": 3.0789566040039062, + "step": 3534, + "token_acc": 0.29046187324898687 + }, + { + "epoch": 2.0721196130167105, + "grad_norm": 0.38901709703660614, + "learning_rate": 0.0001984329085774156, + "loss": 3.0560970306396484, + "step": 3535, + "token_acc": 0.29271873239187507 + }, + { + "epoch": 2.0727059513339197, + "grad_norm": 0.4465639630016911, + "learning_rate": 0.0001984311990001944, + "loss": 3.0605874061584473, + "step": 3536, + "token_acc": 0.2925848534185065 + }, + { + "epoch": 2.0732922896511288, + "grad_norm": 0.35969836533497596, + "learning_rate": 0.0001984294884983447, + "loss": 3.044510841369629, + "step": 3537, + "token_acc": 0.2960840079950481 + }, + { + "epoch": 2.073878627968338, + "grad_norm": 0.36275055131073514, + "learning_rate": 0.00019842777707188255, + "loss": 3.061445713043213, + "step": 3538, + "token_acc": 0.2928920044764984 + }, + { + "epoch": 2.0744649662855466, + "grad_norm": 0.4217384446246162, + "learning_rate": 0.0001984260647208241, + "loss": 3.050929546356201, + "step": 3539, + "token_acc": 0.29208985944555915 + }, + { + "epoch": 2.0750513046027557, + "grad_norm": 0.46711186579893876, + "learning_rate": 0.0001984243514451854, + "loss": 3.0724148750305176, + "step": 3540, + "token_acc": 0.2902850297529916 + }, + { + "epoch": 2.0756376429199648, + "grad_norm": 0.3748401946186969, + "learning_rate": 0.00019842263724498252, + "loss": 3.0598361492156982, + "step": 3541, + "token_acc": 0.29168408703341925 + }, + { + "epoch": 2.076223981237174, + "grad_norm": 0.43450602566105617, + "learning_rate": 0.00019842092212023164, + "loss": 3.0598678588867188, + "step": 3542, + "token_acc": 0.29293968737458453 + }, + { + "epoch": 2.076810319554383, + "grad_norm": 0.4359906955665724, + "learning_rate": 0.0001984192060709488, + "loss": 3.0696616172790527, + "step": 3543, + "token_acc": 0.29211011531225833 + }, + { + "epoch": 2.077396657871592, + "grad_norm": 0.3959562260704288, + "learning_rate": 0.00019841748909715014, + "loss": 3.1017396450042725, + "step": 3544, + "token_acc": 0.28789396891037483 + }, + { + "epoch": 2.077982996188801, + "grad_norm": 0.44461100843793183, + "learning_rate": 0.00019841577119885178, + "loss": 3.035391330718994, + "step": 3545, + "token_acc": 0.29514798431866746 + }, + { + "epoch": 2.07856933450601, + "grad_norm": 0.3532264613789898, + "learning_rate": 0.00019841405237606987, + "loss": 3.0907740592956543, + "step": 3546, + "token_acc": 0.2869562303794467 + }, + { + "epoch": 2.079155672823219, + "grad_norm": 0.4557584814246335, + "learning_rate": 0.00019841233262882056, + "loss": 3.084862232208252, + "step": 3547, + "token_acc": 0.28934312617317165 + }, + { + "epoch": 2.079742011140428, + "grad_norm": 0.49553249109578207, + "learning_rate": 0.00019841061195711998, + "loss": 3.0846166610717773, + "step": 3548, + "token_acc": 0.2892670938341002 + }, + { + "epoch": 2.0803283494576372, + "grad_norm": 0.4253727510143276, + "learning_rate": 0.00019840889036098434, + "loss": 3.095020055770874, + "step": 3549, + "token_acc": 0.28618513563594217 + }, + { + "epoch": 2.080914687774846, + "grad_norm": 0.5072426324183269, + "learning_rate": 0.00019840716784042973, + "loss": 3.0633978843688965, + "step": 3550, + "token_acc": 0.2922023444535381 + }, + { + "epoch": 2.081501026092055, + "grad_norm": 0.5077644718027557, + "learning_rate": 0.00019840544439547243, + "loss": 3.046329975128174, + "step": 3551, + "token_acc": 0.2928570870132882 + }, + { + "epoch": 2.082087364409264, + "grad_norm": 0.396293448052419, + "learning_rate": 0.00019840372002612858, + "loss": 3.0718302726745605, + "step": 3552, + "token_acc": 0.2918956853142913 + }, + { + "epoch": 2.0826737027264732, + "grad_norm": 0.4145590438564462, + "learning_rate": 0.00019840199473241437, + "loss": 3.0591797828674316, + "step": 3553, + "token_acc": 0.29409241618862164 + }, + { + "epoch": 2.0832600410436823, + "grad_norm": 0.4891850166619332, + "learning_rate": 0.000198400268514346, + "loss": 3.0417685508728027, + "step": 3554, + "token_acc": 0.29463054650288295 + }, + { + "epoch": 2.0838463793608915, + "grad_norm": 0.41410691911906794, + "learning_rate": 0.00019839854137193976, + "loss": 3.053943634033203, + "step": 3555, + "token_acc": 0.29240936998944117 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.37777659197907265, + "learning_rate": 0.0001983968133052118, + "loss": 3.0568556785583496, + "step": 3556, + "token_acc": 0.29333862248084247 + }, + { + "epoch": 2.0850190559953092, + "grad_norm": 0.366472130976773, + "learning_rate": 0.00019839508431417833, + "loss": 3.0705909729003906, + "step": 3557, + "token_acc": 0.2892433806745124 + }, + { + "epoch": 2.0856053943125183, + "grad_norm": 0.3240194660734338, + "learning_rate": 0.00019839335439885564, + "loss": 3.0572164058685303, + "step": 3558, + "token_acc": 0.2940084037419833 + }, + { + "epoch": 2.0861917326297275, + "grad_norm": 0.3625460327748282, + "learning_rate": 0.00019839162355926, + "loss": 3.0532264709472656, + "step": 3559, + "token_acc": 0.2932914433561264 + }, + { + "epoch": 2.0867780709469366, + "grad_norm": 0.4250733354309948, + "learning_rate": 0.0001983898917954076, + "loss": 3.072566032409668, + "step": 3560, + "token_acc": 0.290817749767538 + }, + { + "epoch": 2.0873644092641452, + "grad_norm": 0.49356300637028977, + "learning_rate": 0.0001983881591073148, + "loss": 3.0586729049682617, + "step": 3561, + "token_acc": 0.29224468469704684 + }, + { + "epoch": 2.0879507475813543, + "grad_norm": 0.34772695162412937, + "learning_rate": 0.0001983864254949978, + "loss": 3.0868895053863525, + "step": 3562, + "token_acc": 0.2887401761554135 + }, + { + "epoch": 2.0885370858985635, + "grad_norm": 0.35740403260207454, + "learning_rate": 0.0001983846909584729, + "loss": 3.071143627166748, + "step": 3563, + "token_acc": 0.2886153719809987 + }, + { + "epoch": 2.0891234242157726, + "grad_norm": 0.4774885905723965, + "learning_rate": 0.0001983829554977564, + "loss": 3.059811592102051, + "step": 3564, + "token_acc": 0.29389422311813695 + }, + { + "epoch": 2.0897097625329817, + "grad_norm": 0.4847454805016588, + "learning_rate": 0.00019838121911286462, + "loss": 3.1391334533691406, + "step": 3565, + "token_acc": 0.2814356069413033 + }, + { + "epoch": 2.0902961008501904, + "grad_norm": 0.3587535203891912, + "learning_rate": 0.00019837948180381388, + "loss": 2.9756364822387695, + "step": 3566, + "token_acc": 0.30496048826788513 + }, + { + "epoch": 2.0908824391673995, + "grad_norm": 0.390749577880432, + "learning_rate": 0.00019837774357062046, + "loss": 3.079336166381836, + "step": 3567, + "token_acc": 0.2900676104167093 + }, + { + "epoch": 2.0914687774846086, + "grad_norm": 0.6380019264262023, + "learning_rate": 0.0001983760044133007, + "loss": 3.0843658447265625, + "step": 3568, + "token_acc": 0.2902829855146074 + }, + { + "epoch": 2.0920551158018177, + "grad_norm": 0.5799534810194714, + "learning_rate": 0.00019837426433187092, + "loss": 3.0439553260803223, + "step": 3569, + "token_acc": 0.2951409162579062 + }, + { + "epoch": 2.092641454119027, + "grad_norm": 0.40703731287461004, + "learning_rate": 0.0001983725233263475, + "loss": 3.046616315841675, + "step": 3570, + "token_acc": 0.2942870987864201 + }, + { + "epoch": 2.093227792436236, + "grad_norm": 0.5884535435377783, + "learning_rate": 0.0001983707813967468, + "loss": 3.0519230365753174, + "step": 3571, + "token_acc": 0.293330434509278 + }, + { + "epoch": 2.0938141307534446, + "grad_norm": 0.7312989872820534, + "learning_rate": 0.00019836903854308514, + "loss": 3.0527243614196777, + "step": 3572, + "token_acc": 0.2947487193722418 + }, + { + "epoch": 2.0944004690706537, + "grad_norm": 0.5193068659164491, + "learning_rate": 0.00019836729476537893, + "loss": 3.0395874977111816, + "step": 3573, + "token_acc": 0.2959683803038354 + }, + { + "epoch": 2.094986807387863, + "grad_norm": 0.5114592300729086, + "learning_rate": 0.00019836555006364455, + "loss": 3.0629146099090576, + "step": 3574, + "token_acc": 0.2909723401545385 + }, + { + "epoch": 2.095573145705072, + "grad_norm": 0.5611570010123049, + "learning_rate": 0.00019836380443789836, + "loss": 3.0437188148498535, + "step": 3575, + "token_acc": 0.2947131670255222 + }, + { + "epoch": 2.096159484022281, + "grad_norm": 0.3803346739074631, + "learning_rate": 0.00019836205788815677, + "loss": 3.07240629196167, + "step": 3576, + "token_acc": 0.29029812262903415 + }, + { + "epoch": 2.0967458223394897, + "grad_norm": 0.5190646534061584, + "learning_rate": 0.00019836031041443623, + "loss": 3.052419662475586, + "step": 3577, + "token_acc": 0.2956201283206957 + }, + { + "epoch": 2.097332160656699, + "grad_norm": 0.3816142594046331, + "learning_rate": 0.00019835856201675306, + "loss": 3.02846097946167, + "step": 3578, + "token_acc": 0.2976310870031565 + }, + { + "epoch": 2.097918498973908, + "grad_norm": 0.5663487880740444, + "learning_rate": 0.00019835681269512377, + "loss": 3.1270220279693604, + "step": 3579, + "token_acc": 0.28162552237934163 + }, + { + "epoch": 2.098504837291117, + "grad_norm": 0.45960083021039966, + "learning_rate": 0.00019835506244956475, + "loss": 3.0054173469543457, + "step": 3580, + "token_acc": 0.30061533805118157 + }, + { + "epoch": 2.099091175608326, + "grad_norm": 0.4427309389453107, + "learning_rate": 0.00019835331128009246, + "loss": 3.041897773742676, + "step": 3581, + "token_acc": 0.2945209058583184 + }, + { + "epoch": 2.099677513925535, + "grad_norm": 0.4573966337718266, + "learning_rate": 0.00019835155918672333, + "loss": 3.0773720741271973, + "step": 3582, + "token_acc": 0.2885612635437443 + }, + { + "epoch": 2.100263852242744, + "grad_norm": 0.4249680322796581, + "learning_rate": 0.00019834980616947388, + "loss": 3.079244613647461, + "step": 3583, + "token_acc": 0.29033944765154196 + }, + { + "epoch": 2.100850190559953, + "grad_norm": 0.3891381530475288, + "learning_rate": 0.00019834805222836046, + "loss": 3.045846462249756, + "step": 3584, + "token_acc": 0.2951125642321624 + }, + { + "epoch": 2.101436528877162, + "grad_norm": 0.34748584613210326, + "learning_rate": 0.00019834629736339968, + "loss": 3.005671501159668, + "step": 3585, + "token_acc": 0.29949670803564177 + }, + { + "epoch": 2.1020228671943713, + "grad_norm": 0.44467570604991064, + "learning_rate": 0.00019834454157460792, + "loss": 3.073763370513916, + "step": 3586, + "token_acc": 0.2902493151183876 + }, + { + "epoch": 2.1026092055115804, + "grad_norm": 0.3458869544268884, + "learning_rate": 0.00019834278486200173, + "loss": 3.090545415878296, + "step": 3587, + "token_acc": 0.2888066756883443 + }, + { + "epoch": 2.103195543828789, + "grad_norm": 0.3900961667263977, + "learning_rate": 0.00019834102722559758, + "loss": 3.091229200363159, + "step": 3588, + "token_acc": 0.2885511324323078 + }, + { + "epoch": 2.103781882145998, + "grad_norm": 0.3320571433168456, + "learning_rate": 0.00019833926866541198, + "loss": 3.06968355178833, + "step": 3589, + "token_acc": 0.2912585402238717 + }, + { + "epoch": 2.1043682204632073, + "grad_norm": 0.3838331464453844, + "learning_rate": 0.0001983375091814615, + "loss": 3.0774216651916504, + "step": 3590, + "token_acc": 0.2901162507280102 + }, + { + "epoch": 2.1049545587804164, + "grad_norm": 0.41270502186090874, + "learning_rate": 0.00019833574877376262, + "loss": 3.0986461639404297, + "step": 3591, + "token_acc": 0.2887607917456307 + }, + { + "epoch": 2.1055408970976255, + "grad_norm": 0.3003580476476052, + "learning_rate": 0.0001983339874423319, + "loss": 3.118112564086914, + "step": 3592, + "token_acc": 0.2849725548312168 + }, + { + "epoch": 2.106127235414834, + "grad_norm": 0.4178560943863483, + "learning_rate": 0.00019833222518718583, + "loss": 3.03887939453125, + "step": 3593, + "token_acc": 0.2962801343916902 + }, + { + "epoch": 2.1067135737320433, + "grad_norm": 0.38321917436233666, + "learning_rate": 0.00019833046200834107, + "loss": 3.068979263305664, + "step": 3594, + "token_acc": 0.2901594531868983 + }, + { + "epoch": 2.1072999120492524, + "grad_norm": 0.3756314445747655, + "learning_rate": 0.0001983286979058141, + "loss": 3.1027374267578125, + "step": 3595, + "token_acc": 0.286743620214658 + }, + { + "epoch": 2.1078862503664615, + "grad_norm": 0.32342494623087414, + "learning_rate": 0.0001983269328796215, + "loss": 3.083435297012329, + "step": 3596, + "token_acc": 0.29027164009254364 + }, + { + "epoch": 2.1084725886836706, + "grad_norm": 0.37319058647254666, + "learning_rate": 0.00019832516692977988, + "loss": 3.0633890628814697, + "step": 3597, + "token_acc": 0.2916057334424412 + }, + { + "epoch": 2.1090589270008797, + "grad_norm": 0.4336198739366461, + "learning_rate": 0.0001983234000563058, + "loss": 3.0593342781066895, + "step": 3598, + "token_acc": 0.29293118752245495 + }, + { + "epoch": 2.1096452653180884, + "grad_norm": 0.32690073017604865, + "learning_rate": 0.00019832163225921585, + "loss": 3.117563247680664, + "step": 3599, + "token_acc": 0.28314599168368987 + }, + { + "epoch": 2.1102316036352975, + "grad_norm": 0.3650514687629308, + "learning_rate": 0.00019831986353852668, + "loss": 3.080416679382324, + "step": 3600, + "token_acc": 0.28977178319271435 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.44222125901793113, + "learning_rate": 0.00019831809389425487, + "loss": 3.0483977794647217, + "step": 3601, + "token_acc": 0.2936824368114064 + }, + { + "epoch": 2.1114042802697157, + "grad_norm": 0.3303022505116269, + "learning_rate": 0.00019831632332641705, + "loss": 3.019620418548584, + "step": 3602, + "token_acc": 0.29712843988971543 + }, + { + "epoch": 2.111990618586925, + "grad_norm": 0.4310503879767, + "learning_rate": 0.00019831455183502987, + "loss": 3.0526235103607178, + "step": 3603, + "token_acc": 0.2936478526876064 + }, + { + "epoch": 2.1125769569041335, + "grad_norm": 0.3252515406254481, + "learning_rate": 0.00019831277942010996, + "loss": 3.005171537399292, + "step": 3604, + "token_acc": 0.3017371700667372 + }, + { + "epoch": 2.1131632952213426, + "grad_norm": 0.4063407584818642, + "learning_rate": 0.00019831100608167393, + "loss": 3.0962395668029785, + "step": 3605, + "token_acc": 0.2898247611172116 + }, + { + "epoch": 2.1137496335385517, + "grad_norm": 0.401776452029666, + "learning_rate": 0.0001983092318197385, + "loss": 3.0379395484924316, + "step": 3606, + "token_acc": 0.2968602661264974 + }, + { + "epoch": 2.114335971855761, + "grad_norm": 0.45533308632482494, + "learning_rate": 0.00019830745663432033, + "loss": 3.047060489654541, + "step": 3607, + "token_acc": 0.2928545838763441 + }, + { + "epoch": 2.11492231017297, + "grad_norm": 0.38065552204836894, + "learning_rate": 0.00019830568052543604, + "loss": 3.055933952331543, + "step": 3608, + "token_acc": 0.29325146180943207 + }, + { + "epoch": 2.115508648490179, + "grad_norm": 0.3793836552798932, + "learning_rate": 0.00019830390349310237, + "loss": 3.0560786724090576, + "step": 3609, + "token_acc": 0.29247514516061657 + }, + { + "epoch": 2.1160949868073877, + "grad_norm": 0.34437856825947905, + "learning_rate": 0.00019830212553733598, + "loss": 3.0915746688842773, + "step": 3610, + "token_acc": 0.2892308702095056 + }, + { + "epoch": 2.116681325124597, + "grad_norm": 0.32393239925124906, + "learning_rate": 0.00019830034665815357, + "loss": 3.0861353874206543, + "step": 3611, + "token_acc": 0.28766087306173427 + }, + { + "epoch": 2.117267663441806, + "grad_norm": 0.3560461128357273, + "learning_rate": 0.0001982985668555719, + "loss": 3.0503063201904297, + "step": 3612, + "token_acc": 0.2921358264896557 + }, + { + "epoch": 2.117854001759015, + "grad_norm": 0.3320827010446719, + "learning_rate": 0.00019829678612960766, + "loss": 3.023803472518921, + "step": 3613, + "token_acc": 0.29769529437364495 + }, + { + "epoch": 2.118440340076224, + "grad_norm": 0.36805577620539576, + "learning_rate": 0.00019829500448027753, + "loss": 3.077540874481201, + "step": 3614, + "token_acc": 0.29110032746741815 + }, + { + "epoch": 2.119026678393433, + "grad_norm": 0.30927608147347996, + "learning_rate": 0.0001982932219075983, + "loss": 3.0324454307556152, + "step": 3615, + "token_acc": 0.29671131236388637 + }, + { + "epoch": 2.119613016710642, + "grad_norm": 0.33998613401633365, + "learning_rate": 0.00019829143841158673, + "loss": 3.0263547897338867, + "step": 3616, + "token_acc": 0.29628313150815516 + }, + { + "epoch": 2.120199355027851, + "grad_norm": 0.3396585255933413, + "learning_rate": 0.00019828965399225953, + "loss": 3.0975794792175293, + "step": 3617, + "token_acc": 0.2871586301540765 + }, + { + "epoch": 2.12078569334506, + "grad_norm": 0.36105248356697384, + "learning_rate": 0.00019828786864963346, + "loss": 3.059741735458374, + "step": 3618, + "token_acc": 0.2938550148957299 + }, + { + "epoch": 2.1213720316622693, + "grad_norm": 0.33108308210637405, + "learning_rate": 0.00019828608238372532, + "loss": 3.068727970123291, + "step": 3619, + "token_acc": 0.28999059891953044 + }, + { + "epoch": 2.121958369979478, + "grad_norm": 0.3608578403710765, + "learning_rate": 0.00019828429519455187, + "loss": 3.0625953674316406, + "step": 3620, + "token_acc": 0.29295049379322013 + }, + { + "epoch": 2.122544708296687, + "grad_norm": 0.4259711691064494, + "learning_rate": 0.00019828250708212993, + "loss": 3.092543601989746, + "step": 3621, + "token_acc": 0.2874105797193742 + }, + { + "epoch": 2.123131046613896, + "grad_norm": 0.341405840844254, + "learning_rate": 0.00019828071804647626, + "loss": 3.0377516746520996, + "step": 3622, + "token_acc": 0.2950345631802328 + }, + { + "epoch": 2.1237173849311053, + "grad_norm": 0.4702461284941566, + "learning_rate": 0.00019827892808760766, + "loss": 3.068410873413086, + "step": 3623, + "token_acc": 0.2917305352720484 + }, + { + "epoch": 2.1243037232483144, + "grad_norm": 0.4682700889031738, + "learning_rate": 0.00019827713720554097, + "loss": 3.0613012313842773, + "step": 3624, + "token_acc": 0.2921854644668456 + }, + { + "epoch": 2.1248900615655235, + "grad_norm": 0.33271664388555416, + "learning_rate": 0.000198275345400293, + "loss": 3.0392584800720215, + "step": 3625, + "token_acc": 0.2965059206496532 + }, + { + "epoch": 2.125476399882732, + "grad_norm": 0.43401411141698526, + "learning_rate": 0.00019827355267188065, + "loss": 3.0414907932281494, + "step": 3626, + "token_acc": 0.2959803243409423 + }, + { + "epoch": 2.1260627381999413, + "grad_norm": 0.43417332445628765, + "learning_rate": 0.00019827175902032063, + "loss": 2.9982614517211914, + "step": 3627, + "token_acc": 0.3014506581665152 + }, + { + "epoch": 2.1266490765171504, + "grad_norm": 0.4581844679578655, + "learning_rate": 0.00019826996444562988, + "loss": 3.056980609893799, + "step": 3628, + "token_acc": 0.2935469278474155 + }, + { + "epoch": 2.1272354148343595, + "grad_norm": 0.44801763389926275, + "learning_rate": 0.00019826816894782525, + "loss": 3.05255126953125, + "step": 3629, + "token_acc": 0.2929825891654879 + }, + { + "epoch": 2.1278217531515686, + "grad_norm": 0.4038462941930441, + "learning_rate": 0.00019826637252692356, + "loss": 3.0721967220306396, + "step": 3630, + "token_acc": 0.2903196482233252 + }, + { + "epoch": 2.1284080914687773, + "grad_norm": 0.49993955074044333, + "learning_rate": 0.00019826457518294172, + "loss": 3.086122512817383, + "step": 3631, + "token_acc": 0.28825710108604846 + }, + { + "epoch": 2.1289944297859864, + "grad_norm": 0.40774511724742474, + "learning_rate": 0.00019826277691589663, + "loss": 3.057485580444336, + "step": 3632, + "token_acc": 0.2919997121459009 + }, + { + "epoch": 2.1295807681031955, + "grad_norm": 0.33267034489594965, + "learning_rate": 0.00019826097772580517, + "loss": 3.0426793098449707, + "step": 3633, + "token_acc": 0.2927497341906739 + }, + { + "epoch": 2.1301671064204046, + "grad_norm": 0.3668040859612643, + "learning_rate": 0.0001982591776126842, + "loss": 3.0451996326446533, + "step": 3634, + "token_acc": 0.2947234168142339 + }, + { + "epoch": 2.1307534447376137, + "grad_norm": 0.3871607421722465, + "learning_rate": 0.00019825737657655067, + "loss": 3.0688562393188477, + "step": 3635, + "token_acc": 0.2916939141579278 + }, + { + "epoch": 2.1313397830548224, + "grad_norm": 0.4196305514267362, + "learning_rate": 0.0001982555746174215, + "loss": 3.067142963409424, + "step": 3636, + "token_acc": 0.292745619200713 + }, + { + "epoch": 2.1319261213720315, + "grad_norm": 0.42732608281155054, + "learning_rate": 0.0001982537717353136, + "loss": 3.072443962097168, + "step": 3637, + "token_acc": 0.2909729998098578 + }, + { + "epoch": 2.1325124596892406, + "grad_norm": 0.34238518770614135, + "learning_rate": 0.00019825196793024391, + "loss": 3.088256359100342, + "step": 3638, + "token_acc": 0.2889087585962052 + }, + { + "epoch": 2.1330987980064497, + "grad_norm": 0.3673543886394881, + "learning_rate": 0.0001982501632022294, + "loss": 3.078174591064453, + "step": 3639, + "token_acc": 0.28934118473298936 + }, + { + "epoch": 2.133685136323659, + "grad_norm": 0.3707717997723761, + "learning_rate": 0.000198248357551287, + "loss": 3.109175682067871, + "step": 3640, + "token_acc": 0.284658011273742 + }, + { + "epoch": 2.134271474640868, + "grad_norm": 0.36075496666507634, + "learning_rate": 0.00019824655097743367, + "loss": 3.0754547119140625, + "step": 3641, + "token_acc": 0.2921208131773927 + }, + { + "epoch": 2.1348578129580766, + "grad_norm": 0.37531363618475966, + "learning_rate": 0.00019824474348068637, + "loss": 3.0740091800689697, + "step": 3642, + "token_acc": 0.29018498930369846 + }, + { + "epoch": 2.1354441512752858, + "grad_norm": 0.4778193315338914, + "learning_rate": 0.00019824293506106206, + "loss": 3.0912137031555176, + "step": 3643, + "token_acc": 0.2896417318355617 + }, + { + "epoch": 2.136030489592495, + "grad_norm": 0.47419736043746874, + "learning_rate": 0.00019824112571857782, + "loss": 3.0936503410339355, + "step": 3644, + "token_acc": 0.2892625004539017 + }, + { + "epoch": 2.136616827909704, + "grad_norm": 0.37064465349918374, + "learning_rate": 0.00019823931545325053, + "loss": 3.093658208847046, + "step": 3645, + "token_acc": 0.2884854673313202 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.42386620265996405, + "learning_rate": 0.0001982375042650973, + "loss": 3.0095455646514893, + "step": 3646, + "token_acc": 0.2975479631134509 + }, + { + "epoch": 2.1377895045441218, + "grad_norm": 0.38172988276294245, + "learning_rate": 0.00019823569215413503, + "loss": 3.098313808441162, + "step": 3647, + "token_acc": 0.28812614862004854 + }, + { + "epoch": 2.138375842861331, + "grad_norm": 0.3086022099696844, + "learning_rate": 0.00019823387912038087, + "loss": 3.024193286895752, + "step": 3648, + "token_acc": 0.2973437992872654 + }, + { + "epoch": 2.13896218117854, + "grad_norm": 0.4149553561911551, + "learning_rate": 0.00019823206516385175, + "loss": 3.102437734603882, + "step": 3649, + "token_acc": 0.2871082475186682 + }, + { + "epoch": 2.139548519495749, + "grad_norm": 0.35447322722730157, + "learning_rate": 0.00019823025028456478, + "loss": 3.039538860321045, + "step": 3650, + "token_acc": 0.2963027653486353 + }, + { + "epoch": 2.140134857812958, + "grad_norm": 0.3255488570801878, + "learning_rate": 0.00019822843448253694, + "loss": 3.057581901550293, + "step": 3651, + "token_acc": 0.29423873722289995 + }, + { + "epoch": 2.1407211961301673, + "grad_norm": 0.35937595188739524, + "learning_rate": 0.00019822661775778535, + "loss": 3.068493366241455, + "step": 3652, + "token_acc": 0.2912323808421418 + }, + { + "epoch": 2.141307534447376, + "grad_norm": 0.3656363996058169, + "learning_rate": 0.00019822480011032702, + "loss": 3.123793125152588, + "step": 3653, + "token_acc": 0.2831064512280795 + }, + { + "epoch": 2.141893872764585, + "grad_norm": 0.3319438930487994, + "learning_rate": 0.0001982229815401791, + "loss": 3.050457715988159, + "step": 3654, + "token_acc": 0.2943593819162207 + }, + { + "epoch": 2.142480211081794, + "grad_norm": 0.36744461951942464, + "learning_rate": 0.00019822116204735858, + "loss": 3.0768117904663086, + "step": 3655, + "token_acc": 0.28981598109126677 + }, + { + "epoch": 2.1430665493990033, + "grad_norm": 0.34098719091200086, + "learning_rate": 0.0001982193416318826, + "loss": 3.019047260284424, + "step": 3656, + "token_acc": 0.2988497291877813 + }, + { + "epoch": 2.1436528877162124, + "grad_norm": 0.3734420531063816, + "learning_rate": 0.0001982175202937683, + "loss": 3.0314571857452393, + "step": 3657, + "token_acc": 0.2969687103517598 + }, + { + "epoch": 2.144239226033421, + "grad_norm": 0.42365889381392163, + "learning_rate": 0.0001982156980330327, + "loss": 3.0838067531585693, + "step": 3658, + "token_acc": 0.28869940999896493 + }, + { + "epoch": 2.14482556435063, + "grad_norm": 0.38891779796554593, + "learning_rate": 0.000198213874849693, + "loss": 3.061216354370117, + "step": 3659, + "token_acc": 0.291639099595341 + }, + { + "epoch": 2.1454119026678393, + "grad_norm": 0.3325639042483432, + "learning_rate": 0.00019821205074376625, + "loss": 3.082282304763794, + "step": 3660, + "token_acc": 0.2883842363549314 + }, + { + "epoch": 2.1459982409850484, + "grad_norm": 0.3796427915147779, + "learning_rate": 0.00019821022571526965, + "loss": 3.0447168350219727, + "step": 3661, + "token_acc": 0.2934772797518593 + }, + { + "epoch": 2.1465845793022575, + "grad_norm": 0.42503563264359, + "learning_rate": 0.0001982083997642203, + "loss": 3.074496269226074, + "step": 3662, + "token_acc": 0.2913147753630993 + }, + { + "epoch": 2.1471709176194667, + "grad_norm": 0.4017721397348817, + "learning_rate": 0.0001982065728906354, + "loss": 3.087440013885498, + "step": 3663, + "token_acc": 0.2895735220605646 + }, + { + "epoch": 2.1477572559366753, + "grad_norm": 0.39635501228307246, + "learning_rate": 0.00019820474509453208, + "loss": 3.038784980773926, + "step": 3664, + "token_acc": 0.29654691111341086 + }, + { + "epoch": 2.1483435942538844, + "grad_norm": 0.4024113592292357, + "learning_rate": 0.0001982029163759275, + "loss": 3.0895156860351562, + "step": 3665, + "token_acc": 0.28952099237591566 + }, + { + "epoch": 2.1489299325710935, + "grad_norm": 0.44816104698756043, + "learning_rate": 0.00019820108673483886, + "loss": 3.0261764526367188, + "step": 3666, + "token_acc": 0.29754149285524095 + }, + { + "epoch": 2.1495162708883027, + "grad_norm": 0.5295161669300592, + "learning_rate": 0.00019819925617128333, + "loss": 3.055893659591675, + "step": 3667, + "token_acc": 0.2922143921059951 + }, + { + "epoch": 2.1501026092055118, + "grad_norm": 0.3745309728424183, + "learning_rate": 0.0001981974246852781, + "loss": 3.045581340789795, + "step": 3668, + "token_acc": 0.2951537127711179 + }, + { + "epoch": 2.1506889475227204, + "grad_norm": 0.46366028984048285, + "learning_rate": 0.00019819559227684041, + "loss": 3.039191722869873, + "step": 3669, + "token_acc": 0.2964182206817488 + }, + { + "epoch": 2.1512752858399296, + "grad_norm": 0.46228103744879745, + "learning_rate": 0.00019819375894598745, + "loss": 3.114645004272461, + "step": 3670, + "token_acc": 0.2831469669188854 + }, + { + "epoch": 2.1518616241571387, + "grad_norm": 0.3885683672920312, + "learning_rate": 0.00019819192469273643, + "loss": 3.0345194339752197, + "step": 3671, + "token_acc": 0.2951403904568877 + }, + { + "epoch": 2.1524479624743478, + "grad_norm": 0.391563814427664, + "learning_rate": 0.0001981900895171046, + "loss": 3.062636375427246, + "step": 3672, + "token_acc": 0.2917323983230324 + }, + { + "epoch": 2.153034300791557, + "grad_norm": 0.35677312329398886, + "learning_rate": 0.0001981882534191092, + "loss": 3.0529580116271973, + "step": 3673, + "token_acc": 0.2935443357233669 + }, + { + "epoch": 2.1536206391087656, + "grad_norm": 0.36994516324867055, + "learning_rate": 0.00019818641639876745, + "loss": 3.1113665103912354, + "step": 3674, + "token_acc": 0.2855378192782019 + }, + { + "epoch": 2.1542069774259747, + "grad_norm": 0.35141992234043656, + "learning_rate": 0.00019818457845609665, + "loss": 3.0312695503234863, + "step": 3675, + "token_acc": 0.29815066016783515 + }, + { + "epoch": 2.154793315743184, + "grad_norm": 0.3352608846684348, + "learning_rate": 0.00019818273959111403, + "loss": 3.055251359939575, + "step": 3676, + "token_acc": 0.2929994477719723 + }, + { + "epoch": 2.155379654060393, + "grad_norm": 0.3835726200052877, + "learning_rate": 0.00019818089980383686, + "loss": 3.064302921295166, + "step": 3677, + "token_acc": 0.29110212655902795 + }, + { + "epoch": 2.155965992377602, + "grad_norm": 0.4107215412375538, + "learning_rate": 0.00019817905909428246, + "loss": 3.0144827365875244, + "step": 3678, + "token_acc": 0.29772714736374833 + }, + { + "epoch": 2.1565523306948107, + "grad_norm": 0.3996105029647825, + "learning_rate": 0.0001981772174624681, + "loss": 3.096673011779785, + "step": 3679, + "token_acc": 0.28871941043396254 + }, + { + "epoch": 2.15713866901202, + "grad_norm": 0.42256106857756087, + "learning_rate": 0.00019817537490841102, + "loss": 3.072201728820801, + "step": 3680, + "token_acc": 0.2923764787289252 + }, + { + "epoch": 2.157725007329229, + "grad_norm": 0.43398330282105035, + "learning_rate": 0.00019817353143212864, + "loss": 3.045921802520752, + "step": 3681, + "token_acc": 0.294512248707754 + }, + { + "epoch": 2.158311345646438, + "grad_norm": 0.38959889749447035, + "learning_rate": 0.00019817168703363823, + "loss": 3.0750393867492676, + "step": 3682, + "token_acc": 0.2915563169793132 + }, + { + "epoch": 2.158897683963647, + "grad_norm": 0.40338758755030873, + "learning_rate": 0.00019816984171295708, + "loss": 3.1409642696380615, + "step": 3683, + "token_acc": 0.2817941674770178 + }, + { + "epoch": 2.1594840222808562, + "grad_norm": 0.4803805033609024, + "learning_rate": 0.00019816799547010255, + "loss": 3.0706191062927246, + "step": 3684, + "token_acc": 0.29218219470403345 + }, + { + "epoch": 2.160070360598065, + "grad_norm": 0.41681756888313753, + "learning_rate": 0.000198166148305092, + "loss": 3.067589282989502, + "step": 3685, + "token_acc": 0.28988539109580935 + }, + { + "epoch": 2.160656698915274, + "grad_norm": 0.4722609300380041, + "learning_rate": 0.00019816430021794279, + "loss": 3.0518429279327393, + "step": 3686, + "token_acc": 0.294074548468628 + }, + { + "epoch": 2.161243037232483, + "grad_norm": 0.512334638116427, + "learning_rate": 0.0001981624512086722, + "loss": 3.054753065109253, + "step": 3687, + "token_acc": 0.2924440385192344 + }, + { + "epoch": 2.1618293755496922, + "grad_norm": 0.43203731777618976, + "learning_rate": 0.0001981606012772977, + "loss": 3.097407817840576, + "step": 3688, + "token_acc": 0.2886756882589548 + }, + { + "epoch": 2.1624157138669013, + "grad_norm": 0.49109976001930006, + "learning_rate": 0.00019815875042383663, + "loss": 3.067046642303467, + "step": 3689, + "token_acc": 0.29054734331194076 + }, + { + "epoch": 2.16300205218411, + "grad_norm": 0.41444076177104566, + "learning_rate": 0.00019815689864830635, + "loss": 3.0211291313171387, + "step": 3690, + "token_acc": 0.29784430133670087 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.41823770915807945, + "learning_rate": 0.00019815504595072428, + "loss": 3.05775785446167, + "step": 3691, + "token_acc": 0.2954785125329384 + }, + { + "epoch": 2.1641747288185282, + "grad_norm": 0.3505367231140321, + "learning_rate": 0.00019815319233110784, + "loss": 3.0847840309143066, + "step": 3692, + "token_acc": 0.28727815071343954 + }, + { + "epoch": 2.1647610671357373, + "grad_norm": 0.38823383222485475, + "learning_rate": 0.00019815133778947438, + "loss": 3.060549736022949, + "step": 3693, + "token_acc": 0.29188239766158525 + }, + { + "epoch": 2.1653474054529465, + "grad_norm": 0.38876453238600234, + "learning_rate": 0.00019814948232584135, + "loss": 3.081352710723877, + "step": 3694, + "token_acc": 0.28898431134161956 + }, + { + "epoch": 2.1659337437701556, + "grad_norm": 0.34525571729830784, + "learning_rate": 0.00019814762594022624, + "loss": 3.085944175720215, + "step": 3695, + "token_acc": 0.2898330377722016 + }, + { + "epoch": 2.1665200820873642, + "grad_norm": 0.3991600859950033, + "learning_rate": 0.00019814576863264646, + "loss": 3.0598087310791016, + "step": 3696, + "token_acc": 0.2908605857757875 + }, + { + "epoch": 2.1671064204045734, + "grad_norm": 0.3223574454318465, + "learning_rate": 0.00019814391040311936, + "loss": 3.0664358139038086, + "step": 3697, + "token_acc": 0.29078196534218365 + }, + { + "epoch": 2.1676927587217825, + "grad_norm": 0.40365444226727976, + "learning_rate": 0.00019814205125166253, + "loss": 3.0346033573150635, + "step": 3698, + "token_acc": 0.29554123577404773 + }, + { + "epoch": 2.1682790970389916, + "grad_norm": 0.3736109211610393, + "learning_rate": 0.00019814019117829335, + "loss": 3.033038377761841, + "step": 3699, + "token_acc": 0.2984612110468904 + }, + { + "epoch": 2.1688654353562007, + "grad_norm": 0.44397360630406835, + "learning_rate": 0.00019813833018302935, + "loss": 3.0904383659362793, + "step": 3700, + "token_acc": 0.28873367509653275 + }, + { + "epoch": 2.1694517736734094, + "grad_norm": 0.33196705625696693, + "learning_rate": 0.00019813646826588794, + "loss": 3.027376174926758, + "step": 3701, + "token_acc": 0.29806078785082013 + }, + { + "epoch": 2.1700381119906185, + "grad_norm": 0.32084385391430037, + "learning_rate": 0.00019813460542688667, + "loss": 3.019136905670166, + "step": 3702, + "token_acc": 0.297492864306102 + }, + { + "epoch": 2.1706244503078276, + "grad_norm": 0.3707520187668173, + "learning_rate": 0.000198132741666043, + "loss": 3.0417094230651855, + "step": 3703, + "token_acc": 0.2953370317331078 + }, + { + "epoch": 2.1712107886250367, + "grad_norm": 0.34314510502736817, + "learning_rate": 0.0001981308769833745, + "loss": 3.0925612449645996, + "step": 3704, + "token_acc": 0.2871026767322786 + }, + { + "epoch": 2.171797126942246, + "grad_norm": 0.3903723409006208, + "learning_rate": 0.00019812901137889862, + "loss": 3.080919027328491, + "step": 3705, + "token_acc": 0.28975915854582107 + }, + { + "epoch": 2.172383465259455, + "grad_norm": 0.38624853678336335, + "learning_rate": 0.0001981271448526329, + "loss": 3.0907652378082275, + "step": 3706, + "token_acc": 0.28893884618889937 + }, + { + "epoch": 2.1729698035766636, + "grad_norm": 0.3537372371911025, + "learning_rate": 0.0001981252774045949, + "loss": 3.0456995964050293, + "step": 3707, + "token_acc": 0.2950165346045653 + }, + { + "epoch": 2.1735561418938727, + "grad_norm": 0.4416946674237238, + "learning_rate": 0.00019812340903480212, + "loss": 3.0639238357543945, + "step": 3708, + "token_acc": 0.2912564735361303 + }, + { + "epoch": 2.174142480211082, + "grad_norm": 0.42151114482919977, + "learning_rate": 0.00019812153974327215, + "loss": 3.069520950317383, + "step": 3709, + "token_acc": 0.2913897195151553 + }, + { + "epoch": 2.174728818528291, + "grad_norm": 0.3907630218206638, + "learning_rate": 0.00019811966953002256, + "loss": 3.044097423553467, + "step": 3710, + "token_acc": 0.2941923600463518 + }, + { + "epoch": 2.1753151568455, + "grad_norm": 0.48432813726910307, + "learning_rate": 0.00019811779839507088, + "loss": 3.0825676918029785, + "step": 3711, + "token_acc": 0.28926997674851435 + }, + { + "epoch": 2.1759014951627087, + "grad_norm": 0.3411011523866471, + "learning_rate": 0.00019811592633843468, + "loss": 3.1120285987854004, + "step": 3712, + "token_acc": 0.2856225382167172 + }, + { + "epoch": 2.176487833479918, + "grad_norm": 0.3884136175374284, + "learning_rate": 0.00019811405336013155, + "loss": 3.0508508682250977, + "step": 3713, + "token_acc": 0.29531432083904263 + }, + { + "epoch": 2.177074171797127, + "grad_norm": 0.35211602105428624, + "learning_rate": 0.00019811217946017916, + "loss": 3.058058261871338, + "step": 3714, + "token_acc": 0.2938839083768557 + }, + { + "epoch": 2.177660510114336, + "grad_norm": 0.33612920765302057, + "learning_rate": 0.000198110304638595, + "loss": 3.0568323135375977, + "step": 3715, + "token_acc": 0.290863650679156 + }, + { + "epoch": 2.178246848431545, + "grad_norm": 0.360766931547209, + "learning_rate": 0.00019810842889539675, + "loss": 3.0815136432647705, + "step": 3716, + "token_acc": 0.2905791015204376 + }, + { + "epoch": 2.1788331867487543, + "grad_norm": 0.4020471899748261, + "learning_rate": 0.000198106552230602, + "loss": 3.06553316116333, + "step": 3717, + "token_acc": 0.2923001327241021 + }, + { + "epoch": 2.179419525065963, + "grad_norm": 0.42272187443519893, + "learning_rate": 0.00019810467464422842, + "loss": 3.0341875553131104, + "step": 3718, + "token_acc": 0.29733858539266506 + }, + { + "epoch": 2.180005863383172, + "grad_norm": 0.3360927921504579, + "learning_rate": 0.00019810279613629358, + "loss": 3.051051139831543, + "step": 3719, + "token_acc": 0.29595057293800087 + }, + { + "epoch": 2.180592201700381, + "grad_norm": 0.39729716729628617, + "learning_rate": 0.00019810091670681518, + "loss": 3.0622317790985107, + "step": 3720, + "token_acc": 0.2912510805890941 + }, + { + "epoch": 2.1811785400175903, + "grad_norm": 0.32908310056703416, + "learning_rate": 0.0001980990363558109, + "loss": 3.054521083831787, + "step": 3721, + "token_acc": 0.2921703505833829 + }, + { + "epoch": 2.1817648783347994, + "grad_norm": 0.386307670285044, + "learning_rate": 0.0001980971550832983, + "loss": 3.0914382934570312, + "step": 3722, + "token_acc": 0.2895025584216912 + }, + { + "epoch": 2.182351216652008, + "grad_norm": 0.313379061385587, + "learning_rate": 0.00019809527288929517, + "loss": 3.1284141540527344, + "step": 3723, + "token_acc": 0.28403632430311665 + }, + { + "epoch": 2.182937554969217, + "grad_norm": 0.4012839364386284, + "learning_rate": 0.0001980933897738191, + "loss": 3.079728364944458, + "step": 3724, + "token_acc": 0.29032503575787 + }, + { + "epoch": 2.1835238932864263, + "grad_norm": 0.3376288134715868, + "learning_rate": 0.00019809150573688782, + "loss": 3.0606932640075684, + "step": 3725, + "token_acc": 0.2940896670912368 + }, + { + "epoch": 2.1841102316036354, + "grad_norm": 0.37850435503879637, + "learning_rate": 0.00019808962077851904, + "loss": 3.089174509048462, + "step": 3726, + "token_acc": 0.28817984538179847 + }, + { + "epoch": 2.1846965699208445, + "grad_norm": 0.3417610880713508, + "learning_rate": 0.00019808773489873044, + "loss": 3.0899174213409424, + "step": 3727, + "token_acc": 0.28772069888307916 + }, + { + "epoch": 2.185282908238053, + "grad_norm": 0.3448155256888338, + "learning_rate": 0.00019808584809753973, + "loss": 3.073763370513916, + "step": 3728, + "token_acc": 0.2902577979965322 + }, + { + "epoch": 2.1858692465552623, + "grad_norm": 0.37869887681035297, + "learning_rate": 0.0001980839603749647, + "loss": 3.096000909805298, + "step": 3729, + "token_acc": 0.2867157637629594 + }, + { + "epoch": 2.1864555848724714, + "grad_norm": 0.30815461213676887, + "learning_rate": 0.000198082071731023, + "loss": 3.082163095474243, + "step": 3730, + "token_acc": 0.2895739185747386 + }, + { + "epoch": 2.1870419231896805, + "grad_norm": 0.3798261165595876, + "learning_rate": 0.0001980801821657324, + "loss": 3.0781822204589844, + "step": 3731, + "token_acc": 0.2920762308794104 + }, + { + "epoch": 2.1876282615068896, + "grad_norm": 0.38542476451329444, + "learning_rate": 0.00019807829167911066, + "loss": 3.0817389488220215, + "step": 3732, + "token_acc": 0.2891235556101292 + }, + { + "epoch": 2.1882145998240983, + "grad_norm": 0.3513885841387477, + "learning_rate": 0.00019807640027117552, + "loss": 3.0607097148895264, + "step": 3733, + "token_acc": 0.29277806342781737 + }, + { + "epoch": 2.1888009381413074, + "grad_norm": 0.41253787986230267, + "learning_rate": 0.00019807450794194479, + "loss": 3.0622215270996094, + "step": 3734, + "token_acc": 0.292786076673758 + }, + { + "epoch": 2.1893872764585165, + "grad_norm": 0.4603926510231856, + "learning_rate": 0.00019807261469143616, + "loss": 3.1241812705993652, + "step": 3735, + "token_acc": 0.28356866414875226 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.43023403271799454, + "learning_rate": 0.0001980707205196675, + "loss": 3.0950329303741455, + "step": 3736, + "token_acc": 0.289215398804732 + }, + { + "epoch": 2.1905599530929347, + "grad_norm": 0.39486298422908156, + "learning_rate": 0.00019806882542665658, + "loss": 3.0755863189697266, + "step": 3737, + "token_acc": 0.289848019280402 + }, + { + "epoch": 2.191146291410144, + "grad_norm": 0.4274911388865813, + "learning_rate": 0.0001980669294124212, + "loss": 3.053633689880371, + "step": 3738, + "token_acc": 0.2946253276592132 + }, + { + "epoch": 2.1917326297273525, + "grad_norm": 0.35166222960225946, + "learning_rate": 0.00019806503247697915, + "loss": 3.119694232940674, + "step": 3739, + "token_acc": 0.28351120089426696 + }, + { + "epoch": 2.1923189680445616, + "grad_norm": 0.3398753185384973, + "learning_rate": 0.00019806313462034827, + "loss": 3.097565174102783, + "step": 3740, + "token_acc": 0.28844821614132315 + }, + { + "epoch": 2.1929053063617707, + "grad_norm": 0.35258555656756, + "learning_rate": 0.00019806123584254637, + "loss": 3.084041118621826, + "step": 3741, + "token_acc": 0.2908209720884709 + }, + { + "epoch": 2.19349164467898, + "grad_norm": 0.42318864486046637, + "learning_rate": 0.0001980593361435913, + "loss": 3.0764129161834717, + "step": 3742, + "token_acc": 0.2896307161708884 + }, + { + "epoch": 2.194077982996189, + "grad_norm": 0.3976186552131438, + "learning_rate": 0.0001980574355235009, + "loss": 3.0898749828338623, + "step": 3743, + "token_acc": 0.28834941437490325 + }, + { + "epoch": 2.1946643213133976, + "grad_norm": 0.3839498437849161, + "learning_rate": 0.00019805553398229308, + "loss": 3.068784236907959, + "step": 3744, + "token_acc": 0.29113553374720536 + }, + { + "epoch": 2.1952506596306067, + "grad_norm": 0.43724637168886465, + "learning_rate": 0.0001980536315199856, + "loss": 3.0914196968078613, + "step": 3745, + "token_acc": 0.28861965310653986 + }, + { + "epoch": 2.195836997947816, + "grad_norm": 0.43664382706208227, + "learning_rate": 0.00019805172813659638, + "loss": 3.111490249633789, + "step": 3746, + "token_acc": 0.2863194515501893 + }, + { + "epoch": 2.196423336265025, + "grad_norm": 0.31336303931539444, + "learning_rate": 0.0001980498238321433, + "loss": 3.097074508666992, + "step": 3747, + "token_acc": 0.2879774102795686 + }, + { + "epoch": 2.197009674582234, + "grad_norm": 0.43244391044620856, + "learning_rate": 0.00019804791860664428, + "loss": 3.108273983001709, + "step": 3748, + "token_acc": 0.2849440632497388 + }, + { + "epoch": 2.197596012899443, + "grad_norm": 0.38702353495385483, + "learning_rate": 0.00019804601246011715, + "loss": 3.087984085083008, + "step": 3749, + "token_acc": 0.28965369701831795 + }, + { + "epoch": 2.198182351216652, + "grad_norm": 0.4649827179467996, + "learning_rate": 0.00019804410539257984, + "loss": 3.1016461849212646, + "step": 3750, + "token_acc": 0.2874370685697149 + }, + { + "epoch": 2.198768689533861, + "grad_norm": 0.4312944788086562, + "learning_rate": 0.00019804219740405033, + "loss": 3.0383925437927246, + "step": 3751, + "token_acc": 0.29606947056280386 + }, + { + "epoch": 2.19935502785107, + "grad_norm": 0.3723340407649122, + "learning_rate": 0.00019804028849454644, + "loss": 3.0107102394104004, + "step": 3752, + "token_acc": 0.29872457718259815 + }, + { + "epoch": 2.199941366168279, + "grad_norm": 0.3469114638423638, + "learning_rate": 0.0001980383786640862, + "loss": 3.0981040000915527, + "step": 3753, + "token_acc": 0.2874047811733272 + }, + { + "epoch": 2.2005277044854883, + "grad_norm": 0.4506402245473934, + "learning_rate": 0.00019803646791268745, + "loss": 3.0969290733337402, + "step": 3754, + "token_acc": 0.28787476987255656 + }, + { + "epoch": 2.201114042802697, + "grad_norm": 0.46638260040785406, + "learning_rate": 0.00019803455624036823, + "loss": 3.027515411376953, + "step": 3755, + "token_acc": 0.29801522892406235 + }, + { + "epoch": 2.201700381119906, + "grad_norm": 0.33156631027238403, + "learning_rate": 0.0001980326436471464, + "loss": 3.046071767807007, + "step": 3756, + "token_acc": 0.29211779875956434 + }, + { + "epoch": 2.202286719437115, + "grad_norm": 0.45936327928293064, + "learning_rate": 0.00019803073013304005, + "loss": 3.0666890144348145, + "step": 3757, + "token_acc": 0.29142374356481754 + }, + { + "epoch": 2.2028730577543243, + "grad_norm": 0.4330485638394041, + "learning_rate": 0.00019802881569806706, + "loss": 3.0573019981384277, + "step": 3758, + "token_acc": 0.2932921376132419 + }, + { + "epoch": 2.2034593960715334, + "grad_norm": 0.32823989054508146, + "learning_rate": 0.00019802690034224544, + "loss": 3.039001941680908, + "step": 3759, + "token_acc": 0.2973704437443842 + }, + { + "epoch": 2.2040457343887425, + "grad_norm": 0.3985716694670369, + "learning_rate": 0.00019802498406559319, + "loss": 3.0624570846557617, + "step": 3760, + "token_acc": 0.2942314959779733 + }, + { + "epoch": 2.204632072705951, + "grad_norm": 0.3475653878603628, + "learning_rate": 0.0001980230668681283, + "loss": 3.035620927810669, + "step": 3761, + "token_acc": 0.29664889769896335 + }, + { + "epoch": 2.2052184110231603, + "grad_norm": 0.303161854563871, + "learning_rate": 0.00019802114874986878, + "loss": 3.047999382019043, + "step": 3762, + "token_acc": 0.2938517105855886 + }, + { + "epoch": 2.2058047493403694, + "grad_norm": 0.3183014408933996, + "learning_rate": 0.00019801922971083267, + "loss": 3.0662879943847656, + "step": 3763, + "token_acc": 0.2949339884929763 + }, + { + "epoch": 2.2063910876575785, + "grad_norm": 0.30922080298438437, + "learning_rate": 0.00019801730975103798, + "loss": 3.089285373687744, + "step": 3764, + "token_acc": 0.2897957524435264 + }, + { + "epoch": 2.2069774259747876, + "grad_norm": 0.3647627126866834, + "learning_rate": 0.00019801538887050276, + "loss": 3.0610663890838623, + "step": 3765, + "token_acc": 0.29263268093089784 + }, + { + "epoch": 2.2075637642919963, + "grad_norm": 0.3548869738237354, + "learning_rate": 0.00019801346706924496, + "loss": 3.097532272338867, + "step": 3766, + "token_acc": 0.2893190312194055 + }, + { + "epoch": 2.2081501026092054, + "grad_norm": 0.386845850131625, + "learning_rate": 0.0001980115443472828, + "loss": 3.076988458633423, + "step": 3767, + "token_acc": 0.2919435906513681 + }, + { + "epoch": 2.2087364409264145, + "grad_norm": 0.39030053704045653, + "learning_rate": 0.0001980096207046342, + "loss": 3.042177677154541, + "step": 3768, + "token_acc": 0.29525644968029946 + }, + { + "epoch": 2.2093227792436236, + "grad_norm": 0.3761438614966452, + "learning_rate": 0.00019800769614131732, + "loss": 3.0546975135803223, + "step": 3769, + "token_acc": 0.295493929725487 + }, + { + "epoch": 2.2099091175608327, + "grad_norm": 0.3405743866745887, + "learning_rate": 0.00019800577065735018, + "loss": 3.1010794639587402, + "step": 3770, + "token_acc": 0.2866519756918455 + }, + { + "epoch": 2.210495455878042, + "grad_norm": 0.41410058291171886, + "learning_rate": 0.0001980038442527509, + "loss": 3.045254707336426, + "step": 3771, + "token_acc": 0.2944036184564366 + }, + { + "epoch": 2.2110817941952505, + "grad_norm": 0.4401809710796626, + "learning_rate": 0.00019800191692753756, + "loss": 3.090334892272949, + "step": 3772, + "token_acc": 0.2887820109418976 + }, + { + "epoch": 2.2116681325124596, + "grad_norm": 0.38983100982597424, + "learning_rate": 0.00019799998868172826, + "loss": 3.06935453414917, + "step": 3773, + "token_acc": 0.2912610884364482 + }, + { + "epoch": 2.2122544708296688, + "grad_norm": 0.29262096958798306, + "learning_rate": 0.00019799805951534113, + "loss": 3.0704498291015625, + "step": 3774, + "token_acc": 0.29252268168594997 + }, + { + "epoch": 2.212840809146878, + "grad_norm": 0.3892065196542264, + "learning_rate": 0.00019799612942839428, + "loss": 3.072063446044922, + "step": 3775, + "token_acc": 0.28996862356321096 + }, + { + "epoch": 2.213427147464087, + "grad_norm": 0.36885457081080436, + "learning_rate": 0.00019799419842090585, + "loss": 3.0799636840820312, + "step": 3776, + "token_acc": 0.29031433682596475 + }, + { + "epoch": 2.2140134857812956, + "grad_norm": 0.39705518386303346, + "learning_rate": 0.00019799226649289397, + "loss": 3.0894603729248047, + "step": 3777, + "token_acc": 0.287961105148163 + }, + { + "epoch": 2.2145998240985048, + "grad_norm": 0.38423808063994297, + "learning_rate": 0.00019799033364437677, + "loss": 3.0624430179595947, + "step": 3778, + "token_acc": 0.29273675914968667 + }, + { + "epoch": 2.215186162415714, + "grad_norm": 0.3979961271454017, + "learning_rate": 0.00019798839987537247, + "loss": 3.086780548095703, + "step": 3779, + "token_acc": 0.29031007957700494 + }, + { + "epoch": 2.215772500732923, + "grad_norm": 0.3749297890915773, + "learning_rate": 0.00019798646518589917, + "loss": 3.1032357215881348, + "step": 3780, + "token_acc": 0.28681809337525216 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.33741139449465946, + "learning_rate": 0.00019798452957597512, + "loss": 3.097425937652588, + "step": 3781, + "token_acc": 0.2862956873656254 + }, + { + "epoch": 2.2169451773673408, + "grad_norm": 0.32814606864043283, + "learning_rate": 0.0001979825930456184, + "loss": 3.0357799530029297, + "step": 3782, + "token_acc": 0.29361347340336835 + }, + { + "epoch": 2.21753151568455, + "grad_norm": 0.35110870899416013, + "learning_rate": 0.00019798065559484726, + "loss": 3.0791003704071045, + "step": 3783, + "token_acc": 0.2904191383988618 + }, + { + "epoch": 2.218117854001759, + "grad_norm": 0.33338815320793347, + "learning_rate": 0.00019797871722367986, + "loss": 3.023411273956299, + "step": 3784, + "token_acc": 0.29736474903320975 + }, + { + "epoch": 2.218704192318968, + "grad_norm": 0.32189336014873654, + "learning_rate": 0.0001979767779321345, + "loss": 3.054868221282959, + "step": 3785, + "token_acc": 0.29375726530664387 + }, + { + "epoch": 2.219290530636177, + "grad_norm": 0.3950931748868594, + "learning_rate": 0.0001979748377202293, + "loss": 3.0988640785217285, + "step": 3786, + "token_acc": 0.2864457880236215 + }, + { + "epoch": 2.219876868953386, + "grad_norm": 0.35024807643860684, + "learning_rate": 0.00019797289658798252, + "loss": 3.0997066497802734, + "step": 3787, + "token_acc": 0.28544134307038627 + }, + { + "epoch": 2.220463207270595, + "grad_norm": 0.3157189785983546, + "learning_rate": 0.00019797095453541244, + "loss": 3.0601439476013184, + "step": 3788, + "token_acc": 0.29199822072851284 + }, + { + "epoch": 2.221049545587804, + "grad_norm": 0.3721884559213196, + "learning_rate": 0.0001979690115625372, + "loss": 3.0136845111846924, + "step": 3789, + "token_acc": 0.2987582272352564 + }, + { + "epoch": 2.221635883905013, + "grad_norm": 0.3563329142879425, + "learning_rate": 0.00019796706766937513, + "loss": 3.0651297569274902, + "step": 3790, + "token_acc": 0.29267789972690267 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.32371765088238985, + "learning_rate": 0.00019796512285594447, + "loss": 3.068263292312622, + "step": 3791, + "token_acc": 0.29246109402665177 + }, + { + "epoch": 2.2228085605394314, + "grad_norm": 0.3576982355119628, + "learning_rate": 0.0001979631771222635, + "loss": 3.0744235515594482, + "step": 3792, + "token_acc": 0.2912931966168587 + }, + { + "epoch": 2.22339489885664, + "grad_norm": 0.3509447133289146, + "learning_rate": 0.0001979612304683505, + "loss": 3.0494871139526367, + "step": 3793, + "token_acc": 0.29288084913996837 + }, + { + "epoch": 2.223981237173849, + "grad_norm": 0.3245086267603309, + "learning_rate": 0.00019795928289422375, + "loss": 3.0446319580078125, + "step": 3794, + "token_acc": 0.2952821348974849 + }, + { + "epoch": 2.2245675754910583, + "grad_norm": 0.2887129474105757, + "learning_rate": 0.00019795733439990153, + "loss": 3.083066940307617, + "step": 3795, + "token_acc": 0.29071148546364944 + }, + { + "epoch": 2.2251539138082674, + "grad_norm": 0.3746965333089583, + "learning_rate": 0.0001979553849854021, + "loss": 3.096168279647827, + "step": 3796, + "token_acc": 0.2875678476585271 + }, + { + "epoch": 2.2257402521254765, + "grad_norm": 0.4970990265567727, + "learning_rate": 0.0001979534346507439, + "loss": 3.102025032043457, + "step": 3797, + "token_acc": 0.28624045484727756 + }, + { + "epoch": 2.226326590442685, + "grad_norm": 0.528823959950421, + "learning_rate": 0.00019795148339594513, + "loss": 3.1168415546417236, + "step": 3798, + "token_acc": 0.2835878135157938 + }, + { + "epoch": 2.2269129287598943, + "grad_norm": 0.37567138498554514, + "learning_rate": 0.00019794953122102417, + "loss": 3.0929999351501465, + "step": 3799, + "token_acc": 0.28861375482612334 + }, + { + "epoch": 2.2274992670771034, + "grad_norm": 0.4782532200071338, + "learning_rate": 0.00019794757812599938, + "loss": 3.065145492553711, + "step": 3800, + "token_acc": 0.2919840760241428 + }, + { + "epoch": 2.2280856053943126, + "grad_norm": 0.3529207924501019, + "learning_rate": 0.00019794562411088907, + "loss": 3.035465955734253, + "step": 3801, + "token_acc": 0.29645383812850684 + }, + { + "epoch": 2.2286719437115217, + "grad_norm": 0.4843789820636367, + "learning_rate": 0.0001979436691757116, + "loss": 3.0939836502075195, + "step": 3802, + "token_acc": 0.28905246300916304 + }, + { + "epoch": 2.2292582820287308, + "grad_norm": 0.32421219430694725, + "learning_rate": 0.00019794171332048532, + "loss": 3.0550537109375, + "step": 3803, + "token_acc": 0.2923750065304843 + }, + { + "epoch": 2.2298446203459394, + "grad_norm": 0.41660244963102266, + "learning_rate": 0.00019793975654522865, + "loss": 3.0699100494384766, + "step": 3804, + "token_acc": 0.291048301435472 + }, + { + "epoch": 2.2304309586631486, + "grad_norm": 0.39986855016388834, + "learning_rate": 0.00019793779884995992, + "loss": 3.0800912380218506, + "step": 3805, + "token_acc": 0.2904607195164564 + }, + { + "epoch": 2.2310172969803577, + "grad_norm": 0.3917467610845519, + "learning_rate": 0.00019793584023469754, + "loss": 3.0499680042266846, + "step": 3806, + "token_acc": 0.2944743265778527 + }, + { + "epoch": 2.231603635297567, + "grad_norm": 0.39406904650667385, + "learning_rate": 0.00019793388069945994, + "loss": 3.1288533210754395, + "step": 3807, + "token_acc": 0.2828698410015765 + }, + { + "epoch": 2.232189973614776, + "grad_norm": 0.3974274998257378, + "learning_rate": 0.00019793192024426546, + "loss": 3.0647244453430176, + "step": 3808, + "token_acc": 0.29065758570858685 + }, + { + "epoch": 2.2327763119319846, + "grad_norm": 0.35364735209597536, + "learning_rate": 0.00019792995886913257, + "loss": 3.072148323059082, + "step": 3809, + "token_acc": 0.29011482542273326 + }, + { + "epoch": 2.2333626502491937, + "grad_norm": 0.3485360375514206, + "learning_rate": 0.0001979279965740797, + "loss": 3.080418348312378, + "step": 3810, + "token_acc": 0.2898313014861661 + }, + { + "epoch": 2.233948988566403, + "grad_norm": 0.4027758447025723, + "learning_rate": 0.00019792603335912524, + "loss": 3.0551085472106934, + "step": 3811, + "token_acc": 0.2929281457204357 + }, + { + "epoch": 2.234535326883612, + "grad_norm": 0.37345352377432484, + "learning_rate": 0.0001979240692242877, + "loss": 3.04703426361084, + "step": 3812, + "token_acc": 0.2945132982314027 + }, + { + "epoch": 2.235121665200821, + "grad_norm": 0.38550662001234454, + "learning_rate": 0.0001979221041695854, + "loss": 3.114452838897705, + "step": 3813, + "token_acc": 0.28602288409647986 + }, + { + "epoch": 2.23570800351803, + "grad_norm": 0.3949782844491148, + "learning_rate": 0.00019792013819503693, + "loss": 3.0616583824157715, + "step": 3814, + "token_acc": 0.2910285624849123 + }, + { + "epoch": 2.236294341835239, + "grad_norm": 0.3184934679251132, + "learning_rate": 0.00019791817130066072, + "loss": 3.1092820167541504, + "step": 3815, + "token_acc": 0.28525305162989606 + }, + { + "epoch": 2.236880680152448, + "grad_norm": 0.39104097290584805, + "learning_rate": 0.00019791620348647522, + "loss": 3.0384013652801514, + "step": 3816, + "token_acc": 0.2947135282440177 + }, + { + "epoch": 2.237467018469657, + "grad_norm": 0.3085213492631322, + "learning_rate": 0.00019791423475249892, + "loss": 3.0341625213623047, + "step": 3817, + "token_acc": 0.29787547682162024 + }, + { + "epoch": 2.238053356786866, + "grad_norm": 0.32723313844278035, + "learning_rate": 0.00019791226509875034, + "loss": 3.032844066619873, + "step": 3818, + "token_acc": 0.2974975041939833 + }, + { + "epoch": 2.2386396951040752, + "grad_norm": 0.34137692023776933, + "learning_rate": 0.00019791029452524797, + "loss": 3.0497708320617676, + "step": 3819, + "token_acc": 0.294521695865085 + }, + { + "epoch": 2.239226033421284, + "grad_norm": 0.3248472053422265, + "learning_rate": 0.00019790832303201032, + "loss": 3.042205810546875, + "step": 3820, + "token_acc": 0.29325077867325955 + }, + { + "epoch": 2.239812371738493, + "grad_norm": 0.36807091289750077, + "learning_rate": 0.00019790635061905592, + "loss": 3.0656116008758545, + "step": 3821, + "token_acc": 0.29280753305729934 + }, + { + "epoch": 2.240398710055702, + "grad_norm": 0.3697817276162584, + "learning_rate": 0.00019790437728640329, + "loss": 3.081299304962158, + "step": 3822, + "token_acc": 0.2899944442974682 + }, + { + "epoch": 2.2409850483729112, + "grad_norm": 0.34097419942137264, + "learning_rate": 0.00019790240303407092, + "loss": 3.0831947326660156, + "step": 3823, + "token_acc": 0.2899033144339468 + }, + { + "epoch": 2.2415713866901203, + "grad_norm": 0.35510218559109447, + "learning_rate": 0.00019790042786207743, + "loss": 3.051969528198242, + "step": 3824, + "token_acc": 0.29546849203924275 + }, + { + "epoch": 2.2421577250073295, + "grad_norm": 0.3226829422229039, + "learning_rate": 0.00019789845177044132, + "loss": 3.0572009086608887, + "step": 3825, + "token_acc": 0.29258560005020684 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.3602060258290102, + "learning_rate": 0.00019789647475918122, + "loss": 3.0947394371032715, + "step": 3826, + "token_acc": 0.28806298483269654 + }, + { + "epoch": 2.2433304016417472, + "grad_norm": 0.3786246733709363, + "learning_rate": 0.00019789449682831563, + "loss": 3.0828568935394287, + "step": 3827, + "token_acc": 0.2879579940784604 + }, + { + "epoch": 2.2439167399589564, + "grad_norm": 0.5105043078199382, + "learning_rate": 0.00019789251797786315, + "loss": 3.097623348236084, + "step": 3828, + "token_acc": 0.2876481104228163 + }, + { + "epoch": 2.2445030782761655, + "grad_norm": 0.5331372829565559, + "learning_rate": 0.00019789053820784238, + "loss": 3.027785301208496, + "step": 3829, + "token_acc": 0.2957140666530362 + }, + { + "epoch": 2.2450894165933746, + "grad_norm": 0.34342607185948726, + "learning_rate": 0.0001978885575182719, + "loss": 3.069899082183838, + "step": 3830, + "token_acc": 0.29045358287696915 + }, + { + "epoch": 2.2456757549105832, + "grad_norm": 0.42637796526843513, + "learning_rate": 0.00019788657590917038, + "loss": 3.0789177417755127, + "step": 3831, + "token_acc": 0.28990223013522015 + }, + { + "epoch": 2.2462620932277924, + "grad_norm": 0.36022269529952483, + "learning_rate": 0.0001978845933805563, + "loss": 3.0939173698425293, + "step": 3832, + "token_acc": 0.28790971366114665 + }, + { + "epoch": 2.2468484315450015, + "grad_norm": 0.39269173541876246, + "learning_rate": 0.00019788260993244843, + "loss": 3.062408924102783, + "step": 3833, + "token_acc": 0.29059209989390355 + }, + { + "epoch": 2.2474347698622106, + "grad_norm": 0.31103159161968225, + "learning_rate": 0.0001978806255648653, + "loss": 3.0612998008728027, + "step": 3834, + "token_acc": 0.2915513857885721 + }, + { + "epoch": 2.2480211081794197, + "grad_norm": 0.3808013998924554, + "learning_rate": 0.00019787864027782562, + "loss": 3.0871999263763428, + "step": 3835, + "token_acc": 0.28910718601466207 + }, + { + "epoch": 2.2486074464966284, + "grad_norm": 0.31738546832779685, + "learning_rate": 0.00019787665407134801, + "loss": 3.0428075790405273, + "step": 3836, + "token_acc": 0.2943471007057312 + }, + { + "epoch": 2.2491937848138375, + "grad_norm": 0.3885261989418085, + "learning_rate": 0.0001978746669454511, + "loss": 3.0900988578796387, + "step": 3837, + "token_acc": 0.2885441958981247 + }, + { + "epoch": 2.2497801231310466, + "grad_norm": 0.45066357840273746, + "learning_rate": 0.00019787267890015363, + "loss": 3.0723986625671387, + "step": 3838, + "token_acc": 0.2923333271717575 + }, + { + "epoch": 2.2503664614482557, + "grad_norm": 0.4983521815503289, + "learning_rate": 0.00019787068993547418, + "loss": 3.0541491508483887, + "step": 3839, + "token_acc": 0.2945795259324838 + }, + { + "epoch": 2.250952799765465, + "grad_norm": 0.44011693361142934, + "learning_rate": 0.00019786870005143148, + "loss": 3.0687098503112793, + "step": 3840, + "token_acc": 0.29009782356882896 + }, + { + "epoch": 2.2515391380826735, + "grad_norm": 0.3332672507034587, + "learning_rate": 0.0001978667092480442, + "loss": 3.050999164581299, + "step": 3841, + "token_acc": 0.2927275158259582 + }, + { + "epoch": 2.2521254763998826, + "grad_norm": 0.33316882156129396, + "learning_rate": 0.0001978647175253311, + "loss": 3.0840840339660645, + "step": 3842, + "token_acc": 0.29060568826103866 + }, + { + "epoch": 2.2527118147170917, + "grad_norm": 0.3803384958834979, + "learning_rate": 0.00019786272488331088, + "loss": 3.082376480102539, + "step": 3843, + "token_acc": 0.289594089957894 + }, + { + "epoch": 2.253298153034301, + "grad_norm": 0.4251140913978163, + "learning_rate": 0.0001978607313220022, + "loss": 3.053457260131836, + "step": 3844, + "token_acc": 0.29341770321834454 + }, + { + "epoch": 2.25388449135151, + "grad_norm": 0.35962353817743903, + "learning_rate": 0.00019785873684142382, + "loss": 3.0669326782226562, + "step": 3845, + "token_acc": 0.2906754631383887 + }, + { + "epoch": 2.254470829668719, + "grad_norm": 0.4387624314495781, + "learning_rate": 0.00019785674144159448, + "loss": 3.0842366218566895, + "step": 3846, + "token_acc": 0.2890290709889671 + }, + { + "epoch": 2.2550571679859277, + "grad_norm": 0.35356744404633866, + "learning_rate": 0.0001978547451225329, + "loss": 3.0830812454223633, + "step": 3847, + "token_acc": 0.29014837767779994 + }, + { + "epoch": 2.255643506303137, + "grad_norm": 0.3477602693635744, + "learning_rate": 0.00019785274788425788, + "loss": 3.0593464374542236, + "step": 3848, + "token_acc": 0.293080381679594 + }, + { + "epoch": 2.256229844620346, + "grad_norm": 0.35894957550749595, + "learning_rate": 0.00019785074972678812, + "loss": 3.036929130554199, + "step": 3849, + "token_acc": 0.29453205483794276 + }, + { + "epoch": 2.256816182937555, + "grad_norm": 0.42093371558242826, + "learning_rate": 0.00019784875065014242, + "loss": 3.0954580307006836, + "step": 3850, + "token_acc": 0.2863352363979236 + }, + { + "epoch": 2.257402521254764, + "grad_norm": 0.3809209028622022, + "learning_rate": 0.0001978467506543396, + "loss": 3.0760891437530518, + "step": 3851, + "token_acc": 0.29075386597938147 + }, + { + "epoch": 2.257988859571973, + "grad_norm": 0.43697845784981676, + "learning_rate": 0.00019784474973939838, + "loss": 3.0397307872772217, + "step": 3852, + "token_acc": 0.29563900441351215 + }, + { + "epoch": 2.258575197889182, + "grad_norm": 0.4293624524475749, + "learning_rate": 0.0001978427479053376, + "loss": 3.085088014602661, + "step": 3853, + "token_acc": 0.28990424970086004 + }, + { + "epoch": 2.259161536206391, + "grad_norm": 0.46268425793005175, + "learning_rate": 0.00019784074515217604, + "loss": 3.090554714202881, + "step": 3854, + "token_acc": 0.29035512138076713 + }, + { + "epoch": 2.2597478745236, + "grad_norm": 0.40647065091778983, + "learning_rate": 0.0001978387414799325, + "loss": 3.062056541442871, + "step": 3855, + "token_acc": 0.2921633820510225 + }, + { + "epoch": 2.2603342128408093, + "grad_norm": 0.37803857639264177, + "learning_rate": 0.00019783673688862586, + "loss": 3.0450124740600586, + "step": 3856, + "token_acc": 0.29378786168491555 + }, + { + "epoch": 2.2609205511580184, + "grad_norm": 0.39589980240767714, + "learning_rate": 0.0001978347313782749, + "loss": 3.0835564136505127, + "step": 3857, + "token_acc": 0.28886139126472965 + }, + { + "epoch": 2.261506889475227, + "grad_norm": 0.3950371376683415, + "learning_rate": 0.00019783272494889844, + "loss": 3.087594509124756, + "step": 3858, + "token_acc": 0.2871967087976178 + }, + { + "epoch": 2.262093227792436, + "grad_norm": 0.35490649937903, + "learning_rate": 0.00019783071760051538, + "loss": 3.073307752609253, + "step": 3859, + "token_acc": 0.2903033875766009 + }, + { + "epoch": 2.2626795661096453, + "grad_norm": 0.4002758399775946, + "learning_rate": 0.00019782870933314457, + "loss": 3.059218168258667, + "step": 3860, + "token_acc": 0.29250263416546646 + }, + { + "epoch": 2.2632659044268544, + "grad_norm": 0.3943013465477426, + "learning_rate": 0.00019782670014680486, + "loss": 3.0819945335388184, + "step": 3861, + "token_acc": 0.2900453661697991 + }, + { + "epoch": 2.2638522427440635, + "grad_norm": 0.3869606624731942, + "learning_rate": 0.00019782469004151515, + "loss": 3.0478851795196533, + "step": 3862, + "token_acc": 0.29412500814703774 + }, + { + "epoch": 2.264438581061272, + "grad_norm": 0.380158075377517, + "learning_rate": 0.00019782267901729425, + "loss": 3.011404037475586, + "step": 3863, + "token_acc": 0.29967902034504634 + }, + { + "epoch": 2.2650249193784813, + "grad_norm": 0.3743676319930695, + "learning_rate": 0.00019782066707416113, + "loss": 3.0840048789978027, + "step": 3864, + "token_acc": 0.29036241134196356 + }, + { + "epoch": 2.2656112576956904, + "grad_norm": 0.42456445124439374, + "learning_rate": 0.0001978186542121346, + "loss": 3.1118805408477783, + "step": 3865, + "token_acc": 0.2845688650290346 + }, + { + "epoch": 2.2661975960128995, + "grad_norm": 0.4683345251303445, + "learning_rate": 0.0001978166404312337, + "loss": 3.05707049369812, + "step": 3866, + "token_acc": 0.2921084164899667 + }, + { + "epoch": 2.2667839343301086, + "grad_norm": 0.4168564471174331, + "learning_rate": 0.00019781462573147725, + "loss": 3.061349630355835, + "step": 3867, + "token_acc": 0.2907291959728967 + }, + { + "epoch": 2.2673702726473177, + "grad_norm": 0.3897562301307626, + "learning_rate": 0.0001978126101128842, + "loss": 3.031933307647705, + "step": 3868, + "token_acc": 0.2967982359657668 + }, + { + "epoch": 2.2679566109645264, + "grad_norm": 0.4335121229192419, + "learning_rate": 0.00019781059357547345, + "loss": 3.0813913345336914, + "step": 3869, + "token_acc": 0.289440160386766 + }, + { + "epoch": 2.2685429492817355, + "grad_norm": 0.3989462236388706, + "learning_rate": 0.000197808576119264, + "loss": 3.056244373321533, + "step": 3870, + "token_acc": 0.29418721433647393 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.41244545306837865, + "learning_rate": 0.00019780655774427478, + "loss": 3.1044414043426514, + "step": 3871, + "token_acc": 0.2865758296937521 + }, + { + "epoch": 2.2697156259161537, + "grad_norm": 0.40007065004324244, + "learning_rate": 0.00019780453845052475, + "loss": 3.0269341468811035, + "step": 3872, + "token_acc": 0.2985389844446699 + }, + { + "epoch": 2.270301964233363, + "grad_norm": 0.3455341494471005, + "learning_rate": 0.00019780251823803285, + "loss": 3.0718140602111816, + "step": 3873, + "token_acc": 0.2918600097632676 + }, + { + "epoch": 2.2708883025505715, + "grad_norm": 0.35191370222477264, + "learning_rate": 0.0001978004971068181, + "loss": 3.105452537536621, + "step": 3874, + "token_acc": 0.2871991577929339 + }, + { + "epoch": 2.2714746408677806, + "grad_norm": 0.369239621976737, + "learning_rate": 0.00019779847505689948, + "loss": 3.074030876159668, + "step": 3875, + "token_acc": 0.2907204365160919 + }, + { + "epoch": 2.2720609791849897, + "grad_norm": 0.34297005557027793, + "learning_rate": 0.00019779645208829595, + "loss": 3.0372865200042725, + "step": 3876, + "token_acc": 0.29658090572627727 + }, + { + "epoch": 2.272647317502199, + "grad_norm": 0.34038265953210595, + "learning_rate": 0.00019779442820102653, + "loss": 3.0855207443237305, + "step": 3877, + "token_acc": 0.2888565526380751 + }, + { + "epoch": 2.273233655819408, + "grad_norm": 0.3572219542224862, + "learning_rate": 0.00019779240339511025, + "loss": 3.1044750213623047, + "step": 3878, + "token_acc": 0.2871144883014916 + }, + { + "epoch": 2.273819994136617, + "grad_norm": 0.37208387117372316, + "learning_rate": 0.00019779037767056611, + "loss": 3.040769577026367, + "step": 3879, + "token_acc": 0.29628549692379874 + }, + { + "epoch": 2.2744063324538257, + "grad_norm": 0.38764424822561205, + "learning_rate": 0.00019778835102741315, + "loss": 3.0973968505859375, + "step": 3880, + "token_acc": 0.2881991620645655 + }, + { + "epoch": 2.274992670771035, + "grad_norm": 0.34203350960186524, + "learning_rate": 0.0001977863234656704, + "loss": 3.0668859481811523, + "step": 3881, + "token_acc": 0.2923337234746323 + }, + { + "epoch": 2.275579009088244, + "grad_norm": 0.3339718170418426, + "learning_rate": 0.00019778429498535692, + "loss": 3.0442793369293213, + "step": 3882, + "token_acc": 0.29317082669782357 + }, + { + "epoch": 2.276165347405453, + "grad_norm": 0.3772828287091329, + "learning_rate": 0.00019778226558649176, + "loss": 3.0768418312072754, + "step": 3883, + "token_acc": 0.2891283146293293 + }, + { + "epoch": 2.2767516857226617, + "grad_norm": 0.3514258317399042, + "learning_rate": 0.00019778023526909397, + "loss": 3.0617470741271973, + "step": 3884, + "token_acc": 0.29051059565547876 + }, + { + "epoch": 2.277338024039871, + "grad_norm": 0.3730563696832072, + "learning_rate": 0.00019777820403318262, + "loss": 3.075042247772217, + "step": 3885, + "token_acc": 0.2908457325591896 + }, + { + "epoch": 2.27792436235708, + "grad_norm": 0.32662730412792623, + "learning_rate": 0.00019777617187877682, + "loss": 3.1011276245117188, + "step": 3886, + "token_acc": 0.28689720938240865 + }, + { + "epoch": 2.278510700674289, + "grad_norm": 0.31635983598614803, + "learning_rate": 0.0001977741388058956, + "loss": 3.071061611175537, + "step": 3887, + "token_acc": 0.290070867354799 + }, + { + "epoch": 2.279097038991498, + "grad_norm": 0.33721267383444226, + "learning_rate": 0.00019777210481455813, + "loss": 3.1046292781829834, + "step": 3888, + "token_acc": 0.2880738953131714 + }, + { + "epoch": 2.2796833773087073, + "grad_norm": 0.3196585850021692, + "learning_rate": 0.00019777006990478349, + "loss": 3.103748321533203, + "step": 3889, + "token_acc": 0.28723306661230313 + }, + { + "epoch": 2.280269715625916, + "grad_norm": 0.33120359335203414, + "learning_rate": 0.0001977680340765908, + "loss": 3.0703234672546387, + "step": 3890, + "token_acc": 0.2924486015628797 + }, + { + "epoch": 2.280856053943125, + "grad_norm": 0.301223330458454, + "learning_rate": 0.00019776599732999913, + "loss": 3.012737274169922, + "step": 3891, + "token_acc": 0.2993307568568336 + }, + { + "epoch": 2.281442392260334, + "grad_norm": 0.34457857747464643, + "learning_rate": 0.00019776395966502767, + "loss": 3.124857187271118, + "step": 3892, + "token_acc": 0.28540306757977685 + }, + { + "epoch": 2.2820287305775433, + "grad_norm": 0.33680874522147286, + "learning_rate": 0.00019776192108169558, + "loss": 3.0902318954467773, + "step": 3893, + "token_acc": 0.28752942925168085 + }, + { + "epoch": 2.2826150688947524, + "grad_norm": 0.29992755828963785, + "learning_rate": 0.00019775988158002194, + "loss": 3.072676181793213, + "step": 3894, + "token_acc": 0.29093481667764826 + }, + { + "epoch": 2.283201407211961, + "grad_norm": 0.32217562503923125, + "learning_rate": 0.00019775784116002595, + "loss": 3.06217622756958, + "step": 3895, + "token_acc": 0.2922255724879798 + }, + { + "epoch": 2.28378774552917, + "grad_norm": 0.36714026462731875, + "learning_rate": 0.0001977557998217268, + "loss": 3.069631576538086, + "step": 3896, + "token_acc": 0.2911714425268642 + }, + { + "epoch": 2.2843740838463793, + "grad_norm": 0.35990979575254617, + "learning_rate": 0.00019775375756514362, + "loss": 3.0887770652770996, + "step": 3897, + "token_acc": 0.289622237826688 + }, + { + "epoch": 2.2849604221635884, + "grad_norm": 0.3330263531585479, + "learning_rate": 0.00019775171439029562, + "loss": 3.0638556480407715, + "step": 3898, + "token_acc": 0.29386143123710834 + }, + { + "epoch": 2.2855467604807975, + "grad_norm": 0.2963750565116905, + "learning_rate": 0.00019774967029720196, + "loss": 3.1000118255615234, + "step": 3899, + "token_acc": 0.28890363648676104 + }, + { + "epoch": 2.2861330987980066, + "grad_norm": 0.3354805972963505, + "learning_rate": 0.0001977476252858819, + "loss": 3.084341049194336, + "step": 3900, + "token_acc": 0.2889458094846578 + }, + { + "epoch": 2.2867194371152153, + "grad_norm": 0.38336428578348897, + "learning_rate": 0.0001977455793563546, + "loss": 3.0917348861694336, + "step": 3901, + "token_acc": 0.2873119958922573 + }, + { + "epoch": 2.2873057754324244, + "grad_norm": 0.3492427682653205, + "learning_rate": 0.00019774353250863932, + "loss": 3.0789947509765625, + "step": 3902, + "token_acc": 0.2897766186518221 + }, + { + "epoch": 2.2878921137496335, + "grad_norm": 0.3280790319423024, + "learning_rate": 0.00019774148474275524, + "loss": 3.060258388519287, + "step": 3903, + "token_acc": 0.2928114930182599 + }, + { + "epoch": 2.2884784520668426, + "grad_norm": 0.37595969062209994, + "learning_rate": 0.00019773943605872163, + "loss": 3.0321550369262695, + "step": 3904, + "token_acc": 0.2983767905351547 + }, + { + "epoch": 2.2890647903840518, + "grad_norm": 0.41823000450298575, + "learning_rate": 0.0001977373864565577, + "loss": 3.1204299926757812, + "step": 3905, + "token_acc": 0.28475537227325365 + }, + { + "epoch": 2.2896511287012604, + "grad_norm": 0.39220491819646247, + "learning_rate": 0.00019773533593628274, + "loss": 3.037033796310425, + "step": 3906, + "token_acc": 0.29528027227621273 + }, + { + "epoch": 2.2902374670184695, + "grad_norm": 0.3761338896329433, + "learning_rate": 0.00019773328449791601, + "loss": 3.0608339309692383, + "step": 3907, + "token_acc": 0.29134067613297576 + }, + { + "epoch": 2.2908238053356786, + "grad_norm": 0.4725410650045336, + "learning_rate": 0.00019773123214147679, + "loss": 3.0568594932556152, + "step": 3908, + "token_acc": 0.2913151074154603 + }, + { + "epoch": 2.2914101436528878, + "grad_norm": 0.3158851759938663, + "learning_rate": 0.0001977291788669843, + "loss": 3.0284476280212402, + "step": 3909, + "token_acc": 0.29672530733361596 + }, + { + "epoch": 2.291996481970097, + "grad_norm": 0.41928382392857605, + "learning_rate": 0.00019772712467445788, + "loss": 3.0885353088378906, + "step": 3910, + "token_acc": 0.2878898543642635 + }, + { + "epoch": 2.292582820287306, + "grad_norm": 0.39580765785621064, + "learning_rate": 0.0001977250695639168, + "loss": 3.077415943145752, + "step": 3911, + "token_acc": 0.28921388469970977 + }, + { + "epoch": 2.2931691586045146, + "grad_norm": 0.4880177730409939, + "learning_rate": 0.00019772301353538038, + "loss": 3.0579886436462402, + "step": 3912, + "token_acc": 0.2918747524764806 + }, + { + "epoch": 2.2937554969217238, + "grad_norm": 0.3548938789884596, + "learning_rate": 0.00019772095658886793, + "loss": 3.06215500831604, + "step": 3913, + "token_acc": 0.2917418144159296 + }, + { + "epoch": 2.294341835238933, + "grad_norm": 0.4000728819185458, + "learning_rate": 0.0001977188987243988, + "loss": 3.070331573486328, + "step": 3914, + "token_acc": 0.29202924315051604 + }, + { + "epoch": 2.294928173556142, + "grad_norm": 0.41387056754323365, + "learning_rate": 0.00019771683994199228, + "loss": 3.0733089447021484, + "step": 3915, + "token_acc": 0.29079799552525876 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.368425537632172, + "learning_rate": 0.00019771478024166773, + "loss": 3.0414767265319824, + "step": 3916, + "token_acc": 0.2939419557656969 + }, + { + "epoch": 2.2961008501905598, + "grad_norm": 0.4792296598248775, + "learning_rate": 0.00019771271962344447, + "loss": 3.090397357940674, + "step": 3917, + "token_acc": 0.2880425252369274 + }, + { + "epoch": 2.296687188507769, + "grad_norm": 0.5752338769050087, + "learning_rate": 0.0001977106580873419, + "loss": 3.1029324531555176, + "step": 3918, + "token_acc": 0.28563329494176143 + }, + { + "epoch": 2.297273526824978, + "grad_norm": 0.4579930708991789, + "learning_rate": 0.00019770859563337934, + "loss": 3.087873935699463, + "step": 3919, + "token_acc": 0.2874762808349146 + }, + { + "epoch": 2.297859865142187, + "grad_norm": 0.44713112297759794, + "learning_rate": 0.00019770653226157617, + "loss": 3.0868358612060547, + "step": 3920, + "token_acc": 0.28901353194489143 + }, + { + "epoch": 2.298446203459396, + "grad_norm": 0.48385659959776484, + "learning_rate": 0.00019770446797195187, + "loss": 3.090729236602783, + "step": 3921, + "token_acc": 0.2905875785948246 + }, + { + "epoch": 2.2990325417766053, + "grad_norm": 0.380021781282166, + "learning_rate": 0.0001977024027645257, + "loss": 3.08174467086792, + "step": 3922, + "token_acc": 0.2890153299361966 + }, + { + "epoch": 2.299618880093814, + "grad_norm": 0.3288134995101452, + "learning_rate": 0.0001977003366393171, + "loss": 3.0719780921936035, + "step": 3923, + "token_acc": 0.29157037848580203 + }, + { + "epoch": 2.300205218411023, + "grad_norm": 0.33002493717042414, + "learning_rate": 0.0001976982695963455, + "loss": 3.0802690982818604, + "step": 3924, + "token_acc": 0.2887173187369119 + }, + { + "epoch": 2.300791556728232, + "grad_norm": 0.29661916318359, + "learning_rate": 0.0001976962016356303, + "loss": 3.035904884338379, + "step": 3925, + "token_acc": 0.2945318222271227 + }, + { + "epoch": 2.3013778950454413, + "grad_norm": 0.2888831355919122, + "learning_rate": 0.00019769413275719098, + "loss": 3.07951021194458, + "step": 3926, + "token_acc": 0.2898424586582524 + }, + { + "epoch": 2.3019642333626504, + "grad_norm": 0.35350921861194695, + "learning_rate": 0.0001976920629610469, + "loss": 3.0688276290893555, + "step": 3927, + "token_acc": 0.28918185548451997 + }, + { + "epoch": 2.302550571679859, + "grad_norm": 0.35487593752783886, + "learning_rate": 0.00019768999224721752, + "loss": 3.0326390266418457, + "step": 3928, + "token_acc": 0.29618882000769936 + }, + { + "epoch": 2.303136909997068, + "grad_norm": 0.30245074871517424, + "learning_rate": 0.0001976879206157223, + "loss": 3.0536086559295654, + "step": 3929, + "token_acc": 0.29366098377969796 + }, + { + "epoch": 2.3037232483142773, + "grad_norm": 0.37021968241025716, + "learning_rate": 0.0001976858480665807, + "loss": 3.063889741897583, + "step": 3930, + "token_acc": 0.29233408135365885 + }, + { + "epoch": 2.3043095866314864, + "grad_norm": 0.35570209454030016, + "learning_rate": 0.0001976837745998122, + "loss": 3.0275192260742188, + "step": 3931, + "token_acc": 0.29613200406999185 + }, + { + "epoch": 2.3048959249486956, + "grad_norm": 0.45177582271090566, + "learning_rate": 0.00019768170021543626, + "loss": 3.029999256134033, + "step": 3932, + "token_acc": 0.29564851185292446 + }, + { + "epoch": 2.3054822632659047, + "grad_norm": 0.5483197251038302, + "learning_rate": 0.00019767962491347237, + "loss": 3.078869104385376, + "step": 3933, + "token_acc": 0.2902977359660912 + }, + { + "epoch": 2.3060686015831133, + "grad_norm": 0.44266582453522674, + "learning_rate": 0.00019767754869394005, + "loss": 3.041951894760132, + "step": 3934, + "token_acc": 0.29301954274264536 + }, + { + "epoch": 2.3066549399003224, + "grad_norm": 0.31380464085717574, + "learning_rate": 0.00019767547155685877, + "loss": 3.084629535675049, + "step": 3935, + "token_acc": 0.28886242892126346 + }, + { + "epoch": 2.3072412782175316, + "grad_norm": 0.39543992471987766, + "learning_rate": 0.00019767339350224808, + "loss": 3.045207977294922, + "step": 3936, + "token_acc": 0.29422917229442425 + }, + { + "epoch": 2.3078276165347407, + "grad_norm": 0.46559447260023007, + "learning_rate": 0.00019767131453012743, + "loss": 3.1069793701171875, + "step": 3937, + "token_acc": 0.28597872428068294 + }, + { + "epoch": 2.3084139548519493, + "grad_norm": 0.5168921879271132, + "learning_rate": 0.00019766923464051642, + "loss": 3.089266777038574, + "step": 3938, + "token_acc": 0.2876342421460436 + }, + { + "epoch": 2.3090002931691584, + "grad_norm": 0.3508368107430926, + "learning_rate": 0.00019766715383343457, + "loss": 3.0697221755981445, + "step": 3939, + "token_acc": 0.29131249175570506 + }, + { + "epoch": 2.3095866314863676, + "grad_norm": 0.3805241050145065, + "learning_rate": 0.0001976650721089014, + "loss": 3.09342098236084, + "step": 3940, + "token_acc": 0.28739591781386337 + }, + { + "epoch": 2.3101729698035767, + "grad_norm": 0.42671834029274397, + "learning_rate": 0.00019766298946693648, + "loss": 3.0487489700317383, + "step": 3941, + "token_acc": 0.29326043466764246 + }, + { + "epoch": 2.310759308120786, + "grad_norm": 0.35535042637947967, + "learning_rate": 0.00019766090590755936, + "loss": 3.034674644470215, + "step": 3942, + "token_acc": 0.2965640023217256 + }, + { + "epoch": 2.311345646437995, + "grad_norm": 0.3157537783986005, + "learning_rate": 0.00019765882143078968, + "loss": 3.0724880695343018, + "step": 3943, + "token_acc": 0.29286366839111716 + }, + { + "epoch": 2.3119319847552036, + "grad_norm": 0.3623918428308919, + "learning_rate": 0.00019765673603664693, + "loss": 3.0392496585845947, + "step": 3944, + "token_acc": 0.29375016289191797 + }, + { + "epoch": 2.3125183230724127, + "grad_norm": 0.2952976239043702, + "learning_rate": 0.00019765464972515076, + "loss": 3.06953763961792, + "step": 3945, + "token_acc": 0.2895000790545081 + }, + { + "epoch": 2.313104661389622, + "grad_norm": 0.3403753795015596, + "learning_rate": 0.00019765256249632075, + "loss": 3.0562973022460938, + "step": 3946, + "token_acc": 0.292378230939083 + }, + { + "epoch": 2.313690999706831, + "grad_norm": 0.30309617389538196, + "learning_rate": 0.00019765047435017647, + "loss": 3.1248483657836914, + "step": 3947, + "token_acc": 0.2838542470943101 + }, + { + "epoch": 2.31427733802404, + "grad_norm": 0.3548060391917873, + "learning_rate": 0.0001976483852867376, + "loss": 3.083674907684326, + "step": 3948, + "token_acc": 0.290798976164146 + }, + { + "epoch": 2.3148636763412487, + "grad_norm": 0.37399859025584714, + "learning_rate": 0.00019764629530602372, + "loss": 3.050013780593872, + "step": 3949, + "token_acc": 0.2936035816626335 + }, + { + "epoch": 2.315450014658458, + "grad_norm": 0.30935216047609504, + "learning_rate": 0.00019764420440805447, + "loss": 3.1083614826202393, + "step": 3950, + "token_acc": 0.28609565477625026 + }, + { + "epoch": 2.316036352975667, + "grad_norm": 0.4139210947749515, + "learning_rate": 0.0001976421125928495, + "loss": 3.095822334289551, + "step": 3951, + "token_acc": 0.28736773662635057 + }, + { + "epoch": 2.316622691292876, + "grad_norm": 0.3820220865098993, + "learning_rate": 0.00019764001986042847, + "loss": 3.069571018218994, + "step": 3952, + "token_acc": 0.2908962369587932 + }, + { + "epoch": 2.317209029610085, + "grad_norm": 0.3271748428578365, + "learning_rate": 0.00019763792621081103, + "loss": 3.0487797260284424, + "step": 3953, + "token_acc": 0.2947320713352165 + }, + { + "epoch": 2.3177953679272942, + "grad_norm": 0.3777244235634175, + "learning_rate": 0.00019763583164401682, + "loss": 3.0504446029663086, + "step": 3954, + "token_acc": 0.2940915797454866 + }, + { + "epoch": 2.318381706244503, + "grad_norm": 0.3556832797408692, + "learning_rate": 0.00019763373616006556, + "loss": 3.0575156211853027, + "step": 3955, + "token_acc": 0.2922865650187381 + }, + { + "epoch": 2.318968044561712, + "grad_norm": 0.34913945763265647, + "learning_rate": 0.0001976316397589769, + "loss": 3.0987510681152344, + "step": 3956, + "token_acc": 0.288177359801569 + }, + { + "epoch": 2.319554382878921, + "grad_norm": 0.440126868231471, + "learning_rate": 0.00019762954244077055, + "loss": 3.0395970344543457, + "step": 3957, + "token_acc": 0.2957527461595652 + }, + { + "epoch": 2.3201407211961302, + "grad_norm": 0.3509503185405221, + "learning_rate": 0.0001976274442054662, + "loss": 3.11032772064209, + "step": 3958, + "token_acc": 0.28468988473591955 + }, + { + "epoch": 2.3207270595133394, + "grad_norm": 0.3501176029079926, + "learning_rate": 0.00019762534505308356, + "loss": 3.08274245262146, + "step": 3959, + "token_acc": 0.28878287990480533 + }, + { + "epoch": 2.321313397830548, + "grad_norm": 0.31064228163735674, + "learning_rate": 0.00019762324498364236, + "loss": 3.0849123001098633, + "step": 3960, + "token_acc": 0.2906457581903237 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.40329396017342894, + "learning_rate": 0.00019762114399716232, + "loss": 3.0859622955322266, + "step": 3961, + "token_acc": 0.28828072707496655 + }, + { + "epoch": 2.3224860744649662, + "grad_norm": 0.33496347279919436, + "learning_rate": 0.00019761904209366317, + "loss": 3.072143077850342, + "step": 3962, + "token_acc": 0.2910849119088621 + }, + { + "epoch": 2.3230724127821754, + "grad_norm": 0.31563389722342244, + "learning_rate": 0.00019761693927316469, + "loss": 3.0302629470825195, + "step": 3963, + "token_acc": 0.2973203686660531 + }, + { + "epoch": 2.3236587510993845, + "grad_norm": 0.3391197355017933, + "learning_rate": 0.00019761483553568657, + "loss": 3.086792469024658, + "step": 3964, + "token_acc": 0.2884605844771272 + }, + { + "epoch": 2.3242450894165936, + "grad_norm": 0.3309174055092619, + "learning_rate": 0.00019761273088124862, + "loss": 3.0333189964294434, + "step": 3965, + "token_acc": 0.295703423053648 + }, + { + "epoch": 2.3248314277338022, + "grad_norm": 0.30668035491719037, + "learning_rate": 0.00019761062530987062, + "loss": 3.0695810317993164, + "step": 3966, + "token_acc": 0.2917963840022063 + }, + { + "epoch": 2.3254177660510114, + "grad_norm": 0.32186820375791175, + "learning_rate": 0.00019760851882157234, + "loss": 3.092996120452881, + "step": 3967, + "token_acc": 0.28774220235783143 + }, + { + "epoch": 2.3260041043682205, + "grad_norm": 0.3754082831054445, + "learning_rate": 0.0001976064114163735, + "loss": 3.0739240646362305, + "step": 3968, + "token_acc": 0.2905131372501527 + }, + { + "epoch": 2.3265904426854296, + "grad_norm": 0.31148615337814417, + "learning_rate": 0.000197604303094294, + "loss": 3.09426212310791, + "step": 3969, + "token_acc": 0.2887943685920468 + }, + { + "epoch": 2.3271767810026387, + "grad_norm": 0.3138526101525699, + "learning_rate": 0.00019760219385535357, + "loss": 3.0846781730651855, + "step": 3970, + "token_acc": 0.28889640622538754 + }, + { + "epoch": 2.3277631193198474, + "grad_norm": 0.3615140072700527, + "learning_rate": 0.00019760008369957205, + "loss": 3.08321213722229, + "step": 3971, + "token_acc": 0.2888261803031269 + }, + { + "epoch": 2.3283494576370565, + "grad_norm": 0.38084727284845, + "learning_rate": 0.00019759797262696927, + "loss": 3.093390464782715, + "step": 3972, + "token_acc": 0.28630255911888564 + }, + { + "epoch": 2.3289357959542656, + "grad_norm": 0.33983768672838577, + "learning_rate": 0.00019759586063756505, + "loss": 3.101447582244873, + "step": 3973, + "token_acc": 0.2863724941026106 + }, + { + "epoch": 2.3295221342714747, + "grad_norm": 0.33771826777627234, + "learning_rate": 0.00019759374773137923, + "loss": 3.1065969467163086, + "step": 3974, + "token_acc": 0.2843523520202387 + }, + { + "epoch": 2.330108472588684, + "grad_norm": 0.32914220404682254, + "learning_rate": 0.00019759163390843166, + "loss": 3.0686240196228027, + "step": 3975, + "token_acc": 0.29178785059151935 + }, + { + "epoch": 2.330694810905893, + "grad_norm": 0.2927059725789342, + "learning_rate": 0.0001975895191687422, + "loss": 3.0464584827423096, + "step": 3976, + "token_acc": 0.2947440742013054 + }, + { + "epoch": 2.3312811492231016, + "grad_norm": 0.37207018854899204, + "learning_rate": 0.00019758740351233072, + "loss": 3.040398597717285, + "step": 3977, + "token_acc": 0.29562048583144657 + }, + { + "epoch": 2.3318674875403107, + "grad_norm": 0.36751652393791073, + "learning_rate": 0.00019758528693921706, + "loss": 3.056946277618408, + "step": 3978, + "token_acc": 0.29244770901459444 + }, + { + "epoch": 2.33245382585752, + "grad_norm": 0.37590298052767956, + "learning_rate": 0.00019758316944942114, + "loss": 3.0540237426757812, + "step": 3979, + "token_acc": 0.293398796547552 + }, + { + "epoch": 2.333040164174729, + "grad_norm": 0.38418453359137894, + "learning_rate": 0.00019758105104296283, + "loss": 3.027588129043579, + "step": 3980, + "token_acc": 0.29923165394076867 + }, + { + "epoch": 2.333626502491938, + "grad_norm": 0.4380591346155051, + "learning_rate": 0.00019757893171986203, + "loss": 3.1071159839630127, + "step": 3981, + "token_acc": 0.2857442573423933 + }, + { + "epoch": 2.3342128408091467, + "grad_norm": 0.35478252859661724, + "learning_rate": 0.00019757681148013868, + "loss": 3.085390567779541, + "step": 3982, + "token_acc": 0.289410124817842 + }, + { + "epoch": 2.334799179126356, + "grad_norm": 0.40210671407512893, + "learning_rate": 0.00019757469032381266, + "loss": 3.0347161293029785, + "step": 3983, + "token_acc": 0.2951946159571457 + }, + { + "epoch": 2.335385517443565, + "grad_norm": 0.3634283300651795, + "learning_rate": 0.0001975725682509039, + "loss": 3.0693979263305664, + "step": 3984, + "token_acc": 0.2900009425622919 + }, + { + "epoch": 2.335971855760774, + "grad_norm": 0.3304167996947664, + "learning_rate": 0.00019757044526143235, + "loss": 3.048264980316162, + "step": 3985, + "token_acc": 0.29586135920902706 + }, + { + "epoch": 2.336558194077983, + "grad_norm": 0.3192896611692682, + "learning_rate": 0.00019756832135541796, + "loss": 3.062061309814453, + "step": 3986, + "token_acc": 0.29373342903987093 + }, + { + "epoch": 2.3371445323951923, + "grad_norm": 0.4365166073229798, + "learning_rate": 0.00019756619653288064, + "loss": 3.024679660797119, + "step": 3987, + "token_acc": 0.29773537429230446 + }, + { + "epoch": 2.337730870712401, + "grad_norm": 0.460682966241692, + "learning_rate": 0.0001975640707938404, + "loss": 3.0796947479248047, + "step": 3988, + "token_acc": 0.2877266222939887 + }, + { + "epoch": 2.33831720902961, + "grad_norm": 0.31069688326522954, + "learning_rate": 0.00019756194413831716, + "loss": 3.0663650035858154, + "step": 3989, + "token_acc": 0.292067858780376 + }, + { + "epoch": 2.338903547346819, + "grad_norm": 0.4254019623003263, + "learning_rate": 0.00019755981656633095, + "loss": 3.072375774383545, + "step": 3990, + "token_acc": 0.289774355717019 + }, + { + "epoch": 2.3394898856640283, + "grad_norm": 0.4180347623050922, + "learning_rate": 0.0001975576880779017, + "loss": 3.052222967147827, + "step": 3991, + "token_acc": 0.2926384296710621 + }, + { + "epoch": 2.340076223981237, + "grad_norm": 0.3397134867987974, + "learning_rate": 0.00019755555867304945, + "loss": 3.0734167098999023, + "step": 3992, + "token_acc": 0.2887498779227855 + }, + { + "epoch": 2.340662562298446, + "grad_norm": 0.37739143006517256, + "learning_rate": 0.0001975534283517942, + "loss": 3.044403314590454, + "step": 3993, + "token_acc": 0.2937327133523069 + }, + { + "epoch": 2.341248900615655, + "grad_norm": 0.4565405374669542, + "learning_rate": 0.0001975512971141559, + "loss": 3.0844674110412598, + "step": 3994, + "token_acc": 0.288412790560224 + }, + { + "epoch": 2.3418352389328643, + "grad_norm": 0.360839361569355, + "learning_rate": 0.00019754916496015463, + "loss": 3.0569329261779785, + "step": 3995, + "token_acc": 0.29257328372102565 + }, + { + "epoch": 2.3424215772500734, + "grad_norm": 0.37213110531571003, + "learning_rate": 0.0001975470318898104, + "loss": 3.079150438308716, + "step": 3996, + "token_acc": 0.29086718463719935 + }, + { + "epoch": 2.3430079155672825, + "grad_norm": 0.3539666838712452, + "learning_rate": 0.00019754489790314327, + "loss": 3.0526463985443115, + "step": 3997, + "token_acc": 0.29498118110341687 + }, + { + "epoch": 2.343594253884491, + "grad_norm": 0.3564909839532416, + "learning_rate": 0.00019754276300017326, + "loss": 3.0762414932250977, + "step": 3998, + "token_acc": 0.2892633996073788 + }, + { + "epoch": 2.3441805922017003, + "grad_norm": 0.34385310366567623, + "learning_rate": 0.00019754062718092043, + "loss": 3.0470519065856934, + "step": 3999, + "token_acc": 0.29430920307524217 + }, + { + "epoch": 2.3447669305189094, + "grad_norm": 0.37361099402393605, + "learning_rate": 0.00019753849044540483, + "loss": 3.015875816345215, + "step": 4000, + "token_acc": 0.29831418284732614 + }, + { + "epoch": 2.3453532688361185, + "grad_norm": 0.38353953015617503, + "learning_rate": 0.0001975363527936466, + "loss": 3.0424609184265137, + "step": 4001, + "token_acc": 0.29575657568703506 + }, + { + "epoch": 2.3459396071533276, + "grad_norm": 0.4132729845447788, + "learning_rate": 0.0001975342142256657, + "loss": 3.100404739379883, + "step": 4002, + "token_acc": 0.2857146605857517 + }, + { + "epoch": 2.3465259454705363, + "grad_norm": 0.4200571316947381, + "learning_rate": 0.00019753207474148234, + "loss": 3.0756542682647705, + "step": 4003, + "token_acc": 0.29013264662884425 + }, + { + "epoch": 2.3471122837877454, + "grad_norm": 0.30704287318361, + "learning_rate": 0.00019752993434111652, + "loss": 3.1128041744232178, + "step": 4004, + "token_acc": 0.28493046473214984 + }, + { + "epoch": 2.3476986221049545, + "grad_norm": 0.3463450225968444, + "learning_rate": 0.0001975277930245884, + "loss": 3.088301181793213, + "step": 4005, + "token_acc": 0.289170695369655 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.33521686160970243, + "learning_rate": 0.00019752565079191815, + "loss": 3.11116361618042, + "step": 4006, + "token_acc": 0.2855750590020229 + }, + { + "epoch": 2.3488712987393727, + "grad_norm": 0.42626517629359884, + "learning_rate": 0.00019752350764312574, + "loss": 3.0494749546051025, + "step": 4007, + "token_acc": 0.2940345746014735 + }, + { + "epoch": 2.349457637056582, + "grad_norm": 0.47921665366143085, + "learning_rate": 0.00019752136357823144, + "loss": 3.05031681060791, + "step": 4008, + "token_acc": 0.29465879620867624 + }, + { + "epoch": 2.3500439753737905, + "grad_norm": 0.36943211416243066, + "learning_rate": 0.00019751921859725532, + "loss": 3.1038665771484375, + "step": 4009, + "token_acc": 0.2869698219748311 + }, + { + "epoch": 2.3506303136909996, + "grad_norm": 0.30127973139343633, + "learning_rate": 0.00019751707270021756, + "loss": 3.085176467895508, + "step": 4010, + "token_acc": 0.2884002169197397 + }, + { + "epoch": 2.3512166520082087, + "grad_norm": 0.3731755540011698, + "learning_rate": 0.00019751492588713831, + "loss": 3.0522384643554688, + "step": 4011, + "token_acc": 0.29326455401386253 + }, + { + "epoch": 2.351802990325418, + "grad_norm": 0.3491736705298053, + "learning_rate": 0.0001975127781580377, + "loss": 3.070321559906006, + "step": 4012, + "token_acc": 0.29081333506291346 + }, + { + "epoch": 2.352389328642627, + "grad_norm": 0.3930004697026496, + "learning_rate": 0.000197510629512936, + "loss": 3.0925259590148926, + "step": 4013, + "token_acc": 0.28640986053870593 + }, + { + "epoch": 2.3529756669598356, + "grad_norm": 0.3579399373517333, + "learning_rate": 0.0001975084799518533, + "loss": 3.072746753692627, + "step": 4014, + "token_acc": 0.29037428781724506 + }, + { + "epoch": 2.3535620052770447, + "grad_norm": 0.36180353325130354, + "learning_rate": 0.0001975063294748098, + "loss": 3.065969944000244, + "step": 4015, + "token_acc": 0.2904752809510781 + }, + { + "epoch": 2.354148343594254, + "grad_norm": 0.3831685656311953, + "learning_rate": 0.00019750417808182577, + "loss": 3.0430374145507812, + "step": 4016, + "token_acc": 0.29508434628604907 + }, + { + "epoch": 2.354734681911463, + "grad_norm": 0.48109924320896097, + "learning_rate": 0.00019750202577292135, + "loss": 3.078310966491699, + "step": 4017, + "token_acc": 0.2875778507530291 + }, + { + "epoch": 2.355321020228672, + "grad_norm": 0.3661878565261176, + "learning_rate": 0.00019749987254811678, + "loss": 3.0924625396728516, + "step": 4018, + "token_acc": 0.28921013881085456 + }, + { + "epoch": 2.355907358545881, + "grad_norm": 0.3323767354235495, + "learning_rate": 0.0001974977184074323, + "loss": 3.0396461486816406, + "step": 4019, + "token_acc": 0.2962768305758194 + }, + { + "epoch": 2.35649369686309, + "grad_norm": 0.35590729016155426, + "learning_rate": 0.00019749556335088813, + "loss": 3.0730013847351074, + "step": 4020, + "token_acc": 0.29127455868890173 + }, + { + "epoch": 2.357080035180299, + "grad_norm": 0.39505741672431927, + "learning_rate": 0.00019749340737850455, + "loss": 3.095806121826172, + "step": 4021, + "token_acc": 0.28693255454071853 + }, + { + "epoch": 2.357666373497508, + "grad_norm": 0.3931286101181088, + "learning_rate": 0.00019749125049030176, + "loss": 3.0822367668151855, + "step": 4022, + "token_acc": 0.2881409772380501 + }, + { + "epoch": 2.358252711814717, + "grad_norm": 0.33811271720159, + "learning_rate": 0.00019748909268630006, + "loss": 3.047116279602051, + "step": 4023, + "token_acc": 0.29533874718010317 + }, + { + "epoch": 2.3588390501319263, + "grad_norm": 0.3779504439344047, + "learning_rate": 0.00019748693396651966, + "loss": 3.0572638511657715, + "step": 4024, + "token_acc": 0.29200676675829984 + }, + { + "epoch": 2.359425388449135, + "grad_norm": 0.327363311130571, + "learning_rate": 0.0001974847743309809, + "loss": 3.0716371536254883, + "step": 4025, + "token_acc": 0.2926272303864231 + }, + { + "epoch": 2.360011726766344, + "grad_norm": 0.35657928337489797, + "learning_rate": 0.00019748261377970405, + "loss": 3.079113721847534, + "step": 4026, + "token_acc": 0.2899916916980891 + }, + { + "epoch": 2.360598065083553, + "grad_norm": 0.41645378136936156, + "learning_rate": 0.0001974804523127094, + "loss": 3.0654830932617188, + "step": 4027, + "token_acc": 0.2919762740261795 + }, + { + "epoch": 2.3611844034007623, + "grad_norm": 0.39155550318973775, + "learning_rate": 0.00019747828993001726, + "loss": 3.078742504119873, + "step": 4028, + "token_acc": 0.29030880777531093 + }, + { + "epoch": 2.3617707417179714, + "grad_norm": 0.3349853077166782, + "learning_rate": 0.00019747612663164793, + "loss": 3.089872360229492, + "step": 4029, + "token_acc": 0.28676440818640053 + }, + { + "epoch": 2.3623570800351805, + "grad_norm": 0.3516355739479714, + "learning_rate": 0.00019747396241762174, + "loss": 3.0489275455474854, + "step": 4030, + "token_acc": 0.2926922681472342 + }, + { + "epoch": 2.362943418352389, + "grad_norm": 0.34540238508652715, + "learning_rate": 0.00019747179728795905, + "loss": 3.0936219692230225, + "step": 4031, + "token_acc": 0.28934985470578584 + }, + { + "epoch": 2.3635297566695983, + "grad_norm": 0.3396951313757897, + "learning_rate": 0.00019746963124268017, + "loss": 3.0573508739471436, + "step": 4032, + "token_acc": 0.29258172380836445 + }, + { + "epoch": 2.3641160949868074, + "grad_norm": 0.352775218091441, + "learning_rate": 0.00019746746428180544, + "loss": 3.0272040367126465, + "step": 4033, + "token_acc": 0.2970396413317087 + }, + { + "epoch": 2.3647024333040165, + "grad_norm": 0.32972058822562744, + "learning_rate": 0.0001974652964053552, + "loss": 3.0845980644226074, + "step": 4034, + "token_acc": 0.2878482466905141 + }, + { + "epoch": 2.3652887716212256, + "grad_norm": 0.41054318855458716, + "learning_rate": 0.00019746312761334984, + "loss": 3.0185203552246094, + "step": 4035, + "token_acc": 0.2995907223382485 + }, + { + "epoch": 2.3658751099384343, + "grad_norm": 0.3054874564322032, + "learning_rate": 0.0001974609579058097, + "loss": 3.086775064468384, + "step": 4036, + "token_acc": 0.2880820919117935 + }, + { + "epoch": 2.3664614482556434, + "grad_norm": 0.373699783046111, + "learning_rate": 0.00019745878728275526, + "loss": 3.0533313751220703, + "step": 4037, + "token_acc": 0.2936753281542995 + }, + { + "epoch": 2.3670477865728525, + "grad_norm": 0.3822475943474728, + "learning_rate": 0.00019745661574420683, + "loss": 3.0833749771118164, + "step": 4038, + "token_acc": 0.29057075000716925 + }, + { + "epoch": 2.3676341248900616, + "grad_norm": 0.43347225058887195, + "learning_rate": 0.00019745444329018476, + "loss": 3.0364768505096436, + "step": 4039, + "token_acc": 0.29621901787494187 + }, + { + "epoch": 2.3682204632072708, + "grad_norm": 0.2987437226451078, + "learning_rate": 0.00019745226992070957, + "loss": 3.0547916889190674, + "step": 4040, + "token_acc": 0.2934936550010884 + }, + { + "epoch": 2.36880680152448, + "grad_norm": 0.3701991464576738, + "learning_rate": 0.00019745009563580158, + "loss": 3.0611977577209473, + "step": 4041, + "token_acc": 0.29276847343357054 + }, + { + "epoch": 2.3693931398416885, + "grad_norm": 0.4346137899255046, + "learning_rate": 0.00019744792043548131, + "loss": 3.0678791999816895, + "step": 4042, + "token_acc": 0.28965079858145015 + }, + { + "epoch": 2.3699794781588976, + "grad_norm": 0.34573849844965404, + "learning_rate": 0.00019744574431976913, + "loss": 3.0146141052246094, + "step": 4043, + "token_acc": 0.299043251781686 + }, + { + "epoch": 2.3705658164761068, + "grad_norm": 0.34549166340211396, + "learning_rate": 0.00019744356728868546, + "loss": 3.0047109127044678, + "step": 4044, + "token_acc": 0.2996667387560158 + }, + { + "epoch": 2.371152154793316, + "grad_norm": 0.3189120652155531, + "learning_rate": 0.0001974413893422508, + "loss": 3.05513596534729, + "step": 4045, + "token_acc": 0.2935813953488372 + }, + { + "epoch": 2.3717384931105245, + "grad_norm": 0.35265155484296584, + "learning_rate": 0.0001974392104804856, + "loss": 3.060643196105957, + "step": 4046, + "token_acc": 0.2915285889525623 + }, + { + "epoch": 2.3723248314277336, + "grad_norm": 0.3312010661027504, + "learning_rate": 0.00019743703070341031, + "loss": 3.0882105827331543, + "step": 4047, + "token_acc": 0.2879098647818853 + }, + { + "epoch": 2.3729111697449428, + "grad_norm": 0.3645549961071063, + "learning_rate": 0.0001974348500110454, + "loss": 3.045698881149292, + "step": 4048, + "token_acc": 0.29487339858856887 + }, + { + "epoch": 2.373497508062152, + "grad_norm": 0.34492936738773283, + "learning_rate": 0.00019743266840341138, + "loss": 3.0732100009918213, + "step": 4049, + "token_acc": 0.2907530565113851 + }, + { + "epoch": 2.374083846379361, + "grad_norm": 0.27640734855562965, + "learning_rate": 0.00019743048588052872, + "loss": 3.1030168533325195, + "step": 4050, + "token_acc": 0.2869292225315003 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.3983753935949502, + "learning_rate": 0.00019742830244241793, + "loss": 3.065342426300049, + "step": 4051, + "token_acc": 0.29336490720460784 + }, + { + "epoch": 2.3752565230137788, + "grad_norm": 0.31185930933014083, + "learning_rate": 0.0001974261180890996, + "loss": 3.053422451019287, + "step": 4052, + "token_acc": 0.29468053665900745 + }, + { + "epoch": 2.375842861330988, + "grad_norm": 0.3197135069383617, + "learning_rate": 0.0001974239328205941, + "loss": 3.0621118545532227, + "step": 4053, + "token_acc": 0.29194192082637827 + }, + { + "epoch": 2.376429199648197, + "grad_norm": 0.39122664859307044, + "learning_rate": 0.00019742174663692203, + "loss": 3.084141731262207, + "step": 4054, + "token_acc": 0.28973876932573867 + }, + { + "epoch": 2.377015537965406, + "grad_norm": 0.41626362090386515, + "learning_rate": 0.00019741955953810395, + "loss": 3.0489418506622314, + "step": 4055, + "token_acc": 0.29564469750298067 + }, + { + "epoch": 2.377601876282615, + "grad_norm": 0.37015406747247503, + "learning_rate": 0.00019741737152416036, + "loss": 3.0619077682495117, + "step": 4056, + "token_acc": 0.29191145033332805 + }, + { + "epoch": 2.378188214599824, + "grad_norm": 0.3181942191240523, + "learning_rate": 0.00019741518259511187, + "loss": 3.0216617584228516, + "step": 4057, + "token_acc": 0.29683271157925534 + }, + { + "epoch": 2.378774552917033, + "grad_norm": 0.3514363996627795, + "learning_rate": 0.000197412992750979, + "loss": 3.075828790664673, + "step": 4058, + "token_acc": 0.2909657038749614 + }, + { + "epoch": 2.379360891234242, + "grad_norm": 0.349406071338303, + "learning_rate": 0.00019741080199178233, + "loss": 3.0800254344940186, + "step": 4059, + "token_acc": 0.29020382647351645 + }, + { + "epoch": 2.379947229551451, + "grad_norm": 0.32957404637611476, + "learning_rate": 0.0001974086103175424, + "loss": 3.052034616470337, + "step": 4060, + "token_acc": 0.2935027760770708 + }, + { + "epoch": 2.3805335678686603, + "grad_norm": 0.34928983401039476, + "learning_rate": 0.0001974064177282799, + "loss": 3.086732864379883, + "step": 4061, + "token_acc": 0.28873960232970164 + }, + { + "epoch": 2.3811199061858694, + "grad_norm": 0.3808710384732253, + "learning_rate": 0.00019740422422401531, + "loss": 3.0616822242736816, + "step": 4062, + "token_acc": 0.2920802584738794 + }, + { + "epoch": 2.381706244503078, + "grad_norm": 0.34980090771970274, + "learning_rate": 0.0001974020298047693, + "loss": 3.0339975357055664, + "step": 4063, + "token_acc": 0.29591441921031497 + }, + { + "epoch": 2.382292582820287, + "grad_norm": 0.3736204739614879, + "learning_rate": 0.00019739983447056249, + "loss": 3.016005516052246, + "step": 4064, + "token_acc": 0.2977012510617546 + }, + { + "epoch": 2.3828789211374963, + "grad_norm": 0.30737906401497556, + "learning_rate": 0.00019739763822141545, + "loss": 3.06400990486145, + "step": 4065, + "token_acc": 0.29100658614385483 + }, + { + "epoch": 2.3834652594547054, + "grad_norm": 0.33083162449094927, + "learning_rate": 0.00019739544105734888, + "loss": 3.0641822814941406, + "step": 4066, + "token_acc": 0.2911175580221998 + }, + { + "epoch": 2.3840515977719146, + "grad_norm": 0.4035405708881859, + "learning_rate": 0.00019739324297838337, + "loss": 3.1035690307617188, + "step": 4067, + "token_acc": 0.2875145778095913 + }, + { + "epoch": 2.3846379360891232, + "grad_norm": 0.3810975307925663, + "learning_rate": 0.00019739104398453958, + "loss": 3.060784339904785, + "step": 4068, + "token_acc": 0.2923531228385404 + }, + { + "epoch": 2.3852242744063323, + "grad_norm": 0.35022919064215496, + "learning_rate": 0.00019738884407583814, + "loss": 3.045821189880371, + "step": 4069, + "token_acc": 0.29527681052095317 + }, + { + "epoch": 2.3858106127235414, + "grad_norm": 0.2754435580939001, + "learning_rate": 0.0001973866432522998, + "loss": 3.030118703842163, + "step": 4070, + "token_acc": 0.2973155948249999 + }, + { + "epoch": 2.3863969510407506, + "grad_norm": 0.3811984634491082, + "learning_rate": 0.00019738444151394516, + "loss": 3.0654428005218506, + "step": 4071, + "token_acc": 0.2905192696424429 + }, + { + "epoch": 2.3869832893579597, + "grad_norm": 0.48580361423973595, + "learning_rate": 0.00019738223886079487, + "loss": 3.0588324069976807, + "step": 4072, + "token_acc": 0.29053195631237066 + }, + { + "epoch": 2.387569627675169, + "grad_norm": 0.5161100308390308, + "learning_rate": 0.0001973800352928697, + "loss": 3.027744770050049, + "step": 4073, + "token_acc": 0.2952268830664751 + }, + { + "epoch": 2.3881559659923774, + "grad_norm": 0.369702203119271, + "learning_rate": 0.00019737783081019036, + "loss": 3.0774636268615723, + "step": 4074, + "token_acc": 0.29210474376823653 + }, + { + "epoch": 2.3887423043095866, + "grad_norm": 0.34796003103025214, + "learning_rate": 0.0001973756254127775, + "loss": 3.083624839782715, + "step": 4075, + "token_acc": 0.2905960863554185 + }, + { + "epoch": 2.3893286426267957, + "grad_norm": 0.4789495072196346, + "learning_rate": 0.00019737341910065182, + "loss": 3.072279453277588, + "step": 4076, + "token_acc": 0.2904567491707068 + }, + { + "epoch": 2.389914980944005, + "grad_norm": 0.4530878018597216, + "learning_rate": 0.00019737121187383407, + "loss": 3.0857090950012207, + "step": 4077, + "token_acc": 0.28888513672898286 + }, + { + "epoch": 2.390501319261214, + "grad_norm": 0.40261593006764923, + "learning_rate": 0.00019736900373234503, + "loss": 3.063866138458252, + "step": 4078, + "token_acc": 0.292847459288176 + }, + { + "epoch": 2.3910876575784226, + "grad_norm": 0.429483274924287, + "learning_rate": 0.0001973667946762054, + "loss": 3.058635711669922, + "step": 4079, + "token_acc": 0.29353800494805043 + }, + { + "epoch": 2.3916739958956317, + "grad_norm": 0.3807741827597395, + "learning_rate": 0.00019736458470543593, + "loss": 3.026242256164551, + "step": 4080, + "token_acc": 0.29548366972954004 + }, + { + "epoch": 2.392260334212841, + "grad_norm": 0.5044305111070406, + "learning_rate": 0.0001973623738200574, + "loss": 3.0663278102874756, + "step": 4081, + "token_acc": 0.29209834220890524 + }, + { + "epoch": 2.39284667253005, + "grad_norm": 0.4321753551820758, + "learning_rate": 0.00019736016202009053, + "loss": 3.0572004318237305, + "step": 4082, + "token_acc": 0.29191245426567514 + }, + { + "epoch": 2.393433010847259, + "grad_norm": 0.44689884797087115, + "learning_rate": 0.00019735794930555618, + "loss": 3.0680956840515137, + "step": 4083, + "token_acc": 0.2908540093398363 + }, + { + "epoch": 2.394019349164468, + "grad_norm": 0.3841559342714359, + "learning_rate": 0.00019735573567647508, + "loss": 3.053382396697998, + "step": 4084, + "token_acc": 0.29260940251998374 + }, + { + "epoch": 2.394605687481677, + "grad_norm": 0.4815775001158555, + "learning_rate": 0.000197353521132868, + "loss": 3.0845413208007812, + "step": 4085, + "token_acc": 0.28775227350731897 + }, + { + "epoch": 2.395192025798886, + "grad_norm": 0.36144887283273275, + "learning_rate": 0.00019735130567475579, + "loss": 3.0509257316589355, + "step": 4086, + "token_acc": 0.29350153469680507 + }, + { + "epoch": 2.395778364116095, + "grad_norm": 0.3859121847675607, + "learning_rate": 0.00019734908930215924, + "loss": 3.1050310134887695, + "step": 4087, + "token_acc": 0.28709304160311366 + }, + { + "epoch": 2.396364702433304, + "grad_norm": 0.32370890561199395, + "learning_rate": 0.00019734687201509917, + "loss": 3.02191424369812, + "step": 4088, + "token_acc": 0.29807665012477386 + }, + { + "epoch": 2.3969510407505132, + "grad_norm": 0.367394004580195, + "learning_rate": 0.00019734465381359646, + "loss": 3.0496182441711426, + "step": 4089, + "token_acc": 0.29433242389508635 + }, + { + "epoch": 2.397537379067722, + "grad_norm": 0.31435121089871226, + "learning_rate": 0.00019734243469767186, + "loss": 3.0896310806274414, + "step": 4090, + "token_acc": 0.28738281209264493 + }, + { + "epoch": 2.398123717384931, + "grad_norm": 0.3524199001755509, + "learning_rate": 0.00019734021466734627, + "loss": 3.10182523727417, + "step": 4091, + "token_acc": 0.28656505421294304 + }, + { + "epoch": 2.39871005570214, + "grad_norm": 0.3044099532406422, + "learning_rate": 0.00019733799372264054, + "loss": 3.067777156829834, + "step": 4092, + "token_acc": 0.2913250129841515 + }, + { + "epoch": 2.3992963940193492, + "grad_norm": 0.3019869864125417, + "learning_rate": 0.0001973357718635755, + "loss": 3.0055742263793945, + "step": 4093, + "token_acc": 0.29962626690836647 + }, + { + "epoch": 2.3998827323365584, + "grad_norm": 0.31222147639654224, + "learning_rate": 0.00019733354909017204, + "loss": 3.0861077308654785, + "step": 4094, + "token_acc": 0.28815394734731203 + }, + { + "epoch": 2.4004690706537675, + "grad_norm": 0.36231464577666816, + "learning_rate": 0.00019733132540245108, + "loss": 3.080594301223755, + "step": 4095, + "token_acc": 0.2894749670940379 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.3065063954729968, + "learning_rate": 0.00019732910080043342, + "loss": 3.1249656677246094, + "step": 4096, + "token_acc": 0.28354588336414144 + }, + { + "epoch": 2.4016417472881852, + "grad_norm": 0.3621373442185483, + "learning_rate": 0.00019732687528414006, + "loss": 3.073049306869507, + "step": 4097, + "token_acc": 0.2896589747999474 + }, + { + "epoch": 2.4022280856053944, + "grad_norm": 0.32959188632789627, + "learning_rate": 0.00019732464885359186, + "loss": 3.0778708457946777, + "step": 4098, + "token_acc": 0.2895755477653437 + }, + { + "epoch": 2.4028144239226035, + "grad_norm": 0.30329079378524876, + "learning_rate": 0.00019732242150880967, + "loss": 3.087095022201538, + "step": 4099, + "token_acc": 0.28905990776298224 + }, + { + "epoch": 2.403400762239812, + "grad_norm": 0.3396481053098095, + "learning_rate": 0.00019732019324981455, + "loss": 3.0798850059509277, + "step": 4100, + "token_acc": 0.2901997614031525 + }, + { + "epoch": 2.4039871005570213, + "grad_norm": 0.3175410560600566, + "learning_rate": 0.0001973179640766273, + "loss": 3.0958402156829834, + "step": 4101, + "token_acc": 0.2885734687482341 + }, + { + "epoch": 2.4045734388742304, + "grad_norm": 0.3036458834613522, + "learning_rate": 0.00019731573398926896, + "loss": 3.0448741912841797, + "step": 4102, + "token_acc": 0.29506101699256015 + }, + { + "epoch": 2.4051597771914395, + "grad_norm": 0.35794590630812734, + "learning_rate": 0.0001973135029877604, + "loss": 3.0634560585021973, + "step": 4103, + "token_acc": 0.290748252396199 + }, + { + "epoch": 2.4057461155086486, + "grad_norm": 0.40810975726245907, + "learning_rate": 0.00019731127107212263, + "loss": 3.1196486949920654, + "step": 4104, + "token_acc": 0.28546552154985333 + }, + { + "epoch": 2.4063324538258577, + "grad_norm": 0.37404703842103976, + "learning_rate": 0.00019730903824237655, + "loss": 3.0985782146453857, + "step": 4105, + "token_acc": 0.2873534947663295 + }, + { + "epoch": 2.4069187921430664, + "grad_norm": 0.3294698250429917, + "learning_rate": 0.00019730680449854323, + "loss": 3.0477705001831055, + "step": 4106, + "token_acc": 0.2959189304437254 + }, + { + "epoch": 2.4075051304602755, + "grad_norm": 0.3352970478197089, + "learning_rate": 0.00019730456984064362, + "loss": 3.042135715484619, + "step": 4107, + "token_acc": 0.2948903040198235 + }, + { + "epoch": 2.4080914687774846, + "grad_norm": 0.38868138584621537, + "learning_rate": 0.00019730233426869863, + "loss": 3.0357906818389893, + "step": 4108, + "token_acc": 0.2951099211945912 + }, + { + "epoch": 2.4086778070946937, + "grad_norm": 0.399042247796688, + "learning_rate": 0.00019730009778272937, + "loss": 3.0694785118103027, + "step": 4109, + "token_acc": 0.2914336310402214 + }, + { + "epoch": 2.409264145411903, + "grad_norm": 0.302635024218202, + "learning_rate": 0.0001972978603827568, + "loss": 3.0530624389648438, + "step": 4110, + "token_acc": 0.2929539497544827 + }, + { + "epoch": 2.4098504837291115, + "grad_norm": 0.29388734814551826, + "learning_rate": 0.00019729562206880193, + "loss": 3.051225423812866, + "step": 4111, + "token_acc": 0.2926953020134228 + }, + { + "epoch": 2.4104368220463206, + "grad_norm": 0.39619104895108326, + "learning_rate": 0.00019729338284088583, + "loss": 3.0711686611175537, + "step": 4112, + "token_acc": 0.28973295608697336 + }, + { + "epoch": 2.4110231603635297, + "grad_norm": 0.34309009998653844, + "learning_rate": 0.00019729114269902948, + "loss": 3.0238823890686035, + "step": 4113, + "token_acc": 0.2985431417135006 + }, + { + "epoch": 2.411609498680739, + "grad_norm": 0.3224358113733176, + "learning_rate": 0.00019728890164325393, + "loss": 3.065692901611328, + "step": 4114, + "token_acc": 0.29268561386021746 + }, + { + "epoch": 2.412195836997948, + "grad_norm": 0.3107994413040964, + "learning_rate": 0.0001972866596735803, + "loss": 3.0782065391540527, + "step": 4115, + "token_acc": 0.2911000199221986 + }, + { + "epoch": 2.412782175315157, + "grad_norm": 0.31777169765460433, + "learning_rate": 0.00019728441679002954, + "loss": 3.097886562347412, + "step": 4116, + "token_acc": 0.2865172175434872 + }, + { + "epoch": 2.4133685136323657, + "grad_norm": 0.3743217794705939, + "learning_rate": 0.00019728217299262282, + "loss": 3.057987689971924, + "step": 4117, + "token_acc": 0.2942075350773582 + }, + { + "epoch": 2.413954851949575, + "grad_norm": 0.3986550537729199, + "learning_rate": 0.00019727992828138114, + "loss": 3.070807456970215, + "step": 4118, + "token_acc": 0.28964028288661303 + }, + { + "epoch": 2.414541190266784, + "grad_norm": 0.39077653382619687, + "learning_rate": 0.00019727768265632563, + "loss": 3.0946884155273438, + "step": 4119, + "token_acc": 0.28825668210241856 + }, + { + "epoch": 2.415127528583993, + "grad_norm": 0.4492547497691847, + "learning_rate": 0.00019727543611747737, + "loss": 3.0969133377075195, + "step": 4120, + "token_acc": 0.2855234579734677 + }, + { + "epoch": 2.415713866901202, + "grad_norm": 0.3518914393279753, + "learning_rate": 0.00019727318866485748, + "loss": 3.068103075027466, + "step": 4121, + "token_acc": 0.29150716395056325 + }, + { + "epoch": 2.416300205218411, + "grad_norm": 0.31176922519806244, + "learning_rate": 0.00019727094029848706, + "loss": 3.055452346801758, + "step": 4122, + "token_acc": 0.2929140573793147 + }, + { + "epoch": 2.41688654353562, + "grad_norm": 0.3098967678578531, + "learning_rate": 0.00019726869101838724, + "loss": 3.0346579551696777, + "step": 4123, + "token_acc": 0.29467759755336015 + }, + { + "epoch": 2.417472881852829, + "grad_norm": 0.38102398840965657, + "learning_rate": 0.0001972664408245791, + "loss": 3.0992612838745117, + "step": 4124, + "token_acc": 0.2863387774347281 + }, + { + "epoch": 2.418059220170038, + "grad_norm": 0.3855656998064484, + "learning_rate": 0.00019726418971708384, + "loss": 3.096004009246826, + "step": 4125, + "token_acc": 0.2860356162000273 + }, + { + "epoch": 2.4186455584872473, + "grad_norm": 0.3505684632959905, + "learning_rate": 0.0001972619376959226, + "loss": 3.0726895332336426, + "step": 4126, + "token_acc": 0.29149255515560457 + }, + { + "epoch": 2.4192318968044564, + "grad_norm": 0.39080868730996693, + "learning_rate": 0.00019725968476111652, + "loss": 3.068895101547241, + "step": 4127, + "token_acc": 0.28996430580516047 + }, + { + "epoch": 2.419818235121665, + "grad_norm": 0.3309174377376033, + "learning_rate": 0.00019725743091268672, + "loss": 3.0411195755004883, + "step": 4128, + "token_acc": 0.29428724965380404 + }, + { + "epoch": 2.420404573438874, + "grad_norm": 0.33577124031501776, + "learning_rate": 0.00019725517615065444, + "loss": 3.0655517578125, + "step": 4129, + "token_acc": 0.2920482066961917 + }, + { + "epoch": 2.4209909117560833, + "grad_norm": 0.32180739995738616, + "learning_rate": 0.00019725292047504084, + "loss": 3.0562782287597656, + "step": 4130, + "token_acc": 0.2916351999042912 + }, + { + "epoch": 2.4215772500732924, + "grad_norm": 0.38895569592641843, + "learning_rate": 0.0001972506638858671, + "loss": 3.048586130142212, + "step": 4131, + "token_acc": 0.29324345460466095 + }, + { + "epoch": 2.4221635883905015, + "grad_norm": 0.36371703847741926, + "learning_rate": 0.0001972484063831544, + "loss": 3.040928363800049, + "step": 4132, + "token_acc": 0.2956311746497446 + }, + { + "epoch": 2.42274992670771, + "grad_norm": 0.28469923903113414, + "learning_rate": 0.00019724614796692397, + "loss": 3.079281806945801, + "step": 4133, + "token_acc": 0.29095520762785015 + }, + { + "epoch": 2.4233362650249193, + "grad_norm": 0.36273128920223086, + "learning_rate": 0.00019724388863719703, + "loss": 3.025628089904785, + "step": 4134, + "token_acc": 0.2965192001995218 + }, + { + "epoch": 2.4239226033421284, + "grad_norm": 0.3570521791072007, + "learning_rate": 0.0001972416283939948, + "loss": 3.0567104816436768, + "step": 4135, + "token_acc": 0.2926176922381856 + }, + { + "epoch": 2.4245089416593375, + "grad_norm": 0.34870473101868044, + "learning_rate": 0.00019723936723733848, + "loss": 3.0123395919799805, + "step": 4136, + "token_acc": 0.299218596233823 + }, + { + "epoch": 2.4250952799765466, + "grad_norm": 0.3971293349911346, + "learning_rate": 0.00019723710516724935, + "loss": 3.111453056335449, + "step": 4137, + "token_acc": 0.2853437941544416 + }, + { + "epoch": 2.4256816182937557, + "grad_norm": 0.36110919750980897, + "learning_rate": 0.00019723484218374865, + "loss": 3.0785531997680664, + "step": 4138, + "token_acc": 0.2899353374074712 + }, + { + "epoch": 2.4262679566109644, + "grad_norm": 0.33600988158608386, + "learning_rate": 0.00019723257828685765, + "loss": 3.0615620613098145, + "step": 4139, + "token_acc": 0.29270153296868673 + }, + { + "epoch": 2.4268542949281735, + "grad_norm": 0.3933557955203856, + "learning_rate": 0.00019723031347659758, + "loss": 3.048780679702759, + "step": 4140, + "token_acc": 0.2940633466177132 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.3837126450145685, + "learning_rate": 0.00019722804775298974, + "loss": 3.0851893424987793, + "step": 4141, + "token_acc": 0.28838084363768474 + }, + { + "epoch": 2.4280269715625917, + "grad_norm": 0.3324783939656905, + "learning_rate": 0.0001972257811160554, + "loss": 3.0466110706329346, + "step": 4142, + "token_acc": 0.2926182655251791 + }, + { + "epoch": 2.4286133098798004, + "grad_norm": 0.3478762704169274, + "learning_rate": 0.00019722351356581586, + "loss": 3.0483133792877197, + "step": 4143, + "token_acc": 0.2948984068220969 + }, + { + "epoch": 2.4291996481970095, + "grad_norm": 0.3069792766496504, + "learning_rate": 0.00019722124510229244, + "loss": 3.065051317214966, + "step": 4144, + "token_acc": 0.2908824360335046 + }, + { + "epoch": 2.4297859865142186, + "grad_norm": 0.36254686565607297, + "learning_rate": 0.0001972189757255064, + "loss": 3.0799670219421387, + "step": 4145, + "token_acc": 0.29067325198636323 + }, + { + "epoch": 2.4303723248314277, + "grad_norm": 0.36935567914896816, + "learning_rate": 0.00019721670543547908, + "loss": 3.106159210205078, + "step": 4146, + "token_acc": 0.2846457553137973 + }, + { + "epoch": 2.430958663148637, + "grad_norm": 0.2964785609075707, + "learning_rate": 0.00019721443423223185, + "loss": 3.071908473968506, + "step": 4147, + "token_acc": 0.2901360978474581 + }, + { + "epoch": 2.431545001465846, + "grad_norm": 0.33990902720305727, + "learning_rate": 0.000197212162115786, + "loss": 3.049102783203125, + "step": 4148, + "token_acc": 0.294573433173412 + }, + { + "epoch": 2.432131339783055, + "grad_norm": 0.34388051718932294, + "learning_rate": 0.00019720988908616288, + "loss": 3.123523473739624, + "step": 4149, + "token_acc": 0.2840472310199563 + }, + { + "epoch": 2.4327176781002637, + "grad_norm": 0.3391233561606437, + "learning_rate": 0.00019720761514338385, + "loss": 3.0774483680725098, + "step": 4150, + "token_acc": 0.2912793178095323 + }, + { + "epoch": 2.433304016417473, + "grad_norm": 0.3827550094511301, + "learning_rate": 0.00019720534028747023, + "loss": 3.109848976135254, + "step": 4151, + "token_acc": 0.2850491157108364 + }, + { + "epoch": 2.433890354734682, + "grad_norm": 0.395406057378136, + "learning_rate": 0.00019720306451844345, + "loss": 3.0666651725769043, + "step": 4152, + "token_acc": 0.29243923704860775 + }, + { + "epoch": 2.434476693051891, + "grad_norm": 0.33043570258587207, + "learning_rate": 0.00019720078783632485, + "loss": 3.0184414386749268, + "step": 4153, + "token_acc": 0.2987525882262023 + }, + { + "epoch": 2.4350630313690997, + "grad_norm": 0.3005273790691233, + "learning_rate": 0.00019719851024113585, + "loss": 3.0411694049835205, + "step": 4154, + "token_acc": 0.2944877148633673 + }, + { + "epoch": 2.435649369686309, + "grad_norm": 0.32225554289521124, + "learning_rate": 0.0001971962317328978, + "loss": 3.0806777477264404, + "step": 4155, + "token_acc": 0.2903598560683219 + }, + { + "epoch": 2.436235708003518, + "grad_norm": 0.40039868801302597, + "learning_rate": 0.00019719395231163213, + "loss": 3.0519824028015137, + "step": 4156, + "token_acc": 0.293357453376935 + }, + { + "epoch": 2.436822046320727, + "grad_norm": 0.3390099732280655, + "learning_rate": 0.00019719167197736025, + "loss": 3.105412721633911, + "step": 4157, + "token_acc": 0.28408085232659214 + }, + { + "epoch": 2.437408384637936, + "grad_norm": 0.34405013110154337, + "learning_rate": 0.00019718939073010358, + "loss": 3.063797950744629, + "step": 4158, + "token_acc": 0.29279636802336656 + }, + { + "epoch": 2.4379947229551453, + "grad_norm": 0.36500065444887214, + "learning_rate": 0.0001971871085698836, + "loss": 3.10847806930542, + "step": 4159, + "token_acc": 0.2860872122950217 + }, + { + "epoch": 2.438581061272354, + "grad_norm": 0.3187053065425624, + "learning_rate": 0.0001971848254967216, + "loss": 3.0630266666412354, + "step": 4160, + "token_acc": 0.29082313786027497 + }, + { + "epoch": 2.439167399589563, + "grad_norm": 0.33857458848324584, + "learning_rate": 0.00019718254151063918, + "loss": 3.052246570587158, + "step": 4161, + "token_acc": 0.2940540258792562 + }, + { + "epoch": 2.439753737906772, + "grad_norm": 0.3179457269775509, + "learning_rate": 0.00019718025661165776, + "loss": 3.072429656982422, + "step": 4162, + "token_acc": 0.29034941763727123 + }, + { + "epoch": 2.4403400762239813, + "grad_norm": 0.30982036407592634, + "learning_rate": 0.00019717797079979872, + "loss": 3.072239398956299, + "step": 4163, + "token_acc": 0.2904579133023296 + }, + { + "epoch": 2.4409264145411904, + "grad_norm": 0.3382823840619922, + "learning_rate": 0.00019717568407508362, + "loss": 3.066800832748413, + "step": 4164, + "token_acc": 0.29034281361097003 + }, + { + "epoch": 2.441512752858399, + "grad_norm": 0.3502895905393527, + "learning_rate": 0.00019717339643753393, + "loss": 3.0801172256469727, + "step": 4165, + "token_acc": 0.2905032450031264 + }, + { + "epoch": 2.442099091175608, + "grad_norm": 0.3267901712832557, + "learning_rate": 0.0001971711078871711, + "loss": 3.0572760105133057, + "step": 4166, + "token_acc": 0.2924925370841633 + }, + { + "epoch": 2.4426854294928173, + "grad_norm": 0.2941538290492267, + "learning_rate": 0.00019716881842401666, + "loss": 3.1201086044311523, + "step": 4167, + "token_acc": 0.2841321382786749 + }, + { + "epoch": 2.4432717678100264, + "grad_norm": 0.37166343604598084, + "learning_rate": 0.00019716652804809213, + "loss": 3.0689282417297363, + "step": 4168, + "token_acc": 0.2922231580636105 + }, + { + "epoch": 2.4438581061272355, + "grad_norm": 0.47423055990950175, + "learning_rate": 0.00019716423675941898, + "loss": 3.1105926036834717, + "step": 4169, + "token_acc": 0.28560351107131016 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.4801363226790007, + "learning_rate": 0.00019716194455801875, + "loss": 3.09112548828125, + "step": 4170, + "token_acc": 0.2895343922349478 + }, + { + "epoch": 2.4450307827616533, + "grad_norm": 0.3155882264602728, + "learning_rate": 0.000197159651443913, + "loss": 3.0731444358825684, + "step": 4171, + "token_acc": 0.29129294559448804 + }, + { + "epoch": 2.4456171210788624, + "grad_norm": 0.39674761151744875, + "learning_rate": 0.0001971573574171232, + "loss": 3.089888572692871, + "step": 4172, + "token_acc": 0.2871859881094699 + }, + { + "epoch": 2.4462034593960715, + "grad_norm": 0.3415928025320668, + "learning_rate": 0.000197155062477671, + "loss": 3.0826191902160645, + "step": 4173, + "token_acc": 0.28909895802120433 + }, + { + "epoch": 2.4467897977132806, + "grad_norm": 0.392140189451025, + "learning_rate": 0.0001971527666255779, + "loss": 3.0924389362335205, + "step": 4174, + "token_acc": 0.2880713360995019 + }, + { + "epoch": 2.4473761360304898, + "grad_norm": 0.34498438867737363, + "learning_rate": 0.00019715046986086546, + "loss": 3.0283966064453125, + "step": 4175, + "token_acc": 0.2980381199901945 + }, + { + "epoch": 2.4479624743476984, + "grad_norm": 0.402308367945163, + "learning_rate": 0.00019714817218355525, + "loss": 3.1077630519866943, + "step": 4176, + "token_acc": 0.28314003356330414 + }, + { + "epoch": 2.4485488126649075, + "grad_norm": 0.40895309108295946, + "learning_rate": 0.00019714587359366892, + "loss": 3.064359188079834, + "step": 4177, + "token_acc": 0.29165812127618174 + }, + { + "epoch": 2.4491351509821166, + "grad_norm": 0.3906467171937156, + "learning_rate": 0.00019714357409122797, + "loss": 3.0735812187194824, + "step": 4178, + "token_acc": 0.2896861058941919 + }, + { + "epoch": 2.4497214892993258, + "grad_norm": 0.37923065082947793, + "learning_rate": 0.0001971412736762541, + "loss": 3.0578784942626953, + "step": 4179, + "token_acc": 0.29231884439992617 + }, + { + "epoch": 2.450307827616535, + "grad_norm": 0.3535407469192918, + "learning_rate": 0.0001971389723487688, + "loss": 3.045675277709961, + "step": 4180, + "token_acc": 0.2945538274346544 + }, + { + "epoch": 2.450894165933744, + "grad_norm": 0.3384029661148089, + "learning_rate": 0.00019713667010879375, + "loss": 3.067142963409424, + "step": 4181, + "token_acc": 0.29043250063591924 + }, + { + "epoch": 2.4514805042509527, + "grad_norm": 0.3657740464408111, + "learning_rate": 0.0001971343669563506, + "loss": 3.0889322757720947, + "step": 4182, + "token_acc": 0.28872409223152506 + }, + { + "epoch": 2.4520668425681618, + "grad_norm": 0.40084416027978254, + "learning_rate": 0.00019713206289146098, + "loss": 3.1017560958862305, + "step": 4183, + "token_acc": 0.2857810464427161 + }, + { + "epoch": 2.452653180885371, + "grad_norm": 0.3185543569110051, + "learning_rate": 0.0001971297579141465, + "loss": 3.083775043487549, + "step": 4184, + "token_acc": 0.29060125416958976 + }, + { + "epoch": 2.45323951920258, + "grad_norm": 0.37930038598613913, + "learning_rate": 0.0001971274520244288, + "loss": 3.050126314163208, + "step": 4185, + "token_acc": 0.2910102499210684 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.31118492689585786, + "learning_rate": 0.0001971251452223296, + "loss": 3.004601001739502, + "step": 4186, + "token_acc": 0.30126722682741003 + }, + { + "epoch": 2.4544121958369978, + "grad_norm": 0.3790168078683845, + "learning_rate": 0.00019712283750787055, + "loss": 3.053802967071533, + "step": 4187, + "token_acc": 0.29303486688547564 + }, + { + "epoch": 2.454998534154207, + "grad_norm": 0.41641676076270695, + "learning_rate": 0.0001971205288810733, + "loss": 3.118191957473755, + "step": 4188, + "token_acc": 0.2840567209043978 + }, + { + "epoch": 2.455584872471416, + "grad_norm": 0.31704039251858346, + "learning_rate": 0.00019711821934195956, + "loss": 3.0292046070098877, + "step": 4189, + "token_acc": 0.29699596927523003 + }, + { + "epoch": 2.456171210788625, + "grad_norm": 0.3862951830052431, + "learning_rate": 0.000197115908890551, + "loss": 3.09126353263855, + "step": 4190, + "token_acc": 0.28726345927363345 + }, + { + "epoch": 2.456757549105834, + "grad_norm": 0.39928113181654984, + "learning_rate": 0.00019711359752686938, + "loss": 2.996582508087158, + "step": 4191, + "token_acc": 0.29963132098285616 + }, + { + "epoch": 2.4573438874230433, + "grad_norm": 0.3638933434570716, + "learning_rate": 0.00019711128525093635, + "loss": 3.0232417583465576, + "step": 4192, + "token_acc": 0.2977536056808513 + }, + { + "epoch": 2.457930225740252, + "grad_norm": 0.34467744356244073, + "learning_rate": 0.00019710897206277363, + "loss": 3.0724964141845703, + "step": 4193, + "token_acc": 0.29059954949349764 + }, + { + "epoch": 2.458516564057461, + "grad_norm": 0.4193261392216931, + "learning_rate": 0.000197106657962403, + "loss": 3.0611634254455566, + "step": 4194, + "token_acc": 0.2911900609454875 + }, + { + "epoch": 2.45910290237467, + "grad_norm": 0.3358736270166866, + "learning_rate": 0.00019710434294984618, + "loss": 3.071831226348877, + "step": 4195, + "token_acc": 0.29266754270696455 + }, + { + "epoch": 2.4596892406918793, + "grad_norm": 0.4862797502874084, + "learning_rate": 0.0001971020270251249, + "loss": 3.093724012374878, + "step": 4196, + "token_acc": 0.28651427707351496 + }, + { + "epoch": 2.460275579009088, + "grad_norm": 0.42867232936043154, + "learning_rate": 0.00019709971018826088, + "loss": 3.072751760482788, + "step": 4197, + "token_acc": 0.29046735795523176 + }, + { + "epoch": 2.460861917326297, + "grad_norm": 0.36915236645100996, + "learning_rate": 0.00019709739243927595, + "loss": 3.047736883163452, + "step": 4198, + "token_acc": 0.29470716282661996 + }, + { + "epoch": 2.4614482556435062, + "grad_norm": 0.3746889346217329, + "learning_rate": 0.00019709507377819189, + "loss": 3.0751123428344727, + "step": 4199, + "token_acc": 0.2896070464584206 + }, + { + "epoch": 2.4620345939607153, + "grad_norm": 0.3841745057035483, + "learning_rate": 0.00019709275420503044, + "loss": 3.0227293968200684, + "step": 4200, + "token_acc": 0.2971805455803467 + }, + { + "epoch": 2.4626209322779244, + "grad_norm": 0.34086264104313463, + "learning_rate": 0.00019709043371981337, + "loss": 3.060875654220581, + "step": 4201, + "token_acc": 0.29305365597758154 + }, + { + "epoch": 2.4632072705951336, + "grad_norm": 0.3438539940142994, + "learning_rate": 0.00019708811232256251, + "loss": 3.1130895614624023, + "step": 4202, + "token_acc": 0.2862487403207026 + }, + { + "epoch": 2.4637936089123427, + "grad_norm": 0.4128262369767911, + "learning_rate": 0.0001970857900132997, + "loss": 3.1298396587371826, + "step": 4203, + "token_acc": 0.2837900470024483 + }, + { + "epoch": 2.4643799472295513, + "grad_norm": 0.3043408648852795, + "learning_rate": 0.0001970834667920467, + "loss": 3.0559239387512207, + "step": 4204, + "token_acc": 0.29351395949218856 + }, + { + "epoch": 2.4649662855467604, + "grad_norm": 0.35043657334144934, + "learning_rate": 0.00019708114265882534, + "loss": 3.078447103500366, + "step": 4205, + "token_acc": 0.2899473147295848 + }, + { + "epoch": 2.4655526238639696, + "grad_norm": 0.39087018238146287, + "learning_rate": 0.00019707881761365744, + "loss": 3.044036388397217, + "step": 4206, + "token_acc": 0.2941168844638042 + }, + { + "epoch": 2.4661389621811787, + "grad_norm": 0.4085056434103663, + "learning_rate": 0.00019707649165656493, + "loss": 3.0631327629089355, + "step": 4207, + "token_acc": 0.29160898828903375 + }, + { + "epoch": 2.4667253004983873, + "grad_norm": 0.39348139726028375, + "learning_rate": 0.00019707416478756954, + "loss": 3.1001412868499756, + "step": 4208, + "token_acc": 0.2875622587837119 + }, + { + "epoch": 2.4673116388155965, + "grad_norm": 0.32406822916694694, + "learning_rate": 0.0001970718370066932, + "loss": 3.0393834114074707, + "step": 4209, + "token_acc": 0.294212247841233 + }, + { + "epoch": 2.4678979771328056, + "grad_norm": 0.34330933735561014, + "learning_rate": 0.00019706950831395776, + "loss": 3.063748836517334, + "step": 4210, + "token_acc": 0.291339089425369 + }, + { + "epoch": 2.4684843154500147, + "grad_norm": 0.3742329277997824, + "learning_rate": 0.0001970671787093851, + "loss": 3.083432197570801, + "step": 4211, + "token_acc": 0.28902603814770544 + }, + { + "epoch": 2.469070653767224, + "grad_norm": 0.3129515303554952, + "learning_rate": 0.00019706484819299706, + "loss": 3.12119197845459, + "step": 4212, + "token_acc": 0.28283226923745997 + }, + { + "epoch": 2.469656992084433, + "grad_norm": 0.3931215599906331, + "learning_rate": 0.0001970625167648156, + "loss": 3.0883567333221436, + "step": 4213, + "token_acc": 0.286826200462866 + }, + { + "epoch": 2.4702433304016416, + "grad_norm": 0.31525792328379915, + "learning_rate": 0.00019706018442486255, + "loss": 3.094846248626709, + "step": 4214, + "token_acc": 0.28654941448654464 + }, + { + "epoch": 2.4708296687188507, + "grad_norm": 0.3961034145120364, + "learning_rate": 0.00019705785117315992, + "loss": 3.084347724914551, + "step": 4215, + "token_acc": 0.289566771899129 + }, + { + "epoch": 2.47141600703606, + "grad_norm": 0.3596066696582057, + "learning_rate": 0.0001970555170097295, + "loss": 3.038320779800415, + "step": 4216, + "token_acc": 0.29354741857581096 + }, + { + "epoch": 2.472002345353269, + "grad_norm": 0.32150312180483404, + "learning_rate": 0.00019705318193459333, + "loss": 3.0578691959381104, + "step": 4217, + "token_acc": 0.2936278439578622 + }, + { + "epoch": 2.472588683670478, + "grad_norm": 0.3587195332214216, + "learning_rate": 0.00019705084594777328, + "loss": 3.031719446182251, + "step": 4218, + "token_acc": 0.2964798569891725 + }, + { + "epoch": 2.4731750219876867, + "grad_norm": 0.3200242024476419, + "learning_rate": 0.00019704850904929131, + "loss": 3.0855536460876465, + "step": 4219, + "token_acc": 0.28723426741562347 + }, + { + "epoch": 2.473761360304896, + "grad_norm": 0.36876044848082185, + "learning_rate": 0.00019704617123916937, + "loss": 3.0524094104766846, + "step": 4220, + "token_acc": 0.29610902156062935 + }, + { + "epoch": 2.474347698622105, + "grad_norm": 0.37855788483316366, + "learning_rate": 0.00019704383251742944, + "loss": 3.0626096725463867, + "step": 4221, + "token_acc": 0.29265976814262995 + }, + { + "epoch": 2.474934036939314, + "grad_norm": 0.38244532946259363, + "learning_rate": 0.00019704149288409344, + "loss": 3.053119421005249, + "step": 4222, + "token_acc": 0.2910223390555656 + }, + { + "epoch": 2.475520375256523, + "grad_norm": 0.35240126209972156, + "learning_rate": 0.0001970391523391834, + "loss": 3.0764575004577637, + "step": 4223, + "token_acc": 0.2894879376786543 + }, + { + "epoch": 2.4761067135737322, + "grad_norm": 0.37133657307962803, + "learning_rate": 0.00019703681088272128, + "loss": 3.0852982997894287, + "step": 4224, + "token_acc": 0.28919917156553326 + }, + { + "epoch": 2.476693051890941, + "grad_norm": 0.345985293597402, + "learning_rate": 0.00019703446851472909, + "loss": 3.076150894165039, + "step": 4225, + "token_acc": 0.29017247300832283 + }, + { + "epoch": 2.47727939020815, + "grad_norm": 0.3560659436054095, + "learning_rate": 0.00019703212523522877, + "loss": 3.042570114135742, + "step": 4226, + "token_acc": 0.29543749787647766 + }, + { + "epoch": 2.477865728525359, + "grad_norm": 0.3464179235906566, + "learning_rate": 0.00019702978104424245, + "loss": 3.071335554122925, + "step": 4227, + "token_acc": 0.29064548444858135 + }, + { + "epoch": 2.4784520668425682, + "grad_norm": 0.33198616086563704, + "learning_rate": 0.00019702743594179206, + "loss": 3.064554452896118, + "step": 4228, + "token_acc": 0.29197052038335886 + }, + { + "epoch": 2.4790384051597774, + "grad_norm": 0.3584311210096221, + "learning_rate": 0.00019702508992789969, + "loss": 3.090202808380127, + "step": 4229, + "token_acc": 0.2888358959257935 + }, + { + "epoch": 2.479624743476986, + "grad_norm": 0.38222222078627915, + "learning_rate": 0.0001970227430025873, + "loss": 2.983476161956787, + "step": 4230, + "token_acc": 0.3023422304561807 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.3935060002683657, + "learning_rate": 0.000197020395165877, + "loss": 3.04555606842041, + "step": 4231, + "token_acc": 0.2929944550005954 + }, + { + "epoch": 2.4807974201114043, + "grad_norm": 0.2911962759615048, + "learning_rate": 0.00019701804641779084, + "loss": 3.080777168273926, + "step": 4232, + "token_acc": 0.29016147739369447 + }, + { + "epoch": 2.4813837584286134, + "grad_norm": 0.3708685431130037, + "learning_rate": 0.00019701569675835084, + "loss": 3.052978754043579, + "step": 4233, + "token_acc": 0.2930309012402967 + }, + { + "epoch": 2.4819700967458225, + "grad_norm": 0.4310917365080607, + "learning_rate": 0.00019701334618757907, + "loss": 3.0925943851470947, + "step": 4234, + "token_acc": 0.2857361497695195 + }, + { + "epoch": 2.4825564350630316, + "grad_norm": 0.3066174501027044, + "learning_rate": 0.0001970109947054977, + "loss": 3.057028293609619, + "step": 4235, + "token_acc": 0.294473452656315 + }, + { + "epoch": 2.4831427733802403, + "grad_norm": 0.35815747343620646, + "learning_rate": 0.00019700864231212873, + "loss": 3.091637134552002, + "step": 4236, + "token_acc": 0.28774892706579397 + }, + { + "epoch": 2.4837291116974494, + "grad_norm": 0.42772081640843995, + "learning_rate": 0.00019700628900749426, + "loss": 3.0547335147857666, + "step": 4237, + "token_acc": 0.2929629600377008 + }, + { + "epoch": 2.4843154500146585, + "grad_norm": 0.35350017000636264, + "learning_rate": 0.00019700393479161647, + "loss": 3.049715518951416, + "step": 4238, + "token_acc": 0.2955851520319816 + }, + { + "epoch": 2.4849017883318676, + "grad_norm": 0.3242280283403194, + "learning_rate": 0.00019700157966451743, + "loss": 3.0756049156188965, + "step": 4239, + "token_acc": 0.2884507483660296 + }, + { + "epoch": 2.4854881266490767, + "grad_norm": 0.32674039912560665, + "learning_rate": 0.00019699922362621924, + "loss": 3.114276885986328, + "step": 4240, + "token_acc": 0.284856702747494 + }, + { + "epoch": 2.4860744649662854, + "grad_norm": 0.2950398621703499, + "learning_rate": 0.00019699686667674405, + "loss": 3.0686473846435547, + "step": 4241, + "token_acc": 0.28999309200380324 + }, + { + "epoch": 2.4866608032834945, + "grad_norm": 0.3252651916377247, + "learning_rate": 0.00019699450881611398, + "loss": 3.041412353515625, + "step": 4242, + "token_acc": 0.2955505935145705 + }, + { + "epoch": 2.4872471416007036, + "grad_norm": 0.3156980311698982, + "learning_rate": 0.00019699215004435124, + "loss": 3.0570967197418213, + "step": 4243, + "token_acc": 0.2916413015602455 + }, + { + "epoch": 2.4878334799179127, + "grad_norm": 0.35878732717885875, + "learning_rate": 0.00019698979036147793, + "loss": 3.0648746490478516, + "step": 4244, + "token_acc": 0.29183389127067433 + }, + { + "epoch": 2.488419818235122, + "grad_norm": 0.3529111266813683, + "learning_rate": 0.00019698742976751623, + "loss": 3.0728635787963867, + "step": 4245, + "token_acc": 0.29015767661550174 + }, + { + "epoch": 2.489006156552331, + "grad_norm": 0.32780756168528424, + "learning_rate": 0.00019698506826248835, + "loss": 3.0516066551208496, + "step": 4246, + "token_acc": 0.2932093497162575 + }, + { + "epoch": 2.4895924948695396, + "grad_norm": 0.49324166135780145, + "learning_rate": 0.00019698270584641642, + "loss": 3.0820698738098145, + "step": 4247, + "token_acc": 0.2889302644889001 + }, + { + "epoch": 2.4901788331867487, + "grad_norm": 0.6081554803831578, + "learning_rate": 0.00019698034251932264, + "loss": 3.0680360794067383, + "step": 4248, + "token_acc": 0.29175285041889987 + }, + { + "epoch": 2.490765171503958, + "grad_norm": 0.38874890448722127, + "learning_rate": 0.00019697797828122923, + "loss": 3.059863328933716, + "step": 4249, + "token_acc": 0.29178568568301344 + }, + { + "epoch": 2.491351509821167, + "grad_norm": 0.36229122970867705, + "learning_rate": 0.0001969756131321584, + "loss": 3.071688652038574, + "step": 4250, + "token_acc": 0.29014610731667206 + }, + { + "epoch": 2.4919378481383756, + "grad_norm": 0.34572328325591845, + "learning_rate": 0.0001969732470721324, + "loss": 3.057602882385254, + "step": 4251, + "token_acc": 0.2933786700329845 + }, + { + "epoch": 2.4925241864555847, + "grad_norm": 0.3962962759369881, + "learning_rate": 0.00019697088010117337, + "loss": 3.062619209289551, + "step": 4252, + "token_acc": 0.29112977420963093 + }, + { + "epoch": 2.493110524772794, + "grad_norm": 0.3245900387261144, + "learning_rate": 0.0001969685122193036, + "loss": 3.086968421936035, + "step": 4253, + "token_acc": 0.28669597332984703 + }, + { + "epoch": 2.493696863090003, + "grad_norm": 0.35953877848511784, + "learning_rate": 0.00019696614342654532, + "loss": 3.055788278579712, + "step": 4254, + "token_acc": 0.29377657207499425 + }, + { + "epoch": 2.494283201407212, + "grad_norm": 0.3106456372865276, + "learning_rate": 0.0001969637737229208, + "loss": 3.0775251388549805, + "step": 4255, + "token_acc": 0.29064671269905135 + }, + { + "epoch": 2.494869539724421, + "grad_norm": 0.37130104272404985, + "learning_rate": 0.0001969614031084523, + "loss": 3.0594663619995117, + "step": 4256, + "token_acc": 0.2911690455384018 + }, + { + "epoch": 2.49545587804163, + "grad_norm": 0.310730363746335, + "learning_rate": 0.00019695903158316205, + "loss": 3.0579566955566406, + "step": 4257, + "token_acc": 0.292399312845963 + }, + { + "epoch": 2.496042216358839, + "grad_norm": 0.3647755546587646, + "learning_rate": 0.00019695665914707235, + "loss": 3.0624704360961914, + "step": 4258, + "token_acc": 0.29135647986821683 + }, + { + "epoch": 2.496628554676048, + "grad_norm": 0.34203754206018394, + "learning_rate": 0.0001969542858002055, + "loss": 3.0100512504577637, + "step": 4259, + "token_acc": 0.29760081941083943 + }, + { + "epoch": 2.497214892993257, + "grad_norm": 0.30909799388302905, + "learning_rate": 0.0001969519115425838, + "loss": 3.025747776031494, + "step": 4260, + "token_acc": 0.29853345937504894 + }, + { + "epoch": 2.4978012313104663, + "grad_norm": 0.35429943796887864, + "learning_rate": 0.00019694953637422948, + "loss": 3.1122536659240723, + "step": 4261, + "token_acc": 0.28549065142004076 + }, + { + "epoch": 2.498387569627675, + "grad_norm": 0.2958711474779633, + "learning_rate": 0.00019694716029516497, + "loss": 3.0388975143432617, + "step": 4262, + "token_acc": 0.29537905071233483 + }, + { + "epoch": 2.498973907944884, + "grad_norm": 0.33326262913838706, + "learning_rate": 0.00019694478330541245, + "loss": 3.065325975418091, + "step": 4263, + "token_acc": 0.291666772528983 + }, + { + "epoch": 2.499560246262093, + "grad_norm": 0.29915169741530456, + "learning_rate": 0.0001969424054049944, + "loss": 3.1092684268951416, + "step": 4264, + "token_acc": 0.28678721240790683 + }, + { + "epoch": 2.5001465845793023, + "grad_norm": 0.3586325927376122, + "learning_rate": 0.00019694002659393305, + "loss": 3.059685707092285, + "step": 4265, + "token_acc": 0.29272692922772353 + }, + { + "epoch": 2.5007329228965114, + "grad_norm": 0.2951741782608592, + "learning_rate": 0.00019693764687225078, + "loss": 3.0540084838867188, + "step": 4266, + "token_acc": 0.29301378916298704 + }, + { + "epoch": 2.5013192612137205, + "grad_norm": 0.36848177083445705, + "learning_rate": 0.00019693526623996993, + "loss": 3.0449187755584717, + "step": 4267, + "token_acc": 0.2955846940369666 + }, + { + "epoch": 2.5019055995309296, + "grad_norm": 0.38196588176099133, + "learning_rate": 0.00019693288469711294, + "loss": 3.069197177886963, + "step": 4268, + "token_acc": 0.2907751230289706 + }, + { + "epoch": 2.5024919378481383, + "grad_norm": 0.38074972265356, + "learning_rate": 0.00019693050224370203, + "loss": 3.067047357559204, + "step": 4269, + "token_acc": 0.290188198268282 + }, + { + "epoch": 2.5030782761653474, + "grad_norm": 0.29349626900861914, + "learning_rate": 0.00019692811887975974, + "loss": 3.0979738235473633, + "step": 4270, + "token_acc": 0.2871397754015111 + }, + { + "epoch": 2.5036646144825565, + "grad_norm": 0.3362206857377582, + "learning_rate": 0.00019692573460530834, + "loss": 3.0864076614379883, + "step": 4271, + "token_acc": 0.2898633302262075 + }, + { + "epoch": 2.5042509527997656, + "grad_norm": 0.30654983838389, + "learning_rate": 0.00019692334942037027, + "loss": 3.039487361907959, + "step": 4272, + "token_acc": 0.29460107762716564 + }, + { + "epoch": 2.5048372911169743, + "grad_norm": 0.3096373344717664, + "learning_rate": 0.00019692096332496798, + "loss": 3.075105667114258, + "step": 4273, + "token_acc": 0.2879035124163442 + }, + { + "epoch": 2.5054236294341834, + "grad_norm": 0.33270065585083247, + "learning_rate": 0.00019691857631912377, + "loss": 3.098588466644287, + "step": 4274, + "token_acc": 0.28900199982980174 + }, + { + "epoch": 2.5060099677513925, + "grad_norm": 0.2912978189495693, + "learning_rate": 0.0001969161884028602, + "loss": 3.0480523109436035, + "step": 4275, + "token_acc": 0.29444588769835167 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.3875537821570306, + "learning_rate": 0.00019691379957619963, + "loss": 3.0992660522460938, + "step": 4276, + "token_acc": 0.28795357292537016 + }, + { + "epoch": 2.5071826443858107, + "grad_norm": 0.3573293807075769, + "learning_rate": 0.00019691140983916448, + "loss": 3.066530227661133, + "step": 4277, + "token_acc": 0.29093963155705216 + }, + { + "epoch": 2.50776898270302, + "grad_norm": 0.3204927335745619, + "learning_rate": 0.00019690901919177723, + "loss": 3.0874500274658203, + "step": 4278, + "token_acc": 0.2891207428220814 + }, + { + "epoch": 2.5083553210202285, + "grad_norm": 0.39469853255710097, + "learning_rate": 0.00019690662763406034, + "loss": 3.0678510665893555, + "step": 4279, + "token_acc": 0.292323678262773 + }, + { + "epoch": 2.5089416593374376, + "grad_norm": 0.33547454289248085, + "learning_rate": 0.00019690423516603627, + "loss": 3.06976318359375, + "step": 4280, + "token_acc": 0.29084254057102277 + }, + { + "epoch": 2.5095279976546467, + "grad_norm": 0.2846069944961093, + "learning_rate": 0.00019690184178772747, + "loss": 3.0748887062072754, + "step": 4281, + "token_acc": 0.2905438016791925 + }, + { + "epoch": 2.510114335971856, + "grad_norm": 0.2884888568963429, + "learning_rate": 0.00019689944749915646, + "loss": 3.0691356658935547, + "step": 4282, + "token_acc": 0.29161151110106653 + }, + { + "epoch": 2.5107006742890645, + "grad_norm": 0.2948983658280612, + "learning_rate": 0.00019689705230034572, + "loss": 3.0616114139556885, + "step": 4283, + "token_acc": 0.2921807189089474 + }, + { + "epoch": 2.5112870126062736, + "grad_norm": 0.35217252631866836, + "learning_rate": 0.00019689465619131773, + "loss": 3.0825552940368652, + "step": 4284, + "token_acc": 0.28761349902383093 + }, + { + "epoch": 2.5118733509234827, + "grad_norm": 0.3087504648428041, + "learning_rate": 0.00019689225917209502, + "loss": 3.0944318771362305, + "step": 4285, + "token_acc": 0.2890250681631681 + }, + { + "epoch": 2.512459689240692, + "grad_norm": 0.3145846193285609, + "learning_rate": 0.0001968898612427001, + "loss": 3.076207160949707, + "step": 4286, + "token_acc": 0.2914058492948969 + }, + { + "epoch": 2.513046027557901, + "grad_norm": 0.2654875367016934, + "learning_rate": 0.0001968874624031555, + "loss": 3.073927164077759, + "step": 4287, + "token_acc": 0.2920855189750035 + }, + { + "epoch": 2.51363236587511, + "grad_norm": 0.2996510314217945, + "learning_rate": 0.00019688506265348372, + "loss": 3.0828936100006104, + "step": 4288, + "token_acc": 0.28901591721442055 + }, + { + "epoch": 2.514218704192319, + "grad_norm": 0.3479351354338647, + "learning_rate": 0.00019688266199370736, + "loss": 3.0713021755218506, + "step": 4289, + "token_acc": 0.290365323228964 + }, + { + "epoch": 2.514805042509528, + "grad_norm": 0.2952253270233297, + "learning_rate": 0.00019688026042384893, + "loss": 3.0414981842041016, + "step": 4290, + "token_acc": 0.2945317903560479 + }, + { + "epoch": 2.515391380826737, + "grad_norm": 0.29482419781438785, + "learning_rate": 0.000196877857943931, + "loss": 3.053361415863037, + "step": 4291, + "token_acc": 0.29271100805663475 + }, + { + "epoch": 2.515977719143946, + "grad_norm": 0.29923135721811017, + "learning_rate": 0.00019687545455397617, + "loss": 3.074878454208374, + "step": 4292, + "token_acc": 0.2909384444501847 + }, + { + "epoch": 2.516564057461155, + "grad_norm": 0.3153730248985017, + "learning_rate": 0.00019687305025400693, + "loss": 3.07342529296875, + "step": 4293, + "token_acc": 0.29024290470741143 + }, + { + "epoch": 2.517150395778364, + "grad_norm": 0.3277224471189852, + "learning_rate": 0.00019687064504404596, + "loss": 3.0775301456451416, + "step": 4294, + "token_acc": 0.2902218583664022 + }, + { + "epoch": 2.517736734095573, + "grad_norm": 0.38624938110520857, + "learning_rate": 0.0001968682389241158, + "loss": 3.0417943000793457, + "step": 4295, + "token_acc": 0.29495728555743017 + }, + { + "epoch": 2.518323072412782, + "grad_norm": 0.30754087268882213, + "learning_rate": 0.00019686583189423905, + "loss": 3.060079574584961, + "step": 4296, + "token_acc": 0.292336483329794 + }, + { + "epoch": 2.518909410729991, + "grad_norm": 0.36677655515284674, + "learning_rate": 0.00019686342395443837, + "loss": 3.0611917972564697, + "step": 4297, + "token_acc": 0.29271803772429106 + }, + { + "epoch": 2.5194957490472003, + "grad_norm": 0.4774930882813341, + "learning_rate": 0.00019686101510473633, + "loss": 3.1029200553894043, + "step": 4298, + "token_acc": 0.28623283342838207 + }, + { + "epoch": 2.5200820873644094, + "grad_norm": 0.5453976021391657, + "learning_rate": 0.0001968586053451556, + "loss": 3.0844669342041016, + "step": 4299, + "token_acc": 0.28902091615977105 + }, + { + "epoch": 2.5206684256816185, + "grad_norm": 0.36815015962285796, + "learning_rate": 0.00019685619467571877, + "loss": 3.0689823627471924, + "step": 4300, + "token_acc": 0.29033517273195053 + }, + { + "epoch": 2.521254763998827, + "grad_norm": 0.3548288985881831, + "learning_rate": 0.00019685378309644848, + "loss": 3.0501694679260254, + "step": 4301, + "token_acc": 0.293307648017908 + }, + { + "epoch": 2.5218411023160363, + "grad_norm": 0.31235179900632937, + "learning_rate": 0.00019685137060736744, + "loss": 3.057091236114502, + "step": 4302, + "token_acc": 0.29191728232090075 + }, + { + "epoch": 2.5224274406332454, + "grad_norm": 0.3231893918214211, + "learning_rate": 0.0001968489572084983, + "loss": 3.0587756633758545, + "step": 4303, + "token_acc": 0.2907863910068024 + }, + { + "epoch": 2.5230137789504545, + "grad_norm": 0.31483750730230553, + "learning_rate": 0.0001968465428998637, + "loss": 3.045316696166992, + "step": 4304, + "token_acc": 0.2941504803232982 + }, + { + "epoch": 2.523600117267663, + "grad_norm": 0.33571919566464925, + "learning_rate": 0.0001968441276814863, + "loss": 3.056135892868042, + "step": 4305, + "token_acc": 0.29355658305517124 + }, + { + "epoch": 2.5241864555848723, + "grad_norm": 0.3018960509991811, + "learning_rate": 0.00019684171155338884, + "loss": 3.066023349761963, + "step": 4306, + "token_acc": 0.2917258974794195 + }, + { + "epoch": 2.5247727939020814, + "grad_norm": 0.31703761013763104, + "learning_rate": 0.000196839294515594, + "loss": 3.057170867919922, + "step": 4307, + "token_acc": 0.29054298999133105 + }, + { + "epoch": 2.5253591322192905, + "grad_norm": 0.23553694752904497, + "learning_rate": 0.0001968368765681245, + "loss": 3.0816755294799805, + "step": 4308, + "token_acc": 0.28833104949115657 + }, + { + "epoch": 2.5259454705364996, + "grad_norm": 0.35316999371866, + "learning_rate": 0.00019683445771100303, + "loss": 3.082458257675171, + "step": 4309, + "token_acc": 0.28933210420507693 + }, + { + "epoch": 2.5265318088537088, + "grad_norm": 0.3305611543492661, + "learning_rate": 0.00019683203794425226, + "loss": 3.0755796432495117, + "step": 4310, + "token_acc": 0.2882406994284743 + }, + { + "epoch": 2.527118147170918, + "grad_norm": 0.3154034592729068, + "learning_rate": 0.00019682961726789504, + "loss": 3.040978193283081, + "step": 4311, + "token_acc": 0.2952644091414348 + }, + { + "epoch": 2.5277044854881265, + "grad_norm": 0.3168003929354628, + "learning_rate": 0.00019682719568195402, + "loss": 3.0796010494232178, + "step": 4312, + "token_acc": 0.28949092533018855 + }, + { + "epoch": 2.5282908238053357, + "grad_norm": 0.2833203244253017, + "learning_rate": 0.00019682477318645197, + "loss": 3.055309295654297, + "step": 4313, + "token_acc": 0.29305594756679665 + }, + { + "epoch": 2.5288771621225448, + "grad_norm": 0.3111957412581656, + "learning_rate": 0.00019682234978141166, + "loss": 3.133759021759033, + "step": 4314, + "token_acc": 0.28245084441543605 + }, + { + "epoch": 2.529463500439754, + "grad_norm": 0.319591606272368, + "learning_rate": 0.0001968199254668558, + "loss": 3.0856704711914062, + "step": 4315, + "token_acc": 0.28993842996408414 + }, + { + "epoch": 2.5300498387569625, + "grad_norm": 0.31273918054458233, + "learning_rate": 0.00019681750024280728, + "loss": 3.067540168762207, + "step": 4316, + "token_acc": 0.29115876696979964 + }, + { + "epoch": 2.5306361770741717, + "grad_norm": 0.36443284001692516, + "learning_rate": 0.00019681507410928878, + "loss": 3.118192195892334, + "step": 4317, + "token_acc": 0.2864790826908778 + }, + { + "epoch": 2.5312225153913808, + "grad_norm": 0.314060342364928, + "learning_rate": 0.0001968126470663231, + "loss": 3.0855259895324707, + "step": 4318, + "token_acc": 0.2882240621246701 + }, + { + "epoch": 2.53180885370859, + "grad_norm": 0.3338798956370352, + "learning_rate": 0.00019681021911393306, + "loss": 3.0896735191345215, + "step": 4319, + "token_acc": 0.28827129614505065 + }, + { + "epoch": 2.532395192025799, + "grad_norm": 0.340146771123498, + "learning_rate": 0.00019680779025214146, + "loss": 3.0404653549194336, + "step": 4320, + "token_acc": 0.29519595605660925 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.30899001914948426, + "learning_rate": 0.00019680536048097115, + "loss": 3.045503616333008, + "step": 4321, + "token_acc": 0.2943045172832407 + }, + { + "epoch": 2.533567868660217, + "grad_norm": 0.34070681977036404, + "learning_rate": 0.00019680292980044493, + "loss": 3.047095775604248, + "step": 4322, + "token_acc": 0.2937312549594502 + }, + { + "epoch": 2.534154206977426, + "grad_norm": 0.40976098473253214, + "learning_rate": 0.0001968004982105856, + "loss": 3.033961296081543, + "step": 4323, + "token_acc": 0.2969539254248568 + }, + { + "epoch": 2.534740545294635, + "grad_norm": 0.39619804035293127, + "learning_rate": 0.00019679806571141603, + "loss": 3.080564498901367, + "step": 4324, + "token_acc": 0.28873699789246887 + }, + { + "epoch": 2.535326883611844, + "grad_norm": 0.2826258319258942, + "learning_rate": 0.00019679563230295908, + "loss": 3.0360283851623535, + "step": 4325, + "token_acc": 0.29602856100484704 + }, + { + "epoch": 2.535913221929053, + "grad_norm": 0.3383981438704743, + "learning_rate": 0.0001967931979852376, + "loss": 3.0303406715393066, + "step": 4326, + "token_acc": 0.29549217797643235 + }, + { + "epoch": 2.536499560246262, + "grad_norm": 0.36010800439295604, + "learning_rate": 0.00019679076275827445, + "loss": 3.069512367248535, + "step": 4327, + "token_acc": 0.2918079059469398 + }, + { + "epoch": 2.537085898563471, + "grad_norm": 0.31853265407731124, + "learning_rate": 0.0001967883266220925, + "loss": 3.0459797382354736, + "step": 4328, + "token_acc": 0.2953695654994356 + }, + { + "epoch": 2.53767223688068, + "grad_norm": 0.32174772129878987, + "learning_rate": 0.00019678588957671464, + "loss": 3.0687904357910156, + "step": 4329, + "token_acc": 0.2903558469989662 + }, + { + "epoch": 2.538258575197889, + "grad_norm": 0.37946690235253305, + "learning_rate": 0.00019678345162216378, + "loss": 3.1021206378936768, + "step": 4330, + "token_acc": 0.28620961018520685 + }, + { + "epoch": 2.5388449135150983, + "grad_norm": 0.3124169856712645, + "learning_rate": 0.00019678101275846284, + "loss": 3.0927093029022217, + "step": 4331, + "token_acc": 0.2868429764035296 + }, + { + "epoch": 2.5394312518323074, + "grad_norm": 0.34274850699130816, + "learning_rate": 0.00019677857298563468, + "loss": 3.0585250854492188, + "step": 4332, + "token_acc": 0.29139852060397603 + }, + { + "epoch": 2.540017590149516, + "grad_norm": 0.2923296091069028, + "learning_rate": 0.0001967761323037022, + "loss": 3.0273396968841553, + "step": 4333, + "token_acc": 0.2966150872878518 + }, + { + "epoch": 2.5406039284667252, + "grad_norm": 0.3248173566948958, + "learning_rate": 0.0001967736907126884, + "loss": 3.077577590942383, + "step": 4334, + "token_acc": 0.28987661348555044 + }, + { + "epoch": 2.5411902667839343, + "grad_norm": 0.3325764933676299, + "learning_rate": 0.0001967712482126162, + "loss": 3.0596702098846436, + "step": 4335, + "token_acc": 0.29220380109457844 + }, + { + "epoch": 2.5417766051011434, + "grad_norm": 0.29495416159286175, + "learning_rate": 0.00019676880480350847, + "loss": 3.0884342193603516, + "step": 4336, + "token_acc": 0.28674425630921546 + }, + { + "epoch": 2.542362943418352, + "grad_norm": 0.31906955088729705, + "learning_rate": 0.00019676636048538825, + "loss": 3.0372214317321777, + "step": 4337, + "token_acc": 0.29461517277956834 + }, + { + "epoch": 2.5429492817355612, + "grad_norm": 0.38596150792753015, + "learning_rate": 0.00019676391525827848, + "loss": 3.1169283390045166, + "step": 4338, + "token_acc": 0.2853148630615746 + }, + { + "epoch": 2.5435356200527703, + "grad_norm": 0.4171515406592132, + "learning_rate": 0.00019676146912220207, + "loss": 3.0613787174224854, + "step": 4339, + "token_acc": 0.29045808446690446 + }, + { + "epoch": 2.5441219583699795, + "grad_norm": 0.40773296826936284, + "learning_rate": 0.0001967590220771821, + "loss": 3.049337863922119, + "step": 4340, + "token_acc": 0.29383906251648084 + }, + { + "epoch": 2.5447082966871886, + "grad_norm": 0.3744202307544062, + "learning_rate": 0.00019675657412324146, + "loss": 3.0532612800598145, + "step": 4341, + "token_acc": 0.29447773950157036 + }, + { + "epoch": 2.5452946350043977, + "grad_norm": 0.42487171907621696, + "learning_rate": 0.00019675412526040323, + "loss": 3.073619842529297, + "step": 4342, + "token_acc": 0.29084579131144217 + }, + { + "epoch": 2.545880973321607, + "grad_norm": 0.4430334500454967, + "learning_rate": 0.00019675167548869035, + "loss": 3.0856785774230957, + "step": 4343, + "token_acc": 0.28847597597597596 + }, + { + "epoch": 2.5464673116388155, + "grad_norm": 0.42276574376526593, + "learning_rate": 0.00019674922480812583, + "loss": 3.0611069202423096, + "step": 4344, + "token_acc": 0.2915815626619517 + }, + { + "epoch": 2.5470536499560246, + "grad_norm": 0.2951208183344554, + "learning_rate": 0.00019674677321873275, + "loss": 3.053767681121826, + "step": 4345, + "token_acc": 0.2926701529957454 + }, + { + "epoch": 2.5476399882732337, + "grad_norm": 0.4834778382856313, + "learning_rate": 0.0001967443207205341, + "loss": 3.1018574237823486, + "step": 4346, + "token_acc": 0.2857452583020529 + }, + { + "epoch": 2.548226326590443, + "grad_norm": 0.28849019103372164, + "learning_rate": 0.00019674186731355288, + "loss": 3.126852035522461, + "step": 4347, + "token_acc": 0.2839649898927895 + }, + { + "epoch": 2.5488126649076515, + "grad_norm": 0.4404181145353068, + "learning_rate": 0.0001967394129978122, + "loss": 3.0917136669158936, + "step": 4348, + "token_acc": 0.2871915418153199 + }, + { + "epoch": 2.5493990032248606, + "grad_norm": 0.30384673166089216, + "learning_rate": 0.00019673695777333512, + "loss": 3.045119285583496, + "step": 4349, + "token_acc": 0.293204212944232 + }, + { + "epoch": 2.5499853415420697, + "grad_norm": 0.4201201654133906, + "learning_rate": 0.00019673450164014463, + "loss": 3.0878076553344727, + "step": 4350, + "token_acc": 0.28939099488486236 + }, + { + "epoch": 2.550571679859279, + "grad_norm": 0.2729500462034345, + "learning_rate": 0.00019673204459826388, + "loss": 3.0816612243652344, + "step": 4351, + "token_acc": 0.28802842012123453 + }, + { + "epoch": 2.551158018176488, + "grad_norm": 0.3952804575887593, + "learning_rate": 0.0001967295866477159, + "loss": 3.074885129928589, + "step": 4352, + "token_acc": 0.2901008722152999 + }, + { + "epoch": 2.551744356493697, + "grad_norm": 0.3109425019840235, + "learning_rate": 0.00019672712778852382, + "loss": 3.0515828132629395, + "step": 4353, + "token_acc": 0.2931193708231135 + }, + { + "epoch": 2.552330694810906, + "grad_norm": 0.345899174736339, + "learning_rate": 0.0001967246680207107, + "loss": 3.0541086196899414, + "step": 4354, + "token_acc": 0.29222225513933364 + }, + { + "epoch": 2.552917033128115, + "grad_norm": 0.38654748318407606, + "learning_rate": 0.00019672220734429967, + "loss": 3.111905336380005, + "step": 4355, + "token_acc": 0.28456465966487615 + }, + { + "epoch": 2.553503371445324, + "grad_norm": 0.33738221730942225, + "learning_rate": 0.00019671974575931385, + "loss": 3.0592422485351562, + "step": 4356, + "token_acc": 0.29193751380289334 + }, + { + "epoch": 2.554089709762533, + "grad_norm": 0.3578770150362226, + "learning_rate": 0.00019671728326577635, + "loss": 3.064779043197632, + "step": 4357, + "token_acc": 0.2917376779190185 + }, + { + "epoch": 2.554676048079742, + "grad_norm": 0.3863932125854447, + "learning_rate": 0.00019671481986371027, + "loss": 3.03214168548584, + "step": 4358, + "token_acc": 0.2957747948163584 + }, + { + "epoch": 2.555262386396951, + "grad_norm": 0.30995878812369565, + "learning_rate": 0.0001967123555531388, + "loss": 3.0910720825195312, + "step": 4359, + "token_acc": 0.28800116840405915 + }, + { + "epoch": 2.55584872471416, + "grad_norm": 0.3819498869674784, + "learning_rate": 0.00019670989033408507, + "loss": 3.0519962310791016, + "step": 4360, + "token_acc": 0.29198846682358426 + }, + { + "epoch": 2.556435063031369, + "grad_norm": 0.3721631762602892, + "learning_rate": 0.00019670742420657225, + "loss": 3.0353198051452637, + "step": 4361, + "token_acc": 0.2957572216153045 + }, + { + "epoch": 2.557021401348578, + "grad_norm": 0.3541723204248817, + "learning_rate": 0.00019670495717062346, + "loss": 3.1144042015075684, + "step": 4362, + "token_acc": 0.28357453725551884 + }, + { + "epoch": 2.5576077396657872, + "grad_norm": 0.3543586093298492, + "learning_rate": 0.00019670248922626192, + "loss": 3.1077284812927246, + "step": 4363, + "token_acc": 0.2841821775873277 + }, + { + "epoch": 2.5581940779829964, + "grad_norm": 0.314163116849466, + "learning_rate": 0.00019670002037351086, + "loss": 3.0905518531799316, + "step": 4364, + "token_acc": 0.28717304574159414 + }, + { + "epoch": 2.5587804163002055, + "grad_norm": 0.2956714527013071, + "learning_rate": 0.00019669755061239337, + "loss": 3.066251039505005, + "step": 4365, + "token_acc": 0.29134437605714486 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.3306390850186592, + "learning_rate": 0.00019669507994293266, + "loss": 3.097242593765259, + "step": 4366, + "token_acc": 0.2856347173333266 + }, + { + "epoch": 2.5599530929346233, + "grad_norm": 0.31475363380465343, + "learning_rate": 0.00019669260836515203, + "loss": 3.056725025177002, + "step": 4367, + "token_acc": 0.2918700121304986 + }, + { + "epoch": 2.5605394312518324, + "grad_norm": 0.35817930149525296, + "learning_rate": 0.0001966901358790746, + "loss": 3.0887980461120605, + "step": 4368, + "token_acc": 0.2872980851786035 + }, + { + "epoch": 2.5611257695690415, + "grad_norm": 0.3842368806563799, + "learning_rate": 0.00019668766248472362, + "loss": 3.100215435028076, + "step": 4369, + "token_acc": 0.28725450901803606 + }, + { + "epoch": 2.56171210788625, + "grad_norm": 0.3162344267452049, + "learning_rate": 0.00019668518818212238, + "loss": 3.1013846397399902, + "step": 4370, + "token_acc": 0.2875088417212239 + }, + { + "epoch": 2.5622984462034593, + "grad_norm": 0.37850795328190984, + "learning_rate": 0.0001966827129712941, + "loss": 3.0717766284942627, + "step": 4371, + "token_acc": 0.2891443259967143 + }, + { + "epoch": 2.5628847845206684, + "grad_norm": 0.3423010292330842, + "learning_rate": 0.00019668023685226195, + "loss": 3.0707814693450928, + "step": 4372, + "token_acc": 0.2896238791785463 + }, + { + "epoch": 2.5634711228378775, + "grad_norm": 0.2829031212605263, + "learning_rate": 0.0001966777598250493, + "loss": 3.092684030532837, + "step": 4373, + "token_acc": 0.28880511813048354 + }, + { + "epoch": 2.5640574611550866, + "grad_norm": 0.37486135791478675, + "learning_rate": 0.00019667528188967937, + "loss": 3.078953742980957, + "step": 4374, + "token_acc": 0.28989758338430305 + }, + { + "epoch": 2.5646437994722957, + "grad_norm": 0.38901017750281114, + "learning_rate": 0.0001966728030461754, + "loss": 3.0494942665100098, + "step": 4375, + "token_acc": 0.29251891431568344 + }, + { + "epoch": 2.565230137789505, + "grad_norm": 0.45183970380483385, + "learning_rate": 0.00019667032329456077, + "loss": 3.1119227409362793, + "step": 4376, + "token_acc": 0.2854487589526336 + }, + { + "epoch": 2.5658164761067135, + "grad_norm": 0.35230564987409213, + "learning_rate": 0.00019666784263485868, + "loss": 3.070038318634033, + "step": 4377, + "token_acc": 0.2899215103392527 + }, + { + "epoch": 2.5664028144239226, + "grad_norm": 0.35518878657327324, + "learning_rate": 0.00019666536106709246, + "loss": 3.0616188049316406, + "step": 4378, + "token_acc": 0.29071801626330485 + }, + { + "epoch": 2.5669891527411317, + "grad_norm": 0.34082797562021155, + "learning_rate": 0.00019666287859128545, + "loss": 3.0412020683288574, + "step": 4379, + "token_acc": 0.2939832629227996 + }, + { + "epoch": 2.567575491058341, + "grad_norm": 0.35255619286108814, + "learning_rate": 0.00019666039520746095, + "loss": 3.0532445907592773, + "step": 4380, + "token_acc": 0.2925111018191634 + }, + { + "epoch": 2.5681618293755495, + "grad_norm": 0.29233296397113484, + "learning_rate": 0.0001966579109156423, + "loss": 3.1050941944122314, + "step": 4381, + "token_acc": 0.28545357304897306 + }, + { + "epoch": 2.5687481676927586, + "grad_norm": 0.3440338865386845, + "learning_rate": 0.0001966554257158528, + "loss": 3.0384020805358887, + "step": 4382, + "token_acc": 0.29407245136145393 + }, + { + "epoch": 2.5693345060099677, + "grad_norm": 0.3638994869978608, + "learning_rate": 0.00019665293960811583, + "loss": 3.064739942550659, + "step": 4383, + "token_acc": 0.2919915288708573 + }, + { + "epoch": 2.569920844327177, + "grad_norm": 0.3951104643217597, + "learning_rate": 0.00019665045259245473, + "loss": 3.0822486877441406, + "step": 4384, + "token_acc": 0.28899868859907274 + }, + { + "epoch": 2.570507182644386, + "grad_norm": 0.33588356678262443, + "learning_rate": 0.00019664796466889288, + "loss": 3.0625858306884766, + "step": 4385, + "token_acc": 0.2923373364170113 + }, + { + "epoch": 2.571093520961595, + "grad_norm": 0.33576426720680497, + "learning_rate": 0.00019664547583745363, + "loss": 3.082542896270752, + "step": 4386, + "token_acc": 0.29134112995365674 + }, + { + "epoch": 2.5716798592788037, + "grad_norm": 0.26428207277732607, + "learning_rate": 0.00019664298609816037, + "loss": 3.0884761810302734, + "step": 4387, + "token_acc": 0.28682554638594665 + }, + { + "epoch": 2.572266197596013, + "grad_norm": 0.33908583219150185, + "learning_rate": 0.0001966404954510365, + "loss": 3.0769479274749756, + "step": 4388, + "token_acc": 0.29077000628809885 + }, + { + "epoch": 2.572852535913222, + "grad_norm": 0.33852640032145337, + "learning_rate": 0.00019663800389610537, + "loss": 3.0566697120666504, + "step": 4389, + "token_acc": 0.29196055870015014 + }, + { + "epoch": 2.573438874230431, + "grad_norm": 0.29405558012155864, + "learning_rate": 0.00019663551143339042, + "loss": 3.0920510292053223, + "step": 4390, + "token_acc": 0.28844251173752045 + }, + { + "epoch": 2.5740252125476397, + "grad_norm": 0.4084063439903854, + "learning_rate": 0.00019663301806291505, + "loss": 3.0672731399536133, + "step": 4391, + "token_acc": 0.291824211720072 + }, + { + "epoch": 2.574611550864849, + "grad_norm": 0.34410287516604726, + "learning_rate": 0.00019663052378470267, + "loss": 3.069742441177368, + "step": 4392, + "token_acc": 0.2895558121774426 + }, + { + "epoch": 2.575197889182058, + "grad_norm": 0.2637874115580758, + "learning_rate": 0.0001966280285987768, + "loss": 3.0326242446899414, + "step": 4393, + "token_acc": 0.29645017255027756 + }, + { + "epoch": 2.575784227499267, + "grad_norm": 0.334124981646552, + "learning_rate": 0.00019662553250516076, + "loss": 3.081714391708374, + "step": 4394, + "token_acc": 0.2903737337163789 + }, + { + "epoch": 2.576370565816476, + "grad_norm": 0.31844492322124635, + "learning_rate": 0.00019662303550387807, + "loss": 3.063903331756592, + "step": 4395, + "token_acc": 0.2918590018316768 + }, + { + "epoch": 2.5769569041336853, + "grad_norm": 0.2949480035136617, + "learning_rate": 0.00019662053759495214, + "loss": 3.0998377799987793, + "step": 4396, + "token_acc": 0.28477315389428554 + }, + { + "epoch": 2.5775432424508944, + "grad_norm": 0.3084170075036543, + "learning_rate": 0.00019661803877840645, + "loss": 3.1447291374206543, + "step": 4397, + "token_acc": 0.27942980988350785 + }, + { + "epoch": 2.578129580768103, + "grad_norm": 0.3748756166993346, + "learning_rate": 0.0001966155390542645, + "loss": 3.0738232135772705, + "step": 4398, + "token_acc": 0.28976208036987133 + }, + { + "epoch": 2.578715919085312, + "grad_norm": 0.389510707359839, + "learning_rate": 0.00019661303842254975, + "loss": 3.082246780395508, + "step": 4399, + "token_acc": 0.2892691525423729 + }, + { + "epoch": 2.5793022574025213, + "grad_norm": 0.3414748871019285, + "learning_rate": 0.0001966105368832857, + "loss": 3.088263750076294, + "step": 4400, + "token_acc": 0.28817628772594395 + }, + { + "epoch": 2.5798885957197304, + "grad_norm": 0.40813253384065784, + "learning_rate": 0.00019660803443649584, + "loss": 3.0422964096069336, + "step": 4401, + "token_acc": 0.29398536676880743 + }, + { + "epoch": 2.580474934036939, + "grad_norm": 0.3490543368972251, + "learning_rate": 0.00019660553108220366, + "loss": 3.07165789604187, + "step": 4402, + "token_acc": 0.29213844637019354 + }, + { + "epoch": 2.581061272354148, + "grad_norm": 0.3645319784631988, + "learning_rate": 0.00019660302682043268, + "loss": 3.07938289642334, + "step": 4403, + "token_acc": 0.28949733728848787 + }, + { + "epoch": 2.5816476106713573, + "grad_norm": 0.3767381711633503, + "learning_rate": 0.00019660052165120648, + "loss": 3.075606107711792, + "step": 4404, + "token_acc": 0.28972351170739175 + }, + { + "epoch": 2.5822339489885664, + "grad_norm": 0.36943117028402317, + "learning_rate": 0.00019659801557454852, + "loss": 3.0139575004577637, + "step": 4405, + "token_acc": 0.29775966682224536 + }, + { + "epoch": 2.5828202873057755, + "grad_norm": 0.34320773816502176, + "learning_rate": 0.0001965955085904824, + "loss": 3.0313358306884766, + "step": 4406, + "token_acc": 0.2956824775277748 + }, + { + "epoch": 2.5834066256229846, + "grad_norm": 0.40876089144779704, + "learning_rate": 0.0001965930006990316, + "loss": 3.0329337120056152, + "step": 4407, + "token_acc": 0.29685239554080506 + }, + { + "epoch": 2.5839929639401937, + "grad_norm": 0.40094509642801773, + "learning_rate": 0.00019659049190021973, + "loss": 3.067540168762207, + "step": 4408, + "token_acc": 0.29222723022892205 + }, + { + "epoch": 2.5845793022574024, + "grad_norm": 0.32291604738213864, + "learning_rate": 0.00019658798219407037, + "loss": 3.044240951538086, + "step": 4409, + "token_acc": 0.2941207851824378 + }, + { + "epoch": 2.5851656405746115, + "grad_norm": 0.34848239300088957, + "learning_rate": 0.00019658547158060705, + "loss": 3.109755754470825, + "step": 4410, + "token_acc": 0.2851129963492076 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.36670955079564616, + "learning_rate": 0.0001965829600598534, + "loss": 3.013775110244751, + "step": 4411, + "token_acc": 0.2976083510576951 + }, + { + "epoch": 2.5863383172090297, + "grad_norm": 0.30723042181299637, + "learning_rate": 0.00019658044763183296, + "loss": 3.1021902561187744, + "step": 4412, + "token_acc": 0.28611202490512144 + }, + { + "epoch": 2.5869246555262384, + "grad_norm": 0.3748923942741152, + "learning_rate": 0.00019657793429656936, + "loss": 3.0385971069335938, + "step": 4413, + "token_acc": 0.29457802162336616 + }, + { + "epoch": 2.5875109938434475, + "grad_norm": 0.3372145824329986, + "learning_rate": 0.00019657542005408623, + "loss": 3.08402681350708, + "step": 4414, + "token_acc": 0.288815297623435 + }, + { + "epoch": 2.5880973321606566, + "grad_norm": 0.3480607292581736, + "learning_rate": 0.00019657290490440713, + "loss": 3.050720453262329, + "step": 4415, + "token_acc": 0.29246168031965364 + }, + { + "epoch": 2.5886836704778657, + "grad_norm": 0.38232386761433246, + "learning_rate": 0.00019657038884755574, + "loss": 3.045194149017334, + "step": 4416, + "token_acc": 0.2950650778724957 + }, + { + "epoch": 2.589270008795075, + "grad_norm": 0.44161308260079746, + "learning_rate": 0.0001965678718835557, + "loss": 3.0886642932891846, + "step": 4417, + "token_acc": 0.2887354879196737 + }, + { + "epoch": 2.589856347112284, + "grad_norm": 0.35795736262184297, + "learning_rate": 0.0001965653540124306, + "loss": 3.1190571784973145, + "step": 4418, + "token_acc": 0.284661424715409 + }, + { + "epoch": 2.590442685429493, + "grad_norm": 0.3766037021079068, + "learning_rate": 0.00019656283523420413, + "loss": 3.047330379486084, + "step": 4419, + "token_acc": 0.2939204320394759 + }, + { + "epoch": 2.5910290237467017, + "grad_norm": 0.35550401658543784, + "learning_rate": 0.00019656031554889992, + "loss": 3.069953441619873, + "step": 4420, + "token_acc": 0.29171317072665454 + }, + { + "epoch": 2.591615362063911, + "grad_norm": 0.5485291351804926, + "learning_rate": 0.0001965577949565417, + "loss": 3.06333589553833, + "step": 4421, + "token_acc": 0.2910507048835579 + }, + { + "epoch": 2.59220170038112, + "grad_norm": 0.3680642390723448, + "learning_rate": 0.0001965552734571531, + "loss": 3.068913459777832, + "step": 4422, + "token_acc": 0.2919476925942185 + }, + { + "epoch": 2.592788038698329, + "grad_norm": 0.4229449852100515, + "learning_rate": 0.00019655275105075784, + "loss": 3.0234322547912598, + "step": 4423, + "token_acc": 0.2989145026031933 + }, + { + "epoch": 2.5933743770155377, + "grad_norm": 0.45032929612938555, + "learning_rate": 0.00019655022773737955, + "loss": 3.097522020339966, + "step": 4424, + "token_acc": 0.2873199940532033 + }, + { + "epoch": 2.593960715332747, + "grad_norm": 2.2950824445847053, + "learning_rate": 0.000196547703517042, + "loss": 3.2044525146484375, + "step": 4425, + "token_acc": 0.2770094356215149 + }, + { + "epoch": 2.594547053649956, + "grad_norm": 2.8396545605446724, + "learning_rate": 0.00019654517838976884, + "loss": 3.1636412143707275, + "step": 4426, + "token_acc": 0.2820985229821497 + }, + { + "epoch": 2.595133391967165, + "grad_norm": 0.8294171997733754, + "learning_rate": 0.00019654265235558385, + "loss": 3.0723445415496826, + "step": 4427, + "token_acc": 0.2902975595535566 + }, + { + "epoch": 2.595719730284374, + "grad_norm": 1.4731651143309001, + "learning_rate": 0.0001965401254145107, + "loss": 3.100215435028076, + "step": 4428, + "token_acc": 0.2879225438134057 + }, + { + "epoch": 2.5963060686015833, + "grad_norm": 0.9117299372915686, + "learning_rate": 0.00019653759756657323, + "loss": 3.0904886722564697, + "step": 4429, + "token_acc": 0.2904010224530058 + }, + { + "epoch": 2.5968924069187924, + "grad_norm": 0.8977769326077327, + "learning_rate": 0.00019653506881179506, + "loss": 3.105140209197998, + "step": 4430, + "token_acc": 0.28762170611305243 + }, + { + "epoch": 2.597478745236001, + "grad_norm": 0.5054889864204122, + "learning_rate": 0.0001965325391502, + "loss": 3.0985279083251953, + "step": 4431, + "token_acc": 0.2868811145105115 + }, + { + "epoch": 2.59806508355321, + "grad_norm": 0.6355593035266394, + "learning_rate": 0.00019653000858181185, + "loss": 3.1147515773773193, + "step": 4432, + "token_acc": 0.28274495500179786 + }, + { + "epoch": 2.5986514218704193, + "grad_norm": 0.5392172003877407, + "learning_rate": 0.00019652747710665437, + "loss": 3.0859718322753906, + "step": 4433, + "token_acc": 0.28845969392105036 + }, + { + "epoch": 2.5992377601876284, + "grad_norm": 0.5208474751743597, + "learning_rate": 0.00019652494472475126, + "loss": 3.077242136001587, + "step": 4434, + "token_acc": 0.2893993013771469 + }, + { + "epoch": 2.599824098504837, + "grad_norm": 0.4556370885993351, + "learning_rate": 0.00019652241143612638, + "loss": 3.0691652297973633, + "step": 4435, + "token_acc": 0.2916762576379007 + }, + { + "epoch": 2.600410436822046, + "grad_norm": 0.32485519985349476, + "learning_rate": 0.0001965198772408035, + "loss": 3.0671253204345703, + "step": 4436, + "token_acc": 0.29105503005352873 + }, + { + "epoch": 2.6009967751392553, + "grad_norm": 0.4185910103056575, + "learning_rate": 0.00019651734213880644, + "loss": 3.0384130477905273, + "step": 4437, + "token_acc": 0.29465028325722264 + }, + { + "epoch": 2.6015831134564644, + "grad_norm": 0.3830457101204748, + "learning_rate": 0.00019651480613015903, + "loss": 3.0776829719543457, + "step": 4438, + "token_acc": 0.2898653732501256 + }, + { + "epoch": 2.6021694517736735, + "grad_norm": 2.029732030279408, + "learning_rate": 0.00019651226921488504, + "loss": 3.1110501289367676, + "step": 4439, + "token_acc": 0.28992283589045115 + }, + { + "epoch": 2.6027557900908826, + "grad_norm": 0.4157322918489538, + "learning_rate": 0.00019650973139300834, + "loss": 3.072413921356201, + "step": 4440, + "token_acc": 0.29016718312809137 + }, + { + "epoch": 2.6033421284080913, + "grad_norm": 0.3738115483296638, + "learning_rate": 0.00019650719266455278, + "loss": 3.0831398963928223, + "step": 4441, + "token_acc": 0.2906055274055062 + }, + { + "epoch": 2.6039284667253004, + "grad_norm": 0.7281054454723008, + "learning_rate": 0.00019650465302954219, + "loss": 3.198808431625366, + "step": 4442, + "token_acc": 0.28240875428093115 + }, + { + "epoch": 2.6045148050425095, + "grad_norm": 0.36734657523112113, + "learning_rate": 0.0001965021124880004, + "loss": 3.0845210552215576, + "step": 4443, + "token_acc": 0.28914049719202034 + }, + { + "epoch": 2.6051011433597187, + "grad_norm": 0.35737744432983204, + "learning_rate": 0.00019649957103995132, + "loss": 3.0540714263916016, + "step": 4444, + "token_acc": 0.29415140074649276 + }, + { + "epoch": 2.6056874816769273, + "grad_norm": 0.35772084734332854, + "learning_rate": 0.0001964970286854188, + "loss": 3.103898286819458, + "step": 4445, + "token_acc": 0.2847180639383614 + }, + { + "epoch": 2.6062738199941364, + "grad_norm": 0.3821353993324753, + "learning_rate": 0.00019649448542442672, + "loss": 3.075505256652832, + "step": 4446, + "token_acc": 0.28804238012589695 + }, + { + "epoch": 2.6068601583113455, + "grad_norm": 0.3256203646199461, + "learning_rate": 0.000196491941256999, + "loss": 3.071077346801758, + "step": 4447, + "token_acc": 0.2910581249298213 + }, + { + "epoch": 2.6074464966285547, + "grad_norm": 0.35843790969846373, + "learning_rate": 0.0001964893961831595, + "loss": 3.101072072982788, + "step": 4448, + "token_acc": 0.28657332686686227 + }, + { + "epoch": 2.6080328349457638, + "grad_norm": 0.3441457314408639, + "learning_rate": 0.00019648685020293215, + "loss": 3.0406718254089355, + "step": 4449, + "token_acc": 0.29567943872284613 + }, + { + "epoch": 2.608619173262973, + "grad_norm": 0.33621993909040554, + "learning_rate": 0.00019648430331634085, + "loss": 3.0481178760528564, + "step": 4450, + "token_acc": 0.2921173366605581 + }, + { + "epoch": 2.609205511580182, + "grad_norm": 0.2906904104418168, + "learning_rate": 0.00019648175552340952, + "loss": 3.066950798034668, + "step": 4451, + "token_acc": 0.2919402158720509 + }, + { + "epoch": 2.6097918498973907, + "grad_norm": 0.34704656141721263, + "learning_rate": 0.00019647920682416215, + "loss": 3.0664334297180176, + "step": 4452, + "token_acc": 0.2924489869586069 + }, + { + "epoch": 2.6103781882145998, + "grad_norm": 0.2909445400642656, + "learning_rate": 0.0001964766572186226, + "loss": 3.0556480884552, + "step": 4453, + "token_acc": 0.29229036660174623 + }, + { + "epoch": 2.610964526531809, + "grad_norm": 0.36327754891629727, + "learning_rate": 0.0001964741067068149, + "loss": 3.0715274810791016, + "step": 4454, + "token_acc": 0.2909199277605251 + }, + { + "epoch": 2.611550864849018, + "grad_norm": 0.5738670581320662, + "learning_rate": 0.00019647155528876293, + "loss": 3.050835132598877, + "step": 4455, + "token_acc": 0.29428908545311366 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.293080853181563, + "learning_rate": 0.00019646900296449072, + "loss": 3.0443215370178223, + "step": 4456, + "token_acc": 0.29371182844488214 + }, + { + "epoch": 2.6127235414834358, + "grad_norm": 0.3922183584826324, + "learning_rate": 0.0001964664497340222, + "loss": 3.0840888023376465, + "step": 4457, + "token_acc": 0.28788662241592183 + }, + { + "epoch": 2.613309879800645, + "grad_norm": 0.29859083695848426, + "learning_rate": 0.00019646389559738138, + "loss": 3.068246364593506, + "step": 4458, + "token_acc": 0.29035752090393624 + }, + { + "epoch": 2.613896218117854, + "grad_norm": 0.33772709071421625, + "learning_rate": 0.00019646134055459227, + "loss": 3.081051826477051, + "step": 4459, + "token_acc": 0.2889718948132104 + }, + { + "epoch": 2.614482556435063, + "grad_norm": 0.3536834559960459, + "learning_rate": 0.00019645878460567882, + "loss": 3.117042064666748, + "step": 4460, + "token_acc": 0.28482590949792796 + }, + { + "epoch": 2.615068894752272, + "grad_norm": 0.33771190453939687, + "learning_rate": 0.0001964562277506651, + "loss": 3.0764381885528564, + "step": 4461, + "token_acc": 0.2889233603693272 + }, + { + "epoch": 2.6156552330694813, + "grad_norm": 0.35424024724714576, + "learning_rate": 0.00019645366998957507, + "loss": 3.0462307929992676, + "step": 4462, + "token_acc": 0.2931284693248195 + }, + { + "epoch": 2.61624157138669, + "grad_norm": 0.30341854096965054, + "learning_rate": 0.0001964511113224328, + "loss": 3.0258350372314453, + "step": 4463, + "token_acc": 0.2968459137804808 + }, + { + "epoch": 2.616827909703899, + "grad_norm": 0.2982402917000955, + "learning_rate": 0.0001964485517492623, + "loss": 3.005655288696289, + "step": 4464, + "token_acc": 0.3001810802372776 + }, + { + "epoch": 2.6174142480211082, + "grad_norm": 0.26640662699611395, + "learning_rate": 0.00019644599127008761, + "loss": 3.0821139812469482, + "step": 4465, + "token_acc": 0.28849303529925596 + }, + { + "epoch": 2.6180005863383173, + "grad_norm": 0.2617223132507397, + "learning_rate": 0.0001964434298849328, + "loss": 3.0709388256073, + "step": 4466, + "token_acc": 0.29271523526922644 + }, + { + "epoch": 2.618586924655526, + "grad_norm": 0.31109899583608125, + "learning_rate": 0.00019644086759382194, + "loss": 3.0869507789611816, + "step": 4467, + "token_acc": 0.2893768888958306 + }, + { + "epoch": 2.619173262972735, + "grad_norm": 0.2812213263130152, + "learning_rate": 0.00019643830439677906, + "loss": 3.0369787216186523, + "step": 4468, + "token_acc": 0.29494939600014436 + }, + { + "epoch": 2.6197596012899442, + "grad_norm": 0.2808366453872767, + "learning_rate": 0.00019643574029382829, + "loss": 3.0415759086608887, + "step": 4469, + "token_acc": 0.295452570897067 + }, + { + "epoch": 2.6203459396071533, + "grad_norm": 0.3527981814005396, + "learning_rate": 0.00019643317528499367, + "loss": 3.057199001312256, + "step": 4470, + "token_acc": 0.2912875843086168 + }, + { + "epoch": 2.6209322779243625, + "grad_norm": 0.3252137599709574, + "learning_rate": 0.00019643060937029933, + "loss": 3.0361056327819824, + "step": 4471, + "token_acc": 0.2960444136016655 + }, + { + "epoch": 2.6215186162415716, + "grad_norm": 0.30557122713452234, + "learning_rate": 0.00019642804254976936, + "loss": 3.0279526710510254, + "step": 4472, + "token_acc": 0.2976592940260301 + }, + { + "epoch": 2.6221049545587807, + "grad_norm": 0.3343228268098006, + "learning_rate": 0.00019642547482342785, + "loss": 3.0579023361206055, + "step": 4473, + "token_acc": 0.2931988095332745 + }, + { + "epoch": 2.6226912928759893, + "grad_norm": 0.29480812293668984, + "learning_rate": 0.00019642290619129894, + "loss": 3.106870174407959, + "step": 4474, + "token_acc": 0.28628196889859914 + }, + { + "epoch": 2.6232776311931985, + "grad_norm": 0.3143597338444688, + "learning_rate": 0.00019642033665340675, + "loss": 3.0419559478759766, + "step": 4475, + "token_acc": 0.29422649790289596 + }, + { + "epoch": 2.6238639695104076, + "grad_norm": 0.34809137420782466, + "learning_rate": 0.00019641776620977545, + "loss": 3.068683624267578, + "step": 4476, + "token_acc": 0.28931212093829234 + }, + { + "epoch": 2.6244503078276167, + "grad_norm": 0.29454093956690963, + "learning_rate": 0.00019641519486042914, + "loss": 3.077669382095337, + "step": 4477, + "token_acc": 0.2887501024592873 + }, + { + "epoch": 2.6250366461448253, + "grad_norm": 0.354110602969907, + "learning_rate": 0.00019641262260539202, + "loss": 3.0290963649749756, + "step": 4478, + "token_acc": 0.29660718747928255 + }, + { + "epoch": 2.6256229844620345, + "grad_norm": 0.29489555346241825, + "learning_rate": 0.00019641004944468822, + "loss": 3.0545310974121094, + "step": 4479, + "token_acc": 0.2925549708751268 + }, + { + "epoch": 2.6262093227792436, + "grad_norm": 0.2844289747362478, + "learning_rate": 0.0001964074753783419, + "loss": 3.045168876647949, + "step": 4480, + "token_acc": 0.29424680127304825 + }, + { + "epoch": 2.6267956610964527, + "grad_norm": 0.3417634942417672, + "learning_rate": 0.00019640490040637726, + "loss": 3.086907386779785, + "step": 4481, + "token_acc": 0.2886968656075569 + }, + { + "epoch": 2.627381999413662, + "grad_norm": 0.32940580828988714, + "learning_rate": 0.0001964023245288185, + "loss": 3.050446033477783, + "step": 4482, + "token_acc": 0.2932167797247183 + }, + { + "epoch": 2.627968337730871, + "grad_norm": 0.30048130896487857, + "learning_rate": 0.00019639974774568982, + "loss": 3.0426900386810303, + "step": 4483, + "token_acc": 0.29266286334945707 + }, + { + "epoch": 2.62855467604808, + "grad_norm": 0.36789060580501176, + "learning_rate": 0.00019639717005701538, + "loss": 3.067312717437744, + "step": 4484, + "token_acc": 0.291606935095155 + }, + { + "epoch": 2.6291410143652887, + "grad_norm": 0.4460827920534385, + "learning_rate": 0.00019639459146281944, + "loss": 3.0965256690979004, + "step": 4485, + "token_acc": 0.2864489908877523 + }, + { + "epoch": 2.629727352682498, + "grad_norm": 0.33776738565909503, + "learning_rate": 0.00019639201196312622, + "loss": 3.0572757720947266, + "step": 4486, + "token_acc": 0.29244934969108377 + }, + { + "epoch": 2.630313690999707, + "grad_norm": 0.32062546011264975, + "learning_rate": 0.00019638943155795993, + "loss": 3.0528969764709473, + "step": 4487, + "token_acc": 0.29478522931780926 + }, + { + "epoch": 2.630900029316916, + "grad_norm": 0.3505622839783323, + "learning_rate": 0.0001963868502473448, + "loss": 3.061879873275757, + "step": 4488, + "token_acc": 0.2926566427518798 + }, + { + "epoch": 2.6314863676341247, + "grad_norm": 0.32965761663284, + "learning_rate": 0.0001963842680313051, + "loss": 3.090459108352661, + "step": 4489, + "token_acc": 0.28715031600629626 + }, + { + "epoch": 2.632072705951334, + "grad_norm": 0.2686659302108614, + "learning_rate": 0.0001963816849098651, + "loss": 3.0227527618408203, + "step": 4490, + "token_acc": 0.2952963223859966 + }, + { + "epoch": 2.632659044268543, + "grad_norm": 0.32480965974043335, + "learning_rate": 0.00019637910088304904, + "loss": 3.0999596118927, + "step": 4491, + "token_acc": 0.2867034512604133 + }, + { + "epoch": 2.633245382585752, + "grad_norm": 0.41912251319870225, + "learning_rate": 0.0001963765159508812, + "loss": 3.140795946121216, + "step": 4492, + "token_acc": 0.281638156319302 + }, + { + "epoch": 2.633831720902961, + "grad_norm": 0.33534683800568893, + "learning_rate": 0.00019637393011338582, + "loss": 3.045501708984375, + "step": 4493, + "token_acc": 0.2945371648471159 + }, + { + "epoch": 2.6344180592201702, + "grad_norm": 0.30462158647759885, + "learning_rate": 0.0001963713433705873, + "loss": 3.054668426513672, + "step": 4494, + "token_acc": 0.2924317401617169 + }, + { + "epoch": 2.635004397537379, + "grad_norm": 0.2705180193136794, + "learning_rate": 0.00019636875572250984, + "loss": 3.0756068229675293, + "step": 4495, + "token_acc": 0.2903723125612401 + }, + { + "epoch": 2.635590735854588, + "grad_norm": 0.303878079355125, + "learning_rate": 0.00019636616716917776, + "loss": 3.0634098052978516, + "step": 4496, + "token_acc": 0.29232823821152876 + }, + { + "epoch": 2.636177074171797, + "grad_norm": 0.3084281445601369, + "learning_rate": 0.00019636357771061542, + "loss": 3.0523176193237305, + "step": 4497, + "token_acc": 0.29193260975214647 + }, + { + "epoch": 2.6367634124890063, + "grad_norm": 0.2645435797616772, + "learning_rate": 0.0001963609873468471, + "loss": 3.055009365081787, + "step": 4498, + "token_acc": 0.2945348949158364 + }, + { + "epoch": 2.637349750806215, + "grad_norm": 0.2563803415794547, + "learning_rate": 0.00019635839607789714, + "loss": 3.065328598022461, + "step": 4499, + "token_acc": 0.2902637278634221 + }, + { + "epoch": 2.637936089123424, + "grad_norm": 0.2495922398337573, + "learning_rate": 0.00019635580390378994, + "loss": 3.051077365875244, + "step": 4500, + "token_acc": 0.29343870746644274 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.3005039331543738, + "learning_rate": 0.0001963532108245498, + "loss": 3.104482650756836, + "step": 4501, + "token_acc": 0.2871441269157217 + }, + { + "epoch": 2.6391087657578423, + "grad_norm": 0.244025820666221, + "learning_rate": 0.00019635061684020104, + "loss": 3.038386821746826, + "step": 4502, + "token_acc": 0.29478540825540767 + }, + { + "epoch": 2.6396951040750514, + "grad_norm": 0.33492953010619697, + "learning_rate": 0.00019634802195076808, + "loss": 3.0687317848205566, + "step": 4503, + "token_acc": 0.29183692986061033 + }, + { + "epoch": 2.6402814423922605, + "grad_norm": 0.45877832460916906, + "learning_rate": 0.00019634542615627529, + "loss": 3.0338287353515625, + "step": 4504, + "token_acc": 0.29636347456473716 + }, + { + "epoch": 2.6408677807094696, + "grad_norm": 0.5157500661351426, + "learning_rate": 0.00019634282945674706, + "loss": 3.0727882385253906, + "step": 4505, + "token_acc": 0.28951413223660744 + }, + { + "epoch": 2.6414541190266783, + "grad_norm": 0.4824121877087122, + "learning_rate": 0.00019634023185220777, + "loss": 3.0695605278015137, + "step": 4506, + "token_acc": 0.28954831495106276 + }, + { + "epoch": 2.6420404573438874, + "grad_norm": 0.286016254417242, + "learning_rate": 0.0001963376333426818, + "loss": 3.0731043815612793, + "step": 4507, + "token_acc": 0.2885248050661376 + }, + { + "epoch": 2.6426267956610965, + "grad_norm": 0.33144016997840975, + "learning_rate": 0.00019633503392819362, + "loss": 3.0521934032440186, + "step": 4508, + "token_acc": 0.2935126706881073 + }, + { + "epoch": 2.6432131339783056, + "grad_norm": 0.32347629295441577, + "learning_rate": 0.00019633243360876756, + "loss": 3.0892527103424072, + "step": 4509, + "token_acc": 0.2884905232550385 + }, + { + "epoch": 2.6437994722955143, + "grad_norm": 0.3478656689219456, + "learning_rate": 0.00019632983238442812, + "loss": 3.067091941833496, + "step": 4510, + "token_acc": 0.2895294096144256 + }, + { + "epoch": 2.6443858106127234, + "grad_norm": 0.3501867607845672, + "learning_rate": 0.00019632723025519972, + "loss": 3.088876247406006, + "step": 4511, + "token_acc": 0.2876259274018534 + }, + { + "epoch": 2.6449721489299325, + "grad_norm": 0.38770102565818937, + "learning_rate": 0.00019632462722110678, + "loss": 3.086182117462158, + "step": 4512, + "token_acc": 0.28858747808729257 + }, + { + "epoch": 2.6455584872471416, + "grad_norm": 0.3904764530469908, + "learning_rate": 0.00019632202328217376, + "loss": 3.0932693481445312, + "step": 4513, + "token_acc": 0.2866015301459907 + }, + { + "epoch": 2.6461448255643507, + "grad_norm": 0.376584849692192, + "learning_rate": 0.00019631941843842516, + "loss": 3.069999933242798, + "step": 4514, + "token_acc": 0.289378201775143 + }, + { + "epoch": 2.64673116388156, + "grad_norm": 0.3075806878304348, + "learning_rate": 0.00019631681268988537, + "loss": 3.0741350650787354, + "step": 4515, + "token_acc": 0.28925101848950174 + }, + { + "epoch": 2.647317502198769, + "grad_norm": 0.34225947912693605, + "learning_rate": 0.00019631420603657894, + "loss": 3.085578441619873, + "step": 4516, + "token_acc": 0.28889610147757566 + }, + { + "epoch": 2.6479038405159776, + "grad_norm": 0.4404033423864138, + "learning_rate": 0.00019631159847853034, + "loss": 3.06289005279541, + "step": 4517, + "token_acc": 0.2918366963160765 + }, + { + "epoch": 2.6484901788331867, + "grad_norm": 0.3335503426914167, + "learning_rate": 0.00019630899001576405, + "loss": 3.0790886878967285, + "step": 4518, + "token_acc": 0.290332262503402 + }, + { + "epoch": 2.649076517150396, + "grad_norm": 0.3353423529590062, + "learning_rate": 0.00019630638064830456, + "loss": 3.0785131454467773, + "step": 4519, + "token_acc": 0.28918866965628204 + }, + { + "epoch": 2.649662855467605, + "grad_norm": 0.3248139451470929, + "learning_rate": 0.0001963037703761764, + "loss": 3.055438995361328, + "step": 4520, + "token_acc": 0.29214887369308773 + }, + { + "epoch": 2.6502491937848136, + "grad_norm": 0.34058343256543366, + "learning_rate": 0.0001963011591994041, + "loss": 3.054819345474243, + "step": 4521, + "token_acc": 0.29368104000684897 + }, + { + "epoch": 2.6508355321020227, + "grad_norm": 0.40593421952064956, + "learning_rate": 0.00019629854711801216, + "loss": 3.0736405849456787, + "step": 4522, + "token_acc": 0.28880714580688577 + }, + { + "epoch": 2.651421870419232, + "grad_norm": 0.36446617155678246, + "learning_rate": 0.00019629593413202515, + "loss": 3.0906848907470703, + "step": 4523, + "token_acc": 0.28699349838276356 + }, + { + "epoch": 2.652008208736441, + "grad_norm": 0.34285056848024126, + "learning_rate": 0.0001962933202414676, + "loss": 3.0449209213256836, + "step": 4524, + "token_acc": 0.2946073166952517 + }, + { + "epoch": 2.65259454705365, + "grad_norm": 0.313530514146754, + "learning_rate": 0.00019629070544636406, + "loss": 3.0669126510620117, + "step": 4525, + "token_acc": 0.28972421667661036 + }, + { + "epoch": 2.653180885370859, + "grad_norm": 0.30858193941186796, + "learning_rate": 0.0001962880897467391, + "loss": 3.036386489868164, + "step": 4526, + "token_acc": 0.29459631243857626 + }, + { + "epoch": 2.6537672236880683, + "grad_norm": 0.34712111926513606, + "learning_rate": 0.00019628547314261727, + "loss": 3.090385675430298, + "step": 4527, + "token_acc": 0.28859511718697883 + }, + { + "epoch": 2.654353562005277, + "grad_norm": 0.32449936222745696, + "learning_rate": 0.00019628285563402318, + "loss": 3.1185836791992188, + "step": 4528, + "token_acc": 0.28416327438078653 + }, + { + "epoch": 2.654939900322486, + "grad_norm": 0.2884555769027673, + "learning_rate": 0.00019628023722098142, + "loss": 3.055527687072754, + "step": 4529, + "token_acc": 0.29156829978927723 + }, + { + "epoch": 2.655526238639695, + "grad_norm": 0.34335689643317113, + "learning_rate": 0.00019627761790351654, + "loss": 3.065922975540161, + "step": 4530, + "token_acc": 0.2940832334013665 + }, + { + "epoch": 2.6561125769569043, + "grad_norm": 0.35574508501819285, + "learning_rate": 0.0001962749976816532, + "loss": 3.0582306385040283, + "step": 4531, + "token_acc": 0.2911504247575853 + }, + { + "epoch": 2.656698915274113, + "grad_norm": 0.2513292997031737, + "learning_rate": 0.00019627237655541594, + "loss": 3.1087393760681152, + "step": 4532, + "token_acc": 0.2871675141598827 + }, + { + "epoch": 2.657285253591322, + "grad_norm": 0.3782240492427561, + "learning_rate": 0.00019626975452482947, + "loss": 3.0797410011291504, + "step": 4533, + "token_acc": 0.2892079484024141 + }, + { + "epoch": 2.657871591908531, + "grad_norm": 0.3181669356709658, + "learning_rate": 0.00019626713158991837, + "loss": 3.070539951324463, + "step": 4534, + "token_acc": 0.29027567323284686 + }, + { + "epoch": 2.6584579302257403, + "grad_norm": 0.329932175131568, + "learning_rate": 0.00019626450775070731, + "loss": 3.1051158905029297, + "step": 4535, + "token_acc": 0.28634751773049644 + }, + { + "epoch": 2.6590442685429494, + "grad_norm": 0.41972730105232214, + "learning_rate": 0.0001962618830072209, + "loss": 3.0316390991210938, + "step": 4536, + "token_acc": 0.2967513651332702 + }, + { + "epoch": 2.6596306068601585, + "grad_norm": 0.3315416172347069, + "learning_rate": 0.00019625925735948383, + "loss": 3.0803680419921875, + "step": 4537, + "token_acc": 0.2887639933230218 + }, + { + "epoch": 2.660216945177367, + "grad_norm": 0.2924363430608812, + "learning_rate": 0.00019625663080752076, + "loss": 3.0637307167053223, + "step": 4538, + "token_acc": 0.291914218980734 + }, + { + "epoch": 2.6608032834945763, + "grad_norm": 0.42741954444102404, + "learning_rate": 0.00019625400335135628, + "loss": 3.122969388961792, + "step": 4539, + "token_acc": 0.283416222150501 + }, + { + "epoch": 2.6613896218117854, + "grad_norm": 0.39760594169462443, + "learning_rate": 0.00019625137499101522, + "loss": 3.1098380088806152, + "step": 4540, + "token_acc": 0.28538005067882094 + }, + { + "epoch": 2.6619759601289945, + "grad_norm": 0.3327046832783997, + "learning_rate": 0.00019624874572652217, + "loss": 3.072350263595581, + "step": 4541, + "token_acc": 0.29083413914194106 + }, + { + "epoch": 2.6625622984462036, + "grad_norm": 0.302333026609362, + "learning_rate": 0.00019624611555790183, + "loss": 3.0577166080474854, + "step": 4542, + "token_acc": 0.29270956292709566 + }, + { + "epoch": 2.6631486367634123, + "grad_norm": 0.3189625248580394, + "learning_rate": 0.00019624348448517894, + "loss": 3.022976875305176, + "step": 4543, + "token_acc": 0.2968805130653162 + }, + { + "epoch": 2.6637349750806214, + "grad_norm": 0.29349139824092646, + "learning_rate": 0.0001962408525083782, + "loss": 3.0747759342193604, + "step": 4544, + "token_acc": 0.2901611125558817 + }, + { + "epoch": 2.6643213133978305, + "grad_norm": 0.2951850680590209, + "learning_rate": 0.00019623821962752437, + "loss": 3.0890145301818848, + "step": 4545, + "token_acc": 0.28779748047836323 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.32825599759014656, + "learning_rate": 0.00019623558584264206, + "loss": 3.0595803260803223, + "step": 4546, + "token_acc": 0.2944165409010937 + }, + { + "epoch": 2.6654939900322487, + "grad_norm": 0.3872003481927382, + "learning_rate": 0.0001962329511537562, + "loss": 3.0966062545776367, + "step": 4547, + "token_acc": 0.28734901962843273 + }, + { + "epoch": 2.666080328349458, + "grad_norm": 0.30175631248902285, + "learning_rate": 0.00019623031556089137, + "loss": 3.0942349433898926, + "step": 4548, + "token_acc": 0.28668121252325696 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.30181930527781375, + "learning_rate": 0.00019622767906407243, + "loss": 3.0267717838287354, + "step": 4549, + "token_acc": 0.2973882897170164 + }, + { + "epoch": 2.6672530049838756, + "grad_norm": 0.2750983562882502, + "learning_rate": 0.00019622504166332414, + "loss": 3.0798733234405518, + "step": 4550, + "token_acc": 0.28871550017468267 + }, + { + "epoch": 2.6678393433010847, + "grad_norm": 0.3122238136987398, + "learning_rate": 0.00019622240335867122, + "loss": 3.0714807510375977, + "step": 4551, + "token_acc": 0.28987423802574247 + }, + { + "epoch": 2.668425681618294, + "grad_norm": 0.32047764793144706, + "learning_rate": 0.00019621976415013848, + "loss": 3.074183702468872, + "step": 4552, + "token_acc": 0.2890211835217236 + }, + { + "epoch": 2.6690120199355025, + "grad_norm": 0.4659150980283412, + "learning_rate": 0.0001962171240377507, + "loss": 3.0646438598632812, + "step": 4553, + "token_acc": 0.2908177721229358 + }, + { + "epoch": 2.6695983582527116, + "grad_norm": 0.38432784871959863, + "learning_rate": 0.0001962144830215327, + "loss": 3.0881080627441406, + "step": 4554, + "token_acc": 0.28787901967383656 + }, + { + "epoch": 2.6701846965699207, + "grad_norm": 0.3117405621375754, + "learning_rate": 0.0001962118411015093, + "loss": 3.0630271434783936, + "step": 4555, + "token_acc": 0.29029482200402246 + }, + { + "epoch": 2.67077103488713, + "grad_norm": 0.33309754223634663, + "learning_rate": 0.0001962091982777053, + "loss": 3.056474208831787, + "step": 4556, + "token_acc": 0.29094307652999474 + }, + { + "epoch": 2.671357373204339, + "grad_norm": 0.3005710929492111, + "learning_rate": 0.0001962065545501455, + "loss": 3.1018667221069336, + "step": 4557, + "token_acc": 0.28600581127345615 + }, + { + "epoch": 2.671943711521548, + "grad_norm": 0.36358990948741654, + "learning_rate": 0.0001962039099188548, + "loss": 3.0533246994018555, + "step": 4558, + "token_acc": 0.29397992535756506 + }, + { + "epoch": 2.672530049838757, + "grad_norm": 0.3618927866280974, + "learning_rate": 0.00019620126438385795, + "loss": 3.094520092010498, + "step": 4559, + "token_acc": 0.2867888427311946 + }, + { + "epoch": 2.673116388155966, + "grad_norm": 0.3114031401666575, + "learning_rate": 0.00019619861794517988, + "loss": 3.0540623664855957, + "step": 4560, + "token_acc": 0.29173182855036806 + }, + { + "epoch": 2.673702726473175, + "grad_norm": 0.4022924847718842, + "learning_rate": 0.00019619597060284544, + "loss": 3.0916781425476074, + "step": 4561, + "token_acc": 0.2888479878611593 + }, + { + "epoch": 2.674289064790384, + "grad_norm": 0.36090436212272575, + "learning_rate": 0.0001961933223568795, + "loss": 3.104236602783203, + "step": 4562, + "token_acc": 0.28566051435923995 + }, + { + "epoch": 2.674875403107593, + "grad_norm": 0.3819390744440609, + "learning_rate": 0.00019619067320730685, + "loss": 3.111236333847046, + "step": 4563, + "token_acc": 0.28461039344068895 + }, + { + "epoch": 2.675461741424802, + "grad_norm": 0.3581275614592982, + "learning_rate": 0.0001961880231541525, + "loss": 3.097654104232788, + "step": 4564, + "token_acc": 0.28869158949851514 + }, + { + "epoch": 2.676048079742011, + "grad_norm": 0.36163033831251296, + "learning_rate": 0.00019618537219744128, + "loss": 3.108797550201416, + "step": 4565, + "token_acc": 0.28488825843466886 + }, + { + "epoch": 2.67663441805922, + "grad_norm": 0.41489009308025604, + "learning_rate": 0.00019618272033719808, + "loss": 3.0790529251098633, + "step": 4566, + "token_acc": 0.28864593699306135 + }, + { + "epoch": 2.677220756376429, + "grad_norm": 0.3750461873095386, + "learning_rate": 0.00019618006757344782, + "loss": 3.070324182510376, + "step": 4567, + "token_acc": 0.29061931464174456 + }, + { + "epoch": 2.6778070946936383, + "grad_norm": 0.3162441869869595, + "learning_rate": 0.00019617741390621544, + "loss": 3.1262261867523193, + "step": 4568, + "token_acc": 0.28117845837744254 + }, + { + "epoch": 2.6783934330108474, + "grad_norm": 0.3499900932547495, + "learning_rate": 0.00019617475933552588, + "loss": 3.0733556747436523, + "step": 4569, + "token_acc": 0.2916897372905501 + }, + { + "epoch": 2.6789797713280565, + "grad_norm": 0.35222157983834007, + "learning_rate": 0.00019617210386140403, + "loss": 3.06453013420105, + "step": 4570, + "token_acc": 0.2899688670236305 + }, + { + "epoch": 2.679566109645265, + "grad_norm": 0.382571075804764, + "learning_rate": 0.00019616944748387488, + "loss": 3.0631256103515625, + "step": 4571, + "token_acc": 0.29064297730057526 + }, + { + "epoch": 2.6801524479624743, + "grad_norm": 0.3118164015162934, + "learning_rate": 0.00019616679020296334, + "loss": 3.0829179286956787, + "step": 4572, + "token_acc": 0.2883335547376619 + }, + { + "epoch": 2.6807387862796834, + "grad_norm": 0.3379179181178882, + "learning_rate": 0.0001961641320186944, + "loss": 3.039517879486084, + "step": 4573, + "token_acc": 0.295798018577866 + }, + { + "epoch": 2.6813251245968925, + "grad_norm": 0.32282983948174376, + "learning_rate": 0.000196161472931093, + "loss": 3.024003505706787, + "step": 4574, + "token_acc": 0.2962658415405439 + }, + { + "epoch": 2.681911462914101, + "grad_norm": 0.30593756624321233, + "learning_rate": 0.00019615881294018418, + "loss": 3.0250720977783203, + "step": 4575, + "token_acc": 0.2970103663338939 + }, + { + "epoch": 2.6824978012313103, + "grad_norm": 0.3173413310128304, + "learning_rate": 0.00019615615204599286, + "loss": 3.075173854827881, + "step": 4576, + "token_acc": 0.29134404883857584 + }, + { + "epoch": 2.6830841395485194, + "grad_norm": 0.290468111537404, + "learning_rate": 0.00019615349024854412, + "loss": 3.0718934535980225, + "step": 4577, + "token_acc": 0.29100289622168934 + }, + { + "epoch": 2.6836704778657285, + "grad_norm": 0.32407332951306195, + "learning_rate": 0.00019615082754786283, + "loss": 3.0403764247894287, + "step": 4578, + "token_acc": 0.29489088496944577 + }, + { + "epoch": 2.6842568161829377, + "grad_norm": 0.28160077769186653, + "learning_rate": 0.0001961481639439741, + "loss": 3.0857834815979004, + "step": 4579, + "token_acc": 0.28730971094341595 + }, + { + "epoch": 2.6848431545001468, + "grad_norm": 0.37236130523301947, + "learning_rate": 0.00019614549943690297, + "loss": 3.096175193786621, + "step": 4580, + "token_acc": 0.2859703155549015 + }, + { + "epoch": 2.685429492817356, + "grad_norm": 0.3292317410680187, + "learning_rate": 0.0001961428340266744, + "loss": 3.0943541526794434, + "step": 4581, + "token_acc": 0.2862232503741792 + }, + { + "epoch": 2.6860158311345645, + "grad_norm": 0.3449888147804372, + "learning_rate": 0.00019614016771331345, + "loss": 3.1412627696990967, + "step": 4582, + "token_acc": 0.2807060528765929 + }, + { + "epoch": 2.6866021694517737, + "grad_norm": 0.34187128117194526, + "learning_rate": 0.00019613750049684518, + "loss": 3.0711050033569336, + "step": 4583, + "token_acc": 0.2899561084206417 + }, + { + "epoch": 2.6871885077689828, + "grad_norm": 0.31729738211976827, + "learning_rate": 0.00019613483237729463, + "loss": 3.053450584411621, + "step": 4584, + "token_acc": 0.29359155835799766 + }, + { + "epoch": 2.687774846086192, + "grad_norm": 0.32844287331706523, + "learning_rate": 0.0001961321633546869, + "loss": 3.0576930046081543, + "step": 4585, + "token_acc": 0.2919579721145779 + }, + { + "epoch": 2.6883611844034006, + "grad_norm": 0.349752905400812, + "learning_rate": 0.00019612949342904704, + "loss": 3.098722457885742, + "step": 4586, + "token_acc": 0.288147978259826 + }, + { + "epoch": 2.6889475227206097, + "grad_norm": 0.34465407614689486, + "learning_rate": 0.0001961268226004001, + "loss": 3.0849053859710693, + "step": 4587, + "token_acc": 0.28720458562802054 + }, + { + "epoch": 2.6895338610378188, + "grad_norm": 0.25791209653377434, + "learning_rate": 0.00019612415086877122, + "loss": 3.0729269981384277, + "step": 4588, + "token_acc": 0.2909098928927267 + }, + { + "epoch": 2.690120199355028, + "grad_norm": 0.30326907633575656, + "learning_rate": 0.00019612147823418546, + "loss": 3.034104824066162, + "step": 4589, + "token_acc": 0.29637118992882827 + }, + { + "epoch": 2.690706537672237, + "grad_norm": 0.26346440519259406, + "learning_rate": 0.0001961188046966679, + "loss": 3.087848663330078, + "step": 4590, + "token_acc": 0.28711999201128663 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.3094679598124755, + "learning_rate": 0.00019611613025624373, + "loss": 3.0497069358825684, + "step": 4591, + "token_acc": 0.2938933372682722 + }, + { + "epoch": 2.6918792143066548, + "grad_norm": 0.3358553416485472, + "learning_rate": 0.00019611345491293803, + "loss": 3.058612823486328, + "step": 4592, + "token_acc": 0.29250981653671565 + }, + { + "epoch": 2.692465552623864, + "grad_norm": 0.2937235097564311, + "learning_rate": 0.00019611077866677593, + "loss": 3.081667423248291, + "step": 4593, + "token_acc": 0.2882895002354207 + }, + { + "epoch": 2.693051890941073, + "grad_norm": 0.3379997161411641, + "learning_rate": 0.00019610810151778258, + "loss": 3.0104331970214844, + "step": 4594, + "token_acc": 0.29800038029261977 + }, + { + "epoch": 2.693638229258282, + "grad_norm": 0.35560811230546824, + "learning_rate": 0.00019610542346598314, + "loss": 3.044588088989258, + "step": 4595, + "token_acc": 0.293914189873611 + }, + { + "epoch": 2.6942245675754912, + "grad_norm": 0.34088460525265607, + "learning_rate": 0.00019610274451140272, + "loss": 3.115767240524292, + "step": 4596, + "token_acc": 0.28402360832203544 + }, + { + "epoch": 2.6948109058927, + "grad_norm": 0.33497354791512735, + "learning_rate": 0.00019610006465406655, + "loss": 3.086986780166626, + "step": 4597, + "token_acc": 0.2868543587682675 + }, + { + "epoch": 2.695397244209909, + "grad_norm": 0.27525214080721966, + "learning_rate": 0.0001960973838939997, + "loss": 3.080409526824951, + "step": 4598, + "token_acc": 0.28872607012260704 + }, + { + "epoch": 2.695983582527118, + "grad_norm": 0.2995414140533698, + "learning_rate": 0.0001960947022312275, + "loss": 3.0814218521118164, + "step": 4599, + "token_acc": 0.28911611674451765 + }, + { + "epoch": 2.6965699208443272, + "grad_norm": 0.2849168993826068, + "learning_rate": 0.00019609201966577504, + "loss": 3.0284667015075684, + "step": 4600, + "token_acc": 0.2964905885646967 + }, + { + "epoch": 2.6971562591615363, + "grad_norm": 0.2587181971182644, + "learning_rate": 0.00019608933619766754, + "loss": 3.072713851928711, + "step": 4601, + "token_acc": 0.28916574474504675 + }, + { + "epoch": 2.6977425974787455, + "grad_norm": 0.2786439890523753, + "learning_rate": 0.0001960866518269302, + "loss": 3.0935587882995605, + "step": 4602, + "token_acc": 0.2861736170299637 + }, + { + "epoch": 2.698328935795954, + "grad_norm": 0.338772670331652, + "learning_rate": 0.00019608396655358822, + "loss": 3.113826036453247, + "step": 4603, + "token_acc": 0.28389691079576224 + }, + { + "epoch": 2.6989152741131632, + "grad_norm": 0.37271601858279096, + "learning_rate": 0.00019608128037766688, + "loss": 3.0474891662597656, + "step": 4604, + "token_acc": 0.2954168820304049 + }, + { + "epoch": 2.6995016124303723, + "grad_norm": 0.385045455634142, + "learning_rate": 0.00019607859329919138, + "loss": 3.005220413208008, + "step": 4605, + "token_acc": 0.3008332465368191 + }, + { + "epoch": 2.7000879507475815, + "grad_norm": 0.2849201771932255, + "learning_rate": 0.00019607590531818696, + "loss": 3.039815902709961, + "step": 4606, + "token_acc": 0.2928555611582 + }, + { + "epoch": 2.70067428906479, + "grad_norm": 0.353030282025018, + "learning_rate": 0.00019607321643467889, + "loss": 3.046417236328125, + "step": 4607, + "token_acc": 0.294735450085296 + }, + { + "epoch": 2.7012606273819992, + "grad_norm": 0.3707303185644731, + "learning_rate": 0.00019607052664869236, + "loss": 3.0585556030273438, + "step": 4608, + "token_acc": 0.2932592550005488 + }, + { + "epoch": 2.7018469656992083, + "grad_norm": 0.38869429678549705, + "learning_rate": 0.00019606783596025273, + "loss": 3.0804758071899414, + "step": 4609, + "token_acc": 0.28922501882327634 + }, + { + "epoch": 2.7024333040164175, + "grad_norm": 0.3603531737301099, + "learning_rate": 0.00019606514436938522, + "loss": 3.029695510864258, + "step": 4610, + "token_acc": 0.295240389763358 + }, + { + "epoch": 2.7030196423336266, + "grad_norm": 0.37622594772044854, + "learning_rate": 0.00019606245187611516, + "loss": 3.0557150840759277, + "step": 4611, + "token_acc": 0.2921033866725961 + }, + { + "epoch": 2.7036059806508357, + "grad_norm": 0.3142220577318543, + "learning_rate": 0.00019605975848046775, + "loss": 3.0820374488830566, + "step": 4612, + "token_acc": 0.2884864366176436 + }, + { + "epoch": 2.704192318968045, + "grad_norm": 0.3402226734775937, + "learning_rate": 0.00019605706418246842, + "loss": 3.0157523155212402, + "step": 4613, + "token_acc": 0.29827982850569906 + }, + { + "epoch": 2.7047786572852535, + "grad_norm": 0.41825789334529473, + "learning_rate": 0.00019605436898214234, + "loss": 3.0657761096954346, + "step": 4614, + "token_acc": 0.2901373858316064 + }, + { + "epoch": 2.7053649956024626, + "grad_norm": 0.42276411250977985, + "learning_rate": 0.00019605167287951498, + "loss": 3.069960355758667, + "step": 4615, + "token_acc": 0.289633954590243 + }, + { + "epoch": 2.7059513339196717, + "grad_norm": 0.3333705382069517, + "learning_rate": 0.00019604897587461152, + "loss": 3.0744457244873047, + "step": 4616, + "token_acc": 0.290705683454739 + }, + { + "epoch": 2.706537672236881, + "grad_norm": 0.36885850245526164, + "learning_rate": 0.0001960462779674574, + "loss": 3.115445137023926, + "step": 4617, + "token_acc": 0.28453691504479295 + }, + { + "epoch": 2.7071240105540895, + "grad_norm": 0.3507145322574794, + "learning_rate": 0.00019604357915807788, + "loss": 3.1081814765930176, + "step": 4618, + "token_acc": 0.28615999035806633 + }, + { + "epoch": 2.7077103488712986, + "grad_norm": 0.3375902388952492, + "learning_rate": 0.00019604087944649837, + "loss": 3.101623058319092, + "step": 4619, + "token_acc": 0.28516436244733306 + }, + { + "epoch": 2.7082966871885077, + "grad_norm": 0.3043907305211772, + "learning_rate": 0.00019603817883274424, + "loss": 3.050922393798828, + "step": 4620, + "token_acc": 0.2914007916338575 + }, + { + "epoch": 2.708883025505717, + "grad_norm": 0.35341317383193205, + "learning_rate": 0.0001960354773168408, + "loss": 3.052631139755249, + "step": 4621, + "token_acc": 0.2927062964920549 + }, + { + "epoch": 2.709469363822926, + "grad_norm": 0.297846945583081, + "learning_rate": 0.00019603277489881347, + "loss": 3.0719237327575684, + "step": 4622, + "token_acc": 0.2900396840988566 + }, + { + "epoch": 2.710055702140135, + "grad_norm": 0.3690801624691033, + "learning_rate": 0.00019603007157868762, + "loss": 3.0560717582702637, + "step": 4623, + "token_acc": 0.29208165988733914 + }, + { + "epoch": 2.710642040457344, + "grad_norm": 0.3498312818189329, + "learning_rate": 0.00019602736735648867, + "loss": 3.0670833587646484, + "step": 4624, + "token_acc": 0.2920055654939376 + }, + { + "epoch": 2.711228378774553, + "grad_norm": 0.34390655350503474, + "learning_rate": 0.00019602466223224202, + "loss": 3.060535430908203, + "step": 4625, + "token_acc": 0.29069353292723166 + }, + { + "epoch": 2.711814717091762, + "grad_norm": 0.32532908919171283, + "learning_rate": 0.00019602195620597302, + "loss": 3.1041665077209473, + "step": 4626, + "token_acc": 0.2853834748893112 + }, + { + "epoch": 2.712401055408971, + "grad_norm": 0.43594270548360947, + "learning_rate": 0.00019601924927770714, + "loss": 3.1092512607574463, + "step": 4627, + "token_acc": 0.2844591191448878 + }, + { + "epoch": 2.71298739372618, + "grad_norm": 0.4630780654760894, + "learning_rate": 0.00019601654144746981, + "loss": 3.0721325874328613, + "step": 4628, + "token_acc": 0.2916355781949886 + }, + { + "epoch": 2.713573732043389, + "grad_norm": 0.3553615385212874, + "learning_rate": 0.00019601383271528646, + "loss": 3.0570480823516846, + "step": 4629, + "token_acc": 0.2922392415838757 + }, + { + "epoch": 2.714160070360598, + "grad_norm": 0.35840586420755255, + "learning_rate": 0.0001960111230811825, + "loss": 3.0486514568328857, + "step": 4630, + "token_acc": 0.2923069325459536 + }, + { + "epoch": 2.714746408677807, + "grad_norm": 0.3552950301349356, + "learning_rate": 0.00019600841254518346, + "loss": 3.0606350898742676, + "step": 4631, + "token_acc": 0.2923529301793922 + }, + { + "epoch": 2.715332746995016, + "grad_norm": 0.3539224366752878, + "learning_rate": 0.00019600570110731472, + "loss": 3.090754508972168, + "step": 4632, + "token_acc": 0.28820463086434717 + }, + { + "epoch": 2.7159190853122253, + "grad_norm": 0.36553392520341765, + "learning_rate": 0.0001960029887676018, + "loss": 3.0873289108276367, + "step": 4633, + "token_acc": 0.2893336101938699 + }, + { + "epoch": 2.7165054236294344, + "grad_norm": 0.3605116182937075, + "learning_rate": 0.0001960002755260702, + "loss": 3.0798168182373047, + "step": 4634, + "token_acc": 0.2882106541758875 + }, + { + "epoch": 2.7170917619466435, + "grad_norm": 0.32631907619650247, + "learning_rate": 0.00019599756138274533, + "loss": 3.1280314922332764, + "step": 4635, + "token_acc": 0.2823222726487539 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.4181030200341294, + "learning_rate": 0.00019599484633765277, + "loss": 3.046919584274292, + "step": 4636, + "token_acc": 0.29325448630278916 + }, + { + "epoch": 2.7182644385810613, + "grad_norm": 0.37397930933740153, + "learning_rate": 0.00019599213039081794, + "loss": 3.0940630435943604, + "step": 4637, + "token_acc": 0.28769915706681287 + }, + { + "epoch": 2.7188507768982704, + "grad_norm": 0.33913666556650074, + "learning_rate": 0.0001959894135422664, + "loss": 3.045283317565918, + "step": 4638, + "token_acc": 0.292696962705854 + }, + { + "epoch": 2.7194371152154795, + "grad_norm": 0.3470344307622619, + "learning_rate": 0.00019598669579202368, + "loss": 3.094909429550171, + "step": 4639, + "token_acc": 0.28872493146443223 + }, + { + "epoch": 2.720023453532688, + "grad_norm": 0.37438002093932476, + "learning_rate": 0.00019598397714011532, + "loss": 3.0970559120178223, + "step": 4640, + "token_acc": 0.2849042199340155 + }, + { + "epoch": 2.7206097918498973, + "grad_norm": 0.3608262904244165, + "learning_rate": 0.00019598125758656682, + "loss": 3.0626635551452637, + "step": 4641, + "token_acc": 0.2916284567259685 + }, + { + "epoch": 2.7211961301671064, + "grad_norm": 0.3493893388056432, + "learning_rate": 0.00019597853713140373, + "loss": 3.0793392658233643, + "step": 4642, + "token_acc": 0.2890838029449191 + }, + { + "epoch": 2.7217824684843155, + "grad_norm": 0.3560141970482984, + "learning_rate": 0.00019597581577465164, + "loss": 3.0795159339904785, + "step": 4643, + "token_acc": 0.28829914011855473 + }, + { + "epoch": 2.7223688068015246, + "grad_norm": 0.3593867827080234, + "learning_rate": 0.00019597309351633606, + "loss": 3.0684590339660645, + "step": 4644, + "token_acc": 0.2917868430034574 + }, + { + "epoch": 2.7229551451187337, + "grad_norm": 0.32180550235222366, + "learning_rate": 0.00019597037035648258, + "loss": 3.0816004276275635, + "step": 4645, + "token_acc": 0.2886940889045958 + }, + { + "epoch": 2.7235414834359424, + "grad_norm": 0.35681733398488874, + "learning_rate": 0.0001959676462951168, + "loss": 3.078744888305664, + "step": 4646, + "token_acc": 0.28831233400060186 + }, + { + "epoch": 2.7241278217531515, + "grad_norm": 0.3028995387458006, + "learning_rate": 0.00019596492133226433, + "loss": 3.074610471725464, + "step": 4647, + "token_acc": 0.29086400567381804 + }, + { + "epoch": 2.7247141600703606, + "grad_norm": 0.33198462887991426, + "learning_rate": 0.00019596219546795074, + "loss": 3.0617027282714844, + "step": 4648, + "token_acc": 0.29290141340562464 + }, + { + "epoch": 2.7253004983875697, + "grad_norm": 0.3956395327224407, + "learning_rate": 0.00019595946870220161, + "loss": 3.0271615982055664, + "step": 4649, + "token_acc": 0.29771774592562267 + }, + { + "epoch": 2.7258868367047784, + "grad_norm": 0.36416418492854763, + "learning_rate": 0.00019595674103504258, + "loss": 3.021888256072998, + "step": 4650, + "token_acc": 0.29820106042201755 + }, + { + "epoch": 2.7264731750219875, + "grad_norm": 0.29589315395755755, + "learning_rate": 0.00019595401246649927, + "loss": 3.0241734981536865, + "step": 4651, + "token_acc": 0.29568094805468154 + }, + { + "epoch": 2.7270595133391966, + "grad_norm": 0.29836452049302997, + "learning_rate": 0.00019595128299659733, + "loss": 3.0576462745666504, + "step": 4652, + "token_acc": 0.29146687709566044 + }, + { + "epoch": 2.7276458516564057, + "grad_norm": 0.2730215587276771, + "learning_rate": 0.00019594855262536237, + "loss": 3.045133113861084, + "step": 4653, + "token_acc": 0.2934791028390864 + }, + { + "epoch": 2.728232189973615, + "grad_norm": 0.2905621118751534, + "learning_rate": 0.00019594582135282006, + "loss": 3.0950140953063965, + "step": 4654, + "token_acc": 0.2875177080850631 + }, + { + "epoch": 2.728818528290824, + "grad_norm": 0.2628072451628686, + "learning_rate": 0.00019594308917899605, + "loss": 3.0977776050567627, + "step": 4655, + "token_acc": 0.28793800059027674 + }, + { + "epoch": 2.729404866608033, + "grad_norm": 0.2558353075979535, + "learning_rate": 0.00019594035610391597, + "loss": 3.014267921447754, + "step": 4656, + "token_acc": 0.29720147806054054 + }, + { + "epoch": 2.7299912049252417, + "grad_norm": 0.33100191650068966, + "learning_rate": 0.00019593762212760557, + "loss": 3.0784945487976074, + "step": 4657, + "token_acc": 0.28895315997573273 + }, + { + "epoch": 2.730577543242451, + "grad_norm": 0.32806953792579385, + "learning_rate": 0.00019593488725009044, + "loss": 3.0503933429718018, + "step": 4658, + "token_acc": 0.2932190818854645 + }, + { + "epoch": 2.73116388155966, + "grad_norm": 0.31314419072789296, + "learning_rate": 0.00019593215147139636, + "loss": 3.0369620323181152, + "step": 4659, + "token_acc": 0.2955304721523686 + }, + { + "epoch": 2.731750219876869, + "grad_norm": 0.28879276370844303, + "learning_rate": 0.00019592941479154897, + "loss": 3.063556671142578, + "step": 4660, + "token_acc": 0.2921654838326351 + }, + { + "epoch": 2.7323365581940777, + "grad_norm": 0.2984387593755698, + "learning_rate": 0.000195926677210574, + "loss": 3.0611438751220703, + "step": 4661, + "token_acc": 0.29205801789978864 + }, + { + "epoch": 2.732922896511287, + "grad_norm": 0.2929929716247293, + "learning_rate": 0.00019592393872849714, + "loss": 3.013617515563965, + "step": 4662, + "token_acc": 0.29880121098368567 + }, + { + "epoch": 2.733509234828496, + "grad_norm": 0.37261649586524026, + "learning_rate": 0.00019592119934534416, + "loss": 3.077099323272705, + "step": 4663, + "token_acc": 0.28830960779811726 + }, + { + "epoch": 2.734095573145705, + "grad_norm": 0.35175936282198433, + "learning_rate": 0.00019591845906114075, + "loss": 3.044412851333618, + "step": 4664, + "token_acc": 0.2937069167404873 + }, + { + "epoch": 2.734681911462914, + "grad_norm": 0.3144549791936237, + "learning_rate": 0.0001959157178759127, + "loss": 3.0678317546844482, + "step": 4665, + "token_acc": 0.2916704256937016 + }, + { + "epoch": 2.7352682497801233, + "grad_norm": 0.29176557350581506, + "learning_rate": 0.00019591297578968567, + "loss": 3.073025703430176, + "step": 4666, + "token_acc": 0.2906773030120212 + }, + { + "epoch": 2.7358545880973324, + "grad_norm": 0.29566861411407064, + "learning_rate": 0.00019591023280248553, + "loss": 3.073455333709717, + "step": 4667, + "token_acc": 0.2911539056336219 + }, + { + "epoch": 2.736440926414541, + "grad_norm": 0.3009505906839044, + "learning_rate": 0.00019590748891433796, + "loss": 3.0105578899383545, + "step": 4668, + "token_acc": 0.2973537931574415 + }, + { + "epoch": 2.73702726473175, + "grad_norm": 0.33511579029209515, + "learning_rate": 0.0001959047441252688, + "loss": 3.106764316558838, + "step": 4669, + "token_acc": 0.2853260017353307 + }, + { + "epoch": 2.7376136030489593, + "grad_norm": 0.4605083147386973, + "learning_rate": 0.0001959019984353038, + "loss": 3.071453094482422, + "step": 4670, + "token_acc": 0.28911006520143356 + }, + { + "epoch": 2.7381999413661684, + "grad_norm": 0.4578590824992916, + "learning_rate": 0.0001958992518444687, + "loss": 3.030712842941284, + "step": 4671, + "token_acc": 0.2959021262043306 + }, + { + "epoch": 2.738786279683377, + "grad_norm": 0.26593354325576213, + "learning_rate": 0.00019589650435278944, + "loss": 3.0777077674865723, + "step": 4672, + "token_acc": 0.2885764418207303 + }, + { + "epoch": 2.739372618000586, + "grad_norm": 0.4433735877345629, + "learning_rate": 0.00019589375596029167, + "loss": 3.0942602157592773, + "step": 4673, + "token_acc": 0.287688164690784 + }, + { + "epoch": 2.7399589563177953, + "grad_norm": 0.3059161207637901, + "learning_rate": 0.0001958910066670013, + "loss": 3.0496675968170166, + "step": 4674, + "token_acc": 0.2961370736612304 + }, + { + "epoch": 2.7405452946350044, + "grad_norm": 0.3635916470076969, + "learning_rate": 0.00019588825647294417, + "loss": 3.0929830074310303, + "step": 4675, + "token_acc": 0.28623168349715716 + }, + { + "epoch": 2.7411316329522135, + "grad_norm": 0.26899822975510257, + "learning_rate": 0.00019588550537814604, + "loss": 3.0790436267852783, + "step": 4676, + "token_acc": 0.29015201586252476 + }, + { + "epoch": 2.7417179712694226, + "grad_norm": 0.3231045312810313, + "learning_rate": 0.00019588275338263284, + "loss": 3.0601134300231934, + "step": 4677, + "token_acc": 0.291321438902434 + }, + { + "epoch": 2.7423043095866317, + "grad_norm": 0.35540835698269974, + "learning_rate": 0.00019588000048643034, + "loss": 3.099900245666504, + "step": 4678, + "token_acc": 0.2856986295650722 + }, + { + "epoch": 2.7428906479038404, + "grad_norm": 0.3956783976685458, + "learning_rate": 0.00019587724668956446, + "loss": 3.09187912940979, + "step": 4679, + "token_acc": 0.28746296921526465 + }, + { + "epoch": 2.7434769862210495, + "grad_norm": 0.3443251424533783, + "learning_rate": 0.00019587449199206104, + "loss": 3.040811538696289, + "step": 4680, + "token_acc": 0.29417252970503904 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.3242456837977996, + "learning_rate": 0.00019587173639394593, + "loss": 3.1106135845184326, + "step": 4681, + "token_acc": 0.2858626656005841 + }, + { + "epoch": 2.7446496628554677, + "grad_norm": 0.27962492594436156, + "learning_rate": 0.00019586897989524505, + "loss": 3.0799636840820312, + "step": 4682, + "token_acc": 0.28810755073258026 + }, + { + "epoch": 2.7452360011726764, + "grad_norm": 0.3200563141330118, + "learning_rate": 0.0001958662224959843, + "loss": 3.022454023361206, + "step": 4683, + "token_acc": 0.2946517134149012 + }, + { + "epoch": 2.7458223394898855, + "grad_norm": 0.3470764414478528, + "learning_rate": 0.00019586346419618958, + "loss": 3.0411646366119385, + "step": 4684, + "token_acc": 0.2956099928516978 + }, + { + "epoch": 2.7464086778070946, + "grad_norm": 0.371575536355943, + "learning_rate": 0.0001958607049958868, + "loss": 3.0789613723754883, + "step": 4685, + "token_acc": 0.2882726959956508 + }, + { + "epoch": 2.7469950161243037, + "grad_norm": 0.36172382853085894, + "learning_rate": 0.00019585794489510185, + "loss": 3.064236640930176, + "step": 4686, + "token_acc": 0.2914867132110191 + }, + { + "epoch": 2.747581354441513, + "grad_norm": 0.2998610455800225, + "learning_rate": 0.00019585518389386066, + "loss": 3.029071807861328, + "step": 4687, + "token_acc": 0.2972378247066233 + }, + { + "epoch": 2.748167692758722, + "grad_norm": 0.3266438526671952, + "learning_rate": 0.0001958524219921892, + "loss": 3.0545945167541504, + "step": 4688, + "token_acc": 0.2934507244725807 + }, + { + "epoch": 2.748754031075931, + "grad_norm": 0.3390500727553841, + "learning_rate": 0.0001958496591901134, + "loss": 3.0861992835998535, + "step": 4689, + "token_acc": 0.2885668548319151 + }, + { + "epoch": 2.7493403693931397, + "grad_norm": 0.31174469725482884, + "learning_rate": 0.00019584689548765924, + "loss": 3.0544002056121826, + "step": 4690, + "token_acc": 0.2940541296376857 + }, + { + "epoch": 2.749926707710349, + "grad_norm": 0.3764127403414886, + "learning_rate": 0.00019584413088485263, + "loss": 3.046809673309326, + "step": 4691, + "token_acc": 0.2941862662349933 + }, + { + "epoch": 2.750513046027558, + "grad_norm": 0.35695470617247205, + "learning_rate": 0.00019584136538171957, + "loss": 3.065302848815918, + "step": 4692, + "token_acc": 0.29076216254163384 + }, + { + "epoch": 2.751099384344767, + "grad_norm": 0.3389711880486392, + "learning_rate": 0.00019583859897828602, + "loss": 3.098149299621582, + "step": 4693, + "token_acc": 0.28518565650365424 + }, + { + "epoch": 2.7516857226619758, + "grad_norm": 0.28084049582478204, + "learning_rate": 0.00019583583167457797, + "loss": 3.1127569675445557, + "step": 4694, + "token_acc": 0.2848683358794438 + }, + { + "epoch": 2.752272060979185, + "grad_norm": 0.3247790265293051, + "learning_rate": 0.0001958330634706214, + "loss": 3.0625367164611816, + "step": 4695, + "token_acc": 0.291680886594017 + }, + { + "epoch": 2.752858399296394, + "grad_norm": 0.3106951595498024, + "learning_rate": 0.00019583029436644238, + "loss": 3.033874988555908, + "step": 4696, + "token_acc": 0.29539137757927825 + }, + { + "epoch": 2.753444737613603, + "grad_norm": 0.29705095587870434, + "learning_rate": 0.00019582752436206685, + "loss": 3.070064067840576, + "step": 4697, + "token_acc": 0.2887197727197061 + }, + { + "epoch": 2.754031075930812, + "grad_norm": 0.2929945761325786, + "learning_rate": 0.0001958247534575209, + "loss": 3.112363338470459, + "step": 4698, + "token_acc": 0.28455431060983494 + }, + { + "epoch": 2.7546174142480213, + "grad_norm": 0.3359301507738248, + "learning_rate": 0.00019582198165283047, + "loss": 3.0615921020507812, + "step": 4699, + "token_acc": 0.29164272577066774 + }, + { + "epoch": 2.75520375256523, + "grad_norm": 0.30165111441009423, + "learning_rate": 0.00019581920894802166, + "loss": 3.0730247497558594, + "step": 4700, + "token_acc": 0.289022546472928 + }, + { + "epoch": 2.755790090882439, + "grad_norm": 0.2925426932606334, + "learning_rate": 0.00019581643534312052, + "loss": 3.063750743865967, + "step": 4701, + "token_acc": 0.2912387791741472 + }, + { + "epoch": 2.756376429199648, + "grad_norm": 0.3313376692642333, + "learning_rate": 0.00019581366083815306, + "loss": 3.0759363174438477, + "step": 4702, + "token_acc": 0.2897901469499947 + }, + { + "epoch": 2.7569627675168573, + "grad_norm": 0.2562860886490982, + "learning_rate": 0.00019581088543314535, + "loss": 3.0696358680725098, + "step": 4703, + "token_acc": 0.29079656358875594 + }, + { + "epoch": 2.757549105834066, + "grad_norm": 0.28867385766927933, + "learning_rate": 0.0001958081091281235, + "loss": 3.087944507598877, + "step": 4704, + "token_acc": 0.2875663123764376 + }, + { + "epoch": 2.758135444151275, + "grad_norm": 0.3305289556075944, + "learning_rate": 0.0001958053319231136, + "loss": 3.0573315620422363, + "step": 4705, + "token_acc": 0.2941450695322377 + }, + { + "epoch": 2.758721782468484, + "grad_norm": 0.26457058929173943, + "learning_rate": 0.00019580255381814165, + "loss": 3.0935044288635254, + "step": 4706, + "token_acc": 0.28809805744742284 + }, + { + "epoch": 2.7593081207856933, + "grad_norm": 0.33978038309946346, + "learning_rate": 0.00019579977481323384, + "loss": 3.050231456756592, + "step": 4707, + "token_acc": 0.2941964704463532 + }, + { + "epoch": 2.7598944591029024, + "grad_norm": 0.34472759426439287, + "learning_rate": 0.00019579699490841624, + "loss": 3.0440878868103027, + "step": 4708, + "token_acc": 0.2937197206604339 + }, + { + "epoch": 2.7604807974201115, + "grad_norm": 0.29485957217769715, + "learning_rate": 0.00019579421410371493, + "loss": 3.0662012100219727, + "step": 4709, + "token_acc": 0.2916552940435449 + }, + { + "epoch": 2.7610671357373207, + "grad_norm": 0.336811692543625, + "learning_rate": 0.00019579143239915607, + "loss": 3.0733067989349365, + "step": 4710, + "token_acc": 0.2893607446271376 + }, + { + "epoch": 2.7616534740545293, + "grad_norm": 0.3391421590046798, + "learning_rate": 0.00019578864979476578, + "loss": 3.1234397888183594, + "step": 4711, + "token_acc": 0.2837658190051358 + }, + { + "epoch": 2.7622398123717384, + "grad_norm": 0.3173781862702908, + "learning_rate": 0.00019578586629057023, + "loss": 3.077023506164551, + "step": 4712, + "token_acc": 0.28865633628787213 + }, + { + "epoch": 2.7628261506889475, + "grad_norm": 0.295856290220241, + "learning_rate": 0.0001957830818865955, + "loss": 3.073726177215576, + "step": 4713, + "token_acc": 0.2900482635746166 + }, + { + "epoch": 2.7634124890061567, + "grad_norm": 0.2988660181723311, + "learning_rate": 0.0001957802965828678, + "loss": 3.0881896018981934, + "step": 4714, + "token_acc": 0.287873678465978 + }, + { + "epoch": 2.7639988273233653, + "grad_norm": 0.33936644452665277, + "learning_rate": 0.0001957775103794133, + "loss": 3.0753698348999023, + "step": 4715, + "token_acc": 0.28874593145701705 + }, + { + "epoch": 2.7645851656405744, + "grad_norm": 0.32235436266320716, + "learning_rate": 0.00019577472327625815, + "loss": 3.050734519958496, + "step": 4716, + "token_acc": 0.292624627093809 + }, + { + "epoch": 2.7651715039577835, + "grad_norm": 0.3926682045040899, + "learning_rate": 0.0001957719352734285, + "loss": 3.082221269607544, + "step": 4717, + "token_acc": 0.2890802136508495 + }, + { + "epoch": 2.7657578422749927, + "grad_norm": 0.5463267742163548, + "learning_rate": 0.0001957691463709506, + "loss": 3.0633373260498047, + "step": 4718, + "token_acc": 0.2933403151084927 + }, + { + "epoch": 2.7663441805922018, + "grad_norm": 0.45305651236167416, + "learning_rate": 0.0001957663565688506, + "loss": 3.0627689361572266, + "step": 4719, + "token_acc": 0.29162365997302125 + }, + { + "epoch": 2.766930518909411, + "grad_norm": 0.36121123116610665, + "learning_rate": 0.00019576356586715473, + "loss": 3.0541634559631348, + "step": 4720, + "token_acc": 0.2935886254275925 + }, + { + "epoch": 2.76751685722662, + "grad_norm": 0.38688455449479037, + "learning_rate": 0.00019576077426588923, + "loss": 3.0042762756347656, + "step": 4721, + "token_acc": 0.30013287338699374 + }, + { + "epoch": 2.7681031955438287, + "grad_norm": 0.2947530000112747, + "learning_rate": 0.0001957579817650803, + "loss": 3.052546977996826, + "step": 4722, + "token_acc": 0.2927042164110335 + }, + { + "epoch": 2.7686895338610378, + "grad_norm": 0.32017641901143074, + "learning_rate": 0.00019575518836475412, + "loss": 3.0398685932159424, + "step": 4723, + "token_acc": 0.2960986305903769 + }, + { + "epoch": 2.769275872178247, + "grad_norm": 0.3504907476961808, + "learning_rate": 0.000195752394064937, + "loss": 3.0588197708129883, + "step": 4724, + "token_acc": 0.292848503970132 + }, + { + "epoch": 2.769862210495456, + "grad_norm": 0.2917694815622761, + "learning_rate": 0.00019574959886565517, + "loss": 3.063797950744629, + "step": 4725, + "token_acc": 0.29095620509643644 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.30190155906695665, + "learning_rate": 0.00019574680276693486, + "loss": 3.0904717445373535, + "step": 4726, + "token_acc": 0.2884245279006465 + }, + { + "epoch": 2.771034887129874, + "grad_norm": 0.312431659698821, + "learning_rate": 0.00019574400576880238, + "loss": 3.0925369262695312, + "step": 4727, + "token_acc": 0.28749324352417777 + }, + { + "epoch": 2.771621225447083, + "grad_norm": 0.3773457249388687, + "learning_rate": 0.000195741207871284, + "loss": 3.0582962036132812, + "step": 4728, + "token_acc": 0.29231622686425685 + }, + { + "epoch": 2.772207563764292, + "grad_norm": 0.3209095466808538, + "learning_rate": 0.00019573840907440598, + "loss": 3.128178358078003, + "step": 4729, + "token_acc": 0.2824768929654992 + }, + { + "epoch": 2.772793902081501, + "grad_norm": 0.29226061188838, + "learning_rate": 0.00019573560937819457, + "loss": 3.0923616886138916, + "step": 4730, + "token_acc": 0.2864712196533112 + }, + { + "epoch": 2.7733802403987102, + "grad_norm": 0.32371053912124587, + "learning_rate": 0.00019573280878267614, + "loss": 3.029703140258789, + "step": 4731, + "token_acc": 0.29724591218459495 + }, + { + "epoch": 2.7739665787159193, + "grad_norm": 0.2786054331978042, + "learning_rate": 0.000195730007287877, + "loss": 3.0493087768554688, + "step": 4732, + "token_acc": 0.294330019413535 + }, + { + "epoch": 2.774552917033128, + "grad_norm": 0.32943201248548615, + "learning_rate": 0.0001957272048938234, + "loss": 3.090531826019287, + "step": 4733, + "token_acc": 0.28691924350759895 + }, + { + "epoch": 2.775139255350337, + "grad_norm": 0.27186942100487144, + "learning_rate": 0.00019572440160054173, + "loss": 3.0443716049194336, + "step": 4734, + "token_acc": 0.29338803405018826 + }, + { + "epoch": 2.7757255936675462, + "grad_norm": 0.3302503932228583, + "learning_rate": 0.00019572159740805832, + "loss": 3.067739248275757, + "step": 4735, + "token_acc": 0.29002010226042696 + }, + { + "epoch": 2.7763119319847553, + "grad_norm": 0.2985261874916844, + "learning_rate": 0.00019571879231639945, + "loss": 3.0648317337036133, + "step": 4736, + "token_acc": 0.2914637576808632 + }, + { + "epoch": 2.776898270301964, + "grad_norm": 0.3310816357812853, + "learning_rate": 0.00019571598632559152, + "loss": 3.067718982696533, + "step": 4737, + "token_acc": 0.2905018476514384 + }, + { + "epoch": 2.777484608619173, + "grad_norm": 0.29818552734573334, + "learning_rate": 0.00019571317943566086, + "loss": 3.1026358604431152, + "step": 4738, + "token_acc": 0.2855509040896296 + }, + { + "epoch": 2.7780709469363822, + "grad_norm": 0.3330834669152819, + "learning_rate": 0.00019571037164663389, + "loss": 3.021570920944214, + "step": 4739, + "token_acc": 0.2981759041933874 + }, + { + "epoch": 2.7786572852535913, + "grad_norm": 0.3039648515280741, + "learning_rate": 0.00019570756295853693, + "loss": 3.03800106048584, + "step": 4740, + "token_acc": 0.2946683961484478 + }, + { + "epoch": 2.7792436235708005, + "grad_norm": 0.26935054939240355, + "learning_rate": 0.00019570475337139635, + "loss": 3.1004624366760254, + "step": 4741, + "token_acc": 0.286403310917846 + }, + { + "epoch": 2.7798299618880096, + "grad_norm": 0.29738923224375247, + "learning_rate": 0.0001957019428852386, + "loss": 3.0832347869873047, + "step": 4742, + "token_acc": 0.2880243846665558 + }, + { + "epoch": 2.7804163002052187, + "grad_norm": 0.27315316878689616, + "learning_rate": 0.00019569913150009008, + "loss": 3.057856559753418, + "step": 4743, + "token_acc": 0.29203503220936605 + }, + { + "epoch": 2.7810026385224274, + "grad_norm": 0.29283060585933907, + "learning_rate": 0.00019569631921597717, + "loss": 3.0598127841949463, + "step": 4744, + "token_acc": 0.29049602010457287 + }, + { + "epoch": 2.7815889768396365, + "grad_norm": 0.29953033051413813, + "learning_rate": 0.00019569350603292627, + "loss": 3.0425472259521484, + "step": 4745, + "token_acc": 0.29547998117877244 + }, + { + "epoch": 2.7821753151568456, + "grad_norm": 0.2697598650110707, + "learning_rate": 0.00019569069195096386, + "loss": 3.0947084426879883, + "step": 4746, + "token_acc": 0.2875841711050182 + }, + { + "epoch": 2.7827616534740547, + "grad_norm": 0.31858661201747285, + "learning_rate": 0.00019568787697011632, + "loss": 3.071944236755371, + "step": 4747, + "token_acc": 0.2898569201307825 + }, + { + "epoch": 2.7833479917912634, + "grad_norm": 0.3676013530382402, + "learning_rate": 0.00019568506109041015, + "loss": 3.077906608581543, + "step": 4748, + "token_acc": 0.28924578356454117 + }, + { + "epoch": 2.7839343301084725, + "grad_norm": 0.326779862425377, + "learning_rate": 0.00019568224431187173, + "loss": 3.077691078186035, + "step": 4749, + "token_acc": 0.28780931288470024 + }, + { + "epoch": 2.7845206684256816, + "grad_norm": 0.33694893861544417, + "learning_rate": 0.00019567942663452757, + "loss": 3.041159152984619, + "step": 4750, + "token_acc": 0.29606528199342097 + }, + { + "epoch": 2.7851070067428907, + "grad_norm": 0.3178259923850441, + "learning_rate": 0.00019567660805840413, + "loss": 3.1025707721710205, + "step": 4751, + "token_acc": 0.28807764538445485 + }, + { + "epoch": 2.7856933450601, + "grad_norm": 0.3658597927252303, + "learning_rate": 0.00019567378858352785, + "loss": 3.055793285369873, + "step": 4752, + "token_acc": 0.29271552055813477 + }, + { + "epoch": 2.786279683377309, + "grad_norm": 0.3376723299126635, + "learning_rate": 0.0001956709682099253, + "loss": 3.0499935150146484, + "step": 4753, + "token_acc": 0.29120883277893084 + }, + { + "epoch": 2.7868660216945176, + "grad_norm": 0.36056985141640185, + "learning_rate": 0.0001956681469376229, + "loss": 3.047048568725586, + "step": 4754, + "token_acc": 0.2925351547627261 + }, + { + "epoch": 2.7874523600117267, + "grad_norm": 0.33932736999124347, + "learning_rate": 0.00019566532476664717, + "loss": 3.090956211090088, + "step": 4755, + "token_acc": 0.28699214608487994 + }, + { + "epoch": 2.788038698328936, + "grad_norm": 0.3352132222736056, + "learning_rate": 0.00019566250169702462, + "loss": 3.1232051849365234, + "step": 4756, + "token_acc": 0.28255951601430795 + }, + { + "epoch": 2.788625036646145, + "grad_norm": 0.28133736988114944, + "learning_rate": 0.0001956596777287818, + "loss": 3.0788848400115967, + "step": 4757, + "token_acc": 0.28894088537464363 + }, + { + "epoch": 2.7892113749633536, + "grad_norm": 0.3331359371690295, + "learning_rate": 0.00019565685286194518, + "loss": 3.056391716003418, + "step": 4758, + "token_acc": 0.2908862787387981 + }, + { + "epoch": 2.7897977132805627, + "grad_norm": 0.28472302880336786, + "learning_rate": 0.0001956540270965413, + "loss": 3.076047897338867, + "step": 4759, + "token_acc": 0.2881463130435695 + }, + { + "epoch": 2.790384051597772, + "grad_norm": 0.3156466927795733, + "learning_rate": 0.00019565120043259682, + "loss": 3.0509047508239746, + "step": 4760, + "token_acc": 0.29341020199140505 + }, + { + "epoch": 2.790970389914981, + "grad_norm": 0.31371019360064517, + "learning_rate": 0.00019564837287013815, + "loss": 3.0290744304656982, + "step": 4761, + "token_acc": 0.29575254166229953 + }, + { + "epoch": 2.79155672823219, + "grad_norm": 0.281034923604501, + "learning_rate": 0.0001956455444091919, + "loss": 3.0546388626098633, + "step": 4762, + "token_acc": 0.2940789215878365 + }, + { + "epoch": 2.792143066549399, + "grad_norm": 0.311617018580613, + "learning_rate": 0.00019564271504978465, + "loss": 3.0565614700317383, + "step": 4763, + "token_acc": 0.2922941892768093 + }, + { + "epoch": 2.7927294048666083, + "grad_norm": 0.4114744474658279, + "learning_rate": 0.000195639884791943, + "loss": 3.0467729568481445, + "step": 4764, + "token_acc": 0.2934740312935201 + }, + { + "epoch": 2.793315743183817, + "grad_norm": 0.395083272653278, + "learning_rate": 0.00019563705363569346, + "loss": 3.1033642292022705, + "step": 4765, + "token_acc": 0.28690961750521465 + }, + { + "epoch": 2.793902081501026, + "grad_norm": 0.37157473582972894, + "learning_rate": 0.0001956342215810627, + "loss": 3.055727958679199, + "step": 4766, + "token_acc": 0.2921332124268661 + }, + { + "epoch": 2.794488419818235, + "grad_norm": 0.33450703465703785, + "learning_rate": 0.0001956313886280773, + "loss": 3.0712943077087402, + "step": 4767, + "token_acc": 0.29050169014157695 + }, + { + "epoch": 2.7950747581354443, + "grad_norm": 0.3466754361645007, + "learning_rate": 0.00019562855477676388, + "loss": 3.0822036266326904, + "step": 4768, + "token_acc": 0.2875308568883901 + }, + { + "epoch": 2.795661096452653, + "grad_norm": 0.3206547441146517, + "learning_rate": 0.000195625720027149, + "loss": 3.0682313442230225, + "step": 4769, + "token_acc": 0.2898575821500163 + }, + { + "epoch": 2.796247434769862, + "grad_norm": 0.3005907915558481, + "learning_rate": 0.0001956228843792594, + "loss": 3.093679428100586, + "step": 4770, + "token_acc": 0.2874054326286171 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.2661451093039696, + "learning_rate": 0.0001956200478331216, + "loss": 3.107438087463379, + "step": 4771, + "token_acc": 0.2844080935730453 + }, + { + "epoch": 2.7974201114042803, + "grad_norm": 0.3434832824587024, + "learning_rate": 0.00019561721038876235, + "loss": 3.087347984313965, + "step": 4772, + "token_acc": 0.28813515411060936 + }, + { + "epoch": 2.7980064497214894, + "grad_norm": 0.3336395347608038, + "learning_rate": 0.0001956143720462082, + "loss": 3.065269947052002, + "step": 4773, + "token_acc": 0.29143902820112866 + }, + { + "epoch": 2.7985927880386985, + "grad_norm": 0.3183739760996455, + "learning_rate": 0.0001956115328054859, + "loss": 3.084470510482788, + "step": 4774, + "token_acc": 0.2878377561982711 + }, + { + "epoch": 2.7991791263559076, + "grad_norm": 0.31227590923188003, + "learning_rate": 0.00019560869266662209, + "loss": 3.079319715499878, + "step": 4775, + "token_acc": 0.2889203923034294 + }, + { + "epoch": 2.7997654646731163, + "grad_norm": 0.3549238363443517, + "learning_rate": 0.00019560585162964344, + "loss": 3.0619454383850098, + "step": 4776, + "token_acc": 0.29122783575408545 + }, + { + "epoch": 2.8003518029903254, + "grad_norm": 0.3927413516897163, + "learning_rate": 0.00019560300969457663, + "loss": 3.0831174850463867, + "step": 4777, + "token_acc": 0.28833676767296845 + }, + { + "epoch": 2.8009381413075345, + "grad_norm": 0.41221947619570604, + "learning_rate": 0.0001956001668614484, + "loss": 3.09385347366333, + "step": 4778, + "token_acc": 0.2855751419482141 + }, + { + "epoch": 2.8015244796247436, + "grad_norm": 0.3641670349002705, + "learning_rate": 0.0001955973231302854, + "loss": 3.0538108348846436, + "step": 4779, + "token_acc": 0.294092991467595 + }, + { + "epoch": 2.8021108179419523, + "grad_norm": 0.36265745122581533, + "learning_rate": 0.00019559447850111438, + "loss": 3.0987319946289062, + "step": 4780, + "token_acc": 0.28809029983106993 + }, + { + "epoch": 2.8026971562591614, + "grad_norm": 0.3891272704505942, + "learning_rate": 0.00019559163297396207, + "loss": 3.0733296871185303, + "step": 4781, + "token_acc": 0.28964453166740944 + }, + { + "epoch": 2.8032834945763705, + "grad_norm": 0.3929774321343106, + "learning_rate": 0.00019558878654885514, + "loss": 3.0486974716186523, + "step": 4782, + "token_acc": 0.2914900571203262 + }, + { + "epoch": 2.8038698328935796, + "grad_norm": 0.32330171386368967, + "learning_rate": 0.0001955859392258204, + "loss": 3.0741257667541504, + "step": 4783, + "token_acc": 0.28917372154970067 + }, + { + "epoch": 2.8044561712107887, + "grad_norm": 0.3919995427880726, + "learning_rate": 0.00019558309100488455, + "loss": 3.071486711502075, + "step": 4784, + "token_acc": 0.28951822002812655 + }, + { + "epoch": 2.805042509527998, + "grad_norm": 0.3322446673937404, + "learning_rate": 0.00019558024188607434, + "loss": 3.068291664123535, + "step": 4785, + "token_acc": 0.29114804392807553 + }, + { + "epoch": 2.805628847845207, + "grad_norm": 0.3997891503729205, + "learning_rate": 0.00019557739186941658, + "loss": 3.0418734550476074, + "step": 4786, + "token_acc": 0.2946515619287484 + }, + { + "epoch": 2.8062151861624156, + "grad_norm": 0.2781158824893155, + "learning_rate": 0.00019557454095493802, + "loss": 3.036565065383911, + "step": 4787, + "token_acc": 0.29455762202380176 + }, + { + "epoch": 2.8068015244796247, + "grad_norm": 0.3533572699291522, + "learning_rate": 0.00019557168914266542, + "loss": 3.0746285915374756, + "step": 4788, + "token_acc": 0.28938235880015517 + }, + { + "epoch": 2.807387862796834, + "grad_norm": 0.3484236456138922, + "learning_rate": 0.00019556883643262558, + "loss": 3.076582908630371, + "step": 4789, + "token_acc": 0.2894416045537373 + }, + { + "epoch": 2.807974201114043, + "grad_norm": 0.31151568791183704, + "learning_rate": 0.00019556598282484533, + "loss": 3.068746328353882, + "step": 4790, + "token_acc": 0.2907734137876514 + }, + { + "epoch": 2.8085605394312516, + "grad_norm": 0.30422662595008054, + "learning_rate": 0.00019556312831935143, + "loss": 3.0603930950164795, + "step": 4791, + "token_acc": 0.29182015923616245 + }, + { + "epoch": 2.8091468777484607, + "grad_norm": 0.2844521900280327, + "learning_rate": 0.0001955602729161707, + "loss": 3.0700931549072266, + "step": 4792, + "token_acc": 0.2885733871180363 + }, + { + "epoch": 2.80973321606567, + "grad_norm": 0.314068305913703, + "learning_rate": 0.00019555741661533, + "loss": 3.0719380378723145, + "step": 4793, + "token_acc": 0.29097032249040383 + }, + { + "epoch": 2.810319554382879, + "grad_norm": 0.2867334866449183, + "learning_rate": 0.0001955545594168561, + "loss": 3.0702767372131348, + "step": 4794, + "token_acc": 0.28948765022993145 + }, + { + "epoch": 2.810905892700088, + "grad_norm": 0.27354229014054054, + "learning_rate": 0.0001955517013207759, + "loss": 3.0938868522644043, + "step": 4795, + "token_acc": 0.28598251795463636 + }, + { + "epoch": 2.811492231017297, + "grad_norm": 0.361403078013731, + "learning_rate": 0.00019554884232711624, + "loss": 3.048142433166504, + "step": 4796, + "token_acc": 0.29303709294301006 + }, + { + "epoch": 2.8120785693345063, + "grad_norm": 0.32596841347359246, + "learning_rate": 0.00019554598243590394, + "loss": 3.0081427097320557, + "step": 4797, + "token_acc": 0.2989001133184032 + }, + { + "epoch": 2.812664907651715, + "grad_norm": 0.27486598572727083, + "learning_rate": 0.00019554312164716587, + "loss": 3.0953097343444824, + "step": 4798, + "token_acc": 0.2862303270854006 + }, + { + "epoch": 2.813251245968924, + "grad_norm": 0.2796793463258275, + "learning_rate": 0.00019554025996092894, + "loss": 3.0770645141601562, + "step": 4799, + "token_acc": 0.28943960565616056 + }, + { + "epoch": 2.813837584286133, + "grad_norm": 0.2928882554213848, + "learning_rate": 0.00019553739737722002, + "loss": 3.066865921020508, + "step": 4800, + "token_acc": 0.29224330236285273 + }, + { + "epoch": 2.8144239226033423, + "grad_norm": 0.30580552927601373, + "learning_rate": 0.00019553453389606598, + "loss": 3.0743274688720703, + "step": 4801, + "token_acc": 0.2898601247350462 + }, + { + "epoch": 2.815010260920551, + "grad_norm": 0.34627133378798947, + "learning_rate": 0.0001955316695174937, + "loss": 3.0507407188415527, + "step": 4802, + "token_acc": 0.2917941039038274 + }, + { + "epoch": 2.81559659923776, + "grad_norm": 0.29319270367760336, + "learning_rate": 0.00019552880424153016, + "loss": 3.068697929382324, + "step": 4803, + "token_acc": 0.29112078083927656 + }, + { + "epoch": 2.816182937554969, + "grad_norm": 0.2890990606826052, + "learning_rate": 0.00019552593806820221, + "loss": 3.0224251747131348, + "step": 4804, + "token_acc": 0.29567714049423427 + }, + { + "epoch": 2.8167692758721783, + "grad_norm": 0.31283062532229844, + "learning_rate": 0.0001955230709975368, + "loss": 3.0380756855010986, + "step": 4805, + "token_acc": 0.2946384078734272 + }, + { + "epoch": 2.8173556141893874, + "grad_norm": 0.3302896935973322, + "learning_rate": 0.00019552020302956085, + "loss": 3.0549888610839844, + "step": 4806, + "token_acc": 0.2918140181290734 + }, + { + "epoch": 2.8179419525065965, + "grad_norm": 0.33032486295032326, + "learning_rate": 0.0001955173341643013, + "loss": 3.055619716644287, + "step": 4807, + "token_acc": 0.29314283965790694 + }, + { + "epoch": 2.818528290823805, + "grad_norm": 0.3040012386436506, + "learning_rate": 0.0001955144644017851, + "loss": 3.1090478897094727, + "step": 4808, + "token_acc": 0.2845434293653076 + }, + { + "epoch": 2.8191146291410143, + "grad_norm": 0.33060042235189196, + "learning_rate": 0.00019551159374203925, + "loss": 3.035707473754883, + "step": 4809, + "token_acc": 0.29444163196717615 + }, + { + "epoch": 2.8197009674582234, + "grad_norm": 0.412898277380587, + "learning_rate": 0.00019550872218509065, + "loss": 3.043006181716919, + "step": 4810, + "token_acc": 0.2955802625062561 + }, + { + "epoch": 2.8202873057754325, + "grad_norm": 0.3190992896487028, + "learning_rate": 0.00019550584973096634, + "loss": 3.0928313732147217, + "step": 4811, + "token_acc": 0.28715416425356377 + }, + { + "epoch": 2.820873644092641, + "grad_norm": 0.3072258631131407, + "learning_rate": 0.00019550297637969323, + "loss": 3.0841221809387207, + "step": 4812, + "token_acc": 0.2886044743815788 + }, + { + "epoch": 2.8214599824098503, + "grad_norm": 0.319172103854892, + "learning_rate": 0.00019550010213129838, + "loss": 3.07167387008667, + "step": 4813, + "token_acc": 0.2896572915913528 + }, + { + "epoch": 2.8220463207270594, + "grad_norm": 0.2666530398333794, + "learning_rate": 0.00019549722698580872, + "loss": 3.0691330432891846, + "step": 4814, + "token_acc": 0.29123104117796034 + }, + { + "epoch": 2.8226326590442685, + "grad_norm": 0.25879114069297915, + "learning_rate": 0.00019549435094325132, + "loss": 3.086653232574463, + "step": 4815, + "token_acc": 0.2903096298883956 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.24183900721932308, + "learning_rate": 0.00019549147400365317, + "loss": 3.071302890777588, + "step": 4816, + "token_acc": 0.28998693729903535 + }, + { + "epoch": 2.8238053356786867, + "grad_norm": 0.3007085857779486, + "learning_rate": 0.00019548859616704128, + "loss": 3.063629388809204, + "step": 4817, + "token_acc": 0.291296979636463 + }, + { + "epoch": 2.824391673995896, + "grad_norm": 0.2988575184131992, + "learning_rate": 0.00019548571743344275, + "loss": 3.0706238746643066, + "step": 4818, + "token_acc": 0.29067878410667286 + }, + { + "epoch": 2.8249780123131045, + "grad_norm": 0.33728449152656953, + "learning_rate": 0.00019548283780288452, + "loss": 3.0786819458007812, + "step": 4819, + "token_acc": 0.28905718620833987 + }, + { + "epoch": 2.8255643506303136, + "grad_norm": 0.36388693238450476, + "learning_rate": 0.00019547995727539372, + "loss": 3.0740575790405273, + "step": 4820, + "token_acc": 0.2904994036197687 + }, + { + "epoch": 2.8261506889475227, + "grad_norm": 0.3013146052521194, + "learning_rate": 0.00019547707585099738, + "loss": 3.0459136962890625, + "step": 4821, + "token_acc": 0.29231553528306947 + }, + { + "epoch": 2.826737027264732, + "grad_norm": 0.3032751842751434, + "learning_rate": 0.00019547419352972258, + "loss": 3.024435043334961, + "step": 4822, + "token_acc": 0.29680993981551645 + }, + { + "epoch": 2.8273233655819405, + "grad_norm": 0.33773897338389397, + "learning_rate": 0.0001954713103115964, + "loss": 3.035207986831665, + "step": 4823, + "token_acc": 0.29501511728748153 + }, + { + "epoch": 2.8279097038991496, + "grad_norm": 0.3004819294348285, + "learning_rate": 0.00019546842619664584, + "loss": 3.0687451362609863, + "step": 4824, + "token_acc": 0.2916019065381826 + }, + { + "epoch": 2.8284960422163588, + "grad_norm": 0.3201179298297896, + "learning_rate": 0.0001954655411848981, + "loss": 3.0945825576782227, + "step": 4825, + "token_acc": 0.2866025715029788 + }, + { + "epoch": 2.829082380533568, + "grad_norm": 0.2804669391596908, + "learning_rate": 0.00019546265527638026, + "loss": 3.0753512382507324, + "step": 4826, + "token_acc": 0.28883611078124793 + }, + { + "epoch": 2.829668718850777, + "grad_norm": 0.27488218924690433, + "learning_rate": 0.00019545976847111938, + "loss": 3.03167462348938, + "step": 4827, + "token_acc": 0.2962654161951937 + }, + { + "epoch": 2.830255057167986, + "grad_norm": 0.3349400520603764, + "learning_rate": 0.00019545688076914264, + "loss": 3.0509636402130127, + "step": 4828, + "token_acc": 0.2910965147453083 + }, + { + "epoch": 2.830841395485195, + "grad_norm": 0.31931893238218856, + "learning_rate": 0.00019545399217047712, + "loss": 3.106464385986328, + "step": 4829, + "token_acc": 0.2845313687990928 + }, + { + "epoch": 2.831427733802404, + "grad_norm": 0.29040699046720536, + "learning_rate": 0.00019545110267514995, + "loss": 3.018885612487793, + "step": 4830, + "token_acc": 0.2979952538093374 + }, + { + "epoch": 2.832014072119613, + "grad_norm": 0.3311284952422099, + "learning_rate": 0.00019544821228318832, + "loss": 3.0981290340423584, + "step": 4831, + "token_acc": 0.2868815117329128 + }, + { + "epoch": 2.832600410436822, + "grad_norm": 0.3488209986385302, + "learning_rate": 0.00019544532099461935, + "loss": 3.032848358154297, + "step": 4832, + "token_acc": 0.29663976318771534 + }, + { + "epoch": 2.833186748754031, + "grad_norm": 0.4315055900502949, + "learning_rate": 0.00019544242880947018, + "loss": 3.0497515201568604, + "step": 4833, + "token_acc": 0.2940612020286376 + }, + { + "epoch": 2.83377308707124, + "grad_norm": 0.55374461309147, + "learning_rate": 0.00019543953572776803, + "loss": 3.0710690021514893, + "step": 4834, + "token_acc": 0.2905672003199758 + }, + { + "epoch": 2.834359425388449, + "grad_norm": 0.38188431680644663, + "learning_rate": 0.00019543664174954004, + "loss": 3.086817979812622, + "step": 4835, + "token_acc": 0.2890708811768873 + }, + { + "epoch": 2.834945763705658, + "grad_norm": 0.3797547727488267, + "learning_rate": 0.00019543374687481342, + "loss": 3.020171880722046, + "step": 4836, + "token_acc": 0.2982751157574406 + }, + { + "epoch": 2.835532102022867, + "grad_norm": 0.4167338356417922, + "learning_rate": 0.0001954308511036153, + "loss": 3.09006667137146, + "step": 4837, + "token_acc": 0.28785823232975155 + }, + { + "epoch": 2.8361184403400763, + "grad_norm": 0.3203852242666415, + "learning_rate": 0.000195427954435973, + "loss": 3.0870513916015625, + "step": 4838, + "token_acc": 0.28868300509821176 + }, + { + "epoch": 2.8367047786572854, + "grad_norm": 0.43809277582392286, + "learning_rate": 0.00019542505687191358, + "loss": 3.0553090572357178, + "step": 4839, + "token_acc": 0.29217339728712155 + }, + { + "epoch": 2.8372911169744945, + "grad_norm": 0.3376205603932334, + "learning_rate": 0.0001954221584114644, + "loss": 3.074392557144165, + "step": 4840, + "token_acc": 0.28855033528874713 + }, + { + "epoch": 2.837877455291703, + "grad_norm": 0.413718309111033, + "learning_rate": 0.0001954192590546526, + "loss": 3.065587282180786, + "step": 4841, + "token_acc": 0.28989833452806396 + }, + { + "epoch": 2.8384637936089123, + "grad_norm": 0.3795802555153408, + "learning_rate": 0.00019541635880150545, + "loss": 3.066976547241211, + "step": 4842, + "token_acc": 0.29106589305379943 + }, + { + "epoch": 2.8390501319261214, + "grad_norm": 0.3247764503371684, + "learning_rate": 0.00019541345765205015, + "loss": 3.0576696395874023, + "step": 4843, + "token_acc": 0.2903086962202711 + }, + { + "epoch": 2.8396364702433305, + "grad_norm": 0.4234376828592185, + "learning_rate": 0.000195410555606314, + "loss": 3.0026135444641113, + "step": 4844, + "token_acc": 0.2998630354012603 + }, + { + "epoch": 2.840222808560539, + "grad_norm": 0.3391443912458567, + "learning_rate": 0.00019540765266432426, + "loss": 3.068728446960449, + "step": 4845, + "token_acc": 0.2889766071419453 + }, + { + "epoch": 2.8408091468777483, + "grad_norm": 0.3663031667230038, + "learning_rate": 0.0001954047488261082, + "loss": 3.0565459728240967, + "step": 4846, + "token_acc": 0.2918244356064417 + }, + { + "epoch": 2.8413954851949574, + "grad_norm": 0.38458118818323517, + "learning_rate": 0.00019540184409169308, + "loss": 3.0283827781677246, + "step": 4847, + "token_acc": 0.29816596964184844 + }, + { + "epoch": 2.8419818235121665, + "grad_norm": 0.4213115324657733, + "learning_rate": 0.00019539893846110616, + "loss": 3.101609230041504, + "step": 4848, + "token_acc": 0.2860203744249679 + }, + { + "epoch": 2.8425681618293757, + "grad_norm": 0.3537049660591789, + "learning_rate": 0.0001953960319343748, + "loss": 3.1010212898254395, + "step": 4849, + "token_acc": 0.2846537919532161 + }, + { + "epoch": 2.8431545001465848, + "grad_norm": 0.381323900636401, + "learning_rate": 0.00019539312451152623, + "loss": 3.1073460578918457, + "step": 4850, + "token_acc": 0.28463405526512325 + }, + { + "epoch": 2.843740838463794, + "grad_norm": 0.3834835020609835, + "learning_rate": 0.00019539021619258782, + "loss": 3.0617198944091797, + "step": 4851, + "token_acc": 0.2928236820929369 + }, + { + "epoch": 2.8443271767810026, + "grad_norm": 0.4026338214935844, + "learning_rate": 0.00019538730697758687, + "loss": 3.08662748336792, + "step": 4852, + "token_acc": 0.28897779709329824 + }, + { + "epoch": 2.8449135150982117, + "grad_norm": 0.29185528775253, + "learning_rate": 0.0001953843968665507, + "loss": 3.0339646339416504, + "step": 4853, + "token_acc": 0.29589362239822775 + }, + { + "epoch": 2.8454998534154208, + "grad_norm": 0.4035163244853389, + "learning_rate": 0.00019538148585950666, + "loss": 3.072492837905884, + "step": 4854, + "token_acc": 0.28918785014672815 + }, + { + "epoch": 2.84608619173263, + "grad_norm": 0.28911351690481685, + "learning_rate": 0.0001953785739564821, + "loss": 3.093196392059326, + "step": 4855, + "token_acc": 0.2887643048261868 + }, + { + "epoch": 2.8466725300498386, + "grad_norm": 0.3287491819913607, + "learning_rate": 0.00019537566115750433, + "loss": 3.060621500015259, + "step": 4856, + "token_acc": 0.2912318012944219 + }, + { + "epoch": 2.8472588683670477, + "grad_norm": 0.3031585022389628, + "learning_rate": 0.00019537274746260077, + "loss": 3.0571258068084717, + "step": 4857, + "token_acc": 0.2920591592112806 + }, + { + "epoch": 2.847845206684257, + "grad_norm": 0.29596368297161246, + "learning_rate": 0.00019536983287179877, + "loss": 3.103498935699463, + "step": 4858, + "token_acc": 0.28440152299369487 + }, + { + "epoch": 2.848431545001466, + "grad_norm": 0.30920269739266804, + "learning_rate": 0.00019536691738512567, + "loss": 3.069467544555664, + "step": 4859, + "token_acc": 0.29090645894332035 + }, + { + "epoch": 2.849017883318675, + "grad_norm": 0.2717254454120263, + "learning_rate": 0.00019536400100260892, + "loss": 3.079941987991333, + "step": 4860, + "token_acc": 0.2911455223591263 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.33628052562924493, + "learning_rate": 0.00019536108372427587, + "loss": 3.0555896759033203, + "step": 4861, + "token_acc": 0.290561073472341 + }, + { + "epoch": 2.850190559953093, + "grad_norm": 0.28501255791407487, + "learning_rate": 0.00019535816555015396, + "loss": 3.0917677879333496, + "step": 4862, + "token_acc": 0.2867813115093753 + }, + { + "epoch": 2.850776898270302, + "grad_norm": 0.3054068772088216, + "learning_rate": 0.00019535524648027055, + "loss": 3.0719780921936035, + "step": 4863, + "token_acc": 0.2909968498861029 + }, + { + "epoch": 2.851363236587511, + "grad_norm": 0.2985137496979471, + "learning_rate": 0.0001953523265146531, + "loss": 3.0235595703125, + "step": 4864, + "token_acc": 0.296651389974607 + }, + { + "epoch": 2.85194957490472, + "grad_norm": 0.2866215802214872, + "learning_rate": 0.00019534940565332906, + "loss": 3.054766893386841, + "step": 4865, + "token_acc": 0.2916961168018736 + }, + { + "epoch": 2.852535913221929, + "grad_norm": 0.24659156770918106, + "learning_rate": 0.00019534648389632578, + "loss": 3.0671842098236084, + "step": 4866, + "token_acc": 0.2920148172176907 + }, + { + "epoch": 2.853122251539138, + "grad_norm": 0.28445428017836805, + "learning_rate": 0.00019534356124367084, + "loss": 3.011157989501953, + "step": 4867, + "token_acc": 0.2980853068287034 + }, + { + "epoch": 2.853708589856347, + "grad_norm": 0.30445143180651935, + "learning_rate": 0.00019534063769539157, + "loss": 3.094839572906494, + "step": 4868, + "token_acc": 0.28696462776146087 + }, + { + "epoch": 2.854294928173556, + "grad_norm": 0.30367132114162815, + "learning_rate": 0.0001953377132515155, + "loss": 3.0809757709503174, + "step": 4869, + "token_acc": 0.28882139858954975 + }, + { + "epoch": 2.8548812664907652, + "grad_norm": 0.27759814970319696, + "learning_rate": 0.00019533478791207008, + "loss": 3.130516529083252, + "step": 4870, + "token_acc": 0.2818313419058565 + }, + { + "epoch": 2.8554676048079743, + "grad_norm": 0.29533898152526294, + "learning_rate": 0.00019533186167708277, + "loss": 3.101151466369629, + "step": 4871, + "token_acc": 0.28588318813882724 + }, + { + "epoch": 2.8560539431251835, + "grad_norm": 0.28787873888458543, + "learning_rate": 0.0001953289345465811, + "loss": 3.043135643005371, + "step": 4872, + "token_acc": 0.29304473538295606 + }, + { + "epoch": 2.856640281442392, + "grad_norm": 0.3224552936060447, + "learning_rate": 0.00019532600652059256, + "loss": 3.0701546669006348, + "step": 4873, + "token_acc": 0.2900308031035979 + }, + { + "epoch": 2.8572266197596012, + "grad_norm": 0.35021669424150276, + "learning_rate": 0.00019532307759914463, + "loss": 3.0508546829223633, + "step": 4874, + "token_acc": 0.29209976836406165 + }, + { + "epoch": 2.8578129580768104, + "grad_norm": 0.2601008953817544, + "learning_rate": 0.00019532014778226483, + "loss": 3.061588764190674, + "step": 4875, + "token_acc": 0.2907599404545861 + }, + { + "epoch": 2.8583992963940195, + "grad_norm": 0.36007255118109815, + "learning_rate": 0.0001953172170699807, + "loss": 3.061830997467041, + "step": 4876, + "token_acc": 0.29151114184880444 + }, + { + "epoch": 2.858985634711228, + "grad_norm": 0.3268574077839835, + "learning_rate": 0.00019531428546231972, + "loss": 3.0864903926849365, + "step": 4877, + "token_acc": 0.2878912831202473 + }, + { + "epoch": 2.8595719730284372, + "grad_norm": 0.3666618062626624, + "learning_rate": 0.00019531135295930953, + "loss": 3.0952935218811035, + "step": 4878, + "token_acc": 0.28681671687894156 + }, + { + "epoch": 2.8601583113456464, + "grad_norm": 0.2923946530845022, + "learning_rate": 0.00019530841956097756, + "loss": 3.061192035675049, + "step": 4879, + "token_acc": 0.2905892146982791 + }, + { + "epoch": 2.8607446496628555, + "grad_norm": 0.36756679019953775, + "learning_rate": 0.00019530548526735145, + "loss": 3.061936855316162, + "step": 4880, + "token_acc": 0.29129293371253556 + }, + { + "epoch": 2.8613309879800646, + "grad_norm": 0.31838698203652577, + "learning_rate": 0.0001953025500784587, + "loss": 3.118236541748047, + "step": 4881, + "token_acc": 0.28292039035494476 + }, + { + "epoch": 2.8619173262972737, + "grad_norm": 0.29507769731151284, + "learning_rate": 0.00019529961399432694, + "loss": 3.072221517562866, + "step": 4882, + "token_acc": 0.2905912732569688 + }, + { + "epoch": 2.862503664614483, + "grad_norm": 0.31811885578532173, + "learning_rate": 0.00019529667701498373, + "loss": 3.059858798980713, + "step": 4883, + "token_acc": 0.29262427393011936 + }, + { + "epoch": 2.8630900029316915, + "grad_norm": 0.35718945232677896, + "learning_rate": 0.0001952937391404566, + "loss": 3.08382511138916, + "step": 4884, + "token_acc": 0.28940702330568835 + }, + { + "epoch": 2.8636763412489006, + "grad_norm": 0.30314472896270445, + "learning_rate": 0.00019529080037077327, + "loss": 3.0534591674804688, + "step": 4885, + "token_acc": 0.2919510403791479 + }, + { + "epoch": 2.8642626795661097, + "grad_norm": 0.3514504752727815, + "learning_rate": 0.00019528786070596124, + "loss": 3.0765669345855713, + "step": 4886, + "token_acc": 0.28895427566556614 + }, + { + "epoch": 2.864849017883319, + "grad_norm": 0.4280942191060666, + "learning_rate": 0.0001952849201460482, + "loss": 3.042485237121582, + "step": 4887, + "token_acc": 0.2937603972823868 + }, + { + "epoch": 2.8654353562005275, + "grad_norm": 0.35443377793076597, + "learning_rate": 0.00019528197869106165, + "loss": 3.051652431488037, + "step": 4888, + "token_acc": 0.2934732877222476 + }, + { + "epoch": 2.8660216945177366, + "grad_norm": 0.3033428252202999, + "learning_rate": 0.0001952790363410294, + "loss": 3.0494492053985596, + "step": 4889, + "token_acc": 0.2940113398539273 + }, + { + "epoch": 2.8666080328349457, + "grad_norm": 0.37533714897006115, + "learning_rate": 0.00019527609309597893, + "loss": 3.1051902770996094, + "step": 4890, + "token_acc": 0.2859208554213232 + }, + { + "epoch": 2.867194371152155, + "grad_norm": 0.26587952960690314, + "learning_rate": 0.00019527314895593796, + "loss": 3.0404930114746094, + "step": 4891, + "token_acc": 0.29563901073068916 + }, + { + "epoch": 2.867780709469364, + "grad_norm": 0.34869614202683685, + "learning_rate": 0.00019527020392093417, + "loss": 3.031594753265381, + "step": 4892, + "token_acc": 0.29602170693022584 + }, + { + "epoch": 2.868367047786573, + "grad_norm": 0.3144740185604402, + "learning_rate": 0.00019526725799099516, + "loss": 3.0938291549682617, + "step": 4893, + "token_acc": 0.2881541375775532 + }, + { + "epoch": 2.868953386103782, + "grad_norm": 0.3226685545681531, + "learning_rate": 0.00019526431116614863, + "loss": 3.060157060623169, + "step": 4894, + "token_acc": 0.2908762123118127 + }, + { + "epoch": 2.869539724420991, + "grad_norm": 0.2919388821578318, + "learning_rate": 0.0001952613634464223, + "loss": 3.099207878112793, + "step": 4895, + "token_acc": 0.2872777432926409 + }, + { + "epoch": 2.8701260627382, + "grad_norm": 0.3223641289830053, + "learning_rate": 0.00019525841483184378, + "loss": 3.011539936065674, + "step": 4896, + "token_acc": 0.29779659992801577 + }, + { + "epoch": 2.870712401055409, + "grad_norm": 0.3255575485130732, + "learning_rate": 0.00019525546532244084, + "loss": 3.0442047119140625, + "step": 4897, + "token_acc": 0.29523556693786407 + }, + { + "epoch": 2.871298739372618, + "grad_norm": 0.3284871722886573, + "learning_rate": 0.0001952525149182412, + "loss": 3.055866241455078, + "step": 4898, + "token_acc": 0.2915886285875718 + }, + { + "epoch": 2.871885077689827, + "grad_norm": 0.3227825198785674, + "learning_rate": 0.00019524956361927247, + "loss": 3.06754994392395, + "step": 4899, + "token_acc": 0.2899600570396618 + }, + { + "epoch": 2.872471416007036, + "grad_norm": 0.30161036823209014, + "learning_rate": 0.0001952466114255625, + "loss": 3.0585498809814453, + "step": 4900, + "token_acc": 0.2912838015227094 + }, + { + "epoch": 2.873057754324245, + "grad_norm": 0.3018326401099606, + "learning_rate": 0.0001952436583371389, + "loss": 3.045538902282715, + "step": 4901, + "token_acc": 0.29375840204788595 + }, + { + "epoch": 2.873644092641454, + "grad_norm": 0.2875467379991328, + "learning_rate": 0.00019524070435402954, + "loss": 3.0933055877685547, + "step": 4902, + "token_acc": 0.2868314807846806 + }, + { + "epoch": 2.8742304309586633, + "grad_norm": 0.2966062176744263, + "learning_rate": 0.00019523774947626205, + "loss": 3.0361576080322266, + "step": 4903, + "token_acc": 0.29406529499450806 + }, + { + "epoch": 2.8748167692758724, + "grad_norm": 0.30411610714541404, + "learning_rate": 0.00019523479370386426, + "loss": 3.090198040008545, + "step": 4904, + "token_acc": 0.28629630216001983 + }, + { + "epoch": 2.875403107593081, + "grad_norm": 0.28227136568442396, + "learning_rate": 0.0001952318370368639, + "loss": 3.0773372650146484, + "step": 4905, + "token_acc": 0.2884729404570411 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.33037340078595684, + "learning_rate": 0.00019522887947528877, + "loss": 3.0895557403564453, + "step": 4906, + "token_acc": 0.2874898352918666 + }, + { + "epoch": 2.8765757842274993, + "grad_norm": 0.31841402243589145, + "learning_rate": 0.00019522592101916663, + "loss": 3.0663156509399414, + "step": 4907, + "token_acc": 0.2916984224011237 + }, + { + "epoch": 2.8771621225447084, + "grad_norm": 0.3086965140967257, + "learning_rate": 0.0001952229616685253, + "loss": 3.0425424575805664, + "step": 4908, + "token_acc": 0.2942559235868207 + }, + { + "epoch": 2.8777484608619175, + "grad_norm": 0.28716991918330004, + "learning_rate": 0.00019522000142339254, + "loss": 3.0544233322143555, + "step": 4909, + "token_acc": 0.2911381364168764 + }, + { + "epoch": 2.878334799179126, + "grad_norm": 0.28715482071723203, + "learning_rate": 0.00019521704028379618, + "loss": 3.0353474617004395, + "step": 4910, + "token_acc": 0.29551806468363256 + }, + { + "epoch": 2.8789211374963353, + "grad_norm": 0.3197001284280265, + "learning_rate": 0.00019521407824976404, + "loss": 3.086859703063965, + "step": 4911, + "token_acc": 0.2868983155202627 + }, + { + "epoch": 2.8795074758135444, + "grad_norm": 0.306877706993746, + "learning_rate": 0.00019521111532132395, + "loss": 3.0582661628723145, + "step": 4912, + "token_acc": 0.29250594903416466 + }, + { + "epoch": 2.8800938141307535, + "grad_norm": 0.32556874648938794, + "learning_rate": 0.0001952081514985037, + "loss": 3.070643901824951, + "step": 4913, + "token_acc": 0.29001512272674396 + }, + { + "epoch": 2.8806801524479626, + "grad_norm": 0.36671361547295483, + "learning_rate": 0.0001952051867813312, + "loss": 3.057034492492676, + "step": 4914, + "token_acc": 0.2897786822691427 + }, + { + "epoch": 2.8812664907651717, + "grad_norm": 0.3048496487672765, + "learning_rate": 0.00019520222116983422, + "loss": 3.0235021114349365, + "step": 4915, + "token_acc": 0.29852030193280393 + }, + { + "epoch": 2.8818528290823804, + "grad_norm": 0.2638143710500929, + "learning_rate": 0.00019519925466404068, + "loss": 3.0681281089782715, + "step": 4916, + "token_acc": 0.2916421432134477 + }, + { + "epoch": 2.8824391673995895, + "grad_norm": 0.2982293147052228, + "learning_rate": 0.0001951962872639784, + "loss": 3.0866174697875977, + "step": 4917, + "token_acc": 0.28765797049066055 + }, + { + "epoch": 2.8830255057167986, + "grad_norm": 0.3540207206095341, + "learning_rate": 0.0001951933189696753, + "loss": 3.0931448936462402, + "step": 4918, + "token_acc": 0.2883862402306779 + }, + { + "epoch": 2.8836118440340077, + "grad_norm": 0.31971507281384876, + "learning_rate": 0.00019519034978115926, + "loss": 3.0571348667144775, + "step": 4919, + "token_acc": 0.2932009103208147 + }, + { + "epoch": 2.8841981823512164, + "grad_norm": 0.27405814727918415, + "learning_rate": 0.00019518737969845813, + "loss": 3.041811943054199, + "step": 4920, + "token_acc": 0.29337055352582153 + }, + { + "epoch": 2.8847845206684255, + "grad_norm": 0.36428684911126924, + "learning_rate": 0.00019518440872159985, + "loss": 3.085550308227539, + "step": 4921, + "token_acc": 0.29055522320903576 + }, + { + "epoch": 2.8853708589856346, + "grad_norm": 0.2700458102631939, + "learning_rate": 0.0001951814368506123, + "loss": 3.027010917663574, + "step": 4922, + "token_acc": 0.2973262750444045 + }, + { + "epoch": 2.8859571973028437, + "grad_norm": 0.2775340947246322, + "learning_rate": 0.00019517846408552344, + "loss": 3.061241865158081, + "step": 4923, + "token_acc": 0.29162124362526387 + }, + { + "epoch": 2.886543535620053, + "grad_norm": 0.25164059915431164, + "learning_rate": 0.0001951754904263611, + "loss": 3.051241159439087, + "step": 4924, + "token_acc": 0.2936431476893905 + }, + { + "epoch": 2.887129873937262, + "grad_norm": 0.28960076649361527, + "learning_rate": 0.00019517251587315333, + "loss": 3.048980474472046, + "step": 4925, + "token_acc": 0.29265462372831214 + }, + { + "epoch": 2.887716212254471, + "grad_norm": 0.30774919758035363, + "learning_rate": 0.000195169540425928, + "loss": 3.069098472595215, + "step": 4926, + "token_acc": 0.29126203282379115 + }, + { + "epoch": 2.8883025505716797, + "grad_norm": 0.2542348221507696, + "learning_rate": 0.00019516656408471308, + "loss": 3.070157289505005, + "step": 4927, + "token_acc": 0.29073473255046084 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.30542715171935125, + "learning_rate": 0.00019516358684953654, + "loss": 3.052456855773926, + "step": 4928, + "token_acc": 0.29356946773846526 + }, + { + "epoch": 2.889475227206098, + "grad_norm": 0.30027747246176606, + "learning_rate": 0.00019516060872042633, + "loss": 3.0504140853881836, + "step": 4929, + "token_acc": 0.2912003191236374 + }, + { + "epoch": 2.890061565523307, + "grad_norm": 0.3218139644405959, + "learning_rate": 0.00019515762969741043, + "loss": 3.042534112930298, + "step": 4930, + "token_acc": 0.2941679956501266 + }, + { + "epoch": 2.8906479038405157, + "grad_norm": 0.35429420847453924, + "learning_rate": 0.00019515464978051684, + "loss": 3.0787360668182373, + "step": 4931, + "token_acc": 0.2892535280844871 + }, + { + "epoch": 2.891234242157725, + "grad_norm": 0.3075804671801152, + "learning_rate": 0.00019515166896977353, + "loss": 3.075737953186035, + "step": 4932, + "token_acc": 0.28799967496686185 + }, + { + "epoch": 2.891820580474934, + "grad_norm": 0.3109209487414417, + "learning_rate": 0.0001951486872652085, + "loss": 3.0373785495758057, + "step": 4933, + "token_acc": 0.2958004147090475 + }, + { + "epoch": 2.892406918792143, + "grad_norm": 0.40040084121628716, + "learning_rate": 0.00019514570466684975, + "loss": 3.091073989868164, + "step": 4934, + "token_acc": 0.28876125636003774 + }, + { + "epoch": 2.892993257109352, + "grad_norm": 0.3575109866455189, + "learning_rate": 0.00019514272117472536, + "loss": 3.065265655517578, + "step": 4935, + "token_acc": 0.29073600591763554 + }, + { + "epoch": 2.8935795954265613, + "grad_norm": 0.28472537333202524, + "learning_rate": 0.0001951397367888633, + "loss": 3.0574464797973633, + "step": 4936, + "token_acc": 0.2931033112266933 + }, + { + "epoch": 2.8941659337437704, + "grad_norm": 0.3784304959634468, + "learning_rate": 0.0001951367515092916, + "loss": 3.0993096828460693, + "step": 4937, + "token_acc": 0.28689739070761267 + }, + { + "epoch": 2.894752272060979, + "grad_norm": 0.32214704026602126, + "learning_rate": 0.00019513376533603834, + "loss": 3.0546460151672363, + "step": 4938, + "token_acc": 0.29124363346357707 + }, + { + "epoch": 2.895338610378188, + "grad_norm": 0.37367857818457406, + "learning_rate": 0.0001951307782691315, + "loss": 3.0816681385040283, + "step": 4939, + "token_acc": 0.2890486828017395 + }, + { + "epoch": 2.8959249486953973, + "grad_norm": 0.34377161438503284, + "learning_rate": 0.00019512779030859923, + "loss": 3.0321311950683594, + "step": 4940, + "token_acc": 0.29831264256548357 + }, + { + "epoch": 2.8965112870126064, + "grad_norm": 0.30743489572237015, + "learning_rate": 0.00019512480145446955, + "loss": 3.0444631576538086, + "step": 4941, + "token_acc": 0.2953198061684297 + }, + { + "epoch": 2.897097625329815, + "grad_norm": 0.345661441854576, + "learning_rate": 0.00019512181170677054, + "loss": 3.0745506286621094, + "step": 4942, + "token_acc": 0.2908424560486967 + }, + { + "epoch": 2.897683963647024, + "grad_norm": 0.28453720045887443, + "learning_rate": 0.00019511882106553027, + "loss": 3.0129456520080566, + "step": 4943, + "token_acc": 0.298100760931167 + }, + { + "epoch": 2.8982703019642333, + "grad_norm": 0.33390322793680943, + "learning_rate": 0.00019511582953077688, + "loss": 3.0728859901428223, + "step": 4944, + "token_acc": 0.2920691026610161 + }, + { + "epoch": 2.8988566402814424, + "grad_norm": 0.2989739154637284, + "learning_rate": 0.00019511283710253844, + "loss": 3.0774335861206055, + "step": 4945, + "token_acc": 0.28968851320055583 + }, + { + "epoch": 2.8994429785986515, + "grad_norm": 0.3056859449014606, + "learning_rate": 0.00019510984378084303, + "loss": 3.0677621364593506, + "step": 4946, + "token_acc": 0.2913852132876839 + }, + { + "epoch": 2.9000293169158606, + "grad_norm": 0.31106774588008657, + "learning_rate": 0.0001951068495657188, + "loss": 3.0612406730651855, + "step": 4947, + "token_acc": 0.2904933265799153 + }, + { + "epoch": 2.9006156552330697, + "grad_norm": 0.3431997253882104, + "learning_rate": 0.00019510385445719393, + "loss": 3.0397167205810547, + "step": 4948, + "token_acc": 0.29528332380218303 + }, + { + "epoch": 2.9012019935502784, + "grad_norm": 0.35228440865085725, + "learning_rate": 0.00019510085845529646, + "loss": 3.114530563354492, + "step": 4949, + "token_acc": 0.28503666512518994 + }, + { + "epoch": 2.9017883318674875, + "grad_norm": 0.3439863715588709, + "learning_rate": 0.0001950978615600546, + "loss": 3.1159915924072266, + "step": 4950, + "token_acc": 0.28478790405758303 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.2871610711295451, + "learning_rate": 0.00019509486377149643, + "loss": 3.0912458896636963, + "step": 4951, + "token_acc": 0.2878681893324321 + }, + { + "epoch": 2.9029610085019057, + "grad_norm": 0.26930918905170254, + "learning_rate": 0.0001950918650896502, + "loss": 3.057864189147949, + "step": 4952, + "token_acc": 0.2929889837309404 + }, + { + "epoch": 2.9035473468191144, + "grad_norm": 0.2974530225849852, + "learning_rate": 0.00019508886551454401, + "loss": 3.09619140625, + "step": 4953, + "token_acc": 0.2869263498501634 + }, + { + "epoch": 2.9041336851363235, + "grad_norm": 0.3568310165975505, + "learning_rate": 0.00019508586504620606, + "loss": 3.0311264991760254, + "step": 4954, + "token_acc": 0.29708723152446814 + }, + { + "epoch": 2.9047200234535326, + "grad_norm": 0.3554506512897541, + "learning_rate": 0.00019508286368466457, + "loss": 3.0833358764648438, + "step": 4955, + "token_acc": 0.28782082762542327 + }, + { + "epoch": 2.9053063617707418, + "grad_norm": 0.28909144020956834, + "learning_rate": 0.00019507986142994769, + "loss": 3.087094783782959, + "step": 4956, + "token_acc": 0.28886779879008523 + }, + { + "epoch": 2.905892700087951, + "grad_norm": 0.3578905680315287, + "learning_rate": 0.00019507685828208362, + "loss": 3.0739173889160156, + "step": 4957, + "token_acc": 0.2886727387596083 + }, + { + "epoch": 2.90647903840516, + "grad_norm": 0.37917783175413816, + "learning_rate": 0.00019507385424110058, + "loss": 3.082059144973755, + "step": 4958, + "token_acc": 0.28770687905213665 + }, + { + "epoch": 2.9070653767223686, + "grad_norm": 0.37415863577809955, + "learning_rate": 0.0001950708493070268, + "loss": 3.061110496520996, + "step": 4959, + "token_acc": 0.2916764963937051 + }, + { + "epoch": 2.9076517150395778, + "grad_norm": 0.27267786356567114, + "learning_rate": 0.0001950678434798905, + "loss": 3.092258930206299, + "step": 4960, + "token_acc": 0.2866734228527996 + }, + { + "epoch": 2.908238053356787, + "grad_norm": 0.3462552538764763, + "learning_rate": 0.00019506483675971992, + "loss": 3.0762271881103516, + "step": 4961, + "token_acc": 0.2893115142997401 + }, + { + "epoch": 2.908824391673996, + "grad_norm": 0.3393021951700508, + "learning_rate": 0.00019506182914654332, + "loss": 3.097916841506958, + "step": 4962, + "token_acc": 0.2852461936180077 + }, + { + "epoch": 2.909410729991205, + "grad_norm": 0.26711967742588283, + "learning_rate": 0.00019505882064038888, + "loss": 3.0462584495544434, + "step": 4963, + "token_acc": 0.2939512233044212 + }, + { + "epoch": 2.9099970683084138, + "grad_norm": 0.3123835303399173, + "learning_rate": 0.00019505581124128497, + "loss": 3.0573606491088867, + "step": 4964, + "token_acc": 0.2918963629969737 + }, + { + "epoch": 2.910583406625623, + "grad_norm": 0.30336828617282463, + "learning_rate": 0.00019505280094925977, + "loss": 3.061286449432373, + "step": 4965, + "token_acc": 0.2926676760734421 + }, + { + "epoch": 2.911169744942832, + "grad_norm": 0.30746302459703834, + "learning_rate": 0.0001950497897643416, + "loss": 3.0631346702575684, + "step": 4966, + "token_acc": 0.29078807715031135 + }, + { + "epoch": 2.911756083260041, + "grad_norm": 0.265751204133962, + "learning_rate": 0.00019504677768655872, + "loss": 3.035701274871826, + "step": 4967, + "token_acc": 0.29216665782285595 + }, + { + "epoch": 2.91234242157725, + "grad_norm": 0.30833498572244683, + "learning_rate": 0.00019504376471593947, + "loss": 3.0718464851379395, + "step": 4968, + "token_acc": 0.28913336049047866 + }, + { + "epoch": 2.9129287598944593, + "grad_norm": 0.291196404745855, + "learning_rate": 0.0001950407508525121, + "loss": 3.085510730743408, + "step": 4969, + "token_acc": 0.2885935298652277 + }, + { + "epoch": 2.913515098211668, + "grad_norm": 0.2515015882422125, + "learning_rate": 0.00019503773609630499, + "loss": 3.0286383628845215, + "step": 4970, + "token_acc": 0.29546393585807607 + }, + { + "epoch": 2.914101436528877, + "grad_norm": 0.3035115624745694, + "learning_rate": 0.00019503472044734633, + "loss": 3.0399529933929443, + "step": 4971, + "token_acc": 0.2947626763997923 + }, + { + "epoch": 2.914687774846086, + "grad_norm": 0.4358318558998412, + "learning_rate": 0.0001950317039056646, + "loss": 3.006024122238159, + "step": 4972, + "token_acc": 0.2992748846407383 + }, + { + "epoch": 2.9152741131632953, + "grad_norm": 0.41211619124687177, + "learning_rate": 0.00019502868647128805, + "loss": 3.0472846031188965, + "step": 4973, + "token_acc": 0.29414905003660313 + }, + { + "epoch": 2.915860451480504, + "grad_norm": 0.33289942951443097, + "learning_rate": 0.00019502566814424505, + "loss": 3.0885910987854004, + "step": 4974, + "token_acc": 0.2887135983324472 + }, + { + "epoch": 2.916446789797713, + "grad_norm": 0.33744657897442476, + "learning_rate": 0.00019502264892456395, + "loss": 3.0814619064331055, + "step": 4975, + "token_acc": 0.28835315324661853 + }, + { + "epoch": 2.917033128114922, + "grad_norm": 0.28690395825190873, + "learning_rate": 0.00019501962881227308, + "loss": 3.0762784481048584, + "step": 4976, + "token_acc": 0.2885173023490781 + }, + { + "epoch": 2.9176194664321313, + "grad_norm": 0.36021876624438814, + "learning_rate": 0.00019501660780740085, + "loss": 3.0600881576538086, + "step": 4977, + "token_acc": 0.29150309437169647 + }, + { + "epoch": 2.9182058047493404, + "grad_norm": 0.30353187066328163, + "learning_rate": 0.00019501358590997563, + "loss": 3.0010123252868652, + "step": 4978, + "token_acc": 0.30099704091825774 + }, + { + "epoch": 2.9187921430665495, + "grad_norm": 0.3269541427919076, + "learning_rate": 0.0001950105631200258, + "loss": 3.0445008277893066, + "step": 4979, + "token_acc": 0.2922260121766416 + }, + { + "epoch": 2.9193784813837587, + "grad_norm": 0.34086756024199444, + "learning_rate": 0.00019500753943757975, + "loss": 3.034757614135742, + "step": 4980, + "token_acc": 0.2958107931135221 + }, + { + "epoch": 2.9199648197009673, + "grad_norm": 0.32390192531144235, + "learning_rate": 0.00019500451486266593, + "loss": 3.070679187774658, + "step": 4981, + "token_acc": 0.2914003932653312 + }, + { + "epoch": 2.9205511580181764, + "grad_norm": 0.36074936740183655, + "learning_rate": 0.0001950014893953127, + "loss": 3.053281784057617, + "step": 4982, + "token_acc": 0.2919729868353565 + }, + { + "epoch": 2.9211374963353856, + "grad_norm": 0.2739619090305267, + "learning_rate": 0.00019499846303554845, + "loss": 3.0226783752441406, + "step": 4983, + "token_acc": 0.2965813365354875 + }, + { + "epoch": 2.9217238346525947, + "grad_norm": 0.43881135421249245, + "learning_rate": 0.0001949954357834017, + "loss": 3.064603805541992, + "step": 4984, + "token_acc": 0.2913822130718987 + }, + { + "epoch": 2.9223101729698033, + "grad_norm": 0.3433560664286759, + "learning_rate": 0.0001949924076389008, + "loss": 3.0674147605895996, + "step": 4985, + "token_acc": 0.29210365103384 + }, + { + "epoch": 2.9228965112870124, + "grad_norm": 0.3291007238750204, + "learning_rate": 0.00019498937860207426, + "loss": 3.0799214839935303, + "step": 4986, + "token_acc": 0.2903882085378998 + }, + { + "epoch": 2.9234828496042216, + "grad_norm": 0.4141460207208036, + "learning_rate": 0.0001949863486729505, + "loss": 3.097292900085449, + "step": 4987, + "token_acc": 0.2853096206524446 + }, + { + "epoch": 2.9240691879214307, + "grad_norm": 0.3837779918947824, + "learning_rate": 0.000194983317851558, + "loss": 3.061333656311035, + "step": 4988, + "token_acc": 0.29114903311343077 + }, + { + "epoch": 2.92465552623864, + "grad_norm": 0.28843549687321396, + "learning_rate": 0.0001949802861379252, + "loss": 3.0804803371429443, + "step": 4989, + "token_acc": 0.28959805897801616 + }, + { + "epoch": 2.925241864555849, + "grad_norm": 0.34539893993096815, + "learning_rate": 0.00019497725353208062, + "loss": 3.0722172260284424, + "step": 4990, + "token_acc": 0.28805101818099715 + }, + { + "epoch": 2.925828202873058, + "grad_norm": 0.29860061980350566, + "learning_rate": 0.0001949742200340527, + "loss": 3.0547733306884766, + "step": 4991, + "token_acc": 0.29433025513588645 + }, + { + "epoch": 2.9264145411902667, + "grad_norm": 0.284183977617941, + "learning_rate": 0.00019497118564387001, + "loss": 3.0908350944519043, + "step": 4992, + "token_acc": 0.28759007384174856 + }, + { + "epoch": 2.927000879507476, + "grad_norm": 0.31597819335816807, + "learning_rate": 0.00019496815036156096, + "loss": 3.014202117919922, + "step": 4993, + "token_acc": 0.2978801276999319 + }, + { + "epoch": 2.927587217824685, + "grad_norm": 0.3237899926362453, + "learning_rate": 0.00019496511418715413, + "loss": 3.0479018688201904, + "step": 4994, + "token_acc": 0.29319137206257323 + }, + { + "epoch": 2.928173556141894, + "grad_norm": 0.3313212970970978, + "learning_rate": 0.00019496207712067803, + "loss": 3.073309898376465, + "step": 4995, + "token_acc": 0.28928840993484645 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.33032164822963594, + "learning_rate": 0.00019495903916216115, + "loss": 3.0593535900115967, + "step": 4996, + "token_acc": 0.2924629058940183 + }, + { + "epoch": 2.929346232776312, + "grad_norm": 0.3187986021205402, + "learning_rate": 0.00019495600031163205, + "loss": 3.0064120292663574, + "step": 4997, + "token_acc": 0.29896513334345426 + }, + { + "epoch": 2.929932571093521, + "grad_norm": 0.32373557724083457, + "learning_rate": 0.0001949529605691193, + "loss": 3.0537562370300293, + "step": 4998, + "token_acc": 0.2930048143534549 + }, + { + "epoch": 2.93051890941073, + "grad_norm": 0.29210231839052564, + "learning_rate": 0.00019494991993465144, + "loss": 3.0679166316986084, + "step": 4999, + "token_acc": 0.28981495535312324 + }, + { + "epoch": 2.931105247727939, + "grad_norm": 0.2850504935544495, + "learning_rate": 0.00019494687840825706, + "loss": 3.048098087310791, + "step": 5000, + "token_acc": 0.29239563440727734 + }, + { + "epoch": 2.9316915860451482, + "grad_norm": 0.3329465506136974, + "learning_rate": 0.00019494383598996464, + "loss": 3.0638246536254883, + "step": 5001, + "token_acc": 0.29252030057290623 + }, + { + "epoch": 2.9322779243623573, + "grad_norm": 0.303197221270058, + "learning_rate": 0.00019494079267980285, + "loss": 3.062152862548828, + "step": 5002, + "token_acc": 0.2916552067024923 + }, + { + "epoch": 2.932864262679566, + "grad_norm": 0.2963560862066697, + "learning_rate": 0.00019493774847780025, + "loss": 3.0843563079833984, + "step": 5003, + "token_acc": 0.28678526148345895 + }, + { + "epoch": 2.933450600996775, + "grad_norm": 0.26764886399433274, + "learning_rate": 0.0001949347033839854, + "loss": 3.060847282409668, + "step": 5004, + "token_acc": 0.290324684132002 + }, + { + "epoch": 2.9340369393139842, + "grad_norm": 0.3610937398613952, + "learning_rate": 0.00019493165739838697, + "loss": 3.061427354812622, + "step": 5005, + "token_acc": 0.2920216207299187 + }, + { + "epoch": 2.9346232776311933, + "grad_norm": 0.31418550079421076, + "learning_rate": 0.00019492861052103355, + "loss": 3.0480644702911377, + "step": 5006, + "token_acc": 0.2951172609275392 + }, + { + "epoch": 2.935209615948402, + "grad_norm": 0.3278359143463752, + "learning_rate": 0.00019492556275195371, + "loss": 3.105445384979248, + "step": 5007, + "token_acc": 0.2860137644330093 + }, + { + "epoch": 2.935795954265611, + "grad_norm": 0.3219034016040833, + "learning_rate": 0.00019492251409117617, + "loss": 3.0747108459472656, + "step": 5008, + "token_acc": 0.28919790461936185 + }, + { + "epoch": 2.9363822925828202, + "grad_norm": 0.2899224127215569, + "learning_rate": 0.0001949194645387295, + "loss": 3.0997304916381836, + "step": 5009, + "token_acc": 0.2857688496435602 + }, + { + "epoch": 2.9369686309000294, + "grad_norm": 0.3308220068886812, + "learning_rate": 0.00019491641409464237, + "loss": 3.0625200271606445, + "step": 5010, + "token_acc": 0.2905061745436784 + }, + { + "epoch": 2.9375549692172385, + "grad_norm": 0.27756265092410454, + "learning_rate": 0.00019491336275894342, + "loss": 3.0758347511291504, + "step": 5011, + "token_acc": 0.29020997669296045 + }, + { + "epoch": 2.9381413075344476, + "grad_norm": 0.27831303293579907, + "learning_rate": 0.0001949103105316613, + "loss": 3.0570194721221924, + "step": 5012, + "token_acc": 0.2916499981816198 + }, + { + "epoch": 2.9387276458516562, + "grad_norm": 0.39776774872097453, + "learning_rate": 0.00019490725741282475, + "loss": 3.0666885375976562, + "step": 5013, + "token_acc": 0.2909821121610218 + }, + { + "epoch": 2.9393139841688654, + "grad_norm": 0.28961615331830315, + "learning_rate": 0.0001949042034024624, + "loss": 3.0699684619903564, + "step": 5014, + "token_acc": 0.2905948440215713 + }, + { + "epoch": 2.9399003224860745, + "grad_norm": 0.35643917902424743, + "learning_rate": 0.00019490114850060294, + "loss": 3.0714263916015625, + "step": 5015, + "token_acc": 0.28959007195194403 + }, + { + "epoch": 2.9404866608032836, + "grad_norm": 0.33408277575780176, + "learning_rate": 0.00019489809270727503, + "loss": 3.1006455421447754, + "step": 5016, + "token_acc": 0.28470290320279623 + }, + { + "epoch": 2.9410729991204922, + "grad_norm": 0.2949549434916837, + "learning_rate": 0.00019489503602250748, + "loss": 3.0249931812286377, + "step": 5017, + "token_acc": 0.29810700677122814 + }, + { + "epoch": 2.9416593374377014, + "grad_norm": 0.3783333342199044, + "learning_rate": 0.00019489197844632888, + "loss": 3.0486960411071777, + "step": 5018, + "token_acc": 0.2921325680151058 + }, + { + "epoch": 2.9422456757549105, + "grad_norm": 0.3112857463269938, + "learning_rate": 0.00019488891997876805, + "loss": 3.0538716316223145, + "step": 5019, + "token_acc": 0.29226542528724075 + }, + { + "epoch": 2.9428320140721196, + "grad_norm": 0.33954124774377736, + "learning_rate": 0.00019488586061985368, + "loss": 3.079308032989502, + "step": 5020, + "token_acc": 0.2891891891891892 + }, + { + "epoch": 2.9434183523893287, + "grad_norm": 0.28972224156682785, + "learning_rate": 0.0001948828003696145, + "loss": 3.0692856311798096, + "step": 5021, + "token_acc": 0.2927767548723609 + }, + { + "epoch": 2.944004690706538, + "grad_norm": 0.35896931661555187, + "learning_rate": 0.00019487973922807926, + "loss": 3.0493698120117188, + "step": 5022, + "token_acc": 0.2933337573945678 + }, + { + "epoch": 2.944591029023747, + "grad_norm": 0.3452991960705216, + "learning_rate": 0.00019487667719527674, + "loss": 3.084245204925537, + "step": 5023, + "token_acc": 0.28967244641583173 + }, + { + "epoch": 2.9451773673409556, + "grad_norm": 0.36452231075163316, + "learning_rate": 0.00019487361427123569, + "loss": 3.063624858856201, + "step": 5024, + "token_acc": 0.293508327781479 + }, + { + "epoch": 2.9457637056581647, + "grad_norm": 0.28587299764911195, + "learning_rate": 0.00019487055045598487, + "loss": 3.0622692108154297, + "step": 5025, + "token_acc": 0.2903872186489029 + }, + { + "epoch": 2.946350043975374, + "grad_norm": 0.40573345242216996, + "learning_rate": 0.00019486748574955304, + "loss": 3.086578845977783, + "step": 5026, + "token_acc": 0.28955412222867305 + }, + { + "epoch": 2.946936382292583, + "grad_norm": 0.3180596990704223, + "learning_rate": 0.00019486442015196904, + "loss": 3.1424732208251953, + "step": 5027, + "token_acc": 0.2800487991877163 + }, + { + "epoch": 2.9475227206097916, + "grad_norm": 0.3461832212807444, + "learning_rate": 0.00019486135366326166, + "loss": 3.0806894302368164, + "step": 5028, + "token_acc": 0.28855204355035446 + }, + { + "epoch": 2.9481090589270007, + "grad_norm": 0.23505485145733143, + "learning_rate": 0.00019485828628345965, + "loss": 3.0355334281921387, + "step": 5029, + "token_acc": 0.29463172007458177 + }, + { + "epoch": 2.94869539724421, + "grad_norm": 0.3035267087257974, + "learning_rate": 0.0001948552180125919, + "loss": 3.058065891265869, + "step": 5030, + "token_acc": 0.29121044780856264 + }, + { + "epoch": 2.949281735561419, + "grad_norm": 0.2720137156926839, + "learning_rate": 0.0001948521488506872, + "loss": 3.0414891242980957, + "step": 5031, + "token_acc": 0.29394376643516956 + }, + { + "epoch": 2.949868073878628, + "grad_norm": 0.2900881447741355, + "learning_rate": 0.00019484907879777433, + "loss": 3.0601658821105957, + "step": 5032, + "token_acc": 0.2912491827503176 + }, + { + "epoch": 2.950454412195837, + "grad_norm": 0.3115283956790449, + "learning_rate": 0.00019484600785388222, + "loss": 3.1212158203125, + "step": 5033, + "token_acc": 0.2840862131629915 + }, + { + "epoch": 2.9510407505130463, + "grad_norm": 0.32101519084641617, + "learning_rate": 0.00019484293601903965, + "loss": 3.0383353233337402, + "step": 5034, + "token_acc": 0.29502749831615904 + }, + { + "epoch": 2.951627088830255, + "grad_norm": 0.327139323056121, + "learning_rate": 0.0001948398632932755, + "loss": 3.0773799419403076, + "step": 5035, + "token_acc": 0.2901628727259103 + }, + { + "epoch": 2.952213427147464, + "grad_norm": 0.32345577854914903, + "learning_rate": 0.00019483678967661865, + "loss": 3.0634474754333496, + "step": 5036, + "token_acc": 0.29101146210178097 + }, + { + "epoch": 2.952799765464673, + "grad_norm": 0.2794735094951863, + "learning_rate": 0.00019483371516909793, + "loss": 3.073577880859375, + "step": 5037, + "token_acc": 0.28845179247560904 + }, + { + "epoch": 2.9533861037818823, + "grad_norm": 0.3306208381665803, + "learning_rate": 0.00019483063977074227, + "loss": 3.0366334915161133, + "step": 5038, + "token_acc": 0.29502401691218777 + }, + { + "epoch": 2.953972442099091, + "grad_norm": 0.2834816971284872, + "learning_rate": 0.0001948275634815805, + "loss": 3.0630838871002197, + "step": 5039, + "token_acc": 0.2918791487981077 + }, + { + "epoch": 2.9545587804163, + "grad_norm": 0.37454056065945, + "learning_rate": 0.0001948244863016416, + "loss": 3.0759198665618896, + "step": 5040, + "token_acc": 0.29131172365702745 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.3821083846045049, + "learning_rate": 0.00019482140823095438, + "loss": 3.118229866027832, + "step": 5041, + "token_acc": 0.28398220890293513 + }, + { + "epoch": 2.9557314570507183, + "grad_norm": 0.3811173427739997, + "learning_rate": 0.00019481832926954783, + "loss": 3.0890636444091797, + "step": 5042, + "token_acc": 0.2887295377637665 + }, + { + "epoch": 2.9563177953679274, + "grad_norm": 0.31037224595026536, + "learning_rate": 0.00019481524941745087, + "loss": 3.0643105506896973, + "step": 5043, + "token_acc": 0.29118336050855503 + }, + { + "epoch": 2.9569041336851365, + "grad_norm": 0.31979574438554376, + "learning_rate": 0.0001948121686746924, + "loss": 3.047090768814087, + "step": 5044, + "token_acc": 0.2938781676438642 + }, + { + "epoch": 2.9574904720023456, + "grad_norm": 0.300147036133735, + "learning_rate": 0.00019480908704130133, + "loss": 3.073992967605591, + "step": 5045, + "token_acc": 0.2889291955727374 + }, + { + "epoch": 2.9580768103195543, + "grad_norm": 0.3098519553080766, + "learning_rate": 0.00019480600451730667, + "loss": 3.0736918449401855, + "step": 5046, + "token_acc": 0.29125437701763424 + }, + { + "epoch": 2.9586631486367634, + "grad_norm": 0.2943943551142779, + "learning_rate": 0.00019480292110273732, + "loss": 3.0501723289489746, + "step": 5047, + "token_acc": 0.29159042862671486 + }, + { + "epoch": 2.9592494869539725, + "grad_norm": 0.2534114135452284, + "learning_rate": 0.0001947998367976223, + "loss": 3.1003289222717285, + "step": 5048, + "token_acc": 0.2876312198548982 + }, + { + "epoch": 2.9598358252711816, + "grad_norm": 0.326650918324217, + "learning_rate": 0.00019479675160199056, + "loss": 3.0861802101135254, + "step": 5049, + "token_acc": 0.28796822841762215 + }, + { + "epoch": 2.9604221635883903, + "grad_norm": 0.2614504042885108, + "learning_rate": 0.00019479366551587108, + "loss": 3.0408339500427246, + "step": 5050, + "token_acc": 0.29532137239743006 + }, + { + "epoch": 2.9610085019055994, + "grad_norm": 0.29156987513359794, + "learning_rate": 0.00019479057853929283, + "loss": 3.0684614181518555, + "step": 5051, + "token_acc": 0.2913058118391811 + }, + { + "epoch": 2.9615948402228085, + "grad_norm": 0.31009327474398224, + "learning_rate": 0.00019478749067228484, + "loss": 3.0783987045288086, + "step": 5052, + "token_acc": 0.29032592305783617 + }, + { + "epoch": 2.9621811785400176, + "grad_norm": 0.28356523304854436, + "learning_rate": 0.0001947844019148761, + "loss": 3.0161633491516113, + "step": 5053, + "token_acc": 0.29830786826593153 + }, + { + "epoch": 2.9627675168572267, + "grad_norm": 0.29066555233193414, + "learning_rate": 0.00019478131226709564, + "loss": 3.055677890777588, + "step": 5054, + "token_acc": 0.29240020805202327 + }, + { + "epoch": 2.963353855174436, + "grad_norm": 0.25840812363500565, + "learning_rate": 0.00019477822172897247, + "loss": 3.032134532928467, + "step": 5055, + "token_acc": 0.29633352339782265 + }, + { + "epoch": 2.963940193491645, + "grad_norm": 0.2707797261331279, + "learning_rate": 0.00019477513030053558, + "loss": 3.0665526390075684, + "step": 5056, + "token_acc": 0.29080377202769414 + }, + { + "epoch": 2.9645265318088536, + "grad_norm": 0.2605838223033037, + "learning_rate": 0.0001947720379818141, + "loss": 3.0407423973083496, + "step": 5057, + "token_acc": 0.2947788773188956 + }, + { + "epoch": 2.9651128701260627, + "grad_norm": 0.3205731368842676, + "learning_rate": 0.000194768944772837, + "loss": 3.021084785461426, + "step": 5058, + "token_acc": 0.2978306186623326 + }, + { + "epoch": 2.965699208443272, + "grad_norm": 0.34301685824444256, + "learning_rate": 0.0001947658506736334, + "loss": 3.0289154052734375, + "step": 5059, + "token_acc": 0.2946764699098693 + }, + { + "epoch": 2.966285546760481, + "grad_norm": 0.27033898795538125, + "learning_rate": 0.00019476275568423233, + "loss": 3.116395950317383, + "step": 5060, + "token_acc": 0.2839116387248866 + }, + { + "epoch": 2.9668718850776896, + "grad_norm": 0.2820350415568705, + "learning_rate": 0.0001947596598046628, + "loss": 3.0663981437683105, + "step": 5061, + "token_acc": 0.2906437908842085 + }, + { + "epoch": 2.9674582233948987, + "grad_norm": 0.32115416307914063, + "learning_rate": 0.00019475656303495403, + "loss": 3.074695348739624, + "step": 5062, + "token_acc": 0.2889860764478382 + }, + { + "epoch": 2.968044561712108, + "grad_norm": 0.2559949880100656, + "learning_rate": 0.000194753465375135, + "loss": 3.026939630508423, + "step": 5063, + "token_acc": 0.2960930522273386 + }, + { + "epoch": 2.968630900029317, + "grad_norm": 0.3067721216418737, + "learning_rate": 0.00019475036682523486, + "loss": 3.0147125720977783, + "step": 5064, + "token_acc": 0.2973514841830167 + }, + { + "epoch": 2.969217238346526, + "grad_norm": 0.31270101904996167, + "learning_rate": 0.0001947472673852827, + "loss": 3.048008441925049, + "step": 5065, + "token_acc": 0.2940753744325173 + }, + { + "epoch": 2.969803576663735, + "grad_norm": 0.24831209272499682, + "learning_rate": 0.00019474416705530763, + "loss": 3.059325933456421, + "step": 5066, + "token_acc": 0.29287347763654736 + }, + { + "epoch": 2.970389914980944, + "grad_norm": 0.34086496717941267, + "learning_rate": 0.00019474106583533877, + "loss": 3.0491175651550293, + "step": 5067, + "token_acc": 0.29249320220578895 + }, + { + "epoch": 2.970976253298153, + "grad_norm": 0.31107896325796147, + "learning_rate": 0.00019473796372540528, + "loss": 3.0021114349365234, + "step": 5068, + "token_acc": 0.2997472673783575 + }, + { + "epoch": 2.971562591615362, + "grad_norm": 0.3011632955524233, + "learning_rate": 0.0001947348607255363, + "loss": 3.035175323486328, + "step": 5069, + "token_acc": 0.2955362464485884 + }, + { + "epoch": 2.972148929932571, + "grad_norm": 0.3032956738281971, + "learning_rate": 0.0001947317568357609, + "loss": 3.0940096378326416, + "step": 5070, + "token_acc": 0.28624644747616934 + }, + { + "epoch": 2.97273526824978, + "grad_norm": 0.33323611702462846, + "learning_rate": 0.00019472865205610835, + "loss": 3.0671091079711914, + "step": 5071, + "token_acc": 0.2897530582374347 + }, + { + "epoch": 2.973321606566989, + "grad_norm": 0.36946921838589714, + "learning_rate": 0.00019472554638660773, + "loss": 3.065242290496826, + "step": 5072, + "token_acc": 0.2910355582318156 + }, + { + "epoch": 2.973907944884198, + "grad_norm": 0.41988922224095526, + "learning_rate": 0.00019472243982728826, + "loss": 3.076535224914551, + "step": 5073, + "token_acc": 0.28871121562479585 + }, + { + "epoch": 2.974494283201407, + "grad_norm": 0.4794590296986978, + "learning_rate": 0.00019471933237817911, + "loss": 3.040800094604492, + "step": 5074, + "token_acc": 0.29476912099932584 + }, + { + "epoch": 2.9750806215186163, + "grad_norm": 0.38863128098786015, + "learning_rate": 0.00019471622403930948, + "loss": 3.0877060890197754, + "step": 5075, + "token_acc": 0.2879726612746187 + }, + { + "epoch": 2.9756669598358254, + "grad_norm": 0.39644370845163085, + "learning_rate": 0.00019471311481070855, + "loss": 3.0534019470214844, + "step": 5076, + "token_acc": 0.29385373662482095 + }, + { + "epoch": 2.9762532981530345, + "grad_norm": 0.39108778994140214, + "learning_rate": 0.00019471000469240552, + "loss": 3.0581254959106445, + "step": 5077, + "token_acc": 0.2904801765073657 + }, + { + "epoch": 2.976839636470243, + "grad_norm": 0.3897971884991325, + "learning_rate": 0.00019470689368442963, + "loss": 3.026871919631958, + "step": 5078, + "token_acc": 0.2973275434047617 + }, + { + "epoch": 2.9774259747874523, + "grad_norm": 0.36666015103995236, + "learning_rate": 0.00019470378178681008, + "loss": 3.0175135135650635, + "step": 5079, + "token_acc": 0.2992721414365105 + }, + { + "epoch": 2.9780123131046614, + "grad_norm": 0.3919834615243031, + "learning_rate": 0.00019470066899957616, + "loss": 3.058434009552002, + "step": 5080, + "token_acc": 0.2925285501618797 + }, + { + "epoch": 2.9785986514218705, + "grad_norm": 0.3502630441441781, + "learning_rate": 0.000194697555322757, + "loss": 3.0767104625701904, + "step": 5081, + "token_acc": 0.28790827663376944 + }, + { + "epoch": 2.979184989739079, + "grad_norm": 0.3308124297422489, + "learning_rate": 0.00019469444075638194, + "loss": 3.0623905658721924, + "step": 5082, + "token_acc": 0.290237226766078 + }, + { + "epoch": 2.9797713280562883, + "grad_norm": 0.31461500140887944, + "learning_rate": 0.00019469132530048025, + "loss": 3.112497329711914, + "step": 5083, + "token_acc": 0.285479396835668 + }, + { + "epoch": 2.9803576663734974, + "grad_norm": 0.2915778103955219, + "learning_rate": 0.00019468820895508112, + "loss": 3.061370849609375, + "step": 5084, + "token_acc": 0.2924669537692402 + }, + { + "epoch": 2.9809440046907065, + "grad_norm": 0.2898295500975545, + "learning_rate": 0.00019468509172021386, + "loss": 3.0631327629089355, + "step": 5085, + "token_acc": 0.2925041229983508 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.2907425806482855, + "learning_rate": 0.00019468197359590774, + "loss": 3.0638012886047363, + "step": 5086, + "token_acc": 0.2903084219203306 + }, + { + "epoch": 2.9821166813251248, + "grad_norm": 0.26812391809142333, + "learning_rate": 0.0001946788545821921, + "loss": 3.0960702896118164, + "step": 5087, + "token_acc": 0.2849869660052136 + }, + { + "epoch": 2.982703019642334, + "grad_norm": 0.2973472960867228, + "learning_rate": 0.00019467573467909618, + "loss": 3.1034975051879883, + "step": 5088, + "token_acc": 0.2855327043776892 + }, + { + "epoch": 2.9832893579595425, + "grad_norm": 0.28333137227821464, + "learning_rate": 0.00019467261388664931, + "loss": 3.0710272789001465, + "step": 5089, + "token_acc": 0.2913264778834229 + }, + { + "epoch": 2.9838756962767516, + "grad_norm": 0.3127538475349362, + "learning_rate": 0.0001946694922048808, + "loss": 3.072453498840332, + "step": 5090, + "token_acc": 0.290902084169183 + }, + { + "epoch": 2.9844620345939608, + "grad_norm": 0.348293748569071, + "learning_rate": 0.00019466636963382002, + "loss": 3.028822422027588, + "step": 5091, + "token_acc": 0.29601545321907813 + }, + { + "epoch": 2.98504837291117, + "grad_norm": 0.30294241217904117, + "learning_rate": 0.0001946632461734962, + "loss": 3.0734779834747314, + "step": 5092, + "token_acc": 0.2890665115742924 + }, + { + "epoch": 2.9856347112283785, + "grad_norm": 0.33273280994821236, + "learning_rate": 0.00019466012182393878, + "loss": 3.045283794403076, + "step": 5093, + "token_acc": 0.29408210442837374 + }, + { + "epoch": 2.9862210495455876, + "grad_norm": 0.3273073076525325, + "learning_rate": 0.00019465699658517707, + "loss": 3.0563011169433594, + "step": 5094, + "token_acc": 0.29140590822525336 + }, + { + "epoch": 2.9868073878627968, + "grad_norm": 0.32046910625811176, + "learning_rate": 0.00019465387045724042, + "loss": 3.005497455596924, + "step": 5095, + "token_acc": 0.2990356718224541 + }, + { + "epoch": 2.987393726180006, + "grad_norm": 0.25805871783594836, + "learning_rate": 0.0001946507434401582, + "loss": 3.035773754119873, + "step": 5096, + "token_acc": 0.2937105680819528 + }, + { + "epoch": 2.987980064497215, + "grad_norm": 0.32529885878062686, + "learning_rate": 0.00019464761553395982, + "loss": 3.089299201965332, + "step": 5097, + "token_acc": 0.28769768629048476 + }, + { + "epoch": 2.988566402814424, + "grad_norm": 0.3002496596254249, + "learning_rate": 0.0001946444867386746, + "loss": 3.0452799797058105, + "step": 5098, + "token_acc": 0.2938198736018796 + }, + { + "epoch": 2.989152741131633, + "grad_norm": 0.27622070860940207, + "learning_rate": 0.00019464135705433196, + "loss": 3.0655341148376465, + "step": 5099, + "token_acc": 0.29159283323488877 + }, + { + "epoch": 2.989739079448842, + "grad_norm": 0.3154041915479705, + "learning_rate": 0.00019463822648096133, + "loss": 3.029757022857666, + "step": 5100, + "token_acc": 0.2951072141492627 + }, + { + "epoch": 2.990325417766051, + "grad_norm": 0.30749729824971944, + "learning_rate": 0.00019463509501859206, + "loss": 3.0653178691864014, + "step": 5101, + "token_acc": 0.2916677521375092 + }, + { + "epoch": 2.99091175608326, + "grad_norm": 0.2634593593998019, + "learning_rate": 0.0001946319626672536, + "loss": 3.072056293487549, + "step": 5102, + "token_acc": 0.289483906257019 + }, + { + "epoch": 2.991498094400469, + "grad_norm": 0.30658755150656564, + "learning_rate": 0.0001946288294269754, + "loss": 3.059781074523926, + "step": 5103, + "token_acc": 0.29289004426051907 + }, + { + "epoch": 2.992084432717678, + "grad_norm": 0.3155458965303423, + "learning_rate": 0.00019462569529778682, + "loss": 3.0822203159332275, + "step": 5104, + "token_acc": 0.2868187089292617 + }, + { + "epoch": 2.992670771034887, + "grad_norm": 0.42860585615712365, + "learning_rate": 0.00019462256027971735, + "loss": 3.0829527378082275, + "step": 5105, + "token_acc": 0.2894435666963183 + }, + { + "epoch": 2.993257109352096, + "grad_norm": 0.5931398595389955, + "learning_rate": 0.00019461942437279644, + "loss": 3.096341609954834, + "step": 5106, + "token_acc": 0.28655393844658966 + }, + { + "epoch": 2.993843447669305, + "grad_norm": 0.47090810251505416, + "learning_rate": 0.00019461628757705356, + "loss": 3.055384635925293, + "step": 5107, + "token_acc": 0.2936589543578994 + }, + { + "epoch": 2.9944297859865143, + "grad_norm": 0.3358624284703214, + "learning_rate": 0.0001946131498925181, + "loss": 3.0655620098114014, + "step": 5108, + "token_acc": 0.29105537915074536 + }, + { + "epoch": 2.9950161243037234, + "grad_norm": 0.34646293213878354, + "learning_rate": 0.00019461001131921963, + "loss": 3.060105800628662, + "step": 5109, + "token_acc": 0.292280753352287 + }, + { + "epoch": 2.9956024626209325, + "grad_norm": 0.35487299696414576, + "learning_rate": 0.00019460687185718757, + "loss": 3.071585178375244, + "step": 5110, + "token_acc": 0.29089450799949346 + }, + { + "epoch": 2.996188800938141, + "grad_norm": 0.33429780373443735, + "learning_rate": 0.00019460373150645145, + "loss": 3.086029529571533, + "step": 5111, + "token_acc": 0.28776614036163367 + }, + { + "epoch": 2.9967751392553503, + "grad_norm": 0.34637765097363293, + "learning_rate": 0.00019460059026704077, + "loss": 3.0650389194488525, + "step": 5112, + "token_acc": 0.29114903192536684 + }, + { + "epoch": 2.9973614775725594, + "grad_norm": 0.3257436694676698, + "learning_rate": 0.000194597448138985, + "loss": 3.0570435523986816, + "step": 5113, + "token_acc": 0.2934961144709703 + }, + { + "epoch": 2.9979478158897686, + "grad_norm": 0.2806794087998649, + "learning_rate": 0.00019459430512231367, + "loss": 3.0268986225128174, + "step": 5114, + "token_acc": 0.29709987086334855 + }, + { + "epoch": 2.998534154206977, + "grad_norm": 0.34635888337478654, + "learning_rate": 0.00019459116121705634, + "loss": 3.108278512954712, + "step": 5115, + "token_acc": 0.28588031140050624 + }, + { + "epoch": 2.9991204925241863, + "grad_norm": 0.31972129246257647, + "learning_rate": 0.0001945880164232425, + "loss": 2.9964239597320557, + "step": 5116, + "token_acc": 0.300626272156039 + }, + { + "epoch": 2.9997068308413954, + "grad_norm": 0.29687809473523824, + "learning_rate": 0.00019458487074090167, + "loss": 3.0663998126983643, + "step": 5117, + "token_acc": 0.28906655746873877 + }, + { + "epoch": 3.0, + "grad_norm": 0.3301195576428841, + "learning_rate": 0.00019458172417006347, + "loss": 3.0576601028442383, + "step": 5118, + "token_acc": 0.29402854740291945 + }, + { + "epoch": 3.0, + "eval_loss": 3.0766077041625977, + "eval_runtime": 16.7656, + "eval_samples_per_second": 15.269, + "eval_steps_per_second": 1.909, + "eval_token_acc": 0.2895441487563783, + "step": 5118 + }, + { + "epoch": 3.000586338317209, + "grad_norm": 0.32542610512145914, + "learning_rate": 0.00019457857671075743, + "loss": 2.99609375, + "step": 5119, + "token_acc": 0.29915804399768886 + }, + { + "epoch": 3.0011726766344182, + "grad_norm": 0.3484215275556144, + "learning_rate": 0.00019457542836301308, + "loss": 2.924553871154785, + "step": 5120, + "token_acc": 0.3090295235605253 + }, + { + "epoch": 3.001759014951627, + "grad_norm": 0.2952268380512712, + "learning_rate": 0.00019457227912686006, + "loss": 2.9160537719726562, + "step": 5121, + "token_acc": 0.3113160260643334 + }, + { + "epoch": 3.002345353268836, + "grad_norm": 0.3470832856831942, + "learning_rate": 0.00019456912900232788, + "loss": 2.9686379432678223, + "step": 5122, + "token_acc": 0.30280014084096885 + }, + { + "epoch": 3.002931691586045, + "grad_norm": 0.342045200920826, + "learning_rate": 0.00019456597798944616, + "loss": 2.933501958847046, + "step": 5123, + "token_acc": 0.3072107322526551 + }, + { + "epoch": 3.0035180299032542, + "grad_norm": 0.27745103972243146, + "learning_rate": 0.00019456282608824453, + "loss": 2.9482598304748535, + "step": 5124, + "token_acc": 0.3061465909999201 + }, + { + "epoch": 3.0041043682204633, + "grad_norm": 0.31857739348476116, + "learning_rate": 0.00019455967329875255, + "loss": 2.9381589889526367, + "step": 5125, + "token_acc": 0.3080063358742829 + }, + { + "epoch": 3.0046907065376725, + "grad_norm": 0.29967830571185755, + "learning_rate": 0.00019455651962099987, + "loss": 2.9471874237060547, + "step": 5126, + "token_acc": 0.30596984509541264 + }, + { + "epoch": 3.005277044854881, + "grad_norm": 0.36330079033086976, + "learning_rate": 0.00019455336505501614, + "loss": 2.9471545219421387, + "step": 5127, + "token_acc": 0.30482352599829604 + }, + { + "epoch": 3.0058633831720902, + "grad_norm": 0.368025000630573, + "learning_rate": 0.00019455020960083093, + "loss": 2.928900957107544, + "step": 5128, + "token_acc": 0.30792494672646964 + }, + { + "epoch": 3.0064497214892993, + "grad_norm": 0.29734071190141903, + "learning_rate": 0.00019454705325847388, + "loss": 2.931608200073242, + "step": 5129, + "token_acc": 0.3085685284912736 + }, + { + "epoch": 3.0070360598065085, + "grad_norm": 0.3345639676785342, + "learning_rate": 0.0001945438960279747, + "loss": 2.9155187606811523, + "step": 5130, + "token_acc": 0.3093837019912783 + }, + { + "epoch": 3.0076223981237176, + "grad_norm": 0.3429953825908296, + "learning_rate": 0.00019454073790936303, + "loss": 2.9233827590942383, + "step": 5131, + "token_acc": 0.3102762783305516 + }, + { + "epoch": 3.0082087364409262, + "grad_norm": 0.3556854334183303, + "learning_rate": 0.00019453757890266848, + "loss": 2.9480414390563965, + "step": 5132, + "token_acc": 0.3053006594160858 + }, + { + "epoch": 3.0087950747581353, + "grad_norm": 0.3162687449231907, + "learning_rate": 0.00019453441900792079, + "loss": 2.9260048866271973, + "step": 5133, + "token_acc": 0.3093626474483996 + }, + { + "epoch": 3.0093814130753445, + "grad_norm": 0.3036523233786287, + "learning_rate": 0.00019453125822514964, + "loss": 2.8807201385498047, + "step": 5134, + "token_acc": 0.3163385884271098 + }, + { + "epoch": 3.0099677513925536, + "grad_norm": 0.32081759374360624, + "learning_rate": 0.00019452809655438468, + "loss": 2.967503309249878, + "step": 5135, + "token_acc": 0.3018798321127022 + }, + { + "epoch": 3.0105540897097627, + "grad_norm": 0.314895990341606, + "learning_rate": 0.00019452493399565565, + "loss": 2.8954648971557617, + "step": 5136, + "token_acc": 0.3130605014443449 + }, + { + "epoch": 3.0111404280269713, + "grad_norm": 0.2836142606794178, + "learning_rate": 0.00019452177054899222, + "loss": 2.939119338989258, + "step": 5137, + "token_acc": 0.3079360261561113 + }, + { + "epoch": 3.0117267663441805, + "grad_norm": 0.3496845442846436, + "learning_rate": 0.00019451860621442411, + "loss": 2.9109444618225098, + "step": 5138, + "token_acc": 0.3122114496768236 + }, + { + "epoch": 3.0123131046613896, + "grad_norm": 0.33079850545999845, + "learning_rate": 0.00019451544099198108, + "loss": 2.9014229774475098, + "step": 5139, + "token_acc": 0.31152785176667913 + }, + { + "epoch": 3.0128994429785987, + "grad_norm": 0.378778424049442, + "learning_rate": 0.00019451227488169286, + "loss": 2.9063029289245605, + "step": 5140, + "token_acc": 0.31162722404115395 + }, + { + "epoch": 3.013485781295808, + "grad_norm": 0.3741936508738994, + "learning_rate": 0.00019450910788358912, + "loss": 2.9214630126953125, + "step": 5141, + "token_acc": 0.3092676590440075 + }, + { + "epoch": 3.014072119613017, + "grad_norm": 0.3353306802973895, + "learning_rate": 0.00019450593999769972, + "loss": 2.9295449256896973, + "step": 5142, + "token_acc": 0.3067986460097664 + }, + { + "epoch": 3.0146584579302256, + "grad_norm": 0.3654156316820181, + "learning_rate": 0.00019450277122405436, + "loss": 2.9232547283172607, + "step": 5143, + "token_acc": 0.30931743903144726 + }, + { + "epoch": 3.0152447962474347, + "grad_norm": 0.29056176985813614, + "learning_rate": 0.00019449960156268277, + "loss": 2.922541618347168, + "step": 5144, + "token_acc": 0.30918517351813934 + }, + { + "epoch": 3.015831134564644, + "grad_norm": 0.3727929623909324, + "learning_rate": 0.0001944964310136148, + "loss": 2.9728665351867676, + "step": 5145, + "token_acc": 0.300348150764418 + }, + { + "epoch": 3.016417472881853, + "grad_norm": 0.30239551930947406, + "learning_rate": 0.00019449325957688018, + "loss": 2.9441027641296387, + "step": 5146, + "token_acc": 0.30542540708844707 + }, + { + "epoch": 3.017003811199062, + "grad_norm": 0.3192694127207607, + "learning_rate": 0.0001944900872525087, + "loss": 2.9099204540252686, + "step": 5147, + "token_acc": 0.3105837393911463 + }, + { + "epoch": 3.0175901495162707, + "grad_norm": 0.27908739227935875, + "learning_rate": 0.0001944869140405302, + "loss": 2.8911123275756836, + "step": 5148, + "token_acc": 0.31453582229851235 + }, + { + "epoch": 3.01817648783348, + "grad_norm": 0.32842921861514035, + "learning_rate": 0.00019448373994097447, + "loss": 2.9719126224517822, + "step": 5149, + "token_acc": 0.3030040631452741 + }, + { + "epoch": 3.018762826150689, + "grad_norm": 0.3078400062879974, + "learning_rate": 0.00019448056495387133, + "loss": 2.9632067680358887, + "step": 5150, + "token_acc": 0.3045764317700938 + }, + { + "epoch": 3.019349164467898, + "grad_norm": 0.3034017359841723, + "learning_rate": 0.00019447738907925056, + "loss": 2.946415901184082, + "step": 5151, + "token_acc": 0.30527063459937526 + }, + { + "epoch": 3.019935502785107, + "grad_norm": 0.30233509036689804, + "learning_rate": 0.00019447421231714204, + "loss": 2.9368433952331543, + "step": 5152, + "token_acc": 0.30598607039055287 + }, + { + "epoch": 3.0205218411023163, + "grad_norm": 0.2900018232501726, + "learning_rate": 0.0001944710346675756, + "loss": 2.886044979095459, + "step": 5153, + "token_acc": 0.31436047825461716 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.2574826967617441, + "learning_rate": 0.00019446785613058112, + "loss": 2.9603962898254395, + "step": 5154, + "token_acc": 0.3052950295123544 + }, + { + "epoch": 3.021694517736734, + "grad_norm": 0.34091645301319146, + "learning_rate": 0.00019446467670618839, + "loss": 2.9147567749023438, + "step": 5155, + "token_acc": 0.3103328161968227 + }, + { + "epoch": 3.022280856053943, + "grad_norm": 0.2709027236500024, + "learning_rate": 0.00019446149639442735, + "loss": 2.9413373470306396, + "step": 5156, + "token_acc": 0.30683219729603045 + }, + { + "epoch": 3.0228671943711523, + "grad_norm": 0.31274479899862245, + "learning_rate": 0.00019445831519532782, + "loss": 2.947415351867676, + "step": 5157, + "token_acc": 0.3064849277829296 + }, + { + "epoch": 3.0234535326883614, + "grad_norm": 0.2679146091813962, + "learning_rate": 0.00019445513310891973, + "loss": 2.9493980407714844, + "step": 5158, + "token_acc": 0.30504360406703374 + }, + { + "epoch": 3.02403987100557, + "grad_norm": 0.2749909295233927, + "learning_rate": 0.0001944519501352329, + "loss": 2.9203662872314453, + "step": 5159, + "token_acc": 0.3096459643646366 + }, + { + "epoch": 3.024626209322779, + "grad_norm": 0.3269742588051401, + "learning_rate": 0.0001944487662742973, + "loss": 2.944547653198242, + "step": 5160, + "token_acc": 0.30578701715202017 + }, + { + "epoch": 3.0252125476399883, + "grad_norm": 0.3384153780227322, + "learning_rate": 0.0001944455815261428, + "loss": 2.9541068077087402, + "step": 5161, + "token_acc": 0.30482071774222397 + }, + { + "epoch": 3.0257988859571974, + "grad_norm": 0.2793764852752462, + "learning_rate": 0.00019444239589079933, + "loss": 2.916184663772583, + "step": 5162, + "token_acc": 0.3095581834780578 + }, + { + "epoch": 3.0263852242744065, + "grad_norm": 0.31434799352913817, + "learning_rate": 0.00019443920936829681, + "loss": 2.893907070159912, + "step": 5163, + "token_acc": 0.31364041557867833 + }, + { + "epoch": 3.026971562591615, + "grad_norm": 0.3429591571424557, + "learning_rate": 0.0001944360219586652, + "loss": 2.9555184841156006, + "step": 5164, + "token_acc": 0.3046860659720348 + }, + { + "epoch": 3.0275579009088243, + "grad_norm": 0.3101454811030327, + "learning_rate": 0.0001944328336619344, + "loss": 2.903646469116211, + "step": 5165, + "token_acc": 0.3118201896842926 + }, + { + "epoch": 3.0281442392260334, + "grad_norm": 0.3618964956564648, + "learning_rate": 0.00019442964447813438, + "loss": 2.9094157218933105, + "step": 5166, + "token_acc": 0.31061253617108037 + }, + { + "epoch": 3.0287305775432425, + "grad_norm": 0.27160278823182604, + "learning_rate": 0.0001944264544072951, + "loss": 2.9083375930786133, + "step": 5167, + "token_acc": 0.31002800140007003 + }, + { + "epoch": 3.0293169158604516, + "grad_norm": 0.3361325535675269, + "learning_rate": 0.0001944232634494465, + "loss": 2.954824447631836, + "step": 5168, + "token_acc": 0.30405085220758704 + }, + { + "epoch": 3.0299032541776607, + "grad_norm": 0.31725615593714823, + "learning_rate": 0.00019442007160461858, + "loss": 2.947997570037842, + "step": 5169, + "token_acc": 0.30623556229844395 + }, + { + "epoch": 3.0304895924948694, + "grad_norm": 0.3348319942756914, + "learning_rate": 0.00019441687887284136, + "loss": 2.8970162868499756, + "step": 5170, + "token_acc": 0.31299684141352047 + }, + { + "epoch": 3.0310759308120785, + "grad_norm": 0.30433916871070316, + "learning_rate": 0.00019441368525414477, + "loss": 2.920292854309082, + "step": 5171, + "token_acc": 0.3080676802989585 + }, + { + "epoch": 3.0316622691292876, + "grad_norm": 0.30170991706218725, + "learning_rate": 0.00019441049074855885, + "loss": 2.901003837585449, + "step": 5172, + "token_acc": 0.3120196562793094 + }, + { + "epoch": 3.0322486074464967, + "grad_norm": 0.31314765821490453, + "learning_rate": 0.00019440729535611352, + "loss": 2.9486870765686035, + "step": 5173, + "token_acc": 0.3048625281359783 + }, + { + "epoch": 3.032834945763706, + "grad_norm": 0.3289469414154259, + "learning_rate": 0.00019440409907683895, + "loss": 2.9477133750915527, + "step": 5174, + "token_acc": 0.304574505640415 + }, + { + "epoch": 3.0334212840809145, + "grad_norm": 0.3122404877796078, + "learning_rate": 0.00019440090191076502, + "loss": 2.8961567878723145, + "step": 5175, + "token_acc": 0.31156532686274707 + }, + { + "epoch": 3.0340076223981236, + "grad_norm": 0.2825798868947717, + "learning_rate": 0.00019439770385792183, + "loss": 2.9087021350860596, + "step": 5176, + "token_acc": 0.31188267369776373 + }, + { + "epoch": 3.0345939607153327, + "grad_norm": 0.2868566570902747, + "learning_rate": 0.00019439450491833945, + "loss": 2.9344851970672607, + "step": 5177, + "token_acc": 0.30849164544689234 + }, + { + "epoch": 3.035180299032542, + "grad_norm": 0.3237326209538367, + "learning_rate": 0.00019439130509204787, + "loss": 2.965400457382202, + "step": 5178, + "token_acc": 0.3029042170644001 + }, + { + "epoch": 3.035766637349751, + "grad_norm": 0.3170684735408952, + "learning_rate": 0.00019438810437907717, + "loss": 2.911686658859253, + "step": 5179, + "token_acc": 0.3099352164312794 + }, + { + "epoch": 3.03635297566696, + "grad_norm": 0.31935816249145776, + "learning_rate": 0.00019438490277945745, + "loss": 2.9230916500091553, + "step": 5180, + "token_acc": 0.30899603771078155 + }, + { + "epoch": 3.0369393139841687, + "grad_norm": 0.30826402757073124, + "learning_rate": 0.0001943817002932187, + "loss": 2.974609375, + "step": 5181, + "token_acc": 0.30249739517256646 + }, + { + "epoch": 3.037525652301378, + "grad_norm": 0.3076041601625042, + "learning_rate": 0.0001943784969203911, + "loss": 2.9336605072021484, + "step": 5182, + "token_acc": 0.30774229411166004 + }, + { + "epoch": 3.038111990618587, + "grad_norm": 0.3013814829956809, + "learning_rate": 0.0001943752926610047, + "loss": 2.9661457538604736, + "step": 5183, + "token_acc": 0.3022054183641409 + }, + { + "epoch": 3.038698328935796, + "grad_norm": 0.3376232199583679, + "learning_rate": 0.00019437208751508958, + "loss": 2.917414665222168, + "step": 5184, + "token_acc": 0.3103650981942797 + }, + { + "epoch": 3.039284667253005, + "grad_norm": 0.26918587394579985, + "learning_rate": 0.00019436888148267585, + "loss": 2.8925657272338867, + "step": 5185, + "token_acc": 0.3134051155421553 + }, + { + "epoch": 3.039871005570214, + "grad_norm": 0.3230848419086703, + "learning_rate": 0.00019436567456379366, + "loss": 2.920283317565918, + "step": 5186, + "token_acc": 0.3095187287849614 + }, + { + "epoch": 3.040457343887423, + "grad_norm": 0.375243178051357, + "learning_rate": 0.00019436246675847313, + "loss": 2.9641270637512207, + "step": 5187, + "token_acc": 0.30445870466105807 + }, + { + "epoch": 3.041043682204632, + "grad_norm": 0.3244980721865245, + "learning_rate": 0.00019435925806674437, + "loss": 2.976930618286133, + "step": 5188, + "token_acc": 0.3030080328586683 + }, + { + "epoch": 3.041630020521841, + "grad_norm": 0.26752102786035004, + "learning_rate": 0.00019435604848863752, + "loss": 2.9252562522888184, + "step": 5189, + "token_acc": 0.31041034224643405 + }, + { + "epoch": 3.0422163588390503, + "grad_norm": 0.2870771300758046, + "learning_rate": 0.00019435283802418275, + "loss": 2.917541980743408, + "step": 5190, + "token_acc": 0.3107720171341012 + }, + { + "epoch": 3.042802697156259, + "grad_norm": 0.28587770938788876, + "learning_rate": 0.0001943496266734102, + "loss": 2.931736469268799, + "step": 5191, + "token_acc": 0.3058618181070361 + }, + { + "epoch": 3.043389035473468, + "grad_norm": 0.2842974678076354, + "learning_rate": 0.00019434641443635006, + "loss": 2.940046787261963, + "step": 5192, + "token_acc": 0.3065712764714284 + }, + { + "epoch": 3.043975373790677, + "grad_norm": 0.2665458085804807, + "learning_rate": 0.0001943432013130325, + "loss": 2.907958984375, + "step": 5193, + "token_acc": 0.3116108062023765 + }, + { + "epoch": 3.0445617121078863, + "grad_norm": 0.340157062638762, + "learning_rate": 0.00019433998730348766, + "loss": 2.9279537200927734, + "step": 5194, + "token_acc": 0.3082989255474905 + }, + { + "epoch": 3.0451480504250954, + "grad_norm": 0.264892515717637, + "learning_rate": 0.0001943367724077458, + "loss": 2.9474236965179443, + "step": 5195, + "token_acc": 0.3056360593505725 + }, + { + "epoch": 3.0457343887423045, + "grad_norm": 0.29087808765955064, + "learning_rate": 0.00019433355662583704, + "loss": 2.8737950325012207, + "step": 5196, + "token_acc": 0.31799685884599865 + }, + { + "epoch": 3.046320727059513, + "grad_norm": 0.2862129675068749, + "learning_rate": 0.00019433033995779164, + "loss": 2.94284987449646, + "step": 5197, + "token_acc": 0.30531864935694486 + }, + { + "epoch": 3.0469070653767223, + "grad_norm": 0.2966285430106267, + "learning_rate": 0.00019432712240363985, + "loss": 2.9285879135131836, + "step": 5198, + "token_acc": 0.30816626132366753 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.2851217920002604, + "learning_rate": 0.0001943239039634118, + "loss": 2.9066271781921387, + "step": 5199, + "token_acc": 0.3099789263590375 + }, + { + "epoch": 3.0480797420111405, + "grad_norm": 0.2553354121019608, + "learning_rate": 0.0001943206846371378, + "loss": 2.908294200897217, + "step": 5200, + "token_acc": 0.31121503686382745 + }, + { + "epoch": 3.0486660803283496, + "grad_norm": 0.30311014096611794, + "learning_rate": 0.00019431746442484808, + "loss": 2.906442165374756, + "step": 5201, + "token_acc": 0.312074025242161 + }, + { + "epoch": 3.0492524186455583, + "grad_norm": 0.3300061461013664, + "learning_rate": 0.00019431424332657288, + "loss": 2.9103844165802, + "step": 5202, + "token_acc": 0.3095899485031731 + }, + { + "epoch": 3.0498387569627674, + "grad_norm": 0.3703954377911579, + "learning_rate": 0.00019431102134234243, + "loss": 2.9320430755615234, + "step": 5203, + "token_acc": 0.3089385535089563 + }, + { + "epoch": 3.0504250952799765, + "grad_norm": 0.351683438408679, + "learning_rate": 0.000194307798472187, + "loss": 2.921031951904297, + "step": 5204, + "token_acc": 0.3102155003286775 + }, + { + "epoch": 3.0510114335971856, + "grad_norm": 0.2855525728809736, + "learning_rate": 0.00019430457471613692, + "loss": 2.936224937438965, + "step": 5205, + "token_acc": 0.30642035098408965 + }, + { + "epoch": 3.0515977719143947, + "grad_norm": 0.26523595453276416, + "learning_rate": 0.00019430135007422243, + "loss": 2.9507851600646973, + "step": 5206, + "token_acc": 0.3052540442246888 + }, + { + "epoch": 3.052184110231604, + "grad_norm": 0.3826239846267414, + "learning_rate": 0.00019429812454647386, + "loss": 2.9499897956848145, + "step": 5207, + "token_acc": 0.30487685641653717 + }, + { + "epoch": 3.0527704485488125, + "grad_norm": 0.3425011585771168, + "learning_rate": 0.00019429489813292143, + "loss": 2.9314184188842773, + "step": 5208, + "token_acc": 0.3081917495434594 + }, + { + "epoch": 3.0533567868660216, + "grad_norm": 0.27309155129912394, + "learning_rate": 0.0001942916708335955, + "loss": 2.972264289855957, + "step": 5209, + "token_acc": 0.3037004493894338 + }, + { + "epoch": 3.0539431251832307, + "grad_norm": 0.3692863638983722, + "learning_rate": 0.0001942884426485264, + "loss": 2.9193172454833984, + "step": 5210, + "token_acc": 0.30953969141064636 + }, + { + "epoch": 3.05452946350044, + "grad_norm": 0.2867693017933724, + "learning_rate": 0.00019428521357774443, + "loss": 2.948643684387207, + "step": 5211, + "token_acc": 0.3054388346585644 + }, + { + "epoch": 3.055115801817649, + "grad_norm": 0.38337303810177514, + "learning_rate": 0.00019428198362127992, + "loss": 2.8976097106933594, + "step": 5212, + "token_acc": 0.3114708679540835 + }, + { + "epoch": 3.0557021401348576, + "grad_norm": 0.3922767667766906, + "learning_rate": 0.00019427875277916324, + "loss": 2.9133996963500977, + "step": 5213, + "token_acc": 0.30922146435042097 + }, + { + "epoch": 3.0562884784520667, + "grad_norm": 0.40924739212001626, + "learning_rate": 0.0001942755210514247, + "loss": 2.9343833923339844, + "step": 5214, + "token_acc": 0.30736532199782923 + }, + { + "epoch": 3.056874816769276, + "grad_norm": 0.38696392641882704, + "learning_rate": 0.0001942722884380947, + "loss": 2.9053173065185547, + "step": 5215, + "token_acc": 0.31169289448761694 + }, + { + "epoch": 3.057461155086485, + "grad_norm": 0.26676459614221326, + "learning_rate": 0.00019426905493920358, + "loss": 2.9307477474212646, + "step": 5216, + "token_acc": 0.3074609762794922 + }, + { + "epoch": 3.058047493403694, + "grad_norm": 0.3135862707371322, + "learning_rate": 0.0001942658205547817, + "loss": 2.936640739440918, + "step": 5217, + "token_acc": 0.30614871342958816 + }, + { + "epoch": 3.0586338317209028, + "grad_norm": 0.27982570220829195, + "learning_rate": 0.00019426258528485946, + "loss": 2.9156956672668457, + "step": 5218, + "token_acc": 0.30996228995402336 + }, + { + "epoch": 3.059220170038112, + "grad_norm": 0.2798509316664159, + "learning_rate": 0.00019425934912946726, + "loss": 2.9157357215881348, + "step": 5219, + "token_acc": 0.30957495614626906 + }, + { + "epoch": 3.059806508355321, + "grad_norm": 0.2834774616842488, + "learning_rate": 0.0001942561120886355, + "loss": 2.8836545944213867, + "step": 5220, + "token_acc": 0.3152024039585758 + }, + { + "epoch": 3.06039284667253, + "grad_norm": 0.2723891098461981, + "learning_rate": 0.00019425287416239458, + "loss": 2.9562482833862305, + "step": 5221, + "token_acc": 0.30617459739284963 + }, + { + "epoch": 3.060979184989739, + "grad_norm": 0.3603106516806401, + "learning_rate": 0.00019424963535077488, + "loss": 2.9155402183532715, + "step": 5222, + "token_acc": 0.3117574179227893 + }, + { + "epoch": 3.0615655233069483, + "grad_norm": 0.2882600230014994, + "learning_rate": 0.0001942463956538069, + "loss": 2.9290101528167725, + "step": 5223, + "token_acc": 0.30854587172978787 + }, + { + "epoch": 3.062151861624157, + "grad_norm": 0.28898272488602517, + "learning_rate": 0.00019424315507152103, + "loss": 2.945805549621582, + "step": 5224, + "token_acc": 0.30653511026961444 + }, + { + "epoch": 3.062738199941366, + "grad_norm": 0.36021558008686405, + "learning_rate": 0.0001942399136039477, + "loss": 2.8986129760742188, + "step": 5225, + "token_acc": 0.3120431803080341 + }, + { + "epoch": 3.063324538258575, + "grad_norm": 0.25730225647986243, + "learning_rate": 0.00019423667125111735, + "loss": 2.930884838104248, + "step": 5226, + "token_acc": 0.3087518142625948 + }, + { + "epoch": 3.0639108765757843, + "grad_norm": 0.33721514069597724, + "learning_rate": 0.00019423342801306047, + "loss": 2.929837703704834, + "step": 5227, + "token_acc": 0.30877443023608747 + }, + { + "epoch": 3.0644972148929934, + "grad_norm": 0.3271741677189038, + "learning_rate": 0.00019423018388980753, + "loss": 2.93837833404541, + "step": 5228, + "token_acc": 0.3065539991009461 + }, + { + "epoch": 3.065083553210202, + "grad_norm": 0.35905935161449604, + "learning_rate": 0.000194226938881389, + "loss": 2.910773754119873, + "step": 5229, + "token_acc": 0.3107101818710129 + }, + { + "epoch": 3.065669891527411, + "grad_norm": 0.3574399221056548, + "learning_rate": 0.00019422369298783534, + "loss": 2.9578981399536133, + "step": 5230, + "token_acc": 0.3033850733019151 + }, + { + "epoch": 3.0662562298446203, + "grad_norm": 0.28197756106453786, + "learning_rate": 0.00019422044620917702, + "loss": 2.9434781074523926, + "step": 5231, + "token_acc": 0.3050355614396995 + }, + { + "epoch": 3.0668425681618294, + "grad_norm": 0.3586332047428451, + "learning_rate": 0.00019421719854544463, + "loss": 2.929109573364258, + "step": 5232, + "token_acc": 0.30949500979190886 + }, + { + "epoch": 3.0674289064790385, + "grad_norm": 0.33396164067036244, + "learning_rate": 0.00019421394999666856, + "loss": 2.894759178161621, + "step": 5233, + "token_acc": 0.3138705020293645 + }, + { + "epoch": 3.068015244796247, + "grad_norm": 0.29983078959669274, + "learning_rate": 0.00019421070056287944, + "loss": 2.9345834255218506, + "step": 5234, + "token_acc": 0.30858320212621904 + }, + { + "epoch": 3.0686015831134563, + "grad_norm": 0.3288632239006383, + "learning_rate": 0.00019420745024410768, + "loss": 2.910721778869629, + "step": 5235, + "token_acc": 0.3104366347177849 + }, + { + "epoch": 3.0691879214306654, + "grad_norm": 0.2666188132915057, + "learning_rate": 0.0001942041990403839, + "loss": 2.9346981048583984, + "step": 5236, + "token_acc": 0.30712224167750984 + }, + { + "epoch": 3.0697742597478745, + "grad_norm": 0.3438648318086929, + "learning_rate": 0.00019420094695173863, + "loss": 2.928624153137207, + "step": 5237, + "token_acc": 0.30755583385514507 + }, + { + "epoch": 3.0703605980650837, + "grad_norm": 0.27603459777552103, + "learning_rate": 0.0001941976939782024, + "loss": 2.9580626487731934, + "step": 5238, + "token_acc": 0.3028971383462629 + }, + { + "epoch": 3.0709469363822928, + "grad_norm": 0.32230224379971845, + "learning_rate": 0.00019419444011980574, + "loss": 2.9392714500427246, + "step": 5239, + "token_acc": 0.305870022685417 + }, + { + "epoch": 3.0715332746995014, + "grad_norm": 0.3315450069377555, + "learning_rate": 0.00019419118537657927, + "loss": 2.9567298889160156, + "step": 5240, + "token_acc": 0.30397701058514914 + }, + { + "epoch": 3.0721196130167105, + "grad_norm": 0.2972335683417017, + "learning_rate": 0.0001941879297485535, + "loss": 2.9409749507904053, + "step": 5241, + "token_acc": 0.307252599385238 + }, + { + "epoch": 3.0727059513339197, + "grad_norm": 0.43243676421492266, + "learning_rate": 0.00019418467323575908, + "loss": 2.9394798278808594, + "step": 5242, + "token_acc": 0.3069827264659353 + }, + { + "epoch": 3.0732922896511288, + "grad_norm": 0.3406213396174589, + "learning_rate": 0.00019418141583822657, + "loss": 2.925457000732422, + "step": 5243, + "token_acc": 0.3092428335502499 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.34384397596044963, + "learning_rate": 0.00019417815755598655, + "loss": 2.9534716606140137, + "step": 5244, + "token_acc": 0.3054742010509088 + }, + { + "epoch": 3.0744649662855466, + "grad_norm": 0.32242024908318306, + "learning_rate": 0.00019417489838906965, + "loss": 2.913316249847412, + "step": 5245, + "token_acc": 0.31162490183667196 + }, + { + "epoch": 3.0750513046027557, + "grad_norm": 0.3227160881569984, + "learning_rate": 0.0001941716383375065, + "loss": 2.9513397216796875, + "step": 5246, + "token_acc": 0.3066730376931983 + }, + { + "epoch": 3.0756376429199648, + "grad_norm": 0.3157340048043223, + "learning_rate": 0.0001941683774013277, + "loss": 2.908053159713745, + "step": 5247, + "token_acc": 0.310422894154325 + }, + { + "epoch": 3.076223981237174, + "grad_norm": 0.246754061348291, + "learning_rate": 0.0001941651155805639, + "loss": 2.93650484085083, + "step": 5248, + "token_acc": 0.30523670465213987 + }, + { + "epoch": 3.076810319554383, + "grad_norm": 0.35242963962301815, + "learning_rate": 0.0001941618528752457, + "loss": 2.880807876586914, + "step": 5249, + "token_acc": 0.31444160586907816 + }, + { + "epoch": 3.077396657871592, + "grad_norm": 0.31465438424943026, + "learning_rate": 0.0001941585892854038, + "loss": 2.9655680656433105, + "step": 5250, + "token_acc": 0.30255321720078704 + }, + { + "epoch": 3.077982996188801, + "grad_norm": 0.2872862543584576, + "learning_rate": 0.00019415532481106883, + "loss": 2.9440109729766846, + "step": 5251, + "token_acc": 0.30753821992884844 + }, + { + "epoch": 3.07856933450601, + "grad_norm": 0.2897755761081667, + "learning_rate": 0.00019415205945227143, + "loss": 2.9169178009033203, + "step": 5252, + "token_acc": 0.30991938153516074 + }, + { + "epoch": 3.079155672823219, + "grad_norm": 0.31472093047981187, + "learning_rate": 0.00019414879320904237, + "loss": 2.9474191665649414, + "step": 5253, + "token_acc": 0.30488561418495974 + }, + { + "epoch": 3.079742011140428, + "grad_norm": 0.30690989633985, + "learning_rate": 0.0001941455260814122, + "loss": 2.9245386123657227, + "step": 5254, + "token_acc": 0.30772148797559556 + }, + { + "epoch": 3.0803283494576372, + "grad_norm": 0.2662382184115055, + "learning_rate": 0.00019414225806941172, + "loss": 2.9700088500976562, + "step": 5255, + "token_acc": 0.3019841070725904 + }, + { + "epoch": 3.080914687774846, + "grad_norm": 0.2930865937032236, + "learning_rate": 0.00019413898917307153, + "loss": 2.923515796661377, + "step": 5256, + "token_acc": 0.3065538293081007 + }, + { + "epoch": 3.081501026092055, + "grad_norm": 0.34113560735904935, + "learning_rate": 0.00019413571939242243, + "loss": 2.9175162315368652, + "step": 5257, + "token_acc": 0.3096439991914962 + }, + { + "epoch": 3.082087364409264, + "grad_norm": 0.2668450414223363, + "learning_rate": 0.0001941324487274951, + "loss": 2.928525447845459, + "step": 5258, + "token_acc": 0.30860076271816606 + }, + { + "epoch": 3.0826737027264732, + "grad_norm": 0.2942871762951993, + "learning_rate": 0.00019412917717832024, + "loss": 2.9353747367858887, + "step": 5259, + "token_acc": 0.3087878284230517 + }, + { + "epoch": 3.0832600410436823, + "grad_norm": 0.2935801286227263, + "learning_rate": 0.0001941259047449286, + "loss": 2.9370031356811523, + "step": 5260, + "token_acc": 0.306471992138144 + }, + { + "epoch": 3.0838463793608915, + "grad_norm": 0.24544981147428738, + "learning_rate": 0.00019412263142735094, + "loss": 2.964984893798828, + "step": 5261, + "token_acc": 0.3027644170050066 + }, + { + "epoch": 3.0844327176781, + "grad_norm": 0.26775146706493064, + "learning_rate": 0.00019411935722561796, + "loss": 2.943545341491699, + "step": 5262, + "token_acc": 0.3068719280348697 + }, + { + "epoch": 3.0850190559953092, + "grad_norm": 0.2681334063986024, + "learning_rate": 0.00019411608213976047, + "loss": 2.9939284324645996, + "step": 5263, + "token_acc": 0.3000466770160275 + }, + { + "epoch": 3.0856053943125183, + "grad_norm": 0.295830919167988, + "learning_rate": 0.00019411280616980921, + "loss": 2.9422836303710938, + "step": 5264, + "token_acc": 0.3080937824270129 + }, + { + "epoch": 3.0861917326297275, + "grad_norm": 0.39210307916236614, + "learning_rate": 0.00019410952931579492, + "loss": 2.9263384342193604, + "step": 5265, + "token_acc": 0.31013316714601974 + }, + { + "epoch": 3.0867780709469366, + "grad_norm": 0.39608985235795396, + "learning_rate": 0.00019410625157774848, + "loss": 2.9281978607177734, + "step": 5266, + "token_acc": 0.30927426400214886 + }, + { + "epoch": 3.0873644092641452, + "grad_norm": 0.295105837637503, + "learning_rate": 0.00019410297295570058, + "loss": 3.008229970932007, + "step": 5267, + "token_acc": 0.29762900387559066 + }, + { + "epoch": 3.0879507475813543, + "grad_norm": 0.27846917577705105, + "learning_rate": 0.00019409969344968208, + "loss": 2.9562511444091797, + "step": 5268, + "token_acc": 0.3051255189539441 + }, + { + "epoch": 3.0885370858985635, + "grad_norm": 0.28678201631484906, + "learning_rate": 0.00019409641305972373, + "loss": 3.0088610649108887, + "step": 5269, + "token_acc": 0.2969082764191819 + }, + { + "epoch": 3.0891234242157726, + "grad_norm": 0.2942488709892208, + "learning_rate": 0.00019409313178585634, + "loss": 2.943333625793457, + "step": 5270, + "token_acc": 0.30531008425944955 + }, + { + "epoch": 3.0897097625329817, + "grad_norm": 0.2937022682363384, + "learning_rate": 0.00019408984962811083, + "loss": 2.89353609085083, + "step": 5271, + "token_acc": 0.311956298105583 + }, + { + "epoch": 3.0902961008501904, + "grad_norm": 0.3114187104421518, + "learning_rate": 0.00019408656658651796, + "loss": 2.9096574783325195, + "step": 5272, + "token_acc": 0.31030414566503106 + }, + { + "epoch": 3.0908824391673995, + "grad_norm": 0.24161713507387406, + "learning_rate": 0.00019408328266110858, + "loss": 2.899527072906494, + "step": 5273, + "token_acc": 0.31258726021095784 + }, + { + "epoch": 3.0914687774846086, + "grad_norm": 0.24702987248540267, + "learning_rate": 0.00019407999785191353, + "loss": 2.9260406494140625, + "step": 5274, + "token_acc": 0.30848256834712817 + }, + { + "epoch": 3.0920551158018177, + "grad_norm": 0.2644247465153701, + "learning_rate": 0.0001940767121589637, + "loss": 2.941545248031616, + "step": 5275, + "token_acc": 0.3052283599310318 + }, + { + "epoch": 3.092641454119027, + "grad_norm": 0.2778480258797623, + "learning_rate": 0.00019407342558228988, + "loss": 2.9225854873657227, + "step": 5276, + "token_acc": 0.30939987369276184 + }, + { + "epoch": 3.093227792436236, + "grad_norm": 0.24224388215503337, + "learning_rate": 0.00019407013812192304, + "loss": 2.959624767303467, + "step": 5277, + "token_acc": 0.3035603087202888 + }, + { + "epoch": 3.0938141307534446, + "grad_norm": 0.2873938600489093, + "learning_rate": 0.00019406684977789395, + "loss": 2.9027814865112305, + "step": 5278, + "token_acc": 0.31272768032281567 + }, + { + "epoch": 3.0944004690706537, + "grad_norm": 0.27584699508693716, + "learning_rate": 0.00019406356055023363, + "loss": 2.9270944595336914, + "step": 5279, + "token_acc": 0.3091854529649046 + }, + { + "epoch": 3.094986807387863, + "grad_norm": 0.28728778175043224, + "learning_rate": 0.00019406027043897286, + "loss": 2.965500831604004, + "step": 5280, + "token_acc": 0.30353420553699184 + }, + { + "epoch": 3.095573145705072, + "grad_norm": 0.31176559324320124, + "learning_rate": 0.00019405697944414264, + "loss": 2.9307422637939453, + "step": 5281, + "token_acc": 0.3094938768073504 + }, + { + "epoch": 3.096159484022281, + "grad_norm": 0.36881157131464337, + "learning_rate": 0.0001940536875657738, + "loss": 2.9457814693450928, + "step": 5282, + "token_acc": 0.306535019931067 + }, + { + "epoch": 3.0967458223394897, + "grad_norm": 0.3551174431255161, + "learning_rate": 0.00019405039480389734, + "loss": 2.9334702491760254, + "step": 5283, + "token_acc": 0.3071493085470508 + }, + { + "epoch": 3.097332160656699, + "grad_norm": 0.3243456682694211, + "learning_rate": 0.00019404710115854417, + "loss": 2.929581642150879, + "step": 5284, + "token_acc": 0.308254501221437 + }, + { + "epoch": 3.097918498973908, + "grad_norm": 0.3730683604152476, + "learning_rate": 0.00019404380662974515, + "loss": 2.9672653675079346, + "step": 5285, + "token_acc": 0.3034021562514886 + }, + { + "epoch": 3.098504837291117, + "grad_norm": 0.4005567694623175, + "learning_rate": 0.00019404051121753134, + "loss": 2.953073024749756, + "step": 5286, + "token_acc": 0.30552695261537505 + }, + { + "epoch": 3.099091175608326, + "grad_norm": 0.29494731724503725, + "learning_rate": 0.00019403721492193364, + "loss": 2.8535995483398438, + "step": 5287, + "token_acc": 0.3194527677448265 + }, + { + "epoch": 3.099677513925535, + "grad_norm": 0.40836372664406084, + "learning_rate": 0.00019403391774298304, + "loss": 2.9541144371032715, + "step": 5288, + "token_acc": 0.305690288430377 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.3512534198082233, + "learning_rate": 0.00019403061968071046, + "loss": 2.952315330505371, + "step": 5289, + "token_acc": 0.3046761325219743 + }, + { + "epoch": 3.100850190559953, + "grad_norm": 0.2880404152189814, + "learning_rate": 0.00019402732073514693, + "loss": 2.9204468727111816, + "step": 5290, + "token_acc": 0.30863308811172674 + }, + { + "epoch": 3.101436528877162, + "grad_norm": 0.33061426487576784, + "learning_rate": 0.00019402402090632344, + "loss": 2.92392897605896, + "step": 5291, + "token_acc": 0.3094264198821996 + }, + { + "epoch": 3.1020228671943713, + "grad_norm": 0.3375679762301312, + "learning_rate": 0.00019402072019427094, + "loss": 2.946640968322754, + "step": 5292, + "token_acc": 0.30646521972543905 + }, + { + "epoch": 3.1026092055115804, + "grad_norm": 0.3295037384591374, + "learning_rate": 0.0001940174185990205, + "loss": 2.918299436569214, + "step": 5293, + "token_acc": 0.3092454091707742 + }, + { + "epoch": 3.103195543828789, + "grad_norm": 0.2773453587785817, + "learning_rate": 0.0001940141161206031, + "loss": 2.967357635498047, + "step": 5294, + "token_acc": 0.30174333475687876 + }, + { + "epoch": 3.103781882145998, + "grad_norm": 0.39711589345577736, + "learning_rate": 0.00019401081275904973, + "loss": 2.9420785903930664, + "step": 5295, + "token_acc": 0.3061059802187725 + }, + { + "epoch": 3.1043682204632073, + "grad_norm": 0.2776497907277249, + "learning_rate": 0.00019400750851439148, + "loss": 2.913527250289917, + "step": 5296, + "token_acc": 0.31029434683413004 + }, + { + "epoch": 3.1049545587804164, + "grad_norm": 0.3041669323881054, + "learning_rate": 0.00019400420338665936, + "loss": 2.924229621887207, + "step": 5297, + "token_acc": 0.30887089248648797 + }, + { + "epoch": 3.1055408970976255, + "grad_norm": 0.2812968130014946, + "learning_rate": 0.00019400089737588446, + "loss": 2.918821334838867, + "step": 5298, + "token_acc": 0.31000584099964534 + }, + { + "epoch": 3.106127235414834, + "grad_norm": 0.27556102386814124, + "learning_rate": 0.00019399759048209774, + "loss": 2.9605867862701416, + "step": 5299, + "token_acc": 0.3023844111606472 + }, + { + "epoch": 3.1067135737320433, + "grad_norm": 0.2865140845766617, + "learning_rate": 0.00019399428270533035, + "loss": 2.9104089736938477, + "step": 5300, + "token_acc": 0.3106944126838936 + }, + { + "epoch": 3.1072999120492524, + "grad_norm": 0.30365446503702104, + "learning_rate": 0.00019399097404561332, + "loss": 2.9876503944396973, + "step": 5301, + "token_acc": 0.30132215277205426 + }, + { + "epoch": 3.1078862503664615, + "grad_norm": 0.27622312934625304, + "learning_rate": 0.00019398766450297777, + "loss": 2.948108673095703, + "step": 5302, + "token_acc": 0.30504282026766194 + }, + { + "epoch": 3.1084725886836706, + "grad_norm": 0.29743522738915484, + "learning_rate": 0.0001939843540774547, + "loss": 2.936128616333008, + "step": 5303, + "token_acc": 0.3070857847545153 + }, + { + "epoch": 3.1090589270008797, + "grad_norm": 0.29427151603144447, + "learning_rate": 0.00019398104276907533, + "loss": 2.93275785446167, + "step": 5304, + "token_acc": 0.3068207520869387 + }, + { + "epoch": 3.1096452653180884, + "grad_norm": 0.29719659656441677, + "learning_rate": 0.00019397773057787068, + "loss": 2.957824468612671, + "step": 5305, + "token_acc": 0.30477761629810024 + }, + { + "epoch": 3.1102316036352975, + "grad_norm": 0.3105909186102596, + "learning_rate": 0.00019397441750387188, + "loss": 2.9183998107910156, + "step": 5306, + "token_acc": 0.31048551916654205 + }, + { + "epoch": 3.1108179419525066, + "grad_norm": 0.26957806088464414, + "learning_rate": 0.00019397110354711007, + "loss": 2.9416487216949463, + "step": 5307, + "token_acc": 0.30642212201210056 + }, + { + "epoch": 3.1114042802697157, + "grad_norm": 0.32748789515052046, + "learning_rate": 0.00019396778870761638, + "loss": 2.9190382957458496, + "step": 5308, + "token_acc": 0.3102018203403245 + }, + { + "epoch": 3.111990618586925, + "grad_norm": 0.2565018977444655, + "learning_rate": 0.00019396447298542193, + "loss": 2.9649786949157715, + "step": 5309, + "token_acc": 0.30458631912135414 + }, + { + "epoch": 3.1125769569041335, + "grad_norm": 0.2836750798123025, + "learning_rate": 0.0001939611563805579, + "loss": 2.9511170387268066, + "step": 5310, + "token_acc": 0.30518066818034384 + }, + { + "epoch": 3.1131632952213426, + "grad_norm": 0.3176591172033794, + "learning_rate": 0.0001939578388930554, + "loss": 2.935601234436035, + "step": 5311, + "token_acc": 0.3084529149768539 + }, + { + "epoch": 3.1137496335385517, + "grad_norm": 0.30908198506023205, + "learning_rate": 0.0001939545205229456, + "loss": 2.885695457458496, + "step": 5312, + "token_acc": 0.31453876770135825 + }, + { + "epoch": 3.114335971855761, + "grad_norm": 0.26903683298519443, + "learning_rate": 0.0001939512012702597, + "loss": 2.9344825744628906, + "step": 5313, + "token_acc": 0.3069585598263705 + }, + { + "epoch": 3.11492231017297, + "grad_norm": 0.31262387201186514, + "learning_rate": 0.00019394788113502885, + "loss": 2.9133381843566895, + "step": 5314, + "token_acc": 0.3097518346519134 + }, + { + "epoch": 3.115508648490179, + "grad_norm": 0.2730652191570408, + "learning_rate": 0.00019394456011728424, + "loss": 2.937570095062256, + "step": 5315, + "token_acc": 0.30798466211323156 + }, + { + "epoch": 3.1160949868073877, + "grad_norm": 0.3339072777217726, + "learning_rate": 0.00019394123821705713, + "loss": 2.9239795207977295, + "step": 5316, + "token_acc": 0.30889251532197126 + }, + { + "epoch": 3.116681325124597, + "grad_norm": 0.34983158565428213, + "learning_rate": 0.00019393791543437865, + "loss": 2.9083948135375977, + "step": 5317, + "token_acc": 0.3097774490963108 + }, + { + "epoch": 3.117267663441806, + "grad_norm": 0.2671857251093449, + "learning_rate": 0.00019393459176928003, + "loss": 2.943286418914795, + "step": 5318, + "token_acc": 0.30715212969304206 + }, + { + "epoch": 3.117854001759015, + "grad_norm": 0.33354673698416265, + "learning_rate": 0.0001939312672217925, + "loss": 2.9155914783477783, + "step": 5319, + "token_acc": 0.3104030969644832 + }, + { + "epoch": 3.118440340076224, + "grad_norm": 0.282534874042418, + "learning_rate": 0.0001939279417919473, + "loss": 2.86663556098938, + "step": 5320, + "token_acc": 0.31875527837328654 + }, + { + "epoch": 3.119026678393433, + "grad_norm": 0.31113542467319527, + "learning_rate": 0.00019392461547977562, + "loss": 2.9302291870117188, + "step": 5321, + "token_acc": 0.30801729193812477 + }, + { + "epoch": 3.119613016710642, + "grad_norm": 0.3273603579807016, + "learning_rate": 0.00019392128828530877, + "loss": 2.9027786254882812, + "step": 5322, + "token_acc": 0.3111974730943965 + }, + { + "epoch": 3.120199355027851, + "grad_norm": 0.31017293100333126, + "learning_rate": 0.00019391796020857798, + "loss": 2.9264841079711914, + "step": 5323, + "token_acc": 0.30917253100424485 + }, + { + "epoch": 3.12078569334506, + "grad_norm": 0.2936857303423182, + "learning_rate": 0.0001939146312496145, + "loss": 2.8960013389587402, + "step": 5324, + "token_acc": 0.31130930254925254 + }, + { + "epoch": 3.1213720316622693, + "grad_norm": 0.3279480394304328, + "learning_rate": 0.0001939113014084496, + "loss": 2.889918804168701, + "step": 5325, + "token_acc": 0.3125660559002961 + }, + { + "epoch": 3.121958369979478, + "grad_norm": 0.34142747123428907, + "learning_rate": 0.00019390797068511462, + "loss": 2.928982734680176, + "step": 5326, + "token_acc": 0.3078903750530008 + }, + { + "epoch": 3.122544708296687, + "grad_norm": 0.30541291499605416, + "learning_rate": 0.00019390463907964075, + "loss": 2.9199109077453613, + "step": 5327, + "token_acc": 0.3091409535881333 + }, + { + "epoch": 3.123131046613896, + "grad_norm": 0.27028448814851974, + "learning_rate": 0.00019390130659205937, + "loss": 2.904547691345215, + "step": 5328, + "token_acc": 0.31227825020908767 + }, + { + "epoch": 3.1237173849311053, + "grad_norm": 0.3175142317651791, + "learning_rate": 0.00019389797322240173, + "loss": 2.9465131759643555, + "step": 5329, + "token_acc": 0.3052346904946499 + }, + { + "epoch": 3.1243037232483144, + "grad_norm": 0.3003521630194517, + "learning_rate": 0.00019389463897069912, + "loss": 2.9653706550598145, + "step": 5330, + "token_acc": 0.30306370957697437 + }, + { + "epoch": 3.1248900615655235, + "grad_norm": 0.28948798864015673, + "learning_rate": 0.00019389130383698296, + "loss": 2.9111251831054688, + "step": 5331, + "token_acc": 0.31135627241604175 + }, + { + "epoch": 3.125476399882732, + "grad_norm": 0.2986607780908537, + "learning_rate": 0.00019388796782128452, + "loss": 2.9010655879974365, + "step": 5332, + "token_acc": 0.31159625016052395 + }, + { + "epoch": 3.1260627381999413, + "grad_norm": 0.3239253619905989, + "learning_rate": 0.0001938846309236351, + "loss": 2.9301304817199707, + "step": 5333, + "token_acc": 0.3074228903646658 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.350882769602478, + "learning_rate": 0.00019388129314406612, + "loss": 2.928159475326538, + "step": 5334, + "token_acc": 0.3082110160666083 + }, + { + "epoch": 3.1272354148343595, + "grad_norm": 0.30329498553723977, + "learning_rate": 0.00019387795448260885, + "loss": 2.968932867050171, + "step": 5335, + "token_acc": 0.30453604115171357 + }, + { + "epoch": 3.1278217531515686, + "grad_norm": 0.31103423677324843, + "learning_rate": 0.00019387461493929476, + "loss": 2.920726776123047, + "step": 5336, + "token_acc": 0.3084453834820862 + }, + { + "epoch": 3.1284080914687773, + "grad_norm": 0.31044657187543334, + "learning_rate": 0.0001938712745141551, + "loss": 2.927861213684082, + "step": 5337, + "token_acc": 0.30866695903497327 + }, + { + "epoch": 3.1289944297859864, + "grad_norm": 0.28145881657673516, + "learning_rate": 0.00019386793320722134, + "loss": 2.952730417251587, + "step": 5338, + "token_acc": 0.30477910573533795 + }, + { + "epoch": 3.1295807681031955, + "grad_norm": 0.3199176917717775, + "learning_rate": 0.00019386459101852484, + "loss": 2.94476318359375, + "step": 5339, + "token_acc": 0.30487715517826275 + }, + { + "epoch": 3.1301671064204046, + "grad_norm": 0.31147484197388536, + "learning_rate": 0.00019386124794809698, + "loss": 2.9064829349517822, + "step": 5340, + "token_acc": 0.312238571458904 + }, + { + "epoch": 3.1307534447376137, + "grad_norm": 0.36301557524582373, + "learning_rate": 0.0001938579039959692, + "loss": 2.919233560562134, + "step": 5341, + "token_acc": 0.3094864253814416 + }, + { + "epoch": 3.1313397830548224, + "grad_norm": 0.4599309404358186, + "learning_rate": 0.00019385455916217287, + "loss": 2.9691269397735596, + "step": 5342, + "token_acc": 0.30298467578650934 + }, + { + "epoch": 3.1319261213720315, + "grad_norm": 0.49096570313464677, + "learning_rate": 0.0001938512134467394, + "loss": 2.9045820236206055, + "step": 5343, + "token_acc": 0.3110213366184494 + }, + { + "epoch": 3.1325124596892406, + "grad_norm": 0.3421944630402371, + "learning_rate": 0.00019384786684970029, + "loss": 2.9322409629821777, + "step": 5344, + "token_acc": 0.30842541079052915 + }, + { + "epoch": 3.1330987980064497, + "grad_norm": 0.33629500292160724, + "learning_rate": 0.0001938445193710869, + "loss": 2.9425601959228516, + "step": 5345, + "token_acc": 0.30608184942847094 + }, + { + "epoch": 3.133685136323659, + "grad_norm": 0.3896565098209385, + "learning_rate": 0.00019384117101093072, + "loss": 2.9608845710754395, + "step": 5346, + "token_acc": 0.3027661229643044 + }, + { + "epoch": 3.134271474640868, + "grad_norm": 0.28891675514816956, + "learning_rate": 0.00019383782176926321, + "loss": 2.9589552879333496, + "step": 5347, + "token_acc": 0.3032106503847205 + }, + { + "epoch": 3.1348578129580766, + "grad_norm": 0.3149987301232355, + "learning_rate": 0.0001938344716461158, + "loss": 2.9914369583129883, + "step": 5348, + "token_acc": 0.2991584495511558 + }, + { + "epoch": 3.1354441512752858, + "grad_norm": 0.2926856937917814, + "learning_rate": 0.00019383112064151996, + "loss": 2.935220718383789, + "step": 5349, + "token_acc": 0.3078945046732852 + }, + { + "epoch": 3.136030489592495, + "grad_norm": 0.34515804976702014, + "learning_rate": 0.00019382776875550718, + "loss": 2.9218838214874268, + "step": 5350, + "token_acc": 0.3100083287799133 + }, + { + "epoch": 3.136616827909704, + "grad_norm": 0.2812203941084217, + "learning_rate": 0.00019382441598810894, + "loss": 2.8866593837738037, + "step": 5351, + "token_acc": 0.3134866353654479 + }, + { + "epoch": 3.137203166226913, + "grad_norm": 0.32565338480113465, + "learning_rate": 0.00019382106233935677, + "loss": 2.9165635108947754, + "step": 5352, + "token_acc": 0.3096304347268826 + }, + { + "epoch": 3.1377895045441218, + "grad_norm": 0.2578356065473407, + "learning_rate": 0.00019381770780928212, + "loss": 2.951174736022949, + "step": 5353, + "token_acc": 0.30529785025627854 + }, + { + "epoch": 3.138375842861331, + "grad_norm": 0.29404377190990555, + "learning_rate": 0.00019381435239791656, + "loss": 2.951747417449951, + "step": 5354, + "token_acc": 0.3045936322717008 + }, + { + "epoch": 3.13896218117854, + "grad_norm": 0.2888957756054559, + "learning_rate": 0.00019381099610529153, + "loss": 2.9502921104431152, + "step": 5355, + "token_acc": 0.3047815599726547 + }, + { + "epoch": 3.139548519495749, + "grad_norm": 0.3260797383022326, + "learning_rate": 0.00019380763893143862, + "loss": 2.9583778381347656, + "step": 5356, + "token_acc": 0.3054860186418109 + }, + { + "epoch": 3.140134857812958, + "grad_norm": 0.2999891889790078, + "learning_rate": 0.00019380428087638937, + "loss": 2.949016809463501, + "step": 5357, + "token_acc": 0.30712072511875765 + }, + { + "epoch": 3.1407211961301673, + "grad_norm": 0.2616016911648799, + "learning_rate": 0.0001938009219401753, + "loss": 2.940258026123047, + "step": 5358, + "token_acc": 0.3056002603471465 + }, + { + "epoch": 3.141307534447376, + "grad_norm": 0.2808921271452929, + "learning_rate": 0.00019379756212282797, + "loss": 2.9346680641174316, + "step": 5359, + "token_acc": 0.30609792882834697 + }, + { + "epoch": 3.141893872764585, + "grad_norm": 0.24868411859685202, + "learning_rate": 0.0001937942014243789, + "loss": 2.9106290340423584, + "step": 5360, + "token_acc": 0.3103786107072878 + }, + { + "epoch": 3.142480211081794, + "grad_norm": 0.27297900191098723, + "learning_rate": 0.00019379083984485973, + "loss": 2.9510788917541504, + "step": 5361, + "token_acc": 0.3051438491312532 + }, + { + "epoch": 3.1430665493990033, + "grad_norm": 0.2867743444581692, + "learning_rate": 0.000193787477384302, + "loss": 2.935410976409912, + "step": 5362, + "token_acc": 0.3080153247304243 + }, + { + "epoch": 3.1436528877162124, + "grad_norm": 0.26065456018674693, + "learning_rate": 0.00019378411404273732, + "loss": 2.945131301879883, + "step": 5363, + "token_acc": 0.30903977104510016 + }, + { + "epoch": 3.144239226033421, + "grad_norm": 0.2660856830950308, + "learning_rate": 0.0001937807498201972, + "loss": 2.9169299602508545, + "step": 5364, + "token_acc": 0.3121596855514387 + }, + { + "epoch": 3.14482556435063, + "grad_norm": 0.29784719047378566, + "learning_rate": 0.00019377738471671336, + "loss": 2.95880126953125, + "step": 5365, + "token_acc": 0.3025050587023027 + }, + { + "epoch": 3.1454119026678393, + "grad_norm": 0.289038301231129, + "learning_rate": 0.00019377401873231734, + "loss": 2.936163902282715, + "step": 5366, + "token_acc": 0.3070291624876544 + }, + { + "epoch": 3.1459982409850484, + "grad_norm": 0.2672568109746362, + "learning_rate": 0.0001937706518670408, + "loss": 2.9431324005126953, + "step": 5367, + "token_acc": 0.30522628533373186 + }, + { + "epoch": 3.1465845793022575, + "grad_norm": 0.2556023038534498, + "learning_rate": 0.00019376728412091532, + "loss": 2.9482779502868652, + "step": 5368, + "token_acc": 0.3041297623071148 + }, + { + "epoch": 3.1471709176194667, + "grad_norm": 0.33757198042767206, + "learning_rate": 0.00019376391549397255, + "loss": 2.917387008666992, + "step": 5369, + "token_acc": 0.31004216550157887 + }, + { + "epoch": 3.1477572559366753, + "grad_norm": 0.3010495931187666, + "learning_rate": 0.00019376054598624416, + "loss": 2.949450731277466, + "step": 5370, + "token_acc": 0.30425278012277357 + }, + { + "epoch": 3.1483435942538844, + "grad_norm": 0.3096502416314198, + "learning_rate": 0.00019375717559776178, + "loss": 2.8806324005126953, + "step": 5371, + "token_acc": 0.3144549379384781 + }, + { + "epoch": 3.1489299325710935, + "grad_norm": 0.3037628555209738, + "learning_rate": 0.00019375380432855709, + "loss": 2.959415912628174, + "step": 5372, + "token_acc": 0.3040190737467887 + }, + { + "epoch": 3.1495162708883027, + "grad_norm": 0.36340861540468805, + "learning_rate": 0.00019375043217866172, + "loss": 2.9353561401367188, + "step": 5373, + "token_acc": 0.30908735409289845 + }, + { + "epoch": 3.1501026092055118, + "grad_norm": 0.3036538922365206, + "learning_rate": 0.00019374705914810736, + "loss": 2.925947666168213, + "step": 5374, + "token_acc": 0.30884161618002093 + }, + { + "epoch": 3.1506889475227204, + "grad_norm": 0.3001600341269259, + "learning_rate": 0.00019374368523692573, + "loss": 2.9768333435058594, + "step": 5375, + "token_acc": 0.3018817971278239 + }, + { + "epoch": 3.1512752858399296, + "grad_norm": 0.37424349002926516, + "learning_rate": 0.00019374031044514848, + "loss": 2.9330921173095703, + "step": 5376, + "token_acc": 0.3070908457242889 + }, + { + "epoch": 3.1518616241571387, + "grad_norm": 0.3068749590060571, + "learning_rate": 0.00019373693477280735, + "loss": 2.958658218383789, + "step": 5377, + "token_acc": 0.3038902251574628 + }, + { + "epoch": 3.1524479624743478, + "grad_norm": 0.27317204923169974, + "learning_rate": 0.00019373355821993403, + "loss": 2.929501533508301, + "step": 5378, + "token_acc": 0.3080402243160217 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.3556376693671732, + "learning_rate": 0.00019373018078656023, + "loss": 2.9665989875793457, + "step": 5379, + "token_acc": 0.3017731480313769 + }, + { + "epoch": 3.1536206391087656, + "grad_norm": 0.3254599522710315, + "learning_rate": 0.00019372680247271767, + "loss": 2.9713268280029297, + "step": 5380, + "token_acc": 0.30193895805851206 + }, + { + "epoch": 3.1542069774259747, + "grad_norm": 0.30618231650994454, + "learning_rate": 0.0001937234232784381, + "loss": 2.9442286491394043, + "step": 5381, + "token_acc": 0.30383559193636633 + }, + { + "epoch": 3.154793315743184, + "grad_norm": 0.31758797844402153, + "learning_rate": 0.00019372004320375327, + "loss": 2.9296531677246094, + "step": 5382, + "token_acc": 0.30862708957325347 + }, + { + "epoch": 3.155379654060393, + "grad_norm": 0.2976222161357982, + "learning_rate": 0.00019371666224869493, + "loss": 2.9181623458862305, + "step": 5383, + "token_acc": 0.30918101008406884 + }, + { + "epoch": 3.155965992377602, + "grad_norm": 0.2943687092180571, + "learning_rate": 0.00019371328041329483, + "loss": 2.971268653869629, + "step": 5384, + "token_acc": 0.30291594653305215 + }, + { + "epoch": 3.1565523306948107, + "grad_norm": 0.3427956489913068, + "learning_rate": 0.00019370989769758478, + "loss": 2.9433507919311523, + "step": 5385, + "token_acc": 0.3054969363270203 + }, + { + "epoch": 3.15713866901202, + "grad_norm": 0.31929296213015407, + "learning_rate": 0.00019370651410159645, + "loss": 2.984891176223755, + "step": 5386, + "token_acc": 0.30059255618361047 + }, + { + "epoch": 3.157725007329229, + "grad_norm": 0.2930381641512369, + "learning_rate": 0.0001937031296253617, + "loss": 2.939363956451416, + "step": 5387, + "token_acc": 0.3052507541003419 + }, + { + "epoch": 3.158311345646438, + "grad_norm": 0.32795796501839464, + "learning_rate": 0.00019369974426891235, + "loss": 2.949565887451172, + "step": 5388, + "token_acc": 0.307638846920892 + }, + { + "epoch": 3.158897683963647, + "grad_norm": 0.3458212580688221, + "learning_rate": 0.00019369635803228016, + "loss": 2.975759983062744, + "step": 5389, + "token_acc": 0.3014857125182363 + }, + { + "epoch": 3.1594840222808562, + "grad_norm": 0.2683708646001357, + "learning_rate": 0.00019369297091549693, + "loss": 2.9430699348449707, + "step": 5390, + "token_acc": 0.30690886841196957 + }, + { + "epoch": 3.160070360598065, + "grad_norm": 0.2992389306741696, + "learning_rate": 0.00019368958291859448, + "loss": 2.961652994155884, + "step": 5391, + "token_acc": 0.3039216947939804 + }, + { + "epoch": 3.160656698915274, + "grad_norm": 0.26411189200969504, + "learning_rate": 0.00019368619404160466, + "loss": 2.9590036869049072, + "step": 5392, + "token_acc": 0.30535378623386494 + }, + { + "epoch": 3.161243037232483, + "grad_norm": 0.2975888052416875, + "learning_rate": 0.0001936828042845593, + "loss": 2.9536237716674805, + "step": 5393, + "token_acc": 0.3039029875145867 + }, + { + "epoch": 3.1618293755496922, + "grad_norm": 0.28949195044322307, + "learning_rate": 0.00019367941364749022, + "loss": 2.9468026161193848, + "step": 5394, + "token_acc": 0.3053276191513247 + }, + { + "epoch": 3.1624157138669013, + "grad_norm": 0.2828267080384573, + "learning_rate": 0.0001936760221304293, + "loss": 2.9120049476623535, + "step": 5395, + "token_acc": 0.31054327341992743 + }, + { + "epoch": 3.16300205218411, + "grad_norm": 0.27990839248494787, + "learning_rate": 0.00019367262973340833, + "loss": 2.9396982192993164, + "step": 5396, + "token_acc": 0.3061681808332408 + }, + { + "epoch": 3.163588390501319, + "grad_norm": 0.2715808957041378, + "learning_rate": 0.00019366923645645928, + "loss": 2.962836503982544, + "step": 5397, + "token_acc": 0.3031536113936928 + }, + { + "epoch": 3.1641747288185282, + "grad_norm": 0.2753853877873122, + "learning_rate": 0.00019366584229961396, + "loss": 2.9541263580322266, + "step": 5398, + "token_acc": 0.30370695171351875 + }, + { + "epoch": 3.1647610671357373, + "grad_norm": 0.2518873068212432, + "learning_rate": 0.00019366244726290427, + "loss": 2.9393396377563477, + "step": 5399, + "token_acc": 0.3072903921500082 + }, + { + "epoch": 3.1653474054529465, + "grad_norm": 0.24005377521421326, + "learning_rate": 0.0001936590513463621, + "loss": 2.9471359252929688, + "step": 5400, + "token_acc": 0.3066001002762932 + }, + { + "epoch": 3.1659337437701556, + "grad_norm": 0.2333059238297225, + "learning_rate": 0.00019365565455001934, + "loss": 2.9684319496154785, + "step": 5401, + "token_acc": 0.3024221663848032 + }, + { + "epoch": 3.1665200820873642, + "grad_norm": 0.250323384350809, + "learning_rate": 0.00019365225687390794, + "loss": 2.9609391689300537, + "step": 5402, + "token_acc": 0.30253484091506705 + }, + { + "epoch": 3.1671064204045734, + "grad_norm": 0.25459963632168203, + "learning_rate": 0.00019364885831805973, + "loss": 2.934001922607422, + "step": 5403, + "token_acc": 0.30783589139026807 + }, + { + "epoch": 3.1676927587217825, + "grad_norm": 0.25809724109803706, + "learning_rate": 0.00019364545888250675, + "loss": 2.949033260345459, + "step": 5404, + "token_acc": 0.3048904295498301 + }, + { + "epoch": 3.1682790970389916, + "grad_norm": 0.26172141249526054, + "learning_rate": 0.00019364205856728083, + "loss": 2.919278621673584, + "step": 5405, + "token_acc": 0.31015854098386364 + }, + { + "epoch": 3.1688654353562007, + "grad_norm": 0.28186947948424884, + "learning_rate": 0.00019363865737241398, + "loss": 2.9443674087524414, + "step": 5406, + "token_acc": 0.3080535616485473 + }, + { + "epoch": 3.1694517736734094, + "grad_norm": 0.25873923019452383, + "learning_rate": 0.00019363525529793812, + "loss": 2.9504218101501465, + "step": 5407, + "token_acc": 0.30423842246158384 + }, + { + "epoch": 3.1700381119906185, + "grad_norm": 0.2633502020229431, + "learning_rate": 0.00019363185234388519, + "loss": 2.9494967460632324, + "step": 5408, + "token_acc": 0.3055652556292761 + }, + { + "epoch": 3.1706244503078276, + "grad_norm": 0.28583949519563906, + "learning_rate": 0.00019362844851028717, + "loss": 2.9417672157287598, + "step": 5409, + "token_acc": 0.30632624447355494 + }, + { + "epoch": 3.1712107886250367, + "grad_norm": 0.3569574017227077, + "learning_rate": 0.0001936250437971761, + "loss": 2.962395668029785, + "step": 5410, + "token_acc": 0.3029179884248003 + }, + { + "epoch": 3.171797126942246, + "grad_norm": 0.4449873235306184, + "learning_rate": 0.00019362163820458385, + "loss": 2.980628252029419, + "step": 5411, + "token_acc": 0.29927414701488914 + }, + { + "epoch": 3.172383465259455, + "grad_norm": 0.41960741424517234, + "learning_rate": 0.00019361823173254247, + "loss": 2.945840835571289, + "step": 5412, + "token_acc": 0.3062181400334698 + }, + { + "epoch": 3.1729698035766636, + "grad_norm": 0.29673124927088296, + "learning_rate": 0.00019361482438108398, + "loss": 2.933629035949707, + "step": 5413, + "token_acc": 0.30711911223883476 + }, + { + "epoch": 3.1735561418938727, + "grad_norm": 0.39812278897062275, + "learning_rate": 0.00019361141615024035, + "loss": 2.9264397621154785, + "step": 5414, + "token_acc": 0.30798003720154904 + }, + { + "epoch": 3.174142480211082, + "grad_norm": 0.34948643822917974, + "learning_rate": 0.00019360800704004363, + "loss": 2.964817523956299, + "step": 5415, + "token_acc": 0.30409428362158947 + }, + { + "epoch": 3.174728818528291, + "grad_norm": 0.2971428397046301, + "learning_rate": 0.00019360459705052577, + "loss": 2.9055821895599365, + "step": 5416, + "token_acc": 0.3122780038729587 + }, + { + "epoch": 3.1753151568455, + "grad_norm": 0.28491817616925313, + "learning_rate": 0.0001936011861817189, + "loss": 2.9157862663269043, + "step": 5417, + "token_acc": 0.31049192040177354 + }, + { + "epoch": 3.1759014951627087, + "grad_norm": 0.3064024195188568, + "learning_rate": 0.000193597774433655, + "loss": 2.9807801246643066, + "step": 5418, + "token_acc": 0.3022662889518414 + }, + { + "epoch": 3.176487833479918, + "grad_norm": 0.2823644680375918, + "learning_rate": 0.00019359436180636612, + "loss": 2.9004621505737305, + "step": 5419, + "token_acc": 0.31279571528072436 + }, + { + "epoch": 3.177074171797127, + "grad_norm": 0.28848919001436, + "learning_rate": 0.00019359094829988436, + "loss": 2.9565320014953613, + "step": 5420, + "token_acc": 0.3040721750381949 + }, + { + "epoch": 3.177660510114336, + "grad_norm": 0.31960333194769136, + "learning_rate": 0.00019358753391424176, + "loss": 2.9562735557556152, + "step": 5421, + "token_acc": 0.302570677441496 + }, + { + "epoch": 3.178246848431545, + "grad_norm": 0.31295974426585066, + "learning_rate": 0.00019358411864947036, + "loss": 2.924454689025879, + "step": 5422, + "token_acc": 0.3079727940136544 + }, + { + "epoch": 3.1788331867487543, + "grad_norm": 0.33653835342619737, + "learning_rate": 0.00019358070250560227, + "loss": 2.949678897857666, + "step": 5423, + "token_acc": 0.3048583317872194 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.34397649896274735, + "learning_rate": 0.0001935772854826696, + "loss": 2.9417948722839355, + "step": 5424, + "token_acc": 0.3051552724914744 + }, + { + "epoch": 3.180005863383172, + "grad_norm": 0.27835006280904545, + "learning_rate": 0.00019357386758070443, + "loss": 2.971139907836914, + "step": 5425, + "token_acc": 0.30102860900332956 + }, + { + "epoch": 3.180592201700381, + "grad_norm": 0.2689938583651941, + "learning_rate": 0.00019357044879973886, + "loss": 2.9225621223449707, + "step": 5426, + "token_acc": 0.30885179937957874 + }, + { + "epoch": 3.1811785400175903, + "grad_norm": 0.30389340963963807, + "learning_rate": 0.00019356702913980503, + "loss": 2.9366111755371094, + "step": 5427, + "token_acc": 0.3065503243399579 + }, + { + "epoch": 3.1817648783347994, + "grad_norm": 0.3234591281125431, + "learning_rate": 0.000193563608600935, + "loss": 2.969336986541748, + "step": 5428, + "token_acc": 0.30234184387026963 + }, + { + "epoch": 3.182351216652008, + "grad_norm": 0.27777468741234285, + "learning_rate": 0.000193560187183161, + "loss": 2.9282541275024414, + "step": 5429, + "token_acc": 0.3085211440528681 + }, + { + "epoch": 3.182937554969217, + "grad_norm": 0.34837767627990823, + "learning_rate": 0.00019355676488651508, + "loss": 2.967630386352539, + "step": 5430, + "token_acc": 0.3030258835088581 + }, + { + "epoch": 3.1835238932864263, + "grad_norm": 0.33739995511290216, + "learning_rate": 0.0001935533417110294, + "loss": 2.9137308597564697, + "step": 5431, + "token_acc": 0.3110948905109489 + }, + { + "epoch": 3.1841102316036354, + "grad_norm": 0.28000891194144556, + "learning_rate": 0.0001935499176567362, + "loss": 2.9572746753692627, + "step": 5432, + "token_acc": 0.30369068806750527 + }, + { + "epoch": 3.1846965699208445, + "grad_norm": 0.3381436665561972, + "learning_rate": 0.00019354649272366754, + "loss": 2.929831027984619, + "step": 5433, + "token_acc": 0.3072453410600399 + }, + { + "epoch": 3.185282908238053, + "grad_norm": 0.3724357762112872, + "learning_rate": 0.00019354306691185565, + "loss": 2.9519662857055664, + "step": 5434, + "token_acc": 0.30512366903141075 + }, + { + "epoch": 3.1858692465552623, + "grad_norm": 0.2898397164437663, + "learning_rate": 0.0001935396402213327, + "loss": 2.928469657897949, + "step": 5435, + "token_acc": 0.30796954004186516 + }, + { + "epoch": 3.1864555848724714, + "grad_norm": 0.3085488336756759, + "learning_rate": 0.00019353621265213086, + "loss": 2.946547508239746, + "step": 5436, + "token_acc": 0.30662347463376227 + }, + { + "epoch": 3.1870419231896805, + "grad_norm": 0.3225034949415428, + "learning_rate": 0.00019353278420428235, + "loss": 2.9231815338134766, + "step": 5437, + "token_acc": 0.30875465644581296 + }, + { + "epoch": 3.1876282615068896, + "grad_norm": 0.27385182708442546, + "learning_rate": 0.00019352935487781942, + "loss": 2.9150285720825195, + "step": 5438, + "token_acc": 0.310208522496008 + }, + { + "epoch": 3.1882145998240983, + "grad_norm": 0.30777369736773025, + "learning_rate": 0.00019352592467277417, + "loss": 2.935791015625, + "step": 5439, + "token_acc": 0.3061763078363768 + }, + { + "epoch": 3.1888009381413074, + "grad_norm": 0.29001024259253677, + "learning_rate": 0.0001935224935891789, + "loss": 2.973250150680542, + "step": 5440, + "token_acc": 0.3031434158182186 + }, + { + "epoch": 3.1893872764585165, + "grad_norm": 0.29514472722988533, + "learning_rate": 0.00019351906162706582, + "loss": 2.959594249725342, + "step": 5441, + "token_acc": 0.3032567898483172 + }, + { + "epoch": 3.1899736147757256, + "grad_norm": 0.2751018755807374, + "learning_rate": 0.00019351562878646718, + "loss": 2.9430789947509766, + "step": 5442, + "token_acc": 0.3068423869413968 + }, + { + "epoch": 3.1905599530929347, + "grad_norm": 0.298249531995445, + "learning_rate": 0.00019351219506741525, + "loss": 2.9636337757110596, + "step": 5443, + "token_acc": 0.3028956503350062 + }, + { + "epoch": 3.191146291410144, + "grad_norm": 0.27117408091130357, + "learning_rate": 0.00019350876046994223, + "loss": 2.9704248905181885, + "step": 5444, + "token_acc": 0.3038214350970387 + }, + { + "epoch": 3.1917326297273525, + "grad_norm": 0.31475536959904754, + "learning_rate": 0.0001935053249940804, + "loss": 2.9319753646850586, + "step": 5445, + "token_acc": 0.30702200064567686 + }, + { + "epoch": 3.1923189680445616, + "grad_norm": 0.32835287632516824, + "learning_rate": 0.00019350188863986208, + "loss": 2.9596691131591797, + "step": 5446, + "token_acc": 0.30352112303846174 + }, + { + "epoch": 3.1929053063617707, + "grad_norm": 0.29828913908736276, + "learning_rate": 0.0001934984514073195, + "loss": 2.9990763664245605, + "step": 5447, + "token_acc": 0.2982541814186302 + }, + { + "epoch": 3.19349164467898, + "grad_norm": 0.2849555689360862, + "learning_rate": 0.00019349501329648492, + "loss": 2.9852724075317383, + "step": 5448, + "token_acc": 0.30089769446751297 + }, + { + "epoch": 3.194077982996189, + "grad_norm": 0.28346145087374514, + "learning_rate": 0.00019349157430739071, + "loss": 2.926243305206299, + "step": 5449, + "token_acc": 0.30862306631949765 + }, + { + "epoch": 3.1946643213133976, + "grad_norm": 0.24426226158845873, + "learning_rate": 0.00019348813444006915, + "loss": 2.930239677429199, + "step": 5450, + "token_acc": 0.3079855419337792 + }, + { + "epoch": 3.1952506596306067, + "grad_norm": 0.2553827354932598, + "learning_rate": 0.00019348469369455252, + "loss": 2.992568254470825, + "step": 5451, + "token_acc": 0.30089702620006326 + }, + { + "epoch": 3.195836997947816, + "grad_norm": 0.27511896233710137, + "learning_rate": 0.00019348125207087317, + "loss": 2.9348673820495605, + "step": 5452, + "token_acc": 0.307259675292545 + }, + { + "epoch": 3.196423336265025, + "grad_norm": 0.2824181800161575, + "learning_rate": 0.00019347780956906343, + "loss": 2.8963232040405273, + "step": 5453, + "token_acc": 0.3111624003357113 + }, + { + "epoch": 3.197009674582234, + "grad_norm": 0.2994826572413537, + "learning_rate": 0.00019347436618915562, + "loss": 2.9110381603240967, + "step": 5454, + "token_acc": 0.31093106267678866 + }, + { + "epoch": 3.197596012899443, + "grad_norm": 0.282598293275501, + "learning_rate": 0.00019347092193118212, + "loss": 2.935211420059204, + "step": 5455, + "token_acc": 0.30886008781820834 + }, + { + "epoch": 3.198182351216652, + "grad_norm": 0.29627676008012005, + "learning_rate": 0.00019346747679517524, + "loss": 2.931037187576294, + "step": 5456, + "token_acc": 0.3095992002847154 + }, + { + "epoch": 3.198768689533861, + "grad_norm": 0.30596325176166284, + "learning_rate": 0.00019346403078116737, + "loss": 2.922698497772217, + "step": 5457, + "token_acc": 0.30839446493195 + }, + { + "epoch": 3.19935502785107, + "grad_norm": 0.29713075964557495, + "learning_rate": 0.00019346058388919088, + "loss": 2.9282379150390625, + "step": 5458, + "token_acc": 0.3065554992207012 + }, + { + "epoch": 3.199941366168279, + "grad_norm": 0.2565061554649424, + "learning_rate": 0.00019345713611927816, + "loss": 2.8797237873077393, + "step": 5459, + "token_acc": 0.317002777034522 + }, + { + "epoch": 3.2005277044854883, + "grad_norm": 0.29439983110843776, + "learning_rate": 0.00019345368747146155, + "loss": 2.9747731685638428, + "step": 5460, + "token_acc": 0.3022374137323714 + }, + { + "epoch": 3.201114042802697, + "grad_norm": 0.3150538379039204, + "learning_rate": 0.00019345023794577348, + "loss": 2.937020778656006, + "step": 5461, + "token_acc": 0.30632220503957064 + }, + { + "epoch": 3.201700381119906, + "grad_norm": 0.2970739906773659, + "learning_rate": 0.00019344678754224637, + "loss": 2.9328978061676025, + "step": 5462, + "token_acc": 0.3067275675018051 + }, + { + "epoch": 3.202286719437115, + "grad_norm": 0.27600595667268685, + "learning_rate": 0.0001934433362609126, + "loss": 2.9669508934020996, + "step": 5463, + "token_acc": 0.30264190784093303 + }, + { + "epoch": 3.2028730577543243, + "grad_norm": 0.33063274040076696, + "learning_rate": 0.0001934398841018046, + "loss": 2.9089369773864746, + "step": 5464, + "token_acc": 0.31372093084015457 + }, + { + "epoch": 3.2034593960715334, + "grad_norm": 0.4093476264669585, + "learning_rate": 0.00019343643106495482, + "loss": 2.9509329795837402, + "step": 5465, + "token_acc": 0.30470074351987514 + }, + { + "epoch": 3.2040457343887425, + "grad_norm": 0.3553618297293979, + "learning_rate": 0.00019343297715039565, + "loss": 2.9705023765563965, + "step": 5466, + "token_acc": 0.3017262565352249 + }, + { + "epoch": 3.204632072705951, + "grad_norm": 0.3107527204189969, + "learning_rate": 0.00019342952235815958, + "loss": 2.9682536125183105, + "step": 5467, + "token_acc": 0.30191107443132 + }, + { + "epoch": 3.2052184110231603, + "grad_norm": 0.3412864289018294, + "learning_rate": 0.00019342606668827905, + "loss": 2.9663305282592773, + "step": 5468, + "token_acc": 0.30306588424158276 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 0.33719025633437755, + "learning_rate": 0.0001934226101407865, + "loss": 2.9414260387420654, + "step": 5469, + "token_acc": 0.30434286188785786 + }, + { + "epoch": 3.2063910876575785, + "grad_norm": 0.30130417864234116, + "learning_rate": 0.00019341915271571444, + "loss": 2.962266445159912, + "step": 5470, + "token_acc": 0.3041836836070812 + }, + { + "epoch": 3.2069774259747876, + "grad_norm": 0.2955779693777121, + "learning_rate": 0.00019341569441309528, + "loss": 2.9580588340759277, + "step": 5471, + "token_acc": 0.3052344766716054 + }, + { + "epoch": 3.2075637642919963, + "grad_norm": 0.2803147039519151, + "learning_rate": 0.0001934122352329616, + "loss": 2.942302703857422, + "step": 5472, + "token_acc": 0.30634347601856626 + }, + { + "epoch": 3.2081501026092054, + "grad_norm": 0.33085506451581237, + "learning_rate": 0.00019340877517534582, + "loss": 2.948922634124756, + "step": 5473, + "token_acc": 0.30460266952749465 + }, + { + "epoch": 3.2087364409264145, + "grad_norm": 0.2572760720654979, + "learning_rate": 0.00019340531424028048, + "loss": 2.930830955505371, + "step": 5474, + "token_acc": 0.30740315585688055 + }, + { + "epoch": 3.2093227792436236, + "grad_norm": 0.29317711744069486, + "learning_rate": 0.00019340185242779808, + "loss": 2.985015869140625, + "step": 5475, + "token_acc": 0.3022481774537696 + }, + { + "epoch": 3.2099091175608327, + "grad_norm": 0.3090127648650641, + "learning_rate": 0.0001933983897379311, + "loss": 2.9688525199890137, + "step": 5476, + "token_acc": 0.30239920877443355 + }, + { + "epoch": 3.210495455878042, + "grad_norm": 0.3039687955769756, + "learning_rate": 0.00019339492617071214, + "loss": 2.9415416717529297, + "step": 5477, + "token_acc": 0.3066315508346621 + }, + { + "epoch": 3.2110817941952505, + "grad_norm": 0.2983207522056923, + "learning_rate": 0.00019339146172617366, + "loss": 2.9704155921936035, + "step": 5478, + "token_acc": 0.304569453927583 + }, + { + "epoch": 3.2116681325124596, + "grad_norm": 0.258094086082446, + "learning_rate": 0.0001933879964043483, + "loss": 2.968628406524658, + "step": 5479, + "token_acc": 0.3020112159520329 + }, + { + "epoch": 3.2122544708296688, + "grad_norm": 0.26412346547226784, + "learning_rate": 0.00019338453020526853, + "loss": 2.9922337532043457, + "step": 5480, + "token_acc": 0.29749492678472944 + }, + { + "epoch": 3.212840809146878, + "grad_norm": 0.2845160462526243, + "learning_rate": 0.00019338106312896694, + "loss": 2.894908905029297, + "step": 5481, + "token_acc": 0.3141979555464065 + }, + { + "epoch": 3.213427147464087, + "grad_norm": 0.283510982160887, + "learning_rate": 0.00019337759517547607, + "loss": 2.9345791339874268, + "step": 5482, + "token_acc": 0.3063695516442463 + }, + { + "epoch": 3.2140134857812956, + "grad_norm": 0.28876894796961844, + "learning_rate": 0.00019337412634482854, + "loss": 2.906684637069702, + "step": 5483, + "token_acc": 0.3120758967897623 + }, + { + "epoch": 3.2145998240985048, + "grad_norm": 0.310608861866727, + "learning_rate": 0.0001933706566370569, + "loss": 2.926724672317505, + "step": 5484, + "token_acc": 0.3080563545328646 + }, + { + "epoch": 3.215186162415714, + "grad_norm": 0.3268638938848502, + "learning_rate": 0.0001933671860521938, + "loss": 2.9390642642974854, + "step": 5485, + "token_acc": 0.30722162894004235 + }, + { + "epoch": 3.215772500732923, + "grad_norm": 0.3204292200792293, + "learning_rate": 0.00019336371459027177, + "loss": 2.94765305519104, + "step": 5486, + "token_acc": 0.30603392107136057 + }, + { + "epoch": 3.216358839050132, + "grad_norm": 0.2976423670287878, + "learning_rate": 0.00019336024225132347, + "loss": 2.935096025466919, + "step": 5487, + "token_acc": 0.30903870358716173 + }, + { + "epoch": 3.2169451773673408, + "grad_norm": 0.289126134572972, + "learning_rate": 0.00019335676903538146, + "loss": 2.9308056831359863, + "step": 5488, + "token_acc": 0.30913258479855305 + }, + { + "epoch": 3.21753151568455, + "grad_norm": 0.3390484709452385, + "learning_rate": 0.00019335329494247846, + "loss": 2.963320255279541, + "step": 5489, + "token_acc": 0.30352812818926656 + }, + { + "epoch": 3.218117854001759, + "grad_norm": 0.337154626330191, + "learning_rate": 0.00019334981997264701, + "loss": 2.9333834648132324, + "step": 5490, + "token_acc": 0.30716619843074444 + }, + { + "epoch": 3.218704192318968, + "grad_norm": 0.2637196302222183, + "learning_rate": 0.0001933463441259198, + "loss": 2.9198415279388428, + "step": 5491, + "token_acc": 0.30909407712206305 + }, + { + "epoch": 3.219290530636177, + "grad_norm": 0.34280919035092056, + "learning_rate": 0.00019334286740232948, + "loss": 2.9395952224731445, + "step": 5492, + "token_acc": 0.3052960497398726 + }, + { + "epoch": 3.219876868953386, + "grad_norm": 0.28690420509662506, + "learning_rate": 0.0001933393898019087, + "loss": 2.935025215148926, + "step": 5493, + "token_acc": 0.3072363724145724 + }, + { + "epoch": 3.220463207270595, + "grad_norm": 0.3019453077610104, + "learning_rate": 0.0001933359113246901, + "loss": 2.9721508026123047, + "step": 5494, + "token_acc": 0.3033023506645941 + }, + { + "epoch": 3.221049545587804, + "grad_norm": 0.39362648374799203, + "learning_rate": 0.00019333243197070644, + "loss": 2.9954428672790527, + "step": 5495, + "token_acc": 0.2989623224854377 + }, + { + "epoch": 3.221635883905013, + "grad_norm": 0.28128572526351425, + "learning_rate": 0.0001933289517399903, + "loss": 2.9351303577423096, + "step": 5496, + "token_acc": 0.30926390519910324 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.35144269923284666, + "learning_rate": 0.00019332547063257444, + "loss": 2.94991397857666, + "step": 5497, + "token_acc": 0.3053250711669837 + }, + { + "epoch": 3.2228085605394314, + "grad_norm": 0.33273853778154117, + "learning_rate": 0.00019332198864849157, + "loss": 2.9399988651275635, + "step": 5498, + "token_acc": 0.30674416883418976 + }, + { + "epoch": 3.22339489885664, + "grad_norm": 0.2917690000244775, + "learning_rate": 0.00019331850578777432, + "loss": 2.9403092861175537, + "step": 5499, + "token_acc": 0.30710172744721687 + }, + { + "epoch": 3.223981237173849, + "grad_norm": 0.34727191022192466, + "learning_rate": 0.00019331502205045546, + "loss": 2.9610671997070312, + "step": 5500, + "token_acc": 0.30261669498923055 + }, + { + "epoch": 3.2245675754910583, + "grad_norm": 0.26293768685557267, + "learning_rate": 0.00019331153743656774, + "loss": 2.9483139514923096, + "step": 5501, + "token_acc": 0.30786009640407064 + }, + { + "epoch": 3.2251539138082674, + "grad_norm": 0.30069552675860994, + "learning_rate": 0.00019330805194614387, + "loss": 2.8998894691467285, + "step": 5502, + "token_acc": 0.3125118678826612 + }, + { + "epoch": 3.2257402521254765, + "grad_norm": 0.29072127003909826, + "learning_rate": 0.00019330456557921654, + "loss": 2.9421591758728027, + "step": 5503, + "token_acc": 0.3060407689127529 + }, + { + "epoch": 3.226326590442685, + "grad_norm": 0.29888417341793727, + "learning_rate": 0.0001933010783358186, + "loss": 2.9115114212036133, + "step": 5504, + "token_acc": 0.3115030077121504 + }, + { + "epoch": 3.2269129287598943, + "grad_norm": 0.26148903678665786, + "learning_rate": 0.00019329759021598274, + "loss": 2.9659783840179443, + "step": 5505, + "token_acc": 0.3025953763928274 + }, + { + "epoch": 3.2274992670771034, + "grad_norm": 0.2918553668517596, + "learning_rate": 0.0001932941012197417, + "loss": 2.9634170532226562, + "step": 5506, + "token_acc": 0.30281166622538136 + }, + { + "epoch": 3.2280856053943126, + "grad_norm": 0.277041645156445, + "learning_rate": 0.00019329061134712832, + "loss": 2.908942461013794, + "step": 5507, + "token_acc": 0.3126126976273244 + }, + { + "epoch": 3.2286719437115217, + "grad_norm": 0.29487714155284045, + "learning_rate": 0.00019328712059817535, + "loss": 2.972054958343506, + "step": 5508, + "token_acc": 0.301691821655524 + }, + { + "epoch": 3.2292582820287308, + "grad_norm": 0.2766429547646211, + "learning_rate": 0.00019328362897291562, + "loss": 2.9334068298339844, + "step": 5509, + "token_acc": 0.30700255740989746 + }, + { + "epoch": 3.2298446203459394, + "grad_norm": 0.31559357387305625, + "learning_rate": 0.00019328013647138188, + "loss": 2.9141879081726074, + "step": 5510, + "token_acc": 0.3102283436406022 + }, + { + "epoch": 3.2304309586631486, + "grad_norm": 0.2766219908449118, + "learning_rate": 0.00019327664309360694, + "loss": 2.945896625518799, + "step": 5511, + "token_acc": 0.3038982405054667 + }, + { + "epoch": 3.2310172969803577, + "grad_norm": 0.28842161073908024, + "learning_rate": 0.00019327314883962364, + "loss": 2.994135856628418, + "step": 5512, + "token_acc": 0.2979240908661452 + }, + { + "epoch": 3.231603635297567, + "grad_norm": 0.3162449636531089, + "learning_rate": 0.0001932696537094648, + "loss": 2.9820802211761475, + "step": 5513, + "token_acc": 0.3019043464886957 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.38214157279002486, + "learning_rate": 0.00019326615770316323, + "loss": 2.9423506259918213, + "step": 5514, + "token_acc": 0.30582138097063877 + }, + { + "epoch": 3.2327763119319846, + "grad_norm": 0.3161189041597134, + "learning_rate": 0.0001932626608207518, + "loss": 2.937432289123535, + "step": 5515, + "token_acc": 0.30602913721383096 + }, + { + "epoch": 3.2333626502491937, + "grad_norm": 0.2679121421836545, + "learning_rate": 0.00019325916306226333, + "loss": 2.948369026184082, + "step": 5516, + "token_acc": 0.30627930354848326 + }, + { + "epoch": 3.233948988566403, + "grad_norm": 0.29145037105152877, + "learning_rate": 0.00019325566442773072, + "loss": 2.9340388774871826, + "step": 5517, + "token_acc": 0.30677909773801487 + }, + { + "epoch": 3.234535326883612, + "grad_norm": 0.3097700973196709, + "learning_rate": 0.00019325216491718677, + "loss": 2.9388587474823, + "step": 5518, + "token_acc": 0.3073891161483319 + }, + { + "epoch": 3.235121665200821, + "grad_norm": 0.2648594504750986, + "learning_rate": 0.00019324866453066441, + "loss": 2.982898235321045, + "step": 5519, + "token_acc": 0.30241451573014694 + }, + { + "epoch": 3.23570800351803, + "grad_norm": 0.3155344145102964, + "learning_rate": 0.0001932451632681965, + "loss": 2.9290456771850586, + "step": 5520, + "token_acc": 0.3075735025694308 + }, + { + "epoch": 3.236294341835239, + "grad_norm": 0.2983294681981736, + "learning_rate": 0.00019324166112981593, + "loss": 2.9440011978149414, + "step": 5521, + "token_acc": 0.3061988580133509 + }, + { + "epoch": 3.236880680152448, + "grad_norm": 0.2628931205630277, + "learning_rate": 0.0001932381581155556, + "loss": 2.91029953956604, + "step": 5522, + "token_acc": 0.3116043847605972 + }, + { + "epoch": 3.237467018469657, + "grad_norm": 0.26116218606040925, + "learning_rate": 0.0001932346542254484, + "loss": 2.96248459815979, + "step": 5523, + "token_acc": 0.3037071203595103 + }, + { + "epoch": 3.238053356786866, + "grad_norm": 0.2949981135017047, + "learning_rate": 0.00019323114945952728, + "loss": 2.993692398071289, + "step": 5524, + "token_acc": 0.29872926872611244 + }, + { + "epoch": 3.2386396951040752, + "grad_norm": 0.27945935240377145, + "learning_rate": 0.00019322764381782511, + "loss": 2.9792943000793457, + "step": 5525, + "token_acc": 0.30059622349012755 + }, + { + "epoch": 3.239226033421284, + "grad_norm": 0.23254189415376042, + "learning_rate": 0.00019322413730037488, + "loss": 2.9363675117492676, + "step": 5526, + "token_acc": 0.30773914897881055 + }, + { + "epoch": 3.239812371738493, + "grad_norm": 0.28575895972987014, + "learning_rate": 0.0001932206299072095, + "loss": 2.975721836090088, + "step": 5527, + "token_acc": 0.30154219616443495 + }, + { + "epoch": 3.240398710055702, + "grad_norm": 0.2795863778793751, + "learning_rate": 0.00019321712163836193, + "loss": 2.957705497741699, + "step": 5528, + "token_acc": 0.30354256288207077 + }, + { + "epoch": 3.2409850483729112, + "grad_norm": 0.24182803777945375, + "learning_rate": 0.00019321361249386508, + "loss": 2.9397261142730713, + "step": 5529, + "token_acc": 0.30653341368331793 + }, + { + "epoch": 3.2415713866901203, + "grad_norm": 0.2953613344764257, + "learning_rate": 0.00019321010247375195, + "loss": 2.951720952987671, + "step": 5530, + "token_acc": 0.3036856786435161 + }, + { + "epoch": 3.2421577250073295, + "grad_norm": 0.2746381200976614, + "learning_rate": 0.00019320659157805555, + "loss": 2.8933863639831543, + "step": 5531, + "token_acc": 0.3124295133165611 + }, + { + "epoch": 3.242744063324538, + "grad_norm": 0.259522127627178, + "learning_rate": 0.00019320307980680879, + "loss": 2.92635440826416, + "step": 5532, + "token_acc": 0.30861736334405143 + }, + { + "epoch": 3.2433304016417472, + "grad_norm": 0.2856971625672185, + "learning_rate": 0.0001931995671600447, + "loss": 2.937255382537842, + "step": 5533, + "token_acc": 0.3067993821692447 + }, + { + "epoch": 3.2439167399589564, + "grad_norm": 0.3153184954543292, + "learning_rate": 0.00019319605363779624, + "loss": 2.92566180229187, + "step": 5534, + "token_acc": 0.3100002431374456 + }, + { + "epoch": 3.2445030782761655, + "grad_norm": 0.24304077661249268, + "learning_rate": 0.00019319253924009647, + "loss": 2.911578893661499, + "step": 5535, + "token_acc": 0.3114368255129598 + }, + { + "epoch": 3.2450894165933746, + "grad_norm": 0.35252001190129306, + "learning_rate": 0.00019318902396697833, + "loss": 2.974607229232788, + "step": 5536, + "token_acc": 0.3024739040203515 + }, + { + "epoch": 3.2456757549105832, + "grad_norm": 0.476780621648762, + "learning_rate": 0.00019318550781847492, + "loss": 2.9159560203552246, + "step": 5537, + "token_acc": 0.30912701926604347 + }, + { + "epoch": 3.2462620932277924, + "grad_norm": 0.28637009930706986, + "learning_rate": 0.00019318199079461923, + "loss": 2.9334349632263184, + "step": 5538, + "token_acc": 0.3071231293970545 + }, + { + "epoch": 3.2468484315450015, + "grad_norm": 0.3195328704025498, + "learning_rate": 0.0001931784728954443, + "loss": 2.9469428062438965, + "step": 5539, + "token_acc": 0.30526967766214896 + }, + { + "epoch": 3.2474347698622106, + "grad_norm": 0.26832654316894783, + "learning_rate": 0.00019317495412098315, + "loss": 2.948988914489746, + "step": 5540, + "token_acc": 0.3053295526577332 + }, + { + "epoch": 3.2480211081794197, + "grad_norm": 0.2908674552889672, + "learning_rate": 0.0001931714344712689, + "loss": 2.9275853633880615, + "step": 5541, + "token_acc": 0.30855676855329955 + }, + { + "epoch": 3.2486074464966284, + "grad_norm": 0.2742391942429433, + "learning_rate": 0.00019316791394633455, + "loss": 2.9371752738952637, + "step": 5542, + "token_acc": 0.3057878008252863 + }, + { + "epoch": 3.2491937848138375, + "grad_norm": 0.3203997047729603, + "learning_rate": 0.0001931643925462132, + "loss": 2.955483913421631, + "step": 5543, + "token_acc": 0.30583247242921796 + }, + { + "epoch": 3.2497801231310466, + "grad_norm": 0.3264253911083781, + "learning_rate": 0.00019316087027093794, + "loss": 2.9473843574523926, + "step": 5544, + "token_acc": 0.3046910805513191 + }, + { + "epoch": 3.2503664614482557, + "grad_norm": 0.2878037423961283, + "learning_rate": 0.00019315734712054182, + "loss": 2.9345297813415527, + "step": 5545, + "token_acc": 0.306335490288383 + }, + { + "epoch": 3.250952799765465, + "grad_norm": 0.33619404662806934, + "learning_rate": 0.00019315382309505793, + "loss": 2.929159641265869, + "step": 5546, + "token_acc": 0.30800573151046456 + }, + { + "epoch": 3.2515391380826735, + "grad_norm": 0.24730700509543874, + "learning_rate": 0.00019315029819451943, + "loss": 2.9884674549102783, + "step": 5547, + "token_acc": 0.2981429902618348 + }, + { + "epoch": 3.2521254763998826, + "grad_norm": 0.3368426692667576, + "learning_rate": 0.0001931467724189594, + "loss": 2.981721878051758, + "step": 5548, + "token_acc": 0.300923505386252 + }, + { + "epoch": 3.2527118147170917, + "grad_norm": 0.2496026186218317, + "learning_rate": 0.00019314324576841097, + "loss": 2.958937168121338, + "step": 5549, + "token_acc": 0.30521662382707254 + }, + { + "epoch": 3.253298153034301, + "grad_norm": 0.31512215306855007, + "learning_rate": 0.00019313971824290723, + "loss": 2.9585373401641846, + "step": 5550, + "token_acc": 0.30412078526188535 + }, + { + "epoch": 3.25388449135151, + "grad_norm": 0.2644606780569012, + "learning_rate": 0.00019313618984248136, + "loss": 2.952260971069336, + "step": 5551, + "token_acc": 0.30490610742329005 + }, + { + "epoch": 3.254470829668719, + "grad_norm": 0.2964840750623536, + "learning_rate": 0.00019313266056716647, + "loss": 2.997602939605713, + "step": 5552, + "token_acc": 0.2986626734592716 + }, + { + "epoch": 3.2550571679859277, + "grad_norm": 0.2826460479050315, + "learning_rate": 0.00019312913041699575, + "loss": 2.959132671356201, + "step": 5553, + "token_acc": 0.30406453928313276 + }, + { + "epoch": 3.255643506303137, + "grad_norm": 0.27683518655715794, + "learning_rate": 0.00019312559939200236, + "loss": 2.9416816234588623, + "step": 5554, + "token_acc": 0.30738426146784126 + }, + { + "epoch": 3.256229844620346, + "grad_norm": 0.2540551054228169, + "learning_rate": 0.00019312206749221944, + "loss": 2.924468517303467, + "step": 5555, + "token_acc": 0.3078947368421053 + }, + { + "epoch": 3.256816182937555, + "grad_norm": 0.31725721408448015, + "learning_rate": 0.00019311853471768017, + "loss": 2.9105281829833984, + "step": 5556, + "token_acc": 0.31106961084735063 + }, + { + "epoch": 3.257402521254764, + "grad_norm": 0.2889341434441898, + "learning_rate": 0.00019311500106841773, + "loss": 2.9713759422302246, + "step": 5557, + "token_acc": 0.30036918559479603 + }, + { + "epoch": 3.257988859571973, + "grad_norm": 0.29955933575191535, + "learning_rate": 0.00019311146654446537, + "loss": 2.941173791885376, + "step": 5558, + "token_acc": 0.30646073132674523 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.32307366990645614, + "learning_rate": 0.0001931079311458562, + "loss": 2.998309373855591, + "step": 5559, + "token_acc": 0.2973397650449419 + }, + { + "epoch": 3.259161536206391, + "grad_norm": 0.2685595619034012, + "learning_rate": 0.00019310439487262352, + "loss": 2.9513182640075684, + "step": 5560, + "token_acc": 0.304503698244321 + }, + { + "epoch": 3.2597478745236, + "grad_norm": 0.32304434803990695, + "learning_rate": 0.0001931008577248005, + "loss": 2.977510929107666, + "step": 5561, + "token_acc": 0.3002807304160807 + }, + { + "epoch": 3.2603342128408093, + "grad_norm": 0.3582189781919564, + "learning_rate": 0.0001930973197024204, + "loss": 2.9453773498535156, + "step": 5562, + "token_acc": 0.3076506685681867 + }, + { + "epoch": 3.2609205511580184, + "grad_norm": 0.2791830581366631, + "learning_rate": 0.00019309378080551638, + "loss": 2.9175667762756348, + "step": 5563, + "token_acc": 0.30815402068878583 + }, + { + "epoch": 3.261506889475227, + "grad_norm": 0.3591353990952763, + "learning_rate": 0.00019309024103412176, + "loss": 2.954418659210205, + "step": 5564, + "token_acc": 0.30361485627066903 + }, + { + "epoch": 3.262093227792436, + "grad_norm": 0.327223338948454, + "learning_rate": 0.0001930867003882698, + "loss": 2.945157766342163, + "step": 5565, + "token_acc": 0.3074716964918688 + }, + { + "epoch": 3.2626795661096453, + "grad_norm": 0.315860252281925, + "learning_rate": 0.0001930831588679937, + "loss": 2.9620490074157715, + "step": 5566, + "token_acc": 0.3021040157585365 + }, + { + "epoch": 3.2632659044268544, + "grad_norm": 0.3552765522701381, + "learning_rate": 0.00019307961647332673, + "loss": 2.9642271995544434, + "step": 5567, + "token_acc": 0.3040826684641185 + }, + { + "epoch": 3.2638522427440635, + "grad_norm": 0.2906476224767005, + "learning_rate": 0.00019307607320430222, + "loss": 2.954462766647339, + "step": 5568, + "token_acc": 0.30464290095823293 + }, + { + "epoch": 3.264438581061272, + "grad_norm": 0.34450279972302605, + "learning_rate": 0.0001930725290609534, + "loss": 2.8958821296691895, + "step": 5569, + "token_acc": 0.3142881672426005 + }, + { + "epoch": 3.2650249193784813, + "grad_norm": 0.28448146226764515, + "learning_rate": 0.0001930689840433136, + "loss": 2.952406644821167, + "step": 5570, + "token_acc": 0.3042650046631563 + }, + { + "epoch": 3.2656112576956904, + "grad_norm": 0.36471392926214036, + "learning_rate": 0.00019306543815141608, + "loss": 2.965480089187622, + "step": 5571, + "token_acc": 0.3038153815512583 + }, + { + "epoch": 3.2661975960128995, + "grad_norm": 0.2928851335776934, + "learning_rate": 0.00019306189138529423, + "loss": 2.9288558959960938, + "step": 5572, + "token_acc": 0.30830668039970366 + }, + { + "epoch": 3.2667839343301086, + "grad_norm": 0.34877336445248747, + "learning_rate": 0.00019305834374498128, + "loss": 2.930997133255005, + "step": 5573, + "token_acc": 0.3078150489396411 + }, + { + "epoch": 3.2673702726473177, + "grad_norm": 0.2689034327724174, + "learning_rate": 0.00019305479523051058, + "loss": 2.950197458267212, + "step": 5574, + "token_acc": 0.3068243674085134 + }, + { + "epoch": 3.2679566109645264, + "grad_norm": 0.3272056172117976, + "learning_rate": 0.0001930512458419155, + "loss": 2.9441757202148438, + "step": 5575, + "token_acc": 0.30482656306307493 + }, + { + "epoch": 3.2685429492817355, + "grad_norm": 0.26113695234790235, + "learning_rate": 0.00019304769557922932, + "loss": 2.916006088256836, + "step": 5576, + "token_acc": 0.30992925325117365 + }, + { + "epoch": 3.2691292875989446, + "grad_norm": 0.3149577947348922, + "learning_rate": 0.00019304414444248544, + "loss": 2.9441823959350586, + "step": 5577, + "token_acc": 0.30545243415380824 + }, + { + "epoch": 3.2697156259161537, + "grad_norm": 0.28637643170327226, + "learning_rate": 0.00019304059243171722, + "loss": 3.007596492767334, + "step": 5578, + "token_acc": 0.29620854954026654 + }, + { + "epoch": 3.270301964233363, + "grad_norm": 0.26873875197723673, + "learning_rate": 0.00019303703954695798, + "loss": 2.925732135772705, + "step": 5579, + "token_acc": 0.3097641609963769 + }, + { + "epoch": 3.2708883025505715, + "grad_norm": 0.27131174680061176, + "learning_rate": 0.00019303348578824113, + "loss": 2.9266598224639893, + "step": 5580, + "token_acc": 0.3092149611778994 + }, + { + "epoch": 3.2714746408677806, + "grad_norm": 0.27173814004788016, + "learning_rate": 0.00019302993115560005, + "loss": 2.9532389640808105, + "step": 5581, + "token_acc": 0.3041657917513841 + }, + { + "epoch": 3.2720609791849897, + "grad_norm": 0.2535122313665315, + "learning_rate": 0.00019302637564906814, + "loss": 2.950535535812378, + "step": 5582, + "token_acc": 0.30556480490141336 + }, + { + "epoch": 3.272647317502199, + "grad_norm": 0.30450078721574747, + "learning_rate": 0.00019302281926867875, + "loss": 2.9460368156433105, + "step": 5583, + "token_acc": 0.3038447535881119 + }, + { + "epoch": 3.273233655819408, + "grad_norm": 0.27252593105743816, + "learning_rate": 0.00019301926201446533, + "loss": 2.964747905731201, + "step": 5584, + "token_acc": 0.30350385477974706 + }, + { + "epoch": 3.273819994136617, + "grad_norm": 0.27433829771297846, + "learning_rate": 0.00019301570388646132, + "loss": 2.9628806114196777, + "step": 5585, + "token_acc": 0.3040403093627057 + }, + { + "epoch": 3.2744063324538257, + "grad_norm": 0.34769133592402734, + "learning_rate": 0.00019301214488470008, + "loss": 3.0120959281921387, + "step": 5586, + "token_acc": 0.29608672798948754 + }, + { + "epoch": 3.274992670771035, + "grad_norm": 0.2858622620201363, + "learning_rate": 0.0001930085850092151, + "loss": 2.975733995437622, + "step": 5587, + "token_acc": 0.3004513742099638 + }, + { + "epoch": 3.275579009088244, + "grad_norm": 0.3226812869987855, + "learning_rate": 0.00019300502426003978, + "loss": 2.961763858795166, + "step": 5588, + "token_acc": 0.3029758664975722 + }, + { + "epoch": 3.276165347405453, + "grad_norm": 0.2349015610205357, + "learning_rate": 0.0001930014626372076, + "loss": 2.920170307159424, + "step": 5589, + "token_acc": 0.31051243294508163 + }, + { + "epoch": 3.2767516857226617, + "grad_norm": 0.32615475800750326, + "learning_rate": 0.00019299790014075193, + "loss": 2.913259983062744, + "step": 5590, + "token_acc": 0.309280436936712 + }, + { + "epoch": 3.277338024039871, + "grad_norm": 0.3181641601521066, + "learning_rate": 0.00019299433677070636, + "loss": 2.9367003440856934, + "step": 5591, + "token_acc": 0.3072254128067674 + }, + { + "epoch": 3.27792436235708, + "grad_norm": 0.2684538628402062, + "learning_rate": 0.00019299077252710433, + "loss": 2.9809985160827637, + "step": 5592, + "token_acc": 0.3012360351794628 + }, + { + "epoch": 3.278510700674289, + "grad_norm": 0.31631293400830973, + "learning_rate": 0.00019298720740997926, + "loss": 2.934690475463867, + "step": 5593, + "token_acc": 0.3066995180425111 + }, + { + "epoch": 3.279097038991498, + "grad_norm": 0.28334465338371556, + "learning_rate": 0.0001929836414193647, + "loss": 2.9570372104644775, + "step": 5594, + "token_acc": 0.3021197168782687 + }, + { + "epoch": 3.2796833773087073, + "grad_norm": 0.3073094621139582, + "learning_rate": 0.00019298007455529413, + "loss": 2.981383800506592, + "step": 5595, + "token_acc": 0.2994228275473159 + }, + { + "epoch": 3.280269715625916, + "grad_norm": 0.2673146351515476, + "learning_rate": 0.00019297650681780103, + "loss": 2.9781689643859863, + "step": 5596, + "token_acc": 0.3017087253436502 + }, + { + "epoch": 3.280856053943125, + "grad_norm": 0.32240127724091405, + "learning_rate": 0.00019297293820691894, + "loss": 2.968599557876587, + "step": 5597, + "token_acc": 0.3031788002575551 + }, + { + "epoch": 3.281442392260334, + "grad_norm": 0.27184758006611964, + "learning_rate": 0.0001929693687226814, + "loss": 2.9974708557128906, + "step": 5598, + "token_acc": 0.29767199760563195 + }, + { + "epoch": 3.2820287305775433, + "grad_norm": 0.30325717457172857, + "learning_rate": 0.0001929657983651219, + "loss": 2.932379722595215, + "step": 5599, + "token_acc": 0.30830088664113253 + }, + { + "epoch": 3.2826150688947524, + "grad_norm": 0.26450258926836745, + "learning_rate": 0.000192962227134274, + "loss": 2.9293246269226074, + "step": 5600, + "token_acc": 0.30824227780352365 + }, + { + "epoch": 3.283201407211961, + "grad_norm": 0.253994119682815, + "learning_rate": 0.00019295865503017124, + "loss": 2.9782087802886963, + "step": 5601, + "token_acc": 0.2998636960295485 + }, + { + "epoch": 3.28378774552917, + "grad_norm": 0.26392188798613786, + "learning_rate": 0.00019295508205284718, + "loss": 2.9324090480804443, + "step": 5602, + "token_acc": 0.3082858484130916 + }, + { + "epoch": 3.2843740838463793, + "grad_norm": 0.29120151706337566, + "learning_rate": 0.00019295150820233537, + "loss": 2.9344115257263184, + "step": 5603, + "token_acc": 0.3076634395735886 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.265867571543838, + "learning_rate": 0.00019294793347866942, + "loss": 2.958077907562256, + "step": 5604, + "token_acc": 0.30318789577703387 + }, + { + "epoch": 3.2855467604807975, + "grad_norm": 0.2903039371646153, + "learning_rate": 0.0001929443578818829, + "loss": 2.9747567176818848, + "step": 5605, + "token_acc": 0.30161072827852753 + }, + { + "epoch": 3.2861330987980066, + "grad_norm": 0.27459667584453373, + "learning_rate": 0.00019294078141200935, + "loss": 2.934018611907959, + "step": 5606, + "token_acc": 0.3075634179436999 + }, + { + "epoch": 3.2867194371152153, + "grad_norm": 0.2827367549242991, + "learning_rate": 0.0001929372040690824, + "loss": 2.91848087310791, + "step": 5607, + "token_acc": 0.30907608062449027 + }, + { + "epoch": 3.2873057754324244, + "grad_norm": 0.24622937538907255, + "learning_rate": 0.00019293362585313565, + "loss": 2.968585729598999, + "step": 5608, + "token_acc": 0.30346410948504915 + }, + { + "epoch": 3.2878921137496335, + "grad_norm": 0.30342375238963964, + "learning_rate": 0.0001929300467642027, + "loss": 2.969679355621338, + "step": 5609, + "token_acc": 0.30328407348954517 + }, + { + "epoch": 3.2884784520668426, + "grad_norm": 0.2921242190379111, + "learning_rate": 0.00019292646680231723, + "loss": 2.961977481842041, + "step": 5610, + "token_acc": 0.3045719373330626 + }, + { + "epoch": 3.2890647903840518, + "grad_norm": 0.22171135317389842, + "learning_rate": 0.0001929228859675128, + "loss": 2.9748570919036865, + "step": 5611, + "token_acc": 0.300853377828259 + }, + { + "epoch": 3.2896511287012604, + "grad_norm": 0.3267266266540225, + "learning_rate": 0.00019291930425982307, + "loss": 2.978346824645996, + "step": 5612, + "token_acc": 0.3013138367176197 + }, + { + "epoch": 3.2902374670184695, + "grad_norm": 0.35054062230463917, + "learning_rate": 0.0001929157216792817, + "loss": 2.972545623779297, + "step": 5613, + "token_acc": 0.30031920130856654 + }, + { + "epoch": 3.2908238053356786, + "grad_norm": 0.3098351010504053, + "learning_rate": 0.00019291213822592232, + "loss": 2.961137056350708, + "step": 5614, + "token_acc": 0.30347959381196277 + }, + { + "epoch": 3.2914101436528878, + "grad_norm": 0.26935602775795514, + "learning_rate": 0.00019290855389977857, + "loss": 2.9546191692352295, + "step": 5615, + "token_acc": 0.30577952418076315 + }, + { + "epoch": 3.291996481970097, + "grad_norm": 0.30172629714914667, + "learning_rate": 0.0001929049687008842, + "loss": 2.960115909576416, + "step": 5616, + "token_acc": 0.3040097262285351 + }, + { + "epoch": 3.292582820287306, + "grad_norm": 0.3059663164886852, + "learning_rate": 0.00019290138262927282, + "loss": 2.955659866333008, + "step": 5617, + "token_acc": 0.3050338389361736 + }, + { + "epoch": 3.2931691586045146, + "grad_norm": 0.29407980002025524, + "learning_rate": 0.00019289779568497814, + "loss": 2.986624002456665, + "step": 5618, + "token_acc": 0.2984275965809823 + }, + { + "epoch": 3.2937554969217238, + "grad_norm": 0.2993730763091169, + "learning_rate": 0.00019289420786803386, + "loss": 2.954629421234131, + "step": 5619, + "token_acc": 0.30345018811576435 + }, + { + "epoch": 3.294341835238933, + "grad_norm": 0.2617401469708947, + "learning_rate": 0.00019289061917847366, + "loss": 2.964477062225342, + "step": 5620, + "token_acc": 0.30397145713545165 + }, + { + "epoch": 3.294928173556142, + "grad_norm": 0.2790695180521545, + "learning_rate": 0.00019288702961633126, + "loss": 2.9307661056518555, + "step": 5621, + "token_acc": 0.3081485368024156 + }, + { + "epoch": 3.295514511873351, + "grad_norm": 0.24246693095933747, + "learning_rate": 0.0001928834391816404, + "loss": 2.954713821411133, + "step": 5622, + "token_acc": 0.3044471138321644 + }, + { + "epoch": 3.2961008501905598, + "grad_norm": 0.2835213509827598, + "learning_rate": 0.00019287984787443477, + "loss": 2.9860241413116455, + "step": 5623, + "token_acc": 0.3005023322569071 + }, + { + "epoch": 3.296687188507769, + "grad_norm": 0.26517372014441176, + "learning_rate": 0.00019287625569474815, + "loss": 2.9198975563049316, + "step": 5624, + "token_acc": 0.3090585236520724 + }, + { + "epoch": 3.297273526824978, + "grad_norm": 0.2534006383234535, + "learning_rate": 0.00019287266264261425, + "loss": 2.938803195953369, + "step": 5625, + "token_acc": 0.30819750066102414 + }, + { + "epoch": 3.297859865142187, + "grad_norm": 0.2787641537503187, + "learning_rate": 0.00019286906871806685, + "loss": 2.956223964691162, + "step": 5626, + "token_acc": 0.30441546164713706 + }, + { + "epoch": 3.298446203459396, + "grad_norm": 0.3168997539661053, + "learning_rate": 0.00019286547392113965, + "loss": 2.942523956298828, + "step": 5627, + "token_acc": 0.30676759384965113 + }, + { + "epoch": 3.2990325417766053, + "grad_norm": 0.32357189083828575, + "learning_rate": 0.00019286187825186648, + "loss": 2.9433374404907227, + "step": 5628, + "token_acc": 0.3071390098885641 + }, + { + "epoch": 3.299618880093814, + "grad_norm": 0.2944423668480439, + "learning_rate": 0.0001928582817102811, + "loss": 2.943603992462158, + "step": 5629, + "token_acc": 0.30697099270076894 + }, + { + "epoch": 3.300205218411023, + "grad_norm": 0.26484351491760716, + "learning_rate": 0.0001928546842964173, + "loss": 2.9256739616394043, + "step": 5630, + "token_acc": 0.3088785251118085 + }, + { + "epoch": 3.300791556728232, + "grad_norm": 0.2798517635077239, + "learning_rate": 0.00019285108601030886, + "loss": 2.976163864135742, + "step": 5631, + "token_acc": 0.3009628171162906 + }, + { + "epoch": 3.3013778950454413, + "grad_norm": 0.30906028537764757, + "learning_rate": 0.00019284748685198958, + "loss": 2.9555845260620117, + "step": 5632, + "token_acc": 0.3032182009856617 + }, + { + "epoch": 3.3019642333626504, + "grad_norm": 0.3416426534833998, + "learning_rate": 0.00019284388682149327, + "loss": 2.9616713523864746, + "step": 5633, + "token_acc": 0.3050974355752416 + }, + { + "epoch": 3.302550571679859, + "grad_norm": 0.26173136486004844, + "learning_rate": 0.00019284028591885378, + "loss": 2.919099807739258, + "step": 5634, + "token_acc": 0.3094568356738325 + }, + { + "epoch": 3.303136909997068, + "grad_norm": 0.32226516550980766, + "learning_rate": 0.00019283668414410486, + "loss": 2.9393768310546875, + "step": 5635, + "token_acc": 0.3071798452318613 + }, + { + "epoch": 3.3037232483142773, + "grad_norm": 0.3119222455225087, + "learning_rate": 0.00019283308149728044, + "loss": 2.9706835746765137, + "step": 5636, + "token_acc": 0.30226297570682364 + }, + { + "epoch": 3.3043095866314864, + "grad_norm": 0.2342311201566956, + "learning_rate": 0.00019282947797841427, + "loss": 2.9796180725097656, + "step": 5637, + "token_acc": 0.30093624609897457 + }, + { + "epoch": 3.3048959249486956, + "grad_norm": 0.3048411019843206, + "learning_rate": 0.0001928258735875403, + "loss": 2.979363441467285, + "step": 5638, + "token_acc": 0.30102966958518435 + }, + { + "epoch": 3.3054822632659047, + "grad_norm": 0.26769549451694696, + "learning_rate": 0.0001928222683246923, + "loss": 2.972560405731201, + "step": 5639, + "token_acc": 0.3018837698507533 + }, + { + "epoch": 3.3060686015831133, + "grad_norm": 0.30440198034641774, + "learning_rate": 0.00019281866218990413, + "loss": 2.968074321746826, + "step": 5640, + "token_acc": 0.3025897178150616 + }, + { + "epoch": 3.3066549399003224, + "grad_norm": 0.27054162135329285, + "learning_rate": 0.00019281505518320974, + "loss": 2.946080207824707, + "step": 5641, + "token_acc": 0.3049476529505286 + }, + { + "epoch": 3.3072412782175316, + "grad_norm": 0.24653796185609123, + "learning_rate": 0.000192811447304643, + "loss": 2.9238927364349365, + "step": 5642, + "token_acc": 0.3092461682498606 + }, + { + "epoch": 3.3078276165347407, + "grad_norm": 0.31189243648968873, + "learning_rate": 0.00019280783855423774, + "loss": 2.977013111114502, + "step": 5643, + "token_acc": 0.30164238902597573 + }, + { + "epoch": 3.3084139548519493, + "grad_norm": 0.25019451087710404, + "learning_rate": 0.0001928042289320279, + "loss": 2.986665964126587, + "step": 5644, + "token_acc": 0.29997862434634814 + }, + { + "epoch": 3.3090002931691584, + "grad_norm": 0.3206138942759848, + "learning_rate": 0.0001928006184380474, + "loss": 2.9827170372009277, + "step": 5645, + "token_acc": 0.30077514574341413 + }, + { + "epoch": 3.3095866314863676, + "grad_norm": 0.28398298834414215, + "learning_rate": 0.00019279700707233014, + "loss": 2.94273042678833, + "step": 5646, + "token_acc": 0.30576990831618067 + }, + { + "epoch": 3.3101729698035767, + "grad_norm": 0.27591748504945957, + "learning_rate": 0.00019279339483491004, + "loss": 2.9414987564086914, + "step": 5647, + "token_acc": 0.3056171627293948 + }, + { + "epoch": 3.310759308120786, + "grad_norm": 0.3155387188990721, + "learning_rate": 0.00019278978172582102, + "loss": 2.923828601837158, + "step": 5648, + "token_acc": 0.3106152832380272 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.24464084938338984, + "learning_rate": 0.00019278616774509705, + "loss": 2.982725143432617, + "step": 5649, + "token_acc": 0.30034968533451006 + }, + { + "epoch": 3.3119319847552036, + "grad_norm": 0.297217869247182, + "learning_rate": 0.00019278255289277208, + "loss": 2.96604585647583, + "step": 5650, + "token_acc": 0.3025845777246981 + }, + { + "epoch": 3.3125183230724127, + "grad_norm": 0.3132693702015149, + "learning_rate": 0.00019277893716888005, + "loss": 2.9986701011657715, + "step": 5651, + "token_acc": 0.2988245872877655 + }, + { + "epoch": 3.313104661389622, + "grad_norm": 0.29853606790193493, + "learning_rate": 0.00019277532057345492, + "loss": 2.9537243843078613, + "step": 5652, + "token_acc": 0.304517963242967 + }, + { + "epoch": 3.313690999706831, + "grad_norm": 0.2529087482548198, + "learning_rate": 0.00019277170310653063, + "loss": 2.950713634490967, + "step": 5653, + "token_acc": 0.3051230624144939 + }, + { + "epoch": 3.31427733802404, + "grad_norm": 0.27861550005316726, + "learning_rate": 0.00019276808476814125, + "loss": 2.945302963256836, + "step": 5654, + "token_acc": 0.3042957610168398 + }, + { + "epoch": 3.3148636763412487, + "grad_norm": 0.28419898766177615, + "learning_rate": 0.00019276446555832068, + "loss": 2.987307071685791, + "step": 5655, + "token_acc": 0.2997105168686938 + }, + { + "epoch": 3.315450014658458, + "grad_norm": 0.2857214025897931, + "learning_rate": 0.000192760845477103, + "loss": 2.9232144355773926, + "step": 5656, + "token_acc": 0.3090301275625728 + }, + { + "epoch": 3.316036352975667, + "grad_norm": 0.3159537158519594, + "learning_rate": 0.00019275722452452215, + "loss": 2.994924783706665, + "step": 5657, + "token_acc": 0.29901491276854314 + }, + { + "epoch": 3.316622691292876, + "grad_norm": 0.30215029431133145, + "learning_rate": 0.00019275360270061217, + "loss": 2.9695663452148438, + "step": 5658, + "token_acc": 0.30233164273499274 + }, + { + "epoch": 3.317209029610085, + "grad_norm": 0.26074971198414304, + "learning_rate": 0.0001927499800054071, + "loss": 2.9536001682281494, + "step": 5659, + "token_acc": 0.3047785690729854 + }, + { + "epoch": 3.3177953679272942, + "grad_norm": 0.31576566812968443, + "learning_rate": 0.00019274635643894093, + "loss": 2.9794039726257324, + "step": 5660, + "token_acc": 0.3027099286318575 + }, + { + "epoch": 3.318381706244503, + "grad_norm": 0.2625850334677903, + "learning_rate": 0.00019274273200124773, + "loss": 2.9376702308654785, + "step": 5661, + "token_acc": 0.30692136837750145 + }, + { + "epoch": 3.318968044561712, + "grad_norm": 0.30368499839261853, + "learning_rate": 0.00019273910669236153, + "loss": 2.98230242729187, + "step": 5662, + "token_acc": 0.2988292109058776 + }, + { + "epoch": 3.319554382878921, + "grad_norm": 0.26906632448320994, + "learning_rate": 0.00019273548051231638, + "loss": 2.931710720062256, + "step": 5663, + "token_acc": 0.30789647410229215 + }, + { + "epoch": 3.3201407211961302, + "grad_norm": 0.30584429355847903, + "learning_rate": 0.00019273185346114637, + "loss": 2.9478423595428467, + "step": 5664, + "token_acc": 0.30599305419042 + }, + { + "epoch": 3.3207270595133394, + "grad_norm": 0.2770848654454532, + "learning_rate": 0.00019272822553888553, + "loss": 2.9587509632110596, + "step": 5665, + "token_acc": 0.30257670306812273 + }, + { + "epoch": 3.321313397830548, + "grad_norm": 0.2835828703361468, + "learning_rate": 0.00019272459674556797, + "loss": 2.9509706497192383, + "step": 5666, + "token_acc": 0.30482309143970876 + }, + { + "epoch": 3.321899736147757, + "grad_norm": 0.31926620541698986, + "learning_rate": 0.00019272096708122777, + "loss": 2.9831361770629883, + "step": 5667, + "token_acc": 0.30090741568735896 + }, + { + "epoch": 3.3224860744649662, + "grad_norm": 0.2715901724645933, + "learning_rate": 0.00019271733654589905, + "loss": 2.9557747840881348, + "step": 5668, + "token_acc": 0.30477269122487016 + }, + { + "epoch": 3.3230724127821754, + "grad_norm": 0.28682925621032807, + "learning_rate": 0.0001927137051396159, + "loss": 2.9165279865264893, + "step": 5669, + "token_acc": 0.30953384942198603 + }, + { + "epoch": 3.3236587510993845, + "grad_norm": 0.28960361202311696, + "learning_rate": 0.0001927100728624124, + "loss": 2.9681005477905273, + "step": 5670, + "token_acc": 0.30274776532487097 + }, + { + "epoch": 3.3242450894165936, + "grad_norm": 0.2774494656611944, + "learning_rate": 0.0001927064397143227, + "loss": 2.9485220909118652, + "step": 5671, + "token_acc": 0.3063155649991136 + }, + { + "epoch": 3.3248314277338022, + "grad_norm": 0.2772583363505963, + "learning_rate": 0.0001927028056953809, + "loss": 2.980656385421753, + "step": 5672, + "token_acc": 0.3017393848497708 + }, + { + "epoch": 3.3254177660510114, + "grad_norm": 0.2725767530220081, + "learning_rate": 0.00019269917080562117, + "loss": 2.9627394676208496, + "step": 5673, + "token_acc": 0.30403203254099403 + }, + { + "epoch": 3.3260041043682205, + "grad_norm": 0.28705112459275384, + "learning_rate": 0.00019269553504507766, + "loss": 2.9782228469848633, + "step": 5674, + "token_acc": 0.3018384561008148 + }, + { + "epoch": 3.3265904426854296, + "grad_norm": 0.3139674222373875, + "learning_rate": 0.0001926918984137845, + "loss": 2.983400344848633, + "step": 5675, + "token_acc": 0.30057985532015963 + }, + { + "epoch": 3.3271767810026387, + "grad_norm": 0.27784376111011244, + "learning_rate": 0.00019268826091177585, + "loss": 2.9108805656433105, + "step": 5676, + "token_acc": 0.3104204884716377 + }, + { + "epoch": 3.3277631193198474, + "grad_norm": 0.2770864500206743, + "learning_rate": 0.00019268462253908592, + "loss": 2.9188966751098633, + "step": 5677, + "token_acc": 0.3093889547481979 + }, + { + "epoch": 3.3283494576370565, + "grad_norm": 0.24885516142564773, + "learning_rate": 0.00019268098329574878, + "loss": 2.9553380012512207, + "step": 5678, + "token_acc": 0.30414661284816774 + }, + { + "epoch": 3.3289357959542656, + "grad_norm": 0.28378008337627975, + "learning_rate": 0.00019267734318179877, + "loss": 2.9638848304748535, + "step": 5679, + "token_acc": 0.3019457738587691 + }, + { + "epoch": 3.3295221342714747, + "grad_norm": 0.23798786202981287, + "learning_rate": 0.00019267370219726998, + "loss": 2.9493443965911865, + "step": 5680, + "token_acc": 0.3050334735891098 + }, + { + "epoch": 3.330108472588684, + "grad_norm": 0.27419168193009863, + "learning_rate": 0.00019267006034219664, + "loss": 2.965242624282837, + "step": 5681, + "token_acc": 0.3017251243004847 + }, + { + "epoch": 3.330694810905893, + "grad_norm": 0.27020308407268856, + "learning_rate": 0.00019266641761661295, + "loss": 2.968632221221924, + "step": 5682, + "token_acc": 0.3023887490052315 + }, + { + "epoch": 3.3312811492231016, + "grad_norm": 0.32882436917410596, + "learning_rate": 0.00019266277402055313, + "loss": 2.984539270401001, + "step": 5683, + "token_acc": 0.30110208429622565 + }, + { + "epoch": 3.3318674875403107, + "grad_norm": 0.3651465503270693, + "learning_rate": 0.0001926591295540514, + "loss": 2.952151298522949, + "step": 5684, + "token_acc": 0.3050717359936002 + }, + { + "epoch": 3.33245382585752, + "grad_norm": 0.3970605265907252, + "learning_rate": 0.00019265548421714207, + "loss": 2.983992099761963, + "step": 5685, + "token_acc": 0.30074702116516544 + }, + { + "epoch": 3.333040164174729, + "grad_norm": 0.29377583005913965, + "learning_rate": 0.00019265183800985924, + "loss": 2.9404726028442383, + "step": 5686, + "token_acc": 0.3084859199721697 + }, + { + "epoch": 3.333626502491938, + "grad_norm": 0.3917996169424354, + "learning_rate": 0.0001926481909322373, + "loss": 2.949791431427002, + "step": 5687, + "token_acc": 0.30525483663026204 + }, + { + "epoch": 3.3342128408091467, + "grad_norm": 0.38980390150261435, + "learning_rate": 0.00019264454298431044, + "loss": 2.9654550552368164, + "step": 5688, + "token_acc": 0.3034991380354423 + }, + { + "epoch": 3.334799179126356, + "grad_norm": 0.35401578722629806, + "learning_rate": 0.0001926408941661129, + "loss": 2.923727035522461, + "step": 5689, + "token_acc": 0.30827883769028475 + }, + { + "epoch": 3.335385517443565, + "grad_norm": 0.2937316671555973, + "learning_rate": 0.00019263724447767905, + "loss": 2.9253411293029785, + "step": 5690, + "token_acc": 0.30914679291322983 + }, + { + "epoch": 3.335971855760774, + "grad_norm": 0.3297657933079299, + "learning_rate": 0.00019263359391904307, + "loss": 2.914416551589966, + "step": 5691, + "token_acc": 0.31069873256506647 + }, + { + "epoch": 3.336558194077983, + "grad_norm": 0.29909100307888226, + "learning_rate": 0.00019262994249023932, + "loss": 2.9227499961853027, + "step": 5692, + "token_acc": 0.30859070643234354 + }, + { + "epoch": 3.3371445323951923, + "grad_norm": 0.2644057742779, + "learning_rate": 0.0001926262901913021, + "loss": 2.9637508392333984, + "step": 5693, + "token_acc": 0.30323828556009436 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.2732700910977264, + "learning_rate": 0.00019262263702226568, + "loss": 2.969503402709961, + "step": 5694, + "token_acc": 0.30421621912500674 + }, + { + "epoch": 3.33831720902961, + "grad_norm": 0.27387498548337913, + "learning_rate": 0.00019261898298316438, + "loss": 2.981738328933716, + "step": 5695, + "token_acc": 0.3004588418109446 + }, + { + "epoch": 3.338903547346819, + "grad_norm": 0.27257578857082854, + "learning_rate": 0.0001926153280740326, + "loss": 3.010274887084961, + "step": 5696, + "token_acc": 0.29597389029881505 + }, + { + "epoch": 3.3394898856640283, + "grad_norm": 0.3091736036529012, + "learning_rate": 0.00019261167229490456, + "loss": 2.9579851627349854, + "step": 5697, + "token_acc": 0.30173400090268815 + }, + { + "epoch": 3.340076223981237, + "grad_norm": 0.27801056606265245, + "learning_rate": 0.00019260801564581468, + "loss": 2.948822021484375, + "step": 5698, + "token_acc": 0.304778498039919 + }, + { + "epoch": 3.340662562298446, + "grad_norm": 0.27984809975899383, + "learning_rate": 0.00019260435812679723, + "loss": 2.938413143157959, + "step": 5699, + "token_acc": 0.3070150335037526 + }, + { + "epoch": 3.341248900615655, + "grad_norm": 0.29096510335926856, + "learning_rate": 0.00019260069973788669, + "loss": 2.92311429977417, + "step": 5700, + "token_acc": 0.3099208204995888 + }, + { + "epoch": 3.3418352389328643, + "grad_norm": 0.2714718464622634, + "learning_rate": 0.00019259704047911732, + "loss": 2.9839437007904053, + "step": 5701, + "token_acc": 0.3000583029941827 + }, + { + "epoch": 3.3424215772500734, + "grad_norm": 0.29123218699299125, + "learning_rate": 0.00019259338035052356, + "loss": 2.973917007446289, + "step": 5702, + "token_acc": 0.30225141016178475 + }, + { + "epoch": 3.3430079155672825, + "grad_norm": 0.2787897996735095, + "learning_rate": 0.0001925897193521397, + "loss": 2.9662468433380127, + "step": 5703, + "token_acc": 0.30249452999483495 + }, + { + "epoch": 3.343594253884491, + "grad_norm": 0.31066528908859786, + "learning_rate": 0.00019258605748400024, + "loss": 2.9831621646881104, + "step": 5704, + "token_acc": 0.3000120086253257 + }, + { + "epoch": 3.3441805922017003, + "grad_norm": 0.24338094232582197, + "learning_rate": 0.00019258239474613954, + "loss": 2.9529409408569336, + "step": 5705, + "token_acc": 0.30383015884003645 + }, + { + "epoch": 3.3447669305189094, + "grad_norm": 0.2744231406061515, + "learning_rate": 0.000192578731138592, + "loss": 2.990333080291748, + "step": 5706, + "token_acc": 0.30015577665228926 + }, + { + "epoch": 3.3453532688361185, + "grad_norm": 0.2825590446945792, + "learning_rate": 0.000192575066661392, + "loss": 2.934597969055176, + "step": 5707, + "token_acc": 0.3078815262335871 + }, + { + "epoch": 3.3459396071533276, + "grad_norm": 0.30014556387197455, + "learning_rate": 0.00019257140131457402, + "loss": 2.9739320278167725, + "step": 5708, + "token_acc": 0.30023303112109817 + }, + { + "epoch": 3.3465259454705363, + "grad_norm": 0.2867699552732021, + "learning_rate": 0.00019256773509817245, + "loss": 2.948106288909912, + "step": 5709, + "token_acc": 0.3062908207049296 + }, + { + "epoch": 3.3471122837877454, + "grad_norm": 0.25611484686755054, + "learning_rate": 0.00019256406801222177, + "loss": 2.9990603923797607, + "step": 5710, + "token_acc": 0.2977181293769557 + }, + { + "epoch": 3.3476986221049545, + "grad_norm": 0.2963032919996877, + "learning_rate": 0.00019256040005675637, + "loss": 2.931915760040283, + "step": 5711, + "token_acc": 0.307949142652296 + }, + { + "epoch": 3.3482849604221636, + "grad_norm": 0.2656891019499225, + "learning_rate": 0.00019255673123181078, + "loss": 2.9224300384521484, + "step": 5712, + "token_acc": 0.3095964314380831 + }, + { + "epoch": 3.3488712987393727, + "grad_norm": 0.27327014798662047, + "learning_rate": 0.00019255306153741938, + "loss": 3.0212671756744385, + "step": 5713, + "token_acc": 0.2975475351883862 + }, + { + "epoch": 3.349457637056582, + "grad_norm": 0.28866700521472455, + "learning_rate": 0.0001925493909736167, + "loss": 2.9290785789489746, + "step": 5714, + "token_acc": 0.3085989534753317 + }, + { + "epoch": 3.3500439753737905, + "grad_norm": 0.2798457855291285, + "learning_rate": 0.0001925457195404372, + "loss": 2.998946189880371, + "step": 5715, + "token_acc": 0.2973970709002773 + }, + { + "epoch": 3.3506303136909996, + "grad_norm": 0.2555360480075001, + "learning_rate": 0.0001925420472379154, + "loss": 2.9664478302001953, + "step": 5716, + "token_acc": 0.3030438758561484 + }, + { + "epoch": 3.3512166520082087, + "grad_norm": 0.32201260277578936, + "learning_rate": 0.00019253837406608572, + "loss": 2.9732651710510254, + "step": 5717, + "token_acc": 0.3024322132907802 + }, + { + "epoch": 3.351802990325418, + "grad_norm": 0.3750234247351803, + "learning_rate": 0.00019253470002498276, + "loss": 2.9653706550598145, + "step": 5718, + "token_acc": 0.30171153081872426 + }, + { + "epoch": 3.352389328642627, + "grad_norm": 0.3045042616259551, + "learning_rate": 0.00019253102511464096, + "loss": 2.923147678375244, + "step": 5719, + "token_acc": 0.31095209885781394 + }, + { + "epoch": 3.3529756669598356, + "grad_norm": 0.31503768892191847, + "learning_rate": 0.00019252734933509485, + "loss": 2.9490671157836914, + "step": 5720, + "token_acc": 0.30544086299164463 + }, + { + "epoch": 3.3535620052770447, + "grad_norm": 0.417740447715003, + "learning_rate": 0.000192523672686379, + "loss": 2.97928524017334, + "step": 5721, + "token_acc": 0.3006874522237779 + }, + { + "epoch": 3.354148343594254, + "grad_norm": 0.4136862674080124, + "learning_rate": 0.00019251999516852792, + "loss": 2.99717378616333, + "step": 5722, + "token_acc": 0.2997853822676143 + }, + { + "epoch": 3.354734681911463, + "grad_norm": 0.3033397916968096, + "learning_rate": 0.00019251631678157612, + "loss": 2.9185729026794434, + "step": 5723, + "token_acc": 0.30994757999842154 + }, + { + "epoch": 3.355321020228672, + "grad_norm": 0.4411999003332541, + "learning_rate": 0.00019251263752555824, + "loss": 2.9884591102600098, + "step": 5724, + "token_acc": 0.30110112184884835 + }, + { + "epoch": 3.355907358545881, + "grad_norm": 0.34017388741115556, + "learning_rate": 0.0001925089574005088, + "loss": 2.950345993041992, + "step": 5725, + "token_acc": 0.3040848562138021 + }, + { + "epoch": 3.35649369686309, + "grad_norm": 0.3504643876493145, + "learning_rate": 0.00019250527640646232, + "loss": 2.9666855335235596, + "step": 5726, + "token_acc": 0.3017126753855008 + }, + { + "epoch": 3.357080035180299, + "grad_norm": 0.2962101506901891, + "learning_rate": 0.00019250159454345346, + "loss": 2.9271202087402344, + "step": 5727, + "token_acc": 0.30883555564420423 + }, + { + "epoch": 3.357666373497508, + "grad_norm": 0.3797009920046121, + "learning_rate": 0.00019249791181151675, + "loss": 3.001551628112793, + "step": 5728, + "token_acc": 0.29888918248709295 + }, + { + "epoch": 3.358252711814717, + "grad_norm": 0.2851303836353199, + "learning_rate": 0.0001924942282106868, + "loss": 2.9751412868499756, + "step": 5729, + "token_acc": 0.3013681532586585 + }, + { + "epoch": 3.3588390501319263, + "grad_norm": 0.33752683287973206, + "learning_rate": 0.00019249054374099819, + "loss": 2.960116386413574, + "step": 5730, + "token_acc": 0.30298590372129525 + }, + { + "epoch": 3.359425388449135, + "grad_norm": 0.28628521573140286, + "learning_rate": 0.00019248685840248558, + "loss": 2.959883213043213, + "step": 5731, + "token_acc": 0.30396260677313347 + }, + { + "epoch": 3.360011726766344, + "grad_norm": 0.2784896099867368, + "learning_rate": 0.00019248317219518356, + "loss": 2.9690403938293457, + "step": 5732, + "token_acc": 0.3037161590632962 + }, + { + "epoch": 3.360598065083553, + "grad_norm": 0.2910907024642491, + "learning_rate": 0.00019247948511912677, + "loss": 2.9849278926849365, + "step": 5733, + "token_acc": 0.301412987012987 + }, + { + "epoch": 3.3611844034007623, + "grad_norm": 0.28908645342418615, + "learning_rate": 0.00019247579717434984, + "loss": 2.9263014793395996, + "step": 5734, + "token_acc": 0.30818456446962866 + }, + { + "epoch": 3.3617707417179714, + "grad_norm": 0.2724714170784964, + "learning_rate": 0.00019247210836088736, + "loss": 2.9213266372680664, + "step": 5735, + "token_acc": 0.31018308640194525 + }, + { + "epoch": 3.3623570800351805, + "grad_norm": 0.3177989138583467, + "learning_rate": 0.00019246841867877405, + "loss": 2.96691632270813, + "step": 5736, + "token_acc": 0.3020921526690512 + }, + { + "epoch": 3.362943418352389, + "grad_norm": 0.24711114588005784, + "learning_rate": 0.00019246472812804459, + "loss": 2.9579567909240723, + "step": 5737, + "token_acc": 0.304283665972433 + }, + { + "epoch": 3.3635297566695983, + "grad_norm": 0.29805592520589924, + "learning_rate": 0.00019246103670873357, + "loss": 2.9577836990356445, + "step": 5738, + "token_acc": 0.3036515004027487 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.2952313865428316, + "learning_rate": 0.0001924573444208757, + "loss": 2.9634928703308105, + "step": 5739, + "token_acc": 0.3031465989548213 + }, + { + "epoch": 3.3647024333040165, + "grad_norm": 0.318628031202649, + "learning_rate": 0.00019245365126450569, + "loss": 2.961700916290283, + "step": 5740, + "token_acc": 0.3045488179982932 + }, + { + "epoch": 3.3652887716212256, + "grad_norm": 0.2624352806069112, + "learning_rate": 0.00019244995723965817, + "loss": 2.933213949203491, + "step": 5741, + "token_acc": 0.30751287108629566 + }, + { + "epoch": 3.3658751099384343, + "grad_norm": 0.3099060242222501, + "learning_rate": 0.00019244626234636792, + "loss": 2.9584293365478516, + "step": 5742, + "token_acc": 0.3024436744468745 + }, + { + "epoch": 3.3664614482556434, + "grad_norm": 0.28218955311983523, + "learning_rate": 0.0001924425665846696, + "loss": 2.940584182739258, + "step": 5743, + "token_acc": 0.30733539903992807 + }, + { + "epoch": 3.3670477865728525, + "grad_norm": 0.270669129816695, + "learning_rate": 0.00019243886995459793, + "loss": 2.911543846130371, + "step": 5744, + "token_acc": 0.3119433897134967 + }, + { + "epoch": 3.3676341248900616, + "grad_norm": 0.2838136507102207, + "learning_rate": 0.00019243517245618765, + "loss": 2.939208745956421, + "step": 5745, + "token_acc": 0.3045385939361843 + }, + { + "epoch": 3.3682204632072708, + "grad_norm": 0.2811213026772572, + "learning_rate": 0.00019243147408947345, + "loss": 2.954514980316162, + "step": 5746, + "token_acc": 0.303903032204919 + }, + { + "epoch": 3.36880680152448, + "grad_norm": 0.26376206148353515, + "learning_rate": 0.00019242777485449012, + "loss": 2.951934576034546, + "step": 5747, + "token_acc": 0.3051238011926733 + }, + { + "epoch": 3.3693931398416885, + "grad_norm": 0.2603075614460153, + "learning_rate": 0.0001924240747512724, + "loss": 2.971841335296631, + "step": 5748, + "token_acc": 0.30203551044604715 + }, + { + "epoch": 3.3699794781588976, + "grad_norm": 0.28425169389587956, + "learning_rate": 0.00019242037377985508, + "loss": 2.9374032020568848, + "step": 5749, + "token_acc": 0.30648413043928935 + }, + { + "epoch": 3.3705658164761068, + "grad_norm": 0.2975327074195619, + "learning_rate": 0.00019241667194027281, + "loss": 2.928086757659912, + "step": 5750, + "token_acc": 0.3076360087826299 + }, + { + "epoch": 3.371152154793316, + "grad_norm": 0.352366402367999, + "learning_rate": 0.0001924129692325605, + "loss": 2.9284329414367676, + "step": 5751, + "token_acc": 0.3086280717784176 + }, + { + "epoch": 3.3717384931105245, + "grad_norm": 0.2747248576501104, + "learning_rate": 0.00019240926565675283, + "loss": 2.9532876014709473, + "step": 5752, + "token_acc": 0.3050555873658247 + }, + { + "epoch": 3.3723248314277336, + "grad_norm": 0.2696115813869362, + "learning_rate": 0.00019240556121288463, + "loss": 2.990560531616211, + "step": 5753, + "token_acc": 0.2997854564729044 + }, + { + "epoch": 3.3729111697449428, + "grad_norm": 0.30870040806028043, + "learning_rate": 0.00019240185590099076, + "loss": 2.9389357566833496, + "step": 5754, + "token_acc": 0.30667883995147927 + }, + { + "epoch": 3.373497508062152, + "grad_norm": 0.24842278371776058, + "learning_rate": 0.0001923981497211059, + "loss": 2.94051194190979, + "step": 5755, + "token_acc": 0.3072416688460956 + }, + { + "epoch": 3.374083846379361, + "grad_norm": 0.31767891170471424, + "learning_rate": 0.000192394442673265, + "loss": 2.963165283203125, + "step": 5756, + "token_acc": 0.30041329162437075 + }, + { + "epoch": 3.37467018469657, + "grad_norm": 0.2904844746443188, + "learning_rate": 0.00019239073475750274, + "loss": 2.949331760406494, + "step": 5757, + "token_acc": 0.3057805075320999 + }, + { + "epoch": 3.3752565230137788, + "grad_norm": 0.2796993327438782, + "learning_rate": 0.00019238702597385406, + "loss": 2.9786510467529297, + "step": 5758, + "token_acc": 0.30180215383172604 + }, + { + "epoch": 3.375842861330988, + "grad_norm": 0.3004838477890508, + "learning_rate": 0.00019238331632235375, + "loss": 2.983963966369629, + "step": 5759, + "token_acc": 0.299472132932127 + }, + { + "epoch": 3.376429199648197, + "grad_norm": 0.32721998720120615, + "learning_rate": 0.0001923796058030367, + "loss": 2.961064338684082, + "step": 5760, + "token_acc": 0.30374782257721883 + }, + { + "epoch": 3.377015537965406, + "grad_norm": 0.27795233031942596, + "learning_rate": 0.00019237589441593772, + "loss": 2.971208333969116, + "step": 5761, + "token_acc": 0.3019162363740677 + }, + { + "epoch": 3.377601876282615, + "grad_norm": 0.2749830910531999, + "learning_rate": 0.0001923721821610917, + "loss": 2.9554057121276855, + "step": 5762, + "token_acc": 0.30349082477712713 + }, + { + "epoch": 3.378188214599824, + "grad_norm": 0.3333446360762304, + "learning_rate": 0.0001923684690385335, + "loss": 2.9824366569519043, + "step": 5763, + "token_acc": 0.30133124593406707 + }, + { + "epoch": 3.378774552917033, + "grad_norm": 0.34396770282222777, + "learning_rate": 0.00019236475504829796, + "loss": 2.964900016784668, + "step": 5764, + "token_acc": 0.3036453514499458 + }, + { + "epoch": 3.379360891234242, + "grad_norm": 0.2975301875118032, + "learning_rate": 0.00019236104019042008, + "loss": 2.9454212188720703, + "step": 5765, + "token_acc": 0.3052345129024124 + }, + { + "epoch": 3.379947229551451, + "grad_norm": 0.3437034142128457, + "learning_rate": 0.00019235732446493464, + "loss": 2.9291210174560547, + "step": 5766, + "token_acc": 0.3094125206630842 + }, + { + "epoch": 3.3805335678686603, + "grad_norm": 0.322747865861446, + "learning_rate": 0.00019235360787187657, + "loss": 2.9577784538269043, + "step": 5767, + "token_acc": 0.3048610260153332 + }, + { + "epoch": 3.3811199061858694, + "grad_norm": 0.2881548319795034, + "learning_rate": 0.00019234989041128084, + "loss": 2.9466099739074707, + "step": 5768, + "token_acc": 0.3054564643799472 + }, + { + "epoch": 3.381706244503078, + "grad_norm": 0.304900992340823, + "learning_rate": 0.00019234617208318232, + "loss": 2.944830894470215, + "step": 5769, + "token_acc": 0.30645207338944735 + }, + { + "epoch": 3.382292582820287, + "grad_norm": 0.2804240672635995, + "learning_rate": 0.00019234245288761597, + "loss": 2.94958758354187, + "step": 5770, + "token_acc": 0.3045614820049878 + }, + { + "epoch": 3.3828789211374963, + "grad_norm": 0.26921335215013226, + "learning_rate": 0.00019233873282461668, + "loss": 2.957679271697998, + "step": 5771, + "token_acc": 0.3034449476949062 + }, + { + "epoch": 3.3834652594547054, + "grad_norm": 0.2518484328120801, + "learning_rate": 0.00019233501189421946, + "loss": 2.9837541580200195, + "step": 5772, + "token_acc": 0.30064528744622604 + }, + { + "epoch": 3.3840515977719146, + "grad_norm": 0.2548911441272588, + "learning_rate": 0.00019233129009645918, + "loss": 2.93544340133667, + "step": 5773, + "token_acc": 0.30752351433958414 + }, + { + "epoch": 3.3846379360891232, + "grad_norm": 0.2908713318799221, + "learning_rate": 0.00019232756743137088, + "loss": 2.934262275695801, + "step": 5774, + "token_acc": 0.30866100649812644 + }, + { + "epoch": 3.3852242744063323, + "grad_norm": 0.30772870115115925, + "learning_rate": 0.00019232384389898947, + "loss": 2.949921131134033, + "step": 5775, + "token_acc": 0.3061487068403059 + }, + { + "epoch": 3.3858106127235414, + "grad_norm": 0.2562417150644748, + "learning_rate": 0.00019232011949934998, + "loss": 2.927000045776367, + "step": 5776, + "token_acc": 0.3079410927138847 + }, + { + "epoch": 3.3863969510407506, + "grad_norm": 0.3130537380528935, + "learning_rate": 0.00019231639423248736, + "loss": 2.9760966300964355, + "step": 5777, + "token_acc": 0.3021288733307403 + }, + { + "epoch": 3.3869832893579597, + "grad_norm": 0.284372304957134, + "learning_rate": 0.0001923126680984366, + "loss": 2.941946029663086, + "step": 5778, + "token_acc": 0.30766374647519845 + }, + { + "epoch": 3.387569627675169, + "grad_norm": 0.2855532747262567, + "learning_rate": 0.00019230894109723275, + "loss": 2.9145219326019287, + "step": 5779, + "token_acc": 0.3116067734233852 + }, + { + "epoch": 3.3881559659923774, + "grad_norm": 0.28712431488170337, + "learning_rate": 0.00019230521322891075, + "loss": 2.947572708129883, + "step": 5780, + "token_acc": 0.3063375718588172 + }, + { + "epoch": 3.3887423043095866, + "grad_norm": 0.31002664180415784, + "learning_rate": 0.0001923014844935057, + "loss": 2.9784586429595947, + "step": 5781, + "token_acc": 0.30071397054839233 + }, + { + "epoch": 3.3893286426267957, + "grad_norm": 0.29465181639612187, + "learning_rate": 0.00019229775489105255, + "loss": 2.93618106842041, + "step": 5782, + "token_acc": 0.30649953362357957 + }, + { + "epoch": 3.389914980944005, + "grad_norm": 0.23480621251682127, + "learning_rate": 0.00019229402442158636, + "loss": 2.9413909912109375, + "step": 5783, + "token_acc": 0.30645301201018854 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.29320132725755377, + "learning_rate": 0.0001922902930851422, + "loss": 2.969813823699951, + "step": 5784, + "token_acc": 0.30218039141682196 + }, + { + "epoch": 3.3910876575784226, + "grad_norm": 0.2602273753746014, + "learning_rate": 0.00019228656088175506, + "loss": 2.9778590202331543, + "step": 5785, + "token_acc": 0.3013628458339997 + }, + { + "epoch": 3.3916739958956317, + "grad_norm": 0.2478350241891969, + "learning_rate": 0.00019228282781146009, + "loss": 2.996553897857666, + "step": 5786, + "token_acc": 0.29894062602690424 + }, + { + "epoch": 3.392260334212841, + "grad_norm": 0.2769157756672533, + "learning_rate": 0.00019227909387429227, + "loss": 2.968242645263672, + "step": 5787, + "token_acc": 0.3020055243823363 + }, + { + "epoch": 3.39284667253005, + "grad_norm": 0.3261816456990511, + "learning_rate": 0.0001922753590702867, + "loss": 2.9493539333343506, + "step": 5788, + "token_acc": 0.3049907240469285 + }, + { + "epoch": 3.393433010847259, + "grad_norm": 0.3323684665831343, + "learning_rate": 0.0001922716233994785, + "loss": 2.9691548347473145, + "step": 5789, + "token_acc": 0.30354636571863863 + }, + { + "epoch": 3.394019349164468, + "grad_norm": 0.2879545067866791, + "learning_rate": 0.00019226788686190274, + "loss": 2.9961557388305664, + "step": 5790, + "token_acc": 0.298993341315808 + }, + { + "epoch": 3.394605687481677, + "grad_norm": 0.2660184175729306, + "learning_rate": 0.0001922641494575945, + "loss": 2.9196767807006836, + "step": 5791, + "token_acc": 0.30901362459182524 + }, + { + "epoch": 3.395192025798886, + "grad_norm": 0.2848352009248185, + "learning_rate": 0.0001922604111865889, + "loss": 2.962125301361084, + "step": 5792, + "token_acc": 0.30390464990287924 + }, + { + "epoch": 3.395778364116095, + "grad_norm": 0.25878313032633177, + "learning_rate": 0.0001922566720489211, + "loss": 2.9233264923095703, + "step": 5793, + "token_acc": 0.3111245068542549 + }, + { + "epoch": 3.396364702433304, + "grad_norm": 0.29503334538679316, + "learning_rate": 0.00019225293204462615, + "loss": 2.9470572471618652, + "step": 5794, + "token_acc": 0.30614880406909645 + }, + { + "epoch": 3.3969510407505132, + "grad_norm": 0.3118013756149966, + "learning_rate": 0.0001922491911737392, + "loss": 2.9495840072631836, + "step": 5795, + "token_acc": 0.3060521248724235 + }, + { + "epoch": 3.397537379067722, + "grad_norm": 0.30792010718261004, + "learning_rate": 0.00019224544943629543, + "loss": 2.969783306121826, + "step": 5796, + "token_acc": 0.302569684299008 + }, + { + "epoch": 3.398123717384931, + "grad_norm": 0.27125341798012426, + "learning_rate": 0.00019224170683232995, + "loss": 2.9594855308532715, + "step": 5797, + "token_acc": 0.3033516194891068 + }, + { + "epoch": 3.39871005570214, + "grad_norm": 0.3132062802128377, + "learning_rate": 0.00019223796336187795, + "loss": 2.9931325912475586, + "step": 5798, + "token_acc": 0.29958113507568274 + }, + { + "epoch": 3.3992963940193492, + "grad_norm": 0.30425678226173, + "learning_rate": 0.00019223421902497454, + "loss": 2.995800018310547, + "step": 5799, + "token_acc": 0.29708317206661244 + }, + { + "epoch": 3.3998827323365584, + "grad_norm": 0.3202432698780432, + "learning_rate": 0.00019223047382165497, + "loss": 2.932878017425537, + "step": 5800, + "token_acc": 0.30660355557247215 + }, + { + "epoch": 3.4004690706537675, + "grad_norm": 0.3740212716289655, + "learning_rate": 0.00019222672775195436, + "loss": 2.957045078277588, + "step": 5801, + "token_acc": 0.30406507112468845 + }, + { + "epoch": 3.401055408970976, + "grad_norm": 0.3921320594737038, + "learning_rate": 0.00019222298081590796, + "loss": 2.984714984893799, + "step": 5802, + "token_acc": 0.30156723630742166 + }, + { + "epoch": 3.4016417472881852, + "grad_norm": 0.28843319638075454, + "learning_rate": 0.00019221923301355088, + "loss": 2.924182176589966, + "step": 5803, + "token_acc": 0.3099411092097278 + }, + { + "epoch": 3.4022280856053944, + "grad_norm": 0.3374818527928755, + "learning_rate": 0.0001922154843449184, + "loss": 2.970663070678711, + "step": 5804, + "token_acc": 0.3019865149536027 + }, + { + "epoch": 3.4028144239226035, + "grad_norm": 0.3342152905681232, + "learning_rate": 0.00019221173481004568, + "loss": 2.981661796569824, + "step": 5805, + "token_acc": 0.3004859724703163 + }, + { + "epoch": 3.403400762239812, + "grad_norm": 0.3224431405729731, + "learning_rate": 0.00019220798440896795, + "loss": 2.9281656742095947, + "step": 5806, + "token_acc": 0.3073039000510834 + }, + { + "epoch": 3.4039871005570213, + "grad_norm": 0.29048420364357325, + "learning_rate": 0.00019220423314172052, + "loss": 2.9836599826812744, + "step": 5807, + "token_acc": 0.30107193594214127 + }, + { + "epoch": 3.4045734388742304, + "grad_norm": 0.30278841862928607, + "learning_rate": 0.00019220048100833853, + "loss": 2.981579303741455, + "step": 5808, + "token_acc": 0.3015763518263512 + }, + { + "epoch": 3.4051597771914395, + "grad_norm": 0.3132937961294824, + "learning_rate": 0.00019219672800885727, + "loss": 2.940189838409424, + "step": 5809, + "token_acc": 0.3071596536665186 + }, + { + "epoch": 3.4057461155086486, + "grad_norm": 0.23417393552039273, + "learning_rate": 0.000192192974143312, + "loss": 2.9316890239715576, + "step": 5810, + "token_acc": 0.30723328386565235 + }, + { + "epoch": 3.4063324538258577, + "grad_norm": 0.26221674650673316, + "learning_rate": 0.00019218921941173794, + "loss": 2.941380023956299, + "step": 5811, + "token_acc": 0.30480195227934853 + }, + { + "epoch": 3.4069187921430664, + "grad_norm": 0.27577789523585744, + "learning_rate": 0.00019218546381417038, + "loss": 2.939434051513672, + "step": 5812, + "token_acc": 0.3080845653022677 + }, + { + "epoch": 3.4075051304602755, + "grad_norm": 0.28782867597397577, + "learning_rate": 0.00019218170735064465, + "loss": 2.9570720195770264, + "step": 5813, + "token_acc": 0.30452294257824786 + }, + { + "epoch": 3.4080914687774846, + "grad_norm": 0.27842571008664974, + "learning_rate": 0.000192177950021196, + "loss": 2.9782395362854004, + "step": 5814, + "token_acc": 0.30264572280592256 + }, + { + "epoch": 3.4086778070946937, + "grad_norm": 0.2566502665192402, + "learning_rate": 0.00019217419182585967, + "loss": 2.976616859436035, + "step": 5815, + "token_acc": 0.30056501588293494 + }, + { + "epoch": 3.409264145411903, + "grad_norm": 0.2576328191623782, + "learning_rate": 0.00019217043276467105, + "loss": 2.9728050231933594, + "step": 5816, + "token_acc": 0.3032339811588455 + }, + { + "epoch": 3.4098504837291115, + "grad_norm": 0.2612742936256722, + "learning_rate": 0.00019216667283766543, + "loss": 2.942906379699707, + "step": 5817, + "token_acc": 0.3060043720636889 + }, + { + "epoch": 3.4104368220463206, + "grad_norm": 0.2639981971791675, + "learning_rate": 0.00019216291204487808, + "loss": 2.9648547172546387, + "step": 5818, + "token_acc": 0.3018944188569666 + }, + { + "epoch": 3.4110231603635297, + "grad_norm": 0.24389115998631966, + "learning_rate": 0.00019215915038634437, + "loss": 2.9800033569335938, + "step": 5819, + "token_acc": 0.3015268600440512 + }, + { + "epoch": 3.411609498680739, + "grad_norm": 0.27209341880920535, + "learning_rate": 0.00019215538786209962, + "loss": 2.983247756958008, + "step": 5820, + "token_acc": 0.30063575032539425 + }, + { + "epoch": 3.412195836997948, + "grad_norm": 0.29210728793594526, + "learning_rate": 0.00019215162447217923, + "loss": 2.9713613986968994, + "step": 5821, + "token_acc": 0.30083496265905957 + }, + { + "epoch": 3.412782175315157, + "grad_norm": 0.3453638010395938, + "learning_rate": 0.00019214786021661847, + "loss": 2.9659583568573, + "step": 5822, + "token_acc": 0.3012512825979772 + }, + { + "epoch": 3.4133685136323657, + "grad_norm": 0.30386555011632155, + "learning_rate": 0.00019214409509545272, + "loss": 2.9835240840911865, + "step": 5823, + "token_acc": 0.29981537650006596 + }, + { + "epoch": 3.413954851949575, + "grad_norm": 0.24536556647429184, + "learning_rate": 0.00019214032910871737, + "loss": 2.965771436691284, + "step": 5824, + "token_acc": 0.30324765837020456 + }, + { + "epoch": 3.414541190266784, + "grad_norm": 0.28932881774061964, + "learning_rate": 0.0001921365622564478, + "loss": 2.914560317993164, + "step": 5825, + "token_acc": 0.30921828163416526 + }, + { + "epoch": 3.415127528583993, + "grad_norm": 0.2910470091151356, + "learning_rate": 0.0001921327945386794, + "loss": 2.975356340408325, + "step": 5826, + "token_acc": 0.3003853631532922 + }, + { + "epoch": 3.415713866901202, + "grad_norm": 0.25237221767907014, + "learning_rate": 0.00019212902595544754, + "loss": 2.9862780570983887, + "step": 5827, + "token_acc": 0.29866192632549493 + }, + { + "epoch": 3.416300205218411, + "grad_norm": 0.2653344306629598, + "learning_rate": 0.00019212525650678762, + "loss": 3.0299577713012695, + "step": 5828, + "token_acc": 0.2955012271295701 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.28071154867753856, + "learning_rate": 0.000192121486192735, + "loss": 2.921548366546631, + "step": 5829, + "token_acc": 0.31006354572296635 + }, + { + "epoch": 3.417472881852829, + "grad_norm": 0.24848866927867, + "learning_rate": 0.00019211771501332522, + "loss": 2.9695606231689453, + "step": 5830, + "token_acc": 0.3027783784649639 + }, + { + "epoch": 3.418059220170038, + "grad_norm": 0.2717313738561597, + "learning_rate": 0.0001921139429685936, + "loss": 3.005025863647461, + "step": 5831, + "token_acc": 0.29865001135109737 + }, + { + "epoch": 3.4186455584872473, + "grad_norm": 0.3260257905875027, + "learning_rate": 0.00019211017005857565, + "loss": 2.9719338417053223, + "step": 5832, + "token_acc": 0.3020611739618521 + }, + { + "epoch": 3.4192318968044564, + "grad_norm": 0.26837644040684583, + "learning_rate": 0.00019210639628330673, + "loss": 2.9891929626464844, + "step": 5833, + "token_acc": 0.29983823775500373 + }, + { + "epoch": 3.419818235121665, + "grad_norm": 0.2785892155686165, + "learning_rate": 0.00019210262164282238, + "loss": 3.020864725112915, + "step": 5834, + "token_acc": 0.2962394602768927 + }, + { + "epoch": 3.420404573438874, + "grad_norm": 0.3181023193362203, + "learning_rate": 0.00019209884613715796, + "loss": 2.9873809814453125, + "step": 5835, + "token_acc": 0.30083260297984227 + }, + { + "epoch": 3.4209909117560833, + "grad_norm": 0.2937404926990121, + "learning_rate": 0.000192095069766349, + "loss": 2.955162525177002, + "step": 5836, + "token_acc": 0.3054529402197904 + }, + { + "epoch": 3.4215772500732924, + "grad_norm": 0.29893660694712293, + "learning_rate": 0.00019209129253043098, + "loss": 2.9809131622314453, + "step": 5837, + "token_acc": 0.3031430639521251 + }, + { + "epoch": 3.4221635883905015, + "grad_norm": 0.31912591219344894, + "learning_rate": 0.00019208751442943936, + "loss": 2.9565138816833496, + "step": 5838, + "token_acc": 0.30520688052068806 + }, + { + "epoch": 3.42274992670771, + "grad_norm": 0.2483414046933559, + "learning_rate": 0.0001920837354634096, + "loss": 2.964552879333496, + "step": 5839, + "token_acc": 0.30297436747120515 + }, + { + "epoch": 3.4233362650249193, + "grad_norm": 0.2792801130514059, + "learning_rate": 0.00019207995563237727, + "loss": 2.9554409980773926, + "step": 5840, + "token_acc": 0.30358711252972304 + }, + { + "epoch": 3.4239226033421284, + "grad_norm": 0.26189117801007433, + "learning_rate": 0.0001920761749363778, + "loss": 2.9567999839782715, + "step": 5841, + "token_acc": 0.3044390852731853 + }, + { + "epoch": 3.4245089416593375, + "grad_norm": 0.30009422800571467, + "learning_rate": 0.00019207239337544677, + "loss": 2.9635977745056152, + "step": 5842, + "token_acc": 0.30409488095207704 + }, + { + "epoch": 3.4250952799765466, + "grad_norm": 0.3435392422210084, + "learning_rate": 0.00019206861094961966, + "loss": 2.9902148246765137, + "step": 5843, + "token_acc": 0.2987323752891528 + }, + { + "epoch": 3.4256816182937557, + "grad_norm": 0.24571940208033008, + "learning_rate": 0.00019206482765893201, + "loss": 2.9179511070251465, + "step": 5844, + "token_acc": 0.3100790351129955 + }, + { + "epoch": 3.4262679566109644, + "grad_norm": 0.3143668913430003, + "learning_rate": 0.00019206104350341936, + "loss": 2.9577560424804688, + "step": 5845, + "token_acc": 0.30323600095520203 + }, + { + "epoch": 3.4268542949281735, + "grad_norm": 0.3531532322135951, + "learning_rate": 0.0001920572584831173, + "loss": 2.9534034729003906, + "step": 5846, + "token_acc": 0.3046614945261659 + }, + { + "epoch": 3.4274406332453826, + "grad_norm": 0.3843028960845201, + "learning_rate": 0.0001920534725980613, + "loss": 2.9844651222229004, + "step": 5847, + "token_acc": 0.30068123525729246 + }, + { + "epoch": 3.4280269715625917, + "grad_norm": 0.3755600065819227, + "learning_rate": 0.00019204968584828698, + "loss": 2.945474624633789, + "step": 5848, + "token_acc": 0.30626033846545364 + }, + { + "epoch": 3.4286133098798004, + "grad_norm": 0.25599193466339026, + "learning_rate": 0.00019204589823382988, + "loss": 2.9698615074157715, + "step": 5849, + "token_acc": 0.30182895494162143 + }, + { + "epoch": 3.4291996481970095, + "grad_norm": 0.34769139272050503, + "learning_rate": 0.00019204210975472564, + "loss": 2.931894302368164, + "step": 5850, + "token_acc": 0.3084536160894996 + }, + { + "epoch": 3.4297859865142186, + "grad_norm": 0.31159721794229345, + "learning_rate": 0.00019203832041100977, + "loss": 2.9453389644622803, + "step": 5851, + "token_acc": 0.30787213131139257 + }, + { + "epoch": 3.4303723248314277, + "grad_norm": 0.2579993036213398, + "learning_rate": 0.0001920345302027179, + "loss": 2.9483261108398438, + "step": 5852, + "token_acc": 0.3046747886292524 + }, + { + "epoch": 3.430958663148637, + "grad_norm": 0.2698694918074235, + "learning_rate": 0.00019203073912988568, + "loss": 2.9651904106140137, + "step": 5853, + "token_acc": 0.30332346131635135 + }, + { + "epoch": 3.431545001465846, + "grad_norm": 0.23754907248937054, + "learning_rate": 0.00019202694719254866, + "loss": 2.915735960006714, + "step": 5854, + "token_acc": 0.30981478338245355 + }, + { + "epoch": 3.432131339783055, + "grad_norm": 0.28564343428170835, + "learning_rate": 0.00019202315439074247, + "loss": 2.978209972381592, + "step": 5855, + "token_acc": 0.2997180403806455 + }, + { + "epoch": 3.4327176781002637, + "grad_norm": 0.2779476397410527, + "learning_rate": 0.00019201936072450274, + "loss": 2.941974639892578, + "step": 5856, + "token_acc": 0.30566578850835363 + }, + { + "epoch": 3.433304016417473, + "grad_norm": 0.26216289935941856, + "learning_rate": 0.00019201556619386515, + "loss": 2.9447860717773438, + "step": 5857, + "token_acc": 0.30583818274362057 + }, + { + "epoch": 3.433890354734682, + "grad_norm": 0.319434240545209, + "learning_rate": 0.0001920117707988653, + "loss": 2.9532337188720703, + "step": 5858, + "token_acc": 0.30565926066841576 + }, + { + "epoch": 3.434476693051891, + "grad_norm": 0.3189273311043452, + "learning_rate": 0.0001920079745395388, + "loss": 2.986436128616333, + "step": 5859, + "token_acc": 0.30093469577736165 + }, + { + "epoch": 3.4350630313690997, + "grad_norm": 0.24590686053583413, + "learning_rate": 0.0001920041774159214, + "loss": 2.943110227584839, + "step": 5860, + "token_acc": 0.30652719577934445 + }, + { + "epoch": 3.435649369686309, + "grad_norm": 0.2952660269152213, + "learning_rate": 0.00019200037942804875, + "loss": 2.9498395919799805, + "step": 5861, + "token_acc": 0.3048740687822666 + }, + { + "epoch": 3.436235708003518, + "grad_norm": 0.2289524545554295, + "learning_rate": 0.00019199658057595647, + "loss": 2.9807400703430176, + "step": 5862, + "token_acc": 0.30063000667556744 + }, + { + "epoch": 3.436822046320727, + "grad_norm": 0.28100380142177656, + "learning_rate": 0.0001919927808596803, + "loss": 2.934401512145996, + "step": 5863, + "token_acc": 0.30794818965495013 + }, + { + "epoch": 3.437408384637936, + "grad_norm": 0.23094904124217333, + "learning_rate": 0.00019198898027925591, + "loss": 2.9253382682800293, + "step": 5864, + "token_acc": 0.3080496356497714 + }, + { + "epoch": 3.4379947229551453, + "grad_norm": 0.266996533800148, + "learning_rate": 0.000191985178834719, + "loss": 2.948082685470581, + "step": 5865, + "token_acc": 0.3051358730984267 + }, + { + "epoch": 3.438581061272354, + "grad_norm": 0.2964391823174672, + "learning_rate": 0.0001919813765261053, + "loss": 2.9613304138183594, + "step": 5866, + "token_acc": 0.3037651005466537 + }, + { + "epoch": 3.439167399589563, + "grad_norm": 0.29947770849897243, + "learning_rate": 0.00019197757335345051, + "loss": 2.965766429901123, + "step": 5867, + "token_acc": 0.30272262836926805 + }, + { + "epoch": 3.439753737906772, + "grad_norm": 0.3322360800348083, + "learning_rate": 0.00019197376931679035, + "loss": 2.960585117340088, + "step": 5868, + "token_acc": 0.30543242247767505 + }, + { + "epoch": 3.4403400762239813, + "grad_norm": 0.30472365436020155, + "learning_rate": 0.00019196996441616057, + "loss": 2.949005603790283, + "step": 5869, + "token_acc": 0.30664137431263055 + }, + { + "epoch": 3.4409264145411904, + "grad_norm": 0.3266952442526886, + "learning_rate": 0.00019196615865159692, + "loss": 2.978978157043457, + "step": 5870, + "token_acc": 0.301803861985566 + }, + { + "epoch": 3.441512752858399, + "grad_norm": 0.330146266493025, + "learning_rate": 0.00019196235202313512, + "loss": 2.9840073585510254, + "step": 5871, + "token_acc": 0.2999031058230275 + }, + { + "epoch": 3.442099091175608, + "grad_norm": 0.2682976082927483, + "learning_rate": 0.00019195854453081095, + "loss": 2.951385498046875, + "step": 5872, + "token_acc": 0.3055874007252053 + }, + { + "epoch": 3.4426854294928173, + "grad_norm": 0.3721189871150996, + "learning_rate": 0.00019195473617466017, + "loss": 2.9825026988983154, + "step": 5873, + "token_acc": 0.2994223615158824 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.2998320128962938, + "learning_rate": 0.00019195092695471855, + "loss": 2.9249467849731445, + "step": 5874, + "token_acc": 0.3082879922426332 + }, + { + "epoch": 3.4438581061272355, + "grad_norm": 0.33987721764103, + "learning_rate": 0.00019194711687102188, + "loss": 2.962435483932495, + "step": 5875, + "token_acc": 0.3029061850402232 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.3808321135265712, + "learning_rate": 0.00019194330592360595, + "loss": 2.9305970668792725, + "step": 5876, + "token_acc": 0.3081130015709455 + }, + { + "epoch": 3.4450307827616533, + "grad_norm": 0.2838053016642811, + "learning_rate": 0.00019193949411250655, + "loss": 2.9481234550476074, + "step": 5877, + "token_acc": 0.30723581629513175 + }, + { + "epoch": 3.4456171210788624, + "grad_norm": 0.3145662793211528, + "learning_rate": 0.00019193568143775948, + "loss": 2.946491241455078, + "step": 5878, + "token_acc": 0.3062626314971239 + }, + { + "epoch": 3.4462034593960715, + "grad_norm": 0.2697320374695421, + "learning_rate": 0.0001919318678994006, + "loss": 2.966817617416382, + "step": 5879, + "token_acc": 0.3026582616760725 + }, + { + "epoch": 3.4467897977132806, + "grad_norm": 0.363552144063481, + "learning_rate": 0.00019192805349746566, + "loss": 2.9571943283081055, + "step": 5880, + "token_acc": 0.3042139293064968 + }, + { + "epoch": 3.4473761360304898, + "grad_norm": 0.27708434208363003, + "learning_rate": 0.00019192423823199056, + "loss": 2.964855194091797, + "step": 5881, + "token_acc": 0.30335577381029 + }, + { + "epoch": 3.4479624743476984, + "grad_norm": 0.3244346920691581, + "learning_rate": 0.0001919204221030111, + "loss": 2.9121336936950684, + "step": 5882, + "token_acc": 0.30956103582173994 + }, + { + "epoch": 3.4485488126649075, + "grad_norm": 0.2976823632115045, + "learning_rate": 0.00019191660511056315, + "loss": 2.9448509216308594, + "step": 5883, + "token_acc": 0.3066706904311585 + }, + { + "epoch": 3.4491351509821166, + "grad_norm": 0.3073529388971304, + "learning_rate": 0.00019191278725468256, + "loss": 2.990128755569458, + "step": 5884, + "token_acc": 0.2994873164081855 + }, + { + "epoch": 3.4497214892993258, + "grad_norm": 0.27464180728062915, + "learning_rate": 0.00019190896853540516, + "loss": 3.000974416732788, + "step": 5885, + "token_acc": 0.2983952753087156 + }, + { + "epoch": 3.450307827616535, + "grad_norm": 0.32274414349210445, + "learning_rate": 0.00019190514895276687, + "loss": 2.946810722351074, + "step": 5886, + "token_acc": 0.3046700788983221 + }, + { + "epoch": 3.450894165933744, + "grad_norm": 0.2500161355313248, + "learning_rate": 0.00019190132850680356, + "loss": 2.9476499557495117, + "step": 5887, + "token_acc": 0.30469603107243803 + }, + { + "epoch": 3.4514805042509527, + "grad_norm": 0.32535656992806533, + "learning_rate": 0.00019189750719755106, + "loss": 2.953047513961792, + "step": 5888, + "token_acc": 0.30638956270897527 + }, + { + "epoch": 3.4520668425681618, + "grad_norm": 0.2516392505890602, + "learning_rate": 0.00019189368502504537, + "loss": 3.0043559074401855, + "step": 5889, + "token_acc": 0.2995595420521008 + }, + { + "epoch": 3.452653180885371, + "grad_norm": 0.29238211454016416, + "learning_rate": 0.0001918898619893223, + "loss": 2.9752047061920166, + "step": 5890, + "token_acc": 0.30282363001947876 + }, + { + "epoch": 3.45323951920258, + "grad_norm": 0.2662407422602763, + "learning_rate": 0.0001918860380904178, + "loss": 2.956540822982788, + "step": 5891, + "token_acc": 0.30357039411435305 + }, + { + "epoch": 3.453825857519789, + "grad_norm": 0.2671895066997925, + "learning_rate": 0.00019188221332836782, + "loss": 2.9923410415649414, + "step": 5892, + "token_acc": 0.29959832367442735 + }, + { + "epoch": 3.4544121958369978, + "grad_norm": 0.24488016676535831, + "learning_rate": 0.00019187838770320825, + "loss": 2.9865169525146484, + "step": 5893, + "token_acc": 0.2999305994020069 + }, + { + "epoch": 3.454998534154207, + "grad_norm": 0.2798155865349904, + "learning_rate": 0.000191874561214975, + "loss": 2.9410288333892822, + "step": 5894, + "token_acc": 0.30575499385605914 + }, + { + "epoch": 3.455584872471416, + "grad_norm": 0.2265680719449728, + "learning_rate": 0.00019187073386370412, + "loss": 2.9648513793945312, + "step": 5895, + "token_acc": 0.3040691886510702 + }, + { + "epoch": 3.456171210788625, + "grad_norm": 0.28194767823713174, + "learning_rate": 0.0001918669056494314, + "loss": 2.98783802986145, + "step": 5896, + "token_acc": 0.30117085465315824 + }, + { + "epoch": 3.456757549105834, + "grad_norm": 0.25318336622534454, + "learning_rate": 0.00019186307657219297, + "loss": 2.9368224143981934, + "step": 5897, + "token_acc": 0.3067925409134528 + }, + { + "epoch": 3.4573438874230433, + "grad_norm": 0.28508462528369305, + "learning_rate": 0.00019185924663202468, + "loss": 2.9496960639953613, + "step": 5898, + "token_acc": 0.3055572795099772 + }, + { + "epoch": 3.457930225740252, + "grad_norm": 0.24727390420822423, + "learning_rate": 0.00019185541582896257, + "loss": 2.9856793880462646, + "step": 5899, + "token_acc": 0.29950375102035526 + }, + { + "epoch": 3.458516564057461, + "grad_norm": 0.30468635485115086, + "learning_rate": 0.0001918515841630426, + "loss": 2.961728811264038, + "step": 5900, + "token_acc": 0.30219640910766143 + }, + { + "epoch": 3.45910290237467, + "grad_norm": 0.3022250658130902, + "learning_rate": 0.0001918477516343008, + "loss": 3.014646291732788, + "step": 5901, + "token_acc": 0.2950308408458471 + }, + { + "epoch": 3.4596892406918793, + "grad_norm": 0.28444301797341726, + "learning_rate": 0.00019184391824277308, + "loss": 2.967391014099121, + "step": 5902, + "token_acc": 0.3036778113856859 + }, + { + "epoch": 3.460275579009088, + "grad_norm": 0.3159989168597951, + "learning_rate": 0.00019184008398849555, + "loss": 2.971372365951538, + "step": 5903, + "token_acc": 0.3022410567318604 + }, + { + "epoch": 3.460861917326297, + "grad_norm": 0.24972851251337871, + "learning_rate": 0.00019183624887150416, + "loss": 2.993523120880127, + "step": 5904, + "token_acc": 0.2995925671786597 + }, + { + "epoch": 3.4614482556435062, + "grad_norm": 0.22663752289101194, + "learning_rate": 0.000191832412891835, + "loss": 2.971950054168701, + "step": 5905, + "token_acc": 0.3025093106708384 + }, + { + "epoch": 3.4620345939607153, + "grad_norm": 0.2321702861276606, + "learning_rate": 0.00019182857604952403, + "loss": 2.9795827865600586, + "step": 5906, + "token_acc": 0.30172230437336683 + }, + { + "epoch": 3.4626209322779244, + "grad_norm": 0.2443493680222593, + "learning_rate": 0.00019182473834460735, + "loss": 3.0340938568115234, + "step": 5907, + "token_acc": 0.29362152555440846 + }, + { + "epoch": 3.4632072705951336, + "grad_norm": 0.25724197474042537, + "learning_rate": 0.00019182089977712096, + "loss": 3.009056806564331, + "step": 5908, + "token_acc": 0.29684896566791347 + }, + { + "epoch": 3.4637936089123427, + "grad_norm": 0.2406065997317154, + "learning_rate": 0.00019181706034710098, + "loss": 2.9492576122283936, + "step": 5909, + "token_acc": 0.30258969061448304 + }, + { + "epoch": 3.4643799472295513, + "grad_norm": 0.2362749359692757, + "learning_rate": 0.00019181322005458343, + "loss": 2.939565420150757, + "step": 5910, + "token_acc": 0.30689354389345275 + }, + { + "epoch": 3.4649662855467604, + "grad_norm": 0.25374939802019236, + "learning_rate": 0.0001918093788996044, + "loss": 2.971625328063965, + "step": 5911, + "token_acc": 0.30232774377284705 + }, + { + "epoch": 3.4655526238639696, + "grad_norm": 0.2439709356329249, + "learning_rate": 0.00019180553688219996, + "loss": 2.951507091522217, + "step": 5912, + "token_acc": 0.3063108774448814 + }, + { + "epoch": 3.4661389621811787, + "grad_norm": 0.268728123262901, + "learning_rate": 0.00019180169400240623, + "loss": 2.983198642730713, + "step": 5913, + "token_acc": 0.2990803356251058 + }, + { + "epoch": 3.4667253004983873, + "grad_norm": 0.2718096451312759, + "learning_rate": 0.00019179785026025926, + "loss": 2.978200912475586, + "step": 5914, + "token_acc": 0.3020013256588341 + }, + { + "epoch": 3.4673116388155965, + "grad_norm": 0.24381964506274523, + "learning_rate": 0.00019179400565579524, + "loss": 2.94586181640625, + "step": 5915, + "token_acc": 0.30540347839372567 + }, + { + "epoch": 3.4678979771328056, + "grad_norm": 0.24442445861638454, + "learning_rate": 0.00019179016018905018, + "loss": 3.0127201080322266, + "step": 5916, + "token_acc": 0.2972553067805314 + }, + { + "epoch": 3.4684843154500147, + "grad_norm": 0.28595755196948425, + "learning_rate": 0.00019178631386006028, + "loss": 2.966919422149658, + "step": 5917, + "token_acc": 0.3035462828743011 + }, + { + "epoch": 3.469070653767224, + "grad_norm": 0.28765299683926016, + "learning_rate": 0.0001917824666688616, + "loss": 3.0000627040863037, + "step": 5918, + "token_acc": 0.2962995553669129 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 0.2480596497872555, + "learning_rate": 0.00019177861861549038, + "loss": 2.937091588973999, + "step": 5919, + "token_acc": 0.3074703205505898 + }, + { + "epoch": 3.4702433304016416, + "grad_norm": 0.30609569502955264, + "learning_rate": 0.0001917747696999827, + "loss": 2.99935245513916, + "step": 5920, + "token_acc": 0.29867321999301694 + }, + { + "epoch": 3.4708296687188507, + "grad_norm": 0.2981444080754157, + "learning_rate": 0.0001917709199223747, + "loss": 2.9981398582458496, + "step": 5921, + "token_acc": 0.29834112666814266 + }, + { + "epoch": 3.47141600703606, + "grad_norm": 0.3048733057883234, + "learning_rate": 0.0001917670692827026, + "loss": 2.9533164501190186, + "step": 5922, + "token_acc": 0.3048958453711592 + }, + { + "epoch": 3.472002345353269, + "grad_norm": 0.3315694913579945, + "learning_rate": 0.00019176321778100253, + "loss": 2.945148468017578, + "step": 5923, + "token_acc": 0.30529060275671244 + }, + { + "epoch": 3.472588683670478, + "grad_norm": 0.4083119145483531, + "learning_rate": 0.00019175936541731065, + "loss": 2.986443042755127, + "step": 5924, + "token_acc": 0.29981445173115784 + }, + { + "epoch": 3.4731750219876867, + "grad_norm": 0.5330171639928414, + "learning_rate": 0.00019175551219166324, + "loss": 2.99127197265625, + "step": 5925, + "token_acc": 0.29905280631321035 + }, + { + "epoch": 3.473761360304896, + "grad_norm": 0.36020582723180783, + "learning_rate": 0.00019175165810409638, + "loss": 2.9860711097717285, + "step": 5926, + "token_acc": 0.299273904057435 + }, + { + "epoch": 3.474347698622105, + "grad_norm": 0.3421320740531399, + "learning_rate": 0.00019174780315464637, + "loss": 2.9848809242248535, + "step": 5927, + "token_acc": 0.30097084811027097 + }, + { + "epoch": 3.474934036939314, + "grad_norm": 0.35924239148617054, + "learning_rate": 0.00019174394734334935, + "loss": 2.952925682067871, + "step": 5928, + "token_acc": 0.3055462229335591 + }, + { + "epoch": 3.475520375256523, + "grad_norm": 0.32831154255201883, + "learning_rate": 0.00019174009067024158, + "loss": 2.9695301055908203, + "step": 5929, + "token_acc": 0.3022690083716725 + }, + { + "epoch": 3.4761067135737322, + "grad_norm": 0.26454123890374015, + "learning_rate": 0.0001917362331353593, + "loss": 2.9432826042175293, + "step": 5930, + "token_acc": 0.306840718302589 + }, + { + "epoch": 3.476693051890941, + "grad_norm": 0.36257739541237993, + "learning_rate": 0.0001917323747387387, + "loss": 2.9671735763549805, + "step": 5931, + "token_acc": 0.302713078952146 + }, + { + "epoch": 3.47727939020815, + "grad_norm": 0.29407834638170904, + "learning_rate": 0.0001917285154804161, + "loss": 2.9727911949157715, + "step": 5932, + "token_acc": 0.30191120197640675 + }, + { + "epoch": 3.477865728525359, + "grad_norm": 0.3242559228594796, + "learning_rate": 0.00019172465536042762, + "loss": 2.987159013748169, + "step": 5933, + "token_acc": 0.30013832315651917 + }, + { + "epoch": 3.4784520668425682, + "grad_norm": 0.23961738964234222, + "learning_rate": 0.00019172079437880965, + "loss": 2.9654462337493896, + "step": 5934, + "token_acc": 0.3038325040293498 + }, + { + "epoch": 3.4790384051597774, + "grad_norm": 0.26835593058537643, + "learning_rate": 0.00019171693253559842, + "loss": 3.0013129711151123, + "step": 5935, + "token_acc": 0.29873808976422206 + }, + { + "epoch": 3.479624743476986, + "grad_norm": 0.2768649688424807, + "learning_rate": 0.00019171306983083018, + "loss": 2.9207587242126465, + "step": 5936, + "token_acc": 0.3091900190345786 + }, + { + "epoch": 3.480211081794195, + "grad_norm": 0.2579219523049475, + "learning_rate": 0.00019170920626454126, + "loss": 2.9746956825256348, + "step": 5937, + "token_acc": 0.3013603689964391 + }, + { + "epoch": 3.4807974201114043, + "grad_norm": 0.28624913096540316, + "learning_rate": 0.0001917053418367679, + "loss": 2.9514999389648438, + "step": 5938, + "token_acc": 0.30560697248870883 + }, + { + "epoch": 3.4813837584286134, + "grad_norm": 0.28017660361368973, + "learning_rate": 0.00019170147654754645, + "loss": 2.9392194747924805, + "step": 5939, + "token_acc": 0.3073414146542697 + }, + { + "epoch": 3.4819700967458225, + "grad_norm": 0.27024309058769097, + "learning_rate": 0.00019169761039691317, + "loss": 2.9616761207580566, + "step": 5940, + "token_acc": 0.30397879217728996 + }, + { + "epoch": 3.4825564350630316, + "grad_norm": 0.2906095967692996, + "learning_rate": 0.0001916937433849044, + "loss": 2.9783854484558105, + "step": 5941, + "token_acc": 0.3016020573287038 + }, + { + "epoch": 3.4831427733802403, + "grad_norm": 0.28464969221330594, + "learning_rate": 0.0001916898755115565, + "loss": 2.995286226272583, + "step": 5942, + "token_acc": 0.2989766216661179 + }, + { + "epoch": 3.4837291116974494, + "grad_norm": 0.2703042834407997, + "learning_rate": 0.00019168600677690574, + "loss": 2.960696220397949, + "step": 5943, + "token_acc": 0.3037984691860569 + }, + { + "epoch": 3.4843154500146585, + "grad_norm": 0.23950516711499512, + "learning_rate": 0.00019168213718098853, + "loss": 2.9468703269958496, + "step": 5944, + "token_acc": 0.30611996946750375 + }, + { + "epoch": 3.4849017883318676, + "grad_norm": 0.2650541051473338, + "learning_rate": 0.00019167826672384118, + "loss": 2.972216844558716, + "step": 5945, + "token_acc": 0.3019724566388194 + }, + { + "epoch": 3.4854881266490767, + "grad_norm": 0.23125605606574806, + "learning_rate": 0.00019167439540550003, + "loss": 3.006110668182373, + "step": 5946, + "token_acc": 0.29767965020489207 + }, + { + "epoch": 3.4860744649662854, + "grad_norm": 0.2648771795218388, + "learning_rate": 0.00019167052322600147, + "loss": 2.9724087715148926, + "step": 5947, + "token_acc": 0.3021266299233768 + }, + { + "epoch": 3.4866608032834945, + "grad_norm": 0.21692069966137825, + "learning_rate": 0.0001916666501853819, + "loss": 2.95220685005188, + "step": 5948, + "token_acc": 0.3044722180394519 + }, + { + "epoch": 3.4872471416007036, + "grad_norm": 0.2857513523779324, + "learning_rate": 0.00019166277628367766, + "loss": 2.9701128005981445, + "step": 5949, + "token_acc": 0.3019965204162137 + }, + { + "epoch": 3.4878334799179127, + "grad_norm": 0.24066926731927735, + "learning_rate": 0.00019165890152092515, + "loss": 2.9135375022888184, + "step": 5950, + "token_acc": 0.31098550489069027 + }, + { + "epoch": 3.488419818235122, + "grad_norm": 0.2797701130028548, + "learning_rate": 0.00019165502589716077, + "loss": 2.9858896732330322, + "step": 5951, + "token_acc": 0.2996186681093063 + }, + { + "epoch": 3.489006156552331, + "grad_norm": 0.26454483315812455, + "learning_rate": 0.00019165114941242092, + "loss": 2.9886817932128906, + "step": 5952, + "token_acc": 0.3003088406659224 + }, + { + "epoch": 3.4895924948695396, + "grad_norm": 0.2734291060499896, + "learning_rate": 0.00019164727206674205, + "loss": 2.9445204734802246, + "step": 5953, + "token_acc": 0.30565628358763675 + }, + { + "epoch": 3.4901788331867487, + "grad_norm": 0.27391631781102205, + "learning_rate": 0.00019164339386016054, + "loss": 2.954437732696533, + "step": 5954, + "token_acc": 0.30696444576448045 + }, + { + "epoch": 3.490765171503958, + "grad_norm": 0.2760405197135756, + "learning_rate": 0.00019163951479271284, + "loss": 2.947877883911133, + "step": 5955, + "token_acc": 0.30558461589619107 + }, + { + "epoch": 3.491351509821167, + "grad_norm": 0.2567047615665823, + "learning_rate": 0.00019163563486443536, + "loss": 2.9776127338409424, + "step": 5956, + "token_acc": 0.30010125640601754 + }, + { + "epoch": 3.4919378481383756, + "grad_norm": 0.2617237673759999, + "learning_rate": 0.00019163175407536456, + "loss": 2.944434642791748, + "step": 5957, + "token_acc": 0.30643962361322175 + }, + { + "epoch": 3.4925241864555847, + "grad_norm": 0.2596208956426614, + "learning_rate": 0.00019162787242553696, + "loss": 2.989205837249756, + "step": 5958, + "token_acc": 0.29953831852116103 + }, + { + "epoch": 3.493110524772794, + "grad_norm": 0.28449159274285146, + "learning_rate": 0.00019162398991498896, + "loss": 2.9705100059509277, + "step": 5959, + "token_acc": 0.3023345978121223 + }, + { + "epoch": 3.493696863090003, + "grad_norm": 0.3722119719732468, + "learning_rate": 0.000191620106543757, + "loss": 2.962167978286743, + "step": 5960, + "token_acc": 0.3022338679092662 + }, + { + "epoch": 3.494283201407212, + "grad_norm": 0.31678204837613766, + "learning_rate": 0.00019161622231187762, + "loss": 2.973284959793091, + "step": 5961, + "token_acc": 0.30227613870972697 + }, + { + "epoch": 3.494869539724421, + "grad_norm": 0.31562684752949294, + "learning_rate": 0.00019161233721938728, + "loss": 2.977548122406006, + "step": 5962, + "token_acc": 0.3009485836395382 + }, + { + "epoch": 3.49545587804163, + "grad_norm": 0.30816953952720527, + "learning_rate": 0.0001916084512663225, + "loss": 2.9610483646392822, + "step": 5963, + "token_acc": 0.3033335400324518 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.29575441417277193, + "learning_rate": 0.00019160456445271976, + "loss": 2.94991397857666, + "step": 5964, + "token_acc": 0.3039774905924322 + }, + { + "epoch": 3.496628554676048, + "grad_norm": 0.3130404269373462, + "learning_rate": 0.00019160067677861557, + "loss": 2.925572156906128, + "step": 5965, + "token_acc": 0.30927930823231664 + }, + { + "epoch": 3.497214892993257, + "grad_norm": 0.2613943744192855, + "learning_rate": 0.00019159678824404646, + "loss": 2.9770150184631348, + "step": 5966, + "token_acc": 0.3013259851863771 + }, + { + "epoch": 3.4978012313104663, + "grad_norm": 0.28558824248420767, + "learning_rate": 0.00019159289884904893, + "loss": 2.9472172260284424, + "step": 5967, + "token_acc": 0.30584574848984625 + }, + { + "epoch": 3.498387569627675, + "grad_norm": 0.26391575478831697, + "learning_rate": 0.00019158900859365957, + "loss": 2.963912010192871, + "step": 5968, + "token_acc": 0.3035536272307017 + }, + { + "epoch": 3.498973907944884, + "grad_norm": 0.3059708096545487, + "learning_rate": 0.00019158511747791488, + "loss": 2.9502649307250977, + "step": 5969, + "token_acc": 0.3054786276535688 + }, + { + "epoch": 3.499560246262093, + "grad_norm": 0.2735953846247341, + "learning_rate": 0.00019158122550185143, + "loss": 2.949164867401123, + "step": 5970, + "token_acc": 0.3055238553378333 + }, + { + "epoch": 3.5001465845793023, + "grad_norm": 0.2966843000950639, + "learning_rate": 0.00019157733266550575, + "loss": 2.964095115661621, + "step": 5971, + "token_acc": 0.3029662707349402 + }, + { + "epoch": 3.5007329228965114, + "grad_norm": 0.2754068453937649, + "learning_rate": 0.00019157343896891447, + "loss": 2.966104507446289, + "step": 5972, + "token_acc": 0.3034383100359415 + }, + { + "epoch": 3.5013192612137205, + "grad_norm": 0.27796850215277635, + "learning_rate": 0.00019156954441211407, + "loss": 2.9578073024749756, + "step": 5973, + "token_acc": 0.3047558879884775 + }, + { + "epoch": 3.5019055995309296, + "grad_norm": 0.21251071188468645, + "learning_rate": 0.00019156564899514125, + "loss": 2.954092264175415, + "step": 5974, + "token_acc": 0.3045309407573559 + }, + { + "epoch": 3.5024919378481383, + "grad_norm": 0.27669042795923077, + "learning_rate": 0.0001915617527180325, + "loss": 2.9644598960876465, + "step": 5975, + "token_acc": 0.30210262335945426 + }, + { + "epoch": 3.5030782761653474, + "grad_norm": 0.22348088071516292, + "learning_rate": 0.00019155785558082447, + "loss": 2.977642297744751, + "step": 5976, + "token_acc": 0.30237052499113615 + }, + { + "epoch": 3.5036646144825565, + "grad_norm": 0.2345331764004061, + "learning_rate": 0.00019155395758355378, + "loss": 2.9618120193481445, + "step": 5977, + "token_acc": 0.30364657778486087 + }, + { + "epoch": 3.5042509527997656, + "grad_norm": 0.2457726618346513, + "learning_rate": 0.00019155005872625703, + "loss": 3.006176471710205, + "step": 5978, + "token_acc": 0.2977274147520153 + }, + { + "epoch": 3.5048372911169743, + "grad_norm": 0.24194895381652184, + "learning_rate": 0.00019154615900897082, + "loss": 3.0002450942993164, + "step": 5979, + "token_acc": 0.29823697119617626 + }, + { + "epoch": 3.5054236294341834, + "grad_norm": 0.2792292866424761, + "learning_rate": 0.00019154225843173186, + "loss": 2.9755301475524902, + "step": 5980, + "token_acc": 0.303581882585905 + }, + { + "epoch": 3.5060099677513925, + "grad_norm": 0.2471223514085659, + "learning_rate": 0.0001915383569945767, + "loss": 2.9741172790527344, + "step": 5981, + "token_acc": 0.3012835567072924 + }, + { + "epoch": 3.5065963060686016, + "grad_norm": 0.2707817599767387, + "learning_rate": 0.00019153445469754203, + "loss": 3.0109848976135254, + "step": 5982, + "token_acc": 0.297413759556689 + }, + { + "epoch": 3.5071826443858107, + "grad_norm": 0.28452447527035873, + "learning_rate": 0.00019153055154066452, + "loss": 2.976449489593506, + "step": 5983, + "token_acc": 0.3006461837274823 + }, + { + "epoch": 3.50776898270302, + "grad_norm": 0.23894854149430764, + "learning_rate": 0.00019152664752398077, + "loss": 2.9401111602783203, + "step": 5984, + "token_acc": 0.3053380672166478 + }, + { + "epoch": 3.5083553210202285, + "grad_norm": 0.2576243936857398, + "learning_rate": 0.00019152274264752755, + "loss": 2.944181203842163, + "step": 5985, + "token_acc": 0.30503817439895486 + }, + { + "epoch": 3.5089416593374376, + "grad_norm": 0.26725361499836425, + "learning_rate": 0.00019151883691134145, + "loss": 2.981978416442871, + "step": 5986, + "token_acc": 0.3019366355221683 + }, + { + "epoch": 3.5095279976546467, + "grad_norm": 0.2988927849403871, + "learning_rate": 0.00019151493031545921, + "loss": 2.9759116172790527, + "step": 5987, + "token_acc": 0.3009395901900414 + }, + { + "epoch": 3.510114335971856, + "grad_norm": 0.33410454060795103, + "learning_rate": 0.00019151102285991752, + "loss": 3.0012550354003906, + "step": 5988, + "token_acc": 0.296885525169378 + }, + { + "epoch": 3.5107006742890645, + "grad_norm": 0.3175448001381692, + "learning_rate": 0.0001915071145447531, + "loss": 2.9826645851135254, + "step": 5989, + "token_acc": 0.29973003505831564 + }, + { + "epoch": 3.5112870126062736, + "grad_norm": 0.3112057511099924, + "learning_rate": 0.00019150320537000265, + "loss": 2.92803955078125, + "step": 5990, + "token_acc": 0.3081505418881185 + }, + { + "epoch": 3.5118733509234827, + "grad_norm": 0.3075334359857697, + "learning_rate": 0.00019149929533570286, + "loss": 2.9625308513641357, + "step": 5991, + "token_acc": 0.3040020160512653 + }, + { + "epoch": 3.512459689240692, + "grad_norm": 0.2754022461201159, + "learning_rate": 0.0001914953844418905, + "loss": 2.977827548980713, + "step": 5992, + "token_acc": 0.3026272628492937 + }, + { + "epoch": 3.513046027557901, + "grad_norm": 0.2874550711683382, + "learning_rate": 0.0001914914726886023, + "loss": 2.95717191696167, + "step": 5993, + "token_acc": 0.3039518891412711 + }, + { + "epoch": 3.51363236587511, + "grad_norm": 0.29715397657401443, + "learning_rate": 0.00019148756007587498, + "loss": 2.951174259185791, + "step": 5994, + "token_acc": 0.3068908952351429 + }, + { + "epoch": 3.514218704192319, + "grad_norm": 0.31373214469219995, + "learning_rate": 0.00019148364660374534, + "loss": 2.9671683311462402, + "step": 5995, + "token_acc": 0.3023500025488097 + }, + { + "epoch": 3.514805042509528, + "grad_norm": 0.2800442659357902, + "learning_rate": 0.0001914797322722501, + "loss": 2.962876796722412, + "step": 5996, + "token_acc": 0.30265794677769825 + }, + { + "epoch": 3.515391380826737, + "grad_norm": 0.30863886341230845, + "learning_rate": 0.00019147581708142604, + "loss": 2.9880080223083496, + "step": 5997, + "token_acc": 0.2988402088882364 + }, + { + "epoch": 3.515977719143946, + "grad_norm": 0.2746478330876217, + "learning_rate": 0.00019147190103130997, + "loss": 2.992375373840332, + "step": 5998, + "token_acc": 0.300004164085801 + }, + { + "epoch": 3.516564057461155, + "grad_norm": 0.3000778096289024, + "learning_rate": 0.00019146798412193863, + "loss": 2.9545109272003174, + "step": 5999, + "token_acc": 0.3052149717260569 + }, + { + "epoch": 3.517150395778364, + "grad_norm": 0.2799244239958438, + "learning_rate": 0.00019146406635334884, + "loss": 2.93007230758667, + "step": 6000, + "token_acc": 0.30757477201709077 + }, + { + "epoch": 3.517736734095573, + "grad_norm": 0.2548022953685049, + "learning_rate": 0.0001914601477255774, + "loss": 2.9795355796813965, + "step": 6001, + "token_acc": 0.30046408201275904 + }, + { + "epoch": 3.518323072412782, + "grad_norm": 0.2783344742546853, + "learning_rate": 0.00019145622823866113, + "loss": 3.0100769996643066, + "step": 6002, + "token_acc": 0.29657779343089047 + }, + { + "epoch": 3.518909410729991, + "grad_norm": 0.2771245296191965, + "learning_rate": 0.00019145230789263678, + "loss": 2.9748661518096924, + "step": 6003, + "token_acc": 0.30046849246108753 + }, + { + "epoch": 3.5194957490472003, + "grad_norm": 0.32731669117424095, + "learning_rate": 0.00019144838668754127, + "loss": 2.959407329559326, + "step": 6004, + "token_acc": 0.30438826283074727 + }, + { + "epoch": 3.5200820873644094, + "grad_norm": 0.31399821279565865, + "learning_rate": 0.0001914444646234114, + "loss": 3.0213232040405273, + "step": 6005, + "token_acc": 0.2946626829274702 + }, + { + "epoch": 3.5206684256816185, + "grad_norm": 0.3044822240328191, + "learning_rate": 0.000191440541700284, + "loss": 2.960515022277832, + "step": 6006, + "token_acc": 0.30416292891507396 + }, + { + "epoch": 3.521254763998827, + "grad_norm": 0.2877993958307413, + "learning_rate": 0.00019143661791819593, + "loss": 2.9200243949890137, + "step": 6007, + "token_acc": 0.30947370600735613 + }, + { + "epoch": 3.5218411023160363, + "grad_norm": 0.3410656092491032, + "learning_rate": 0.00019143269327718404, + "loss": 3.0075554847717285, + "step": 6008, + "token_acc": 0.2958289740382674 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.26219905920051323, + "learning_rate": 0.00019142876777728521, + "loss": 2.9884064197540283, + "step": 6009, + "token_acc": 0.2993108186998548 + }, + { + "epoch": 3.5230137789504545, + "grad_norm": 0.3095821787519087, + "learning_rate": 0.00019142484141853632, + "loss": 2.9131128787994385, + "step": 6010, + "token_acc": 0.3106748877105509 + }, + { + "epoch": 3.523600117267663, + "grad_norm": 0.32243002260269915, + "learning_rate": 0.0001914209142009742, + "loss": 2.9589967727661133, + "step": 6011, + "token_acc": 0.30376123401313515 + }, + { + "epoch": 3.5241864555848723, + "grad_norm": 0.32067188911130157, + "learning_rate": 0.0001914169861246358, + "loss": 2.967515230178833, + "step": 6012, + "token_acc": 0.30306979216173263 + }, + { + "epoch": 3.5247727939020814, + "grad_norm": 0.2570945984495637, + "learning_rate": 0.00019141305718955805, + "loss": 2.975963830947876, + "step": 6013, + "token_acc": 0.30258381435792453 + }, + { + "epoch": 3.5253591322192905, + "grad_norm": 0.3187663465487429, + "learning_rate": 0.00019140912739577773, + "loss": 2.947457790374756, + "step": 6014, + "token_acc": 0.30632316241898344 + }, + { + "epoch": 3.5259454705364996, + "grad_norm": 0.24985528843312174, + "learning_rate": 0.0001914051967433319, + "loss": 3.013056993484497, + "step": 6015, + "token_acc": 0.29877900076414066 + }, + { + "epoch": 3.5265318088537088, + "grad_norm": 0.2663958130762031, + "learning_rate": 0.0001914012652322574, + "loss": 2.961904525756836, + "step": 6016, + "token_acc": 0.3045105082395942 + }, + { + "epoch": 3.527118147170918, + "grad_norm": 0.2587305019331022, + "learning_rate": 0.00019139733286259117, + "loss": 2.9744646549224854, + "step": 6017, + "token_acc": 0.30235633282448343 + }, + { + "epoch": 3.5277044854881265, + "grad_norm": 0.2420367072388504, + "learning_rate": 0.00019139339963437015, + "loss": 2.9683353900909424, + "step": 6018, + "token_acc": 0.30419167032346606 + }, + { + "epoch": 3.5282908238053357, + "grad_norm": 0.2531436895818353, + "learning_rate": 0.0001913894655476313, + "loss": 3.000499725341797, + "step": 6019, + "token_acc": 0.3002289928409032 + }, + { + "epoch": 3.5288771621225448, + "grad_norm": 0.23680796629105214, + "learning_rate": 0.0001913855306024116, + "loss": 2.92912220954895, + "step": 6020, + "token_acc": 0.30839090274004505 + }, + { + "epoch": 3.529463500439754, + "grad_norm": 0.24746751587969099, + "learning_rate": 0.0001913815947987479, + "loss": 2.9691295623779297, + "step": 6021, + "token_acc": 0.30369165351286925 + }, + { + "epoch": 3.5300498387569625, + "grad_norm": 0.25343285746236516, + "learning_rate": 0.00019137765813667735, + "loss": 2.951542854309082, + "step": 6022, + "token_acc": 0.3051863771262021 + }, + { + "epoch": 3.5306361770741717, + "grad_norm": 0.2395097438191023, + "learning_rate": 0.00019137372061623674, + "loss": 2.9944005012512207, + "step": 6023, + "token_acc": 0.29984529533459775 + }, + { + "epoch": 3.5312225153913808, + "grad_norm": 0.27779986217368247, + "learning_rate": 0.00019136978223746324, + "loss": 2.9297263622283936, + "step": 6024, + "token_acc": 0.3095740392719354 + }, + { + "epoch": 3.53180885370859, + "grad_norm": 0.2352336156518264, + "learning_rate": 0.00019136584300039373, + "loss": 2.963967800140381, + "step": 6025, + "token_acc": 0.3038155186456379 + }, + { + "epoch": 3.532395192025799, + "grad_norm": 0.24621379909754137, + "learning_rate": 0.00019136190290506525, + "loss": 2.945768356323242, + "step": 6026, + "token_acc": 0.30647433549777187 + }, + { + "epoch": 3.532981530343008, + "grad_norm": 0.27635081217937774, + "learning_rate": 0.00019135796195151477, + "loss": 2.9718778133392334, + "step": 6027, + "token_acc": 0.3021337747011262 + }, + { + "epoch": 3.533567868660217, + "grad_norm": 0.27390439039592546, + "learning_rate": 0.00019135402013977935, + "loss": 2.9786605834960938, + "step": 6028, + "token_acc": 0.3001400334206156 + }, + { + "epoch": 3.534154206977426, + "grad_norm": 0.26332098154671874, + "learning_rate": 0.00019135007746989605, + "loss": 2.932772636413574, + "step": 6029, + "token_acc": 0.3082523559287203 + }, + { + "epoch": 3.534740545294635, + "grad_norm": 0.3533609470335083, + "learning_rate": 0.00019134613394190182, + "loss": 2.961691379547119, + "step": 6030, + "token_acc": 0.30266378689704826 + }, + { + "epoch": 3.535326883611844, + "grad_norm": 0.30447870670353966, + "learning_rate": 0.00019134218955583378, + "loss": 2.9801721572875977, + "step": 6031, + "token_acc": 0.3017181579110844 + }, + { + "epoch": 3.535913221929053, + "grad_norm": 0.25792664374032487, + "learning_rate": 0.00019133824431172896, + "loss": 3.0026698112487793, + "step": 6032, + "token_acc": 0.29789519354906985 + }, + { + "epoch": 3.536499560246262, + "grad_norm": 0.29593564831233565, + "learning_rate": 0.0001913342982096244, + "loss": 2.9800641536712646, + "step": 6033, + "token_acc": 0.30277789390075666 + }, + { + "epoch": 3.537085898563471, + "grad_norm": 0.34209049206529407, + "learning_rate": 0.0001913303512495572, + "loss": 2.939664363861084, + "step": 6034, + "token_acc": 0.3076003974064003 + }, + { + "epoch": 3.53767223688068, + "grad_norm": 0.2896585574680046, + "learning_rate": 0.0001913264034315644, + "loss": 3.009415626525879, + "step": 6035, + "token_acc": 0.2963400360341404 + }, + { + "epoch": 3.538258575197889, + "grad_norm": 0.3105310238269969, + "learning_rate": 0.00019132245475568312, + "loss": 2.9269728660583496, + "step": 6036, + "token_acc": 0.30827511800404583 + }, + { + "epoch": 3.5388449135150983, + "grad_norm": 0.3147159700402227, + "learning_rate": 0.0001913185052219504, + "loss": 2.9490199089050293, + "step": 6037, + "token_acc": 0.3049728133470569 + }, + { + "epoch": 3.5394312518323074, + "grad_norm": 0.3072956211798509, + "learning_rate": 0.0001913145548304034, + "loss": 2.9548277854919434, + "step": 6038, + "token_acc": 0.30501826778422525 + }, + { + "epoch": 3.540017590149516, + "grad_norm": 0.2529286585513444, + "learning_rate": 0.00019131060358107922, + "loss": 2.991983413696289, + "step": 6039, + "token_acc": 0.29849948795583064 + }, + { + "epoch": 3.5406039284667252, + "grad_norm": 0.2806392070049623, + "learning_rate": 0.00019130665147401495, + "loss": 3.004472255706787, + "step": 6040, + "token_acc": 0.2967159994180587 + }, + { + "epoch": 3.5411902667839343, + "grad_norm": 0.2478941367237467, + "learning_rate": 0.00019130269850924772, + "loss": 2.9572386741638184, + "step": 6041, + "token_acc": 0.30560991093845113 + }, + { + "epoch": 3.5417766051011434, + "grad_norm": 0.2813861725821734, + "learning_rate": 0.0001912987446868147, + "loss": 2.97723388671875, + "step": 6042, + "token_acc": 0.2999278058253404 + }, + { + "epoch": 3.542362943418352, + "grad_norm": 0.26339464834253684, + "learning_rate": 0.00019129479000675294, + "loss": 2.9609756469726562, + "step": 6043, + "token_acc": 0.30324932208574645 + }, + { + "epoch": 3.5429492817355612, + "grad_norm": 0.28261798315166753, + "learning_rate": 0.0001912908344690997, + "loss": 2.9773683547973633, + "step": 6044, + "token_acc": 0.30172688375253487 + }, + { + "epoch": 3.5435356200527703, + "grad_norm": 0.296373810890885, + "learning_rate": 0.00019128687807389206, + "loss": 2.982858180999756, + "step": 6045, + "token_acc": 0.3007664228660713 + }, + { + "epoch": 3.5441219583699795, + "grad_norm": 0.2509418418327762, + "learning_rate": 0.00019128292082116723, + "loss": 2.9923007488250732, + "step": 6046, + "token_acc": 0.2995224830879427 + }, + { + "epoch": 3.5447082966871886, + "grad_norm": 0.3074253950779621, + "learning_rate": 0.00019127896271096235, + "loss": 2.970160961151123, + "step": 6047, + "token_acc": 0.302632838926574 + }, + { + "epoch": 3.5452946350043977, + "grad_norm": 0.2638720717452515, + "learning_rate": 0.00019127500374331463, + "loss": 2.9973959922790527, + "step": 6048, + "token_acc": 0.29808754960056416 + }, + { + "epoch": 3.545880973321607, + "grad_norm": 0.29083701292481673, + "learning_rate": 0.00019127104391826122, + "loss": 2.9632906913757324, + "step": 6049, + "token_acc": 0.3022998305278589 + }, + { + "epoch": 3.5464673116388155, + "grad_norm": 0.2731822796334414, + "learning_rate": 0.00019126708323583937, + "loss": 2.9164652824401855, + "step": 6050, + "token_acc": 0.30911774309410306 + }, + { + "epoch": 3.5470536499560246, + "grad_norm": 0.2826184794460728, + "learning_rate": 0.00019126312169608623, + "loss": 3.0000228881835938, + "step": 6051, + "token_acc": 0.2981384833181905 + }, + { + "epoch": 3.5476399882732337, + "grad_norm": 0.2741298787269225, + "learning_rate": 0.0001912591592990391, + "loss": 2.9693102836608887, + "step": 6052, + "token_acc": 0.30116591973744616 + }, + { + "epoch": 3.548226326590443, + "grad_norm": 0.2934656590170281, + "learning_rate": 0.00019125519604473506, + "loss": 2.961246967315674, + "step": 6053, + "token_acc": 0.3043683353639471 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.23547395084827352, + "learning_rate": 0.0001912512319332115, + "loss": 3.0001492500305176, + "step": 6054, + "token_acc": 0.2987342940426961 + }, + { + "epoch": 3.5493990032248606, + "grad_norm": 0.302082094485623, + "learning_rate": 0.00019124726696450554, + "loss": 2.9944260120391846, + "step": 6055, + "token_acc": 0.2987072569724506 + }, + { + "epoch": 3.5499853415420697, + "grad_norm": 0.29697082626082094, + "learning_rate": 0.00019124330113865442, + "loss": 2.9592125415802, + "step": 6056, + "token_acc": 0.30211237246730854 + }, + { + "epoch": 3.550571679859279, + "grad_norm": 0.2440005186383839, + "learning_rate": 0.00019123933445569548, + "loss": 2.954178810119629, + "step": 6057, + "token_acc": 0.3049324539886926 + }, + { + "epoch": 3.551158018176488, + "grad_norm": 0.33028213890965935, + "learning_rate": 0.00019123536691566595, + "loss": 2.952855110168457, + "step": 6058, + "token_acc": 0.30381315235098405 + }, + { + "epoch": 3.551744356493697, + "grad_norm": 0.30310146488583645, + "learning_rate": 0.00019123139851860309, + "loss": 2.974757671356201, + "step": 6059, + "token_acc": 0.30103614226414693 + }, + { + "epoch": 3.552330694810906, + "grad_norm": 0.242211841473373, + "learning_rate": 0.00019122742926454416, + "loss": 2.9565038681030273, + "step": 6060, + "token_acc": 0.30527864930377024 + }, + { + "epoch": 3.552917033128115, + "grad_norm": 0.2554172839321172, + "learning_rate": 0.00019122345915352647, + "loss": 2.9752566814422607, + "step": 6061, + "token_acc": 0.30418943533697634 + }, + { + "epoch": 3.553503371445324, + "grad_norm": 0.2513359516711458, + "learning_rate": 0.0001912194881855873, + "loss": 2.977457046508789, + "step": 6062, + "token_acc": 0.3022908379473136 + }, + { + "epoch": 3.554089709762533, + "grad_norm": 0.2221724307294777, + "learning_rate": 0.00019121551636076397, + "loss": 2.9891786575317383, + "step": 6063, + "token_acc": 0.30040666843516356 + }, + { + "epoch": 3.554676048079742, + "grad_norm": 0.27445306566602073, + "learning_rate": 0.00019121154367909374, + "loss": 2.951536178588867, + "step": 6064, + "token_acc": 0.3063825793129341 + }, + { + "epoch": 3.555262386396951, + "grad_norm": 0.23490876893988244, + "learning_rate": 0.00019120757014061402, + "loss": 2.9403176307678223, + "step": 6065, + "token_acc": 0.306334242107677 + }, + { + "epoch": 3.55584872471416, + "grad_norm": 0.2516118373617787, + "learning_rate": 0.00019120359574536204, + "loss": 2.9542317390441895, + "step": 6066, + "token_acc": 0.30294476735085724 + }, + { + "epoch": 3.556435063031369, + "grad_norm": 0.2679040148691342, + "learning_rate": 0.0001911996204933752, + "loss": 3.0217373371124268, + "step": 6067, + "token_acc": 0.29510657050626105 + }, + { + "epoch": 3.557021401348578, + "grad_norm": 0.24653047779150936, + "learning_rate": 0.00019119564438469083, + "loss": 2.995137929916382, + "step": 6068, + "token_acc": 0.29786876254734357 + }, + { + "epoch": 3.5576077396657872, + "grad_norm": 0.2484886990235106, + "learning_rate": 0.00019119166741934622, + "loss": 3.000593662261963, + "step": 6069, + "token_acc": 0.2986035738952966 + }, + { + "epoch": 3.5581940779829964, + "grad_norm": 0.24711761129930102, + "learning_rate": 0.00019118768959737882, + "loss": 2.954603672027588, + "step": 6070, + "token_acc": 0.3050378924199513 + }, + { + "epoch": 3.5587804163002055, + "grad_norm": 0.228907024447038, + "learning_rate": 0.00019118371091882594, + "loss": 2.9427480697631836, + "step": 6071, + "token_acc": 0.3055746658337221 + }, + { + "epoch": 3.559366754617414, + "grad_norm": 0.2305080698567949, + "learning_rate": 0.00019117973138372497, + "loss": 2.985908031463623, + "step": 6072, + "token_acc": 0.2990233761019049 + }, + { + "epoch": 3.5599530929346233, + "grad_norm": 0.27209814437291874, + "learning_rate": 0.0001911757509921133, + "loss": 2.967372417449951, + "step": 6073, + "token_acc": 0.30308258482089934 + }, + { + "epoch": 3.5605394312518324, + "grad_norm": 0.3554476935124714, + "learning_rate": 0.00019117176974402827, + "loss": 2.9692063331604004, + "step": 6074, + "token_acc": 0.30515848018391994 + }, + { + "epoch": 3.5611257695690415, + "grad_norm": 0.3855799269006361, + "learning_rate": 0.00019116778763950736, + "loss": 3.009383201599121, + "step": 6075, + "token_acc": 0.29771396399318156 + }, + { + "epoch": 3.56171210788625, + "grad_norm": 0.23904301525289712, + "learning_rate": 0.00019116380467858792, + "loss": 2.9908876419067383, + "step": 6076, + "token_acc": 0.3011338105439156 + }, + { + "epoch": 3.5622984462034593, + "grad_norm": 0.3575263181958856, + "learning_rate": 0.00019115982086130738, + "loss": 2.9455742835998535, + "step": 6077, + "token_acc": 0.30734467085492934 + }, + { + "epoch": 3.5628847845206684, + "grad_norm": 0.33921296331241757, + "learning_rate": 0.00019115583618770318, + "loss": 2.958333969116211, + "step": 6078, + "token_acc": 0.30518011687452984 + }, + { + "epoch": 3.5634711228378775, + "grad_norm": 0.2646770780071063, + "learning_rate": 0.00019115185065781272, + "loss": 2.951984405517578, + "step": 6079, + "token_acc": 0.3062160521807272 + }, + { + "epoch": 3.5640574611550866, + "grad_norm": 0.3218159409310935, + "learning_rate": 0.00019114786427167343, + "loss": 2.9511821269989014, + "step": 6080, + "token_acc": 0.3055288741601833 + }, + { + "epoch": 3.5646437994722957, + "grad_norm": 0.2776972995036099, + "learning_rate": 0.00019114387702932282, + "loss": 2.9740653038024902, + "step": 6081, + "token_acc": 0.30222206701659565 + }, + { + "epoch": 3.565230137789505, + "grad_norm": 0.3172573973687408, + "learning_rate": 0.00019113988893079825, + "loss": 2.9364614486694336, + "step": 6082, + "token_acc": 0.30756670628201216 + }, + { + "epoch": 3.5658164761067135, + "grad_norm": 0.3393852101247649, + "learning_rate": 0.0001911358999761373, + "loss": 2.982854127883911, + "step": 6083, + "token_acc": 0.30059094504676376 + }, + { + "epoch": 3.5664028144239226, + "grad_norm": 0.32023888074587303, + "learning_rate": 0.00019113191016537732, + "loss": 2.972132682800293, + "step": 6084, + "token_acc": 0.30243817034460185 + }, + { + "epoch": 3.5669891527411317, + "grad_norm": 0.30657874179084604, + "learning_rate": 0.00019112791949855588, + "loss": 2.9916868209838867, + "step": 6085, + "token_acc": 0.299861806279459 + }, + { + "epoch": 3.567575491058341, + "grad_norm": 0.288638301280867, + "learning_rate": 0.00019112392797571043, + "loss": 2.993163824081421, + "step": 6086, + "token_acc": 0.2995280981383094 + }, + { + "epoch": 3.5681618293755495, + "grad_norm": 0.2925492244708194, + "learning_rate": 0.00019111993559687846, + "loss": 3.0008814334869385, + "step": 6087, + "token_acc": 0.2995426849039327 + }, + { + "epoch": 3.5687481676927586, + "grad_norm": 0.2917733902157744, + "learning_rate": 0.00019111594236209748, + "loss": 2.942704200744629, + "step": 6088, + "token_acc": 0.3065314344016577 + }, + { + "epoch": 3.5693345060099677, + "grad_norm": 0.24658253107296965, + "learning_rate": 0.000191111948271405, + "loss": 2.9848532676696777, + "step": 6089, + "token_acc": 0.2996251544937048 + }, + { + "epoch": 3.569920844327177, + "grad_norm": 0.2869888638321483, + "learning_rate": 0.00019110795332483854, + "loss": 2.9550015926361084, + "step": 6090, + "token_acc": 0.30549792265408476 + }, + { + "epoch": 3.570507182644386, + "grad_norm": 0.27216799171897926, + "learning_rate": 0.00019110395752243564, + "loss": 2.979416847229004, + "step": 6091, + "token_acc": 0.3008749185545477 + }, + { + "epoch": 3.571093520961595, + "grad_norm": 0.2660240850290858, + "learning_rate": 0.00019109996086423382, + "loss": 2.9608914852142334, + "step": 6092, + "token_acc": 0.303288056206089 + }, + { + "epoch": 3.5716798592788037, + "grad_norm": 0.2926574999332826, + "learning_rate": 0.00019109596335027063, + "loss": 2.943258762359619, + "step": 6093, + "token_acc": 0.30629834533922445 + }, + { + "epoch": 3.572266197596013, + "grad_norm": 0.3341132726023575, + "learning_rate": 0.00019109196498058362, + "loss": 2.951904296875, + "step": 6094, + "token_acc": 0.3052081071394 + }, + { + "epoch": 3.572852535913222, + "grad_norm": 0.3000757445052186, + "learning_rate": 0.0001910879657552103, + "loss": 2.975396156311035, + "step": 6095, + "token_acc": 0.30034560559228973 + }, + { + "epoch": 3.573438874230431, + "grad_norm": 0.30953948851146373, + "learning_rate": 0.00019108396567418833, + "loss": 2.9296205043792725, + "step": 6096, + "token_acc": 0.3092009539756386 + }, + { + "epoch": 3.5740252125476397, + "grad_norm": 0.2546143620690671, + "learning_rate": 0.00019107996473755523, + "loss": 2.9269824028015137, + "step": 6097, + "token_acc": 0.3071438952733917 + }, + { + "epoch": 3.574611550864849, + "grad_norm": 0.26516970969182374, + "learning_rate": 0.0001910759629453486, + "loss": 2.9254884719848633, + "step": 6098, + "token_acc": 0.3090405444715539 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 0.25674193175851545, + "learning_rate": 0.00019107196029760602, + "loss": 2.9549310207366943, + "step": 6099, + "token_acc": 0.3044577241734409 + }, + { + "epoch": 3.575784227499267, + "grad_norm": 0.27554819296089095, + "learning_rate": 0.0001910679567943651, + "loss": 2.983686685562134, + "step": 6100, + "token_acc": 0.2995202908809435 + }, + { + "epoch": 3.576370565816476, + "grad_norm": 0.26104134159790215, + "learning_rate": 0.00019106395243566343, + "loss": 2.94295334815979, + "step": 6101, + "token_acc": 0.30498578863143594 + }, + { + "epoch": 3.5769569041336853, + "grad_norm": 0.23322966523244412, + "learning_rate": 0.00019105994722153863, + "loss": 2.9615185260772705, + "step": 6102, + "token_acc": 0.3033024819574927 + }, + { + "epoch": 3.5775432424508944, + "grad_norm": 0.23826787140074537, + "learning_rate": 0.00019105594115202833, + "loss": 2.9407341480255127, + "step": 6103, + "token_acc": 0.3082021583284435 + }, + { + "epoch": 3.578129580768103, + "grad_norm": 0.23653677545330673, + "learning_rate": 0.0001910519342271702, + "loss": 2.9667861461639404, + "step": 6104, + "token_acc": 0.30217488813182825 + }, + { + "epoch": 3.578715919085312, + "grad_norm": 0.2659788321623413, + "learning_rate": 0.0001910479264470018, + "loss": 2.9569573402404785, + "step": 6105, + "token_acc": 0.30457583058081816 + }, + { + "epoch": 3.5793022574025213, + "grad_norm": 0.25319488649102495, + "learning_rate": 0.00019104391781156084, + "loss": 2.93888258934021, + "step": 6106, + "token_acc": 0.30696923564210765 + }, + { + "epoch": 3.5798885957197304, + "grad_norm": 0.2358698665775989, + "learning_rate": 0.00019103990832088498, + "loss": 3.006730556488037, + "step": 6107, + "token_acc": 0.2973490717525425 + }, + { + "epoch": 3.580474934036939, + "grad_norm": 0.24685626565436072, + "learning_rate": 0.0001910358979750118, + "loss": 3.016669273376465, + "step": 6108, + "token_acc": 0.2964768522411908 + }, + { + "epoch": 3.581061272354148, + "grad_norm": 0.25600265813579076, + "learning_rate": 0.00019103188677397904, + "loss": 2.954880714416504, + "step": 6109, + "token_acc": 0.3040211349756399 + }, + { + "epoch": 3.5816476106713573, + "grad_norm": 0.26135045808462115, + "learning_rate": 0.00019102787471782443, + "loss": 2.953213691711426, + "step": 6110, + "token_acc": 0.30506230849904376 + }, + { + "epoch": 3.5822339489885664, + "grad_norm": 0.2765286104288164, + "learning_rate": 0.00019102386180658556, + "loss": 2.936375379562378, + "step": 6111, + "token_acc": 0.3070043711314714 + }, + { + "epoch": 3.5828202873057755, + "grad_norm": 0.26587797202154273, + "learning_rate": 0.00019101984804030016, + "loss": 3.0120368003845215, + "step": 6112, + "token_acc": 0.29737433107710265 + }, + { + "epoch": 3.5834066256229846, + "grad_norm": 0.23921825868568486, + "learning_rate": 0.00019101583341900593, + "loss": 3.000037431716919, + "step": 6113, + "token_acc": 0.2975017080753566 + }, + { + "epoch": 3.5839929639401937, + "grad_norm": 0.25513573628948827, + "learning_rate": 0.0001910118179427406, + "loss": 2.968623399734497, + "step": 6114, + "token_acc": 0.30301765419895416 + }, + { + "epoch": 3.5845793022574024, + "grad_norm": 0.26355806188497044, + "learning_rate": 0.00019100780161154188, + "loss": 2.9707489013671875, + "step": 6115, + "token_acc": 0.30307186938041525 + }, + { + "epoch": 3.5851656405746115, + "grad_norm": 0.26008214800277046, + "learning_rate": 0.00019100378442544753, + "loss": 2.971597194671631, + "step": 6116, + "token_acc": 0.3011936886388191 + }, + { + "epoch": 3.5857519788918206, + "grad_norm": 0.2918888880575415, + "learning_rate": 0.00019099976638449522, + "loss": 2.97985577583313, + "step": 6117, + "token_acc": 0.3021927156346708 + }, + { + "epoch": 3.5863383172090297, + "grad_norm": 0.3258452177376403, + "learning_rate": 0.00019099574748872273, + "loss": 2.947326183319092, + "step": 6118, + "token_acc": 0.30692099531384065 + }, + { + "epoch": 3.5869246555262384, + "grad_norm": 0.3602005324913786, + "learning_rate": 0.00019099172773816782, + "loss": 2.9568371772766113, + "step": 6119, + "token_acc": 0.3052741370386073 + }, + { + "epoch": 3.5875109938434475, + "grad_norm": 0.31550800355097924, + "learning_rate": 0.00019098770713286823, + "loss": 2.994813919067383, + "step": 6120, + "token_acc": 0.2986731259109386 + }, + { + "epoch": 3.5880973321606566, + "grad_norm": 0.28033070210320354, + "learning_rate": 0.00019098368567286173, + "loss": 2.938511848449707, + "step": 6121, + "token_acc": 0.3075197295786934 + }, + { + "epoch": 3.5886836704778657, + "grad_norm": 0.2594408708124164, + "learning_rate": 0.00019097966335818615, + "loss": 2.9175825119018555, + "step": 6122, + "token_acc": 0.30976245131670577 + }, + { + "epoch": 3.589270008795075, + "grad_norm": 0.3366059890129282, + "learning_rate": 0.0001909756401888792, + "loss": 2.9433441162109375, + "step": 6123, + "token_acc": 0.3052279416435514 + }, + { + "epoch": 3.589856347112284, + "grad_norm": 0.30712621591065287, + "learning_rate": 0.0001909716161649787, + "loss": 2.9516336917877197, + "step": 6124, + "token_acc": 0.305059607960066 + }, + { + "epoch": 3.590442685429493, + "grad_norm": 0.26793115054346023, + "learning_rate": 0.00019096759128652243, + "loss": 2.967846393585205, + "step": 6125, + "token_acc": 0.3030857287018495 + }, + { + "epoch": 3.5910290237467017, + "grad_norm": 0.32182466652422687, + "learning_rate": 0.00019096356555354827, + "loss": 2.984340190887451, + "step": 6126, + "token_acc": 0.2993270314102532 + }, + { + "epoch": 3.591615362063911, + "grad_norm": 0.3041975856351586, + "learning_rate": 0.00019095953896609396, + "loss": 2.959488868713379, + "step": 6127, + "token_acc": 0.304567551544694 + }, + { + "epoch": 3.59220170038112, + "grad_norm": 0.2961943370722895, + "learning_rate": 0.00019095551152419735, + "loss": 2.9880523681640625, + "step": 6128, + "token_acc": 0.29803527268335966 + }, + { + "epoch": 3.592788038698329, + "grad_norm": 0.3028784154324309, + "learning_rate": 0.00019095148322789628, + "loss": 2.9437203407287598, + "step": 6129, + "token_acc": 0.3065974958638435 + }, + { + "epoch": 3.5933743770155377, + "grad_norm": 0.26249348856534693, + "learning_rate": 0.00019094745407722855, + "loss": 2.9660019874572754, + "step": 6130, + "token_acc": 0.3023608228901138 + }, + { + "epoch": 3.593960715332747, + "grad_norm": 0.2583792421501495, + "learning_rate": 0.0001909434240722321, + "loss": 2.9663591384887695, + "step": 6131, + "token_acc": 0.30201498125386467 + }, + { + "epoch": 3.594547053649956, + "grad_norm": 0.25411573785947217, + "learning_rate": 0.00019093939321294468, + "loss": 2.9547524452209473, + "step": 6132, + "token_acc": 0.3052255382385491 + }, + { + "epoch": 3.595133391967165, + "grad_norm": 0.25061990353708913, + "learning_rate": 0.00019093536149940424, + "loss": 2.969355583190918, + "step": 6133, + "token_acc": 0.3021969333791812 + }, + { + "epoch": 3.595719730284374, + "grad_norm": 0.22452123697237805, + "learning_rate": 0.00019093132893164858, + "loss": 2.9826812744140625, + "step": 6134, + "token_acc": 0.30174376165887157 + }, + { + "epoch": 3.5963060686015833, + "grad_norm": 0.2772123160710859, + "learning_rate": 0.00019092729550971565, + "loss": 2.9628429412841797, + "step": 6135, + "token_acc": 0.30360325275382805 + }, + { + "epoch": 3.5968924069187924, + "grad_norm": 0.3629147243429197, + "learning_rate": 0.0001909232612336433, + "loss": 3.001366138458252, + "step": 6136, + "token_acc": 0.2981142773950823 + }, + { + "epoch": 3.597478745236001, + "grad_norm": 0.34757445611169524, + "learning_rate": 0.0001909192261034694, + "loss": 2.9764175415039062, + "step": 6137, + "token_acc": 0.3019083597742569 + }, + { + "epoch": 3.59806508355321, + "grad_norm": 0.23084649595729798, + "learning_rate": 0.0001909151901192319, + "loss": 2.9358725547790527, + "step": 6138, + "token_acc": 0.3073871889415633 + }, + { + "epoch": 3.5986514218704193, + "grad_norm": 0.3076775667030008, + "learning_rate": 0.0001909111532809687, + "loss": 2.978053331375122, + "step": 6139, + "token_acc": 0.3014994269790495 + }, + { + "epoch": 3.5992377601876284, + "grad_norm": 0.25896592089346787, + "learning_rate": 0.00019090711558871775, + "loss": 2.988049030303955, + "step": 6140, + "token_acc": 0.2986257299901479 + }, + { + "epoch": 3.599824098504837, + "grad_norm": 0.2903077755864694, + "learning_rate": 0.0001909030770425169, + "loss": 2.9821600914001465, + "step": 6141, + "token_acc": 0.3007345066962317 + }, + { + "epoch": 3.600410436822046, + "grad_norm": 0.2999308038807419, + "learning_rate": 0.00019089903764240416, + "loss": 2.97176194190979, + "step": 6142, + "token_acc": 0.3023460100826655 + }, + { + "epoch": 3.6009967751392553, + "grad_norm": 0.28123317075303567, + "learning_rate": 0.00019089499738841745, + "loss": 2.9827146530151367, + "step": 6143, + "token_acc": 0.3005290512278958 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.2654358985731293, + "learning_rate": 0.00019089095628059473, + "loss": 2.9305708408355713, + "step": 6144, + "token_acc": 0.3084093706344549 + }, + { + "epoch": 3.6021694517736735, + "grad_norm": 0.23664756003134604, + "learning_rate": 0.00019088691431897394, + "loss": 2.964878559112549, + "step": 6145, + "token_acc": 0.3014300569989914 + }, + { + "epoch": 3.6027557900908826, + "grad_norm": 0.3236881071848014, + "learning_rate": 0.00019088287150359305, + "loss": 2.982977867126465, + "step": 6146, + "token_acc": 0.2999882917691137 + }, + { + "epoch": 3.6033421284080913, + "grad_norm": 0.2757575712527811, + "learning_rate": 0.00019087882783449004, + "loss": 2.931215286254883, + "step": 6147, + "token_acc": 0.3071265580689837 + }, + { + "epoch": 3.6039284667253004, + "grad_norm": 0.2550000582798702, + "learning_rate": 0.00019087478331170294, + "loss": 2.966728448867798, + "step": 6148, + "token_acc": 0.3024278262295418 + }, + { + "epoch": 3.6045148050425095, + "grad_norm": 0.26363615805449864, + "learning_rate": 0.00019087073793526971, + "loss": 2.9874563217163086, + "step": 6149, + "token_acc": 0.30007298146667927 + }, + { + "epoch": 3.6051011433597187, + "grad_norm": 0.2723891445238067, + "learning_rate": 0.00019086669170522832, + "loss": 3.021811008453369, + "step": 6150, + "token_acc": 0.29487909779308397 + }, + { + "epoch": 3.6056874816769273, + "grad_norm": 0.29245624329354564, + "learning_rate": 0.0001908626446216168, + "loss": 2.9864869117736816, + "step": 6151, + "token_acc": 0.29999313063420097 + }, + { + "epoch": 3.6062738199941364, + "grad_norm": 0.2869472107341945, + "learning_rate": 0.0001908585966844732, + "loss": 2.988417148590088, + "step": 6152, + "token_acc": 0.2997824874949941 + }, + { + "epoch": 3.6068601583113455, + "grad_norm": 0.3238962966920359, + "learning_rate": 0.0001908545478938355, + "loss": 2.980520248413086, + "step": 6153, + "token_acc": 0.30055756878825246 + }, + { + "epoch": 3.6074464966285547, + "grad_norm": 0.2584199792354374, + "learning_rate": 0.00019085049824974176, + "loss": 3.0198869705200195, + "step": 6154, + "token_acc": 0.2940510465702757 + }, + { + "epoch": 3.6080328349457638, + "grad_norm": 0.2815049814019709, + "learning_rate": 0.00019084644775222998, + "loss": 2.9583446979522705, + "step": 6155, + "token_acc": 0.30336045059536354 + }, + { + "epoch": 3.608619173262973, + "grad_norm": 0.2788215288643938, + "learning_rate": 0.00019084239640133828, + "loss": 2.998249053955078, + "step": 6156, + "token_acc": 0.2972175999380829 + }, + { + "epoch": 3.609205511580182, + "grad_norm": 0.23771260423215942, + "learning_rate": 0.00019083834419710466, + "loss": 2.96685791015625, + "step": 6157, + "token_acc": 0.30172644167802193 + }, + { + "epoch": 3.6097918498973907, + "grad_norm": 0.26435356443270297, + "learning_rate": 0.0001908342911395672, + "loss": 2.9485459327697754, + "step": 6158, + "token_acc": 0.30718357422666487 + }, + { + "epoch": 3.6103781882145998, + "grad_norm": 0.2518544107212388, + "learning_rate": 0.000190830237228764, + "loss": 2.939924716949463, + "step": 6159, + "token_acc": 0.3062974715991997 + }, + { + "epoch": 3.610964526531809, + "grad_norm": 0.2556012043150553, + "learning_rate": 0.0001908261824647331, + "loss": 2.9505722522735596, + "step": 6160, + "token_acc": 0.30520046032782805 + }, + { + "epoch": 3.611550864849018, + "grad_norm": 0.22653556991068524, + "learning_rate": 0.0001908221268475126, + "loss": 2.9202792644500732, + "step": 6161, + "token_acc": 0.3109134807363274 + }, + { + "epoch": 3.6121372031662267, + "grad_norm": 0.25990243821009557, + "learning_rate": 0.0001908180703771406, + "loss": 2.985656261444092, + "step": 6162, + "token_acc": 0.30095158783622505 + }, + { + "epoch": 3.6127235414834358, + "grad_norm": 0.29619555821171106, + "learning_rate": 0.00019081401305365522, + "loss": 2.97200870513916, + "step": 6163, + "token_acc": 0.3022150736258777 + }, + { + "epoch": 3.613309879800645, + "grad_norm": 0.2941226098620405, + "learning_rate": 0.00019080995487709456, + "loss": 2.9388225078582764, + "step": 6164, + "token_acc": 0.3075512075817793 + }, + { + "epoch": 3.613896218117854, + "grad_norm": 0.3468725655991898, + "learning_rate": 0.00019080589584749672, + "loss": 3.005979537963867, + "step": 6165, + "token_acc": 0.29630646676282035 + }, + { + "epoch": 3.614482556435063, + "grad_norm": 0.3220289794508712, + "learning_rate": 0.00019080183596489986, + "loss": 3.045868396759033, + "step": 6166, + "token_acc": 0.29053471018393784 + }, + { + "epoch": 3.615068894752272, + "grad_norm": 0.27399430655068224, + "learning_rate": 0.00019079777522934213, + "loss": 2.94460391998291, + "step": 6167, + "token_acc": 0.3046594322688484 + }, + { + "epoch": 3.6156552330694813, + "grad_norm": 0.30118627259141817, + "learning_rate": 0.00019079371364086166, + "loss": 2.9589529037475586, + "step": 6168, + "token_acc": 0.3045903235801868 + }, + { + "epoch": 3.61624157138669, + "grad_norm": 0.2930012446351192, + "learning_rate": 0.00019078965119949655, + "loss": 2.98988676071167, + "step": 6169, + "token_acc": 0.2992364991170486 + }, + { + "epoch": 3.616827909703899, + "grad_norm": 0.27856118291067317, + "learning_rate": 0.000190785587905285, + "loss": 2.945614814758301, + "step": 6170, + "token_acc": 0.3065683182858871 + }, + { + "epoch": 3.6174142480211082, + "grad_norm": 0.26361483267455377, + "learning_rate": 0.00019078152375826525, + "loss": 2.963853597640991, + "step": 6171, + "token_acc": 0.3052314062285192 + }, + { + "epoch": 3.6180005863383173, + "grad_norm": 0.26750377185294894, + "learning_rate": 0.00019077745875847537, + "loss": 2.9795773029327393, + "step": 6172, + "token_acc": 0.3012130103410026 + }, + { + "epoch": 3.618586924655526, + "grad_norm": 0.2663804956131301, + "learning_rate": 0.0001907733929059536, + "loss": 2.944855213165283, + "step": 6173, + "token_acc": 0.30556487865726195 + }, + { + "epoch": 3.619173262972735, + "grad_norm": 0.33296952111803024, + "learning_rate": 0.00019076932620073816, + "loss": 2.9526374340057373, + "step": 6174, + "token_acc": 0.30584491692972887 + }, + { + "epoch": 3.6197596012899442, + "grad_norm": 0.365944764407279, + "learning_rate": 0.00019076525864286715, + "loss": 3.0436816215515137, + "step": 6175, + "token_acc": 0.2919612503765356 + }, + { + "epoch": 3.6203459396071533, + "grad_norm": 0.3177944417820769, + "learning_rate": 0.00019076119023237888, + "loss": 2.9874870777130127, + "step": 6176, + "token_acc": 0.29948480996832805 + }, + { + "epoch": 3.6209322779243625, + "grad_norm": 0.2843870348356647, + "learning_rate": 0.00019075712096931153, + "loss": 2.984544515609741, + "step": 6177, + "token_acc": 0.30142010088487786 + }, + { + "epoch": 3.6215186162415716, + "grad_norm": 0.32361294931390544, + "learning_rate": 0.00019075305085370332, + "loss": 2.9647340774536133, + "step": 6178, + "token_acc": 0.3025995583722277 + }, + { + "epoch": 3.6221049545587807, + "grad_norm": 0.2889808386304798, + "learning_rate": 0.00019074897988559248, + "loss": 2.956777572631836, + "step": 6179, + "token_acc": 0.3052317645422657 + }, + { + "epoch": 3.6226912928759893, + "grad_norm": 0.27006920715000504, + "learning_rate": 0.00019074490806501727, + "loss": 2.9542698860168457, + "step": 6180, + "token_acc": 0.30556562654648517 + }, + { + "epoch": 3.6232776311931985, + "grad_norm": 0.2868417760748693, + "learning_rate": 0.00019074083539201593, + "loss": 2.9923036098480225, + "step": 6181, + "token_acc": 0.299751620681528 + }, + { + "epoch": 3.6238639695104076, + "grad_norm": 0.30061755622675707, + "learning_rate": 0.0001907367618666267, + "loss": 3.0001049041748047, + "step": 6182, + "token_acc": 0.29682224719160816 + }, + { + "epoch": 3.6244503078276167, + "grad_norm": 0.25275165826890705, + "learning_rate": 0.00019073268748888786, + "loss": 2.940582275390625, + "step": 6183, + "token_acc": 0.306372486131709 + }, + { + "epoch": 3.6250366461448253, + "grad_norm": 0.2962188534729802, + "learning_rate": 0.0001907286122588377, + "loss": 2.954368829727173, + "step": 6184, + "token_acc": 0.30511541993209335 + }, + { + "epoch": 3.6256229844620345, + "grad_norm": 0.2781403435619372, + "learning_rate": 0.00019072453617651448, + "loss": 2.9168784618377686, + "step": 6185, + "token_acc": 0.3099325113494031 + }, + { + "epoch": 3.6262093227792436, + "grad_norm": 0.26699117779111864, + "learning_rate": 0.00019072045924195652, + "loss": 2.9802467823028564, + "step": 6186, + "token_acc": 0.30024963029937185 + }, + { + "epoch": 3.6267956610964527, + "grad_norm": 0.3306638468825156, + "learning_rate": 0.00019071638145520204, + "loss": 2.9505598545074463, + "step": 6187, + "token_acc": 0.30659413384159256 + }, + { + "epoch": 3.627381999413662, + "grad_norm": 0.2860276604339937, + "learning_rate": 0.00019071230281628946, + "loss": 2.9598701000213623, + "step": 6188, + "token_acc": 0.3027730527730528 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.2711790997606585, + "learning_rate": 0.000190708223325257, + "loss": 2.9696645736694336, + "step": 6189, + "token_acc": 0.3020161050772579 + }, + { + "epoch": 3.62855467604808, + "grad_norm": 0.33376570529357985, + "learning_rate": 0.000190704142982143, + "loss": 2.9799728393554688, + "step": 6190, + "token_acc": 0.30156798414972547 + }, + { + "epoch": 3.6291410143652887, + "grad_norm": 0.2705693947280847, + "learning_rate": 0.0001907000617869858, + "loss": 2.982224941253662, + "step": 6191, + "token_acc": 0.299917279388226 + }, + { + "epoch": 3.629727352682498, + "grad_norm": 0.2917094012425391, + "learning_rate": 0.00019069597973982378, + "loss": 2.9821066856384277, + "step": 6192, + "token_acc": 0.300698863369866 + }, + { + "epoch": 3.630313690999707, + "grad_norm": 0.2683510046575638, + "learning_rate": 0.0001906918968406952, + "loss": 2.9758899211883545, + "step": 6193, + "token_acc": 0.3019360667057791 + }, + { + "epoch": 3.630900029316916, + "grad_norm": 0.2586959415928914, + "learning_rate": 0.0001906878130896385, + "loss": 2.98081111907959, + "step": 6194, + "token_acc": 0.300010272741281 + }, + { + "epoch": 3.6314863676341247, + "grad_norm": 0.29747046318066234, + "learning_rate": 0.00019068372848669198, + "loss": 2.967271327972412, + "step": 6195, + "token_acc": 0.30302388583145884 + }, + { + "epoch": 3.632072705951334, + "grad_norm": 0.2655527102125555, + "learning_rate": 0.000190679643031894, + "loss": 2.9688143730163574, + "step": 6196, + "token_acc": 0.30188255708960304 + }, + { + "epoch": 3.632659044268543, + "grad_norm": 0.2539137250025021, + "learning_rate": 0.00019067555672528302, + "loss": 3.0024023056030273, + "step": 6197, + "token_acc": 0.29832192066918367 + }, + { + "epoch": 3.633245382585752, + "grad_norm": 0.25450568310885174, + "learning_rate": 0.00019067146956689733, + "loss": 2.974299192428589, + "step": 6198, + "token_acc": 0.30229438055909347 + }, + { + "epoch": 3.633831720902961, + "grad_norm": 0.2907435385167291, + "learning_rate": 0.00019066738155677537, + "loss": 3.0151805877685547, + "step": 6199, + "token_acc": 0.2955186169868517 + }, + { + "epoch": 3.6344180592201702, + "grad_norm": 0.3122053922969575, + "learning_rate": 0.00019066329269495555, + "loss": 2.9650015830993652, + "step": 6200, + "token_acc": 0.30378918611743977 + }, + { + "epoch": 3.635004397537379, + "grad_norm": 0.21337769052105093, + "learning_rate": 0.00019065920298147625, + "loss": 2.951798915863037, + "step": 6201, + "token_acc": 0.3062927930644751 + }, + { + "epoch": 3.635590735854588, + "grad_norm": 0.3442300718367786, + "learning_rate": 0.00019065511241637593, + "loss": 3.0073294639587402, + "step": 6202, + "token_acc": 0.2951863729478251 + }, + { + "epoch": 3.636177074171797, + "grad_norm": 0.2979926028903667, + "learning_rate": 0.00019065102099969297, + "loss": 2.9703011512756348, + "step": 6203, + "token_acc": 0.3023549708184979 + }, + { + "epoch": 3.6367634124890063, + "grad_norm": 0.284621434168284, + "learning_rate": 0.0001906469287314658, + "loss": 2.9915499687194824, + "step": 6204, + "token_acc": 0.29943188025783896 + }, + { + "epoch": 3.637349750806215, + "grad_norm": 0.3207872178620921, + "learning_rate": 0.0001906428356117329, + "loss": 2.975562572479248, + "step": 6205, + "token_acc": 0.3026583282799005 + }, + { + "epoch": 3.637936089123424, + "grad_norm": 0.2781573544098772, + "learning_rate": 0.00019063874164053273, + "loss": 2.973410129547119, + "step": 6206, + "token_acc": 0.30082479699573345 + }, + { + "epoch": 3.638522427440633, + "grad_norm": 0.2819400047466067, + "learning_rate": 0.00019063464681790367, + "loss": 2.9945993423461914, + "step": 6207, + "token_acc": 0.29973346190726424 + }, + { + "epoch": 3.6391087657578423, + "grad_norm": 0.29518411052627225, + "learning_rate": 0.00019063055114388428, + "loss": 2.9451169967651367, + "step": 6208, + "token_acc": 0.3058016239202031 + }, + { + "epoch": 3.6396951040750514, + "grad_norm": 0.315084499386678, + "learning_rate": 0.00019062645461851297, + "loss": 2.9967751502990723, + "step": 6209, + "token_acc": 0.2979920340471109 + }, + { + "epoch": 3.6402814423922605, + "grad_norm": 0.3202276478155651, + "learning_rate": 0.00019062235724182823, + "loss": 2.976663589477539, + "step": 6210, + "token_acc": 0.3009384899547007 + }, + { + "epoch": 3.6408677807094696, + "grad_norm": 0.3143574720813603, + "learning_rate": 0.00019061825901386858, + "loss": 2.9756641387939453, + "step": 6211, + "token_acc": 0.3021067168863779 + }, + { + "epoch": 3.6414541190266783, + "grad_norm": 0.2948584087582355, + "learning_rate": 0.00019061415993467247, + "loss": 2.9689507484436035, + "step": 6212, + "token_acc": 0.3009819320344924 + }, + { + "epoch": 3.6420404573438874, + "grad_norm": 0.2853401118200396, + "learning_rate": 0.00019061006000427845, + "loss": 2.9341278076171875, + "step": 6213, + "token_acc": 0.3084795649842232 + }, + { + "epoch": 3.6426267956610965, + "grad_norm": 0.30364684230206446, + "learning_rate": 0.000190605959222725, + "loss": 2.9727964401245117, + "step": 6214, + "token_acc": 0.3018801561240908 + }, + { + "epoch": 3.6432131339783056, + "grad_norm": 0.25438090983228817, + "learning_rate": 0.0001906018575900507, + "loss": 3.007850170135498, + "step": 6215, + "token_acc": 0.29763555839696426 + }, + { + "epoch": 3.6437994722955143, + "grad_norm": 0.30752962351742735, + "learning_rate": 0.00019059775510629399, + "loss": 3.0018739700317383, + "step": 6216, + "token_acc": 0.2978389363369506 + }, + { + "epoch": 3.6443858106127234, + "grad_norm": 0.2918374990459936, + "learning_rate": 0.00019059365177149346, + "loss": 2.941260814666748, + "step": 6217, + "token_acc": 0.30619276545142243 + }, + { + "epoch": 3.6449721489299325, + "grad_norm": 0.2812347764106457, + "learning_rate": 0.00019058954758568766, + "loss": 2.96766996383667, + "step": 6218, + "token_acc": 0.30441127038020466 + }, + { + "epoch": 3.6455584872471416, + "grad_norm": 0.27273104363229456, + "learning_rate": 0.00019058544254891514, + "loss": 3.00252628326416, + "step": 6219, + "token_acc": 0.2983004605526632 + }, + { + "epoch": 3.6461448255643507, + "grad_norm": 0.30926508385875995, + "learning_rate": 0.0001905813366612144, + "loss": 2.932182788848877, + "step": 6220, + "token_acc": 0.308145139894032 + }, + { + "epoch": 3.64673116388156, + "grad_norm": 0.33176825152152195, + "learning_rate": 0.00019057722992262411, + "loss": 2.997267961502075, + "step": 6221, + "token_acc": 0.29897303303116207 + }, + { + "epoch": 3.647317502198769, + "grad_norm": 0.31339646878857463, + "learning_rate": 0.0001905731223331828, + "loss": 2.9566590785980225, + "step": 6222, + "token_acc": 0.3038203311201907 + }, + { + "epoch": 3.6479038405159776, + "grad_norm": 0.3383958252657143, + "learning_rate": 0.00019056901389292901, + "loss": 2.9743757247924805, + "step": 6223, + "token_acc": 0.3006694976067325 + }, + { + "epoch": 3.6484901788331867, + "grad_norm": 0.29777665194718855, + "learning_rate": 0.00019056490460190144, + "loss": 2.971754312515259, + "step": 6224, + "token_acc": 0.30263735974639255 + }, + { + "epoch": 3.649076517150396, + "grad_norm": 0.2933986445680828, + "learning_rate": 0.0001905607944601386, + "loss": 2.9496264457702637, + "step": 6225, + "token_acc": 0.3050920278346798 + }, + { + "epoch": 3.649662855467605, + "grad_norm": 0.28903595672131344, + "learning_rate": 0.0001905566834676791, + "loss": 2.9644532203674316, + "step": 6226, + "token_acc": 0.3025563854485099 + }, + { + "epoch": 3.6502491937848136, + "grad_norm": 0.2844361000020329, + "learning_rate": 0.00019055257162456162, + "loss": 2.9940433502197266, + "step": 6227, + "token_acc": 0.2994151163910573 + }, + { + "epoch": 3.6508355321020227, + "grad_norm": 0.287571379422993, + "learning_rate": 0.00019054845893082476, + "loss": 2.975808620452881, + "step": 6228, + "token_acc": 0.3021555390176483 + }, + { + "epoch": 3.651421870419232, + "grad_norm": 0.292445357082647, + "learning_rate": 0.00019054434538650714, + "loss": 2.9213790893554688, + "step": 6229, + "token_acc": 0.3103839337250903 + }, + { + "epoch": 3.652008208736441, + "grad_norm": 0.3243218123384122, + "learning_rate": 0.00019054023099164736, + "loss": 3.005901336669922, + "step": 6230, + "token_acc": 0.29821217652743476 + }, + { + "epoch": 3.65259454705365, + "grad_norm": 0.3684401545528447, + "learning_rate": 0.00019053611574628416, + "loss": 2.9830000400543213, + "step": 6231, + "token_acc": 0.3002663565933397 + }, + { + "epoch": 3.653180885370859, + "grad_norm": 0.41392585147505484, + "learning_rate": 0.00019053199965045613, + "loss": 2.9981765747070312, + "step": 6232, + "token_acc": 0.2974693519906179 + }, + { + "epoch": 3.6537672236880683, + "grad_norm": 0.32182968111089044, + "learning_rate": 0.00019052788270420198, + "loss": 2.9905354976654053, + "step": 6233, + "token_acc": 0.3023598309422546 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.3466503631600139, + "learning_rate": 0.00019052376490756034, + "loss": 2.979701519012451, + "step": 6234, + "token_acc": 0.3010690120392589 + }, + { + "epoch": 3.654939900322486, + "grad_norm": 0.41647482494672516, + "learning_rate": 0.00019051964626056993, + "loss": 2.991868019104004, + "step": 6235, + "token_acc": 0.29920602726842893 + }, + { + "epoch": 3.655526238639695, + "grad_norm": 0.2799565199133241, + "learning_rate": 0.0001905155267632694, + "loss": 3.036208391189575, + "step": 6236, + "token_acc": 0.29476869984388265 + }, + { + "epoch": 3.6561125769569043, + "grad_norm": 0.3771354109494663, + "learning_rate": 0.00019051140641569746, + "loss": 2.9412827491760254, + "step": 6237, + "token_acc": 0.3054329158892378 + }, + { + "epoch": 3.656698915274113, + "grad_norm": 0.2533824598350181, + "learning_rate": 0.00019050728521789284, + "loss": 2.956327199935913, + "step": 6238, + "token_acc": 0.30332765386204047 + }, + { + "epoch": 3.657285253591322, + "grad_norm": 0.3377729027095872, + "learning_rate": 0.00019050316316989422, + "loss": 2.952402114868164, + "step": 6239, + "token_acc": 0.30571273966233137 + }, + { + "epoch": 3.657871591908531, + "grad_norm": 0.22786917596176479, + "learning_rate": 0.00019049904027174038, + "loss": 2.9367804527282715, + "step": 6240, + "token_acc": 0.30655910679005366 + }, + { + "epoch": 3.6584579302257403, + "grad_norm": 0.3189700110669627, + "learning_rate": 0.00019049491652346995, + "loss": 2.9897208213806152, + "step": 6241, + "token_acc": 0.3009005006225492 + }, + { + "epoch": 3.6590442685429494, + "grad_norm": 0.22156996188183542, + "learning_rate": 0.00019049079192512175, + "loss": 2.9745535850524902, + "step": 6242, + "token_acc": 0.30223500300942957 + }, + { + "epoch": 3.6596306068601585, + "grad_norm": 0.2668414452185026, + "learning_rate": 0.0001904866664767345, + "loss": 2.995163917541504, + "step": 6243, + "token_acc": 0.2987265562471643 + }, + { + "epoch": 3.660216945177367, + "grad_norm": 0.22995724363482903, + "learning_rate": 0.00019048254017834694, + "loss": 2.9742980003356934, + "step": 6244, + "token_acc": 0.30336239733741666 + }, + { + "epoch": 3.6608032834945763, + "grad_norm": 0.24405452652073414, + "learning_rate": 0.00019047841302999785, + "loss": 2.9605932235717773, + "step": 6245, + "token_acc": 0.3040994121211644 + }, + { + "epoch": 3.6613896218117854, + "grad_norm": 0.24877708211516808, + "learning_rate": 0.000190474285031726, + "loss": 2.9449081420898438, + "step": 6246, + "token_acc": 0.3063109010666817 + }, + { + "epoch": 3.6619759601289945, + "grad_norm": 0.2419367217666246, + "learning_rate": 0.00019047015618357013, + "loss": 3.0069422721862793, + "step": 6247, + "token_acc": 0.2973931388347044 + }, + { + "epoch": 3.6625622984462036, + "grad_norm": 0.26894027616857596, + "learning_rate": 0.00019046602648556906, + "loss": 2.9843950271606445, + "step": 6248, + "token_acc": 0.3008007421263756 + }, + { + "epoch": 3.6631486367634123, + "grad_norm": 0.2126682485340138, + "learning_rate": 0.0001904618959377616, + "loss": 3.0006322860717773, + "step": 6249, + "token_acc": 0.29686951486064383 + }, + { + "epoch": 3.6637349750806214, + "grad_norm": 0.26636978955626106, + "learning_rate": 0.0001904577645401865, + "loss": 2.9362521171569824, + "step": 6250, + "token_acc": 0.3067324907846235 + }, + { + "epoch": 3.6643213133978305, + "grad_norm": 0.23079087184621802, + "learning_rate": 0.00019045363229288262, + "loss": 2.989394187927246, + "step": 6251, + "token_acc": 0.30040518206536226 + }, + { + "epoch": 3.6649076517150396, + "grad_norm": 0.2515445115068219, + "learning_rate": 0.00019044949919588873, + "loss": 2.9568722248077393, + "step": 6252, + "token_acc": 0.3043862412112347 + }, + { + "epoch": 3.6654939900322487, + "grad_norm": 0.25468672884628824, + "learning_rate": 0.0001904453652492437, + "loss": 3.000718593597412, + "step": 6253, + "token_acc": 0.29907100982090656 + }, + { + "epoch": 3.666080328349458, + "grad_norm": 0.25922166180503803, + "learning_rate": 0.0001904412304529863, + "loss": 2.9637246131896973, + "step": 6254, + "token_acc": 0.3036730334684558 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.24102015358890252, + "learning_rate": 0.00019043709480715543, + "loss": 2.9836018085479736, + "step": 6255, + "token_acc": 0.3009246035524242 + }, + { + "epoch": 3.6672530049838756, + "grad_norm": 0.22824511692856825, + "learning_rate": 0.00019043295831178993, + "loss": 2.927424669265747, + "step": 6256, + "token_acc": 0.30931591594123475 + }, + { + "epoch": 3.6678393433010847, + "grad_norm": 0.25916895475754387, + "learning_rate": 0.00019042882096692866, + "loss": 2.974609375, + "step": 6257, + "token_acc": 0.30105942329138125 + }, + { + "epoch": 3.668425681618294, + "grad_norm": 0.2382323014523667, + "learning_rate": 0.00019042468277261044, + "loss": 2.9911837577819824, + "step": 6258, + "token_acc": 0.3013075042693211 + }, + { + "epoch": 3.6690120199355025, + "grad_norm": 0.254529195370672, + "learning_rate": 0.0001904205437288742, + "loss": 2.969275951385498, + "step": 6259, + "token_acc": 0.30247830044056884 + }, + { + "epoch": 3.6695983582527116, + "grad_norm": 0.25586360118694773, + "learning_rate": 0.00019041640383575875, + "loss": 2.929520606994629, + "step": 6260, + "token_acc": 0.30856523796349683 + }, + { + "epoch": 3.6701846965699207, + "grad_norm": 0.288498697968749, + "learning_rate": 0.00019041226309330308, + "loss": 2.9930315017700195, + "step": 6261, + "token_acc": 0.2994312383030594 + }, + { + "epoch": 3.67077103488713, + "grad_norm": 0.22877322672817335, + "learning_rate": 0.000190408121501546, + "loss": 3.008330821990967, + "step": 6262, + "token_acc": 0.2982720718468046 + }, + { + "epoch": 3.671357373204339, + "grad_norm": 0.27063142259504636, + "learning_rate": 0.00019040397906052646, + "loss": 2.9880032539367676, + "step": 6263, + "token_acc": 0.298337150307002 + }, + { + "epoch": 3.671943711521548, + "grad_norm": 0.23368492651493533, + "learning_rate": 0.00019039983577028336, + "loss": 2.924875259399414, + "step": 6264, + "token_acc": 0.30932840960868047 + }, + { + "epoch": 3.672530049838757, + "grad_norm": 0.2582833283735504, + "learning_rate": 0.0001903956916308556, + "loss": 2.9381537437438965, + "step": 6265, + "token_acc": 0.3068796449636561 + }, + { + "epoch": 3.673116388155966, + "grad_norm": 0.27916407381449165, + "learning_rate": 0.00019039154664228213, + "loss": 2.987145185470581, + "step": 6266, + "token_acc": 0.2997504599465344 + }, + { + "epoch": 3.673702726473175, + "grad_norm": 0.3225457540657182, + "learning_rate": 0.0001903874008046019, + "loss": 2.986154317855835, + "step": 6267, + "token_acc": 0.2998957454469242 + }, + { + "epoch": 3.674289064790384, + "grad_norm": 0.27221288643563646, + "learning_rate": 0.00019038325411785382, + "loss": 2.9574015140533447, + "step": 6268, + "token_acc": 0.3035075450214472 + }, + { + "epoch": 3.674875403107593, + "grad_norm": 0.2452292547901702, + "learning_rate": 0.00019037910658207684, + "loss": 2.929011583328247, + "step": 6269, + "token_acc": 0.31004418463447136 + }, + { + "epoch": 3.675461741424802, + "grad_norm": 0.2717992141555569, + "learning_rate": 0.00019037495819731, + "loss": 2.9656410217285156, + "step": 6270, + "token_acc": 0.3026585443329815 + }, + { + "epoch": 3.676048079742011, + "grad_norm": 0.2559583773015556, + "learning_rate": 0.0001903708089635922, + "loss": 3.0026004314422607, + "step": 6271, + "token_acc": 0.29758137461147993 + }, + { + "epoch": 3.67663441805922, + "grad_norm": 0.23217987874185483, + "learning_rate": 0.00019036665888096238, + "loss": 2.9892964363098145, + "step": 6272, + "token_acc": 0.2996508453218524 + }, + { + "epoch": 3.677220756376429, + "grad_norm": 0.27741439275831536, + "learning_rate": 0.00019036250794945958, + "loss": 2.9899230003356934, + "step": 6273, + "token_acc": 0.2983738166600388 + }, + { + "epoch": 3.6778070946936383, + "grad_norm": 0.24622681211937247, + "learning_rate": 0.0001903583561691228, + "loss": 2.951610565185547, + "step": 6274, + "token_acc": 0.30356155001185425 + }, + { + "epoch": 3.6783934330108474, + "grad_norm": 0.2573566691372984, + "learning_rate": 0.00019035420353999101, + "loss": 2.962080478668213, + "step": 6275, + "token_acc": 0.3034800608291731 + }, + { + "epoch": 3.6789797713280565, + "grad_norm": 0.30408817406016164, + "learning_rate": 0.00019035005006210324, + "loss": 2.9670677185058594, + "step": 6276, + "token_acc": 0.3024133993945552 + }, + { + "epoch": 3.679566109645265, + "grad_norm": 0.2729999438704715, + "learning_rate": 0.00019034589573549852, + "loss": 2.9653844833374023, + "step": 6277, + "token_acc": 0.3023982672178505 + }, + { + "epoch": 3.6801524479624743, + "grad_norm": 0.2696680662045605, + "learning_rate": 0.00019034174056021584, + "loss": 2.952261447906494, + "step": 6278, + "token_acc": 0.304149359984298 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.2778391845593696, + "learning_rate": 0.0001903375845362942, + "loss": 2.941913604736328, + "step": 6279, + "token_acc": 0.30735129540547224 + }, + { + "epoch": 3.6813251245968925, + "grad_norm": 0.2644557967380305, + "learning_rate": 0.00019033342766377274, + "loss": 2.979827404022217, + "step": 6280, + "token_acc": 0.3015176524767281 + }, + { + "epoch": 3.681911462914101, + "grad_norm": 0.2543370120967468, + "learning_rate": 0.00019032926994269045, + "loss": 2.960798501968384, + "step": 6281, + "token_acc": 0.30410753752438796 + }, + { + "epoch": 3.6824978012313103, + "grad_norm": 0.25910844602861055, + "learning_rate": 0.00019032511137308635, + "loss": 2.9486069679260254, + "step": 6282, + "token_acc": 0.3048707411974316 + }, + { + "epoch": 3.6830841395485194, + "grad_norm": 0.257511453464599, + "learning_rate": 0.00019032095195499956, + "loss": 2.973449230194092, + "step": 6283, + "token_acc": 0.30220434374774513 + }, + { + "epoch": 3.6836704778657285, + "grad_norm": 0.2817837062344389, + "learning_rate": 0.00019031679168846917, + "loss": 3.031688690185547, + "step": 6284, + "token_acc": 0.29244978972605434 + }, + { + "epoch": 3.6842568161829377, + "grad_norm": 0.2817957970667045, + "learning_rate": 0.0001903126305735342, + "loss": 2.964170455932617, + "step": 6285, + "token_acc": 0.3031678524583648 + }, + { + "epoch": 3.6848431545001468, + "grad_norm": 0.24806487159675453, + "learning_rate": 0.00019030846861023374, + "loss": 2.95097017288208, + "step": 6286, + "token_acc": 0.3054087659309916 + }, + { + "epoch": 3.685429492817356, + "grad_norm": 0.24150324801622605, + "learning_rate": 0.00019030430579860692, + "loss": 2.984927177429199, + "step": 6287, + "token_acc": 0.2991145841297998 + }, + { + "epoch": 3.6860158311345645, + "grad_norm": 0.2544207190906043, + "learning_rate": 0.00019030014213869284, + "loss": 2.9702696800231934, + "step": 6288, + "token_acc": 0.30226896689816524 + }, + { + "epoch": 3.6866021694517737, + "grad_norm": 0.2566797661180451, + "learning_rate": 0.00019029597763053061, + "loss": 2.978369951248169, + "step": 6289, + "token_acc": 0.3022135487074114 + }, + { + "epoch": 3.6871885077689828, + "grad_norm": 0.28976969616958237, + "learning_rate": 0.00019029181227415935, + "loss": 3.0182032585144043, + "step": 6290, + "token_acc": 0.29523811995670785 + }, + { + "epoch": 3.687774846086192, + "grad_norm": 0.29241306248037163, + "learning_rate": 0.00019028764606961816, + "loss": 2.9459493160247803, + "step": 6291, + "token_acc": 0.30576064359144034 + }, + { + "epoch": 3.6883611844034006, + "grad_norm": 0.22933930935580515, + "learning_rate": 0.00019028347901694622, + "loss": 2.942089557647705, + "step": 6292, + "token_acc": 0.30683110725343626 + }, + { + "epoch": 3.6889475227206097, + "grad_norm": 0.31712030748613307, + "learning_rate": 0.00019027931111618266, + "loss": 2.9646778106689453, + "step": 6293, + "token_acc": 0.30280976342625465 + }, + { + "epoch": 3.6895338610378188, + "grad_norm": 0.4116757377235896, + "learning_rate": 0.00019027514236736662, + "loss": 2.9486870765686035, + "step": 6294, + "token_acc": 0.30627205346721803 + }, + { + "epoch": 3.690120199355028, + "grad_norm": 0.30100712642999755, + "learning_rate": 0.00019027097277053728, + "loss": 2.9905476570129395, + "step": 6295, + "token_acc": 0.3003432700065262 + }, + { + "epoch": 3.690706537672237, + "grad_norm": 0.32578474489889564, + "learning_rate": 0.00019026680232573376, + "loss": 2.9521169662475586, + "step": 6296, + "token_acc": 0.30554650329140326 + }, + { + "epoch": 3.691292875989446, + "grad_norm": 0.3326382296483218, + "learning_rate": 0.00019026263103299527, + "loss": 2.9799866676330566, + "step": 6297, + "token_acc": 0.30086036786733056 + }, + { + "epoch": 3.6918792143066548, + "grad_norm": 0.260117217522024, + "learning_rate": 0.000190258458892361, + "loss": 2.9289636611938477, + "step": 6298, + "token_acc": 0.3070492040970618 + }, + { + "epoch": 3.692465552623864, + "grad_norm": 0.3447819424850371, + "learning_rate": 0.00019025428590387016, + "loss": 2.992405891418457, + "step": 6299, + "token_acc": 0.30223858817438704 + }, + { + "epoch": 3.693051890941073, + "grad_norm": 0.22559533891626973, + "learning_rate": 0.0001902501120675619, + "loss": 2.9502062797546387, + "step": 6300, + "token_acc": 0.30567795447268514 + }, + { + "epoch": 3.693638229258282, + "grad_norm": 0.2684349205698122, + "learning_rate": 0.0001902459373834754, + "loss": 2.950061798095703, + "step": 6301, + "token_acc": 0.30570234811956287 + }, + { + "epoch": 3.6942245675754912, + "grad_norm": 0.25520331733917195, + "learning_rate": 0.00019024176185165, + "loss": 2.984220504760742, + "step": 6302, + "token_acc": 0.30230165205506504 + }, + { + "epoch": 3.6948109058927, + "grad_norm": 0.25501063768549975, + "learning_rate": 0.00019023758547212483, + "loss": 2.969302177429199, + "step": 6303, + "token_acc": 0.30262633316569687 + }, + { + "epoch": 3.695397244209909, + "grad_norm": 0.26438871768553796, + "learning_rate": 0.00019023340824493915, + "loss": 3.025841474533081, + "step": 6304, + "token_acc": 0.2960849748737193 + }, + { + "epoch": 3.695983582527118, + "grad_norm": 0.2669508649230447, + "learning_rate": 0.00019022923017013218, + "loss": 2.9550399780273438, + "step": 6305, + "token_acc": 0.30319367709045564 + }, + { + "epoch": 3.6965699208443272, + "grad_norm": 0.23609734712239372, + "learning_rate": 0.00019022505124774317, + "loss": 2.9517624378204346, + "step": 6306, + "token_acc": 0.30398864678126924 + }, + { + "epoch": 3.6971562591615363, + "grad_norm": 0.28346762018295474, + "learning_rate": 0.0001902208714778114, + "loss": 2.996804714202881, + "step": 6307, + "token_acc": 0.2986849089792562 + }, + { + "epoch": 3.6977425974787455, + "grad_norm": 0.2443855681911445, + "learning_rate": 0.0001902166908603761, + "loss": 3.019991397857666, + "step": 6308, + "token_acc": 0.29568681395949403 + }, + { + "epoch": 3.698328935795954, + "grad_norm": 0.23324056432401047, + "learning_rate": 0.00019021250939547658, + "loss": 2.955322742462158, + "step": 6309, + "token_acc": 0.3040921560658403 + }, + { + "epoch": 3.6989152741131632, + "grad_norm": 0.2493018923769049, + "learning_rate": 0.00019020832708315207, + "loss": 2.9597363471984863, + "step": 6310, + "token_acc": 0.3034445734878665 + }, + { + "epoch": 3.6995016124303723, + "grad_norm": 0.25336673496525297, + "learning_rate": 0.00019020414392344187, + "loss": 2.9652631282806396, + "step": 6311, + "token_acc": 0.3043450893173276 + }, + { + "epoch": 3.7000879507475815, + "grad_norm": 0.2930127425097025, + "learning_rate": 0.00019019995991638534, + "loss": 2.9235386848449707, + "step": 6312, + "token_acc": 0.3073756468089886 + }, + { + "epoch": 3.70067428906479, + "grad_norm": 0.25189305536060086, + "learning_rate": 0.0001901957750620217, + "loss": 2.9615228176116943, + "step": 6313, + "token_acc": 0.3040208610388837 + }, + { + "epoch": 3.7012606273819992, + "grad_norm": 0.2772555745158556, + "learning_rate": 0.00019019158936039028, + "loss": 2.986903429031372, + "step": 6314, + "token_acc": 0.2994557264525366 + }, + { + "epoch": 3.7018469656992083, + "grad_norm": 0.23084375307624813, + "learning_rate": 0.00019018740281153044, + "loss": 2.9422569274902344, + "step": 6315, + "token_acc": 0.3057512771988745 + }, + { + "epoch": 3.7024333040164175, + "grad_norm": 0.26245203025893526, + "learning_rate": 0.00019018321541548148, + "loss": 2.9992148876190186, + "step": 6316, + "token_acc": 0.29956117641630825 + }, + { + "epoch": 3.7030196423336266, + "grad_norm": 0.25214658305730453, + "learning_rate": 0.00019017902717228272, + "loss": 2.9766857624053955, + "step": 6317, + "token_acc": 0.3021283245809499 + }, + { + "epoch": 3.7036059806508357, + "grad_norm": 0.24707826165407104, + "learning_rate": 0.00019017483808197354, + "loss": 2.9763357639312744, + "step": 6318, + "token_acc": 0.3032729370672987 + }, + { + "epoch": 3.704192318968045, + "grad_norm": 0.25151762941648587, + "learning_rate": 0.00019017064814459326, + "loss": 2.965953826904297, + "step": 6319, + "token_acc": 0.3042194870570032 + }, + { + "epoch": 3.7047786572852535, + "grad_norm": 0.2381626682024101, + "learning_rate": 0.00019016645736018122, + "loss": 2.961422920227051, + "step": 6320, + "token_acc": 0.30404426863800443 + }, + { + "epoch": 3.7053649956024626, + "grad_norm": 0.2447539051919443, + "learning_rate": 0.00019016226572877683, + "loss": 2.9980249404907227, + "step": 6321, + "token_acc": 0.29899884842185703 + }, + { + "epoch": 3.7059513339196717, + "grad_norm": 0.2887434428940188, + "learning_rate": 0.00019015807325041946, + "loss": 2.98592209815979, + "step": 6322, + "token_acc": 0.2995074635632116 + }, + { + "epoch": 3.706537672236881, + "grad_norm": 0.3207331876905664, + "learning_rate": 0.00019015387992514846, + "loss": 2.929492473602295, + "step": 6323, + "token_acc": 0.30956660289666504 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.2956742793871334, + "learning_rate": 0.00019014968575300326, + "loss": 2.9600448608398438, + "step": 6324, + "token_acc": 0.30439749014265316 + }, + { + "epoch": 3.7077103488712986, + "grad_norm": 0.24755414157473432, + "learning_rate": 0.00019014549073402326, + "loss": 2.9625425338745117, + "step": 6325, + "token_acc": 0.30299008550595397 + }, + { + "epoch": 3.7082966871885077, + "grad_norm": 0.2950645471218155, + "learning_rate": 0.0001901412948682478, + "loss": 2.969426155090332, + "step": 6326, + "token_acc": 0.3027498252509133 + }, + { + "epoch": 3.708883025505717, + "grad_norm": 0.3099957167844313, + "learning_rate": 0.00019013709815571636, + "loss": 2.979088306427002, + "step": 6327, + "token_acc": 0.30084974355060695 + }, + { + "epoch": 3.709469363822926, + "grad_norm": 0.22653178367405352, + "learning_rate": 0.00019013290059646836, + "loss": 3.002145767211914, + "step": 6328, + "token_acc": 0.2977456204810919 + }, + { + "epoch": 3.710055702140135, + "grad_norm": 0.24421553966880818, + "learning_rate": 0.00019012870219054323, + "loss": 2.9862465858459473, + "step": 6329, + "token_acc": 0.29960457223360654 + }, + { + "epoch": 3.710642040457344, + "grad_norm": 0.24557155728156826, + "learning_rate": 0.00019012450293798034, + "loss": 2.96150279045105, + "step": 6330, + "token_acc": 0.3056087264336471 + }, + { + "epoch": 3.711228378774553, + "grad_norm": 0.22986727459819853, + "learning_rate": 0.0001901203028388192, + "loss": 2.9522314071655273, + "step": 6331, + "token_acc": 0.3056782453472256 + }, + { + "epoch": 3.711814717091762, + "grad_norm": 0.24288577944197778, + "learning_rate": 0.00019011610189309928, + "loss": 2.963533878326416, + "step": 6332, + "token_acc": 0.3037156456318133 + }, + { + "epoch": 3.712401055408971, + "grad_norm": 0.23093245480123537, + "learning_rate": 0.00019011190010086, + "loss": 2.990579128265381, + "step": 6333, + "token_acc": 0.2985225377586733 + }, + { + "epoch": 3.71298739372618, + "grad_norm": 0.27835846574597023, + "learning_rate": 0.00019010769746214087, + "loss": 2.9629178047180176, + "step": 6334, + "token_acc": 0.30335232529334744 + }, + { + "epoch": 3.713573732043389, + "grad_norm": 0.2595220382673158, + "learning_rate": 0.00019010349397698134, + "loss": 2.9336161613464355, + "step": 6335, + "token_acc": 0.30579483892587106 + }, + { + "epoch": 3.714160070360598, + "grad_norm": 0.28247727036855075, + "learning_rate": 0.00019009928964542087, + "loss": 2.983488082885742, + "step": 6336, + "token_acc": 0.3005971371448018 + }, + { + "epoch": 3.714746408677807, + "grad_norm": 0.30416202870394077, + "learning_rate": 0.00019009508446749898, + "loss": 2.945540428161621, + "step": 6337, + "token_acc": 0.3064674876448612 + }, + { + "epoch": 3.715332746995016, + "grad_norm": 0.2280390966040148, + "learning_rate": 0.00019009087844325522, + "loss": 2.9879589080810547, + "step": 6338, + "token_acc": 0.2984429929562138 + }, + { + "epoch": 3.7159190853122253, + "grad_norm": 0.36426380750709003, + "learning_rate": 0.00019008667157272902, + "loss": 3.0284194946289062, + "step": 6339, + "token_acc": 0.2947635266087478 + }, + { + "epoch": 3.7165054236294344, + "grad_norm": 0.27550992399242974, + "learning_rate": 0.00019008246385595996, + "loss": 2.9873099327087402, + "step": 6340, + "token_acc": 0.2999179434680626 + }, + { + "epoch": 3.7170917619466435, + "grad_norm": 0.27602068196707996, + "learning_rate": 0.0001900782552929875, + "loss": 2.959066152572632, + "step": 6341, + "token_acc": 0.30414989160730876 + }, + { + "epoch": 3.717678100263852, + "grad_norm": 0.24986176387896136, + "learning_rate": 0.00019007404588385125, + "loss": 2.9747049808502197, + "step": 6342, + "token_acc": 0.30231796666465843 + }, + { + "epoch": 3.7182644385810613, + "grad_norm": 0.2529792653980859, + "learning_rate": 0.0001900698356285907, + "loss": 3.000605821609497, + "step": 6343, + "token_acc": 0.2983727477331648 + }, + { + "epoch": 3.7188507768982704, + "grad_norm": 0.24879973772842484, + "learning_rate": 0.0001900656245272454, + "loss": 3.017080307006836, + "step": 6344, + "token_acc": 0.29679052696439573 + }, + { + "epoch": 3.7194371152154795, + "grad_norm": 0.2950510569138496, + "learning_rate": 0.00019006141257985496, + "loss": 2.984283447265625, + "step": 6345, + "token_acc": 0.2989423737279141 + }, + { + "epoch": 3.720023453532688, + "grad_norm": 0.31357008399101083, + "learning_rate": 0.00019005719978645887, + "loss": 2.958548069000244, + "step": 6346, + "token_acc": 0.3047582636566697 + }, + { + "epoch": 3.7206097918498973, + "grad_norm": 0.25101169288568986, + "learning_rate": 0.00019005298614709678, + "loss": 2.96866512298584, + "step": 6347, + "token_acc": 0.30325714089843614 + }, + { + "epoch": 3.7211961301671064, + "grad_norm": 0.3061100156676543, + "learning_rate": 0.00019004877166180822, + "loss": 2.9383187294006348, + "step": 6348, + "token_acc": 0.3062591864393838 + }, + { + "epoch": 3.7217824684843155, + "grad_norm": 0.23382063298811795, + "learning_rate": 0.0001900445563306328, + "loss": 2.9775500297546387, + "step": 6349, + "token_acc": 0.3015733569657971 + }, + { + "epoch": 3.7223688068015246, + "grad_norm": 0.3133245743169299, + "learning_rate": 0.00019004034015361008, + "loss": 2.994356155395508, + "step": 6350, + "token_acc": 0.29933396764985726 + }, + { + "epoch": 3.7229551451187337, + "grad_norm": 0.256536533640805, + "learning_rate": 0.00019003612313077972, + "loss": 2.9436895847320557, + "step": 6351, + "token_acc": 0.30521720341652503 + }, + { + "epoch": 3.7235414834359424, + "grad_norm": 0.3311811253093764, + "learning_rate": 0.00019003190526218128, + "loss": 3.029824733734131, + "step": 6352, + "token_acc": 0.29527110847912763 + }, + { + "epoch": 3.7241278217531515, + "grad_norm": 0.27837716692868525, + "learning_rate": 0.00019002768654785443, + "loss": 3.0153770446777344, + "step": 6353, + "token_acc": 0.2950725799524723 + }, + { + "epoch": 3.7247141600703606, + "grad_norm": 0.2688445741760561, + "learning_rate": 0.00019002346698783877, + "loss": 2.995077610015869, + "step": 6354, + "token_acc": 0.2994408817738827 + }, + { + "epoch": 3.7253004983875697, + "grad_norm": 0.2694000050206595, + "learning_rate": 0.00019001924658217396, + "loss": 2.9942402839660645, + "step": 6355, + "token_acc": 0.29896504121481976 + }, + { + "epoch": 3.7258868367047784, + "grad_norm": 0.25495557928438856, + "learning_rate": 0.00019001502533089963, + "loss": 2.9655890464782715, + "step": 6356, + "token_acc": 0.3023421060504822 + }, + { + "epoch": 3.7264731750219875, + "grad_norm": 0.2631673278622382, + "learning_rate": 0.00019001080323405542, + "loss": 2.9499459266662598, + "step": 6357, + "token_acc": 0.3062973812314915 + }, + { + "epoch": 3.7270595133391966, + "grad_norm": 0.3135742446004838, + "learning_rate": 0.00019000658029168102, + "loss": 2.9979248046875, + "step": 6358, + "token_acc": 0.2973516672146809 + }, + { + "epoch": 3.7276458516564057, + "grad_norm": 0.29819777289467625, + "learning_rate": 0.0001900023565038161, + "loss": 2.9760982990264893, + "step": 6359, + "token_acc": 0.30106373841456907 + }, + { + "epoch": 3.728232189973615, + "grad_norm": 0.26914305886366235, + "learning_rate": 0.00018999813187050028, + "loss": 2.9923925399780273, + "step": 6360, + "token_acc": 0.3007405436566006 + }, + { + "epoch": 3.728818528290824, + "grad_norm": 0.29489594525756213, + "learning_rate": 0.00018999390639177328, + "loss": 2.978440284729004, + "step": 6361, + "token_acc": 0.30089077996816455 + }, + { + "epoch": 3.729404866608033, + "grad_norm": 0.2482561434558423, + "learning_rate": 0.00018998968006767484, + "loss": 2.975919485092163, + "step": 6362, + "token_acc": 0.30286378137159087 + }, + { + "epoch": 3.7299912049252417, + "grad_norm": 0.2487304469717474, + "learning_rate": 0.0001899854528982446, + "loss": 2.970010280609131, + "step": 6363, + "token_acc": 0.30247099170900493 + }, + { + "epoch": 3.730577543242451, + "grad_norm": 0.26544232901861814, + "learning_rate": 0.00018998122488352227, + "loss": 3.005382537841797, + "step": 6364, + "token_acc": 0.29765255686792097 + }, + { + "epoch": 3.73116388155966, + "grad_norm": 0.2855807733494327, + "learning_rate": 0.00018997699602354761, + "loss": 2.9702000617980957, + "step": 6365, + "token_acc": 0.30279789984548644 + }, + { + "epoch": 3.731750219876869, + "grad_norm": 0.2773575380206403, + "learning_rate": 0.0001899727663183603, + "loss": 2.995187759399414, + "step": 6366, + "token_acc": 0.2986285533053349 + }, + { + "epoch": 3.7323365581940777, + "grad_norm": 0.27210417189701036, + "learning_rate": 0.0001899685357680001, + "loss": 2.962930679321289, + "step": 6367, + "token_acc": 0.3033415950079296 + }, + { + "epoch": 3.732922896511287, + "grad_norm": 0.252306545487768, + "learning_rate": 0.00018996430437250673, + "loss": 2.973048686981201, + "step": 6368, + "token_acc": 0.30247084445326494 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.2785191343597969, + "learning_rate": 0.00018996007213191996, + "loss": 2.9834110736846924, + "step": 6369, + "token_acc": 0.30146891897469763 + }, + { + "epoch": 3.734095573145705, + "grad_norm": 0.2693615063767666, + "learning_rate": 0.00018995583904627954, + "loss": 3.0329649448394775, + "step": 6370, + "token_acc": 0.29366451774687324 + }, + { + "epoch": 3.734681911462914, + "grad_norm": 0.24196000562253117, + "learning_rate": 0.0001899516051156252, + "loss": 2.9376871585845947, + "step": 6371, + "token_acc": 0.3066253441694045 + }, + { + "epoch": 3.7352682497801233, + "grad_norm": 0.2751400566294294, + "learning_rate": 0.00018994737033999678, + "loss": 2.975109100341797, + "step": 6372, + "token_acc": 0.3030088402717662 + }, + { + "epoch": 3.7358545880973324, + "grad_norm": 0.23953798830569437, + "learning_rate": 0.000189943134719434, + "loss": 2.9184794425964355, + "step": 6373, + "token_acc": 0.3087879595305293 + }, + { + "epoch": 3.736440926414541, + "grad_norm": 0.28342537556308817, + "learning_rate": 0.00018993889825397666, + "loss": 2.9362549781799316, + "step": 6374, + "token_acc": 0.3070441740204276 + }, + { + "epoch": 3.73702726473175, + "grad_norm": 0.2572060120275675, + "learning_rate": 0.00018993466094366457, + "loss": 2.967146396636963, + "step": 6375, + "token_acc": 0.30267576115926564 + }, + { + "epoch": 3.7376136030489593, + "grad_norm": 0.2519424990729519, + "learning_rate": 0.00018993042278853754, + "loss": 2.959254026412964, + "step": 6376, + "token_acc": 0.30264331426127505 + }, + { + "epoch": 3.7381999413661684, + "grad_norm": 0.23522774289619797, + "learning_rate": 0.0001899261837886354, + "loss": 3.0247530937194824, + "step": 6377, + "token_acc": 0.29371664828819527 + }, + { + "epoch": 3.738786279683377, + "grad_norm": 0.25765752840441936, + "learning_rate": 0.00018992194394399788, + "loss": 2.9726128578186035, + "step": 6378, + "token_acc": 0.3022246164706007 + }, + { + "epoch": 3.739372618000586, + "grad_norm": 0.2581576400911658, + "learning_rate": 0.0001899177032546649, + "loss": 2.946310520172119, + "step": 6379, + "token_acc": 0.3055899968477461 + }, + { + "epoch": 3.7399589563177953, + "grad_norm": 0.296704095447177, + "learning_rate": 0.00018991346172067626, + "loss": 2.9851527214050293, + "step": 6380, + "token_acc": 0.3022891418137872 + }, + { + "epoch": 3.7405452946350044, + "grad_norm": 0.3897481978672039, + "learning_rate": 0.0001899092193420718, + "loss": 2.978149175643921, + "step": 6381, + "token_acc": 0.30136508490162195 + }, + { + "epoch": 3.7411316329522135, + "grad_norm": 0.39710884091290466, + "learning_rate": 0.0001899049761188914, + "loss": 3.0382485389709473, + "step": 6382, + "token_acc": 0.2937630054252156 + }, + { + "epoch": 3.7417179712694226, + "grad_norm": 0.23478985922368062, + "learning_rate": 0.00018990073205117487, + "loss": 2.9756479263305664, + "step": 6383, + "token_acc": 0.3007988004793859 + }, + { + "epoch": 3.7423043095866317, + "grad_norm": 0.292506791141633, + "learning_rate": 0.00018989648713896214, + "loss": 2.932119846343994, + "step": 6384, + "token_acc": 0.30870210694479555 + }, + { + "epoch": 3.7428906479038404, + "grad_norm": 0.3126245163646938, + "learning_rate": 0.000189892241382293, + "loss": 2.985936164855957, + "step": 6385, + "token_acc": 0.30068314764670623 + }, + { + "epoch": 3.7434769862210495, + "grad_norm": 0.2706567940782977, + "learning_rate": 0.00018988799478120743, + "loss": 2.994166374206543, + "step": 6386, + "token_acc": 0.2971665969432188 + }, + { + "epoch": 3.7440633245382586, + "grad_norm": 0.3286795833011848, + "learning_rate": 0.00018988374733574522, + "loss": 2.951742172241211, + "step": 6387, + "token_acc": 0.305133521610124 + }, + { + "epoch": 3.7446496628554677, + "grad_norm": 0.2675871955453939, + "learning_rate": 0.00018987949904594636, + "loss": 2.9892616271972656, + "step": 6388, + "token_acc": 0.30090340154373174 + }, + { + "epoch": 3.7452360011726764, + "grad_norm": 0.3078409773378117, + "learning_rate": 0.00018987524991185076, + "loss": 2.9753918647766113, + "step": 6389, + "token_acc": 0.3001743008338175 + }, + { + "epoch": 3.7458223394898855, + "grad_norm": 0.3104564901570339, + "learning_rate": 0.00018987099993349822, + "loss": 2.967794179916382, + "step": 6390, + "token_acc": 0.30155068497382914 + }, + { + "epoch": 3.7464086778070946, + "grad_norm": 0.3308259582628546, + "learning_rate": 0.0001898667491109288, + "loss": 2.9795639514923096, + "step": 6391, + "token_acc": 0.3012476546419125 + }, + { + "epoch": 3.7469950161243037, + "grad_norm": 0.3064452017340387, + "learning_rate": 0.00018986249744418231, + "loss": 3.0218162536621094, + "step": 6392, + "token_acc": 0.2959819570674865 + }, + { + "epoch": 3.747581354441513, + "grad_norm": 0.24555781944165342, + "learning_rate": 0.0001898582449332988, + "loss": 2.9798426628112793, + "step": 6393, + "token_acc": 0.3036490209116446 + }, + { + "epoch": 3.748167692758722, + "grad_norm": 0.2804345634201924, + "learning_rate": 0.00018985399157831813, + "loss": 2.9589853286743164, + "step": 6394, + "token_acc": 0.30474302861588454 + }, + { + "epoch": 3.748754031075931, + "grad_norm": 0.25378063817720764, + "learning_rate": 0.00018984973737928032, + "loss": 2.9575445652008057, + "step": 6395, + "token_acc": 0.3049566529927231 + }, + { + "epoch": 3.7493403693931397, + "grad_norm": 0.3229580742639388, + "learning_rate": 0.00018984548233622528, + "loss": 2.9705677032470703, + "step": 6396, + "token_acc": 0.3028504843205396 + }, + { + "epoch": 3.749926707710349, + "grad_norm": 0.22192204877831015, + "learning_rate": 0.000189841226449193, + "loss": 2.985295295715332, + "step": 6397, + "token_acc": 0.30107986171362977 + }, + { + "epoch": 3.750513046027558, + "grad_norm": 0.27086656075954574, + "learning_rate": 0.00018983696971822348, + "loss": 2.9959020614624023, + "step": 6398, + "token_acc": 0.3003767616365222 + }, + { + "epoch": 3.751099384344767, + "grad_norm": 0.2501935623215438, + "learning_rate": 0.00018983271214335665, + "loss": 2.9752509593963623, + "step": 6399, + "token_acc": 0.30150096392178466 + }, + { + "epoch": 3.7516857226619758, + "grad_norm": 0.2640993820224359, + "learning_rate": 0.00018982845372463259, + "loss": 2.9430179595947266, + "step": 6400, + "token_acc": 0.3080886789977699 + }, + { + "epoch": 3.752272060979185, + "grad_norm": 0.28871971653885753, + "learning_rate": 0.0001898241944620912, + "loss": 2.9612202644348145, + "step": 6401, + "token_acc": 0.3027732727016724 + }, + { + "epoch": 3.752858399296394, + "grad_norm": 0.2503679298147152, + "learning_rate": 0.00018981993435577258, + "loss": 2.988402843475342, + "step": 6402, + "token_acc": 0.3026130795622659 + }, + { + "epoch": 3.753444737613603, + "grad_norm": 0.29817020657949045, + "learning_rate": 0.00018981567340571668, + "loss": 2.988438606262207, + "step": 6403, + "token_acc": 0.29840501768550054 + }, + { + "epoch": 3.754031075930812, + "grad_norm": 0.2938869669411254, + "learning_rate": 0.00018981141161196358, + "loss": 2.9649300575256348, + "step": 6404, + "token_acc": 0.30339169256121795 + }, + { + "epoch": 3.7546174142480213, + "grad_norm": 0.26326831530161504, + "learning_rate": 0.0001898071489745533, + "loss": 2.9572858810424805, + "step": 6405, + "token_acc": 0.3050320950054461 + }, + { + "epoch": 3.75520375256523, + "grad_norm": 0.34522361980275706, + "learning_rate": 0.00018980288549352587, + "loss": 2.9809975624084473, + "step": 6406, + "token_acc": 0.30201390788991395 + }, + { + "epoch": 3.755790090882439, + "grad_norm": 0.26329738302473293, + "learning_rate": 0.00018979862116892134, + "loss": 2.970841407775879, + "step": 6407, + "token_acc": 0.30208955380247454 + }, + { + "epoch": 3.756376429199648, + "grad_norm": 0.3130808412389744, + "learning_rate": 0.00018979435600077974, + "loss": 2.9555368423461914, + "step": 6408, + "token_acc": 0.3052555489412365 + }, + { + "epoch": 3.7569627675168573, + "grad_norm": 0.29680289133861065, + "learning_rate": 0.00018979008998914118, + "loss": 2.9590911865234375, + "step": 6409, + "token_acc": 0.3046593304893298 + }, + { + "epoch": 3.757549105834066, + "grad_norm": 0.2601585311163044, + "learning_rate": 0.00018978582313404575, + "loss": 2.9795825481414795, + "step": 6410, + "token_acc": 0.3022048718100783 + }, + { + "epoch": 3.758135444151275, + "grad_norm": 0.2750601792237733, + "learning_rate": 0.00018978155543553345, + "loss": 2.9883341789245605, + "step": 6411, + "token_acc": 0.30147124844473516 + }, + { + "epoch": 3.758721782468484, + "grad_norm": 0.245527105222135, + "learning_rate": 0.00018977728689364444, + "loss": 2.947695016860962, + "step": 6412, + "token_acc": 0.3059318847805973 + }, + { + "epoch": 3.7593081207856933, + "grad_norm": 0.2585593028353297, + "learning_rate": 0.00018977301750841877, + "loss": 2.96058988571167, + "step": 6413, + "token_acc": 0.30481432930930347 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.2473124335993149, + "learning_rate": 0.00018976874727989662, + "loss": 2.96328067779541, + "step": 6414, + "token_acc": 0.30300524060048656 + }, + { + "epoch": 3.7604807974201115, + "grad_norm": 0.26193974612623955, + "learning_rate": 0.00018976447620811803, + "loss": 2.9938931465148926, + "step": 6415, + "token_acc": 0.30001367278295826 + }, + { + "epoch": 3.7610671357373207, + "grad_norm": 0.22710905408863505, + "learning_rate": 0.00018976020429312316, + "loss": 2.9814109802246094, + "step": 6416, + "token_acc": 0.30099770830329936 + }, + { + "epoch": 3.7616534740545293, + "grad_norm": 0.3062556298707376, + "learning_rate": 0.0001897559315349521, + "loss": 2.9744997024536133, + "step": 6417, + "token_acc": 0.3015961623238913 + }, + { + "epoch": 3.7622398123717384, + "grad_norm": 0.26644095334067547, + "learning_rate": 0.00018975165793364503, + "loss": 3.0003349781036377, + "step": 6418, + "token_acc": 0.29760157647729923 + }, + { + "epoch": 3.7628261506889475, + "grad_norm": 0.3045591780146438, + "learning_rate": 0.00018974738348924206, + "loss": 2.96622633934021, + "step": 6419, + "token_acc": 0.3031609271069775 + }, + { + "epoch": 3.7634124890061567, + "grad_norm": 0.32790874832235006, + "learning_rate": 0.00018974310820178336, + "loss": 3.014979839324951, + "step": 6420, + "token_acc": 0.297230867608195 + }, + { + "epoch": 3.7639988273233653, + "grad_norm": 0.2870746585624688, + "learning_rate": 0.0001897388320713091, + "loss": 2.9602577686309814, + "step": 6421, + "token_acc": 0.30392334550494415 + }, + { + "epoch": 3.7645851656405744, + "grad_norm": 0.26266519882543654, + "learning_rate": 0.00018973455509785944, + "loss": 2.984834671020508, + "step": 6422, + "token_acc": 0.3009348360406882 + }, + { + "epoch": 3.7651715039577835, + "grad_norm": 0.39968855288980115, + "learning_rate": 0.00018973027728147454, + "loss": 2.9502432346343994, + "step": 6423, + "token_acc": 0.3055117941749962 + }, + { + "epoch": 3.7657578422749927, + "grad_norm": 0.3037615600197763, + "learning_rate": 0.0001897259986221946, + "loss": 2.9382119178771973, + "step": 6424, + "token_acc": 0.3067722748796007 + }, + { + "epoch": 3.7663441805922018, + "grad_norm": 0.3041086263052896, + "learning_rate": 0.00018972171912005981, + "loss": 2.9570322036743164, + "step": 6425, + "token_acc": 0.3041972494140179 + }, + { + "epoch": 3.766930518909411, + "grad_norm": 0.3216288563934693, + "learning_rate": 0.0001897174387751104, + "loss": 2.9967215061187744, + "step": 6426, + "token_acc": 0.29831994347473373 + }, + { + "epoch": 3.76751685722662, + "grad_norm": 0.2681767360601132, + "learning_rate": 0.0001897131575873865, + "loss": 2.9416747093200684, + "step": 6427, + "token_acc": 0.3051988248872102 + }, + { + "epoch": 3.7681031955438287, + "grad_norm": 0.32238434255261966, + "learning_rate": 0.0001897088755569284, + "loss": 2.9535956382751465, + "step": 6428, + "token_acc": 0.30622803643778024 + }, + { + "epoch": 3.7686895338610378, + "grad_norm": 0.2597282288638976, + "learning_rate": 0.00018970459268377628, + "loss": 2.9602630138397217, + "step": 6429, + "token_acc": 0.3039485914342272 + }, + { + "epoch": 3.769275872178247, + "grad_norm": 0.3106684193517675, + "learning_rate": 0.00018970030896797043, + "loss": 2.9956607818603516, + "step": 6430, + "token_acc": 0.29824697031898956 + }, + { + "epoch": 3.769862210495456, + "grad_norm": 0.26526237475685877, + "learning_rate": 0.000189696024409551, + "loss": 2.9677610397338867, + "step": 6431, + "token_acc": 0.30389454861623766 + }, + { + "epoch": 3.7704485488126647, + "grad_norm": 0.31382890761195076, + "learning_rate": 0.0001896917390085583, + "loss": 3.0021324157714844, + "step": 6432, + "token_acc": 0.2979854548485575 + }, + { + "epoch": 3.771034887129874, + "grad_norm": 0.25340962827369945, + "learning_rate": 0.00018968745276503262, + "loss": 2.949416399002075, + "step": 6433, + "token_acc": 0.3042702034048257 + }, + { + "epoch": 3.771621225447083, + "grad_norm": 0.35338136637199524, + "learning_rate": 0.00018968316567901413, + "loss": 2.997436046600342, + "step": 6434, + "token_acc": 0.2983510323680739 + }, + { + "epoch": 3.772207563764292, + "grad_norm": 0.26437406847111955, + "learning_rate": 0.00018967887775054316, + "loss": 2.963125705718994, + "step": 6435, + "token_acc": 0.30427102931297956 + }, + { + "epoch": 3.772793902081501, + "grad_norm": 0.28555454071696934, + "learning_rate": 0.00018967458897966, + "loss": 2.9734902381896973, + "step": 6436, + "token_acc": 0.3030131674610227 + }, + { + "epoch": 3.7733802403987102, + "grad_norm": 0.2500844030724953, + "learning_rate": 0.00018967029936640487, + "loss": 3.0043625831604004, + "step": 6437, + "token_acc": 0.297483104473588 + }, + { + "epoch": 3.7739665787159193, + "grad_norm": 0.3040280860395201, + "learning_rate": 0.00018966600891081817, + "loss": 2.973471164703369, + "step": 6438, + "token_acc": 0.3003231264726502 + }, + { + "epoch": 3.774552917033128, + "grad_norm": 0.26987391042434655, + "learning_rate": 0.00018966171761294009, + "loss": 2.976778745651245, + "step": 6439, + "token_acc": 0.30086796927793685 + }, + { + "epoch": 3.775139255350337, + "grad_norm": 0.2729465586788869, + "learning_rate": 0.000189657425472811, + "loss": 2.9822378158569336, + "step": 6440, + "token_acc": 0.301755399802776 + }, + { + "epoch": 3.7757255936675462, + "grad_norm": 0.2553439874426675, + "learning_rate": 0.0001896531324904712, + "loss": 2.9639816284179688, + "step": 6441, + "token_acc": 0.30337592074719893 + }, + { + "epoch": 3.7763119319847553, + "grad_norm": 0.25992658858808765, + "learning_rate": 0.00018964883866596108, + "loss": 2.9855594635009766, + "step": 6442, + "token_acc": 0.2981479496183725 + }, + { + "epoch": 3.776898270301964, + "grad_norm": 0.23978903154784204, + "learning_rate": 0.0001896445439993209, + "loss": 2.9921417236328125, + "step": 6443, + "token_acc": 0.2989601606793278 + }, + { + "epoch": 3.777484608619173, + "grad_norm": 0.2367045496190604, + "learning_rate": 0.000189640248490591, + "loss": 2.972799777984619, + "step": 6444, + "token_acc": 0.301528904993437 + }, + { + "epoch": 3.7780709469363822, + "grad_norm": 0.2380341998977342, + "learning_rate": 0.0001896359521398118, + "loss": 3.000115394592285, + "step": 6445, + "token_acc": 0.29759096811809294 + }, + { + "epoch": 3.7786572852535913, + "grad_norm": 0.20778327035566968, + "learning_rate": 0.00018963165494702356, + "loss": 2.9575588703155518, + "step": 6446, + "token_acc": 0.3030098715038424 + }, + { + "epoch": 3.7792436235708005, + "grad_norm": 0.2616016860918608, + "learning_rate": 0.00018962735691226677, + "loss": 2.9937148094177246, + "step": 6447, + "token_acc": 0.2989232197460414 + }, + { + "epoch": 3.7798299618880096, + "grad_norm": 0.2439608996607693, + "learning_rate": 0.0001896230580355817, + "loss": 2.9846038818359375, + "step": 6448, + "token_acc": 0.30090735028462606 + }, + { + "epoch": 3.7804163002052187, + "grad_norm": 0.23812481452576162, + "learning_rate": 0.00018961875831700878, + "loss": 3.0058727264404297, + "step": 6449, + "token_acc": 0.2980010240218458 + }, + { + "epoch": 3.7810026385224274, + "grad_norm": 0.30612820256887985, + "learning_rate": 0.00018961445775658836, + "loss": 2.983628988265991, + "step": 6450, + "token_acc": 0.3002706528870084 + }, + { + "epoch": 3.7815889768396365, + "grad_norm": 0.28807737227148783, + "learning_rate": 0.0001896101563543609, + "loss": 2.942044496536255, + "step": 6451, + "token_acc": 0.3067467718067746 + }, + { + "epoch": 3.7821753151568456, + "grad_norm": 0.23091217902331396, + "learning_rate": 0.00018960585411036674, + "loss": 2.958925485610962, + "step": 6452, + "token_acc": 0.3048559344366662 + }, + { + "epoch": 3.7827616534740547, + "grad_norm": 0.3257970135816232, + "learning_rate": 0.00018960155102464637, + "loss": 2.9423441886901855, + "step": 6453, + "token_acc": 0.3065108591434528 + }, + { + "epoch": 3.7833479917912634, + "grad_norm": 0.27545340774214644, + "learning_rate": 0.0001895972470972401, + "loss": 2.960127115249634, + "step": 6454, + "token_acc": 0.30305548270498633 + }, + { + "epoch": 3.7839343301084725, + "grad_norm": 0.2465387068210143, + "learning_rate": 0.0001895929423281885, + "loss": 3.020472288131714, + "step": 6455, + "token_acc": 0.2950990207427338 + }, + { + "epoch": 3.7845206684256816, + "grad_norm": 0.2870415606059651, + "learning_rate": 0.00018958863671753192, + "loss": 2.9609787464141846, + "step": 6456, + "token_acc": 0.30344210951450784 + }, + { + "epoch": 3.7851070067428907, + "grad_norm": 0.26132612657468557, + "learning_rate": 0.00018958433026531078, + "loss": 2.9534366130828857, + "step": 6457, + "token_acc": 0.3039761161623015 + }, + { + "epoch": 3.7856933450601, + "grad_norm": 0.24280552585491177, + "learning_rate": 0.00018958002297156558, + "loss": 2.957118034362793, + "step": 6458, + "token_acc": 0.3029219110378913 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.2607746553561123, + "learning_rate": 0.0001895757148363368, + "loss": 2.967437982559204, + "step": 6459, + "token_acc": 0.3028230361403427 + }, + { + "epoch": 3.7868660216945176, + "grad_norm": 0.21662138940769565, + "learning_rate": 0.0001895714058596649, + "loss": 2.9625139236450195, + "step": 6460, + "token_acc": 0.3039037875264817 + }, + { + "epoch": 3.7874523600117267, + "grad_norm": 0.2611983872616639, + "learning_rate": 0.0001895670960415903, + "loss": 2.960892915725708, + "step": 6461, + "token_acc": 0.3034370419478643 + }, + { + "epoch": 3.788038698328936, + "grad_norm": 0.2261393400770819, + "learning_rate": 0.00018956278538215354, + "loss": 2.978372097015381, + "step": 6462, + "token_acc": 0.2994066601054084 + }, + { + "epoch": 3.788625036646145, + "grad_norm": 0.27289552200721107, + "learning_rate": 0.0001895584738813951, + "loss": 3.006833791732788, + "step": 6463, + "token_acc": 0.2977787308114381 + }, + { + "epoch": 3.7892113749633536, + "grad_norm": 0.24511034184719233, + "learning_rate": 0.0001895541615393555, + "loss": 2.9368844032287598, + "step": 6464, + "token_acc": 0.30731900336975393 + }, + { + "epoch": 3.7897977132805627, + "grad_norm": 0.28370330594737736, + "learning_rate": 0.0001895498483560752, + "loss": 3.005736827850342, + "step": 6465, + "token_acc": 0.2987435863702144 + }, + { + "epoch": 3.790384051597772, + "grad_norm": 0.24985030356549393, + "learning_rate": 0.00018954553433159473, + "loss": 2.9910166263580322, + "step": 6466, + "token_acc": 0.3002912424788317 + }, + { + "epoch": 3.790970389914981, + "grad_norm": 0.26021952515960534, + "learning_rate": 0.00018954121946595468, + "loss": 2.954996347427368, + "step": 6467, + "token_acc": 0.30532725857267506 + }, + { + "epoch": 3.79155672823219, + "grad_norm": 0.2342053143968054, + "learning_rate": 0.00018953690375919551, + "loss": 2.9303388595581055, + "step": 6468, + "token_acc": 0.3076729335741725 + }, + { + "epoch": 3.792143066549399, + "grad_norm": 0.25630843671086245, + "learning_rate": 0.00018953258721135776, + "loss": 2.9153311252593994, + "step": 6469, + "token_acc": 0.3092236565532177 + }, + { + "epoch": 3.7927294048666083, + "grad_norm": 0.2310152075531785, + "learning_rate": 0.00018952826982248202, + "loss": 2.981039524078369, + "step": 6470, + "token_acc": 0.30252157236259636 + }, + { + "epoch": 3.793315743183817, + "grad_norm": 0.2816029721011824, + "learning_rate": 0.00018952395159260884, + "loss": 2.9622507095336914, + "step": 6471, + "token_acc": 0.3042772814124529 + }, + { + "epoch": 3.793902081501026, + "grad_norm": 0.25851978307745715, + "learning_rate": 0.00018951963252177874, + "loss": 2.975102663040161, + "step": 6472, + "token_acc": 0.3025784026481156 + }, + { + "epoch": 3.794488419818235, + "grad_norm": 0.29187641028021233, + "learning_rate": 0.00018951531261003233, + "loss": 2.9543423652648926, + "step": 6473, + "token_acc": 0.30443890325971845 + }, + { + "epoch": 3.7950747581354443, + "grad_norm": 0.3137516332363987, + "learning_rate": 0.0001895109918574102, + "loss": 2.984013557434082, + "step": 6474, + "token_acc": 0.30018195943735687 + }, + { + "epoch": 3.795661096452653, + "grad_norm": 0.4079429339667035, + "learning_rate": 0.00018950667026395289, + "loss": 2.9659347534179688, + "step": 6475, + "token_acc": 0.30197372485225377 + }, + { + "epoch": 3.796247434769862, + "grad_norm": 0.5110694433849108, + "learning_rate": 0.00018950234782970105, + "loss": 2.9980976581573486, + "step": 6476, + "token_acc": 0.2985276326288965 + }, + { + "epoch": 3.796833773087071, + "grad_norm": 0.3668991743382624, + "learning_rate": 0.00018949802455469524, + "loss": 2.9845781326293945, + "step": 6477, + "token_acc": 0.299582671162965 + }, + { + "epoch": 3.7974201114042803, + "grad_norm": 0.27596411300862606, + "learning_rate": 0.0001894937004389761, + "loss": 2.961146116256714, + "step": 6478, + "token_acc": 0.3037281218114748 + }, + { + "epoch": 3.7980064497214894, + "grad_norm": 0.31313731265377037, + "learning_rate": 0.00018948937548258422, + "loss": 2.973755359649658, + "step": 6479, + "token_acc": 0.302887353189119 + }, + { + "epoch": 3.7985927880386985, + "grad_norm": 0.24055250786731236, + "learning_rate": 0.00018948504968556028, + "loss": 2.991786241531372, + "step": 6480, + "token_acc": 0.299454233504549 + }, + { + "epoch": 3.7991791263559076, + "grad_norm": 0.2861633278163949, + "learning_rate": 0.00018948072304794488, + "loss": 2.9693217277526855, + "step": 6481, + "token_acc": 0.30120011582332806 + }, + { + "epoch": 3.7997654646731163, + "grad_norm": 0.25525817753095326, + "learning_rate": 0.00018947639556977862, + "loss": 2.9497933387756348, + "step": 6482, + "token_acc": 0.30518910861954696 + }, + { + "epoch": 3.8003518029903254, + "grad_norm": 0.2752159503143734, + "learning_rate": 0.0001894720672511022, + "loss": 2.9878454208374023, + "step": 6483, + "token_acc": 0.29990168705021825 + }, + { + "epoch": 3.8009381413075345, + "grad_norm": 0.26002350968562365, + "learning_rate": 0.0001894677380919563, + "loss": 2.958299160003662, + "step": 6484, + "token_acc": 0.3040312863765393 + }, + { + "epoch": 3.8015244796247436, + "grad_norm": 0.2818770964375362, + "learning_rate": 0.00018946340809238157, + "loss": 3.008742332458496, + "step": 6485, + "token_acc": 0.2971870042433544 + }, + { + "epoch": 3.8021108179419523, + "grad_norm": 0.27733767582467467, + "learning_rate": 0.00018945907725241866, + "loss": 3.0215892791748047, + "step": 6486, + "token_acc": 0.2938997270264858 + }, + { + "epoch": 3.8026971562591614, + "grad_norm": 0.30673458385011565, + "learning_rate": 0.00018945474557210826, + "loss": 2.960934638977051, + "step": 6487, + "token_acc": 0.3039904333584631 + }, + { + "epoch": 3.8032834945763705, + "grad_norm": 0.25723798977338125, + "learning_rate": 0.00018945041305149104, + "loss": 2.9898130893707275, + "step": 6488, + "token_acc": 0.30111156898173097 + }, + { + "epoch": 3.8038698328935796, + "grad_norm": 0.27094781658790884, + "learning_rate": 0.00018944607969060778, + "loss": 2.965735673904419, + "step": 6489, + "token_acc": 0.3030776445296858 + }, + { + "epoch": 3.8044561712107887, + "grad_norm": 0.27857574245158556, + "learning_rate": 0.00018944174548949912, + "loss": 2.979887008666992, + "step": 6490, + "token_acc": 0.301169279155887 + }, + { + "epoch": 3.805042509527998, + "grad_norm": 0.25951125385213214, + "learning_rate": 0.0001894374104482058, + "loss": 2.9727773666381836, + "step": 6491, + "token_acc": 0.3020376754173881 + }, + { + "epoch": 3.805628847845207, + "grad_norm": 0.26324424208473696, + "learning_rate": 0.00018943307456676848, + "loss": 2.9863975048065186, + "step": 6492, + "token_acc": 0.3003067614230351 + }, + { + "epoch": 3.8062151861624156, + "grad_norm": 0.26997291967709486, + "learning_rate": 0.00018942873784522795, + "loss": 2.9934046268463135, + "step": 6493, + "token_acc": 0.29805158072838056 + }, + { + "epoch": 3.8068015244796247, + "grad_norm": 0.23298918778922945, + "learning_rate": 0.00018942440028362493, + "loss": 2.9511237144470215, + "step": 6494, + "token_acc": 0.3046206307711768 + }, + { + "epoch": 3.807387862796834, + "grad_norm": 0.2575913067109522, + "learning_rate": 0.0001894200618820002, + "loss": 2.953953742980957, + "step": 6495, + "token_acc": 0.3045232908533289 + }, + { + "epoch": 3.807974201114043, + "grad_norm": 0.2522861284613284, + "learning_rate": 0.00018941572264039445, + "loss": 2.935549736022949, + "step": 6496, + "token_acc": 0.30819986644478126 + }, + { + "epoch": 3.8085605394312516, + "grad_norm": 0.2231331236893713, + "learning_rate": 0.00018941138255884848, + "loss": 2.9963440895080566, + "step": 6497, + "token_acc": 0.2985771398149729 + }, + { + "epoch": 3.8091468777484607, + "grad_norm": 0.25898359044725044, + "learning_rate": 0.00018940704163740308, + "loss": 2.9858288764953613, + "step": 6498, + "token_acc": 0.3017366932439177 + }, + { + "epoch": 3.80973321606567, + "grad_norm": 0.3227033621169905, + "learning_rate": 0.00018940269987609897, + "loss": 2.954547882080078, + "step": 6499, + "token_acc": 0.306346068916488 + }, + { + "epoch": 3.810319554382879, + "grad_norm": 0.330409234622868, + "learning_rate": 0.00018939835727497698, + "loss": 2.9800124168395996, + "step": 6500, + "token_acc": 0.29988499284399917 + }, + { + "epoch": 3.810905892700088, + "grad_norm": 0.2569895624660461, + "learning_rate": 0.0001893940138340779, + "loss": 2.991697311401367, + "step": 6501, + "token_acc": 0.29903958207755676 + }, + { + "epoch": 3.811492231017297, + "grad_norm": 0.2981652275263049, + "learning_rate": 0.00018938966955344251, + "loss": 2.9611032009124756, + "step": 6502, + "token_acc": 0.3036033248660356 + }, + { + "epoch": 3.8120785693345063, + "grad_norm": 0.3102511908111465, + "learning_rate": 0.00018938532443311165, + "loss": 2.98814058303833, + "step": 6503, + "token_acc": 0.29982911895592196 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.23916674370112898, + "learning_rate": 0.0001893809784731261, + "loss": 2.9591097831726074, + "step": 6504, + "token_acc": 0.30400351064990144 + }, + { + "epoch": 3.813251245968924, + "grad_norm": 0.3320336812420841, + "learning_rate": 0.0001893766316735267, + "loss": 2.9880242347717285, + "step": 6505, + "token_acc": 0.30118081773636435 + }, + { + "epoch": 3.813837584286133, + "grad_norm": 0.27311381123343714, + "learning_rate": 0.00018937228403435427, + "loss": 2.999217987060547, + "step": 6506, + "token_acc": 0.29825480484294303 + }, + { + "epoch": 3.8144239226033423, + "grad_norm": 0.24613684233206592, + "learning_rate": 0.00018936793555564965, + "loss": 2.9370527267456055, + "step": 6507, + "token_acc": 0.30585877049946514 + }, + { + "epoch": 3.815010260920551, + "grad_norm": 0.2706709001681524, + "learning_rate": 0.00018936358623745375, + "loss": 2.950209617614746, + "step": 6508, + "token_acc": 0.3073964450626742 + }, + { + "epoch": 3.81559659923776, + "grad_norm": 0.22471046895308247, + "learning_rate": 0.00018935923607980732, + "loss": 2.9983434677124023, + "step": 6509, + "token_acc": 0.2981613456423738 + }, + { + "epoch": 3.816182937554969, + "grad_norm": 0.2968323557506492, + "learning_rate": 0.0001893548850827513, + "loss": 2.9787673950195312, + "step": 6510, + "token_acc": 0.3017991019728154 + }, + { + "epoch": 3.8167692758721783, + "grad_norm": 0.2679037393202564, + "learning_rate": 0.00018935053324632657, + "loss": 2.9783711433410645, + "step": 6511, + "token_acc": 0.3012353442541161 + }, + { + "epoch": 3.8173556141893874, + "grad_norm": 0.27883523188495396, + "learning_rate": 0.00018934618057057394, + "loss": 2.970696210861206, + "step": 6512, + "token_acc": 0.30121491091841357 + }, + { + "epoch": 3.8179419525065965, + "grad_norm": 0.26270425608838743, + "learning_rate": 0.00018934182705553437, + "loss": 2.968486785888672, + "step": 6513, + "token_acc": 0.3015280144837509 + }, + { + "epoch": 3.818528290823805, + "grad_norm": 0.2636634036711456, + "learning_rate": 0.00018933747270124873, + "loss": 2.965203285217285, + "step": 6514, + "token_acc": 0.3025961630719664 + }, + { + "epoch": 3.8191146291410143, + "grad_norm": 0.246383340857532, + "learning_rate": 0.0001893331175077579, + "loss": 2.963128089904785, + "step": 6515, + "token_acc": 0.3050427157749085 + }, + { + "epoch": 3.8197009674582234, + "grad_norm": 0.2822987575261475, + "learning_rate": 0.00018932876147510278, + "loss": 2.9310970306396484, + "step": 6516, + "token_acc": 0.30740909189353083 + }, + { + "epoch": 3.8202873057754325, + "grad_norm": 0.3008160734583212, + "learning_rate": 0.00018932440460332436, + "loss": 3.0081210136413574, + "step": 6517, + "token_acc": 0.2963012600547718 + }, + { + "epoch": 3.820873644092641, + "grad_norm": 0.2581158722056415, + "learning_rate": 0.0001893200468924635, + "loss": 2.963991641998291, + "step": 6518, + "token_acc": 0.3036314919331851 + }, + { + "epoch": 3.8214599824098503, + "grad_norm": 0.28318244703655115, + "learning_rate": 0.00018931568834256116, + "loss": 2.953887939453125, + "step": 6519, + "token_acc": 0.3028388813268869 + }, + { + "epoch": 3.8220463207270594, + "grad_norm": 0.2981496810028302, + "learning_rate": 0.00018931132895365832, + "loss": 2.9810752868652344, + "step": 6520, + "token_acc": 0.30094367171694586 + }, + { + "epoch": 3.8226326590442685, + "grad_norm": 0.2746602810359318, + "learning_rate": 0.00018930696872579588, + "loss": 2.973417282104492, + "step": 6521, + "token_acc": 0.3016037274839222 + }, + { + "epoch": 3.8232189973614776, + "grad_norm": 0.2931987574985435, + "learning_rate": 0.0001893026076590148, + "loss": 2.948890209197998, + "step": 6522, + "token_acc": 0.30440457024961315 + }, + { + "epoch": 3.8238053356786867, + "grad_norm": 0.22997746077521894, + "learning_rate": 0.00018929824575335605, + "loss": 2.9974405765533447, + "step": 6523, + "token_acc": 0.29686401236788 + }, + { + "epoch": 3.824391673995896, + "grad_norm": 0.29955618221972885, + "learning_rate": 0.00018929388300886063, + "loss": 2.976614236831665, + "step": 6524, + "token_acc": 0.30220428076243316 + }, + { + "epoch": 3.8249780123131045, + "grad_norm": 0.25745567293784866, + "learning_rate": 0.0001892895194255695, + "loss": 3.0095407962799072, + "step": 6525, + "token_acc": 0.29481090129204374 + }, + { + "epoch": 3.8255643506303136, + "grad_norm": 0.2713412391909302, + "learning_rate": 0.00018928515500352364, + "loss": 2.9895386695861816, + "step": 6526, + "token_acc": 0.30012694031698206 + }, + { + "epoch": 3.8261506889475227, + "grad_norm": 0.27907659983060323, + "learning_rate": 0.00018928078974276405, + "loss": 2.9679059982299805, + "step": 6527, + "token_acc": 0.3018858281800991 + }, + { + "epoch": 3.826737027264732, + "grad_norm": 0.32873247246054155, + "learning_rate": 0.00018927642364333175, + "loss": 3.030717611312866, + "step": 6528, + "token_acc": 0.29340850396728957 + }, + { + "epoch": 3.8273233655819405, + "grad_norm": 0.3217152890377281, + "learning_rate": 0.0001892720567052678, + "loss": 2.9552791118621826, + "step": 6529, + "token_acc": 0.3057435942161332 + }, + { + "epoch": 3.8279097038991496, + "grad_norm": 0.27239758691672594, + "learning_rate": 0.00018926768892861312, + "loss": 2.978858232498169, + "step": 6530, + "token_acc": 0.30147255203457995 + }, + { + "epoch": 3.8284960422163588, + "grad_norm": 0.389569836016819, + "learning_rate": 0.00018926332031340883, + "loss": 2.9666478633880615, + "step": 6531, + "token_acc": 0.300999751909505 + }, + { + "epoch": 3.829082380533568, + "grad_norm": 0.3982687088187171, + "learning_rate": 0.0001892589508596959, + "loss": 3.0066559314727783, + "step": 6532, + "token_acc": 0.2976260942708496 + }, + { + "epoch": 3.829668718850777, + "grad_norm": 0.23442871447609875, + "learning_rate": 0.00018925458056751545, + "loss": 2.9384467601776123, + "step": 6533, + "token_acc": 0.30749373063873725 + }, + { + "epoch": 3.830255057167986, + "grad_norm": 0.3720642887850602, + "learning_rate": 0.00018925020943690843, + "loss": 2.924694538116455, + "step": 6534, + "token_acc": 0.30937463864857195 + }, + { + "epoch": 3.830841395485195, + "grad_norm": 0.2238513332271215, + "learning_rate": 0.00018924583746791597, + "loss": 3.0158438682556152, + "step": 6535, + "token_acc": 0.2951865592888039 + }, + { + "epoch": 3.831427733802404, + "grad_norm": 0.3026993928634364, + "learning_rate": 0.00018924146466057918, + "loss": 2.9711694717407227, + "step": 6536, + "token_acc": 0.30340052032691245 + }, + { + "epoch": 3.832014072119613, + "grad_norm": 0.23077247873787882, + "learning_rate": 0.00018923709101493903, + "loss": 2.941213607788086, + "step": 6537, + "token_acc": 0.3052276991519675 + }, + { + "epoch": 3.832600410436822, + "grad_norm": 0.2850071633613535, + "learning_rate": 0.00018923271653103666, + "loss": 2.9446988105773926, + "step": 6538, + "token_acc": 0.3030086422015034 + }, + { + "epoch": 3.833186748754031, + "grad_norm": 0.23391634714757192, + "learning_rate": 0.00018922834120891317, + "loss": 3.051865816116333, + "step": 6539, + "token_acc": 0.2906703134840017 + }, + { + "epoch": 3.83377308707124, + "grad_norm": 0.31385137184852774, + "learning_rate": 0.00018922396504860966, + "loss": 3.0062785148620605, + "step": 6540, + "token_acc": 0.2961318779284322 + }, + { + "epoch": 3.834359425388449, + "grad_norm": 0.22039312273114434, + "learning_rate": 0.00018921958805016723, + "loss": 2.9394474029541016, + "step": 6541, + "token_acc": 0.30521860203245743 + }, + { + "epoch": 3.834945763705658, + "grad_norm": 0.2755406592956006, + "learning_rate": 0.00018921521021362698, + "loss": 2.989572048187256, + "step": 6542, + "token_acc": 0.2981256603831371 + }, + { + "epoch": 3.835532102022867, + "grad_norm": 0.21725046293351383, + "learning_rate": 0.00018921083153903006, + "loss": 3.013251304626465, + "step": 6543, + "token_acc": 0.2971545768338258 + }, + { + "epoch": 3.8361184403400763, + "grad_norm": 0.25457714487559724, + "learning_rate": 0.00018920645202641758, + "loss": 2.9969961643218994, + "step": 6544, + "token_acc": 0.2989139344262295 + }, + { + "epoch": 3.8367047786572854, + "grad_norm": 0.21417911366880765, + "learning_rate": 0.0001892020716758307, + "loss": 2.9853835105895996, + "step": 6545, + "token_acc": 0.3015421829254323 + }, + { + "epoch": 3.8372911169744945, + "grad_norm": 0.26402859028122605, + "learning_rate": 0.00018919769048731058, + "loss": 2.975426435470581, + "step": 6546, + "token_acc": 0.3027708668715547 + }, + { + "epoch": 3.837877455291703, + "grad_norm": 0.2388827238170951, + "learning_rate": 0.00018919330846089833, + "loss": 3.0057177543640137, + "step": 6547, + "token_acc": 0.29634797597123114 + }, + { + "epoch": 3.8384637936089123, + "grad_norm": 0.24731542545910584, + "learning_rate": 0.00018918892559663514, + "loss": 3.0001282691955566, + "step": 6548, + "token_acc": 0.2989893251911288 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.26255158036576626, + "learning_rate": 0.00018918454189456216, + "loss": 2.987797975540161, + "step": 6549, + "token_acc": 0.29973290393487456 + }, + { + "epoch": 3.8396364702433305, + "grad_norm": 0.23109189835235516, + "learning_rate": 0.00018918015735472062, + "loss": 2.954200267791748, + "step": 6550, + "token_acc": 0.3035390835439622 + }, + { + "epoch": 3.840222808560539, + "grad_norm": 0.24974193886076582, + "learning_rate": 0.00018917577197715164, + "loss": 2.9989688396453857, + "step": 6551, + "token_acc": 0.29938174795536143 + }, + { + "epoch": 3.8408091468777483, + "grad_norm": 0.24800466375299599, + "learning_rate": 0.00018917138576189646, + "loss": 2.956712245941162, + "step": 6552, + "token_acc": 0.30490720829605444 + }, + { + "epoch": 3.8413954851949574, + "grad_norm": 0.2910677173467684, + "learning_rate": 0.00018916699870899628, + "loss": 2.9893062114715576, + "step": 6553, + "token_acc": 0.30073180846986935 + }, + { + "epoch": 3.8419818235121665, + "grad_norm": 0.27300541051629335, + "learning_rate": 0.0001891626108184923, + "loss": 2.9990646839141846, + "step": 6554, + "token_acc": 0.29844519331665187 + }, + { + "epoch": 3.8425681618293757, + "grad_norm": 0.2777863842559474, + "learning_rate": 0.00018915822209042573, + "loss": 2.9732422828674316, + "step": 6555, + "token_acc": 0.3015398740609634 + }, + { + "epoch": 3.8431545001465848, + "grad_norm": 0.31920648038273947, + "learning_rate": 0.00018915383252483782, + "loss": 2.958232879638672, + "step": 6556, + "token_acc": 0.3027655832736273 + }, + { + "epoch": 3.843740838463794, + "grad_norm": 0.27735658715512135, + "learning_rate": 0.00018914944212176978, + "loss": 2.935795783996582, + "step": 6557, + "token_acc": 0.3069559171335533 + }, + { + "epoch": 3.8443271767810026, + "grad_norm": 0.27684169793614577, + "learning_rate": 0.00018914505088126285, + "loss": 2.9397130012512207, + "step": 6558, + "token_acc": 0.30711435226322736 + }, + { + "epoch": 3.8449135150982117, + "grad_norm": 0.24738124463911165, + "learning_rate": 0.00018914065880335832, + "loss": 3.0078787803649902, + "step": 6559, + "token_acc": 0.2976215159503569 + }, + { + "epoch": 3.8454998534154208, + "grad_norm": 0.28035206463663415, + "learning_rate": 0.0001891362658880974, + "loss": 2.9847705364227295, + "step": 6560, + "token_acc": 0.2998066283504324 + }, + { + "epoch": 3.84608619173263, + "grad_norm": 0.2634102832611171, + "learning_rate": 0.00018913187213552134, + "loss": 2.9645724296569824, + "step": 6561, + "token_acc": 0.30301350719490605 + }, + { + "epoch": 3.8466725300498386, + "grad_norm": 0.23484176992673733, + "learning_rate": 0.0001891274775456715, + "loss": 2.987703323364258, + "step": 6562, + "token_acc": 0.2986919689986353 + }, + { + "epoch": 3.8472588683670477, + "grad_norm": 0.25984591952909364, + "learning_rate": 0.0001891230821185891, + "loss": 2.9654252529144287, + "step": 6563, + "token_acc": 0.30525967344194654 + }, + { + "epoch": 3.847845206684257, + "grad_norm": 0.26629596386013177, + "learning_rate": 0.00018911868585431543, + "loss": 2.943603754043579, + "step": 6564, + "token_acc": 0.3052165743282676 + }, + { + "epoch": 3.848431545001466, + "grad_norm": 0.2391992298775766, + "learning_rate": 0.0001891142887528918, + "loss": 2.9723825454711914, + "step": 6565, + "token_acc": 0.2996885211914188 + }, + { + "epoch": 3.849017883318675, + "grad_norm": 0.2685897502324014, + "learning_rate": 0.0001891098908143595, + "loss": 3.0006003379821777, + "step": 6566, + "token_acc": 0.2978954978218709 + }, + { + "epoch": 3.849604221635884, + "grad_norm": 0.3312508644240777, + "learning_rate": 0.00018910549203875987, + "loss": 2.944103717803955, + "step": 6567, + "token_acc": 0.3060699601854717 + }, + { + "epoch": 3.850190559953093, + "grad_norm": 0.3269699735457189, + "learning_rate": 0.00018910109242613421, + "loss": 2.968503713607788, + "step": 6568, + "token_acc": 0.30211712342791863 + }, + { + "epoch": 3.850776898270302, + "grad_norm": 0.24664538726190666, + "learning_rate": 0.00018909669197652383, + "loss": 3.0190699100494385, + "step": 6569, + "token_acc": 0.29489689976475536 + }, + { + "epoch": 3.851363236587511, + "grad_norm": 0.2693928496394285, + "learning_rate": 0.0001890922906899701, + "loss": 2.9379968643188477, + "step": 6570, + "token_acc": 0.30698960337233383 + }, + { + "epoch": 3.85194957490472, + "grad_norm": 0.27658183039681894, + "learning_rate": 0.0001890878885665144, + "loss": 2.962949752807617, + "step": 6571, + "token_acc": 0.3037013900527682 + }, + { + "epoch": 3.852535913221929, + "grad_norm": 0.2693550361387959, + "learning_rate": 0.00018908348560619796, + "loss": 2.9985125064849854, + "step": 6572, + "token_acc": 0.2967676260299886 + }, + { + "epoch": 3.853122251539138, + "grad_norm": 0.2617317406431663, + "learning_rate": 0.00018907908180906225, + "loss": 2.9859962463378906, + "step": 6573, + "token_acc": 0.3013485742329281 + }, + { + "epoch": 3.853708589856347, + "grad_norm": 0.2340987878679894, + "learning_rate": 0.0001890746771751486, + "loss": 2.9504761695861816, + "step": 6574, + "token_acc": 0.3054859663071361 + }, + { + "epoch": 3.854294928173556, + "grad_norm": 0.23756963622205265, + "learning_rate": 0.00018907027170449837, + "loss": 2.9846694469451904, + "step": 6575, + "token_acc": 0.30020008593941533 + }, + { + "epoch": 3.8548812664907652, + "grad_norm": 0.2740573110573964, + "learning_rate": 0.00018906586539715298, + "loss": 2.985908031463623, + "step": 6576, + "token_acc": 0.3002133420178898 + }, + { + "epoch": 3.8554676048079743, + "grad_norm": 0.2351187850980175, + "learning_rate": 0.0001890614582531538, + "loss": 2.9653494358062744, + "step": 6577, + "token_acc": 0.3045679164231743 + }, + { + "epoch": 3.8560539431251835, + "grad_norm": 0.2516890280930331, + "learning_rate": 0.00018905705027254222, + "loss": 2.95635986328125, + "step": 6578, + "token_acc": 0.30444575924599593 + }, + { + "epoch": 3.856640281442392, + "grad_norm": 0.27630985197699487, + "learning_rate": 0.0001890526414553597, + "loss": 2.980644702911377, + "step": 6579, + "token_acc": 0.3010527659370787 + }, + { + "epoch": 3.8572266197596012, + "grad_norm": 0.22901415240074638, + "learning_rate": 0.00018904823180164755, + "loss": 2.9527535438537598, + "step": 6580, + "token_acc": 0.30582121990754013 + }, + { + "epoch": 3.8578129580768104, + "grad_norm": 0.22480247876338946, + "learning_rate": 0.00018904382131144728, + "loss": 2.9790101051330566, + "step": 6581, + "token_acc": 0.30085785966710726 + }, + { + "epoch": 3.8583992963940195, + "grad_norm": 0.24950726632856918, + "learning_rate": 0.00018903940998480032, + "loss": 2.9796957969665527, + "step": 6582, + "token_acc": 0.30032191233326627 + }, + { + "epoch": 3.858985634711228, + "grad_norm": 0.24375514582605803, + "learning_rate": 0.00018903499782174806, + "loss": 2.9565320014953613, + "step": 6583, + "token_acc": 0.3046423225611923 + }, + { + "epoch": 3.8595719730284372, + "grad_norm": 0.24682260971358824, + "learning_rate": 0.00018903058482233197, + "loss": 2.9611656665802, + "step": 6584, + "token_acc": 0.3038019269743901 + }, + { + "epoch": 3.8601583113456464, + "grad_norm": 0.27799446773964115, + "learning_rate": 0.00018902617098659355, + "loss": 2.993495464324951, + "step": 6585, + "token_acc": 0.29933959969746127 + }, + { + "epoch": 3.8607446496628555, + "grad_norm": 0.2763036721744753, + "learning_rate": 0.00018902175631457417, + "loss": 2.9699997901916504, + "step": 6586, + "token_acc": 0.3037248762961001 + }, + { + "epoch": 3.8613309879800646, + "grad_norm": 0.3033940454127729, + "learning_rate": 0.00018901734080631536, + "loss": 2.9681100845336914, + "step": 6587, + "token_acc": 0.303348460374843 + }, + { + "epoch": 3.8619173262972737, + "grad_norm": 0.2784206850543003, + "learning_rate": 0.00018901292446185859, + "loss": 2.981411933898926, + "step": 6588, + "token_acc": 0.3005357695800493 + }, + { + "epoch": 3.862503664614483, + "grad_norm": 0.2612354992627761, + "learning_rate": 0.00018900850728124536, + "loss": 3.0085861682891846, + "step": 6589, + "token_acc": 0.29694844130866305 + }, + { + "epoch": 3.8630900029316915, + "grad_norm": 0.3149991636738378, + "learning_rate": 0.0001890040892645171, + "loss": 2.9808101654052734, + "step": 6590, + "token_acc": 0.29982053141214526 + }, + { + "epoch": 3.8636763412489006, + "grad_norm": 0.40096297645010476, + "learning_rate": 0.0001889996704117154, + "loss": 2.9964959621429443, + "step": 6591, + "token_acc": 0.2990876897355222 + }, + { + "epoch": 3.8642626795661097, + "grad_norm": 0.3759961880757574, + "learning_rate": 0.00018899525072288168, + "loss": 2.9703972339630127, + "step": 6592, + "token_acc": 0.302446519323957 + }, + { + "epoch": 3.864849017883319, + "grad_norm": 0.30962284190031536, + "learning_rate": 0.00018899083019805754, + "loss": 2.9514222145080566, + "step": 6593, + "token_acc": 0.3035770473022216 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.2797327000604344, + "learning_rate": 0.00018898640883728446, + "loss": 2.94486665725708, + "step": 6594, + "token_acc": 0.3068007112204382 + }, + { + "epoch": 3.8660216945177366, + "grad_norm": 0.3165109061810221, + "learning_rate": 0.00018898198664060395, + "loss": 2.9469738006591797, + "step": 6595, + "token_acc": 0.30442420012298815 + }, + { + "epoch": 3.8666080328349457, + "grad_norm": 0.2686144491908403, + "learning_rate": 0.00018897756360805763, + "loss": 2.9870944023132324, + "step": 6596, + "token_acc": 0.3003847929015563 + }, + { + "epoch": 3.867194371152155, + "grad_norm": 0.27060693351158177, + "learning_rate": 0.00018897313973968697, + "loss": 2.9807047843933105, + "step": 6597, + "token_acc": 0.299737125732762 + }, + { + "epoch": 3.867780709469364, + "grad_norm": 0.2624501927585736, + "learning_rate": 0.00018896871503553355, + "loss": 2.962324380874634, + "step": 6598, + "token_acc": 0.30281936976349905 + }, + { + "epoch": 3.868367047786573, + "grad_norm": 0.22024054527219022, + "learning_rate": 0.00018896428949563896, + "loss": 2.9282186031341553, + "step": 6599, + "token_acc": 0.30940650583624907 + }, + { + "epoch": 3.868953386103782, + "grad_norm": 0.3003255056871707, + "learning_rate": 0.00018895986312004475, + "loss": 3.009282112121582, + "step": 6600, + "token_acc": 0.2969149529720053 + }, + { + "epoch": 3.869539724420991, + "grad_norm": 0.21971218301402018, + "learning_rate": 0.00018895543590879247, + "loss": 2.9734272956848145, + "step": 6601, + "token_acc": 0.30134884118155714 + }, + { + "epoch": 3.8701260627382, + "grad_norm": 0.29897030765720656, + "learning_rate": 0.00018895100786192373, + "loss": 2.9890599250793457, + "step": 6602, + "token_acc": 0.3013773111004432 + }, + { + "epoch": 3.870712401055409, + "grad_norm": 0.2455968303443717, + "learning_rate": 0.0001889465789794802, + "loss": 2.9506888389587402, + "step": 6603, + "token_acc": 0.30582732832007986 + }, + { + "epoch": 3.871298739372618, + "grad_norm": 0.2661505922192627, + "learning_rate": 0.00018894214926150338, + "loss": 2.9331350326538086, + "step": 6604, + "token_acc": 0.3072227088419545 + }, + { + "epoch": 3.871885077689827, + "grad_norm": 0.2764553014798297, + "learning_rate": 0.00018893771870803492, + "loss": 2.9683642387390137, + "step": 6605, + "token_acc": 0.30380053451324573 + }, + { + "epoch": 3.872471416007036, + "grad_norm": 0.24987557665629767, + "learning_rate": 0.0001889332873191164, + "loss": 2.952374219894409, + "step": 6606, + "token_acc": 0.3028951541896259 + }, + { + "epoch": 3.873057754324245, + "grad_norm": 0.3125239165562528, + "learning_rate": 0.00018892885509478954, + "loss": 2.9447720050811768, + "step": 6607, + "token_acc": 0.3066290928609769 + }, + { + "epoch": 3.873644092641454, + "grad_norm": 0.2447664245438412, + "learning_rate": 0.0001889244220350959, + "loss": 3.016688346862793, + "step": 6608, + "token_acc": 0.2944239226033421 + }, + { + "epoch": 3.8742304309586633, + "grad_norm": 0.3380046160889902, + "learning_rate": 0.00018891998814007715, + "loss": 2.9483423233032227, + "step": 6609, + "token_acc": 0.3063433998345527 + }, + { + "epoch": 3.8748167692758724, + "grad_norm": 0.29053080853390506, + "learning_rate": 0.0001889155534097749, + "loss": 2.9640774726867676, + "step": 6610, + "token_acc": 0.30336190224351495 + }, + { + "epoch": 3.875403107593081, + "grad_norm": 0.30449922224142834, + "learning_rate": 0.00018891111784423087, + "loss": 3.0250210762023926, + "step": 6611, + "token_acc": 0.2939440106646353 + }, + { + "epoch": 3.87598944591029, + "grad_norm": 0.23451655621413303, + "learning_rate": 0.00018890668144348668, + "loss": 3.0070528984069824, + "step": 6612, + "token_acc": 0.2973025568108105 + }, + { + "epoch": 3.8765757842274993, + "grad_norm": 0.2678397291227879, + "learning_rate": 0.00018890224420758407, + "loss": 3.0150537490844727, + "step": 6613, + "token_acc": 0.2967995867657425 + }, + { + "epoch": 3.8771621225447084, + "grad_norm": 0.26214380594391734, + "learning_rate": 0.00018889780613656464, + "loss": 2.9843482971191406, + "step": 6614, + "token_acc": 0.30008882087611627 + }, + { + "epoch": 3.8777484608619175, + "grad_norm": 0.32610892180228557, + "learning_rate": 0.00018889336723047008, + "loss": 3.0186028480529785, + "step": 6615, + "token_acc": 0.29446573504174245 + }, + { + "epoch": 3.878334799179126, + "grad_norm": 0.34055448049035714, + "learning_rate": 0.00018888892748934218, + "loss": 2.985410690307617, + "step": 6616, + "token_acc": 0.2982729575762252 + }, + { + "epoch": 3.8789211374963353, + "grad_norm": 0.26096876699023536, + "learning_rate": 0.00018888448691322253, + "loss": 2.9608302116394043, + "step": 6617, + "token_acc": 0.30279321940841397 + }, + { + "epoch": 3.8795074758135444, + "grad_norm": 0.2592938861485805, + "learning_rate": 0.00018888004550215293, + "loss": 2.9746205806732178, + "step": 6618, + "token_acc": 0.3006277431866898 + }, + { + "epoch": 3.8800938141307535, + "grad_norm": 0.29374199501137177, + "learning_rate": 0.00018887560325617507, + "loss": 2.993506669998169, + "step": 6619, + "token_acc": 0.2983843683473608 + }, + { + "epoch": 3.8806801524479626, + "grad_norm": 0.23343553620098012, + "learning_rate": 0.00018887116017533067, + "loss": 2.976466178894043, + "step": 6620, + "token_acc": 0.30247849061445076 + }, + { + "epoch": 3.8812664907651717, + "grad_norm": 0.28125736025367254, + "learning_rate": 0.0001888667162596615, + "loss": 2.96417498588562, + "step": 6621, + "token_acc": 0.3029697081564429 + }, + { + "epoch": 3.8818528290823804, + "grad_norm": 0.2538792065483055, + "learning_rate": 0.00018886227150920922, + "loss": 3.009347915649414, + "step": 6622, + "token_acc": 0.2963773363137403 + }, + { + "epoch": 3.8824391673995895, + "grad_norm": 0.26553775339762836, + "learning_rate": 0.0001888578259240157, + "loss": 2.9885189533233643, + "step": 6623, + "token_acc": 0.3004351655864218 + }, + { + "epoch": 3.8830255057167986, + "grad_norm": 0.23495111482500267, + "learning_rate": 0.0001888533795041226, + "loss": 2.9778409004211426, + "step": 6624, + "token_acc": 0.3005599203160312 + }, + { + "epoch": 3.8836118440340077, + "grad_norm": 0.23949151438282304, + "learning_rate": 0.00018884893224957176, + "loss": 2.9597413539886475, + "step": 6625, + "token_acc": 0.30309328520880435 + }, + { + "epoch": 3.8841981823512164, + "grad_norm": 0.23497873600284389, + "learning_rate": 0.00018884448416040493, + "loss": 2.998344898223877, + "step": 6626, + "token_acc": 0.2976496127914501 + }, + { + "epoch": 3.8847845206684255, + "grad_norm": 0.21587033977052048, + "learning_rate": 0.0001888400352366639, + "loss": 3.004761219024658, + "step": 6627, + "token_acc": 0.29682110343714446 + }, + { + "epoch": 3.8853708589856346, + "grad_norm": 0.24969744940200198, + "learning_rate": 0.00018883558547839042, + "loss": 3.0008902549743652, + "step": 6628, + "token_acc": 0.297086806206952 + }, + { + "epoch": 3.8859571973028437, + "grad_norm": 0.2412223863704887, + "learning_rate": 0.00018883113488562633, + "loss": 2.95741605758667, + "step": 6629, + "token_acc": 0.30567092028304804 + }, + { + "epoch": 3.886543535620053, + "grad_norm": 0.2660658390821338, + "learning_rate": 0.00018882668345841344, + "loss": 3.001864433288574, + "step": 6630, + "token_acc": 0.2982506174906853 + }, + { + "epoch": 3.887129873937262, + "grad_norm": 0.2463285791102486, + "learning_rate": 0.00018882223119679354, + "loss": 2.9841654300689697, + "step": 6631, + "token_acc": 0.299462497075899 + }, + { + "epoch": 3.887716212254471, + "grad_norm": 0.22612302055758363, + "learning_rate": 0.00018881777810080848, + "loss": 2.971144676208496, + "step": 6632, + "token_acc": 0.3019725294410264 + }, + { + "epoch": 3.8883025505716797, + "grad_norm": 0.2517708960294958, + "learning_rate": 0.0001888133241705001, + "loss": 2.954951286315918, + "step": 6633, + "token_acc": 0.3054076485986082 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2653754782302431, + "learning_rate": 0.00018880886940591016, + "loss": 2.9475808143615723, + "step": 6634, + "token_acc": 0.30563719727251354 + }, + { + "epoch": 3.889475227206098, + "grad_norm": 0.2817371743030177, + "learning_rate": 0.0001888044138070806, + "loss": 3.0366148948669434, + "step": 6635, + "token_acc": 0.2947911516898253 + }, + { + "epoch": 3.890061565523307, + "grad_norm": 0.2836071043199749, + "learning_rate": 0.00018879995737405324, + "loss": 2.9504294395446777, + "step": 6636, + "token_acc": 0.30356567036228216 + }, + { + "epoch": 3.8906479038405157, + "grad_norm": 0.26358054784821444, + "learning_rate": 0.00018879550010686994, + "loss": 2.984335422515869, + "step": 6637, + "token_acc": 0.30244030424572416 + }, + { + "epoch": 3.891234242157725, + "grad_norm": 0.24424673486908102, + "learning_rate": 0.00018879104200557255, + "loss": 2.9907312393188477, + "step": 6638, + "token_acc": 0.2999737824909844 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 0.2573698541705401, + "learning_rate": 0.000188786583070203, + "loss": 2.9807116985321045, + "step": 6639, + "token_acc": 0.3016705354085011 + }, + { + "epoch": 3.892406918792143, + "grad_norm": 0.2457970213698467, + "learning_rate": 0.0001887821233008031, + "loss": 2.9726996421813965, + "step": 6640, + "token_acc": 0.30085238733521596 + }, + { + "epoch": 3.892993257109352, + "grad_norm": 0.27429160107036615, + "learning_rate": 0.00018877766269741485, + "loss": 2.983583927154541, + "step": 6641, + "token_acc": 0.3002059745677536 + }, + { + "epoch": 3.8935795954265613, + "grad_norm": 0.286163928529811, + "learning_rate": 0.00018877320126008002, + "loss": 2.9630837440490723, + "step": 6642, + "token_acc": 0.30331265357381015 + }, + { + "epoch": 3.8941659337437704, + "grad_norm": 0.2767036903622396, + "learning_rate": 0.00018876873898884063, + "loss": 3.0131726264953613, + "step": 6643, + "token_acc": 0.29637767610157717 + }, + { + "epoch": 3.894752272060979, + "grad_norm": 0.29226693828062505, + "learning_rate": 0.00018876427588373855, + "loss": 2.938687801361084, + "step": 6644, + "token_acc": 0.3063164223898699 + }, + { + "epoch": 3.895338610378188, + "grad_norm": 0.2549602347961865, + "learning_rate": 0.0001887598119448157, + "loss": 2.9561405181884766, + "step": 6645, + "token_acc": 0.3039101709813707 + }, + { + "epoch": 3.8959249486953973, + "grad_norm": 0.2684099165668575, + "learning_rate": 0.000188755347172114, + "loss": 2.958974838256836, + "step": 6646, + "token_acc": 0.3040535567844902 + }, + { + "epoch": 3.8965112870126064, + "grad_norm": 0.28050509497263837, + "learning_rate": 0.00018875088156567547, + "loss": 2.975895881652832, + "step": 6647, + "token_acc": 0.3009517259514731 + }, + { + "epoch": 3.897097625329815, + "grad_norm": 0.3307004023140116, + "learning_rate": 0.00018874641512554193, + "loss": 2.9834132194519043, + "step": 6648, + "token_acc": 0.3017969558301977 + }, + { + "epoch": 3.897683963647024, + "grad_norm": 0.2978112259235787, + "learning_rate": 0.00018874194785175545, + "loss": 3.007200241088867, + "step": 6649, + "token_acc": 0.29563803614278095 + }, + { + "epoch": 3.8982703019642333, + "grad_norm": 0.2830289180697127, + "learning_rate": 0.00018873747974435795, + "loss": 2.954601287841797, + "step": 6650, + "token_acc": 0.3065585151677867 + }, + { + "epoch": 3.8988566402814424, + "grad_norm": 0.2730273361561211, + "learning_rate": 0.0001887330108033914, + "loss": 3.034597873687744, + "step": 6651, + "token_acc": 0.2936712306103552 + }, + { + "epoch": 3.8994429785986515, + "grad_norm": 0.2489265504183231, + "learning_rate": 0.00018872854102889778, + "loss": 2.9668939113616943, + "step": 6652, + "token_acc": 0.30229206581423684 + }, + { + "epoch": 3.9000293169158606, + "grad_norm": 0.28273924084914726, + "learning_rate": 0.00018872407042091907, + "loss": 2.983332872390747, + "step": 6653, + "token_acc": 0.2992593430202974 + }, + { + "epoch": 3.9006156552330697, + "grad_norm": 0.24252612102820797, + "learning_rate": 0.0001887195989794973, + "loss": 2.9591875076293945, + "step": 6654, + "token_acc": 0.3032617946535689 + }, + { + "epoch": 3.9012019935502784, + "grad_norm": 0.21934129097145666, + "learning_rate": 0.00018871512670467445, + "loss": 2.950644016265869, + "step": 6655, + "token_acc": 0.30633798271905166 + }, + { + "epoch": 3.9017883318674875, + "grad_norm": 0.2587007173024275, + "learning_rate": 0.00018871065359649252, + "loss": 2.9710941314697266, + "step": 6656, + "token_acc": 0.3023864611225206 + }, + { + "epoch": 3.9023746701846966, + "grad_norm": 0.22998476477185303, + "learning_rate": 0.0001887061796549935, + "loss": 3.008894920349121, + "step": 6657, + "token_acc": 0.2977257352115325 + }, + { + "epoch": 3.9029610085019057, + "grad_norm": 0.2740375265682306, + "learning_rate": 0.0001887017048802195, + "loss": 2.9860899448394775, + "step": 6658, + "token_acc": 0.29987601818399967 + }, + { + "epoch": 3.9035473468191144, + "grad_norm": 0.2331201957062955, + "learning_rate": 0.0001886972292722125, + "loss": 2.916191816329956, + "step": 6659, + "token_acc": 0.3096351209365017 + }, + { + "epoch": 3.9041336851363235, + "grad_norm": 0.25891759390324937, + "learning_rate": 0.00018869275283101456, + "loss": 2.936060905456543, + "step": 6660, + "token_acc": 0.3073071549338654 + }, + { + "epoch": 3.9047200234535326, + "grad_norm": 0.24260118343385476, + "learning_rate": 0.00018868827555666771, + "loss": 2.9556992053985596, + "step": 6661, + "token_acc": 0.3028853715952322 + }, + { + "epoch": 3.9053063617707418, + "grad_norm": 0.2729552877252817, + "learning_rate": 0.00018868379744921404, + "loss": 2.9791150093078613, + "step": 6662, + "token_acc": 0.3010581472521356 + }, + { + "epoch": 3.905892700087951, + "grad_norm": 0.3124095888779568, + "learning_rate": 0.00018867931850869555, + "loss": 2.9738237857818604, + "step": 6663, + "token_acc": 0.30141635610932693 + }, + { + "epoch": 3.90647903840516, + "grad_norm": 0.28528815077524716, + "learning_rate": 0.0001886748387351544, + "loss": 2.9784393310546875, + "step": 6664, + "token_acc": 0.302279112243603 + }, + { + "epoch": 3.9070653767223686, + "grad_norm": 0.26914262399552624, + "learning_rate": 0.00018867035812863262, + "loss": 2.9897382259368896, + "step": 6665, + "token_acc": 0.30008254153121217 + }, + { + "epoch": 3.9076517150395778, + "grad_norm": 0.2620991239098661, + "learning_rate": 0.00018866587668917232, + "loss": 2.9942331314086914, + "step": 6666, + "token_acc": 0.2996946078960036 + }, + { + "epoch": 3.908238053356787, + "grad_norm": 0.382617384140643, + "learning_rate": 0.00018866139441681558, + "loss": 2.9819374084472656, + "step": 6667, + "token_acc": 0.30054543678639545 + }, + { + "epoch": 3.908824391673996, + "grad_norm": 0.40462677720926665, + "learning_rate": 0.0001886569113116045, + "loss": 2.9759955406188965, + "step": 6668, + "token_acc": 0.3001475040757705 + }, + { + "epoch": 3.909410729991205, + "grad_norm": 0.2817685339523738, + "learning_rate": 0.00018865242737358122, + "loss": 2.9759418964385986, + "step": 6669, + "token_acc": 0.30132286924130025 + }, + { + "epoch": 3.9099970683084138, + "grad_norm": 0.28204952404280176, + "learning_rate": 0.00018864794260278785, + "loss": 3.0075559616088867, + "step": 6670, + "token_acc": 0.29701503553278086 + }, + { + "epoch": 3.910583406625623, + "grad_norm": 0.3088966121298939, + "learning_rate": 0.00018864345699926648, + "loss": 3.008556842803955, + "step": 6671, + "token_acc": 0.29775123271727777 + }, + { + "epoch": 3.911169744942832, + "grad_norm": 0.24020410682945567, + "learning_rate": 0.00018863897056305932, + "loss": 2.972257614135742, + "step": 6672, + "token_acc": 0.30153817537369343 + }, + { + "epoch": 3.911756083260041, + "grad_norm": 0.3318332743695136, + "learning_rate": 0.00018863448329420844, + "loss": 2.9934463500976562, + "step": 6673, + "token_acc": 0.30060680568890535 + }, + { + "epoch": 3.91234242157725, + "grad_norm": 0.24597747425358377, + "learning_rate": 0.00018862999519275606, + "loss": 2.955934524536133, + "step": 6674, + "token_acc": 0.3042511874458761 + }, + { + "epoch": 3.9129287598944593, + "grad_norm": 0.3181839379228489, + "learning_rate": 0.00018862550625874428, + "loss": 2.9687986373901367, + "step": 6675, + "token_acc": 0.3025857144670534 + }, + { + "epoch": 3.913515098211668, + "grad_norm": 0.23861709256364186, + "learning_rate": 0.00018862101649221532, + "loss": 2.9907801151275635, + "step": 6676, + "token_acc": 0.3006237324455669 + }, + { + "epoch": 3.914101436528877, + "grad_norm": 0.2739791161942881, + "learning_rate": 0.0001886165258932113, + "loss": 2.941749095916748, + "step": 6677, + "token_acc": 0.3073049247359243 + }, + { + "epoch": 3.914687774846086, + "grad_norm": 0.23549711403840526, + "learning_rate": 0.00018861203446177442, + "loss": 2.9294729232788086, + "step": 6678, + "token_acc": 0.30900740598249493 + }, + { + "epoch": 3.9152741131632953, + "grad_norm": 0.26645897253211187, + "learning_rate": 0.00018860754219794692, + "loss": 2.981844902038574, + "step": 6679, + "token_acc": 0.30028378338843215 + }, + { + "epoch": 3.915860451480504, + "grad_norm": 0.23248579314557088, + "learning_rate": 0.00018860304910177096, + "loss": 2.988368511199951, + "step": 6680, + "token_acc": 0.29872042760393996 + }, + { + "epoch": 3.916446789797713, + "grad_norm": 0.26265769508347514, + "learning_rate": 0.00018859855517328871, + "loss": 2.988191604614258, + "step": 6681, + "token_acc": 0.2993813847303027 + }, + { + "epoch": 3.917033128114922, + "grad_norm": 0.2721945776908509, + "learning_rate": 0.00018859406041254247, + "loss": 3.002068519592285, + "step": 6682, + "token_acc": 0.2966658763635985 + }, + { + "epoch": 3.9176194664321313, + "grad_norm": 0.23651154349850323, + "learning_rate": 0.0001885895648195744, + "loss": 3.011507034301758, + "step": 6683, + "token_acc": 0.29744212177747625 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.2857830774184962, + "learning_rate": 0.00018858506839442672, + "loss": 2.9982192516326904, + "step": 6684, + "token_acc": 0.29824278616795313 + }, + { + "epoch": 3.9187921430665495, + "grad_norm": 0.27910292689126925, + "learning_rate": 0.0001885805711371417, + "loss": 2.979325771331787, + "step": 6685, + "token_acc": 0.30059245157397585 + }, + { + "epoch": 3.9193784813837587, + "grad_norm": 0.23210769833271055, + "learning_rate": 0.00018857607304776158, + "loss": 2.954697608947754, + "step": 6686, + "token_acc": 0.3069179363806469 + }, + { + "epoch": 3.9199648197009673, + "grad_norm": 0.2426971957680676, + "learning_rate": 0.00018857157412632863, + "loss": 2.9683780670166016, + "step": 6687, + "token_acc": 0.30241355121976565 + }, + { + "epoch": 3.9205511580181764, + "grad_norm": 0.27815189472815216, + "learning_rate": 0.00018856707437288507, + "loss": 2.941561222076416, + "step": 6688, + "token_acc": 0.3067292128617168 + }, + { + "epoch": 3.9211374963353856, + "grad_norm": 0.2838777392455147, + "learning_rate": 0.0001885625737874732, + "loss": 2.9664716720581055, + "step": 6689, + "token_acc": 0.3022922711964925 + }, + { + "epoch": 3.9217238346525947, + "grad_norm": 0.24015118604739444, + "learning_rate": 0.0001885580723701353, + "loss": 2.997640371322632, + "step": 6690, + "token_acc": 0.29694977159500663 + }, + { + "epoch": 3.9223101729698033, + "grad_norm": 0.26932663321063127, + "learning_rate": 0.0001885535701209136, + "loss": 3.0059115886688232, + "step": 6691, + "token_acc": 0.2984698069105152 + }, + { + "epoch": 3.9228965112870124, + "grad_norm": 0.2450360482989737, + "learning_rate": 0.00018854906703985052, + "loss": 3.0054855346679688, + "step": 6692, + "token_acc": 0.2972920902264788 + }, + { + "epoch": 3.9234828496042216, + "grad_norm": 0.24135419153231888, + "learning_rate": 0.0001885445631269882, + "loss": 2.9718894958496094, + "step": 6693, + "token_acc": 0.30213827005763666 + }, + { + "epoch": 3.9240691879214307, + "grad_norm": 0.20733617101589707, + "learning_rate": 0.00018854005838236907, + "loss": 2.9810047149658203, + "step": 6694, + "token_acc": 0.30001229526423046 + }, + { + "epoch": 3.92465552623864, + "grad_norm": 0.2482746054414359, + "learning_rate": 0.00018853555280603536, + "loss": 3.0050950050354004, + "step": 6695, + "token_acc": 0.2962661080429125 + }, + { + "epoch": 3.925241864555849, + "grad_norm": 0.2437572470359346, + "learning_rate": 0.00018853104639802946, + "loss": 2.9590678215026855, + "step": 6696, + "token_acc": 0.30386676625129394 + }, + { + "epoch": 3.925828202873058, + "grad_norm": 0.22526899777181897, + "learning_rate": 0.0001885265391583937, + "loss": 2.9736289978027344, + "step": 6697, + "token_acc": 0.30161480834697085 + }, + { + "epoch": 3.9264145411902667, + "grad_norm": 0.2248672416308465, + "learning_rate": 0.00018852203108717035, + "loss": 2.9654767513275146, + "step": 6698, + "token_acc": 0.3020724940435782 + }, + { + "epoch": 3.927000879507476, + "grad_norm": 0.23801344476314878, + "learning_rate": 0.0001885175221844018, + "loss": 2.974339485168457, + "step": 6699, + "token_acc": 0.3018122724374139 + }, + { + "epoch": 3.927587217824685, + "grad_norm": 0.24412606760224878, + "learning_rate": 0.00018851301245013043, + "loss": 2.953859567642212, + "step": 6700, + "token_acc": 0.3060422800456221 + }, + { + "epoch": 3.928173556141894, + "grad_norm": 0.3448065097607578, + "learning_rate": 0.0001885085018843986, + "loss": 3.0057570934295654, + "step": 6701, + "token_acc": 0.29516110301725046 + }, + { + "epoch": 3.9287598944591027, + "grad_norm": 0.5342747009782947, + "learning_rate": 0.00018850399048724864, + "loss": 3.0397000312805176, + "step": 6702, + "token_acc": 0.2924180072658666 + }, + { + "epoch": 3.929346232776312, + "grad_norm": 0.29935721550933975, + "learning_rate": 0.00018849947825872295, + "loss": 3.025165557861328, + "step": 6703, + "token_acc": 0.2957742319957909 + }, + { + "epoch": 3.929932571093521, + "grad_norm": 0.35227942705131465, + "learning_rate": 0.0001884949651988639, + "loss": 2.9858784675598145, + "step": 6704, + "token_acc": 0.30065772591602175 + }, + { + "epoch": 3.93051890941073, + "grad_norm": 0.31267133984855744, + "learning_rate": 0.00018849045130771392, + "loss": 2.963439702987671, + "step": 6705, + "token_acc": 0.3041682771000267 + }, + { + "epoch": 3.931105247727939, + "grad_norm": 0.2753930172292787, + "learning_rate": 0.00018848593658531542, + "loss": 2.9809625148773193, + "step": 6706, + "token_acc": 0.299745481656683 + }, + { + "epoch": 3.9316915860451482, + "grad_norm": 0.22391815326384154, + "learning_rate": 0.00018848142103171074, + "loss": 2.9947967529296875, + "step": 6707, + "token_acc": 0.2974418261043125 + }, + { + "epoch": 3.9322779243623573, + "grad_norm": 0.29136291934432906, + "learning_rate": 0.00018847690464694235, + "loss": 3.0012121200561523, + "step": 6708, + "token_acc": 0.29734666065536813 + }, + { + "epoch": 3.932864262679566, + "grad_norm": 0.23693068982099624, + "learning_rate": 0.00018847238743105265, + "loss": 2.984525203704834, + "step": 6709, + "token_acc": 0.2996593308376834 + }, + { + "epoch": 3.933450600996775, + "grad_norm": 0.257219104597974, + "learning_rate": 0.00018846786938408412, + "loss": 3.01507830619812, + "step": 6710, + "token_acc": 0.2973228338167252 + }, + { + "epoch": 3.9340369393139842, + "grad_norm": 0.2712616398425281, + "learning_rate": 0.00018846335050607915, + "loss": 2.991961717605591, + "step": 6711, + "token_acc": 0.29910618836793335 + }, + { + "epoch": 3.9346232776311933, + "grad_norm": 0.2175791430664531, + "learning_rate": 0.0001884588307970802, + "loss": 2.942702531814575, + "step": 6712, + "token_acc": 0.3064930947913098 + }, + { + "epoch": 3.935209615948402, + "grad_norm": 0.27374914362069525, + "learning_rate": 0.00018845431025712976, + "loss": 2.9535255432128906, + "step": 6713, + "token_acc": 0.3045037567597339 + }, + { + "epoch": 3.935795954265611, + "grad_norm": 0.2146644429450761, + "learning_rate": 0.00018844978888627026, + "loss": 3.003509044647217, + "step": 6714, + "token_acc": 0.2983390016924991 + }, + { + "epoch": 3.9363822925828202, + "grad_norm": 0.2548154634490993, + "learning_rate": 0.00018844526668454416, + "loss": 2.9654130935668945, + "step": 6715, + "token_acc": 0.3020009611603827 + }, + { + "epoch": 3.9369686309000294, + "grad_norm": 0.23083478923313672, + "learning_rate": 0.00018844074365199397, + "loss": 2.995351791381836, + "step": 6716, + "token_acc": 0.29794829762044156 + }, + { + "epoch": 3.9375549692172385, + "grad_norm": 0.28509096399661604, + "learning_rate": 0.0001884362197886622, + "loss": 2.967449188232422, + "step": 6717, + "token_acc": 0.30224573251203896 + }, + { + "epoch": 3.9381413075344476, + "grad_norm": 0.23592641383866594, + "learning_rate": 0.00018843169509459129, + "loss": 3.0169780254364014, + "step": 6718, + "token_acc": 0.29644500362274295 + }, + { + "epoch": 3.9387276458516562, + "grad_norm": 0.2729500266007118, + "learning_rate": 0.00018842716956982375, + "loss": 2.928776741027832, + "step": 6719, + "token_acc": 0.3085829506698908 + }, + { + "epoch": 3.9393139841688654, + "grad_norm": 0.24984817604975365, + "learning_rate": 0.00018842264321440212, + "loss": 2.991912364959717, + "step": 6720, + "token_acc": 0.29745179425208845 + }, + { + "epoch": 3.9399003224860745, + "grad_norm": 0.2650022539419051, + "learning_rate": 0.00018841811602836894, + "loss": 3.01686429977417, + "step": 6721, + "token_acc": 0.29551065289161355 + }, + { + "epoch": 3.9404866608032836, + "grad_norm": 0.28380291512288974, + "learning_rate": 0.00018841358801176668, + "loss": 3.016180992126465, + "step": 6722, + "token_acc": 0.29580694303870114 + }, + { + "epoch": 3.9410729991204922, + "grad_norm": 0.24804405403461538, + "learning_rate": 0.0001884090591646379, + "loss": 2.9789557456970215, + "step": 6723, + "token_acc": 0.301797160440482 + }, + { + "epoch": 3.9416593374377014, + "grad_norm": 0.2399389473635261, + "learning_rate": 0.00018840452948702514, + "loss": 3.006746292114258, + "step": 6724, + "token_acc": 0.29700139451204216 + }, + { + "epoch": 3.9422456757549105, + "grad_norm": 0.2825573584632716, + "learning_rate": 0.00018839999897897093, + "loss": 2.9718286991119385, + "step": 6725, + "token_acc": 0.3027985607859583 + }, + { + "epoch": 3.9428320140721196, + "grad_norm": 0.2378724108255232, + "learning_rate": 0.00018839546764051786, + "loss": 2.9617390632629395, + "step": 6726, + "token_acc": 0.30351177763696247 + }, + { + "epoch": 3.9434183523893287, + "grad_norm": 0.2536960931711908, + "learning_rate": 0.0001883909354717085, + "loss": 3.0065436363220215, + "step": 6727, + "token_acc": 0.29690043880731243 + }, + { + "epoch": 3.944004690706538, + "grad_norm": 0.24768087383794138, + "learning_rate": 0.00018838640247258545, + "loss": 2.9516966342926025, + "step": 6728, + "token_acc": 0.3045352611691942 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.24610096818774838, + "learning_rate": 0.0001883818686431912, + "loss": 2.943861961364746, + "step": 6729, + "token_acc": 0.30548240116403785 + }, + { + "epoch": 3.9451773673409556, + "grad_norm": 0.2977890103816516, + "learning_rate": 0.00018837733398356838, + "loss": 2.9800877571105957, + "step": 6730, + "token_acc": 0.3013265213501212 + }, + { + "epoch": 3.9457637056581647, + "grad_norm": 0.22772336913162497, + "learning_rate": 0.00018837279849375963, + "loss": 2.9788689613342285, + "step": 6731, + "token_acc": 0.3010798768067865 + }, + { + "epoch": 3.946350043975374, + "grad_norm": 0.28514128536165245, + "learning_rate": 0.00018836826217380752, + "loss": 2.997581958770752, + "step": 6732, + "token_acc": 0.29697987419489247 + }, + { + "epoch": 3.946936382292583, + "grad_norm": 0.2408616192881167, + "learning_rate": 0.00018836372502375467, + "loss": 3.000856399536133, + "step": 6733, + "token_acc": 0.29821835833699495 + }, + { + "epoch": 3.9475227206097916, + "grad_norm": 0.2810211890300591, + "learning_rate": 0.00018835918704364365, + "loss": 2.9607861042022705, + "step": 6734, + "token_acc": 0.3031808366901115 + }, + { + "epoch": 3.9481090589270007, + "grad_norm": 0.3721457343335993, + "learning_rate": 0.00018835464823351716, + "loss": 2.9481794834136963, + "step": 6735, + "token_acc": 0.30523759604028716 + }, + { + "epoch": 3.94869539724421, + "grad_norm": 0.3226434525145338, + "learning_rate": 0.0001883501085934178, + "loss": 2.965211868286133, + "step": 6736, + "token_acc": 0.3030965252925201 + }, + { + "epoch": 3.949281735561419, + "grad_norm": 0.2849766560330644, + "learning_rate": 0.00018834556812338823, + "loss": 2.985299587249756, + "step": 6737, + "token_acc": 0.3008067415555964 + }, + { + "epoch": 3.949868073878628, + "grad_norm": 0.34291130232988004, + "learning_rate": 0.00018834102682347112, + "loss": 3.019843816757202, + "step": 6738, + "token_acc": 0.2951466191718084 + }, + { + "epoch": 3.950454412195837, + "grad_norm": 0.34828503828766616, + "learning_rate": 0.00018833648469370907, + "loss": 2.9998016357421875, + "step": 6739, + "token_acc": 0.29816482091853563 + }, + { + "epoch": 3.9510407505130463, + "grad_norm": 0.26848142731771146, + "learning_rate": 0.0001883319417341448, + "loss": 3.0172119140625, + "step": 6740, + "token_acc": 0.2944113093576305 + }, + { + "epoch": 3.951627088830255, + "grad_norm": 0.3757257089541708, + "learning_rate": 0.00018832739794482096, + "loss": 3.0265755653381348, + "step": 6741, + "token_acc": 0.293521873528629 + }, + { + "epoch": 3.952213427147464, + "grad_norm": 0.21302904121821553, + "learning_rate": 0.00018832285332578023, + "loss": 2.987481117248535, + "step": 6742, + "token_acc": 0.30050292496107067 + }, + { + "epoch": 3.952799765464673, + "grad_norm": 0.2940721433710636, + "learning_rate": 0.00018831830787706535, + "loss": 2.992220401763916, + "step": 6743, + "token_acc": 0.3006512338892902 + }, + { + "epoch": 3.9533861037818823, + "grad_norm": 0.21356996805635348, + "learning_rate": 0.00018831376159871894, + "loss": 3.0146684646606445, + "step": 6744, + "token_acc": 0.29469122426868904 + }, + { + "epoch": 3.953972442099091, + "grad_norm": 0.2782581852332759, + "learning_rate": 0.00018830921449078373, + "loss": 2.963684558868408, + "step": 6745, + "token_acc": 0.3026602225858913 + }, + { + "epoch": 3.9545587804163, + "grad_norm": 0.2618278956787487, + "learning_rate": 0.00018830466655330247, + "loss": 2.9898948669433594, + "step": 6746, + "token_acc": 0.2990431303492121 + }, + { + "epoch": 3.955145118733509, + "grad_norm": 0.26411360515532745, + "learning_rate": 0.00018830011778631786, + "loss": 2.986818790435791, + "step": 6747, + "token_acc": 0.30089953258532726 + }, + { + "epoch": 3.9557314570507183, + "grad_norm": 0.27602320964989147, + "learning_rate": 0.00018829556818987265, + "loss": 2.994004726409912, + "step": 6748, + "token_acc": 0.2989589825878234 + }, + { + "epoch": 3.9563177953679274, + "grad_norm": 0.2914662116213642, + "learning_rate": 0.0001882910177640095, + "loss": 2.9968347549438477, + "step": 6749, + "token_acc": 0.29892500584235954 + }, + { + "epoch": 3.9569041336851365, + "grad_norm": 0.2707021838356128, + "learning_rate": 0.00018828646650877128, + "loss": 2.997046709060669, + "step": 6750, + "token_acc": 0.29701084302041614 + }, + { + "epoch": 3.9574904720023456, + "grad_norm": 0.2909293188522531, + "learning_rate": 0.00018828191442420063, + "loss": 3.0266470909118652, + "step": 6751, + "token_acc": 0.29594473590095915 + }, + { + "epoch": 3.9580768103195543, + "grad_norm": 0.24077010139663063, + "learning_rate": 0.00018827736151034037, + "loss": 2.9700400829315186, + "step": 6752, + "token_acc": 0.30128910991493 + }, + { + "epoch": 3.9586631486367634, + "grad_norm": 0.2707599993823994, + "learning_rate": 0.00018827280776723324, + "loss": 2.9679665565490723, + "step": 6753, + "token_acc": 0.303233648506837 + }, + { + "epoch": 3.9592494869539725, + "grad_norm": 0.27108208792957694, + "learning_rate": 0.00018826825319492204, + "loss": 2.9435768127441406, + "step": 6754, + "token_acc": 0.3064626511542446 + }, + { + "epoch": 3.9598358252711816, + "grad_norm": 0.2625951319089346, + "learning_rate": 0.00018826369779344955, + "loss": 3.0129075050354004, + "step": 6755, + "token_acc": 0.29509989870034703 + }, + { + "epoch": 3.9604221635883903, + "grad_norm": 0.2982722701120704, + "learning_rate": 0.00018825914156285855, + "loss": 2.9858531951904297, + "step": 6756, + "token_acc": 0.3011798745849476 + }, + { + "epoch": 3.9610085019055994, + "grad_norm": 0.3671582135871774, + "learning_rate": 0.00018825458450319184, + "loss": 2.972569465637207, + "step": 6757, + "token_acc": 0.30175144353905764 + }, + { + "epoch": 3.9615948402228085, + "grad_norm": 0.31822372583403363, + "learning_rate": 0.00018825002661449223, + "loss": 2.9502756595611572, + "step": 6758, + "token_acc": 0.3049355800898979 + }, + { + "epoch": 3.9621811785400176, + "grad_norm": 0.29943348317112517, + "learning_rate": 0.00018824546789680255, + "loss": 2.9721460342407227, + "step": 6759, + "token_acc": 0.3020162776848255 + }, + { + "epoch": 3.9627675168572267, + "grad_norm": 0.33220010175026204, + "learning_rate": 0.00018824090835016565, + "loss": 2.9790258407592773, + "step": 6760, + "token_acc": 0.3010301650244124 + }, + { + "epoch": 3.963353855174436, + "grad_norm": 0.24525993244889766, + "learning_rate": 0.00018823634797462426, + "loss": 2.9715735912323, + "step": 6761, + "token_acc": 0.3028212949762525 + }, + { + "epoch": 3.963940193491645, + "grad_norm": 0.2807718891394711, + "learning_rate": 0.0001882317867702213, + "loss": 2.9553141593933105, + "step": 6762, + "token_acc": 0.3049802690324625 + }, + { + "epoch": 3.9645265318088536, + "grad_norm": 0.23330554934775408, + "learning_rate": 0.00018822722473699958, + "loss": 2.979844093322754, + "step": 6763, + "token_acc": 0.3016093044829673 + }, + { + "epoch": 3.9651128701260627, + "grad_norm": 0.31647614568306526, + "learning_rate": 0.000188222661875002, + "loss": 2.9736900329589844, + "step": 6764, + "token_acc": 0.3008516382378206 + }, + { + "epoch": 3.965699208443272, + "grad_norm": 0.26130259170910786, + "learning_rate": 0.00018821809818427137, + "loss": 2.939664840698242, + "step": 6765, + "token_acc": 0.3059006457131091 + }, + { + "epoch": 3.966285546760481, + "grad_norm": 0.27701252227024686, + "learning_rate": 0.0001882135336648506, + "loss": 2.9174704551696777, + "step": 6766, + "token_acc": 0.3099732001144849 + }, + { + "epoch": 3.9668718850776896, + "grad_norm": 0.2698149330486192, + "learning_rate": 0.00018820896831678256, + "loss": 3.00138258934021, + "step": 6767, + "token_acc": 0.29681123783614105 + }, + { + "epoch": 3.9674582233948987, + "grad_norm": 0.30534898789471326, + "learning_rate": 0.00018820440214011011, + "loss": 3.01220703125, + "step": 6768, + "token_acc": 0.2964907440051913 + }, + { + "epoch": 3.968044561712108, + "grad_norm": 0.24498760661080554, + "learning_rate": 0.00018819983513487616, + "loss": 3.010448455810547, + "step": 6769, + "token_acc": 0.29770500492359975 + }, + { + "epoch": 3.968630900029317, + "grad_norm": 0.3043936681446897, + "learning_rate": 0.00018819526730112361, + "loss": 2.9761781692504883, + "step": 6770, + "token_acc": 0.30152095922738 + }, + { + "epoch": 3.969217238346526, + "grad_norm": 0.2663148257534456, + "learning_rate": 0.00018819069863889535, + "loss": 2.9302144050598145, + "step": 6771, + "token_acc": 0.3073616654082623 + }, + { + "epoch": 3.969803576663735, + "grad_norm": 0.237462570604375, + "learning_rate": 0.00018818612914823433, + "loss": 2.9407782554626465, + "step": 6772, + "token_acc": 0.30635882146062876 + }, + { + "epoch": 3.970389914980944, + "grad_norm": 0.32117142702392415, + "learning_rate": 0.0001881815588291835, + "loss": 2.980024814605713, + "step": 6773, + "token_acc": 0.3003881088330319 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.23111150447555812, + "learning_rate": 0.0001881769876817857, + "loss": 2.933063507080078, + "step": 6774, + "token_acc": 0.3073785239932257 + }, + { + "epoch": 3.971562591615362, + "grad_norm": 0.2610491755144798, + "learning_rate": 0.00018817241570608394, + "loss": 2.9211087226867676, + "step": 6775, + "token_acc": 0.3102465377659004 + }, + { + "epoch": 3.972148929932571, + "grad_norm": 0.26054823907501207, + "learning_rate": 0.00018816784290212114, + "loss": 2.9758048057556152, + "step": 6776, + "token_acc": 0.3019210422587806 + }, + { + "epoch": 3.97273526824978, + "grad_norm": 0.2385174360994277, + "learning_rate": 0.00018816326926994026, + "loss": 3.026453733444214, + "step": 6777, + "token_acc": 0.29289401643560825 + }, + { + "epoch": 3.973321606566989, + "grad_norm": 0.25604700494317345, + "learning_rate": 0.00018815869480958428, + "loss": 2.9858181476593018, + "step": 6778, + "token_acc": 0.30215400428628575 + }, + { + "epoch": 3.973907944884198, + "grad_norm": 0.2282562617277907, + "learning_rate": 0.00018815411952109617, + "loss": 2.9625072479248047, + "step": 6779, + "token_acc": 0.3027356734036608 + }, + { + "epoch": 3.974494283201407, + "grad_norm": 0.25919321277160245, + "learning_rate": 0.00018814954340451884, + "loss": 3.0089173316955566, + "step": 6780, + "token_acc": 0.29615116777373046 + }, + { + "epoch": 3.9750806215186163, + "grad_norm": 0.2529452818591886, + "learning_rate": 0.00018814496645989536, + "loss": 2.9495909214019775, + "step": 6781, + "token_acc": 0.30609142243912413 + }, + { + "epoch": 3.9756669598358254, + "grad_norm": 0.24065454400460834, + "learning_rate": 0.00018814038868726873, + "loss": 3.055450439453125, + "step": 6782, + "token_acc": 0.2920182962570486 + }, + { + "epoch": 3.9762532981530345, + "grad_norm": 0.23490452149378216, + "learning_rate": 0.0001881358100866819, + "loss": 2.9737353324890137, + "step": 6783, + "token_acc": 0.3017115028189002 + }, + { + "epoch": 3.976839636470243, + "grad_norm": 0.22653160540067888, + "learning_rate": 0.0001881312306581779, + "loss": 2.9702277183532715, + "step": 6784, + "token_acc": 0.30145397322468903 + }, + { + "epoch": 3.9774259747874523, + "grad_norm": 0.24136439047655175, + "learning_rate": 0.00018812665040179974, + "loss": 3.010288715362549, + "step": 6785, + "token_acc": 0.29607907374409703 + }, + { + "epoch": 3.9780123131046614, + "grad_norm": 0.22942669704587268, + "learning_rate": 0.00018812206931759044, + "loss": 2.9914684295654297, + "step": 6786, + "token_acc": 0.2985744193001695 + }, + { + "epoch": 3.9785986514218705, + "grad_norm": 0.25289077789476483, + "learning_rate": 0.00018811748740559306, + "loss": 2.9701802730560303, + "step": 6787, + "token_acc": 0.301797229205895 + }, + { + "epoch": 3.979184989739079, + "grad_norm": 0.26288878477655336, + "learning_rate": 0.0001881129046658506, + "loss": 2.9892754554748535, + "step": 6788, + "token_acc": 0.2998869067547511 + }, + { + "epoch": 3.9797713280562883, + "grad_norm": 0.2707327154319935, + "learning_rate": 0.00018810832109840617, + "loss": 2.9541473388671875, + "step": 6789, + "token_acc": 0.3046050285649786 + }, + { + "epoch": 3.9803576663734974, + "grad_norm": 0.422053307629299, + "learning_rate": 0.00018810373670330278, + "loss": 2.9699063301086426, + "step": 6790, + "token_acc": 0.30276202456080503 + }, + { + "epoch": 3.9809440046907065, + "grad_norm": 0.5311450207612103, + "learning_rate": 0.00018809915148058353, + "loss": 2.97745680809021, + "step": 6791, + "token_acc": 0.3011652602294524 + }, + { + "epoch": 3.9815303430079156, + "grad_norm": 0.2636961692842945, + "learning_rate": 0.00018809456543029143, + "loss": 2.9863481521606445, + "step": 6792, + "token_acc": 0.2992351302215003 + }, + { + "epoch": 3.9821166813251248, + "grad_norm": 0.49579220722114775, + "learning_rate": 0.00018808997855246959, + "loss": 2.9743685722351074, + "step": 6793, + "token_acc": 0.30131669740585215 + }, + { + "epoch": 3.982703019642334, + "grad_norm": 0.25582776572123345, + "learning_rate": 0.0001880853908471611, + "loss": 3.0076003074645996, + "step": 6794, + "token_acc": 0.2989739461545342 + }, + { + "epoch": 3.9832893579595425, + "grad_norm": 0.4065192974116163, + "learning_rate": 0.0001880808023144091, + "loss": 2.9674878120422363, + "step": 6795, + "token_acc": 0.30174112389182334 + }, + { + "epoch": 3.9838756962767516, + "grad_norm": 0.3240933578073701, + "learning_rate": 0.00018807621295425663, + "loss": 2.9737026691436768, + "step": 6796, + "token_acc": 0.30184735564235193 + }, + { + "epoch": 3.9844620345939608, + "grad_norm": 0.3129066599396761, + "learning_rate": 0.00018807162276674683, + "loss": 2.9665474891662598, + "step": 6797, + "token_acc": 0.3029389763992932 + }, + { + "epoch": 3.98504837291117, + "grad_norm": 0.38436543022251884, + "learning_rate": 0.00018806703175192283, + "loss": 2.986518383026123, + "step": 6798, + "token_acc": 0.3019264275886969 + }, + { + "epoch": 3.9856347112283785, + "grad_norm": 0.26632490687678073, + "learning_rate": 0.0001880624399098277, + "loss": 3.0117480754852295, + "step": 6799, + "token_acc": 0.29685834657893273 + }, + { + "epoch": 3.9862210495455876, + "grad_norm": 0.27168005805669226, + "learning_rate": 0.0001880578472405046, + "loss": 2.971714496612549, + "step": 6800, + "token_acc": 0.3028493376103423 + }, + { + "epoch": 3.9868073878627968, + "grad_norm": 0.3007347093146597, + "learning_rate": 0.00018805325374399674, + "loss": 3.036787509918213, + "step": 6801, + "token_acc": 0.2940040021153279 + }, + { + "epoch": 3.987393726180006, + "grad_norm": 0.23480167232985188, + "learning_rate": 0.0001880486594203472, + "loss": 3.0148262977600098, + "step": 6802, + "token_acc": 0.29634861810402574 + }, + { + "epoch": 3.987980064497215, + "grad_norm": 0.28844957628243073, + "learning_rate": 0.00018804406426959914, + "loss": 2.9950225353240967, + "step": 6803, + "token_acc": 0.299117812174192 + }, + { + "epoch": 3.988566402814424, + "grad_norm": 0.3267480090876118, + "learning_rate": 0.00018803946829179573, + "loss": 2.959831714630127, + "step": 6804, + "token_acc": 0.3033361715303042 + }, + { + "epoch": 3.989152741131633, + "grad_norm": 0.2551761789983474, + "learning_rate": 0.00018803487148698016, + "loss": 2.8965282440185547, + "step": 6805, + "token_acc": 0.3137999390845125 + }, + { + "epoch": 3.989739079448842, + "grad_norm": 0.279856292192039, + "learning_rate": 0.0001880302738551956, + "loss": 2.984565019607544, + "step": 6806, + "token_acc": 0.30079113278380804 + }, + { + "epoch": 3.990325417766051, + "grad_norm": 0.2663470781715964, + "learning_rate": 0.00018802567539648524, + "loss": 2.9823896884918213, + "step": 6807, + "token_acc": 0.2989748397628311 + }, + { + "epoch": 3.99091175608326, + "grad_norm": 0.2623050758104646, + "learning_rate": 0.00018802107611089227, + "loss": 3.036341667175293, + "step": 6808, + "token_acc": 0.294514719471258 + }, + { + "epoch": 3.991498094400469, + "grad_norm": 0.2764828792927526, + "learning_rate": 0.0001880164759984599, + "loss": 2.972909927368164, + "step": 6809, + "token_acc": 0.30311552649841117 + }, + { + "epoch": 3.992084432717678, + "grad_norm": 0.27648808913692197, + "learning_rate": 0.00018801187505923135, + "loss": 2.9660797119140625, + "step": 6810, + "token_acc": 0.30340494363026005 + }, + { + "epoch": 3.992670771034887, + "grad_norm": 0.27046810527472004, + "learning_rate": 0.0001880072732932498, + "loss": 2.9569969177246094, + "step": 6811, + "token_acc": 0.3039939541418543 + }, + { + "epoch": 3.993257109352096, + "grad_norm": 0.31583071405922153, + "learning_rate": 0.00018800267070055856, + "loss": 2.9581847190856934, + "step": 6812, + "token_acc": 0.3065733972193214 + }, + { + "epoch": 3.993843447669305, + "grad_norm": 0.2450234315344251, + "learning_rate": 0.00018799806728120078, + "loss": 2.962909698486328, + "step": 6813, + "token_acc": 0.30253667695060865 + }, + { + "epoch": 3.9944297859865143, + "grad_norm": 0.2929593605116672, + "learning_rate": 0.00018799346303521977, + "loss": 2.9641213417053223, + "step": 6814, + "token_acc": 0.30367895811863693 + }, + { + "epoch": 3.9950161243037234, + "grad_norm": 0.2626130861557688, + "learning_rate": 0.0001879888579626587, + "loss": 2.970533847808838, + "step": 6815, + "token_acc": 0.30462212522345666 + }, + { + "epoch": 3.9956024626209325, + "grad_norm": 0.24576273767346254, + "learning_rate": 0.0001879842520635609, + "loss": 2.9424028396606445, + "step": 6816, + "token_acc": 0.3085727670617093 + }, + { + "epoch": 3.996188800938141, + "grad_norm": 0.27148725249277733, + "learning_rate": 0.00018797964533796962, + "loss": 2.9336466789245605, + "step": 6817, + "token_acc": 0.30644212834233137 + }, + { + "epoch": 3.9967751392553503, + "grad_norm": 0.23894465883073088, + "learning_rate": 0.00018797503778592812, + "loss": 2.9928646087646484, + "step": 6818, + "token_acc": 0.2979747386174724 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.300943096893216, + "learning_rate": 0.00018797042940747968, + "loss": 3.016228675842285, + "step": 6819, + "token_acc": 0.29471618136302663 + }, + { + "epoch": 3.9979478158897686, + "grad_norm": 0.24899592794644393, + "learning_rate": 0.0001879658202026676, + "loss": 2.9966681003570557, + "step": 6820, + "token_acc": 0.2958041834365423 + }, + { + "epoch": 3.998534154206977, + "grad_norm": 0.2587882244025811, + "learning_rate": 0.00018796121017153518, + "loss": 2.980031728744507, + "step": 6821, + "token_acc": 0.3012523419781087 + }, + { + "epoch": 3.9991204925241863, + "grad_norm": 0.296043718760882, + "learning_rate": 0.0001879565993141257, + "loss": 2.973658561706543, + "step": 6822, + "token_acc": 0.30175447784644394 + }, + { + "epoch": 3.9997068308413954, + "grad_norm": 0.2307610500363952, + "learning_rate": 0.00018795198763048253, + "loss": 2.9776337146759033, + "step": 6823, + "token_acc": 0.30236705249099516 + }, + { + "epoch": 4.0, + "grad_norm": 0.34804182990384036, + "learning_rate": 0.0001879473751206489, + "loss": 2.997058391571045, + "step": 6824, + "token_acc": 0.29656255541201204 + }, + { + "epoch": 4.0, + "eval_loss": 3.0742709636688232, + "eval_runtime": 16.6778, + "eval_samples_per_second": 15.35, + "eval_steps_per_second": 1.919, + "eval_token_acc": 0.2890191852159429, + "step": 6824 + }, + { + "epoch": 4.000586338317209, + "grad_norm": 0.36997597595215104, + "learning_rate": 0.00018794276178466825, + "loss": 2.809704542160034, + "step": 6825, + "token_acc": 0.32451676463325535 + }, + { + "epoch": 4.001172676634418, + "grad_norm": 0.33900744913162, + "learning_rate": 0.00018793814762258382, + "loss": 2.8247694969177246, + "step": 6826, + "token_acc": 0.32178274099865434 + }, + { + "epoch": 4.001759014951627, + "grad_norm": 0.31652232796080954, + "learning_rate": 0.00018793353263443901, + "loss": 2.7757062911987305, + "step": 6827, + "token_acc": 0.3302428767299934 + }, + { + "epoch": 4.0023453532688364, + "grad_norm": 0.3389172728475794, + "learning_rate": 0.00018792891682027713, + "loss": 2.8247079849243164, + "step": 6828, + "token_acc": 0.32160448064345776 + }, + { + "epoch": 4.002931691586046, + "grad_norm": 0.3544336107182365, + "learning_rate": 0.00018792430018014158, + "loss": 2.797395706176758, + "step": 6829, + "token_acc": 0.3270148923388244 + }, + { + "epoch": 4.003518029903254, + "grad_norm": 0.3383968223869068, + "learning_rate": 0.00018791968271407572, + "loss": 2.732421398162842, + "step": 6830, + "token_acc": 0.33748829666064717 + }, + { + "epoch": 4.004104368220463, + "grad_norm": 0.333691439209077, + "learning_rate": 0.0001879150644221229, + "loss": 2.7615413665771484, + "step": 6831, + "token_acc": 0.3321200475941273 + }, + { + "epoch": 4.004690706537672, + "grad_norm": 0.3381708553043849, + "learning_rate": 0.00018791044530432652, + "loss": 2.7796247005462646, + "step": 6832, + "token_acc": 0.3298306647345111 + }, + { + "epoch": 4.005277044854881, + "grad_norm": 0.3070489547109363, + "learning_rate": 0.00018790582536072994, + "loss": 2.7613229751586914, + "step": 6833, + "token_acc": 0.3317651491183794 + }, + { + "epoch": 4.00586338317209, + "grad_norm": 0.2667117412931318, + "learning_rate": 0.0001879012045913766, + "loss": 2.8310656547546387, + "step": 6834, + "token_acc": 0.3219197205906989 + }, + { + "epoch": 4.006449721489299, + "grad_norm": 0.30541554795689796, + "learning_rate": 0.00018789658299630992, + "loss": 2.844264507293701, + "step": 6835, + "token_acc": 0.32203951085799587 + }, + { + "epoch": 4.0070360598065085, + "grad_norm": 0.27519184963179677, + "learning_rate": 0.00018789196057557325, + "loss": 2.7469067573547363, + "step": 6836, + "token_acc": 0.33306382891346087 + }, + { + "epoch": 4.007622398123718, + "grad_norm": 0.3345346497857896, + "learning_rate": 0.00018788733732921008, + "loss": 2.790536403656006, + "step": 6837, + "token_acc": 0.3262795125278708 + }, + { + "epoch": 4.008208736440927, + "grad_norm": 0.29805289471656893, + "learning_rate": 0.0001878827132572638, + "loss": 2.7803330421447754, + "step": 6838, + "token_acc": 0.32960974379301644 + }, + { + "epoch": 4.008795074758136, + "grad_norm": 0.3529577082254483, + "learning_rate": 0.00018787808835977782, + "loss": 2.7568695545196533, + "step": 6839, + "token_acc": 0.3329231032656097 + }, + { + "epoch": 4.009381413075345, + "grad_norm": 0.31774521769599595, + "learning_rate": 0.00018787346263679565, + "loss": 2.805112600326538, + "step": 6840, + "token_acc": 0.32637249413845415 + }, + { + "epoch": 4.009967751392553, + "grad_norm": 0.31368219394733554, + "learning_rate": 0.0001878688360883607, + "loss": 2.780792236328125, + "step": 6841, + "token_acc": 0.32936063789550063 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 0.28134436042772193, + "learning_rate": 0.00018786420871451642, + "loss": 2.787625312805176, + "step": 6842, + "token_acc": 0.32785053193411345 + }, + { + "epoch": 4.011140428026971, + "grad_norm": 0.3148655213326251, + "learning_rate": 0.0001878595805153063, + "loss": 2.8253822326660156, + "step": 6843, + "token_acc": 0.3229981791706366 + }, + { + "epoch": 4.0117267663441805, + "grad_norm": 0.3182534964493563, + "learning_rate": 0.00018785495149077383, + "loss": 2.7305078506469727, + "step": 6844, + "token_acc": 0.33757875365708057 + }, + { + "epoch": 4.01231310466139, + "grad_norm": 0.2958896345586758, + "learning_rate": 0.00018785032164096247, + "loss": 2.8016417026519775, + "step": 6845, + "token_acc": 0.3270356449674975 + }, + { + "epoch": 4.012899442978599, + "grad_norm": 0.3573213669477871, + "learning_rate": 0.00018784569096591574, + "loss": 2.7652981281280518, + "step": 6846, + "token_acc": 0.33177658142664873 + }, + { + "epoch": 4.013485781295808, + "grad_norm": 0.32646934065996, + "learning_rate": 0.00018784105946567713, + "loss": 2.782241106033325, + "step": 6847, + "token_acc": 0.3289520871735069 + }, + { + "epoch": 4.014072119613017, + "grad_norm": 0.3093142605955748, + "learning_rate": 0.00018783642714029005, + "loss": 2.8232805728912354, + "step": 6848, + "token_acc": 0.32506771520613736 + }, + { + "epoch": 4.014658457930226, + "grad_norm": 0.33437466731138826, + "learning_rate": 0.00018783179398979818, + "loss": 2.776484489440918, + "step": 6849, + "token_acc": 0.32847820399163774 + }, + { + "epoch": 4.015244796247435, + "grad_norm": 0.2737094633540436, + "learning_rate": 0.0001878271600142449, + "loss": 2.7839882373809814, + "step": 6850, + "token_acc": 0.32758526032339447 + }, + { + "epoch": 4.015831134564644, + "grad_norm": 0.29259197162976963, + "learning_rate": 0.00018782252521367388, + "loss": 2.774160861968994, + "step": 6851, + "token_acc": 0.32935152712477855 + }, + { + "epoch": 4.0164174728818525, + "grad_norm": 0.2718931648698179, + "learning_rate": 0.00018781788958812848, + "loss": 2.7623963356018066, + "step": 6852, + "token_acc": 0.33307464581970925 + }, + { + "epoch": 4.017003811199062, + "grad_norm": 0.3241439474375975, + "learning_rate": 0.0001878132531376524, + "loss": 2.8113932609558105, + "step": 6853, + "token_acc": 0.3252248794772176 + }, + { + "epoch": 4.017590149516271, + "grad_norm": 0.30259265680455233, + "learning_rate": 0.00018780861586228915, + "loss": 2.8035244941711426, + "step": 6854, + "token_acc": 0.32413013950772435 + }, + { + "epoch": 4.01817648783348, + "grad_norm": 0.2852199936990498, + "learning_rate": 0.00018780397776208224, + "loss": 2.7789130210876465, + "step": 6855, + "token_acc": 0.3299833948145579 + }, + { + "epoch": 4.018762826150689, + "grad_norm": 0.3158733558305754, + "learning_rate": 0.0001877993388370753, + "loss": 2.7633376121520996, + "step": 6856, + "token_acc": 0.3312243693459922 + }, + { + "epoch": 4.019349164467898, + "grad_norm": 0.25732623428031365, + "learning_rate": 0.00018779469908731188, + "loss": 2.788844347000122, + "step": 6857, + "token_acc": 0.32929831927082626 + }, + { + "epoch": 4.019935502785107, + "grad_norm": 0.31042140665164614, + "learning_rate": 0.00018779005851283554, + "loss": 2.7620437145233154, + "step": 6858, + "token_acc": 0.33377648859882364 + }, + { + "epoch": 4.020521841102316, + "grad_norm": 0.3008021889394991, + "learning_rate": 0.00018778541711368996, + "loss": 2.7931137084960938, + "step": 6859, + "token_acc": 0.3269916180569226 + }, + { + "epoch": 4.021108179419525, + "grad_norm": 0.28329654320018144, + "learning_rate": 0.0001877807748899186, + "loss": 2.7533888816833496, + "step": 6860, + "token_acc": 0.3354698925701905 + }, + { + "epoch": 4.0216945177367345, + "grad_norm": 0.2710676671682988, + "learning_rate": 0.0001877761318415652, + "loss": 2.782712697982788, + "step": 6861, + "token_acc": 0.32884026724160875 + }, + { + "epoch": 4.022280856053943, + "grad_norm": 0.28894613102882455, + "learning_rate": 0.00018777148796867332, + "loss": 2.7825112342834473, + "step": 6862, + "token_acc": 0.3281718801255604 + }, + { + "epoch": 4.022867194371152, + "grad_norm": 0.2863772677554496, + "learning_rate": 0.00018776684327128658, + "loss": 2.7521257400512695, + "step": 6863, + "token_acc": 0.33326026166785183 + }, + { + "epoch": 4.023453532688361, + "grad_norm": 0.2616965983665461, + "learning_rate": 0.00018776219774944858, + "loss": 2.7520627975463867, + "step": 6864, + "token_acc": 0.33269347640427177 + }, + { + "epoch": 4.02403987100557, + "grad_norm": 0.2745740760072004, + "learning_rate": 0.00018775755140320303, + "loss": 2.788012742996216, + "step": 6865, + "token_acc": 0.3275942636995192 + }, + { + "epoch": 4.024626209322779, + "grad_norm": 0.2866265639456835, + "learning_rate": 0.00018775290423259352, + "loss": 2.7775301933288574, + "step": 6866, + "token_acc": 0.3302292526604533 + }, + { + "epoch": 4.025212547639988, + "grad_norm": 0.2739144262284417, + "learning_rate": 0.00018774825623766374, + "loss": 2.8072848320007324, + "step": 6867, + "token_acc": 0.3249855449551894 + }, + { + "epoch": 4.025798885957197, + "grad_norm": 0.27953365424063953, + "learning_rate": 0.00018774360741845734, + "loss": 2.776688575744629, + "step": 6868, + "token_acc": 0.32906569904241434 + }, + { + "epoch": 4.0263852242744065, + "grad_norm": 0.2689891638017352, + "learning_rate": 0.00018773895777501794, + "loss": 2.7590856552124023, + "step": 6869, + "token_acc": 0.3310644620202492 + }, + { + "epoch": 4.026971562591616, + "grad_norm": 0.3166968437748137, + "learning_rate": 0.0001877343073073893, + "loss": 2.7364020347595215, + "step": 6870, + "token_acc": 0.33491128391295943 + }, + { + "epoch": 4.027557900908825, + "grad_norm": 0.3354108839453728, + "learning_rate": 0.00018772965601561507, + "loss": 2.75793194770813, + "step": 6871, + "token_acc": 0.33087312679159825 + }, + { + "epoch": 4.028144239226034, + "grad_norm": 0.28742281669057507, + "learning_rate": 0.00018772500389973893, + "loss": 2.7706661224365234, + "step": 6872, + "token_acc": 0.33081962238436086 + }, + { + "epoch": 4.028730577543242, + "grad_norm": 0.3782253887071229, + "learning_rate": 0.0001877203509598046, + "loss": 2.7687172889709473, + "step": 6873, + "token_acc": 0.33154544598679875 + }, + { + "epoch": 4.029316915860451, + "grad_norm": 0.373791524176523, + "learning_rate": 0.00018771569719585576, + "loss": 2.8063082695007324, + "step": 6874, + "token_acc": 0.32528751156289576 + }, + { + "epoch": 4.02990325417766, + "grad_norm": 0.2724946527080424, + "learning_rate": 0.00018771104260793613, + "loss": 2.7572009563446045, + "step": 6875, + "token_acc": 0.3323743177932232 + }, + { + "epoch": 4.030489592494869, + "grad_norm": 0.40798013950818385, + "learning_rate": 0.00018770638719608945, + "loss": 2.7910990715026855, + "step": 6876, + "token_acc": 0.32817945332618725 + }, + { + "epoch": 4.0310759308120785, + "grad_norm": 0.3344593076500049, + "learning_rate": 0.00018770173096035949, + "loss": 2.7497029304504395, + "step": 6877, + "token_acc": 0.33301280203742617 + }, + { + "epoch": 4.031662269129288, + "grad_norm": 0.2839827085573838, + "learning_rate": 0.0001876970739007899, + "loss": 2.772012710571289, + "step": 6878, + "token_acc": 0.3324896152800963 + }, + { + "epoch": 4.032248607446497, + "grad_norm": 0.3336546844688524, + "learning_rate": 0.0001876924160174245, + "loss": 2.7675232887268066, + "step": 6879, + "token_acc": 0.33169038455488326 + }, + { + "epoch": 4.032834945763706, + "grad_norm": 0.271889986164176, + "learning_rate": 0.00018768775731030704, + "loss": 2.7831106185913086, + "step": 6880, + "token_acc": 0.3283844546681325 + }, + { + "epoch": 4.033421284080915, + "grad_norm": 0.3064084182953931, + "learning_rate": 0.00018768309777948122, + "loss": 2.7696099281311035, + "step": 6881, + "token_acc": 0.33226359543236933 + }, + { + "epoch": 4.034007622398124, + "grad_norm": 0.2777333778623159, + "learning_rate": 0.00018767843742499088, + "loss": 2.799945831298828, + "step": 6882, + "token_acc": 0.3258161867032082 + }, + { + "epoch": 4.034593960715333, + "grad_norm": 0.2955154347868756, + "learning_rate": 0.00018767377624687975, + "loss": 2.760730266571045, + "step": 6883, + "token_acc": 0.33243573284904654 + }, + { + "epoch": 4.035180299032541, + "grad_norm": 0.2592371792402468, + "learning_rate": 0.00018766911424519163, + "loss": 2.7805352210998535, + "step": 6884, + "token_acc": 0.32892527343677463 + }, + { + "epoch": 4.0357666373497505, + "grad_norm": 0.2794037223135334, + "learning_rate": 0.00018766445141997032, + "loss": 2.718114137649536, + "step": 6885, + "token_acc": 0.3398542559185279 + }, + { + "epoch": 4.03635297566696, + "grad_norm": 0.2689090575269661, + "learning_rate": 0.00018765978777125962, + "loss": 2.778414249420166, + "step": 6886, + "token_acc": 0.33069485604990234 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.30376376240548936, + "learning_rate": 0.00018765512329910333, + "loss": 2.797227621078491, + "step": 6887, + "token_acc": 0.32605230132618795 + }, + { + "epoch": 4.037525652301378, + "grad_norm": 0.2692298278942264, + "learning_rate": 0.00018765045800354528, + "loss": 2.7535789012908936, + "step": 6888, + "token_acc": 0.33440590795199787 + }, + { + "epoch": 4.038111990618587, + "grad_norm": 0.29456685096097757, + "learning_rate": 0.00018764579188462928, + "loss": 2.771760940551758, + "step": 6889, + "token_acc": 0.3302968926263973 + }, + { + "epoch": 4.038698328935796, + "grad_norm": 0.28142022945830475, + "learning_rate": 0.00018764112494239917, + "loss": 2.778557777404785, + "step": 6890, + "token_acc": 0.33037032015144485 + }, + { + "epoch": 4.039284667253005, + "grad_norm": 0.2974819980768383, + "learning_rate": 0.00018763645717689883, + "loss": 2.7787203788757324, + "step": 6891, + "token_acc": 0.3297246204265891 + }, + { + "epoch": 4.039871005570214, + "grad_norm": 0.27747217849801065, + "learning_rate": 0.00018763178858817204, + "loss": 2.759206771850586, + "step": 6892, + "token_acc": 0.33202810830875834 + }, + { + "epoch": 4.040457343887423, + "grad_norm": 0.2760489793461656, + "learning_rate": 0.00018762711917626266, + "loss": 2.7717132568359375, + "step": 6893, + "token_acc": 0.33075960523036363 + }, + { + "epoch": 4.0410436822046325, + "grad_norm": 0.28495269287229136, + "learning_rate": 0.00018762244894121458, + "loss": 2.781449556350708, + "step": 6894, + "token_acc": 0.3279262517347046 + }, + { + "epoch": 4.041630020521841, + "grad_norm": 0.2773741551301759, + "learning_rate": 0.00018761777788307168, + "loss": 2.7549259662628174, + "step": 6895, + "token_acc": 0.33305276378594745 + }, + { + "epoch": 4.04221635883905, + "grad_norm": 0.28396673775215203, + "learning_rate": 0.00018761310600187782, + "loss": 2.8119614124298096, + "step": 6896, + "token_acc": 0.3237608581907918 + }, + { + "epoch": 4.042802697156259, + "grad_norm": 0.2780904414635531, + "learning_rate": 0.0001876084332976769, + "loss": 2.788419723510742, + "step": 6897, + "token_acc": 0.32691259313101945 + }, + { + "epoch": 4.043389035473468, + "grad_norm": 0.3045459288620365, + "learning_rate": 0.00018760375977051278, + "loss": 2.7625670433044434, + "step": 6898, + "token_acc": 0.3312883435582822 + }, + { + "epoch": 4.043975373790677, + "grad_norm": 0.2592457588450355, + "learning_rate": 0.0001875990854204294, + "loss": 2.7478976249694824, + "step": 6899, + "token_acc": 0.33196353646480586 + }, + { + "epoch": 4.044561712107886, + "grad_norm": 0.28383236375360427, + "learning_rate": 0.00018759441024747064, + "loss": 2.7483723163604736, + "step": 6900, + "token_acc": 0.3351584893377991 + }, + { + "epoch": 4.045148050425095, + "grad_norm": 0.2723310043807201, + "learning_rate": 0.00018758973425168045, + "loss": 2.807433843612671, + "step": 6901, + "token_acc": 0.32599103866222284 + }, + { + "epoch": 4.0457343887423045, + "grad_norm": 0.28602153419177223, + "learning_rate": 0.0001875850574331027, + "loss": 2.739790439605713, + "step": 6902, + "token_acc": 0.33592011381105946 + }, + { + "epoch": 4.046320727059514, + "grad_norm": 0.3158398124093764, + "learning_rate": 0.0001875803797917814, + "loss": 2.7709250450134277, + "step": 6903, + "token_acc": 0.3304399245501452 + }, + { + "epoch": 4.046907065376723, + "grad_norm": 0.3339713907102065, + "learning_rate": 0.00018757570132776043, + "loss": 2.766171455383301, + "step": 6904, + "token_acc": 0.3309611355929176 + }, + { + "epoch": 4.047493403693931, + "grad_norm": 0.42387878975002297, + "learning_rate": 0.00018757102204108373, + "loss": 2.735865592956543, + "step": 6905, + "token_acc": 0.3346720284791351 + }, + { + "epoch": 4.04807974201114, + "grad_norm": 0.4886871588915225, + "learning_rate": 0.0001875663419317953, + "loss": 2.7668159008026123, + "step": 6906, + "token_acc": 0.3309696511326894 + }, + { + "epoch": 4.048666080328349, + "grad_norm": 0.37874510041362275, + "learning_rate": 0.00018756166099993913, + "loss": 2.796940565109253, + "step": 6907, + "token_acc": 0.32617878307580594 + }, + { + "epoch": 4.049252418645558, + "grad_norm": 0.314298446354948, + "learning_rate": 0.00018755697924555912, + "loss": 2.7579712867736816, + "step": 6908, + "token_acc": 0.3329552784915876 + }, + { + "epoch": 4.049838756962767, + "grad_norm": 0.3832672785050187, + "learning_rate": 0.00018755229666869925, + "loss": 2.752161741256714, + "step": 6909, + "token_acc": 0.33266478086927803 + }, + { + "epoch": 4.0504250952799765, + "grad_norm": 0.2936386376660296, + "learning_rate": 0.00018754761326940353, + "loss": 2.7634830474853516, + "step": 6910, + "token_acc": 0.33242755247083533 + }, + { + "epoch": 4.051011433597186, + "grad_norm": 0.32265135429292935, + "learning_rate": 0.00018754292904771597, + "loss": 2.7729527950286865, + "step": 6911, + "token_acc": 0.32992748762987056 + }, + { + "epoch": 4.051597771914395, + "grad_norm": 0.27939899591944534, + "learning_rate": 0.00018753824400368057, + "loss": 2.7908830642700195, + "step": 6912, + "token_acc": 0.32677308012748457 + }, + { + "epoch": 4.052184110231604, + "grad_norm": 0.3676753953235489, + "learning_rate": 0.0001875335581373413, + "loss": 2.792667865753174, + "step": 6913, + "token_acc": 0.32852630747769324 + }, + { + "epoch": 4.052770448548813, + "grad_norm": 0.2637688120432847, + "learning_rate": 0.00018752887144874223, + "loss": 2.7679219245910645, + "step": 6914, + "token_acc": 0.33158920024770533 + }, + { + "epoch": 4.053356786866022, + "grad_norm": 0.33093191401428684, + "learning_rate": 0.00018752418393792734, + "loss": 2.7562692165374756, + "step": 6915, + "token_acc": 0.33231292427476816 + }, + { + "epoch": 4.05394312518323, + "grad_norm": 0.2521829343560796, + "learning_rate": 0.0001875194956049407, + "loss": 2.7012040615081787, + "step": 6916, + "token_acc": 0.34112499763246684 + }, + { + "epoch": 4.054529463500439, + "grad_norm": 0.3000176500552398, + "learning_rate": 0.0001875148064498263, + "loss": 2.7751495838165283, + "step": 6917, + "token_acc": 0.3308576658285902 + }, + { + "epoch": 4.0551158018176485, + "grad_norm": 0.2792806399142487, + "learning_rate": 0.00018751011647262823, + "loss": 2.774353504180908, + "step": 6918, + "token_acc": 0.3294375727212765 + }, + { + "epoch": 4.055702140134858, + "grad_norm": 0.3258240461061615, + "learning_rate": 0.00018750542567339058, + "loss": 2.7712550163269043, + "step": 6919, + "token_acc": 0.33067063755266185 + }, + { + "epoch": 4.056288478452067, + "grad_norm": 0.26955602050407346, + "learning_rate": 0.00018750073405215733, + "loss": 2.7577781677246094, + "step": 6920, + "token_acc": 0.3331525659454999 + }, + { + "epoch": 4.056874816769276, + "grad_norm": 0.29942012602178675, + "learning_rate": 0.0001874960416089726, + "loss": 2.7662367820739746, + "step": 6921, + "token_acc": 0.33172079272909577 + }, + { + "epoch": 4.057461155086485, + "grad_norm": 0.2575745026519664, + "learning_rate": 0.00018749134834388049, + "loss": 2.789682388305664, + "step": 6922, + "token_acc": 0.3279884769216919 + }, + { + "epoch": 4.058047493403694, + "grad_norm": 0.3052660068945356, + "learning_rate": 0.00018748665425692503, + "loss": 2.7984015941619873, + "step": 6923, + "token_acc": 0.32599318023088614 + }, + { + "epoch": 4.058633831720903, + "grad_norm": 0.2699691755048279, + "learning_rate": 0.00018748195934815035, + "loss": 2.8098926544189453, + "step": 6924, + "token_acc": 0.32484791166095806 + }, + { + "epoch": 4.059220170038112, + "grad_norm": 0.3370475519299112, + "learning_rate": 0.0001874772636176005, + "loss": 2.802156925201416, + "step": 6925, + "token_acc": 0.3251754456723401 + }, + { + "epoch": 4.059806508355321, + "grad_norm": 0.26608284016538064, + "learning_rate": 0.0001874725670653197, + "loss": 2.753708839416504, + "step": 6926, + "token_acc": 0.3331864829763936 + }, + { + "epoch": 4.06039284667253, + "grad_norm": 0.31024061405733094, + "learning_rate": 0.00018746786969135197, + "loss": 2.733388900756836, + "step": 6927, + "token_acc": 0.3366816330235066 + }, + { + "epoch": 4.060979184989739, + "grad_norm": 0.2825549507571758, + "learning_rate": 0.00018746317149574148, + "loss": 2.7752132415771484, + "step": 6928, + "token_acc": 0.33001740168553534 + }, + { + "epoch": 4.061565523306948, + "grad_norm": 0.26218068159211266, + "learning_rate": 0.00018745847247853237, + "loss": 2.792555809020996, + "step": 6929, + "token_acc": 0.32729730804405743 + }, + { + "epoch": 4.062151861624157, + "grad_norm": 0.28565571409610485, + "learning_rate": 0.00018745377263976873, + "loss": 2.7336134910583496, + "step": 6930, + "token_acc": 0.33658813010791955 + }, + { + "epoch": 4.062738199941366, + "grad_norm": 0.25917494324516965, + "learning_rate": 0.00018744907197949475, + "loss": 2.78607177734375, + "step": 6931, + "token_acc": 0.32839207787848274 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.275856600393436, + "learning_rate": 0.0001874443704977546, + "loss": 2.7626137733459473, + "step": 6932, + "token_acc": 0.3318768629764896 + }, + { + "epoch": 4.063910876575784, + "grad_norm": 0.24019919760464942, + "learning_rate": 0.00018743966819459237, + "loss": 2.732649326324463, + "step": 6933, + "token_acc": 0.33588962625890506 + }, + { + "epoch": 4.064497214892993, + "grad_norm": 0.27187415472866, + "learning_rate": 0.00018743496507005235, + "loss": 2.8132128715515137, + "step": 6934, + "token_acc": 0.3224202450757547 + }, + { + "epoch": 4.0650835532102025, + "grad_norm": 0.24414871822487383, + "learning_rate": 0.0001874302611241786, + "loss": 2.7322864532470703, + "step": 6935, + "token_acc": 0.335468319382995 + }, + { + "epoch": 4.065669891527412, + "grad_norm": 0.28580262242350185, + "learning_rate": 0.0001874255563570154, + "loss": 2.772169589996338, + "step": 6936, + "token_acc": 0.3297258726288874 + }, + { + "epoch": 4.066256229844621, + "grad_norm": 0.24595331072833043, + "learning_rate": 0.00018742085076860687, + "loss": 2.785900115966797, + "step": 6937, + "token_acc": 0.32830294757475303 + }, + { + "epoch": 4.066842568161829, + "grad_norm": 0.2994795502731689, + "learning_rate": 0.00018741614435899729, + "loss": 2.777104377746582, + "step": 6938, + "token_acc": 0.3310012743868956 + }, + { + "epoch": 4.067428906479038, + "grad_norm": 0.2541971012890222, + "learning_rate": 0.0001874114371282308, + "loss": 2.7946391105651855, + "step": 6939, + "token_acc": 0.3265326393046111 + }, + { + "epoch": 4.068015244796247, + "grad_norm": 0.2866739259589239, + "learning_rate": 0.00018740672907635163, + "loss": 2.78688645362854, + "step": 6940, + "token_acc": 0.3291741824170653 + }, + { + "epoch": 4.068601583113456, + "grad_norm": 0.318026047304951, + "learning_rate": 0.00018740202020340406, + "loss": 2.777865409851074, + "step": 6941, + "token_acc": 0.32845302694035317 + }, + { + "epoch": 4.069187921430665, + "grad_norm": 0.2636248194563254, + "learning_rate": 0.00018739731050943225, + "loss": 2.779660224914551, + "step": 6942, + "token_acc": 0.32937057545943443 + }, + { + "epoch": 4.0697742597478745, + "grad_norm": 0.34685037688241127, + "learning_rate": 0.00018739259999448052, + "loss": 2.770552158355713, + "step": 6943, + "token_acc": 0.3302495266536336 + }, + { + "epoch": 4.070360598065084, + "grad_norm": 0.28457131153826515, + "learning_rate": 0.00018738788865859304, + "loss": 2.735234260559082, + "step": 6944, + "token_acc": 0.3351677232327737 + }, + { + "epoch": 4.070946936382293, + "grad_norm": 0.2946063281682729, + "learning_rate": 0.00018738317650181412, + "loss": 2.737989902496338, + "step": 6945, + "token_acc": 0.336115510013973 + }, + { + "epoch": 4.071533274699502, + "grad_norm": 0.3107838949698725, + "learning_rate": 0.000187378463524188, + "loss": 2.735621929168701, + "step": 6946, + "token_acc": 0.33495310265864653 + }, + { + "epoch": 4.072119613016711, + "grad_norm": 0.2639847972785085, + "learning_rate": 0.00018737374972575897, + "loss": 2.7650980949401855, + "step": 6947, + "token_acc": 0.33240904191336135 + }, + { + "epoch": 4.07270595133392, + "grad_norm": 0.2953356426811721, + "learning_rate": 0.0001873690351065713, + "loss": 2.7613000869750977, + "step": 6948, + "token_acc": 0.3319508468947194 + }, + { + "epoch": 4.073292289651128, + "grad_norm": 0.2448617502391117, + "learning_rate": 0.00018736431966666925, + "loss": 2.796015739440918, + "step": 6949, + "token_acc": 0.32686305914351205 + }, + { + "epoch": 4.073878627968337, + "grad_norm": 0.3349701806695207, + "learning_rate": 0.00018735960340609715, + "loss": 2.750030994415283, + "step": 6950, + "token_acc": 0.3334959310585645 + }, + { + "epoch": 4.0744649662855466, + "grad_norm": 0.2972968835392099, + "learning_rate": 0.0001873548863248993, + "loss": 2.78411602973938, + "step": 6951, + "token_acc": 0.32904960520448046 + }, + { + "epoch": 4.075051304602756, + "grad_norm": 0.2862035605160103, + "learning_rate": 0.00018735016842312001, + "loss": 2.767188310623169, + "step": 6952, + "token_acc": 0.32981540192677405 + }, + { + "epoch": 4.075637642919965, + "grad_norm": 0.28264675652244914, + "learning_rate": 0.0001873454497008036, + "loss": 2.8102285861968994, + "step": 6953, + "token_acc": 0.32528572060388317 + }, + { + "epoch": 4.076223981237174, + "grad_norm": 0.2677781927569503, + "learning_rate": 0.00018734073015799435, + "loss": 2.7505927085876465, + "step": 6954, + "token_acc": 0.3341662497696095 + }, + { + "epoch": 4.076810319554383, + "grad_norm": 0.2866412242122134, + "learning_rate": 0.0001873360097947367, + "loss": 2.7501988410949707, + "step": 6955, + "token_acc": 0.3330945817123129 + }, + { + "epoch": 4.077396657871592, + "grad_norm": 0.2572213606705688, + "learning_rate": 0.00018733128861107487, + "loss": 2.7422337532043457, + "step": 6956, + "token_acc": 0.3351038158698162 + }, + { + "epoch": 4.077982996188801, + "grad_norm": 0.2781125837809364, + "learning_rate": 0.0001873265666070533, + "loss": 2.7785842418670654, + "step": 6957, + "token_acc": 0.3300830435500397 + }, + { + "epoch": 4.07856933450601, + "grad_norm": 0.28590266231449196, + "learning_rate": 0.00018732184378271626, + "loss": 2.801027774810791, + "step": 6958, + "token_acc": 0.32767509404876755 + }, + { + "epoch": 4.0791556728232194, + "grad_norm": 0.27665116877733775, + "learning_rate": 0.00018731712013810822, + "loss": 2.7490527629852295, + "step": 6959, + "token_acc": 0.33425858139552933 + }, + { + "epoch": 4.079742011140428, + "grad_norm": 0.2881567320608162, + "learning_rate": 0.00018731239567327347, + "loss": 2.750871181488037, + "step": 6960, + "token_acc": 0.3347949084223901 + }, + { + "epoch": 4.080328349457637, + "grad_norm": 0.29411699909789923, + "learning_rate": 0.00018730767038825644, + "loss": 2.7507994174957275, + "step": 6961, + "token_acc": 0.33386738969682034 + }, + { + "epoch": 4.080914687774846, + "grad_norm": 0.30938966460423356, + "learning_rate": 0.00018730294428310148, + "loss": 2.778989315032959, + "step": 6962, + "token_acc": 0.3300888019386679 + }, + { + "epoch": 4.081501026092055, + "grad_norm": 0.2716111641880414, + "learning_rate": 0.00018729821735785298, + "loss": 2.7616281509399414, + "step": 6963, + "token_acc": 0.33293458252337077 + }, + { + "epoch": 4.082087364409264, + "grad_norm": 0.286731706949621, + "learning_rate": 0.00018729348961255538, + "loss": 2.7621986865997314, + "step": 6964, + "token_acc": 0.33139352165042907 + }, + { + "epoch": 4.082673702726473, + "grad_norm": 0.2800254923304573, + "learning_rate": 0.00018728876104725308, + "loss": 2.776677131652832, + "step": 6965, + "token_acc": 0.32902117496713507 + }, + { + "epoch": 4.083260041043682, + "grad_norm": 0.31442177367564694, + "learning_rate": 0.0001872840316619905, + "loss": 2.73514986038208, + "step": 6966, + "token_acc": 0.3359620082410142 + }, + { + "epoch": 4.0838463793608915, + "grad_norm": 0.29508438486586264, + "learning_rate": 0.00018727930145681206, + "loss": 2.772282123565674, + "step": 6967, + "token_acc": 0.33024621876456084 + }, + { + "epoch": 4.084432717678101, + "grad_norm": 0.2886810982556486, + "learning_rate": 0.00018727457043176218, + "loss": 2.758009910583496, + "step": 6968, + "token_acc": 0.33227547467740054 + }, + { + "epoch": 4.08501905599531, + "grad_norm": 0.32037873471429956, + "learning_rate": 0.00018726983858688532, + "loss": 2.765078067779541, + "step": 6969, + "token_acc": 0.32996333889780516 + }, + { + "epoch": 4.085605394312518, + "grad_norm": 0.37409292450447285, + "learning_rate": 0.0001872651059222259, + "loss": 2.8005900382995605, + "step": 6970, + "token_acc": 0.3271375464684015 + }, + { + "epoch": 4.086191732629727, + "grad_norm": 0.2938886905099938, + "learning_rate": 0.00018726037243782843, + "loss": 2.7676639556884766, + "step": 6971, + "token_acc": 0.33005086735359335 + }, + { + "epoch": 4.086778070946936, + "grad_norm": 0.2893332732306068, + "learning_rate": 0.00018725563813373733, + "loss": 2.76513671875, + "step": 6972, + "token_acc": 0.33118179900902706 + }, + { + "epoch": 4.087364409264145, + "grad_norm": 0.33046676274236186, + "learning_rate": 0.0001872509030099971, + "loss": 2.7479000091552734, + "step": 6973, + "token_acc": 0.3332185105076237 + }, + { + "epoch": 4.087950747581354, + "grad_norm": 0.26955341360066104, + "learning_rate": 0.00018724616706665222, + "loss": 2.7698910236358643, + "step": 6974, + "token_acc": 0.33168206763410774 + }, + { + "epoch": 4.0885370858985635, + "grad_norm": 0.2604521758871385, + "learning_rate": 0.00018724143030374713, + "loss": 2.7822229862213135, + "step": 6975, + "token_acc": 0.3290214994272534 + }, + { + "epoch": 4.089123424215773, + "grad_norm": 0.25591471146964134, + "learning_rate": 0.0001872366927213264, + "loss": 2.774961471557617, + "step": 6976, + "token_acc": 0.3283634468471819 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.2669908508235807, + "learning_rate": 0.00018723195431943448, + "loss": 2.7355546951293945, + "step": 6977, + "token_acc": 0.33567018884599886 + }, + { + "epoch": 4.090296100850191, + "grad_norm": 0.25196534934350184, + "learning_rate": 0.0001872272150981159, + "loss": 2.7441518306732178, + "step": 6978, + "token_acc": 0.33401965323600696 + }, + { + "epoch": 4.0908824391674, + "grad_norm": 0.29341116461069644, + "learning_rate": 0.00018722247505741514, + "loss": 2.806053638458252, + "step": 6979, + "token_acc": 0.3239189841276395 + }, + { + "epoch": 4.091468777484609, + "grad_norm": 0.269197395566141, + "learning_rate": 0.0001872177341973768, + "loss": 2.7758655548095703, + "step": 6980, + "token_acc": 0.3302535503534142 + }, + { + "epoch": 4.092055115801817, + "grad_norm": 0.26560612438553594, + "learning_rate": 0.0001872129925180454, + "loss": 2.784797430038452, + "step": 6981, + "token_acc": 0.32876715945786805 + }, + { + "epoch": 4.092641454119026, + "grad_norm": 0.3366844470860098, + "learning_rate": 0.0001872082500194654, + "loss": 2.7610507011413574, + "step": 6982, + "token_acc": 0.3336090752459284 + }, + { + "epoch": 4.0932277924362355, + "grad_norm": 0.34951875845322655, + "learning_rate": 0.00018720350670168144, + "loss": 2.770429849624634, + "step": 6983, + "token_acc": 0.33193197264900937 + }, + { + "epoch": 4.093814130753445, + "grad_norm": 0.2637241933805337, + "learning_rate": 0.00018719876256473802, + "loss": 2.766040325164795, + "step": 6984, + "token_acc": 0.3314911507425239 + }, + { + "epoch": 4.094400469070654, + "grad_norm": 0.3162434060030911, + "learning_rate": 0.00018719401760867972, + "loss": 2.773322582244873, + "step": 6985, + "token_acc": 0.3310106870393112 + }, + { + "epoch": 4.094986807387863, + "grad_norm": 0.28569884990697536, + "learning_rate": 0.00018718927183355115, + "loss": 2.77522611618042, + "step": 6986, + "token_acc": 0.329738195528433 + }, + { + "epoch": 4.095573145705072, + "grad_norm": 0.25913497284786324, + "learning_rate": 0.00018718452523939683, + "loss": 2.724515914916992, + "step": 6987, + "token_acc": 0.3359277507638022 + }, + { + "epoch": 4.096159484022281, + "grad_norm": 0.27935033919243435, + "learning_rate": 0.0001871797778262614, + "loss": 2.7765698432922363, + "step": 6988, + "token_acc": 0.328036729317973 + }, + { + "epoch": 4.09674582233949, + "grad_norm": 0.2719017625940981, + "learning_rate": 0.0001871750295941894, + "loss": 2.7635622024536133, + "step": 6989, + "token_acc": 0.3317244464566759 + }, + { + "epoch": 4.097332160656699, + "grad_norm": 0.25469946765134416, + "learning_rate": 0.00018717028054322552, + "loss": 2.7829761505126953, + "step": 6990, + "token_acc": 0.32703086983748936 + }, + { + "epoch": 4.097918498973908, + "grad_norm": 0.24891389017139048, + "learning_rate": 0.00018716553067341427, + "loss": 2.7539639472961426, + "step": 6991, + "token_acc": 0.3325885245150684 + }, + { + "epoch": 4.098504837291117, + "grad_norm": 0.2585031128140516, + "learning_rate": 0.00018716077998480034, + "loss": 2.7908847332000732, + "step": 6992, + "token_acc": 0.32657609541367644 + }, + { + "epoch": 4.099091175608326, + "grad_norm": 0.27687698176399467, + "learning_rate": 0.00018715602847742835, + "loss": 2.7853105068206787, + "step": 6993, + "token_acc": 0.32809481168263027 + }, + { + "epoch": 4.099677513925535, + "grad_norm": 0.2571919110684222, + "learning_rate": 0.00018715127615134288, + "loss": 2.7836523056030273, + "step": 6994, + "token_acc": 0.32807189730150194 + }, + { + "epoch": 4.100263852242744, + "grad_norm": 0.2656065161465735, + "learning_rate": 0.00018714652300658863, + "loss": 2.7827649116516113, + "step": 6995, + "token_acc": 0.32866491766204053 + }, + { + "epoch": 4.100850190559953, + "grad_norm": 0.2628347208721259, + "learning_rate": 0.00018714176904321023, + "loss": 2.765716075897217, + "step": 6996, + "token_acc": 0.3299386874732345 + }, + { + "epoch": 4.101436528877162, + "grad_norm": 0.2710803609148174, + "learning_rate": 0.00018713701426125234, + "loss": 2.760533332824707, + "step": 6997, + "token_acc": 0.3320192728865528 + }, + { + "epoch": 4.102022867194371, + "grad_norm": 0.27170442463488476, + "learning_rate": 0.0001871322586607596, + "loss": 2.7863659858703613, + "step": 6998, + "token_acc": 0.3277534069062211 + }, + { + "epoch": 4.10260920551158, + "grad_norm": 0.3079328478046814, + "learning_rate": 0.00018712750224177672, + "loss": 2.7903757095336914, + "step": 6999, + "token_acc": 0.3267958857935452 + }, + { + "epoch": 4.1031955438287895, + "grad_norm": 0.3578986661606624, + "learning_rate": 0.00018712274500434835, + "loss": 2.7802817821502686, + "step": 7000, + "token_acc": 0.3291383989145183 + }, + { + "epoch": 4.103781882145999, + "grad_norm": 0.3120518322693373, + "learning_rate": 0.00018711798694851916, + "loss": 2.7869412899017334, + "step": 7001, + "token_acc": 0.3285645192752186 + }, + { + "epoch": 4.104368220463208, + "grad_norm": 0.27673372665174956, + "learning_rate": 0.00018711322807433392, + "loss": 2.7826361656188965, + "step": 7002, + "token_acc": 0.32803046296916116 + }, + { + "epoch": 4.104954558780416, + "grad_norm": 0.2701067293435275, + "learning_rate": 0.0001871084683818373, + "loss": 2.7830686569213867, + "step": 7003, + "token_acc": 0.3277646773680463 + }, + { + "epoch": 4.105540897097625, + "grad_norm": 0.3061785059713317, + "learning_rate": 0.00018710370787107397, + "loss": 2.790168285369873, + "step": 7004, + "token_acc": 0.32784299851509324 + }, + { + "epoch": 4.106127235414834, + "grad_norm": 0.26051961441749666, + "learning_rate": 0.00018709894654208868, + "loss": 2.827219247817993, + "step": 7005, + "token_acc": 0.32449667320106185 + }, + { + "epoch": 4.106713573732043, + "grad_norm": 0.3090480869904867, + "learning_rate": 0.00018709418439492615, + "loss": 2.7633821964263916, + "step": 7006, + "token_acc": 0.33233672449013 + }, + { + "epoch": 4.107299912049252, + "grad_norm": 0.2593030729150849, + "learning_rate": 0.00018708942142963114, + "loss": 2.7726902961730957, + "step": 7007, + "token_acc": 0.3313726369432128 + }, + { + "epoch": 4.1078862503664615, + "grad_norm": 0.27487392902951996, + "learning_rate": 0.00018708465764624834, + "loss": 2.807840347290039, + "step": 7008, + "token_acc": 0.3253213105705568 + }, + { + "epoch": 4.108472588683671, + "grad_norm": 0.2801469133155097, + "learning_rate": 0.00018707989304482254, + "loss": 2.759533405303955, + "step": 7009, + "token_acc": 0.33206257968987785 + }, + { + "epoch": 4.10905892700088, + "grad_norm": 0.25645991403593255, + "learning_rate": 0.0001870751276253985, + "loss": 2.8053030967712402, + "step": 7010, + "token_acc": 0.32579624767881155 + }, + { + "epoch": 4.109645265318089, + "grad_norm": 0.2746323135299995, + "learning_rate": 0.00018707036138802097, + "loss": 2.771796941757202, + "step": 7011, + "token_acc": 0.3321467927526204 + }, + { + "epoch": 4.110231603635298, + "grad_norm": 0.265106029438647, + "learning_rate": 0.0001870655943327347, + "loss": 2.7741711139678955, + "step": 7012, + "token_acc": 0.32933837902995683 + }, + { + "epoch": 4.110817941952506, + "grad_norm": 0.26188098833628304, + "learning_rate": 0.0001870608264595845, + "loss": 2.744208812713623, + "step": 7013, + "token_acc": 0.33597084879555544 + }, + { + "epoch": 4.111404280269715, + "grad_norm": 0.29865889116801675, + "learning_rate": 0.0001870560577686152, + "loss": 2.811337471008301, + "step": 7014, + "token_acc": 0.32440194285966034 + }, + { + "epoch": 4.111990618586924, + "grad_norm": 0.2820099505103101, + "learning_rate": 0.0001870512882598715, + "loss": 2.795687675476074, + "step": 7015, + "token_acc": 0.32662213688556246 + }, + { + "epoch": 4.1125769569041335, + "grad_norm": 0.2822595527425229, + "learning_rate": 0.00018704651793339828, + "loss": 2.7699217796325684, + "step": 7016, + "token_acc": 0.3312767175277295 + }, + { + "epoch": 4.113163295221343, + "grad_norm": 0.3725045013551885, + "learning_rate": 0.0001870417467892403, + "loss": 2.792863368988037, + "step": 7017, + "token_acc": 0.3268953378775965 + }, + { + "epoch": 4.113749633538552, + "grad_norm": 0.3206722410831035, + "learning_rate": 0.0001870369748274424, + "loss": 2.7744576930999756, + "step": 7018, + "token_acc": 0.327900455660708 + }, + { + "epoch": 4.114335971855761, + "grad_norm": 0.2653342546786799, + "learning_rate": 0.0001870322020480494, + "loss": 2.773062229156494, + "step": 7019, + "token_acc": 0.33105562902044666 + }, + { + "epoch": 4.11492231017297, + "grad_norm": 0.29228210617629496, + "learning_rate": 0.00018702742845110612, + "loss": 2.748831033706665, + "step": 7020, + "token_acc": 0.33241421331329857 + }, + { + "epoch": 4.115508648490179, + "grad_norm": 0.2707063354424606, + "learning_rate": 0.00018702265403665745, + "loss": 2.7499032020568848, + "step": 7021, + "token_acc": 0.3344534779669922 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.28093681581677316, + "learning_rate": 0.00018701787880474823, + "loss": 2.767742156982422, + "step": 7022, + "token_acc": 0.3306166629524689 + }, + { + "epoch": 4.116681325124597, + "grad_norm": 0.24431384247063698, + "learning_rate": 0.00018701310275542326, + "loss": 2.745388984680176, + "step": 7023, + "token_acc": 0.3357383016779703 + }, + { + "epoch": 4.1172676634418055, + "grad_norm": 0.27872708923673, + "learning_rate": 0.00018700832588872744, + "loss": 2.790595769882202, + "step": 7024, + "token_acc": 0.3283110022521354 + }, + { + "epoch": 4.117854001759015, + "grad_norm": 0.2726547496097954, + "learning_rate": 0.00018700354820470568, + "loss": 2.706279754638672, + "step": 7025, + "token_acc": 0.3406320619504428 + }, + { + "epoch": 4.118440340076224, + "grad_norm": 0.3107360314112489, + "learning_rate": 0.00018699876970340278, + "loss": 2.786653518676758, + "step": 7026, + "token_acc": 0.32710481381325457 + }, + { + "epoch": 4.119026678393433, + "grad_norm": 0.3335487111517807, + "learning_rate": 0.00018699399038486368, + "loss": 2.801717758178711, + "step": 7027, + "token_acc": 0.3264239103098651 + }, + { + "epoch": 4.119613016710642, + "grad_norm": 0.27672355674194155, + "learning_rate": 0.0001869892102491333, + "loss": 2.7825257778167725, + "step": 7028, + "token_acc": 0.32844359496710895 + }, + { + "epoch": 4.120199355027851, + "grad_norm": 0.2765096560489484, + "learning_rate": 0.00018698442929625646, + "loss": 2.8233089447021484, + "step": 7029, + "token_acc": 0.32240019752171967 + }, + { + "epoch": 4.12078569334506, + "grad_norm": 0.3059329729677058, + "learning_rate": 0.00018697964752627816, + "loss": 2.7617125511169434, + "step": 7030, + "token_acc": 0.3316954522333451 + }, + { + "epoch": 4.121372031662269, + "grad_norm": 0.2710815583715179, + "learning_rate": 0.00018697486493924326, + "loss": 2.760650634765625, + "step": 7031, + "token_acc": 0.33190080739989997 + }, + { + "epoch": 4.121958369979478, + "grad_norm": 0.31203436338182305, + "learning_rate": 0.00018697008153519673, + "loss": 2.757140636444092, + "step": 7032, + "token_acc": 0.33304438915804635 + }, + { + "epoch": 4.1225447082966875, + "grad_norm": 0.33578580087262255, + "learning_rate": 0.00018696529731418343, + "loss": 2.7520766258239746, + "step": 7033, + "token_acc": 0.3345742932724167 + }, + { + "epoch": 4.123131046613897, + "grad_norm": 0.289357131677349, + "learning_rate": 0.00018696051227624837, + "loss": 2.7894434928894043, + "step": 7034, + "token_acc": 0.3264634008401748 + }, + { + "epoch": 4.123717384931105, + "grad_norm": 0.27819199448952175, + "learning_rate": 0.00018695572642143647, + "loss": 2.777121067047119, + "step": 7035, + "token_acc": 0.3277851542335577 + }, + { + "epoch": 4.124303723248314, + "grad_norm": 0.3009842204012059, + "learning_rate": 0.00018695093974979273, + "loss": 2.8255021572113037, + "step": 7036, + "token_acc": 0.32206532372479424 + }, + { + "epoch": 4.124890061565523, + "grad_norm": 0.30974712396712883, + "learning_rate": 0.00018694615226136206, + "loss": 2.759551525115967, + "step": 7037, + "token_acc": 0.33152108818030795 + }, + { + "epoch": 4.125476399882732, + "grad_norm": 0.2953854844520762, + "learning_rate": 0.00018694136395618944, + "loss": 2.7843942642211914, + "step": 7038, + "token_acc": 0.32956415072844664 + }, + { + "epoch": 4.126062738199941, + "grad_norm": 0.28009635260859733, + "learning_rate": 0.0001869365748343199, + "loss": 2.77339243888855, + "step": 7039, + "token_acc": 0.3305213550539104 + }, + { + "epoch": 4.12664907651715, + "grad_norm": 0.32887105735005084, + "learning_rate": 0.00018693178489579833, + "loss": 2.757084369659424, + "step": 7040, + "token_acc": 0.33373386423311424 + }, + { + "epoch": 4.1272354148343595, + "grad_norm": 0.3053424996870601, + "learning_rate": 0.00018692699414066982, + "loss": 2.768772840499878, + "step": 7041, + "token_acc": 0.33037440812785485 + }, + { + "epoch": 4.127821753151569, + "grad_norm": 0.25962555383164393, + "learning_rate": 0.0001869222025689793, + "loss": 2.825148105621338, + "step": 7042, + "token_acc": 0.321526552242924 + }, + { + "epoch": 4.128408091468778, + "grad_norm": 0.3101808558394585, + "learning_rate": 0.00018691741018077185, + "loss": 2.7873921394348145, + "step": 7043, + "token_acc": 0.32956636344841844 + }, + { + "epoch": 4.128994429785987, + "grad_norm": 0.35146631458379474, + "learning_rate": 0.0001869126169760924, + "loss": 2.7699458599090576, + "step": 7044, + "token_acc": 0.3312729667155207 + }, + { + "epoch": 4.129580768103196, + "grad_norm": 0.32041795317132327, + "learning_rate": 0.0001869078229549861, + "loss": 2.78788423538208, + "step": 7045, + "token_acc": 0.32715159249750037 + }, + { + "epoch": 4.130167106420404, + "grad_norm": 0.2951197907994588, + "learning_rate": 0.0001869030281174979, + "loss": 2.7604000568389893, + "step": 7046, + "token_acc": 0.3330716538058618 + }, + { + "epoch": 4.130753444737613, + "grad_norm": 0.32871960030887004, + "learning_rate": 0.0001868982324636728, + "loss": 2.7620162963867188, + "step": 7047, + "token_acc": 0.3314547276534344 + }, + { + "epoch": 4.131339783054822, + "grad_norm": 0.3301648556863799, + "learning_rate": 0.00018689343599355597, + "loss": 2.786294460296631, + "step": 7048, + "token_acc": 0.3275438922680549 + }, + { + "epoch": 4.1319261213720315, + "grad_norm": 0.2906726232084152, + "learning_rate": 0.00018688863870719238, + "loss": 2.8451647758483887, + "step": 7049, + "token_acc": 0.3210807841701215 + }, + { + "epoch": 4.132512459689241, + "grad_norm": 0.4210967496802527, + "learning_rate": 0.0001868838406046271, + "loss": 2.7999956607818604, + "step": 7050, + "token_acc": 0.32510103395106016 + }, + { + "epoch": 4.13309879800645, + "grad_norm": 0.2935659433205372, + "learning_rate": 0.00018687904168590523, + "loss": 2.8028855323791504, + "step": 7051, + "token_acc": 0.3267167102507736 + }, + { + "epoch": 4.133685136323659, + "grad_norm": 0.33054437937100584, + "learning_rate": 0.00018687424195107182, + "loss": 2.783092498779297, + "step": 7052, + "token_acc": 0.3278107338763298 + }, + { + "epoch": 4.134271474640868, + "grad_norm": 0.2634381035047187, + "learning_rate": 0.00018686944140017196, + "loss": 2.8198819160461426, + "step": 7053, + "token_acc": 0.3240459873422462 + }, + { + "epoch": 4.134857812958077, + "grad_norm": 0.36315860437658876, + "learning_rate": 0.00018686464003325078, + "loss": 2.7515907287597656, + "step": 7054, + "token_acc": 0.3351889375094307 + }, + { + "epoch": 4.135444151275286, + "grad_norm": 0.27387714814099007, + "learning_rate": 0.00018685983785035335, + "loss": 2.7970261573791504, + "step": 7055, + "token_acc": 0.3256825478294115 + }, + { + "epoch": 4.136030489592494, + "grad_norm": 0.3545461519717396, + "learning_rate": 0.00018685503485152478, + "loss": 2.843562602996826, + "step": 7056, + "token_acc": 0.31898898459566655 + }, + { + "epoch": 4.1366168279097035, + "grad_norm": 0.2874434061880429, + "learning_rate": 0.00018685023103681022, + "loss": 2.745453357696533, + "step": 7057, + "token_acc": 0.3346944932283193 + }, + { + "epoch": 4.137203166226913, + "grad_norm": 0.33855619322856323, + "learning_rate": 0.00018684542640625475, + "loss": 2.795011043548584, + "step": 7058, + "token_acc": 0.32832476815405065 + }, + { + "epoch": 4.137789504544122, + "grad_norm": 0.2669482462179286, + "learning_rate": 0.00018684062095990353, + "loss": 2.8409528732299805, + "step": 7059, + "token_acc": 0.32114618553623014 + }, + { + "epoch": 4.138375842861331, + "grad_norm": 0.2861106352799615, + "learning_rate": 0.0001868358146978017, + "loss": 2.8036770820617676, + "step": 7060, + "token_acc": 0.3253860287853887 + }, + { + "epoch": 4.13896218117854, + "grad_norm": 0.2449597642082394, + "learning_rate": 0.00018683100761999438, + "loss": 2.733694076538086, + "step": 7061, + "token_acc": 0.33732241219902853 + }, + { + "epoch": 4.139548519495749, + "grad_norm": 0.29058488972340873, + "learning_rate": 0.00018682619972652678, + "loss": 2.77097749710083, + "step": 7062, + "token_acc": 0.33256993774977767 + }, + { + "epoch": 4.140134857812958, + "grad_norm": 0.2662685747523196, + "learning_rate": 0.00018682139101744403, + "loss": 2.7678170204162598, + "step": 7063, + "token_acc": 0.3309265884012697 + }, + { + "epoch": 4.140721196130167, + "grad_norm": 0.29599596641628567, + "learning_rate": 0.0001868165814927913, + "loss": 2.8359062671661377, + "step": 7064, + "token_acc": 0.32174010260110836 + }, + { + "epoch": 4.141307534447376, + "grad_norm": 0.25726594615956855, + "learning_rate": 0.0001868117711526138, + "loss": 2.80977725982666, + "step": 7065, + "token_acc": 0.32590732536672784 + }, + { + "epoch": 4.1418938727645855, + "grad_norm": 0.28490856373810053, + "learning_rate": 0.00018680695999695663, + "loss": 2.783968687057495, + "step": 7066, + "token_acc": 0.3279909294688005 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.2520894605068028, + "learning_rate": 0.00018680214802586508, + "loss": 2.7627320289611816, + "step": 7067, + "token_acc": 0.33140180161577576 + }, + { + "epoch": 4.143066549399003, + "grad_norm": 0.2783301064219893, + "learning_rate": 0.00018679733523938432, + "loss": 2.7848260402679443, + "step": 7068, + "token_acc": 0.328845948180744 + }, + { + "epoch": 4.143652887716212, + "grad_norm": 0.2658360803771178, + "learning_rate": 0.00018679252163755953, + "loss": 2.7919535636901855, + "step": 7069, + "token_acc": 0.3271269463371918 + }, + { + "epoch": 4.144239226033421, + "grad_norm": 0.24183446199370118, + "learning_rate": 0.000186787707220436, + "loss": 2.79420804977417, + "step": 7070, + "token_acc": 0.32717455206054796 + }, + { + "epoch": 4.14482556435063, + "grad_norm": 0.26054339657663267, + "learning_rate": 0.00018678289198805888, + "loss": 2.8057875633239746, + "step": 7071, + "token_acc": 0.32339594019566253 + }, + { + "epoch": 4.145411902667839, + "grad_norm": 0.2630474990463091, + "learning_rate": 0.00018677807594047344, + "loss": 2.773744583129883, + "step": 7072, + "token_acc": 0.33090469261454625 + }, + { + "epoch": 4.145998240985048, + "grad_norm": 0.2595537313094888, + "learning_rate": 0.0001867732590777249, + "loss": 2.764838695526123, + "step": 7073, + "token_acc": 0.33055181480056434 + }, + { + "epoch": 4.1465845793022575, + "grad_norm": 0.26794172093144475, + "learning_rate": 0.00018676844139985853, + "loss": 2.7939529418945312, + "step": 7074, + "token_acc": 0.32792273957955637 + }, + { + "epoch": 4.147170917619467, + "grad_norm": 0.24037160431563642, + "learning_rate": 0.00018676362290691957, + "loss": 2.7875704765319824, + "step": 7075, + "token_acc": 0.32778470299563056 + }, + { + "epoch": 4.147757255936676, + "grad_norm": 0.2729734541177915, + "learning_rate": 0.00018675880359895328, + "loss": 2.8028979301452637, + "step": 7076, + "token_acc": 0.32618341656263006 + }, + { + "epoch": 4.148343594253885, + "grad_norm": 0.2630554088208072, + "learning_rate": 0.00018675398347600496, + "loss": 2.809323787689209, + "step": 7077, + "token_acc": 0.32371711534055364 + }, + { + "epoch": 4.148929932571093, + "grad_norm": 0.2730631031382186, + "learning_rate": 0.00018674916253811981, + "loss": 2.768653392791748, + "step": 7078, + "token_acc": 0.33009128337353405 + }, + { + "epoch": 4.149516270888302, + "grad_norm": 0.26453828104729843, + "learning_rate": 0.00018674434078534325, + "loss": 2.7943623065948486, + "step": 7079, + "token_acc": 0.3277035462876876 + }, + { + "epoch": 4.150102609205511, + "grad_norm": 0.3340931310258282, + "learning_rate": 0.00018673951821772047, + "loss": 2.79005765914917, + "step": 7080, + "token_acc": 0.3275406506731599 + }, + { + "epoch": 4.15068894752272, + "grad_norm": 0.3506987576465481, + "learning_rate": 0.00018673469483529678, + "loss": 2.8223509788513184, + "step": 7081, + "token_acc": 0.32267923932814147 + }, + { + "epoch": 4.1512752858399296, + "grad_norm": 0.24428524079193453, + "learning_rate": 0.00018672987063811754, + "loss": 2.7531051635742188, + "step": 7082, + "token_acc": 0.335091730446931 + }, + { + "epoch": 4.151861624157139, + "grad_norm": 0.32776096009618627, + "learning_rate": 0.000186725045626228, + "loss": 2.8050754070281982, + "step": 7083, + "token_acc": 0.3240860176140168 + }, + { + "epoch": 4.152447962474348, + "grad_norm": 0.3156337991430243, + "learning_rate": 0.00018672021979967353, + "loss": 2.7837018966674805, + "step": 7084, + "token_acc": 0.3277810283039567 + }, + { + "epoch": 4.153034300791557, + "grad_norm": 0.27443673184627987, + "learning_rate": 0.00018671539315849947, + "loss": 2.791717052459717, + "step": 7085, + "token_acc": 0.3283897708450138 + }, + { + "epoch": 4.153620639108766, + "grad_norm": 0.25510788259141437, + "learning_rate": 0.00018671056570275114, + "loss": 2.813810348510742, + "step": 7086, + "token_acc": 0.3245557943826832 + }, + { + "epoch": 4.154206977425975, + "grad_norm": 0.26251494317763374, + "learning_rate": 0.00018670573743247387, + "loss": 2.7571256160736084, + "step": 7087, + "token_acc": 0.3326289915694603 + }, + { + "epoch": 4.154793315743184, + "grad_norm": 0.2344264704345542, + "learning_rate": 0.00018670090834771306, + "loss": 2.7841124534606934, + "step": 7088, + "token_acc": 0.3269944643438619 + }, + { + "epoch": 4.1553796540603924, + "grad_norm": 0.2553680099230396, + "learning_rate": 0.00018669607844851402, + "loss": 2.748168468475342, + "step": 7089, + "token_acc": 0.33395510574528264 + }, + { + "epoch": 4.155965992377602, + "grad_norm": 0.23980217953221, + "learning_rate": 0.00018669124773492218, + "loss": 2.7773807048797607, + "step": 7090, + "token_acc": 0.32837252565183267 + }, + { + "epoch": 4.156552330694811, + "grad_norm": 0.27065010800507466, + "learning_rate": 0.00018668641620698287, + "loss": 2.76983642578125, + "step": 7091, + "token_acc": 0.33109400840962894 + }, + { + "epoch": 4.15713866901202, + "grad_norm": 0.2591167377906633, + "learning_rate": 0.00018668158386474145, + "loss": 2.7897448539733887, + "step": 7092, + "token_acc": 0.32789453142394626 + }, + { + "epoch": 4.157725007329229, + "grad_norm": 0.2548083500599125, + "learning_rate": 0.0001866767507082434, + "loss": 2.7587451934814453, + "step": 7093, + "token_acc": 0.3326336643003703 + }, + { + "epoch": 4.158311345646438, + "grad_norm": 0.26800751481464313, + "learning_rate": 0.00018667191673753407, + "loss": 2.8164479732513428, + "step": 7094, + "token_acc": 0.3234671255519549 + }, + { + "epoch": 4.158897683963647, + "grad_norm": 0.2874489839603957, + "learning_rate": 0.00018666708195265889, + "loss": 2.812061071395874, + "step": 7095, + "token_acc": 0.32480085875228043 + }, + { + "epoch": 4.159484022280856, + "grad_norm": 0.31640798387122043, + "learning_rate": 0.00018666224635366323, + "loss": 2.825373649597168, + "step": 7096, + "token_acc": 0.32279993296681314 + }, + { + "epoch": 4.160070360598065, + "grad_norm": 0.259717065218652, + "learning_rate": 0.00018665740994059258, + "loss": 2.7896132469177246, + "step": 7097, + "token_acc": 0.32720886105724245 + }, + { + "epoch": 4.1606566989152745, + "grad_norm": 0.27521394865798493, + "learning_rate": 0.0001866525727134923, + "loss": 2.77710223197937, + "step": 7098, + "token_acc": 0.33024111527149763 + }, + { + "epoch": 4.161243037232484, + "grad_norm": 0.26344504062103585, + "learning_rate": 0.00018664773467240786, + "loss": 2.795780658721924, + "step": 7099, + "token_acc": 0.32529523306000235 + }, + { + "epoch": 4.161829375549692, + "grad_norm": 0.2903025543494628, + "learning_rate": 0.00018664289581738476, + "loss": 2.7618629932403564, + "step": 7100, + "token_acc": 0.33281112484740244 + }, + { + "epoch": 4.162415713866901, + "grad_norm": 0.27150845033650894, + "learning_rate": 0.00018663805614846837, + "loss": 2.7867631912231445, + "step": 7101, + "token_acc": 0.327508163900762 + }, + { + "epoch": 4.16300205218411, + "grad_norm": 0.2830386813054436, + "learning_rate": 0.00018663321566570417, + "loss": 2.7832565307617188, + "step": 7102, + "token_acc": 0.32921536659854417 + }, + { + "epoch": 4.163588390501319, + "grad_norm": 0.27133229843551504, + "learning_rate": 0.00018662837436913768, + "loss": 2.800172805786133, + "step": 7103, + "token_acc": 0.32652028632262403 + }, + { + "epoch": 4.164174728818528, + "grad_norm": 0.2942079592602994, + "learning_rate": 0.00018662353225881435, + "loss": 2.800168514251709, + "step": 7104, + "token_acc": 0.32632758936706574 + }, + { + "epoch": 4.164761067135737, + "grad_norm": 0.29979963131332366, + "learning_rate": 0.00018661868933477963, + "loss": 2.8138575553894043, + "step": 7105, + "token_acc": 0.32249482707655924 + }, + { + "epoch": 4.1653474054529465, + "grad_norm": 0.33770100558570276, + "learning_rate": 0.0001866138455970791, + "loss": 2.7790517807006836, + "step": 7106, + "token_acc": 0.3301621644275525 + }, + { + "epoch": 4.165933743770156, + "grad_norm": 0.34295619293043955, + "learning_rate": 0.00018660900104575814, + "loss": 2.7915194034576416, + "step": 7107, + "token_acc": 0.3272568144007191 + }, + { + "epoch": 4.166520082087365, + "grad_norm": 0.2890581748443234, + "learning_rate": 0.00018660415568086234, + "loss": 2.779521942138672, + "step": 7108, + "token_acc": 0.33012738000844394 + }, + { + "epoch": 4.167106420404574, + "grad_norm": 0.2713896993391161, + "learning_rate": 0.0001865993095024372, + "loss": 2.782832145690918, + "step": 7109, + "token_acc": 0.32854373302368883 + }, + { + "epoch": 4.167692758721783, + "grad_norm": 0.25793955423588083, + "learning_rate": 0.00018659446251052824, + "loss": 2.7800774574279785, + "step": 7110, + "token_acc": 0.32776205535056324 + }, + { + "epoch": 4.168279097038991, + "grad_norm": 0.26444740157657376, + "learning_rate": 0.00018658961470518102, + "loss": 2.791886806488037, + "step": 7111, + "token_acc": 0.3275825984502213 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.2607246005357545, + "learning_rate": 0.00018658476608644103, + "loss": 2.7907450199127197, + "step": 7112, + "token_acc": 0.32789147049594 + }, + { + "epoch": 4.169451773673409, + "grad_norm": 0.2673980628771356, + "learning_rate": 0.00018657991665435385, + "loss": 2.7650692462921143, + "step": 7113, + "token_acc": 0.33089376977046525 + }, + { + "epoch": 4.1700381119906185, + "grad_norm": 0.26173272002760334, + "learning_rate": 0.00018657506640896502, + "loss": 2.8083853721618652, + "step": 7114, + "token_acc": 0.3251694921621615 + }, + { + "epoch": 4.170624450307828, + "grad_norm": 0.2623387739758055, + "learning_rate": 0.0001865702153503201, + "loss": 2.7661795616149902, + "step": 7115, + "token_acc": 0.33181971427367357 + }, + { + "epoch": 4.171210788625037, + "grad_norm": 0.25236774338551904, + "learning_rate": 0.00018656536347846464, + "loss": 2.7927117347717285, + "step": 7116, + "token_acc": 0.3270040818101279 + }, + { + "epoch": 4.171797126942246, + "grad_norm": 0.2731925456740826, + "learning_rate": 0.00018656051079344425, + "loss": 2.803333044052124, + "step": 7117, + "token_acc": 0.3250152650929547 + }, + { + "epoch": 4.172383465259455, + "grad_norm": 0.40323974685428443, + "learning_rate": 0.0001865556572953045, + "loss": 2.806036949157715, + "step": 7118, + "token_acc": 0.3261204906435175 + }, + { + "epoch": 4.172969803576664, + "grad_norm": 0.4240528933155089, + "learning_rate": 0.000186550802984091, + "loss": 2.8030807971954346, + "step": 7119, + "token_acc": 0.3260309835994634 + }, + { + "epoch": 4.173556141893873, + "grad_norm": 0.26345208945850634, + "learning_rate": 0.00018654594785984932, + "loss": 2.8113174438476562, + "step": 7120, + "token_acc": 0.32495721338167743 + }, + { + "epoch": 4.174142480211081, + "grad_norm": 0.3025642129724335, + "learning_rate": 0.0001865410919226251, + "loss": 2.801701068878174, + "step": 7121, + "token_acc": 0.32611378359309856 + }, + { + "epoch": 4.1747288185282905, + "grad_norm": 0.27291186222328934, + "learning_rate": 0.0001865362351724639, + "loss": 2.759350299835205, + "step": 7122, + "token_acc": 0.33252460708152326 + }, + { + "epoch": 4.1753151568455, + "grad_norm": 0.32262170926153777, + "learning_rate": 0.0001865313776094114, + "loss": 2.805788040161133, + "step": 7123, + "token_acc": 0.32526669558808924 + }, + { + "epoch": 4.175901495162709, + "grad_norm": 0.27277281305017403, + "learning_rate": 0.00018652651923351324, + "loss": 2.7759857177734375, + "step": 7124, + "token_acc": 0.33016439653249025 + }, + { + "epoch": 4.176487833479918, + "grad_norm": 0.32194328171796716, + "learning_rate": 0.00018652166004481499, + "loss": 2.8123388290405273, + "step": 7125, + "token_acc": 0.32509168434648644 + }, + { + "epoch": 4.177074171797127, + "grad_norm": 0.31170966719355153, + "learning_rate": 0.00018651680004336236, + "loss": 2.7830138206481934, + "step": 7126, + "token_acc": 0.32659967105522025 + }, + { + "epoch": 4.177660510114336, + "grad_norm": 0.30129830320334333, + "learning_rate": 0.00018651193922920097, + "loss": 2.765509605407715, + "step": 7127, + "token_acc": 0.3314838423175852 + }, + { + "epoch": 4.178246848431545, + "grad_norm": 0.31861687277751904, + "learning_rate": 0.00018650707760237646, + "loss": 2.8066015243530273, + "step": 7128, + "token_acc": 0.3234356789983395 + }, + { + "epoch": 4.178833186748754, + "grad_norm": 0.29318449613580966, + "learning_rate": 0.00018650221516293455, + "loss": 2.73773455619812, + "step": 7129, + "token_acc": 0.3358902243756368 + }, + { + "epoch": 4.179419525065963, + "grad_norm": 0.3370932323580013, + "learning_rate": 0.00018649735191092088, + "loss": 2.776411533355713, + "step": 7130, + "token_acc": 0.3299775805246265 + }, + { + "epoch": 4.1800058633831725, + "grad_norm": 0.2710712040606177, + "learning_rate": 0.00018649248784638115, + "loss": 2.7904844284057617, + "step": 7131, + "token_acc": 0.32678880251889036 + }, + { + "epoch": 4.180592201700381, + "grad_norm": 0.3019398458232423, + "learning_rate": 0.00018648762296936104, + "loss": 2.808136463165283, + "step": 7132, + "token_acc": 0.3249455882278679 + }, + { + "epoch": 4.18117854001759, + "grad_norm": 0.26223806080520057, + "learning_rate": 0.00018648275727990628, + "loss": 2.79612135887146, + "step": 7133, + "token_acc": 0.32764721988314366 + }, + { + "epoch": 4.181764878334799, + "grad_norm": 0.27790552792001005, + "learning_rate": 0.00018647789077806253, + "loss": 2.7412309646606445, + "step": 7134, + "token_acc": 0.33470452285961444 + }, + { + "epoch": 4.182351216652008, + "grad_norm": 0.26892586305851907, + "learning_rate": 0.00018647302346387555, + "loss": 2.799913167953491, + "step": 7135, + "token_acc": 0.3247402396024174 + }, + { + "epoch": 4.182937554969217, + "grad_norm": 0.30003085111530087, + "learning_rate": 0.000186468155337391, + "loss": 2.7498412132263184, + "step": 7136, + "token_acc": 0.33326342281879195 + }, + { + "epoch": 4.183523893286426, + "grad_norm": 0.2754543435671535, + "learning_rate": 0.00018646328639865468, + "loss": 2.8077759742736816, + "step": 7137, + "token_acc": 0.32511101394845715 + }, + { + "epoch": 4.184110231603635, + "grad_norm": 0.3213513602377404, + "learning_rate": 0.00018645841664771226, + "loss": 2.797866106033325, + "step": 7138, + "token_acc": 0.3261113794474525 + }, + { + "epoch": 4.1846965699208445, + "grad_norm": 0.29160090776598685, + "learning_rate": 0.00018645354608460958, + "loss": 2.765488624572754, + "step": 7139, + "token_acc": 0.3322875692088821 + }, + { + "epoch": 4.185282908238054, + "grad_norm": 0.2781717504488947, + "learning_rate": 0.00018644867470939228, + "loss": 2.801461696624756, + "step": 7140, + "token_acc": 0.3251454172838077 + }, + { + "epoch": 4.185869246555263, + "grad_norm": 0.25709256601189306, + "learning_rate": 0.00018644380252210617, + "loss": 2.7947957515716553, + "step": 7141, + "token_acc": 0.3273180992696534 + }, + { + "epoch": 4.186455584872472, + "grad_norm": 0.3013285944673012, + "learning_rate": 0.00018643892952279704, + "loss": 2.7660555839538574, + "step": 7142, + "token_acc": 0.3319426296132438 + }, + { + "epoch": 4.18704192318968, + "grad_norm": 0.25667518933292005, + "learning_rate": 0.00018643405571151065, + "loss": 2.800262451171875, + "step": 7143, + "token_acc": 0.3261171906452439 + }, + { + "epoch": 4.187628261506889, + "grad_norm": 0.2966746727434648, + "learning_rate": 0.00018642918108829277, + "loss": 2.758972644805908, + "step": 7144, + "token_acc": 0.3331086964752205 + }, + { + "epoch": 4.188214599824098, + "grad_norm": 0.25814810188244625, + "learning_rate": 0.0001864243056531892, + "loss": 2.8075592517852783, + "step": 7145, + "token_acc": 0.32473101791510134 + }, + { + "epoch": 4.188800938141307, + "grad_norm": 0.26175778519986925, + "learning_rate": 0.00018641942940624576, + "loss": 2.7754759788513184, + "step": 7146, + "token_acc": 0.3302482488805841 + }, + { + "epoch": 4.1893872764585165, + "grad_norm": 0.2971018037879146, + "learning_rate": 0.00018641455234750818, + "loss": 2.837045192718506, + "step": 7147, + "token_acc": 0.3199036372128917 + }, + { + "epoch": 4.189973614775726, + "grad_norm": 0.26381660353703046, + "learning_rate": 0.00018640967447702237, + "loss": 2.773775577545166, + "step": 7148, + "token_acc": 0.3313390138115599 + }, + { + "epoch": 4.190559953092935, + "grad_norm": 0.2689216620119575, + "learning_rate": 0.00018640479579483407, + "loss": 2.7813847064971924, + "step": 7149, + "token_acc": 0.3274797618251154 + }, + { + "epoch": 4.191146291410144, + "grad_norm": 0.27646580120974623, + "learning_rate": 0.00018639991630098917, + "loss": 2.797849655151367, + "step": 7150, + "token_acc": 0.32580738170133955 + }, + { + "epoch": 4.191732629727353, + "grad_norm": 0.30033542856581474, + "learning_rate": 0.0001863950359955335, + "loss": 2.8148417472839355, + "step": 7151, + "token_acc": 0.32331430004336037 + }, + { + "epoch": 4.192318968044562, + "grad_norm": 0.25430068792565647, + "learning_rate": 0.00018639015487851283, + "loss": 2.7596330642700195, + "step": 7152, + "token_acc": 0.33350099038651 + }, + { + "epoch": 4.192905306361771, + "grad_norm": 0.3061969566003337, + "learning_rate": 0.00018638527294997313, + "loss": 2.8368749618530273, + "step": 7153, + "token_acc": 0.32023378895051274 + }, + { + "epoch": 4.193491644678979, + "grad_norm": 0.29360521065985223, + "learning_rate": 0.00018638039020996017, + "loss": 2.8334736824035645, + "step": 7154, + "token_acc": 0.3222612657836815 + }, + { + "epoch": 4.1940779829961885, + "grad_norm": 0.26469594283487363, + "learning_rate": 0.00018637550665851982, + "loss": 2.7973012924194336, + "step": 7155, + "token_acc": 0.32807673110388647 + }, + { + "epoch": 4.194664321313398, + "grad_norm": 0.3395100447109936, + "learning_rate": 0.000186370622295698, + "loss": 2.798107862472534, + "step": 7156, + "token_acc": 0.32652112847911813 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 0.2584639905947706, + "learning_rate": 0.00018636573712154059, + "loss": 2.7745542526245117, + "step": 7157, + "token_acc": 0.32972449402041887 + }, + { + "epoch": 4.195836997947816, + "grad_norm": 0.3287655019874536, + "learning_rate": 0.00018636085113609343, + "loss": 2.75933837890625, + "step": 7158, + "token_acc": 0.33145545844348034 + }, + { + "epoch": 4.196423336265025, + "grad_norm": 0.37963036731744526, + "learning_rate": 0.00018635596433940244, + "loss": 2.8011245727539062, + "step": 7159, + "token_acc": 0.3259422999765324 + }, + { + "epoch": 4.197009674582234, + "grad_norm": 0.2791200974416275, + "learning_rate": 0.00018635107673151352, + "loss": 2.7943596839904785, + "step": 7160, + "token_acc": 0.32793067331933323 + }, + { + "epoch": 4.197596012899443, + "grad_norm": 0.3190420515211332, + "learning_rate": 0.00018634618831247262, + "loss": 2.7719130516052246, + "step": 7161, + "token_acc": 0.3298731796526338 + }, + { + "epoch": 4.198182351216652, + "grad_norm": 0.29710887785827383, + "learning_rate": 0.00018634129908232566, + "loss": 2.7999250888824463, + "step": 7162, + "token_acc": 0.32671175540466924 + }, + { + "epoch": 4.198768689533861, + "grad_norm": 0.34573787991532473, + "learning_rate": 0.00018633640904111852, + "loss": 2.755390167236328, + "step": 7163, + "token_acc": 0.3320836998336177 + }, + { + "epoch": 4.19935502785107, + "grad_norm": 0.28669619273580593, + "learning_rate": 0.00018633151818889712, + "loss": 2.805746078491211, + "step": 7164, + "token_acc": 0.3246945907466223 + }, + { + "epoch": 4.199941366168279, + "grad_norm": 0.3486843824690745, + "learning_rate": 0.00018632662652570749, + "loss": 2.7802281379699707, + "step": 7165, + "token_acc": 0.3293920071374216 + }, + { + "epoch": 4.200527704485488, + "grad_norm": 0.33456941539566115, + "learning_rate": 0.0001863217340515955, + "loss": 2.775449275970459, + "step": 7166, + "token_acc": 0.3302888286248387 + }, + { + "epoch": 4.201114042802697, + "grad_norm": 0.3177513995503537, + "learning_rate": 0.00018631684076660717, + "loss": 2.775282859802246, + "step": 7167, + "token_acc": 0.32994647999052196 + }, + { + "epoch": 4.201700381119906, + "grad_norm": 0.3074760862911338, + "learning_rate": 0.00018631194667078838, + "loss": 2.8411147594451904, + "step": 7168, + "token_acc": 0.32009838545658564 + }, + { + "epoch": 4.202286719437115, + "grad_norm": 0.28596671519668726, + "learning_rate": 0.0001863070517641852, + "loss": 2.749114513397217, + "step": 7169, + "token_acc": 0.3348188834945138 + }, + { + "epoch": 4.202873057754324, + "grad_norm": 0.34823567201266376, + "learning_rate": 0.00018630215604684356, + "loss": 2.768881320953369, + "step": 7170, + "token_acc": 0.33090829604484906 + }, + { + "epoch": 4.203459396071533, + "grad_norm": 0.24434087076948205, + "learning_rate": 0.00018629725951880945, + "loss": 2.7958014011383057, + "step": 7171, + "token_acc": 0.32703188216453133 + }, + { + "epoch": 4.2040457343887425, + "grad_norm": 0.29725275208560054, + "learning_rate": 0.00018629236218012886, + "loss": 2.7710518836975098, + "step": 7172, + "token_acc": 0.3319634872141823 + }, + { + "epoch": 4.204632072705952, + "grad_norm": 0.25219867571094534, + "learning_rate": 0.00018628746403084786, + "loss": 2.810288906097412, + "step": 7173, + "token_acc": 0.3252672054744146 + }, + { + "epoch": 4.205218411023161, + "grad_norm": 0.26667411249406076, + "learning_rate": 0.00018628256507101235, + "loss": 2.8047566413879395, + "step": 7174, + "token_acc": 0.32448908936733883 + }, + { + "epoch": 4.205804749340369, + "grad_norm": 0.26546597053969334, + "learning_rate": 0.00018627766530066847, + "loss": 2.778273105621338, + "step": 7175, + "token_acc": 0.3297123247701258 + }, + { + "epoch": 4.206391087657578, + "grad_norm": 0.24790662637772332, + "learning_rate": 0.00018627276471986214, + "loss": 2.7851388454437256, + "step": 7176, + "token_acc": 0.3281740470489675 + }, + { + "epoch": 4.206977425974787, + "grad_norm": 0.2624436848159625, + "learning_rate": 0.00018626786332863942, + "loss": 2.7707605361938477, + "step": 7177, + "token_acc": 0.3303622452013953 + }, + { + "epoch": 4.207563764291996, + "grad_norm": 0.24880874356177782, + "learning_rate": 0.0001862629611270464, + "loss": 2.7740938663482666, + "step": 7178, + "token_acc": 0.3298171950793904 + }, + { + "epoch": 4.208150102609205, + "grad_norm": 0.25626723427079173, + "learning_rate": 0.0001862580581151291, + "loss": 2.816868782043457, + "step": 7179, + "token_acc": 0.32295718813266483 + }, + { + "epoch": 4.2087364409264145, + "grad_norm": 0.23242575522505876, + "learning_rate": 0.0001862531542929336, + "loss": 2.7931060791015625, + "step": 7180, + "token_acc": 0.32802569041658763 + }, + { + "epoch": 4.209322779243624, + "grad_norm": 0.27803035382476776, + "learning_rate": 0.00018624824966050594, + "loss": 2.7752556800842285, + "step": 7181, + "token_acc": 0.32841583807689056 + }, + { + "epoch": 4.209909117560833, + "grad_norm": 0.26622353642183966, + "learning_rate": 0.00018624334421789217, + "loss": 2.7808239459991455, + "step": 7182, + "token_acc": 0.33086606999444407 + }, + { + "epoch": 4.210495455878042, + "grad_norm": 0.267544186442729, + "learning_rate": 0.00018623843796513841, + "loss": 2.799476146697998, + "step": 7183, + "token_acc": 0.3260908253828238 + }, + { + "epoch": 4.211081794195251, + "grad_norm": 0.32001193414538076, + "learning_rate": 0.00018623353090229072, + "loss": 2.7878870964050293, + "step": 7184, + "token_acc": 0.3279342931240509 + }, + { + "epoch": 4.21166813251246, + "grad_norm": 0.2954581995548513, + "learning_rate": 0.0001862286230293952, + "loss": 2.8171629905700684, + "step": 7185, + "token_acc": 0.3227697935719885 + }, + { + "epoch": 4.212254470829668, + "grad_norm": 0.3089045434576776, + "learning_rate": 0.00018622371434649798, + "loss": 2.8179779052734375, + "step": 7186, + "token_acc": 0.3237001465956799 + }, + { + "epoch": 4.212840809146877, + "grad_norm": 0.413154502755599, + "learning_rate": 0.00018621880485364517, + "loss": 2.820302963256836, + "step": 7187, + "token_acc": 0.3229706804302725 + }, + { + "epoch": 4.2134271474640865, + "grad_norm": 0.3036522662385077, + "learning_rate": 0.00018621389455088285, + "loss": 2.872323513031006, + "step": 7188, + "token_acc": 0.3141758138170849 + }, + { + "epoch": 4.214013485781296, + "grad_norm": 0.28155944066202293, + "learning_rate": 0.00018620898343825717, + "loss": 2.766209125518799, + "step": 7189, + "token_acc": 0.3312721272607853 + }, + { + "epoch": 4.214599824098505, + "grad_norm": 0.3016276611269739, + "learning_rate": 0.00018620407151581425, + "loss": 2.8263587951660156, + "step": 7190, + "token_acc": 0.3210977354143723 + }, + { + "epoch": 4.215186162415714, + "grad_norm": 0.2517209729682834, + "learning_rate": 0.00018619915878360024, + "loss": 2.751180648803711, + "step": 7191, + "token_acc": 0.3344086383992822 + }, + { + "epoch": 4.215772500732923, + "grad_norm": 0.30602552379123477, + "learning_rate": 0.00018619424524166127, + "loss": 2.7748265266418457, + "step": 7192, + "token_acc": 0.32968820513343516 + }, + { + "epoch": 4.216358839050132, + "grad_norm": 0.26846420456726744, + "learning_rate": 0.00018618933089004354, + "loss": 2.7991480827331543, + "step": 7193, + "token_acc": 0.32640017723599435 + }, + { + "epoch": 4.216945177367341, + "grad_norm": 0.27349423413890933, + "learning_rate": 0.00018618441572879318, + "loss": 2.759122371673584, + "step": 7194, + "token_acc": 0.33251756962971424 + }, + { + "epoch": 4.21753151568455, + "grad_norm": 0.29711759770444435, + "learning_rate": 0.00018617949975795637, + "loss": 2.8319473266601562, + "step": 7195, + "token_acc": 0.32171724992257666 + }, + { + "epoch": 4.218117854001759, + "grad_norm": 0.24924275123868372, + "learning_rate": 0.0001861745829775793, + "loss": 2.786811113357544, + "step": 7196, + "token_acc": 0.3278883653013114 + }, + { + "epoch": 4.218704192318968, + "grad_norm": 0.3314997228043431, + "learning_rate": 0.00018616966538770812, + "loss": 2.8435819149017334, + "step": 7197, + "token_acc": 0.3184776167927803 + }, + { + "epoch": 4.219290530636177, + "grad_norm": 0.24426536008233282, + "learning_rate": 0.00018616474698838906, + "loss": 2.7798056602478027, + "step": 7198, + "token_acc": 0.3280384656061454 + }, + { + "epoch": 4.219876868953386, + "grad_norm": 0.28261368987984764, + "learning_rate": 0.00018615982777966833, + "loss": 2.8509159088134766, + "step": 7199, + "token_acc": 0.32023487619956525 + }, + { + "epoch": 4.220463207270595, + "grad_norm": 0.259177407666592, + "learning_rate": 0.0001861549077615921, + "loss": 2.8020074367523193, + "step": 7200, + "token_acc": 0.3258015560764421 + }, + { + "epoch": 4.221049545587804, + "grad_norm": 0.2784499628625681, + "learning_rate": 0.0001861499869342066, + "loss": 2.786154270172119, + "step": 7201, + "token_acc": 0.3271836673446037 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.2770141097004256, + "learning_rate": 0.00018614506529755808, + "loss": 2.827462911605835, + "step": 7202, + "token_acc": 0.32255528976526904 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2665143154133402, + "learning_rate": 0.00018614014285169274, + "loss": 2.839615821838379, + "step": 7203, + "token_acc": 0.3204658398191819 + }, + { + "epoch": 4.222808560539431, + "grad_norm": 0.3090169036702747, + "learning_rate": 0.00018613521959665686, + "loss": 2.7768468856811523, + "step": 7204, + "token_acc": 0.329127732103469 + }, + { + "epoch": 4.2233948988566405, + "grad_norm": 0.2612189611700686, + "learning_rate": 0.00018613029553249662, + "loss": 2.798933506011963, + "step": 7205, + "token_acc": 0.3266280567650722 + }, + { + "epoch": 4.22398123717385, + "grad_norm": 0.2633362058115629, + "learning_rate": 0.00018612537065925832, + "loss": 2.8057100772857666, + "step": 7206, + "token_acc": 0.3245074237816565 + }, + { + "epoch": 4.224567575491059, + "grad_norm": 0.2582251005796516, + "learning_rate": 0.00018612044497698824, + "loss": 2.784407138824463, + "step": 7207, + "token_acc": 0.3286531515272696 + }, + { + "epoch": 4.225153913808267, + "grad_norm": 0.2570518390077076, + "learning_rate": 0.0001861155184857326, + "loss": 2.789052963256836, + "step": 7208, + "token_acc": 0.32912614715520716 + }, + { + "epoch": 4.225740252125476, + "grad_norm": 0.24467996896239622, + "learning_rate": 0.00018611059118553774, + "loss": 2.810938596725464, + "step": 7209, + "token_acc": 0.32572292612829024 + }, + { + "epoch": 4.226326590442685, + "grad_norm": 0.29150848679983293, + "learning_rate": 0.0001861056630764499, + "loss": 2.8374547958374023, + "step": 7210, + "token_acc": 0.32059990991868975 + }, + { + "epoch": 4.226912928759894, + "grad_norm": 0.26915418106421807, + "learning_rate": 0.0001861007341585154, + "loss": 2.7820582389831543, + "step": 7211, + "token_acc": 0.32779640234278185 + }, + { + "epoch": 4.227499267077103, + "grad_norm": 0.2565536197103172, + "learning_rate": 0.0001860958044317805, + "loss": 2.828176498413086, + "step": 7212, + "token_acc": 0.32188629471147856 + }, + { + "epoch": 4.2280856053943126, + "grad_norm": 0.2410132143674873, + "learning_rate": 0.00018609087389629154, + "loss": 2.8507637977600098, + "step": 7213, + "token_acc": 0.3179332989661924 + }, + { + "epoch": 4.228671943711522, + "grad_norm": 0.2706676227223853, + "learning_rate": 0.00018608594255209484, + "loss": 2.782197952270508, + "step": 7214, + "token_acc": 0.32843443872935457 + }, + { + "epoch": 4.229258282028731, + "grad_norm": 0.30932351221382404, + "learning_rate": 0.00018608101039923667, + "loss": 2.826585531234741, + "step": 7215, + "token_acc": 0.3223195312480446 + }, + { + "epoch": 4.22984462034594, + "grad_norm": 0.35800619903714426, + "learning_rate": 0.00018607607743776345, + "loss": 2.7740697860717773, + "step": 7216, + "token_acc": 0.32999056668539767 + }, + { + "epoch": 4.230430958663149, + "grad_norm": 0.2669117455777673, + "learning_rate": 0.00018607114366772144, + "loss": 2.82993221282959, + "step": 7217, + "token_acc": 0.3216169075052518 + }, + { + "epoch": 4.231017296980358, + "grad_norm": 0.27008272214236756, + "learning_rate": 0.00018606620908915704, + "loss": 2.777475357055664, + "step": 7218, + "token_acc": 0.3301514410825541 + }, + { + "epoch": 4.231603635297566, + "grad_norm": 0.327636506943128, + "learning_rate": 0.00018606127370211656, + "loss": 2.7887344360351562, + "step": 7219, + "token_acc": 0.3277534730023452 + }, + { + "epoch": 4.2321899736147754, + "grad_norm": 0.25232826421323556, + "learning_rate": 0.00018605633750664642, + "loss": 2.766155242919922, + "step": 7220, + "token_acc": 0.3300219960100261 + }, + { + "epoch": 4.232776311931985, + "grad_norm": 0.29311034226063165, + "learning_rate": 0.00018605140050279292, + "loss": 2.773897647857666, + "step": 7221, + "token_acc": 0.33007722279781926 + }, + { + "epoch": 4.233362650249194, + "grad_norm": 0.2708945256233776, + "learning_rate": 0.00018604646269060248, + "loss": 2.8470466136932373, + "step": 7222, + "token_acc": 0.319114115354783 + }, + { + "epoch": 4.233948988566403, + "grad_norm": 0.2642143483572943, + "learning_rate": 0.00018604152407012146, + "loss": 2.7781643867492676, + "step": 7223, + "token_acc": 0.32962308847479854 + }, + { + "epoch": 4.234535326883612, + "grad_norm": 0.33031745855416683, + "learning_rate": 0.0001860365846413963, + "loss": 2.8202686309814453, + "step": 7224, + "token_acc": 0.3222385777740985 + }, + { + "epoch": 4.235121665200821, + "grad_norm": 0.2601389423295867, + "learning_rate": 0.00018603164440447333, + "loss": 2.8353850841522217, + "step": 7225, + "token_acc": 0.3215255722264812 + }, + { + "epoch": 4.23570800351803, + "grad_norm": 0.3191855904121981, + "learning_rate": 0.000186026703359399, + "loss": 2.8178486824035645, + "step": 7226, + "token_acc": 0.32175634001335296 + }, + { + "epoch": 4.236294341835239, + "grad_norm": 0.2818438012315105, + "learning_rate": 0.00018602176150621968, + "loss": 2.815585136413574, + "step": 7227, + "token_acc": 0.3240985656882833 + }, + { + "epoch": 4.236880680152448, + "grad_norm": 0.29816872044435727, + "learning_rate": 0.00018601681884498184, + "loss": 2.7867541313171387, + "step": 7228, + "token_acc": 0.3271028289857578 + }, + { + "epoch": 4.237467018469657, + "grad_norm": 0.333861646056676, + "learning_rate": 0.00018601187537573192, + "loss": 2.79691219329834, + "step": 7229, + "token_acc": 0.3261013303695242 + }, + { + "epoch": 4.238053356786866, + "grad_norm": 0.26166243408199225, + "learning_rate": 0.00018600693109851633, + "loss": 2.798318862915039, + "step": 7230, + "token_acc": 0.32501832442030004 + }, + { + "epoch": 4.238639695104075, + "grad_norm": 0.2868860559254532, + "learning_rate": 0.00018600198601338152, + "loss": 2.8321170806884766, + "step": 7231, + "token_acc": 0.3216889179026253 + }, + { + "epoch": 4.239226033421284, + "grad_norm": 0.28229931034310796, + "learning_rate": 0.00018599704012037393, + "loss": 2.828172206878662, + "step": 7232, + "token_acc": 0.3211865380197565 + }, + { + "epoch": 4.239812371738493, + "grad_norm": 0.2908369324851455, + "learning_rate": 0.00018599209341954, + "loss": 2.8463354110717773, + "step": 7233, + "token_acc": 0.31912504623014765 + }, + { + "epoch": 4.240398710055702, + "grad_norm": 0.26941372844602013, + "learning_rate": 0.00018598714591092628, + "loss": 2.7919979095458984, + "step": 7234, + "token_acc": 0.32792464441650354 + }, + { + "epoch": 4.240985048372911, + "grad_norm": 0.29940514025789516, + "learning_rate": 0.0001859821975945792, + "loss": 2.8361620903015137, + "step": 7235, + "token_acc": 0.3199322057459373 + }, + { + "epoch": 4.24157138669012, + "grad_norm": 0.2615773277327321, + "learning_rate": 0.00018597724847054518, + "loss": 2.7836971282958984, + "step": 7236, + "token_acc": 0.3276175636605546 + }, + { + "epoch": 4.2421577250073295, + "grad_norm": 0.33595114121691927, + "learning_rate": 0.0001859722985388708, + "loss": 2.817362070083618, + "step": 7237, + "token_acc": 0.32363001103346817 + }, + { + "epoch": 4.242744063324539, + "grad_norm": 0.27727074988570233, + "learning_rate": 0.00018596734779960253, + "loss": 2.8033151626586914, + "step": 7238, + "token_acc": 0.32579815387221067 + }, + { + "epoch": 4.243330401641748, + "grad_norm": 0.3036697913217882, + "learning_rate": 0.00018596239625278687, + "loss": 2.775413751602173, + "step": 7239, + "token_acc": 0.33128635202712853 + }, + { + "epoch": 4.243916739958956, + "grad_norm": 0.31217994949411854, + "learning_rate": 0.00018595744389847033, + "loss": 2.8296804428100586, + "step": 7240, + "token_acc": 0.3213631723087394 + }, + { + "epoch": 4.244503078276165, + "grad_norm": 0.29791173651230707, + "learning_rate": 0.0001859524907366994, + "loss": 2.777073383331299, + "step": 7241, + "token_acc": 0.3279179201176034 + }, + { + "epoch": 4.245089416593374, + "grad_norm": 0.29269367134351376, + "learning_rate": 0.00018594753676752068, + "loss": 2.8041634559631348, + "step": 7242, + "token_acc": 0.3250288615142457 + }, + { + "epoch": 4.245675754910583, + "grad_norm": 0.2683766588735151, + "learning_rate": 0.00018594258199098067, + "loss": 2.77238130569458, + "step": 7243, + "token_acc": 0.3301181165373822 + }, + { + "epoch": 4.246262093227792, + "grad_norm": 0.30190671688672743, + "learning_rate": 0.00018593762640712588, + "loss": 2.7948150634765625, + "step": 7244, + "token_acc": 0.32656787605332016 + }, + { + "epoch": 4.2468484315450015, + "grad_norm": 0.2611529307863619, + "learning_rate": 0.00018593267001600292, + "loss": 2.820159912109375, + "step": 7245, + "token_acc": 0.32378601958515046 + }, + { + "epoch": 4.247434769862211, + "grad_norm": 0.2729489508028632, + "learning_rate": 0.00018592771281765832, + "loss": 2.7711167335510254, + "step": 7246, + "token_acc": 0.33006193671851763 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.2500548574256754, + "learning_rate": 0.00018592275481213864, + "loss": 2.7811269760131836, + "step": 7247, + "token_acc": 0.32828630240496104 + }, + { + "epoch": 4.248607446496629, + "grad_norm": 0.293296876868701, + "learning_rate": 0.00018591779599949043, + "loss": 2.8041152954101562, + "step": 7248, + "token_acc": 0.3249691702700843 + }, + { + "epoch": 4.249193784813838, + "grad_norm": 0.2591749448981494, + "learning_rate": 0.00018591283637976036, + "loss": 2.820497512817383, + "step": 7249, + "token_acc": 0.32207114879571763 + }, + { + "epoch": 4.249780123131047, + "grad_norm": 0.2573841579489027, + "learning_rate": 0.00018590787595299492, + "loss": 2.799330711364746, + "step": 7250, + "token_acc": 0.3264108412008062 + }, + { + "epoch": 4.250366461448255, + "grad_norm": 0.2934884045768928, + "learning_rate": 0.00018590291471924078, + "loss": 2.8133256435394287, + "step": 7251, + "token_acc": 0.3234112176359148 + }, + { + "epoch": 4.250952799765464, + "grad_norm": 0.2778698682354681, + "learning_rate": 0.00018589795267854448, + "loss": 2.808682441711426, + "step": 7252, + "token_acc": 0.32576857317978 + }, + { + "epoch": 4.2515391380826735, + "grad_norm": 0.2661434113633592, + "learning_rate": 0.00018589298983095266, + "loss": 2.769360065460205, + "step": 7253, + "token_acc": 0.330891473169317 + }, + { + "epoch": 4.252125476399883, + "grad_norm": 0.29665914738035704, + "learning_rate": 0.000185888026176512, + "loss": 2.795877695083618, + "step": 7254, + "token_acc": 0.3248432694349834 + }, + { + "epoch": 4.252711814717092, + "grad_norm": 0.2770992638822399, + "learning_rate": 0.000185883061715269, + "loss": 2.8054394721984863, + "step": 7255, + "token_acc": 0.32559387114931804 + }, + { + "epoch": 4.253298153034301, + "grad_norm": 0.24699655417277075, + "learning_rate": 0.0001858780964472704, + "loss": 2.833385944366455, + "step": 7256, + "token_acc": 0.32014055902240884 + }, + { + "epoch": 4.25388449135151, + "grad_norm": 0.26090048906427205, + "learning_rate": 0.0001858731303725628, + "loss": 2.7807912826538086, + "step": 7257, + "token_acc": 0.32841326471547794 + }, + { + "epoch": 4.254470829668719, + "grad_norm": 0.23758348226021472, + "learning_rate": 0.00018586816349119286, + "loss": 2.7943272590637207, + "step": 7258, + "token_acc": 0.3254815892445069 + }, + { + "epoch": 4.255057167985928, + "grad_norm": 0.24529113061316865, + "learning_rate": 0.00018586319580320723, + "loss": 2.762887954711914, + "step": 7259, + "token_acc": 0.3325062349839744 + }, + { + "epoch": 4.255643506303137, + "grad_norm": 0.25691253873043135, + "learning_rate": 0.00018585822730865257, + "loss": 2.8289601802825928, + "step": 7260, + "token_acc": 0.3222533395859149 + }, + { + "epoch": 4.256229844620346, + "grad_norm": 0.23877210021621345, + "learning_rate": 0.00018585325800757557, + "loss": 2.7962939739227295, + "step": 7261, + "token_acc": 0.32584877231849896 + }, + { + "epoch": 4.256816182937555, + "grad_norm": 0.25923808291081674, + "learning_rate": 0.0001858482879000229, + "loss": 2.794616937637329, + "step": 7262, + "token_acc": 0.3256992623577806 + }, + { + "epoch": 4.257402521254764, + "grad_norm": 0.2516689961425692, + "learning_rate": 0.00018584331698604122, + "loss": 2.761894702911377, + "step": 7263, + "token_acc": 0.33134821124790503 + }, + { + "epoch": 4.257988859571973, + "grad_norm": 0.25685691403779953, + "learning_rate": 0.0001858383452656773, + "loss": 2.8225340843200684, + "step": 7264, + "token_acc": 0.3221636380998098 + }, + { + "epoch": 4.258575197889182, + "grad_norm": 0.2614171815863196, + "learning_rate": 0.00018583337273897775, + "loss": 2.8659238815307617, + "step": 7265, + "token_acc": 0.31620199715887537 + }, + { + "epoch": 4.259161536206391, + "grad_norm": 0.28101959485441763, + "learning_rate": 0.00018582839940598934, + "loss": 2.8032522201538086, + "step": 7266, + "token_acc": 0.32640431307781387 + }, + { + "epoch": 4.2597478745236, + "grad_norm": 0.3260857270166502, + "learning_rate": 0.00018582342526675876, + "loss": 2.799434185028076, + "step": 7267, + "token_acc": 0.32405240094241883 + }, + { + "epoch": 4.260334212840809, + "grad_norm": 0.2669054420268166, + "learning_rate": 0.0001858184503213328, + "loss": 2.7674007415771484, + "step": 7268, + "token_acc": 0.331480740216153 + }, + { + "epoch": 4.260920551158018, + "grad_norm": 0.2665435458390013, + "learning_rate": 0.00018581347456975812, + "loss": 2.83184552192688, + "step": 7269, + "token_acc": 0.32219774482356295 + }, + { + "epoch": 4.2615068894752275, + "grad_norm": 0.31123896032413284, + "learning_rate": 0.00018580849801208148, + "loss": 2.767821788787842, + "step": 7270, + "token_acc": 0.3315526904787823 + }, + { + "epoch": 4.262093227792437, + "grad_norm": 0.3869402880382641, + "learning_rate": 0.00018580352064834958, + "loss": 2.814661741256714, + "step": 7271, + "token_acc": 0.32414627376844 + }, + { + "epoch": 4.262679566109645, + "grad_norm": 0.3118106787483905, + "learning_rate": 0.00018579854247860927, + "loss": 2.7867748737335205, + "step": 7272, + "token_acc": 0.32802088396230855 + }, + { + "epoch": 4.263265904426854, + "grad_norm": 0.28627442903720196, + "learning_rate": 0.00018579356350290727, + "loss": 2.8296127319335938, + "step": 7273, + "token_acc": 0.32161217612756515 + }, + { + "epoch": 4.263852242744063, + "grad_norm": 0.31924324436555945, + "learning_rate": 0.00018578858372129034, + "loss": 2.823333501815796, + "step": 7274, + "token_acc": 0.3227546079545739 + }, + { + "epoch": 4.264438581061272, + "grad_norm": 0.26054060664355555, + "learning_rate": 0.00018578360313380526, + "loss": 2.8126187324523926, + "step": 7275, + "token_acc": 0.32375245615742554 + }, + { + "epoch": 4.265024919378481, + "grad_norm": 0.307485784417134, + "learning_rate": 0.00018577862174049882, + "loss": 2.801429271697998, + "step": 7276, + "token_acc": 0.32372586439155815 + }, + { + "epoch": 4.26561125769569, + "grad_norm": 0.2671535147129377, + "learning_rate": 0.00018577363954141784, + "loss": 2.817614793777466, + "step": 7277, + "token_acc": 0.32333456405928584 + }, + { + "epoch": 4.2661975960128995, + "grad_norm": 0.29732471457292126, + "learning_rate": 0.0001857686565366091, + "loss": 2.8188486099243164, + "step": 7278, + "token_acc": 0.32449842180632565 + }, + { + "epoch": 4.266783934330109, + "grad_norm": 0.2661148400330736, + "learning_rate": 0.0001857636727261194, + "loss": 2.807769298553467, + "step": 7279, + "token_acc": 0.3259313607281571 + }, + { + "epoch": 4.267370272647318, + "grad_norm": 0.2825555667667071, + "learning_rate": 0.00018575868810999553, + "loss": 2.7600135803222656, + "step": 7280, + "token_acc": 0.33153001027141266 + }, + { + "epoch": 4.267956610964527, + "grad_norm": 0.28661788779829256, + "learning_rate": 0.00018575370268828439, + "loss": 2.8024230003356934, + "step": 7281, + "token_acc": 0.3269853735152646 + }, + { + "epoch": 4.268542949281736, + "grad_norm": 0.27240053835138306, + "learning_rate": 0.0001857487164610327, + "loss": 2.8084990978240967, + "step": 7282, + "token_acc": 0.32539578767893473 + }, + { + "epoch": 4.269129287598945, + "grad_norm": 0.2994600424082782, + "learning_rate": 0.00018574372942828745, + "loss": 2.8383166790008545, + "step": 7283, + "token_acc": 0.31961565124024666 + }, + { + "epoch": 4.269715625916153, + "grad_norm": 0.2741971250446781, + "learning_rate": 0.00018573874159009537, + "loss": 2.8139712810516357, + "step": 7284, + "token_acc": 0.3227656013517301 + }, + { + "epoch": 4.270301964233362, + "grad_norm": 0.27702364708963007, + "learning_rate": 0.00018573375294650334, + "loss": 2.7799324989318848, + "step": 7285, + "token_acc": 0.32980253004260707 + }, + { + "epoch": 4.2708883025505715, + "grad_norm": 0.2510879014431597, + "learning_rate": 0.00018572876349755822, + "loss": 2.778569221496582, + "step": 7286, + "token_acc": 0.33082968716377537 + }, + { + "epoch": 4.271474640867781, + "grad_norm": 0.291916278558268, + "learning_rate": 0.00018572377324330692, + "loss": 2.85839581489563, + "step": 7287, + "token_acc": 0.31741911270270695 + }, + { + "epoch": 4.27206097918499, + "grad_norm": 0.2683862718991664, + "learning_rate": 0.00018571878218379628, + "loss": 2.7897744178771973, + "step": 7288, + "token_acc": 0.32731188455824856 + }, + { + "epoch": 4.272647317502199, + "grad_norm": 0.2671692459182062, + "learning_rate": 0.00018571379031907315, + "loss": 2.786526918411255, + "step": 7289, + "token_acc": 0.3282105817429057 + }, + { + "epoch": 4.273233655819408, + "grad_norm": 0.2703502658305935, + "learning_rate": 0.00018570879764918453, + "loss": 2.7870378494262695, + "step": 7290, + "token_acc": 0.3289858008529233 + }, + { + "epoch": 4.273819994136617, + "grad_norm": 0.2728508612477497, + "learning_rate": 0.00018570380417417718, + "loss": 2.838200807571411, + "step": 7291, + "token_acc": 0.32023647281787015 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.24749302082108923, + "learning_rate": 0.0001856988098940981, + "loss": 2.8244848251342773, + "step": 7292, + "token_acc": 0.3225096298023175 + }, + { + "epoch": 4.274992670771035, + "grad_norm": 0.2653348020573965, + "learning_rate": 0.0001856938148089942, + "loss": 2.789384365081787, + "step": 7293, + "token_acc": 0.32779352795443517 + }, + { + "epoch": 4.2755790090882435, + "grad_norm": 0.24862873926523077, + "learning_rate": 0.00018568881891891238, + "loss": 2.8305468559265137, + "step": 7294, + "token_acc": 0.3214331328963569 + }, + { + "epoch": 4.276165347405453, + "grad_norm": 0.2419331682356139, + "learning_rate": 0.00018568382222389955, + "loss": 2.819166660308838, + "step": 7295, + "token_acc": 0.3227081124414906 + }, + { + "epoch": 4.276751685722662, + "grad_norm": 0.24805843511143963, + "learning_rate": 0.00018567882472400268, + "loss": 2.806670904159546, + "step": 7296, + "token_acc": 0.325897966762131 + }, + { + "epoch": 4.277338024039871, + "grad_norm": 0.2547008504523437, + "learning_rate": 0.00018567382641926868, + "loss": 2.810605049133301, + "step": 7297, + "token_acc": 0.32482637478677606 + }, + { + "epoch": 4.27792436235708, + "grad_norm": 0.26407989280879574, + "learning_rate": 0.00018566882730974458, + "loss": 2.822115182876587, + "step": 7298, + "token_acc": 0.322613020437868 + }, + { + "epoch": 4.278510700674289, + "grad_norm": 0.2751150090347371, + "learning_rate": 0.00018566382739547725, + "loss": 2.8266501426696777, + "step": 7299, + "token_acc": 0.3211986427582 + }, + { + "epoch": 4.279097038991498, + "grad_norm": 0.2652782642016134, + "learning_rate": 0.00018565882667651373, + "loss": 2.8442234992980957, + "step": 7300, + "token_acc": 0.31990304652540813 + }, + { + "epoch": 4.279683377308707, + "grad_norm": 0.24070980062212655, + "learning_rate": 0.00018565382515290093, + "loss": 2.816871166229248, + "step": 7301, + "token_acc": 0.3222111779931683 + }, + { + "epoch": 4.280269715625916, + "grad_norm": 0.28090543592421724, + "learning_rate": 0.0001856488228246859, + "loss": 2.83585786819458, + "step": 7302, + "token_acc": 0.3214441835567457 + }, + { + "epoch": 4.2808560539431255, + "grad_norm": 0.2524501657756281, + "learning_rate": 0.00018564381969191556, + "loss": 2.823448657989502, + "step": 7303, + "token_acc": 0.3221172103571295 + }, + { + "epoch": 4.281442392260335, + "grad_norm": 0.2489536568136416, + "learning_rate": 0.00018563881575463692, + "loss": 2.7881476879119873, + "step": 7304, + "token_acc": 0.32695628115729664 + }, + { + "epoch": 4.282028730577543, + "grad_norm": 0.23986620733242398, + "learning_rate": 0.00018563381101289705, + "loss": 2.8279004096984863, + "step": 7305, + "token_acc": 0.32289972592239596 + }, + { + "epoch": 4.282615068894752, + "grad_norm": 0.2755080600773978, + "learning_rate": 0.0001856288054667429, + "loss": 2.8109140396118164, + "step": 7306, + "token_acc": 0.3242606053809373 + }, + { + "epoch": 4.283201407211961, + "grad_norm": 0.3120137641574296, + "learning_rate": 0.0001856237991162215, + "loss": 2.7446224689483643, + "step": 7307, + "token_acc": 0.33326992629565944 + }, + { + "epoch": 4.28378774552917, + "grad_norm": 0.3396846419683235, + "learning_rate": 0.0001856187919613799, + "loss": 2.7877445220947266, + "step": 7308, + "token_acc": 0.32822685874362345 + }, + { + "epoch": 4.284374083846379, + "grad_norm": 0.2963000468260124, + "learning_rate": 0.0001856137840022651, + "loss": 2.8122153282165527, + "step": 7309, + "token_acc": 0.32449347732274053 + }, + { + "epoch": 4.284960422163588, + "grad_norm": 0.2490675519726043, + "learning_rate": 0.00018560877523892422, + "loss": 2.823389768600464, + "step": 7310, + "token_acc": 0.3218357413926514 + }, + { + "epoch": 4.2855467604807975, + "grad_norm": 0.3128231230966118, + "learning_rate": 0.00018560376567140421, + "loss": 2.824934720993042, + "step": 7311, + "token_acc": 0.32309449386871847 + }, + { + "epoch": 4.286133098798007, + "grad_norm": 0.3339320736498481, + "learning_rate": 0.0001855987552997522, + "loss": 2.7988572120666504, + "step": 7312, + "token_acc": 0.3256756302477548 + }, + { + "epoch": 4.286719437115216, + "grad_norm": 0.25450277012409295, + "learning_rate": 0.0001855937441240152, + "loss": 2.8208513259887695, + "step": 7313, + "token_acc": 0.32296150437076393 + }, + { + "epoch": 4.287305775432425, + "grad_norm": 0.2991275782456912, + "learning_rate": 0.00018558873214424032, + "loss": 2.7601354122161865, + "step": 7314, + "token_acc": 0.3321531574757067 + }, + { + "epoch": 4.287892113749633, + "grad_norm": 0.36602093579034706, + "learning_rate": 0.00018558371936047463, + "loss": 2.794074773788452, + "step": 7315, + "token_acc": 0.329046180405697 + }, + { + "epoch": 4.288478452066842, + "grad_norm": 0.26590871957334616, + "learning_rate": 0.00018557870577276523, + "loss": 2.813007354736328, + "step": 7316, + "token_acc": 0.32435362611749113 + }, + { + "epoch": 4.289064790384051, + "grad_norm": 0.3401038590177105, + "learning_rate": 0.0001855736913811592, + "loss": 2.833259344100952, + "step": 7317, + "token_acc": 0.3220998791231413 + }, + { + "epoch": 4.28965112870126, + "grad_norm": 0.2672034063573335, + "learning_rate": 0.00018556867618570367, + "loss": 2.8263285160064697, + "step": 7318, + "token_acc": 0.32193144911027416 + }, + { + "epoch": 4.2902374670184695, + "grad_norm": 0.2851360671836541, + "learning_rate": 0.00018556366018644574, + "loss": 2.79093599319458, + "step": 7319, + "token_acc": 0.328227769629708 + }, + { + "epoch": 4.290823805335679, + "grad_norm": 0.2725928167778592, + "learning_rate": 0.0001855586433834325, + "loss": 2.8039135932922363, + "step": 7320, + "token_acc": 0.3249584160890039 + }, + { + "epoch": 4.291410143652888, + "grad_norm": 0.2757176245922568, + "learning_rate": 0.0001855536257767111, + "loss": 2.805896282196045, + "step": 7321, + "token_acc": 0.3260785716755733 + }, + { + "epoch": 4.291996481970097, + "grad_norm": 0.2767721173300752, + "learning_rate": 0.00018554860736632865, + "loss": 2.8147716522216797, + "step": 7322, + "token_acc": 0.3227741882458368 + }, + { + "epoch": 4.292582820287306, + "grad_norm": 0.2393160937746302, + "learning_rate": 0.00018554358815233235, + "loss": 2.8246254920959473, + "step": 7323, + "token_acc": 0.32186129563261073 + }, + { + "epoch": 4.293169158604515, + "grad_norm": 0.2555788052803914, + "learning_rate": 0.0001855385681347693, + "loss": 2.807732343673706, + "step": 7324, + "token_acc": 0.3241206852355838 + }, + { + "epoch": 4.293755496921724, + "grad_norm": 0.2471076321876937, + "learning_rate": 0.00018553354731368665, + "loss": 2.823533296585083, + "step": 7325, + "token_acc": 0.32330630743447575 + }, + { + "epoch": 4.294341835238933, + "grad_norm": 0.2761763041860472, + "learning_rate": 0.00018552852568913162, + "loss": 2.778254270553589, + "step": 7326, + "token_acc": 0.3290900482619586 + }, + { + "epoch": 4.2949281735561415, + "grad_norm": 0.23672794424754837, + "learning_rate": 0.0001855235032611513, + "loss": 2.802098274230957, + "step": 7327, + "token_acc": 0.32537915115655325 + }, + { + "epoch": 4.295514511873351, + "grad_norm": 0.27248740962549783, + "learning_rate": 0.00018551848002979293, + "loss": 2.783581018447876, + "step": 7328, + "token_acc": 0.32810237321536095 + }, + { + "epoch": 4.29610085019056, + "grad_norm": 0.2407725208862905, + "learning_rate": 0.00018551345599510368, + "loss": 2.8062942028045654, + "step": 7329, + "token_acc": 0.3237197628407883 + }, + { + "epoch": 4.296687188507769, + "grad_norm": 0.2637366434839126, + "learning_rate": 0.00018550843115713072, + "loss": 2.811466932296753, + "step": 7330, + "token_acc": 0.3243766642459453 + }, + { + "epoch": 4.297273526824978, + "grad_norm": 0.24973256606261507, + "learning_rate": 0.0001855034055159213, + "loss": 2.8131279945373535, + "step": 7331, + "token_acc": 0.32352479557190167 + }, + { + "epoch": 4.297859865142187, + "grad_norm": 0.28059564849282453, + "learning_rate": 0.0001854983790715226, + "loss": 2.8321104049682617, + "step": 7332, + "token_acc": 0.32301145330145026 + }, + { + "epoch": 4.298446203459396, + "grad_norm": 0.29650161031004485, + "learning_rate": 0.00018549335182398182, + "loss": 2.832314968109131, + "step": 7333, + "token_acc": 0.32127787667571456 + }, + { + "epoch": 4.299032541776605, + "grad_norm": 0.27229506990058105, + "learning_rate": 0.00018548832377334622, + "loss": 2.8226585388183594, + "step": 7334, + "token_acc": 0.32414593592460966 + }, + { + "epoch": 4.299618880093814, + "grad_norm": 0.26223986554659007, + "learning_rate": 0.000185483294919663, + "loss": 2.7886390686035156, + "step": 7335, + "token_acc": 0.32820848410193243 + }, + { + "epoch": 4.3002052184110235, + "grad_norm": 0.25132379075741357, + "learning_rate": 0.00018547826526297946, + "loss": 2.8314831256866455, + "step": 7336, + "token_acc": 0.32085945863281845 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.24211622271724428, + "learning_rate": 0.00018547323480334272, + "loss": 2.808999538421631, + "step": 7337, + "token_acc": 0.3240410300245753 + }, + { + "epoch": 4.301377895045441, + "grad_norm": 0.2695695518499097, + "learning_rate": 0.0001854682035408002, + "loss": 2.78363299369812, + "step": 7338, + "token_acc": 0.3288140199066476 + }, + { + "epoch": 4.30196423336265, + "grad_norm": 0.288051327469357, + "learning_rate": 0.00018546317147539903, + "loss": 2.8311729431152344, + "step": 7339, + "token_acc": 0.32169637562903064 + }, + { + "epoch": 4.302550571679859, + "grad_norm": 0.27628220053742264, + "learning_rate": 0.00018545813860718652, + "loss": 2.8121166229248047, + "step": 7340, + "token_acc": 0.3246737337062339 + }, + { + "epoch": 4.303136909997068, + "grad_norm": 0.2704143247505957, + "learning_rate": 0.00018545310493620998, + "loss": 2.8320748805999756, + "step": 7341, + "token_acc": 0.3223838696559052 + }, + { + "epoch": 4.303723248314277, + "grad_norm": 0.3011045280377905, + "learning_rate": 0.00018544807046251663, + "loss": 2.7819314002990723, + "step": 7342, + "token_acc": 0.32712941586674343 + }, + { + "epoch": 4.304309586631486, + "grad_norm": 0.2991484339880676, + "learning_rate": 0.00018544303518615386, + "loss": 2.8504786491394043, + "step": 7343, + "token_acc": 0.31816578108959137 + }, + { + "epoch": 4.3048959249486956, + "grad_norm": 0.25622455690861545, + "learning_rate": 0.00018543799910716887, + "loss": 2.8148651123046875, + "step": 7344, + "token_acc": 0.32281214293478117 + }, + { + "epoch": 4.305482263265905, + "grad_norm": 0.29244727707376394, + "learning_rate": 0.00018543296222560903, + "loss": 2.822002410888672, + "step": 7345, + "token_acc": 0.32275306744387655 + }, + { + "epoch": 4.306068601583114, + "grad_norm": 0.24415635864595084, + "learning_rate": 0.0001854279245415216, + "loss": 2.7905263900756836, + "step": 7346, + "token_acc": 0.3285865667200852 + }, + { + "epoch": 4.306654939900323, + "grad_norm": 0.27899228508673524, + "learning_rate": 0.00018542288605495396, + "loss": 2.8284225463867188, + "step": 7347, + "token_acc": 0.32066951956214335 + }, + { + "epoch": 4.307241278217531, + "grad_norm": 0.2555604196620964, + "learning_rate": 0.0001854178467659534, + "loss": 2.7665023803710938, + "step": 7348, + "token_acc": 0.3302447858252647 + }, + { + "epoch": 4.30782761653474, + "grad_norm": 0.2632727410892724, + "learning_rate": 0.0001854128066745673, + "loss": 2.808840036392212, + "step": 7349, + "token_acc": 0.3248949502796346 + }, + { + "epoch": 4.308413954851949, + "grad_norm": 0.2844855579772716, + "learning_rate": 0.00018540776578084293, + "loss": 2.8241991996765137, + "step": 7350, + "token_acc": 0.32150459295979505 + }, + { + "epoch": 4.3090002931691584, + "grad_norm": 0.2717218439839742, + "learning_rate": 0.00018540272408482773, + "loss": 2.851439952850342, + "step": 7351, + "token_acc": 0.317790741144165 + }, + { + "epoch": 4.309586631486368, + "grad_norm": 0.2569263156480144, + "learning_rate": 0.00018539768158656902, + "loss": 2.8031463623046875, + "step": 7352, + "token_acc": 0.32549196387647894 + }, + { + "epoch": 4.310172969803577, + "grad_norm": 0.2913178961816923, + "learning_rate": 0.00018539263828611414, + "loss": 2.7898645401000977, + "step": 7353, + "token_acc": 0.3290096287189059 + }, + { + "epoch": 4.310759308120786, + "grad_norm": 0.25963484039673174, + "learning_rate": 0.0001853875941835105, + "loss": 2.787909984588623, + "step": 7354, + "token_acc": 0.32731684395798105 + }, + { + "epoch": 4.311345646437995, + "grad_norm": 0.2589206946766923, + "learning_rate": 0.00018538254927880549, + "loss": 2.8027899265289307, + "step": 7355, + "token_acc": 0.3247565641102924 + }, + { + "epoch": 4.311931984755204, + "grad_norm": 0.27180997580918637, + "learning_rate": 0.00018537750357204647, + "loss": 2.7803854942321777, + "step": 7356, + "token_acc": 0.3289767237148971 + }, + { + "epoch": 4.312518323072413, + "grad_norm": 0.2855486724334418, + "learning_rate": 0.00018537245706328083, + "loss": 2.786447048187256, + "step": 7357, + "token_acc": 0.32747780023024964 + }, + { + "epoch": 4.313104661389621, + "grad_norm": 0.2668825983689697, + "learning_rate": 0.00018536740975255603, + "loss": 2.8221518993377686, + "step": 7358, + "token_acc": 0.3229034836926329 + }, + { + "epoch": 4.3136909997068305, + "grad_norm": 0.2541301667346999, + "learning_rate": 0.00018536236163991943, + "loss": 2.838918685913086, + "step": 7359, + "token_acc": 0.3189109361583265 + }, + { + "epoch": 4.31427733802404, + "grad_norm": 0.24058252653276763, + "learning_rate": 0.00018535731272541849, + "loss": 2.820225715637207, + "step": 7360, + "token_acc": 0.3220257705637019 + }, + { + "epoch": 4.314863676341249, + "grad_norm": 0.2537094571498051, + "learning_rate": 0.0001853522630091006, + "loss": 2.837794542312622, + "step": 7361, + "token_acc": 0.32047502715825227 + }, + { + "epoch": 4.315450014658458, + "grad_norm": 0.2321610535595397, + "learning_rate": 0.00018534721249101321, + "loss": 2.796674966812134, + "step": 7362, + "token_acc": 0.32710979974319043 + }, + { + "epoch": 4.316036352975667, + "grad_norm": 0.2729601426540344, + "learning_rate": 0.00018534216117120376, + "loss": 2.8310489654541016, + "step": 7363, + "token_acc": 0.32185640290941153 + }, + { + "epoch": 4.316622691292876, + "grad_norm": 0.2928943374150621, + "learning_rate": 0.00018533710904971974, + "loss": 2.782020092010498, + "step": 7364, + "token_acc": 0.32851574919691706 + }, + { + "epoch": 4.317209029610085, + "grad_norm": 0.33186848199360436, + "learning_rate": 0.00018533205612660854, + "loss": 2.792886257171631, + "step": 7365, + "token_acc": 0.3267970771687235 + }, + { + "epoch": 4.317795367927294, + "grad_norm": 0.29396320143969734, + "learning_rate": 0.00018532700240191766, + "loss": 2.8050012588500977, + "step": 7366, + "token_acc": 0.32734517899137705 + }, + { + "epoch": 4.318381706244503, + "grad_norm": 0.242382467888204, + "learning_rate": 0.00018532194787569458, + "loss": 2.8372154235839844, + "step": 7367, + "token_acc": 0.32093334954809966 + }, + { + "epoch": 4.3189680445617125, + "grad_norm": 0.3018708021737018, + "learning_rate": 0.00018531689254798679, + "loss": 2.857952117919922, + "step": 7368, + "token_acc": 0.3182269720461962 + }, + { + "epoch": 4.319554382878922, + "grad_norm": 0.28574251410242824, + "learning_rate": 0.00018531183641884175, + "loss": 2.8318610191345215, + "step": 7369, + "token_acc": 0.3224281947024513 + }, + { + "epoch": 4.32014072119613, + "grad_norm": 0.24408509010222723, + "learning_rate": 0.00018530677948830695, + "loss": 2.8215036392211914, + "step": 7370, + "token_acc": 0.32163402299545685 + }, + { + "epoch": 4.320727059513339, + "grad_norm": 0.34518635217296834, + "learning_rate": 0.00018530172175642992, + "loss": 2.8243284225463867, + "step": 7371, + "token_acc": 0.32230077452224043 + }, + { + "epoch": 4.321313397830548, + "grad_norm": 0.3171144494115203, + "learning_rate": 0.00018529666322325816, + "loss": 2.8102946281433105, + "step": 7372, + "token_acc": 0.32534964241571607 + }, + { + "epoch": 4.321899736147757, + "grad_norm": 0.27451174702407827, + "learning_rate": 0.0001852916038888392, + "loss": 2.815958023071289, + "step": 7373, + "token_acc": 0.3258603070047708 + }, + { + "epoch": 4.322486074464966, + "grad_norm": 0.28523242562530465, + "learning_rate": 0.00018528654375322054, + "loss": 2.7794761657714844, + "step": 7374, + "token_acc": 0.32956259816483424 + }, + { + "epoch": 4.323072412782175, + "grad_norm": 0.2592134744682898, + "learning_rate": 0.00018528148281644972, + "loss": 2.801241874694824, + "step": 7375, + "token_acc": 0.32708814961487026 + }, + { + "epoch": 4.3236587510993845, + "grad_norm": 0.25679270863521686, + "learning_rate": 0.0001852764210785743, + "loss": 2.815281391143799, + "step": 7376, + "token_acc": 0.3238663564776118 + }, + { + "epoch": 4.324245089416594, + "grad_norm": 0.24667412760522206, + "learning_rate": 0.0001852713585396418, + "loss": 2.7800350189208984, + "step": 7377, + "token_acc": 0.3298651231305891 + }, + { + "epoch": 4.324831427733803, + "grad_norm": 0.2627684435486072, + "learning_rate": 0.00018526629519969982, + "loss": 2.783808946609497, + "step": 7378, + "token_acc": 0.32935931246150196 + }, + { + "epoch": 4.325417766051012, + "grad_norm": 0.2667008597177768, + "learning_rate": 0.00018526123105879586, + "loss": 2.8039867877960205, + "step": 7379, + "token_acc": 0.32489695716777567 + }, + { + "epoch": 4.32600410436822, + "grad_norm": 0.2590757655000024, + "learning_rate": 0.00018525616611697756, + "loss": 2.8031604290008545, + "step": 7380, + "token_acc": 0.32589459354336836 + }, + { + "epoch": 4.326590442685429, + "grad_norm": 0.26233243923293786, + "learning_rate": 0.00018525110037429245, + "loss": 2.827221632003784, + "step": 7381, + "token_acc": 0.3228146159296373 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.26757584022367426, + "learning_rate": 0.0001852460338307881, + "loss": 2.7913246154785156, + "step": 7382, + "token_acc": 0.3276428237928304 + }, + { + "epoch": 4.327763119319847, + "grad_norm": 0.2717490114631852, + "learning_rate": 0.00018524096648651214, + "loss": 2.8125948905944824, + "step": 7383, + "token_acc": 0.32433331341220367 + }, + { + "epoch": 4.3283494576370565, + "grad_norm": 0.2616628626748493, + "learning_rate": 0.0001852358983415122, + "loss": 2.8318963050842285, + "step": 7384, + "token_acc": 0.3208337026578704 + }, + { + "epoch": 4.328935795954266, + "grad_norm": 0.2582633175302739, + "learning_rate": 0.00018523082939583585, + "loss": 2.7909657955169678, + "step": 7385, + "token_acc": 0.3286259779608608 + }, + { + "epoch": 4.329522134271475, + "grad_norm": 0.28136044849043373, + "learning_rate": 0.0001852257596495307, + "loss": 2.83647084236145, + "step": 7386, + "token_acc": 0.32002741569040577 + }, + { + "epoch": 4.330108472588684, + "grad_norm": 0.2650291408241554, + "learning_rate": 0.00018522068910264435, + "loss": 2.8075337409973145, + "step": 7387, + "token_acc": 0.3239179789704238 + }, + { + "epoch": 4.330694810905893, + "grad_norm": 0.25594984829641465, + "learning_rate": 0.0001852156177552245, + "loss": 2.763432502746582, + "step": 7388, + "token_acc": 0.3304623341650682 + }, + { + "epoch": 4.331281149223102, + "grad_norm": 0.25309265520875107, + "learning_rate": 0.0001852105456073187, + "loss": 2.833162546157837, + "step": 7389, + "token_acc": 0.3221918124239755 + }, + { + "epoch": 4.331867487540311, + "grad_norm": 0.28936436487018385, + "learning_rate": 0.0001852054726589747, + "loss": 2.852695941925049, + "step": 7390, + "token_acc": 0.3183785951263865 + }, + { + "epoch": 4.33245382585752, + "grad_norm": 0.3653333311441747, + "learning_rate": 0.00018520039891024007, + "loss": 2.846194267272949, + "step": 7391, + "token_acc": 0.3181990552136604 + }, + { + "epoch": 4.3330401641747285, + "grad_norm": 0.39009068238795525, + "learning_rate": 0.00018519532436116253, + "loss": 2.834728717803955, + "step": 7392, + "token_acc": 0.3199874391584236 + }, + { + "epoch": 4.333626502491938, + "grad_norm": 0.29169434278831785, + "learning_rate": 0.00018519024901178968, + "loss": 2.783836603164673, + "step": 7393, + "token_acc": 0.32890546496605333 + }, + { + "epoch": 4.334212840809147, + "grad_norm": 0.3328962412854287, + "learning_rate": 0.00018518517286216928, + "loss": 2.8078761100769043, + "step": 7394, + "token_acc": 0.32448227900267873 + }, + { + "epoch": 4.334799179126356, + "grad_norm": 0.3312694110445546, + "learning_rate": 0.00018518009591234893, + "loss": 2.8235490322113037, + "step": 7395, + "token_acc": 0.32306550191695566 + }, + { + "epoch": 4.335385517443565, + "grad_norm": 0.2649008689432459, + "learning_rate": 0.00018517501816237638, + "loss": 2.7996339797973633, + "step": 7396, + "token_acc": 0.3267675244649991 + }, + { + "epoch": 4.335971855760774, + "grad_norm": 0.33652924456117955, + "learning_rate": 0.00018516993961229932, + "loss": 2.8173112869262695, + "step": 7397, + "token_acc": 0.324001422306507 + }, + { + "epoch": 4.336558194077983, + "grad_norm": 0.2509395245866988, + "learning_rate": 0.0001851648602621654, + "loss": 2.867915391921997, + "step": 7398, + "token_acc": 0.31734970381884636 + }, + { + "epoch": 4.337144532395192, + "grad_norm": 0.31800161571113367, + "learning_rate": 0.00018515978011202243, + "loss": 2.8555874824523926, + "step": 7399, + "token_acc": 0.31784655305305715 + }, + { + "epoch": 4.337730870712401, + "grad_norm": 0.28991738048334936, + "learning_rate": 0.00018515469916191807, + "loss": 2.8549158573150635, + "step": 7400, + "token_acc": 0.3175896735892186 + }, + { + "epoch": 4.3383172090296105, + "grad_norm": 0.2549485136651108, + "learning_rate": 0.00018514961741190005, + "loss": 2.8160927295684814, + "step": 7401, + "token_acc": 0.3237614304726742 + }, + { + "epoch": 4.338903547346819, + "grad_norm": 0.3377181249613439, + "learning_rate": 0.0001851445348620161, + "loss": 2.8079957962036133, + "step": 7402, + "token_acc": 0.3247294763389721 + }, + { + "epoch": 4.339489885664028, + "grad_norm": 0.22616211111235662, + "learning_rate": 0.000185139451512314, + "loss": 2.8290390968322754, + "step": 7403, + "token_acc": 0.32114109136730706 + }, + { + "epoch": 4.340076223981237, + "grad_norm": 0.3518895634181134, + "learning_rate": 0.00018513436736284147, + "loss": 2.8103904724121094, + "step": 7404, + "token_acc": 0.32318866756115183 + }, + { + "epoch": 4.340662562298446, + "grad_norm": 0.24669901838192926, + "learning_rate": 0.00018512928241364626, + "loss": 2.7957215309143066, + "step": 7405, + "token_acc": 0.3255691394163288 + }, + { + "epoch": 4.341248900615655, + "grad_norm": 0.28820187430618854, + "learning_rate": 0.0001851241966647762, + "loss": 2.869537591934204, + "step": 7406, + "token_acc": 0.3178083371921474 + }, + { + "epoch": 4.341835238932864, + "grad_norm": 0.24891514803819764, + "learning_rate": 0.00018511911011627897, + "loss": 2.8168578147888184, + "step": 7407, + "token_acc": 0.32259477310082535 + }, + { + "epoch": 4.342421577250073, + "grad_norm": 0.2571528257986596, + "learning_rate": 0.00018511402276820245, + "loss": 2.7865803241729736, + "step": 7408, + "token_acc": 0.3291118935523456 + }, + { + "epoch": 4.3430079155672825, + "grad_norm": 0.25810184374558254, + "learning_rate": 0.00018510893462059437, + "loss": 2.827939987182617, + "step": 7409, + "token_acc": 0.32046704029890577 + }, + { + "epoch": 4.343594253884492, + "grad_norm": 0.2568506348988116, + "learning_rate": 0.0001851038456735025, + "loss": 2.822042942047119, + "step": 7410, + "token_acc": 0.3221971065742571 + }, + { + "epoch": 4.344180592201701, + "grad_norm": 0.29090885674597405, + "learning_rate": 0.0001850987559269747, + "loss": 2.7751927375793457, + "step": 7411, + "token_acc": 0.3301368927550849 + }, + { + "epoch": 4.34476693051891, + "grad_norm": 0.24339245370074164, + "learning_rate": 0.00018509366538105873, + "loss": 2.8235793113708496, + "step": 7412, + "token_acc": 0.3217509013590916 + }, + { + "epoch": 4.345353268836118, + "grad_norm": 0.31155069126324275, + "learning_rate": 0.0001850885740358025, + "loss": 2.815742015838623, + "step": 7413, + "token_acc": 0.32216966946441755 + }, + { + "epoch": 4.345939607153327, + "grad_norm": 0.2795590712630844, + "learning_rate": 0.00018508348189125374, + "loss": 2.818142890930176, + "step": 7414, + "token_acc": 0.322453706102401 + }, + { + "epoch": 4.346525945470536, + "grad_norm": 0.23436285630869977, + "learning_rate": 0.00018507838894746032, + "loss": 2.786525011062622, + "step": 7415, + "token_acc": 0.3279812394751842 + }, + { + "epoch": 4.347112283787745, + "grad_norm": 0.2849407475086039, + "learning_rate": 0.0001850732952044701, + "loss": 2.814694404602051, + "step": 7416, + "token_acc": 0.32312804674207535 + }, + { + "epoch": 4.3476986221049545, + "grad_norm": 0.24376330498475554, + "learning_rate": 0.00018506820066233087, + "loss": 2.7773947715759277, + "step": 7417, + "token_acc": 0.32969840478564305 + }, + { + "epoch": 4.348284960422164, + "grad_norm": 0.2764201381438405, + "learning_rate": 0.00018506310532109054, + "loss": 2.7853097915649414, + "step": 7418, + "token_acc": 0.3287138884556012 + }, + { + "epoch": 4.348871298739373, + "grad_norm": 0.2687160350868754, + "learning_rate": 0.00018505800918079695, + "loss": 2.812912940979004, + "step": 7419, + "token_acc": 0.32458040431917196 + }, + { + "epoch": 4.349457637056582, + "grad_norm": 0.2518304951226571, + "learning_rate": 0.000185052912241498, + "loss": 2.8121728897094727, + "step": 7420, + "token_acc": 0.32392841775142617 + }, + { + "epoch": 4.350043975373791, + "grad_norm": 0.30126866689190374, + "learning_rate": 0.00018504781450324155, + "loss": 2.8240442276000977, + "step": 7421, + "token_acc": 0.32356303222887617 + }, + { + "epoch": 4.350630313691, + "grad_norm": 0.25309129540613, + "learning_rate": 0.0001850427159660755, + "loss": 2.8050942420959473, + "step": 7422, + "token_acc": 0.32553090148110236 + }, + { + "epoch": 4.351216652008208, + "grad_norm": 0.2843011454637013, + "learning_rate": 0.0001850376166300477, + "loss": 2.805994987487793, + "step": 7423, + "token_acc": 0.32634665365843424 + }, + { + "epoch": 4.351802990325417, + "grad_norm": 0.24165485876858914, + "learning_rate": 0.0001850325164952061, + "loss": 2.84590482711792, + "step": 7424, + "token_acc": 0.31832573631207434 + }, + { + "epoch": 4.3523893286426265, + "grad_norm": 0.2914030097509735, + "learning_rate": 0.00018502741556159858, + "loss": 2.795292854309082, + "step": 7425, + "token_acc": 0.326193428234822 + }, + { + "epoch": 4.352975666959836, + "grad_norm": 0.24601242198914453, + "learning_rate": 0.00018502231382927308, + "loss": 2.8465681076049805, + "step": 7426, + "token_acc": 0.3204679409320463 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.2792770905828105, + "learning_rate": 0.00018501721129827748, + "loss": 2.814530372619629, + "step": 7427, + "token_acc": 0.3249713402910222 + }, + { + "epoch": 4.354148343594254, + "grad_norm": 0.30287611203612164, + "learning_rate": 0.0001850121079686598, + "loss": 2.8049874305725098, + "step": 7428, + "token_acc": 0.3270176839980462 + }, + { + "epoch": 4.354734681911463, + "grad_norm": 0.24294831548838733, + "learning_rate": 0.00018500700384046787, + "loss": 2.8342690467834473, + "step": 7429, + "token_acc": 0.32040272588315216 + }, + { + "epoch": 4.355321020228672, + "grad_norm": 0.28869084411314827, + "learning_rate": 0.00018500189891374968, + "loss": 2.7839574813842773, + "step": 7430, + "token_acc": 0.3290280276155926 + }, + { + "epoch": 4.355907358545881, + "grad_norm": 0.26009503321293176, + "learning_rate": 0.00018499679318855324, + "loss": 2.8120007514953613, + "step": 7431, + "token_acc": 0.32519179583529045 + }, + { + "epoch": 4.35649369686309, + "grad_norm": 0.28434393827576804, + "learning_rate": 0.00018499168666492643, + "loss": 2.7974328994750977, + "step": 7432, + "token_acc": 0.32622335669359337 + }, + { + "epoch": 4.357080035180299, + "grad_norm": 0.28400574142718304, + "learning_rate": 0.00018498657934291725, + "loss": 2.8036394119262695, + "step": 7433, + "token_acc": 0.3253259812735579 + }, + { + "epoch": 4.3576663734975085, + "grad_norm": 0.24928521714247145, + "learning_rate": 0.00018498147122257368, + "loss": 2.775306224822998, + "step": 7434, + "token_acc": 0.3298007419489118 + }, + { + "epoch": 4.358252711814717, + "grad_norm": 0.2743040943339715, + "learning_rate": 0.00018497636230394374, + "loss": 2.824603796005249, + "step": 7435, + "token_acc": 0.3224029569175877 + }, + { + "epoch": 4.358839050131926, + "grad_norm": 0.24145861292711085, + "learning_rate": 0.0001849712525870753, + "loss": 2.842122793197632, + "step": 7436, + "token_acc": 0.3195661413718695 + }, + { + "epoch": 4.359425388449135, + "grad_norm": 0.2789911434682713, + "learning_rate": 0.00018496614207201654, + "loss": 2.8071703910827637, + "step": 7437, + "token_acc": 0.3244850783096543 + }, + { + "epoch": 4.360011726766344, + "grad_norm": 0.24424498794953525, + "learning_rate": 0.0001849610307588153, + "loss": 2.7769598960876465, + "step": 7438, + "token_acc": 0.32882933261893554 + }, + { + "epoch": 4.360598065083553, + "grad_norm": 0.2695181350328296, + "learning_rate": 0.0001849559186475197, + "loss": 2.861466884613037, + "step": 7439, + "token_acc": 0.3165495419257515 + }, + { + "epoch": 4.361184403400762, + "grad_norm": 0.24782700103932084, + "learning_rate": 0.0001849508057381777, + "loss": 2.7859151363372803, + "step": 7440, + "token_acc": 0.3271815286624204 + }, + { + "epoch": 4.361770741717971, + "grad_norm": 0.2659572036248464, + "learning_rate": 0.00018494569203083734, + "loss": 2.8244385719299316, + "step": 7441, + "token_acc": 0.32223399893235954 + }, + { + "epoch": 4.3623570800351805, + "grad_norm": 0.25477958971445763, + "learning_rate": 0.0001849405775255467, + "loss": 2.809879779815674, + "step": 7442, + "token_acc": 0.3260767079086042 + }, + { + "epoch": 4.36294341835239, + "grad_norm": 0.25182472903105746, + "learning_rate": 0.00018493546222235377, + "loss": 2.8057496547698975, + "step": 7443, + "token_acc": 0.3252531956478569 + }, + { + "epoch": 4.363529756669599, + "grad_norm": 0.25855156279286806, + "learning_rate": 0.00018493034612130664, + "loss": 2.859363079071045, + "step": 7444, + "token_acc": 0.3185364327271377 + }, + { + "epoch": 4.364116094986807, + "grad_norm": 0.24395031348275611, + "learning_rate": 0.00018492522922245334, + "loss": 2.8046674728393555, + "step": 7445, + "token_acc": 0.3258344240837696 + }, + { + "epoch": 4.364702433304016, + "grad_norm": 0.24759459699652248, + "learning_rate": 0.00018492011152584196, + "loss": 2.8137881755828857, + "step": 7446, + "token_acc": 0.32270133057026856 + }, + { + "epoch": 4.365288771621225, + "grad_norm": 0.23611623997166672, + "learning_rate": 0.00018491499303152056, + "loss": 2.813845634460449, + "step": 7447, + "token_acc": 0.32359776862022815 + }, + { + "epoch": 4.365875109938434, + "grad_norm": 0.3126461990164729, + "learning_rate": 0.00018490987373953724, + "loss": 2.8215651512145996, + "step": 7448, + "token_acc": 0.32253601494532297 + }, + { + "epoch": 4.366461448255643, + "grad_norm": 0.3186216936330115, + "learning_rate": 0.00018490475364994007, + "loss": 2.8012022972106934, + "step": 7449, + "token_acc": 0.3242361927144536 + }, + { + "epoch": 4.3670477865728525, + "grad_norm": 0.29457020427657277, + "learning_rate": 0.00018489963276277713, + "loss": 2.849219799041748, + "step": 7450, + "token_acc": 0.31825762803358043 + }, + { + "epoch": 4.367634124890062, + "grad_norm": 0.2501204606906152, + "learning_rate": 0.00018489451107809655, + "loss": 2.794825315475464, + "step": 7451, + "token_acc": 0.3262410617565401 + }, + { + "epoch": 4.368220463207271, + "grad_norm": 0.3406687686599865, + "learning_rate": 0.00018488938859594645, + "loss": 2.8325717449188232, + "step": 7452, + "token_acc": 0.32162174144351835 + }, + { + "epoch": 4.36880680152448, + "grad_norm": 0.27171352131296966, + "learning_rate": 0.00018488426531637492, + "loss": 2.804859161376953, + "step": 7453, + "token_acc": 0.3269292699210619 + }, + { + "epoch": 4.369393139841689, + "grad_norm": 0.28518367512576753, + "learning_rate": 0.00018487914123943008, + "loss": 2.740365982055664, + "step": 7454, + "token_acc": 0.3346309006129827 + }, + { + "epoch": 4.369979478158898, + "grad_norm": 0.29682105898632566, + "learning_rate": 0.00018487401636516011, + "loss": 2.821575164794922, + "step": 7455, + "token_acc": 0.3230612125363845 + }, + { + "epoch": 4.370565816476106, + "grad_norm": 0.2928318995174857, + "learning_rate": 0.00018486889069361314, + "loss": 2.8416078090667725, + "step": 7456, + "token_acc": 0.31991798322862175 + }, + { + "epoch": 4.371152154793315, + "grad_norm": 0.32878573778571785, + "learning_rate": 0.00018486376422483728, + "loss": 2.801227569580078, + "step": 7457, + "token_acc": 0.3238602219728676 + }, + { + "epoch": 4.3717384931105245, + "grad_norm": 0.26157052047857365, + "learning_rate": 0.00018485863695888072, + "loss": 2.8278298377990723, + "step": 7458, + "token_acc": 0.32205361306610775 + }, + { + "epoch": 4.372324831427734, + "grad_norm": 0.2961892536181837, + "learning_rate": 0.00018485350889579162, + "loss": 2.841789960861206, + "step": 7459, + "token_acc": 0.3195097709290404 + }, + { + "epoch": 4.372911169744943, + "grad_norm": 0.2443179757475132, + "learning_rate": 0.00018484838003561812, + "loss": 2.8435611724853516, + "step": 7460, + "token_acc": 0.3200068988405244 + }, + { + "epoch": 4.373497508062152, + "grad_norm": 0.30543071181233283, + "learning_rate": 0.00018484325037840845, + "loss": 2.8258790969848633, + "step": 7461, + "token_acc": 0.3233214646438883 + }, + { + "epoch": 4.374083846379361, + "grad_norm": 0.23444518822590776, + "learning_rate": 0.00018483811992421077, + "loss": 2.794872760772705, + "step": 7462, + "token_acc": 0.3262092148226592 + }, + { + "epoch": 4.37467018469657, + "grad_norm": 0.36072474751451244, + "learning_rate": 0.00018483298867307327, + "loss": 2.831958770751953, + "step": 7463, + "token_acc": 0.3214374162897071 + }, + { + "epoch": 4.375256523013779, + "grad_norm": 0.2805303492712956, + "learning_rate": 0.00018482785662504413, + "loss": 2.8262906074523926, + "step": 7464, + "token_acc": 0.32344541957330075 + }, + { + "epoch": 4.375842861330988, + "grad_norm": 0.2798396869313885, + "learning_rate": 0.00018482272378017163, + "loss": 2.860952377319336, + "step": 7465, + "token_acc": 0.3156788322708102 + }, + { + "epoch": 4.3764291996481965, + "grad_norm": 0.2589115684810431, + "learning_rate": 0.00018481759013850392, + "loss": 2.7820849418640137, + "step": 7466, + "token_acc": 0.32854825396246756 + }, + { + "epoch": 4.377015537965406, + "grad_norm": 0.2847340859494114, + "learning_rate": 0.00018481245570008925, + "loss": 2.839822292327881, + "step": 7467, + "token_acc": 0.3198406901019821 + }, + { + "epoch": 4.377601876282615, + "grad_norm": 0.2590622290856762, + "learning_rate": 0.0001848073204649758, + "loss": 2.7962377071380615, + "step": 7468, + "token_acc": 0.32662777969228274 + }, + { + "epoch": 4.378188214599824, + "grad_norm": 0.26939016228154095, + "learning_rate": 0.00018480218443321192, + "loss": 2.802985906600952, + "step": 7469, + "token_acc": 0.32542228837808074 + }, + { + "epoch": 4.378774552917033, + "grad_norm": 0.25696054590749384, + "learning_rate": 0.00018479704760484574, + "loss": 2.8410744667053223, + "step": 7470, + "token_acc": 0.3198787125351274 + }, + { + "epoch": 4.379360891234242, + "grad_norm": 0.28611742125613693, + "learning_rate": 0.00018479190997992557, + "loss": 2.8431754112243652, + "step": 7471, + "token_acc": 0.32072118756739215 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.24632064668504558, + "learning_rate": 0.0001847867715584997, + "loss": 2.8539669513702393, + "step": 7472, + "token_acc": 0.31879176525901204 + }, + { + "epoch": 4.38053356786866, + "grad_norm": 0.270109463436155, + "learning_rate": 0.00018478163234061633, + "loss": 2.826313018798828, + "step": 7473, + "token_acc": 0.3217656450318911 + }, + { + "epoch": 4.381119906185869, + "grad_norm": 0.2294447079125213, + "learning_rate": 0.00018477649232632377, + "loss": 2.806281566619873, + "step": 7474, + "token_acc": 0.325930981200103 + }, + { + "epoch": 4.3817062445030786, + "grad_norm": 0.2721520430124895, + "learning_rate": 0.00018477135151567033, + "loss": 2.8207802772521973, + "step": 7475, + "token_acc": 0.3217781359967971 + }, + { + "epoch": 4.382292582820288, + "grad_norm": 0.24117047248464785, + "learning_rate": 0.00018476620990870424, + "loss": 2.812748908996582, + "step": 7476, + "token_acc": 0.32572373991191617 + }, + { + "epoch": 4.382878921137497, + "grad_norm": 0.2774552387321152, + "learning_rate": 0.00018476106750547384, + "loss": 2.846264123916626, + "step": 7477, + "token_acc": 0.31913908511257505 + }, + { + "epoch": 4.383465259454705, + "grad_norm": 0.2493480408202868, + "learning_rate": 0.00018475592430602743, + "loss": 2.8156681060791016, + "step": 7478, + "token_acc": 0.32345000321109757 + }, + { + "epoch": 4.384051597771914, + "grad_norm": 0.2356671289412216, + "learning_rate": 0.00018475078031041333, + "loss": 2.829155683517456, + "step": 7479, + "token_acc": 0.32324218485161593 + }, + { + "epoch": 4.384637936089123, + "grad_norm": 0.27505341445681825, + "learning_rate": 0.00018474563551867986, + "loss": 2.8232085704803467, + "step": 7480, + "token_acc": 0.3228202033405189 + }, + { + "epoch": 4.385224274406332, + "grad_norm": 0.24697200121043764, + "learning_rate": 0.00018474048993087533, + "loss": 2.812225580215454, + "step": 7481, + "token_acc": 0.3247289623128549 + }, + { + "epoch": 4.3858106127235414, + "grad_norm": 0.24605840924966021, + "learning_rate": 0.00018473534354704807, + "loss": 2.8080828189849854, + "step": 7482, + "token_acc": 0.3236353198371021 + }, + { + "epoch": 4.386396951040751, + "grad_norm": 0.2540521217304517, + "learning_rate": 0.00018473019636724644, + "loss": 2.8516621589660645, + "step": 7483, + "token_acc": 0.3180080738925843 + }, + { + "epoch": 4.38698328935796, + "grad_norm": 0.24968551555319693, + "learning_rate": 0.0001847250483915188, + "loss": 2.819540023803711, + "step": 7484, + "token_acc": 0.3239070946033658 + }, + { + "epoch": 4.387569627675169, + "grad_norm": 0.2577191060926478, + "learning_rate": 0.00018471989961991352, + "loss": 2.818208694458008, + "step": 7485, + "token_acc": 0.322424696114665 + }, + { + "epoch": 4.388155965992378, + "grad_norm": 0.24788598187225602, + "learning_rate": 0.00018471475005247894, + "loss": 2.822981357574463, + "step": 7486, + "token_acc": 0.32137172148826837 + }, + { + "epoch": 4.388742304309587, + "grad_norm": 0.23539170279259933, + "learning_rate": 0.0001847095996892634, + "loss": 2.7876975536346436, + "step": 7487, + "token_acc": 0.3276886966612163 + }, + { + "epoch": 4.389328642626795, + "grad_norm": 0.24338792878012794, + "learning_rate": 0.00018470444853031535, + "loss": 2.814864158630371, + "step": 7488, + "token_acc": 0.3226865579758472 + }, + { + "epoch": 4.389914980944004, + "grad_norm": 0.22651877835897277, + "learning_rate": 0.00018469929657568312, + "loss": 2.845851182937622, + "step": 7489, + "token_acc": 0.31883428374032036 + }, + { + "epoch": 4.3905013192612135, + "grad_norm": 0.2590390065359618, + "learning_rate": 0.00018469414382541518, + "loss": 2.8050179481506348, + "step": 7490, + "token_acc": 0.3262142053944146 + }, + { + "epoch": 4.391087657578423, + "grad_norm": 0.22230814204997276, + "learning_rate": 0.00018468899027955984, + "loss": 2.8113574981689453, + "step": 7491, + "token_acc": 0.3242332809787551 + }, + { + "epoch": 4.391673995895632, + "grad_norm": 0.2605781214041405, + "learning_rate": 0.00018468383593816555, + "loss": 2.823817729949951, + "step": 7492, + "token_acc": 0.32187318133026394 + }, + { + "epoch": 4.392260334212841, + "grad_norm": 0.2544185786793318, + "learning_rate": 0.0001846786808012808, + "loss": 2.8755605220794678, + "step": 7493, + "token_acc": 0.3159327000380823 + }, + { + "epoch": 4.39284667253005, + "grad_norm": 0.28288620277312204, + "learning_rate": 0.0001846735248689539, + "loss": 2.8294219970703125, + "step": 7494, + "token_acc": 0.32254745999128537 + }, + { + "epoch": 4.393433010847259, + "grad_norm": 0.258163487821477, + "learning_rate": 0.00018466836814123335, + "loss": 2.836784839630127, + "step": 7495, + "token_acc": 0.3206309693817391 + }, + { + "epoch": 4.394019349164468, + "grad_norm": 0.25159277235452043, + "learning_rate": 0.00018466321061816755, + "loss": 2.7986326217651367, + "step": 7496, + "token_acc": 0.32642096717135083 + }, + { + "epoch": 4.394605687481677, + "grad_norm": 0.2734817920851079, + "learning_rate": 0.00018465805229980498, + "loss": 2.877167224884033, + "step": 7497, + "token_acc": 0.3154312836221546 + }, + { + "epoch": 4.395192025798886, + "grad_norm": 0.35692770495698906, + "learning_rate": 0.0001846528931861941, + "loss": 2.8124876022338867, + "step": 7498, + "token_acc": 0.32456011989161315 + }, + { + "epoch": 4.395778364116095, + "grad_norm": 0.45778941110682464, + "learning_rate": 0.00018464773327738336, + "loss": 2.8079657554626465, + "step": 7499, + "token_acc": 0.3247841491163361 + }, + { + "epoch": 4.396364702433304, + "grad_norm": 0.3141493753615565, + "learning_rate": 0.00018464257257342118, + "loss": 2.8134732246398926, + "step": 7500, + "token_acc": 0.3246066780439981 + }, + { + "epoch": 4.396951040750513, + "grad_norm": 0.28180349341437266, + "learning_rate": 0.00018463741107435614, + "loss": 2.8101422786712646, + "step": 7501, + "token_acc": 0.32392081977354026 + }, + { + "epoch": 4.397537379067722, + "grad_norm": 0.3227499011081679, + "learning_rate": 0.00018463224878023668, + "loss": 2.824122905731201, + "step": 7502, + "token_acc": 0.3227849554834808 + }, + { + "epoch": 4.398123717384931, + "grad_norm": 0.2568344379646359, + "learning_rate": 0.00018462708569111128, + "loss": 2.850935459136963, + "step": 7503, + "token_acc": 0.3177146152055693 + }, + { + "epoch": 4.39871005570214, + "grad_norm": 0.32413419867836946, + "learning_rate": 0.0001846219218070284, + "loss": 2.834317684173584, + "step": 7504, + "token_acc": 0.32036655390079194 + }, + { + "epoch": 4.399296394019349, + "grad_norm": 0.23842504434135364, + "learning_rate": 0.00018461675712803667, + "loss": 2.854387044906616, + "step": 7505, + "token_acc": 0.31712410423026033 + }, + { + "epoch": 4.399882732336558, + "grad_norm": 0.28480909719674335, + "learning_rate": 0.00018461159165418447, + "loss": 2.7860805988311768, + "step": 7506, + "token_acc": 0.32932264989951315 + }, + { + "epoch": 4.4004690706537675, + "grad_norm": 0.2554176220314128, + "learning_rate": 0.0001846064253855204, + "loss": 2.8351945877075195, + "step": 7507, + "token_acc": 0.320398403225169 + }, + { + "epoch": 4.401055408970977, + "grad_norm": 0.25782240507866955, + "learning_rate": 0.000184601258322093, + "loss": 2.823171615600586, + "step": 7508, + "token_acc": 0.32194886457594296 + }, + { + "epoch": 4.401641747288186, + "grad_norm": 0.2608368634224289, + "learning_rate": 0.0001845960904639507, + "loss": 2.797649383544922, + "step": 7509, + "token_acc": 0.3266473962211777 + }, + { + "epoch": 4.402228085605394, + "grad_norm": 0.2952272562801867, + "learning_rate": 0.00018459092181114222, + "loss": 2.8523616790771484, + "step": 7510, + "token_acc": 0.31784478497281266 + }, + { + "epoch": 4.402814423922603, + "grad_norm": 0.2424137248369666, + "learning_rate": 0.00018458575236371595, + "loss": 2.8539533615112305, + "step": 7511, + "token_acc": 0.31906207900569294 + }, + { + "epoch": 4.403400762239812, + "grad_norm": 0.25429988737721926, + "learning_rate": 0.00018458058212172056, + "loss": 2.857726812362671, + "step": 7512, + "token_acc": 0.31731123880796475 + }, + { + "epoch": 4.403987100557021, + "grad_norm": 0.24527297593155334, + "learning_rate": 0.00018457541108520456, + "loss": 2.8450088500976562, + "step": 7513, + "token_acc": 0.3190523296590564 + }, + { + "epoch": 4.40457343887423, + "grad_norm": 0.25075864752553134, + "learning_rate": 0.00018457023925421654, + "loss": 2.8414525985717773, + "step": 7514, + "token_acc": 0.3192677027125418 + }, + { + "epoch": 4.4051597771914395, + "grad_norm": 0.2305568752809105, + "learning_rate": 0.00018456506662880507, + "loss": 2.79876446723938, + "step": 7515, + "token_acc": 0.32583267659380316 + }, + { + "epoch": 4.405746115508649, + "grad_norm": 0.2527565832974834, + "learning_rate": 0.00018455989320901876, + "loss": 2.76139497756958, + "step": 7516, + "token_acc": 0.33267204131099165 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.23173111998779827, + "learning_rate": 0.0001845547189949062, + "loss": 2.8168601989746094, + "step": 7517, + "token_acc": 0.32405750923882526 + }, + { + "epoch": 4.406918792143067, + "grad_norm": 0.26359720632726735, + "learning_rate": 0.000184549543986516, + "loss": 2.836848497390747, + "step": 7518, + "token_acc": 0.3207381704909224 + }, + { + "epoch": 4.407505130460276, + "grad_norm": 0.24613130124419194, + "learning_rate": 0.00018454436818389676, + "loss": 2.829709529876709, + "step": 7519, + "token_acc": 0.3211919588273764 + }, + { + "epoch": 4.408091468777485, + "grad_norm": 0.259813917786715, + "learning_rate": 0.0001845391915870971, + "loss": 2.826183319091797, + "step": 7520, + "token_acc": 0.3205655251304783 + }, + { + "epoch": 4.408677807094693, + "grad_norm": 0.24254868245861122, + "learning_rate": 0.00018453401419616565, + "loss": 2.810854911804199, + "step": 7521, + "token_acc": 0.32620550372716767 + }, + { + "epoch": 4.409264145411902, + "grad_norm": 0.2513801498981925, + "learning_rate": 0.00018452883601115106, + "loss": 2.874814748764038, + "step": 7522, + "token_acc": 0.31515510924857015 + }, + { + "epoch": 4.4098504837291115, + "grad_norm": 0.25210140035107903, + "learning_rate": 0.00018452365703210195, + "loss": 2.8333945274353027, + "step": 7523, + "token_acc": 0.32058536383247055 + }, + { + "epoch": 4.410436822046321, + "grad_norm": 0.2526980781632405, + "learning_rate": 0.00018451847725906698, + "loss": 2.8420867919921875, + "step": 7524, + "token_acc": 0.3200683255620701 + }, + { + "epoch": 4.41102316036353, + "grad_norm": 0.26129697425638454, + "learning_rate": 0.00018451329669209483, + "loss": 2.8255414962768555, + "step": 7525, + "token_acc": 0.3221202341460315 + }, + { + "epoch": 4.411609498680739, + "grad_norm": 0.2608816802452931, + "learning_rate": 0.00018450811533123412, + "loss": 2.8358936309814453, + "step": 7526, + "token_acc": 0.32078285669431433 + }, + { + "epoch": 4.412195836997948, + "grad_norm": 0.27501187345812267, + "learning_rate": 0.00018450293317653354, + "loss": 2.823366165161133, + "step": 7527, + "token_acc": 0.32151920889122254 + }, + { + "epoch": 4.412782175315157, + "grad_norm": 0.2998283932480553, + "learning_rate": 0.00018449775022804176, + "loss": 2.7763662338256836, + "step": 7528, + "token_acc": 0.3320185584180403 + }, + { + "epoch": 4.413368513632366, + "grad_norm": 0.25235150930051264, + "learning_rate": 0.0001844925664858075, + "loss": 2.822849750518799, + "step": 7529, + "token_acc": 0.3229004208772192 + }, + { + "epoch": 4.413954851949575, + "grad_norm": 0.273642005806271, + "learning_rate": 0.00018448738194987944, + "loss": 2.8518691062927246, + "step": 7530, + "token_acc": 0.3196752728240618 + }, + { + "epoch": 4.4145411902667835, + "grad_norm": 0.3012883180946971, + "learning_rate": 0.00018448219662030625, + "loss": 2.824174642562866, + "step": 7531, + "token_acc": 0.3232736654775785 + }, + { + "epoch": 4.415127528583993, + "grad_norm": 0.27965091291508765, + "learning_rate": 0.0001844770104971367, + "loss": 2.803652763366699, + "step": 7532, + "token_acc": 0.3261475946141396 + }, + { + "epoch": 4.415713866901202, + "grad_norm": 0.237697541514221, + "learning_rate": 0.00018447182358041943, + "loss": 2.7950549125671387, + "step": 7533, + "token_acc": 0.32593421172430836 + }, + { + "epoch": 4.416300205218411, + "grad_norm": 0.2900524086476858, + "learning_rate": 0.00018446663587020322, + "loss": 2.835469961166382, + "step": 7534, + "token_acc": 0.32011954334233317 + }, + { + "epoch": 4.41688654353562, + "grad_norm": 0.2846375564624913, + "learning_rate": 0.0001844614473665368, + "loss": 2.808065176010132, + "step": 7535, + "token_acc": 0.32274238948857975 + }, + { + "epoch": 4.417472881852829, + "grad_norm": 0.2586241173064911, + "learning_rate": 0.00018445625806946887, + "loss": 2.8396291732788086, + "step": 7536, + "token_acc": 0.32136194395966394 + }, + { + "epoch": 4.418059220170038, + "grad_norm": 0.3049325154891526, + "learning_rate": 0.00018445106797904823, + "loss": 2.8562135696411133, + "step": 7537, + "token_acc": 0.31745504122573526 + }, + { + "epoch": 4.418645558487247, + "grad_norm": 0.35100214991079315, + "learning_rate": 0.0001844458770953236, + "loss": 2.843250274658203, + "step": 7538, + "token_acc": 0.3202434437362867 + }, + { + "epoch": 4.419231896804456, + "grad_norm": 0.27282960892861985, + "learning_rate": 0.0001844406854183437, + "loss": 2.8035359382629395, + "step": 7539, + "token_acc": 0.32677366645582967 + }, + { + "epoch": 4.4198182351216655, + "grad_norm": 0.2644995704039485, + "learning_rate": 0.0001844354929481574, + "loss": 2.8357295989990234, + "step": 7540, + "token_acc": 0.3197456769504127 + }, + { + "epoch": 4.420404573438875, + "grad_norm": 0.29052950665999944, + "learning_rate": 0.0001844302996848134, + "loss": 2.8587915897369385, + "step": 7541, + "token_acc": 0.31716682169887106 + }, + { + "epoch": 4.420990911756084, + "grad_norm": 0.24364408761410675, + "learning_rate": 0.00018442510562836052, + "loss": 2.8119564056396484, + "step": 7542, + "token_acc": 0.325652274762248 + }, + { + "epoch": 4.421577250073292, + "grad_norm": 0.2524167382271862, + "learning_rate": 0.00018441991077884753, + "loss": 2.8635408878326416, + "step": 7543, + "token_acc": 0.31652915766182316 + }, + { + "epoch": 4.422163588390501, + "grad_norm": 0.25292487422078413, + "learning_rate": 0.00018441471513632322, + "loss": 2.811081886291504, + "step": 7544, + "token_acc": 0.32360527482326956 + }, + { + "epoch": 4.42274992670771, + "grad_norm": 0.23421140131752005, + "learning_rate": 0.00018440951870083642, + "loss": 2.818826913833618, + "step": 7545, + "token_acc": 0.32250273864418005 + }, + { + "epoch": 4.423336265024919, + "grad_norm": 0.24485967734155728, + "learning_rate": 0.00018440432147243596, + "loss": 2.8264498710632324, + "step": 7546, + "token_acc": 0.32216061527454837 + }, + { + "epoch": 4.423922603342128, + "grad_norm": 0.24473415634227536, + "learning_rate": 0.0001843991234511706, + "loss": 2.8714261054992676, + "step": 7547, + "token_acc": 0.3162602423956264 + }, + { + "epoch": 4.4245089416593375, + "grad_norm": 0.24255771660243144, + "learning_rate": 0.00018439392463708923, + "loss": 2.793022632598877, + "step": 7548, + "token_acc": 0.32887164464389596 + }, + { + "epoch": 4.425095279976547, + "grad_norm": 0.256376106147076, + "learning_rate": 0.00018438872503024066, + "loss": 2.8813676834106445, + "step": 7549, + "token_acc": 0.3148656135198662 + }, + { + "epoch": 4.425681618293756, + "grad_norm": 0.24394831549153587, + "learning_rate": 0.00018438352463067372, + "loss": 2.8206028938293457, + "step": 7550, + "token_acc": 0.3236345916546808 + }, + { + "epoch": 4.426267956610965, + "grad_norm": 0.28263447895312116, + "learning_rate": 0.00018437832343843726, + "loss": 2.828641891479492, + "step": 7551, + "token_acc": 0.3197032168465391 + }, + { + "epoch": 4.426854294928174, + "grad_norm": 0.29927512465755, + "learning_rate": 0.00018437312145358018, + "loss": 2.8647751808166504, + "step": 7552, + "token_acc": 0.3148723176246112 + }, + { + "epoch": 4.427440633245382, + "grad_norm": 0.24584354832458374, + "learning_rate": 0.00018436791867615132, + "loss": 2.834036111831665, + "step": 7553, + "token_acc": 0.3193284545845889 + }, + { + "epoch": 4.428026971562591, + "grad_norm": 0.25984077818669643, + "learning_rate": 0.00018436271510619952, + "loss": 2.800201892852783, + "step": 7554, + "token_acc": 0.3248483932926171 + }, + { + "epoch": 4.4286133098798, + "grad_norm": 0.31749065808529503, + "learning_rate": 0.00018435751074377375, + "loss": 2.7959418296813965, + "step": 7555, + "token_acc": 0.3262571339408974 + }, + { + "epoch": 4.4291996481970095, + "grad_norm": 0.31148652057948895, + "learning_rate": 0.00018435230558892278, + "loss": 2.816371440887451, + "step": 7556, + "token_acc": 0.32256624304344217 + }, + { + "epoch": 4.429785986514219, + "grad_norm": 0.27053789502434866, + "learning_rate": 0.0001843470996416956, + "loss": 2.842940330505371, + "step": 7557, + "token_acc": 0.31853133218010377 + }, + { + "epoch": 4.430372324831428, + "grad_norm": 0.3012481775206665, + "learning_rate": 0.00018434189290214106, + "loss": 2.838082790374756, + "step": 7558, + "token_acc": 0.3200302128808309 + }, + { + "epoch": 4.430958663148637, + "grad_norm": 0.26973785392052835, + "learning_rate": 0.0001843366853703081, + "loss": 2.804936408996582, + "step": 7559, + "token_acc": 0.32386304810094324 + }, + { + "epoch": 4.431545001465846, + "grad_norm": 0.26781086810824756, + "learning_rate": 0.0001843314770462456, + "loss": 2.8340702056884766, + "step": 7560, + "token_acc": 0.3190427747904859 + }, + { + "epoch": 4.432131339783055, + "grad_norm": 0.3130702051821676, + "learning_rate": 0.00018432626793000255, + "loss": 2.84051513671875, + "step": 7561, + "token_acc": 0.3185650262246334 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 0.2582564755805302, + "learning_rate": 0.00018432105802162783, + "loss": 2.8592031002044678, + "step": 7562, + "token_acc": 0.31721227749082914 + }, + { + "epoch": 4.433304016417473, + "grad_norm": 0.29265668627900765, + "learning_rate": 0.00018431584732117037, + "loss": 2.8405814170837402, + "step": 7563, + "token_acc": 0.3198559578162176 + }, + { + "epoch": 4.4338903547346815, + "grad_norm": 0.24068537032061196, + "learning_rate": 0.00018431063582867915, + "loss": 2.8366122245788574, + "step": 7564, + "token_acc": 0.3204174326524279 + }, + { + "epoch": 4.434476693051891, + "grad_norm": 0.30265062253321445, + "learning_rate": 0.00018430542354420313, + "loss": 2.8384690284729004, + "step": 7565, + "token_acc": 0.3201527841791029 + }, + { + "epoch": 4.4350630313691, + "grad_norm": 0.2882114115040358, + "learning_rate": 0.00018430021046779126, + "loss": 2.8192219734191895, + "step": 7566, + "token_acc": 0.3220837041309244 + }, + { + "epoch": 4.435649369686309, + "grad_norm": 0.2422760938544285, + "learning_rate": 0.0001842949965994925, + "loss": 2.8562135696411133, + "step": 7567, + "token_acc": 0.31603166974969726 + }, + { + "epoch": 4.436235708003518, + "grad_norm": 0.24261972065093299, + "learning_rate": 0.00018428978193935585, + "loss": 2.791912078857422, + "step": 7568, + "token_acc": 0.32633406271277177 + }, + { + "epoch": 4.436822046320727, + "grad_norm": 0.26858292217634006, + "learning_rate": 0.00018428456648743026, + "loss": 2.8356292247772217, + "step": 7569, + "token_acc": 0.3218397972047879 + }, + { + "epoch": 4.437408384637936, + "grad_norm": 0.26882968517475314, + "learning_rate": 0.00018427935024376474, + "loss": 2.8390235900878906, + "step": 7570, + "token_acc": 0.31889707394937417 + }, + { + "epoch": 4.437994722955145, + "grad_norm": 0.23885338800863146, + "learning_rate": 0.0001842741332084083, + "loss": 2.8280577659606934, + "step": 7571, + "token_acc": 0.32107144639484764 + }, + { + "epoch": 4.438581061272354, + "grad_norm": 0.2593815909380509, + "learning_rate": 0.00018426891538140999, + "loss": 2.8573594093322754, + "step": 7572, + "token_acc": 0.31738128816270983 + }, + { + "epoch": 4.4391673995895635, + "grad_norm": 0.234711161106046, + "learning_rate": 0.00018426369676281871, + "loss": 2.793976306915283, + "step": 7573, + "token_acc": 0.32644046144019206 + }, + { + "epoch": 4.439753737906772, + "grad_norm": 0.2576300328520498, + "learning_rate": 0.00018425847735268356, + "loss": 2.8758177757263184, + "step": 7574, + "token_acc": 0.314799037143869 + }, + { + "epoch": 4.440340076223981, + "grad_norm": 0.23774499876129487, + "learning_rate": 0.00018425325715105357, + "loss": 2.810102939605713, + "step": 7575, + "token_acc": 0.3242615734539399 + }, + { + "epoch": 4.44092641454119, + "grad_norm": 0.2408904460414873, + "learning_rate": 0.00018424803615797774, + "loss": 2.8378005027770996, + "step": 7576, + "token_acc": 0.31937275904566953 + }, + { + "epoch": 4.441512752858399, + "grad_norm": 0.25831445762506355, + "learning_rate": 0.00018424281437350514, + "loss": 2.8527908325195312, + "step": 7577, + "token_acc": 0.31651328026854775 + }, + { + "epoch": 4.442099091175608, + "grad_norm": 0.25854493025000974, + "learning_rate": 0.00018423759179768485, + "loss": 2.842883825302124, + "step": 7578, + "token_acc": 0.3202257238726486 + }, + { + "epoch": 4.442685429492817, + "grad_norm": 0.2607906446626564, + "learning_rate": 0.00018423236843056586, + "loss": 2.837601900100708, + "step": 7579, + "token_acc": 0.320505957040621 + }, + { + "epoch": 4.443271767810026, + "grad_norm": 0.2523269174841092, + "learning_rate": 0.00018422714427219727, + "loss": 2.8212642669677734, + "step": 7580, + "token_acc": 0.321689218759091 + }, + { + "epoch": 4.4438581061272355, + "grad_norm": 0.23376552188734268, + "learning_rate": 0.00018422191932262818, + "loss": 2.8117740154266357, + "step": 7581, + "token_acc": 0.3251956550058621 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.28522257517835853, + "learning_rate": 0.00018421669358190763, + "loss": 2.8422253131866455, + "step": 7582, + "token_acc": 0.3203974454397352 + }, + { + "epoch": 4.445030782761654, + "grad_norm": 0.2901075874254731, + "learning_rate": 0.00018421146705008474, + "loss": 2.833270788192749, + "step": 7583, + "token_acc": 0.32162403498845604 + }, + { + "epoch": 4.445617121078863, + "grad_norm": 0.2701571805264613, + "learning_rate": 0.0001842062397272086, + "loss": 2.821427822113037, + "step": 7584, + "token_acc": 0.32338791844794806 + }, + { + "epoch": 4.446203459396072, + "grad_norm": 0.2663567426397365, + "learning_rate": 0.0001842010116133283, + "loss": 2.8433446884155273, + "step": 7585, + "token_acc": 0.3204010438858117 + }, + { + "epoch": 4.44678979771328, + "grad_norm": 0.2685978192882868, + "learning_rate": 0.00018419578270849294, + "loss": 2.869859218597412, + "step": 7586, + "token_acc": 0.31602902497714935 + }, + { + "epoch": 4.447376136030489, + "grad_norm": 0.25702851703654667, + "learning_rate": 0.00018419055301275168, + "loss": 2.8544278144836426, + "step": 7587, + "token_acc": 0.3187604842242475 + }, + { + "epoch": 4.447962474347698, + "grad_norm": 0.24812503461958954, + "learning_rate": 0.0001841853225261536, + "loss": 2.8512682914733887, + "step": 7588, + "token_acc": 0.31857121003518435 + }, + { + "epoch": 4.4485488126649075, + "grad_norm": 0.2730652004481719, + "learning_rate": 0.00018418009124874789, + "loss": 2.8665080070495605, + "step": 7589, + "token_acc": 0.3148870455121688 + }, + { + "epoch": 4.449135150982117, + "grad_norm": 0.24112196148090861, + "learning_rate": 0.00018417485918058364, + "loss": 2.8224263191223145, + "step": 7590, + "token_acc": 0.3240234079103125 + }, + { + "epoch": 4.449721489299326, + "grad_norm": 0.25514301679841356, + "learning_rate": 0.00018416962632171, + "loss": 2.8792271614074707, + "step": 7591, + "token_acc": 0.31615314235641295 + }, + { + "epoch": 4.450307827616535, + "grad_norm": 0.249957691130563, + "learning_rate": 0.00018416439267217617, + "loss": 2.8368616104125977, + "step": 7592, + "token_acc": 0.32170399748626227 + }, + { + "epoch": 4.450894165933744, + "grad_norm": 0.24715512652708144, + "learning_rate": 0.00018415915823203127, + "loss": 2.829893112182617, + "step": 7593, + "token_acc": 0.32298647821948134 + }, + { + "epoch": 4.451480504250953, + "grad_norm": 0.24901244890256663, + "learning_rate": 0.00018415392300132446, + "loss": 2.8170974254608154, + "step": 7594, + "token_acc": 0.3252518729361072 + }, + { + "epoch": 4.452066842568162, + "grad_norm": 0.28268378554774404, + "learning_rate": 0.00018414868698010496, + "loss": 2.819002151489258, + "step": 7595, + "token_acc": 0.3228680407498418 + }, + { + "epoch": 4.45265318088537, + "grad_norm": 0.2456659097790723, + "learning_rate": 0.00018414345016842196, + "loss": 2.805337429046631, + "step": 7596, + "token_acc": 0.3270001268579403 + }, + { + "epoch": 4.4532395192025795, + "grad_norm": 0.25690610632220606, + "learning_rate": 0.0001841382125663246, + "loss": 2.821662425994873, + "step": 7597, + "token_acc": 0.32342470961378433 + }, + { + "epoch": 4.453825857519789, + "grad_norm": 0.29265604747385204, + "learning_rate": 0.00018413297417386212, + "loss": 2.8232250213623047, + "step": 7598, + "token_acc": 0.3229396851724518 + }, + { + "epoch": 4.454412195836998, + "grad_norm": 0.3453231541848402, + "learning_rate": 0.00018412773499108372, + "loss": 2.8285274505615234, + "step": 7599, + "token_acc": 0.3225101318331771 + }, + { + "epoch": 4.454998534154207, + "grad_norm": 0.2839377598121682, + "learning_rate": 0.00018412249501803863, + "loss": 2.845349073410034, + "step": 7600, + "token_acc": 0.3197847008736623 + }, + { + "epoch": 4.455584872471416, + "grad_norm": 0.2724549842334263, + "learning_rate": 0.00018411725425477603, + "loss": 2.8273396492004395, + "step": 7601, + "token_acc": 0.3214776865395716 + }, + { + "epoch": 4.456171210788625, + "grad_norm": 0.37296077424605156, + "learning_rate": 0.00018411201270134519, + "loss": 2.820012092590332, + "step": 7602, + "token_acc": 0.3242395146625757 + }, + { + "epoch": 4.456757549105834, + "grad_norm": 0.24039967189706796, + "learning_rate": 0.00018410677035779534, + "loss": 2.8533453941345215, + "step": 7603, + "token_acc": 0.31911072865836376 + }, + { + "epoch": 4.457343887423043, + "grad_norm": 0.30928955260272556, + "learning_rate": 0.00018410152722417569, + "loss": 2.789132833480835, + "step": 7604, + "token_acc": 0.3275837783155981 + }, + { + "epoch": 4.457930225740252, + "grad_norm": 0.27759620470150814, + "learning_rate": 0.00018409628330053553, + "loss": 2.8005530834198, + "step": 7605, + "token_acc": 0.3262782444959444 + }, + { + "epoch": 4.4585165640574616, + "grad_norm": 0.29330952398537463, + "learning_rate": 0.00018409103858692413, + "loss": 2.8571019172668457, + "step": 7606, + "token_acc": 0.3177122280789063 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.2790118431679959, + "learning_rate": 0.00018408579308339072, + "loss": 2.820038318634033, + "step": 7607, + "token_acc": 0.32336490729625883 + }, + { + "epoch": 4.459689240691879, + "grad_norm": 0.2958092304985593, + "learning_rate": 0.00018408054678998461, + "loss": 2.868168830871582, + "step": 7608, + "token_acc": 0.315758896151053 + }, + { + "epoch": 4.460275579009088, + "grad_norm": 0.28362988914876425, + "learning_rate": 0.00018407529970675503, + "loss": 2.820218086242676, + "step": 7609, + "token_acc": 0.32304653551394485 + }, + { + "epoch": 4.460861917326297, + "grad_norm": 0.2924205538995259, + "learning_rate": 0.0001840700518337513, + "loss": 2.848245620727539, + "step": 7610, + "token_acc": 0.31917014907381364 + }, + { + "epoch": 4.461448255643506, + "grad_norm": 0.2685462146681838, + "learning_rate": 0.00018406480317102278, + "loss": 2.8176627159118652, + "step": 7611, + "token_acc": 0.323820666914854 + }, + { + "epoch": 4.462034593960715, + "grad_norm": 0.2725157207960877, + "learning_rate": 0.00018405955371861865, + "loss": 2.815708637237549, + "step": 7612, + "token_acc": 0.32371825659640946 + }, + { + "epoch": 4.4626209322779244, + "grad_norm": 0.2472827958635882, + "learning_rate": 0.0001840543034765883, + "loss": 2.827860116958618, + "step": 7613, + "token_acc": 0.32094730928397913 + }, + { + "epoch": 4.463207270595134, + "grad_norm": 0.27251339159499216, + "learning_rate": 0.00018404905244498104, + "loss": 2.7749600410461426, + "step": 7614, + "token_acc": 0.32916604551459666 + }, + { + "epoch": 4.463793608912343, + "grad_norm": 0.23876865879343587, + "learning_rate": 0.0001840438006238462, + "loss": 2.8428664207458496, + "step": 7615, + "token_acc": 0.32059586329586565 + }, + { + "epoch": 4.464379947229552, + "grad_norm": 0.318390904270576, + "learning_rate": 0.00018403854801323307, + "loss": 2.832314968109131, + "step": 7616, + "token_acc": 0.3212141358319656 + }, + { + "epoch": 4.464966285546761, + "grad_norm": 0.2634444733489264, + "learning_rate": 0.00018403329461319105, + "loss": 2.839538097381592, + "step": 7617, + "token_acc": 0.3198911390378612 + }, + { + "epoch": 4.465552623863969, + "grad_norm": 0.2934033645289138, + "learning_rate": 0.00018402804042376946, + "loss": 2.8125712871551514, + "step": 7618, + "token_acc": 0.3247531326482555 + }, + { + "epoch": 4.466138962181178, + "grad_norm": 0.2773547420284079, + "learning_rate": 0.00018402278544501764, + "loss": 2.8212928771972656, + "step": 7619, + "token_acc": 0.3231636522792019 + }, + { + "epoch": 4.466725300498387, + "grad_norm": 0.283115780973382, + "learning_rate": 0.000184017529676985, + "loss": 2.8487982749938965, + "step": 7620, + "token_acc": 0.31756186227341715 + }, + { + "epoch": 4.4673116388155965, + "grad_norm": 0.2525810592529912, + "learning_rate": 0.0001840122731197209, + "loss": 2.850487232208252, + "step": 7621, + "token_acc": 0.31851261427098937 + }, + { + "epoch": 4.467897977132806, + "grad_norm": 0.26651622606811115, + "learning_rate": 0.00018400701577327467, + "loss": 2.819931745529175, + "step": 7622, + "token_acc": 0.32462687575131355 + }, + { + "epoch": 4.468484315450015, + "grad_norm": 0.2513106816248567, + "learning_rate": 0.00018400175763769573, + "loss": 2.8364052772521973, + "step": 7623, + "token_acc": 0.31906281701729683 + }, + { + "epoch": 4.469070653767224, + "grad_norm": 0.3403004040400084, + "learning_rate": 0.00018399649871303348, + "loss": 2.807452440261841, + "step": 7624, + "token_acc": 0.3250064336347827 + }, + { + "epoch": 4.469656992084433, + "grad_norm": 0.28545288123916884, + "learning_rate": 0.0001839912389993373, + "loss": 2.80859375, + "step": 7625, + "token_acc": 0.3249975894446341 + }, + { + "epoch": 4.470243330401642, + "grad_norm": 0.28630422773219927, + "learning_rate": 0.00018398597849665662, + "loss": 2.865186929702759, + "step": 7626, + "token_acc": 0.31636102458408477 + }, + { + "epoch": 4.470829668718851, + "grad_norm": 0.2540941942448384, + "learning_rate": 0.00018398071720504084, + "loss": 2.817577362060547, + "step": 7627, + "token_acc": 0.3226896408574859 + }, + { + "epoch": 4.47141600703606, + "grad_norm": 0.29702887857342286, + "learning_rate": 0.0001839754551245394, + "loss": 2.8362388610839844, + "step": 7628, + "token_acc": 0.32158888945310515 + }, + { + "epoch": 4.4720023453532685, + "grad_norm": 0.25611099666619275, + "learning_rate": 0.00018397019225520168, + "loss": 2.8643293380737305, + "step": 7629, + "token_acc": 0.3151626564543723 + }, + { + "epoch": 4.472588683670478, + "grad_norm": 0.29086159210294393, + "learning_rate": 0.00018396492859707722, + "loss": 2.837221622467041, + "step": 7630, + "token_acc": 0.318861793227144 + }, + { + "epoch": 4.473175021987687, + "grad_norm": 0.2802747278749033, + "learning_rate": 0.00018395966415021535, + "loss": 2.8511292934417725, + "step": 7631, + "token_acc": 0.3168966633934354 + }, + { + "epoch": 4.473761360304896, + "grad_norm": 0.26831293554504737, + "learning_rate": 0.00018395439891466558, + "loss": 2.7913360595703125, + "step": 7632, + "token_acc": 0.32810912369968637 + }, + { + "epoch": 4.474347698622105, + "grad_norm": 0.31200261526343953, + "learning_rate": 0.00018394913289047736, + "loss": 2.8551840782165527, + "step": 7633, + "token_acc": 0.3164997506648936 + }, + { + "epoch": 4.474934036939314, + "grad_norm": 0.24670151769544466, + "learning_rate": 0.00018394386607770017, + "loss": 2.8199048042297363, + "step": 7634, + "token_acc": 0.3236031165443487 + }, + { + "epoch": 4.475520375256523, + "grad_norm": 0.2919775453483244, + "learning_rate": 0.00018393859847638347, + "loss": 2.8268532752990723, + "step": 7635, + "token_acc": 0.3230495517193157 + }, + { + "epoch": 4.476106713573732, + "grad_norm": 0.24387830948670028, + "learning_rate": 0.00018393333008657673, + "loss": 2.801215648651123, + "step": 7636, + "token_acc": 0.3256830738956026 + }, + { + "epoch": 4.476693051890941, + "grad_norm": 0.273144353056082, + "learning_rate": 0.0001839280609083295, + "loss": 2.812460422515869, + "step": 7637, + "token_acc": 0.32563834491714105 + }, + { + "epoch": 4.4772793902081505, + "grad_norm": 0.24173550853568615, + "learning_rate": 0.00018392279094169118, + "loss": 2.7972640991210938, + "step": 7638, + "token_acc": 0.3267057925873278 + }, + { + "epoch": 4.477865728525359, + "grad_norm": 0.27795020684064486, + "learning_rate": 0.00018391752018671133, + "loss": 2.8707380294799805, + "step": 7639, + "token_acc": 0.31554272966144786 + }, + { + "epoch": 4.478452066842568, + "grad_norm": 0.2603277479645157, + "learning_rate": 0.0001839122486434395, + "loss": 2.87839937210083, + "step": 7640, + "token_acc": 0.31514530680623803 + }, + { + "epoch": 4.479038405159777, + "grad_norm": 0.29210511045907017, + "learning_rate": 0.00018390697631192511, + "loss": 2.851154088973999, + "step": 7641, + "token_acc": 0.31763267635425035 + }, + { + "epoch": 4.479624743476986, + "grad_norm": 0.27213655322917163, + "learning_rate": 0.0001839017031922178, + "loss": 2.827998638153076, + "step": 7642, + "token_acc": 0.3203301497589253 + }, + { + "epoch": 4.480211081794195, + "grad_norm": 0.3049019968192061, + "learning_rate": 0.00018389642928436702, + "loss": 2.872258186340332, + "step": 7643, + "token_acc": 0.3156141225828676 + }, + { + "epoch": 4.480797420111404, + "grad_norm": 0.3101507805328225, + "learning_rate": 0.00018389115458842238, + "loss": 2.8185393810272217, + "step": 7644, + "token_acc": 0.3225825597362621 + }, + { + "epoch": 4.481383758428613, + "grad_norm": 0.2959430211857069, + "learning_rate": 0.00018388587910443332, + "loss": 2.8609461784362793, + "step": 7645, + "token_acc": 0.3172168108220871 + }, + { + "epoch": 4.4819700967458225, + "grad_norm": 0.268563893286785, + "learning_rate": 0.0001838806028324495, + "loss": 2.8676962852478027, + "step": 7646, + "token_acc": 0.3152370034904104 + }, + { + "epoch": 4.482556435063032, + "grad_norm": 0.34094091087210315, + "learning_rate": 0.00018387532577252043, + "loss": 2.8793482780456543, + "step": 7647, + "token_acc": 0.3144535516914936 + }, + { + "epoch": 4.483142773380241, + "grad_norm": 0.28140159307069845, + "learning_rate": 0.00018387004792469572, + "loss": 2.79286527633667, + "step": 7648, + "token_acc": 0.3260709474629547 + }, + { + "epoch": 4.48372911169745, + "grad_norm": 0.3250694021714327, + "learning_rate": 0.00018386476928902492, + "loss": 2.8382630348205566, + "step": 7649, + "token_acc": 0.3209516376456657 + }, + { + "epoch": 4.484315450014659, + "grad_norm": 0.27516948356377313, + "learning_rate": 0.00018385948986555763, + "loss": 2.820890426635742, + "step": 7650, + "token_acc": 0.3231284495801548 + }, + { + "epoch": 4.484901788331867, + "grad_norm": 0.31502497244087235, + "learning_rate": 0.00018385420965434342, + "loss": 2.835212230682373, + "step": 7651, + "token_acc": 0.3200458731747557 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.24235914572527933, + "learning_rate": 0.0001838489286554319, + "loss": 2.842158079147339, + "step": 7652, + "token_acc": 0.3187563894096249 + }, + { + "epoch": 4.486074464966285, + "grad_norm": 0.2942956750675823, + "learning_rate": 0.00018384364686887267, + "loss": 2.856235980987549, + "step": 7653, + "token_acc": 0.31860349335189225 + }, + { + "epoch": 4.4866608032834945, + "grad_norm": 0.2289267153608233, + "learning_rate": 0.0001838383642947154, + "loss": 2.812288761138916, + "step": 7654, + "token_acc": 0.3251164079091425 + }, + { + "epoch": 4.487247141600704, + "grad_norm": 0.29317433239100943, + "learning_rate": 0.00018383308093300964, + "loss": 2.8690412044525146, + "step": 7655, + "token_acc": 0.3162895083292732 + }, + { + "epoch": 4.487833479917913, + "grad_norm": 0.2465923727972142, + "learning_rate": 0.00018382779678380507, + "loss": 2.8019704818725586, + "step": 7656, + "token_acc": 0.32623603228688036 + }, + { + "epoch": 4.488419818235122, + "grad_norm": 0.24356240021414, + "learning_rate": 0.00018382251184715132, + "loss": 2.811600923538208, + "step": 7657, + "token_acc": 0.3241353404450265 + }, + { + "epoch": 4.489006156552331, + "grad_norm": 0.24048811537882683, + "learning_rate": 0.00018381722612309797, + "loss": 2.8528389930725098, + "step": 7658, + "token_acc": 0.317128401275659 + }, + { + "epoch": 4.48959249486954, + "grad_norm": 0.24933375790915877, + "learning_rate": 0.00018381193961169477, + "loss": 2.8360793590545654, + "step": 7659, + "token_acc": 0.31974614499722104 + }, + { + "epoch": 4.490178833186749, + "grad_norm": 0.2555165924536825, + "learning_rate": 0.0001838066523129913, + "loss": 2.8652899265289307, + "step": 7660, + "token_acc": 0.3151818532054686 + }, + { + "epoch": 4.490765171503957, + "grad_norm": 0.252482794351262, + "learning_rate": 0.00018380136422703732, + "loss": 2.8721370697021484, + "step": 7661, + "token_acc": 0.3140247078185871 + }, + { + "epoch": 4.4913515098211665, + "grad_norm": 0.25200924397046026, + "learning_rate": 0.0001837960753538824, + "loss": 2.864933967590332, + "step": 7662, + "token_acc": 0.31629532778689806 + }, + { + "epoch": 4.491937848138376, + "grad_norm": 0.2658190953437351, + "learning_rate": 0.00018379078569357628, + "loss": 2.840512275695801, + "step": 7663, + "token_acc": 0.32234886157544557 + }, + { + "epoch": 4.492524186455585, + "grad_norm": 0.24935345999547168, + "learning_rate": 0.00018378549524616865, + "loss": 2.838761329650879, + "step": 7664, + "token_acc": 0.3208653955749468 + }, + { + "epoch": 4.493110524772794, + "grad_norm": 0.275530838764388, + "learning_rate": 0.0001837802040117092, + "loss": 2.793308734893799, + "step": 7665, + "token_acc": 0.32805589684810604 + }, + { + "epoch": 4.493696863090003, + "grad_norm": 0.25050665467110955, + "learning_rate": 0.00018377491199024758, + "loss": 2.8488712310791016, + "step": 7666, + "token_acc": 0.31815756111656335 + }, + { + "epoch": 4.494283201407212, + "grad_norm": 0.26769348010086874, + "learning_rate": 0.0001837696191818336, + "loss": 2.830003261566162, + "step": 7667, + "token_acc": 0.32257463616590887 + }, + { + "epoch": 4.494869539724421, + "grad_norm": 0.2926102681536656, + "learning_rate": 0.00018376432558651692, + "loss": 2.804551124572754, + "step": 7668, + "token_acc": 0.325306432935038 + }, + { + "epoch": 4.49545587804163, + "grad_norm": 0.25951568773353506, + "learning_rate": 0.00018375903120434727, + "loss": 2.7872252464294434, + "step": 7669, + "token_acc": 0.3274770784190246 + }, + { + "epoch": 4.496042216358839, + "grad_norm": 0.3362641735425992, + "learning_rate": 0.00018375373603537435, + "loss": 2.848722457885742, + "step": 7670, + "token_acc": 0.3180244797977276 + }, + { + "epoch": 4.4966285546760485, + "grad_norm": 0.2321194769328074, + "learning_rate": 0.000183748440079648, + "loss": 2.8219823837280273, + "step": 7671, + "token_acc": 0.3237619453218662 + }, + { + "epoch": 4.497214892993257, + "grad_norm": 0.33269834293338074, + "learning_rate": 0.00018374314333721787, + "loss": 2.801387310028076, + "step": 7672, + "token_acc": 0.3267991074195748 + }, + { + "epoch": 4.497801231310466, + "grad_norm": 0.26864861427734366, + "learning_rate": 0.00018373784580813377, + "loss": 2.8773651123046875, + "step": 7673, + "token_acc": 0.31324278218280566 + }, + { + "epoch": 4.498387569627675, + "grad_norm": 0.30152021613191005, + "learning_rate": 0.00018373254749244543, + "loss": 2.8401050567626953, + "step": 7674, + "token_acc": 0.3213943361985876 + }, + { + "epoch": 4.498973907944884, + "grad_norm": 0.31484038328458597, + "learning_rate": 0.00018372724839020265, + "loss": 2.840540885925293, + "step": 7675, + "token_acc": 0.3195842058891965 + }, + { + "epoch": 4.499560246262093, + "grad_norm": 0.2788085496247335, + "learning_rate": 0.00018372194850145522, + "loss": 2.8317971229553223, + "step": 7676, + "token_acc": 0.31956652147831305 + }, + { + "epoch": 4.500146584579302, + "grad_norm": 0.331859588550261, + "learning_rate": 0.00018371664782625287, + "loss": 2.7961292266845703, + "step": 7677, + "token_acc": 0.3262073908221557 + }, + { + "epoch": 4.500732922896511, + "grad_norm": 0.24874643438082666, + "learning_rate": 0.0001837113463646454, + "loss": 2.851634979248047, + "step": 7678, + "token_acc": 0.3199990560422474 + }, + { + "epoch": 4.5013192612137205, + "grad_norm": 0.27834895067658855, + "learning_rate": 0.0001837060441166827, + "loss": 2.8316891193389893, + "step": 7679, + "token_acc": 0.32185122790863147 + }, + { + "epoch": 4.50190559953093, + "grad_norm": 0.2689830235687572, + "learning_rate": 0.00018370074108241445, + "loss": 2.8524489402770996, + "step": 7680, + "token_acc": 0.3187062707684563 + }, + { + "epoch": 4.502491937848139, + "grad_norm": 0.27836591029607943, + "learning_rate": 0.00018369543726189056, + "loss": 2.8248372077941895, + "step": 7681, + "token_acc": 0.32205663189269745 + }, + { + "epoch": 4.503078276165347, + "grad_norm": 0.25176688596720914, + "learning_rate": 0.0001836901326551608, + "loss": 2.853407382965088, + "step": 7682, + "token_acc": 0.3180991711685238 + }, + { + "epoch": 4.503664614482556, + "grad_norm": 0.2734199076634955, + "learning_rate": 0.00018368482726227505, + "loss": 2.7773144245147705, + "step": 7683, + "token_acc": 0.3311217482899772 + }, + { + "epoch": 4.504250952799765, + "grad_norm": 0.25878350547295664, + "learning_rate": 0.0001836795210832831, + "loss": 2.787787675857544, + "step": 7684, + "token_acc": 0.32750612455073624 + }, + { + "epoch": 4.504837291116974, + "grad_norm": 0.26514168643647407, + "learning_rate": 0.00018367421411823477, + "loss": 2.8163645267486572, + "step": 7685, + "token_acc": 0.3240872691864917 + }, + { + "epoch": 4.505423629434183, + "grad_norm": 0.25164542393815403, + "learning_rate": 0.00018366890636717996, + "loss": 2.8686861991882324, + "step": 7686, + "token_acc": 0.3161483062149907 + }, + { + "epoch": 4.5060099677513925, + "grad_norm": 0.2526830508818113, + "learning_rate": 0.00018366359783016857, + "loss": 2.803330898284912, + "step": 7687, + "token_acc": 0.3258927285911367 + }, + { + "epoch": 4.506596306068602, + "grad_norm": 0.2351431618171929, + "learning_rate": 0.00018365828850725038, + "loss": 2.8135061264038086, + "step": 7688, + "token_acc": 0.32241193457807116 + }, + { + "epoch": 4.507182644385811, + "grad_norm": 0.24847770104026204, + "learning_rate": 0.0001836529783984753, + "loss": 2.8131935596466064, + "step": 7689, + "token_acc": 0.3239288675942136 + }, + { + "epoch": 4.50776898270302, + "grad_norm": 0.2425791739890838, + "learning_rate": 0.00018364766750389322, + "loss": 2.8346095085144043, + "step": 7690, + "token_acc": 0.32081788605583716 + }, + { + "epoch": 4.508355321020229, + "grad_norm": 0.2381367978570313, + "learning_rate": 0.00018364235582355403, + "loss": 2.7978782653808594, + "step": 7691, + "token_acc": 0.32510165705009725 + }, + { + "epoch": 4.508941659337438, + "grad_norm": 0.26345114827975474, + "learning_rate": 0.0001836370433575076, + "loss": 2.827767848968506, + "step": 7692, + "token_acc": 0.3228906957574771 + }, + { + "epoch": 4.509527997654647, + "grad_norm": 0.25325000226447536, + "learning_rate": 0.00018363173010580385, + "loss": 2.8251793384552, + "step": 7693, + "token_acc": 0.32250608655500984 + }, + { + "epoch": 4.510114335971855, + "grad_norm": 0.2363759963825418, + "learning_rate": 0.00018362641606849272, + "loss": 2.8705995082855225, + "step": 7694, + "token_acc": 0.3160190345715146 + }, + { + "epoch": 4.5107006742890645, + "grad_norm": 0.2510667041304358, + "learning_rate": 0.00018362110124562405, + "loss": 2.860729217529297, + "step": 7695, + "token_acc": 0.31565470167141313 + }, + { + "epoch": 4.511287012606274, + "grad_norm": 0.2469260663570348, + "learning_rate": 0.00018361578563724784, + "loss": 2.821065902709961, + "step": 7696, + "token_acc": 0.32349133977936745 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 0.26322320319626746, + "learning_rate": 0.000183610469243414, + "loss": 2.7909674644470215, + "step": 7697, + "token_acc": 0.3268820714236826 + }, + { + "epoch": 4.512459689240692, + "grad_norm": 0.25041222761402426, + "learning_rate": 0.00018360515206417247, + "loss": 2.845888614654541, + "step": 7698, + "token_acc": 0.3183089665704809 + }, + { + "epoch": 4.513046027557901, + "grad_norm": 0.2189679603488117, + "learning_rate": 0.00018359983409957318, + "loss": 2.8282458782196045, + "step": 7699, + "token_acc": 0.32251330786407456 + }, + { + "epoch": 4.51363236587511, + "grad_norm": 0.2427598487040979, + "learning_rate": 0.00018359451534966613, + "loss": 2.8354454040527344, + "step": 7700, + "token_acc": 0.3202570102519683 + }, + { + "epoch": 4.514218704192319, + "grad_norm": 0.22610551529621634, + "learning_rate": 0.00018358919581450123, + "loss": 2.87906551361084, + "step": 7701, + "token_acc": 0.31367836167821395 + }, + { + "epoch": 4.514805042509528, + "grad_norm": 0.24671542054712703, + "learning_rate": 0.00018358387549412844, + "loss": 2.8574416637420654, + "step": 7702, + "token_acc": 0.3174234810415715 + }, + { + "epoch": 4.515391380826737, + "grad_norm": 0.2739724118830067, + "learning_rate": 0.00018357855438859782, + "loss": 2.8229784965515137, + "step": 7703, + "token_acc": 0.32224383175075316 + }, + { + "epoch": 4.515977719143946, + "grad_norm": 0.3096248697251086, + "learning_rate": 0.00018357323249795933, + "loss": 2.8115198612213135, + "step": 7704, + "token_acc": 0.32467439302396023 + }, + { + "epoch": 4.516564057461155, + "grad_norm": 0.3448268732013189, + "learning_rate": 0.0001835679098222629, + "loss": 2.8472132682800293, + "step": 7705, + "token_acc": 0.3194717052843045 + }, + { + "epoch": 4.517150395778364, + "grad_norm": 0.2784068835044052, + "learning_rate": 0.00018356258636155855, + "loss": 2.848871946334839, + "step": 7706, + "token_acc": 0.3195979794932086 + }, + { + "epoch": 4.517736734095573, + "grad_norm": 0.25972195249527863, + "learning_rate": 0.0001835572621158963, + "loss": 2.8457953929901123, + "step": 7707, + "token_acc": 0.3176148566353388 + }, + { + "epoch": 4.518323072412782, + "grad_norm": 0.36055509196714314, + "learning_rate": 0.0001835519370853262, + "loss": 2.824819564819336, + "step": 7708, + "token_acc": 0.3227502756495434 + }, + { + "epoch": 4.518909410729991, + "grad_norm": 0.25794553624486094, + "learning_rate": 0.00018354661126989823, + "loss": 2.8323235511779785, + "step": 7709, + "token_acc": 0.321746131320494 + }, + { + "epoch": 4.5194957490472, + "grad_norm": 0.31642442782176083, + "learning_rate": 0.00018354128466966242, + "loss": 2.84260892868042, + "step": 7710, + "token_acc": 0.31852300148992313 + }, + { + "epoch": 4.520082087364409, + "grad_norm": 0.2785551984835124, + "learning_rate": 0.00018353595728466885, + "loss": 2.8041605949401855, + "step": 7711, + "token_acc": 0.32649987978183564 + }, + { + "epoch": 4.5206684256816185, + "grad_norm": 0.3010482688901615, + "learning_rate": 0.00018353062911496745, + "loss": 2.843400478363037, + "step": 7712, + "token_acc": 0.3191584350381648 + }, + { + "epoch": 4.521254763998828, + "grad_norm": 0.3251697655403242, + "learning_rate": 0.0001835253001606084, + "loss": 2.8313210010528564, + "step": 7713, + "token_acc": 0.3208970412394739 + }, + { + "epoch": 4.521841102316037, + "grad_norm": 0.2538737971415092, + "learning_rate": 0.00018351997042164171, + "loss": 2.812047243118286, + "step": 7714, + "token_acc": 0.3255329524132594 + }, + { + "epoch": 4.522427440633246, + "grad_norm": 0.3606575559231632, + "learning_rate": 0.00018351463989811742, + "loss": 2.8564791679382324, + "step": 7715, + "token_acc": 0.3180157916006093 + }, + { + "epoch": 4.523013778950454, + "grad_norm": 0.24491645535846915, + "learning_rate": 0.00018350930859008563, + "loss": 2.848299026489258, + "step": 7716, + "token_acc": 0.3195786447335857 + }, + { + "epoch": 4.523600117267663, + "grad_norm": 0.3157460233640792, + "learning_rate": 0.00018350397649759644, + "loss": 2.8247528076171875, + "step": 7717, + "token_acc": 0.32257156738072545 + }, + { + "epoch": 4.524186455584872, + "grad_norm": 0.2294509113681746, + "learning_rate": 0.0001834986436206999, + "loss": 2.8300352096557617, + "step": 7718, + "token_acc": 0.3203935667950408 + }, + { + "epoch": 4.524772793902081, + "grad_norm": 0.2941809156530237, + "learning_rate": 0.00018349330995944612, + "loss": 2.823375940322876, + "step": 7719, + "token_acc": 0.3231691992797274 + }, + { + "epoch": 4.5253591322192905, + "grad_norm": 0.2599852444692932, + "learning_rate": 0.0001834879755138852, + "loss": 2.845916271209717, + "step": 7720, + "token_acc": 0.31844565877352765 + }, + { + "epoch": 4.5259454705365, + "grad_norm": 0.2623526449500155, + "learning_rate": 0.00018348264028406725, + "loss": 2.8408684730529785, + "step": 7721, + "token_acc": 0.32048369463776233 + }, + { + "epoch": 4.526531808853709, + "grad_norm": 0.2655947954033707, + "learning_rate": 0.00018347730427004238, + "loss": 2.847322702407837, + "step": 7722, + "token_acc": 0.3197789151540735 + }, + { + "epoch": 4.527118147170918, + "grad_norm": 0.2560615980751466, + "learning_rate": 0.00018347196747186075, + "loss": 2.8806638717651367, + "step": 7723, + "token_acc": 0.3148151623948583 + }, + { + "epoch": 4.527704485488127, + "grad_norm": 0.2760801221416481, + "learning_rate": 0.0001834666298895724, + "loss": 2.8188042640686035, + "step": 7724, + "token_acc": 0.32337401880445094 + }, + { + "epoch": 4.528290823805335, + "grad_norm": 0.2568655371583902, + "learning_rate": 0.00018346129152322762, + "loss": 2.818319320678711, + "step": 7725, + "token_acc": 0.3233568433953386 + }, + { + "epoch": 4.528877162122544, + "grad_norm": 0.25719504129035675, + "learning_rate": 0.00018345595237287643, + "loss": 2.8262627124786377, + "step": 7726, + "token_acc": 0.3199121214513116 + }, + { + "epoch": 4.529463500439753, + "grad_norm": 0.2508524629357222, + "learning_rate": 0.00018345061243856903, + "loss": 2.8120267391204834, + "step": 7727, + "token_acc": 0.32446483497136464 + }, + { + "epoch": 4.5300498387569625, + "grad_norm": 0.23786750855180663, + "learning_rate": 0.0001834452717203556, + "loss": 2.8563313484191895, + "step": 7728, + "token_acc": 0.318266097169212 + }, + { + "epoch": 4.530636177074172, + "grad_norm": 0.24631307969280639, + "learning_rate": 0.00018343993021828622, + "loss": 2.8671798706054688, + "step": 7729, + "token_acc": 0.31686509529494944 + }, + { + "epoch": 4.531222515391381, + "grad_norm": 0.23813098711120922, + "learning_rate": 0.0001834345879324112, + "loss": 2.817723274230957, + "step": 7730, + "token_acc": 0.32439479057027276 + }, + { + "epoch": 4.53180885370859, + "grad_norm": 0.268699102148224, + "learning_rate": 0.00018342924486278061, + "loss": 2.805649757385254, + "step": 7731, + "token_acc": 0.32546697038724376 + }, + { + "epoch": 4.532395192025799, + "grad_norm": 0.27066613649717747, + "learning_rate": 0.00018342390100944473, + "loss": 2.8722825050354004, + "step": 7732, + "token_acc": 0.3155248404873159 + }, + { + "epoch": 4.532981530343008, + "grad_norm": 0.2705891384284354, + "learning_rate": 0.0001834185563724537, + "loss": 2.828000068664551, + "step": 7733, + "token_acc": 0.322695175248412 + }, + { + "epoch": 4.533567868660217, + "grad_norm": 0.24087655041270514, + "learning_rate": 0.00018341321095185773, + "loss": 2.844444990158081, + "step": 7734, + "token_acc": 0.31993119086715144 + }, + { + "epoch": 4.534154206977426, + "grad_norm": 0.2503007659872924, + "learning_rate": 0.00018340786474770705, + "loss": 2.8256092071533203, + "step": 7735, + "token_acc": 0.32170008482762863 + }, + { + "epoch": 4.534740545294635, + "grad_norm": 0.24199362316045547, + "learning_rate": 0.00018340251776005186, + "loss": 2.8503260612487793, + "step": 7736, + "token_acc": 0.31915609225834923 + }, + { + "epoch": 4.535326883611844, + "grad_norm": 0.2398653466851469, + "learning_rate": 0.00018339716998894243, + "loss": 2.8577136993408203, + "step": 7737, + "token_acc": 0.3166500891096957 + }, + { + "epoch": 4.535913221929053, + "grad_norm": 0.24424923574770097, + "learning_rate": 0.00018339182143442895, + "loss": 2.8343915939331055, + "step": 7738, + "token_acc": 0.31977845030604696 + }, + { + "epoch": 4.536499560246262, + "grad_norm": 0.2428426928409791, + "learning_rate": 0.00018338647209656167, + "loss": 2.8432366847991943, + "step": 7739, + "token_acc": 0.32017942494199164 + }, + { + "epoch": 4.537085898563471, + "grad_norm": 0.2575030018794465, + "learning_rate": 0.00018338112197539085, + "loss": 2.796196937561035, + "step": 7740, + "token_acc": 0.3271116900032563 + }, + { + "epoch": 4.53767223688068, + "grad_norm": 0.2245588683961331, + "learning_rate": 0.00018337577107096676, + "loss": 2.8128504753112793, + "step": 7741, + "token_acc": 0.3253845651179507 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.26632691472626097, + "learning_rate": 0.00018337041938333966, + "loss": 2.868323802947998, + "step": 7742, + "token_acc": 0.3162025963168528 + }, + { + "epoch": 4.538844913515098, + "grad_norm": 0.22042362667050988, + "learning_rate": 0.0001833650669125598, + "loss": 2.804248094558716, + "step": 7743, + "token_acc": 0.3243084074722617 + }, + { + "epoch": 4.5394312518323074, + "grad_norm": 0.27116714508763695, + "learning_rate": 0.00018335971365867745, + "loss": 2.870344638824463, + "step": 7744, + "token_acc": 0.3169775457448426 + }, + { + "epoch": 4.540017590149517, + "grad_norm": 0.2905417931458559, + "learning_rate": 0.00018335435962174297, + "loss": 2.859323024749756, + "step": 7745, + "token_acc": 0.3160662545370864 + }, + { + "epoch": 4.540603928466726, + "grad_norm": 0.32842308906063283, + "learning_rate": 0.00018334900480180654, + "loss": 2.852921485900879, + "step": 7746, + "token_acc": 0.31870050343756345 + }, + { + "epoch": 4.541190266783934, + "grad_norm": 0.3295010187610696, + "learning_rate": 0.00018334364919891856, + "loss": 2.863541841506958, + "step": 7747, + "token_acc": 0.316482854959829 + }, + { + "epoch": 4.541776605101143, + "grad_norm": 0.2990242865085673, + "learning_rate": 0.00018333829281312933, + "loss": 2.8801183700561523, + "step": 7748, + "token_acc": 0.314817130523191 + }, + { + "epoch": 4.542362943418352, + "grad_norm": 0.26277815322585135, + "learning_rate": 0.0001833329356444891, + "loss": 2.8537063598632812, + "step": 7749, + "token_acc": 0.31776040719869114 + }, + { + "epoch": 4.542949281735561, + "grad_norm": 0.2995046065951158, + "learning_rate": 0.00018332757769304824, + "loss": 2.8264999389648438, + "step": 7750, + "token_acc": 0.32131325236826636 + }, + { + "epoch": 4.54353562005277, + "grad_norm": 0.26877552571802543, + "learning_rate": 0.00018332221895885707, + "loss": 2.826925277709961, + "step": 7751, + "token_acc": 0.3233190381571242 + }, + { + "epoch": 4.5441219583699795, + "grad_norm": 0.23985361545519024, + "learning_rate": 0.00018331685944196594, + "loss": 2.84481143951416, + "step": 7752, + "token_acc": 0.3194187333169015 + }, + { + "epoch": 4.544708296687189, + "grad_norm": 0.26971939580553167, + "learning_rate": 0.0001833114991424252, + "loss": 2.833148956298828, + "step": 7753, + "token_acc": 0.3218081483366362 + }, + { + "epoch": 4.545294635004398, + "grad_norm": 0.24868414843816886, + "learning_rate": 0.00018330613806028515, + "loss": 2.8257768154144287, + "step": 7754, + "token_acc": 0.3216031205976738 + }, + { + "epoch": 4.545880973321607, + "grad_norm": 0.25962824104643906, + "learning_rate": 0.00018330077619559622, + "loss": 2.8251991271972656, + "step": 7755, + "token_acc": 0.3235306494968464 + }, + { + "epoch": 4.546467311638816, + "grad_norm": 0.27524757104128766, + "learning_rate": 0.00018329541354840875, + "loss": 2.828958034515381, + "step": 7756, + "token_acc": 0.32073924626884975 + }, + { + "epoch": 4.547053649956025, + "grad_norm": 0.24281601500884478, + "learning_rate": 0.0001832900501187731, + "loss": 2.8128223419189453, + "step": 7757, + "token_acc": 0.3229294589441851 + }, + { + "epoch": 4.547639988273234, + "grad_norm": 0.2454552967395895, + "learning_rate": 0.00018328468590673964, + "loss": 2.8386659622192383, + "step": 7758, + "token_acc": 0.32068397976032526 + }, + { + "epoch": 4.548226326590442, + "grad_norm": 0.2465237837874172, + "learning_rate": 0.0001832793209123588, + "loss": 2.809868812561035, + "step": 7759, + "token_acc": 0.3246127410102848 + }, + { + "epoch": 4.5488126649076515, + "grad_norm": 0.2657239968045349, + "learning_rate": 0.000183273955135681, + "loss": 2.806796073913574, + "step": 7760, + "token_acc": 0.3254946519450248 + }, + { + "epoch": 4.549399003224861, + "grad_norm": 0.24072194250558124, + "learning_rate": 0.00018326858857675655, + "loss": 2.8695144653320312, + "step": 7761, + "token_acc": 0.3147618632131327 + }, + { + "epoch": 4.54998534154207, + "grad_norm": 0.26340862936920906, + "learning_rate": 0.00018326322123563595, + "loss": 2.834646463394165, + "step": 7762, + "token_acc": 0.3201235014441701 + }, + { + "epoch": 4.550571679859279, + "grad_norm": 0.2657876876813026, + "learning_rate": 0.00018325785311236955, + "loss": 2.8300623893737793, + "step": 7763, + "token_acc": 0.3214318322189103 + }, + { + "epoch": 4.551158018176488, + "grad_norm": 0.2673887957125959, + "learning_rate": 0.00018325248420700784, + "loss": 2.8325138092041016, + "step": 7764, + "token_acc": 0.3220799824789601 + }, + { + "epoch": 4.551744356493697, + "grad_norm": 0.29585785907176415, + "learning_rate": 0.00018324711451960123, + "loss": 2.839362144470215, + "step": 7765, + "token_acc": 0.3210291555893481 + }, + { + "epoch": 4.552330694810906, + "grad_norm": 0.26568993539180763, + "learning_rate": 0.00018324174405020017, + "loss": 2.861663341522217, + "step": 7766, + "token_acc": 0.31811166543109887 + }, + { + "epoch": 4.552917033128115, + "grad_norm": 0.2411867977446255, + "learning_rate": 0.00018323637279885505, + "loss": 2.826723098754883, + "step": 7767, + "token_acc": 0.32242093109889325 + }, + { + "epoch": 4.5535033714453235, + "grad_norm": 0.25972251553506026, + "learning_rate": 0.0001832310007656164, + "loss": 2.822232246398926, + "step": 7768, + "token_acc": 0.32115589465343625 + }, + { + "epoch": 4.554089709762533, + "grad_norm": 0.29872736305051434, + "learning_rate": 0.0001832256279505346, + "loss": 2.8394222259521484, + "step": 7769, + "token_acc": 0.32147943801015216 + }, + { + "epoch": 4.554676048079742, + "grad_norm": 0.3222791630195521, + "learning_rate": 0.00018322025435366026, + "loss": 2.888413906097412, + "step": 7770, + "token_acc": 0.31269483690344424 + }, + { + "epoch": 4.555262386396951, + "grad_norm": 0.23816429935576924, + "learning_rate": 0.00018321487997504372, + "loss": 2.8502755165100098, + "step": 7771, + "token_acc": 0.3173263370978052 + }, + { + "epoch": 4.55584872471416, + "grad_norm": 0.2830132020626502, + "learning_rate": 0.00018320950481473552, + "loss": 2.855494499206543, + "step": 7772, + "token_acc": 0.3171771984594812 + }, + { + "epoch": 4.556435063031369, + "grad_norm": 0.3152414331914778, + "learning_rate": 0.00018320412887278616, + "loss": 2.852656841278076, + "step": 7773, + "token_acc": 0.31800564708868256 + }, + { + "epoch": 4.557021401348578, + "grad_norm": 0.28174293302466674, + "learning_rate": 0.0001831987521492461, + "loss": 2.819453001022339, + "step": 7774, + "token_acc": 0.3228402010780189 + }, + { + "epoch": 4.557607739665787, + "grad_norm": 0.23975331590030624, + "learning_rate": 0.0001831933746441659, + "loss": 2.7996208667755127, + "step": 7775, + "token_acc": 0.32663832589104913 + }, + { + "epoch": 4.558194077982996, + "grad_norm": 0.27299421877467245, + "learning_rate": 0.00018318799635759603, + "loss": 2.8425116539001465, + "step": 7776, + "token_acc": 0.3198681153750968 + }, + { + "epoch": 4.5587804163002055, + "grad_norm": 0.23405750301381278, + "learning_rate": 0.00018318261728958706, + "loss": 2.8368468284606934, + "step": 7777, + "token_acc": 0.3206944868782857 + }, + { + "epoch": 4.559366754617415, + "grad_norm": 0.23190484163000724, + "learning_rate": 0.0001831772374401895, + "loss": 2.8276829719543457, + "step": 7778, + "token_acc": 0.322621805138251 + }, + { + "epoch": 4.559953092934624, + "grad_norm": 0.2435189152224899, + "learning_rate": 0.00018317185680945383, + "loss": 2.8515148162841797, + "step": 7779, + "token_acc": 0.31801157734681357 + }, + { + "epoch": 4.560539431251832, + "grad_norm": 0.2218372995137078, + "learning_rate": 0.00018316647539743066, + "loss": 2.8129982948303223, + "step": 7780, + "token_acc": 0.32315392008066507 + }, + { + "epoch": 4.561125769569041, + "grad_norm": 0.24771568155657886, + "learning_rate": 0.00018316109320417053, + "loss": 2.863996744155884, + "step": 7781, + "token_acc": 0.31552516411378556 + }, + { + "epoch": 4.56171210788625, + "grad_norm": 0.22352530859501157, + "learning_rate": 0.00018315571022972397, + "loss": 2.8565673828125, + "step": 7782, + "token_acc": 0.3180355092399173 + }, + { + "epoch": 4.562298446203459, + "grad_norm": 0.265209352841456, + "learning_rate": 0.00018315032647414162, + "loss": 2.8475418090820312, + "step": 7783, + "token_acc": 0.31968437810740213 + }, + { + "epoch": 4.562884784520668, + "grad_norm": 0.2858027314490506, + "learning_rate": 0.00018314494193747395, + "loss": 2.8420217037200928, + "step": 7784, + "token_acc": 0.32071090335114133 + }, + { + "epoch": 4.5634711228378775, + "grad_norm": 0.22678790269619628, + "learning_rate": 0.0001831395566197716, + "loss": 2.8355610370635986, + "step": 7785, + "token_acc": 0.32029979588030233 + }, + { + "epoch": 4.564057461155087, + "grad_norm": 0.28791520626009, + "learning_rate": 0.00018313417052108513, + "loss": 2.849102020263672, + "step": 7786, + "token_acc": 0.31815035555129245 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 0.274288373159216, + "learning_rate": 0.0001831287836414652, + "loss": 2.8657054901123047, + "step": 7787, + "token_acc": 0.3156505886974125 + }, + { + "epoch": 4.565230137789505, + "grad_norm": 0.23586249416258506, + "learning_rate": 0.0001831233959809623, + "loss": 2.8282546997070312, + "step": 7788, + "token_acc": 0.322474327967383 + }, + { + "epoch": 4.565816476106714, + "grad_norm": 0.32314642666883303, + "learning_rate": 0.00018311800753962717, + "loss": 2.8721020221710205, + "step": 7789, + "token_acc": 0.31546210369235006 + }, + { + "epoch": 4.566402814423922, + "grad_norm": 0.29037838166563834, + "learning_rate": 0.00018311261831751032, + "loss": 2.862663745880127, + "step": 7790, + "token_acc": 0.31788014633894984 + }, + { + "epoch": 4.566989152741131, + "grad_norm": 0.23265434731409984, + "learning_rate": 0.00018310722831466243, + "loss": 2.8437297344207764, + "step": 7791, + "token_acc": 0.31765811417413337 + }, + { + "epoch": 4.56757549105834, + "grad_norm": 0.2800196567931235, + "learning_rate": 0.00018310183753113415, + "loss": 2.8966193199157715, + "step": 7792, + "token_acc": 0.3117929985702088 + }, + { + "epoch": 4.5681618293755495, + "grad_norm": 0.23139904469358016, + "learning_rate": 0.00018309644596697605, + "loss": 2.8361258506774902, + "step": 7793, + "token_acc": 0.3207524775977687 + }, + { + "epoch": 4.568748167692759, + "grad_norm": 0.22594515254513886, + "learning_rate": 0.0001830910536222388, + "loss": 2.8370652198791504, + "step": 7794, + "token_acc": 0.31921325786846555 + }, + { + "epoch": 4.569334506009968, + "grad_norm": 0.24472308179898072, + "learning_rate": 0.0001830856604969731, + "loss": 2.8430404663085938, + "step": 7795, + "token_acc": 0.320558266045107 + }, + { + "epoch": 4.569920844327177, + "grad_norm": 0.27807011160822387, + "learning_rate": 0.00018308026659122958, + "loss": 2.8427541255950928, + "step": 7796, + "token_acc": 0.31951301449937514 + }, + { + "epoch": 4.570507182644386, + "grad_norm": 0.256299708169299, + "learning_rate": 0.00018307487190505887, + "loss": 2.8402578830718994, + "step": 7797, + "token_acc": 0.3198915327774388 + }, + { + "epoch": 4.571093520961595, + "grad_norm": 0.23425294686614173, + "learning_rate": 0.00018306947643851172, + "loss": 2.8055636882781982, + "step": 7798, + "token_acc": 0.32476934771742755 + }, + { + "epoch": 4.571679859278804, + "grad_norm": 0.27587340913572717, + "learning_rate": 0.00018306408019163876, + "loss": 2.829970598220825, + "step": 7799, + "token_acc": 0.3211069555588057 + }, + { + "epoch": 4.572266197596013, + "grad_norm": 0.25970224807584597, + "learning_rate": 0.0001830586831644907, + "loss": 2.854948043823242, + "step": 7800, + "token_acc": 0.3184034189286032 + }, + { + "epoch": 4.572852535913222, + "grad_norm": 0.23958864770343005, + "learning_rate": 0.00018305328535711822, + "loss": 2.846958637237549, + "step": 7801, + "token_acc": 0.3191086677937993 + }, + { + "epoch": 4.573438874230431, + "grad_norm": 0.2857315569416896, + "learning_rate": 0.00018304788676957206, + "loss": 2.862722396850586, + "step": 7802, + "token_acc": 0.3171784826656557 + }, + { + "epoch": 4.57402521254764, + "grad_norm": 0.23342086728346284, + "learning_rate": 0.0001830424874019029, + "loss": 2.8656482696533203, + "step": 7803, + "token_acc": 0.3162522688064393 + }, + { + "epoch": 4.574611550864849, + "grad_norm": 0.25177894095160985, + "learning_rate": 0.00018303708725416149, + "loss": 2.8239309787750244, + "step": 7804, + "token_acc": 0.32340771561155673 + }, + { + "epoch": 4.575197889182058, + "grad_norm": 0.27419890713268885, + "learning_rate": 0.00018303168632639852, + "loss": 2.8536462783813477, + "step": 7805, + "token_acc": 0.3174355331058873 + }, + { + "epoch": 4.575784227499267, + "grad_norm": 0.2721272329225644, + "learning_rate": 0.00018302628461866477, + "loss": 2.856517791748047, + "step": 7806, + "token_acc": 0.31823291010986926 + }, + { + "epoch": 4.576370565816476, + "grad_norm": 0.27017719850495087, + "learning_rate": 0.00018302088213101092, + "loss": 2.874941349029541, + "step": 7807, + "token_acc": 0.3150082049068856 + }, + { + "epoch": 4.576956904133685, + "grad_norm": 0.27348075464918403, + "learning_rate": 0.00018301547886348778, + "loss": 2.8055880069732666, + "step": 7808, + "token_acc": 0.32659511928361473 + }, + { + "epoch": 4.577543242450894, + "grad_norm": 0.3572565493349303, + "learning_rate": 0.00018301007481614606, + "loss": 2.8623769283294678, + "step": 7809, + "token_acc": 0.31739804473692534 + }, + { + "epoch": 4.5781295807681035, + "grad_norm": 0.30548058456911925, + "learning_rate": 0.00018300466998903657, + "loss": 2.8241465091705322, + "step": 7810, + "token_acc": 0.3217803614547543 + }, + { + "epoch": 4.578715919085313, + "grad_norm": 0.24111871402030108, + "learning_rate": 0.00018299926438221004, + "loss": 2.8353607654571533, + "step": 7811, + "token_acc": 0.32163103144036054 + }, + { + "epoch": 4.579302257402521, + "grad_norm": 0.37468889057899146, + "learning_rate": 0.00018299385799571728, + "loss": 2.821384906768799, + "step": 7812, + "token_acc": 0.3228421136732069 + }, + { + "epoch": 4.57988859571973, + "grad_norm": 0.27531522437819533, + "learning_rate": 0.00018298845082960905, + "loss": 2.8252944946289062, + "step": 7813, + "token_acc": 0.32101906020003773 + }, + { + "epoch": 4.580474934036939, + "grad_norm": 0.2463421751008095, + "learning_rate": 0.00018298304288393615, + "loss": 2.812098503112793, + "step": 7814, + "token_acc": 0.3242411284603719 + }, + { + "epoch": 4.581061272354148, + "grad_norm": 0.24738873897900018, + "learning_rate": 0.00018297763415874938, + "loss": 2.8685102462768555, + "step": 7815, + "token_acc": 0.3167015599706038 + }, + { + "epoch": 4.581647610671357, + "grad_norm": 0.2533642443646533, + "learning_rate": 0.00018297222465409955, + "loss": 2.8492283821105957, + "step": 7816, + "token_acc": 0.31875176892457174 + }, + { + "epoch": 4.582233948988566, + "grad_norm": 0.2630910394796865, + "learning_rate": 0.00018296681437003745, + "loss": 2.8641209602355957, + "step": 7817, + "token_acc": 0.31842938665605924 + }, + { + "epoch": 4.5828202873057755, + "grad_norm": 0.23626625670532087, + "learning_rate": 0.000182961403306614, + "loss": 2.8185791969299316, + "step": 7818, + "token_acc": 0.32204823704518354 + }, + { + "epoch": 4.583406625622985, + "grad_norm": 0.2577162684483696, + "learning_rate": 0.0001829559914638799, + "loss": 2.8485255241394043, + "step": 7819, + "token_acc": 0.31982671350371233 + }, + { + "epoch": 4.583992963940194, + "grad_norm": 0.24053110902988406, + "learning_rate": 0.00018295057884188607, + "loss": 2.8399159908294678, + "step": 7820, + "token_acc": 0.319246256094878 + }, + { + "epoch": 4.584579302257403, + "grad_norm": 0.2513793498290437, + "learning_rate": 0.00018294516544068332, + "loss": 2.83280086517334, + "step": 7821, + "token_acc": 0.32151699474263357 + }, + { + "epoch": 4.585165640574612, + "grad_norm": 0.23046525781071497, + "learning_rate": 0.0001829397512603225, + "loss": 2.8607687950134277, + "step": 7822, + "token_acc": 0.3169031854334047 + }, + { + "epoch": 4.585751978891821, + "grad_norm": 0.24966284611350234, + "learning_rate": 0.0001829343363008545, + "loss": 2.8428847789764404, + "step": 7823, + "token_acc": 0.32100159051071375 + }, + { + "epoch": 4.586338317209029, + "grad_norm": 0.24049942168834643, + "learning_rate": 0.00018292892056233015, + "loss": 2.8300681114196777, + "step": 7824, + "token_acc": 0.3207356734327689 + }, + { + "epoch": 4.586924655526238, + "grad_norm": 0.22481430650226586, + "learning_rate": 0.00018292350404480035, + "loss": 2.870157241821289, + "step": 7825, + "token_acc": 0.3145985618753535 + }, + { + "epoch": 4.5875109938434475, + "grad_norm": 0.23572608428657887, + "learning_rate": 0.00018291808674831595, + "loss": 2.8344593048095703, + "step": 7826, + "token_acc": 0.3206641519030262 + }, + { + "epoch": 4.588097332160657, + "grad_norm": 0.2481824877489451, + "learning_rate": 0.0001829126686729279, + "loss": 2.811589241027832, + "step": 7827, + "token_acc": 0.32411180998898026 + }, + { + "epoch": 4.588683670477866, + "grad_norm": 0.2450593350411226, + "learning_rate": 0.000182907249818687, + "loss": 2.85050106048584, + "step": 7828, + "token_acc": 0.3180259753501829 + }, + { + "epoch": 4.589270008795075, + "grad_norm": 0.24852448877468736, + "learning_rate": 0.00018290183018564426, + "loss": 2.8246614933013916, + "step": 7829, + "token_acc": 0.32319598721562603 + }, + { + "epoch": 4.589856347112284, + "grad_norm": 0.27086950756073297, + "learning_rate": 0.0001828964097738505, + "loss": 2.8780689239501953, + "step": 7830, + "token_acc": 0.31384846838143543 + }, + { + "epoch": 4.590442685429493, + "grad_norm": 0.24772652969747724, + "learning_rate": 0.0001828909885833567, + "loss": 2.8088693618774414, + "step": 7831, + "token_acc": 0.3244980862235561 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.2796965972877741, + "learning_rate": 0.00018288556661421375, + "loss": 2.862424612045288, + "step": 7832, + "token_acc": 0.3149672024203946 + }, + { + "epoch": 4.59161536206391, + "grad_norm": 0.3445269304582732, + "learning_rate": 0.0001828801438664726, + "loss": 2.7970192432403564, + "step": 7833, + "token_acc": 0.32732064860009946 + }, + { + "epoch": 4.5922017003811195, + "grad_norm": 0.27887179214695224, + "learning_rate": 0.00018287472034018415, + "loss": 2.8342642784118652, + "step": 7834, + "token_acc": 0.32108564289538766 + }, + { + "epoch": 4.592788038698329, + "grad_norm": 0.2585063846205059, + "learning_rate": 0.0001828692960353994, + "loss": 2.8776583671569824, + "step": 7835, + "token_acc": 0.3147913803386666 + }, + { + "epoch": 4.593374377015538, + "grad_norm": 0.2831366553211178, + "learning_rate": 0.00018286387095216929, + "loss": 2.847933292388916, + "step": 7836, + "token_acc": 0.3188762085993271 + }, + { + "epoch": 4.593960715332747, + "grad_norm": 0.29325624175012105, + "learning_rate": 0.00018285844509054473, + "loss": 2.8458805084228516, + "step": 7837, + "token_acc": 0.319692704970949 + }, + { + "epoch": 4.594547053649956, + "grad_norm": 0.22571006618178352, + "learning_rate": 0.00018285301845057675, + "loss": 2.8271756172180176, + "step": 7838, + "token_acc": 0.32144297845553 + }, + { + "epoch": 4.595133391967165, + "grad_norm": 0.31784175217154903, + "learning_rate": 0.00018284759103231633, + "loss": 2.863452911376953, + "step": 7839, + "token_acc": 0.3160327320356369 + }, + { + "epoch": 4.595719730284374, + "grad_norm": 0.3037661600289063, + "learning_rate": 0.00018284216283581442, + "loss": 2.8247365951538086, + "step": 7840, + "token_acc": 0.32238351441571966 + }, + { + "epoch": 4.596306068601583, + "grad_norm": 0.2586601393515336, + "learning_rate": 0.000182836733861122, + "loss": 2.8592166900634766, + "step": 7841, + "token_acc": 0.3171403675778043 + }, + { + "epoch": 4.596892406918792, + "grad_norm": 0.29988819639009845, + "learning_rate": 0.00018283130410829012, + "loss": 2.8516881465911865, + "step": 7842, + "token_acc": 0.31908946692518825 + }, + { + "epoch": 4.5974787452360015, + "grad_norm": 0.2693751474874164, + "learning_rate": 0.00018282587357736974, + "loss": 2.8492956161499023, + "step": 7843, + "token_acc": 0.31783809356662673 + }, + { + "epoch": 4.598065083553211, + "grad_norm": 0.27202559174314184, + "learning_rate": 0.0001828204422684119, + "loss": 2.8375842571258545, + "step": 7844, + "token_acc": 0.3208790445028977 + }, + { + "epoch": 4.598651421870419, + "grad_norm": 0.2899618506522721, + "learning_rate": 0.0001828150101814676, + "loss": 2.8572168350219727, + "step": 7845, + "token_acc": 0.31727218325615 + }, + { + "epoch": 4.599237760187628, + "grad_norm": 0.24226520770534732, + "learning_rate": 0.00018280957731658788, + "loss": 2.850071907043457, + "step": 7846, + "token_acc": 0.317172711221743 + }, + { + "epoch": 4.599824098504837, + "grad_norm": 0.23559293701167766, + "learning_rate": 0.00018280414367382374, + "loss": 2.8359532356262207, + "step": 7847, + "token_acc": 0.31939518533630457 + }, + { + "epoch": 4.600410436822046, + "grad_norm": 0.24035691124530492, + "learning_rate": 0.00018279870925322632, + "loss": 2.893117904663086, + "step": 7848, + "token_acc": 0.31267529638132274 + }, + { + "epoch": 4.600996775139255, + "grad_norm": 0.2554469630008216, + "learning_rate": 0.00018279327405484652, + "loss": 2.811690330505371, + "step": 7849, + "token_acc": 0.324535231264522 + }, + { + "epoch": 4.601583113456464, + "grad_norm": 0.2396565782233264, + "learning_rate": 0.00018278783807873552, + "loss": 2.848278045654297, + "step": 7850, + "token_acc": 0.3203838251180919 + }, + { + "epoch": 4.6021694517736735, + "grad_norm": 0.25213005700708674, + "learning_rate": 0.00018278240132494432, + "loss": 2.8447041511535645, + "step": 7851, + "token_acc": 0.32032794122931757 + }, + { + "epoch": 4.602755790090883, + "grad_norm": 0.2947351926593888, + "learning_rate": 0.000182776963793524, + "loss": 2.866109848022461, + "step": 7852, + "token_acc": 0.31711083668276285 + }, + { + "epoch": 4.603342128408092, + "grad_norm": 0.28526032288419606, + "learning_rate": 0.0001827715254845257, + "loss": 2.8366386890411377, + "step": 7853, + "token_acc": 0.32065958237809034 + }, + { + "epoch": 4.603928466725301, + "grad_norm": 0.23936794135796188, + "learning_rate": 0.00018276608639800039, + "loss": 2.8710098266601562, + "step": 7854, + "token_acc": 0.31488979653772714 + }, + { + "epoch": 4.604514805042509, + "grad_norm": 0.2805361461361784, + "learning_rate": 0.00018276064653399926, + "loss": 2.8772459030151367, + "step": 7855, + "token_acc": 0.3143370061531657 + }, + { + "epoch": 4.605101143359718, + "grad_norm": 0.24174380212392402, + "learning_rate": 0.00018275520589257336, + "loss": 2.815774440765381, + "step": 7856, + "token_acc": 0.32284009340136643 + }, + { + "epoch": 4.605687481676927, + "grad_norm": 0.2833659999438425, + "learning_rate": 0.00018274976447377384, + "loss": 2.8373234272003174, + "step": 7857, + "token_acc": 0.32079275323695255 + }, + { + "epoch": 4.606273819994136, + "grad_norm": 0.27678413035499294, + "learning_rate": 0.0001827443222776518, + "loss": 2.8721837997436523, + "step": 7858, + "token_acc": 0.3146242712021359 + }, + { + "epoch": 4.6068601583113455, + "grad_norm": 0.2572475958811099, + "learning_rate": 0.00018273887930425828, + "loss": 2.8448867797851562, + "step": 7859, + "token_acc": 0.3174822167673039 + }, + { + "epoch": 4.607446496628555, + "grad_norm": 0.2870587087704763, + "learning_rate": 0.00018273343555364456, + "loss": 2.832009792327881, + "step": 7860, + "token_acc": 0.32387689709390727 + }, + { + "epoch": 4.608032834945764, + "grad_norm": 0.29934819039576477, + "learning_rate": 0.00018272799102586165, + "loss": 2.870086669921875, + "step": 7861, + "token_acc": 0.314192259224115 + }, + { + "epoch": 4.608619173262973, + "grad_norm": 0.26667296532610885, + "learning_rate": 0.00018272254572096076, + "loss": 2.8548755645751953, + "step": 7862, + "token_acc": 0.31737830458986327 + }, + { + "epoch": 4.609205511580182, + "grad_norm": 0.29723707616637146, + "learning_rate": 0.00018271709963899304, + "loss": 2.8359103202819824, + "step": 7863, + "token_acc": 0.3206878099817106 + }, + { + "epoch": 4.609791849897391, + "grad_norm": 0.2648519552224178, + "learning_rate": 0.00018271165278000958, + "loss": 2.8475544452667236, + "step": 7864, + "token_acc": 0.3190191384580127 + }, + { + "epoch": 4.6103781882146, + "grad_norm": 0.2656417297804327, + "learning_rate": 0.00018270620514406166, + "loss": 2.835358142852783, + "step": 7865, + "token_acc": 0.32036495936593334 + }, + { + "epoch": 4.610964526531809, + "grad_norm": 0.2559937316247502, + "learning_rate": 0.00018270075673120035, + "loss": 2.827864646911621, + "step": 7866, + "token_acc": 0.3214763251615773 + }, + { + "epoch": 4.6115508648490176, + "grad_norm": 0.27065655416297224, + "learning_rate": 0.00018269530754147688, + "loss": 2.8784027099609375, + "step": 7867, + "token_acc": 0.3158678361761053 + }, + { + "epoch": 4.612137203166227, + "grad_norm": 0.2734542264336458, + "learning_rate": 0.0001826898575749424, + "loss": 2.8656935691833496, + "step": 7868, + "token_acc": 0.31775990265596316 + }, + { + "epoch": 4.612723541483436, + "grad_norm": 0.29642634184843547, + "learning_rate": 0.0001826844068316482, + "loss": 2.90169620513916, + "step": 7869, + "token_acc": 0.31115029944285966 + }, + { + "epoch": 4.613309879800645, + "grad_norm": 0.24135977214080212, + "learning_rate": 0.00018267895531164538, + "loss": 2.8739449977874756, + "step": 7870, + "token_acc": 0.31650011243064596 + }, + { + "epoch": 4.613896218117854, + "grad_norm": 0.2681046065163028, + "learning_rate": 0.0001826735030149852, + "loss": 2.823215961456299, + "step": 7871, + "token_acc": 0.3232908578933598 + }, + { + "epoch": 4.614482556435063, + "grad_norm": 0.27730839712865574, + "learning_rate": 0.00018266804994171882, + "loss": 2.873626947402954, + "step": 7872, + "token_acc": 0.3131181404665526 + }, + { + "epoch": 4.615068894752272, + "grad_norm": 0.2833721416001539, + "learning_rate": 0.00018266259609189754, + "loss": 2.837857961654663, + "step": 7873, + "token_acc": 0.32025166947943334 + }, + { + "epoch": 4.615655233069481, + "grad_norm": 0.2444579688224471, + "learning_rate": 0.00018265714146557257, + "loss": 2.822655200958252, + "step": 7874, + "token_acc": 0.3246272132342961 + }, + { + "epoch": 4.6162415713866904, + "grad_norm": 0.26811158077423547, + "learning_rate": 0.00018265168606279515, + "loss": 2.8396034240722656, + "step": 7875, + "token_acc": 0.3194193122813502 + }, + { + "epoch": 4.616827909703899, + "grad_norm": 0.26225489752115166, + "learning_rate": 0.00018264622988361647, + "loss": 2.8712821006774902, + "step": 7876, + "token_acc": 0.31535483305678313 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.25238118945757915, + "learning_rate": 0.00018264077292808785, + "loss": 2.782496452331543, + "step": 7877, + "token_acc": 0.3298738094008054 + }, + { + "epoch": 4.618000586338317, + "grad_norm": 0.24067269461754287, + "learning_rate": 0.0001826353151962605, + "loss": 2.842684745788574, + "step": 7878, + "token_acc": 0.3180024376926282 + }, + { + "epoch": 4.618586924655526, + "grad_norm": 0.25935330099613063, + "learning_rate": 0.00018262985668818574, + "loss": 2.813405752182007, + "step": 7879, + "token_acc": 0.3238514854036227 + }, + { + "epoch": 4.619173262972735, + "grad_norm": 0.23512847483886792, + "learning_rate": 0.00018262439740391483, + "loss": 2.8301210403442383, + "step": 7880, + "token_acc": 0.32151287372940424 + }, + { + "epoch": 4.619759601289944, + "grad_norm": 0.2379078941600059, + "learning_rate": 0.00018261893734349905, + "loss": 2.883781909942627, + "step": 7881, + "token_acc": 0.31364861028533475 + }, + { + "epoch": 4.620345939607153, + "grad_norm": 0.2312883308309407, + "learning_rate": 0.00018261347650698966, + "loss": 2.8516929149627686, + "step": 7882, + "token_acc": 0.3181142273426691 + }, + { + "epoch": 4.6209322779243625, + "grad_norm": 0.2499195545615533, + "learning_rate": 0.000182608014894438, + "loss": 2.8743200302124023, + "step": 7883, + "token_acc": 0.31432021464577464 + }, + { + "epoch": 4.621518616241572, + "grad_norm": 0.24105962534228284, + "learning_rate": 0.00018260255250589533, + "loss": 2.8688597679138184, + "step": 7884, + "token_acc": 0.3155860806807666 + }, + { + "epoch": 4.622104954558781, + "grad_norm": 0.26792981412727995, + "learning_rate": 0.000182597089341413, + "loss": 2.8817801475524902, + "step": 7885, + "token_acc": 0.31389339074203976 + }, + { + "epoch": 4.62269129287599, + "grad_norm": 0.29762353346825826, + "learning_rate": 0.00018259162540104233, + "loss": 2.8442492485046387, + "step": 7886, + "token_acc": 0.319871581387294 + }, + { + "epoch": 4.623277631193199, + "grad_norm": 0.32931802844472696, + "learning_rate": 0.00018258616068483465, + "loss": 2.8766069412231445, + "step": 7887, + "token_acc": 0.3144373313167399 + }, + { + "epoch": 4.623863969510407, + "grad_norm": 0.34291367122111555, + "learning_rate": 0.00018258069519284123, + "loss": 2.854952335357666, + "step": 7888, + "token_acc": 0.31881827893545295 + }, + { + "epoch": 4.624450307827616, + "grad_norm": 0.2918959051083099, + "learning_rate": 0.00018257522892511346, + "loss": 2.863406181335449, + "step": 7889, + "token_acc": 0.316234646268233 + }, + { + "epoch": 4.625036646144825, + "grad_norm": 0.26709298393070696, + "learning_rate": 0.00018256976188170274, + "loss": 2.8505213260650635, + "step": 7890, + "token_acc": 0.319156483522216 + }, + { + "epoch": 4.6256229844620345, + "grad_norm": 0.33236602466018683, + "learning_rate": 0.0001825642940626603, + "loss": 2.855293035507202, + "step": 7891, + "token_acc": 0.316943463782598 + }, + { + "epoch": 4.626209322779244, + "grad_norm": 0.27008123478044915, + "learning_rate": 0.00018255882546803763, + "loss": 2.812650680541992, + "step": 7892, + "token_acc": 0.32468300050895044 + }, + { + "epoch": 4.626795661096453, + "grad_norm": 0.2575234294273729, + "learning_rate": 0.00018255335609788605, + "loss": 2.818479061126709, + "step": 7893, + "token_acc": 0.323048706876538 + }, + { + "epoch": 4.627381999413662, + "grad_norm": 0.2473397339241833, + "learning_rate": 0.0001825478859522569, + "loss": 2.837080240249634, + "step": 7894, + "token_acc": 0.32048089154856396 + }, + { + "epoch": 4.627968337730871, + "grad_norm": 0.24936538176168208, + "learning_rate": 0.00018254241503120157, + "loss": 2.863457202911377, + "step": 7895, + "token_acc": 0.31589722775939666 + }, + { + "epoch": 4.62855467604808, + "grad_norm": 0.2491430871006636, + "learning_rate": 0.00018253694333477153, + "loss": 2.825207233428955, + "step": 7896, + "token_acc": 0.3221353063354448 + }, + { + "epoch": 4.629141014365289, + "grad_norm": 0.2658614065131225, + "learning_rate": 0.0001825314708630181, + "loss": 2.8497893810272217, + "step": 7897, + "token_acc": 0.3195753584047652 + }, + { + "epoch": 4.629727352682497, + "grad_norm": 0.24281292267718096, + "learning_rate": 0.00018252599761599272, + "loss": 2.8542709350585938, + "step": 7898, + "token_acc": 0.31626812068046234 + }, + { + "epoch": 4.6303136909997065, + "grad_norm": 0.2993708275189512, + "learning_rate": 0.00018252052359374682, + "loss": 2.8671603202819824, + "step": 7899, + "token_acc": 0.31535155774416573 + }, + { + "epoch": 4.630900029316916, + "grad_norm": 0.2554954225216052, + "learning_rate": 0.00018251504879633176, + "loss": 2.8506011962890625, + "step": 7900, + "token_acc": 0.3195963343752355 + }, + { + "epoch": 4.631486367634125, + "grad_norm": 0.2431903895642876, + "learning_rate": 0.00018250957322379902, + "loss": 2.8726491928100586, + "step": 7901, + "token_acc": 0.31569443627044463 + }, + { + "epoch": 4.632072705951334, + "grad_norm": 0.2820239402368769, + "learning_rate": 0.00018250409687620004, + "loss": 2.832711935043335, + "step": 7902, + "token_acc": 0.3218508196894364 + }, + { + "epoch": 4.632659044268543, + "grad_norm": 0.22484150997921395, + "learning_rate": 0.00018249861975358625, + "loss": 2.8249940872192383, + "step": 7903, + "token_acc": 0.32391055990253925 + }, + { + "epoch": 4.633245382585752, + "grad_norm": 0.2612888906469905, + "learning_rate": 0.00018249314185600905, + "loss": 2.8473429679870605, + "step": 7904, + "token_acc": 0.31836237656405375 + }, + { + "epoch": 4.633831720902961, + "grad_norm": 0.2417603188943553, + "learning_rate": 0.00018248766318351998, + "loss": 2.84458065032959, + "step": 7905, + "token_acc": 0.3188160774758404 + }, + { + "epoch": 4.63441805922017, + "grad_norm": 0.25230317882723874, + "learning_rate": 0.00018248218373617046, + "loss": 2.8415584564208984, + "step": 7906, + "token_acc": 0.3186439195550731 + }, + { + "epoch": 4.635004397537379, + "grad_norm": 0.24961419769172538, + "learning_rate": 0.00018247670351401199, + "loss": 2.835369348526001, + "step": 7907, + "token_acc": 0.32092409207343287 + }, + { + "epoch": 4.6355907358545885, + "grad_norm": 0.2365763623754291, + "learning_rate": 0.000182471222517096, + "loss": 2.781708240509033, + "step": 7908, + "token_acc": 0.32672511514384045 + }, + { + "epoch": 4.636177074171798, + "grad_norm": 0.25215445113088497, + "learning_rate": 0.000182465740745474, + "loss": 2.875487804412842, + "step": 7909, + "token_acc": 0.31556962297870506 + }, + { + "epoch": 4.636763412489006, + "grad_norm": 0.25069168278640086, + "learning_rate": 0.0001824602581991975, + "loss": 2.835012197494507, + "step": 7910, + "token_acc": 0.3215695379614581 + }, + { + "epoch": 4.637349750806215, + "grad_norm": 0.2693362697575483, + "learning_rate": 0.000182454774878318, + "loss": 2.8289010524749756, + "step": 7911, + "token_acc": 0.32309097475569604 + }, + { + "epoch": 4.637936089123424, + "grad_norm": 0.30904125073954924, + "learning_rate": 0.000182449290782887, + "loss": 2.811495065689087, + "step": 7912, + "token_acc": 0.324672267690302 + }, + { + "epoch": 4.638522427440633, + "grad_norm": 0.2766838901905144, + "learning_rate": 0.00018244380591295601, + "loss": 2.875392436981201, + "step": 7913, + "token_acc": 0.31481676103958023 + }, + { + "epoch": 4.639108765757842, + "grad_norm": 0.24824851438576034, + "learning_rate": 0.00018243832026857654, + "loss": 2.8296966552734375, + "step": 7914, + "token_acc": 0.32114433974454804 + }, + { + "epoch": 4.639695104075051, + "grad_norm": 0.262304411358143, + "learning_rate": 0.00018243283384980017, + "loss": 2.8442044258117676, + "step": 7915, + "token_acc": 0.3191658406482797 + }, + { + "epoch": 4.6402814423922605, + "grad_norm": 0.2363574121149564, + "learning_rate": 0.00018242734665667839, + "loss": 2.829318046569824, + "step": 7916, + "token_acc": 0.3225189605160676 + }, + { + "epoch": 4.64086778070947, + "grad_norm": 0.2613041675942796, + "learning_rate": 0.00018242185868926276, + "loss": 2.852980136871338, + "step": 7917, + "token_acc": 0.31839223213277634 + }, + { + "epoch": 4.641454119026679, + "grad_norm": 0.2523636579909328, + "learning_rate": 0.00018241636994760483, + "loss": 2.842521905899048, + "step": 7918, + "token_acc": 0.31982866508527474 + }, + { + "epoch": 4.642040457343887, + "grad_norm": 0.2607440257073755, + "learning_rate": 0.00018241088043175616, + "loss": 2.8563544750213623, + "step": 7919, + "token_acc": 0.31525375843964915 + }, + { + "epoch": 4.642626795661096, + "grad_norm": 0.2633314326850421, + "learning_rate": 0.00018240539014176832, + "loss": 2.8983917236328125, + "step": 7920, + "token_acc": 0.31031506038932516 + }, + { + "epoch": 4.643213133978305, + "grad_norm": 0.254956005658883, + "learning_rate": 0.00018239989907769288, + "loss": 2.848633289337158, + "step": 7921, + "token_acc": 0.31896490174975606 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 0.26149952836772633, + "learning_rate": 0.00018239440723958144, + "loss": 2.869771718978882, + "step": 7922, + "token_acc": 0.31713107616657277 + }, + { + "epoch": 4.644385810612723, + "grad_norm": 0.23566169086452762, + "learning_rate": 0.00018238891462748555, + "loss": 2.8389174938201904, + "step": 7923, + "token_acc": 0.3185520788101055 + }, + { + "epoch": 4.6449721489299325, + "grad_norm": 0.24112410038996554, + "learning_rate": 0.00018238342124145686, + "loss": 2.8437905311584473, + "step": 7924, + "token_acc": 0.31976466473338155 + }, + { + "epoch": 4.645558487247142, + "grad_norm": 0.265799625799581, + "learning_rate": 0.0001823779270815469, + "loss": 2.88850736618042, + "step": 7925, + "token_acc": 0.312613298207202 + }, + { + "epoch": 4.646144825564351, + "grad_norm": 0.28357451765928265, + "learning_rate": 0.00018237243214780735, + "loss": 2.871522903442383, + "step": 7926, + "token_acc": 0.3164327401257655 + }, + { + "epoch": 4.64673116388156, + "grad_norm": 0.307720034347513, + "learning_rate": 0.00018236693644028978, + "loss": 2.8813347816467285, + "step": 7927, + "token_acc": 0.31313363453553056 + }, + { + "epoch": 4.647317502198769, + "grad_norm": 0.29712265757239714, + "learning_rate": 0.00018236143995904584, + "loss": 2.8462624549865723, + "step": 7928, + "token_acc": 0.3186442569075344 + }, + { + "epoch": 4.647903840515978, + "grad_norm": 0.2364171324817206, + "learning_rate": 0.00018235594270412717, + "loss": 2.8362178802490234, + "step": 7929, + "token_acc": 0.3209897319186674 + }, + { + "epoch": 4.648490178833187, + "grad_norm": 0.2493959058610231, + "learning_rate": 0.00018235044467558535, + "loss": 2.9147772789001465, + "step": 7930, + "token_acc": 0.3094253885958503 + }, + { + "epoch": 4.649076517150396, + "grad_norm": 0.2496013590745673, + "learning_rate": 0.0001823449458734721, + "loss": 2.905533790588379, + "step": 7931, + "token_acc": 0.31080001462072276 + }, + { + "epoch": 4.6496628554676045, + "grad_norm": 0.24588646192106783, + "learning_rate": 0.00018233944629783908, + "loss": 2.8537509441375732, + "step": 7932, + "token_acc": 0.3181794966223754 + }, + { + "epoch": 4.650249193784814, + "grad_norm": 0.24161723195144777, + "learning_rate": 0.00018233394594873787, + "loss": 2.844881534576416, + "step": 7933, + "token_acc": 0.31873395585262454 + }, + { + "epoch": 4.650835532102023, + "grad_norm": 0.250931657193241, + "learning_rate": 0.00018232844482622018, + "loss": 2.827566623687744, + "step": 7934, + "token_acc": 0.320910831689462 + }, + { + "epoch": 4.651421870419232, + "grad_norm": 0.23534196420030198, + "learning_rate": 0.0001823229429303377, + "loss": 2.8187108039855957, + "step": 7935, + "token_acc": 0.32303635387061164 + }, + { + "epoch": 4.652008208736441, + "grad_norm": 0.2385197962506422, + "learning_rate": 0.00018231744026114211, + "loss": 2.8507349491119385, + "step": 7936, + "token_acc": 0.3177959881964093 + }, + { + "epoch": 4.65259454705365, + "grad_norm": 0.3206774731811037, + "learning_rate": 0.0001823119368186851, + "loss": 2.866292953491211, + "step": 7937, + "token_acc": 0.31595817751659927 + }, + { + "epoch": 4.653180885370859, + "grad_norm": 0.36090073723247323, + "learning_rate": 0.00018230643260301838, + "loss": 2.8455991744995117, + "step": 7938, + "token_acc": 0.3206590519090519 + }, + { + "epoch": 4.653767223688068, + "grad_norm": 0.2658573355069035, + "learning_rate": 0.0001823009276141936, + "loss": 2.837843418121338, + "step": 7939, + "token_acc": 0.3206343609510606 + }, + { + "epoch": 4.654353562005277, + "grad_norm": 0.31477198523763134, + "learning_rate": 0.0001822954218522625, + "loss": 2.876812219619751, + "step": 7940, + "token_acc": 0.3146120220013915 + }, + { + "epoch": 4.654939900322486, + "grad_norm": 0.3228202901294854, + "learning_rate": 0.0001822899153172768, + "loss": 2.8511581420898438, + "step": 7941, + "token_acc": 0.31764272991968945 + }, + { + "epoch": 4.655526238639695, + "grad_norm": 0.2749991488934795, + "learning_rate": 0.00018228440800928825, + "loss": 2.876026153564453, + "step": 7942, + "token_acc": 0.31263049446785657 + }, + { + "epoch": 4.656112576956904, + "grad_norm": 0.4193431531418702, + "learning_rate": 0.0001822788999283486, + "loss": 2.8509178161621094, + "step": 7943, + "token_acc": 0.3180476616551776 + }, + { + "epoch": 4.656698915274113, + "grad_norm": 0.29646913743894887, + "learning_rate": 0.00018227339107450952, + "loss": 2.8604750633239746, + "step": 7944, + "token_acc": 0.3187382594003505 + }, + { + "epoch": 4.657285253591322, + "grad_norm": 0.34258409133184176, + "learning_rate": 0.00018226788144782278, + "loss": 2.8860936164855957, + "step": 7945, + "token_acc": 0.31379240080577975 + }, + { + "epoch": 4.657871591908531, + "grad_norm": 0.2778106906108733, + "learning_rate": 0.00018226237104834018, + "loss": 2.845273494720459, + "step": 7946, + "token_acc": 0.3196495404776459 + }, + { + "epoch": 4.65845793022574, + "grad_norm": 0.32539060250475726, + "learning_rate": 0.00018225685987611345, + "loss": 2.8268966674804688, + "step": 7947, + "token_acc": 0.32381397037844006 + }, + { + "epoch": 4.659044268542949, + "grad_norm": 0.26938212222598285, + "learning_rate": 0.00018225134793119438, + "loss": 2.860983371734619, + "step": 7948, + "token_acc": 0.315939866669127 + }, + { + "epoch": 4.6596306068601585, + "grad_norm": 0.2893828278782226, + "learning_rate": 0.0001822458352136347, + "loss": 2.861258029937744, + "step": 7949, + "token_acc": 0.31519825569416227 + }, + { + "epoch": 4.660216945177368, + "grad_norm": 0.25317295995541844, + "learning_rate": 0.00018224032172348625, + "loss": 2.852689743041992, + "step": 7950, + "token_acc": 0.31891356805822874 + }, + { + "epoch": 4.660803283494577, + "grad_norm": 0.3261909196885654, + "learning_rate": 0.00018223480746080078, + "loss": 2.905864953994751, + "step": 7951, + "token_acc": 0.3091468965886574 + }, + { + "epoch": 4.661389621811786, + "grad_norm": 0.25245319834731045, + "learning_rate": 0.0001822292924256301, + "loss": 2.8631138801574707, + "step": 7952, + "token_acc": 0.31756120465556725 + }, + { + "epoch": 4.661975960128994, + "grad_norm": 0.2844190801329634, + "learning_rate": 0.00018222377661802607, + "loss": 2.8484835624694824, + "step": 7953, + "token_acc": 0.31778582323278926 + }, + { + "epoch": 4.662562298446203, + "grad_norm": 0.23058552984987232, + "learning_rate": 0.00018221826003804039, + "loss": 2.840463638305664, + "step": 7954, + "token_acc": 0.3180210987734192 + }, + { + "epoch": 4.663148636763412, + "grad_norm": 0.2751777190666098, + "learning_rate": 0.00018221274268572497, + "loss": 2.8497517108917236, + "step": 7955, + "token_acc": 0.31913728840875194 + }, + { + "epoch": 4.663734975080621, + "grad_norm": 0.25836269470298967, + "learning_rate": 0.00018220722456113164, + "loss": 2.8627238273620605, + "step": 7956, + "token_acc": 0.3169352205981883 + }, + { + "epoch": 4.6643213133978305, + "grad_norm": 0.25395891382911834, + "learning_rate": 0.0001822017056643122, + "loss": 2.8270535469055176, + "step": 7957, + "token_acc": 0.32116043956043955 + }, + { + "epoch": 4.66490765171504, + "grad_norm": 0.2726792777379215, + "learning_rate": 0.0001821961859953185, + "loss": 2.887840747833252, + "step": 7958, + "token_acc": 0.3124549606764582 + }, + { + "epoch": 4.665493990032249, + "grad_norm": 0.24130748121970802, + "learning_rate": 0.00018219066555420237, + "loss": 2.8166680335998535, + "step": 7959, + "token_acc": 0.32191510284579533 + }, + { + "epoch": 4.666080328349458, + "grad_norm": 0.27491132128726337, + "learning_rate": 0.00018218514434101572, + "loss": 2.8271172046661377, + "step": 7960, + "token_acc": 0.32222906387942746 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.23130173787802005, + "learning_rate": 0.0001821796223558104, + "loss": 2.83827543258667, + "step": 7961, + "token_acc": 0.3205263938276968 + }, + { + "epoch": 4.667253004983876, + "grad_norm": 0.2608491043570111, + "learning_rate": 0.00018217409959863824, + "loss": 2.858046531677246, + "step": 7962, + "token_acc": 0.3169588723385103 + }, + { + "epoch": 4.667839343301084, + "grad_norm": 0.2362315929110999, + "learning_rate": 0.00018216857606955113, + "loss": 2.861868381500244, + "step": 7963, + "token_acc": 0.3168875723547992 + }, + { + "epoch": 4.668425681618293, + "grad_norm": 0.25824729934195145, + "learning_rate": 0.000182163051768601, + "loss": 2.842451572418213, + "step": 7964, + "token_acc": 0.319540054386884 + }, + { + "epoch": 4.6690120199355025, + "grad_norm": 0.22844543115350627, + "learning_rate": 0.0001821575266958397, + "loss": 2.842209577560425, + "step": 7965, + "token_acc": 0.32015692532748186 + }, + { + "epoch": 4.669598358252712, + "grad_norm": 0.22485787448105587, + "learning_rate": 0.00018215200085131916, + "loss": 2.877556800842285, + "step": 7966, + "token_acc": 0.3157010345379869 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.23112491015744832, + "learning_rate": 0.00018214647423509125, + "loss": 2.8361425399780273, + "step": 7967, + "token_acc": 0.32119608040147857 + }, + { + "epoch": 4.67077103488713, + "grad_norm": 0.2923105287568493, + "learning_rate": 0.00018214094684720794, + "loss": 2.8411386013031006, + "step": 7968, + "token_acc": 0.3192867630054695 + }, + { + "epoch": 4.671357373204339, + "grad_norm": 0.23865625290820194, + "learning_rate": 0.0001821354186877211, + "loss": 2.817558765411377, + "step": 7969, + "token_acc": 0.3234574077424986 + }, + { + "epoch": 4.671943711521548, + "grad_norm": 0.26803140063182135, + "learning_rate": 0.00018212988975668267, + "loss": 2.806683301925659, + "step": 7970, + "token_acc": 0.3263003728263803 + }, + { + "epoch": 4.672530049838757, + "grad_norm": 0.258209703356879, + "learning_rate": 0.00018212436005414463, + "loss": 2.9041097164154053, + "step": 7971, + "token_acc": 0.3106649020892631 + }, + { + "epoch": 4.673116388155966, + "grad_norm": 0.28489373065528234, + "learning_rate": 0.00018211882958015885, + "loss": 2.8429460525512695, + "step": 7972, + "token_acc": 0.32182492674106866 + }, + { + "epoch": 4.673702726473175, + "grad_norm": 0.2792121475768161, + "learning_rate": 0.00018211329833477734, + "loss": 2.835817337036133, + "step": 7973, + "token_acc": 0.3199662123829095 + }, + { + "epoch": 4.6742890647903845, + "grad_norm": 0.25021435887536836, + "learning_rate": 0.00018210776631805207, + "loss": 2.8920738697052, + "step": 7974, + "token_acc": 0.31345546704501764 + }, + { + "epoch": 4.674875403107593, + "grad_norm": 0.27677434025971537, + "learning_rate": 0.00018210223353003495, + "loss": 2.8787167072296143, + "step": 7975, + "token_acc": 0.31589636285309863 + }, + { + "epoch": 4.675461741424802, + "grad_norm": 0.25272235555150385, + "learning_rate": 0.00018209669997077795, + "loss": 2.8661491870880127, + "step": 7976, + "token_acc": 0.31556929508912784 + }, + { + "epoch": 4.676048079742011, + "grad_norm": 0.2972781577093249, + "learning_rate": 0.00018209116564033316, + "loss": 2.835538387298584, + "step": 7977, + "token_acc": 0.3201033386327504 + }, + { + "epoch": 4.67663441805922, + "grad_norm": 0.2580242814336882, + "learning_rate": 0.00018208563053875244, + "loss": 2.847827911376953, + "step": 7978, + "token_acc": 0.32073199211749237 + }, + { + "epoch": 4.677220756376429, + "grad_norm": 0.2798228671940621, + "learning_rate": 0.00018208009466608779, + "loss": 2.9027786254882812, + "step": 7979, + "token_acc": 0.31051386341281867 + }, + { + "epoch": 4.677807094693638, + "grad_norm": 0.2733044828517619, + "learning_rate": 0.0001820745580223913, + "loss": 2.829592704772949, + "step": 7980, + "token_acc": 0.32280676458436736 + }, + { + "epoch": 4.678393433010847, + "grad_norm": 0.2651282841484067, + "learning_rate": 0.00018206902060771495, + "loss": 2.794020175933838, + "step": 7981, + "token_acc": 0.32606704622751154 + }, + { + "epoch": 4.6789797713280565, + "grad_norm": 0.25673741549563334, + "learning_rate": 0.00018206348242211072, + "loss": 2.8610496520996094, + "step": 7982, + "token_acc": 0.31692036519808187 + }, + { + "epoch": 4.679566109645266, + "grad_norm": 0.24341138039443314, + "learning_rate": 0.00018205794346563066, + "loss": 2.860891819000244, + "step": 7983, + "token_acc": 0.3173751504290842 + }, + { + "epoch": 4.680152447962474, + "grad_norm": 0.2458464416624701, + "learning_rate": 0.0001820524037383268, + "loss": 2.9053516387939453, + "step": 7984, + "token_acc": 0.3105147441211453 + }, + { + "epoch": 4.680738786279683, + "grad_norm": 0.2643695435385149, + "learning_rate": 0.00018204686324025117, + "loss": 2.8444507122039795, + "step": 7985, + "token_acc": 0.3188622120596206 + }, + { + "epoch": 4.681325124596892, + "grad_norm": 0.25700610613011127, + "learning_rate": 0.0001820413219714558, + "loss": 2.849257230758667, + "step": 7986, + "token_acc": 0.3185922312443828 + }, + { + "epoch": 4.681911462914101, + "grad_norm": 0.25570420504328395, + "learning_rate": 0.00018203577993199278, + "loss": 2.853827714920044, + "step": 7987, + "token_acc": 0.3177199841113875 + }, + { + "epoch": 4.68249780123131, + "grad_norm": 0.23093366489804387, + "learning_rate": 0.00018203023712191416, + "loss": 2.8710155487060547, + "step": 7988, + "token_acc": 0.3174503935716628 + }, + { + "epoch": 4.683084139548519, + "grad_norm": 0.259817989196454, + "learning_rate": 0.000182024693541272, + "loss": 2.8631575107574463, + "step": 7989, + "token_acc": 0.31644829199812824 + }, + { + "epoch": 4.6836704778657285, + "grad_norm": 0.22821289909175868, + "learning_rate": 0.00018201914919011838, + "loss": 2.8164682388305664, + "step": 7990, + "token_acc": 0.32182777879097846 + }, + { + "epoch": 4.684256816182938, + "grad_norm": 0.2545399035501483, + "learning_rate": 0.00018201360406850533, + "loss": 2.8166565895080566, + "step": 7991, + "token_acc": 0.32352871985885123 + }, + { + "epoch": 4.684843154500147, + "grad_norm": 0.22092961822369694, + "learning_rate": 0.00018200805817648503, + "loss": 2.805501699447632, + "step": 7992, + "token_acc": 0.32548701943312763 + }, + { + "epoch": 4.685429492817356, + "grad_norm": 0.24656958950551727, + "learning_rate": 0.0001820025115141095, + "loss": 2.8383491039276123, + "step": 7993, + "token_acc": 0.32129780199661845 + }, + { + "epoch": 4.686015831134565, + "grad_norm": 0.2263372661714545, + "learning_rate": 0.0001819969640814309, + "loss": 2.8347816467285156, + "step": 7994, + "token_acc": 0.32211190483683166 + }, + { + "epoch": 4.686602169451774, + "grad_norm": 0.24553631997631356, + "learning_rate": 0.00018199141587850131, + "loss": 2.874990940093994, + "step": 7995, + "token_acc": 0.3141520486607071 + }, + { + "epoch": 4.687188507768982, + "grad_norm": 0.2725009905957728, + "learning_rate": 0.00018198586690537286, + "loss": 2.856537342071533, + "step": 7996, + "token_acc": 0.3173330045946369 + }, + { + "epoch": 4.687774846086191, + "grad_norm": 0.2433171123430189, + "learning_rate": 0.00018198031716209765, + "loss": 2.849447250366211, + "step": 7997, + "token_acc": 0.3178343966168496 + }, + { + "epoch": 4.6883611844034006, + "grad_norm": 0.2370078966811938, + "learning_rate": 0.00018197476664872782, + "loss": 2.829416513442993, + "step": 7998, + "token_acc": 0.32200723773491796 + }, + { + "epoch": 4.68894752272061, + "grad_norm": 0.24459060585952624, + "learning_rate": 0.00018196921536531554, + "loss": 2.8380603790283203, + "step": 7999, + "token_acc": 0.3205729924282408 + }, + { + "epoch": 4.689533861037819, + "grad_norm": 0.2662025490636676, + "learning_rate": 0.00018196366331191293, + "loss": 2.876652717590332, + "step": 8000, + "token_acc": 0.31395115423218467 + }, + { + "epoch": 4.690120199355028, + "grad_norm": 0.27376208069123026, + "learning_rate": 0.00018195811048857214, + "loss": 2.9305503368377686, + "step": 8001, + "token_acc": 0.3075252778919278 + }, + { + "epoch": 4.690706537672237, + "grad_norm": 0.3289502120308831, + "learning_rate": 0.00018195255689534536, + "loss": 2.892932891845703, + "step": 8002, + "token_acc": 0.31132625383098833 + }, + { + "epoch": 4.691292875989446, + "grad_norm": 0.2990535550780179, + "learning_rate": 0.00018194700253228475, + "loss": 2.832838296890259, + "step": 8003, + "token_acc": 0.3212853103612293 + }, + { + "epoch": 4.691879214306655, + "grad_norm": 0.24510422661029935, + "learning_rate": 0.00018194144739944244, + "loss": 2.8240671157836914, + "step": 8004, + "token_acc": 0.32344140714154934 + }, + { + "epoch": 4.692465552623864, + "grad_norm": 0.28514575646441226, + "learning_rate": 0.0001819358914968707, + "loss": 2.8391356468200684, + "step": 8005, + "token_acc": 0.3203686249869096 + }, + { + "epoch": 4.693051890941073, + "grad_norm": 0.2800849462489954, + "learning_rate": 0.0001819303348246216, + "loss": 2.886533737182617, + "step": 8006, + "token_acc": 0.31420718360525646 + }, + { + "epoch": 4.693638229258282, + "grad_norm": 0.23530728231588485, + "learning_rate": 0.00018192477738274745, + "loss": 2.8834023475646973, + "step": 8007, + "token_acc": 0.31434186768645705 + }, + { + "epoch": 4.694224567575491, + "grad_norm": 0.2515115127843232, + "learning_rate": 0.00018191921917130042, + "loss": 2.850801467895508, + "step": 8008, + "token_acc": 0.3189156297972045 + }, + { + "epoch": 4.6948109058927, + "grad_norm": 0.23822270754618435, + "learning_rate": 0.0001819136601903327, + "loss": 2.8425331115722656, + "step": 8009, + "token_acc": 0.3206677865895743 + }, + { + "epoch": 4.695397244209909, + "grad_norm": 0.23888649121738686, + "learning_rate": 0.00018190810043989652, + "loss": 2.838322401046753, + "step": 8010, + "token_acc": 0.31950578352453546 + }, + { + "epoch": 4.695983582527118, + "grad_norm": 0.2742156585715057, + "learning_rate": 0.00018190253992004412, + "loss": 2.9043946266174316, + "step": 8011, + "token_acc": 0.31131339615360404 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.24498139730584972, + "learning_rate": 0.0001818969786308277, + "loss": 2.9003829956054688, + "step": 8012, + "token_acc": 0.3109862688319833 + }, + { + "epoch": 4.697156259161536, + "grad_norm": 0.25583271990760836, + "learning_rate": 0.00018189141657229952, + "loss": 2.851210117340088, + "step": 8013, + "token_acc": 0.3185222489088608 + }, + { + "epoch": 4.6977425974787455, + "grad_norm": 0.26129160995062894, + "learning_rate": 0.0001818858537445119, + "loss": 2.88232159614563, + "step": 8014, + "token_acc": 0.31325655373696865 + }, + { + "epoch": 4.698328935795955, + "grad_norm": 0.2332040665447364, + "learning_rate": 0.00018188029014751695, + "loss": 2.8614609241485596, + "step": 8015, + "token_acc": 0.31674782259808404 + }, + { + "epoch": 4.698915274113164, + "grad_norm": 0.27007296395041647, + "learning_rate": 0.00018187472578136703, + "loss": 2.8396332263946533, + "step": 8016, + "token_acc": 0.32091937501519774 + }, + { + "epoch": 4.699501612430373, + "grad_norm": 0.25439380224047087, + "learning_rate": 0.0001818691606461144, + "loss": 2.8662452697753906, + "step": 8017, + "token_acc": 0.3161944471261099 + }, + { + "epoch": 4.700087950747581, + "grad_norm": 0.2464550817689056, + "learning_rate": 0.00018186359474181132, + "loss": 2.8305578231811523, + "step": 8018, + "token_acc": 0.3242475540907815 + }, + { + "epoch": 4.70067428906479, + "grad_norm": 0.2804523568929671, + "learning_rate": 0.0001818580280685101, + "loss": 2.808176040649414, + "step": 8019, + "token_acc": 0.3246752573041169 + }, + { + "epoch": 4.701260627381999, + "grad_norm": 0.24355875836021335, + "learning_rate": 0.00018185246062626297, + "loss": 2.8570289611816406, + "step": 8020, + "token_acc": 0.3192484513551142 + }, + { + "epoch": 4.701846965699208, + "grad_norm": 0.2591666605364451, + "learning_rate": 0.0001818468924151223, + "loss": 2.836561679840088, + "step": 8021, + "token_acc": 0.3195663998756702 + }, + { + "epoch": 4.7024333040164175, + "grad_norm": 0.3159520745470281, + "learning_rate": 0.00018184132343514035, + "loss": 2.87027645111084, + "step": 8022, + "token_acc": 0.31565142208501745 + }, + { + "epoch": 4.703019642333627, + "grad_norm": 0.22764517951130156, + "learning_rate": 0.00018183575368636948, + "loss": 2.8519668579101562, + "step": 8023, + "token_acc": 0.31714101958110236 + }, + { + "epoch": 4.703605980650836, + "grad_norm": 0.24133055707444134, + "learning_rate": 0.00018183018316886193, + "loss": 2.8013575077056885, + "step": 8024, + "token_acc": 0.3245651557846445 + }, + { + "epoch": 4.704192318968045, + "grad_norm": 0.2291506622631952, + "learning_rate": 0.0001818246118826701, + "loss": 2.8687868118286133, + "step": 8025, + "token_acc": 0.31608920655073053 + }, + { + "epoch": 4.704778657285254, + "grad_norm": 0.26539143969326395, + "learning_rate": 0.00018181903982784632, + "loss": 2.9064242839813232, + "step": 8026, + "token_acc": 0.31050965140902426 + }, + { + "epoch": 4.705364995602462, + "grad_norm": 0.28817198202862465, + "learning_rate": 0.0001818134670044429, + "loss": 2.854684829711914, + "step": 8027, + "token_acc": 0.31704691922346095 + }, + { + "epoch": 4.705951333919671, + "grad_norm": 0.24112896565155048, + "learning_rate": 0.00018180789341251216, + "loss": 2.8974506855010986, + "step": 8028, + "token_acc": 0.3102875919715961 + }, + { + "epoch": 4.70653767223688, + "grad_norm": 0.2642072136625414, + "learning_rate": 0.00018180231905210657, + "loss": 2.8863887786865234, + "step": 8029, + "token_acc": 0.31324339967982645 + }, + { + "epoch": 4.7071240105540895, + "grad_norm": 0.2404251366049508, + "learning_rate": 0.00018179674392327839, + "loss": 2.833858013153076, + "step": 8030, + "token_acc": 0.32072781459971 + }, + { + "epoch": 4.707710348871299, + "grad_norm": 0.276559978329488, + "learning_rate": 0.00018179116802608002, + "loss": 2.8274240493774414, + "step": 8031, + "token_acc": 0.3212447566503922 + }, + { + "epoch": 4.708296687188508, + "grad_norm": 0.23826030630660644, + "learning_rate": 0.00018178559136056382, + "loss": 2.833381414413452, + "step": 8032, + "token_acc": 0.3213932870890681 + }, + { + "epoch": 4.708883025505717, + "grad_norm": 0.26007341496577335, + "learning_rate": 0.00018178001392678224, + "loss": 2.8373050689697266, + "step": 8033, + "token_acc": 0.3197908683821939 + }, + { + "epoch": 4.709469363822926, + "grad_norm": 0.2529567529068354, + "learning_rate": 0.0001817744357247876, + "loss": 2.861542224884033, + "step": 8034, + "token_acc": 0.3165112263434989 + }, + { + "epoch": 4.710055702140135, + "grad_norm": 0.2431402630977329, + "learning_rate": 0.00018176885675463237, + "loss": 2.8428993225097656, + "step": 8035, + "token_acc": 0.32066344909024924 + }, + { + "epoch": 4.710642040457344, + "grad_norm": 0.27955260704489493, + "learning_rate": 0.00018176327701636887, + "loss": 2.838435173034668, + "step": 8036, + "token_acc": 0.32021783035199075 + }, + { + "epoch": 4.711228378774553, + "grad_norm": 0.2689898052707983, + "learning_rate": 0.00018175769651004956, + "loss": 2.928544044494629, + "step": 8037, + "token_acc": 0.3064314717224155 + }, + { + "epoch": 4.711814717091762, + "grad_norm": 0.23226460164563856, + "learning_rate": 0.0001817521152357269, + "loss": 2.873896837234497, + "step": 8038, + "token_acc": 0.31294827071692444 + }, + { + "epoch": 4.7124010554089715, + "grad_norm": 0.3125019805654407, + "learning_rate": 0.00018174653319345322, + "loss": 2.856449842453003, + "step": 8039, + "token_acc": 0.31791541685280916 + }, + { + "epoch": 4.71298739372618, + "grad_norm": 0.293705375638339, + "learning_rate": 0.00018174095038328108, + "loss": 2.878654718399048, + "step": 8040, + "token_acc": 0.3133308121885264 + }, + { + "epoch": 4.713573732043389, + "grad_norm": 0.2833306270756169, + "learning_rate": 0.00018173536680526282, + "loss": 2.928429126739502, + "step": 8041, + "token_acc": 0.3069624181485274 + }, + { + "epoch": 4.714160070360598, + "grad_norm": 0.3334948847758536, + "learning_rate": 0.00018172978245945096, + "loss": 2.797441005706787, + "step": 8042, + "token_acc": 0.32532701908418943 + }, + { + "epoch": 4.714746408677807, + "grad_norm": 0.2609469122965276, + "learning_rate": 0.0001817241973458979, + "loss": 2.8259265422821045, + "step": 8043, + "token_acc": 0.3216305792150534 + }, + { + "epoch": 4.715332746995016, + "grad_norm": 0.310934316469313, + "learning_rate": 0.00018171861146465613, + "loss": 2.833998203277588, + "step": 8044, + "token_acc": 0.32099941402957677 + }, + { + "epoch": 4.715919085312225, + "grad_norm": 0.296903504195043, + "learning_rate": 0.0001817130248157781, + "loss": 2.854832172393799, + "step": 8045, + "token_acc": 0.3185565057029898 + }, + { + "epoch": 4.716505423629434, + "grad_norm": 0.25370060665138006, + "learning_rate": 0.00018170743739931634, + "loss": 2.9049510955810547, + "step": 8046, + "token_acc": 0.3119570974237474 + }, + { + "epoch": 4.7170917619466435, + "grad_norm": 0.29004846654180894, + "learning_rate": 0.00018170184921532335, + "loss": 2.836297035217285, + "step": 8047, + "token_acc": 0.3192653619321492 + }, + { + "epoch": 4.717678100263853, + "grad_norm": 0.259727850682258, + "learning_rate": 0.0001816962602638515, + "loss": 2.886216640472412, + "step": 8048, + "token_acc": 0.31320203047809336 + }, + { + "epoch": 4.718264438581061, + "grad_norm": 0.28288949521918405, + "learning_rate": 0.00018169067054495344, + "loss": 2.846942901611328, + "step": 8049, + "token_acc": 0.3183509006147651 + }, + { + "epoch": 4.71885077689827, + "grad_norm": 0.23848079580669918, + "learning_rate": 0.00018168508005868156, + "loss": 2.868199348449707, + "step": 8050, + "token_acc": 0.31611411542562073 + }, + { + "epoch": 4.719437115215479, + "grad_norm": 0.28026393841099584, + "learning_rate": 0.00018167948880508844, + "loss": 2.8703441619873047, + "step": 8051, + "token_acc": 0.3159572697580561 + }, + { + "epoch": 4.720023453532688, + "grad_norm": 0.23050309804981484, + "learning_rate": 0.00018167389678422658, + "loss": 2.850451707839966, + "step": 8052, + "token_acc": 0.3186301081493566 + }, + { + "epoch": 4.720609791849897, + "grad_norm": 0.2830963211551632, + "learning_rate": 0.00018166830399614855, + "loss": 2.8724722862243652, + "step": 8053, + "token_acc": 0.31553533814138335 + }, + { + "epoch": 4.721196130167106, + "grad_norm": 0.22212479486255632, + "learning_rate": 0.00018166271044090684, + "loss": 2.855909824371338, + "step": 8054, + "token_acc": 0.3167231549354983 + }, + { + "epoch": 4.7217824684843155, + "grad_norm": 0.266277037788316, + "learning_rate": 0.00018165711611855398, + "loss": 2.8273510932922363, + "step": 8055, + "token_acc": 0.3225707329756305 + }, + { + "epoch": 4.722368806801525, + "grad_norm": 0.23221877059005042, + "learning_rate": 0.0001816515210291426, + "loss": 2.836549758911133, + "step": 8056, + "token_acc": 0.321211021609377 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.28472583793168915, + "learning_rate": 0.00018164592517272516, + "loss": 2.8237316608428955, + "step": 8057, + "token_acc": 0.32335621103368783 + }, + { + "epoch": 4.723541483435943, + "grad_norm": 0.24309086585736395, + "learning_rate": 0.00018164032854935428, + "loss": 2.8503198623657227, + "step": 8058, + "token_acc": 0.3178483733525079 + }, + { + "epoch": 4.724127821753152, + "grad_norm": 0.2999907377752205, + "learning_rate": 0.00018163473115908254, + "loss": 2.8550238609313965, + "step": 8059, + "token_acc": 0.315762456680691 + }, + { + "epoch": 4.724714160070361, + "grad_norm": 0.2760338034534839, + "learning_rate": 0.0001816291330019625, + "loss": 2.8779211044311523, + "step": 8060, + "token_acc": 0.3149451594530054 + }, + { + "epoch": 4.725300498387569, + "grad_norm": 0.24798308936885677, + "learning_rate": 0.00018162353407804674, + "loss": 2.8319313526153564, + "step": 8061, + "token_acc": 0.32106564349063427 + }, + { + "epoch": 4.725886836704778, + "grad_norm": 0.2833270982505081, + "learning_rate": 0.00018161793438738788, + "loss": 2.8686540126800537, + "step": 8062, + "token_acc": 0.3167144369561288 + }, + { + "epoch": 4.7264731750219875, + "grad_norm": 0.23611917812134892, + "learning_rate": 0.00018161233393003848, + "loss": 2.8891663551330566, + "step": 8063, + "token_acc": 0.3137665913602004 + }, + { + "epoch": 4.727059513339197, + "grad_norm": 0.27098040325117845, + "learning_rate": 0.00018160673270605122, + "loss": 2.8479437828063965, + "step": 8064, + "token_acc": 0.3195982744251141 + }, + { + "epoch": 4.727645851656406, + "grad_norm": 0.24603809204183594, + "learning_rate": 0.00018160113071547865, + "loss": 2.8657913208007812, + "step": 8065, + "token_acc": 0.3162626321942782 + }, + { + "epoch": 4.728232189973615, + "grad_norm": 0.24323331262389142, + "learning_rate": 0.00018159552795837342, + "loss": 2.87576961517334, + "step": 8066, + "token_acc": 0.31376961614874727 + }, + { + "epoch": 4.728818528290824, + "grad_norm": 0.2523842135857536, + "learning_rate": 0.00018158992443478814, + "loss": 2.8928680419921875, + "step": 8067, + "token_acc": 0.3121880900337666 + }, + { + "epoch": 4.729404866608033, + "grad_norm": 0.2826819382569553, + "learning_rate": 0.00018158432014477548, + "loss": 2.848867654800415, + "step": 8068, + "token_acc": 0.3189370325405099 + }, + { + "epoch": 4.729991204925242, + "grad_norm": 0.2557982808002101, + "learning_rate": 0.00018157871508838808, + "loss": 2.86297607421875, + "step": 8069, + "token_acc": 0.3177658013903948 + }, + { + "epoch": 4.730577543242451, + "grad_norm": 0.25008369582151574, + "learning_rate": 0.00018157310926567857, + "loss": 2.8835794925689697, + "step": 8070, + "token_acc": 0.31495925817410103 + }, + { + "epoch": 4.7311638815596595, + "grad_norm": 0.2849336252487903, + "learning_rate": 0.00018156750267669963, + "loss": 2.834134340286255, + "step": 8071, + "token_acc": 0.31951847868974215 + }, + { + "epoch": 4.731750219876869, + "grad_norm": 0.26762375812623396, + "learning_rate": 0.00018156189532150387, + "loss": 2.849240303039551, + "step": 8072, + "token_acc": 0.31931949160056117 + }, + { + "epoch": 4.732336558194078, + "grad_norm": 0.24002315538425464, + "learning_rate": 0.00018155628720014407, + "loss": 2.8511886596679688, + "step": 8073, + "token_acc": 0.31729686880697583 + }, + { + "epoch": 4.732922896511287, + "grad_norm": 0.2635355740921285, + "learning_rate": 0.00018155067831267282, + "loss": 2.8659536838531494, + "step": 8074, + "token_acc": 0.3167993424476213 + }, + { + "epoch": 4.733509234828496, + "grad_norm": 0.2393455253509564, + "learning_rate": 0.00018154506865914285, + "loss": 2.8910579681396484, + "step": 8075, + "token_acc": 0.3137299845215431 + }, + { + "epoch": 4.734095573145705, + "grad_norm": 0.24560682147272686, + "learning_rate": 0.00018153945823960683, + "loss": 2.8448266983032227, + "step": 8076, + "token_acc": 0.318531694023175 + }, + { + "epoch": 4.734681911462914, + "grad_norm": 0.281710323013585, + "learning_rate": 0.00018153384705411747, + "loss": 2.8232245445251465, + "step": 8077, + "token_acc": 0.3231604776359348 + }, + { + "epoch": 4.735268249780123, + "grad_norm": 0.22888018458060053, + "learning_rate": 0.0001815282351027275, + "loss": 2.907013416290283, + "step": 8078, + "token_acc": 0.30971276606904946 + }, + { + "epoch": 4.735854588097332, + "grad_norm": 0.29966226721629974, + "learning_rate": 0.0001815226223854896, + "loss": 2.8319334983825684, + "step": 8079, + "token_acc": 0.3228889922068721 + }, + { + "epoch": 4.7364409264145415, + "grad_norm": 0.2616392494777424, + "learning_rate": 0.00018151700890245653, + "loss": 2.878106117248535, + "step": 8080, + "token_acc": 0.3144527511325691 + }, + { + "epoch": 4.737027264731751, + "grad_norm": 0.2718129825803422, + "learning_rate": 0.00018151139465368102, + "loss": 2.8504018783569336, + "step": 8081, + "token_acc": 0.319645171437559 + }, + { + "epoch": 4.73761360304896, + "grad_norm": 0.3660693104364194, + "learning_rate": 0.0001815057796392158, + "loss": 2.8621160984039307, + "step": 8082, + "token_acc": 0.3170808640059905 + }, + { + "epoch": 4.738199941366168, + "grad_norm": 0.25128602214835555, + "learning_rate": 0.00018150016385911358, + "loss": 2.853025436401367, + "step": 8083, + "token_acc": 0.3181481979558903 + }, + { + "epoch": 4.738786279683377, + "grad_norm": 0.400131418509681, + "learning_rate": 0.00018149454731342717, + "loss": 2.8442623615264893, + "step": 8084, + "token_acc": 0.3212362605393168 + }, + { + "epoch": 4.739372618000586, + "grad_norm": 0.2657749731207815, + "learning_rate": 0.00018148893000220927, + "loss": 2.859973430633545, + "step": 8085, + "token_acc": 0.3172285134347968 + }, + { + "epoch": 4.739958956317795, + "grad_norm": 0.33401079897119496, + "learning_rate": 0.0001814833119255127, + "loss": 2.8709328174591064, + "step": 8086, + "token_acc": 0.31419178755314375 + }, + { + "epoch": 4.740545294635004, + "grad_norm": 0.24594412668907412, + "learning_rate": 0.00018147769308339022, + "loss": 2.852072238922119, + "step": 8087, + "token_acc": 0.3178781432418798 + }, + { + "epoch": 4.7411316329522135, + "grad_norm": 0.41557281416006797, + "learning_rate": 0.0001814720734758946, + "loss": 2.8715970516204834, + "step": 8088, + "token_acc": 0.3145869188865366 + }, + { + "epoch": 4.741717971269423, + "grad_norm": 0.24645535176614186, + "learning_rate": 0.00018146645310307866, + "loss": 2.8728585243225098, + "step": 8089, + "token_acc": 0.314820069717615 + }, + { + "epoch": 4.742304309586632, + "grad_norm": 0.35845486988276215, + "learning_rate": 0.00018146083196499512, + "loss": 2.877509355545044, + "step": 8090, + "token_acc": 0.31447917524458696 + }, + { + "epoch": 4.742890647903841, + "grad_norm": 0.2273087996350966, + "learning_rate": 0.00018145521006169687, + "loss": 2.8547534942626953, + "step": 8091, + "token_acc": 0.31841609152621914 + }, + { + "epoch": 4.743476986221049, + "grad_norm": 0.3305647923242742, + "learning_rate": 0.0001814495873932367, + "loss": 2.8630473613739014, + "step": 8092, + "token_acc": 0.3179541476011058 + }, + { + "epoch": 4.744063324538258, + "grad_norm": 0.2580908344970165, + "learning_rate": 0.00018144396395966737, + "loss": 2.883364200592041, + "step": 8093, + "token_acc": 0.3130672089041096 + }, + { + "epoch": 4.744649662855467, + "grad_norm": 0.26081472447270754, + "learning_rate": 0.00018143833976104178, + "loss": 2.841121196746826, + "step": 8094, + "token_acc": 0.3202383984793332 + }, + { + "epoch": 4.745236001172676, + "grad_norm": 0.2897863667413356, + "learning_rate": 0.00018143271479741267, + "loss": 2.807852268218994, + "step": 8095, + "token_acc": 0.3253842203622233 + }, + { + "epoch": 4.7458223394898855, + "grad_norm": 0.24111356368258086, + "learning_rate": 0.000181427089068833, + "loss": 2.88679838180542, + "step": 8096, + "token_acc": 0.31266485359720153 + }, + { + "epoch": 4.746408677807095, + "grad_norm": 0.28143475111388694, + "learning_rate": 0.00018142146257535554, + "loss": 2.8864684104919434, + "step": 8097, + "token_acc": 0.31350402128057525 + }, + { + "epoch": 4.746995016124304, + "grad_norm": 0.2810418652779738, + "learning_rate": 0.0001814158353170331, + "loss": 2.8566131591796875, + "step": 8098, + "token_acc": 0.317571594524976 + }, + { + "epoch": 4.747581354441513, + "grad_norm": 0.2331550043979706, + "learning_rate": 0.0001814102072939187, + "loss": 2.8767473697662354, + "step": 8099, + "token_acc": 0.3145498247527879 + }, + { + "epoch": 4.748167692758722, + "grad_norm": 0.2753471014871126, + "learning_rate": 0.00018140457850606502, + "loss": 2.8601951599121094, + "step": 8100, + "token_acc": 0.3169262336799584 + }, + { + "epoch": 4.748754031075931, + "grad_norm": 0.238565844067442, + "learning_rate": 0.00018139894895352504, + "loss": 2.8498740196228027, + "step": 8101, + "token_acc": 0.31909064655363056 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 0.28366941193718787, + "learning_rate": 0.00018139331863635164, + "loss": 2.8749842643737793, + "step": 8102, + "token_acc": 0.3138319682903494 + }, + { + "epoch": 4.749926707710349, + "grad_norm": 0.2606398332867438, + "learning_rate": 0.00018138768755459768, + "loss": 2.873274564743042, + "step": 8103, + "token_acc": 0.31175566302920843 + }, + { + "epoch": 4.7505130460275575, + "grad_norm": 0.23969999726021451, + "learning_rate": 0.00018138205570831603, + "loss": 2.8490233421325684, + "step": 8104, + "token_acc": 0.3197406748708543 + }, + { + "epoch": 4.751099384344767, + "grad_norm": 0.3193724962367442, + "learning_rate": 0.0001813764230975597, + "loss": 2.8728792667388916, + "step": 8105, + "token_acc": 0.31491849102883673 + }, + { + "epoch": 4.751685722661976, + "grad_norm": 0.24224678539709218, + "learning_rate": 0.0001813707897223815, + "loss": 2.849452495574951, + "step": 8106, + "token_acc": 0.3188687711217103 + }, + { + "epoch": 4.752272060979185, + "grad_norm": 0.34001969547209343, + "learning_rate": 0.00018136515558283436, + "loss": 2.8645622730255127, + "step": 8107, + "token_acc": 0.315676084723906 + }, + { + "epoch": 4.752858399296394, + "grad_norm": 0.26578636917641907, + "learning_rate": 0.00018135952067897123, + "loss": 2.843933582305908, + "step": 8108, + "token_acc": 0.32091989655908704 + }, + { + "epoch": 4.753444737613603, + "grad_norm": 0.26843087565298923, + "learning_rate": 0.00018135388501084503, + "loss": 2.8612165451049805, + "step": 8109, + "token_acc": 0.3179208117371677 + }, + { + "epoch": 4.754031075930812, + "grad_norm": 0.2972956692285099, + "learning_rate": 0.00018134824857850873, + "loss": 2.866142749786377, + "step": 8110, + "token_acc": 0.3174174524119551 + }, + { + "epoch": 4.754617414248021, + "grad_norm": 0.2507653572120258, + "learning_rate": 0.00018134261138201522, + "loss": 2.868525981903076, + "step": 8111, + "token_acc": 0.315902056682899 + }, + { + "epoch": 4.75520375256523, + "grad_norm": 0.291394311195834, + "learning_rate": 0.00018133697342141754, + "loss": 2.8417797088623047, + "step": 8112, + "token_acc": 0.3192974146437734 + }, + { + "epoch": 4.7557900908824395, + "grad_norm": 0.25840427164856955, + "learning_rate": 0.00018133133469676855, + "loss": 2.882445812225342, + "step": 8113, + "token_acc": 0.31343322085667996 + }, + { + "epoch": 4.756376429199648, + "grad_norm": 0.3042362722050162, + "learning_rate": 0.0001813256952081213, + "loss": 2.8996667861938477, + "step": 8114, + "token_acc": 0.31164511348735063 + }, + { + "epoch": 4.756962767516857, + "grad_norm": 0.2544756825536757, + "learning_rate": 0.00018132005495552869, + "loss": 2.839451789855957, + "step": 8115, + "token_acc": 0.31972897924079574 + }, + { + "epoch": 4.757549105834066, + "grad_norm": 0.3309493648361358, + "learning_rate": 0.0001813144139390438, + "loss": 2.836857795715332, + "step": 8116, + "token_acc": 0.3199251563397986 + }, + { + "epoch": 4.758135444151275, + "grad_norm": 0.23947347034792899, + "learning_rate": 0.0001813087721587195, + "loss": 2.8489277362823486, + "step": 8117, + "token_acc": 0.3200418476533531 + }, + { + "epoch": 4.758721782468484, + "grad_norm": 0.31324013845388626, + "learning_rate": 0.0001813031296146089, + "loss": 2.876556634902954, + "step": 8118, + "token_acc": 0.3150759275148082 + }, + { + "epoch": 4.759308120785693, + "grad_norm": 0.2164433247404135, + "learning_rate": 0.00018129748630676493, + "loss": 2.8348793983459473, + "step": 8119, + "token_acc": 0.32038794303010304 + }, + { + "epoch": 4.759894459102902, + "grad_norm": 0.2916040080351027, + "learning_rate": 0.00018129184223524063, + "loss": 2.8650968074798584, + "step": 8120, + "token_acc": 0.3159516280435683 + }, + { + "epoch": 4.7604807974201115, + "grad_norm": 0.21973361116769724, + "learning_rate": 0.00018128619740008902, + "loss": 2.8875927925109863, + "step": 8121, + "token_acc": 0.3125834794223224 + }, + { + "epoch": 4.761067135737321, + "grad_norm": 0.2531314638843724, + "learning_rate": 0.00018128055180136312, + "loss": 2.826516628265381, + "step": 8122, + "token_acc": 0.32237092760272695 + }, + { + "epoch": 4.76165347405453, + "grad_norm": 0.2454021434099326, + "learning_rate": 0.00018127490543911595, + "loss": 2.840968132019043, + "step": 8123, + "token_acc": 0.321548291508473 + }, + { + "epoch": 4.762239812371739, + "grad_norm": 0.24231611661373578, + "learning_rate": 0.00018126925831340058, + "loss": 2.8848531246185303, + "step": 8124, + "token_acc": 0.31293197738009354 + }, + { + "epoch": 4.762826150688948, + "grad_norm": 0.24081618550132425, + "learning_rate": 0.00018126361042427003, + "loss": 2.8327250480651855, + "step": 8125, + "token_acc": 0.32230493071238914 + }, + { + "epoch": 4.763412489006156, + "grad_norm": 0.23295577693478312, + "learning_rate": 0.00018125796177177736, + "loss": 2.877139091491699, + "step": 8126, + "token_acc": 0.3141819039633638 + }, + { + "epoch": 4.763998827323365, + "grad_norm": 0.2210529617814289, + "learning_rate": 0.00018125231235597563, + "loss": 2.8429958820343018, + "step": 8127, + "token_acc": 0.3206383941252591 + }, + { + "epoch": 4.764585165640574, + "grad_norm": 0.2358269984069406, + "learning_rate": 0.00018124666217691796, + "loss": 2.841744899749756, + "step": 8128, + "token_acc": 0.3189275583216902 + }, + { + "epoch": 4.7651715039577835, + "grad_norm": 0.250011870511674, + "learning_rate": 0.00018124101123465734, + "loss": 2.875566005706787, + "step": 8129, + "token_acc": 0.31670009449045955 + }, + { + "epoch": 4.765757842274993, + "grad_norm": 0.24336475462926818, + "learning_rate": 0.0001812353595292469, + "loss": 2.813002586364746, + "step": 8130, + "token_acc": 0.3259146891848563 + }, + { + "epoch": 4.766344180592202, + "grad_norm": 0.24246383004577401, + "learning_rate": 0.0001812297070607397, + "loss": 2.7973411083221436, + "step": 8131, + "token_acc": 0.3273682828205451 + }, + { + "epoch": 4.766930518909411, + "grad_norm": 0.24876944370997475, + "learning_rate": 0.00018122405382918887, + "loss": 2.8359577655792236, + "step": 8132, + "token_acc": 0.32116274075354484 + }, + { + "epoch": 4.76751685722662, + "grad_norm": 0.2485667718374598, + "learning_rate": 0.00018121839983464754, + "loss": 2.8358585834503174, + "step": 8133, + "token_acc": 0.3202875839639654 + }, + { + "epoch": 4.768103195543829, + "grad_norm": 0.2700892449211088, + "learning_rate": 0.00018121274507716876, + "loss": 2.8785624504089355, + "step": 8134, + "token_acc": 0.3127131256690775 + }, + { + "epoch": 4.768689533861037, + "grad_norm": 0.24662695940358334, + "learning_rate": 0.0001812070895568057, + "loss": 2.875436782836914, + "step": 8135, + "token_acc": 0.315440865348103 + }, + { + "epoch": 4.7692758721782464, + "grad_norm": 0.2642085880960896, + "learning_rate": 0.00018120143327361144, + "loss": 2.8548102378845215, + "step": 8136, + "token_acc": 0.31806089579778635 + }, + { + "epoch": 4.769862210495456, + "grad_norm": 0.28583099227997527, + "learning_rate": 0.0001811957762276391, + "loss": 2.9181082248687744, + "step": 8137, + "token_acc": 0.3095460629707205 + }, + { + "epoch": 4.770448548812665, + "grad_norm": 0.23415825729571144, + "learning_rate": 0.0001811901184189419, + "loss": 2.903862237930298, + "step": 8138, + "token_acc": 0.30974019332071806 + }, + { + "epoch": 4.771034887129874, + "grad_norm": 0.29193814350934794, + "learning_rate": 0.00018118445984757292, + "loss": 2.8313355445861816, + "step": 8139, + "token_acc": 0.32268198799954717 + }, + { + "epoch": 4.771621225447083, + "grad_norm": 0.2488021375262915, + "learning_rate": 0.00018117880051358537, + "loss": 2.858372926712036, + "step": 8140, + "token_acc": 0.31836297946800607 + }, + { + "epoch": 4.772207563764292, + "grad_norm": 0.25016353966451155, + "learning_rate": 0.00018117314041703238, + "loss": 2.821680784225464, + "step": 8141, + "token_acc": 0.32261071899880944 + }, + { + "epoch": 4.772793902081501, + "grad_norm": 0.2644863114110237, + "learning_rate": 0.00018116747955796708, + "loss": 2.8611650466918945, + "step": 8142, + "token_acc": 0.31577585760852694 + }, + { + "epoch": 4.77338024039871, + "grad_norm": 0.27077924963485445, + "learning_rate": 0.00018116181793644272, + "loss": 2.826604127883911, + "step": 8143, + "token_acc": 0.32209233480661464 + }, + { + "epoch": 4.773966578715919, + "grad_norm": 0.2902639271082748, + "learning_rate": 0.0001811561555525124, + "loss": 2.8524208068847656, + "step": 8144, + "token_acc": 0.31812241256524837 + }, + { + "epoch": 4.7745529170331285, + "grad_norm": 0.2338020160442722, + "learning_rate": 0.0001811504924062294, + "loss": 2.8528800010681152, + "step": 8145, + "token_acc": 0.31817058932133 + }, + { + "epoch": 4.775139255350338, + "grad_norm": 0.27317734578112846, + "learning_rate": 0.00018114482849764687, + "loss": 2.853255033493042, + "step": 8146, + "token_acc": 0.31783677342339123 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 0.29942002214019947, + "learning_rate": 0.00018113916382681803, + "loss": 2.847193479537964, + "step": 8147, + "token_acc": 0.3209507347929856 + }, + { + "epoch": 4.776311931984755, + "grad_norm": 0.3152178153781653, + "learning_rate": 0.00018113349839379606, + "loss": 2.8592376708984375, + "step": 8148, + "token_acc": 0.3177396895605521 + }, + { + "epoch": 4.776898270301964, + "grad_norm": 0.2755407087772189, + "learning_rate": 0.00018112783219863417, + "loss": 2.8734045028686523, + "step": 8149, + "token_acc": 0.3146510640061333 + }, + { + "epoch": 4.777484608619173, + "grad_norm": 0.256599712003577, + "learning_rate": 0.00018112216524138568, + "loss": 2.8599419593811035, + "step": 8150, + "token_acc": 0.31715971107101276 + }, + { + "epoch": 4.778070946936382, + "grad_norm": 0.24107035830239307, + "learning_rate": 0.00018111649752210372, + "loss": 2.823293447494507, + "step": 8151, + "token_acc": 0.32303869274189645 + }, + { + "epoch": 4.778657285253591, + "grad_norm": 0.24240712571419468, + "learning_rate": 0.0001811108290408416, + "loss": 2.890720844268799, + "step": 8152, + "token_acc": 0.313693807477252 + }, + { + "epoch": 4.7792436235708005, + "grad_norm": 0.24125320199863215, + "learning_rate": 0.00018110515979765252, + "loss": 2.8172149658203125, + "step": 8153, + "token_acc": 0.3217824252180238 + }, + { + "epoch": 4.77982996188801, + "grad_norm": 0.2760180006122312, + "learning_rate": 0.00018109948979258978, + "loss": 2.868124485015869, + "step": 8154, + "token_acc": 0.3157876718641783 + }, + { + "epoch": 4.780416300205219, + "grad_norm": 0.2433756743712632, + "learning_rate": 0.00018109381902570659, + "loss": 2.891968250274658, + "step": 8155, + "token_acc": 0.31359143823345564 + }, + { + "epoch": 4.781002638522428, + "grad_norm": 0.2465303383082132, + "learning_rate": 0.00018108814749705625, + "loss": 2.857940912246704, + "step": 8156, + "token_acc": 0.3179065213933928 + }, + { + "epoch": 4.781588976839636, + "grad_norm": 0.28851049234808446, + "learning_rate": 0.000181082475206692, + "loss": 2.882906436920166, + "step": 8157, + "token_acc": 0.3131278791520403 + }, + { + "epoch": 4.782175315156845, + "grad_norm": 0.2770484626556144, + "learning_rate": 0.00018107680215466722, + "loss": 2.85306978225708, + "step": 8158, + "token_acc": 0.31886902278334955 + }, + { + "epoch": 4.782761653474054, + "grad_norm": 0.2662430167428787, + "learning_rate": 0.0001810711283410351, + "loss": 2.8797686100006104, + "step": 8159, + "token_acc": 0.3143202961322608 + }, + { + "epoch": 4.783347991791263, + "grad_norm": 0.2446041227574991, + "learning_rate": 0.00018106545376584898, + "loss": 2.9015705585479736, + "step": 8160, + "token_acc": 0.31088644525373677 + }, + { + "epoch": 4.7839343301084725, + "grad_norm": 0.2652140190043571, + "learning_rate": 0.00018105977842916216, + "loss": 2.868628740310669, + "step": 8161, + "token_acc": 0.31614125015538996 + }, + { + "epoch": 4.784520668425682, + "grad_norm": 0.27171634638394543, + "learning_rate": 0.00018105410233102795, + "loss": 2.8268423080444336, + "step": 8162, + "token_acc": 0.3212940239386894 + }, + { + "epoch": 4.785107006742891, + "grad_norm": 0.2435175237871007, + "learning_rate": 0.00018104842547149967, + "loss": 2.860830783843994, + "step": 8163, + "token_acc": 0.31832966892564535 + }, + { + "epoch": 4.7856933450601, + "grad_norm": 0.28195816557607195, + "learning_rate": 0.00018104274785063064, + "loss": 2.8669190406799316, + "step": 8164, + "token_acc": 0.3138442646000325 + }, + { + "epoch": 4.786279683377309, + "grad_norm": 0.2222731700532026, + "learning_rate": 0.0001810370694684742, + "loss": 2.8316445350646973, + "step": 8165, + "token_acc": 0.32005220588044125 + }, + { + "epoch": 4.786866021694518, + "grad_norm": 0.3236069300476552, + "learning_rate": 0.0001810313903250837, + "loss": 2.8675243854522705, + "step": 8166, + "token_acc": 0.3171432505865716 + }, + { + "epoch": 4.787452360011727, + "grad_norm": 0.36629393019166573, + "learning_rate": 0.0001810257104205125, + "loss": 2.8252575397491455, + "step": 8167, + "token_acc": 0.3230941692009805 + }, + { + "epoch": 4.788038698328936, + "grad_norm": 0.2340781079468918, + "learning_rate": 0.00018102002975481393, + "loss": 2.8369643688201904, + "step": 8168, + "token_acc": 0.3216903217519748 + }, + { + "epoch": 4.7886250366461445, + "grad_norm": 0.3923471445134877, + "learning_rate": 0.0001810143483280413, + "loss": 2.875633955001831, + "step": 8169, + "token_acc": 0.31395536869340235 + }, + { + "epoch": 4.789211374963354, + "grad_norm": 0.2586293568970939, + "learning_rate": 0.0001810086661402481, + "loss": 2.8146908283233643, + "step": 8170, + "token_acc": 0.32401377960671823 + }, + { + "epoch": 4.789797713280563, + "grad_norm": 0.3317381261266541, + "learning_rate": 0.00018100298319148757, + "loss": 2.8480870723724365, + "step": 8171, + "token_acc": 0.31824836298611914 + }, + { + "epoch": 4.790384051597772, + "grad_norm": 0.2489372070631155, + "learning_rate": 0.00018099729948181325, + "loss": 2.817063093185425, + "step": 8172, + "token_acc": 0.322509765625 + }, + { + "epoch": 4.790970389914981, + "grad_norm": 0.34581254920205595, + "learning_rate": 0.0001809916150112784, + "loss": 2.867819309234619, + "step": 8173, + "token_acc": 0.3156962615445209 + }, + { + "epoch": 4.79155672823219, + "grad_norm": 0.23278217159996728, + "learning_rate": 0.00018098592977993646, + "loss": 2.8557162284851074, + "step": 8174, + "token_acc": 0.3161999328239147 + }, + { + "epoch": 4.792143066549399, + "grad_norm": 0.3014441922280524, + "learning_rate": 0.00018098024378784087, + "loss": 2.84934663772583, + "step": 8175, + "token_acc": 0.3183370646534398 + }, + { + "epoch": 4.792729404866608, + "grad_norm": 0.25092992170602385, + "learning_rate": 0.000180974557035045, + "loss": 2.8727641105651855, + "step": 8176, + "token_acc": 0.3164565504603349 + }, + { + "epoch": 4.793315743183817, + "grad_norm": 0.2707209433815732, + "learning_rate": 0.00018096886952160226, + "loss": 2.8431830406188965, + "step": 8177, + "token_acc": 0.31908322624378727 + }, + { + "epoch": 4.7939020815010265, + "grad_norm": 0.24538217576471189, + "learning_rate": 0.00018096318124756613, + "loss": 2.8503355979919434, + "step": 8178, + "token_acc": 0.3182705029008561 + }, + { + "epoch": 4.794488419818235, + "grad_norm": 0.21997841638318055, + "learning_rate": 0.00018095749221299, + "loss": 2.8617210388183594, + "step": 8179, + "token_acc": 0.31850876194502714 + }, + { + "epoch": 4.795074758135444, + "grad_norm": 0.2965800898239356, + "learning_rate": 0.00018095180241792732, + "loss": 2.832221031188965, + "step": 8180, + "token_acc": 0.319807231373349 + }, + { + "epoch": 4.795661096452653, + "grad_norm": 0.23788493450420323, + "learning_rate": 0.0001809461118624315, + "loss": 2.9093899726867676, + "step": 8181, + "token_acc": 0.31061972680304367 + }, + { + "epoch": 4.796247434769862, + "grad_norm": 0.25070970081028615, + "learning_rate": 0.0001809404205465561, + "loss": 2.9116873741149902, + "step": 8182, + "token_acc": 0.31046585916692737 + }, + { + "epoch": 4.796833773087071, + "grad_norm": 0.28191506577997527, + "learning_rate": 0.00018093472847035449, + "loss": 2.9098711013793945, + "step": 8183, + "token_acc": 0.3081247404789029 + }, + { + "epoch": 4.79742011140428, + "grad_norm": 0.22509699417118398, + "learning_rate": 0.00018092903563388015, + "loss": 2.8509271144866943, + "step": 8184, + "token_acc": 0.31772347052512057 + }, + { + "epoch": 4.798006449721489, + "grad_norm": 0.29864058113816083, + "learning_rate": 0.00018092334203718662, + "loss": 2.8774447441101074, + "step": 8185, + "token_acc": 0.3149316025096222 + }, + { + "epoch": 4.7985927880386985, + "grad_norm": 0.2620555822535803, + "learning_rate": 0.0001809176476803273, + "loss": 2.8750672340393066, + "step": 8186, + "token_acc": 0.3139500111460899 + }, + { + "epoch": 4.799179126355908, + "grad_norm": 0.24996631011354264, + "learning_rate": 0.0001809119525633557, + "loss": 2.8366427421569824, + "step": 8187, + "token_acc": 0.3221414841258106 + }, + { + "epoch": 4.799765464673117, + "grad_norm": 0.30094249500738574, + "learning_rate": 0.00018090625668632537, + "loss": 2.855557441711426, + "step": 8188, + "token_acc": 0.31678514668893204 + }, + { + "epoch": 4.800351802990326, + "grad_norm": 0.2514099446714227, + "learning_rate": 0.00018090056004928977, + "loss": 2.8482189178466797, + "step": 8189, + "token_acc": 0.3199554933239986 + }, + { + "epoch": 4.800938141307535, + "grad_norm": 0.2526603848329596, + "learning_rate": 0.00018089486265230245, + "loss": 2.820018768310547, + "step": 8190, + "token_acc": 0.3227026264853294 + }, + { + "epoch": 4.801524479624743, + "grad_norm": 0.24290049582219164, + "learning_rate": 0.00018088916449541688, + "loss": 2.8114962577819824, + "step": 8191, + "token_acc": 0.323745375355197 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.28825989497942844, + "learning_rate": 0.0001808834655786866, + "loss": 2.848357677459717, + "step": 8192, + "token_acc": 0.3189894573785611 + }, + { + "epoch": 4.802697156259161, + "grad_norm": 0.24082759248248137, + "learning_rate": 0.00018087776590216517, + "loss": 2.8723549842834473, + "step": 8193, + "token_acc": 0.3159647698299001 + }, + { + "epoch": 4.8032834945763705, + "grad_norm": 0.2668897304549686, + "learning_rate": 0.0001808720654659061, + "loss": 2.8832757472991943, + "step": 8194, + "token_acc": 0.3136895527165622 + }, + { + "epoch": 4.80386983289358, + "grad_norm": 0.219158410487806, + "learning_rate": 0.00018086636426996292, + "loss": 2.8898842334747314, + "step": 8195, + "token_acc": 0.31334483266914753 + }, + { + "epoch": 4.804456171210789, + "grad_norm": 0.2520977482353172, + "learning_rate": 0.00018086066231438929, + "loss": 2.867830514907837, + "step": 8196, + "token_acc": 0.3176000827445046 + }, + { + "epoch": 4.805042509527998, + "grad_norm": 0.2260520245413571, + "learning_rate": 0.00018085495959923863, + "loss": 2.820164203643799, + "step": 8197, + "token_acc": 0.32319370780107615 + }, + { + "epoch": 4.805628847845207, + "grad_norm": 0.24143640447731016, + "learning_rate": 0.00018084925612456463, + "loss": 2.8762216567993164, + "step": 8198, + "token_acc": 0.3132154620261992 + }, + { + "epoch": 4.806215186162416, + "grad_norm": 0.23377367250215608, + "learning_rate": 0.0001808435518904208, + "loss": 2.85361385345459, + "step": 8199, + "token_acc": 0.3176249882750211 + }, + { + "epoch": 4.806801524479624, + "grad_norm": 0.25714854874495435, + "learning_rate": 0.0001808378468968607, + "loss": 2.8985462188720703, + "step": 8200, + "token_acc": 0.3124442888068254 + }, + { + "epoch": 4.807387862796833, + "grad_norm": 0.25446293412333043, + "learning_rate": 0.000180832141143938, + "loss": 2.847727060317993, + "step": 8201, + "token_acc": 0.3188911693533571 + }, + { + "epoch": 4.8079742011140425, + "grad_norm": 0.22141394356354235, + "learning_rate": 0.00018082643463170626, + "loss": 2.8736815452575684, + "step": 8202, + "token_acc": 0.3144927806439756 + }, + { + "epoch": 4.808560539431252, + "grad_norm": 0.2388494738863037, + "learning_rate": 0.00018082072736021906, + "loss": 2.850316286087036, + "step": 8203, + "token_acc": 0.31842846313551926 + }, + { + "epoch": 4.809146877748461, + "grad_norm": 0.2416443549327943, + "learning_rate": 0.00018081501932953005, + "loss": 2.8625502586364746, + "step": 8204, + "token_acc": 0.3155276396614976 + }, + { + "epoch": 4.80973321606567, + "grad_norm": 0.22852947195773407, + "learning_rate": 0.00018080931053969282, + "loss": 2.8500723838806152, + "step": 8205, + "token_acc": 0.32011872666943175 + }, + { + "epoch": 4.810319554382879, + "grad_norm": 0.2640730853027657, + "learning_rate": 0.00018080360099076098, + "loss": 2.890326499938965, + "step": 8206, + "token_acc": 0.31345769654637956 + }, + { + "epoch": 4.810905892700088, + "grad_norm": 0.23338253996819766, + "learning_rate": 0.00018079789068278825, + "loss": 2.8558266162872314, + "step": 8207, + "token_acc": 0.31727379235516784 + }, + { + "epoch": 4.811492231017297, + "grad_norm": 0.26815192656086667, + "learning_rate": 0.0001807921796158282, + "loss": 2.851827621459961, + "step": 8208, + "token_acc": 0.3200605617410471 + }, + { + "epoch": 4.812078569334506, + "grad_norm": 0.2680973211187716, + "learning_rate": 0.00018078646778993447, + "loss": 2.8473615646362305, + "step": 8209, + "token_acc": 0.32104972505829915 + }, + { + "epoch": 4.812664907651715, + "grad_norm": 0.22936740359740326, + "learning_rate": 0.00018078075520516076, + "loss": 2.8656771183013916, + "step": 8210, + "token_acc": 0.315752506554218 + }, + { + "epoch": 4.8132512459689245, + "grad_norm": 0.2562788778282395, + "learning_rate": 0.0001807750418615607, + "loss": 2.872614860534668, + "step": 8211, + "token_acc": 0.31601683014809906 + }, + { + "epoch": 4.813837584286133, + "grad_norm": 0.23169224743021596, + "learning_rate": 0.00018076932775918796, + "loss": 2.8568992614746094, + "step": 8212, + "token_acc": 0.3179971944810837 + }, + { + "epoch": 4.814423922603342, + "grad_norm": 0.26156465953851854, + "learning_rate": 0.0001807636128980962, + "loss": 2.8486838340759277, + "step": 8213, + "token_acc": 0.3193243499321183 + }, + { + "epoch": 4.815010260920551, + "grad_norm": 0.24189732538383077, + "learning_rate": 0.00018075789727833917, + "loss": 2.8999156951904297, + "step": 8214, + "token_acc": 0.31061379097093383 + }, + { + "epoch": 4.81559659923776, + "grad_norm": 0.23957126575589463, + "learning_rate": 0.0001807521808999705, + "loss": 2.8752529621124268, + "step": 8215, + "token_acc": 0.3129721328658691 + }, + { + "epoch": 4.816182937554969, + "grad_norm": 0.26684385286455187, + "learning_rate": 0.0001807464637630439, + "loss": 2.8746471405029297, + "step": 8216, + "token_acc": 0.3151582578607831 + }, + { + "epoch": 4.816769275872178, + "grad_norm": 0.25725339910232115, + "learning_rate": 0.0001807407458676131, + "loss": 2.8593287467956543, + "step": 8217, + "token_acc": 0.31664378923961756 + }, + { + "epoch": 4.817355614189387, + "grad_norm": 0.27782955888054767, + "learning_rate": 0.00018073502721373177, + "loss": 2.883793830871582, + "step": 8218, + "token_acc": 0.31404211903062734 + }, + { + "epoch": 4.8179419525065965, + "grad_norm": 0.28777892804597327, + "learning_rate": 0.00018072930780145367, + "loss": 2.8663835525512695, + "step": 8219, + "token_acc": 0.3161315015619651 + }, + { + "epoch": 4.818528290823806, + "grad_norm": 0.3072605752691343, + "learning_rate": 0.00018072358763083251, + "loss": 2.862708568572998, + "step": 8220, + "token_acc": 0.31613593542834545 + }, + { + "epoch": 4.819114629141015, + "grad_norm": 0.25538760818729805, + "learning_rate": 0.00018071786670192198, + "loss": 2.8655238151550293, + "step": 8221, + "token_acc": 0.3167420582281144 + }, + { + "epoch": 4.819700967458223, + "grad_norm": 0.24224807924943095, + "learning_rate": 0.0001807121450147759, + "loss": 2.8618836402893066, + "step": 8222, + "token_acc": 0.3178090818952842 + }, + { + "epoch": 4.820287305775432, + "grad_norm": 0.2362867878307459, + "learning_rate": 0.00018070642256944795, + "loss": 2.843153715133667, + "step": 8223, + "token_acc": 0.3198684755187953 + }, + { + "epoch": 4.820873644092641, + "grad_norm": 0.22960889386403519, + "learning_rate": 0.00018070069936599196, + "loss": 2.8467721939086914, + "step": 8224, + "token_acc": 0.3194806081991808 + }, + { + "epoch": 4.82145998240985, + "grad_norm": 0.22110122169758384, + "learning_rate": 0.00018069497540446162, + "loss": 2.8222007751464844, + "step": 8225, + "token_acc": 0.3232364435438241 + }, + { + "epoch": 4.822046320727059, + "grad_norm": 0.2540249124375196, + "learning_rate": 0.00018068925068491073, + "loss": 2.8849048614501953, + "step": 8226, + "token_acc": 0.31421429030255016 + }, + { + "epoch": 4.8226326590442685, + "grad_norm": 0.2639050299425887, + "learning_rate": 0.00018068352520739304, + "loss": 2.9070823192596436, + "step": 8227, + "token_acc": 0.3109053157862116 + }, + { + "epoch": 4.823218997361478, + "grad_norm": 0.23078505436481792, + "learning_rate": 0.00018067779897196237, + "loss": 2.877955436706543, + "step": 8228, + "token_acc": 0.31353088852912886 + }, + { + "epoch": 4.823805335678687, + "grad_norm": 0.23496549610824452, + "learning_rate": 0.00018067207197867247, + "loss": 2.8433361053466797, + "step": 8229, + "token_acc": 0.3188719554139962 + }, + { + "epoch": 4.824391673995896, + "grad_norm": 0.25530349951740045, + "learning_rate": 0.0001806663442275772, + "loss": 2.8697896003723145, + "step": 8230, + "token_acc": 0.31594890132669984 + }, + { + "epoch": 4.824978012313105, + "grad_norm": 0.2694179582986932, + "learning_rate": 0.0001806606157187303, + "loss": 2.8228821754455566, + "step": 8231, + "token_acc": 0.32266536402935136 + }, + { + "epoch": 4.825564350630314, + "grad_norm": 0.2251553767729195, + "learning_rate": 0.0001806548864521856, + "loss": 2.8572213649749756, + "step": 8232, + "token_acc": 0.3171362264783538 + }, + { + "epoch": 4.826150688947523, + "grad_norm": 0.23551163610864667, + "learning_rate": 0.00018064915642799692, + "loss": 2.8482768535614014, + "step": 8233, + "token_acc": 0.3182484141168023 + }, + { + "epoch": 4.826737027264731, + "grad_norm": 0.2216757676955962, + "learning_rate": 0.0001806434256462181, + "loss": 2.842334270477295, + "step": 8234, + "token_acc": 0.31985196382496284 + }, + { + "epoch": 4.8273233655819405, + "grad_norm": 0.2566188644556497, + "learning_rate": 0.000180637694106903, + "loss": 2.879251718521118, + "step": 8235, + "token_acc": 0.3137808665962374 + }, + { + "epoch": 4.82790970389915, + "grad_norm": 0.3068648597098306, + "learning_rate": 0.00018063196181010538, + "loss": 2.8713197708129883, + "step": 8236, + "token_acc": 0.31493075513376 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.28655843664163927, + "learning_rate": 0.00018062622875587915, + "loss": 2.8696413040161133, + "step": 8237, + "token_acc": 0.314733687501178 + }, + { + "epoch": 4.829082380533568, + "grad_norm": 0.20986594432373837, + "learning_rate": 0.00018062049494427815, + "loss": 2.869027614593506, + "step": 8238, + "token_acc": 0.3153961251311162 + }, + { + "epoch": 4.829668718850777, + "grad_norm": 0.298237725812062, + "learning_rate": 0.00018061476037535624, + "loss": 2.8465917110443115, + "step": 8239, + "token_acc": 0.320963109641183 + }, + { + "epoch": 4.830255057167986, + "grad_norm": 0.2268527458653415, + "learning_rate": 0.00018060902504916725, + "loss": 2.8734958171844482, + "step": 8240, + "token_acc": 0.31613198634769174 + }, + { + "epoch": 4.830841395485195, + "grad_norm": 0.2638291078469678, + "learning_rate": 0.00018060328896576513, + "loss": 2.8857178688049316, + "step": 8241, + "token_acc": 0.3135397334451528 + }, + { + "epoch": 4.831427733802404, + "grad_norm": 0.2696195367090812, + "learning_rate": 0.00018059755212520372, + "loss": 2.9038708209991455, + "step": 8242, + "token_acc": 0.30950549913615105 + }, + { + "epoch": 4.8320140721196125, + "grad_norm": 0.22436018974786645, + "learning_rate": 0.0001805918145275369, + "loss": 2.861236095428467, + "step": 8243, + "token_acc": 0.3171980968165824 + }, + { + "epoch": 4.832600410436822, + "grad_norm": 0.26679029840985113, + "learning_rate": 0.0001805860761728186, + "loss": 2.88057017326355, + "step": 8244, + "token_acc": 0.3141044921121205 + }, + { + "epoch": 4.833186748754031, + "grad_norm": 0.2226249569287407, + "learning_rate": 0.0001805803370611027, + "loss": 2.8849852085113525, + "step": 8245, + "token_acc": 0.3133791653015715 + }, + { + "epoch": 4.83377308707124, + "grad_norm": 0.2586694075073, + "learning_rate": 0.00018057459719244312, + "loss": 2.864267587661743, + "step": 8246, + "token_acc": 0.31807542262678806 + }, + { + "epoch": 4.834359425388449, + "grad_norm": 0.24423887136021102, + "learning_rate": 0.00018056885656689376, + "loss": 2.859689474105835, + "step": 8247, + "token_acc": 0.31816064309239034 + }, + { + "epoch": 4.834945763705658, + "grad_norm": 0.25830508285499704, + "learning_rate": 0.00018056311518450854, + "loss": 2.878535747528076, + "step": 8248, + "token_acc": 0.3135531759489391 + }, + { + "epoch": 4.835532102022867, + "grad_norm": 0.24118887532754674, + "learning_rate": 0.00018055737304534147, + "loss": 2.830040216445923, + "step": 8249, + "token_acc": 0.3210832161678282 + }, + { + "epoch": 4.836118440340076, + "grad_norm": 0.23108683381840714, + "learning_rate": 0.0001805516301494464, + "loss": 2.8736555576324463, + "step": 8250, + "token_acc": 0.3146463699258148 + }, + { + "epoch": 4.836704778657285, + "grad_norm": 0.2532612232141292, + "learning_rate": 0.0001805458864968773, + "loss": 2.892242431640625, + "step": 8251, + "token_acc": 0.3121503277524947 + }, + { + "epoch": 4.8372911169744945, + "grad_norm": 0.25013810059464525, + "learning_rate": 0.00018054014208768813, + "loss": 2.8634836673736572, + "step": 8252, + "token_acc": 0.31592235956090314 + }, + { + "epoch": 4.837877455291704, + "grad_norm": 0.2579046103567225, + "learning_rate": 0.0001805343969219329, + "loss": 2.83475399017334, + "step": 8253, + "token_acc": 0.32222596803436 + }, + { + "epoch": 4.838463793608913, + "grad_norm": 0.24573011345980175, + "learning_rate": 0.0001805286509996655, + "loss": 2.8755555152893066, + "step": 8254, + "token_acc": 0.31531732968397685 + }, + { + "epoch": 4.839050131926121, + "grad_norm": 0.2503673896450783, + "learning_rate": 0.00018052290432093993, + "loss": 2.8335912227630615, + "step": 8255, + "token_acc": 0.3232205639246548 + }, + { + "epoch": 4.83963647024333, + "grad_norm": 0.23326978696579043, + "learning_rate": 0.0001805171568858102, + "loss": 2.8716824054718018, + "step": 8256, + "token_acc": 0.3158763774722711 + }, + { + "epoch": 4.840222808560539, + "grad_norm": 0.2471043054527452, + "learning_rate": 0.00018051140869433026, + "loss": 2.8203985691070557, + "step": 8257, + "token_acc": 0.3210322026438726 + }, + { + "epoch": 4.840809146877748, + "grad_norm": 0.2425622949435863, + "learning_rate": 0.00018050565974655412, + "loss": 2.84857177734375, + "step": 8258, + "token_acc": 0.318638694353548 + }, + { + "epoch": 4.841395485194957, + "grad_norm": 0.2490927332105346, + "learning_rate": 0.0001804999100425358, + "loss": 2.8525524139404297, + "step": 8259, + "token_acc": 0.3186320091689553 + }, + { + "epoch": 4.8419818235121665, + "grad_norm": 0.23407265553562537, + "learning_rate": 0.00018049415958232932, + "loss": 2.8574249744415283, + "step": 8260, + "token_acc": 0.31782524417409186 + }, + { + "epoch": 4.842568161829376, + "grad_norm": 0.24844312271338953, + "learning_rate": 0.00018048840836598867, + "loss": 2.859438180923462, + "step": 8261, + "token_acc": 0.3174997722569982 + }, + { + "epoch": 4.843154500146585, + "grad_norm": 0.22160060909432833, + "learning_rate": 0.00018048265639356785, + "loss": 2.8656527996063232, + "step": 8262, + "token_acc": 0.3160205681183318 + }, + { + "epoch": 4.843740838463794, + "grad_norm": 0.22148171837567276, + "learning_rate": 0.00018047690366512095, + "loss": 2.854942798614502, + "step": 8263, + "token_acc": 0.31832615105466094 + }, + { + "epoch": 4.844327176781003, + "grad_norm": 0.22503645265857317, + "learning_rate": 0.00018047115018070197, + "loss": 2.8434882164001465, + "step": 8264, + "token_acc": 0.3183701004840454 + }, + { + "epoch": 4.844913515098211, + "grad_norm": 0.24966726732358113, + "learning_rate": 0.000180465395940365, + "loss": 2.876203775405884, + "step": 8265, + "token_acc": 0.3145159541595741 + }, + { + "epoch": 4.84549985341542, + "grad_norm": 0.27436033696318435, + "learning_rate": 0.00018045964094416406, + "loss": 2.8579604625701904, + "step": 8266, + "token_acc": 0.3177842341511406 + }, + { + "epoch": 4.8460861917326294, + "grad_norm": 0.34539052750169497, + "learning_rate": 0.0001804538851921532, + "loss": 2.8718719482421875, + "step": 8267, + "token_acc": 0.31489252667550405 + }, + { + "epoch": 4.846672530049839, + "grad_norm": 0.32237752319390384, + "learning_rate": 0.0001804481286843865, + "loss": 2.8554272651672363, + "step": 8268, + "token_acc": 0.3163815668712613 + }, + { + "epoch": 4.847258868367048, + "grad_norm": 0.24655865713050729, + "learning_rate": 0.00018044237142091803, + "loss": 2.898087501525879, + "step": 8269, + "token_acc": 0.31197622893000226 + }, + { + "epoch": 4.847845206684257, + "grad_norm": 0.3117384981293464, + "learning_rate": 0.0001804366134018019, + "loss": 2.846686601638794, + "step": 8270, + "token_acc": 0.32020691766466675 + }, + { + "epoch": 4.848431545001466, + "grad_norm": 0.3325263281692976, + "learning_rate": 0.00018043085462709216, + "loss": 2.870121479034424, + "step": 8271, + "token_acc": 0.315547755586316 + }, + { + "epoch": 4.849017883318675, + "grad_norm": 0.26219748966507395, + "learning_rate": 0.0001804250950968429, + "loss": 2.9362940788269043, + "step": 8272, + "token_acc": 0.3064223191332133 + }, + { + "epoch": 4.849604221635884, + "grad_norm": 0.26897445707930684, + "learning_rate": 0.00018041933481110828, + "loss": 2.8543896675109863, + "step": 8273, + "token_acc": 0.31782677425687333 + }, + { + "epoch": 4.850190559953093, + "grad_norm": 0.2553256807560734, + "learning_rate": 0.0001804135737699424, + "loss": 2.8379414081573486, + "step": 8274, + "token_acc": 0.3211551675253453 + }, + { + "epoch": 4.850776898270302, + "grad_norm": 0.3114082036782669, + "learning_rate": 0.00018040781197339932, + "loss": 2.8789148330688477, + "step": 8275, + "token_acc": 0.3135999916265877 + }, + { + "epoch": 4.8513632365875115, + "grad_norm": 0.2730560220550665, + "learning_rate": 0.0001804020494215332, + "loss": 2.8654305934906006, + "step": 8276, + "token_acc": 0.3151163156404961 + }, + { + "epoch": 4.85194957490472, + "grad_norm": 0.2470158961694744, + "learning_rate": 0.00018039628611439814, + "loss": 2.8697290420532227, + "step": 8277, + "token_acc": 0.3130341601108178 + }, + { + "epoch": 4.852535913221929, + "grad_norm": 0.2565056776298764, + "learning_rate": 0.00018039052205204837, + "loss": 2.837745189666748, + "step": 8278, + "token_acc": 0.32126750687596867 + }, + { + "epoch": 4.853122251539138, + "grad_norm": 0.21918859869281512, + "learning_rate": 0.00018038475723453792, + "loss": 2.843376636505127, + "step": 8279, + "token_acc": 0.3194130065729334 + }, + { + "epoch": 4.853708589856347, + "grad_norm": 0.24356065018212725, + "learning_rate": 0.000180378991661921, + "loss": 2.8585290908813477, + "step": 8280, + "token_acc": 0.31841282409689176 + }, + { + "epoch": 4.854294928173556, + "grad_norm": 0.2367040909788465, + "learning_rate": 0.00018037322533425178, + "loss": 2.901815414428711, + "step": 8281, + "token_acc": 0.31010659551705916 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.2623951539818655, + "learning_rate": 0.0001803674582515844, + "loss": 2.885009527206421, + "step": 8282, + "token_acc": 0.31317041973990556 + }, + { + "epoch": 4.855467604807974, + "grad_norm": 0.24151255188138437, + "learning_rate": 0.0001803616904139731, + "loss": 2.8508739471435547, + "step": 8283, + "token_acc": 0.31892035402882596 + }, + { + "epoch": 4.8560539431251835, + "grad_norm": 0.24142413826738005, + "learning_rate": 0.00018035592182147193, + "loss": 2.8432466983795166, + "step": 8284, + "token_acc": 0.32071092316911376 + }, + { + "epoch": 4.856640281442393, + "grad_norm": 0.25038561087397676, + "learning_rate": 0.0001803501524741352, + "loss": 2.845156669616699, + "step": 8285, + "token_acc": 0.3183360282386619 + }, + { + "epoch": 4.857226619759601, + "grad_norm": 0.2688844123631264, + "learning_rate": 0.00018034438237201706, + "loss": 2.9217746257781982, + "step": 8286, + "token_acc": 0.3069184300314486 + }, + { + "epoch": 4.85781295807681, + "grad_norm": 0.26061201656189176, + "learning_rate": 0.0001803386115151717, + "loss": 2.879915952682495, + "step": 8287, + "token_acc": 0.3138979157554078 + }, + { + "epoch": 4.858399296394019, + "grad_norm": 0.2434241639900745, + "learning_rate": 0.00018033283990365336, + "loss": 2.8732175827026367, + "step": 8288, + "token_acc": 0.3150662841596854 + }, + { + "epoch": 4.858985634711228, + "grad_norm": 0.2636781934330705, + "learning_rate": 0.0001803270675375162, + "loss": 2.8783469200134277, + "step": 8289, + "token_acc": 0.3143435427363254 + }, + { + "epoch": 4.859571973028437, + "grad_norm": 0.2412022578869104, + "learning_rate": 0.00018032129441681455, + "loss": 2.8917031288146973, + "step": 8290, + "token_acc": 0.3122230485575284 + }, + { + "epoch": 4.860158311345646, + "grad_norm": 0.251784973091972, + "learning_rate": 0.0001803155205416025, + "loss": 2.8607735633850098, + "step": 8291, + "token_acc": 0.31629122311909796 + }, + { + "epoch": 4.8607446496628555, + "grad_norm": 0.31277765136428076, + "learning_rate": 0.0001803097459119344, + "loss": 2.8644590377807617, + "step": 8292, + "token_acc": 0.316609756480179 + }, + { + "epoch": 4.861330987980065, + "grad_norm": 0.23496105136535844, + "learning_rate": 0.00018030397052786444, + "loss": 2.8517866134643555, + "step": 8293, + "token_acc": 0.3193561599387224 + }, + { + "epoch": 4.861917326297274, + "grad_norm": 0.2514984031302117, + "learning_rate": 0.00018029819438944688, + "loss": 2.8404793739318848, + "step": 8294, + "token_acc": 0.321150203397199 + }, + { + "epoch": 4.862503664614483, + "grad_norm": 0.2345558256257672, + "learning_rate": 0.00018029241749673602, + "loss": 2.8161051273345947, + "step": 8295, + "token_acc": 0.324529119428893 + }, + { + "epoch": 4.863090002931692, + "grad_norm": 0.24003916718476534, + "learning_rate": 0.00018028663984978607, + "loss": 2.8749234676361084, + "step": 8296, + "token_acc": 0.3157562037722393 + }, + { + "epoch": 4.863676341248901, + "grad_norm": 0.22372755736968827, + "learning_rate": 0.00018028086144865132, + "loss": 2.8817644119262695, + "step": 8297, + "token_acc": 0.31306175947675097 + }, + { + "epoch": 4.86426267956611, + "grad_norm": 0.24911554432421226, + "learning_rate": 0.00018027508229338609, + "loss": 2.8640637397766113, + "step": 8298, + "token_acc": 0.3152502706489206 + }, + { + "epoch": 4.864849017883318, + "grad_norm": 0.2365267375596786, + "learning_rate": 0.0001802693023840446, + "loss": 2.862915515899658, + "step": 8299, + "token_acc": 0.3156112499933655 + }, + { + "epoch": 4.8654353562005275, + "grad_norm": 0.2465057550603483, + "learning_rate": 0.0001802635217206812, + "loss": 2.845059871673584, + "step": 8300, + "token_acc": 0.32128665342970675 + }, + { + "epoch": 4.866021694517737, + "grad_norm": 0.23719947427105276, + "learning_rate": 0.00018025774030335016, + "loss": 2.88051700592041, + "step": 8301, + "token_acc": 0.31389478687591243 + }, + { + "epoch": 4.866608032834946, + "grad_norm": 0.22523795178569206, + "learning_rate": 0.00018025195813210578, + "loss": 2.847609758377075, + "step": 8302, + "token_acc": 0.31831767605746764 + }, + { + "epoch": 4.867194371152155, + "grad_norm": 0.24587668926707795, + "learning_rate": 0.00018024617520700243, + "loss": 2.8776755332946777, + "step": 8303, + "token_acc": 0.31404801879583955 + }, + { + "epoch": 4.867780709469364, + "grad_norm": 0.2318335054382226, + "learning_rate": 0.00018024039152809436, + "loss": 2.867398738861084, + "step": 8304, + "token_acc": 0.3170485157085939 + }, + { + "epoch": 4.868367047786573, + "grad_norm": 0.24106518459400422, + "learning_rate": 0.00018023460709543597, + "loss": 2.8649697303771973, + "step": 8305, + "token_acc": 0.3154857752804883 + }, + { + "epoch": 4.868953386103782, + "grad_norm": 0.2692996784923357, + "learning_rate": 0.00018022882190908156, + "loss": 2.882925510406494, + "step": 8306, + "token_acc": 0.31291861699721385 + }, + { + "epoch": 4.869539724420991, + "grad_norm": 0.24226530823021974, + "learning_rate": 0.00018022303596908548, + "loss": 2.8986101150512695, + "step": 8307, + "token_acc": 0.3103310125558258 + }, + { + "epoch": 4.8701260627381995, + "grad_norm": 0.26374124541814953, + "learning_rate": 0.00018021724927550205, + "loss": 2.8671793937683105, + "step": 8308, + "token_acc": 0.315154242592974 + }, + { + "epoch": 4.870712401055409, + "grad_norm": 0.2436113019715859, + "learning_rate": 0.00018021146182838567, + "loss": 2.839139223098755, + "step": 8309, + "token_acc": 0.3204626156089057 + }, + { + "epoch": 4.871298739372618, + "grad_norm": 0.23228069491711262, + "learning_rate": 0.00018020567362779071, + "loss": 2.8331470489501953, + "step": 8310, + "token_acc": 0.3216310001430174 + }, + { + "epoch": 4.871885077689827, + "grad_norm": 0.2463145013589297, + "learning_rate": 0.0001801998846737715, + "loss": 2.858738422393799, + "step": 8311, + "token_acc": 0.3177758586188375 + }, + { + "epoch": 4.872471416007036, + "grad_norm": 0.23723800238275622, + "learning_rate": 0.00018019409496638247, + "loss": 2.8994221687316895, + "step": 8312, + "token_acc": 0.3099905309345034 + }, + { + "epoch": 4.873057754324245, + "grad_norm": 0.250814793198358, + "learning_rate": 0.00018018830450567793, + "loss": 2.9055447578430176, + "step": 8313, + "token_acc": 0.3108170054316789 + }, + { + "epoch": 4.873644092641454, + "grad_norm": 0.27398209307191984, + "learning_rate": 0.0001801825132917124, + "loss": 2.879744052886963, + "step": 8314, + "token_acc": 0.3138493930217388 + }, + { + "epoch": 4.874230430958663, + "grad_norm": 0.22359184827465498, + "learning_rate": 0.00018017672132454018, + "loss": 2.8784992694854736, + "step": 8315, + "token_acc": 0.3132211215806287 + }, + { + "epoch": 4.874816769275872, + "grad_norm": 0.2762913441351406, + "learning_rate": 0.00018017092860421566, + "loss": 2.8213250637054443, + "step": 8316, + "token_acc": 0.3228874011997838 + }, + { + "epoch": 4.8754031075930815, + "grad_norm": 0.27911287184463435, + "learning_rate": 0.00018016513513079334, + "loss": 2.8378982543945312, + "step": 8317, + "token_acc": 0.3210342899160099 + }, + { + "epoch": 4.875989445910291, + "grad_norm": 0.3467151263096626, + "learning_rate": 0.00018015934090432757, + "loss": 2.879188060760498, + "step": 8318, + "token_acc": 0.31426505794368764 + }, + { + "epoch": 4.8765757842275, + "grad_norm": 0.41651856958443784, + "learning_rate": 0.00018015354592487283, + "loss": 2.894289970397949, + "step": 8319, + "token_acc": 0.31249780912858593 + }, + { + "epoch": 4.877162122544708, + "grad_norm": 0.25184072688833115, + "learning_rate": 0.00018014775019248354, + "loss": 2.9014859199523926, + "step": 8320, + "token_acc": 0.3118942919352773 + }, + { + "epoch": 4.877748460861917, + "grad_norm": 0.4366234877037817, + "learning_rate": 0.00018014195370721412, + "loss": 2.869117021560669, + "step": 8321, + "token_acc": 0.3143454485283689 + }, + { + "epoch": 4.878334799179126, + "grad_norm": 0.3519327963883528, + "learning_rate": 0.00018013615646911903, + "loss": 2.82499361038208, + "step": 8322, + "token_acc": 0.3220378060179883 + }, + { + "epoch": 4.878921137496335, + "grad_norm": 0.33397018448864185, + "learning_rate": 0.00018013035847825278, + "loss": 2.8657753467559814, + "step": 8323, + "token_acc": 0.31717150526355425 + }, + { + "epoch": 4.879507475813544, + "grad_norm": 0.3009519254263361, + "learning_rate": 0.00018012455973466973, + "loss": 2.842866897583008, + "step": 8324, + "token_acc": 0.32038674467318284 + }, + { + "epoch": 4.8800938141307535, + "grad_norm": 0.3443367633797972, + "learning_rate": 0.00018011876023842443, + "loss": 2.871098041534424, + "step": 8325, + "token_acc": 0.31643414811547776 + }, + { + "epoch": 4.880680152447963, + "grad_norm": 0.24497043106893787, + "learning_rate": 0.00018011295998957136, + "loss": 2.9212915897369385, + "step": 8326, + "token_acc": 0.3088777353944991 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 0.316984883994554, + "learning_rate": 0.00018010715898816496, + "loss": 2.8661885261535645, + "step": 8327, + "token_acc": 0.31717082662757545 + }, + { + "epoch": 4.881852829082381, + "grad_norm": 0.24788825320846958, + "learning_rate": 0.00018010135723425975, + "loss": 2.8803391456604004, + "step": 8328, + "token_acc": 0.3138902779346055 + }, + { + "epoch": 4.88243916739959, + "grad_norm": 0.2602679117072649, + "learning_rate": 0.00018009555472791024, + "loss": 2.87166690826416, + "step": 8329, + "token_acc": 0.315174309717123 + }, + { + "epoch": 4.883025505716798, + "grad_norm": 0.31036588749503774, + "learning_rate": 0.00018008975146917094, + "loss": 2.848505973815918, + "step": 8330, + "token_acc": 0.31965453121420234 + }, + { + "epoch": 4.883611844034007, + "grad_norm": 0.21788158773523006, + "learning_rate": 0.00018008394745809632, + "loss": 2.8551535606384277, + "step": 8331, + "token_acc": 0.31942335256912363 + }, + { + "epoch": 4.884198182351216, + "grad_norm": 0.26274104727704695, + "learning_rate": 0.00018007814269474092, + "loss": 2.8808937072753906, + "step": 8332, + "token_acc": 0.31517946944933367 + }, + { + "epoch": 4.8847845206684255, + "grad_norm": 0.25641079607513045, + "learning_rate": 0.00018007233717915927, + "loss": 2.819014549255371, + "step": 8333, + "token_acc": 0.3236232315436507 + }, + { + "epoch": 4.885370858985635, + "grad_norm": 0.23555712937131118, + "learning_rate": 0.00018006653091140595, + "loss": 2.8311595916748047, + "step": 8334, + "token_acc": 0.3205550882216173 + }, + { + "epoch": 4.885957197302844, + "grad_norm": 0.29136000202512463, + "learning_rate": 0.00018006072389153545, + "loss": 2.8847479820251465, + "step": 8335, + "token_acc": 0.31392240313407 + }, + { + "epoch": 4.886543535620053, + "grad_norm": 0.23515596007198872, + "learning_rate": 0.00018005491611960232, + "loss": 2.8684260845184326, + "step": 8336, + "token_acc": 0.31563893266020926 + }, + { + "epoch": 4.887129873937262, + "grad_norm": 0.23892637852319623, + "learning_rate": 0.00018004910759566114, + "loss": 2.8785219192504883, + "step": 8337, + "token_acc": 0.31416866447779945 + }, + { + "epoch": 4.887716212254471, + "grad_norm": 0.24738364982189454, + "learning_rate": 0.00018004329831976644, + "loss": 2.8637099266052246, + "step": 8338, + "token_acc": 0.3154155394301029 + }, + { + "epoch": 4.88830255057168, + "grad_norm": 0.2241361403139087, + "learning_rate": 0.00018003748829197285, + "loss": 2.840609073638916, + "step": 8339, + "token_acc": 0.3190341879260843 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.24769831876794188, + "learning_rate": 0.00018003167751233487, + "loss": 2.830899238586426, + "step": 8340, + "token_acc": 0.32139984662289806 + }, + { + "epoch": 4.889475227206098, + "grad_norm": 0.23949859887013486, + "learning_rate": 0.00018002586598090713, + "loss": 2.8124194145202637, + "step": 8341, + "token_acc": 0.3246290072423339 + }, + { + "epoch": 4.890061565523307, + "grad_norm": 0.22859410466642532, + "learning_rate": 0.00018002005369774425, + "loss": 2.8836545944213867, + "step": 8342, + "token_acc": 0.31433519146810085 + }, + { + "epoch": 4.890647903840516, + "grad_norm": 0.2505889991202275, + "learning_rate": 0.00018001424066290076, + "loss": 2.8768460750579834, + "step": 8343, + "token_acc": 0.3155217826476321 + }, + { + "epoch": 4.891234242157725, + "grad_norm": 0.22030788690810008, + "learning_rate": 0.0001800084268764313, + "loss": 2.869615077972412, + "step": 8344, + "token_acc": 0.31552486632613047 + }, + { + "epoch": 4.891820580474934, + "grad_norm": 0.24817859520213717, + "learning_rate": 0.0001800026123383905, + "loss": 2.893939971923828, + "step": 8345, + "token_acc": 0.31317855796643645 + }, + { + "epoch": 4.892406918792143, + "grad_norm": 0.2185594769012512, + "learning_rate": 0.00017999679704883297, + "loss": 2.810187816619873, + "step": 8346, + "token_acc": 0.32569696807155407 + }, + { + "epoch": 4.892993257109352, + "grad_norm": 0.24143949083851032, + "learning_rate": 0.0001799909810078133, + "loss": 2.9196364879608154, + "step": 8347, + "token_acc": 0.30799483145836964 + }, + { + "epoch": 4.893579595426561, + "grad_norm": 0.225846336782677, + "learning_rate": 0.00017998516421538615, + "loss": 2.890047073364258, + "step": 8348, + "token_acc": 0.31326497975655865 + }, + { + "epoch": 4.89416593374377, + "grad_norm": 0.23405898896626698, + "learning_rate": 0.0001799793466716062, + "loss": 2.9187352657318115, + "step": 8349, + "token_acc": 0.30899945336809354 + }, + { + "epoch": 4.8947522720609795, + "grad_norm": 0.23345743570027597, + "learning_rate": 0.00017997352837652804, + "loss": 2.862542152404785, + "step": 8350, + "token_acc": 0.3158329101066531 + }, + { + "epoch": 4.895338610378188, + "grad_norm": 0.2288312764465444, + "learning_rate": 0.00017996770933020634, + "loss": 2.874232530593872, + "step": 8351, + "token_acc": 0.3149623414307114 + }, + { + "epoch": 4.895924948695397, + "grad_norm": 0.25099056091449495, + "learning_rate": 0.00017996188953269576, + "loss": 2.8935036659240723, + "step": 8352, + "token_acc": 0.31226081970573205 + }, + { + "epoch": 4.896511287012606, + "grad_norm": 0.22608767067381705, + "learning_rate": 0.00017995606898405103, + "loss": 2.8853354454040527, + "step": 8353, + "token_acc": 0.3130003311316868 + }, + { + "epoch": 4.897097625329815, + "grad_norm": 0.2761988276982564, + "learning_rate": 0.00017995024768432673, + "loss": 2.8745832443237305, + "step": 8354, + "token_acc": 0.3148744664663632 + }, + { + "epoch": 4.897683963647024, + "grad_norm": 0.23446419449030897, + "learning_rate": 0.0001799444256335776, + "loss": 2.8428549766540527, + "step": 8355, + "token_acc": 0.3192100895769309 + }, + { + "epoch": 4.898270301964233, + "grad_norm": 0.26382609452481925, + "learning_rate": 0.0001799386028318583, + "loss": 2.8720126152038574, + "step": 8356, + "token_acc": 0.315113459402122 + }, + { + "epoch": 4.898856640281442, + "grad_norm": 0.23617571610873392, + "learning_rate": 0.00017993277927922356, + "loss": 2.889610528945923, + "step": 8357, + "token_acc": 0.31346922801624144 + }, + { + "epoch": 4.8994429785986515, + "grad_norm": 0.22977684025777068, + "learning_rate": 0.00017992695497572806, + "loss": 2.8615188598632812, + "step": 8358, + "token_acc": 0.3171974488538655 + }, + { + "epoch": 4.900029316915861, + "grad_norm": 0.24129330942038432, + "learning_rate": 0.00017992112992142655, + "loss": 2.806392192840576, + "step": 8359, + "token_acc": 0.3253658904919245 + }, + { + "epoch": 4.90061565523307, + "grad_norm": 0.23626231719995006, + "learning_rate": 0.0001799153041163737, + "loss": 2.8367161750793457, + "step": 8360, + "token_acc": 0.31884134132819025 + }, + { + "epoch": 4.901201993550279, + "grad_norm": 0.2542463727851749, + "learning_rate": 0.00017990947756062423, + "loss": 2.8421549797058105, + "step": 8361, + "token_acc": 0.320737015813615 + }, + { + "epoch": 4.901788331867488, + "grad_norm": 0.23367954446336806, + "learning_rate": 0.00017990365025423292, + "loss": 2.8626556396484375, + "step": 8362, + "token_acc": 0.3170893159541577 + }, + { + "epoch": 4.902374670184696, + "grad_norm": 0.24107456560833737, + "learning_rate": 0.0001798978221972545, + "loss": 2.89695143699646, + "step": 8363, + "token_acc": 0.3130193758162096 + }, + { + "epoch": 4.902961008501905, + "grad_norm": 0.22205478432796627, + "learning_rate": 0.0001798919933897437, + "loss": 2.9388632774353027, + "step": 8364, + "token_acc": 0.3057149181629969 + }, + { + "epoch": 4.903547346819114, + "grad_norm": 0.24583170317791017, + "learning_rate": 0.00017988616383175524, + "loss": 2.882878541946411, + "step": 8365, + "token_acc": 0.3132292641588874 + }, + { + "epoch": 4.9041336851363235, + "grad_norm": 0.22301532725874598, + "learning_rate": 0.00017988033352334397, + "loss": 2.8714468479156494, + "step": 8366, + "token_acc": 0.3151011658205903 + }, + { + "epoch": 4.904720023453533, + "grad_norm": 0.25011571479305245, + "learning_rate": 0.0001798745024645646, + "loss": 2.8726320266723633, + "step": 8367, + "token_acc": 0.31377489705738637 + }, + { + "epoch": 4.905306361770742, + "grad_norm": 0.29320008303632084, + "learning_rate": 0.00017986867065547188, + "loss": 2.8680896759033203, + "step": 8368, + "token_acc": 0.3152449922550408 + }, + { + "epoch": 4.905892700087951, + "grad_norm": 0.31867725285958265, + "learning_rate": 0.00017986283809612064, + "loss": 2.8753085136413574, + "step": 8369, + "token_acc": 0.31594593034543916 + }, + { + "epoch": 4.90647903840516, + "grad_norm": 0.25513645561589104, + "learning_rate": 0.00017985700478656563, + "loss": 2.8706867694854736, + "step": 8370, + "token_acc": 0.31672513385909845 + }, + { + "epoch": 4.907065376722369, + "grad_norm": 0.2539813547600626, + "learning_rate": 0.0001798511707268617, + "loss": 2.8828749656677246, + "step": 8371, + "token_acc": 0.31298156865500365 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.23029249829891968, + "learning_rate": 0.00017984533591706363, + "loss": 2.866711139678955, + "step": 8372, + "token_acc": 0.3161076544338378 + }, + { + "epoch": 4.908238053356786, + "grad_norm": 0.26006752445352266, + "learning_rate": 0.0001798395003572262, + "loss": 2.8342995643615723, + "step": 8373, + "token_acc": 0.3208169752339252 + }, + { + "epoch": 4.9088243916739955, + "grad_norm": 0.22817906499243035, + "learning_rate": 0.00017983366404740426, + "loss": 2.8753013610839844, + "step": 8374, + "token_acc": 0.3165643232801957 + }, + { + "epoch": 4.909410729991205, + "grad_norm": 0.23486876239497756, + "learning_rate": 0.0001798278269876526, + "loss": 2.878453254699707, + "step": 8375, + "token_acc": 0.31363540814543356 + }, + { + "epoch": 4.909997068308414, + "grad_norm": 0.2253808898672687, + "learning_rate": 0.0001798219891780261, + "loss": 2.8972384929656982, + "step": 8376, + "token_acc": 0.312015641483405 + }, + { + "epoch": 4.910583406625623, + "grad_norm": 0.2379170768290947, + "learning_rate": 0.00017981615061857955, + "loss": 2.873216152191162, + "step": 8377, + "token_acc": 0.315789062683024 + }, + { + "epoch": 4.911169744942832, + "grad_norm": 0.29709065303173665, + "learning_rate": 0.00017981031130936785, + "loss": 2.8706111907958984, + "step": 8378, + "token_acc": 0.3143550165380375 + }, + { + "epoch": 4.911756083260041, + "grad_norm": 0.29621275029561955, + "learning_rate": 0.00017980447125044583, + "loss": 2.8557510375976562, + "step": 8379, + "token_acc": 0.3173814272458698 + }, + { + "epoch": 4.91234242157725, + "grad_norm": 0.2633445086959949, + "learning_rate": 0.00017979863044186828, + "loss": 2.860452651977539, + "step": 8380, + "token_acc": 0.3178431764336483 + }, + { + "epoch": 4.912928759894459, + "grad_norm": 0.2778937269590066, + "learning_rate": 0.0001797927888836902, + "loss": 2.8566553592681885, + "step": 8381, + "token_acc": 0.3170729862408569 + }, + { + "epoch": 4.913515098211668, + "grad_norm": 0.3038834088678048, + "learning_rate": 0.00017978694657596632, + "loss": 2.866913318634033, + "step": 8382, + "token_acc": 0.3164103019312951 + }, + { + "epoch": 4.9141014365288775, + "grad_norm": 0.21938384011073933, + "learning_rate": 0.00017978110351875163, + "loss": 2.8766119480133057, + "step": 8383, + "token_acc": 0.3162657461365683 + }, + { + "epoch": 4.914687774846087, + "grad_norm": 0.23538344192937924, + "learning_rate": 0.00017977525971210097, + "loss": 2.865816116333008, + "step": 8384, + "token_acc": 0.3175139161755075 + }, + { + "epoch": 4.915274113163295, + "grad_norm": 0.22459320113229408, + "learning_rate": 0.00017976941515606925, + "loss": 2.8219075202941895, + "step": 8385, + "token_acc": 0.3227600541023244 + }, + { + "epoch": 4.915860451480504, + "grad_norm": 0.22151718451969166, + "learning_rate": 0.00017976356985071137, + "loss": 2.8622217178344727, + "step": 8386, + "token_acc": 0.315430179801714 + }, + { + "epoch": 4.916446789797713, + "grad_norm": 0.22233044291478105, + "learning_rate": 0.0001797577237960822, + "loss": 2.915844678878784, + "step": 8387, + "token_acc": 0.3091933477977636 + }, + { + "epoch": 4.917033128114922, + "grad_norm": 0.21736400094775632, + "learning_rate": 0.00017975187699223675, + "loss": 2.8749098777770996, + "step": 8388, + "token_acc": 0.3158443814259652 + }, + { + "epoch": 4.917619466432131, + "grad_norm": 0.23973876173722167, + "learning_rate": 0.00017974602943922983, + "loss": 2.84849214553833, + "step": 8389, + "token_acc": 0.31763861390043624 + }, + { + "epoch": 4.91820580474934, + "grad_norm": 0.22841312207726472, + "learning_rate": 0.00017974018113711644, + "loss": 2.865987777709961, + "step": 8390, + "token_acc": 0.3173935988741524 + }, + { + "epoch": 4.9187921430665495, + "grad_norm": 0.23677695696859857, + "learning_rate": 0.0001797343320859515, + "loss": 2.889751434326172, + "step": 8391, + "token_acc": 0.312426859705264 + }, + { + "epoch": 4.919378481383759, + "grad_norm": 0.22697427161858655, + "learning_rate": 0.00017972848228578996, + "loss": 2.9122982025146484, + "step": 8392, + "token_acc": 0.30895203409335203 + }, + { + "epoch": 4.919964819700968, + "grad_norm": 0.22834015936063518, + "learning_rate": 0.00017972263173668678, + "loss": 2.8649489879608154, + "step": 8393, + "token_acc": 0.31620692809261847 + }, + { + "epoch": 4.920551158018176, + "grad_norm": 0.21463997859439704, + "learning_rate": 0.00017971678043869686, + "loss": 2.8691301345825195, + "step": 8394, + "token_acc": 0.3162274038298106 + }, + { + "epoch": 4.921137496335385, + "grad_norm": 0.2264994949144399, + "learning_rate": 0.00017971092839187526, + "loss": 2.8167340755462646, + "step": 8395, + "token_acc": 0.32415269732146684 + }, + { + "epoch": 4.921723834652594, + "grad_norm": 0.2257552581259529, + "learning_rate": 0.00017970507559627685, + "loss": 2.872645378112793, + "step": 8396, + "token_acc": 0.31694903346508424 + }, + { + "epoch": 4.922310172969803, + "grad_norm": 0.23328961657902544, + "learning_rate": 0.00017969922205195667, + "loss": 2.872405767440796, + "step": 8397, + "token_acc": 0.3151759810855178 + }, + { + "epoch": 4.9228965112870124, + "grad_norm": 0.2192088463765428, + "learning_rate": 0.0001796933677589697, + "loss": 2.832584857940674, + "step": 8398, + "token_acc": 0.3236575967595576 + }, + { + "epoch": 4.923482849604222, + "grad_norm": 0.24260608661816396, + "learning_rate": 0.0001796875127173709, + "loss": 2.893548011779785, + "step": 8399, + "token_acc": 0.31123765388631874 + }, + { + "epoch": 4.924069187921431, + "grad_norm": 0.27769296241341496, + "learning_rate": 0.00017968165692721535, + "loss": 2.824563980102539, + "step": 8400, + "token_acc": 0.321175642063783 + }, + { + "epoch": 4.92465552623864, + "grad_norm": 0.29811134098093284, + "learning_rate": 0.000179675800388558, + "loss": 2.9068756103515625, + "step": 8401, + "token_acc": 0.3093563886026312 + }, + { + "epoch": 4.925241864555849, + "grad_norm": 0.249000364576747, + "learning_rate": 0.00017966994310145384, + "loss": 2.878631114959717, + "step": 8402, + "token_acc": 0.31276594077405906 + }, + { + "epoch": 4.925828202873058, + "grad_norm": 0.2787394495925771, + "learning_rate": 0.00017966408506595792, + "loss": 2.867664337158203, + "step": 8403, + "token_acc": 0.31646316988851914 + }, + { + "epoch": 4.926414541190267, + "grad_norm": 0.30255537886443484, + "learning_rate": 0.0001796582262821253, + "loss": 2.871713161468506, + "step": 8404, + "token_acc": 0.3152598118525957 + }, + { + "epoch": 4.927000879507476, + "grad_norm": 0.24076229661336646, + "learning_rate": 0.00017965236675001096, + "loss": 2.8511435985565186, + "step": 8405, + "token_acc": 0.3198485288556914 + }, + { + "epoch": 4.927587217824685, + "grad_norm": 0.2755282150328066, + "learning_rate": 0.00017964650646967, + "loss": 2.8566789627075195, + "step": 8406, + "token_acc": 0.31816040007601654 + }, + { + "epoch": 4.928173556141894, + "grad_norm": 0.2126960649711986, + "learning_rate": 0.0001796406454411574, + "loss": 2.8522043228149414, + "step": 8407, + "token_acc": 0.3195865765629008 + }, + { + "epoch": 4.928759894459103, + "grad_norm": 0.31397777047049447, + "learning_rate": 0.00017963478366452828, + "loss": 2.8467164039611816, + "step": 8408, + "token_acc": 0.31936727804450593 + }, + { + "epoch": 4.929346232776312, + "grad_norm": 0.31244737987162124, + "learning_rate": 0.00017962892113983765, + "loss": 2.8915839195251465, + "step": 8409, + "token_acc": 0.3129559683344544 + }, + { + "epoch": 4.929932571093521, + "grad_norm": 0.2533921326833803, + "learning_rate": 0.0001796230578671406, + "loss": 2.893911361694336, + "step": 8410, + "token_acc": 0.3120797965016751 + }, + { + "epoch": 4.93051890941073, + "grad_norm": 0.2861462755250293, + "learning_rate": 0.00017961719384649224, + "loss": 2.869845151901245, + "step": 8411, + "token_acc": 0.3168968683968855 + }, + { + "epoch": 4.931105247727939, + "grad_norm": 0.25865957627690706, + "learning_rate": 0.00017961132907794763, + "loss": 2.8767058849334717, + "step": 8412, + "token_acc": 0.31476828809345864 + }, + { + "epoch": 4.931691586045148, + "grad_norm": 0.32565273995163213, + "learning_rate": 0.00017960546356156188, + "loss": 2.857372999191284, + "step": 8413, + "token_acc": 0.3187572705645626 + }, + { + "epoch": 4.932277924362357, + "grad_norm": 0.2552855438475177, + "learning_rate": 0.00017959959729739003, + "loss": 2.849339008331299, + "step": 8414, + "token_acc": 0.31983503128716967 + }, + { + "epoch": 4.9328642626795665, + "grad_norm": 0.2738336538961474, + "learning_rate": 0.00017959373028548722, + "loss": 2.884718179702759, + "step": 8415, + "token_acc": 0.3155745781014763 + }, + { + "epoch": 4.933450600996775, + "grad_norm": 0.24417145570321924, + "learning_rate": 0.0001795878625259086, + "loss": 2.891413927078247, + "step": 8416, + "token_acc": 0.3125787147391112 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.24757218137037865, + "learning_rate": 0.00017958199401870925, + "loss": 2.8760080337524414, + "step": 8417, + "token_acc": 0.31498449208280016 + }, + { + "epoch": 4.934623277631193, + "grad_norm": 0.2595627487152961, + "learning_rate": 0.0001795761247639443, + "loss": 2.8534088134765625, + "step": 8418, + "token_acc": 0.3180557606346564 + }, + { + "epoch": 4.935209615948402, + "grad_norm": 0.2619144923431915, + "learning_rate": 0.0001795702547616689, + "loss": 2.832172155380249, + "step": 8419, + "token_acc": 0.3205300226327968 + }, + { + "epoch": 4.935795954265611, + "grad_norm": 0.26102097932421314, + "learning_rate": 0.00017956438401193814, + "loss": 2.8744659423828125, + "step": 8420, + "token_acc": 0.31579796297982493 + }, + { + "epoch": 4.93638229258282, + "grad_norm": 0.22910570857373616, + "learning_rate": 0.00017955851251480725, + "loss": 2.8690361976623535, + "step": 8421, + "token_acc": 0.3152141926262244 + }, + { + "epoch": 4.936968630900029, + "grad_norm": 0.2614473539724336, + "learning_rate": 0.0001795526402703313, + "loss": 2.8879055976867676, + "step": 8422, + "token_acc": 0.3136906731666895 + }, + { + "epoch": 4.9375549692172385, + "grad_norm": 0.23684964577174047, + "learning_rate": 0.00017954676727856552, + "loss": 2.886958122253418, + "step": 8423, + "token_acc": 0.312734063595016 + }, + { + "epoch": 4.938141307534448, + "grad_norm": 0.2490443299057755, + "learning_rate": 0.00017954089353956504, + "loss": 2.8789305686950684, + "step": 8424, + "token_acc": 0.31382580580441544 + }, + { + "epoch": 4.938727645851657, + "grad_norm": 0.2036345704375411, + "learning_rate": 0.00017953501905338507, + "loss": 2.8779430389404297, + "step": 8425, + "token_acc": 0.3141000326904217 + }, + { + "epoch": 4.939313984168866, + "grad_norm": 0.24906080426303961, + "learning_rate": 0.00017952914382008076, + "loss": 2.8541882038116455, + "step": 8426, + "token_acc": 0.31917883152960863 + }, + { + "epoch": 4.939900322486075, + "grad_norm": 0.2364522383358862, + "learning_rate": 0.00017952326783970732, + "loss": 2.869612693786621, + "step": 8427, + "token_acc": 0.31597029226021267 + }, + { + "epoch": 4.940486660803283, + "grad_norm": 0.23946703920374435, + "learning_rate": 0.00017951739111231994, + "loss": 2.8637654781341553, + "step": 8428, + "token_acc": 0.3156103529313132 + }, + { + "epoch": 4.941072999120492, + "grad_norm": 0.23108771649079995, + "learning_rate": 0.00017951151363797377, + "loss": 2.8711001873016357, + "step": 8429, + "token_acc": 0.3157841457716867 + }, + { + "epoch": 4.941659337437701, + "grad_norm": 0.3026029264217738, + "learning_rate": 0.00017950563541672412, + "loss": 2.8523011207580566, + "step": 8430, + "token_acc": 0.31890253244355815 + }, + { + "epoch": 4.9422456757549105, + "grad_norm": 0.2747685719009976, + "learning_rate": 0.00017949975644862616, + "loss": 2.83609676361084, + "step": 8431, + "token_acc": 0.31888526270645706 + }, + { + "epoch": 4.94283201407212, + "grad_norm": 0.23844616839270108, + "learning_rate": 0.0001794938767337351, + "loss": 2.894364356994629, + "step": 8432, + "token_acc": 0.3114953606689112 + }, + { + "epoch": 4.943418352389329, + "grad_norm": 0.3558042910109727, + "learning_rate": 0.0001794879962721062, + "loss": 2.909543991088867, + "step": 8433, + "token_acc": 0.30960725013793244 + }, + { + "epoch": 4.944004690706538, + "grad_norm": 0.2882064391548642, + "learning_rate": 0.0001794821150637947, + "loss": 2.8687398433685303, + "step": 8434, + "token_acc": 0.31630840641947633 + }, + { + "epoch": 4.944591029023747, + "grad_norm": 0.24634742643211946, + "learning_rate": 0.0001794762331088558, + "loss": 2.895592451095581, + "step": 8435, + "token_acc": 0.31194721216172105 + }, + { + "epoch": 4.945177367340956, + "grad_norm": 0.31088095814704864, + "learning_rate": 0.00017947035040734478, + "loss": 2.857819080352783, + "step": 8436, + "token_acc": 0.3178397192953762 + }, + { + "epoch": 4.945763705658165, + "grad_norm": 0.2160652368025357, + "learning_rate": 0.00017946446695931695, + "loss": 2.825084686279297, + "step": 8437, + "token_acc": 0.321468530905637 + }, + { + "epoch": 4.946350043975373, + "grad_norm": 0.2958035164682766, + "learning_rate": 0.00017945858276482749, + "loss": 2.8763270378112793, + "step": 8438, + "token_acc": 0.31381781670879066 + }, + { + "epoch": 4.9469363822925825, + "grad_norm": 0.241168953514544, + "learning_rate": 0.00017945269782393173, + "loss": 2.8931961059570312, + "step": 8439, + "token_acc": 0.31105080738332574 + }, + { + "epoch": 4.947522720609792, + "grad_norm": 0.2526440014816673, + "learning_rate": 0.00017944681213668493, + "loss": 2.8589744567871094, + "step": 8440, + "token_acc": 0.31835809013814836 + }, + { + "epoch": 4.948109058927001, + "grad_norm": 0.27369265265725334, + "learning_rate": 0.00017944092570314243, + "loss": 2.8843092918395996, + "step": 8441, + "token_acc": 0.3135505218982029 + }, + { + "epoch": 4.94869539724421, + "grad_norm": 0.24230636499144806, + "learning_rate": 0.00017943503852335942, + "loss": 2.8540053367614746, + "step": 8442, + "token_acc": 0.31693669449785866 + }, + { + "epoch": 4.949281735561419, + "grad_norm": 0.27081961341012084, + "learning_rate": 0.0001794291505973913, + "loss": 2.8884458541870117, + "step": 8443, + "token_acc": 0.3129586281265616 + }, + { + "epoch": 4.949868073878628, + "grad_norm": 0.22472966445766054, + "learning_rate": 0.00017942326192529334, + "loss": 2.8744518756866455, + "step": 8444, + "token_acc": 0.3157451186693931 + }, + { + "epoch": 4.950454412195837, + "grad_norm": 0.27107364537960604, + "learning_rate": 0.00017941737250712082, + "loss": 2.852600574493408, + "step": 8445, + "token_acc": 0.3170322417046412 + }, + { + "epoch": 4.951040750513046, + "grad_norm": 0.22211175928585705, + "learning_rate": 0.00017941148234292914, + "loss": 2.904472827911377, + "step": 8446, + "token_acc": 0.3110918589831365 + }, + { + "epoch": 4.951627088830255, + "grad_norm": 0.22791913004449532, + "learning_rate": 0.0001794055914327736, + "loss": 2.8633878231048584, + "step": 8447, + "token_acc": 0.3169164226131051 + }, + { + "epoch": 4.9522134271474645, + "grad_norm": 0.21537409102024568, + "learning_rate": 0.00017939969977670951, + "loss": 2.8988394737243652, + "step": 8448, + "token_acc": 0.3098271769590448 + }, + { + "epoch": 4.952799765464674, + "grad_norm": 0.22624407070317581, + "learning_rate": 0.00017939380737479223, + "loss": 2.922788619995117, + "step": 8449, + "token_acc": 0.3086223190323975 + }, + { + "epoch": 4.953386103781882, + "grad_norm": 0.24076528414204681, + "learning_rate": 0.00017938791422707713, + "loss": 2.867844581604004, + "step": 8450, + "token_acc": 0.31615657307135514 + }, + { + "epoch": 4.953972442099091, + "grad_norm": 0.2304792364918695, + "learning_rate": 0.00017938202033361954, + "loss": 2.8729777336120605, + "step": 8451, + "token_acc": 0.3142697275614504 + }, + { + "epoch": 4.9545587804163, + "grad_norm": 0.24467403822596323, + "learning_rate": 0.00017937612569447485, + "loss": 2.8968539237976074, + "step": 8452, + "token_acc": 0.31223510492049905 + }, + { + "epoch": 4.955145118733509, + "grad_norm": 0.22995302914132748, + "learning_rate": 0.0001793702303096984, + "loss": 2.873523712158203, + "step": 8453, + "token_acc": 0.31530301679677053 + }, + { + "epoch": 4.955731457050718, + "grad_norm": 0.2585564071769646, + "learning_rate": 0.00017936433417934563, + "loss": 2.8777623176574707, + "step": 8454, + "token_acc": 0.31463056509732606 + }, + { + "epoch": 4.956317795367927, + "grad_norm": 0.26014784343606256, + "learning_rate": 0.00017935843730347185, + "loss": 2.876589059829712, + "step": 8455, + "token_acc": 0.3132975738355324 + }, + { + "epoch": 4.9569041336851365, + "grad_norm": 0.23828140401566972, + "learning_rate": 0.00017935253968213245, + "loss": 2.8952813148498535, + "step": 8456, + "token_acc": 0.30933137023347596 + }, + { + "epoch": 4.957490472002346, + "grad_norm": 0.24482416436640467, + "learning_rate": 0.00017934664131538295, + "loss": 2.831240653991699, + "step": 8457, + "token_acc": 0.32196208624705663 + }, + { + "epoch": 4.958076810319555, + "grad_norm": 0.24906862139323488, + "learning_rate": 0.00017934074220327863, + "loss": 2.854583263397217, + "step": 8458, + "token_acc": 0.3174457628527493 + }, + { + "epoch": 4.958663148636763, + "grad_norm": 0.2149633932163617, + "learning_rate": 0.00017933484234587495, + "loss": 2.86299991607666, + "step": 8459, + "token_acc": 0.3183635436079501 + }, + { + "epoch": 4.959249486953972, + "grad_norm": 0.24466946577680426, + "learning_rate": 0.00017932894174322735, + "loss": 2.8612112998962402, + "step": 8460, + "token_acc": 0.31658846924874273 + }, + { + "epoch": 4.959835825271181, + "grad_norm": 0.2220752400546345, + "learning_rate": 0.00017932304039539122, + "loss": 2.8488755226135254, + "step": 8461, + "token_acc": 0.31957833941288866 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.24354049725874863, + "learning_rate": 0.000179317138302422, + "loss": 2.86130428314209, + "step": 8462, + "token_acc": 0.3160565163239888 + }, + { + "epoch": 4.961008501905599, + "grad_norm": 0.220288200553323, + "learning_rate": 0.0001793112354643752, + "loss": 2.8464696407318115, + "step": 8463, + "token_acc": 0.31817335588066464 + }, + { + "epoch": 4.9615948402228085, + "grad_norm": 0.23742862660385866, + "learning_rate": 0.00017930533188130617, + "loss": 2.9064717292785645, + "step": 8464, + "token_acc": 0.3126001662420071 + }, + { + "epoch": 4.962181178540018, + "grad_norm": 0.2572505141016793, + "learning_rate": 0.00017929942755327044, + "loss": 2.84448504447937, + "step": 8465, + "token_acc": 0.3185073701447532 + }, + { + "epoch": 4.962767516857227, + "grad_norm": 0.24229069797076158, + "learning_rate": 0.00017929352248032343, + "loss": 2.9053921699523926, + "step": 8466, + "token_acc": 0.30928349844632513 + }, + { + "epoch": 4.963353855174436, + "grad_norm": 0.23707694155670558, + "learning_rate": 0.00017928761666252064, + "loss": 2.869821071624756, + "step": 8467, + "token_acc": 0.3155895990169047 + }, + { + "epoch": 4.963940193491645, + "grad_norm": 0.23993120982386987, + "learning_rate": 0.0001792817100999175, + "loss": 2.8758597373962402, + "step": 8468, + "token_acc": 0.31520305561996226 + }, + { + "epoch": 4.964526531808854, + "grad_norm": 0.21953219632247756, + "learning_rate": 0.00017927580279256954, + "loss": 2.866121768951416, + "step": 8469, + "token_acc": 0.3176150154250998 + }, + { + "epoch": 4.965112870126063, + "grad_norm": 0.21213184587429856, + "learning_rate": 0.00017926989474053223, + "loss": 2.8481290340423584, + "step": 8470, + "token_acc": 0.3193162721716748 + }, + { + "epoch": 4.965699208443271, + "grad_norm": 0.23950240981379395, + "learning_rate": 0.00017926398594386112, + "loss": 2.8559980392456055, + "step": 8471, + "token_acc": 0.3187008151112976 + }, + { + "epoch": 4.9662855467604805, + "grad_norm": 0.25205350140802885, + "learning_rate": 0.00017925807640261162, + "loss": 2.8302392959594727, + "step": 8472, + "token_acc": 0.32067642947722197 + }, + { + "epoch": 4.96687188507769, + "grad_norm": 0.3087549449807468, + "learning_rate": 0.00017925216611683933, + "loss": 2.9145708084106445, + "step": 8473, + "token_acc": 0.30979842020102183 + }, + { + "epoch": 4.967458223394899, + "grad_norm": 0.3097235146306694, + "learning_rate": 0.00017924625508659972, + "loss": 2.836543083190918, + "step": 8474, + "token_acc": 0.32098026693571324 + }, + { + "epoch": 4.968044561712108, + "grad_norm": 0.24463342483970554, + "learning_rate": 0.00017924034331194834, + "loss": 2.8545455932617188, + "step": 8475, + "token_acc": 0.3191926742664922 + }, + { + "epoch": 4.968630900029317, + "grad_norm": 0.2502253549220813, + "learning_rate": 0.00017923443079294072, + "loss": 2.8544154167175293, + "step": 8476, + "token_acc": 0.3188576140766107 + }, + { + "epoch": 4.969217238346526, + "grad_norm": 0.2987540742125401, + "learning_rate": 0.00017922851752963238, + "loss": 2.8679263591766357, + "step": 8477, + "token_acc": 0.3162451161834259 + }, + { + "epoch": 4.969803576663735, + "grad_norm": 0.33210370238558434, + "learning_rate": 0.00017922260352207884, + "loss": 2.8839192390441895, + "step": 8478, + "token_acc": 0.3128733221695333 + }, + { + "epoch": 4.970389914980944, + "grad_norm": 0.2609264166965845, + "learning_rate": 0.00017921668877033574, + "loss": 2.8879973888397217, + "step": 8479, + "token_acc": 0.3138327486581299 + }, + { + "epoch": 4.970976253298153, + "grad_norm": 0.277628001224917, + "learning_rate": 0.00017921077327445859, + "loss": 2.8682703971862793, + "step": 8480, + "token_acc": 0.31493226457873885 + }, + { + "epoch": 4.971562591615362, + "grad_norm": 0.27607377640150704, + "learning_rate": 0.00017920485703450296, + "loss": 2.846822500228882, + "step": 8481, + "token_acc": 0.3198889767548332 + }, + { + "epoch": 4.972148929932571, + "grad_norm": 0.23867597857746908, + "learning_rate": 0.00017919894005052442, + "loss": 2.8481671810150146, + "step": 8482, + "token_acc": 0.3195922858936277 + }, + { + "epoch": 4.97273526824978, + "grad_norm": 0.2557897899228066, + "learning_rate": 0.00017919302232257856, + "loss": 2.879589796066284, + "step": 8483, + "token_acc": 0.31282720475688913 + }, + { + "epoch": 4.973321606566989, + "grad_norm": 0.23800072107441336, + "learning_rate": 0.000179187103850721, + "loss": 2.861189842224121, + "step": 8484, + "token_acc": 0.3163650763050535 + }, + { + "epoch": 4.973907944884198, + "grad_norm": 0.24673257717820166, + "learning_rate": 0.00017918118463500725, + "loss": 2.9156742095947266, + "step": 8485, + "token_acc": 0.30818791742100377 + }, + { + "epoch": 4.974494283201407, + "grad_norm": 0.2389108879643748, + "learning_rate": 0.00017917526467549298, + "loss": 2.903752326965332, + "step": 8486, + "token_acc": 0.31066099278639764 + }, + { + "epoch": 4.975080621518616, + "grad_norm": 0.22200841917961095, + "learning_rate": 0.00017916934397223383, + "loss": 2.8764190673828125, + "step": 8487, + "token_acc": 0.31592107667979585 + }, + { + "epoch": 4.975666959835825, + "grad_norm": 0.24631251858129105, + "learning_rate": 0.00017916342252528535, + "loss": 2.8493995666503906, + "step": 8488, + "token_acc": 0.3204835589941973 + }, + { + "epoch": 4.9762532981530345, + "grad_norm": 0.2417674400421058, + "learning_rate": 0.00017915750033470319, + "loss": 2.840669631958008, + "step": 8489, + "token_acc": 0.3202059784758217 + }, + { + "epoch": 4.976839636470244, + "grad_norm": 0.22371602763045198, + "learning_rate": 0.000179151577400543, + "loss": 2.8537039756774902, + "step": 8490, + "token_acc": 0.3183531075421707 + }, + { + "epoch": 4.977425974787453, + "grad_norm": 0.24517790025225092, + "learning_rate": 0.00017914565372286037, + "loss": 2.8857383728027344, + "step": 8491, + "token_acc": 0.31477932810347925 + }, + { + "epoch": 4.978012313104662, + "grad_norm": 0.2303879716774955, + "learning_rate": 0.00017913972930171096, + "loss": 2.833596706390381, + "step": 8492, + "token_acc": 0.3208731919515907 + }, + { + "epoch": 4.97859865142187, + "grad_norm": 0.21952604240108176, + "learning_rate": 0.00017913380413715047, + "loss": 2.842550754547119, + "step": 8493, + "token_acc": 0.3202233053791323 + }, + { + "epoch": 4.979184989739079, + "grad_norm": 0.2364741868635869, + "learning_rate": 0.00017912787822923454, + "loss": 2.855903387069702, + "step": 8494, + "token_acc": 0.31731397366540776 + }, + { + "epoch": 4.979771328056288, + "grad_norm": 0.22864873124504112, + "learning_rate": 0.0001791219515780188, + "loss": 2.8198933601379395, + "step": 8495, + "token_acc": 0.3221293922170605 + }, + { + "epoch": 4.980357666373497, + "grad_norm": 0.21768791713483765, + "learning_rate": 0.0001791160241835589, + "loss": 2.8832879066467285, + "step": 8496, + "token_acc": 0.31440234389619276 + }, + { + "epoch": 4.9809440046907065, + "grad_norm": 0.2262613718135984, + "learning_rate": 0.0001791100960459106, + "loss": 2.8716835975646973, + "step": 8497, + "token_acc": 0.31583811089801006 + }, + { + "epoch": 4.981530343007916, + "grad_norm": 0.22683619389339266, + "learning_rate": 0.00017910416716512956, + "loss": 2.891462802886963, + "step": 8498, + "token_acc": 0.3127765486725664 + }, + { + "epoch": 4.982116681325125, + "grad_norm": 0.22339521389395867, + "learning_rate": 0.00017909823754127144, + "loss": 2.864078998565674, + "step": 8499, + "token_acc": 0.31737311127806994 + }, + { + "epoch": 4.982703019642334, + "grad_norm": 0.21080402882781651, + "learning_rate": 0.000179092307174392, + "loss": 2.932692527770996, + "step": 8500, + "token_acc": 0.3065689085808342 + }, + { + "epoch": 4.983289357959543, + "grad_norm": 0.22892508220793364, + "learning_rate": 0.00017908637606454687, + "loss": 2.857616901397705, + "step": 8501, + "token_acc": 0.31639023347591605 + }, + { + "epoch": 4.983875696276751, + "grad_norm": 0.2229368963671994, + "learning_rate": 0.00017908044421179183, + "loss": 2.8588857650756836, + "step": 8502, + "token_acc": 0.31685495846735895 + }, + { + "epoch": 4.98446203459396, + "grad_norm": 0.22444567030341322, + "learning_rate": 0.0001790745116161826, + "loss": 2.9332284927368164, + "step": 8503, + "token_acc": 0.3062027184304597 + }, + { + "epoch": 4.985048372911169, + "grad_norm": 0.21454941745025957, + "learning_rate": 0.00017906857827777484, + "loss": 2.924492597579956, + "step": 8504, + "token_acc": 0.30870076843493577 + }, + { + "epoch": 4.9856347112283785, + "grad_norm": 0.22136202269508384, + "learning_rate": 0.00017906264419662436, + "loss": 2.8622660636901855, + "step": 8505, + "token_acc": 0.31729759709019617 + }, + { + "epoch": 4.986221049545588, + "grad_norm": 0.23783258256232354, + "learning_rate": 0.00017905670937278686, + "loss": 2.856257438659668, + "step": 8506, + "token_acc": 0.3188564712107952 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.2361640521773867, + "learning_rate": 0.0001790507738063181, + "loss": 2.9143805503845215, + "step": 8507, + "token_acc": 0.3091109254383891 + }, + { + "epoch": 4.987393726180006, + "grad_norm": 0.23269977030368877, + "learning_rate": 0.00017904483749727387, + "loss": 2.8381435871124268, + "step": 8508, + "token_acc": 0.31944902693813504 + }, + { + "epoch": 4.987980064497215, + "grad_norm": 0.22367574763668133, + "learning_rate": 0.0001790389004457099, + "loss": 2.915377378463745, + "step": 8509, + "token_acc": 0.30884221800094314 + }, + { + "epoch": 4.988566402814424, + "grad_norm": 0.2348708422805005, + "learning_rate": 0.00017903296265168198, + "loss": 2.8656444549560547, + "step": 8510, + "token_acc": 0.31724034063736073 + }, + { + "epoch": 4.989152741131633, + "grad_norm": 0.26292205820699793, + "learning_rate": 0.00017902702411524586, + "loss": 2.8722805976867676, + "step": 8511, + "token_acc": 0.31745583734262767 + }, + { + "epoch": 4.989739079448842, + "grad_norm": 0.21981841451273185, + "learning_rate": 0.00017902108483645735, + "loss": 2.853205680847168, + "step": 8512, + "token_acc": 0.3180597554688642 + }, + { + "epoch": 4.990325417766051, + "grad_norm": 0.2410120888170991, + "learning_rate": 0.0001790151448153722, + "loss": 2.8568387031555176, + "step": 8513, + "token_acc": 0.3163631315234814 + }, + { + "epoch": 4.99091175608326, + "grad_norm": 0.22686685412599306, + "learning_rate": 0.00017900920405204625, + "loss": 2.843190908432007, + "step": 8514, + "token_acc": 0.31800718704159325 + }, + { + "epoch": 4.991498094400469, + "grad_norm": 0.26593674023775077, + "learning_rate": 0.0001790032625465353, + "loss": 2.8981356620788574, + "step": 8515, + "token_acc": 0.3115141115009113 + }, + { + "epoch": 4.992084432717678, + "grad_norm": 0.27346868142257325, + "learning_rate": 0.00017899732029889515, + "loss": 2.8425588607788086, + "step": 8516, + "token_acc": 0.3186945150668945 + }, + { + "epoch": 4.992670771034887, + "grad_norm": 0.25475811112702484, + "learning_rate": 0.00017899137730918163, + "loss": 2.87961483001709, + "step": 8517, + "token_acc": 0.3149162699935498 + }, + { + "epoch": 4.993257109352096, + "grad_norm": 0.22113435032695664, + "learning_rate": 0.00017898543357745058, + "loss": 2.889873504638672, + "step": 8518, + "token_acc": 0.31485450657801634 + }, + { + "epoch": 4.993843447669305, + "grad_norm": 0.2862201794274265, + "learning_rate": 0.00017897948910375777, + "loss": 2.9130234718322754, + "step": 8519, + "token_acc": 0.30897796339593564 + }, + { + "epoch": 4.994429785986514, + "grad_norm": 0.38162279243154706, + "learning_rate": 0.00017897354388815914, + "loss": 2.885221242904663, + "step": 8520, + "token_acc": 0.3136089243663102 + }, + { + "epoch": 4.995016124303723, + "grad_norm": 0.30584510964405864, + "learning_rate": 0.00017896759793071046, + "loss": 2.899193525314331, + "step": 8521, + "token_acc": 0.31160074602868354 + }, + { + "epoch": 4.9956024626209325, + "grad_norm": 0.2549578283332126, + "learning_rate": 0.0001789616512314676, + "loss": 2.8359553813934326, + "step": 8522, + "token_acc": 0.32050980883473756 + }, + { + "epoch": 4.996188800938142, + "grad_norm": 0.2853655740064938, + "learning_rate": 0.00017895570379048643, + "loss": 2.8635950088500977, + "step": 8523, + "token_acc": 0.3154783371878066 + }, + { + "epoch": 4.99677513925535, + "grad_norm": 0.24804008976000905, + "learning_rate": 0.00017894975560782284, + "loss": 2.880258083343506, + "step": 8524, + "token_acc": 0.31414378998963766 + }, + { + "epoch": 4.997361477572559, + "grad_norm": 0.2827029743125223, + "learning_rate": 0.00017894380668353265, + "loss": 2.8957061767578125, + "step": 8525, + "token_acc": 0.3110220092605236 + }, + { + "epoch": 4.997947815889768, + "grad_norm": 0.2388138794035119, + "learning_rate": 0.00017893785701767178, + "loss": 2.8899502754211426, + "step": 8526, + "token_acc": 0.313573874509014 + }, + { + "epoch": 4.998534154206977, + "grad_norm": 0.2946185368856333, + "learning_rate": 0.00017893190661029613, + "loss": 2.8956902027130127, + "step": 8527, + "token_acc": 0.31157830606702114 + }, + { + "epoch": 4.999120492524186, + "grad_norm": 0.24657747915649159, + "learning_rate": 0.00017892595546146155, + "loss": 2.89015793800354, + "step": 8528, + "token_acc": 0.31426744274271345 + }, + { + "epoch": 4.999706830841395, + "grad_norm": 0.27516565593646075, + "learning_rate": 0.000178920003571224, + "loss": 2.901970148086548, + "step": 8529, + "token_acc": 0.31077075614465377 + }, + { + "epoch": 5.0, + "grad_norm": 0.31457004306043534, + "learning_rate": 0.00017891405093963938, + "loss": 2.883554458618164, + "step": 8530, + "token_acc": 0.31363783959302954 + }, + { + "epoch": 5.0, + "eval_loss": 3.110069751739502, + "eval_runtime": 16.7889, + "eval_samples_per_second": 15.248, + "eval_steps_per_second": 1.906, + "eval_token_acc": 0.28490550317586005, + "step": 8530 + }, + { + "epoch": 5.000586338317209, + "grad_norm": 0.5728609097865577, + "learning_rate": 0.00017890809756676354, + "loss": 2.6137020587921143, + "step": 8531, + "token_acc": 0.35633464052287583 + }, + { + "epoch": 5.001172676634418, + "grad_norm": 0.48984419222165093, + "learning_rate": 0.0001789021434526525, + "loss": 2.6457982063293457, + "step": 8532, + "token_acc": 0.3505687640110476 + }, + { + "epoch": 5.001759014951627, + "grad_norm": 0.32993140654176834, + "learning_rate": 0.00017889618859736212, + "loss": 2.6383183002471924, + "step": 8533, + "token_acc": 0.3525478246148637 + }, + { + "epoch": 5.0023453532688364, + "grad_norm": 0.4905807086283444, + "learning_rate": 0.00017889023300094836, + "loss": 2.6150574684143066, + "step": 8534, + "token_acc": 0.3546726494610107 + }, + { + "epoch": 5.002931691586046, + "grad_norm": 0.479577916950639, + "learning_rate": 0.00017888427666346718, + "loss": 2.6390600204467773, + "step": 8535, + "token_acc": 0.3519423687815845 + }, + { + "epoch": 5.003518029903254, + "grad_norm": 0.4011623420757777, + "learning_rate": 0.0001788783195849745, + "loss": 2.5844533443450928, + "step": 8536, + "token_acc": 0.36083323525950334 + }, + { + "epoch": 5.004104368220463, + "grad_norm": 0.3927691188525506, + "learning_rate": 0.0001788723617655263, + "loss": 2.583338499069214, + "step": 8537, + "token_acc": 0.36307227177435214 + }, + { + "epoch": 5.004690706537672, + "grad_norm": 0.3590091098008287, + "learning_rate": 0.00017886640320517855, + "loss": 2.593388795852661, + "step": 8538, + "token_acc": 0.3605190318959975 + }, + { + "epoch": 5.005277044854881, + "grad_norm": 0.32283351775874786, + "learning_rate": 0.00017886044390398725, + "loss": 2.531052589416504, + "step": 8539, + "token_acc": 0.3719200999544699 + }, + { + "epoch": 5.00586338317209, + "grad_norm": 0.3543052571384308, + "learning_rate": 0.0001788544838620083, + "loss": 2.5684967041015625, + "step": 8540, + "token_acc": 0.363936745090672 + }, + { + "epoch": 5.006449721489299, + "grad_norm": 0.34444625581534105, + "learning_rate": 0.00017884852307929774, + "loss": 2.6395487785339355, + "step": 8541, + "token_acc": 0.35311911163804965 + }, + { + "epoch": 5.0070360598065085, + "grad_norm": 0.3173895031108944, + "learning_rate": 0.00017884256155591157, + "loss": 2.5861763954162598, + "step": 8542, + "token_acc": 0.3615610123450149 + }, + { + "epoch": 5.007622398123718, + "grad_norm": 0.3378104142526924, + "learning_rate": 0.00017883659929190574, + "loss": 2.571347713470459, + "step": 8543, + "token_acc": 0.3641806503433498 + }, + { + "epoch": 5.008208736440927, + "grad_norm": 0.2951923512088088, + "learning_rate": 0.00017883063628733634, + "loss": 2.5819907188415527, + "step": 8544, + "token_acc": 0.36197110489288087 + }, + { + "epoch": 5.008795074758136, + "grad_norm": 0.3176423457440417, + "learning_rate": 0.00017882467254225933, + "loss": 2.5647072792053223, + "step": 8545, + "token_acc": 0.364050285877353 + }, + { + "epoch": 5.009381413075345, + "grad_norm": 0.31497685666115965, + "learning_rate": 0.0001788187080567307, + "loss": 2.520672082901001, + "step": 8546, + "token_acc": 0.3703714562139025 + }, + { + "epoch": 5.009967751392553, + "grad_norm": 0.32198843957165213, + "learning_rate": 0.00017881274283080656, + "loss": 2.5758109092712402, + "step": 8547, + "token_acc": 0.3640082017132113 + }, + { + "epoch": 5.010554089709762, + "grad_norm": 0.30913375868976933, + "learning_rate": 0.00017880677686454288, + "loss": 2.5549397468566895, + "step": 8548, + "token_acc": 0.365419957284061 + }, + { + "epoch": 5.011140428026971, + "grad_norm": 0.3625104583515587, + "learning_rate": 0.00017880081015799574, + "loss": 2.539494514465332, + "step": 8549, + "token_acc": 0.36929375006444826 + }, + { + "epoch": 5.0117267663441805, + "grad_norm": 0.32693962659638726, + "learning_rate": 0.00017879484271122117, + "loss": 2.5783021450042725, + "step": 8550, + "token_acc": 0.36188182326225726 + }, + { + "epoch": 5.01231310466139, + "grad_norm": 0.3178441155562443, + "learning_rate": 0.00017878887452427522, + "loss": 2.600250244140625, + "step": 8551, + "token_acc": 0.36005084972567913 + }, + { + "epoch": 5.012899442978599, + "grad_norm": 0.3882108261774225, + "learning_rate": 0.00017878290559721397, + "loss": 2.6201887130737305, + "step": 8552, + "token_acc": 0.3541683938585958 + }, + { + "epoch": 5.013485781295808, + "grad_norm": 0.4052815662621842, + "learning_rate": 0.00017877693593009347, + "loss": 2.5477776527404785, + "step": 8553, + "token_acc": 0.3685016961076129 + }, + { + "epoch": 5.014072119613017, + "grad_norm": 0.3056425027642402, + "learning_rate": 0.00017877096552296981, + "loss": 2.5621085166931152, + "step": 8554, + "token_acc": 0.36622811220787754 + }, + { + "epoch": 5.014658457930226, + "grad_norm": 0.4570553853885685, + "learning_rate": 0.0001787649943758991, + "loss": 2.5541880130767822, + "step": 8555, + "token_acc": 0.36701937786843447 + }, + { + "epoch": 5.015244796247435, + "grad_norm": 0.3727821164379915, + "learning_rate": 0.00017875902248893738, + "loss": 2.574573516845703, + "step": 8556, + "token_acc": 0.3636656700896919 + }, + { + "epoch": 5.015831134564644, + "grad_norm": 0.3657681865774627, + "learning_rate": 0.00017875304986214078, + "loss": 2.542238473892212, + "step": 8557, + "token_acc": 0.36913995063051736 + }, + { + "epoch": 5.0164174728818525, + "grad_norm": 0.39731488657565095, + "learning_rate": 0.0001787470764955654, + "loss": 2.5839080810546875, + "step": 8558, + "token_acc": 0.3638529012748418 + }, + { + "epoch": 5.017003811199062, + "grad_norm": 0.33660857904428493, + "learning_rate": 0.00017874110238926737, + "loss": 2.544034004211426, + "step": 8559, + "token_acc": 0.367727235739279 + }, + { + "epoch": 5.017590149516271, + "grad_norm": 0.41833580572489215, + "learning_rate": 0.00017873512754330279, + "loss": 2.579230785369873, + "step": 8560, + "token_acc": 0.3633607306921452 + }, + { + "epoch": 5.01817648783348, + "grad_norm": 0.3237486028640243, + "learning_rate": 0.00017872915195772773, + "loss": 2.568049430847168, + "step": 8561, + "token_acc": 0.3659882754861951 + }, + { + "epoch": 5.018762826150689, + "grad_norm": 0.42757355794515894, + "learning_rate": 0.0001787231756325984, + "loss": 2.5780301094055176, + "step": 8562, + "token_acc": 0.36309642822354493 + }, + { + "epoch": 5.019349164467898, + "grad_norm": 0.3081906038258494, + "learning_rate": 0.00017871719856797093, + "loss": 2.5651447772979736, + "step": 8563, + "token_acc": 0.36332374183373456 + }, + { + "epoch": 5.019935502785107, + "grad_norm": 0.35387805977751463, + "learning_rate": 0.00017871122076390145, + "loss": 2.54543399810791, + "step": 8564, + "token_acc": 0.3679917701521052 + }, + { + "epoch": 5.020521841102316, + "grad_norm": 0.30383619501387427, + "learning_rate": 0.00017870524222044612, + "loss": 2.562528133392334, + "step": 8565, + "token_acc": 0.36429510454488545 + }, + { + "epoch": 5.021108179419525, + "grad_norm": 0.3420187628771937, + "learning_rate": 0.00017869926293766108, + "loss": 2.5655436515808105, + "step": 8566, + "token_acc": 0.36507346482573905 + }, + { + "epoch": 5.0216945177367345, + "grad_norm": 0.29282344403323773, + "learning_rate": 0.0001786932829156025, + "loss": 2.5570621490478516, + "step": 8567, + "token_acc": 0.3657955982797875 + }, + { + "epoch": 5.022280856053943, + "grad_norm": 0.32013063447416545, + "learning_rate": 0.00017868730215432662, + "loss": 2.598223924636841, + "step": 8568, + "token_acc": 0.3594857061823453 + }, + { + "epoch": 5.022867194371152, + "grad_norm": 0.30598285640345363, + "learning_rate": 0.00017868132065388954, + "loss": 2.520106792449951, + "step": 8569, + "token_acc": 0.3725451929106915 + }, + { + "epoch": 5.023453532688361, + "grad_norm": 0.3073116277605857, + "learning_rate": 0.00017867533841434745, + "loss": 2.5541653633117676, + "step": 8570, + "token_acc": 0.36786315199102637 + }, + { + "epoch": 5.02403987100557, + "grad_norm": 0.29053430852209117, + "learning_rate": 0.0001786693554357566, + "loss": 2.5661065578460693, + "step": 8571, + "token_acc": 0.3645743720601509 + }, + { + "epoch": 5.024626209322779, + "grad_norm": 0.29984573422101274, + "learning_rate": 0.00017866337171817316, + "loss": 2.585926055908203, + "step": 8572, + "token_acc": 0.3621052465450031 + }, + { + "epoch": 5.025212547639988, + "grad_norm": 0.28891046101653295, + "learning_rate": 0.00017865738726165336, + "loss": 2.5682687759399414, + "step": 8573, + "token_acc": 0.36491833424589604 + }, + { + "epoch": 5.025798885957197, + "grad_norm": 0.3021542829649355, + "learning_rate": 0.00017865140206625336, + "loss": 2.53190279006958, + "step": 8574, + "token_acc": 0.3712162155336569 + }, + { + "epoch": 5.0263852242744065, + "grad_norm": 0.312655598616484, + "learning_rate": 0.00017864541613202945, + "loss": 2.587083578109741, + "step": 8575, + "token_acc": 0.3616598341611508 + }, + { + "epoch": 5.026971562591616, + "grad_norm": 0.3079385900072447, + "learning_rate": 0.00017863942945903785, + "loss": 2.570455551147461, + "step": 8576, + "token_acc": 0.364129285413986 + }, + { + "epoch": 5.027557900908825, + "grad_norm": 0.2999212972382153, + "learning_rate": 0.00017863344204733473, + "loss": 2.5599498748779297, + "step": 8577, + "token_acc": 0.3676264649122575 + }, + { + "epoch": 5.028144239226034, + "grad_norm": 0.2942709482203419, + "learning_rate": 0.00017862745389697642, + "loss": 2.5489354133605957, + "step": 8578, + "token_acc": 0.36726496863519614 + }, + { + "epoch": 5.028730577543242, + "grad_norm": 0.3145912647504632, + "learning_rate": 0.0001786214650080191, + "loss": 2.5287702083587646, + "step": 8579, + "token_acc": 0.3714973994552239 + }, + { + "epoch": 5.029316915860451, + "grad_norm": 0.30320402319450795, + "learning_rate": 0.00017861547538051907, + "loss": 2.5303704738616943, + "step": 8580, + "token_acc": 0.3704164391319297 + }, + { + "epoch": 5.02990325417766, + "grad_norm": 0.33014856000935666, + "learning_rate": 0.00017860948501453262, + "loss": 2.4881644248962402, + "step": 8581, + "token_acc": 0.3780443920603298 + }, + { + "epoch": 5.030489592494869, + "grad_norm": 0.31657615073264067, + "learning_rate": 0.00017860349391011596, + "loss": 2.5896806716918945, + "step": 8582, + "token_acc": 0.36262776159341104 + }, + { + "epoch": 5.0310759308120785, + "grad_norm": 0.3184797158939612, + "learning_rate": 0.00017859750206732536, + "loss": 2.5678181648254395, + "step": 8583, + "token_acc": 0.36518014721708064 + }, + { + "epoch": 5.031662269129288, + "grad_norm": 0.31608551573702154, + "learning_rate": 0.00017859150948621716, + "loss": 2.550880193710327, + "step": 8584, + "token_acc": 0.36687682308722147 + }, + { + "epoch": 5.032248607446497, + "grad_norm": 0.3069700936872596, + "learning_rate": 0.00017858551616684767, + "loss": 2.5597996711730957, + "step": 8585, + "token_acc": 0.36549414089328924 + }, + { + "epoch": 5.032834945763706, + "grad_norm": 0.291911346593309, + "learning_rate": 0.0001785795221092731, + "loss": 2.5623950958251953, + "step": 8586, + "token_acc": 0.3652140187207813 + }, + { + "epoch": 5.033421284080915, + "grad_norm": 0.30898162104127186, + "learning_rate": 0.00017857352731354985, + "loss": 2.579223871231079, + "step": 8587, + "token_acc": 0.36094642168509833 + }, + { + "epoch": 5.034007622398124, + "grad_norm": 0.30719860280750216, + "learning_rate": 0.00017856753177973418, + "loss": 2.569281816482544, + "step": 8588, + "token_acc": 0.36469552760325946 + }, + { + "epoch": 5.034593960715333, + "grad_norm": 0.36774744395893666, + "learning_rate": 0.00017856153550788238, + "loss": 2.5600554943084717, + "step": 8589, + "token_acc": 0.3651454555415548 + }, + { + "epoch": 5.035180299032541, + "grad_norm": 0.4711026081836451, + "learning_rate": 0.0001785555384980509, + "loss": 2.5066380500793457, + "step": 8590, + "token_acc": 0.3751918659426043 + }, + { + "epoch": 5.0357666373497505, + "grad_norm": 0.46413117270997145, + "learning_rate": 0.0001785495407502959, + "loss": 2.570627212524414, + "step": 8591, + "token_acc": 0.3641189212980271 + }, + { + "epoch": 5.03635297566696, + "grad_norm": 0.30770741546561603, + "learning_rate": 0.00017854354226467387, + "loss": 2.547548294067383, + "step": 8592, + "token_acc": 0.36684605095343265 + }, + { + "epoch": 5.036939313984169, + "grad_norm": 0.45656417698670165, + "learning_rate": 0.00017853754304124109, + "loss": 2.5522119998931885, + "step": 8593, + "token_acc": 0.3670502440033844 + }, + { + "epoch": 5.037525652301378, + "grad_norm": 0.35576314169194634, + "learning_rate": 0.00017853154308005388, + "loss": 2.5898637771606445, + "step": 8594, + "token_acc": 0.3613680664462577 + }, + { + "epoch": 5.038111990618587, + "grad_norm": 0.35255739567077987, + "learning_rate": 0.0001785255423811687, + "loss": 2.5680136680603027, + "step": 8595, + "token_acc": 0.36405543015573627 + }, + { + "epoch": 5.038698328935796, + "grad_norm": 0.31752485866672336, + "learning_rate": 0.00017851954094464184, + "loss": 2.5736083984375, + "step": 8596, + "token_acc": 0.3617938429137447 + }, + { + "epoch": 5.039284667253005, + "grad_norm": 0.3395376046681421, + "learning_rate": 0.0001785135387705297, + "loss": 2.560234546661377, + "step": 8597, + "token_acc": 0.3658538539607927 + }, + { + "epoch": 5.039871005570214, + "grad_norm": 0.28802426219906957, + "learning_rate": 0.00017850753585888865, + "loss": 2.5121490955352783, + "step": 8598, + "token_acc": 0.3743807047117836 + }, + { + "epoch": 5.040457343887423, + "grad_norm": 0.36636400170871986, + "learning_rate": 0.00017850153220977513, + "loss": 2.5468177795410156, + "step": 8599, + "token_acc": 0.36772802947669475 + }, + { + "epoch": 5.0410436822046325, + "grad_norm": 0.31131145818896805, + "learning_rate": 0.00017849552782324546, + "loss": 2.536012649536133, + "step": 8600, + "token_acc": 0.3699290127170291 + }, + { + "epoch": 5.041630020521841, + "grad_norm": 0.3760346193784607, + "learning_rate": 0.0001784895226993561, + "loss": 2.5302774906158447, + "step": 8601, + "token_acc": 0.37090719553026724 + }, + { + "epoch": 5.04221635883905, + "grad_norm": 0.30104487128374796, + "learning_rate": 0.00017848351683816342, + "loss": 2.52775239944458, + "step": 8602, + "token_acc": 0.3723042518885429 + }, + { + "epoch": 5.042802697156259, + "grad_norm": 0.31366689481289206, + "learning_rate": 0.00017847751023972386, + "loss": 2.5727272033691406, + "step": 8603, + "token_acc": 0.36326769226234484 + }, + { + "epoch": 5.043389035473468, + "grad_norm": 0.2980917869380582, + "learning_rate": 0.00017847150290409384, + "loss": 2.5655388832092285, + "step": 8604, + "token_acc": 0.36521995090592757 + }, + { + "epoch": 5.043975373790677, + "grad_norm": 0.3176571924594715, + "learning_rate": 0.00017846549483132982, + "loss": 2.550281524658203, + "step": 8605, + "token_acc": 0.36648691729400135 + }, + { + "epoch": 5.044561712107886, + "grad_norm": 0.2863134135082776, + "learning_rate": 0.0001784594860214882, + "loss": 2.5486721992492676, + "step": 8606, + "token_acc": 0.36800546518522337 + }, + { + "epoch": 5.045148050425095, + "grad_norm": 0.34134556344303274, + "learning_rate": 0.00017845347647462543, + "loss": 2.544363021850586, + "step": 8607, + "token_acc": 0.3675474307178717 + }, + { + "epoch": 5.0457343887423045, + "grad_norm": 0.29889869901782373, + "learning_rate": 0.00017844746619079794, + "loss": 2.5942368507385254, + "step": 8608, + "token_acc": 0.3584032202211559 + }, + { + "epoch": 5.046320727059514, + "grad_norm": 0.3895689990148726, + "learning_rate": 0.00017844145517006225, + "loss": 2.5189552307128906, + "step": 8609, + "token_acc": 0.3722706538266599 + }, + { + "epoch": 5.046907065376723, + "grad_norm": 0.33741442406630434, + "learning_rate": 0.00017843544341247477, + "loss": 2.5026745796203613, + "step": 8610, + "token_acc": 0.3759538437846822 + }, + { + "epoch": 5.047493403693931, + "grad_norm": 0.3264569704237778, + "learning_rate": 0.00017842943091809198, + "loss": 2.549659013748169, + "step": 8611, + "token_acc": 0.367681353191754 + }, + { + "epoch": 5.04807974201114, + "grad_norm": 0.32571507145597456, + "learning_rate": 0.0001784234176869704, + "loss": 2.5807695388793945, + "step": 8612, + "token_acc": 0.36172193549864046 + }, + { + "epoch": 5.048666080328349, + "grad_norm": 0.3175754922077994, + "learning_rate": 0.0001784174037191665, + "loss": 2.5562191009521484, + "step": 8613, + "token_acc": 0.36770156359722256 + }, + { + "epoch": 5.049252418645558, + "grad_norm": 0.3352716692092544, + "learning_rate": 0.00017841138901473672, + "loss": 2.545306921005249, + "step": 8614, + "token_acc": 0.36915870930364886 + }, + { + "epoch": 5.049838756962767, + "grad_norm": 0.3342022595469454, + "learning_rate": 0.00017840537357373762, + "loss": 2.5136969089508057, + "step": 8615, + "token_acc": 0.373111929260103 + }, + { + "epoch": 5.0504250952799765, + "grad_norm": 0.3698572166437671, + "learning_rate": 0.0001783993573962257, + "loss": 2.524839401245117, + "step": 8616, + "token_acc": 0.37144148097751994 + }, + { + "epoch": 5.051011433597186, + "grad_norm": 0.3496972254293288, + "learning_rate": 0.00017839334048225743, + "loss": 2.544816732406616, + "step": 8617, + "token_acc": 0.3684737768090188 + }, + { + "epoch": 5.051597771914395, + "grad_norm": 0.323435957493382, + "learning_rate": 0.00017838732283188938, + "loss": 2.5452558994293213, + "step": 8618, + "token_acc": 0.36836540278792274 + }, + { + "epoch": 5.052184110231604, + "grad_norm": 0.3186981183050785, + "learning_rate": 0.00017838130444517808, + "loss": 2.5675978660583496, + "step": 8619, + "token_acc": 0.3624714464477968 + }, + { + "epoch": 5.052770448548813, + "grad_norm": 0.338347275583856, + "learning_rate": 0.00017837528532218, + "loss": 2.5589938163757324, + "step": 8620, + "token_acc": 0.3647268723153395 + }, + { + "epoch": 5.053356786866022, + "grad_norm": 0.2964264313375623, + "learning_rate": 0.00017836926546295175, + "loss": 2.507209300994873, + "step": 8621, + "token_acc": 0.3749760574827484 + }, + { + "epoch": 5.05394312518323, + "grad_norm": 0.3702286715287716, + "learning_rate": 0.00017836324486754986, + "loss": 2.546562910079956, + "step": 8622, + "token_acc": 0.3686370070416226 + }, + { + "epoch": 5.054529463500439, + "grad_norm": 0.32738272997183826, + "learning_rate": 0.0001783572235360309, + "loss": 2.550058126449585, + "step": 8623, + "token_acc": 0.3673816680056365 + }, + { + "epoch": 5.0551158018176485, + "grad_norm": 0.3489811917549848, + "learning_rate": 0.0001783512014684514, + "loss": 2.575343370437622, + "step": 8624, + "token_acc": 0.36435302500254474 + }, + { + "epoch": 5.055702140134858, + "grad_norm": 0.3272732798578958, + "learning_rate": 0.0001783451786648679, + "loss": 2.59786057472229, + "step": 8625, + "token_acc": 0.36059966259922793 + }, + { + "epoch": 5.056288478452067, + "grad_norm": 0.3351515724565755, + "learning_rate": 0.00017833915512533704, + "loss": 2.536472797393799, + "step": 8626, + "token_acc": 0.36967911959708966 + }, + { + "epoch": 5.056874816769276, + "grad_norm": 0.3490249113357389, + "learning_rate": 0.00017833313084991543, + "loss": 2.577605724334717, + "step": 8627, + "token_acc": 0.3643636419277674 + }, + { + "epoch": 5.057461155086485, + "grad_norm": 0.31281084362226147, + "learning_rate": 0.00017832710583865955, + "loss": 2.552957773208618, + "step": 8628, + "token_acc": 0.36781288939453743 + }, + { + "epoch": 5.058047493403694, + "grad_norm": 0.38519994362273724, + "learning_rate": 0.0001783210800916261, + "loss": 2.5694937705993652, + "step": 8629, + "token_acc": 0.3632950524154789 + }, + { + "epoch": 5.058633831720903, + "grad_norm": 0.3170205677129762, + "learning_rate": 0.00017831505360887162, + "loss": 2.5812439918518066, + "step": 8630, + "token_acc": 0.36106454892693246 + }, + { + "epoch": 5.059220170038112, + "grad_norm": 0.3221587995706993, + "learning_rate": 0.00017830902639045273, + "loss": 2.587827444076538, + "step": 8631, + "token_acc": 0.36089932258233914 + }, + { + "epoch": 5.059806508355321, + "grad_norm": 0.3127017424375763, + "learning_rate": 0.0001783029984364261, + "loss": 2.5396838188171387, + "step": 8632, + "token_acc": 0.3683606510661558 + }, + { + "epoch": 5.06039284667253, + "grad_norm": 0.3092481145452716, + "learning_rate": 0.00017829696974684827, + "loss": 2.5201659202575684, + "step": 8633, + "token_acc": 0.3729968371112283 + }, + { + "epoch": 5.060979184989739, + "grad_norm": 0.31156518954728046, + "learning_rate": 0.00017829094032177593, + "loss": 2.5538747310638428, + "step": 8634, + "token_acc": 0.3665296579810702 + }, + { + "epoch": 5.061565523306948, + "grad_norm": 0.3265156067724412, + "learning_rate": 0.0001782849101612657, + "loss": 2.5675978660583496, + "step": 8635, + "token_acc": 0.3631898249996081 + }, + { + "epoch": 5.062151861624157, + "grad_norm": 0.3201273192078522, + "learning_rate": 0.00017827887926537424, + "loss": 2.55073881149292, + "step": 8636, + "token_acc": 0.36664392640455706 + }, + { + "epoch": 5.062738199941366, + "grad_norm": 0.3125211010127739, + "learning_rate": 0.0001782728476341582, + "loss": 2.5284056663513184, + "step": 8637, + "token_acc": 0.36952214647906245 + }, + { + "epoch": 5.063324538258575, + "grad_norm": 0.2988053014494796, + "learning_rate": 0.0001782668152676742, + "loss": 2.556939125061035, + "step": 8638, + "token_acc": 0.3656246057301466 + }, + { + "epoch": 5.063910876575784, + "grad_norm": 0.2985445757361666, + "learning_rate": 0.00017826078216597898, + "loss": 2.5389156341552734, + "step": 8639, + "token_acc": 0.3697985923318158 + }, + { + "epoch": 5.064497214892993, + "grad_norm": 0.3145072900062383, + "learning_rate": 0.0001782547483291291, + "loss": 2.5682849884033203, + "step": 8640, + "token_acc": 0.36383810159093655 + }, + { + "epoch": 5.0650835532102025, + "grad_norm": 0.33045525848793245, + "learning_rate": 0.00017824871375718136, + "loss": 2.604620933532715, + "step": 8641, + "token_acc": 0.3586817901801636 + }, + { + "epoch": 5.065669891527412, + "grad_norm": 0.2974922110161493, + "learning_rate": 0.0001782426784501924, + "loss": 2.598081111907959, + "step": 8642, + "token_acc": 0.35895287524857966 + }, + { + "epoch": 5.066256229844621, + "grad_norm": 0.33630052166895597, + "learning_rate": 0.00017823664240821893, + "loss": 2.5367367267608643, + "step": 8643, + "token_acc": 0.369738694642249 + }, + { + "epoch": 5.066842568161829, + "grad_norm": 0.30004110187418354, + "learning_rate": 0.00017823060563131756, + "loss": 2.5762908458709717, + "step": 8644, + "token_acc": 0.3645476514627047 + }, + { + "epoch": 5.067428906479038, + "grad_norm": 0.3910657069692267, + "learning_rate": 0.00017822456811954513, + "loss": 2.5280518531799316, + "step": 8645, + "token_acc": 0.370581977225265 + }, + { + "epoch": 5.068015244796247, + "grad_norm": 0.32994213495225766, + "learning_rate": 0.00017821852987295826, + "loss": 2.533412456512451, + "step": 8646, + "token_acc": 0.36998954478582774 + }, + { + "epoch": 5.068601583113456, + "grad_norm": 0.3347223141051749, + "learning_rate": 0.0001782124908916137, + "loss": 2.6010608673095703, + "step": 8647, + "token_acc": 0.35866107576633893 + }, + { + "epoch": 5.069187921430665, + "grad_norm": 0.38568755970071855, + "learning_rate": 0.0001782064511755682, + "loss": 2.5580105781555176, + "step": 8648, + "token_acc": 0.36583961760377187 + }, + { + "epoch": 5.0697742597478745, + "grad_norm": 0.28649006945836664, + "learning_rate": 0.00017820041072487845, + "loss": 2.530031204223633, + "step": 8649, + "token_acc": 0.3707557090370909 + }, + { + "epoch": 5.070360598065084, + "grad_norm": 0.3972715558454441, + "learning_rate": 0.00017819436953960124, + "loss": 2.551689624786377, + "step": 8650, + "token_acc": 0.36756097747601835 + }, + { + "epoch": 5.070946936382293, + "grad_norm": 0.3177109451791595, + "learning_rate": 0.0001781883276197933, + "loss": 2.5293281078338623, + "step": 8651, + "token_acc": 0.3711145039800849 + }, + { + "epoch": 5.071533274699502, + "grad_norm": 0.3297786870337855, + "learning_rate": 0.00017818228496551135, + "loss": 2.5782697200775146, + "step": 8652, + "token_acc": 0.3618049174760221 + }, + { + "epoch": 5.072119613016711, + "grad_norm": 0.3028362519003831, + "learning_rate": 0.0001781762415768122, + "loss": 2.5293474197387695, + "step": 8653, + "token_acc": 0.3712482075024053 + }, + { + "epoch": 5.07270595133392, + "grad_norm": 0.33525613729127424, + "learning_rate": 0.00017817019745375263, + "loss": 2.538595199584961, + "step": 8654, + "token_acc": 0.3687129390891122 + }, + { + "epoch": 5.073292289651128, + "grad_norm": 0.2934633273275737, + "learning_rate": 0.00017816415259638938, + "loss": 2.625231981277466, + "step": 8655, + "token_acc": 0.3549068410762937 + }, + { + "epoch": 5.073878627968337, + "grad_norm": 0.408788390016289, + "learning_rate": 0.0001781581070047792, + "loss": 2.5804429054260254, + "step": 8656, + "token_acc": 0.36280263362538234 + }, + { + "epoch": 5.0744649662855466, + "grad_norm": 0.37506886856840405, + "learning_rate": 0.00017815206067897898, + "loss": 2.5608997344970703, + "step": 8657, + "token_acc": 0.3667965521313024 + }, + { + "epoch": 5.075051304602756, + "grad_norm": 0.3487901006393067, + "learning_rate": 0.00017814601361904544, + "loss": 2.5515003204345703, + "step": 8658, + "token_acc": 0.3668162192892153 + }, + { + "epoch": 5.075637642919965, + "grad_norm": 0.3611737601651554, + "learning_rate": 0.0001781399658250354, + "loss": 2.596212387084961, + "step": 8659, + "token_acc": 0.3593859509709155 + }, + { + "epoch": 5.076223981237174, + "grad_norm": 0.3177250888477601, + "learning_rate": 0.00017813391729700568, + "loss": 2.595973014831543, + "step": 8660, + "token_acc": 0.3601860020585208 + }, + { + "epoch": 5.076810319554383, + "grad_norm": 0.32189341550380085, + "learning_rate": 0.0001781278680350131, + "loss": 2.5558865070343018, + "step": 8661, + "token_acc": 0.36595420601315753 + }, + { + "epoch": 5.077396657871592, + "grad_norm": 0.30434027519695955, + "learning_rate": 0.00017812181803911447, + "loss": 2.570974349975586, + "step": 8662, + "token_acc": 0.3638679735987597 + }, + { + "epoch": 5.077982996188801, + "grad_norm": 0.3656942903457676, + "learning_rate": 0.00017811576730936664, + "loss": 2.564648151397705, + "step": 8663, + "token_acc": 0.36480222829243114 + }, + { + "epoch": 5.07856933450601, + "grad_norm": 0.2980314602520659, + "learning_rate": 0.00017810971584582643, + "loss": 2.528481960296631, + "step": 8664, + "token_acc": 0.3704219930510654 + }, + { + "epoch": 5.0791556728232194, + "grad_norm": 0.3214652020003212, + "learning_rate": 0.00017810366364855068, + "loss": 2.565074920654297, + "step": 8665, + "token_acc": 0.36418697708257547 + }, + { + "epoch": 5.079742011140428, + "grad_norm": 0.3085935181115178, + "learning_rate": 0.00017809761071759629, + "loss": 2.5683186054229736, + "step": 8666, + "token_acc": 0.36621876838292206 + }, + { + "epoch": 5.080328349457637, + "grad_norm": 0.3585548289825254, + "learning_rate": 0.00017809155705302007, + "loss": 2.55531907081604, + "step": 8667, + "token_acc": 0.3646511154638651 + }, + { + "epoch": 5.080914687774846, + "grad_norm": 0.34396413090760825, + "learning_rate": 0.0001780855026548789, + "loss": 2.5789380073547363, + "step": 8668, + "token_acc": 0.36284301187108964 + }, + { + "epoch": 5.081501026092055, + "grad_norm": 0.3115431730015309, + "learning_rate": 0.00017807944752322964, + "loss": 2.5804362297058105, + "step": 8669, + "token_acc": 0.3636655055243306 + }, + { + "epoch": 5.082087364409264, + "grad_norm": 0.32452169809234144, + "learning_rate": 0.0001780733916581292, + "loss": 2.555271863937378, + "step": 8670, + "token_acc": 0.3665360597526632 + }, + { + "epoch": 5.082673702726473, + "grad_norm": 0.30057334771173677, + "learning_rate": 0.00017806733505963443, + "loss": 2.573197841644287, + "step": 8671, + "token_acc": 0.36405204673202785 + }, + { + "epoch": 5.083260041043682, + "grad_norm": 0.3087657974315778, + "learning_rate": 0.00017806127772780226, + "loss": 2.5896239280700684, + "step": 8672, + "token_acc": 0.3589368227145959 + }, + { + "epoch": 5.0838463793608915, + "grad_norm": 0.29099831044817276, + "learning_rate": 0.00017805521966268958, + "loss": 2.536803722381592, + "step": 8673, + "token_acc": 0.37012067586433034 + }, + { + "epoch": 5.084432717678101, + "grad_norm": 0.33520015533913267, + "learning_rate": 0.0001780491608643533, + "loss": 2.573042392730713, + "step": 8674, + "token_acc": 0.36412847443242097 + }, + { + "epoch": 5.08501905599531, + "grad_norm": 0.3224108045187601, + "learning_rate": 0.0001780431013328503, + "loss": 2.5509583950042725, + "step": 8675, + "token_acc": 0.36753521671929934 + }, + { + "epoch": 5.085605394312518, + "grad_norm": 0.3413275571569484, + "learning_rate": 0.00017803704106823755, + "loss": 2.621654510498047, + "step": 8676, + "token_acc": 0.35722091581062326 + }, + { + "epoch": 5.086191732629727, + "grad_norm": 0.3586229044437792, + "learning_rate": 0.00017803098007057195, + "loss": 2.5988285541534424, + "step": 8677, + "token_acc": 0.36091511131343346 + }, + { + "epoch": 5.086778070946936, + "grad_norm": 0.30801630044444706, + "learning_rate": 0.00017802491833991045, + "loss": 2.528641700744629, + "step": 8678, + "token_acc": 0.36859016142019096 + }, + { + "epoch": 5.087364409264145, + "grad_norm": 0.31741918840181327, + "learning_rate": 0.00017801885587630996, + "loss": 2.580700397491455, + "step": 8679, + "token_acc": 0.36228125132735844 + }, + { + "epoch": 5.087950747581354, + "grad_norm": 0.31135155478888293, + "learning_rate": 0.00017801279267982745, + "loss": 2.5882132053375244, + "step": 8680, + "token_acc": 0.3621735633592015 + }, + { + "epoch": 5.0885370858985635, + "grad_norm": 0.3061114253666468, + "learning_rate": 0.0001780067287505199, + "loss": 2.566697120666504, + "step": 8681, + "token_acc": 0.3645383560254693 + }, + { + "epoch": 5.089123424215773, + "grad_norm": 0.3515887983679542, + "learning_rate": 0.00017800066408844422, + "loss": 2.5126171112060547, + "step": 8682, + "token_acc": 0.3735695560423256 + }, + { + "epoch": 5.089709762532982, + "grad_norm": 0.3014504110772663, + "learning_rate": 0.00017799459869365745, + "loss": 2.5652589797973633, + "step": 8683, + "token_acc": 0.3654255599472991 + }, + { + "epoch": 5.090296100850191, + "grad_norm": 0.37302211570990335, + "learning_rate": 0.00017798853256621649, + "loss": 2.5386734008789062, + "step": 8684, + "token_acc": 0.369287898762999 + }, + { + "epoch": 5.0908824391674, + "grad_norm": 0.3070045874687679, + "learning_rate": 0.00017798246570617832, + "loss": 2.5354185104370117, + "step": 8685, + "token_acc": 0.3696374773786095 + }, + { + "epoch": 5.091468777484609, + "grad_norm": 0.30846844724496125, + "learning_rate": 0.00017797639811360005, + "loss": 2.5793769359588623, + "step": 8686, + "token_acc": 0.3612555666139245 + }, + { + "epoch": 5.092055115801817, + "grad_norm": 0.335196762448663, + "learning_rate": 0.00017797032978853852, + "loss": 2.539492607116699, + "step": 8687, + "token_acc": 0.3669445766183255 + }, + { + "epoch": 5.092641454119026, + "grad_norm": 0.27926329461377664, + "learning_rate": 0.0001779642607310509, + "loss": 2.5332818031311035, + "step": 8688, + "token_acc": 0.3695707471344244 + }, + { + "epoch": 5.0932277924362355, + "grad_norm": 0.3304190403700646, + "learning_rate": 0.00017795819094119404, + "loss": 2.592761278152466, + "step": 8689, + "token_acc": 0.359917911521154 + }, + { + "epoch": 5.093814130753445, + "grad_norm": 0.3199046823182537, + "learning_rate": 0.000177952120419025, + "loss": 2.544893503189087, + "step": 8690, + "token_acc": 0.3676867722832576 + }, + { + "epoch": 5.094400469070654, + "grad_norm": 0.3338379341983684, + "learning_rate": 0.0001779460491646009, + "loss": 2.569002628326416, + "step": 8691, + "token_acc": 0.3664240572519387 + }, + { + "epoch": 5.094986807387863, + "grad_norm": 0.29806898084584, + "learning_rate": 0.00017793997717797865, + "loss": 2.5985050201416016, + "step": 8692, + "token_acc": 0.359388820654344 + }, + { + "epoch": 5.095573145705072, + "grad_norm": 0.2974730596146133, + "learning_rate": 0.0001779339044592154, + "loss": 2.564746379852295, + "step": 8693, + "token_acc": 0.3647777427285075 + }, + { + "epoch": 5.096159484022281, + "grad_norm": 0.3227208190439189, + "learning_rate": 0.00017792783100836808, + "loss": 2.56030011177063, + "step": 8694, + "token_acc": 0.3664799773793805 + }, + { + "epoch": 5.09674582233949, + "grad_norm": 0.28585466777550367, + "learning_rate": 0.0001779217568254938, + "loss": 2.5541584491729736, + "step": 8695, + "token_acc": 0.366164249293557 + }, + { + "epoch": 5.097332160656699, + "grad_norm": 0.34662690247403183, + "learning_rate": 0.00017791568191064964, + "loss": 2.5726568698883057, + "step": 8696, + "token_acc": 0.3624830807611667 + }, + { + "epoch": 5.097918498973908, + "grad_norm": 0.33546528285006866, + "learning_rate": 0.00017790960626389262, + "loss": 2.605684280395508, + "step": 8697, + "token_acc": 0.35868411284817303 + }, + { + "epoch": 5.098504837291117, + "grad_norm": 0.33887671212101234, + "learning_rate": 0.00017790352988527984, + "loss": 2.556511163711548, + "step": 8698, + "token_acc": 0.3666915958963187 + }, + { + "epoch": 5.099091175608326, + "grad_norm": 0.30233365861987865, + "learning_rate": 0.00017789745277486837, + "loss": 2.5629310607910156, + "step": 8699, + "token_acc": 0.3647650137312814 + }, + { + "epoch": 5.099677513925535, + "grad_norm": 0.3652666452882459, + "learning_rate": 0.0001778913749327153, + "loss": 2.596416473388672, + "step": 8700, + "token_acc": 0.3605429436862847 + }, + { + "epoch": 5.100263852242744, + "grad_norm": 0.32776413258426074, + "learning_rate": 0.00017788529635887773, + "loss": 2.598938226699829, + "step": 8701, + "token_acc": 0.3587423341398699 + }, + { + "epoch": 5.100850190559953, + "grad_norm": 0.3117711264051812, + "learning_rate": 0.00017787921705341274, + "loss": 2.5848286151885986, + "step": 8702, + "token_acc": 0.36257501189820984 + }, + { + "epoch": 5.101436528877162, + "grad_norm": 0.3002138322325676, + "learning_rate": 0.0001778731370163775, + "loss": 2.5925133228302, + "step": 8703, + "token_acc": 0.36055757188401166 + }, + { + "epoch": 5.102022867194371, + "grad_norm": 0.3153448951658653, + "learning_rate": 0.00017786705624782902, + "loss": 2.578900098800659, + "step": 8704, + "token_acc": 0.36218541938166493 + }, + { + "epoch": 5.10260920551158, + "grad_norm": 0.30419673902434613, + "learning_rate": 0.00017786097474782446, + "loss": 2.5859649181365967, + "step": 8705, + "token_acc": 0.3618694623141846 + }, + { + "epoch": 5.1031955438287895, + "grad_norm": 0.31647094422421734, + "learning_rate": 0.000177854892516421, + "loss": 2.566540479660034, + "step": 8706, + "token_acc": 0.3630619323802391 + }, + { + "epoch": 5.103781882145999, + "grad_norm": 0.3161751532697215, + "learning_rate": 0.0001778488095536757, + "loss": 2.6177268028259277, + "step": 8707, + "token_acc": 0.35553262048334794 + }, + { + "epoch": 5.104368220463208, + "grad_norm": 0.29940245202558324, + "learning_rate": 0.0001778427258596458, + "loss": 2.6232423782348633, + "step": 8708, + "token_acc": 0.35296298846799695 + }, + { + "epoch": 5.104954558780416, + "grad_norm": 0.3142022718993205, + "learning_rate": 0.00017783664143438833, + "loss": 2.5876476764678955, + "step": 8709, + "token_acc": 0.3624279574029616 + }, + { + "epoch": 5.105540897097625, + "grad_norm": 0.30494911699308014, + "learning_rate": 0.0001778305562779605, + "loss": 2.563300609588623, + "step": 8710, + "token_acc": 0.3638596314222979 + }, + { + "epoch": 5.106127235414834, + "grad_norm": 0.3663480910734714, + "learning_rate": 0.0001778244703904195, + "loss": 2.5931406021118164, + "step": 8711, + "token_acc": 0.3606730973883842 + }, + { + "epoch": 5.106713573732043, + "grad_norm": 0.4010463540965783, + "learning_rate": 0.00017781838377182245, + "loss": 2.560307502746582, + "step": 8712, + "token_acc": 0.36487027841909403 + }, + { + "epoch": 5.107299912049252, + "grad_norm": 0.3690635820645635, + "learning_rate": 0.00017781229642222657, + "loss": 2.5877881050109863, + "step": 8713, + "token_acc": 0.36101509339496446 + }, + { + "epoch": 5.1078862503664615, + "grad_norm": 0.2849674264066636, + "learning_rate": 0.00017780620834168898, + "loss": 2.5788164138793945, + "step": 8714, + "token_acc": 0.3633517976588629 + }, + { + "epoch": 5.108472588683671, + "grad_norm": 0.3776444929012261, + "learning_rate": 0.00017780011953026694, + "loss": 2.54604434967041, + "step": 8715, + "token_acc": 0.36879391116277954 + }, + { + "epoch": 5.10905892700088, + "grad_norm": 0.32689300791328374, + "learning_rate": 0.0001777940299880176, + "loss": 2.5166871547698975, + "step": 8716, + "token_acc": 0.37301462699845617 + }, + { + "epoch": 5.109645265318089, + "grad_norm": 0.31670023753219606, + "learning_rate": 0.0001777879397149982, + "loss": 2.5536739826202393, + "step": 8717, + "token_acc": 0.36645080288112075 + }, + { + "epoch": 5.110231603635298, + "grad_norm": 0.36461519502981615, + "learning_rate": 0.0001777818487112659, + "loss": 2.583094835281372, + "step": 8718, + "token_acc": 0.3624467264262818 + }, + { + "epoch": 5.110817941952506, + "grad_norm": 0.35026827218716855, + "learning_rate": 0.00017777575697687793, + "loss": 2.5645411014556885, + "step": 8719, + "token_acc": 0.3659068151027861 + }, + { + "epoch": 5.111404280269715, + "grad_norm": 0.302309715251391, + "learning_rate": 0.00017776966451189157, + "loss": 2.5638270378112793, + "step": 8720, + "token_acc": 0.36238349316096446 + }, + { + "epoch": 5.111990618586924, + "grad_norm": 0.3741235231061588, + "learning_rate": 0.00017776357131636398, + "loss": 2.5800561904907227, + "step": 8721, + "token_acc": 0.361576339351697 + }, + { + "epoch": 5.1125769569041335, + "grad_norm": 0.3209821568203822, + "learning_rate": 0.00017775747739035241, + "loss": 2.579256057739258, + "step": 8722, + "token_acc": 0.3625130713531254 + }, + { + "epoch": 5.113163295221343, + "grad_norm": 0.2948416511263344, + "learning_rate": 0.00017775138273391417, + "loss": 2.537633180618286, + "step": 8723, + "token_acc": 0.3691275516973161 + }, + { + "epoch": 5.113749633538552, + "grad_norm": 0.34246840772290155, + "learning_rate": 0.00017774528734710644, + "loss": 2.5742411613464355, + "step": 8724, + "token_acc": 0.3615608684284888 + }, + { + "epoch": 5.114335971855761, + "grad_norm": 0.30519530059903244, + "learning_rate": 0.0001777391912299865, + "loss": 2.603969097137451, + "step": 8725, + "token_acc": 0.35845489878674464 + }, + { + "epoch": 5.11492231017297, + "grad_norm": 0.3383554662467581, + "learning_rate": 0.00017773309438261158, + "loss": 2.5867156982421875, + "step": 8726, + "token_acc": 0.3613432339021906 + }, + { + "epoch": 5.115508648490179, + "grad_norm": 0.3114679358121218, + "learning_rate": 0.00017772699680503902, + "loss": 2.608220100402832, + "step": 8727, + "token_acc": 0.3577511502671921 + }, + { + "epoch": 5.116094986807388, + "grad_norm": 0.30103817547590217, + "learning_rate": 0.00017772089849732602, + "loss": 2.5776171684265137, + "step": 8728, + "token_acc": 0.36263352858922393 + }, + { + "epoch": 5.116681325124597, + "grad_norm": 0.31370309327297613, + "learning_rate": 0.00017771479945952995, + "loss": 2.6162428855895996, + "step": 8729, + "token_acc": 0.3558424654350257 + }, + { + "epoch": 5.1172676634418055, + "grad_norm": 0.29058125739962826, + "learning_rate": 0.00017770869969170806, + "loss": 2.5584490299224854, + "step": 8730, + "token_acc": 0.3662462359205193 + }, + { + "epoch": 5.117854001759015, + "grad_norm": 0.31493409034253317, + "learning_rate": 0.00017770259919391764, + "loss": 2.5770998001098633, + "step": 8731, + "token_acc": 0.3623192660158731 + }, + { + "epoch": 5.118440340076224, + "grad_norm": 0.3038403243885083, + "learning_rate": 0.00017769649796621598, + "loss": 2.5806236267089844, + "step": 8732, + "token_acc": 0.3616266885963227 + }, + { + "epoch": 5.119026678393433, + "grad_norm": 0.29478892089536085, + "learning_rate": 0.00017769039600866048, + "loss": 2.545267105102539, + "step": 8733, + "token_acc": 0.3689115357848509 + }, + { + "epoch": 5.119613016710642, + "grad_norm": 0.3171312202486041, + "learning_rate": 0.00017768429332130835, + "loss": 2.564260482788086, + "step": 8734, + "token_acc": 0.36449064385236885 + }, + { + "epoch": 5.120199355027851, + "grad_norm": 0.33122265835743825, + "learning_rate": 0.000177678189904217, + "loss": 2.5714006423950195, + "step": 8735, + "token_acc": 0.3638228935891728 + }, + { + "epoch": 5.12078569334506, + "grad_norm": 0.3101457697606461, + "learning_rate": 0.00017767208575744368, + "loss": 2.5785865783691406, + "step": 8736, + "token_acc": 0.3613113670413507 + }, + { + "epoch": 5.121372031662269, + "grad_norm": 0.2995124932122777, + "learning_rate": 0.00017766598088104582, + "loss": 2.5694262981414795, + "step": 8737, + "token_acc": 0.3641655666357351 + }, + { + "epoch": 5.121958369979478, + "grad_norm": 0.35917739514183444, + "learning_rate": 0.0001776598752750807, + "loss": 2.6244006156921387, + "step": 8738, + "token_acc": 0.3561718949247641 + }, + { + "epoch": 5.1225447082966875, + "grad_norm": 0.4305897999245486, + "learning_rate": 0.00017765376893960573, + "loss": 2.578237533569336, + "step": 8739, + "token_acc": 0.36265919693563503 + }, + { + "epoch": 5.123131046613897, + "grad_norm": 0.33349056553527523, + "learning_rate": 0.00017764766187467824, + "loss": 2.5885679721832275, + "step": 8740, + "token_acc": 0.3598215051917961 + }, + { + "epoch": 5.123717384931105, + "grad_norm": 0.33115078912184726, + "learning_rate": 0.00017764155408035557, + "loss": 2.5867807865142822, + "step": 8741, + "token_acc": 0.35987577606834437 + }, + { + "epoch": 5.124303723248314, + "grad_norm": 0.3572592378302158, + "learning_rate": 0.00017763544555669512, + "loss": 2.542055130004883, + "step": 8742, + "token_acc": 0.3676556543508734 + }, + { + "epoch": 5.124890061565523, + "grad_norm": 0.3053746428260833, + "learning_rate": 0.0001776293363037543, + "loss": 2.561729907989502, + "step": 8743, + "token_acc": 0.36527112461216915 + }, + { + "epoch": 5.125476399882732, + "grad_norm": 0.3921018654877444, + "learning_rate": 0.00017762322632159045, + "loss": 2.5706028938293457, + "step": 8744, + "token_acc": 0.3639261434277703 + }, + { + "epoch": 5.126062738199941, + "grad_norm": 0.2972537049353717, + "learning_rate": 0.00017761711561026103, + "loss": 2.562030792236328, + "step": 8745, + "token_acc": 0.3640063079556931 + }, + { + "epoch": 5.12664907651715, + "grad_norm": 0.31986615885829506, + "learning_rate": 0.00017761100416982336, + "loss": 2.5650229454040527, + "step": 8746, + "token_acc": 0.3642552491329643 + }, + { + "epoch": 5.1272354148343595, + "grad_norm": 0.28885815264670817, + "learning_rate": 0.0001776048920003349, + "loss": 2.5682590007781982, + "step": 8747, + "token_acc": 0.3644066170524787 + }, + { + "epoch": 5.127821753151569, + "grad_norm": 0.32369732605998486, + "learning_rate": 0.00017759877910185302, + "loss": 2.581594228744507, + "step": 8748, + "token_acc": 0.3608302674264765 + }, + { + "epoch": 5.128408091468778, + "grad_norm": 0.3195838715460639, + "learning_rate": 0.0001775926654744352, + "loss": 2.5679264068603516, + "step": 8749, + "token_acc": 0.3627362075743677 + }, + { + "epoch": 5.128994429785987, + "grad_norm": 0.3012263462150962, + "learning_rate": 0.00017758655111813887, + "loss": 2.5967321395874023, + "step": 8750, + "token_acc": 0.36043907906773 + }, + { + "epoch": 5.129580768103196, + "grad_norm": 0.32407003391363237, + "learning_rate": 0.00017758043603302142, + "loss": 2.563599109649658, + "step": 8751, + "token_acc": 0.36445512926400964 + }, + { + "epoch": 5.130167106420404, + "grad_norm": 0.3267574191454591, + "learning_rate": 0.0001775743202191403, + "loss": 2.5634918212890625, + "step": 8752, + "token_acc": 0.36589412014304085 + }, + { + "epoch": 5.130753444737613, + "grad_norm": 0.3287773192051845, + "learning_rate": 0.000177568203676553, + "loss": 2.5854454040527344, + "step": 8753, + "token_acc": 0.36014038985961017 + }, + { + "epoch": 5.131339783054822, + "grad_norm": 0.3249219670465738, + "learning_rate": 0.00017756208640531696, + "loss": 2.585137367248535, + "step": 8754, + "token_acc": 0.3610645870407202 + }, + { + "epoch": 5.1319261213720315, + "grad_norm": 0.2962890653235747, + "learning_rate": 0.0001775559684054896, + "loss": 2.542698621749878, + "step": 8755, + "token_acc": 0.3692114295252303 + }, + { + "epoch": 5.132512459689241, + "grad_norm": 0.32202942764946535, + "learning_rate": 0.00017754984967712845, + "loss": 2.552659034729004, + "step": 8756, + "token_acc": 0.3645587925957089 + }, + { + "epoch": 5.13309879800645, + "grad_norm": 0.3033781641562856, + "learning_rate": 0.00017754373022029095, + "loss": 2.561004161834717, + "step": 8757, + "token_acc": 0.3646738485815804 + }, + { + "epoch": 5.133685136323659, + "grad_norm": 0.295879427398369, + "learning_rate": 0.0001775376100350346, + "loss": 2.5950076580047607, + "step": 8758, + "token_acc": 0.3604670113146048 + }, + { + "epoch": 5.134271474640868, + "grad_norm": 0.2853022923052908, + "learning_rate": 0.00017753148912141685, + "loss": 2.5576281547546387, + "step": 8759, + "token_acc": 0.3652652020682325 + }, + { + "epoch": 5.134857812958077, + "grad_norm": 0.32569828329035605, + "learning_rate": 0.0001775253674794953, + "loss": 2.5961480140686035, + "step": 8760, + "token_acc": 0.35876917647436274 + }, + { + "epoch": 5.135444151275286, + "grad_norm": 0.3056343109718411, + "learning_rate": 0.00017751924510932737, + "loss": 2.54693865776062, + "step": 8761, + "token_acc": 0.36730257827270274 + }, + { + "epoch": 5.136030489592494, + "grad_norm": 0.31537849159211484, + "learning_rate": 0.00017751312201097057, + "loss": 2.595202922821045, + "step": 8762, + "token_acc": 0.36051313660528544 + }, + { + "epoch": 5.1366168279097035, + "grad_norm": 0.3241377418695354, + "learning_rate": 0.00017750699818448244, + "loss": 2.5486183166503906, + "step": 8763, + "token_acc": 0.3653852168473728 + }, + { + "epoch": 5.137203166226913, + "grad_norm": 0.29797362908082, + "learning_rate": 0.00017750087362992053, + "loss": 2.559046506881714, + "step": 8764, + "token_acc": 0.36490117648260817 + }, + { + "epoch": 5.137789504544122, + "grad_norm": 0.32222572259453336, + "learning_rate": 0.0001774947483473423, + "loss": 2.5956671237945557, + "step": 8765, + "token_acc": 0.35944647230051285 + }, + { + "epoch": 5.138375842861331, + "grad_norm": 0.30616928295049667, + "learning_rate": 0.00017748862233680539, + "loss": 2.5945863723754883, + "step": 8766, + "token_acc": 0.3596494927979417 + }, + { + "epoch": 5.13896218117854, + "grad_norm": 0.285442120245395, + "learning_rate": 0.00017748249559836724, + "loss": 2.583724021911621, + "step": 8767, + "token_acc": 0.36120989528158454 + }, + { + "epoch": 5.139548519495749, + "grad_norm": 0.31688485292108787, + "learning_rate": 0.0001774763681320855, + "loss": 2.5831120014190674, + "step": 8768, + "token_acc": 0.36177994192633817 + }, + { + "epoch": 5.140134857812958, + "grad_norm": 0.2938573762736049, + "learning_rate": 0.00017747023993801766, + "loss": 2.5536818504333496, + "step": 8769, + "token_acc": 0.3677388268997609 + }, + { + "epoch": 5.140721196130167, + "grad_norm": 0.2894950733824776, + "learning_rate": 0.00017746411101622132, + "loss": 2.6125760078430176, + "step": 8770, + "token_acc": 0.35678827392169044 + }, + { + "epoch": 5.141307534447376, + "grad_norm": 0.31760653282216933, + "learning_rate": 0.00017745798136675403, + "loss": 2.5950801372528076, + "step": 8771, + "token_acc": 0.35999667768050186 + }, + { + "epoch": 5.1418938727645855, + "grad_norm": 0.2935906730979065, + "learning_rate": 0.00017745185098967336, + "loss": 2.6049084663391113, + "step": 8772, + "token_acc": 0.35716771095640787 + }, + { + "epoch": 5.142480211081795, + "grad_norm": 0.2913134922172613, + "learning_rate": 0.00017744571988503692, + "loss": 2.5761826038360596, + "step": 8773, + "token_acc": 0.361777729049813 + }, + { + "epoch": 5.143066549399003, + "grad_norm": 0.34141863448886867, + "learning_rate": 0.00017743958805290232, + "loss": 2.5550057888031006, + "step": 8774, + "token_acc": 0.365270122979476 + }, + { + "epoch": 5.143652887716212, + "grad_norm": 0.3533291356073863, + "learning_rate": 0.00017743345549332715, + "loss": 2.564237594604492, + "step": 8775, + "token_acc": 0.3638222203292374 + }, + { + "epoch": 5.144239226033421, + "grad_norm": 0.30645873015559016, + "learning_rate": 0.00017742732220636903, + "loss": 2.5772275924682617, + "step": 8776, + "token_acc": 0.36307858683837363 + }, + { + "epoch": 5.14482556435063, + "grad_norm": 0.3115476745810554, + "learning_rate": 0.00017742118819208549, + "loss": 2.5988450050354004, + "step": 8777, + "token_acc": 0.3596710495986723 + }, + { + "epoch": 5.145411902667839, + "grad_norm": 0.30881392874079755, + "learning_rate": 0.00017741505345053425, + "loss": 2.556070327758789, + "step": 8778, + "token_acc": 0.3657164291072768 + }, + { + "epoch": 5.145998240985048, + "grad_norm": 0.2916190589256628, + "learning_rate": 0.0001774089179817729, + "loss": 2.571113109588623, + "step": 8779, + "token_acc": 0.36330131284864725 + }, + { + "epoch": 5.1465845793022575, + "grad_norm": 0.31259452400825366, + "learning_rate": 0.00017740278178585904, + "loss": 2.5893185138702393, + "step": 8780, + "token_acc": 0.36085242992117766 + }, + { + "epoch": 5.147170917619467, + "grad_norm": 0.33231795934897307, + "learning_rate": 0.0001773966448628504, + "loss": 2.555454969406128, + "step": 8781, + "token_acc": 0.3668878168061756 + }, + { + "epoch": 5.147757255936676, + "grad_norm": 0.3117555494702881, + "learning_rate": 0.00017739050721280453, + "loss": 2.598276138305664, + "step": 8782, + "token_acc": 0.3588670225604107 + }, + { + "epoch": 5.148343594253885, + "grad_norm": 0.3163486423367551, + "learning_rate": 0.00017738436883577916, + "loss": 2.582850694656372, + "step": 8783, + "token_acc": 0.36111750928454045 + }, + { + "epoch": 5.148929932571093, + "grad_norm": 0.330811759561692, + "learning_rate": 0.00017737822973183193, + "loss": 2.556408166885376, + "step": 8784, + "token_acc": 0.36490600189230327 + }, + { + "epoch": 5.149516270888302, + "grad_norm": 0.30144330573473926, + "learning_rate": 0.0001773720899010205, + "loss": 2.577692985534668, + "step": 8785, + "token_acc": 0.36193092025086554 + }, + { + "epoch": 5.150102609205511, + "grad_norm": 0.33176113659805917, + "learning_rate": 0.00017736594934340252, + "loss": 2.580148220062256, + "step": 8786, + "token_acc": 0.3612634119467982 + }, + { + "epoch": 5.15068894752272, + "grad_norm": 0.32186644720849794, + "learning_rate": 0.00017735980805903568, + "loss": 2.6092586517333984, + "step": 8787, + "token_acc": 0.35700732474799035 + }, + { + "epoch": 5.1512752858399296, + "grad_norm": 0.33291530727762836, + "learning_rate": 0.00017735366604797772, + "loss": 2.551642417907715, + "step": 8788, + "token_acc": 0.3665850532278243 + }, + { + "epoch": 5.151861624157139, + "grad_norm": 0.2965315835537992, + "learning_rate": 0.00017734752331028633, + "loss": 2.6178078651428223, + "step": 8789, + "token_acc": 0.3554844437963959 + }, + { + "epoch": 5.152447962474348, + "grad_norm": 0.30189433523374826, + "learning_rate": 0.00017734137984601914, + "loss": 2.5936150550842285, + "step": 8790, + "token_acc": 0.3599041554693602 + }, + { + "epoch": 5.153034300791557, + "grad_norm": 0.29855256049129747, + "learning_rate": 0.00017733523565523392, + "loss": 2.5673632621765137, + "step": 8791, + "token_acc": 0.36487070575554154 + }, + { + "epoch": 5.153620639108766, + "grad_norm": 0.30893572873219743, + "learning_rate": 0.00017732909073798835, + "loss": 2.5712413787841797, + "step": 8792, + "token_acc": 0.3629407022308646 + }, + { + "epoch": 5.154206977425975, + "grad_norm": 0.30286228945309135, + "learning_rate": 0.0001773229450943402, + "loss": 2.5810327529907227, + "step": 8793, + "token_acc": 0.36093151619455166 + }, + { + "epoch": 5.154793315743184, + "grad_norm": 0.306931201183645, + "learning_rate": 0.0001773167987243472, + "loss": 2.6005775928497314, + "step": 8794, + "token_acc": 0.35822223638879813 + }, + { + "epoch": 5.1553796540603924, + "grad_norm": 0.29258145459147067, + "learning_rate": 0.000177310651628067, + "loss": 2.5784718990325928, + "step": 8795, + "token_acc": 0.36182931297791593 + }, + { + "epoch": 5.155965992377602, + "grad_norm": 0.3141084331489179, + "learning_rate": 0.00017730450380555742, + "loss": 2.6128878593444824, + "step": 8796, + "token_acc": 0.35605835786299217 + }, + { + "epoch": 5.156552330694811, + "grad_norm": 0.31054693928880134, + "learning_rate": 0.00017729835525687624, + "loss": 2.5845413208007812, + "step": 8797, + "token_acc": 0.36081458367548036 + }, + { + "epoch": 5.15713866901202, + "grad_norm": 0.3728608191400777, + "learning_rate": 0.00017729220598208115, + "loss": 2.5960464477539062, + "step": 8798, + "token_acc": 0.3593628723862373 + }, + { + "epoch": 5.157725007329229, + "grad_norm": 0.3280141891409224, + "learning_rate": 0.0001772860559812299, + "loss": 2.5909268856048584, + "step": 8799, + "token_acc": 0.3592768122673003 + }, + { + "epoch": 5.158311345646438, + "grad_norm": 0.29396126943774803, + "learning_rate": 0.00017727990525438035, + "loss": 2.581202507019043, + "step": 8800, + "token_acc": 0.35971804380666594 + }, + { + "epoch": 5.158897683963647, + "grad_norm": 0.36120109212497215, + "learning_rate": 0.0001772737538015902, + "loss": 2.5689618587493896, + "step": 8801, + "token_acc": 0.36294901671409696 + }, + { + "epoch": 5.159484022280856, + "grad_norm": 0.36765344780659887, + "learning_rate": 0.00017726760162291728, + "loss": 2.5400218963623047, + "step": 8802, + "token_acc": 0.3680063062402447 + }, + { + "epoch": 5.160070360598065, + "grad_norm": 0.3100161329390006, + "learning_rate": 0.00017726144871841934, + "loss": 2.6082160472869873, + "step": 8803, + "token_acc": 0.3576020079403022 + }, + { + "epoch": 5.1606566989152745, + "grad_norm": 0.4177507104184087, + "learning_rate": 0.0001772552950881542, + "loss": 2.605971336364746, + "step": 8804, + "token_acc": 0.35833385595750655 + }, + { + "epoch": 5.161243037232484, + "grad_norm": 0.3990429071196709, + "learning_rate": 0.0001772491407321797, + "loss": 2.6020851135253906, + "step": 8805, + "token_acc": 0.3592093295133146 + }, + { + "epoch": 5.161829375549692, + "grad_norm": 0.33071246389774106, + "learning_rate": 0.00017724298565055356, + "loss": 2.573862075805664, + "step": 8806, + "token_acc": 0.36170981117565754 + }, + { + "epoch": 5.162415713866901, + "grad_norm": 0.450611524325082, + "learning_rate": 0.0001772368298433337, + "loss": 2.611372947692871, + "step": 8807, + "token_acc": 0.3566550355592254 + }, + { + "epoch": 5.16300205218411, + "grad_norm": 0.3021141315810496, + "learning_rate": 0.00017723067331057787, + "loss": 2.5904293060302734, + "step": 8808, + "token_acc": 0.36050947390329197 + }, + { + "epoch": 5.163588390501319, + "grad_norm": 0.430433773307704, + "learning_rate": 0.00017722451605234397, + "loss": 2.6565680503845215, + "step": 8809, + "token_acc": 0.34969225114791097 + }, + { + "epoch": 5.164174728818528, + "grad_norm": 0.2891182835747003, + "learning_rate": 0.00017721835806868978, + "loss": 2.6297552585601807, + "step": 8810, + "token_acc": 0.3548627483564077 + }, + { + "epoch": 5.164761067135737, + "grad_norm": 0.3943084610503009, + "learning_rate": 0.0001772121993596732, + "loss": 2.5754218101501465, + "step": 8811, + "token_acc": 0.3622248607178738 + }, + { + "epoch": 5.1653474054529465, + "grad_norm": 0.29385971396078264, + "learning_rate": 0.00017720603992535204, + "loss": 2.607400417327881, + "step": 8812, + "token_acc": 0.3582610137780738 + }, + { + "epoch": 5.165933743770156, + "grad_norm": 0.3803703001390518, + "learning_rate": 0.00017719987976578413, + "loss": 2.5949554443359375, + "step": 8813, + "token_acc": 0.3599474234100637 + }, + { + "epoch": 5.166520082087365, + "grad_norm": 0.2817534171566281, + "learning_rate": 0.0001771937188810274, + "loss": 2.5874056816101074, + "step": 8814, + "token_acc": 0.35904198157695777 + }, + { + "epoch": 5.167106420404574, + "grad_norm": 0.37286566017244677, + "learning_rate": 0.00017718755727113973, + "loss": 2.584026575088501, + "step": 8815, + "token_acc": 0.36073057010817544 + }, + { + "epoch": 5.167692758721783, + "grad_norm": 0.2767348433203727, + "learning_rate": 0.00017718139493617894, + "loss": 2.5618467330932617, + "step": 8816, + "token_acc": 0.3646959025242918 + }, + { + "epoch": 5.168279097038991, + "grad_norm": 0.31617510034402707, + "learning_rate": 0.00017717523187620295, + "loss": 2.576890230178833, + "step": 8817, + "token_acc": 0.36311764720596956 + }, + { + "epoch": 5.1688654353562, + "grad_norm": 0.2993267402690325, + "learning_rate": 0.00017716906809126965, + "loss": 2.6144163608551025, + "step": 8818, + "token_acc": 0.35484430319468313 + }, + { + "epoch": 5.169451773673409, + "grad_norm": 0.30884772192954396, + "learning_rate": 0.00017716290358143696, + "loss": 2.640666961669922, + "step": 8819, + "token_acc": 0.35258329384934994 + }, + { + "epoch": 5.1700381119906185, + "grad_norm": 0.30624398544873566, + "learning_rate": 0.00017715673834676275, + "loss": 2.5840091705322266, + "step": 8820, + "token_acc": 0.3616628905632541 + }, + { + "epoch": 5.170624450307828, + "grad_norm": 0.29040072899697433, + "learning_rate": 0.000177150572387305, + "loss": 2.5670838356018066, + "step": 8821, + "token_acc": 0.3621655671736207 + }, + { + "epoch": 5.171210788625037, + "grad_norm": 0.2942055327510666, + "learning_rate": 0.00017714440570312153, + "loss": 2.6134395599365234, + "step": 8822, + "token_acc": 0.3565820197345011 + }, + { + "epoch": 5.171797126942246, + "grad_norm": 0.28897145725059237, + "learning_rate": 0.00017713823829427035, + "loss": 2.556884765625, + "step": 8823, + "token_acc": 0.3666879684270477 + }, + { + "epoch": 5.172383465259455, + "grad_norm": 0.312512093815707, + "learning_rate": 0.00017713207016080933, + "loss": 2.616426944732666, + "step": 8824, + "token_acc": 0.35600277226084076 + }, + { + "epoch": 5.172969803576664, + "grad_norm": 0.2907733524965378, + "learning_rate": 0.00017712590130279646, + "loss": 2.5543813705444336, + "step": 8825, + "token_acc": 0.3653676071271561 + }, + { + "epoch": 5.173556141893873, + "grad_norm": 0.3118605923579699, + "learning_rate": 0.00017711973172028972, + "loss": 2.571045398712158, + "step": 8826, + "token_acc": 0.36339021488882 + }, + { + "epoch": 5.174142480211081, + "grad_norm": 0.29930764117352127, + "learning_rate": 0.00017711356141334697, + "loss": 2.593981981277466, + "step": 8827, + "token_acc": 0.3587100040703273 + }, + { + "epoch": 5.1747288185282905, + "grad_norm": 0.31468243985456906, + "learning_rate": 0.00017710739038202624, + "loss": 2.589202880859375, + "step": 8828, + "token_acc": 0.35976129798661804 + }, + { + "epoch": 5.1753151568455, + "grad_norm": 0.28822486603463027, + "learning_rate": 0.00017710121862638548, + "loss": 2.5864434242248535, + "step": 8829, + "token_acc": 0.3585260387290844 + }, + { + "epoch": 5.175901495162709, + "grad_norm": 0.2916709461688628, + "learning_rate": 0.00017709504614648268, + "loss": 2.575636148452759, + "step": 8830, + "token_acc": 0.3625404574772155 + }, + { + "epoch": 5.176487833479918, + "grad_norm": 0.29313010955158697, + "learning_rate": 0.0001770888729423758, + "loss": 2.638294219970703, + "step": 8831, + "token_acc": 0.35295063811834193 + }, + { + "epoch": 5.177074171797127, + "grad_norm": 0.2863528101969556, + "learning_rate": 0.00017708269901412283, + "loss": 2.594176769256592, + "step": 8832, + "token_acc": 0.3589286928149004 + }, + { + "epoch": 5.177660510114336, + "grad_norm": 0.3018080878559346, + "learning_rate": 0.00017707652436178178, + "loss": 2.571104049682617, + "step": 8833, + "token_acc": 0.36333263841001545 + }, + { + "epoch": 5.178246848431545, + "grad_norm": 0.27555627140482153, + "learning_rate": 0.00017707034898541065, + "loss": 2.596005916595459, + "step": 8834, + "token_acc": 0.35710715563555123 + }, + { + "epoch": 5.178833186748754, + "grad_norm": 0.29011786870315326, + "learning_rate": 0.0001770641728850674, + "loss": 2.590555191040039, + "step": 8835, + "token_acc": 0.360935813807727 + }, + { + "epoch": 5.179419525065963, + "grad_norm": 0.2978453480192051, + "learning_rate": 0.00017705799606081016, + "loss": 2.5733537673950195, + "step": 8836, + "token_acc": 0.3622193171331898 + }, + { + "epoch": 5.1800058633831725, + "grad_norm": 0.2882035133423909, + "learning_rate": 0.00017705181851269687, + "loss": 2.589637279510498, + "step": 8837, + "token_acc": 0.3615395762220648 + }, + { + "epoch": 5.180592201700381, + "grad_norm": 0.2946360984321885, + "learning_rate": 0.00017704564024078554, + "loss": 2.6022963523864746, + "step": 8838, + "token_acc": 0.3582858418195219 + }, + { + "epoch": 5.18117854001759, + "grad_norm": 0.27955000171512157, + "learning_rate": 0.00017703946124513425, + "loss": 2.5820934772491455, + "step": 8839, + "token_acc": 0.3621006415868494 + }, + { + "epoch": 5.181764878334799, + "grad_norm": 0.3066909011153294, + "learning_rate": 0.00017703328152580104, + "loss": 2.5948734283447266, + "step": 8840, + "token_acc": 0.3582165529612342 + }, + { + "epoch": 5.182351216652008, + "grad_norm": 0.30894457038666073, + "learning_rate": 0.00017702710108284396, + "loss": 2.569955825805664, + "step": 8841, + "token_acc": 0.36253553005579536 + }, + { + "epoch": 5.182937554969217, + "grad_norm": 0.3286767495803109, + "learning_rate": 0.00017702091991632102, + "loss": 2.62277889251709, + "step": 8842, + "token_acc": 0.35459366717591206 + }, + { + "epoch": 5.183523893286426, + "grad_norm": 0.28186909976088925, + "learning_rate": 0.00017701473802629036, + "loss": 2.647202968597412, + "step": 8843, + "token_acc": 0.35108093479368313 + }, + { + "epoch": 5.184110231603635, + "grad_norm": 0.33795708720267514, + "learning_rate": 0.00017700855541281002, + "loss": 2.604660987854004, + "step": 8844, + "token_acc": 0.3571048115050009 + }, + { + "epoch": 5.1846965699208445, + "grad_norm": 0.35268808542920493, + "learning_rate": 0.0001770023720759381, + "loss": 2.6267058849334717, + "step": 8845, + "token_acc": 0.35550304476913713 + }, + { + "epoch": 5.185282908238054, + "grad_norm": 0.3063645011330831, + "learning_rate": 0.0001769961880157326, + "loss": 2.568591356277466, + "step": 8846, + "token_acc": 0.36352294739779245 + }, + { + "epoch": 5.185869246555263, + "grad_norm": 0.30271917779320245, + "learning_rate": 0.0001769900032322517, + "loss": 2.573857307434082, + "step": 8847, + "token_acc": 0.3631904065127848 + }, + { + "epoch": 5.186455584872472, + "grad_norm": 0.33362757904837365, + "learning_rate": 0.00017698381772555344, + "loss": 2.576220989227295, + "step": 8848, + "token_acc": 0.3620993745386737 + }, + { + "epoch": 5.18704192318968, + "grad_norm": 0.36497869626235757, + "learning_rate": 0.00017697763149569594, + "loss": 2.564157009124756, + "step": 8849, + "token_acc": 0.36568953268392107 + }, + { + "epoch": 5.187628261506889, + "grad_norm": 0.2918568341756316, + "learning_rate": 0.00017697144454273736, + "loss": 2.6090922355651855, + "step": 8850, + "token_acc": 0.3575259054652645 + }, + { + "epoch": 5.188214599824098, + "grad_norm": 0.34911531373807103, + "learning_rate": 0.00017696525686673576, + "loss": 2.5933568477630615, + "step": 8851, + "token_acc": 0.35864319742622036 + }, + { + "epoch": 5.188800938141307, + "grad_norm": 0.35652312535143926, + "learning_rate": 0.0001769590684677493, + "loss": 2.5977320671081543, + "step": 8852, + "token_acc": 0.35852609281221803 + }, + { + "epoch": 5.1893872764585165, + "grad_norm": 0.2866128622186048, + "learning_rate": 0.00017695287934583605, + "loss": 2.5706992149353027, + "step": 8853, + "token_acc": 0.36342019509511414 + }, + { + "epoch": 5.189973614775726, + "grad_norm": 0.3137836019576745, + "learning_rate": 0.00017694668950105427, + "loss": 2.5903472900390625, + "step": 8854, + "token_acc": 0.36089131872686003 + }, + { + "epoch": 5.190559953092935, + "grad_norm": 0.2860848218727202, + "learning_rate": 0.00017694049893346198, + "loss": 2.6254711151123047, + "step": 8855, + "token_acc": 0.35468721538936715 + }, + { + "epoch": 5.191146291410144, + "grad_norm": 0.30521531759588744, + "learning_rate": 0.00017693430764311737, + "loss": 2.6389904022216797, + "step": 8856, + "token_acc": 0.3527080231245728 + }, + { + "epoch": 5.191732629727353, + "grad_norm": 0.2848142272217059, + "learning_rate": 0.00017692811563007862, + "loss": 2.5652620792388916, + "step": 8857, + "token_acc": 0.36416562497490573 + }, + { + "epoch": 5.192318968044562, + "grad_norm": 0.29476194500343117, + "learning_rate": 0.0001769219228944039, + "loss": 2.583707809448242, + "step": 8858, + "token_acc": 0.361143427393411 + }, + { + "epoch": 5.192905306361771, + "grad_norm": 0.28541014284583993, + "learning_rate": 0.00017691572943615138, + "loss": 2.606368064880371, + "step": 8859, + "token_acc": 0.35718983510170443 + }, + { + "epoch": 5.193491644678979, + "grad_norm": 0.2955340961949114, + "learning_rate": 0.0001769095352553792, + "loss": 2.570476531982422, + "step": 8860, + "token_acc": 0.36220984633551995 + }, + { + "epoch": 5.1940779829961885, + "grad_norm": 0.2969794728079262, + "learning_rate": 0.00017690334035214557, + "loss": 2.607548475265503, + "step": 8861, + "token_acc": 0.357132392816846 + }, + { + "epoch": 5.194664321313398, + "grad_norm": 0.28862586173203547, + "learning_rate": 0.0001768971447265087, + "loss": 2.635871648788452, + "step": 8862, + "token_acc": 0.3509638447188416 + }, + { + "epoch": 5.195250659630607, + "grad_norm": 0.32862825027999903, + "learning_rate": 0.00017689094837852677, + "loss": 2.5731353759765625, + "step": 8863, + "token_acc": 0.36267274232784913 + }, + { + "epoch": 5.195836997947816, + "grad_norm": 0.29088187435086743, + "learning_rate": 0.000176884751308258, + "loss": 2.6089284420013428, + "step": 8864, + "token_acc": 0.35545232261749066 + }, + { + "epoch": 5.196423336265025, + "grad_norm": 0.2874926762518735, + "learning_rate": 0.00017687855351576057, + "loss": 2.601158618927002, + "step": 8865, + "token_acc": 0.356766678138264 + }, + { + "epoch": 5.197009674582234, + "grad_norm": 0.31700415639942564, + "learning_rate": 0.00017687235500109277, + "loss": 2.6067328453063965, + "step": 8866, + "token_acc": 0.35643780853915474 + }, + { + "epoch": 5.197596012899443, + "grad_norm": 0.30873287677401, + "learning_rate": 0.00017686615576431274, + "loss": 2.59761905670166, + "step": 8867, + "token_acc": 0.3590050524268561 + }, + { + "epoch": 5.198182351216652, + "grad_norm": 0.2928075524909576, + "learning_rate": 0.00017685995580547879, + "loss": 2.578472375869751, + "step": 8868, + "token_acc": 0.36317113871745293 + }, + { + "epoch": 5.198768689533861, + "grad_norm": 0.28916122797735777, + "learning_rate": 0.00017685375512464907, + "loss": 2.6061248779296875, + "step": 8869, + "token_acc": 0.3570543892905847 + }, + { + "epoch": 5.19935502785107, + "grad_norm": 0.2872368567456281, + "learning_rate": 0.00017684755372188193, + "loss": 2.603356122970581, + "step": 8870, + "token_acc": 0.35796114028741405 + }, + { + "epoch": 5.199941366168279, + "grad_norm": 0.2870500951548041, + "learning_rate": 0.00017684135159723555, + "loss": 2.618990421295166, + "step": 8871, + "token_acc": 0.35474290243958934 + }, + { + "epoch": 5.200527704485488, + "grad_norm": 0.2864276564424642, + "learning_rate": 0.00017683514875076824, + "loss": 2.593994140625, + "step": 8872, + "token_acc": 0.3575860782527263 + }, + { + "epoch": 5.201114042802697, + "grad_norm": 0.30050104589767135, + "learning_rate": 0.00017682894518253824, + "loss": 2.6247172355651855, + "step": 8873, + "token_acc": 0.3544306401713776 + }, + { + "epoch": 5.201700381119906, + "grad_norm": 0.31646686185320194, + "learning_rate": 0.00017682274089260381, + "loss": 2.574474334716797, + "step": 8874, + "token_acc": 0.36330769007307406 + }, + { + "epoch": 5.202286719437115, + "grad_norm": 0.3041857573712074, + "learning_rate": 0.00017681653588102328, + "loss": 2.611551523208618, + "step": 8875, + "token_acc": 0.3572504070717219 + }, + { + "epoch": 5.202873057754324, + "grad_norm": 0.2862174831405846, + "learning_rate": 0.0001768103301478549, + "loss": 2.610865354537964, + "step": 8876, + "token_acc": 0.3547432254344782 + }, + { + "epoch": 5.203459396071533, + "grad_norm": 0.3188465053380257, + "learning_rate": 0.00017680412369315692, + "loss": 2.581068277359009, + "step": 8877, + "token_acc": 0.361868049689842 + }, + { + "epoch": 5.2040457343887425, + "grad_norm": 0.3717357775555192, + "learning_rate": 0.00017679791651698776, + "loss": 2.5799295902252197, + "step": 8878, + "token_acc": 0.3610729059350797 + }, + { + "epoch": 5.204632072705952, + "grad_norm": 0.3597770921147461, + "learning_rate": 0.00017679170861940562, + "loss": 2.5837433338165283, + "step": 8879, + "token_acc": 0.3607960022719101 + }, + { + "epoch": 5.205218411023161, + "grad_norm": 0.296606164147987, + "learning_rate": 0.00017678550000046887, + "loss": 2.591031074523926, + "step": 8880, + "token_acc": 0.3598082188928337 + }, + { + "epoch": 5.205804749340369, + "grad_norm": 0.3167138416859688, + "learning_rate": 0.00017677929066023583, + "loss": 2.581468105316162, + "step": 8881, + "token_acc": 0.36107384298758644 + }, + { + "epoch": 5.206391087657578, + "grad_norm": 0.3184406849672734, + "learning_rate": 0.0001767730805987648, + "loss": 2.59531569480896, + "step": 8882, + "token_acc": 0.3606766693542849 + }, + { + "epoch": 5.206977425974787, + "grad_norm": 0.29010467295338416, + "learning_rate": 0.00017676686981611415, + "loss": 2.60360050201416, + "step": 8883, + "token_acc": 0.35864521591871296 + }, + { + "epoch": 5.207563764291996, + "grad_norm": 0.3588415896014432, + "learning_rate": 0.00017676065831234217, + "loss": 2.6245317459106445, + "step": 8884, + "token_acc": 0.3544902608373391 + }, + { + "epoch": 5.208150102609205, + "grad_norm": 0.32009161546581305, + "learning_rate": 0.00017675444608750723, + "loss": 2.6115505695343018, + "step": 8885, + "token_acc": 0.35673445774794477 + }, + { + "epoch": 5.2087364409264145, + "grad_norm": 0.29866476570579265, + "learning_rate": 0.00017674823314166776, + "loss": 2.5770421028137207, + "step": 8886, + "token_acc": 0.3622060242799817 + }, + { + "epoch": 5.209322779243624, + "grad_norm": 0.3093734670709904, + "learning_rate": 0.00017674201947488202, + "loss": 2.6146249771118164, + "step": 8887, + "token_acc": 0.3581273560657985 + }, + { + "epoch": 5.209909117560833, + "grad_norm": 0.30152675815007357, + "learning_rate": 0.00017673580508720843, + "loss": 2.6202807426452637, + "step": 8888, + "token_acc": 0.35608605830429246 + }, + { + "epoch": 5.210495455878042, + "grad_norm": 0.37814267926313366, + "learning_rate": 0.00017672958997870533, + "loss": 2.572105884552002, + "step": 8889, + "token_acc": 0.3629826109470191 + }, + { + "epoch": 5.211081794195251, + "grad_norm": 0.3292304444912722, + "learning_rate": 0.00017672337414943113, + "loss": 2.6440553665161133, + "step": 8890, + "token_acc": 0.35229318054058517 + }, + { + "epoch": 5.21166813251246, + "grad_norm": 0.3034593574859272, + "learning_rate": 0.00017671715759944422, + "loss": 2.6002702713012695, + "step": 8891, + "token_acc": 0.3573376683410891 + }, + { + "epoch": 5.212254470829668, + "grad_norm": 0.32872101784817487, + "learning_rate": 0.00017671094032880303, + "loss": 2.6020944118499756, + "step": 8892, + "token_acc": 0.35633485014670335 + }, + { + "epoch": 5.212840809146877, + "grad_norm": 0.30040987379298434, + "learning_rate": 0.00017670472233756587, + "loss": 2.6131582260131836, + "step": 8893, + "token_acc": 0.35709080695208845 + }, + { + "epoch": 5.2134271474640865, + "grad_norm": 0.27812740170405953, + "learning_rate": 0.00017669850362579123, + "loss": 2.6491641998291016, + "step": 8894, + "token_acc": 0.35002195191239893 + }, + { + "epoch": 5.214013485781296, + "grad_norm": 0.3040233825752894, + "learning_rate": 0.0001766922841935375, + "loss": 2.606353521347046, + "step": 8895, + "token_acc": 0.35625909656073784 + }, + { + "epoch": 5.214599824098505, + "grad_norm": 0.2886028500828905, + "learning_rate": 0.00017668606404086312, + "loss": 2.6165714263916016, + "step": 8896, + "token_acc": 0.353889548075522 + }, + { + "epoch": 5.215186162415714, + "grad_norm": 0.351504967624042, + "learning_rate": 0.0001766798431678265, + "loss": 2.586872100830078, + "step": 8897, + "token_acc": 0.36083460106100385 + }, + { + "epoch": 5.215772500732923, + "grad_norm": 0.3114049150986594, + "learning_rate": 0.00017667362157448605, + "loss": 2.5593318939208984, + "step": 8898, + "token_acc": 0.3648464233338765 + }, + { + "epoch": 5.216358839050132, + "grad_norm": 0.32620783118979163, + "learning_rate": 0.00017666739926090028, + "loss": 2.607499122619629, + "step": 8899, + "token_acc": 0.35629975221315047 + }, + { + "epoch": 5.216945177367341, + "grad_norm": 0.30508239708023666, + "learning_rate": 0.00017666117622712758, + "loss": 2.579393148422241, + "step": 8900, + "token_acc": 0.36147835898746034 + }, + { + "epoch": 5.21753151568455, + "grad_norm": 0.31512461675205666, + "learning_rate": 0.00017665495247322642, + "loss": 2.58910870552063, + "step": 8901, + "token_acc": 0.3610934162221625 + }, + { + "epoch": 5.218117854001759, + "grad_norm": 0.3331883047453537, + "learning_rate": 0.00017664872799925534, + "loss": 2.5961084365844727, + "step": 8902, + "token_acc": 0.3587714874122641 + }, + { + "epoch": 5.218704192318968, + "grad_norm": 0.3050923596830108, + "learning_rate": 0.00017664250280527267, + "loss": 2.617009162902832, + "step": 8903, + "token_acc": 0.35557874902549336 + }, + { + "epoch": 5.219290530636177, + "grad_norm": 0.36307336990497646, + "learning_rate": 0.000176636276891337, + "loss": 2.5787949562072754, + "step": 8904, + "token_acc": 0.36222543912020566 + }, + { + "epoch": 5.219876868953386, + "grad_norm": 0.29439756799236233, + "learning_rate": 0.00017663005025750676, + "loss": 2.621690273284912, + "step": 8905, + "token_acc": 0.35518129927726677 + }, + { + "epoch": 5.220463207270595, + "grad_norm": 0.3206589952322598, + "learning_rate": 0.00017662382290384047, + "loss": 2.614262104034424, + "step": 8906, + "token_acc": 0.35550557153984325 + }, + { + "epoch": 5.221049545587804, + "grad_norm": 0.290530713150827, + "learning_rate": 0.00017661759483039664, + "loss": 2.612776756286621, + "step": 8907, + "token_acc": 0.35589317746435584 + }, + { + "epoch": 5.221635883905013, + "grad_norm": 0.34661912581747684, + "learning_rate": 0.00017661136603723372, + "loss": 2.6172826290130615, + "step": 8908, + "token_acc": 0.3540361247947455 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.29911326096454727, + "learning_rate": 0.00017660513652441027, + "loss": 2.6312520503997803, + "step": 8909, + "token_acc": 0.3538855231229024 + }, + { + "epoch": 5.222808560539431, + "grad_norm": 0.35431828256202935, + "learning_rate": 0.00017659890629198477, + "loss": 2.629631757736206, + "step": 8910, + "token_acc": 0.35414256984171644 + }, + { + "epoch": 5.2233948988566405, + "grad_norm": 0.3186983119913448, + "learning_rate": 0.00017659267534001578, + "loss": 2.5746936798095703, + "step": 8911, + "token_acc": 0.3635192314630136 + }, + { + "epoch": 5.22398123717385, + "grad_norm": 0.3242624801407607, + "learning_rate": 0.00017658644366856178, + "loss": 2.6207313537597656, + "step": 8912, + "token_acc": 0.35678353936952406 + }, + { + "epoch": 5.224567575491059, + "grad_norm": 0.3494445124773035, + "learning_rate": 0.00017658021127768135, + "loss": 2.6116943359375, + "step": 8913, + "token_acc": 0.3559525710740102 + }, + { + "epoch": 5.225153913808267, + "grad_norm": 0.2867130796835485, + "learning_rate": 0.00017657397816743306, + "loss": 2.574814558029175, + "step": 8914, + "token_acc": 0.3633493914144188 + }, + { + "epoch": 5.225740252125476, + "grad_norm": 0.3653283472460645, + "learning_rate": 0.00017656774433787544, + "loss": 2.633619546890259, + "step": 8915, + "token_acc": 0.35268917986190523 + }, + { + "epoch": 5.226326590442685, + "grad_norm": 0.31533882348416115, + "learning_rate": 0.000176561509789067, + "loss": 2.603806972503662, + "step": 8916, + "token_acc": 0.35738972272445113 + }, + { + "epoch": 5.226912928759894, + "grad_norm": 0.3077074753000578, + "learning_rate": 0.00017655527452106634, + "loss": 2.5943570137023926, + "step": 8917, + "token_acc": 0.35786196600948844 + }, + { + "epoch": 5.227499267077103, + "grad_norm": 0.29959809495308665, + "learning_rate": 0.0001765490385339321, + "loss": 2.6193180084228516, + "step": 8918, + "token_acc": 0.35545160294751554 + }, + { + "epoch": 5.2280856053943126, + "grad_norm": 0.3205902709839929, + "learning_rate": 0.00017654280182772273, + "loss": 2.6344871520996094, + "step": 8919, + "token_acc": 0.35142441415224385 + }, + { + "epoch": 5.228671943711522, + "grad_norm": 0.30876965397672074, + "learning_rate": 0.0001765365644024969, + "loss": 2.6367740631103516, + "step": 8920, + "token_acc": 0.3536425476682808 + }, + { + "epoch": 5.229258282028731, + "grad_norm": 0.2831767469074054, + "learning_rate": 0.00017653032625831316, + "loss": 2.608541488647461, + "step": 8921, + "token_acc": 0.3571820719483057 + }, + { + "epoch": 5.22984462034594, + "grad_norm": 0.2817990266436051, + "learning_rate": 0.00017652408739523016, + "loss": 2.6234054565429688, + "step": 8922, + "token_acc": 0.3560192886773779 + }, + { + "epoch": 5.230430958663149, + "grad_norm": 0.2711107440558047, + "learning_rate": 0.00017651784781330646, + "loss": 2.607964038848877, + "step": 8923, + "token_acc": 0.35711216597782053 + }, + { + "epoch": 5.231017296980358, + "grad_norm": 0.3003357545094554, + "learning_rate": 0.0001765116075126007, + "loss": 2.638132333755493, + "step": 8924, + "token_acc": 0.35305141192722445 + }, + { + "epoch": 5.231603635297566, + "grad_norm": 0.2898127458216129, + "learning_rate": 0.00017650536649317148, + "loss": 2.6328983306884766, + "step": 8925, + "token_acc": 0.3540484888741282 + }, + { + "epoch": 5.2321899736147754, + "grad_norm": 0.3086776949497406, + "learning_rate": 0.00017649912475507744, + "loss": 2.5847232341766357, + "step": 8926, + "token_acc": 0.36117247465967744 + }, + { + "epoch": 5.232776311931985, + "grad_norm": 0.2773391206690136, + "learning_rate": 0.00017649288229837722, + "loss": 2.593325614929199, + "step": 8927, + "token_acc": 0.35931855840408705 + }, + { + "epoch": 5.233362650249194, + "grad_norm": 0.3068693167303299, + "learning_rate": 0.00017648663912312942, + "loss": 2.5795040130615234, + "step": 8928, + "token_acc": 0.36141258824166167 + }, + { + "epoch": 5.233948988566403, + "grad_norm": 0.31083938762054275, + "learning_rate": 0.00017648039522939272, + "loss": 2.614588737487793, + "step": 8929, + "token_acc": 0.3560164822692763 + }, + { + "epoch": 5.234535326883612, + "grad_norm": 0.2985515531873977, + "learning_rate": 0.00017647415061722575, + "loss": 2.598916530609131, + "step": 8930, + "token_acc": 0.3579850153377618 + }, + { + "epoch": 5.235121665200821, + "grad_norm": 0.2796026369684912, + "learning_rate": 0.00017646790528668722, + "loss": 2.585221290588379, + "step": 8931, + "token_acc": 0.36120348832645627 + }, + { + "epoch": 5.23570800351803, + "grad_norm": 0.28737463006291303, + "learning_rate": 0.00017646165923783572, + "loss": 2.5958662033081055, + "step": 8932, + "token_acc": 0.3610374016617763 + }, + { + "epoch": 5.236294341835239, + "grad_norm": 0.29868012936335064, + "learning_rate": 0.00017645541247073002, + "loss": 2.647989511489868, + "step": 8933, + "token_acc": 0.35169541810963306 + }, + { + "epoch": 5.236880680152448, + "grad_norm": 0.28741195022807337, + "learning_rate": 0.00017644916498542868, + "loss": 2.6246585845947266, + "step": 8934, + "token_acc": 0.3543490921853422 + }, + { + "epoch": 5.237467018469657, + "grad_norm": 0.3024716302951172, + "learning_rate": 0.0001764429167819905, + "loss": 2.6007468700408936, + "step": 8935, + "token_acc": 0.3576647541763084 + }, + { + "epoch": 5.238053356786866, + "grad_norm": 0.31823492549558297, + "learning_rate": 0.00017643666786047412, + "loss": 2.591785430908203, + "step": 8936, + "token_acc": 0.35987065225964193 + }, + { + "epoch": 5.238639695104075, + "grad_norm": 0.29093448477113826, + "learning_rate": 0.00017643041822093823, + "loss": 2.5966320037841797, + "step": 8937, + "token_acc": 0.35865443847313977 + }, + { + "epoch": 5.239226033421284, + "grad_norm": 0.3602176128493178, + "learning_rate": 0.00017642416786344155, + "loss": 2.60294508934021, + "step": 8938, + "token_acc": 0.35823805012264603 + }, + { + "epoch": 5.239812371738493, + "grad_norm": 0.2863918895449606, + "learning_rate": 0.00017641791678804284, + "loss": 2.643946409225464, + "step": 8939, + "token_acc": 0.35186155596170804 + }, + { + "epoch": 5.240398710055702, + "grad_norm": 0.3250231607642417, + "learning_rate": 0.0001764116649948007, + "loss": 2.5882511138916016, + "step": 8940, + "token_acc": 0.361252967463573 + }, + { + "epoch": 5.240985048372911, + "grad_norm": 0.3373707245106521, + "learning_rate": 0.000176405412483774, + "loss": 2.576939344406128, + "step": 8941, + "token_acc": 0.36355262645788194 + }, + { + "epoch": 5.24157138669012, + "grad_norm": 0.28231517263118694, + "learning_rate": 0.00017639915925502138, + "loss": 2.6037158966064453, + "step": 8942, + "token_acc": 0.35765800556300653 + }, + { + "epoch": 5.2421577250073295, + "grad_norm": 0.44315721684516396, + "learning_rate": 0.00017639290530860162, + "loss": 2.593046188354492, + "step": 8943, + "token_acc": 0.36015892780453285 + }, + { + "epoch": 5.242744063324539, + "grad_norm": 0.3456474183860975, + "learning_rate": 0.00017638665064457343, + "loss": 2.6263513565063477, + "step": 8944, + "token_acc": 0.35402143321065166 + }, + { + "epoch": 5.243330401641748, + "grad_norm": 0.33997836123243064, + "learning_rate": 0.0001763803952629956, + "loss": 2.622840404510498, + "step": 8945, + "token_acc": 0.3549798662726088 + }, + { + "epoch": 5.243916739958956, + "grad_norm": 0.39243144263794494, + "learning_rate": 0.00017637413916392689, + "loss": 2.602046012878418, + "step": 8946, + "token_acc": 0.3579908845589954 + }, + { + "epoch": 5.244503078276165, + "grad_norm": 0.28812177340128425, + "learning_rate": 0.00017636788234742605, + "loss": 2.586763858795166, + "step": 8947, + "token_acc": 0.3614600878751822 + }, + { + "epoch": 5.245089416593374, + "grad_norm": 0.31665677809463044, + "learning_rate": 0.00017636162481355188, + "loss": 2.6205544471740723, + "step": 8948, + "token_acc": 0.35475883624407617 + }, + { + "epoch": 5.245675754910583, + "grad_norm": 0.28631333361964334, + "learning_rate": 0.00017635536656236312, + "loss": 2.600924491882324, + "step": 8949, + "token_acc": 0.35767624254346186 + }, + { + "epoch": 5.246262093227792, + "grad_norm": 0.3416097768998052, + "learning_rate": 0.00017634910759391857, + "loss": 2.590449333190918, + "step": 8950, + "token_acc": 0.35946761632975355 + }, + { + "epoch": 5.2468484315450015, + "grad_norm": 0.2864851848347396, + "learning_rate": 0.00017634284790827705, + "loss": 2.591109275817871, + "step": 8951, + "token_acc": 0.3590155204883434 + }, + { + "epoch": 5.247434769862211, + "grad_norm": 0.32009030196872096, + "learning_rate": 0.00017633658750549738, + "loss": 2.631446123123169, + "step": 8952, + "token_acc": 0.3518361376765615 + }, + { + "epoch": 5.24802110817942, + "grad_norm": 0.30715287919072826, + "learning_rate": 0.00017633032638563828, + "loss": 2.626551628112793, + "step": 8953, + "token_acc": 0.35466651493952034 + }, + { + "epoch": 5.248607446496629, + "grad_norm": 0.30191879835284363, + "learning_rate": 0.00017632406454875867, + "loss": 2.6372604370117188, + "step": 8954, + "token_acc": 0.3513582966226138 + }, + { + "epoch": 5.249193784813838, + "grad_norm": 0.285357260078069, + "learning_rate": 0.0001763178019949173, + "loss": 2.6059539318084717, + "step": 8955, + "token_acc": 0.3571936905836741 + }, + { + "epoch": 5.249780123131047, + "grad_norm": 0.298223730356294, + "learning_rate": 0.000176311538724173, + "loss": 2.639087677001953, + "step": 8956, + "token_acc": 0.35286021741216467 + }, + { + "epoch": 5.250366461448255, + "grad_norm": 0.2866783997004973, + "learning_rate": 0.00017630527473658464, + "loss": 2.562894344329834, + "step": 8957, + "token_acc": 0.36472233722942343 + }, + { + "epoch": 5.250952799765464, + "grad_norm": 0.2944375517389073, + "learning_rate": 0.00017629901003221105, + "loss": 2.5701937675476074, + "step": 8958, + "token_acc": 0.36303990255988505 + }, + { + "epoch": 5.2515391380826735, + "grad_norm": 0.28913217326901264, + "learning_rate": 0.00017629274461111106, + "loss": 2.6354010105133057, + "step": 8959, + "token_acc": 0.3527658874318681 + }, + { + "epoch": 5.252125476399883, + "grad_norm": 0.287896308732696, + "learning_rate": 0.00017628647847334357, + "loss": 2.5816168785095215, + "step": 8960, + "token_acc": 0.36249897605329984 + }, + { + "epoch": 5.252711814717092, + "grad_norm": 0.2836656923185974, + "learning_rate": 0.00017628021161896743, + "loss": 2.6121528148651123, + "step": 8961, + "token_acc": 0.3566316127229255 + }, + { + "epoch": 5.253298153034301, + "grad_norm": 0.28745988300989717, + "learning_rate": 0.00017627394404804144, + "loss": 2.6017661094665527, + "step": 8962, + "token_acc": 0.3586734639633414 + }, + { + "epoch": 5.25388449135151, + "grad_norm": 0.2831945492417219, + "learning_rate": 0.00017626767576062454, + "loss": 2.594233274459839, + "step": 8963, + "token_acc": 0.35934348160625534 + }, + { + "epoch": 5.254470829668719, + "grad_norm": 0.27586738894755025, + "learning_rate": 0.0001762614067567756, + "loss": 2.6412806510925293, + "step": 8964, + "token_acc": 0.3505909246886417 + }, + { + "epoch": 5.255057167985928, + "grad_norm": 0.2944956123430574, + "learning_rate": 0.0001762551370365535, + "loss": 2.5941548347473145, + "step": 8965, + "token_acc": 0.35867293424936464 + }, + { + "epoch": 5.255643506303137, + "grad_norm": 0.29989593621063293, + "learning_rate": 0.00017624886660001717, + "loss": 2.596522569656372, + "step": 8966, + "token_acc": 0.35902004214583105 + }, + { + "epoch": 5.256229844620346, + "grad_norm": 0.3287599683252191, + "learning_rate": 0.00017624259544722545, + "loss": 2.6046249866485596, + "step": 8967, + "token_acc": 0.3566307727366786 + }, + { + "epoch": 5.256816182937555, + "grad_norm": 0.29633392261532654, + "learning_rate": 0.00017623632357823728, + "loss": 2.6302382946014404, + "step": 8968, + "token_acc": 0.35277176887591544 + }, + { + "epoch": 5.257402521254764, + "grad_norm": 0.2964524953036852, + "learning_rate": 0.00017623005099311163, + "loss": 2.611330986022949, + "step": 8969, + "token_acc": 0.3556174189699742 + }, + { + "epoch": 5.257988859571973, + "grad_norm": 0.277834700365497, + "learning_rate": 0.00017622377769190736, + "loss": 2.6310195922851562, + "step": 8970, + "token_acc": 0.35374237459568236 + }, + { + "epoch": 5.258575197889182, + "grad_norm": 0.28835244474603333, + "learning_rate": 0.00017621750367468337, + "loss": 2.627258539199829, + "step": 8971, + "token_acc": 0.3537943932833242 + }, + { + "epoch": 5.259161536206391, + "grad_norm": 0.29280553245944724, + "learning_rate": 0.0001762112289414987, + "loss": 2.5902390480041504, + "step": 8972, + "token_acc": 0.3596100316008875 + }, + { + "epoch": 5.2597478745236, + "grad_norm": 0.3354383506640203, + "learning_rate": 0.00017620495349241218, + "loss": 2.6468722820281982, + "step": 8973, + "token_acc": 0.34950138412614484 + }, + { + "epoch": 5.260334212840809, + "grad_norm": 0.3140280351005481, + "learning_rate": 0.0001761986773274828, + "loss": 2.637523651123047, + "step": 8974, + "token_acc": 0.3513550086135558 + }, + { + "epoch": 5.260920551158018, + "grad_norm": 0.3033298037475534, + "learning_rate": 0.00017619240044676953, + "loss": 2.5959901809692383, + "step": 8975, + "token_acc": 0.3579761280376357 + }, + { + "epoch": 5.2615068894752275, + "grad_norm": 0.2963409896929546, + "learning_rate": 0.00017618612285033136, + "loss": 2.604379177093506, + "step": 8976, + "token_acc": 0.35653769317558404 + }, + { + "epoch": 5.262093227792437, + "grad_norm": 0.35757397159455806, + "learning_rate": 0.00017617984453822722, + "loss": 2.6063408851623535, + "step": 8977, + "token_acc": 0.35741926350065284 + }, + { + "epoch": 5.262679566109645, + "grad_norm": 0.3774046014096985, + "learning_rate": 0.00017617356551051608, + "loss": 2.6097888946533203, + "step": 8978, + "token_acc": 0.3566665036536368 + }, + { + "epoch": 5.263265904426854, + "grad_norm": 0.2951899144061072, + "learning_rate": 0.00017616728576725694, + "loss": 2.6205573081970215, + "step": 8979, + "token_acc": 0.3563465972327535 + }, + { + "epoch": 5.263852242744063, + "grad_norm": 0.3615111545493343, + "learning_rate": 0.0001761610053085088, + "loss": 2.6179280281066895, + "step": 8980, + "token_acc": 0.35593678770626974 + }, + { + "epoch": 5.264438581061272, + "grad_norm": 0.3531519598512695, + "learning_rate": 0.00017615472413433063, + "loss": 2.628978967666626, + "step": 8981, + "token_acc": 0.35328794595735696 + }, + { + "epoch": 5.265024919378481, + "grad_norm": 0.30846183604250915, + "learning_rate": 0.00017614844224478145, + "loss": 2.5970637798309326, + "step": 8982, + "token_acc": 0.3594998813009417 + }, + { + "epoch": 5.26561125769569, + "grad_norm": 0.37615389290053786, + "learning_rate": 0.00017614215963992027, + "loss": 2.6035749912261963, + "step": 8983, + "token_acc": 0.357612877233912 + }, + { + "epoch": 5.2661975960128995, + "grad_norm": 0.3173799132123656, + "learning_rate": 0.0001761358763198061, + "loss": 2.6065468788146973, + "step": 8984, + "token_acc": 0.35802802353374735 + }, + { + "epoch": 5.266783934330109, + "grad_norm": 0.3552028029603691, + "learning_rate": 0.00017612959228449797, + "loss": 2.6264853477478027, + "step": 8985, + "token_acc": 0.35360447363096226 + }, + { + "epoch": 5.267370272647318, + "grad_norm": 0.31518225251382354, + "learning_rate": 0.00017612330753405487, + "loss": 2.6327805519104004, + "step": 8986, + "token_acc": 0.3528630884923337 + }, + { + "epoch": 5.267956610964527, + "grad_norm": 0.3565223622397148, + "learning_rate": 0.00017611702206853593, + "loss": 2.595665216445923, + "step": 8987, + "token_acc": 0.35884508795627884 + }, + { + "epoch": 5.268542949281736, + "grad_norm": 0.305227863163778, + "learning_rate": 0.0001761107358880001, + "loss": 2.6159262657165527, + "step": 8988, + "token_acc": 0.355600270017655 + }, + { + "epoch": 5.269129287598945, + "grad_norm": 0.32285729463743773, + "learning_rate": 0.0001761044489925065, + "loss": 2.6152122020721436, + "step": 8989, + "token_acc": 0.3556207182211469 + }, + { + "epoch": 5.269715625916153, + "grad_norm": 0.3217858554922761, + "learning_rate": 0.00017609816138211413, + "loss": 2.6016697883605957, + "step": 8990, + "token_acc": 0.3580617573739469 + }, + { + "epoch": 5.270301964233362, + "grad_norm": 0.2884168319953974, + "learning_rate": 0.00017609187305688207, + "loss": 2.605201482772827, + "step": 8991, + "token_acc": 0.35689749502227175 + }, + { + "epoch": 5.2708883025505715, + "grad_norm": 0.31755308185400694, + "learning_rate": 0.0001760855840168694, + "loss": 2.611504077911377, + "step": 8992, + "token_acc": 0.3567775736828922 + }, + { + "epoch": 5.271474640867781, + "grad_norm": 0.28539060509850916, + "learning_rate": 0.0001760792942621352, + "loss": 2.585408926010132, + "step": 8993, + "token_acc": 0.3617779878176854 + }, + { + "epoch": 5.27206097918499, + "grad_norm": 0.3050278777883416, + "learning_rate": 0.0001760730037927386, + "loss": 2.6180243492126465, + "step": 8994, + "token_acc": 0.3544958361310706 + }, + { + "epoch": 5.272647317502199, + "grad_norm": 0.29544386523781624, + "learning_rate": 0.00017606671260873857, + "loss": 2.6446807384490967, + "step": 8995, + "token_acc": 0.35140982071504107 + }, + { + "epoch": 5.273233655819408, + "grad_norm": 0.2896486183124645, + "learning_rate": 0.0001760604207101943, + "loss": 2.667173385620117, + "step": 8996, + "token_acc": 0.34786844783164095 + }, + { + "epoch": 5.273819994136617, + "grad_norm": 0.28967403018239507, + "learning_rate": 0.00017605412809716485, + "loss": 2.61602783203125, + "step": 8997, + "token_acc": 0.3556349381653477 + }, + { + "epoch": 5.274406332453826, + "grad_norm": 0.31953608384234616, + "learning_rate": 0.0001760478347697094, + "loss": 2.628969192504883, + "step": 8998, + "token_acc": 0.3541925571688506 + }, + { + "epoch": 5.274992670771035, + "grad_norm": 0.29758483212025405, + "learning_rate": 0.00017604154072788694, + "loss": 2.5840134620666504, + "step": 8999, + "token_acc": 0.36045895383963117 + }, + { + "epoch": 5.2755790090882435, + "grad_norm": 0.27759254190097504, + "learning_rate": 0.00017603524597175675, + "loss": 2.6010642051696777, + "step": 9000, + "token_acc": 0.359265724787752 + }, + { + "epoch": 5.276165347405453, + "grad_norm": 0.28777283060260406, + "learning_rate": 0.00017602895050137783, + "loss": 2.607245445251465, + "step": 9001, + "token_acc": 0.3548003349707944 + }, + { + "epoch": 5.276751685722662, + "grad_norm": 0.31652789205743814, + "learning_rate": 0.0001760226543168094, + "loss": 2.6195006370544434, + "step": 9002, + "token_acc": 0.3547273552486546 + }, + { + "epoch": 5.277338024039871, + "grad_norm": 0.3276983808823185, + "learning_rate": 0.00017601635741811057, + "loss": 2.6375725269317627, + "step": 9003, + "token_acc": 0.3525123331747703 + }, + { + "epoch": 5.27792436235708, + "grad_norm": 0.27027298646974746, + "learning_rate": 0.0001760100598053405, + "loss": 2.6157901287078857, + "step": 9004, + "token_acc": 0.35428544064870215 + }, + { + "epoch": 5.278510700674289, + "grad_norm": 0.31325341227957815, + "learning_rate": 0.00017600376147855834, + "loss": 2.646467685699463, + "step": 9005, + "token_acc": 0.35055022757034515 + }, + { + "epoch": 5.279097038991498, + "grad_norm": 0.26973140277126073, + "learning_rate": 0.00017599746243782325, + "loss": 2.626941204071045, + "step": 9006, + "token_acc": 0.35555930591141316 + }, + { + "epoch": 5.279683377308707, + "grad_norm": 0.3393729213971131, + "learning_rate": 0.0001759911626831944, + "loss": 2.6132426261901855, + "step": 9007, + "token_acc": 0.35756513952053565 + }, + { + "epoch": 5.280269715625916, + "grad_norm": 0.3879644988459256, + "learning_rate": 0.000175984862214731, + "loss": 2.6410703659057617, + "step": 9008, + "token_acc": 0.3516117852759617 + }, + { + "epoch": 5.2808560539431255, + "grad_norm": 0.2739956475897817, + "learning_rate": 0.00017597856103249217, + "loss": 2.638608932495117, + "step": 9009, + "token_acc": 0.351092523798769 + }, + { + "epoch": 5.281442392260335, + "grad_norm": 0.3440551800518427, + "learning_rate": 0.0001759722591365372, + "loss": 2.6260015964508057, + "step": 9010, + "token_acc": 0.35322916558734235 + }, + { + "epoch": 5.282028730577543, + "grad_norm": 0.28232847572962244, + "learning_rate": 0.00017596595652692514, + "loss": 2.6265738010406494, + "step": 9011, + "token_acc": 0.353564243010994 + }, + { + "epoch": 5.282615068894752, + "grad_norm": 0.301102947064147, + "learning_rate": 0.00017595965320371537, + "loss": 2.6153030395507812, + "step": 9012, + "token_acc": 0.35561550384270907 + }, + { + "epoch": 5.283201407211961, + "grad_norm": 0.28628984389884327, + "learning_rate": 0.00017595334916696697, + "loss": 2.6205005645751953, + "step": 9013, + "token_acc": 0.35565197716050206 + }, + { + "epoch": 5.28378774552917, + "grad_norm": 0.29263521241375834, + "learning_rate": 0.00017594704441673925, + "loss": 2.651933431625366, + "step": 9014, + "token_acc": 0.35072530916417 + }, + { + "epoch": 5.284374083846379, + "grad_norm": 0.2801368062592007, + "learning_rate": 0.00017594073895309134, + "loss": 2.6311397552490234, + "step": 9015, + "token_acc": 0.3532900146805838 + }, + { + "epoch": 5.284960422163588, + "grad_norm": 0.27754873020141085, + "learning_rate": 0.00017593443277608254, + "loss": 2.642279863357544, + "step": 9016, + "token_acc": 0.3517929435227335 + }, + { + "epoch": 5.2855467604807975, + "grad_norm": 0.28289431222166866, + "learning_rate": 0.00017592812588577205, + "loss": 2.6196844577789307, + "step": 9017, + "token_acc": 0.35567085050235026 + }, + { + "epoch": 5.286133098798007, + "grad_norm": 0.29657072140753443, + "learning_rate": 0.00017592181828221914, + "loss": 2.58072566986084, + "step": 9018, + "token_acc": 0.36213364015712574 + }, + { + "epoch": 5.286719437115216, + "grad_norm": 0.293469074036933, + "learning_rate": 0.00017591550996548305, + "loss": 2.6210427284240723, + "step": 9019, + "token_acc": 0.35408826461717086 + }, + { + "epoch": 5.287305775432425, + "grad_norm": 0.28211628913550935, + "learning_rate": 0.00017590920093562307, + "loss": 2.6029324531555176, + "step": 9020, + "token_acc": 0.3568860765875917 + }, + { + "epoch": 5.287892113749633, + "grad_norm": 0.295151984323409, + "learning_rate": 0.0001759028911926984, + "loss": 2.6245193481445312, + "step": 9021, + "token_acc": 0.3544909120238114 + }, + { + "epoch": 5.288478452066842, + "grad_norm": 0.28160898763138664, + "learning_rate": 0.0001758965807367684, + "loss": 2.5861737728118896, + "step": 9022, + "token_acc": 0.3603082851637765 + }, + { + "epoch": 5.289064790384051, + "grad_norm": 0.30717280860225477, + "learning_rate": 0.00017589026956789223, + "loss": 2.6251678466796875, + "step": 9023, + "token_acc": 0.35370535962373006 + }, + { + "epoch": 5.28965112870126, + "grad_norm": 0.35208221781005705, + "learning_rate": 0.00017588395768612926, + "loss": 2.6634230613708496, + "step": 9024, + "token_acc": 0.3467819529319984 + }, + { + "epoch": 5.2902374670184695, + "grad_norm": 0.30862952141206784, + "learning_rate": 0.0001758776450915388, + "loss": 2.6463637351989746, + "step": 9025, + "token_acc": 0.3503440806244964 + }, + { + "epoch": 5.290823805335679, + "grad_norm": 0.27931257112576735, + "learning_rate": 0.00017587133178418007, + "loss": 2.624399185180664, + "step": 9026, + "token_acc": 0.35419862397925567 + }, + { + "epoch": 5.291410143652888, + "grad_norm": 0.31866322950055115, + "learning_rate": 0.00017586501776411243, + "loss": 2.618483066558838, + "step": 9027, + "token_acc": 0.35496736765772297 + }, + { + "epoch": 5.291996481970097, + "grad_norm": 0.2848243918570421, + "learning_rate": 0.00017585870303139518, + "loss": 2.662550449371338, + "step": 9028, + "token_acc": 0.3484362061444181 + }, + { + "epoch": 5.292582820287306, + "grad_norm": 0.29604395182398446, + "learning_rate": 0.00017585238758608762, + "loss": 2.621995687484741, + "step": 9029, + "token_acc": 0.35358926316757106 + }, + { + "epoch": 5.293169158604515, + "grad_norm": 0.34917699169332056, + "learning_rate": 0.0001758460714282491, + "loss": 2.635432243347168, + "step": 9030, + "token_acc": 0.35027430118507613 + }, + { + "epoch": 5.293755496921724, + "grad_norm": 0.2881902595411632, + "learning_rate": 0.00017583975455793895, + "loss": 2.6020326614379883, + "step": 9031, + "token_acc": 0.358450350723655 + }, + { + "epoch": 5.294341835238933, + "grad_norm": 0.31148134282562767, + "learning_rate": 0.0001758334369752165, + "loss": 2.5915191173553467, + "step": 9032, + "token_acc": 0.3604079549656045 + }, + { + "epoch": 5.2949281735561415, + "grad_norm": 0.27771926793373564, + "learning_rate": 0.0001758271186801411, + "loss": 2.643895149230957, + "step": 9033, + "token_acc": 0.3527766679984019 + }, + { + "epoch": 5.295514511873351, + "grad_norm": 0.32030550269509334, + "learning_rate": 0.00017582079967277207, + "loss": 2.5994625091552734, + "step": 9034, + "token_acc": 0.3572646364162483 + }, + { + "epoch": 5.29610085019056, + "grad_norm": 0.3265707128987528, + "learning_rate": 0.0001758144799531688, + "loss": 2.588143825531006, + "step": 9035, + "token_acc": 0.36146950164053016 + }, + { + "epoch": 5.296687188507769, + "grad_norm": 0.28077270424230305, + "learning_rate": 0.00017580815952139068, + "loss": 2.620697259902954, + "step": 9036, + "token_acc": 0.3550741338249981 + }, + { + "epoch": 5.297273526824978, + "grad_norm": 0.34808878941063703, + "learning_rate": 0.00017580183837749707, + "loss": 2.6298627853393555, + "step": 9037, + "token_acc": 0.3528846986235608 + }, + { + "epoch": 5.297859865142187, + "grad_norm": 0.26652860326729394, + "learning_rate": 0.0001757955165215473, + "loss": 2.6268820762634277, + "step": 9038, + "token_acc": 0.3527528842594765 + }, + { + "epoch": 5.298446203459396, + "grad_norm": 0.3660975438354635, + "learning_rate": 0.00017578919395360077, + "loss": 2.6217548847198486, + "step": 9039, + "token_acc": 0.35419827724012504 + }, + { + "epoch": 5.299032541776605, + "grad_norm": 0.31493022441359575, + "learning_rate": 0.0001757828706737169, + "loss": 2.593721866607666, + "step": 9040, + "token_acc": 0.3602060906290686 + }, + { + "epoch": 5.299618880093814, + "grad_norm": 0.31305821828760433, + "learning_rate": 0.00017577654668195512, + "loss": 2.630213499069214, + "step": 9041, + "token_acc": 0.3531029871647731 + }, + { + "epoch": 5.3002052184110235, + "grad_norm": 0.3340149637039047, + "learning_rate": 0.0001757702219783748, + "loss": 2.638047695159912, + "step": 9042, + "token_acc": 0.3519916828020987 + }, + { + "epoch": 5.300791556728232, + "grad_norm": 0.2750714503117414, + "learning_rate": 0.0001757638965630353, + "loss": 2.5797386169433594, + "step": 9043, + "token_acc": 0.3608497268199049 + }, + { + "epoch": 5.301377895045441, + "grad_norm": 0.33370012640943, + "learning_rate": 0.00017575757043599606, + "loss": 2.6306817531585693, + "step": 9044, + "token_acc": 0.35539415636360755 + }, + { + "epoch": 5.30196423336265, + "grad_norm": 0.27329696910101586, + "learning_rate": 0.0001757512435973166, + "loss": 2.581338405609131, + "step": 9045, + "token_acc": 0.360382205678035 + }, + { + "epoch": 5.302550571679859, + "grad_norm": 0.30484529874474836, + "learning_rate": 0.00017574491604705625, + "loss": 2.6212024688720703, + "step": 9046, + "token_acc": 0.3539755196492368 + }, + { + "epoch": 5.303136909997068, + "grad_norm": 0.2870395136087814, + "learning_rate": 0.0001757385877852745, + "loss": 2.637681484222412, + "step": 9047, + "token_acc": 0.3514013819663498 + }, + { + "epoch": 5.303723248314277, + "grad_norm": 0.31714505008839566, + "learning_rate": 0.00017573225881203076, + "loss": 2.640303611755371, + "step": 9048, + "token_acc": 0.35329424902289225 + }, + { + "epoch": 5.304309586631486, + "grad_norm": 0.31009667999003915, + "learning_rate": 0.0001757259291273845, + "loss": 2.61525297164917, + "step": 9049, + "token_acc": 0.3559364376223344 + }, + { + "epoch": 5.3048959249486956, + "grad_norm": 0.2943865623688959, + "learning_rate": 0.00017571959873139515, + "loss": 2.6052048206329346, + "step": 9050, + "token_acc": 0.3557980157210806 + }, + { + "epoch": 5.305482263265905, + "grad_norm": 0.3208321714668972, + "learning_rate": 0.00017571326762412225, + "loss": 2.6432807445526123, + "step": 9051, + "token_acc": 0.35159205527596776 + }, + { + "epoch": 5.306068601583114, + "grad_norm": 0.275562669732982, + "learning_rate": 0.00017570693580562518, + "loss": 2.6014018058776855, + "step": 9052, + "token_acc": 0.3576564444551962 + }, + { + "epoch": 5.306654939900323, + "grad_norm": 0.297559007425737, + "learning_rate": 0.00017570060327596351, + "loss": 2.6268725395202637, + "step": 9053, + "token_acc": 0.35392623963296277 + }, + { + "epoch": 5.307241278217531, + "grad_norm": 0.27938043566252896, + "learning_rate": 0.00017569427003519668, + "loss": 2.6344408988952637, + "step": 9054, + "token_acc": 0.35130987699983474 + }, + { + "epoch": 5.30782761653474, + "grad_norm": 0.2897252668071001, + "learning_rate": 0.00017568793608338415, + "loss": 2.617241144180298, + "step": 9055, + "token_acc": 0.3550784081044273 + }, + { + "epoch": 5.308413954851949, + "grad_norm": 0.2716098922223578, + "learning_rate": 0.00017568160142058544, + "loss": 2.6318421363830566, + "step": 9056, + "token_acc": 0.3539247415010161 + }, + { + "epoch": 5.3090002931691584, + "grad_norm": 0.26897198022148067, + "learning_rate": 0.0001756752660468601, + "loss": 2.6266794204711914, + "step": 9057, + "token_acc": 0.3541892040499518 + }, + { + "epoch": 5.309586631486368, + "grad_norm": 0.2718371279179874, + "learning_rate": 0.0001756689299622676, + "loss": 2.622577667236328, + "step": 9058, + "token_acc": 0.3538454749116736 + }, + { + "epoch": 5.310172969803577, + "grad_norm": 0.2758080083095387, + "learning_rate": 0.00017566259316686747, + "loss": 2.6642274856567383, + "step": 9059, + "token_acc": 0.3476271654505093 + }, + { + "epoch": 5.310759308120786, + "grad_norm": 0.2939317034099069, + "learning_rate": 0.00017565625566071924, + "loss": 2.626450777053833, + "step": 9060, + "token_acc": 0.35529531939732223 + }, + { + "epoch": 5.311345646437995, + "grad_norm": 0.3036499410210893, + "learning_rate": 0.00017564991744388242, + "loss": 2.6162467002868652, + "step": 9061, + "token_acc": 0.3570325117618145 + }, + { + "epoch": 5.311931984755204, + "grad_norm": 0.28791743717506507, + "learning_rate": 0.00017564357851641656, + "loss": 2.6299383640289307, + "step": 9062, + "token_acc": 0.35365745980526725 + }, + { + "epoch": 5.312518323072413, + "grad_norm": 0.2750684280734211, + "learning_rate": 0.00017563723887838124, + "loss": 2.641218900680542, + "step": 9063, + "token_acc": 0.35090036451373036 + }, + { + "epoch": 5.313104661389621, + "grad_norm": 0.2983133291420359, + "learning_rate": 0.00017563089852983595, + "loss": 2.6711513996124268, + "step": 9064, + "token_acc": 0.34700824696755733 + }, + { + "epoch": 5.3136909997068305, + "grad_norm": 0.2695695138576655, + "learning_rate": 0.0001756245574708403, + "loss": 2.6378631591796875, + "step": 9065, + "token_acc": 0.3514974975505124 + }, + { + "epoch": 5.31427733802404, + "grad_norm": 0.28383433350963405, + "learning_rate": 0.00017561821570145385, + "loss": 2.6156296730041504, + "step": 9066, + "token_acc": 0.35636258719120756 + }, + { + "epoch": 5.314863676341249, + "grad_norm": 0.30452957685698323, + "learning_rate": 0.00017561187322173615, + "loss": 2.6259407997131348, + "step": 9067, + "token_acc": 0.3533663272445812 + }, + { + "epoch": 5.315450014658458, + "grad_norm": 0.2800858644776289, + "learning_rate": 0.00017560553003174677, + "loss": 2.6577627658843994, + "step": 9068, + "token_acc": 0.3499114688957289 + }, + { + "epoch": 5.316036352975667, + "grad_norm": 0.2766345611127507, + "learning_rate": 0.00017559918613154537, + "loss": 2.6276133060455322, + "step": 9069, + "token_acc": 0.35325496468883927 + }, + { + "epoch": 5.316622691292876, + "grad_norm": 0.3333545721389573, + "learning_rate": 0.00017559284152119143, + "loss": 2.6134591102600098, + "step": 9070, + "token_acc": 0.35598660377915003 + }, + { + "epoch": 5.317209029610085, + "grad_norm": 0.343137750382585, + "learning_rate": 0.00017558649620074462, + "loss": 2.605302572250366, + "step": 9071, + "token_acc": 0.356458748979191 + }, + { + "epoch": 5.317795367927294, + "grad_norm": 0.2941133336109503, + "learning_rate": 0.00017558015017026454, + "loss": 2.6478724479675293, + "step": 9072, + "token_acc": 0.35041021154476454 + }, + { + "epoch": 5.318381706244503, + "grad_norm": 0.35598740922745375, + "learning_rate": 0.0001755738034298108, + "loss": 2.6021063327789307, + "step": 9073, + "token_acc": 0.3586416105267305 + }, + { + "epoch": 5.3189680445617125, + "grad_norm": 0.40048671090041144, + "learning_rate": 0.000175567455979443, + "loss": 2.6390910148620605, + "step": 9074, + "token_acc": 0.3510536877755363 + }, + { + "epoch": 5.319554382878922, + "grad_norm": 0.27526154393841834, + "learning_rate": 0.0001755611078192208, + "loss": 2.616978645324707, + "step": 9075, + "token_acc": 0.35403253757365066 + }, + { + "epoch": 5.32014072119613, + "grad_norm": 0.39565548713407417, + "learning_rate": 0.0001755547589492038, + "loss": 2.6247684955596924, + "step": 9076, + "token_acc": 0.35327722375704795 + }, + { + "epoch": 5.320727059513339, + "grad_norm": 0.3201350921509172, + "learning_rate": 0.00017554840936945162, + "loss": 2.6282029151916504, + "step": 9077, + "token_acc": 0.3526081396656372 + }, + { + "epoch": 5.321313397830548, + "grad_norm": 0.33885618428675307, + "learning_rate": 0.000175542059080024, + "loss": 2.6283724308013916, + "step": 9078, + "token_acc": 0.35313902759659566 + }, + { + "epoch": 5.321899736147757, + "grad_norm": 0.3092494431919273, + "learning_rate": 0.00017553570808098048, + "loss": 2.665473699569702, + "step": 9079, + "token_acc": 0.3471552432746068 + }, + { + "epoch": 5.322486074464966, + "grad_norm": 0.359909528450709, + "learning_rate": 0.00017552935637238078, + "loss": 2.6382510662078857, + "step": 9080, + "token_acc": 0.3515944399018806 + }, + { + "epoch": 5.323072412782175, + "grad_norm": 0.3237204262063517, + "learning_rate": 0.00017552300395428457, + "loss": 2.65928316116333, + "step": 9081, + "token_acc": 0.3492521419483987 + }, + { + "epoch": 5.3236587510993845, + "grad_norm": 0.34288327082963693, + "learning_rate": 0.00017551665082675148, + "loss": 2.5832948684692383, + "step": 9082, + "token_acc": 0.3611499473424258 + }, + { + "epoch": 5.324245089416594, + "grad_norm": 0.31979352849713677, + "learning_rate": 0.00017551029698984123, + "loss": 2.6473593711853027, + "step": 9083, + "token_acc": 0.35192766571586476 + }, + { + "epoch": 5.324831427733803, + "grad_norm": 0.32135892375115627, + "learning_rate": 0.00017550394244361347, + "loss": 2.578754425048828, + "step": 9084, + "token_acc": 0.36233885080353845 + }, + { + "epoch": 5.325417766051012, + "grad_norm": 0.3133324635558715, + "learning_rate": 0.00017549758718812797, + "loss": 2.6548657417297363, + "step": 9085, + "token_acc": 0.34870031911054106 + }, + { + "epoch": 5.32600410436822, + "grad_norm": 0.31439023892076773, + "learning_rate": 0.00017549123122344434, + "loss": 2.635324478149414, + "step": 9086, + "token_acc": 0.3519035612327825 + }, + { + "epoch": 5.326590442685429, + "grad_norm": 0.305983733806818, + "learning_rate": 0.0001754848745496223, + "loss": 2.6060590744018555, + "step": 9087, + "token_acc": 0.35775526319311535 + }, + { + "epoch": 5.327176781002638, + "grad_norm": 0.32409872895944036, + "learning_rate": 0.0001754785171667216, + "loss": 2.6450953483581543, + "step": 9088, + "token_acc": 0.35085297126432247 + }, + { + "epoch": 5.327763119319847, + "grad_norm": 0.31079307818170143, + "learning_rate": 0.00017547215907480195, + "loss": 2.630849838256836, + "step": 9089, + "token_acc": 0.35314364621177763 + }, + { + "epoch": 5.3283494576370565, + "grad_norm": 0.31678481605695397, + "learning_rate": 0.00017546580027392303, + "loss": 2.6259191036224365, + "step": 9090, + "token_acc": 0.35333945077790047 + }, + { + "epoch": 5.328935795954266, + "grad_norm": 0.3264392061279435, + "learning_rate": 0.00017545944076414465, + "loss": 2.6482725143432617, + "step": 9091, + "token_acc": 0.35105781643131884 + }, + { + "epoch": 5.329522134271475, + "grad_norm": 0.28540610736057953, + "learning_rate": 0.00017545308054552647, + "loss": 2.6152689456939697, + "step": 9092, + "token_acc": 0.3557318939821155 + }, + { + "epoch": 5.330108472588684, + "grad_norm": 0.2869917194884141, + "learning_rate": 0.0001754467196181283, + "loss": 2.6385960578918457, + "step": 9093, + "token_acc": 0.3507612409876581 + }, + { + "epoch": 5.330694810905893, + "grad_norm": 0.30336076629311287, + "learning_rate": 0.00017544035798200983, + "loss": 2.5872960090637207, + "step": 9094, + "token_acc": 0.3585177957486373 + }, + { + "epoch": 5.331281149223102, + "grad_norm": 0.2950112938142556, + "learning_rate": 0.00017543399563723088, + "loss": 2.660673141479492, + "step": 9095, + "token_acc": 0.3475361438845908 + }, + { + "epoch": 5.331867487540311, + "grad_norm": 0.3417830081148275, + "learning_rate": 0.00017542763258385119, + "loss": 2.6628928184509277, + "step": 9096, + "token_acc": 0.34686441574936494 + }, + { + "epoch": 5.33245382585752, + "grad_norm": 0.31772020093596137, + "learning_rate": 0.00017542126882193052, + "loss": 2.6203958988189697, + "step": 9097, + "token_acc": 0.354867817649194 + }, + { + "epoch": 5.3330401641747285, + "grad_norm": 0.29091211494358055, + "learning_rate": 0.00017541490435152868, + "loss": 2.5979137420654297, + "step": 9098, + "token_acc": 0.35750370854469443 + }, + { + "epoch": 5.333626502491938, + "grad_norm": 0.335148388390116, + "learning_rate": 0.00017540853917270542, + "loss": 2.6149709224700928, + "step": 9099, + "token_acc": 0.3542441280493335 + }, + { + "epoch": 5.334212840809147, + "grad_norm": 0.298773400512402, + "learning_rate": 0.00017540217328552055, + "loss": 2.6258180141448975, + "step": 9100, + "token_acc": 0.35318468684259857 + }, + { + "epoch": 5.334799179126356, + "grad_norm": 0.28273270044005383, + "learning_rate": 0.00017539580669003387, + "loss": 2.6134450435638428, + "step": 9101, + "token_acc": 0.35420800836832955 + }, + { + "epoch": 5.335385517443565, + "grad_norm": 0.27791619021303676, + "learning_rate": 0.0001753894393863052, + "loss": 2.590485095977783, + "step": 9102, + "token_acc": 0.3588154178479099 + }, + { + "epoch": 5.335971855760774, + "grad_norm": 0.2779319489382343, + "learning_rate": 0.00017538307137439428, + "loss": 2.603686809539795, + "step": 9103, + "token_acc": 0.35787717952545933 + }, + { + "epoch": 5.336558194077983, + "grad_norm": 0.2966877083747229, + "learning_rate": 0.00017537670265436106, + "loss": 2.6256093978881836, + "step": 9104, + "token_acc": 0.35351301693758097 + }, + { + "epoch": 5.337144532395192, + "grad_norm": 0.29528362625209525, + "learning_rate": 0.00017537033322626524, + "loss": 2.64054012298584, + "step": 9105, + "token_acc": 0.3512576453794296 + }, + { + "epoch": 5.337730870712401, + "grad_norm": 0.28817935184415744, + "learning_rate": 0.00017536396309016675, + "loss": 2.6372482776641846, + "step": 9106, + "token_acc": 0.3522232134109554 + }, + { + "epoch": 5.3383172090296105, + "grad_norm": 0.281638366014676, + "learning_rate": 0.00017535759224612533, + "loss": 2.6294965744018555, + "step": 9107, + "token_acc": 0.35383972285364573 + }, + { + "epoch": 5.338903547346819, + "grad_norm": 0.300906129084363, + "learning_rate": 0.00017535122069420092, + "loss": 2.6844325065612793, + "step": 9108, + "token_acc": 0.34291447293858246 + }, + { + "epoch": 5.339489885664028, + "grad_norm": 0.2845672837736264, + "learning_rate": 0.00017534484843445327, + "loss": 2.6173362731933594, + "step": 9109, + "token_acc": 0.3567033563555202 + }, + { + "epoch": 5.340076223981237, + "grad_norm": 0.29336173147617856, + "learning_rate": 0.00017533847546694233, + "loss": 2.589667558670044, + "step": 9110, + "token_acc": 0.35926793215724323 + }, + { + "epoch": 5.340662562298446, + "grad_norm": 0.29478278718906475, + "learning_rate": 0.00017533210179172795, + "loss": 2.658782720565796, + "step": 9111, + "token_acc": 0.3493949479106895 + }, + { + "epoch": 5.341248900615655, + "grad_norm": 0.27874069023201703, + "learning_rate": 0.00017532572740886995, + "loss": 2.6278762817382812, + "step": 9112, + "token_acc": 0.35338075550191317 + }, + { + "epoch": 5.341835238932864, + "grad_norm": 0.31119150716155614, + "learning_rate": 0.00017531935231842826, + "loss": 2.6399271488189697, + "step": 9113, + "token_acc": 0.35126385740458965 + }, + { + "epoch": 5.342421577250073, + "grad_norm": 0.286739443796918, + "learning_rate": 0.00017531297652046278, + "loss": 2.6200504302978516, + "step": 9114, + "token_acc": 0.35347253466943895 + }, + { + "epoch": 5.3430079155672825, + "grad_norm": 0.391878898213163, + "learning_rate": 0.0001753066000150333, + "loss": 2.65126371383667, + "step": 9115, + "token_acc": 0.34753103826317494 + }, + { + "epoch": 5.343594253884492, + "grad_norm": 0.2940297680977322, + "learning_rate": 0.00017530022280219987, + "loss": 2.6300222873687744, + "step": 9116, + "token_acc": 0.3532876537891534 + }, + { + "epoch": 5.344180592201701, + "grad_norm": 0.29281241244573347, + "learning_rate": 0.00017529384488202228, + "loss": 2.587777614593506, + "step": 9117, + "token_acc": 0.3595239475362151 + }, + { + "epoch": 5.34476693051891, + "grad_norm": 0.29254328966189197, + "learning_rate": 0.00017528746625456045, + "loss": 2.636072874069214, + "step": 9118, + "token_acc": 0.35251633582751013 + }, + { + "epoch": 5.345353268836118, + "grad_norm": 0.31486354698924723, + "learning_rate": 0.00017528108691987438, + "loss": 2.6129727363586426, + "step": 9119, + "token_acc": 0.35666382885233294 + }, + { + "epoch": 5.345939607153327, + "grad_norm": 0.31731883285600876, + "learning_rate": 0.00017527470687802386, + "loss": 2.657259464263916, + "step": 9120, + "token_acc": 0.3482393867164852 + }, + { + "epoch": 5.346525945470536, + "grad_norm": 0.3041777261364634, + "learning_rate": 0.00017526832612906897, + "loss": 2.661468267440796, + "step": 9121, + "token_acc": 0.3480633190897753 + }, + { + "epoch": 5.347112283787745, + "grad_norm": 0.3327609475708628, + "learning_rate": 0.00017526194467306957, + "loss": 2.6475026607513428, + "step": 9122, + "token_acc": 0.35100722158874953 + }, + { + "epoch": 5.3476986221049545, + "grad_norm": 0.30706185795630264, + "learning_rate": 0.0001752555625100856, + "loss": 2.616863250732422, + "step": 9123, + "token_acc": 0.35492202244958776 + }, + { + "epoch": 5.348284960422164, + "grad_norm": 0.3296483534679435, + "learning_rate": 0.00017524917964017707, + "loss": 2.6088900566101074, + "step": 9124, + "token_acc": 0.35612530290252675 + }, + { + "epoch": 5.348871298739373, + "grad_norm": 0.3066028286091517, + "learning_rate": 0.00017524279606340385, + "loss": 2.660628080368042, + "step": 9125, + "token_acc": 0.34801751994731767 + }, + { + "epoch": 5.349457637056582, + "grad_norm": 0.3264146615310536, + "learning_rate": 0.00017523641177982598, + "loss": 2.61086106300354, + "step": 9126, + "token_acc": 0.3552697631056821 + }, + { + "epoch": 5.350043975373791, + "grad_norm": 0.30092092145481386, + "learning_rate": 0.00017523002678950338, + "loss": 2.6382458209991455, + "step": 9127, + "token_acc": 0.3517462627507323 + }, + { + "epoch": 5.350630313691, + "grad_norm": 0.34773964828063153, + "learning_rate": 0.00017522364109249608, + "loss": 2.6637630462646484, + "step": 9128, + "token_acc": 0.34718433305943575 + }, + { + "epoch": 5.351216652008208, + "grad_norm": 0.2811897977512063, + "learning_rate": 0.00017521725468886402, + "loss": 2.6443536281585693, + "step": 9129, + "token_acc": 0.3521953438217866 + }, + { + "epoch": 5.351802990325417, + "grad_norm": 0.33117984361285163, + "learning_rate": 0.00017521086757866722, + "loss": 2.631499767303467, + "step": 9130, + "token_acc": 0.35433478876273955 + }, + { + "epoch": 5.3523893286426265, + "grad_norm": 0.29510787992485477, + "learning_rate": 0.00017520447976196565, + "loss": 2.671140670776367, + "step": 9131, + "token_acc": 0.3471141362080497 + }, + { + "epoch": 5.352975666959836, + "grad_norm": 0.3217863253605579, + "learning_rate": 0.00017519809123881931, + "loss": 2.6426329612731934, + "step": 9132, + "token_acc": 0.3510991423156407 + }, + { + "epoch": 5.353562005277045, + "grad_norm": 0.30524592704227443, + "learning_rate": 0.0001751917020092883, + "loss": 2.6281991004943848, + "step": 9133, + "token_acc": 0.35266112432837016 + }, + { + "epoch": 5.354148343594254, + "grad_norm": 0.27425010820647794, + "learning_rate": 0.0001751853120734325, + "loss": 2.6257667541503906, + "step": 9134, + "token_acc": 0.3532135294149024 + }, + { + "epoch": 5.354734681911463, + "grad_norm": 0.2958922525868725, + "learning_rate": 0.00017517892143131207, + "loss": 2.6101741790771484, + "step": 9135, + "token_acc": 0.3577594849734921 + }, + { + "epoch": 5.355321020228672, + "grad_norm": 0.2779829006150963, + "learning_rate": 0.00017517253008298694, + "loss": 2.611966609954834, + "step": 9136, + "token_acc": 0.3557986630252915 + }, + { + "epoch": 5.355907358545881, + "grad_norm": 0.283422800707739, + "learning_rate": 0.0001751661380285172, + "loss": 2.6100351810455322, + "step": 9137, + "token_acc": 0.3557722399679284 + }, + { + "epoch": 5.35649369686309, + "grad_norm": 0.28855634578453615, + "learning_rate": 0.00017515974526796288, + "loss": 2.6458120346069336, + "step": 9138, + "token_acc": 0.3503066378537702 + }, + { + "epoch": 5.357080035180299, + "grad_norm": 0.2935697920836456, + "learning_rate": 0.00017515335180138403, + "loss": 2.67828369140625, + "step": 9139, + "token_acc": 0.34568130648827416 + }, + { + "epoch": 5.3576663734975085, + "grad_norm": 0.3043977162626109, + "learning_rate": 0.0001751469576288407, + "loss": 2.6350231170654297, + "step": 9140, + "token_acc": 0.35185208991641365 + }, + { + "epoch": 5.358252711814717, + "grad_norm": 0.27760535331149716, + "learning_rate": 0.00017514056275039298, + "loss": 2.6588568687438965, + "step": 9141, + "token_acc": 0.34780973623466555 + }, + { + "epoch": 5.358839050131926, + "grad_norm": 0.2835811473968491, + "learning_rate": 0.00017513416716610092, + "loss": 2.6256415843963623, + "step": 9142, + "token_acc": 0.35355226656496686 + }, + { + "epoch": 5.359425388449135, + "grad_norm": 0.2802774411176032, + "learning_rate": 0.0001751277708760246, + "loss": 2.661647319793701, + "step": 9143, + "token_acc": 0.34815232362122234 + }, + { + "epoch": 5.360011726766344, + "grad_norm": 0.2875446633216819, + "learning_rate": 0.00017512137388022412, + "loss": 2.667041778564453, + "step": 9144, + "token_acc": 0.3486363529586795 + }, + { + "epoch": 5.360598065083553, + "grad_norm": 0.2787932426554914, + "learning_rate": 0.00017511497617875955, + "loss": 2.6451539993286133, + "step": 9145, + "token_acc": 0.3512874160209435 + }, + { + "epoch": 5.361184403400762, + "grad_norm": 0.3123570563453856, + "learning_rate": 0.000175108577771691, + "loss": 2.6678757667541504, + "step": 9146, + "token_acc": 0.34696859021183346 + }, + { + "epoch": 5.361770741717971, + "grad_norm": 0.3009270211128707, + "learning_rate": 0.00017510217865907856, + "loss": 2.6367340087890625, + "step": 9147, + "token_acc": 0.3511767143747084 + }, + { + "epoch": 5.3623570800351805, + "grad_norm": 0.32113995381240895, + "learning_rate": 0.00017509577884098238, + "loss": 2.6606087684631348, + "step": 9148, + "token_acc": 0.3486898332283091 + }, + { + "epoch": 5.36294341835239, + "grad_norm": 0.27959369168445586, + "learning_rate": 0.00017508937831746253, + "loss": 2.662008762359619, + "step": 9149, + "token_acc": 0.3475869246064931 + }, + { + "epoch": 5.363529756669599, + "grad_norm": 0.3230672578386095, + "learning_rate": 0.00017508297708857917, + "loss": 2.6738786697387695, + "step": 9150, + "token_acc": 0.3471830675996469 + }, + { + "epoch": 5.364116094986807, + "grad_norm": 0.2909542830772192, + "learning_rate": 0.0001750765751543924, + "loss": 2.656966209411621, + "step": 9151, + "token_acc": 0.3502919952625911 + }, + { + "epoch": 5.364702433304016, + "grad_norm": 0.3172404502030238, + "learning_rate": 0.00017507017251496237, + "loss": 2.6545541286468506, + "step": 9152, + "token_acc": 0.3506357541172131 + }, + { + "epoch": 5.365288771621225, + "grad_norm": 0.2720645153299547, + "learning_rate": 0.00017506376917034925, + "loss": 2.5917651653289795, + "step": 9153, + "token_acc": 0.35840305808762596 + }, + { + "epoch": 5.365875109938434, + "grad_norm": 0.3459479560443342, + "learning_rate": 0.00017505736512061316, + "loss": 2.611440658569336, + "step": 9154, + "token_acc": 0.35605213363085614 + }, + { + "epoch": 5.366461448255643, + "grad_norm": 0.3249418903877057, + "learning_rate": 0.00017505096036581424, + "loss": 2.658864974975586, + "step": 9155, + "token_acc": 0.34792251231310123 + }, + { + "epoch": 5.3670477865728525, + "grad_norm": 0.2922989062209861, + "learning_rate": 0.0001750445549060127, + "loss": 2.6486916542053223, + "step": 9156, + "token_acc": 0.3512991499518446 + }, + { + "epoch": 5.367634124890062, + "grad_norm": 0.35274622803534633, + "learning_rate": 0.0001750381487412687, + "loss": 2.6241095066070557, + "step": 9157, + "token_acc": 0.35361047579670174 + }, + { + "epoch": 5.368220463207271, + "grad_norm": 0.3763943354075912, + "learning_rate": 0.0001750317418716424, + "loss": 2.6429853439331055, + "step": 9158, + "token_acc": 0.3505435793838548 + }, + { + "epoch": 5.36880680152448, + "grad_norm": 0.30612400513216764, + "learning_rate": 0.00017502533429719397, + "loss": 2.6591031551361084, + "step": 9159, + "token_acc": 0.3490009937758251 + }, + { + "epoch": 5.369393139841689, + "grad_norm": 0.3203736641308976, + "learning_rate": 0.00017501892601798366, + "loss": 2.6291561126708984, + "step": 9160, + "token_acc": 0.3533349034743168 + }, + { + "epoch": 5.369979478158898, + "grad_norm": 0.34192120109428303, + "learning_rate": 0.0001750125170340716, + "loss": 2.627112865447998, + "step": 9161, + "token_acc": 0.35359891740969945 + }, + { + "epoch": 5.370565816476106, + "grad_norm": 0.2843544638963652, + "learning_rate": 0.00017500610734551804, + "loss": 2.5898313522338867, + "step": 9162, + "token_acc": 0.3585044281002691 + }, + { + "epoch": 5.371152154793315, + "grad_norm": 0.31517302840536654, + "learning_rate": 0.00017499969695238319, + "loss": 2.665757894515991, + "step": 9163, + "token_acc": 0.34857948876356887 + }, + { + "epoch": 5.3717384931105245, + "grad_norm": 0.27688638128701154, + "learning_rate": 0.0001749932858547272, + "loss": 2.628725528717041, + "step": 9164, + "token_acc": 0.3535559827006247 + }, + { + "epoch": 5.372324831427734, + "grad_norm": 0.30913899690604996, + "learning_rate": 0.0001749868740526104, + "loss": 2.634352922439575, + "step": 9165, + "token_acc": 0.3510590597119113 + }, + { + "epoch": 5.372911169744943, + "grad_norm": 0.2748201231281132, + "learning_rate": 0.0001749804615460929, + "loss": 2.678978204727173, + "step": 9166, + "token_acc": 0.3464749152822547 + }, + { + "epoch": 5.373497508062152, + "grad_norm": 0.31381779824559786, + "learning_rate": 0.00017497404833523506, + "loss": 2.647855758666992, + "step": 9167, + "token_acc": 0.34964007561834853 + }, + { + "epoch": 5.374083846379361, + "grad_norm": 0.29518899338823756, + "learning_rate": 0.00017496763442009704, + "loss": 2.631199598312378, + "step": 9168, + "token_acc": 0.35215448535368055 + }, + { + "epoch": 5.37467018469657, + "grad_norm": 0.27892144909398875, + "learning_rate": 0.0001749612198007391, + "loss": 2.715101718902588, + "step": 9169, + "token_acc": 0.3407833961346785 + }, + { + "epoch": 5.375256523013779, + "grad_norm": 0.2946675782950637, + "learning_rate": 0.00017495480447722158, + "loss": 2.627732276916504, + "step": 9170, + "token_acc": 0.3533859124286103 + }, + { + "epoch": 5.375842861330988, + "grad_norm": 0.2692378569610981, + "learning_rate": 0.0001749483884496046, + "loss": 2.6475040912628174, + "step": 9171, + "token_acc": 0.3514483510656923 + }, + { + "epoch": 5.3764291996481965, + "grad_norm": 0.30928815350488864, + "learning_rate": 0.00017494197171794853, + "loss": 2.6394259929656982, + "step": 9172, + "token_acc": 0.3512110400030205 + }, + { + "epoch": 5.377015537965406, + "grad_norm": 0.27381692864354157, + "learning_rate": 0.0001749355542823136, + "loss": 2.655602216720581, + "step": 9173, + "token_acc": 0.348442514002392 + }, + { + "epoch": 5.377601876282615, + "grad_norm": 0.2770754440448715, + "learning_rate": 0.00017492913614276014, + "loss": 2.6748037338256836, + "step": 9174, + "token_acc": 0.34641980703082526 + }, + { + "epoch": 5.378188214599824, + "grad_norm": 0.2821182147745218, + "learning_rate": 0.00017492271729934843, + "loss": 2.640951633453369, + "step": 9175, + "token_acc": 0.34960844868471846 + }, + { + "epoch": 5.378774552917033, + "grad_norm": 0.29134355449895866, + "learning_rate": 0.0001749162977521387, + "loss": 2.694399356842041, + "step": 9176, + "token_acc": 0.34526181158829117 + }, + { + "epoch": 5.379360891234242, + "grad_norm": 0.28339730088405507, + "learning_rate": 0.00017490987750119134, + "loss": 2.653134822845459, + "step": 9177, + "token_acc": 0.3489772441312866 + }, + { + "epoch": 5.379947229551451, + "grad_norm": 0.2903999014350154, + "learning_rate": 0.00017490345654656663, + "loss": 2.6606860160827637, + "step": 9178, + "token_acc": 0.34796152273915043 + }, + { + "epoch": 5.38053356786866, + "grad_norm": 0.29677740599003216, + "learning_rate": 0.00017489703488832484, + "loss": 2.679461717605591, + "step": 9179, + "token_acc": 0.3452262418409116 + }, + { + "epoch": 5.381119906185869, + "grad_norm": 0.2697579475166165, + "learning_rate": 0.00017489061252652638, + "loss": 2.6446759700775146, + "step": 9180, + "token_acc": 0.34955150217394143 + }, + { + "epoch": 5.3817062445030786, + "grad_norm": 0.3030978500479128, + "learning_rate": 0.0001748841894612315, + "loss": 2.638965129852295, + "step": 9181, + "token_acc": 0.3517012569203565 + }, + { + "epoch": 5.382292582820288, + "grad_norm": 0.3051325901575654, + "learning_rate": 0.0001748777656925006, + "loss": 2.6479573249816895, + "step": 9182, + "token_acc": 0.35169829974893 + }, + { + "epoch": 5.382878921137497, + "grad_norm": 0.2875584734640436, + "learning_rate": 0.00017487134122039395, + "loss": 2.6258015632629395, + "step": 9183, + "token_acc": 0.3537485405759508 + }, + { + "epoch": 5.383465259454705, + "grad_norm": 0.2757945525569109, + "learning_rate": 0.00017486491604497197, + "loss": 2.664182186126709, + "step": 9184, + "token_acc": 0.348831485738523 + }, + { + "epoch": 5.384051597771914, + "grad_norm": 0.2729210743588234, + "learning_rate": 0.000174858490166295, + "loss": 2.6397876739501953, + "step": 9185, + "token_acc": 0.34964197619560794 + }, + { + "epoch": 5.384637936089123, + "grad_norm": 0.28739859326512823, + "learning_rate": 0.00017485206358442332, + "loss": 2.645949125289917, + "step": 9186, + "token_acc": 0.3506532427034973 + }, + { + "epoch": 5.385224274406332, + "grad_norm": 0.308235423230451, + "learning_rate": 0.00017484563629941745, + "loss": 2.6778860092163086, + "step": 9187, + "token_acc": 0.3448710712451502 + }, + { + "epoch": 5.3858106127235414, + "grad_norm": 0.2967589316678, + "learning_rate": 0.0001748392083113376, + "loss": 2.657205104827881, + "step": 9188, + "token_acc": 0.35011086180113915 + }, + { + "epoch": 5.386396951040751, + "grad_norm": 0.2979831071707743, + "learning_rate": 0.0001748327796202443, + "loss": 2.6488137245178223, + "step": 9189, + "token_acc": 0.35024664526135063 + }, + { + "epoch": 5.38698328935796, + "grad_norm": 0.2877212544030338, + "learning_rate": 0.00017482635022619784, + "loss": 2.6232125759124756, + "step": 9190, + "token_acc": 0.35373985534400715 + }, + { + "epoch": 5.387569627675169, + "grad_norm": 0.2828034670223659, + "learning_rate": 0.00017481992012925864, + "loss": 2.6502652168273926, + "step": 9191, + "token_acc": 0.35097181208755857 + }, + { + "epoch": 5.388155965992378, + "grad_norm": 0.3247823988867913, + "learning_rate": 0.00017481348932948712, + "loss": 2.652045726776123, + "step": 9192, + "token_acc": 0.3506194135173279 + }, + { + "epoch": 5.388742304309587, + "grad_norm": 0.2877578392348327, + "learning_rate": 0.00017480705782694372, + "loss": 2.6619410514831543, + "step": 9193, + "token_acc": 0.34804089737192284 + }, + { + "epoch": 5.389328642626795, + "grad_norm": 0.2945741136560786, + "learning_rate": 0.00017480062562168878, + "loss": 2.6558499336242676, + "step": 9194, + "token_acc": 0.34909756667618874 + }, + { + "epoch": 5.389914980944004, + "grad_norm": 0.28568789621593405, + "learning_rate": 0.00017479419271378274, + "loss": 2.672414541244507, + "step": 9195, + "token_acc": 0.3464891982450619 + }, + { + "epoch": 5.3905013192612135, + "grad_norm": 0.3033980329969103, + "learning_rate": 0.0001747877591032861, + "loss": 2.6636180877685547, + "step": 9196, + "token_acc": 0.3466803847208385 + }, + { + "epoch": 5.391087657578423, + "grad_norm": 0.27552698443000784, + "learning_rate": 0.00017478132479025921, + "loss": 2.668168067932129, + "step": 9197, + "token_acc": 0.3466289108869471 + }, + { + "epoch": 5.391673995895632, + "grad_norm": 0.29275583402266914, + "learning_rate": 0.00017477488977476254, + "loss": 2.6359057426452637, + "step": 9198, + "token_acc": 0.3519196673561569 + }, + { + "epoch": 5.392260334212841, + "grad_norm": 0.3150720530420571, + "learning_rate": 0.00017476845405685654, + "loss": 2.6451005935668945, + "step": 9199, + "token_acc": 0.35023056274437914 + }, + { + "epoch": 5.39284667253005, + "grad_norm": 0.30543353534881174, + "learning_rate": 0.00017476201763660167, + "loss": 2.648590564727783, + "step": 9200, + "token_acc": 0.3488169901422295 + }, + { + "epoch": 5.393433010847259, + "grad_norm": 0.29511651739550887, + "learning_rate": 0.0001747555805140584, + "loss": 2.66976261138916, + "step": 9201, + "token_acc": 0.34743473079534476 + }, + { + "epoch": 5.394019349164468, + "grad_norm": 0.2944459553185582, + "learning_rate": 0.00017474914268928715, + "loss": 2.6645655632019043, + "step": 9202, + "token_acc": 0.3474706713315747 + }, + { + "epoch": 5.394605687481677, + "grad_norm": 0.26957972039691674, + "learning_rate": 0.00017474270416234847, + "loss": 2.6345341205596924, + "step": 9203, + "token_acc": 0.35123961613551313 + }, + { + "epoch": 5.395192025798886, + "grad_norm": 0.2929112440455568, + "learning_rate": 0.00017473626493330277, + "loss": 2.6242728233337402, + "step": 9204, + "token_acc": 0.35428383916238676 + }, + { + "epoch": 5.395778364116095, + "grad_norm": 0.3372180127903838, + "learning_rate": 0.0001747298250022106, + "loss": 2.6255886554718018, + "step": 9205, + "token_acc": 0.35243933887549284 + }, + { + "epoch": 5.396364702433304, + "grad_norm": 0.34854658561618224, + "learning_rate": 0.00017472338436913242, + "loss": 2.641961097717285, + "step": 9206, + "token_acc": 0.35080886527646826 + }, + { + "epoch": 5.396951040750513, + "grad_norm": 0.27998655527515354, + "learning_rate": 0.0001747169430341287, + "loss": 2.6474790573120117, + "step": 9207, + "token_acc": 0.3503120698385074 + }, + { + "epoch": 5.397537379067722, + "grad_norm": 0.3138349883677517, + "learning_rate": 0.00017471050099726, + "loss": 2.633378028869629, + "step": 9208, + "token_acc": 0.3529273344590489 + }, + { + "epoch": 5.398123717384931, + "grad_norm": 0.3128681099397522, + "learning_rate": 0.0001747040582585868, + "loss": 2.6790719032287598, + "step": 9209, + "token_acc": 0.3445141809016383 + }, + { + "epoch": 5.39871005570214, + "grad_norm": 0.29269220154742487, + "learning_rate": 0.00017469761481816968, + "loss": 2.6462650299072266, + "step": 9210, + "token_acc": 0.3506086582511453 + }, + { + "epoch": 5.399296394019349, + "grad_norm": 0.28290830017447266, + "learning_rate": 0.00017469117067606913, + "loss": 2.626572608947754, + "step": 9211, + "token_acc": 0.3522772618793235 + }, + { + "epoch": 5.399882732336558, + "grad_norm": 0.303152004723051, + "learning_rate": 0.00017468472583234563, + "loss": 2.6611132621765137, + "step": 9212, + "token_acc": 0.34688854276790976 + }, + { + "epoch": 5.4004690706537675, + "grad_norm": 0.2744499388078628, + "learning_rate": 0.0001746782802870598, + "loss": 2.6674013137817383, + "step": 9213, + "token_acc": 0.34850612153206784 + }, + { + "epoch": 5.401055408970977, + "grad_norm": 0.27660877961609887, + "learning_rate": 0.00017467183404027217, + "loss": 2.639521837234497, + "step": 9214, + "token_acc": 0.352064184693467 + }, + { + "epoch": 5.401641747288186, + "grad_norm": 0.28652071307511084, + "learning_rate": 0.00017466538709204324, + "loss": 2.6406121253967285, + "step": 9215, + "token_acc": 0.3526194316151607 + }, + { + "epoch": 5.402228085605394, + "grad_norm": 0.2633453024280151, + "learning_rate": 0.00017465893944243363, + "loss": 2.647779941558838, + "step": 9216, + "token_acc": 0.349663027328002 + }, + { + "epoch": 5.402814423922603, + "grad_norm": 0.29036724444051026, + "learning_rate": 0.0001746524910915039, + "loss": 2.668825149536133, + "step": 9217, + "token_acc": 0.345658357035302 + }, + { + "epoch": 5.403400762239812, + "grad_norm": 0.2838923432302892, + "learning_rate": 0.0001746460420393146, + "loss": 2.64522647857666, + "step": 9218, + "token_acc": 0.35120988785784296 + }, + { + "epoch": 5.403987100557021, + "grad_norm": 0.2779513825513963, + "learning_rate": 0.0001746395922859263, + "loss": 2.645008087158203, + "step": 9219, + "token_acc": 0.34980507598058713 + }, + { + "epoch": 5.40457343887423, + "grad_norm": 0.3255499891434415, + "learning_rate": 0.00017463314183139965, + "loss": 2.6132619380950928, + "step": 9220, + "token_acc": 0.35555567416959794 + }, + { + "epoch": 5.4051597771914395, + "grad_norm": 0.3152259872423518, + "learning_rate": 0.00017462669067579517, + "loss": 2.639146089553833, + "step": 9221, + "token_acc": 0.35209544046491387 + }, + { + "epoch": 5.405746115508649, + "grad_norm": 0.2865576663914704, + "learning_rate": 0.0001746202388191735, + "loss": 2.634516954421997, + "step": 9222, + "token_acc": 0.3531703777937845 + }, + { + "epoch": 5.406332453825858, + "grad_norm": 0.30907224675001005, + "learning_rate": 0.00017461378626159525, + "loss": 2.6406731605529785, + "step": 9223, + "token_acc": 0.350268175622392 + }, + { + "epoch": 5.406918792143067, + "grad_norm": 0.3200922583515522, + "learning_rate": 0.00017460733300312105, + "loss": 2.6428892612457275, + "step": 9224, + "token_acc": 0.3525032308816093 + }, + { + "epoch": 5.407505130460276, + "grad_norm": 0.284691396576186, + "learning_rate": 0.00017460087904381144, + "loss": 2.6719746589660645, + "step": 9225, + "token_acc": 0.34598874391790385 + }, + { + "epoch": 5.408091468777485, + "grad_norm": 0.33064610000445716, + "learning_rate": 0.0001745944243837271, + "loss": 2.6362950801849365, + "step": 9226, + "token_acc": 0.35206593739645914 + }, + { + "epoch": 5.408677807094693, + "grad_norm": 0.27921331806323674, + "learning_rate": 0.00017458796902292869, + "loss": 2.6359457969665527, + "step": 9227, + "token_acc": 0.35187015678024336 + }, + { + "epoch": 5.409264145411902, + "grad_norm": 0.3324757094536556, + "learning_rate": 0.0001745815129614768, + "loss": 2.678595542907715, + "step": 9228, + "token_acc": 0.3445675759454433 + }, + { + "epoch": 5.4098504837291115, + "grad_norm": 0.35578066812042636, + "learning_rate": 0.0001745750561994321, + "loss": 2.646449565887451, + "step": 9229, + "token_acc": 0.35135618511706623 + }, + { + "epoch": 5.410436822046321, + "grad_norm": 0.2974071114269231, + "learning_rate": 0.00017456859873685523, + "loss": 2.6521496772766113, + "step": 9230, + "token_acc": 0.35014047486011535 + }, + { + "epoch": 5.41102316036353, + "grad_norm": 0.35828511272411984, + "learning_rate": 0.0001745621405738069, + "loss": 2.663882255554199, + "step": 9231, + "token_acc": 0.3487180554724316 + }, + { + "epoch": 5.411609498680739, + "grad_norm": 0.35791225373617613, + "learning_rate": 0.00017455568171034766, + "loss": 2.641292095184326, + "step": 9232, + "token_acc": 0.35025245010264455 + }, + { + "epoch": 5.412195836997948, + "grad_norm": 0.2985744761471861, + "learning_rate": 0.0001745492221465383, + "loss": 2.6272144317626953, + "step": 9233, + "token_acc": 0.3530285236031804 + }, + { + "epoch": 5.412782175315157, + "grad_norm": 0.31776533100372834, + "learning_rate": 0.00017454276188243946, + "loss": 2.648611545562744, + "step": 9234, + "token_acc": 0.35067204660235685 + }, + { + "epoch": 5.413368513632366, + "grad_norm": 0.2949566010130329, + "learning_rate": 0.00017453630091811178, + "loss": 2.645766258239746, + "step": 9235, + "token_acc": 0.351200272016321 + }, + { + "epoch": 5.413954851949575, + "grad_norm": 0.3484661470153019, + "learning_rate": 0.00017452983925361605, + "loss": 2.6655569076538086, + "step": 9236, + "token_acc": 0.3486717620545965 + }, + { + "epoch": 5.4145411902667835, + "grad_norm": 0.2723768563723372, + "learning_rate": 0.0001745233768890129, + "loss": 2.6693434715270996, + "step": 9237, + "token_acc": 0.34740341964780486 + }, + { + "epoch": 5.415127528583993, + "grad_norm": 0.3039809968566393, + "learning_rate": 0.000174516913824363, + "loss": 2.644955635070801, + "step": 9238, + "token_acc": 0.35083827424407427 + }, + { + "epoch": 5.415713866901202, + "grad_norm": 0.2890490171994882, + "learning_rate": 0.00017451045005972712, + "loss": 2.6863882541656494, + "step": 9239, + "token_acc": 0.3436282148731337 + }, + { + "epoch": 5.416300205218411, + "grad_norm": 0.31801458186699505, + "learning_rate": 0.00017450398559516598, + "loss": 2.676280975341797, + "step": 9240, + "token_acc": 0.3459335303070087 + }, + { + "epoch": 5.41688654353562, + "grad_norm": 0.283411162699875, + "learning_rate": 0.0001744975204307403, + "loss": 2.6563234329223633, + "step": 9241, + "token_acc": 0.34891630663707474 + }, + { + "epoch": 5.417472881852829, + "grad_norm": 0.34295498007173897, + "learning_rate": 0.00017449105456651077, + "loss": 2.679187297821045, + "step": 9242, + "token_acc": 0.344854150723208 + }, + { + "epoch": 5.418059220170038, + "grad_norm": 0.2777695500015634, + "learning_rate": 0.0001744845880025382, + "loss": 2.655790328979492, + "step": 9243, + "token_acc": 0.3501764448756357 + }, + { + "epoch": 5.418645558487247, + "grad_norm": 0.38541906609184623, + "learning_rate": 0.00017447812073888327, + "loss": 2.63789701461792, + "step": 9244, + "token_acc": 0.350186677297434 + }, + { + "epoch": 5.419231896804456, + "grad_norm": 0.29375542977778046, + "learning_rate": 0.00017447165277560678, + "loss": 2.6485838890075684, + "step": 9245, + "token_acc": 0.3507300258268766 + }, + { + "epoch": 5.4198182351216655, + "grad_norm": 0.34715775652158476, + "learning_rate": 0.00017446518411276944, + "loss": 2.653679370880127, + "step": 9246, + "token_acc": 0.34857971879820143 + }, + { + "epoch": 5.420404573438875, + "grad_norm": 0.27434429599254373, + "learning_rate": 0.00017445871475043205, + "loss": 2.607123851776123, + "step": 9247, + "token_acc": 0.3574174956896203 + }, + { + "epoch": 5.420990911756084, + "grad_norm": 0.37457703237735224, + "learning_rate": 0.00017445224468865534, + "loss": 2.6612343788146973, + "step": 9248, + "token_acc": 0.3488917267871721 + }, + { + "epoch": 5.421577250073292, + "grad_norm": 0.2763339075962531, + "learning_rate": 0.00017444577392750017, + "loss": 2.676593065261841, + "step": 9249, + "token_acc": 0.34444216544993683 + }, + { + "epoch": 5.422163588390501, + "grad_norm": 0.36263552114490627, + "learning_rate": 0.00017443930246702723, + "loss": 2.6488003730773926, + "step": 9250, + "token_acc": 0.34999337684701554 + }, + { + "epoch": 5.42274992670771, + "grad_norm": 0.2829850224343466, + "learning_rate": 0.0001744328303072974, + "loss": 2.6467456817626953, + "step": 9251, + "token_acc": 0.34968082362549974 + }, + { + "epoch": 5.423336265024919, + "grad_norm": 0.41973991451105946, + "learning_rate": 0.00017442635744837137, + "loss": 2.6746253967285156, + "step": 9252, + "token_acc": 0.34592618972728617 + }, + { + "epoch": 5.423922603342128, + "grad_norm": 0.28772496248680346, + "learning_rate": 0.00017441988389031002, + "loss": 2.6486639976501465, + "step": 9253, + "token_acc": 0.3498523973146647 + }, + { + "epoch": 5.4245089416593375, + "grad_norm": 0.3431228478030812, + "learning_rate": 0.00017441340963317414, + "loss": 2.6319870948791504, + "step": 9254, + "token_acc": 0.35211869069264184 + }, + { + "epoch": 5.425095279976547, + "grad_norm": 0.2812869360074288, + "learning_rate": 0.00017440693467702455, + "loss": 2.66316556930542, + "step": 9255, + "token_acc": 0.3480773353848482 + }, + { + "epoch": 5.425681618293756, + "grad_norm": 0.3348143176497172, + "learning_rate": 0.0001744004590219221, + "loss": 2.653688430786133, + "step": 9256, + "token_acc": 0.3481771252429468 + }, + { + "epoch": 5.426267956610965, + "grad_norm": 0.284678250997968, + "learning_rate": 0.00017439398266792756, + "loss": 2.6870598793029785, + "step": 9257, + "token_acc": 0.3440134776129074 + }, + { + "epoch": 5.426854294928174, + "grad_norm": 0.3019696756347643, + "learning_rate": 0.0001743875056151018, + "loss": 2.6434805393218994, + "step": 9258, + "token_acc": 0.3494060712714474 + }, + { + "epoch": 5.427440633245382, + "grad_norm": 0.31617571155889557, + "learning_rate": 0.0001743810278635057, + "loss": 2.6683263778686523, + "step": 9259, + "token_acc": 0.3476538800857325 + }, + { + "epoch": 5.428026971562591, + "grad_norm": 0.2925789315656586, + "learning_rate": 0.00017437454941320003, + "loss": 2.684692144393921, + "step": 9260, + "token_acc": 0.3438412805818197 + }, + { + "epoch": 5.4286133098798, + "grad_norm": 0.3456299784149186, + "learning_rate": 0.00017436807026424572, + "loss": 2.6515817642211914, + "step": 9261, + "token_acc": 0.34929095764686147 + }, + { + "epoch": 5.4291996481970095, + "grad_norm": 0.2778919608164676, + "learning_rate": 0.0001743615904167036, + "loss": 2.665006637573242, + "step": 9262, + "token_acc": 0.34846154450817374 + }, + { + "epoch": 5.429785986514219, + "grad_norm": 0.3215575733331176, + "learning_rate": 0.00017435510987063454, + "loss": 2.6541085243225098, + "step": 9263, + "token_acc": 0.34982349894839193 + }, + { + "epoch": 5.430372324831428, + "grad_norm": 0.28342284044312094, + "learning_rate": 0.0001743486286260994, + "loss": 2.625, + "step": 9264, + "token_acc": 0.35392923128023085 + }, + { + "epoch": 5.430958663148637, + "grad_norm": 0.32584422131979335, + "learning_rate": 0.0001743421466831591, + "loss": 2.653529644012451, + "step": 9265, + "token_acc": 0.3503565585074619 + }, + { + "epoch": 5.431545001465846, + "grad_norm": 0.2827144264662771, + "learning_rate": 0.0001743356640418745, + "loss": 2.666945219039917, + "step": 9266, + "token_acc": 0.34683041339782933 + }, + { + "epoch": 5.432131339783055, + "grad_norm": 0.32468473548878996, + "learning_rate": 0.0001743291807023065, + "loss": 2.6752285957336426, + "step": 9267, + "token_acc": 0.345463508355032 + }, + { + "epoch": 5.432717678100264, + "grad_norm": 0.27251853685135174, + "learning_rate": 0.00017432269666451604, + "loss": 2.630253791809082, + "step": 9268, + "token_acc": 0.3528215984594391 + }, + { + "epoch": 5.433304016417473, + "grad_norm": 0.3045888525804336, + "learning_rate": 0.00017431621192856396, + "loss": 2.66154408454895, + "step": 9269, + "token_acc": 0.34941003709810387 + }, + { + "epoch": 5.4338903547346815, + "grad_norm": 0.260799069106681, + "learning_rate": 0.0001743097264945112, + "loss": 2.64109468460083, + "step": 9270, + "token_acc": 0.35025388340151153 + }, + { + "epoch": 5.434476693051891, + "grad_norm": 0.2869325668708683, + "learning_rate": 0.00017430324036241872, + "loss": 2.6270158290863037, + "step": 9271, + "token_acc": 0.3544345655571141 + }, + { + "epoch": 5.4350630313691, + "grad_norm": 0.2834136362404422, + "learning_rate": 0.0001742967535323474, + "loss": 2.6951050758361816, + "step": 9272, + "token_acc": 0.3438157365738519 + }, + { + "epoch": 5.435649369686309, + "grad_norm": 0.32005213712083863, + "learning_rate": 0.00017429026600435823, + "loss": 2.6767215728759766, + "step": 9273, + "token_acc": 0.34603496805630474 + }, + { + "epoch": 5.436235708003518, + "grad_norm": 0.26341614561565535, + "learning_rate": 0.0001742837777785121, + "loss": 2.6697981357574463, + "step": 9274, + "token_acc": 0.34766043121463885 + }, + { + "epoch": 5.436822046320727, + "grad_norm": 0.28202438830085746, + "learning_rate": 0.00017427728885486995, + "loss": 2.678168535232544, + "step": 9275, + "token_acc": 0.34597784507898244 + }, + { + "epoch": 5.437408384637936, + "grad_norm": 0.26470526030329666, + "learning_rate": 0.0001742707992334928, + "loss": 2.657444953918457, + "step": 9276, + "token_acc": 0.3496569787264819 + }, + { + "epoch": 5.437994722955145, + "grad_norm": 0.27381762834454226, + "learning_rate": 0.00017426430891444156, + "loss": 2.6749205589294434, + "step": 9277, + "token_acc": 0.3448349142175919 + }, + { + "epoch": 5.438581061272354, + "grad_norm": 0.2975967795276183, + "learning_rate": 0.0001742578178977772, + "loss": 2.658967971801758, + "step": 9278, + "token_acc": 0.34780798407653013 + }, + { + "epoch": 5.4391673995895635, + "grad_norm": 0.34075639558908916, + "learning_rate": 0.00017425132618356073, + "loss": 2.638697385787964, + "step": 9279, + "token_acc": 0.352623083570198 + }, + { + "epoch": 5.439753737906772, + "grad_norm": 0.26368360885623476, + "learning_rate": 0.0001742448337718531, + "loss": 2.617581367492676, + "step": 9280, + "token_acc": 0.3542658314487145 + }, + { + "epoch": 5.440340076223981, + "grad_norm": 0.3120488755638987, + "learning_rate": 0.00017423834066271528, + "loss": 2.6785593032836914, + "step": 9281, + "token_acc": 0.34476466457655447 + }, + { + "epoch": 5.44092641454119, + "grad_norm": 0.2744581679399676, + "learning_rate": 0.0001742318468562083, + "loss": 2.6564488410949707, + "step": 9282, + "token_acc": 0.3481688195862942 + }, + { + "epoch": 5.441512752858399, + "grad_norm": 0.28282940281946123, + "learning_rate": 0.00017422535235239313, + "loss": 2.651916742324829, + "step": 9283, + "token_acc": 0.3484321390255043 + }, + { + "epoch": 5.442099091175608, + "grad_norm": 0.2756694635126806, + "learning_rate": 0.00017421885715133083, + "loss": 2.637572765350342, + "step": 9284, + "token_acc": 0.3511623204907871 + }, + { + "epoch": 5.442685429492817, + "grad_norm": 0.2802514613693682, + "learning_rate": 0.00017421236125308236, + "loss": 2.6657588481903076, + "step": 9285, + "token_acc": 0.3459594587250693 + }, + { + "epoch": 5.443271767810026, + "grad_norm": 0.2806904346102108, + "learning_rate": 0.00017420586465770877, + "loss": 2.6574153900146484, + "step": 9286, + "token_acc": 0.34741721103131107 + }, + { + "epoch": 5.4438581061272355, + "grad_norm": 0.2821907706139861, + "learning_rate": 0.00017419936736527106, + "loss": 2.6239233016967773, + "step": 9287, + "token_acc": 0.3539010996388244 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.2848098389449877, + "learning_rate": 0.00017419286937583027, + "loss": 2.653468608856201, + "step": 9288, + "token_acc": 0.3501569375918989 + }, + { + "epoch": 5.445030782761654, + "grad_norm": 0.2893317588870503, + "learning_rate": 0.00017418637068944746, + "loss": 2.6941704750061035, + "step": 9289, + "token_acc": 0.34371686517536365 + }, + { + "epoch": 5.445617121078863, + "grad_norm": 0.28883291122830984, + "learning_rate": 0.00017417987130618364, + "loss": 2.672786235809326, + "step": 9290, + "token_acc": 0.34521196470841403 + }, + { + "epoch": 5.446203459396072, + "grad_norm": 0.28333587453210113, + "learning_rate": 0.00017417337122609992, + "loss": 2.6625771522521973, + "step": 9291, + "token_acc": 0.3490096465458008 + }, + { + "epoch": 5.44678979771328, + "grad_norm": 0.2837438943154363, + "learning_rate": 0.00017416687044925734, + "loss": 2.691338539123535, + "step": 9292, + "token_acc": 0.34333685539777853 + }, + { + "epoch": 5.447376136030489, + "grad_norm": 0.27935800101214703, + "learning_rate": 0.00017416036897571692, + "loss": 2.6151061058044434, + "step": 9293, + "token_acc": 0.35436849670623627 + }, + { + "epoch": 5.447962474347698, + "grad_norm": 0.3639009126483607, + "learning_rate": 0.0001741538668055398, + "loss": 2.640374183654785, + "step": 9294, + "token_acc": 0.3509067727164434 + }, + { + "epoch": 5.4485488126649075, + "grad_norm": 0.3250992497972495, + "learning_rate": 0.000174147363938787, + "loss": 2.6559290885925293, + "step": 9295, + "token_acc": 0.34717715452097203 + }, + { + "epoch": 5.449135150982117, + "grad_norm": 0.2695904108673803, + "learning_rate": 0.00017414086037551962, + "loss": 2.6337718963623047, + "step": 9296, + "token_acc": 0.3511304002050992 + }, + { + "epoch": 5.449721489299326, + "grad_norm": 0.29978425485416504, + "learning_rate": 0.0001741343561157988, + "loss": 2.6178317070007324, + "step": 9297, + "token_acc": 0.35389719985660417 + }, + { + "epoch": 5.450307827616535, + "grad_norm": 0.3177734866705486, + "learning_rate": 0.00017412785115968556, + "loss": 2.631922721862793, + "step": 9298, + "token_acc": 0.35220932871149224 + }, + { + "epoch": 5.450894165933744, + "grad_norm": 0.267781707337024, + "learning_rate": 0.00017412134550724107, + "loss": 2.683915376663208, + "step": 9299, + "token_acc": 0.34389442076152665 + }, + { + "epoch": 5.451480504250953, + "grad_norm": 0.27595763555867653, + "learning_rate": 0.00017411483915852642, + "loss": 2.6174488067626953, + "step": 9300, + "token_acc": 0.3539585595853808 + }, + { + "epoch": 5.452066842568162, + "grad_norm": 0.2999665626197228, + "learning_rate": 0.0001741083321136027, + "loss": 2.655979633331299, + "step": 9301, + "token_acc": 0.3489852183662747 + }, + { + "epoch": 5.45265318088537, + "grad_norm": 0.26052848551400154, + "learning_rate": 0.00017410182437253111, + "loss": 2.6884846687316895, + "step": 9302, + "token_acc": 0.34258274094976693 + }, + { + "epoch": 5.4532395192025795, + "grad_norm": 0.2922634219412777, + "learning_rate": 0.0001740953159353727, + "loss": 2.6818859577178955, + "step": 9303, + "token_acc": 0.3448977944641597 + }, + { + "epoch": 5.453825857519789, + "grad_norm": 0.27134106517901835, + "learning_rate": 0.00017408880680218865, + "loss": 2.6419026851654053, + "step": 9304, + "token_acc": 0.3523951655172592 + }, + { + "epoch": 5.454412195836998, + "grad_norm": 0.2876124856859591, + "learning_rate": 0.0001740822969730401, + "loss": 2.6196064949035645, + "step": 9305, + "token_acc": 0.35319417848611223 + }, + { + "epoch": 5.454998534154207, + "grad_norm": 0.29044534669936084, + "learning_rate": 0.00017407578644798818, + "loss": 2.68595552444458, + "step": 9306, + "token_acc": 0.3448662334102014 + }, + { + "epoch": 5.455584872471416, + "grad_norm": 0.26973562649463134, + "learning_rate": 0.00017406927522709408, + "loss": 2.630375385284424, + "step": 9307, + "token_acc": 0.35358303421617143 + }, + { + "epoch": 5.456171210788625, + "grad_norm": 0.3151115208262226, + "learning_rate": 0.00017406276331041894, + "loss": 2.6562623977661133, + "step": 9308, + "token_acc": 0.34878196178131576 + }, + { + "epoch": 5.456757549105834, + "grad_norm": 0.26480537698557466, + "learning_rate": 0.00017405625069802393, + "loss": 2.6395699977874756, + "step": 9309, + "token_acc": 0.3518182029057225 + }, + { + "epoch": 5.457343887423043, + "grad_norm": 0.28618944384003336, + "learning_rate": 0.00017404973738997028, + "loss": 2.6706836223602295, + "step": 9310, + "token_acc": 0.3468155455858324 + }, + { + "epoch": 5.457930225740252, + "grad_norm": 0.29773744590181683, + "learning_rate": 0.0001740432233863191, + "loss": 2.6483659744262695, + "step": 9311, + "token_acc": 0.3489965044577982 + }, + { + "epoch": 5.4585165640574616, + "grad_norm": 0.2662053080072157, + "learning_rate": 0.0001740367086871316, + "loss": 2.6510047912597656, + "step": 9312, + "token_acc": 0.3508390813577518 + }, + { + "epoch": 5.45910290237467, + "grad_norm": 0.27678042396670544, + "learning_rate": 0.000174030193292469, + "loss": 2.663435459136963, + "step": 9313, + "token_acc": 0.3465382490344035 + }, + { + "epoch": 5.459689240691879, + "grad_norm": 0.3203594893846758, + "learning_rate": 0.00017402367720239248, + "loss": 2.660707950592041, + "step": 9314, + "token_acc": 0.3481814761146114 + }, + { + "epoch": 5.460275579009088, + "grad_norm": 0.30332311298228415, + "learning_rate": 0.00017401716041696327, + "loss": 2.6377854347229004, + "step": 9315, + "token_acc": 0.35267191155917776 + }, + { + "epoch": 5.460861917326297, + "grad_norm": 0.26408385550656044, + "learning_rate": 0.0001740106429362426, + "loss": 2.6434619426727295, + "step": 9316, + "token_acc": 0.35168904275983115 + }, + { + "epoch": 5.461448255643506, + "grad_norm": 0.30589332025149496, + "learning_rate": 0.00017400412476029165, + "loss": 2.6600189208984375, + "step": 9317, + "token_acc": 0.3477878397227271 + }, + { + "epoch": 5.462034593960715, + "grad_norm": 0.30668118452255516, + "learning_rate": 0.00017399760588917163, + "loss": 2.668876886367798, + "step": 9318, + "token_acc": 0.3469260871155323 + }, + { + "epoch": 5.4626209322779244, + "grad_norm": 0.276483070883018, + "learning_rate": 0.00017399108632294388, + "loss": 2.6713247299194336, + "step": 9319, + "token_acc": 0.3473461495792193 + }, + { + "epoch": 5.463207270595134, + "grad_norm": 0.2838596765542573, + "learning_rate": 0.00017398456606166956, + "loss": 2.6146204471588135, + "step": 9320, + "token_acc": 0.3545785389349746 + }, + { + "epoch": 5.463793608912343, + "grad_norm": 0.2645734110421507, + "learning_rate": 0.00017397804510540992, + "loss": 2.661116123199463, + "step": 9321, + "token_acc": 0.34743536493928917 + }, + { + "epoch": 5.464379947229552, + "grad_norm": 0.278720188378057, + "learning_rate": 0.00017397152345422626, + "loss": 2.6144638061523438, + "step": 9322, + "token_acc": 0.35589518521964786 + }, + { + "epoch": 5.464966285546761, + "grad_norm": 0.28977271362060236, + "learning_rate": 0.00017396500110817978, + "loss": 2.6785597801208496, + "step": 9323, + "token_acc": 0.34519601328903654 + }, + { + "epoch": 5.465552623863969, + "grad_norm": 0.25313250889276745, + "learning_rate": 0.0001739584780673318, + "loss": 2.640286445617676, + "step": 9324, + "token_acc": 0.35178047487947056 + }, + { + "epoch": 5.466138962181178, + "grad_norm": 0.28833440127614024, + "learning_rate": 0.0001739519543317436, + "loss": 2.63649582862854, + "step": 9325, + "token_acc": 0.3517091972793604 + }, + { + "epoch": 5.466725300498387, + "grad_norm": 0.26294784887766226, + "learning_rate": 0.0001739454299014764, + "loss": 2.6604275703430176, + "step": 9326, + "token_acc": 0.34682292904145534 + }, + { + "epoch": 5.4673116388155965, + "grad_norm": 0.28017460519453047, + "learning_rate": 0.00017393890477659157, + "loss": 2.646897554397583, + "step": 9327, + "token_acc": 0.350433971141374 + }, + { + "epoch": 5.467897977132806, + "grad_norm": 0.3146127454848998, + "learning_rate": 0.00017393237895715034, + "loss": 2.6719017028808594, + "step": 9328, + "token_acc": 0.3449838354435271 + }, + { + "epoch": 5.468484315450015, + "grad_norm": 0.26860868264912446, + "learning_rate": 0.00017392585244321404, + "loss": 2.6594743728637695, + "step": 9329, + "token_acc": 0.3485607175614855 + }, + { + "epoch": 5.469070653767224, + "grad_norm": 0.34455698944111873, + "learning_rate": 0.00017391932523484397, + "loss": 2.6771316528320312, + "step": 9330, + "token_acc": 0.3464481451644854 + }, + { + "epoch": 5.469656992084433, + "grad_norm": 0.32099053548764633, + "learning_rate": 0.00017391279733210144, + "loss": 2.6617016792297363, + "step": 9331, + "token_acc": 0.3493789360613537 + }, + { + "epoch": 5.470243330401642, + "grad_norm": 0.30945263570789366, + "learning_rate": 0.00017390626873504782, + "loss": 2.6670072078704834, + "step": 9332, + "token_acc": 0.3460895678958384 + }, + { + "epoch": 5.470829668718851, + "grad_norm": 0.3973434159286546, + "learning_rate": 0.00017389973944374434, + "loss": 2.6650776863098145, + "step": 9333, + "token_acc": 0.3475084473917773 + }, + { + "epoch": 5.47141600703606, + "grad_norm": 0.303254980588608, + "learning_rate": 0.00017389320945825242, + "loss": 2.680159330368042, + "step": 9334, + "token_acc": 0.344187926458935 + }, + { + "epoch": 5.4720023453532685, + "grad_norm": 0.29341717875090767, + "learning_rate": 0.00017388667877863334, + "loss": 2.6701245307922363, + "step": 9335, + "token_acc": 0.3480163284128007 + }, + { + "epoch": 5.472588683670478, + "grad_norm": 0.2710885264019792, + "learning_rate": 0.0001738801474049485, + "loss": 2.6102588176727295, + "step": 9336, + "token_acc": 0.3562948487618327 + }, + { + "epoch": 5.473175021987687, + "grad_norm": 0.28160601701267257, + "learning_rate": 0.00017387361533725924, + "loss": 2.6840109825134277, + "step": 9337, + "token_acc": 0.3446664531006346 + }, + { + "epoch": 5.473761360304896, + "grad_norm": 0.2762566658130932, + "learning_rate": 0.00017386708257562686, + "loss": 2.6841378211975098, + "step": 9338, + "token_acc": 0.34368588927656896 + }, + { + "epoch": 5.474347698622105, + "grad_norm": 0.2701273096117695, + "learning_rate": 0.00017386054912011284, + "loss": 2.6066830158233643, + "step": 9339, + "token_acc": 0.35550053738676496 + }, + { + "epoch": 5.474934036939314, + "grad_norm": 0.2763901194200535, + "learning_rate": 0.00017385401497077845, + "loss": 2.6395537853240967, + "step": 9340, + "token_acc": 0.35032762445374577 + }, + { + "epoch": 5.475520375256523, + "grad_norm": 0.26932033810523065, + "learning_rate": 0.00017384748012768512, + "loss": 2.65915584564209, + "step": 9341, + "token_acc": 0.34760120814591083 + }, + { + "epoch": 5.476106713573732, + "grad_norm": 0.28250410614893195, + "learning_rate": 0.00017384094459089422, + "loss": 2.6433002948760986, + "step": 9342, + "token_acc": 0.351272283036762 + }, + { + "epoch": 5.476693051890941, + "grad_norm": 0.27790802438107404, + "learning_rate": 0.00017383440836046715, + "loss": 2.687185049057007, + "step": 9343, + "token_acc": 0.34443859083534495 + }, + { + "epoch": 5.4772793902081505, + "grad_norm": 0.2715486113615593, + "learning_rate": 0.0001738278714364653, + "loss": 2.680638074874878, + "step": 9344, + "token_acc": 0.3454077365394668 + }, + { + "epoch": 5.477865728525359, + "grad_norm": 0.28518037759277326, + "learning_rate": 0.0001738213338189501, + "loss": 2.663626194000244, + "step": 9345, + "token_acc": 0.34835818368448346 + }, + { + "epoch": 5.478452066842568, + "grad_norm": 0.2646345518178395, + "learning_rate": 0.00017381479550798297, + "loss": 2.64632511138916, + "step": 9346, + "token_acc": 0.3509986747072985 + }, + { + "epoch": 5.479038405159777, + "grad_norm": 0.2592955110522507, + "learning_rate": 0.00017380825650362524, + "loss": 2.6906065940856934, + "step": 9347, + "token_acc": 0.3428210290938414 + }, + { + "epoch": 5.479624743476986, + "grad_norm": 0.2659796909760562, + "learning_rate": 0.00017380171680593843, + "loss": 2.6582560539245605, + "step": 9348, + "token_acc": 0.34921183177041404 + }, + { + "epoch": 5.480211081794195, + "grad_norm": 0.2705188844914245, + "learning_rate": 0.00017379517641498394, + "loss": 2.6619350910186768, + "step": 9349, + "token_acc": 0.3481304266154617 + }, + { + "epoch": 5.480797420111404, + "grad_norm": 0.2909725148900551, + "learning_rate": 0.0001737886353308232, + "loss": 2.670841693878174, + "step": 9350, + "token_acc": 0.3489178012369487 + }, + { + "epoch": 5.481383758428613, + "grad_norm": 0.2621433360463305, + "learning_rate": 0.00017378209355351767, + "loss": 2.6771442890167236, + "step": 9351, + "token_acc": 0.3460719034308029 + }, + { + "epoch": 5.4819700967458225, + "grad_norm": 0.28425192352323025, + "learning_rate": 0.0001737755510831288, + "loss": 2.6732242107391357, + "step": 9352, + "token_acc": 0.34619621031123127 + }, + { + "epoch": 5.482556435063032, + "grad_norm": 0.25994564849115154, + "learning_rate": 0.00017376900791971802, + "loss": 2.684951066970825, + "step": 9353, + "token_acc": 0.34395369532313635 + }, + { + "epoch": 5.483142773380241, + "grad_norm": 0.278889134498189, + "learning_rate": 0.00017376246406334685, + "loss": 2.6220078468322754, + "step": 9354, + "token_acc": 0.3528867340123999 + }, + { + "epoch": 5.48372911169745, + "grad_norm": 0.2749130315885006, + "learning_rate": 0.0001737559195140767, + "loss": 2.666330099105835, + "step": 9355, + "token_acc": 0.3477057448640669 + }, + { + "epoch": 5.484315450014659, + "grad_norm": 0.2858989730591188, + "learning_rate": 0.00017374937427196905, + "loss": 2.68009614944458, + "step": 9356, + "token_acc": 0.34429530370898104 + }, + { + "epoch": 5.484901788331867, + "grad_norm": 0.4196010261050356, + "learning_rate": 0.00017374282833708545, + "loss": 2.6617136001586914, + "step": 9357, + "token_acc": 0.3476075065479039 + }, + { + "epoch": 5.485488126649076, + "grad_norm": 0.34987514280049253, + "learning_rate": 0.00017373628170948733, + "loss": 2.66796875, + "step": 9358, + "token_acc": 0.3478785191987481 + }, + { + "epoch": 5.486074464966285, + "grad_norm": 0.32453463211446626, + "learning_rate": 0.0001737297343892362, + "loss": 2.635098457336426, + "step": 9359, + "token_acc": 0.3532302638091136 + }, + { + "epoch": 5.4866608032834945, + "grad_norm": 0.3479434236447583, + "learning_rate": 0.00017372318637639357, + "loss": 2.6387076377868652, + "step": 9360, + "token_acc": 0.351332601177082 + }, + { + "epoch": 5.487247141600704, + "grad_norm": 0.2791611738606666, + "learning_rate": 0.00017371663767102094, + "loss": 2.654923439025879, + "step": 9361, + "token_acc": 0.3498700764105071 + }, + { + "epoch": 5.487833479917913, + "grad_norm": 0.32531752834081396, + "learning_rate": 0.00017371008827317988, + "loss": 2.6721725463867188, + "step": 9362, + "token_acc": 0.34630661957042574 + }, + { + "epoch": 5.488419818235122, + "grad_norm": 0.2856942808112665, + "learning_rate": 0.0001737035381829318, + "loss": 2.610436201095581, + "step": 9363, + "token_acc": 0.35533196938050543 + }, + { + "epoch": 5.489006156552331, + "grad_norm": 0.38209206980466504, + "learning_rate": 0.00017369698740033832, + "loss": 2.6744191646575928, + "step": 9364, + "token_acc": 0.345857606546631 + }, + { + "epoch": 5.48959249486954, + "grad_norm": 0.2956029594238154, + "learning_rate": 0.00017369043592546098, + "loss": 2.654921531677246, + "step": 9365, + "token_acc": 0.3492039670978242 + }, + { + "epoch": 5.490178833186749, + "grad_norm": 0.29935619187503226, + "learning_rate": 0.00017368388375836128, + "loss": 2.6563568115234375, + "step": 9366, + "token_acc": 0.348972087893895 + }, + { + "epoch": 5.490765171503957, + "grad_norm": 0.2758632275787339, + "learning_rate": 0.00017367733089910075, + "loss": 2.6672253608703613, + "step": 9367, + "token_acc": 0.3474619321412776 + }, + { + "epoch": 5.4913515098211665, + "grad_norm": 0.37015481638140496, + "learning_rate": 0.000173670777347741, + "loss": 2.683981418609619, + "step": 9368, + "token_acc": 0.34426463994630624 + }, + { + "epoch": 5.491937848138376, + "grad_norm": 0.2819303900260238, + "learning_rate": 0.0001736642231043436, + "loss": 2.675234317779541, + "step": 9369, + "token_acc": 0.345387927134943 + }, + { + "epoch": 5.492524186455585, + "grad_norm": 0.3457482070183949, + "learning_rate": 0.00017365766816897002, + "loss": 2.7000961303710938, + "step": 9370, + "token_acc": 0.3416620720111003 + }, + { + "epoch": 5.493110524772794, + "grad_norm": 0.2744326752659587, + "learning_rate": 0.00017365111254168196, + "loss": 2.659615993499756, + "step": 9371, + "token_acc": 0.347717599489253 + }, + { + "epoch": 5.493696863090003, + "grad_norm": 0.33447248091314147, + "learning_rate": 0.00017364455622254093, + "loss": 2.68723726272583, + "step": 9372, + "token_acc": 0.3446416068615438 + }, + { + "epoch": 5.494283201407212, + "grad_norm": 0.26341586037490133, + "learning_rate": 0.00017363799921160853, + "loss": 2.6986615657806396, + "step": 9373, + "token_acc": 0.34228354284166385 + }, + { + "epoch": 5.494869539724421, + "grad_norm": 0.337235905292035, + "learning_rate": 0.00017363144150894634, + "loss": 2.6360843181610107, + "step": 9374, + "token_acc": 0.35132004580450765 + }, + { + "epoch": 5.49545587804163, + "grad_norm": 0.26558554176260174, + "learning_rate": 0.00017362488311461598, + "loss": 2.6861143112182617, + "step": 9375, + "token_acc": 0.3443222490427834 + }, + { + "epoch": 5.496042216358839, + "grad_norm": 0.3533698885257246, + "learning_rate": 0.00017361832402867905, + "loss": 2.659644603729248, + "step": 9376, + "token_acc": 0.34875447667888704 + }, + { + "epoch": 5.4966285546760485, + "grad_norm": 0.2612639177592154, + "learning_rate": 0.00017361176425119721, + "loss": 2.692948341369629, + "step": 9377, + "token_acc": 0.3423559764572804 + }, + { + "epoch": 5.497214892993257, + "grad_norm": 0.33401237573003445, + "learning_rate": 0.000173605203782232, + "loss": 2.6597068309783936, + "step": 9378, + "token_acc": 0.3485873744926298 + }, + { + "epoch": 5.497801231310466, + "grad_norm": 0.26197886707808343, + "learning_rate": 0.00017359864262184507, + "loss": 2.6893529891967773, + "step": 9379, + "token_acc": 0.34534118191773555 + }, + { + "epoch": 5.498387569627675, + "grad_norm": 0.34224790625120116, + "learning_rate": 0.0001735920807700981, + "loss": 2.710928440093994, + "step": 9380, + "token_acc": 0.34055076729030903 + }, + { + "epoch": 5.498973907944884, + "grad_norm": 0.2582218248274261, + "learning_rate": 0.00017358551822705271, + "loss": 2.6384365558624268, + "step": 9381, + "token_acc": 0.3501637432736574 + }, + { + "epoch": 5.499560246262093, + "grad_norm": 0.2876180452219583, + "learning_rate": 0.00017357895499277052, + "loss": 2.653571128845215, + "step": 9382, + "token_acc": 0.3489325296519541 + }, + { + "epoch": 5.500146584579302, + "grad_norm": 0.25048842783702785, + "learning_rate": 0.00017357239106731317, + "loss": 2.6648755073547363, + "step": 9383, + "token_acc": 0.34534254943964404 + }, + { + "epoch": 5.500732922896511, + "grad_norm": 0.2854732012402806, + "learning_rate": 0.00017356582645074235, + "loss": 2.671459674835205, + "step": 9384, + "token_acc": 0.3473649194265138 + }, + { + "epoch": 5.5013192612137205, + "grad_norm": 0.26130353309652177, + "learning_rate": 0.00017355926114311977, + "loss": 2.663386821746826, + "step": 9385, + "token_acc": 0.3481534262782162 + }, + { + "epoch": 5.50190559953093, + "grad_norm": 0.29852216946881543, + "learning_rate": 0.000173552695144507, + "loss": 2.7103805541992188, + "step": 9386, + "token_acc": 0.3390252826167655 + }, + { + "epoch": 5.502491937848139, + "grad_norm": 0.2595333761191849, + "learning_rate": 0.0001735461284549658, + "loss": 2.664442539215088, + "step": 9387, + "token_acc": 0.34683414442352706 + }, + { + "epoch": 5.503078276165347, + "grad_norm": 0.2693743804237651, + "learning_rate": 0.00017353956107455783, + "loss": 2.653409481048584, + "step": 9388, + "token_acc": 0.34893336592935875 + }, + { + "epoch": 5.503664614482556, + "grad_norm": 0.25750135133069596, + "learning_rate": 0.00017353299300334476, + "loss": 2.6595613956451416, + "step": 9389, + "token_acc": 0.3484983539231286 + }, + { + "epoch": 5.504250952799765, + "grad_norm": 0.2683014170752076, + "learning_rate": 0.00017352642424138832, + "loss": 2.653118371963501, + "step": 9390, + "token_acc": 0.3494822208036884 + }, + { + "epoch": 5.504837291116974, + "grad_norm": 0.26262004819717516, + "learning_rate": 0.00017351985478875022, + "loss": 2.6844658851623535, + "step": 9391, + "token_acc": 0.3438432073544433 + }, + { + "epoch": 5.505423629434183, + "grad_norm": 0.2739852468252056, + "learning_rate": 0.0001735132846454921, + "loss": 2.6895031929016113, + "step": 9392, + "token_acc": 0.34403324921151945 + }, + { + "epoch": 5.5060099677513925, + "grad_norm": 0.2644354186008587, + "learning_rate": 0.00017350671381167579, + "loss": 2.665036916732788, + "step": 9393, + "token_acc": 0.3457236859674516 + }, + { + "epoch": 5.506596306068602, + "grad_norm": 0.27149964756133255, + "learning_rate": 0.0001735001422873629, + "loss": 2.706864833831787, + "step": 9394, + "token_acc": 0.3411302337528898 + }, + { + "epoch": 5.507182644385811, + "grad_norm": 0.2874202777434705, + "learning_rate": 0.00017349357007261527, + "loss": 2.661686420440674, + "step": 9395, + "token_acc": 0.3484246552446779 + }, + { + "epoch": 5.50776898270302, + "grad_norm": 0.26934167645066354, + "learning_rate": 0.00017348699716749456, + "loss": 2.66831111907959, + "step": 9396, + "token_acc": 0.34639575020708074 + }, + { + "epoch": 5.508355321020229, + "grad_norm": 0.27264006958204745, + "learning_rate": 0.00017348042357206254, + "loss": 2.6596550941467285, + "step": 9397, + "token_acc": 0.34834885473754135 + }, + { + "epoch": 5.508941659337438, + "grad_norm": 0.2837479217510576, + "learning_rate": 0.00017347384928638097, + "loss": 2.705252170562744, + "step": 9398, + "token_acc": 0.3407398966879368 + }, + { + "epoch": 5.509527997654647, + "grad_norm": 0.26123979435716504, + "learning_rate": 0.00017346727431051155, + "loss": 2.6580193042755127, + "step": 9399, + "token_acc": 0.34781223001875383 + }, + { + "epoch": 5.510114335971855, + "grad_norm": 0.2854790579676716, + "learning_rate": 0.0001734606986445161, + "loss": 2.656141757965088, + "step": 9400, + "token_acc": 0.3483431670081494 + }, + { + "epoch": 5.5107006742890645, + "grad_norm": 0.3105751990942194, + "learning_rate": 0.00017345412228845638, + "loss": 2.7000811100006104, + "step": 9401, + "token_acc": 0.34148358749030755 + }, + { + "epoch": 5.511287012606274, + "grad_norm": 0.261410517279633, + "learning_rate": 0.0001734475452423942, + "loss": 2.7129154205322266, + "step": 9402, + "token_acc": 0.3402588420999839 + }, + { + "epoch": 5.511873350923483, + "grad_norm": 0.26749809848006917, + "learning_rate": 0.00017344096750639127, + "loss": 2.649630069732666, + "step": 9403, + "token_acc": 0.35005843936283787 + }, + { + "epoch": 5.512459689240692, + "grad_norm": 0.26545801573768124, + "learning_rate": 0.0001734343890805094, + "loss": 2.6510887145996094, + "step": 9404, + "token_acc": 0.3502283363915634 + }, + { + "epoch": 5.513046027557901, + "grad_norm": 0.2811816875724293, + "learning_rate": 0.00017342780996481042, + "loss": 2.6524534225463867, + "step": 9405, + "token_acc": 0.3476690624729005 + }, + { + "epoch": 5.51363236587511, + "grad_norm": 0.27071182188601506, + "learning_rate": 0.0001734212301593561, + "loss": 2.6667261123657227, + "step": 9406, + "token_acc": 0.34701281033896386 + }, + { + "epoch": 5.514218704192319, + "grad_norm": 0.2694667105565443, + "learning_rate": 0.00017341464966420827, + "loss": 2.653271436691284, + "step": 9407, + "token_acc": 0.3485127423231119 + }, + { + "epoch": 5.514805042509528, + "grad_norm": 0.2660609369878042, + "learning_rate": 0.00017340806847942876, + "loss": 2.6883373260498047, + "step": 9408, + "token_acc": 0.34254216648962355 + }, + { + "epoch": 5.515391380826737, + "grad_norm": 0.27916671670689874, + "learning_rate": 0.0001734014866050793, + "loss": 2.6706161499023438, + "step": 9409, + "token_acc": 0.34544597578311675 + }, + { + "epoch": 5.515977719143946, + "grad_norm": 0.26366535787870504, + "learning_rate": 0.00017339490404122182, + "loss": 2.641026020050049, + "step": 9410, + "token_acc": 0.35069148165440406 + }, + { + "epoch": 5.516564057461155, + "grad_norm": 0.28878699951034664, + "learning_rate": 0.0001733883207879181, + "loss": 2.6849069595336914, + "step": 9411, + "token_acc": 0.34376355591544 + }, + { + "epoch": 5.517150395778364, + "grad_norm": 0.34832723643607066, + "learning_rate": 0.00017338173684523005, + "loss": 2.67995023727417, + "step": 9412, + "token_acc": 0.344861109276553 + }, + { + "epoch": 5.517736734095573, + "grad_norm": 0.3315500763755568, + "learning_rate": 0.00017337515221321943, + "loss": 2.6788172721862793, + "step": 9413, + "token_acc": 0.34414141122567066 + }, + { + "epoch": 5.518323072412782, + "grad_norm": 0.2765005512790981, + "learning_rate": 0.00017336856689194812, + "loss": 2.6608054637908936, + "step": 9414, + "token_acc": 0.34776972755260294 + }, + { + "epoch": 5.518909410729991, + "grad_norm": 0.2777264683142074, + "learning_rate": 0.00017336198088147798, + "loss": 2.640443801879883, + "step": 9415, + "token_acc": 0.3515483213073279 + }, + { + "epoch": 5.5194957490472, + "grad_norm": 0.2965145179763992, + "learning_rate": 0.0001733553941818709, + "loss": 2.632626533508301, + "step": 9416, + "token_acc": 0.35146126692696145 + }, + { + "epoch": 5.520082087364409, + "grad_norm": 0.27930860725616957, + "learning_rate": 0.00017334880679318877, + "loss": 2.6832375526428223, + "step": 9417, + "token_acc": 0.3442544793034473 + }, + { + "epoch": 5.5206684256816185, + "grad_norm": 0.26379131911943793, + "learning_rate": 0.00017334221871549338, + "loss": 2.672139883041382, + "step": 9418, + "token_acc": 0.34649010565458316 + }, + { + "epoch": 5.521254763998828, + "grad_norm": 0.29427344276677414, + "learning_rate": 0.00017333562994884674, + "loss": 2.7112064361572266, + "step": 9419, + "token_acc": 0.340765790110303 + }, + { + "epoch": 5.521841102316037, + "grad_norm": 0.26301003316732136, + "learning_rate": 0.00017332904049331064, + "loss": 2.666936159133911, + "step": 9420, + "token_acc": 0.3495162704266164 + }, + { + "epoch": 5.522427440633246, + "grad_norm": 0.26199707296416314, + "learning_rate": 0.000173322450348947, + "loss": 2.6565511226654053, + "step": 9421, + "token_acc": 0.3478055325195307 + }, + { + "epoch": 5.523013778950454, + "grad_norm": 0.26870919858202064, + "learning_rate": 0.0001733158595158178, + "loss": 2.697734832763672, + "step": 9422, + "token_acc": 0.34291472490967856 + }, + { + "epoch": 5.523600117267663, + "grad_norm": 0.24970956702514127, + "learning_rate": 0.00017330926799398482, + "loss": 2.693329334259033, + "step": 9423, + "token_acc": 0.34346936983924525 + }, + { + "epoch": 5.524186455584872, + "grad_norm": 0.2627069105990618, + "learning_rate": 0.0001733026757835101, + "loss": 2.671926736831665, + "step": 9424, + "token_acc": 0.34480162424929917 + }, + { + "epoch": 5.524772793902081, + "grad_norm": 0.24951018807561823, + "learning_rate": 0.0001732960828844555, + "loss": 2.6325125694274902, + "step": 9425, + "token_acc": 0.352132381699868 + }, + { + "epoch": 5.5253591322192905, + "grad_norm": 0.2663978983076645, + "learning_rate": 0.000173289489296883, + "loss": 2.6883883476257324, + "step": 9426, + "token_acc": 0.34452838186846974 + }, + { + "epoch": 5.5259454705365, + "grad_norm": 0.2831414413177394, + "learning_rate": 0.0001732828950208545, + "loss": 2.667560577392578, + "step": 9427, + "token_acc": 0.34693367807668346 + }, + { + "epoch": 5.526531808853709, + "grad_norm": 0.32866410144456365, + "learning_rate": 0.00017327630005643192, + "loss": 2.6780049800872803, + "step": 9428, + "token_acc": 0.3454617177321824 + }, + { + "epoch": 5.527118147170918, + "grad_norm": 0.3054373271427708, + "learning_rate": 0.00017326970440367724, + "loss": 2.675140857696533, + "step": 9429, + "token_acc": 0.3470856344823208 + }, + { + "epoch": 5.527704485488127, + "grad_norm": 0.2834955353259913, + "learning_rate": 0.00017326310806265244, + "loss": 2.661965847015381, + "step": 9430, + "token_acc": 0.34824867668218695 + }, + { + "epoch": 5.528290823805335, + "grad_norm": 0.265505641370291, + "learning_rate": 0.00017325651103341943, + "loss": 2.654348850250244, + "step": 9431, + "token_acc": 0.3488534911279428 + }, + { + "epoch": 5.528877162122544, + "grad_norm": 0.2773785376453433, + "learning_rate": 0.00017324991331604025, + "loss": 2.7341766357421875, + "step": 9432, + "token_acc": 0.33629781744464626 + }, + { + "epoch": 5.529463500439753, + "grad_norm": 0.29842901379287545, + "learning_rate": 0.00017324331491057687, + "loss": 2.6686980724334717, + "step": 9433, + "token_acc": 0.34697090161645866 + }, + { + "epoch": 5.5300498387569625, + "grad_norm": 0.31064705131243536, + "learning_rate": 0.0001732367158170912, + "loss": 2.6444926261901855, + "step": 9434, + "token_acc": 0.3505938126191291 + }, + { + "epoch": 5.530636177074172, + "grad_norm": 0.30745288018032235, + "learning_rate": 0.00017323011603564528, + "loss": 2.6743435859680176, + "step": 9435, + "token_acc": 0.34515766357398786 + }, + { + "epoch": 5.531222515391381, + "grad_norm": 0.26186769836133367, + "learning_rate": 0.00017322351556630107, + "loss": 2.6852824687957764, + "step": 9436, + "token_acc": 0.34505779709939904 + }, + { + "epoch": 5.53180885370859, + "grad_norm": 0.3359407176988424, + "learning_rate": 0.00017321691440912065, + "loss": 2.6544008255004883, + "step": 9437, + "token_acc": 0.3501308380555491 + }, + { + "epoch": 5.532395192025799, + "grad_norm": 0.29643486899031, + "learning_rate": 0.00017321031256416596, + "loss": 2.672698497772217, + "step": 9438, + "token_acc": 0.34539805625810427 + }, + { + "epoch": 5.532981530343008, + "grad_norm": 0.27589776096650775, + "learning_rate": 0.000173203710031499, + "loss": 2.6893701553344727, + "step": 9439, + "token_acc": 0.3441357619545185 + }, + { + "epoch": 5.533567868660217, + "grad_norm": 0.3194298934422611, + "learning_rate": 0.00017319710681118188, + "loss": 2.6448814868927, + "step": 9440, + "token_acc": 0.3502327235925108 + }, + { + "epoch": 5.534154206977426, + "grad_norm": 0.2936641899372622, + "learning_rate": 0.00017319050290327657, + "loss": 2.67972469329834, + "step": 9441, + "token_acc": 0.34374282315479276 + }, + { + "epoch": 5.534740545294635, + "grad_norm": 0.2661527357511759, + "learning_rate": 0.0001731838983078451, + "loss": 2.6797404289245605, + "step": 9442, + "token_acc": 0.3439920602305945 + }, + { + "epoch": 5.535326883611844, + "grad_norm": 0.32119209545532196, + "learning_rate": 0.00017317729302494952, + "loss": 2.6530847549438477, + "step": 9443, + "token_acc": 0.34917912788873784 + }, + { + "epoch": 5.535913221929053, + "grad_norm": 0.3233764906251096, + "learning_rate": 0.00017317068705465188, + "loss": 2.6515626907348633, + "step": 9444, + "token_acc": 0.3489255751301931 + }, + { + "epoch": 5.536499560246262, + "grad_norm": 0.26871531157327827, + "learning_rate": 0.00017316408039701423, + "loss": 2.690389633178711, + "step": 9445, + "token_acc": 0.342866602347062 + }, + { + "epoch": 5.537085898563471, + "grad_norm": 0.3626831467148151, + "learning_rate": 0.00017315747305209861, + "loss": 2.6333746910095215, + "step": 9446, + "token_acc": 0.3533416608167816 + }, + { + "epoch": 5.53767223688068, + "grad_norm": 0.3319070200460502, + "learning_rate": 0.00017315086501996715, + "loss": 2.6874165534973145, + "step": 9447, + "token_acc": 0.3426787954511979 + }, + { + "epoch": 5.538258575197889, + "grad_norm": 0.30166759164789586, + "learning_rate": 0.0001731442563006819, + "loss": 2.6765732765197754, + "step": 9448, + "token_acc": 0.3440254903762479 + }, + { + "epoch": 5.538844913515098, + "grad_norm": 0.34795391035649303, + "learning_rate": 0.0001731376468943049, + "loss": 2.6530773639678955, + "step": 9449, + "token_acc": 0.34677114127599507 + }, + { + "epoch": 5.5394312518323074, + "grad_norm": 0.29991835282214735, + "learning_rate": 0.00017313103680089825, + "loss": 2.6525368690490723, + "step": 9450, + "token_acc": 0.3490909836443688 + }, + { + "epoch": 5.540017590149517, + "grad_norm": 0.43125454174346906, + "learning_rate": 0.00017312442602052407, + "loss": 2.6716771125793457, + "step": 9451, + "token_acc": 0.34605374952090723 + }, + { + "epoch": 5.540603928466726, + "grad_norm": 0.2830518989538011, + "learning_rate": 0.00017311781455324444, + "loss": 2.661417007446289, + "step": 9452, + "token_acc": 0.3483629029885279 + }, + { + "epoch": 5.541190266783934, + "grad_norm": 0.3569299427087462, + "learning_rate": 0.00017311120239912146, + "loss": 2.7349114418029785, + "step": 9453, + "token_acc": 0.3362143474503025 + }, + { + "epoch": 5.541776605101143, + "grad_norm": 0.26659600000792616, + "learning_rate": 0.00017310458955821726, + "loss": 2.652721405029297, + "step": 9454, + "token_acc": 0.34900500376152216 + }, + { + "epoch": 5.542362943418352, + "grad_norm": 0.3379262724026496, + "learning_rate": 0.00017309797603059398, + "loss": 2.699692487716675, + "step": 9455, + "token_acc": 0.34169497933656456 + }, + { + "epoch": 5.542949281735561, + "grad_norm": 0.27987420423582204, + "learning_rate": 0.00017309136181631364, + "loss": 2.7045164108276367, + "step": 9456, + "token_acc": 0.3400708845642041 + }, + { + "epoch": 5.54353562005277, + "grad_norm": 0.2956594661251451, + "learning_rate": 0.00017308474691543851, + "loss": 2.6820507049560547, + "step": 9457, + "token_acc": 0.34294393477125673 + }, + { + "epoch": 5.5441219583699795, + "grad_norm": 0.2931623918506195, + "learning_rate": 0.00017307813132803066, + "loss": 2.6876039505004883, + "step": 9458, + "token_acc": 0.34183480377815295 + }, + { + "epoch": 5.544708296687189, + "grad_norm": 0.25690523799219306, + "learning_rate": 0.00017307151505415222, + "loss": 2.661341428756714, + "step": 9459, + "token_acc": 0.34912238896642744 + }, + { + "epoch": 5.545294635004398, + "grad_norm": 0.3406428122190264, + "learning_rate": 0.0001730648980938654, + "loss": 2.6577606201171875, + "step": 9460, + "token_acc": 0.3475338844099026 + }, + { + "epoch": 5.545880973321607, + "grad_norm": 0.2861391411221247, + "learning_rate": 0.00017305828044723227, + "loss": 2.697904109954834, + "step": 9461, + "token_acc": 0.3418006114320227 + }, + { + "epoch": 5.546467311638816, + "grad_norm": 0.2989303338782182, + "learning_rate": 0.00017305166211431508, + "loss": 2.689767360687256, + "step": 9462, + "token_acc": 0.3434398505195293 + }, + { + "epoch": 5.547053649956025, + "grad_norm": 0.25128811966664044, + "learning_rate": 0.00017304504309517593, + "loss": 2.6647088527679443, + "step": 9463, + "token_acc": 0.348135078940041 + }, + { + "epoch": 5.547639988273234, + "grad_norm": 0.3064542146997696, + "learning_rate": 0.00017303842338987706, + "loss": 2.6758477687835693, + "step": 9464, + "token_acc": 0.34454232773689114 + }, + { + "epoch": 5.548226326590442, + "grad_norm": 0.2546947441397591, + "learning_rate": 0.0001730318029984806, + "loss": 2.6483469009399414, + "step": 9465, + "token_acc": 0.3496044153812058 + }, + { + "epoch": 5.5488126649076515, + "grad_norm": 0.29504332566230695, + "learning_rate": 0.00017302518192104877, + "loss": 2.665048599243164, + "step": 9466, + "token_acc": 0.3482107701215981 + }, + { + "epoch": 5.549399003224861, + "grad_norm": 0.256925386255931, + "learning_rate": 0.00017301856015764378, + "loss": 2.6136746406555176, + "step": 9467, + "token_acc": 0.3534764349043397 + }, + { + "epoch": 5.54998534154207, + "grad_norm": 0.2963430006465843, + "learning_rate": 0.00017301193770832778, + "loss": 2.71746563911438, + "step": 9468, + "token_acc": 0.33966461106819645 + }, + { + "epoch": 5.550571679859279, + "grad_norm": 0.25643853564385827, + "learning_rate": 0.000173005314573163, + "loss": 2.6924004554748535, + "step": 9469, + "token_acc": 0.34381214123466364 + }, + { + "epoch": 5.551158018176488, + "grad_norm": 0.2866427943281811, + "learning_rate": 0.0001729986907522117, + "loss": 2.665832996368408, + "step": 9470, + "token_acc": 0.34674378997432154 + }, + { + "epoch": 5.551744356493697, + "grad_norm": 0.2522961575135783, + "learning_rate": 0.00017299206624553606, + "loss": 2.6874403953552246, + "step": 9471, + "token_acc": 0.34224498332933173 + }, + { + "epoch": 5.552330694810906, + "grad_norm": 0.2701326536425539, + "learning_rate": 0.00017298544105319832, + "loss": 2.682722806930542, + "step": 9472, + "token_acc": 0.3458279798149079 + }, + { + "epoch": 5.552917033128115, + "grad_norm": 0.29517384853147194, + "learning_rate": 0.00017297881517526066, + "loss": 2.6634249687194824, + "step": 9473, + "token_acc": 0.3478224212883678 + }, + { + "epoch": 5.5535033714453235, + "grad_norm": 0.2893749885504846, + "learning_rate": 0.00017297218861178545, + "loss": 2.666928291320801, + "step": 9474, + "token_acc": 0.34810414181628535 + }, + { + "epoch": 5.554089709762533, + "grad_norm": 0.2927214316628923, + "learning_rate": 0.0001729655613628348, + "loss": 2.6892919540405273, + "step": 9475, + "token_acc": 0.3437058444871216 + }, + { + "epoch": 5.554676048079742, + "grad_norm": 0.28272498642890176, + "learning_rate": 0.00017295893342847104, + "loss": 2.6703104972839355, + "step": 9476, + "token_acc": 0.34774146610871554 + }, + { + "epoch": 5.555262386396951, + "grad_norm": 0.2874734528475493, + "learning_rate": 0.00017295230480875642, + "loss": 2.698310136795044, + "step": 9477, + "token_acc": 0.34064780552647056 + }, + { + "epoch": 5.55584872471416, + "grad_norm": 0.2736366698793052, + "learning_rate": 0.0001729456755037532, + "loss": 2.6916580200195312, + "step": 9478, + "token_acc": 0.34195693542381994 + }, + { + "epoch": 5.556435063031369, + "grad_norm": 0.3152978451387112, + "learning_rate": 0.00017293904551352366, + "loss": 2.65950870513916, + "step": 9479, + "token_acc": 0.34799432140532144 + }, + { + "epoch": 5.557021401348578, + "grad_norm": 0.3052567184440223, + "learning_rate": 0.00017293241483813006, + "loss": 2.6542787551879883, + "step": 9480, + "token_acc": 0.3490028223271492 + }, + { + "epoch": 5.557607739665787, + "grad_norm": 0.2816850458264859, + "learning_rate": 0.0001729257834776347, + "loss": 2.6838974952697754, + "step": 9481, + "token_acc": 0.3435410684719049 + }, + { + "epoch": 5.558194077982996, + "grad_norm": 0.30439044435788276, + "learning_rate": 0.00017291915143209988, + "loss": 2.681971788406372, + "step": 9482, + "token_acc": 0.34243155837733924 + }, + { + "epoch": 5.5587804163002055, + "grad_norm": 0.27808374529967367, + "learning_rate": 0.00017291251870158792, + "loss": 2.695809841156006, + "step": 9483, + "token_acc": 0.3422477135165316 + }, + { + "epoch": 5.559366754617415, + "grad_norm": 0.31562240032589156, + "learning_rate": 0.00017290588528616105, + "loss": 2.686605453491211, + "step": 9484, + "token_acc": 0.34430643485201473 + }, + { + "epoch": 5.559953092934624, + "grad_norm": 0.28587776119346625, + "learning_rate": 0.00017289925118588165, + "loss": 2.684319019317627, + "step": 9485, + "token_acc": 0.3443590536610468 + }, + { + "epoch": 5.560539431251832, + "grad_norm": 0.3000043704006392, + "learning_rate": 0.000172892616400812, + "loss": 2.6316676139831543, + "step": 9486, + "token_acc": 0.35381603846581977 + }, + { + "epoch": 5.561125769569041, + "grad_norm": 0.27938466482531554, + "learning_rate": 0.00017288598093101446, + "loss": 2.6674113273620605, + "step": 9487, + "token_acc": 0.34602405284611176 + }, + { + "epoch": 5.56171210788625, + "grad_norm": 0.28520319300142155, + "learning_rate": 0.00017287934477655135, + "loss": 2.689854621887207, + "step": 9488, + "token_acc": 0.3427489862222329 + }, + { + "epoch": 5.562298446203459, + "grad_norm": 0.2856135209529232, + "learning_rate": 0.000172872707937485, + "loss": 2.6866321563720703, + "step": 9489, + "token_acc": 0.34463522945645353 + }, + { + "epoch": 5.562884784520668, + "grad_norm": 0.27251541740450586, + "learning_rate": 0.00017286607041387778, + "loss": 2.6591038703918457, + "step": 9490, + "token_acc": 0.34761383855024713 + }, + { + "epoch": 5.5634711228378775, + "grad_norm": 0.2911640657192143, + "learning_rate": 0.00017285943220579197, + "loss": 2.6778178215026855, + "step": 9491, + "token_acc": 0.34360578115127133 + }, + { + "epoch": 5.564057461155087, + "grad_norm": 0.2924233365135463, + "learning_rate": 0.00017285279331329, + "loss": 2.6685585975646973, + "step": 9492, + "token_acc": 0.3461153794864956 + }, + { + "epoch": 5.564643799472296, + "grad_norm": 0.2883716303319102, + "learning_rate": 0.0001728461537364342, + "loss": 2.697382688522339, + "step": 9493, + "token_acc": 0.34341927768728997 + }, + { + "epoch": 5.565230137789505, + "grad_norm": 0.3257203366526236, + "learning_rate": 0.00017283951347528694, + "loss": 2.674328327178955, + "step": 9494, + "token_acc": 0.34434078489043124 + }, + { + "epoch": 5.565816476106714, + "grad_norm": 0.25778236660646237, + "learning_rate": 0.00017283287252991062, + "loss": 2.68906307220459, + "step": 9495, + "token_acc": 0.34408718699111457 + }, + { + "epoch": 5.566402814423922, + "grad_norm": 0.29116442122375863, + "learning_rate": 0.0001728262309003676, + "loss": 2.6713616847991943, + "step": 9496, + "token_acc": 0.3460609594712976 + }, + { + "epoch": 5.566989152741131, + "grad_norm": 0.26458123442862924, + "learning_rate": 0.00017281958858672027, + "loss": 2.676656723022461, + "step": 9497, + "token_acc": 0.34633244894995974 + }, + { + "epoch": 5.56757549105834, + "grad_norm": 0.2831581877661786, + "learning_rate": 0.000172812945589031, + "loss": 2.6890816688537598, + "step": 9498, + "token_acc": 0.3423424142740038 + }, + { + "epoch": 5.5681618293755495, + "grad_norm": 0.293515002596814, + "learning_rate": 0.0001728063019073623, + "loss": 2.668801784515381, + "step": 9499, + "token_acc": 0.3469486218754812 + }, + { + "epoch": 5.568748167692759, + "grad_norm": 0.2607978546093702, + "learning_rate": 0.00017279965754177644, + "loss": 2.6777501106262207, + "step": 9500, + "token_acc": 0.34447816734726394 + }, + { + "epoch": 5.569334506009968, + "grad_norm": 0.288181359846888, + "learning_rate": 0.0001727930124923359, + "loss": 2.6453592777252197, + "step": 9501, + "token_acc": 0.35060994708072435 + }, + { + "epoch": 5.569920844327177, + "grad_norm": 0.26250521713133224, + "learning_rate": 0.00017278636675910312, + "loss": 2.6453347206115723, + "step": 9502, + "token_acc": 0.3506943804061157 + }, + { + "epoch": 5.570507182644386, + "grad_norm": 0.2565523396765342, + "learning_rate": 0.0001727797203421405, + "loss": 2.6938037872314453, + "step": 9503, + "token_acc": 0.3417875008878652 + }, + { + "epoch": 5.571093520961595, + "grad_norm": 0.27540527025358624, + "learning_rate": 0.0001727730732415105, + "loss": 2.683742046356201, + "step": 9504, + "token_acc": 0.34517230336962584 + }, + { + "epoch": 5.571679859278804, + "grad_norm": 0.26661861473737386, + "learning_rate": 0.00017276642545727548, + "loss": 2.6859798431396484, + "step": 9505, + "token_acc": 0.34400222800044766 + }, + { + "epoch": 5.572266197596013, + "grad_norm": 0.2615321347479978, + "learning_rate": 0.000172759776989498, + "loss": 2.6967639923095703, + "step": 9506, + "token_acc": 0.3403088967675528 + }, + { + "epoch": 5.572852535913222, + "grad_norm": 0.2695278979954013, + "learning_rate": 0.00017275312783824042, + "loss": 2.6598520278930664, + "step": 9507, + "token_acc": 0.34723685348885736 + }, + { + "epoch": 5.573438874230431, + "grad_norm": 0.2592194044143957, + "learning_rate": 0.00017274647800356527, + "loss": 2.6663970947265625, + "step": 9508, + "token_acc": 0.3466866813802804 + }, + { + "epoch": 5.57402521254764, + "grad_norm": 0.28867783525241614, + "learning_rate": 0.000172739827485535, + "loss": 2.6888844966888428, + "step": 9509, + "token_acc": 0.34336087483579075 + }, + { + "epoch": 5.574611550864849, + "grad_norm": 0.27420306398915206, + "learning_rate": 0.00017273317628421203, + "loss": 2.656083106994629, + "step": 9510, + "token_acc": 0.3491038651447048 + }, + { + "epoch": 5.575197889182058, + "grad_norm": 0.2745292375796734, + "learning_rate": 0.0001727265243996589, + "loss": 2.6714773178100586, + "step": 9511, + "token_acc": 0.3463119540964415 + }, + { + "epoch": 5.575784227499267, + "grad_norm": 0.2918980168788795, + "learning_rate": 0.00017271987183193807, + "loss": 2.6668505668640137, + "step": 9512, + "token_acc": 0.3480056643851782 + }, + { + "epoch": 5.576370565816476, + "grad_norm": 0.30481901122545135, + "learning_rate": 0.00017271321858111202, + "loss": 2.6504790782928467, + "step": 9513, + "token_acc": 0.3496241589571068 + }, + { + "epoch": 5.576956904133685, + "grad_norm": 0.2692296966426547, + "learning_rate": 0.0001727065646472433, + "loss": 2.682041645050049, + "step": 9514, + "token_acc": 0.34455677801772205 + }, + { + "epoch": 5.577543242450894, + "grad_norm": 0.28949548646926637, + "learning_rate": 0.00017269991003039436, + "loss": 2.6522960662841797, + "step": 9515, + "token_acc": 0.34984124949597717 + }, + { + "epoch": 5.5781295807681035, + "grad_norm": 0.3150092720675681, + "learning_rate": 0.00017269325473062773, + "loss": 2.689664125442505, + "step": 9516, + "token_acc": 0.3421989177125709 + }, + { + "epoch": 5.578715919085313, + "grad_norm": 0.2962501861272474, + "learning_rate": 0.00017268659874800592, + "loss": 2.6928863525390625, + "step": 9517, + "token_acc": 0.3433965796194526 + }, + { + "epoch": 5.579302257402521, + "grad_norm": 0.29964019541642506, + "learning_rate": 0.00017267994208259143, + "loss": 2.6890602111816406, + "step": 9518, + "token_acc": 0.3422720331872358 + }, + { + "epoch": 5.57988859571973, + "grad_norm": 0.3385485587414542, + "learning_rate": 0.00017267328473444688, + "loss": 2.7093987464904785, + "step": 9519, + "token_acc": 0.34061969784851226 + }, + { + "epoch": 5.580474934036939, + "grad_norm": 0.26641709907040595, + "learning_rate": 0.0001726666267036347, + "loss": 2.69830322265625, + "step": 9520, + "token_acc": 0.3423059869303126 + }, + { + "epoch": 5.581061272354148, + "grad_norm": 0.27748577719155926, + "learning_rate": 0.00017265996799021752, + "loss": 2.6623306274414062, + "step": 9521, + "token_acc": 0.34782296383400996 + }, + { + "epoch": 5.581647610671357, + "grad_norm": 0.2739024857547672, + "learning_rate": 0.00017265330859425783, + "loss": 2.6953396797180176, + "step": 9522, + "token_acc": 0.34351493483775036 + }, + { + "epoch": 5.582233948988566, + "grad_norm": 0.31848067354307735, + "learning_rate": 0.0001726466485158182, + "loss": 2.6923258304595947, + "step": 9523, + "token_acc": 0.34257672200361045 + }, + { + "epoch": 5.5828202873057755, + "grad_norm": 0.34115839876893417, + "learning_rate": 0.00017263998775496125, + "loss": 2.674044609069824, + "step": 9524, + "token_acc": 0.3451939927682513 + }, + { + "epoch": 5.583406625622985, + "grad_norm": 0.25283423836727303, + "learning_rate": 0.00017263332631174946, + "loss": 2.7193233966827393, + "step": 9525, + "token_acc": 0.33926552006903166 + }, + { + "epoch": 5.583992963940194, + "grad_norm": 0.3231957621995199, + "learning_rate": 0.00017262666418624544, + "loss": 2.6371898651123047, + "step": 9526, + "token_acc": 0.3502690381305339 + }, + { + "epoch": 5.584579302257403, + "grad_norm": 0.27162916626609257, + "learning_rate": 0.0001726200013785118, + "loss": 2.6481292247772217, + "step": 9527, + "token_acc": 0.3483629885063339 + }, + { + "epoch": 5.585165640574612, + "grad_norm": 0.3040474833045279, + "learning_rate": 0.00017261333788861106, + "loss": 2.6644997596740723, + "step": 9528, + "token_acc": 0.3463392718741215 + }, + { + "epoch": 5.585751978891821, + "grad_norm": 0.31590883761611954, + "learning_rate": 0.0001726066737166059, + "loss": 2.6718826293945312, + "step": 9529, + "token_acc": 0.3455741056040383 + }, + { + "epoch": 5.586338317209029, + "grad_norm": 0.26545033238765453, + "learning_rate": 0.00017260000886255887, + "loss": 2.6872425079345703, + "step": 9530, + "token_acc": 0.3450030769070982 + }, + { + "epoch": 5.586924655526238, + "grad_norm": 0.28795215125051715, + "learning_rate": 0.0001725933433265326, + "loss": 2.645616054534912, + "step": 9531, + "token_acc": 0.3495080317160347 + }, + { + "epoch": 5.5875109938434475, + "grad_norm": 0.25399968073061185, + "learning_rate": 0.00017258667710858965, + "loss": 2.6257543563842773, + "step": 9532, + "token_acc": 0.353845174595496 + }, + { + "epoch": 5.588097332160657, + "grad_norm": 0.3300004459222821, + "learning_rate": 0.0001725800102087927, + "loss": 2.729137659072876, + "step": 9533, + "token_acc": 0.3357266200127238 + }, + { + "epoch": 5.588683670477866, + "grad_norm": 0.3026773629599954, + "learning_rate": 0.00017257334262720436, + "loss": 2.6408743858337402, + "step": 9534, + "token_acc": 0.3509868395681918 + }, + { + "epoch": 5.589270008795075, + "grad_norm": 0.29936761228932973, + "learning_rate": 0.00017256667436388722, + "loss": 2.680730104446411, + "step": 9535, + "token_acc": 0.3439705690135936 + }, + { + "epoch": 5.589856347112284, + "grad_norm": 0.30545073651369886, + "learning_rate": 0.00017256000541890403, + "loss": 2.688145399093628, + "step": 9536, + "token_acc": 0.3436648889349143 + }, + { + "epoch": 5.590442685429493, + "grad_norm": 0.28566245336509705, + "learning_rate": 0.00017255333579231733, + "loss": 2.6684179306030273, + "step": 9537, + "token_acc": 0.3462927970189093 + }, + { + "epoch": 5.591029023746702, + "grad_norm": 0.3153970582488031, + "learning_rate": 0.00017254666548418982, + "loss": 2.6944031715393066, + "step": 9538, + "token_acc": 0.3428997630973876 + }, + { + "epoch": 5.59161536206391, + "grad_norm": 0.27455213831192754, + "learning_rate": 0.0001725399944945841, + "loss": 2.686351776123047, + "step": 9539, + "token_acc": 0.345387926656823 + }, + { + "epoch": 5.5922017003811195, + "grad_norm": 0.31689319639101826, + "learning_rate": 0.0001725333228235629, + "loss": 2.670151710510254, + "step": 9540, + "token_acc": 0.3468669018045342 + }, + { + "epoch": 5.592788038698329, + "grad_norm": 0.2651920529823943, + "learning_rate": 0.00017252665047118892, + "loss": 2.702652931213379, + "step": 9541, + "token_acc": 0.34046762317626456 + }, + { + "epoch": 5.593374377015538, + "grad_norm": 0.3082296243265281, + "learning_rate": 0.00017251997743752474, + "loss": 2.6784915924072266, + "step": 9542, + "token_acc": 0.34402279156063387 + }, + { + "epoch": 5.593960715332747, + "grad_norm": 0.26460348720071364, + "learning_rate": 0.00017251330372263312, + "loss": 2.665478229522705, + "step": 9543, + "token_acc": 0.3483158284425782 + }, + { + "epoch": 5.594547053649956, + "grad_norm": 0.2979342262535822, + "learning_rate": 0.00017250662932657672, + "loss": 2.69720458984375, + "step": 9544, + "token_acc": 0.34080049971412524 + }, + { + "epoch": 5.595133391967165, + "grad_norm": 0.26510563711093627, + "learning_rate": 0.00017249995424941823, + "loss": 2.6349191665649414, + "step": 9545, + "token_acc": 0.35066568989680214 + }, + { + "epoch": 5.595719730284374, + "grad_norm": 0.2970555271125922, + "learning_rate": 0.00017249327849122036, + "loss": 2.66591215133667, + "step": 9546, + "token_acc": 0.3466287077822815 + }, + { + "epoch": 5.596306068601583, + "grad_norm": 0.3138910758786903, + "learning_rate": 0.00017248660205204585, + "loss": 2.671900749206543, + "step": 9547, + "token_acc": 0.34628694164909907 + }, + { + "epoch": 5.596892406918792, + "grad_norm": 0.24128026291554897, + "learning_rate": 0.0001724799249319574, + "loss": 2.6586809158325195, + "step": 9548, + "token_acc": 0.3483696264267911 + }, + { + "epoch": 5.5974787452360015, + "grad_norm": 0.29391646535329724, + "learning_rate": 0.00017247324713101766, + "loss": 2.6854958534240723, + "step": 9549, + "token_acc": 0.3436215880275134 + }, + { + "epoch": 5.598065083553211, + "grad_norm": 0.24681472867351084, + "learning_rate": 0.00017246656864928947, + "loss": 2.678588390350342, + "step": 9550, + "token_acc": 0.3446805159703911 + }, + { + "epoch": 5.598651421870419, + "grad_norm": 0.28132837032821467, + "learning_rate": 0.00017245988948683553, + "loss": 2.668708324432373, + "step": 9551, + "token_acc": 0.3481025574925617 + }, + { + "epoch": 5.599237760187628, + "grad_norm": 0.272817763183024, + "learning_rate": 0.00017245320964371855, + "loss": 2.706871509552002, + "step": 9552, + "token_acc": 0.34029273212360717 + }, + { + "epoch": 5.599824098504837, + "grad_norm": 0.2625142910426835, + "learning_rate": 0.0001724465291200013, + "loss": 2.6884288787841797, + "step": 9553, + "token_acc": 0.34264260343362046 + }, + { + "epoch": 5.600410436822046, + "grad_norm": 0.2537382693024419, + "learning_rate": 0.00017243984791574652, + "loss": 2.6718735694885254, + "step": 9554, + "token_acc": 0.34656953389019957 + }, + { + "epoch": 5.600996775139255, + "grad_norm": 0.2642069352103558, + "learning_rate": 0.00017243316603101697, + "loss": 2.7170276641845703, + "step": 9555, + "token_acc": 0.3410430213847748 + }, + { + "epoch": 5.601583113456464, + "grad_norm": 0.24868884253586906, + "learning_rate": 0.00017242648346587545, + "loss": 2.703139305114746, + "step": 9556, + "token_acc": 0.3410795339858084 + }, + { + "epoch": 5.6021694517736735, + "grad_norm": 0.27178682092858986, + "learning_rate": 0.00017241980022038474, + "loss": 2.6944518089294434, + "step": 9557, + "token_acc": 0.34216920066626305 + }, + { + "epoch": 5.602755790090883, + "grad_norm": 0.27424919814975207, + "learning_rate": 0.00017241311629460757, + "loss": 2.6754965782165527, + "step": 9558, + "token_acc": 0.3455730184137227 + }, + { + "epoch": 5.603342128408092, + "grad_norm": 0.2685413706781158, + "learning_rate": 0.00017240643168860674, + "loss": 2.713334798812866, + "step": 9559, + "token_acc": 0.3409522537562604 + }, + { + "epoch": 5.603928466725301, + "grad_norm": 0.25971886386869275, + "learning_rate": 0.00017239974640244506, + "loss": 2.654721736907959, + "step": 9560, + "token_acc": 0.34911859544904333 + }, + { + "epoch": 5.604514805042509, + "grad_norm": 0.27804199803844065, + "learning_rate": 0.00017239306043618534, + "loss": 2.6440765857696533, + "step": 9561, + "token_acc": 0.3507363307768283 + }, + { + "epoch": 5.605101143359718, + "grad_norm": 0.2846340218616572, + "learning_rate": 0.0001723863737898903, + "loss": 2.6840686798095703, + "step": 9562, + "token_acc": 0.34385098189511926 + }, + { + "epoch": 5.605687481676927, + "grad_norm": 0.2585214699981028, + "learning_rate": 0.0001723796864636229, + "loss": 2.7015018463134766, + "step": 9563, + "token_acc": 0.34110689844290243 + }, + { + "epoch": 5.606273819994136, + "grad_norm": 0.2839966679831815, + "learning_rate": 0.00017237299845744585, + "loss": 2.708827257156372, + "step": 9564, + "token_acc": 0.3394794960404281 + }, + { + "epoch": 5.6068601583113455, + "grad_norm": 0.27633458431248276, + "learning_rate": 0.000172366309771422, + "loss": 2.705092668533325, + "step": 9565, + "token_acc": 0.34158589248930854 + }, + { + "epoch": 5.607446496628555, + "grad_norm": 0.26731375037233446, + "learning_rate": 0.0001723596204056142, + "loss": 2.703127384185791, + "step": 9566, + "token_acc": 0.3413327552297048 + }, + { + "epoch": 5.608032834945764, + "grad_norm": 0.2628737009413341, + "learning_rate": 0.00017235293036008524, + "loss": 2.7016396522521973, + "step": 9567, + "token_acc": 0.34079449921887656 + }, + { + "epoch": 5.608619173262973, + "grad_norm": 0.271931918458244, + "learning_rate": 0.00017234623963489803, + "loss": 2.6606388092041016, + "step": 9568, + "token_acc": 0.3474572043033341 + }, + { + "epoch": 5.609205511580182, + "grad_norm": 0.2742698171333254, + "learning_rate": 0.0001723395482301154, + "loss": 2.711544990539551, + "step": 9569, + "token_acc": 0.3401948679407344 + }, + { + "epoch": 5.609791849897391, + "grad_norm": 0.3018361748449417, + "learning_rate": 0.00017233285614580014, + "loss": 2.6753997802734375, + "step": 9570, + "token_acc": 0.3458021778493862 + }, + { + "epoch": 5.6103781882146, + "grad_norm": 0.29775006466249143, + "learning_rate": 0.00017232616338201518, + "loss": 2.672884702682495, + "step": 9571, + "token_acc": 0.34581213096024704 + }, + { + "epoch": 5.610964526531809, + "grad_norm": 0.2674109045338595, + "learning_rate": 0.00017231946993882338, + "loss": 2.681978464126587, + "step": 9572, + "token_acc": 0.3447729336243011 + }, + { + "epoch": 5.6115508648490176, + "grad_norm": 0.2657367020654729, + "learning_rate": 0.00017231277581628764, + "loss": 2.6371989250183105, + "step": 9573, + "token_acc": 0.3520868223596587 + }, + { + "epoch": 5.612137203166227, + "grad_norm": 0.29888779535527715, + "learning_rate": 0.00017230608101447077, + "loss": 2.6924996376037598, + "step": 9574, + "token_acc": 0.34226447030217916 + }, + { + "epoch": 5.612723541483436, + "grad_norm": 0.26593816707227347, + "learning_rate": 0.00017229938553343574, + "loss": 2.7103960514068604, + "step": 9575, + "token_acc": 0.34020578019529457 + }, + { + "epoch": 5.613309879800645, + "grad_norm": 0.28000804429137094, + "learning_rate": 0.00017229268937324541, + "loss": 2.6699142456054688, + "step": 9576, + "token_acc": 0.34684881022913233 + }, + { + "epoch": 5.613896218117854, + "grad_norm": 0.3280636946451515, + "learning_rate": 0.00017228599253396265, + "loss": 2.684858798980713, + "step": 9577, + "token_acc": 0.3441450143963026 + }, + { + "epoch": 5.614482556435063, + "grad_norm": 0.29497026079725164, + "learning_rate": 0.0001722792950156504, + "loss": 2.7349681854248047, + "step": 9578, + "token_acc": 0.3373895929970979 + }, + { + "epoch": 5.615068894752272, + "grad_norm": 0.27888967947162163, + "learning_rate": 0.00017227259681837159, + "loss": 2.6370911598205566, + "step": 9579, + "token_acc": 0.3508704183839928 + }, + { + "epoch": 5.615655233069481, + "grad_norm": 0.4086271425555681, + "learning_rate": 0.00017226589794218911, + "loss": 2.653137683868408, + "step": 9580, + "token_acc": 0.34714875678837415 + }, + { + "epoch": 5.6162415713866904, + "grad_norm": 0.35028685706137985, + "learning_rate": 0.0001722591983871659, + "loss": 2.6893744468688965, + "step": 9581, + "token_acc": 0.3429903224312769 + }, + { + "epoch": 5.616827909703899, + "grad_norm": 0.27977161103939074, + "learning_rate": 0.00017225249815336488, + "loss": 2.7082693576812744, + "step": 9582, + "token_acc": 0.3391929049796378 + }, + { + "epoch": 5.617414248021108, + "grad_norm": 0.3407775971940978, + "learning_rate": 0.000172245797240849, + "loss": 2.7055768966674805, + "step": 9583, + "token_acc": 0.34166554092670914 + }, + { + "epoch": 5.618000586338317, + "grad_norm": 0.2600046419542586, + "learning_rate": 0.00017223909564968124, + "loss": 2.654989242553711, + "step": 9584, + "token_acc": 0.35002241458442296 + }, + { + "epoch": 5.618586924655526, + "grad_norm": 0.3054417166482799, + "learning_rate": 0.0001722323933799245, + "loss": 2.710479259490967, + "step": 9585, + "token_acc": 0.33896057703875343 + }, + { + "epoch": 5.619173262972735, + "grad_norm": 0.25609859261555895, + "learning_rate": 0.00017222569043164176, + "loss": 2.6899924278259277, + "step": 9586, + "token_acc": 0.342729484898677 + }, + { + "epoch": 5.619759601289944, + "grad_norm": 0.30319158262514767, + "learning_rate": 0.00017221898680489596, + "loss": 2.671921968460083, + "step": 9587, + "token_acc": 0.348451389778404 + }, + { + "epoch": 5.620345939607153, + "grad_norm": 0.27243401289824437, + "learning_rate": 0.00017221228249975014, + "loss": 2.683389902114868, + "step": 9588, + "token_acc": 0.34456922859978345 + }, + { + "epoch": 5.6209322779243625, + "grad_norm": 0.30901186004536363, + "learning_rate": 0.0001722055775162672, + "loss": 2.6792807579040527, + "step": 9589, + "token_acc": 0.3444609148509137 + }, + { + "epoch": 5.621518616241572, + "grad_norm": 0.26434804750524393, + "learning_rate": 0.00017219887185451017, + "loss": 2.6610584259033203, + "step": 9590, + "token_acc": 0.3453238309890159 + }, + { + "epoch": 5.622104954558781, + "grad_norm": 0.3343600033096775, + "learning_rate": 0.00017219216551454203, + "loss": 2.6997694969177246, + "step": 9591, + "token_acc": 0.341328741910586 + }, + { + "epoch": 5.62269129287599, + "grad_norm": 0.29944898718648144, + "learning_rate": 0.00017218545849642577, + "loss": 2.6932477951049805, + "step": 9592, + "token_acc": 0.3417195902788799 + }, + { + "epoch": 5.623277631193199, + "grad_norm": 0.29199629429564067, + "learning_rate": 0.00017217875080022442, + "loss": 2.698925256729126, + "step": 9593, + "token_acc": 0.34316761928397954 + }, + { + "epoch": 5.623863969510407, + "grad_norm": 0.27236689540699016, + "learning_rate": 0.00017217204242600092, + "loss": 2.6657328605651855, + "step": 9594, + "token_acc": 0.3469391548652474 + }, + { + "epoch": 5.624450307827616, + "grad_norm": 0.30446357084770387, + "learning_rate": 0.00017216533337381838, + "loss": 2.675774574279785, + "step": 9595, + "token_acc": 0.3456307751726765 + }, + { + "epoch": 5.625036646144825, + "grad_norm": 0.2667273900288917, + "learning_rate": 0.00017215862364373976, + "loss": 2.720653533935547, + "step": 9596, + "token_acc": 0.3394340658158104 + }, + { + "epoch": 5.6256229844620345, + "grad_norm": 0.3135891993616265, + "learning_rate": 0.0001721519132358281, + "loss": 2.73087477684021, + "step": 9597, + "token_acc": 0.3373571856917002 + }, + { + "epoch": 5.626209322779244, + "grad_norm": 0.2750544075417309, + "learning_rate": 0.00017214520215014643, + "loss": 2.6480722427368164, + "step": 9598, + "token_acc": 0.3507457487231431 + }, + { + "epoch": 5.626795661096453, + "grad_norm": 0.2902383142752462, + "learning_rate": 0.00017213849038675782, + "loss": 2.6900877952575684, + "step": 9599, + "token_acc": 0.34327257401749023 + }, + { + "epoch": 5.627381999413662, + "grad_norm": 0.29442137581977007, + "learning_rate": 0.00017213177794572532, + "loss": 2.6956427097320557, + "step": 9600, + "token_acc": 0.3416669934256101 + }, + { + "epoch": 5.627968337730871, + "grad_norm": 0.275834745398906, + "learning_rate": 0.00017212506482711194, + "loss": 2.662900447845459, + "step": 9601, + "token_acc": 0.34729395425545767 + }, + { + "epoch": 5.62855467604808, + "grad_norm": 0.2617486056192483, + "learning_rate": 0.00017211835103098078, + "loss": 2.7033324241638184, + "step": 9602, + "token_acc": 0.34047504537908996 + }, + { + "epoch": 5.629141014365289, + "grad_norm": 0.2527867342072555, + "learning_rate": 0.0001721116365573949, + "loss": 2.665907859802246, + "step": 9603, + "token_acc": 0.34680471605994334 + }, + { + "epoch": 5.629727352682497, + "grad_norm": 0.2615780198377606, + "learning_rate": 0.00017210492140641733, + "loss": 2.701897144317627, + "step": 9604, + "token_acc": 0.3413744386179302 + }, + { + "epoch": 5.6303136909997065, + "grad_norm": 0.2586832242263612, + "learning_rate": 0.0001720982055781112, + "loss": 2.6905431747436523, + "step": 9605, + "token_acc": 0.3426048157144366 + }, + { + "epoch": 5.630900029316916, + "grad_norm": 0.258771689622604, + "learning_rate": 0.00017209148907253958, + "loss": 2.6716136932373047, + "step": 9606, + "token_acc": 0.34494342843462783 + }, + { + "epoch": 5.631486367634125, + "grad_norm": 0.2672388769846871, + "learning_rate": 0.00017208477188976557, + "loss": 2.6875696182250977, + "step": 9607, + "token_acc": 0.34482784092028285 + }, + { + "epoch": 5.632072705951334, + "grad_norm": 0.24877714299573475, + "learning_rate": 0.00017207805402985228, + "loss": 2.657158851623535, + "step": 9608, + "token_acc": 0.3474502657250928 + }, + { + "epoch": 5.632659044268543, + "grad_norm": 0.26040353690542056, + "learning_rate": 0.00017207133549286278, + "loss": 2.6807336807250977, + "step": 9609, + "token_acc": 0.3426333170807719 + }, + { + "epoch": 5.633245382585752, + "grad_norm": 0.2750706571856512, + "learning_rate": 0.0001720646162788602, + "loss": 2.6953413486480713, + "step": 9610, + "token_acc": 0.3441295224350982 + }, + { + "epoch": 5.633831720902961, + "grad_norm": 0.27217154412555883, + "learning_rate": 0.00017205789638790768, + "loss": 2.677666664123535, + "step": 9611, + "token_acc": 0.3437205925141484 + }, + { + "epoch": 5.63441805922017, + "grad_norm": 0.27063261896767504, + "learning_rate": 0.0001720511758200683, + "loss": 2.6994733810424805, + "step": 9612, + "token_acc": 0.34223900958007675 + }, + { + "epoch": 5.635004397537379, + "grad_norm": 0.2668747280725409, + "learning_rate": 0.0001720444545754052, + "loss": 2.700747013092041, + "step": 9613, + "token_acc": 0.341882276349987 + }, + { + "epoch": 5.6355907358545885, + "grad_norm": 0.3074591598718835, + "learning_rate": 0.00017203773265398158, + "loss": 2.731694221496582, + "step": 9614, + "token_acc": 0.33585005770072557 + }, + { + "epoch": 5.636177074171798, + "grad_norm": 0.2810924054234089, + "learning_rate": 0.0001720310100558605, + "loss": 2.720261573791504, + "step": 9615, + "token_acc": 0.3396750025550114 + }, + { + "epoch": 5.636763412489006, + "grad_norm": 0.2615150538293546, + "learning_rate": 0.00017202428678110513, + "loss": 2.728724479675293, + "step": 9616, + "token_acc": 0.33753262936971207 + }, + { + "epoch": 5.637349750806215, + "grad_norm": 0.32455518003444894, + "learning_rate": 0.00017201756282977866, + "loss": 2.685471534729004, + "step": 9617, + "token_acc": 0.3442908791848462 + }, + { + "epoch": 5.637936089123424, + "grad_norm": 0.28631815065560834, + "learning_rate": 0.00017201083820194422, + "loss": 2.686674118041992, + "step": 9618, + "token_acc": 0.34382407614929267 + }, + { + "epoch": 5.638522427440633, + "grad_norm": 0.2795183824407709, + "learning_rate": 0.000172004112897665, + "loss": 2.7081964015960693, + "step": 9619, + "token_acc": 0.34199075405575263 + }, + { + "epoch": 5.639108765757842, + "grad_norm": 0.35445840211209584, + "learning_rate": 0.00017199738691700417, + "loss": 2.6855039596557617, + "step": 9620, + "token_acc": 0.34333955859977305 + }, + { + "epoch": 5.639695104075051, + "grad_norm": 0.3257199841356022, + "learning_rate": 0.00017199066026002492, + "loss": 2.724978446960449, + "step": 9621, + "token_acc": 0.33752105450218656 + }, + { + "epoch": 5.6402814423922605, + "grad_norm": 0.26070852774632136, + "learning_rate": 0.0001719839329267904, + "loss": 2.683920383453369, + "step": 9622, + "token_acc": 0.3445674770274 + }, + { + "epoch": 5.64086778070947, + "grad_norm": 0.3233129408396068, + "learning_rate": 0.00017197720491736384, + "loss": 2.668210506439209, + "step": 9623, + "token_acc": 0.34627317362784016 + }, + { + "epoch": 5.641454119026679, + "grad_norm": 0.2746717369620902, + "learning_rate": 0.00017197047623180843, + "loss": 2.656402587890625, + "step": 9624, + "token_acc": 0.3484665826931706 + }, + { + "epoch": 5.642040457343887, + "grad_norm": 0.28546700046892254, + "learning_rate": 0.00017196374687018738, + "loss": 2.6803884506225586, + "step": 9625, + "token_acc": 0.3434194138531584 + }, + { + "epoch": 5.642626795661096, + "grad_norm": 0.29501524391772743, + "learning_rate": 0.0001719570168325639, + "loss": 2.6659979820251465, + "step": 9626, + "token_acc": 0.3477107296734918 + }, + { + "epoch": 5.643213133978305, + "grad_norm": 0.258006246091174, + "learning_rate": 0.00017195028611900122, + "loss": 2.708184242248535, + "step": 9627, + "token_acc": 0.33988353454610115 + }, + { + "epoch": 5.643799472295514, + "grad_norm": 0.2789750490267383, + "learning_rate": 0.00017194355472956253, + "loss": 2.6745424270629883, + "step": 9628, + "token_acc": 0.34688089486926454 + }, + { + "epoch": 5.644385810612723, + "grad_norm": 0.26013898401880814, + "learning_rate": 0.00017193682266431108, + "loss": 2.672896146774292, + "step": 9629, + "token_acc": 0.3469715571925517 + }, + { + "epoch": 5.6449721489299325, + "grad_norm": 0.30409176665869486, + "learning_rate": 0.00017193008992331013, + "loss": 2.6519622802734375, + "step": 9630, + "token_acc": 0.3495431657260984 + }, + { + "epoch": 5.645558487247142, + "grad_norm": 0.2748934565335675, + "learning_rate": 0.00017192335650662296, + "loss": 2.709531307220459, + "step": 9631, + "token_acc": 0.33962980185900826 + }, + { + "epoch": 5.646144825564351, + "grad_norm": 0.28745747670992294, + "learning_rate": 0.00017191662241431273, + "loss": 2.6950721740722656, + "step": 9632, + "token_acc": 0.3426986680393797 + }, + { + "epoch": 5.64673116388156, + "grad_norm": 0.29364886080963093, + "learning_rate": 0.00017190988764644271, + "loss": 2.657607078552246, + "step": 9633, + "token_acc": 0.3474349142171143 + }, + { + "epoch": 5.647317502198769, + "grad_norm": 0.28820697744904045, + "learning_rate": 0.00017190315220307626, + "loss": 2.705496311187744, + "step": 9634, + "token_acc": 0.34098763392583553 + }, + { + "epoch": 5.647903840515978, + "grad_norm": 0.2809696773094621, + "learning_rate": 0.00017189641608427657, + "loss": 2.6582584381103516, + "step": 9635, + "token_acc": 0.3495842122235791 + }, + { + "epoch": 5.648490178833187, + "grad_norm": 0.2943127476028719, + "learning_rate": 0.00017188967929010688, + "loss": 2.687264919281006, + "step": 9636, + "token_acc": 0.3433402161036356 + }, + { + "epoch": 5.649076517150396, + "grad_norm": 0.2732125172876398, + "learning_rate": 0.0001718829418206306, + "loss": 2.672905445098877, + "step": 9637, + "token_acc": 0.3473263969736054 + }, + { + "epoch": 5.6496628554676045, + "grad_norm": 0.33251542886324703, + "learning_rate": 0.0001718762036759109, + "loss": 2.7080190181732178, + "step": 9638, + "token_acc": 0.3407064001350952 + }, + { + "epoch": 5.650249193784814, + "grad_norm": 0.29275527323922623, + "learning_rate": 0.0001718694648560111, + "loss": 2.6641740798950195, + "step": 9639, + "token_acc": 0.3485184433442355 + }, + { + "epoch": 5.650835532102023, + "grad_norm": 0.2720275855721091, + "learning_rate": 0.00017186272536099458, + "loss": 2.677889823913574, + "step": 9640, + "token_acc": 0.3453799311622981 + }, + { + "epoch": 5.651421870419232, + "grad_norm": 0.28025747483633473, + "learning_rate": 0.00017185598519092455, + "loss": 2.695666790008545, + "step": 9641, + "token_acc": 0.34273902661722466 + }, + { + "epoch": 5.652008208736441, + "grad_norm": 0.2687714962643324, + "learning_rate": 0.0001718492443458644, + "loss": 2.664212703704834, + "step": 9642, + "token_acc": 0.34570813380120763 + }, + { + "epoch": 5.65259454705365, + "grad_norm": 0.28614792822812724, + "learning_rate": 0.00017184250282587738, + "loss": 2.708156108856201, + "step": 9643, + "token_acc": 0.33888891767131735 + }, + { + "epoch": 5.653180885370859, + "grad_norm": 0.28021189524005335, + "learning_rate": 0.0001718357606310269, + "loss": 2.6956028938293457, + "step": 9644, + "token_acc": 0.3421257377577404 + }, + { + "epoch": 5.653767223688068, + "grad_norm": 0.26468917323063723, + "learning_rate": 0.00017182901776137622, + "loss": 2.700101375579834, + "step": 9645, + "token_acc": 0.3430033549588279 + }, + { + "epoch": 5.654353562005277, + "grad_norm": 0.27955813134907753, + "learning_rate": 0.00017182227421698868, + "loss": 2.6796088218688965, + "step": 9646, + "token_acc": 0.34540723675294754 + }, + { + "epoch": 5.654939900322486, + "grad_norm": 0.27865219821203446, + "learning_rate": 0.0001718155299979277, + "loss": 2.6931333541870117, + "step": 9647, + "token_acc": 0.34297121344096415 + }, + { + "epoch": 5.655526238639695, + "grad_norm": 0.2548991478352482, + "learning_rate": 0.00017180878510425658, + "loss": 2.6627683639526367, + "step": 9648, + "token_acc": 0.34634670303471543 + }, + { + "epoch": 5.656112576956904, + "grad_norm": 0.28179119724857904, + "learning_rate": 0.00017180203953603867, + "loss": 2.6768953800201416, + "step": 9649, + "token_acc": 0.346237124050323 + }, + { + "epoch": 5.656698915274113, + "grad_norm": 0.28296715775712505, + "learning_rate": 0.00017179529329333737, + "loss": 2.6644973754882812, + "step": 9650, + "token_acc": 0.34607728142736305 + }, + { + "epoch": 5.657285253591322, + "grad_norm": 0.2536074287064516, + "learning_rate": 0.00017178854637621603, + "loss": 2.682227849960327, + "step": 9651, + "token_acc": 0.3434657927079066 + }, + { + "epoch": 5.657871591908531, + "grad_norm": 0.2743872489109879, + "learning_rate": 0.00017178179878473806, + "loss": 2.6914095878601074, + "step": 9652, + "token_acc": 0.3436564545097958 + }, + { + "epoch": 5.65845793022574, + "grad_norm": 0.2756830047285492, + "learning_rate": 0.00017177505051896676, + "loss": 2.653563976287842, + "step": 9653, + "token_acc": 0.34873480608529106 + }, + { + "epoch": 5.659044268542949, + "grad_norm": 0.26370904273242785, + "learning_rate": 0.00017176830157896564, + "loss": 2.6928234100341797, + "step": 9654, + "token_acc": 0.3407285117868705 + }, + { + "epoch": 5.6596306068601585, + "grad_norm": 0.2844213816246217, + "learning_rate": 0.00017176155196479796, + "loss": 2.6598589420318604, + "step": 9655, + "token_acc": 0.34840487318673136 + }, + { + "epoch": 5.660216945177368, + "grad_norm": 0.25334053750060304, + "learning_rate": 0.00017175480167652725, + "loss": 2.6874332427978516, + "step": 9656, + "token_acc": 0.34339116836696704 + }, + { + "epoch": 5.660803283494577, + "grad_norm": 0.2604208831574149, + "learning_rate": 0.0001717480507142169, + "loss": 2.7017476558685303, + "step": 9657, + "token_acc": 0.3409253267218557 + }, + { + "epoch": 5.661389621811786, + "grad_norm": 0.26336914932303757, + "learning_rate": 0.00017174129907793025, + "loss": 2.699188470840454, + "step": 9658, + "token_acc": 0.3424822419414038 + }, + { + "epoch": 5.661975960128994, + "grad_norm": 0.25451341195512084, + "learning_rate": 0.0001717345467677308, + "loss": 2.7013988494873047, + "step": 9659, + "token_acc": 0.34254907455596034 + }, + { + "epoch": 5.662562298446203, + "grad_norm": 0.2512169594366114, + "learning_rate": 0.00017172779378368192, + "loss": 2.6777291297912598, + "step": 9660, + "token_acc": 0.34638555013071604 + }, + { + "epoch": 5.663148636763412, + "grad_norm": 0.2678853539599262, + "learning_rate": 0.00017172104012584707, + "loss": 2.6957106590270996, + "step": 9661, + "token_acc": 0.3419926040723635 + }, + { + "epoch": 5.663734975080621, + "grad_norm": 0.24524049835171582, + "learning_rate": 0.00017171428579428969, + "loss": 2.6461286544799805, + "step": 9662, + "token_acc": 0.35020916211575825 + }, + { + "epoch": 5.6643213133978305, + "grad_norm": 0.313378698430541, + "learning_rate": 0.00017170753078907326, + "loss": 2.7188878059387207, + "step": 9663, + "token_acc": 0.3385147126021487 + }, + { + "epoch": 5.66490765171504, + "grad_norm": 0.37109935330587024, + "learning_rate": 0.00017170077511026116, + "loss": 2.728109359741211, + "step": 9664, + "token_acc": 0.33765468473777255 + }, + { + "epoch": 5.665493990032249, + "grad_norm": 0.3130998597121793, + "learning_rate": 0.00017169401875791692, + "loss": 2.698941230773926, + "step": 9665, + "token_acc": 0.34041382289861183 + }, + { + "epoch": 5.666080328349458, + "grad_norm": 0.26219306768097944, + "learning_rate": 0.000171687261732104, + "loss": 2.699347496032715, + "step": 9666, + "token_acc": 0.34244095114386447 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.3112212318369894, + "learning_rate": 0.00017168050403288583, + "loss": 2.6614480018615723, + "step": 9667, + "token_acc": 0.34581196269187947 + }, + { + "epoch": 5.667253004983876, + "grad_norm": 0.26897563477935615, + "learning_rate": 0.0001716737456603259, + "loss": 2.654895305633545, + "step": 9668, + "token_acc": 0.3492307573427708 + }, + { + "epoch": 5.667839343301084, + "grad_norm": 0.2663202729538439, + "learning_rate": 0.00017166698661448776, + "loss": 2.6832804679870605, + "step": 9669, + "token_acc": 0.34296849714100763 + }, + { + "epoch": 5.668425681618293, + "grad_norm": 0.2740789366429989, + "learning_rate": 0.0001716602268954348, + "loss": 2.6734683513641357, + "step": 9670, + "token_acc": 0.3465400805150867 + }, + { + "epoch": 5.6690120199355025, + "grad_norm": 0.25878683053469054, + "learning_rate": 0.00017165346650323065, + "loss": 2.738891124725342, + "step": 9671, + "token_acc": 0.33742612697543817 + }, + { + "epoch": 5.669598358252712, + "grad_norm": 0.25625375861972927, + "learning_rate": 0.00017164670543793866, + "loss": 2.6904873847961426, + "step": 9672, + "token_acc": 0.34311797057574517 + }, + { + "epoch": 5.670184696569921, + "grad_norm": 0.27341181256757546, + "learning_rate": 0.00017163994369962245, + "loss": 2.7016983032226562, + "step": 9673, + "token_acc": 0.3420264647248498 + }, + { + "epoch": 5.67077103488713, + "grad_norm": 0.26570693964556785, + "learning_rate": 0.0001716331812883455, + "loss": 2.6520755290985107, + "step": 9674, + "token_acc": 0.3481397923256737 + }, + { + "epoch": 5.671357373204339, + "grad_norm": 0.26346167444512103, + "learning_rate": 0.00017162641820417133, + "loss": 2.6935617923736572, + "step": 9675, + "token_acc": 0.3432988108730943 + }, + { + "epoch": 5.671943711521548, + "grad_norm": 0.27757711180086914, + "learning_rate": 0.0001716196544471635, + "loss": 2.7076027393341064, + "step": 9676, + "token_acc": 0.3399895236432324 + }, + { + "epoch": 5.672530049838757, + "grad_norm": 0.25614865138210385, + "learning_rate": 0.0001716128900173855, + "loss": 2.6771514415740967, + "step": 9677, + "token_acc": 0.34508505144955154 + }, + { + "epoch": 5.673116388155966, + "grad_norm": 0.26100486460293554, + "learning_rate": 0.00017160612491490092, + "loss": 2.670776844024658, + "step": 9678, + "token_acc": 0.34649800731390085 + }, + { + "epoch": 5.673702726473175, + "grad_norm": 0.29769488433209373, + "learning_rate": 0.00017159935913977327, + "loss": 2.6809566020965576, + "step": 9679, + "token_acc": 0.34354621279111547 + }, + { + "epoch": 5.6742890647903845, + "grad_norm": 0.27357565229498243, + "learning_rate": 0.00017159259269206614, + "loss": 2.723518133163452, + "step": 9680, + "token_acc": 0.3390090773537029 + }, + { + "epoch": 5.674875403107593, + "grad_norm": 0.27292475923486215, + "learning_rate": 0.00017158582557184307, + "loss": 2.72519588470459, + "step": 9681, + "token_acc": 0.3355027972844923 + }, + { + "epoch": 5.675461741424802, + "grad_norm": 0.2992115192055746, + "learning_rate": 0.00017157905777916762, + "loss": 2.6943047046661377, + "step": 9682, + "token_acc": 0.3425933866370722 + }, + { + "epoch": 5.676048079742011, + "grad_norm": 0.29030520350441075, + "learning_rate": 0.00017157228931410337, + "loss": 2.7105729579925537, + "step": 9683, + "token_acc": 0.3399942208679206 + }, + { + "epoch": 5.67663441805922, + "grad_norm": 0.29053278857007825, + "learning_rate": 0.0001715655201767139, + "loss": 2.734339475631714, + "step": 9684, + "token_acc": 0.3366634537040489 + }, + { + "epoch": 5.677220756376429, + "grad_norm": 0.4211042765428095, + "learning_rate": 0.00017155875036706285, + "loss": 2.7287237644195557, + "step": 9685, + "token_acc": 0.33740072129021803 + }, + { + "epoch": 5.677807094693638, + "grad_norm": 0.33259880997898006, + "learning_rate": 0.00017155197988521375, + "loss": 2.7345170974731445, + "step": 9686, + "token_acc": 0.3353360258226222 + }, + { + "epoch": 5.678393433010847, + "grad_norm": 0.2961986663595038, + "learning_rate": 0.0001715452087312302, + "loss": 2.699075222015381, + "step": 9687, + "token_acc": 0.3412668754440906 + }, + { + "epoch": 5.6789797713280565, + "grad_norm": 0.3012429298451997, + "learning_rate": 0.00017153843690517583, + "loss": 2.6976025104522705, + "step": 9688, + "token_acc": 0.34229070415421065 + }, + { + "epoch": 5.679566109645266, + "grad_norm": 0.2793880479123014, + "learning_rate": 0.00017153166440711423, + "loss": 2.6744914054870605, + "step": 9689, + "token_acc": 0.3469209457544942 + }, + { + "epoch": 5.680152447962474, + "grad_norm": 0.2802968624648089, + "learning_rate": 0.00017152489123710904, + "loss": 2.6746459007263184, + "step": 9690, + "token_acc": 0.3463219663303863 + }, + { + "epoch": 5.680738786279683, + "grad_norm": 0.28662892548268065, + "learning_rate": 0.00017151811739522387, + "loss": 2.6628518104553223, + "step": 9691, + "token_acc": 0.3467135959379254 + }, + { + "epoch": 5.681325124596892, + "grad_norm": 0.27934699048564315, + "learning_rate": 0.0001715113428815224, + "loss": 2.712088108062744, + "step": 9692, + "token_acc": 0.3404275393573431 + }, + { + "epoch": 5.681911462914101, + "grad_norm": 0.2671031181331191, + "learning_rate": 0.00017150456769606816, + "loss": 2.7613983154296875, + "step": 9693, + "token_acc": 0.3318236219625351 + }, + { + "epoch": 5.68249780123131, + "grad_norm": 0.2657156241642082, + "learning_rate": 0.00017149779183892492, + "loss": 2.6949009895324707, + "step": 9694, + "token_acc": 0.3420149259675197 + }, + { + "epoch": 5.683084139548519, + "grad_norm": 0.27283830017725214, + "learning_rate": 0.00017149101531015623, + "loss": 2.6977996826171875, + "step": 9695, + "token_acc": 0.34328876314463863 + }, + { + "epoch": 5.6836704778657285, + "grad_norm": 0.27890756220096297, + "learning_rate": 0.00017148423810982584, + "loss": 2.7115440368652344, + "step": 9696, + "token_acc": 0.3387540434125231 + }, + { + "epoch": 5.684256816182938, + "grad_norm": 0.2960220158246693, + "learning_rate": 0.0001714774602379973, + "loss": 2.6821229457855225, + "step": 9697, + "token_acc": 0.3448735055867804 + }, + { + "epoch": 5.684843154500147, + "grad_norm": 0.2945279182128348, + "learning_rate": 0.00017147068169473436, + "loss": 2.696906328201294, + "step": 9698, + "token_acc": 0.341732442825339 + }, + { + "epoch": 5.685429492817356, + "grad_norm": 0.26122457169313246, + "learning_rate": 0.00017146390248010067, + "loss": 2.706333875656128, + "step": 9699, + "token_acc": 0.3401145108662193 + }, + { + "epoch": 5.686015831134565, + "grad_norm": 0.28358980148016516, + "learning_rate": 0.00017145712259415994, + "loss": 2.7144322395324707, + "step": 9700, + "token_acc": 0.3390273770729986 + }, + { + "epoch": 5.686602169451774, + "grad_norm": 0.27764826409851145, + "learning_rate": 0.0001714503420369758, + "loss": 2.6760425567626953, + "step": 9701, + "token_acc": 0.34607765451664024 + }, + { + "epoch": 5.687188507768982, + "grad_norm": 0.25429064158741754, + "learning_rate": 0.00017144356080861202, + "loss": 2.6748547554016113, + "step": 9702, + "token_acc": 0.3450052164840897 + }, + { + "epoch": 5.687774846086191, + "grad_norm": 0.2747682412834148, + "learning_rate": 0.00017143677890913222, + "loss": 2.711700439453125, + "step": 9703, + "token_acc": 0.3383512622187198 + }, + { + "epoch": 5.6883611844034006, + "grad_norm": 0.25800833645683646, + "learning_rate": 0.00017142999633860013, + "loss": 2.72167706489563, + "step": 9704, + "token_acc": 0.3387947603727071 + }, + { + "epoch": 5.68894752272061, + "grad_norm": 0.2680073352135934, + "learning_rate": 0.0001714232130970795, + "loss": 2.716604232788086, + "step": 9705, + "token_acc": 0.33833822923451146 + }, + { + "epoch": 5.689533861037819, + "grad_norm": 0.26353410286587386, + "learning_rate": 0.00017141642918463404, + "loss": 2.70736026763916, + "step": 9706, + "token_acc": 0.3398236522994486 + }, + { + "epoch": 5.690120199355028, + "grad_norm": 0.27581644818499984, + "learning_rate": 0.00017140964460132745, + "loss": 2.6875500679016113, + "step": 9707, + "token_acc": 0.3437701502240838 + }, + { + "epoch": 5.690706537672237, + "grad_norm": 0.25279342391570686, + "learning_rate": 0.00017140285934722348, + "loss": 2.7416634559631348, + "step": 9708, + "token_acc": 0.33541525475941103 + }, + { + "epoch": 5.691292875989446, + "grad_norm": 0.2567513742149731, + "learning_rate": 0.00017139607342238583, + "loss": 2.6870784759521484, + "step": 9709, + "token_acc": 0.34341012405472005 + }, + { + "epoch": 5.691879214306655, + "grad_norm": 0.2662546011804505, + "learning_rate": 0.0001713892868268783, + "loss": 2.698129177093506, + "step": 9710, + "token_acc": 0.33983693030223067 + }, + { + "epoch": 5.692465552623864, + "grad_norm": 0.2520978973721482, + "learning_rate": 0.00017138249956076464, + "loss": 2.720951557159424, + "step": 9711, + "token_acc": 0.33784967541778554 + }, + { + "epoch": 5.693051890941073, + "grad_norm": 0.25435649613726424, + "learning_rate": 0.0001713757116241086, + "loss": 2.6916401386260986, + "step": 9712, + "token_acc": 0.34335120521677337 + }, + { + "epoch": 5.693638229258282, + "grad_norm": 0.27114417000353586, + "learning_rate": 0.0001713689230169739, + "loss": 2.6933059692382812, + "step": 9713, + "token_acc": 0.3416384367953664 + }, + { + "epoch": 5.694224567575491, + "grad_norm": 0.26859230660058864, + "learning_rate": 0.0001713621337394243, + "loss": 2.6944518089294434, + "step": 9714, + "token_acc": 0.34360502885093047 + }, + { + "epoch": 5.6948109058927, + "grad_norm": 0.2745675136113956, + "learning_rate": 0.00017135534379152367, + "loss": 2.678412437438965, + "step": 9715, + "token_acc": 0.3448146412634852 + }, + { + "epoch": 5.695397244209909, + "grad_norm": 0.2977936494330076, + "learning_rate": 0.00017134855317333574, + "loss": 2.744518280029297, + "step": 9716, + "token_acc": 0.3354496491371167 + }, + { + "epoch": 5.695983582527118, + "grad_norm": 0.2862108879482743, + "learning_rate": 0.00017134176188492425, + "loss": 2.6785643100738525, + "step": 9717, + "token_acc": 0.3455358361598945 + }, + { + "epoch": 5.696569920844327, + "grad_norm": 0.28035757868718536, + "learning_rate": 0.00017133496992635308, + "loss": 2.7197721004486084, + "step": 9718, + "token_acc": 0.33951212645715595 + }, + { + "epoch": 5.697156259161536, + "grad_norm": 0.2587638836193722, + "learning_rate": 0.00017132817729768597, + "loss": 2.7041447162628174, + "step": 9719, + "token_acc": 0.34045063888621596 + }, + { + "epoch": 5.6977425974787455, + "grad_norm": 0.26726880007916765, + "learning_rate": 0.00017132138399898676, + "loss": 2.6794629096984863, + "step": 9720, + "token_acc": 0.3459427585292821 + }, + { + "epoch": 5.698328935795955, + "grad_norm": 0.25885507650634726, + "learning_rate": 0.00017131459003031927, + "loss": 2.676908016204834, + "step": 9721, + "token_acc": 0.345653594857722 + }, + { + "epoch": 5.698915274113164, + "grad_norm": 0.25569052988181756, + "learning_rate": 0.00017130779539174728, + "loss": 2.683712959289551, + "step": 9722, + "token_acc": 0.3451104756122889 + }, + { + "epoch": 5.699501612430373, + "grad_norm": 0.26017009619750575, + "learning_rate": 0.00017130100008333466, + "loss": 2.6609983444213867, + "step": 9723, + "token_acc": 0.3469309854698322 + }, + { + "epoch": 5.700087950747581, + "grad_norm": 0.2783159908329817, + "learning_rate": 0.0001712942041051452, + "loss": 2.670192241668701, + "step": 9724, + "token_acc": 0.34642788114030326 + }, + { + "epoch": 5.70067428906479, + "grad_norm": 0.28967877623041083, + "learning_rate": 0.00017128740745724278, + "loss": 2.7244067192077637, + "step": 9725, + "token_acc": 0.33750685862095997 + }, + { + "epoch": 5.701260627381999, + "grad_norm": 0.28872852257096737, + "learning_rate": 0.00017128061013969124, + "loss": 2.7011876106262207, + "step": 9726, + "token_acc": 0.3427534897712038 + }, + { + "epoch": 5.701846965699208, + "grad_norm": 0.26385900821646313, + "learning_rate": 0.00017127381215255438, + "loss": 2.7199716567993164, + "step": 9727, + "token_acc": 0.33854522119116254 + }, + { + "epoch": 5.7024333040164175, + "grad_norm": 0.26415495525422983, + "learning_rate": 0.00017126701349589614, + "loss": 2.6752676963806152, + "step": 9728, + "token_acc": 0.3442394700270629 + }, + { + "epoch": 5.703019642333627, + "grad_norm": 0.25377827077903775, + "learning_rate": 0.00017126021416978034, + "loss": 2.6789393424987793, + "step": 9729, + "token_acc": 0.3440548303614595 + }, + { + "epoch": 5.703605980650836, + "grad_norm": 0.2999311219193406, + "learning_rate": 0.00017125341417427082, + "loss": 2.721673011779785, + "step": 9730, + "token_acc": 0.33771657041314973 + }, + { + "epoch": 5.704192318968045, + "grad_norm": 0.26782542896093176, + "learning_rate": 0.0001712466135094315, + "loss": 2.7176051139831543, + "step": 9731, + "token_acc": 0.33998301874469333 + }, + { + "epoch": 5.704778657285254, + "grad_norm": 0.2688848674073152, + "learning_rate": 0.00017123981217532626, + "loss": 2.6988096237182617, + "step": 9732, + "token_acc": 0.34087736916369 + }, + { + "epoch": 5.705364995602462, + "grad_norm": 0.37106934319896084, + "learning_rate": 0.00017123301017201898, + "loss": 2.6864659786224365, + "step": 9733, + "token_acc": 0.3436038096890127 + }, + { + "epoch": 5.705951333919671, + "grad_norm": 0.3260039128527722, + "learning_rate": 0.00017122620749957353, + "loss": 2.680455207824707, + "step": 9734, + "token_acc": 0.34424102712537213 + }, + { + "epoch": 5.70653767223688, + "grad_norm": 0.27620516224952646, + "learning_rate": 0.00017121940415805388, + "loss": 2.650160312652588, + "step": 9735, + "token_acc": 0.35079507272810423 + }, + { + "epoch": 5.7071240105540895, + "grad_norm": 0.2949262597828128, + "learning_rate": 0.00017121260014752387, + "loss": 2.671657085418701, + "step": 9736, + "token_acc": 0.34582348305752564 + }, + { + "epoch": 5.707710348871299, + "grad_norm": 0.2990210719743637, + "learning_rate": 0.00017120579546804745, + "loss": 2.6990580558776855, + "step": 9737, + "token_acc": 0.3421709303343136 + }, + { + "epoch": 5.708296687188508, + "grad_norm": 0.29328931084556414, + "learning_rate": 0.00017119899011968856, + "loss": 2.6936371326446533, + "step": 9738, + "token_acc": 0.34231551268949073 + }, + { + "epoch": 5.708883025505717, + "grad_norm": 0.3198841559198466, + "learning_rate": 0.00017119218410251106, + "loss": 2.6864168643951416, + "step": 9739, + "token_acc": 0.34291262638685704 + }, + { + "epoch": 5.709469363822926, + "grad_norm": 0.26424598711522645, + "learning_rate": 0.00017118537741657894, + "loss": 2.6947760581970215, + "step": 9740, + "token_acc": 0.34041564058220497 + }, + { + "epoch": 5.710055702140135, + "grad_norm": 0.30728248812990044, + "learning_rate": 0.00017117857006195613, + "loss": 2.7045998573303223, + "step": 9741, + "token_acc": 0.33979316465325193 + }, + { + "epoch": 5.710642040457344, + "grad_norm": 0.260746561086653, + "learning_rate": 0.00017117176203870654, + "loss": 2.7167842388153076, + "step": 9742, + "token_acc": 0.3384944081029753 + }, + { + "epoch": 5.711228378774553, + "grad_norm": 0.32745000709729316, + "learning_rate": 0.0001711649533468942, + "loss": 2.714959144592285, + "step": 9743, + "token_acc": 0.33814542908369494 + }, + { + "epoch": 5.711814717091762, + "grad_norm": 0.30974940885090324, + "learning_rate": 0.000171158143986583, + "loss": 2.6906955242156982, + "step": 9744, + "token_acc": 0.3432065079817636 + }, + { + "epoch": 5.7124010554089715, + "grad_norm": 0.2601905798155636, + "learning_rate": 0.0001711513339578369, + "loss": 2.7093000411987305, + "step": 9745, + "token_acc": 0.3405824285362115 + }, + { + "epoch": 5.71298739372618, + "grad_norm": 0.3375232613722728, + "learning_rate": 0.00017114452326071988, + "loss": 2.690463066101074, + "step": 9746, + "token_acc": 0.34313885053367704 + }, + { + "epoch": 5.713573732043389, + "grad_norm": 0.2563378741457845, + "learning_rate": 0.00017113771189529597, + "loss": 2.7091119289398193, + "step": 9747, + "token_acc": 0.34089345848877617 + }, + { + "epoch": 5.714160070360598, + "grad_norm": 0.30036645834518505, + "learning_rate": 0.0001711308998616291, + "loss": 2.665311336517334, + "step": 9748, + "token_acc": 0.34789655522812724 + }, + { + "epoch": 5.714746408677807, + "grad_norm": 0.26170582974491674, + "learning_rate": 0.0001711240871597833, + "loss": 2.680868625640869, + "step": 9749, + "token_acc": 0.3446827133479212 + }, + { + "epoch": 5.715332746995016, + "grad_norm": 0.3111825179256955, + "learning_rate": 0.00017111727378982253, + "loss": 2.700237989425659, + "step": 9750, + "token_acc": 0.34122715632489464 + }, + { + "epoch": 5.715919085312225, + "grad_norm": 0.2925940683568707, + "learning_rate": 0.0001711104597518108, + "loss": 2.724099636077881, + "step": 9751, + "token_acc": 0.3383845768234606 + }, + { + "epoch": 5.716505423629434, + "grad_norm": 0.26024852222650807, + "learning_rate": 0.0001711036450458121, + "loss": 2.6854677200317383, + "step": 9752, + "token_acc": 0.3426606006027891 + }, + { + "epoch": 5.7170917619466435, + "grad_norm": 0.2692403325356353, + "learning_rate": 0.0001710968296718905, + "loss": 2.6941633224487305, + "step": 9753, + "token_acc": 0.3423151643422362 + }, + { + "epoch": 5.717678100263853, + "grad_norm": 0.2545522749257339, + "learning_rate": 0.00017109001363011, + "loss": 2.68007230758667, + "step": 9754, + "token_acc": 0.34384698853459883 + }, + { + "epoch": 5.718264438581061, + "grad_norm": 0.25431640576537073, + "learning_rate": 0.00017108319692053458, + "loss": 2.690493106842041, + "step": 9755, + "token_acc": 0.3428599166176764 + }, + { + "epoch": 5.71885077689827, + "grad_norm": 0.25750593221008694, + "learning_rate": 0.00017107637954322832, + "loss": 2.720430850982666, + "step": 9756, + "token_acc": 0.33831440849506944 + }, + { + "epoch": 5.719437115215479, + "grad_norm": 0.2599397867838589, + "learning_rate": 0.0001710695614982553, + "loss": 2.700011730194092, + "step": 9757, + "token_acc": 0.3420546074908482 + }, + { + "epoch": 5.720023453532688, + "grad_norm": 0.2607872752975199, + "learning_rate": 0.00017106274278567944, + "loss": 2.686519145965576, + "step": 9758, + "token_acc": 0.34319085167414876 + }, + { + "epoch": 5.720609791849897, + "grad_norm": 0.2560634124270843, + "learning_rate": 0.0001710559234055649, + "loss": 2.705650806427002, + "step": 9759, + "token_acc": 0.34117915557246103 + }, + { + "epoch": 5.721196130167106, + "grad_norm": 0.2555389294052096, + "learning_rate": 0.00017104910335797574, + "loss": 2.6971700191497803, + "step": 9760, + "token_acc": 0.3409017863071917 + }, + { + "epoch": 5.7217824684843155, + "grad_norm": 0.2737229160664518, + "learning_rate": 0.00017104228264297597, + "loss": 2.6999640464782715, + "step": 9761, + "token_acc": 0.34035474600371723 + }, + { + "epoch": 5.722368806801525, + "grad_norm": 0.25775683000244204, + "learning_rate": 0.00017103546126062967, + "loss": 2.729640245437622, + "step": 9762, + "token_acc": 0.3371763485098539 + }, + { + "epoch": 5.722955145118734, + "grad_norm": 0.28364063335595124, + "learning_rate": 0.00017102863921100093, + "loss": 2.706819772720337, + "step": 9763, + "token_acc": 0.34029474149803474 + }, + { + "epoch": 5.723541483435943, + "grad_norm": 0.28267700268868706, + "learning_rate": 0.00017102181649415385, + "loss": 2.6976799964904785, + "step": 9764, + "token_acc": 0.3414142371310949 + }, + { + "epoch": 5.724127821753152, + "grad_norm": 0.26026206756766684, + "learning_rate": 0.00017101499311015253, + "loss": 2.7080278396606445, + "step": 9765, + "token_acc": 0.3390354585272451 + }, + { + "epoch": 5.724714160070361, + "grad_norm": 0.29403574871222393, + "learning_rate": 0.00017100816905906102, + "loss": 2.7065558433532715, + "step": 9766, + "token_acc": 0.3404565158493633 + }, + { + "epoch": 5.725300498387569, + "grad_norm": 0.2504377363742552, + "learning_rate": 0.00017100134434094345, + "loss": 2.715836524963379, + "step": 9767, + "token_acc": 0.33924575827770165 + }, + { + "epoch": 5.725886836704778, + "grad_norm": 0.30351948374689247, + "learning_rate": 0.00017099451895586393, + "loss": 2.6740026473999023, + "step": 9768, + "token_acc": 0.34603050218850384 + }, + { + "epoch": 5.7264731750219875, + "grad_norm": 0.2960985384998379, + "learning_rate": 0.00017098769290388655, + "loss": 2.71990966796875, + "step": 9769, + "token_acc": 0.33975473491434266 + }, + { + "epoch": 5.727059513339197, + "grad_norm": 0.2660476293423822, + "learning_rate": 0.00017098086618507548, + "loss": 2.687426805496216, + "step": 9770, + "token_acc": 0.34411978403136323 + }, + { + "epoch": 5.727645851656406, + "grad_norm": 0.30950077527573744, + "learning_rate": 0.00017097403879949482, + "loss": 2.662525177001953, + "step": 9771, + "token_acc": 0.34718101685467034 + }, + { + "epoch": 5.728232189973615, + "grad_norm": 0.25575448276451124, + "learning_rate": 0.00017096721074720867, + "loss": 2.699496269226074, + "step": 9772, + "token_acc": 0.3423390608915987 + }, + { + "epoch": 5.728818528290824, + "grad_norm": 0.29161144072794143, + "learning_rate": 0.00017096038202828125, + "loss": 2.699843406677246, + "step": 9773, + "token_acc": 0.34085409271370054 + }, + { + "epoch": 5.729404866608033, + "grad_norm": 0.31530960157301263, + "learning_rate": 0.00017095355264277665, + "loss": 2.6798179149627686, + "step": 9774, + "token_acc": 0.34392669285997673 + }, + { + "epoch": 5.729991204925242, + "grad_norm": 0.2700875857523055, + "learning_rate": 0.000170946722590759, + "loss": 2.711947202682495, + "step": 9775, + "token_acc": 0.33865903634129824 + }, + { + "epoch": 5.730577543242451, + "grad_norm": 0.2672616730425358, + "learning_rate": 0.00017093989187229251, + "loss": 2.7120652198791504, + "step": 9776, + "token_acc": 0.34098240276940023 + }, + { + "epoch": 5.7311638815596595, + "grad_norm": 0.2626277186236491, + "learning_rate": 0.00017093306048744132, + "loss": 2.700963020324707, + "step": 9777, + "token_acc": 0.34114509173752083 + }, + { + "epoch": 5.731750219876869, + "grad_norm": 0.26941788536500166, + "learning_rate": 0.00017092622843626964, + "loss": 2.689690113067627, + "step": 9778, + "token_acc": 0.34305546134888876 + }, + { + "epoch": 5.732336558194078, + "grad_norm": 0.2651415603443734, + "learning_rate": 0.0001709193957188416, + "loss": 2.729811429977417, + "step": 9779, + "token_acc": 0.3366144356303663 + }, + { + "epoch": 5.732922896511287, + "grad_norm": 0.26929007793912, + "learning_rate": 0.00017091256233522142, + "loss": 2.665167808532715, + "step": 9780, + "token_acc": 0.34690240559879965 + }, + { + "epoch": 5.733509234828496, + "grad_norm": 0.3010938273143518, + "learning_rate": 0.00017090572828547327, + "loss": 2.699704647064209, + "step": 9781, + "token_acc": 0.33939044948943564 + }, + { + "epoch": 5.734095573145705, + "grad_norm": 0.263358523281644, + "learning_rate": 0.00017089889356966133, + "loss": 2.6912002563476562, + "step": 9782, + "token_acc": 0.34348387295840704 + }, + { + "epoch": 5.734681911462914, + "grad_norm": 0.2853994119846367, + "learning_rate": 0.00017089205818784985, + "loss": 2.7092292308807373, + "step": 9783, + "token_acc": 0.34015680676902305 + }, + { + "epoch": 5.735268249780123, + "grad_norm": 0.26808640613941365, + "learning_rate": 0.00017088522214010299, + "loss": 2.707169532775879, + "step": 9784, + "token_acc": 0.3403616927087603 + }, + { + "epoch": 5.735854588097332, + "grad_norm": 0.27839189640848083, + "learning_rate": 0.00017087838542648496, + "loss": 2.6989340782165527, + "step": 9785, + "token_acc": 0.34183965974694347 + }, + { + "epoch": 5.7364409264145415, + "grad_norm": 0.31492998215987655, + "learning_rate": 0.00017087154804706006, + "loss": 2.708911895751953, + "step": 9786, + "token_acc": 0.3408630807808951 + }, + { + "epoch": 5.737027264731751, + "grad_norm": 0.2736303969955324, + "learning_rate": 0.00017086471000189244, + "loss": 2.7278456687927246, + "step": 9787, + "token_acc": 0.3365573596902869 + }, + { + "epoch": 5.73761360304896, + "grad_norm": 0.2805829287957194, + "learning_rate": 0.00017085787129104634, + "loss": 2.737631320953369, + "step": 9788, + "token_acc": 0.33693952540368693 + }, + { + "epoch": 5.738199941366168, + "grad_norm": 0.27965463383293826, + "learning_rate": 0.00017085103191458604, + "loss": 2.6989285945892334, + "step": 9789, + "token_acc": 0.34236492451050077 + }, + { + "epoch": 5.738786279683377, + "grad_norm": 0.2596622286561764, + "learning_rate": 0.0001708441918725758, + "loss": 2.730696439743042, + "step": 9790, + "token_acc": 0.33648783135800236 + }, + { + "epoch": 5.739372618000586, + "grad_norm": 0.2997292460840302, + "learning_rate": 0.0001708373511650798, + "loss": 2.6817541122436523, + "step": 9791, + "token_acc": 0.34421246869645444 + }, + { + "epoch": 5.739958956317795, + "grad_norm": 0.24675660635945773, + "learning_rate": 0.00017083050979216236, + "loss": 2.673198699951172, + "step": 9792, + "token_acc": 0.346139544187454 + }, + { + "epoch": 5.740545294635004, + "grad_norm": 0.36060202645340544, + "learning_rate": 0.00017082366775388773, + "loss": 2.6794328689575195, + "step": 9793, + "token_acc": 0.345360091281294 + }, + { + "epoch": 5.7411316329522135, + "grad_norm": 0.3101345877382126, + "learning_rate": 0.00017081682505032015, + "loss": 2.724865198135376, + "step": 9794, + "token_acc": 0.33718523399698686 + }, + { + "epoch": 5.741717971269423, + "grad_norm": 0.2750404062721753, + "learning_rate": 0.00017080998168152395, + "loss": 2.691318988800049, + "step": 9795, + "token_acc": 0.3415126596882801 + }, + { + "epoch": 5.742304309586632, + "grad_norm": 0.2820934091914795, + "learning_rate": 0.0001708031376475634, + "loss": 2.693655014038086, + "step": 9796, + "token_acc": 0.3416504559037426 + }, + { + "epoch": 5.742890647903841, + "grad_norm": 0.29047740790181464, + "learning_rate": 0.00017079629294850275, + "loss": 2.7151474952697754, + "step": 9797, + "token_acc": 0.3401318369206965 + }, + { + "epoch": 5.743476986221049, + "grad_norm": 0.30857586091775796, + "learning_rate": 0.00017078944758440633, + "loss": 2.689760208129883, + "step": 9798, + "token_acc": 0.3425099349190808 + }, + { + "epoch": 5.744063324538258, + "grad_norm": 0.26762364101303837, + "learning_rate": 0.00017078260155533847, + "loss": 2.6821975708007812, + "step": 9799, + "token_acc": 0.34278142893178654 + }, + { + "epoch": 5.744649662855467, + "grad_norm": 0.29230952279561856, + "learning_rate": 0.00017077575486136343, + "loss": 2.6854281425476074, + "step": 9800, + "token_acc": 0.34561163212767687 + }, + { + "epoch": 5.745236001172676, + "grad_norm": 0.2571663120802006, + "learning_rate": 0.00017076890750254554, + "loss": 2.706923007965088, + "step": 9801, + "token_acc": 0.34170859588696834 + }, + { + "epoch": 5.7458223394898855, + "grad_norm": 0.2767398307788116, + "learning_rate": 0.00017076205947894914, + "loss": 2.7301294803619385, + "step": 9802, + "token_acc": 0.33542818815067293 + }, + { + "epoch": 5.746408677807095, + "grad_norm": 0.26326954399113556, + "learning_rate": 0.00017075521079063855, + "loss": 2.675654888153076, + "step": 9803, + "token_acc": 0.3440135714897886 + }, + { + "epoch": 5.746995016124304, + "grad_norm": 0.2535708109225153, + "learning_rate": 0.00017074836143767807, + "loss": 2.6660447120666504, + "step": 9804, + "token_acc": 0.34559922416043964 + }, + { + "epoch": 5.747581354441513, + "grad_norm": 0.2703662907696137, + "learning_rate": 0.00017074151142013207, + "loss": 2.6732985973358154, + "step": 9805, + "token_acc": 0.3448076226619127 + }, + { + "epoch": 5.748167692758722, + "grad_norm": 0.2635740837579622, + "learning_rate": 0.00017073466073806492, + "loss": 2.7102131843566895, + "step": 9806, + "token_acc": 0.34080677858981123 + }, + { + "epoch": 5.748754031075931, + "grad_norm": 0.284722146492488, + "learning_rate": 0.00017072780939154093, + "loss": 2.704298973083496, + "step": 9807, + "token_acc": 0.342929684253423 + }, + { + "epoch": 5.74934036939314, + "grad_norm": 0.27315126737049256, + "learning_rate": 0.0001707209573806245, + "loss": 2.666435718536377, + "step": 9808, + "token_acc": 0.3470690326557032 + }, + { + "epoch": 5.749926707710349, + "grad_norm": 0.28322056175773214, + "learning_rate": 0.00017071410470537995, + "loss": 2.6925525665283203, + "step": 9809, + "token_acc": 0.342310037873335 + }, + { + "epoch": 5.7505130460275575, + "grad_norm": 0.2637783080790276, + "learning_rate": 0.0001707072513658717, + "loss": 2.71258544921875, + "step": 9810, + "token_acc": 0.3395110985040282 + }, + { + "epoch": 5.751099384344767, + "grad_norm": 0.2588024379429364, + "learning_rate": 0.0001707003973621641, + "loss": 2.7056877613067627, + "step": 9811, + "token_acc": 0.3412268778595766 + }, + { + "epoch": 5.751685722661976, + "grad_norm": 0.27747871925939843, + "learning_rate": 0.00017069354269432148, + "loss": 2.689424514770508, + "step": 9812, + "token_acc": 0.3426340561264247 + }, + { + "epoch": 5.752272060979185, + "grad_norm": 0.2491068919037751, + "learning_rate": 0.00017068668736240836, + "loss": 2.695323944091797, + "step": 9813, + "token_acc": 0.339852237058096 + }, + { + "epoch": 5.752858399296394, + "grad_norm": 0.27572638773769415, + "learning_rate": 0.00017067983136648902, + "loss": 2.7250115871429443, + "step": 9814, + "token_acc": 0.3369874991903621 + }, + { + "epoch": 5.753444737613603, + "grad_norm": 0.26063019077469046, + "learning_rate": 0.00017067297470662793, + "loss": 2.705073356628418, + "step": 9815, + "token_acc": 0.34066050211772103 + }, + { + "epoch": 5.754031075930812, + "grad_norm": 0.26848478906821854, + "learning_rate": 0.00017066611738288944, + "loss": 2.6968345642089844, + "step": 9816, + "token_acc": 0.3423200932976607 + }, + { + "epoch": 5.754617414248021, + "grad_norm": 0.24301836934732673, + "learning_rate": 0.00017065925939533803, + "loss": 2.6776082515716553, + "step": 9817, + "token_acc": 0.3452482763318222 + }, + { + "epoch": 5.75520375256523, + "grad_norm": 0.2698772694944979, + "learning_rate": 0.0001706524007440381, + "loss": 2.678867816925049, + "step": 9818, + "token_acc": 0.345342908861112 + }, + { + "epoch": 5.7557900908824395, + "grad_norm": 0.27588484694928056, + "learning_rate": 0.00017064554142905407, + "loss": 2.706106185913086, + "step": 9819, + "token_acc": 0.3416944742199619 + }, + { + "epoch": 5.756376429199648, + "grad_norm": 0.26777047948125227, + "learning_rate": 0.00017063868145045038, + "loss": 2.7054548263549805, + "step": 9820, + "token_acc": 0.3411221484550492 + }, + { + "epoch": 5.756962767516857, + "grad_norm": 0.2669185983870583, + "learning_rate": 0.00017063182080829143, + "loss": 2.68900728225708, + "step": 9821, + "token_acc": 0.3419860445740471 + }, + { + "epoch": 5.757549105834066, + "grad_norm": 0.2491126689060044, + "learning_rate": 0.0001706249595026417, + "loss": 2.675116777420044, + "step": 9822, + "token_acc": 0.34582792805133206 + }, + { + "epoch": 5.758135444151275, + "grad_norm": 0.28586494416290215, + "learning_rate": 0.00017061809753356565, + "loss": 2.6776599884033203, + "step": 9823, + "token_acc": 0.3445602446732178 + }, + { + "epoch": 5.758721782468484, + "grad_norm": 0.2812164791649811, + "learning_rate": 0.00017061123490112777, + "loss": 2.6965208053588867, + "step": 9824, + "token_acc": 0.3430281166992415 + }, + { + "epoch": 5.759308120785693, + "grad_norm": 0.2676566619814792, + "learning_rate": 0.00017060437160539246, + "loss": 2.75732159614563, + "step": 9825, + "token_acc": 0.3325398585811589 + }, + { + "epoch": 5.759894459102902, + "grad_norm": 0.2628373444888273, + "learning_rate": 0.00017059750764642423, + "loss": 2.7129392623901367, + "step": 9826, + "token_acc": 0.33887337544362645 + }, + { + "epoch": 5.7604807974201115, + "grad_norm": 0.26176939565665425, + "learning_rate": 0.00017059064302428752, + "loss": 2.695472240447998, + "step": 9827, + "token_acc": 0.3412850065051976 + }, + { + "epoch": 5.761067135737321, + "grad_norm": 0.24716713038309338, + "learning_rate": 0.0001705837777390469, + "loss": 2.687631130218506, + "step": 9828, + "token_acc": 0.34257091983113447 + }, + { + "epoch": 5.76165347405453, + "grad_norm": 0.2634440661237052, + "learning_rate": 0.00017057691179076672, + "loss": 2.705292224884033, + "step": 9829, + "token_acc": 0.33907477239109146 + }, + { + "epoch": 5.762239812371739, + "grad_norm": 0.2591503987369195, + "learning_rate": 0.00017057004517951162, + "loss": 2.658693552017212, + "step": 9830, + "token_acc": 0.3477375141795627 + }, + { + "epoch": 5.762826150688948, + "grad_norm": 0.2851792219357145, + "learning_rate": 0.00017056317790534604, + "loss": 2.7019896507263184, + "step": 9831, + "token_acc": 0.340577612919157 + }, + { + "epoch": 5.763412489006156, + "grad_norm": 0.2708704863624883, + "learning_rate": 0.00017055630996833446, + "loss": 2.6829514503479004, + "step": 9832, + "token_acc": 0.34385945637701515 + }, + { + "epoch": 5.763998827323365, + "grad_norm": 0.25477104906801956, + "learning_rate": 0.00017054944136854144, + "loss": 2.7061400413513184, + "step": 9833, + "token_acc": 0.340193029688949 + }, + { + "epoch": 5.764585165640574, + "grad_norm": 0.25157506377940575, + "learning_rate": 0.00017054257210603148, + "loss": 2.6914963722229004, + "step": 9834, + "token_acc": 0.3424424841302894 + }, + { + "epoch": 5.7651715039577835, + "grad_norm": 0.2795433526164303, + "learning_rate": 0.00017053570218086907, + "loss": 2.682927131652832, + "step": 9835, + "token_acc": 0.34317964118362804 + }, + { + "epoch": 5.765757842274993, + "grad_norm": 0.2971328736056097, + "learning_rate": 0.00017052883159311883, + "loss": 2.7140562534332275, + "step": 9836, + "token_acc": 0.33813859782014055 + }, + { + "epoch": 5.766344180592202, + "grad_norm": 0.34355356910324225, + "learning_rate": 0.00017052196034284523, + "loss": 2.721029758453369, + "step": 9837, + "token_acc": 0.338979455296095 + }, + { + "epoch": 5.766930518909411, + "grad_norm": 0.35846365049813533, + "learning_rate": 0.0001705150884301129, + "loss": 2.707603931427002, + "step": 9838, + "token_acc": 0.34089033018867926 + }, + { + "epoch": 5.76751685722662, + "grad_norm": 0.28590922033111105, + "learning_rate": 0.0001705082158549863, + "loss": 2.6835408210754395, + "step": 9839, + "token_acc": 0.34474857230044903 + }, + { + "epoch": 5.768103195543829, + "grad_norm": 0.2912358074407059, + "learning_rate": 0.00017050134261753, + "loss": 2.6697421073913574, + "step": 9840, + "token_acc": 0.34671373891300006 + }, + { + "epoch": 5.768689533861037, + "grad_norm": 0.3085099938409532, + "learning_rate": 0.00017049446871780864, + "loss": 2.683354616165161, + "step": 9841, + "token_acc": 0.34385701552751324 + }, + { + "epoch": 5.7692758721782464, + "grad_norm": 0.2823658427206369, + "learning_rate": 0.0001704875941558867, + "loss": 2.718597888946533, + "step": 9842, + "token_acc": 0.33956534593432586 + }, + { + "epoch": 5.769862210495456, + "grad_norm": 0.3818686591329294, + "learning_rate": 0.0001704807189318288, + "loss": 2.686283588409424, + "step": 9843, + "token_acc": 0.34476382320772386 + }, + { + "epoch": 5.770448548812665, + "grad_norm": 0.2947384138222875, + "learning_rate": 0.0001704738430456995, + "loss": 2.6760177612304688, + "step": 9844, + "token_acc": 0.34464518616242307 + }, + { + "epoch": 5.771034887129874, + "grad_norm": 0.3036931938079206, + "learning_rate": 0.00017046696649756344, + "loss": 2.722147226333618, + "step": 9845, + "token_acc": 0.3384175743855767 + }, + { + "epoch": 5.771621225447083, + "grad_norm": 0.3153643447246259, + "learning_rate": 0.0001704600892874852, + "loss": 2.6861658096313477, + "step": 9846, + "token_acc": 0.3426058422646939 + }, + { + "epoch": 5.772207563764292, + "grad_norm": 0.28455942714804217, + "learning_rate": 0.00017045321141552933, + "loss": 2.7050533294677734, + "step": 9847, + "token_acc": 0.3418295680674236 + }, + { + "epoch": 5.772793902081501, + "grad_norm": 0.32415554933072555, + "learning_rate": 0.00017044633288176047, + "loss": 2.6833689212799072, + "step": 9848, + "token_acc": 0.34444718023380294 + }, + { + "epoch": 5.77338024039871, + "grad_norm": 0.28234730029784844, + "learning_rate": 0.00017043945368624326, + "loss": 2.7180192470550537, + "step": 9849, + "token_acc": 0.3387732670048456 + }, + { + "epoch": 5.773966578715919, + "grad_norm": 0.32880208754215906, + "learning_rate": 0.0001704325738290423, + "loss": 2.7026286125183105, + "step": 9850, + "token_acc": 0.34178133465043953 + }, + { + "epoch": 5.7745529170331285, + "grad_norm": 0.27976745585884905, + "learning_rate": 0.00017042569331022223, + "loss": 2.7076733112335205, + "step": 9851, + "token_acc": 0.3388434852825453 + }, + { + "epoch": 5.775139255350338, + "grad_norm": 0.28837813967273035, + "learning_rate": 0.00017041881212984765, + "loss": 2.714110851287842, + "step": 9852, + "token_acc": 0.3383640967191671 + }, + { + "epoch": 5.775725593667546, + "grad_norm": 0.27822257280391227, + "learning_rate": 0.00017041193028798324, + "loss": 2.693833112716675, + "step": 9853, + "token_acc": 0.34195815364957677 + }, + { + "epoch": 5.776311931984755, + "grad_norm": 0.3113184000401301, + "learning_rate": 0.00017040504778469362, + "loss": 2.73563289642334, + "step": 9854, + "token_acc": 0.33658639559618725 + }, + { + "epoch": 5.776898270301964, + "grad_norm": 0.270158787339092, + "learning_rate": 0.00017039816462004344, + "loss": 2.7067089080810547, + "step": 9855, + "token_acc": 0.34094095416124837 + }, + { + "epoch": 5.777484608619173, + "grad_norm": 0.2700713912768026, + "learning_rate": 0.00017039128079409735, + "loss": 2.7130002975463867, + "step": 9856, + "token_acc": 0.3384421104015833 + }, + { + "epoch": 5.778070946936382, + "grad_norm": 0.2715597860272181, + "learning_rate": 0.00017038439630692007, + "loss": 2.677408218383789, + "step": 9857, + "token_acc": 0.3453836053673794 + }, + { + "epoch": 5.778657285253591, + "grad_norm": 0.2809980535825738, + "learning_rate": 0.00017037751115857623, + "loss": 2.6988730430603027, + "step": 9858, + "token_acc": 0.3413678155737596 + }, + { + "epoch": 5.7792436235708005, + "grad_norm": 0.271100893965222, + "learning_rate": 0.00017037062534913048, + "loss": 2.742131471633911, + "step": 9859, + "token_acc": 0.33461016949152544 + }, + { + "epoch": 5.77982996188801, + "grad_norm": 0.26731614449294994, + "learning_rate": 0.00017036373887864754, + "loss": 2.694150924682617, + "step": 9860, + "token_acc": 0.3423007208633693 + }, + { + "epoch": 5.780416300205219, + "grad_norm": 0.29240572023505085, + "learning_rate": 0.00017035685174719207, + "loss": 2.702456474304199, + "step": 9861, + "token_acc": 0.3415720270172978 + }, + { + "epoch": 5.781002638522428, + "grad_norm": 0.27560362088919893, + "learning_rate": 0.0001703499639548288, + "loss": 2.712031126022339, + "step": 9862, + "token_acc": 0.34105246174944204 + }, + { + "epoch": 5.781588976839636, + "grad_norm": 0.2740949900561151, + "learning_rate": 0.00017034307550162244, + "loss": 2.7155933380126953, + "step": 9863, + "token_acc": 0.34084302131221633 + }, + { + "epoch": 5.782175315156845, + "grad_norm": 0.26057453892939514, + "learning_rate": 0.00017033618638763765, + "loss": 2.7052063941955566, + "step": 9864, + "token_acc": 0.3396987698454421 + }, + { + "epoch": 5.782761653474054, + "grad_norm": 0.2650309871599445, + "learning_rate": 0.0001703292966129392, + "loss": 2.7079169750213623, + "step": 9865, + "token_acc": 0.3395972780573755 + }, + { + "epoch": 5.783347991791263, + "grad_norm": 0.25919450461879134, + "learning_rate": 0.00017032240617759174, + "loss": 2.7104878425598145, + "step": 9866, + "token_acc": 0.3389209678175163 + }, + { + "epoch": 5.7839343301084725, + "grad_norm": 0.2640726158263265, + "learning_rate": 0.00017031551508166003, + "loss": 2.694451332092285, + "step": 9867, + "token_acc": 0.341696492756642 + }, + { + "epoch": 5.784520668425682, + "grad_norm": 0.26199249057539475, + "learning_rate": 0.00017030862332520881, + "loss": 2.7151966094970703, + "step": 9868, + "token_acc": 0.3399060614479747 + }, + { + "epoch": 5.785107006742891, + "grad_norm": 0.27173103939122334, + "learning_rate": 0.00017030173090830286, + "loss": 2.6787304878234863, + "step": 9869, + "token_acc": 0.34536717800096606 + }, + { + "epoch": 5.7856933450601, + "grad_norm": 0.2508454059759158, + "learning_rate": 0.00017029483783100684, + "loss": 2.6575913429260254, + "step": 9870, + "token_acc": 0.3473974721348195 + }, + { + "epoch": 5.786279683377309, + "grad_norm": 0.2637645934097564, + "learning_rate": 0.00017028794409338556, + "loss": 2.7062621116638184, + "step": 9871, + "token_acc": 0.3410778986662043 + }, + { + "epoch": 5.786866021694518, + "grad_norm": 0.24851569103866536, + "learning_rate": 0.00017028104969550375, + "loss": 2.6707816123962402, + "step": 9872, + "token_acc": 0.3466608989412104 + }, + { + "epoch": 5.787452360011727, + "grad_norm": 0.27884252514212116, + "learning_rate": 0.0001702741546374262, + "loss": 2.7049195766448975, + "step": 9873, + "token_acc": 0.3398031907676837 + }, + { + "epoch": 5.788038698328936, + "grad_norm": 0.2611104713806163, + "learning_rate": 0.00017026725891921765, + "loss": 2.710439682006836, + "step": 9874, + "token_acc": 0.3409179999325038 + }, + { + "epoch": 5.7886250366461445, + "grad_norm": 0.271528142839158, + "learning_rate": 0.00017026036254094286, + "loss": 2.711056709289551, + "step": 9875, + "token_acc": 0.33907462039940583 + }, + { + "epoch": 5.789211374963354, + "grad_norm": 0.261896450168623, + "learning_rate": 0.00017025346550266667, + "loss": 2.7395944595336914, + "step": 9876, + "token_acc": 0.3364757878156703 + }, + { + "epoch": 5.789797713280563, + "grad_norm": 0.2747974194373344, + "learning_rate": 0.00017024656780445385, + "loss": 2.6984763145446777, + "step": 9877, + "token_acc": 0.3416516993182384 + }, + { + "epoch": 5.790384051597772, + "grad_norm": 0.25256219589867474, + "learning_rate": 0.00017023966944636917, + "loss": 2.6613640785217285, + "step": 9878, + "token_acc": 0.34767067763067383 + }, + { + "epoch": 5.790970389914981, + "grad_norm": 0.2653499963420842, + "learning_rate": 0.00017023277042847745, + "loss": 2.7117462158203125, + "step": 9879, + "token_acc": 0.33886905983590554 + }, + { + "epoch": 5.79155672823219, + "grad_norm": 0.2897111629229459, + "learning_rate": 0.00017022587075084348, + "loss": 2.7103726863861084, + "step": 9880, + "token_acc": 0.3379442390451047 + }, + { + "epoch": 5.792143066549399, + "grad_norm": 0.3922365650863327, + "learning_rate": 0.00017021897041353209, + "loss": 2.7264010906219482, + "step": 9881, + "token_acc": 0.3369276610429645 + }, + { + "epoch": 5.792729404866608, + "grad_norm": 0.3657571921914574, + "learning_rate": 0.0001702120694166081, + "loss": 2.7064368724823, + "step": 9882, + "token_acc": 0.34111021925348983 + }, + { + "epoch": 5.793315743183817, + "grad_norm": 0.2606322539847341, + "learning_rate": 0.00017020516776013634, + "loss": 2.677274465560913, + "step": 9883, + "token_acc": 0.34396318702535716 + }, + { + "epoch": 5.7939020815010265, + "grad_norm": 0.3540055089079396, + "learning_rate": 0.0001701982654441816, + "loss": 2.7213034629821777, + "step": 9884, + "token_acc": 0.3380639183438106 + }, + { + "epoch": 5.794488419818235, + "grad_norm": 0.2871227717703711, + "learning_rate": 0.00017019136246880878, + "loss": 2.724184513092041, + "step": 9885, + "token_acc": 0.3383453071447403 + }, + { + "epoch": 5.795074758135444, + "grad_norm": 0.2882191398309349, + "learning_rate": 0.00017018445883408266, + "loss": 2.739137649536133, + "step": 9886, + "token_acc": 0.3352867506321038 + }, + { + "epoch": 5.795661096452653, + "grad_norm": 0.2855845744317301, + "learning_rate": 0.00017017755454006817, + "loss": 2.6894781589508057, + "step": 9887, + "token_acc": 0.3415025231108473 + }, + { + "epoch": 5.796247434769862, + "grad_norm": 0.25232864042816594, + "learning_rate": 0.00017017064958683008, + "loss": 2.704000949859619, + "step": 9888, + "token_acc": 0.3417113026874007 + }, + { + "epoch": 5.796833773087071, + "grad_norm": 0.286912625387369, + "learning_rate": 0.0001701637439744333, + "loss": 2.708357095718384, + "step": 9889, + "token_acc": 0.3411382288752959 + }, + { + "epoch": 5.79742011140428, + "grad_norm": 0.2673600945824778, + "learning_rate": 0.00017015683770294274, + "loss": 2.692500591278076, + "step": 9890, + "token_acc": 0.3421621259237485 + }, + { + "epoch": 5.798006449721489, + "grad_norm": 0.35247089279727145, + "learning_rate": 0.00017014993077242317, + "loss": 2.66996693611145, + "step": 9891, + "token_acc": 0.3469176194545721 + }, + { + "epoch": 5.7985927880386985, + "grad_norm": 0.25670137465137927, + "learning_rate": 0.00017014302318293952, + "loss": 2.709150791168213, + "step": 9892, + "token_acc": 0.3407032013812327 + }, + { + "epoch": 5.799179126355908, + "grad_norm": 0.3004955298700424, + "learning_rate": 0.00017013611493455673, + "loss": 2.71744966506958, + "step": 9893, + "token_acc": 0.3381570329130506 + }, + { + "epoch": 5.799765464673117, + "grad_norm": 0.2776637363342749, + "learning_rate": 0.00017012920602733962, + "loss": 2.6978535652160645, + "step": 9894, + "token_acc": 0.34159674548690566 + }, + { + "epoch": 5.800351802990326, + "grad_norm": 0.3105306906853624, + "learning_rate": 0.00017012229646135314, + "loss": 2.7267956733703613, + "step": 9895, + "token_acc": 0.33800561086627434 + }, + { + "epoch": 5.800938141307535, + "grad_norm": 0.2742074383091318, + "learning_rate": 0.00017011538623666215, + "loss": 2.720053195953369, + "step": 9896, + "token_acc": 0.3393743319840173 + }, + { + "epoch": 5.801524479624743, + "grad_norm": 0.2991120024134006, + "learning_rate": 0.00017010847535333163, + "loss": 2.7433085441589355, + "step": 9897, + "token_acc": 0.3354463130659767 + }, + { + "epoch": 5.802110817941952, + "grad_norm": 0.26908350350116056, + "learning_rate": 0.00017010156381142642, + "loss": 2.7164466381073, + "step": 9898, + "token_acc": 0.33720555609397057 + }, + { + "epoch": 5.802697156259161, + "grad_norm": 0.2791132245236656, + "learning_rate": 0.00017009465161101151, + "loss": 2.695408344268799, + "step": 9899, + "token_acc": 0.3414637453591836 + }, + { + "epoch": 5.8032834945763705, + "grad_norm": 0.256986655377693, + "learning_rate": 0.0001700877387521518, + "loss": 2.6765737533569336, + "step": 9900, + "token_acc": 0.3441346546684964 + }, + { + "epoch": 5.80386983289358, + "grad_norm": 0.2816778481906404, + "learning_rate": 0.00017008082523491217, + "loss": 2.744688034057617, + "step": 9901, + "token_acc": 0.3353278808474659 + }, + { + "epoch": 5.804456171210789, + "grad_norm": 0.26803354233762583, + "learning_rate": 0.00017007391105935767, + "loss": 2.674678325653076, + "step": 9902, + "token_acc": 0.3462757118262842 + }, + { + "epoch": 5.805042509527998, + "grad_norm": 0.24871430935658076, + "learning_rate": 0.00017006699622555322, + "loss": 2.7315614223480225, + "step": 9903, + "token_acc": 0.3355871997823733 + }, + { + "epoch": 5.805628847845207, + "grad_norm": 0.26032667024581124, + "learning_rate": 0.00017006008073356372, + "loss": 2.664966106414795, + "step": 9904, + "token_acc": 0.3475562591997599 + }, + { + "epoch": 5.806215186162416, + "grad_norm": 0.2634571288378092, + "learning_rate": 0.00017005316458345419, + "loss": 2.6923460960388184, + "step": 9905, + "token_acc": 0.3433181736426938 + }, + { + "epoch": 5.806801524479624, + "grad_norm": 0.2540756801350001, + "learning_rate": 0.00017004624777528955, + "loss": 2.7298476696014404, + "step": 9906, + "token_acc": 0.337559711552304 + }, + { + "epoch": 5.807387862796833, + "grad_norm": 0.27032998428841026, + "learning_rate": 0.00017003933030913484, + "loss": 2.7230639457702637, + "step": 9907, + "token_acc": 0.3412245006516131 + }, + { + "epoch": 5.8079742011140425, + "grad_norm": 0.2619893067194834, + "learning_rate": 0.000170032412185055, + "loss": 2.7009029388427734, + "step": 9908, + "token_acc": 0.34153084480622337 + }, + { + "epoch": 5.808560539431252, + "grad_norm": 0.2624514054167554, + "learning_rate": 0.00017002549340311497, + "loss": 2.7039709091186523, + "step": 9909, + "token_acc": 0.3404080535687573 + }, + { + "epoch": 5.809146877748461, + "grad_norm": 0.27098018640290145, + "learning_rate": 0.0001700185739633798, + "loss": 2.6965699195861816, + "step": 9910, + "token_acc": 0.34230634845883906 + }, + { + "epoch": 5.80973321606567, + "grad_norm": 0.27992902575083634, + "learning_rate": 0.00017001165386591453, + "loss": 2.725646495819092, + "step": 9911, + "token_acc": 0.33803956764552534 + }, + { + "epoch": 5.810319554382879, + "grad_norm": 0.25422863433920795, + "learning_rate": 0.0001700047331107841, + "loss": 2.706587076187134, + "step": 9912, + "token_acc": 0.34110687834483183 + }, + { + "epoch": 5.810905892700088, + "grad_norm": 0.2772500181655618, + "learning_rate": 0.0001699978116980535, + "loss": 2.7238001823425293, + "step": 9913, + "token_acc": 0.3383483360101595 + }, + { + "epoch": 5.811492231017297, + "grad_norm": 0.42270189565024113, + "learning_rate": 0.0001699908896277878, + "loss": 2.7148947715759277, + "step": 9914, + "token_acc": 0.33979248875088464 + }, + { + "epoch": 5.812078569334506, + "grad_norm": 0.36910019612221867, + "learning_rate": 0.00016998396690005198, + "loss": 2.703819513320923, + "step": 9915, + "token_acc": 0.34196862816684703 + }, + { + "epoch": 5.812664907651715, + "grad_norm": 0.2897915684067254, + "learning_rate": 0.00016997704351491113, + "loss": 2.7244815826416016, + "step": 9916, + "token_acc": 0.3383307858250133 + }, + { + "epoch": 5.8132512459689245, + "grad_norm": 0.3777776631188572, + "learning_rate": 0.00016997011947243024, + "loss": 2.7135095596313477, + "step": 9917, + "token_acc": 0.33834990455082053 + }, + { + "epoch": 5.813837584286133, + "grad_norm": 0.2662816191919359, + "learning_rate": 0.00016996319477267436, + "loss": 2.7268238067626953, + "step": 9918, + "token_acc": 0.33744900061817434 + }, + { + "epoch": 5.814423922603342, + "grad_norm": 0.32684631391700236, + "learning_rate": 0.00016995626941570854, + "loss": 2.711203098297119, + "step": 9919, + "token_acc": 0.33953106430091223 + }, + { + "epoch": 5.815010260920551, + "grad_norm": 0.25419712176027265, + "learning_rate": 0.00016994934340159784, + "loss": 2.6893577575683594, + "step": 9920, + "token_acc": 0.34266725542046556 + }, + { + "epoch": 5.81559659923776, + "grad_norm": 0.3398682523196575, + "learning_rate": 0.00016994241673040734, + "loss": 2.6690673828125, + "step": 9921, + "token_acc": 0.3466630618882148 + }, + { + "epoch": 5.816182937554969, + "grad_norm": 0.2456883262681673, + "learning_rate": 0.00016993548940220205, + "loss": 2.7188563346862793, + "step": 9922, + "token_acc": 0.3393908666972976 + }, + { + "epoch": 5.816769275872178, + "grad_norm": 0.28520728015909846, + "learning_rate": 0.00016992856141704712, + "loss": 2.708329677581787, + "step": 9923, + "token_acc": 0.34072562601800643 + }, + { + "epoch": 5.817355614189387, + "grad_norm": 0.2456178031467423, + "learning_rate": 0.00016992163277500754, + "loss": 2.7370846271514893, + "step": 9924, + "token_acc": 0.33466575926114245 + }, + { + "epoch": 5.8179419525065965, + "grad_norm": 0.2933964984982332, + "learning_rate": 0.00016991470347614844, + "loss": 2.7135021686553955, + "step": 9925, + "token_acc": 0.3398556927968693 + }, + { + "epoch": 5.818528290823806, + "grad_norm": 0.25145191091752794, + "learning_rate": 0.00016990777352053494, + "loss": 2.730299949645996, + "step": 9926, + "token_acc": 0.3364658442401668 + }, + { + "epoch": 5.819114629141015, + "grad_norm": 0.2654714925566974, + "learning_rate": 0.0001699008429082321, + "loss": 2.7233567237854004, + "step": 9927, + "token_acc": 0.33760656225218083 + }, + { + "epoch": 5.819700967458223, + "grad_norm": 0.26987655787105874, + "learning_rate": 0.00016989391163930502, + "loss": 2.669539213180542, + "step": 9928, + "token_acc": 0.34442568316991884 + }, + { + "epoch": 5.820287305775432, + "grad_norm": 0.2672032742127561, + "learning_rate": 0.00016988697971381884, + "loss": 2.7079505920410156, + "step": 9929, + "token_acc": 0.341373858308432 + }, + { + "epoch": 5.820873644092641, + "grad_norm": 0.28123575078024077, + "learning_rate": 0.00016988004713183865, + "loss": 2.7293198108673096, + "step": 9930, + "token_acc": 0.3373321056477932 + }, + { + "epoch": 5.82145998240985, + "grad_norm": 0.26603945586842404, + "learning_rate": 0.00016987311389342956, + "loss": 2.7408835887908936, + "step": 9931, + "token_acc": 0.3354048940304004 + }, + { + "epoch": 5.822046320727059, + "grad_norm": 0.2644369511745641, + "learning_rate": 0.00016986617999865678, + "loss": 2.685732126235962, + "step": 9932, + "token_acc": 0.3445139327668244 + }, + { + "epoch": 5.8226326590442685, + "grad_norm": 0.27624884793473636, + "learning_rate": 0.00016985924544758534, + "loss": 2.718756914138794, + "step": 9933, + "token_acc": 0.33832097923483073 + }, + { + "epoch": 5.823218997361478, + "grad_norm": 0.2536518600650178, + "learning_rate": 0.00016985231024028045, + "loss": 2.6872506141662598, + "step": 9934, + "token_acc": 0.34382720539700246 + }, + { + "epoch": 5.823805335678687, + "grad_norm": 0.25480957616751143, + "learning_rate": 0.00016984537437680718, + "loss": 2.7489430904388428, + "step": 9935, + "token_acc": 0.33409342461260966 + }, + { + "epoch": 5.824391673995896, + "grad_norm": 0.27108110740992875, + "learning_rate": 0.0001698384378572308, + "loss": 2.7308692932128906, + "step": 9936, + "token_acc": 0.33691821590229964 + }, + { + "epoch": 5.824978012313105, + "grad_norm": 0.24807992885136756, + "learning_rate": 0.00016983150068161637, + "loss": 2.72170352935791, + "step": 9937, + "token_acc": 0.33933277191164085 + }, + { + "epoch": 5.825564350630314, + "grad_norm": 0.2716620070621618, + "learning_rate": 0.0001698245628500291, + "loss": 2.775378704071045, + "step": 9938, + "token_acc": 0.3298786653185035 + }, + { + "epoch": 5.826150688947523, + "grad_norm": 0.24440034382546813, + "learning_rate": 0.00016981762436253414, + "loss": 2.703197479248047, + "step": 9939, + "token_acc": 0.3406854688492977 + }, + { + "epoch": 5.826737027264731, + "grad_norm": 0.28210599664211694, + "learning_rate": 0.0001698106852191967, + "loss": 2.730391025543213, + "step": 9940, + "token_acc": 0.3370422022798933 + }, + { + "epoch": 5.8273233655819405, + "grad_norm": 0.25155715021008596, + "learning_rate": 0.00016980374542008194, + "loss": 2.709158420562744, + "step": 9941, + "token_acc": 0.3411392787386988 + }, + { + "epoch": 5.82790970389915, + "grad_norm": 0.2869697838097068, + "learning_rate": 0.00016979680496525504, + "loss": 2.7391624450683594, + "step": 9942, + "token_acc": 0.33503271972412424 + }, + { + "epoch": 5.828496042216359, + "grad_norm": 0.29240338891864864, + "learning_rate": 0.00016978986385478122, + "loss": 2.706935405731201, + "step": 9943, + "token_acc": 0.34227092028659284 + }, + { + "epoch": 5.829082380533568, + "grad_norm": 0.2664310569058118, + "learning_rate": 0.00016978292208872565, + "loss": 2.6844325065612793, + "step": 9944, + "token_acc": 0.3443434690242699 + }, + { + "epoch": 5.829668718850777, + "grad_norm": 0.40606653446626256, + "learning_rate": 0.0001697759796671536, + "loss": 2.7427282333374023, + "step": 9945, + "token_acc": 0.33529680488001506 + }, + { + "epoch": 5.830255057167986, + "grad_norm": 0.2888228317100483, + "learning_rate": 0.0001697690365901302, + "loss": 2.7218165397644043, + "step": 9946, + "token_acc": 0.3391144759156344 + }, + { + "epoch": 5.830841395485195, + "grad_norm": 0.3483322131078382, + "learning_rate": 0.00016976209285772076, + "loss": 2.7196950912475586, + "step": 9947, + "token_acc": 0.3384283955683217 + }, + { + "epoch": 5.831427733802404, + "grad_norm": 0.3082320085449344, + "learning_rate": 0.00016975514846999046, + "loss": 2.698625087738037, + "step": 9948, + "token_acc": 0.3413796194735066 + }, + { + "epoch": 5.8320140721196125, + "grad_norm": 0.3108255610072256, + "learning_rate": 0.0001697482034270045, + "loss": 2.724987030029297, + "step": 9949, + "token_acc": 0.33773047402541856 + }, + { + "epoch": 5.832600410436822, + "grad_norm": 0.31028656233283125, + "learning_rate": 0.00016974125772882816, + "loss": 2.6671595573425293, + "step": 9950, + "token_acc": 0.34660720554392627 + }, + { + "epoch": 5.833186748754031, + "grad_norm": 0.30857944762494294, + "learning_rate": 0.0001697343113755267, + "loss": 2.712918996810913, + "step": 9951, + "token_acc": 0.339410552652881 + }, + { + "epoch": 5.83377308707124, + "grad_norm": 0.2868327620466821, + "learning_rate": 0.00016972736436716537, + "loss": 2.6995151042938232, + "step": 9952, + "token_acc": 0.3424719020889493 + }, + { + "epoch": 5.834359425388449, + "grad_norm": 0.3081804607465968, + "learning_rate": 0.0001697204167038094, + "loss": 2.6873269081115723, + "step": 9953, + "token_acc": 0.34373103926497356 + }, + { + "epoch": 5.834945763705658, + "grad_norm": 0.28268646342874776, + "learning_rate": 0.00016971346838552402, + "loss": 2.7061896324157715, + "step": 9954, + "token_acc": 0.3408886752283497 + }, + { + "epoch": 5.835532102022867, + "grad_norm": 0.29450958473737765, + "learning_rate": 0.0001697065194123746, + "loss": 2.6924009323120117, + "step": 9955, + "token_acc": 0.3427805519610256 + }, + { + "epoch": 5.836118440340076, + "grad_norm": 0.2552245376143469, + "learning_rate": 0.00016969956978442634, + "loss": 2.6997978687286377, + "step": 9956, + "token_acc": 0.34066715930537944 + }, + { + "epoch": 5.836704778657285, + "grad_norm": 0.3009136038326612, + "learning_rate": 0.00016969261950174454, + "loss": 2.691918134689331, + "step": 9957, + "token_acc": 0.34179415764593485 + }, + { + "epoch": 5.8372911169744945, + "grad_norm": 0.2513591545575658, + "learning_rate": 0.0001696856685643945, + "loss": 2.7007393836975098, + "step": 9958, + "token_acc": 0.34120910993972103 + }, + { + "epoch": 5.837877455291704, + "grad_norm": 0.2739958395930539, + "learning_rate": 0.0001696787169724415, + "loss": 2.6600186824798584, + "step": 9959, + "token_acc": 0.3460854370143851 + }, + { + "epoch": 5.838463793608913, + "grad_norm": 0.2594616963930268, + "learning_rate": 0.00016967176472595084, + "loss": 2.6728415489196777, + "step": 9960, + "token_acc": 0.3455509469503189 + }, + { + "epoch": 5.839050131926121, + "grad_norm": 0.26462913258508086, + "learning_rate": 0.00016966481182498786, + "loss": 2.7204184532165527, + "step": 9961, + "token_acc": 0.3384629283087049 + }, + { + "epoch": 5.83963647024333, + "grad_norm": 0.25803996151663977, + "learning_rate": 0.00016965785826961782, + "loss": 2.7037971019744873, + "step": 9962, + "token_acc": 0.33892441026839293 + }, + { + "epoch": 5.840222808560539, + "grad_norm": 0.25859270697650855, + "learning_rate": 0.0001696509040599061, + "loss": 2.6715481281280518, + "step": 9963, + "token_acc": 0.34756979939726246 + }, + { + "epoch": 5.840809146877748, + "grad_norm": 0.2646490504791317, + "learning_rate": 0.00016964394919591794, + "loss": 2.7166812419891357, + "step": 9964, + "token_acc": 0.3395637322623688 + }, + { + "epoch": 5.841395485194957, + "grad_norm": 0.25410632111166526, + "learning_rate": 0.00016963699367771878, + "loss": 2.7142887115478516, + "step": 9965, + "token_acc": 0.3400764870107413 + }, + { + "epoch": 5.8419818235121665, + "grad_norm": 0.2475568789473016, + "learning_rate": 0.00016963003750537387, + "loss": 2.6944870948791504, + "step": 9966, + "token_acc": 0.3415110015543641 + }, + { + "epoch": 5.842568161829376, + "grad_norm": 0.25241725539995225, + "learning_rate": 0.0001696230806789486, + "loss": 2.714154005050659, + "step": 9967, + "token_acc": 0.339983186426861 + }, + { + "epoch": 5.843154500146585, + "grad_norm": 0.2563744322372722, + "learning_rate": 0.0001696161231985083, + "loss": 2.7427749633789062, + "step": 9968, + "token_acc": 0.3356262223286319 + }, + { + "epoch": 5.843740838463794, + "grad_norm": 0.2683617216159031, + "learning_rate": 0.00016960916506411828, + "loss": 2.735119104385376, + "step": 9969, + "token_acc": 0.3357239921485732 + }, + { + "epoch": 5.844327176781003, + "grad_norm": 0.24855856901659903, + "learning_rate": 0.000169602206275844, + "loss": 2.7303712368011475, + "step": 9970, + "token_acc": 0.33612715341787647 + }, + { + "epoch": 5.844913515098211, + "grad_norm": 0.27671665516563054, + "learning_rate": 0.00016959524683375082, + "loss": 2.665950298309326, + "step": 9971, + "token_acc": 0.3482199889281608 + }, + { + "epoch": 5.84549985341542, + "grad_norm": 0.24415923777402596, + "learning_rate": 0.00016958828673790404, + "loss": 2.7043566703796387, + "step": 9972, + "token_acc": 0.34074640421590613 + }, + { + "epoch": 5.8460861917326294, + "grad_norm": 0.2706878373713526, + "learning_rate": 0.00016958132598836906, + "loss": 2.6884937286376953, + "step": 9973, + "token_acc": 0.3417396286322187 + }, + { + "epoch": 5.846672530049839, + "grad_norm": 0.2533230871594401, + "learning_rate": 0.0001695743645852113, + "loss": 2.7115750312805176, + "step": 9974, + "token_acc": 0.3400953443554113 + }, + { + "epoch": 5.847258868367048, + "grad_norm": 0.2886379767740009, + "learning_rate": 0.00016956740252849613, + "loss": 2.74137806892395, + "step": 9975, + "token_acc": 0.33615073029779263 + }, + { + "epoch": 5.847845206684257, + "grad_norm": 0.2957519840778072, + "learning_rate": 0.00016956043981828896, + "loss": 2.708195209503174, + "step": 9976, + "token_acc": 0.3398092890503373 + }, + { + "epoch": 5.848431545001466, + "grad_norm": 0.26677935705571937, + "learning_rate": 0.00016955347645465524, + "loss": 2.727712631225586, + "step": 9977, + "token_acc": 0.33780282109811055 + }, + { + "epoch": 5.849017883318675, + "grad_norm": 0.3323888400127302, + "learning_rate": 0.00016954651243766028, + "loss": 2.678679943084717, + "step": 9978, + "token_acc": 0.3435169085694535 + }, + { + "epoch": 5.849604221635884, + "grad_norm": 0.297354970618711, + "learning_rate": 0.00016953954776736954, + "loss": 2.673854351043701, + "step": 9979, + "token_acc": 0.3457612313601875 + }, + { + "epoch": 5.850190559953093, + "grad_norm": 0.27558008240987675, + "learning_rate": 0.00016953258244384846, + "loss": 2.71500825881958, + "step": 9980, + "token_acc": 0.33866103029462774 + }, + { + "epoch": 5.850776898270302, + "grad_norm": 0.35320214782461534, + "learning_rate": 0.0001695256164671625, + "loss": 2.7390122413635254, + "step": 9981, + "token_acc": 0.3352152970052937 + }, + { + "epoch": 5.8513632365875115, + "grad_norm": 0.26727435911858827, + "learning_rate": 0.00016951864983737704, + "loss": 2.690431833267212, + "step": 9982, + "token_acc": 0.3430393781030196 + }, + { + "epoch": 5.85194957490472, + "grad_norm": 0.2961654439002612, + "learning_rate": 0.00016951168255455754, + "loss": 2.705667018890381, + "step": 9983, + "token_acc": 0.33995198184717407 + }, + { + "epoch": 5.852535913221929, + "grad_norm": 0.2803647409541711, + "learning_rate": 0.00016950471461876944, + "loss": 2.7075843811035156, + "step": 9984, + "token_acc": 0.34105333510864655 + }, + { + "epoch": 5.853122251539138, + "grad_norm": 0.27612137691321464, + "learning_rate": 0.00016949774603007822, + "loss": 2.705247402191162, + "step": 9985, + "token_acc": 0.3412376437147342 + }, + { + "epoch": 5.853708589856347, + "grad_norm": 0.28171585377805697, + "learning_rate": 0.00016949077678854931, + "loss": 2.730396270751953, + "step": 9986, + "token_acc": 0.3373874340130421 + }, + { + "epoch": 5.854294928173556, + "grad_norm": 0.2523313080847772, + "learning_rate": 0.00016948380689424823, + "loss": 2.735898017883301, + "step": 9987, + "token_acc": 0.33607392004123515 + }, + { + "epoch": 5.854881266490765, + "grad_norm": 0.2761524101196835, + "learning_rate": 0.00016947683634724035, + "loss": 2.6894724369049072, + "step": 9988, + "token_acc": 0.34331600689621405 + }, + { + "epoch": 5.855467604807974, + "grad_norm": 0.25534915392967134, + "learning_rate": 0.00016946986514759126, + "loss": 2.7613301277160645, + "step": 9989, + "token_acc": 0.33098989056588424 + }, + { + "epoch": 5.8560539431251835, + "grad_norm": 0.2957055430775944, + "learning_rate": 0.00016946289329536641, + "loss": 2.7702126502990723, + "step": 9990, + "token_acc": 0.3319583209163961 + }, + { + "epoch": 5.856640281442393, + "grad_norm": 0.27198599490216757, + "learning_rate": 0.00016945592079063127, + "loss": 2.721731185913086, + "step": 9991, + "token_acc": 0.338326440732585 + }, + { + "epoch": 5.857226619759601, + "grad_norm": 0.27316199541981495, + "learning_rate": 0.00016944894763345136, + "loss": 2.7360892295837402, + "step": 9992, + "token_acc": 0.33559566421714326 + }, + { + "epoch": 5.85781295807681, + "grad_norm": 0.2771587595203273, + "learning_rate": 0.00016944197382389212, + "loss": 2.681577205657959, + "step": 9993, + "token_acc": 0.3427863588187181 + }, + { + "epoch": 5.858399296394019, + "grad_norm": 0.2656120639492622, + "learning_rate": 0.00016943499936201915, + "loss": 2.6937918663024902, + "step": 9994, + "token_acc": 0.34134429748060724 + }, + { + "epoch": 5.858985634711228, + "grad_norm": 0.25648093204243366, + "learning_rate": 0.0001694280242478979, + "loss": 2.706294536590576, + "step": 9995, + "token_acc": 0.3403471884847471 + }, + { + "epoch": 5.859571973028437, + "grad_norm": 0.25068124282515175, + "learning_rate": 0.00016942104848159396, + "loss": 2.7461438179016113, + "step": 9996, + "token_acc": 0.33354783186454157 + }, + { + "epoch": 5.860158311345646, + "grad_norm": 0.2555126375074528, + "learning_rate": 0.00016941407206317276, + "loss": 2.7627153396606445, + "step": 9997, + "token_acc": 0.3310113704368641 + }, + { + "epoch": 5.8607446496628555, + "grad_norm": 0.2781497098931414, + "learning_rate": 0.00016940709499269993, + "loss": 2.72983980178833, + "step": 9998, + "token_acc": 0.33620312193601937 + }, + { + "epoch": 5.861330987980065, + "grad_norm": 0.25106612709367165, + "learning_rate": 0.00016940011727024096, + "loss": 2.732506036758423, + "step": 9999, + "token_acc": 0.33766975922302483 + }, + { + "epoch": 5.861917326297274, + "grad_norm": 0.24493323068107956, + "learning_rate": 0.00016939313889586142, + "loss": 2.6832520961761475, + "step": 10000, + "token_acc": 0.3431363208096463 + }, + { + "epoch": 5.862503664614483, + "grad_norm": 0.25396668754230994, + "learning_rate": 0.00016938615986962684, + "loss": 2.7155680656433105, + "step": 10001, + "token_acc": 0.33868345608028366 + }, + { + "epoch": 5.863090002931692, + "grad_norm": 0.24431532469658523, + "learning_rate": 0.00016937918019160276, + "loss": 2.7020833492279053, + "step": 10002, + "token_acc": 0.3406546046804314 + }, + { + "epoch": 5.863676341248901, + "grad_norm": 0.2559684887359459, + "learning_rate": 0.00016937219986185479, + "loss": 2.723104476928711, + "step": 10003, + "token_acc": 0.33706968703800755 + }, + { + "epoch": 5.86426267956611, + "grad_norm": 0.24635514179198958, + "learning_rate": 0.00016936521888044848, + "loss": 2.747711420059204, + "step": 10004, + "token_acc": 0.33622377476917636 + }, + { + "epoch": 5.864849017883318, + "grad_norm": 0.24555119175792478, + "learning_rate": 0.00016935823724744943, + "loss": 2.745075225830078, + "step": 10005, + "token_acc": 0.3349257612183621 + }, + { + "epoch": 5.8654353562005275, + "grad_norm": 0.246204847647822, + "learning_rate": 0.00016935125496292318, + "loss": 2.679321765899658, + "step": 10006, + "token_acc": 0.34444394563176534 + }, + { + "epoch": 5.866021694517737, + "grad_norm": 0.2526062642049166, + "learning_rate": 0.00016934427202693536, + "loss": 2.736929178237915, + "step": 10007, + "token_acc": 0.33564534710523747 + }, + { + "epoch": 5.866608032834946, + "grad_norm": 0.2551705697262054, + "learning_rate": 0.00016933728843955152, + "loss": 2.717261791229248, + "step": 10008, + "token_acc": 0.3356517813827152 + }, + { + "epoch": 5.867194371152155, + "grad_norm": 0.3028303010112887, + "learning_rate": 0.0001693303042008373, + "loss": 2.682924747467041, + "step": 10009, + "token_acc": 0.3436227512378863 + }, + { + "epoch": 5.867780709469364, + "grad_norm": 0.3131615954871314, + "learning_rate": 0.00016932331931085833, + "loss": 2.7356791496276855, + "step": 10010, + "token_acc": 0.3361654145196607 + }, + { + "epoch": 5.868367047786573, + "grad_norm": 0.25116871468897767, + "learning_rate": 0.00016931633376968014, + "loss": 2.677077293395996, + "step": 10011, + "token_acc": 0.3451714131028838 + }, + { + "epoch": 5.868953386103782, + "grad_norm": 0.27282656042662845, + "learning_rate": 0.00016930934757736842, + "loss": 2.7018814086914062, + "step": 10012, + "token_acc": 0.33883248070527866 + }, + { + "epoch": 5.869539724420991, + "grad_norm": 0.2839502227954031, + "learning_rate": 0.00016930236073398874, + "loss": 2.7132701873779297, + "step": 10013, + "token_acc": 0.33983370285311654 + }, + { + "epoch": 5.8701260627381995, + "grad_norm": 0.2631069401271334, + "learning_rate": 0.00016929537323960684, + "loss": 2.7463746070861816, + "step": 10014, + "token_acc": 0.334955290107709 + }, + { + "epoch": 5.870712401055409, + "grad_norm": 0.2950921774695979, + "learning_rate": 0.00016928838509428824, + "loss": 2.71547269821167, + "step": 10015, + "token_acc": 0.33972507768959803 + }, + { + "epoch": 5.871298739372618, + "grad_norm": 0.2735020683664484, + "learning_rate": 0.00016928139629809863, + "loss": 2.677398204803467, + "step": 10016, + "token_acc": 0.34326303631997873 + }, + { + "epoch": 5.871885077689827, + "grad_norm": 0.27052708383697827, + "learning_rate": 0.00016927440685110366, + "loss": 2.700289011001587, + "step": 10017, + "token_acc": 0.34301772869150243 + }, + { + "epoch": 5.872471416007036, + "grad_norm": 0.2738451345925023, + "learning_rate": 0.000169267416753369, + "loss": 2.721689462661743, + "step": 10018, + "token_acc": 0.3367934651111415 + }, + { + "epoch": 5.873057754324245, + "grad_norm": 0.2570570651585135, + "learning_rate": 0.00016926042600496025, + "loss": 2.6989293098449707, + "step": 10019, + "token_acc": 0.3429785904384273 + }, + { + "epoch": 5.873644092641454, + "grad_norm": 0.3042203856212929, + "learning_rate": 0.0001692534346059432, + "loss": 2.7226452827453613, + "step": 10020, + "token_acc": 0.3396675457860062 + }, + { + "epoch": 5.874230430958663, + "grad_norm": 0.29444838585426586, + "learning_rate": 0.00016924644255638342, + "loss": 2.726085662841797, + "step": 10021, + "token_acc": 0.3390521908250893 + }, + { + "epoch": 5.874816769275872, + "grad_norm": 0.26004467931772224, + "learning_rate": 0.0001692394498563466, + "loss": 2.700791835784912, + "step": 10022, + "token_acc": 0.34140784365335264 + }, + { + "epoch": 5.8754031075930815, + "grad_norm": 0.35153222034173465, + "learning_rate": 0.00016923245650589847, + "loss": 2.727527618408203, + "step": 10023, + "token_acc": 0.33672599572259115 + }, + { + "epoch": 5.875989445910291, + "grad_norm": 0.3663739715983905, + "learning_rate": 0.00016922546250510472, + "loss": 2.7064261436462402, + "step": 10024, + "token_acc": 0.34125639229685867 + }, + { + "epoch": 5.8765757842275, + "grad_norm": 0.25233248188302204, + "learning_rate": 0.00016921846785403102, + "loss": 2.7502636909484863, + "step": 10025, + "token_acc": 0.3335853920951666 + }, + { + "epoch": 5.877162122544708, + "grad_norm": 0.3661230261285711, + "learning_rate": 0.0001692114725527431, + "loss": 2.691458225250244, + "step": 10026, + "token_acc": 0.34192705826649566 + }, + { + "epoch": 5.877748460861917, + "grad_norm": 0.24771104007834868, + "learning_rate": 0.0001692044766013066, + "loss": 2.7181143760681152, + "step": 10027, + "token_acc": 0.33770751312125863 + }, + { + "epoch": 5.878334799179126, + "grad_norm": 0.30941000830892607, + "learning_rate": 0.00016919747999978734, + "loss": 2.7243120670318604, + "step": 10028, + "token_acc": 0.33832667903213975 + }, + { + "epoch": 5.878921137496335, + "grad_norm": 0.24378793971991228, + "learning_rate": 0.000169190482748251, + "loss": 2.744317054748535, + "step": 10029, + "token_acc": 0.3343541830222851 + }, + { + "epoch": 5.879507475813544, + "grad_norm": 0.29450228183821714, + "learning_rate": 0.00016918348484676332, + "loss": 2.727663993835449, + "step": 10030, + "token_acc": 0.337220224875248 + }, + { + "epoch": 5.8800938141307535, + "grad_norm": 0.24575377786513727, + "learning_rate": 0.00016917648629539002, + "loss": 2.7202882766723633, + "step": 10031, + "token_acc": 0.33815590356273006 + }, + { + "epoch": 5.880680152447963, + "grad_norm": 0.278284442438251, + "learning_rate": 0.00016916948709419684, + "loss": 2.690361976623535, + "step": 10032, + "token_acc": 0.34312333466690725 + }, + { + "epoch": 5.881266490765172, + "grad_norm": 0.24946675331438417, + "learning_rate": 0.00016916248724324954, + "loss": 2.710378408432007, + "step": 10033, + "token_acc": 0.3392705978161868 + }, + { + "epoch": 5.881852829082381, + "grad_norm": 0.2964444978003064, + "learning_rate": 0.00016915548674261387, + "loss": 2.6898016929626465, + "step": 10034, + "token_acc": 0.3432944167092522 + }, + { + "epoch": 5.88243916739959, + "grad_norm": 0.25498506091700623, + "learning_rate": 0.0001691484855923556, + "loss": 2.7475600242614746, + "step": 10035, + "token_acc": 0.3359395931658457 + }, + { + "epoch": 5.883025505716798, + "grad_norm": 0.2624255268620245, + "learning_rate": 0.00016914148379254047, + "loss": 2.6807546615600586, + "step": 10036, + "token_acc": 0.3458657612120471 + }, + { + "epoch": 5.883611844034007, + "grad_norm": 0.25074544928686404, + "learning_rate": 0.00016913448134323427, + "loss": 2.700540781021118, + "step": 10037, + "token_acc": 0.3412262433363459 + }, + { + "epoch": 5.884198182351216, + "grad_norm": 0.30626560982390727, + "learning_rate": 0.00016912747824450276, + "loss": 2.7350263595581055, + "step": 10038, + "token_acc": 0.3358779440968589 + }, + { + "epoch": 5.8847845206684255, + "grad_norm": 0.2683487849631449, + "learning_rate": 0.0001691204744964118, + "loss": 2.7285869121551514, + "step": 10039, + "token_acc": 0.336587305430735 + }, + { + "epoch": 5.885370858985635, + "grad_norm": 0.2541068544379682, + "learning_rate": 0.00016911347009902707, + "loss": 2.7220938205718994, + "step": 10040, + "token_acc": 0.33699998427829514 + }, + { + "epoch": 5.885957197302844, + "grad_norm": 0.26701081712120683, + "learning_rate": 0.00016910646505241444, + "loss": 2.714755058288574, + "step": 10041, + "token_acc": 0.3411365128444058 + }, + { + "epoch": 5.886543535620053, + "grad_norm": 0.2660290823455849, + "learning_rate": 0.0001690994593566397, + "loss": 2.7057929039001465, + "step": 10042, + "token_acc": 0.3422482336111616 + }, + { + "epoch": 5.887129873937262, + "grad_norm": 0.27057891232764697, + "learning_rate": 0.00016909245301176861, + "loss": 2.7170567512512207, + "step": 10043, + "token_acc": 0.33949803925677746 + }, + { + "epoch": 5.887716212254471, + "grad_norm": 0.253691041291685, + "learning_rate": 0.00016908544601786706, + "loss": 2.713286876678467, + "step": 10044, + "token_acc": 0.33850873429202005 + }, + { + "epoch": 5.88830255057168, + "grad_norm": 0.26975142116684775, + "learning_rate": 0.00016907843837500085, + "loss": 2.698566436767578, + "step": 10045, + "token_acc": 0.34244160536843465 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.25655403369670665, + "learning_rate": 0.0001690714300832358, + "loss": 2.7141926288604736, + "step": 10046, + "token_acc": 0.33841881063634144 + }, + { + "epoch": 5.889475227206098, + "grad_norm": 0.2775470679048955, + "learning_rate": 0.0001690644211426377, + "loss": 2.6909971237182617, + "step": 10047, + "token_acc": 0.34290730653221085 + }, + { + "epoch": 5.890061565523307, + "grad_norm": 0.24186288697184866, + "learning_rate": 0.00016905741155327246, + "loss": 2.728443145751953, + "step": 10048, + "token_acc": 0.3360793724703607 + }, + { + "epoch": 5.890647903840516, + "grad_norm": 0.2658494822318302, + "learning_rate": 0.00016905040131520588, + "loss": 2.6868700981140137, + "step": 10049, + "token_acc": 0.34324313822001734 + }, + { + "epoch": 5.891234242157725, + "grad_norm": 0.24731428669705158, + "learning_rate": 0.00016904339042850386, + "loss": 2.6995363235473633, + "step": 10050, + "token_acc": 0.3438812381419344 + }, + { + "epoch": 5.891820580474934, + "grad_norm": 0.26846177166257823, + "learning_rate": 0.00016903637889323218, + "loss": 2.7287354469299316, + "step": 10051, + "token_acc": 0.33603048879726577 + }, + { + "epoch": 5.892406918792143, + "grad_norm": 0.2503188336907777, + "learning_rate": 0.00016902936670945678, + "loss": 2.719649076461792, + "step": 10052, + "token_acc": 0.3387959422137242 + }, + { + "epoch": 5.892993257109352, + "grad_norm": 0.28499320101761116, + "learning_rate": 0.00016902235387724346, + "loss": 2.723842144012451, + "step": 10053, + "token_acc": 0.3391266773270406 + }, + { + "epoch": 5.893579595426561, + "grad_norm": 0.26114462898386326, + "learning_rate": 0.00016901534039665816, + "loss": 2.697676658630371, + "step": 10054, + "token_acc": 0.34146936334931344 + }, + { + "epoch": 5.89416593374377, + "grad_norm": 0.2582907228033285, + "learning_rate": 0.00016900832626776672, + "loss": 2.697214126586914, + "step": 10055, + "token_acc": 0.3404737483688385 + }, + { + "epoch": 5.8947522720609795, + "grad_norm": 0.2775114090668468, + "learning_rate": 0.00016900131149063507, + "loss": 2.7080087661743164, + "step": 10056, + "token_acc": 0.3397652943938934 + }, + { + "epoch": 5.895338610378188, + "grad_norm": 0.26195127181940703, + "learning_rate": 0.00016899429606532905, + "loss": 2.7213408946990967, + "step": 10057, + "token_acc": 0.3380230583148901 + }, + { + "epoch": 5.895924948695397, + "grad_norm": 0.24399284525978315, + "learning_rate": 0.0001689872799919146, + "loss": 2.7128515243530273, + "step": 10058, + "token_acc": 0.34011451895774175 + }, + { + "epoch": 5.896511287012606, + "grad_norm": 0.2527571299463814, + "learning_rate": 0.00016898026327045765, + "loss": 2.6993370056152344, + "step": 10059, + "token_acc": 0.34091895163666164 + }, + { + "epoch": 5.897097625329815, + "grad_norm": 0.2455955128465418, + "learning_rate": 0.00016897324590102404, + "loss": 2.7303175926208496, + "step": 10060, + "token_acc": 0.3350498483335134 + }, + { + "epoch": 5.897683963647024, + "grad_norm": 0.2581313722605992, + "learning_rate": 0.00016896622788367975, + "loss": 2.6940836906433105, + "step": 10061, + "token_acc": 0.34191245064570297 + }, + { + "epoch": 5.898270301964233, + "grad_norm": 0.28236584875064186, + "learning_rate": 0.00016895920921849067, + "loss": 2.711979627609253, + "step": 10062, + "token_acc": 0.33907035010219555 + }, + { + "epoch": 5.898856640281442, + "grad_norm": 0.2548364117502969, + "learning_rate": 0.00016895218990552272, + "loss": 2.715607166290283, + "step": 10063, + "token_acc": 0.3397585628993401 + }, + { + "epoch": 5.8994429785986515, + "grad_norm": 0.2546606514810528, + "learning_rate": 0.0001689451699448419, + "loss": 2.724278211593628, + "step": 10064, + "token_acc": 0.33668514693626117 + }, + { + "epoch": 5.900029316915861, + "grad_norm": 0.3172609746865497, + "learning_rate": 0.00016893814933651412, + "loss": 2.7009623050689697, + "step": 10065, + "token_acc": 0.34205782922508426 + }, + { + "epoch": 5.90061565523307, + "grad_norm": 0.28790419764338393, + "learning_rate": 0.00016893112808060527, + "loss": 2.723097801208496, + "step": 10066, + "token_acc": 0.3378764518449995 + }, + { + "epoch": 5.901201993550279, + "grad_norm": 0.25280529544607333, + "learning_rate": 0.0001689241061771814, + "loss": 2.6836447715759277, + "step": 10067, + "token_acc": 0.34226113233922384 + }, + { + "epoch": 5.901788331867488, + "grad_norm": 0.33438953330389093, + "learning_rate": 0.00016891708362630842, + "loss": 2.741642475128174, + "step": 10068, + "token_acc": 0.33421431069998836 + }, + { + "epoch": 5.902374670184696, + "grad_norm": 0.25299918839933627, + "learning_rate": 0.0001689100604280523, + "loss": 2.743349075317383, + "step": 10069, + "token_acc": 0.3329881576945539 + }, + { + "epoch": 5.902961008501905, + "grad_norm": 0.2690771408670462, + "learning_rate": 0.00016890303658247902, + "loss": 2.7648682594299316, + "step": 10070, + "token_acc": 0.3312713595172364 + }, + { + "epoch": 5.903547346819114, + "grad_norm": 0.2610014029532568, + "learning_rate": 0.00016889601208965456, + "loss": 2.7135047912597656, + "step": 10071, + "token_acc": 0.33981160886858663 + }, + { + "epoch": 5.9041336851363235, + "grad_norm": 0.24999015687568105, + "learning_rate": 0.00016888898694964488, + "loss": 2.664006233215332, + "step": 10072, + "token_acc": 0.34749988943894816 + }, + { + "epoch": 5.904720023453533, + "grad_norm": 0.257429646063742, + "learning_rate": 0.00016888196116251604, + "loss": 2.719818353652954, + "step": 10073, + "token_acc": 0.3377865245978108 + }, + { + "epoch": 5.905306361770742, + "grad_norm": 0.26387080191662715, + "learning_rate": 0.00016887493472833397, + "loss": 2.7447457313537598, + "step": 10074, + "token_acc": 0.33444293757448734 + }, + { + "epoch": 5.905892700087951, + "grad_norm": 0.24807202646982135, + "learning_rate": 0.0001688679076471647, + "loss": 2.721515655517578, + "step": 10075, + "token_acc": 0.33652087884699017 + }, + { + "epoch": 5.90647903840516, + "grad_norm": 0.27162421528252745, + "learning_rate": 0.00016886087991907424, + "loss": 2.7095324993133545, + "step": 10076, + "token_acc": 0.3397269967298922 + }, + { + "epoch": 5.907065376722369, + "grad_norm": 0.25609099850570555, + "learning_rate": 0.00016885385154412862, + "loss": 2.680759906768799, + "step": 10077, + "token_acc": 0.3448195958154769 + }, + { + "epoch": 5.907651715039578, + "grad_norm": 0.24784251886823802, + "learning_rate": 0.00016884682252239384, + "loss": 2.741856098175049, + "step": 10078, + "token_acc": 0.3345811934347349 + }, + { + "epoch": 5.908238053356786, + "grad_norm": 0.2555031980846938, + "learning_rate": 0.00016883979285393593, + "loss": 2.675039291381836, + "step": 10079, + "token_acc": 0.34487673989602546 + }, + { + "epoch": 5.9088243916739955, + "grad_norm": 0.24819935423417258, + "learning_rate": 0.00016883276253882093, + "loss": 2.685575485229492, + "step": 10080, + "token_acc": 0.34459982854959154 + }, + { + "epoch": 5.909410729991205, + "grad_norm": 0.2676498754293093, + "learning_rate": 0.00016882573157711486, + "loss": 2.6943202018737793, + "step": 10081, + "token_acc": 0.3405362134491183 + }, + { + "epoch": 5.909997068308414, + "grad_norm": 0.2686884666601504, + "learning_rate": 0.0001688186999688838, + "loss": 2.768827199935913, + "step": 10082, + "token_acc": 0.3319729039286235 + }, + { + "epoch": 5.910583406625623, + "grad_norm": 0.2523587822239195, + "learning_rate": 0.0001688116677141938, + "loss": 2.7474024295806885, + "step": 10083, + "token_acc": 0.3340068013062398 + }, + { + "epoch": 5.911169744942832, + "grad_norm": 0.2654585892112961, + "learning_rate": 0.0001688046348131109, + "loss": 2.7422313690185547, + "step": 10084, + "token_acc": 0.3355551605640479 + }, + { + "epoch": 5.911756083260041, + "grad_norm": 0.2584848805150892, + "learning_rate": 0.0001687976012657012, + "loss": 2.7703118324279785, + "step": 10085, + "token_acc": 0.3311910221908282 + }, + { + "epoch": 5.91234242157725, + "grad_norm": 0.2544015870909013, + "learning_rate": 0.00016879056707203068, + "loss": 2.7154178619384766, + "step": 10086, + "token_acc": 0.3379070265079357 + }, + { + "epoch": 5.912928759894459, + "grad_norm": 0.2611113798033157, + "learning_rate": 0.0001687835322321655, + "loss": 2.7297258377075195, + "step": 10087, + "token_acc": 0.3378544941492403 + }, + { + "epoch": 5.913515098211668, + "grad_norm": 0.24706921966583278, + "learning_rate": 0.00016877649674617174, + "loss": 2.7041428089141846, + "step": 10088, + "token_acc": 0.34071838353377365 + }, + { + "epoch": 5.9141014365288775, + "grad_norm": 0.2561260248175171, + "learning_rate": 0.00016876946061411546, + "loss": 2.724003314971924, + "step": 10089, + "token_acc": 0.33762091019609913 + }, + { + "epoch": 5.914687774846087, + "grad_norm": 0.24707000149635414, + "learning_rate": 0.00016876242383606277, + "loss": 2.734863519668579, + "step": 10090, + "token_acc": 0.33636622268201216 + }, + { + "epoch": 5.915274113163295, + "grad_norm": 0.249752896429239, + "learning_rate": 0.00016875538641207975, + "loss": 2.721538543701172, + "step": 10091, + "token_acc": 0.33632850752984145 + }, + { + "epoch": 5.915860451480504, + "grad_norm": 0.2612057511735482, + "learning_rate": 0.00016874834834223256, + "loss": 2.6812188625335693, + "step": 10092, + "token_acc": 0.3451552663297347 + }, + { + "epoch": 5.916446789797713, + "grad_norm": 0.2843336172093094, + "learning_rate": 0.00016874130962658726, + "loss": 2.7335152626037598, + "step": 10093, + "token_acc": 0.33540852236246455 + }, + { + "epoch": 5.917033128114922, + "grad_norm": 0.28123713903239245, + "learning_rate": 0.00016873427026520998, + "loss": 2.7167530059814453, + "step": 10094, + "token_acc": 0.33933930053012074 + }, + { + "epoch": 5.917619466432131, + "grad_norm": 0.25441452740695014, + "learning_rate": 0.00016872723025816683, + "loss": 2.6968724727630615, + "step": 10095, + "token_acc": 0.34167201687608917 + }, + { + "epoch": 5.91820580474934, + "grad_norm": 0.25176451116859444, + "learning_rate": 0.000168720189605524, + "loss": 2.705535650253296, + "step": 10096, + "token_acc": 0.34005191385106875 + }, + { + "epoch": 5.9187921430665495, + "grad_norm": 0.26072492490265464, + "learning_rate": 0.00016871314830734757, + "loss": 2.7111315727233887, + "step": 10097, + "token_acc": 0.3392982410389475 + }, + { + "epoch": 5.919378481383759, + "grad_norm": 0.2767946720655263, + "learning_rate": 0.0001687061063637037, + "loss": 2.727653980255127, + "step": 10098, + "token_acc": 0.3377626509118931 + }, + { + "epoch": 5.919964819700968, + "grad_norm": 0.26826580300472896, + "learning_rate": 0.00016869906377465856, + "loss": 2.7062578201293945, + "step": 10099, + "token_acc": 0.34077583669557016 + }, + { + "epoch": 5.920551158018176, + "grad_norm": 0.2633998971499819, + "learning_rate": 0.0001686920205402783, + "loss": 2.7585015296936035, + "step": 10100, + "token_acc": 0.33199755428495886 + }, + { + "epoch": 5.921137496335385, + "grad_norm": 0.252971193379832, + "learning_rate": 0.00016868497666062903, + "loss": 2.749950885772705, + "step": 10101, + "token_acc": 0.3347576951642185 + }, + { + "epoch": 5.921723834652594, + "grad_norm": 0.3218610297108547, + "learning_rate": 0.00016867793213577698, + "loss": 2.6981067657470703, + "step": 10102, + "token_acc": 0.341605800992275 + }, + { + "epoch": 5.922310172969803, + "grad_norm": 0.38947634568837347, + "learning_rate": 0.0001686708869657883, + "loss": 2.733358860015869, + "step": 10103, + "token_acc": 0.33697598914481636 + }, + { + "epoch": 5.9228965112870124, + "grad_norm": 0.3371059251436706, + "learning_rate": 0.00016866384115072917, + "loss": 2.7359580993652344, + "step": 10104, + "token_acc": 0.336489905845813 + }, + { + "epoch": 5.923482849604222, + "grad_norm": 0.2743430984718241, + "learning_rate": 0.0001686567946906658, + "loss": 2.7520432472229004, + "step": 10105, + "token_acc": 0.33413284558810324 + }, + { + "epoch": 5.924069187921431, + "grad_norm": 0.3126853672817783, + "learning_rate": 0.00016864974758566434, + "loss": 2.7216532230377197, + "step": 10106, + "token_acc": 0.3373539012993663 + }, + { + "epoch": 5.92465552623864, + "grad_norm": 0.26192901266594687, + "learning_rate": 0.000168642699835791, + "loss": 2.7034573554992676, + "step": 10107, + "token_acc": 0.3406948537836108 + }, + { + "epoch": 5.925241864555849, + "grad_norm": 0.2706106850519992, + "learning_rate": 0.000168635651441112, + "loss": 2.7299089431762695, + "step": 10108, + "token_acc": 0.3362670817104227 + }, + { + "epoch": 5.925828202873058, + "grad_norm": 0.2611442056301714, + "learning_rate": 0.00016862860240169356, + "loss": 2.738332509994507, + "step": 10109, + "token_acc": 0.33614985293918326 + }, + { + "epoch": 5.926414541190267, + "grad_norm": 0.2541051735472812, + "learning_rate": 0.00016862155271760187, + "loss": 2.7286152839660645, + "step": 10110, + "token_acc": 0.3371387562449971 + }, + { + "epoch": 5.927000879507476, + "grad_norm": 0.27289537558375987, + "learning_rate": 0.00016861450238890314, + "loss": 2.6953213214874268, + "step": 10111, + "token_acc": 0.3430040074420237 + }, + { + "epoch": 5.927587217824685, + "grad_norm": 0.2504841583694696, + "learning_rate": 0.00016860745141566365, + "loss": 2.7200679779052734, + "step": 10112, + "token_acc": 0.33947174996804297 + }, + { + "epoch": 5.928173556141894, + "grad_norm": 0.23989532943745695, + "learning_rate": 0.00016860039979794958, + "loss": 2.68721866607666, + "step": 10113, + "token_acc": 0.34244721169463993 + }, + { + "epoch": 5.928759894459103, + "grad_norm": 0.24303221104789188, + "learning_rate": 0.00016859334753582724, + "loss": 2.7521440982818604, + "step": 10114, + "token_acc": 0.3336711281567576 + }, + { + "epoch": 5.929346232776312, + "grad_norm": 0.24885470575374802, + "learning_rate": 0.00016858629462936277, + "loss": 2.745771884918213, + "step": 10115, + "token_acc": 0.336010983460049 + }, + { + "epoch": 5.929932571093521, + "grad_norm": 0.2581521681386411, + "learning_rate": 0.00016857924107862248, + "loss": 2.704254150390625, + "step": 10116, + "token_acc": 0.3404411349918204 + }, + { + "epoch": 5.93051890941073, + "grad_norm": 0.26578524238338724, + "learning_rate": 0.00016857218688367268, + "loss": 2.7237706184387207, + "step": 10117, + "token_acc": 0.33910852425349725 + }, + { + "epoch": 5.931105247727939, + "grad_norm": 0.2522166406505705, + "learning_rate": 0.00016856513204457959, + "loss": 2.7263214588165283, + "step": 10118, + "token_acc": 0.3374403567678826 + }, + { + "epoch": 5.931691586045148, + "grad_norm": 0.2530443232199545, + "learning_rate": 0.00016855807656140941, + "loss": 2.722933769226074, + "step": 10119, + "token_acc": 0.33871392072044726 + }, + { + "epoch": 5.932277924362357, + "grad_norm": 0.27053339127944986, + "learning_rate": 0.00016855102043422852, + "loss": 2.7333338260650635, + "step": 10120, + "token_acc": 0.33559873377390875 + }, + { + "epoch": 5.9328642626795665, + "grad_norm": 0.2537423696113559, + "learning_rate": 0.00016854396366310317, + "loss": 2.7365479469299316, + "step": 10121, + "token_acc": 0.3346153238255299 + }, + { + "epoch": 5.933450600996775, + "grad_norm": 0.25255995469474407, + "learning_rate": 0.00016853690624809965, + "loss": 2.692580461502075, + "step": 10122, + "token_acc": 0.3423083300383914 + }, + { + "epoch": 5.934036939313984, + "grad_norm": 0.25241324414005767, + "learning_rate": 0.00016852984818928427, + "loss": 2.702218532562256, + "step": 10123, + "token_acc": 0.34203271930954526 + }, + { + "epoch": 5.934623277631193, + "grad_norm": 0.24392493538590038, + "learning_rate": 0.00016852278948672328, + "loss": 2.733661413192749, + "step": 10124, + "token_acc": 0.33667951084032816 + }, + { + "epoch": 5.935209615948402, + "grad_norm": 0.24434229797482795, + "learning_rate": 0.00016851573014048304, + "loss": 2.6804187297821045, + "step": 10125, + "token_acc": 0.34484381146813875 + }, + { + "epoch": 5.935795954265611, + "grad_norm": 0.24714498283598646, + "learning_rate": 0.0001685086701506298, + "loss": 2.6839091777801514, + "step": 10126, + "token_acc": 0.34314231005417956 + }, + { + "epoch": 5.93638229258282, + "grad_norm": 0.2604206156321687, + "learning_rate": 0.00016850160951722995, + "loss": 2.6907825469970703, + "step": 10127, + "token_acc": 0.3426993880640688 + }, + { + "epoch": 5.936968630900029, + "grad_norm": 0.25348794327519003, + "learning_rate": 0.0001684945482403498, + "loss": 2.722682476043701, + "step": 10128, + "token_acc": 0.33785346672063843 + }, + { + "epoch": 5.9375549692172385, + "grad_norm": 0.2663427991611868, + "learning_rate": 0.00016848748632005567, + "loss": 2.717118263244629, + "step": 10129, + "token_acc": 0.3382822151796745 + }, + { + "epoch": 5.938141307534448, + "grad_norm": 0.2592244041794843, + "learning_rate": 0.00016848042375641387, + "loss": 2.698159694671631, + "step": 10130, + "token_acc": 0.34121336067256647 + }, + { + "epoch": 5.938727645851657, + "grad_norm": 0.26502909876679615, + "learning_rate": 0.00016847336054949077, + "loss": 2.7254021167755127, + "step": 10131, + "token_acc": 0.33753504314402666 + }, + { + "epoch": 5.939313984168866, + "grad_norm": 0.3027665784760766, + "learning_rate": 0.0001684662966993527, + "loss": 2.7336504459381104, + "step": 10132, + "token_acc": 0.33650704678567445 + }, + { + "epoch": 5.939900322486075, + "grad_norm": 0.2943076997509595, + "learning_rate": 0.00016845923220606607, + "loss": 2.738840341567993, + "step": 10133, + "token_acc": 0.33701679740275453 + }, + { + "epoch": 5.940486660803283, + "grad_norm": 0.23510233266352326, + "learning_rate": 0.0001684521670696972, + "loss": 2.6735754013061523, + "step": 10134, + "token_acc": 0.3450651931211783 + }, + { + "epoch": 5.941072999120492, + "grad_norm": 0.28858344536940916, + "learning_rate": 0.00016844510129031243, + "loss": 2.7069296836853027, + "step": 10135, + "token_acc": 0.3413103807813627 + }, + { + "epoch": 5.941659337437701, + "grad_norm": 0.29836664348950076, + "learning_rate": 0.0001684380348679782, + "loss": 2.6930034160614014, + "step": 10136, + "token_acc": 0.342329442658203 + }, + { + "epoch": 5.9422456757549105, + "grad_norm": 0.24096952179874076, + "learning_rate": 0.00016843096780276082, + "loss": 2.750548839569092, + "step": 10137, + "token_acc": 0.33375187697405895 + }, + { + "epoch": 5.94283201407212, + "grad_norm": 0.2648365452386403, + "learning_rate": 0.00016842390009472674, + "loss": 2.7154617309570312, + "step": 10138, + "token_acc": 0.33912107303884664 + }, + { + "epoch": 5.943418352389329, + "grad_norm": 0.2731096608202849, + "learning_rate": 0.00016841683174394228, + "loss": 2.6951241493225098, + "step": 10139, + "token_acc": 0.3432894149173738 + }, + { + "epoch": 5.944004690706538, + "grad_norm": 0.2468093132454965, + "learning_rate": 0.0001684097627504739, + "loss": 2.680095911026001, + "step": 10140, + "token_acc": 0.3441169368227233 + }, + { + "epoch": 5.944591029023747, + "grad_norm": 0.23566015728538875, + "learning_rate": 0.000168402693114388, + "loss": 2.727893352508545, + "step": 10141, + "token_acc": 0.3377867698626128 + }, + { + "epoch": 5.945177367340956, + "grad_norm": 0.2659399442860299, + "learning_rate": 0.00016839562283575097, + "loss": 2.7441470623016357, + "step": 10142, + "token_acc": 0.3349252364561787 + }, + { + "epoch": 5.945763705658165, + "grad_norm": 0.2714657672663293, + "learning_rate": 0.00016838855191462918, + "loss": 2.703430414199829, + "step": 10143, + "token_acc": 0.3400894624091793 + }, + { + "epoch": 5.946350043975373, + "grad_norm": 0.24435987325458336, + "learning_rate": 0.00016838148035108917, + "loss": 2.699824333190918, + "step": 10144, + "token_acc": 0.3393396688442198 + }, + { + "epoch": 5.9469363822925825, + "grad_norm": 0.24733412993940038, + "learning_rate": 0.00016837440814519724, + "loss": 2.717195510864258, + "step": 10145, + "token_acc": 0.3383077331861708 + }, + { + "epoch": 5.947522720609792, + "grad_norm": 0.2515290675540613, + "learning_rate": 0.0001683673352970199, + "loss": 2.7203683853149414, + "step": 10146, + "token_acc": 0.3375116699467514 + }, + { + "epoch": 5.948109058927001, + "grad_norm": 0.2646140643081045, + "learning_rate": 0.00016836026180662357, + "loss": 2.728562355041504, + "step": 10147, + "token_acc": 0.33753807721901913 + }, + { + "epoch": 5.94869539724421, + "grad_norm": 0.2498993709438596, + "learning_rate": 0.0001683531876740747, + "loss": 2.753882884979248, + "step": 10148, + "token_acc": 0.3342149123512168 + }, + { + "epoch": 5.949281735561419, + "grad_norm": 0.2594515677619726, + "learning_rate": 0.0001683461128994398, + "loss": 2.735823631286621, + "step": 10149, + "token_acc": 0.335089879340064 + }, + { + "epoch": 5.949868073878628, + "grad_norm": 0.23983166408656292, + "learning_rate": 0.00016833903748278517, + "loss": 2.699150562286377, + "step": 10150, + "token_acc": 0.34081424230580104 + }, + { + "epoch": 5.950454412195837, + "grad_norm": 0.26590181742368757, + "learning_rate": 0.0001683319614241774, + "loss": 2.707749128341675, + "step": 10151, + "token_acc": 0.34069754168958255 + }, + { + "epoch": 5.951040750513046, + "grad_norm": 0.2631816576989613, + "learning_rate": 0.00016832488472368296, + "loss": 2.7378439903259277, + "step": 10152, + "token_acc": 0.3353515922696478 + }, + { + "epoch": 5.951627088830255, + "grad_norm": 0.2568349334883304, + "learning_rate": 0.00016831780738136827, + "loss": 2.724374294281006, + "step": 10153, + "token_acc": 0.33815388002893904 + }, + { + "epoch": 5.9522134271474645, + "grad_norm": 0.25290173775604213, + "learning_rate": 0.00016831072939729985, + "loss": 2.75616455078125, + "step": 10154, + "token_acc": 0.3337582123407342 + }, + { + "epoch": 5.952799765464674, + "grad_norm": 0.2510819112805984, + "learning_rate": 0.00016830365077154415, + "loss": 2.746598720550537, + "step": 10155, + "token_acc": 0.3346535841027553 + }, + { + "epoch": 5.953386103781882, + "grad_norm": 0.2570786773194706, + "learning_rate": 0.0001682965715041677, + "loss": 2.715606927871704, + "step": 10156, + "token_acc": 0.3382761914033209 + }, + { + "epoch": 5.953972442099091, + "grad_norm": 0.25310745334052026, + "learning_rate": 0.00016828949159523703, + "loss": 2.718245029449463, + "step": 10157, + "token_acc": 0.33891849918916495 + }, + { + "epoch": 5.9545587804163, + "grad_norm": 0.2643569262687907, + "learning_rate": 0.00016828241104481858, + "loss": 2.7268900871276855, + "step": 10158, + "token_acc": 0.3371385420599685 + }, + { + "epoch": 5.955145118733509, + "grad_norm": 0.27702447603389446, + "learning_rate": 0.0001682753298529789, + "loss": 2.727816581726074, + "step": 10159, + "token_acc": 0.33719018124850864 + }, + { + "epoch": 5.955731457050718, + "grad_norm": 0.295007505266152, + "learning_rate": 0.0001682682480197845, + "loss": 2.742077589035034, + "step": 10160, + "token_acc": 0.3336946370081951 + }, + { + "epoch": 5.956317795367927, + "grad_norm": 0.2786256298859136, + "learning_rate": 0.00016826116554530187, + "loss": 2.7045798301696777, + "step": 10161, + "token_acc": 0.3411555187192144 + }, + { + "epoch": 5.9569041336851365, + "grad_norm": 0.2542391678454758, + "learning_rate": 0.0001682540824295976, + "loss": 2.7173380851745605, + "step": 10162, + "token_acc": 0.33700387382108304 + }, + { + "epoch": 5.957490472002346, + "grad_norm": 0.2677545372647092, + "learning_rate": 0.0001682469986727382, + "loss": 2.689178943634033, + "step": 10163, + "token_acc": 0.3434429090345015 + }, + { + "epoch": 5.958076810319555, + "grad_norm": 0.3021554761012849, + "learning_rate": 0.00016823991427479017, + "loss": 2.7516028881073, + "step": 10164, + "token_acc": 0.3331225143910727 + }, + { + "epoch": 5.958663148636763, + "grad_norm": 0.27981576317063545, + "learning_rate": 0.00016823282923582019, + "loss": 2.751741647720337, + "step": 10165, + "token_acc": 0.3335085809696918 + }, + { + "epoch": 5.959249486953972, + "grad_norm": 0.26273494458440705, + "learning_rate": 0.00016822574355589465, + "loss": 2.7154407501220703, + "step": 10166, + "token_acc": 0.33911059166832846 + }, + { + "epoch": 5.959835825271181, + "grad_norm": 0.2986589720688196, + "learning_rate": 0.00016821865723508022, + "loss": 2.7141833305358887, + "step": 10167, + "token_acc": 0.33851671780974774 + }, + { + "epoch": 5.96042216358839, + "grad_norm": 0.3029207539838716, + "learning_rate": 0.0001682115702734434, + "loss": 2.704883575439453, + "step": 10168, + "token_acc": 0.3398980263934775 + }, + { + "epoch": 5.961008501905599, + "grad_norm": 0.25979838496000174, + "learning_rate": 0.0001682044826710508, + "loss": 2.704939126968384, + "step": 10169, + "token_acc": 0.34141642824132973 + }, + { + "epoch": 5.9615948402228085, + "grad_norm": 0.395455216654233, + "learning_rate": 0.00016819739442796902, + "loss": 2.756603717803955, + "step": 10170, + "token_acc": 0.3303579438413923 + }, + { + "epoch": 5.962181178540018, + "grad_norm": 0.29430979921994643, + "learning_rate": 0.00016819030554426459, + "loss": 2.71714448928833, + "step": 10171, + "token_acc": 0.33845494645307483 + }, + { + "epoch": 5.962767516857227, + "grad_norm": 0.2950663206671342, + "learning_rate": 0.00016818321602000413, + "loss": 2.7282814979553223, + "step": 10172, + "token_acc": 0.3377189158696377 + }, + { + "epoch": 5.963353855174436, + "grad_norm": 0.32582751939322346, + "learning_rate": 0.00016817612585525425, + "loss": 2.7257494926452637, + "step": 10173, + "token_acc": 0.338313277181387 + }, + { + "epoch": 5.963940193491645, + "grad_norm": 0.26229508108503635, + "learning_rate": 0.0001681690350500815, + "loss": 2.739900827407837, + "step": 10174, + "token_acc": 0.3351960028580282 + }, + { + "epoch": 5.964526531808854, + "grad_norm": 0.316062294610996, + "learning_rate": 0.00016816194360455255, + "loss": 2.7042412757873535, + "step": 10175, + "token_acc": 0.33954629191566515 + }, + { + "epoch": 5.965112870126063, + "grad_norm": 0.24812786199407416, + "learning_rate": 0.000168154851518734, + "loss": 2.7209177017211914, + "step": 10176, + "token_acc": 0.3394989617487773 + }, + { + "epoch": 5.965699208443271, + "grad_norm": 0.3730941020562639, + "learning_rate": 0.00016814775879269247, + "loss": 2.7558434009552, + "step": 10177, + "token_acc": 0.3322239895591174 + }, + { + "epoch": 5.9662855467604805, + "grad_norm": 0.2582665561513139, + "learning_rate": 0.00016814066542649453, + "loss": 2.7265844345092773, + "step": 10178, + "token_acc": 0.3356349214731572 + }, + { + "epoch": 5.96687188507769, + "grad_norm": 0.3161310886194945, + "learning_rate": 0.00016813357142020689, + "loss": 2.7031235694885254, + "step": 10179, + "token_acc": 0.3423281820868433 + }, + { + "epoch": 5.967458223394899, + "grad_norm": 0.2545084193008516, + "learning_rate": 0.00016812647677389616, + "loss": 2.649871826171875, + "step": 10180, + "token_acc": 0.3487021071794765 + }, + { + "epoch": 5.968044561712108, + "grad_norm": 0.32581494917799214, + "learning_rate": 0.00016811938148762897, + "loss": 2.7154979705810547, + "step": 10181, + "token_acc": 0.3380478176340511 + }, + { + "epoch": 5.968630900029317, + "grad_norm": 0.2659309235677399, + "learning_rate": 0.00016811228556147198, + "loss": 2.7338480949401855, + "step": 10182, + "token_acc": 0.3361925209763037 + }, + { + "epoch": 5.969217238346526, + "grad_norm": 0.3053709233765362, + "learning_rate": 0.00016810518899549188, + "loss": 2.7382726669311523, + "step": 10183, + "token_acc": 0.335295700285298 + }, + { + "epoch": 5.969803576663735, + "grad_norm": 0.24617108457007933, + "learning_rate": 0.00016809809178975526, + "loss": 2.712703227996826, + "step": 10184, + "token_acc": 0.34116178867473895 + }, + { + "epoch": 5.970389914980944, + "grad_norm": 0.2840253339153727, + "learning_rate": 0.00016809099394432883, + "loss": 2.7629711627960205, + "step": 10185, + "token_acc": 0.3304271246147072 + }, + { + "epoch": 5.970976253298153, + "grad_norm": 0.2578011520710332, + "learning_rate": 0.0001680838954592793, + "loss": 2.7712347507476807, + "step": 10186, + "token_acc": 0.3316614953032686 + }, + { + "epoch": 5.971562591615362, + "grad_norm": 0.2797721129236242, + "learning_rate": 0.0001680767963346733, + "loss": 2.7358903884887695, + "step": 10187, + "token_acc": 0.3354225150676748 + }, + { + "epoch": 5.972148929932571, + "grad_norm": 0.26448160579548713, + "learning_rate": 0.00016806969657057755, + "loss": 2.7082791328430176, + "step": 10188, + "token_acc": 0.34017997585689724 + }, + { + "epoch": 5.97273526824978, + "grad_norm": 0.28639745722024423, + "learning_rate": 0.00016806259616705872, + "loss": 2.6926002502441406, + "step": 10189, + "token_acc": 0.34232629509111445 + }, + { + "epoch": 5.973321606566989, + "grad_norm": 0.25117330132637294, + "learning_rate": 0.00016805549512418348, + "loss": 2.7002205848693848, + "step": 10190, + "token_acc": 0.3402950698007703 + }, + { + "epoch": 5.973907944884198, + "grad_norm": 0.26488006377207257, + "learning_rate": 0.0001680483934420186, + "loss": 2.7291829586029053, + "step": 10191, + "token_acc": 0.33692652684342433 + }, + { + "epoch": 5.974494283201407, + "grad_norm": 0.2484832530770182, + "learning_rate": 0.00016804129112063076, + "loss": 2.7229056358337402, + "step": 10192, + "token_acc": 0.33814306863511123 + }, + { + "epoch": 5.975080621518616, + "grad_norm": 0.2508066362067838, + "learning_rate": 0.00016803418816008667, + "loss": 2.7284324169158936, + "step": 10193, + "token_acc": 0.3374094362961654 + }, + { + "epoch": 5.975666959835825, + "grad_norm": 0.24051638550463983, + "learning_rate": 0.00016802708456045305, + "loss": 2.7369537353515625, + "step": 10194, + "token_acc": 0.3355770618210907 + }, + { + "epoch": 5.9762532981530345, + "grad_norm": 0.24410167997947352, + "learning_rate": 0.00016801998032179663, + "loss": 2.72306752204895, + "step": 10195, + "token_acc": 0.33823716420282945 + }, + { + "epoch": 5.976839636470244, + "grad_norm": 0.24563474651349018, + "learning_rate": 0.00016801287544418418, + "loss": 2.742371082305908, + "step": 10196, + "token_acc": 0.33539072633367345 + }, + { + "epoch": 5.977425974787453, + "grad_norm": 0.25293949088856804, + "learning_rate": 0.00016800576992768242, + "loss": 2.742272138595581, + "step": 10197, + "token_acc": 0.33645290090167707 + }, + { + "epoch": 5.978012313104662, + "grad_norm": 0.2694217609201809, + "learning_rate": 0.00016799866377235808, + "loss": 2.7645301818847656, + "step": 10198, + "token_acc": 0.3323620306724383 + }, + { + "epoch": 5.97859865142187, + "grad_norm": 0.24555958465568534, + "learning_rate": 0.0001679915569782779, + "loss": 2.727877140045166, + "step": 10199, + "token_acc": 0.33814461196160267 + }, + { + "epoch": 5.979184989739079, + "grad_norm": 0.31705340735516746, + "learning_rate": 0.00016798444954550868, + "loss": 2.757765054702759, + "step": 10200, + "token_acc": 0.33307569338559995 + }, + { + "epoch": 5.979771328056288, + "grad_norm": 0.3158558866135394, + "learning_rate": 0.0001679773414741172, + "loss": 2.6718242168426514, + "step": 10201, + "token_acc": 0.3452937102394126 + }, + { + "epoch": 5.980357666373497, + "grad_norm": 0.24459783158543644, + "learning_rate": 0.00016797023276417017, + "loss": 2.762197494506836, + "step": 10202, + "token_acc": 0.33071679405081594 + }, + { + "epoch": 5.9809440046907065, + "grad_norm": 0.35245369449167063, + "learning_rate": 0.00016796312341573434, + "loss": 2.7274842262268066, + "step": 10203, + "token_acc": 0.33664886359811835 + }, + { + "epoch": 5.981530343007916, + "grad_norm": 0.3059113994481763, + "learning_rate": 0.00016795601342887664, + "loss": 2.7295093536376953, + "step": 10204, + "token_acc": 0.3367064850066305 + }, + { + "epoch": 5.982116681325125, + "grad_norm": 0.2800803667709282, + "learning_rate": 0.0001679489028036637, + "loss": 2.7052507400512695, + "step": 10205, + "token_acc": 0.3412273052645279 + }, + { + "epoch": 5.982703019642334, + "grad_norm": 0.3617578018394989, + "learning_rate": 0.00016794179154016242, + "loss": 2.7270894050598145, + "step": 10206, + "token_acc": 0.33678452723231583 + }, + { + "epoch": 5.983289357959543, + "grad_norm": 0.2446705153648211, + "learning_rate": 0.00016793467963843953, + "loss": 2.7204036712646484, + "step": 10207, + "token_acc": 0.3384542896407303 + }, + { + "epoch": 5.983875696276751, + "grad_norm": 0.2962658203937414, + "learning_rate": 0.00016792756709856188, + "loss": 2.753791570663452, + "step": 10208, + "token_acc": 0.33443397290771926 + }, + { + "epoch": 5.98446203459396, + "grad_norm": 0.2392133190442609, + "learning_rate": 0.00016792045392059627, + "loss": 2.73746919631958, + "step": 10209, + "token_acc": 0.3362744608298081 + }, + { + "epoch": 5.985048372911169, + "grad_norm": 0.27775266005779287, + "learning_rate": 0.00016791334010460952, + "loss": 2.734940528869629, + "step": 10210, + "token_acc": 0.3354403208942055 + }, + { + "epoch": 5.9856347112283785, + "grad_norm": 0.26276167013246793, + "learning_rate": 0.00016790622565066848, + "loss": 2.7292447090148926, + "step": 10211, + "token_acc": 0.3374857438949148 + }, + { + "epoch": 5.986221049545588, + "grad_norm": 0.2687631395065838, + "learning_rate": 0.00016789911055883994, + "loss": 2.7581610679626465, + "step": 10212, + "token_acc": 0.3334221451944592 + }, + { + "epoch": 5.986807387862797, + "grad_norm": 0.2501641565992109, + "learning_rate": 0.00016789199482919071, + "loss": 2.7086219787597656, + "step": 10213, + "token_acc": 0.339484948785041 + }, + { + "epoch": 5.987393726180006, + "grad_norm": 0.2799095182632129, + "learning_rate": 0.00016788487846178772, + "loss": 2.7407708168029785, + "step": 10214, + "token_acc": 0.33759326194651984 + }, + { + "epoch": 5.987980064497215, + "grad_norm": 0.26749523677276577, + "learning_rate": 0.00016787776145669775, + "loss": 2.722445011138916, + "step": 10215, + "token_acc": 0.33732403250733306 + }, + { + "epoch": 5.988566402814424, + "grad_norm": 0.2486040432261087, + "learning_rate": 0.0001678706438139877, + "loss": 2.7272770404815674, + "step": 10216, + "token_acc": 0.337614079660969 + }, + { + "epoch": 5.989152741131633, + "grad_norm": 0.27012289743636514, + "learning_rate": 0.0001678635255337244, + "loss": 2.744518995285034, + "step": 10217, + "token_acc": 0.3362216102449189 + }, + { + "epoch": 5.989739079448842, + "grad_norm": 0.25075233346649883, + "learning_rate": 0.00016785640661597467, + "loss": 2.710411548614502, + "step": 10218, + "token_acc": 0.34014015896523714 + }, + { + "epoch": 5.990325417766051, + "grad_norm": 0.2751869613546627, + "learning_rate": 0.00016784928706080552, + "loss": 2.7205638885498047, + "step": 10219, + "token_acc": 0.3381181954079432 + }, + { + "epoch": 5.99091175608326, + "grad_norm": 0.2510417853898608, + "learning_rate": 0.0001678421668682837, + "loss": 2.706225872039795, + "step": 10220, + "token_acc": 0.34136128655702164 + }, + { + "epoch": 5.991498094400469, + "grad_norm": 0.2627004683909609, + "learning_rate": 0.00016783504603847614, + "loss": 2.705753803253174, + "step": 10221, + "token_acc": 0.3402494442180625 + }, + { + "epoch": 5.992084432717678, + "grad_norm": 0.2561212305433454, + "learning_rate": 0.0001678279245714498, + "loss": 2.7705225944519043, + "step": 10222, + "token_acc": 0.3310022175901504 + }, + { + "epoch": 5.992670771034887, + "grad_norm": 0.268456607013711, + "learning_rate": 0.00016782080246727143, + "loss": 2.6961536407470703, + "step": 10223, + "token_acc": 0.34109681752023185 + }, + { + "epoch": 5.993257109352096, + "grad_norm": 0.2481218322257767, + "learning_rate": 0.00016781367972600804, + "loss": 2.6990301609039307, + "step": 10224, + "token_acc": 0.34103218169247207 + }, + { + "epoch": 5.993843447669305, + "grad_norm": 0.2596926162538201, + "learning_rate": 0.0001678065563477265, + "loss": 2.711716413497925, + "step": 10225, + "token_acc": 0.3403301953832597 + }, + { + "epoch": 5.994429785986514, + "grad_norm": 0.264815682982942, + "learning_rate": 0.00016779943233249372, + "loss": 2.745940685272217, + "step": 10226, + "token_acc": 0.33574427455955635 + }, + { + "epoch": 5.995016124303723, + "grad_norm": 0.25475044965603727, + "learning_rate": 0.00016779230768037667, + "loss": 2.7270400524139404, + "step": 10227, + "token_acc": 0.33782008491095034 + }, + { + "epoch": 5.9956024626209325, + "grad_norm": 0.2866190072061432, + "learning_rate": 0.00016778518239144222, + "loss": 2.698831081390381, + "step": 10228, + "token_acc": 0.3416493743318871 + }, + { + "epoch": 5.996188800938142, + "grad_norm": 0.27933860036912594, + "learning_rate": 0.0001677780564657573, + "loss": 2.727341651916504, + "step": 10229, + "token_acc": 0.33891827585854767 + }, + { + "epoch": 5.99677513925535, + "grad_norm": 0.2560384106467328, + "learning_rate": 0.00016777092990338888, + "loss": 2.720071792602539, + "step": 10230, + "token_acc": 0.33778281933362697 + }, + { + "epoch": 5.997361477572559, + "grad_norm": 0.26400750208288487, + "learning_rate": 0.00016776380270440395, + "loss": 2.7431836128234863, + "step": 10231, + "token_acc": 0.33481992703110036 + }, + { + "epoch": 5.997947815889768, + "grad_norm": 0.26428567522865265, + "learning_rate": 0.00016775667486886936, + "loss": 2.7432212829589844, + "step": 10232, + "token_acc": 0.33454362518865244 + }, + { + "epoch": 5.998534154206977, + "grad_norm": 0.2566944884692843, + "learning_rate": 0.00016774954639685213, + "loss": 2.712904453277588, + "step": 10233, + "token_acc": 0.3389582481905791 + }, + { + "epoch": 5.999120492524186, + "grad_norm": 0.24888684863633628, + "learning_rate": 0.00016774241728841917, + "loss": 2.7438740730285645, + "step": 10234, + "token_acc": 0.3357275316071155 + }, + { + "epoch": 5.999706830841395, + "grad_norm": 0.28585631802949457, + "learning_rate": 0.00016773528754363755, + "loss": 2.7697067260742188, + "step": 10235, + "token_acc": 0.33190997664064525 + }, + { + "epoch": 6.0, + "grad_norm": 0.33256047208254136, + "learning_rate": 0.00016772815716257412, + "loss": 2.7731783390045166, + "step": 10236, + "token_acc": 0.3298436325828933 + }, + { + "epoch": 6.0, + "eval_loss": 3.1927804946899414, + "eval_runtime": 16.7898, + "eval_samples_per_second": 15.247, + "eval_steps_per_second": 1.906, + "eval_token_acc": 0.2776745537640563, + "step": 10236 + } + ], + "logging_steps": 1, + "max_steps": 34120, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": -34120, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7329911390339072.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}