diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17494 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.547446153846154, + "eval_steps": 500, + "global_step": 19400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.32, + "grad_norm": 0.48755398392677307, + "learning_rate": 9e-06, + "loss": 2.9601, + "mean_token_accuracy": 0.4284199851565063, + "num_tokens": 15763.0, + "step": 10 + }, + { + "epoch": 0.64, + "grad_norm": 0.7431631088256836, + "learning_rate": 1.9e-05, + "loss": 3.1197, + "mean_token_accuracy": 0.42047689845785496, + "num_tokens": 30510.0, + "step": 20 + }, + { + "epoch": 0.96, + "grad_norm": 0.9281144142150879, + "learning_rate": 1.9999959867760483e-05, + "loss": 3.0392, + "mean_token_accuracy": 0.41553077606949956, + "num_tokens": 44066.0, + "step": 30 + }, + { + "epoch": 1.256, + "grad_norm": 1.2322081327438354, + "learning_rate": 1.999982113944484e-05, + "loss": 3.3331, + "mean_token_accuracy": 0.4135566469583962, + "num_tokens": 58784.0, + "step": 40 + }, + { + "epoch": 1.576, + "grad_norm": 7.1544270515441895, + "learning_rate": 1.9999583320967683e-05, + "loss": 3.092, + "mean_token_accuracy": 0.42304785093292596, + "num_tokens": 74305.0, + "step": 50 + }, + { + "epoch": 1.896, + "grad_norm": 0.8600693345069885, + "learning_rate": 1.99992464146856e-05, + "loss": 2.8575, + "mean_token_accuracy": 0.4428128655999899, + "num_tokens": 88582.0, + "step": 60 + }, + { + "epoch": 2.192, + "grad_norm": 1.3089476823806763, + "learning_rate": 1.999881042393706e-05, + "loss": 3.2266, + "mean_token_accuracy": 0.43124929654437144, + "num_tokens": 101701.0, + "step": 70 + }, + { + "epoch": 2.512, + "grad_norm": 0.6948946714401245, + "learning_rate": 1.9998275353042377e-05, + "loss": 2.6626, + "mean_token_accuracy": 0.45563504602760074, + "num_tokens": 116845.0, + "step": 80 + }, + { + "epoch": 2.832, + "grad_norm": 1.0951130390167236, + "learning_rate": 1.999764120730368e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.4799388902261853, + "num_tokens": 131642.0, + "step": 90 + }, + { + "epoch": 3.128, + "grad_norm": 2.2256667613983154, + "learning_rate": 1.9996907993004836e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.4792116602530351, + "num_tokens": 144071.0, + "step": 100 + }, + { + "epoch": 3.448, + "grad_norm": 1.3083274364471436, + "learning_rate": 1.9996075717411405e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.49251377964392307, + "num_tokens": 160671.0, + "step": 110 + }, + { + "epoch": 3.768, + "grad_norm": 1.0546499490737915, + "learning_rate": 1.9995144388770577e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.49237193521112205, + "num_tokens": 174208.0, + "step": 120 + }, + { + "epoch": 4.064, + "grad_norm": 1.3395127058029175, + "learning_rate": 1.9994114016311053e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.494740814977401, + "num_tokens": 189174.0, + "step": 130 + }, + { + "epoch": 4.384, + "grad_norm": 0.9926703572273254, + "learning_rate": 1.9992984610243006e-05, + "loss": 2.2636, + "mean_token_accuracy": 0.5148700190708041, + "num_tokens": 205614.0, + "step": 140 + }, + { + "epoch": 4.704, + "grad_norm": 2.0030810832977295, + "learning_rate": 1.9991756181757936e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.5199183862656355, + "num_tokens": 220053.0, + "step": 150 + }, + { + "epoch": 5.0, + "grad_norm": 3.6073784828186035, + "learning_rate": 1.999042874302857e-05, + "loss": 2.3257, + "mean_token_accuracy": 0.5087871091389978, + "num_tokens": 232090.0, + "step": 160 + }, + { + "epoch": 5.32, + "grad_norm": 0.9835549592971802, + "learning_rate": 1.9989002307208767e-05, + "loss": 2.0461, + "mean_token_accuracy": 0.5411571308970451, + "num_tokens": 247494.0, + "step": 170 + }, + { + "epoch": 5.64, + "grad_norm": 1.3350774049758911, + "learning_rate": 1.998747688843335e-05, + "loss": 2.1222, + "mean_token_accuracy": 0.5401662968099117, + "num_tokens": 261394.0, + "step": 180 + }, + { + "epoch": 5.96, + "grad_norm": 1.4606289863586426, + "learning_rate": 1.9985852501817985e-05, + "loss": 2.0423, + "mean_token_accuracy": 0.5442505508661271, + "num_tokens": 276061.0, + "step": 190 + }, + { + "epoch": 6.256, + "grad_norm": 2.1242387294769287, + "learning_rate": 1.998412916345904e-05, + "loss": 2.1295, + "mean_token_accuracy": 0.5391830210951535, + "num_tokens": 289693.0, + "step": 200 + }, + { + "epoch": 6.576, + "grad_norm": 1.4135829210281372, + "learning_rate": 1.99823068904334e-05, + "loss": 1.9505, + "mean_token_accuracy": 0.5633615963160992, + "num_tokens": 304290.0, + "step": 210 + }, + { + "epoch": 6.896, + "grad_norm": 1.05637526512146, + "learning_rate": 1.998038570079833e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.569461640715599, + "num_tokens": 319354.0, + "step": 220 + }, + { + "epoch": 7.192, + "grad_norm": 1.409026026725769, + "learning_rate": 1.9978365613591263e-05, + "loss": 1.8998, + "mean_token_accuracy": 0.5667525610408267, + "num_tokens": 333444.0, + "step": 230 + }, + { + "epoch": 7.5120000000000005, + "grad_norm": 0.9068111181259155, + "learning_rate": 1.9976246648829636e-05, + "loss": 1.6992, + "mean_token_accuracy": 0.6070717331022024, + "num_tokens": 349035.0, + "step": 240 + }, + { + "epoch": 7.832, + "grad_norm": 1.7179079055786133, + "learning_rate": 1.997402882751068e-05, + "loss": 1.898, + "mean_token_accuracy": 0.5741809576749801, + "num_tokens": 363648.0, + "step": 250 + }, + { + "epoch": 8.128, + "grad_norm": 2.5260181427001953, + "learning_rate": 1.997171217161122e-05, + "loss": 1.8262, + "mean_token_accuracy": 0.6014102785973936, + "num_tokens": 377215.0, + "step": 260 + }, + { + "epoch": 8.448, + "grad_norm": 1.0469582080841064, + "learning_rate": 1.996929670408744e-05, + "loss": 1.7423, + "mean_token_accuracy": 0.6009339291602374, + "num_tokens": 394237.0, + "step": 270 + }, + { + "epoch": 8.768, + "grad_norm": 2.0109641551971436, + "learning_rate": 1.996678244887468e-05, + "loss": 1.844, + "mean_token_accuracy": 0.5853462919592858, + "num_tokens": 407239.0, + "step": 280 + }, + { + "epoch": 9.064, + "grad_norm": 2.3061084747314453, + "learning_rate": 1.9964169430887174e-05, + "loss": 1.7935, + "mean_token_accuracy": 0.5961252535517151, + "num_tokens": 420770.0, + "step": 290 + }, + { + "epoch": 9.384, + "grad_norm": 1.5985437631607056, + "learning_rate": 1.9961457676017833e-05, + "loss": 1.703, + "mean_token_accuracy": 0.6001830734312534, + "num_tokens": 435005.0, + "step": 300 + }, + { + "epoch": 9.704, + "grad_norm": 1.0775606632232666, + "learning_rate": 1.9958647211137952e-05, + "loss": 1.7104, + "mean_token_accuracy": 0.6070772130973637, + "num_tokens": 449192.0, + "step": 310 + }, + { + "epoch": 10.0, + "grad_norm": 4.900282859802246, + "learning_rate": 1.9955738064096975e-05, + "loss": 1.7131, + "mean_token_accuracy": 0.6122549969602276, + "num_tokens": 464180.0, + "step": 320 + }, + { + "epoch": 10.32, + "grad_norm": 4.266648769378662, + "learning_rate": 1.9952730263722205e-05, + "loss": 1.6526, + "mean_token_accuracy": 0.6316778633743525, + "num_tokens": 480765.0, + "step": 330 + }, + { + "epoch": 10.64, + "grad_norm": 1.3835431337356567, + "learning_rate": 1.994962383981851e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.6267588481307029, + "num_tokens": 495929.0, + "step": 340 + }, + { + "epoch": 10.96, + "grad_norm": 0.9430285096168518, + "learning_rate": 1.9946418823168053e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.6107403030619025, + "num_tokens": 508555.0, + "step": 350 + }, + { + "epoch": 11.256, + "grad_norm": 1.7163220643997192, + "learning_rate": 1.994311524552996e-05, + "loss": 1.557, + "mean_token_accuracy": 0.6259392479787002, + "num_tokens": 522011.0, + "step": 360 + }, + { + "epoch": 11.576, + "grad_norm": 1.8388807773590088, + "learning_rate": 1.993971313964002e-05, + "loss": 1.6451, + "mean_token_accuracy": 0.6252246461808681, + "num_tokens": 535811.0, + "step": 370 + }, + { + "epoch": 11.896, + "grad_norm": 2.878235101699829, + "learning_rate": 1.993621253921036e-05, + "loss": 1.6015, + "mean_token_accuracy": 0.6341628909111023, + "num_tokens": 551848.0, + "step": 380 + }, + { + "epoch": 12.192, + "grad_norm": 8.73257064819336, + "learning_rate": 1.9932613478929103e-05, + "loss": 1.6087, + "mean_token_accuracy": 0.6341016663087381, + "num_tokens": 566187.0, + "step": 390 + }, + { + "epoch": 12.512, + "grad_norm": 1.6711736917495728, + "learning_rate": 1.9928915994460037e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.641170359775424, + "num_tokens": 579340.0, + "step": 400 + }, + { + "epoch": 12.832, + "grad_norm": 1.2355554103851318, + "learning_rate": 1.9925120122442253e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6438105596229434, + "num_tokens": 595449.0, + "step": 410 + }, + { + "epoch": 13.128, + "grad_norm": 1.1492340564727783, + "learning_rate": 1.9921225900489776e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6468948456081184, + "num_tokens": 610726.0, + "step": 420 + }, + { + "epoch": 13.448, + "grad_norm": 2.4454128742218018, + "learning_rate": 1.9917233367191205e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6485198132693768, + "num_tokens": 624770.0, + "step": 430 + }, + { + "epoch": 13.768, + "grad_norm": 1.809606909751892, + "learning_rate": 1.9913142562109328e-05, + "loss": 1.4744, + "mean_token_accuracy": 0.656716751307249, + "num_tokens": 640635.0, + "step": 440 + }, + { + "epoch": 14.064, + "grad_norm": 1.8572089672088623, + "learning_rate": 1.990895352578072e-05, + "loss": 1.4747, + "mean_token_accuracy": 0.6568980690192532, + "num_tokens": 652969.0, + "step": 450 + }, + { + "epoch": 14.384, + "grad_norm": 1.4431779384613037, + "learning_rate": 1.9904666299715357e-05, + "loss": 1.3779, + "mean_token_accuracy": 0.6651028156280517, + "num_tokens": 667518.0, + "step": 460 + }, + { + "epoch": 14.704, + "grad_norm": 1.1659435033798218, + "learning_rate": 1.9900280926396186e-05, + "loss": 1.4853, + "mean_token_accuracy": 0.6542218446731567, + "num_tokens": 682093.0, + "step": 470 + }, + { + "epoch": 15.0, + "grad_norm": 2.4632649421691895, + "learning_rate": 1.989579744927872e-05, + "loss": 1.5292, + "mean_token_accuracy": 0.6373760857292123, + "num_tokens": 696270.0, + "step": 480 + }, + { + "epoch": 15.32, + "grad_norm": 1.2648308277130127, + "learning_rate": 1.98912159127906e-05, + "loss": 1.5282, + "mean_token_accuracy": 0.6369021199643612, + "num_tokens": 711645.0, + "step": 490 + }, + { + "epoch": 15.64, + "grad_norm": 1.6591901779174805, + "learning_rate": 1.988653636233116e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6757966171950102, + "num_tokens": 724986.0, + "step": 500 + }, + { + "epoch": 15.96, + "grad_norm": 1.3664172887802124, + "learning_rate": 1.988175884427097e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6697750940918923, + "num_tokens": 740314.0, + "step": 510 + }, + { + "epoch": 16.256, + "grad_norm": 1.3944075107574463, + "learning_rate": 1.9876883405951378e-05, + "loss": 1.4199, + "mean_token_accuracy": 0.6569653801821373, + "num_tokens": 754126.0, + "step": 520 + }, + { + "epoch": 16.576, + "grad_norm": 1.846074104309082, + "learning_rate": 1.987191009568405e-05, + "loss": 1.3542, + "mean_token_accuracy": 0.6702545773237943, + "num_tokens": 770447.0, + "step": 530 + }, + { + "epoch": 16.896, + "grad_norm": 1.4818017482757568, + "learning_rate": 1.9866838962750473e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.673089163005352, + "num_tokens": 784145.0, + "step": 540 + }, + { + "epoch": 17.192, + "grad_norm": 1.6503247022628784, + "learning_rate": 1.986167005740149e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6697660020074329, + "num_tokens": 797632.0, + "step": 550 + }, + { + "epoch": 17.512, + "grad_norm": 1.7759121656417847, + "learning_rate": 1.985640343085678e-05, + "loss": 1.485, + "mean_token_accuracy": 0.6630586348474026, + "num_tokens": 812748.0, + "step": 560 + }, + { + "epoch": 17.832, + "grad_norm": 1.6328964233398438, + "learning_rate": 1.9851039135304366e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.673164501786232, + "num_tokens": 827388.0, + "step": 570 + }, + { + "epoch": 18.128, + "grad_norm": 1.394505500793457, + "learning_rate": 1.9845577223900087e-05, + "loss": 1.3223, + "mean_token_accuracy": 0.6847520865298606, + "num_tokens": 842219.0, + "step": 580 + }, + { + "epoch": 18.448, + "grad_norm": 1.541831374168396, + "learning_rate": 1.984001775076708e-05, + "loss": 1.3222, + "mean_token_accuracy": 0.6771992217749357, + "num_tokens": 857904.0, + "step": 590 + }, + { + "epoch": 18.768, + "grad_norm": 1.4715123176574707, + "learning_rate": 1.983436077099524e-05, + "loss": 1.3623, + "mean_token_accuracy": 0.6771474566310645, + "num_tokens": 871758.0, + "step": 600 + }, + { + "epoch": 19.064, + "grad_norm": 1.244395136833191, + "learning_rate": 1.9828606340640678e-05, + "loss": 1.3194, + "mean_token_accuracy": 0.6701785076308895, + "num_tokens": 885177.0, + "step": 610 + }, + { + "epoch": 19.384, + "grad_norm": 3.2594940662384033, + "learning_rate": 1.9822754516725148e-05, + "loss": 1.3986, + "mean_token_accuracy": 0.6804742000997066, + "num_tokens": 900412.0, + "step": 620 + }, + { + "epoch": 19.704, + "grad_norm": 1.9484755992889404, + "learning_rate": 1.9816805357235512e-05, + "loss": 1.3087, + "mean_token_accuracy": 0.675427176989615, + "num_tokens": 914923.0, + "step": 630 + }, + { + "epoch": 20.0, + "grad_norm": 4.795617580413818, + "learning_rate": 1.981075892112314e-05, + "loss": 1.3261, + "mean_token_accuracy": 0.6869603467954172, + "num_tokens": 928360.0, + "step": 640 + }, + { + "epoch": 20.32, + "grad_norm": 2.1296019554138184, + "learning_rate": 1.980461526830334e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6707998286932707, + "num_tokens": 942431.0, + "step": 650 + }, + { + "epoch": 20.64, + "grad_norm": 2.041980743408203, + "learning_rate": 1.979837445965475e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6774610493332147, + "num_tokens": 957665.0, + "step": 660 + }, + { + "epoch": 20.96, + "grad_norm": 2.0277955532073975, + "learning_rate": 1.979203655701875e-05, + "loss": 1.1934, + "mean_token_accuracy": 0.7059706412255764, + "num_tokens": 972341.0, + "step": 670 + }, + { + "epoch": 21.256, + "grad_norm": 1.4342715740203857, + "learning_rate": 1.978560162319885e-05, + "loss": 1.2645, + "mean_token_accuracy": 0.6948015895244237, + "num_tokens": 987917.0, + "step": 680 + }, + { + "epoch": 21.576, + "grad_norm": 2.365342140197754, + "learning_rate": 1.9779069721960046e-05, + "loss": 1.356, + "mean_token_accuracy": 0.6770768724381924, + "num_tokens": 1001846.0, + "step": 690 + }, + { + "epoch": 21.896, + "grad_norm": 1.4183971881866455, + "learning_rate": 1.9772440918028217e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6928307216614484, + "num_tokens": 1016810.0, + "step": 700 + }, + { + "epoch": 22.192, + "grad_norm": 1.5157005786895752, + "learning_rate": 1.9765715277089458e-05, + "loss": 1.2262, + "mean_token_accuracy": 0.6972452486689026, + "num_tokens": 1032507.0, + "step": 710 + }, + { + "epoch": 22.512, + "grad_norm": 1.4448522329330444, + "learning_rate": 1.9758892865789445e-05, + "loss": 1.2261, + "mean_token_accuracy": 0.6949771210551262, + "num_tokens": 1047386.0, + "step": 720 + }, + { + "epoch": 22.832, + "grad_norm": 2.33046817779541, + "learning_rate": 1.9751973751732775e-05, + "loss": 1.2348, + "mean_token_accuracy": 0.6982233498245478, + "num_tokens": 1061351.0, + "step": 730 + }, + { + "epoch": 23.128, + "grad_norm": 1.8830664157867432, + "learning_rate": 1.9744958003482285e-05, + "loss": 1.2979, + "mean_token_accuracy": 0.6971497769291336, + "num_tokens": 1073148.0, + "step": 740 + }, + { + "epoch": 23.448, + "grad_norm": 1.466878056526184, + "learning_rate": 1.9737845690558385e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.680212589353323, + "num_tokens": 1088218.0, + "step": 750 + }, + { + "epoch": 23.768, + "grad_norm": 1.5701245069503784, + "learning_rate": 1.973063688343835e-05, + "loss": 1.1505, + "mean_token_accuracy": 0.7072769150137901, + "num_tokens": 1102836.0, + "step": 760 + }, + { + "epoch": 24.064, + "grad_norm": 1.6687356233596802, + "learning_rate": 1.9723331653555653e-05, + "loss": 1.2474, + "mean_token_accuracy": 0.6967680285105834, + "num_tokens": 1116942.0, + "step": 770 + }, + { + "epoch": 24.384, + "grad_norm": 1.3728556632995605, + "learning_rate": 1.9715930073299227e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.7040290288627148, + "num_tokens": 1132054.0, + "step": 780 + }, + { + "epoch": 24.704, + "grad_norm": 1.4181838035583496, + "learning_rate": 1.970843221601276e-05, + "loss": 1.1969, + "mean_token_accuracy": 0.6944498892873525, + "num_tokens": 1148041.0, + "step": 790 + }, + { + "epoch": 25.0, + "grad_norm": 4.3919596672058105, + "learning_rate": 1.9700838155993972e-05, + "loss": 1.1934, + "mean_token_accuracy": 0.7042354522524653, + "num_tokens": 1160450.0, + "step": 800 + }, + { + "epoch": 25.32, + "grad_norm": 1.5123074054718018, + "learning_rate": 1.9693147968493872e-05, + "loss": 1.2369, + "mean_token_accuracy": 0.692409698665142, + "num_tokens": 1174351.0, + "step": 810 + }, + { + "epoch": 25.64, + "grad_norm": 1.278221845626831, + "learning_rate": 1.9685361729716014e-05, + "loss": 1.1829, + "mean_token_accuracy": 0.7174848213791847, + "num_tokens": 1190213.0, + "step": 820 + }, + { + "epoch": 25.96, + "grad_norm": 2.224332094192505, + "learning_rate": 1.967747951681575e-05, + "loss": 1.2224, + "mean_token_accuracy": 0.7052119519561529, + "num_tokens": 1205508.0, + "step": 830 + }, + { + "epoch": 26.256, + "grad_norm": 1.548086166381836, + "learning_rate": 1.966950140789944e-05, + "loss": 1.2254, + "mean_token_accuracy": 0.7041690679820808, + "num_tokens": 1219047.0, + "step": 840 + }, + { + "epoch": 26.576, + "grad_norm": 2.0900254249572754, + "learning_rate": 1.9661427482023718e-05, + "loss": 1.1557, + "mean_token_accuracy": 0.7088660508394241, + "num_tokens": 1231738.0, + "step": 850 + }, + { + "epoch": 26.896, + "grad_norm": 1.9919354915618896, + "learning_rate": 1.965325781919467e-05, + "loss": 1.1962, + "mean_token_accuracy": 0.7142665989696979, + "num_tokens": 1248062.0, + "step": 860 + }, + { + "epoch": 27.192, + "grad_norm": 2.2158303260803223, + "learning_rate": 1.9644992500367072e-05, + "loss": 1.2078, + "mean_token_accuracy": 0.7049629523141964, + "num_tokens": 1261738.0, + "step": 870 + }, + { + "epoch": 27.512, + "grad_norm": 1.830531358718872, + "learning_rate": 1.9636631607443565e-05, + "loss": 1.2142, + "mean_token_accuracy": 0.7097026702016592, + "num_tokens": 1278012.0, + "step": 880 + }, + { + "epoch": 27.832, + "grad_norm": 2.0944063663482666, + "learning_rate": 1.9628175223273847e-05, + "loss": 1.1368, + "mean_token_accuracy": 0.7265028398483991, + "num_tokens": 1292725.0, + "step": 890 + }, + { + "epoch": 28.128, + "grad_norm": 1.4445384740829468, + "learning_rate": 1.9619623431653872e-05, + "loss": 1.2329, + "mean_token_accuracy": 0.6941638359346906, + "num_tokens": 1305912.0, + "step": 900 + }, + { + "epoch": 28.448, + "grad_norm": 2.084064245223999, + "learning_rate": 1.9610976317324993e-05, + "loss": 1.1324, + "mean_token_accuracy": 0.7086500860750675, + "num_tokens": 1320269.0, + "step": 910 + }, + { + "epoch": 28.768, + "grad_norm": 1.5166538953781128, + "learning_rate": 1.9602233965973145e-05, + "loss": 1.215, + "mean_token_accuracy": 0.7056132420897484, + "num_tokens": 1336877.0, + "step": 920 + }, + { + "epoch": 29.064, + "grad_norm": 1.324559211730957, + "learning_rate": 1.9593396464227964e-05, + "loss": 1.1762, + "mean_token_accuracy": 0.7244789052653957, + "num_tokens": 1349855.0, + "step": 930 + }, + { + "epoch": 29.384, + "grad_norm": 1.3715434074401855, + "learning_rate": 1.9584463899661975e-05, + "loss": 1.1323, + "mean_token_accuracy": 0.7216422040015459, + "num_tokens": 1364729.0, + "step": 940 + }, + { + "epoch": 29.704, + "grad_norm": 1.782844066619873, + "learning_rate": 1.9575436360789687e-05, + "loss": 1.2588, + "mean_token_accuracy": 0.7018849883228541, + "num_tokens": 1378903.0, + "step": 950 + }, + { + "epoch": 30.0, + "grad_norm": 3.4414260387420654, + "learning_rate": 1.9566313937066727e-05, + "loss": 1.1545, + "mean_token_accuracy": 0.7196269961627754, + "num_tokens": 1392540.0, + "step": 960 + }, + { + "epoch": 30.32, + "grad_norm": 3.570629835128784, + "learning_rate": 1.9557096718888956e-05, + "loss": 1.1217, + "mean_token_accuracy": 0.7212486552074552, + "num_tokens": 1406295.0, + "step": 970 + }, + { + "epoch": 30.64, + "grad_norm": 1.5852808952331543, + "learning_rate": 1.9547784797591565e-05, + "loss": 1.1959, + "mean_token_accuracy": 0.7164284475147724, + "num_tokens": 1422592.0, + "step": 980 + }, + { + "epoch": 30.96, + "grad_norm": 1.5355671644210815, + "learning_rate": 1.9538378265448195e-05, + "loss": 1.1813, + "mean_token_accuracy": 0.710675698518753, + "num_tokens": 1437502.0, + "step": 990 + }, + { + "epoch": 31.256, + "grad_norm": 1.5741212368011475, + "learning_rate": 1.9528877215669983e-05, + "loss": 1.1143, + "mean_token_accuracy": 0.7233139457734855, + "num_tokens": 1452056.0, + "step": 1000 + }, + { + "epoch": 32.576, + "grad_norm": 1.7357654571533203, + "learning_rate": 1.8116046949409032e-05, + "loss": 1.2445, + "mean_token_accuracy": 0.7005614548921585, + "num_tokens": 14368.0, + "step": 1010 + }, + { + "epoch": 32.896, + "grad_norm": 2.1830084323883057, + "learning_rate": 1.807903147537074e-05, + "loss": 1.1743, + "mean_token_accuracy": 0.7173698712140322, + "num_tokens": 31135.0, + "step": 1020 + }, + { + "epoch": 33.224, + "grad_norm": 2.0015718936920166, + "learning_rate": 1.8041694488049716e-05, + "loss": 1.2619, + "mean_token_accuracy": 0.715624163063561, + "num_tokens": 46347.0, + "step": 1030 + }, + { + "epoch": 33.544, + "grad_norm": 1.7311397790908813, + "learning_rate": 1.8004037473309373e-05, + "loss": 1.2331, + "mean_token_accuracy": 0.7106888771057129, + "num_tokens": 63240.0, + "step": 1040 + }, + { + "epoch": 33.864, + "grad_norm": 1.8815584182739258, + "learning_rate": 1.7966061929748968e-05, + "loss": 1.2194, + "mean_token_accuracy": 0.7109542470425367, + "num_tokens": 79655.0, + "step": 1050 + }, + { + "epoch": 34.16, + "grad_norm": 1.684423565864563, + "learning_rate": 1.7927769368643904e-05, + "loss": 1.0667, + "mean_token_accuracy": 0.7348488770626687, + "num_tokens": 95318.0, + "step": 1060 + }, + { + "epoch": 34.48, + "grad_norm": 1.6687734127044678, + "learning_rate": 1.788916131388564e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.7278237771242857, + "num_tokens": 111213.0, + "step": 1070 + }, + { + "epoch": 34.8, + "grad_norm": 1.7268950939178467, + "learning_rate": 1.785023930192103e-05, + "loss": 1.1723, + "mean_token_accuracy": 0.7138827528804541, + "num_tokens": 126943.0, + "step": 1080 + }, + { + "epoch": 35.096, + "grad_norm": 1.5416665077209473, + "learning_rate": 1.781100488169115e-05, + "loss": 1.0286, + "mean_token_accuracy": 0.7333241834028347, + "num_tokens": 142073.0, + "step": 1090 + }, + { + "epoch": 35.416, + "grad_norm": 1.7402383089065552, + "learning_rate": 1.777145961456971e-05, + "loss": 1.0884, + "mean_token_accuracy": 0.7265842445194721, + "num_tokens": 158263.0, + "step": 1100 + }, + { + "epoch": 35.736, + "grad_norm": 1.4736402034759521, + "learning_rate": 1.773160507430087e-05, + "loss": 1.1012, + "mean_token_accuracy": 0.727820971608162, + "num_tokens": 172637.0, + "step": 1110 + }, + { + "epoch": 36.032, + "grad_norm": 2.027437448501587, + "learning_rate": 1.7691442846936643e-05, + "loss": 1.1525, + "mean_token_accuracy": 0.7281997264237017, + "num_tokens": 189288.0, + "step": 1120 + }, + { + "epoch": 36.352, + "grad_norm": 2.058610439300537, + "learning_rate": 1.7650974530773745e-05, + "loss": 1.147, + "mean_token_accuracy": 0.7228171911090613, + "num_tokens": 204429.0, + "step": 1130 + }, + { + "epoch": 36.672, + "grad_norm": 1.47328519821167, + "learning_rate": 1.7610201736290022e-05, + "loss": 1.1293, + "mean_token_accuracy": 0.7266111556440592, + "num_tokens": 220109.0, + "step": 1140 + }, + { + "epoch": 36.992, + "grad_norm": 1.4244815111160278, + "learning_rate": 1.7569126086080342e-05, + "loss": 1.0312, + "mean_token_accuracy": 0.7415647856891155, + "num_tokens": 236961.0, + "step": 1150 + }, + { + "epoch": 37.288, + "grad_norm": 1.6087596416473389, + "learning_rate": 1.7527749214792023e-05, + "loss": 1.1148, + "mean_token_accuracy": 0.722566624348228, + "num_tokens": 251116.0, + "step": 1160 + }, + { + "epoch": 37.608, + "grad_norm": 1.6909428834915161, + "learning_rate": 1.7486072769059785e-05, + "loss": 1.1283, + "mean_token_accuracy": 0.7359607569873333, + "num_tokens": 267570.0, + "step": 1170 + }, + { + "epoch": 37.928, + "grad_norm": 1.646548867225647, + "learning_rate": 1.7444098407440218e-05, + "loss": 1.0572, + "mean_token_accuracy": 0.7344494730234146, + "num_tokens": 282974.0, + "step": 1180 + }, + { + "epoch": 38.224, + "grad_norm": 1.5249629020690918, + "learning_rate": 1.740182780034577e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.7474351501142656, + "num_tokens": 298664.0, + "step": 1190 + }, + { + "epoch": 38.544, + "grad_norm": 1.8203458786010742, + "learning_rate": 1.7359262629978286e-05, + "loss": 1.044, + "mean_token_accuracy": 0.7267404418438673, + "num_tokens": 313932.0, + "step": 1200 + }, + { + "epoch": 38.864, + "grad_norm": 1.960335612297058, + "learning_rate": 1.731640459026206e-05, + "loss": 1.0537, + "mean_token_accuracy": 0.7449462197721004, + "num_tokens": 330427.0, + "step": 1210 + }, + { + "epoch": 39.16, + "grad_norm": 2.152423620223999, + "learning_rate": 1.727325538677642e-05, + "loss": 1.1988, + "mean_token_accuracy": 0.7341888015334671, + "num_tokens": 344595.0, + "step": 1220 + }, + { + "epoch": 39.48, + "grad_norm": 1.9269284009933472, + "learning_rate": 1.722981673668784e-05, + "loss": 1.0929, + "mean_token_accuracy": 0.7354621075093746, + "num_tokens": 361903.0, + "step": 1230 + }, + { + "epoch": 39.8, + "grad_norm": 2.627488374710083, + "learning_rate": 1.7186090368681625e-05, + "loss": 1.0304, + "mean_token_accuracy": 0.7406851584091783, + "num_tokens": 378158.0, + "step": 1240 + }, + { + "epoch": 40.096, + "grad_norm": 1.340135931968689, + "learning_rate": 1.714207802289311e-05, + "loss": 0.9831, + "mean_token_accuracy": 0.7508459779861811, + "num_tokens": 393086.0, + "step": 1250 + }, + { + "epoch": 40.416, + "grad_norm": 1.5764344930648804, + "learning_rate": 1.7097781450838408e-05, + "loss": 1.0411, + "mean_token_accuracy": 0.7428241446614265, + "num_tokens": 408865.0, + "step": 1260 + }, + { + "epoch": 40.736, + "grad_norm": 2.27480149269104, + "learning_rate": 1.7053202415344693e-05, + "loss": 1.1553, + "mean_token_accuracy": 0.7261891044676304, + "num_tokens": 422941.0, + "step": 1270 + }, + { + "epoch": 41.032, + "grad_norm": 2.0869431495666504, + "learning_rate": 1.7008342690480075e-05, + "loss": 1.0776, + "mean_token_accuracy": 0.7442273002218556, + "num_tokens": 438615.0, + "step": 1280 + }, + { + "epoch": 41.352, + "grad_norm": 1.6138980388641357, + "learning_rate": 1.6963204061482972e-05, + "loss": 0.9933, + "mean_token_accuracy": 0.7366263665258884, + "num_tokens": 454742.0, + "step": 1290 + }, + { + "epoch": 41.672, + "grad_norm": 2.201198101043701, + "learning_rate": 1.6917788324691083e-05, + "loss": 1.12, + "mean_token_accuracy": 0.7349841587245465, + "num_tokens": 471732.0, + "step": 1300 + }, + { + "epoch": 41.992, + "grad_norm": 2.3492226600646973, + "learning_rate": 1.687209728746989e-05, + "loss": 1.0594, + "mean_token_accuracy": 0.745047352835536, + "num_tokens": 487349.0, + "step": 1310 + }, + { + "epoch": 42.288, + "grad_norm": 1.7862104177474976, + "learning_rate": 1.6826132768140735e-05, + "loss": 0.9756, + "mean_token_accuracy": 0.7570219249338717, + "num_tokens": 502115.0, + "step": 1320 + }, + { + "epoch": 42.608, + "grad_norm": 2.4716343879699707, + "learning_rate": 1.6779896595908462e-05, + "loss": 1.0208, + "mean_token_accuracy": 0.7443521052598954, + "num_tokens": 517825.0, + "step": 1330 + }, + { + "epoch": 42.928, + "grad_norm": 2.661140203475952, + "learning_rate": 1.6733390610788622e-05, + "loss": 1.0313, + "mean_token_accuracy": 0.7418102856725455, + "num_tokens": 534561.0, + "step": 1340 + }, + { + "epoch": 43.224, + "grad_norm": 1.9998219013214111, + "learning_rate": 1.668661666353423e-05, + "loss": 1.0699, + "mean_token_accuracy": 0.7479387578126546, + "num_tokens": 548327.0, + "step": 1350 + }, + { + "epoch": 43.544, + "grad_norm": 2.4526405334472656, + "learning_rate": 1.6639576615562143e-05, + "loss": 0.9673, + "mean_token_accuracy": 0.7542693041265011, + "num_tokens": 565164.0, + "step": 1360 + }, + { + "epoch": 43.864, + "grad_norm": 1.7199647426605225, + "learning_rate": 1.6592272338878963e-05, + "loss": 1.0644, + "mean_token_accuracy": 0.743690374866128, + "num_tokens": 580754.0, + "step": 1370 + }, + { + "epoch": 44.16, + "grad_norm": 1.7065895795822144, + "learning_rate": 1.6544705716006537e-05, + "loss": 0.9511, + "mean_token_accuracy": 0.7495483123772854, + "num_tokens": 595953.0, + "step": 1380 + }, + { + "epoch": 44.48, + "grad_norm": 1.5984984636306763, + "learning_rate": 1.649687863990705e-05, + "loss": 1.0901, + "mean_token_accuracy": 0.7480962604284287, + "num_tokens": 611850.0, + "step": 1390 + }, + { + "epoch": 44.8, + "grad_norm": 2.719882011413574, + "learning_rate": 1.644879301390769e-05, + "loss": 0.9664, + "mean_token_accuracy": 0.7527227349579334, + "num_tokens": 627428.0, + "step": 1400 + }, + { + "epoch": 45.096, + "grad_norm": 2.04146409034729, + "learning_rate": 1.6400450751624897e-05, + "loss": 0.9673, + "mean_token_accuracy": 0.7482488421169488, + "num_tokens": 641538.0, + "step": 1410 + }, + { + "epoch": 45.416, + "grad_norm": 2.128373384475708, + "learning_rate": 1.6351853776888214e-05, + "loss": 0.9908, + "mean_token_accuracy": 0.7453075967729091, + "num_tokens": 658145.0, + "step": 1420 + }, + { + "epoch": 45.736, + "grad_norm": 1.9842469692230225, + "learning_rate": 1.630300402366373e-05, + "loss": 1.0387, + "mean_token_accuracy": 0.7478526467457414, + "num_tokens": 675926.0, + "step": 1430 + }, + { + "epoch": 46.032, + "grad_norm": 1.5507521629333496, + "learning_rate": 1.6253903435977103e-05, + "loss": 0.959, + "mean_token_accuracy": 0.7599469971012425, + "num_tokens": 689837.0, + "step": 1440 + }, + { + "epoch": 46.352, + "grad_norm": 2.250763416290283, + "learning_rate": 1.6204553967836216e-05, + "loss": 1.0544, + "mean_token_accuracy": 0.7468490976840257, + "num_tokens": 705912.0, + "step": 1450 + }, + { + "epoch": 46.672, + "grad_norm": 1.7809251546859741, + "learning_rate": 1.6154957583153388e-05, + "loss": 1.034, + "mean_token_accuracy": 0.7534692898392678, + "num_tokens": 722631.0, + "step": 1460 + }, + { + "epoch": 46.992, + "grad_norm": 2.4856886863708496, + "learning_rate": 1.6105116255667246e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7516257427632809, + "num_tokens": 737649.0, + "step": 1470 + }, + { + "epoch": 47.288, + "grad_norm": 2.1131696701049805, + "learning_rate": 1.605503196886416e-05, + "loss": 0.9908, + "mean_token_accuracy": 0.7506888621562237, + "num_tokens": 754612.0, + "step": 1480 + }, + { + "epoch": 47.608, + "grad_norm": 1.3065401315689087, + "learning_rate": 1.600470671589931e-05, + "loss": 0.9346, + "mean_token_accuracy": 0.757453129440546, + "num_tokens": 771515.0, + "step": 1490 + }, + { + "epoch": 47.928, + "grad_norm": 2.0020365715026855, + "learning_rate": 1.5954142499517377e-05, + "loss": 1.0396, + "mean_token_accuracy": 0.7508561560884118, + "num_tokens": 785665.0, + "step": 1500 + }, + { + "epoch": 48.224, + "grad_norm": 1.8463741540908813, + "learning_rate": 1.5903341331972832e-05, + "loss": 0.9151, + "mean_token_accuracy": 0.7590098671011023, + "num_tokens": 799293.0, + "step": 1510 + }, + { + "epoch": 48.544, + "grad_norm": 1.851616382598877, + "learning_rate": 1.585230523494985e-05, + "loss": 0.9102, + "mean_token_accuracy": 0.7564024582505227, + "num_tokens": 813555.0, + "step": 1520 + }, + { + "epoch": 48.864, + "grad_norm": 1.4981343746185303, + "learning_rate": 1.580103623948188e-05, + "loss": 1.0654, + "mean_token_accuracy": 0.748985405266285, + "num_tokens": 831868.0, + "step": 1530 + }, + { + "epoch": 49.16, + "grad_norm": 1.8819829225540161, + "learning_rate": 1.574953638587079e-05, + "loss": 0.993, + "mean_token_accuracy": 0.7556418059645472, + "num_tokens": 846798.0, + "step": 1540 + }, + { + "epoch": 49.48, + "grad_norm": 2.24092960357666, + "learning_rate": 1.569780772360568e-05, + "loss": 0.9818, + "mean_token_accuracy": 0.7535504069179296, + "num_tokens": 862063.0, + "step": 1550 + }, + { + "epoch": 49.8, + "grad_norm": 1.7873568534851074, + "learning_rate": 1.5645852311281343e-05, + "loss": 1.0086, + "mean_token_accuracy": 0.7555014498531818, + "num_tokens": 878215.0, + "step": 1560 + }, + { + "epoch": 50.096, + "grad_norm": 2.5300111770629883, + "learning_rate": 1.559367221651629e-05, + "loss": 0.8826, + "mean_token_accuracy": 0.7630251637987189, + "num_tokens": 893320.0, + "step": 1570 + }, + { + "epoch": 50.416, + "grad_norm": 1.9504714012145996, + "learning_rate": 1.554126951587053e-05, + "loss": 0.9572, + "mean_token_accuracy": 0.7577113211154938, + "num_tokens": 908230.0, + "step": 1580 + }, + { + "epoch": 50.736, + "grad_norm": 1.8482609987258911, + "learning_rate": 1.548864629476288e-05, + "loss": 0.9715, + "mean_token_accuracy": 0.7632556769996881, + "num_tokens": 925533.0, + "step": 1590 + }, + { + "epoch": 51.032, + "grad_norm": 1.7342660427093506, + "learning_rate": 1.5435804647388003e-05, + "loss": 1.0049, + "mean_token_accuracy": 0.753706334410487, + "num_tokens": 940557.0, + "step": 1600 + }, + { + "epoch": 51.352, + "grad_norm": 1.7231630086898804, + "learning_rate": 1.5382746676633053e-05, + "loss": 0.9577, + "mean_token_accuracy": 0.7602146591991186, + "num_tokens": 955898.0, + "step": 1610 + }, + { + "epoch": 51.672, + "grad_norm": 1.9401224851608276, + "learning_rate": 1.5329474493993984e-05, + "loss": 0.9607, + "mean_token_accuracy": 0.7621455781161786, + "num_tokens": 972435.0, + "step": 1620 + }, + { + "epoch": 51.992, + "grad_norm": 2.089966297149658, + "learning_rate": 1.5275990219491553e-05, + "loss": 0.9482, + "mean_token_accuracy": 0.762396826967597, + "num_tokens": 988434.0, + "step": 1630 + }, + { + "epoch": 52.288, + "grad_norm": 1.7538946866989136, + "learning_rate": 1.522229598158691e-05, + "loss": 0.9943, + "mean_token_accuracy": 0.7541912862577954, + "num_tokens": 1001536.0, + "step": 1640 + }, + { + "epoch": 52.608, + "grad_norm": 1.8982934951782227, + "learning_rate": 1.5168393917096917e-05, + "loss": 0.9258, + "mean_token_accuracy": 0.7704043008387089, + "num_tokens": 1018633.0, + "step": 1650 + }, + { + "epoch": 52.928, + "grad_norm": 2.195676803588867, + "learning_rate": 1.5114286171109109e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7571658097207546, + "num_tokens": 1035378.0, + "step": 1660 + }, + { + "epoch": 53.224, + "grad_norm": 1.6514253616333008, + "learning_rate": 1.5059974896896324e-05, + "loss": 1.0186, + "mean_token_accuracy": 0.7523992178407876, + "num_tokens": 1050349.0, + "step": 1670 + }, + { + "epoch": 53.544, + "grad_norm": 2.6811511516571045, + "learning_rate": 1.5005462255831014e-05, + "loss": 1.0254, + "mean_token_accuracy": 0.7484087854623794, + "num_tokens": 1064873.0, + "step": 1680 + }, + { + "epoch": 53.864, + "grad_norm": 2.0554141998291016, + "learning_rate": 1.4950750417299227e-05, + "loss": 0.878, + "mean_token_accuracy": 0.7748427361249923, + "num_tokens": 1082105.0, + "step": 1690 + }, + { + "epoch": 54.16, + "grad_norm": 1.614017128944397, + "learning_rate": 1.489584155861428e-05, + "loss": 0.9688, + "mean_token_accuracy": 0.7686513742885074, + "num_tokens": 1097886.0, + "step": 1700 + }, + { + "epoch": 54.48, + "grad_norm": 1.7928838729858398, + "learning_rate": 1.4840737864930106e-05, + "loss": 0.8874, + "mean_token_accuracy": 0.7716922122985125, + "num_tokens": 1112624.0, + "step": 1710 + }, + { + "epoch": 54.8, + "grad_norm": 1.688085675239563, + "learning_rate": 1.4785441529154294e-05, + "loss": 0.9361, + "mean_token_accuracy": 0.767570473998785, + "num_tokens": 1129549.0, + "step": 1720 + }, + { + "epoch": 55.096, + "grad_norm": 1.3455687761306763, + "learning_rate": 1.4729954751860827e-05, + "loss": 1.0524, + "mean_token_accuracy": 0.7470491971518542, + "num_tokens": 1145039.0, + "step": 1730 + }, + { + "epoch": 55.416, + "grad_norm": 1.7406009435653687, + "learning_rate": 1.4674279741202495e-05, + "loss": 0.8839, + "mean_token_accuracy": 0.7727594949305058, + "num_tokens": 1159810.0, + "step": 1740 + }, + { + "epoch": 55.736, + "grad_norm": 2.1520540714263916, + "learning_rate": 1.4618418712823028e-05, + "loss": 0.9652, + "mean_token_accuracy": 0.7532628539949655, + "num_tokens": 1176245.0, + "step": 1750 + }, + { + "epoch": 56.032, + "grad_norm": 1.581739902496338, + "learning_rate": 1.4562373889768927e-05, + "loss": 0.9332, + "mean_token_accuracy": 0.7696672396079914, + "num_tokens": 1191008.0, + "step": 1760 + }, + { + "epoch": 56.352, + "grad_norm": 1.6474453210830688, + "learning_rate": 1.4506147502400977e-05, + "loss": 0.8376, + "mean_token_accuracy": 0.772033654898405, + "num_tokens": 1205755.0, + "step": 1770 + }, + { + "epoch": 56.672, + "grad_norm": 1.8299458026885986, + "learning_rate": 1.4449741788305514e-05, + "loss": 0.9889, + "mean_token_accuracy": 0.760890544205904, + "num_tokens": 1221863.0, + "step": 1780 + }, + { + "epoch": 56.992, + "grad_norm": 1.6759440898895264, + "learning_rate": 1.4393158992205348e-05, + "loss": 0.9799, + "mean_token_accuracy": 0.7623420935124159, + "num_tokens": 1238647.0, + "step": 1790 + }, + { + "epoch": 57.288, + "grad_norm": 2.1239564418792725, + "learning_rate": 1.4336401365870466e-05, + "loss": 0.9944, + "mean_token_accuracy": 0.7618524045557589, + "num_tokens": 1253030.0, + "step": 1800 + }, + { + "epoch": 57.608, + "grad_norm": 2.75298810005188, + "learning_rate": 1.4279471168028382e-05, + "loss": 0.9822, + "mean_token_accuracy": 0.7654153741896152, + "num_tokens": 1269147.0, + "step": 1810 + }, + { + "epoch": 57.928, + "grad_norm": 1.8775372505187988, + "learning_rate": 1.422237066427429e-05, + "loss": 0.8866, + "mean_token_accuracy": 0.7653848383575678, + "num_tokens": 1285368.0, + "step": 1820 + }, + { + "epoch": 58.224, + "grad_norm": 1.6810104846954346, + "learning_rate": 1.416510212698086e-05, + "loss": 0.9072, + "mean_token_accuracy": 0.7690872151303936, + "num_tokens": 1300660.0, + "step": 1830 + }, + { + "epoch": 58.544, + "grad_norm": 1.914070725440979, + "learning_rate": 1.4107667835207844e-05, + "loss": 1.0272, + "mean_token_accuracy": 0.7550359651446342, + "num_tokens": 1317143.0, + "step": 1840 + }, + { + "epoch": 58.864, + "grad_norm": 2.164189338684082, + "learning_rate": 1.4050070074611355e-05, + "loss": 0.9304, + "mean_token_accuracy": 0.7650556772947311, + "num_tokens": 1332705.0, + "step": 1850 + }, + { + "epoch": 59.16, + "grad_norm": 2.7804877758026123, + "learning_rate": 1.3992311137352918e-05, + "loss": 0.8424, + "mean_token_accuracy": 0.7625659327652003, + "num_tokens": 1345993.0, + "step": 1860 + }, + { + "epoch": 59.48, + "grad_norm": 1.7922106981277466, + "learning_rate": 1.3934393322008241e-05, + "loss": 0.8732, + "mean_token_accuracy": 0.7774093203246594, + "num_tokens": 1362688.0, + "step": 1870 + }, + { + "epoch": 59.8, + "grad_norm": 1.39845609664917, + "learning_rate": 1.387631893347575e-05, + "loss": 0.8986, + "mean_token_accuracy": 0.7775574192404747, + "num_tokens": 1379021.0, + "step": 1880 + }, + { + "epoch": 60.096, + "grad_norm": 2.3520147800445557, + "learning_rate": 1.3818090282884869e-05, + "loss": 0.9055, + "mean_token_accuracy": 0.7666742781529555, + "num_tokens": 1394388.0, + "step": 1890 + }, + { + "epoch": 60.416, + "grad_norm": 1.9911949634552002, + "learning_rate": 1.3759709687504022e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.7690058574080467, + "num_tokens": 1410943.0, + "step": 1900 + }, + { + "epoch": 60.736, + "grad_norm": 2.0429329872131348, + "learning_rate": 1.3701179470648444e-05, + "loss": 0.9081, + "mean_token_accuracy": 0.764681476354599, + "num_tokens": 1428993.0, + "step": 1910 + }, + { + "epoch": 61.032, + "grad_norm": 1.7519456148147583, + "learning_rate": 1.36425019615877e-05, + "loss": 0.9026, + "mean_token_accuracy": 0.7673927166977444, + "num_tokens": 1441530.0, + "step": 1920 + }, + { + "epoch": 61.352, + "grad_norm": 2.105077028274536, + "learning_rate": 1.3583679495453e-05, + "loss": 0.8834, + "mean_token_accuracy": 0.7748925991356372, + "num_tokens": 1459071.0, + "step": 1930 + }, + { + "epoch": 61.672, + "grad_norm": 1.9322600364685059, + "learning_rate": 1.3524714413144282e-05, + "loss": 0.91, + "mean_token_accuracy": 0.7671246759593486, + "num_tokens": 1474214.0, + "step": 1940 + }, + { + "epoch": 61.992, + "grad_norm": 2.1808035373687744, + "learning_rate": 1.346560906123702e-05, + "loss": 0.915, + "mean_token_accuracy": 0.7675775479525327, + "num_tokens": 1489457.0, + "step": 1950 + }, + { + "epoch": 62.288, + "grad_norm": 1.765626311302185, + "learning_rate": 1.3406365791888865e-05, + "loss": 1.0076, + "mean_token_accuracy": 0.7589444365050342, + "num_tokens": 1504842.0, + "step": 1960 + }, + { + "epoch": 62.608, + "grad_norm": 2.268444061279297, + "learning_rate": 1.3346986962746038e-05, + "loss": 0.8381, + "mean_token_accuracy": 0.780813368782401, + "num_tokens": 1519276.0, + "step": 1970 + }, + { + "epoch": 62.928, + "grad_norm": 1.4477503299713135, + "learning_rate": 1.32874749368495e-05, + "loss": 0.925, + "mean_token_accuracy": 0.7658030860126018, + "num_tokens": 1535446.0, + "step": 1980 + }, + { + "epoch": 63.224, + "grad_norm": 2.732478618621826, + "learning_rate": 1.3227832082540908e-05, + "loss": 0.9051, + "mean_token_accuracy": 0.7650254467451895, + "num_tokens": 1550674.0, + "step": 1990 + }, + { + "epoch": 63.544, + "grad_norm": 2.2961671352386475, + "learning_rate": 1.3168060773368375e-05, + "loss": 0.9873, + "mean_token_accuracy": 0.7675742536783219, + "num_tokens": 1564485.0, + "step": 2000 + }, + { + "epoch": 64.832, + "grad_norm": 2.223515272140503, + "learning_rate": 1.3108163387991993e-05, + "loss": 0.8791, + "mean_token_accuracy": 0.7629961850121617, + "num_tokens": 16127.0, + "step": 2010 + }, + { + "epoch": 65.16, + "grad_norm": 2.646225690841675, + "learning_rate": 1.30481423100892e-05, + "loss": 0.9661, + "mean_token_accuracy": 0.768963757811523, + "num_tokens": 31851.0, + "step": 2020 + }, + { + "epoch": 65.48, + "grad_norm": 2.350883722305298, + "learning_rate": 1.2987999928259897e-05, + "loss": 0.9412, + "mean_token_accuracy": 0.7736104667186737, + "num_tokens": 47921.0, + "step": 2030 + }, + { + "epoch": 65.8, + "grad_norm": 1.8255304098129272, + "learning_rate": 1.2927738635931402e-05, + "loss": 0.9436, + "mean_token_accuracy": 0.7672818608582019, + "num_tokens": 64830.0, + "step": 2040 + }, + { + "epoch": 66.096, + "grad_norm": 1.562624454498291, + "learning_rate": 1.2867360831263191e-05, + "loss": 0.8713, + "mean_token_accuracy": 0.7805772717740085, + "num_tokens": 79811.0, + "step": 2050 + }, + { + "epoch": 66.416, + "grad_norm": 2.139047145843506, + "learning_rate": 1.280686891705147e-05, + "loss": 0.9338, + "mean_token_accuracy": 0.7658140640705824, + "num_tokens": 95253.0, + "step": 2060 + }, + { + "epoch": 66.736, + "grad_norm": 1.71339750289917, + "learning_rate": 1.2746265300633556e-05, + "loss": 0.8785, + "mean_token_accuracy": 0.7795989379286766, + "num_tokens": 110479.0, + "step": 2070 + }, + { + "epoch": 67.032, + "grad_norm": 2.284088134765625, + "learning_rate": 1.268555239379206e-05, + "loss": 0.9184, + "mean_token_accuracy": 0.7648406020692877, + "num_tokens": 125681.0, + "step": 2080 + }, + { + "epoch": 67.352, + "grad_norm": 1.8913801908493042, + "learning_rate": 1.2624732612658923e-05, + "loss": 0.8725, + "mean_token_accuracy": 0.7707512844353914, + "num_tokens": 141796.0, + "step": 2090 + }, + { + "epoch": 67.672, + "grad_norm": 1.9446955919265747, + "learning_rate": 1.2563808377619253e-05, + "loss": 1.0064, + "mean_token_accuracy": 0.7678989730775356, + "num_tokens": 158724.0, + "step": 2100 + }, + { + "epoch": 67.992, + "grad_norm": 2.2045912742614746, + "learning_rate": 1.250278211321501e-05, + "loss": 0.7989, + "mean_token_accuracy": 0.7780600219964982, + "num_tokens": 173629.0, + "step": 2110 + }, + { + "epoch": 68.288, + "grad_norm": 1.9509689807891846, + "learning_rate": 1.244165624804852e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.7794965231740797, + "num_tokens": 188518.0, + "step": 2120 + }, + { + "epoch": 68.608, + "grad_norm": 2.1073553562164307, + "learning_rate": 1.2380433214685813e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7774934440851211, + "num_tokens": 205654.0, + "step": 2130 + }, + { + "epoch": 68.928, + "grad_norm": 1.652787208557129, + "learning_rate": 1.2319115449559835e-05, + "loss": 0.8801, + "mean_token_accuracy": 0.7723641652613878, + "num_tokens": 220311.0, + "step": 2140 + }, + { + "epoch": 69.224, + "grad_norm": 2.534707546234131, + "learning_rate": 1.2257705392873476e-05, + "loss": 0.8723, + "mean_token_accuracy": 0.7854163485604364, + "num_tokens": 236282.0, + "step": 2150 + }, + { + "epoch": 69.544, + "grad_norm": 1.578347086906433, + "learning_rate": 1.2196205488502463e-05, + "loss": 0.8169, + "mean_token_accuracy": 0.7866261303424835, + "num_tokens": 252837.0, + "step": 2160 + }, + { + "epoch": 69.864, + "grad_norm": 2.228119373321533, + "learning_rate": 1.2134618183898105e-05, + "loss": 0.9254, + "mean_token_accuracy": 0.7748822212219239, + "num_tokens": 267785.0, + "step": 2170 + }, + { + "epoch": 70.16, + "grad_norm": 2.410616159439087, + "learning_rate": 1.2072945929989888e-05, + "loss": 0.8046, + "mean_token_accuracy": 0.7787431329488754, + "num_tokens": 281535.0, + "step": 2180 + }, + { + "epoch": 70.48, + "grad_norm": 1.7590594291687012, + "learning_rate": 1.201119118108794e-05, + "loss": 0.8912, + "mean_token_accuracy": 0.7787077182903885, + "num_tokens": 298775.0, + "step": 2190 + }, + { + "epoch": 70.8, + "grad_norm": 3.3293755054473877, + "learning_rate": 1.1949356394785373e-05, + "loss": 0.9112, + "mean_token_accuracy": 0.7765318274497985, + "num_tokens": 314484.0, + "step": 2200 + }, + { + "epoch": 71.096, + "grad_norm": 2.363255739212036, + "learning_rate": 1.1887444031860456e-05, + "loss": 0.9063, + "mean_token_accuracy": 0.776000738546655, + "num_tokens": 327608.0, + "step": 2210 + }, + { + "epoch": 71.416, + "grad_norm": 1.7942370176315308, + "learning_rate": 1.1825456556178705e-05, + "loss": 0.8095, + "mean_token_accuracy": 0.7899976089596749, + "num_tokens": 345798.0, + "step": 2220 + }, + { + "epoch": 71.736, + "grad_norm": 1.9774558544158936, + "learning_rate": 1.1763396434594823e-05, + "loss": 0.9154, + "mean_token_accuracy": 0.7691428020596505, + "num_tokens": 361462.0, + "step": 2230 + }, + { + "epoch": 72.032, + "grad_norm": 1.6556707620620728, + "learning_rate": 1.1701266136854532e-05, + "loss": 0.8829, + "mean_token_accuracy": 0.7704721173724612, + "num_tokens": 376304.0, + "step": 2240 + }, + { + "epoch": 72.352, + "grad_norm": 2.80587100982666, + "learning_rate": 1.1639068135496285e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.7709558174014092, + "num_tokens": 390379.0, + "step": 2250 + }, + { + "epoch": 72.672, + "grad_norm": 2.0841872692108154, + "learning_rate": 1.1576804905752873e-05, + "loss": 0.9589, + "mean_token_accuracy": 0.7605574566870927, + "num_tokens": 408237.0, + "step": 2260 + }, + { + "epoch": 72.992, + "grad_norm": 2.8403215408325195, + "learning_rate": 1.1514478925452905e-05, + "loss": 0.7252, + "mean_token_accuracy": 0.7972878247499466, + "num_tokens": 423763.0, + "step": 2270 + }, + { + "epoch": 73.288, + "grad_norm": 6.484622955322266, + "learning_rate": 1.1452092674922224e-05, + "loss": 0.9519, + "mean_token_accuracy": 0.7691420135465828, + "num_tokens": 437835.0, + "step": 2280 + }, + { + "epoch": 73.608, + "grad_norm": 2.27260160446167, + "learning_rate": 1.1389648636885186e-05, + "loss": 0.8394, + "mean_token_accuracy": 0.7912575013935566, + "num_tokens": 455397.0, + "step": 2290 + }, + { + "epoch": 73.928, + "grad_norm": 1.805159091949463, + "learning_rate": 1.132714929636586e-05, + "loss": 0.8545, + "mean_token_accuracy": 0.7838539175689221, + "num_tokens": 471371.0, + "step": 2300 + }, + { + "epoch": 74.224, + "grad_norm": 2.250121593475342, + "learning_rate": 1.1264597140589127e-05, + "loss": 0.8243, + "mean_token_accuracy": 0.7824344949142353, + "num_tokens": 486629.0, + "step": 2310 + }, + { + "epoch": 74.544, + "grad_norm": 3.2095444202423096, + "learning_rate": 1.120199465888171e-05, + "loss": 0.8556, + "mean_token_accuracy": 0.7762523703277111, + "num_tokens": 501461.0, + "step": 2320 + }, + { + "epoch": 74.864, + "grad_norm": 2.3047547340393066, + "learning_rate": 1.1139344342573106e-05, + "loss": 0.8754, + "mean_token_accuracy": 0.7786926485598087, + "num_tokens": 516976.0, + "step": 2330 + }, + { + "epoch": 75.16, + "grad_norm": 2.0419108867645264, + "learning_rate": 1.1076648684896441e-05, + "loss": 0.8008, + "mean_token_accuracy": 0.7848166005836951, + "num_tokens": 532021.0, + "step": 2340 + }, + { + "epoch": 75.48, + "grad_norm": 2.602372646331787, + "learning_rate": 1.101391018088923e-05, + "loss": 0.9487, + "mean_token_accuracy": 0.7746396526694298, + "num_tokens": 546596.0, + "step": 2350 + }, + { + "epoch": 75.8, + "grad_norm": 1.9730421304702759, + "learning_rate": 1.0951131327294123e-05, + "loss": 0.8744, + "mean_token_accuracy": 0.7984356313943863, + "num_tokens": 563545.0, + "step": 2360 + }, + { + "epoch": 76.096, + "grad_norm": 2.331416130065918, + "learning_rate": 1.0888314622459509e-05, + "loss": 0.8102, + "mean_token_accuracy": 0.7831854063111383, + "num_tokens": 578977.0, + "step": 2370 + }, + { + "epoch": 76.416, + "grad_norm": 2.8027427196502686, + "learning_rate": 1.082546256624011e-05, + "loss": 0.8598, + "mean_token_accuracy": 0.7751214955002069, + "num_tokens": 594479.0, + "step": 2380 + }, + { + "epoch": 76.736, + "grad_norm": 1.8376470804214478, + "learning_rate": 1.0762577659897495e-05, + "loss": 0.8722, + "mean_token_accuracy": 0.7737262137234211, + "num_tokens": 611581.0, + "step": 2390 + }, + { + "epoch": 77.032, + "grad_norm": 2.3731982707977295, + "learning_rate": 1.0699662406000533e-05, + "loss": 0.8581, + "mean_token_accuracy": 0.7886427938938141, + "num_tokens": 626188.0, + "step": 2400 + }, + { + "epoch": 77.352, + "grad_norm": 1.711204171180725, + "learning_rate": 1.0636719308325803e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.7730351705104113, + "num_tokens": 643408.0, + "step": 2410 + }, + { + "epoch": 77.672, + "grad_norm": 1.7660971879959106, + "learning_rate": 1.0573750871757965e-05, + "loss": 0.7626, + "mean_token_accuracy": 0.7915604203939438, + "num_tokens": 657604.0, + "step": 2420 + }, + { + "epoch": 77.992, + "grad_norm": 2.0509514808654785, + "learning_rate": 1.0510759602190055e-05, + "loss": 0.8603, + "mean_token_accuracy": 0.784786606580019, + "num_tokens": 674373.0, + "step": 2430 + }, + { + "epoch": 78.288, + "grad_norm": 2.348026752471924, + "learning_rate": 1.0447748006423775e-05, + "loss": 0.8823, + "mean_token_accuracy": 0.7760254515183939, + "num_tokens": 690196.0, + "step": 2440 + }, + { + "epoch": 78.608, + "grad_norm": 2.094943046569824, + "learning_rate": 1.0384718592069733e-05, + "loss": 0.8474, + "mean_token_accuracy": 0.7716075176373124, + "num_tokens": 706149.0, + "step": 2450 + }, + { + "epoch": 78.928, + "grad_norm": 2.465407609939575, + "learning_rate": 1.0321673867447642e-05, + "loss": 0.8644, + "mean_token_accuracy": 0.786153320223093, + "num_tokens": 721536.0, + "step": 2460 + }, + { + "epoch": 79.224, + "grad_norm": 2.3234193325042725, + "learning_rate": 1.0258616341486505e-05, + "loss": 0.9199, + "mean_token_accuracy": 0.7744305520444303, + "num_tokens": 737605.0, + "step": 2470 + }, + { + "epoch": 79.544, + "grad_norm": 1.9042166471481323, + "learning_rate": 1.019554852362476e-05, + "loss": 0.8054, + "mean_token_accuracy": 0.7926479011774064, + "num_tokens": 753913.0, + "step": 2480 + }, + { + "epoch": 79.864, + "grad_norm": 2.5160131454467773, + "learning_rate": 1.0132472923710437e-05, + "loss": 0.8329, + "mean_token_accuracy": 0.7762512426823378, + "num_tokens": 769204.0, + "step": 2490 + }, + { + "epoch": 80.16, + "grad_norm": 2.8922526836395264, + "learning_rate": 1.0069392051901241e-05, + "loss": 0.8492, + "mean_token_accuracy": 0.7814656487993292, + "num_tokens": 784216.0, + "step": 2500 + }, + { + "epoch": 80.48, + "grad_norm": 2.763730049133301, + "learning_rate": 1.0006308418564697e-05, + "loss": 0.8454, + "mean_token_accuracy": 0.7843520522117615, + "num_tokens": 800421.0, + "step": 2510 + }, + { + "epoch": 80.8, + "grad_norm": 2.41654372215271, + "learning_rate": 9.94322453417821e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.8014784809201956, + "num_tokens": 815977.0, + "step": 2520 + }, + { + "epoch": 81.096, + "grad_norm": 2.7866134643554688, + "learning_rate": 9.880142909229188e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7932786933473639, + "num_tokens": 829350.0, + "step": 2530 + }, + { + "epoch": 81.416, + "grad_norm": 1.8219573497772217, + "learning_rate": 9.817066054115117e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7871743485331535, + "num_tokens": 845363.0, + "step": 2540 + }, + { + "epoch": 81.736, + "grad_norm": 1.9417917728424072, + "learning_rate": 9.753996479043672e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7732684839516878, + "num_tokens": 862288.0, + "step": 2550 + }, + { + "epoch": 82.032, + "grad_norm": 1.784688949584961, + "learning_rate": 9.690936693932793e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.791683446716618, + "num_tokens": 876947.0, + "step": 2560 + }, + { + "epoch": 82.352, + "grad_norm": 2.9179623126983643, + "learning_rate": 9.627889208310831e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7839712589979172, + "num_tokens": 891614.0, + "step": 2570 + }, + { + "epoch": 82.672, + "grad_norm": 1.7839528322219849, + "learning_rate": 9.564856531216666e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.8007228754460811, + "num_tokens": 909761.0, + "step": 2580 + }, + { + "epoch": 82.992, + "grad_norm": 2.202512741088867, + "learning_rate": 9.50184117109986e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7817719358950853, + "num_tokens": 925239.0, + "step": 2590 + }, + { + "epoch": 83.288, + "grad_norm": 2.0895087718963623, + "learning_rate": 9.438845635720817e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7858564757012032, + "num_tokens": 940941.0, + "step": 2600 + }, + { + "epoch": 84.192, + "grad_norm": 1.9542677402496338, + "learning_rate": 9.375872432051006e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.780240989312893, + "num_tokens": 17808.0, + "step": 2610 + }, + { + "epoch": 84.512, + "grad_norm": 2.257493257522583, + "learning_rate": 9.312924066173178e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7828688979148865, + "num_tokens": 33397.0, + "step": 2620 + }, + { + "epoch": 84.832, + "grad_norm": 1.9773480892181396, + "learning_rate": 9.25000304318164e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7853579100221395, + "num_tokens": 49526.0, + "step": 2630 + }, + { + "epoch": 85.128, + "grad_norm": 2.3270950317382812, + "learning_rate": 9.187111867082568e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7843060026297698, + "num_tokens": 64196.0, + "step": 2640 + }, + { + "epoch": 85.448, + "grad_norm": 1.751816749572754, + "learning_rate": 9.124253040694334e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7842564310878515, + "num_tokens": 82640.0, + "step": 2650 + }, + { + "epoch": 85.768, + "grad_norm": 2.2109670639038086, + "learning_rate": 9.061429065547933e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7830525517463685, + "num_tokens": 96760.0, + "step": 2660 + }, + { + "epoch": 86.064, + "grad_norm": 1.9558287858963013, + "learning_rate": 8.998642441787417e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7956994892777624, + "num_tokens": 110140.0, + "step": 2670 + }, + { + "epoch": 86.384, + "grad_norm": 2.14022159576416, + "learning_rate": 8.935895668070405e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7832688026130199, + "num_tokens": 125468.0, + "step": 2680 + }, + { + "epoch": 86.704, + "grad_norm": 2.9838459491729736, + "learning_rate": 8.873191241468631e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7785748850554228, + "num_tokens": 141001.0, + "step": 2690 + }, + { + "epoch": 87.0, + "grad_norm": 3.1861932277679443, + "learning_rate": 8.810531657368594e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.794649675891206, + "num_tokens": 156865.0, + "step": 2700 + }, + { + "epoch": 87.32, + "grad_norm": 1.9827327728271484, + "learning_rate": 8.747919409372236e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7892976485192775, + "num_tokens": 173221.0, + "step": 2710 + }, + { + "epoch": 87.64, + "grad_norm": 2.972670078277588, + "learning_rate": 8.685356989197717e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7798152294009923, + "num_tokens": 188954.0, + "step": 2720 + }, + { + "epoch": 87.96, + "grad_norm": 2.67842173576355, + "learning_rate": 8.62284688658023e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.79255036637187, + "num_tokens": 205013.0, + "step": 2730 + }, + { + "epoch": 88.256, + "grad_norm": 1.817650556564331, + "learning_rate": 8.56039158917296e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7901485256246619, + "num_tokens": 219723.0, + "step": 2740 + }, + { + "epoch": 88.576, + "grad_norm": 1.7845501899719238, + "learning_rate": 8.497993582448044e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7930382348597049, + "num_tokens": 235112.0, + "step": 2750 + }, + { + "epoch": 88.896, + "grad_norm": 2.3108439445495605, + "learning_rate": 8.43565534959769e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7890813775360584, + "num_tokens": 252361.0, + "step": 2760 + }, + { + "epoch": 89.192, + "grad_norm": 1.8335771560668945, + "learning_rate": 8.373379371435346e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7960183253964862, + "num_tokens": 265617.0, + "step": 2770 + }, + { + "epoch": 89.512, + "grad_norm": 2.717653512954712, + "learning_rate": 8.31116812629696e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7901519671082496, + "num_tokens": 281770.0, + "step": 2780 + }, + { + "epoch": 89.832, + "grad_norm": 1.9572986364364624, + "learning_rate": 8.249024089942364e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7931222733110189, + "num_tokens": 298511.0, + "step": 2790 + }, + { + "epoch": 90.128, + "grad_norm": 1.8655132055282593, + "learning_rate": 8.186949735456758e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7922740490049929, + "num_tokens": 312957.0, + "step": 2800 + }, + { + "epoch": 90.448, + "grad_norm": 2.0918149948120117, + "learning_rate": 8.12494753315228e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7884520322084427, + "num_tokens": 330412.0, + "step": 2810 + }, + { + "epoch": 90.768, + "grad_norm": 1.7944889068603516, + "learning_rate": 8.063019950469688e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7932636447250843, + "num_tokens": 345474.0, + "step": 2820 + }, + { + "epoch": 91.064, + "grad_norm": 1.7774523496627808, + "learning_rate": 8.001169451880186e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7842674186906299, + "num_tokens": 360670.0, + "step": 2830 + }, + { + "epoch": 91.384, + "grad_norm": 2.441330909729004, + "learning_rate": 7.939398498787332e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7940668806433677, + "num_tokens": 375578.0, + "step": 2840 + }, + { + "epoch": 91.704, + "grad_norm": 1.986222505569458, + "learning_rate": 7.877709549429092e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7950244933366776, + "num_tokens": 392683.0, + "step": 2850 + }, + { + "epoch": 92.0, + "grad_norm": 4.878885269165039, + "learning_rate": 7.816105058780019e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7837782482037673, + "num_tokens": 407330.0, + "step": 2860 + }, + { + "epoch": 92.32, + "grad_norm": 2.343815326690674, + "learning_rate": 7.754587478453528e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7878943778574466, + "num_tokens": 420579.0, + "step": 2870 + }, + { + "epoch": 92.64, + "grad_norm": 2.5471577644348145, + "learning_rate": 7.69315925660436e-06, + "loss": 0.88, + "mean_token_accuracy": 0.791867159307003, + "num_tokens": 438517.0, + "step": 2880 + }, + { + "epoch": 92.96, + "grad_norm": 2.2550160884857178, + "learning_rate": 7.631822837831143e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7879139900207519, + "num_tokens": 455707.0, + "step": 2890 + }, + { + "epoch": 93.256, + "grad_norm": 2.0642154216766357, + "learning_rate": 7.570580663079114e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7856367556629954, + "num_tokens": 469780.0, + "step": 2900 + }, + { + "epoch": 93.576, + "grad_norm": 2.1604714393615723, + "learning_rate": 7.509435169542961e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7887919537723065, + "num_tokens": 484586.0, + "step": 2910 + }, + { + "epoch": 93.896, + "grad_norm": 2.2268590927124023, + "learning_rate": 7.448388790569851e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7843763899058104, + "num_tokens": 502557.0, + "step": 2920 + }, + { + "epoch": 94.192, + "grad_norm": 1.8110442161560059, + "learning_rate": 7.387443955562586e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7898652823390188, + "num_tokens": 516331.0, + "step": 2930 + }, + { + "epoch": 94.512, + "grad_norm": 2.456662178039551, + "learning_rate": 7.326603089882925e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7980688564479351, + "num_tokens": 532511.0, + "step": 2940 + }, + { + "epoch": 94.832, + "grad_norm": 2.060681104660034, + "learning_rate": 7.26586861475506e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7954543896019459, + "num_tokens": 549222.0, + "step": 2950 + }, + { + "epoch": 95.128, + "grad_norm": 2.5429089069366455, + "learning_rate": 7.205242947169258e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7921945170776264, + "num_tokens": 563980.0, + "step": 2960 + }, + { + "epoch": 95.448, + "grad_norm": 2.3039979934692383, + "learning_rate": 7.144728499785683e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.801618828624487, + "num_tokens": 579326.0, + "step": 2970 + }, + { + "epoch": 95.768, + "grad_norm": 1.8464511632919312, + "learning_rate": 7.0843276808383785e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7837361056357622, + "num_tokens": 596726.0, + "step": 2980 + }, + { + "epoch": 96.064, + "grad_norm": 2.409407377243042, + "learning_rate": 7.024042894039434e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7905531976674054, + "num_tokens": 611478.0, + "step": 2990 + }, + { + "epoch": 96.384, + "grad_norm": 3.4677658081054688, + "learning_rate": 6.963876538483305e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7856792386621236, + "num_tokens": 626726.0, + "step": 3000 + }, + { + "epoch": 96.704, + "grad_norm": 2.2152152061462402, + "learning_rate": 6.9038310085513716e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.781861812621355, + "num_tokens": 641499.0, + "step": 3010 + }, + { + "epoch": 97.0, + "grad_norm": 2.4535887241363525, + "learning_rate": 6.843908693816627e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.8028259905608924, + "num_tokens": 657795.0, + "step": 3020 + }, + { + "epoch": 97.32, + "grad_norm": 1.937121033668518, + "learning_rate": 6.784111978948596e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7986438237130642, + "num_tokens": 673802.0, + "step": 3030 + }, + { + "epoch": 97.64, + "grad_norm": 1.616132140159607, + "learning_rate": 6.724443243618421e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7848228823393584, + "num_tokens": 690896.0, + "step": 3040 + }, + { + "epoch": 97.96, + "grad_norm": 2.3996787071228027, + "learning_rate": 6.664904862404175e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7884074129164219, + "num_tokens": 705680.0, + "step": 3050 + }, + { + "epoch": 98.256, + "grad_norm": 3.018188714981079, + "learning_rate": 6.605499204696351e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.801042732354757, + "num_tokens": 720238.0, + "step": 3060 + }, + { + "epoch": 98.576, + "grad_norm": 2.550436496734619, + "learning_rate": 6.546228634603578e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.798908605799079, + "num_tokens": 735457.0, + "step": 3070 + }, + { + "epoch": 98.896, + "grad_norm": 3.060084819793701, + "learning_rate": 6.487095510858543e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7785589572042226, + "num_tokens": 752742.0, + "step": 3080 + }, + { + "epoch": 99.192, + "grad_norm": 2.1915123462677, + "learning_rate": 6.428102186724101e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7807568505003646, + "num_tokens": 765549.0, + "step": 3090 + }, + { + "epoch": 99.512, + "grad_norm": 2.3755106925964355, + "learning_rate": 6.369251009899644e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.788112024590373, + "num_tokens": 782597.0, + "step": 3100 + }, + { + "epoch": 99.832, + "grad_norm": 1.9347033500671387, + "learning_rate": 6.310544322427674e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.8023913279175758, + "num_tokens": 799203.0, + "step": 3110 + }, + { + "epoch": 100.128, + "grad_norm": 2.046133279800415, + "learning_rate": 6.251984460600588e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7995543536302205, + "num_tokens": 813931.0, + "step": 3120 + }, + { + "epoch": 100.448, + "grad_norm": 2.557436943054199, + "learning_rate": 6.193573754867708e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.8036689855158329, + "num_tokens": 830433.0, + "step": 3130 + }, + { + "epoch": 100.768, + "grad_norm": 2.666550636291504, + "learning_rate": 6.135314529742529e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.79065520465374, + "num_tokens": 846129.0, + "step": 3140 + }, + { + "epoch": 101.064, + "grad_norm": 2.4647037982940674, + "learning_rate": 6.077209103710232e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7805173554130502, + "num_tokens": 860395.0, + "step": 3150 + }, + { + "epoch": 101.384, + "grad_norm": 1.9933632612228394, + "learning_rate": 6.019259789135404e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7982403136789799, + "num_tokens": 878034.0, + "step": 3160 + }, + { + "epoch": 101.704, + "grad_norm": 2.3307456970214844, + "learning_rate": 5.961468892170016e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7907839316874743, + "num_tokens": 892819.0, + "step": 3170 + }, + { + "epoch": 102.0, + "grad_norm": 4.047188758850098, + "learning_rate": 5.903838712661647e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7972375758596368, + "num_tokens": 908260.0, + "step": 3180 + }, + { + "epoch": 102.32, + "grad_norm": 1.9516690969467163, + "learning_rate": 5.846371544061962e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7980046071112156, + "num_tokens": 924521.0, + "step": 3190 + }, + { + "epoch": 102.64, + "grad_norm": 2.3500168323516846, + "learning_rate": 5.789069673335446e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.8008730575442314, + "num_tokens": 940805.0, + "step": 3200 + }, + { + "epoch": 102.96, + "grad_norm": 1.9596396684646606, + "learning_rate": 5.731935380868381e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7914111088961363, + "num_tokens": 957150.0, + "step": 3210 + }, + { + "epoch": 103.256, + "grad_norm": 2.2512779235839844, + "learning_rate": 5.674970940378102e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.80112284542741, + "num_tokens": 970896.0, + "step": 3220 + }, + { + "epoch": 103.576, + "grad_norm": 2.6935369968414307, + "learning_rate": 5.618178618822512e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7918466597795486, + "num_tokens": 986051.0, + "step": 3230 + }, + { + "epoch": 103.896, + "grad_norm": 2.1991372108459473, + "learning_rate": 5.561560676309874e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7981615476310253, + "num_tokens": 1001657.0, + "step": 3240 + }, + { + "epoch": 104.192, + "grad_norm": 2.4802706241607666, + "learning_rate": 5.505119366008847e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7797639261226397, + "num_tokens": 1018539.0, + "step": 3250 + }, + { + "epoch": 104.512, + "grad_norm": 2.416335105895996, + "learning_rate": 5.448856934058837e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.802381145209074, + "num_tokens": 1035770.0, + "step": 3260 + }, + { + "epoch": 104.832, + "grad_norm": 1.9266993999481201, + "learning_rate": 5.392775619480606e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.801979061216116, + "num_tokens": 1050287.0, + "step": 3270 + }, + { + "epoch": 105.128, + "grad_norm": 3.1635992527008057, + "learning_rate": 5.336877654087161e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7894677262048464, + "num_tokens": 1063888.0, + "step": 3280 + }, + { + "epoch": 105.448, + "grad_norm": 2.317321300506592, + "learning_rate": 5.281165262394938e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7858642000705004, + "num_tokens": 1080743.0, + "step": 3290 + }, + { + "epoch": 105.768, + "grad_norm": 1.9168007373809814, + "learning_rate": 5.2256406615353015e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7893419295549393, + "num_tokens": 1097525.0, + "step": 3300 + }, + { + "epoch": 106.064, + "grad_norm": 1.7733817100524902, + "learning_rate": 5.170306061166254e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.8171853680868406, + "num_tokens": 1112336.0, + "step": 3310 + }, + { + "epoch": 106.384, + "grad_norm": 2.349670648574829, + "learning_rate": 5.115163663384563e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.789124884083867, + "num_tokens": 1126428.0, + "step": 3320 + }, + { + "epoch": 106.704, + "grad_norm": 1.7135353088378906, + "learning_rate": 5.060215662638084e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7968744553625584, + "num_tokens": 1142993.0, + "step": 3330 + }, + { + "epoch": 107.0, + "grad_norm": 6.969696044921875, + "learning_rate": 5.005464245638447e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.790745651399767, + "num_tokens": 1158725.0, + "step": 3340 + }, + { + "epoch": 107.32, + "grad_norm": 2.188507080078125, + "learning_rate": 4.9509115912740445e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.8085566960275173, + "num_tokens": 1174330.0, + "step": 3350 + }, + { + "epoch": 107.64, + "grad_norm": 2.8108561038970947, + "learning_rate": 4.896559870523279e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7786224085837603, + "num_tokens": 1188830.0, + "step": 3360 + }, + { + "epoch": 107.96, + "grad_norm": 2.438131093978882, + "learning_rate": 4.842411246368226e-06, + "loss": 0.795, + "mean_token_accuracy": 0.8030483074486255, + "num_tokens": 1207364.0, + "step": 3370 + }, + { + "epoch": 108.256, + "grad_norm": 2.4335777759552, + "learning_rate": 4.788467873708508e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7993817514664417, + "num_tokens": 1223655.0, + "step": 3380 + }, + { + "epoch": 108.576, + "grad_norm": 2.748537540435791, + "learning_rate": 4.734731899275557e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7918653458356857, + "num_tokens": 1238999.0, + "step": 3390 + }, + { + "epoch": 108.896, + "grad_norm": 2.6951160430908203, + "learning_rate": 4.681205461547187e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.8007099393755197, + "num_tokens": 1253439.0, + "step": 3400 + }, + { + "epoch": 109.992, + "grad_norm": 2.926764726638794, + "learning_rate": 4.62789069066248e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7860011033713817, + "num_tokens": 16823.0, + "step": 3410 + }, + { + "epoch": 110.32, + "grad_norm": 2.0920846462249756, + "learning_rate": 4.574789708337018e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7867580187029954, + "num_tokens": 31381.0, + "step": 3420 + }, + { + "epoch": 110.64, + "grad_norm": 2.0056655406951904, + "learning_rate": 4.521904627778463e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.801980373263359, + "num_tokens": 48922.0, + "step": 3430 + }, + { + "epoch": 110.96, + "grad_norm": 2.2784852981567383, + "learning_rate": 4.469237553602433e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7932860311120749, + "num_tokens": 65112.0, + "step": 3440 + }, + { + "epoch": 111.256, + "grad_norm": 2.1628239154815674, + "learning_rate": 4.416790581748766e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.8054195183354456, + "num_tokens": 80534.0, + "step": 3450 + }, + { + "epoch": 111.576, + "grad_norm": 2.2064616680145264, + "learning_rate": 4.364565799398102e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7998192355036735, + "num_tokens": 97437.0, + "step": 3460 + }, + { + "epoch": 111.896, + "grad_norm": 2.387873888015747, + "learning_rate": 4.312565284888819e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7902990553528071, + "num_tokens": 111663.0, + "step": 3470 + }, + { + "epoch": 112.192, + "grad_norm": 2.14197039604187, + "learning_rate": 4.2607911076343455e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7986521447027052, + "num_tokens": 127399.0, + "step": 3480 + }, + { + "epoch": 112.512, + "grad_norm": 2.459407091140747, + "learning_rate": 4.2092453280407605e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7940516691654921, + "num_tokens": 143237.0, + "step": 3490 + }, + { + "epoch": 112.832, + "grad_norm": 2.606563091278076, + "learning_rate": 4.157929997424853e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.8024938710033893, + "num_tokens": 159484.0, + "step": 3500 + }, + { + "epoch": 113.128, + "grad_norm": 2.4822981357574463, + "learning_rate": 4.106847157932445e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.8050464319216238, + "num_tokens": 172111.0, + "step": 3510 + }, + { + "epoch": 113.448, + "grad_norm": 2.0524537563323975, + "learning_rate": 4.0559988424571365e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.797309584543109, + "num_tokens": 189542.0, + "step": 3520 + }, + { + "epoch": 113.768, + "grad_norm": 2.3543918132781982, + "learning_rate": 4.005387074559421e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.783655048161745, + "num_tokens": 207117.0, + "step": 3530 + }, + { + "epoch": 114.064, + "grad_norm": 2.1995062828063965, + "learning_rate": 3.9550138683861184e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.8102532826565407, + "num_tokens": 220475.0, + "step": 3540 + }, + { + "epoch": 114.384, + "grad_norm": 2.4602413177490234, + "learning_rate": 3.904881228590253e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7911395899951458, + "num_tokens": 235641.0, + "step": 3550 + }, + { + "epoch": 114.704, + "grad_norm": 1.7863750457763672, + "learning_rate": 3.854991150251271e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7878696542233229, + "num_tokens": 252557.0, + "step": 3560 + }, + { + "epoch": 115.0, + "grad_norm": 2.476003885269165, + "learning_rate": 3.8053456187956315e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7975163347012287, + "num_tokens": 267658.0, + "step": 3570 + }, + { + "epoch": 115.32, + "grad_norm": 2.536450147628784, + "learning_rate": 3.7559466099178e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7918349288403987, + "num_tokens": 283710.0, + "step": 3580 + }, + { + "epoch": 115.64, + "grad_norm": 1.9077178239822388, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7928959406912327, + "num_tokens": 299210.0, + "step": 3590 + }, + { + "epoch": 115.96, + "grad_norm": 1.9973210096359253, + "learning_rate": 3.6578960135421117e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.8063468877226114, + "num_tokens": 315732.0, + "step": 3600 + }, + { + "epoch": 116.256, + "grad_norm": 2.727466344833374, + "learning_rate": 3.6092483280675683e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.814963810347222, + "num_tokens": 329521.0, + "step": 3610 + }, + { + "epoch": 116.576, + "grad_norm": 5.467685222625732, + "learning_rate": 3.5608549690621562e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7913577631115913, + "num_tokens": 346404.0, + "step": 3620 + }, + { + "epoch": 116.896, + "grad_norm": 2.106499433517456, + "learning_rate": 3.512717862388876e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7868743006139993, + "num_tokens": 362461.0, + "step": 3630 + }, + { + "epoch": 117.192, + "grad_norm": 1.8872570991516113, + "learning_rate": 3.464838923712891e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7879228088501338, + "num_tokens": 378441.0, + "step": 3640 + }, + { + "epoch": 117.512, + "grad_norm": 2.7338979244232178, + "learning_rate": 3.4172200584253077e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.8083719074726105, + "num_tokens": 392976.0, + "step": 3650 + }, + { + "epoch": 117.832, + "grad_norm": 2.1551365852355957, + "learning_rate": 3.369863161567363e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7942818276584148, + "num_tokens": 409377.0, + "step": 3660 + }, + { + "epoch": 118.128, + "grad_norm": 2.206017255783081, + "learning_rate": 3.322770117754963e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.8010066038853413, + "num_tokens": 423629.0, + "step": 3670 + }, + { + "epoch": 118.448, + "grad_norm": 2.915071964263916, + "learning_rate": 3.2759428011037454e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7961924949660897, + "num_tokens": 438934.0, + "step": 3680 + }, + { + "epoch": 118.768, + "grad_norm": 2.040217876434326, + "learning_rate": 3.229383075154445e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.8073360413312912, + "num_tokens": 456515.0, + "step": 3690 + }, + { + "epoch": 119.064, + "grad_norm": 2.2024896144866943, + "learning_rate": 3.18309279279876e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7914658515034495, + "num_tokens": 471476.0, + "step": 3700 + }, + { + "epoch": 119.384, + "grad_norm": 2.4495859146118164, + "learning_rate": 3.137073796205601e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.8003936596214771, + "num_tokens": 486665.0, + "step": 3710 + }, + { + "epoch": 119.704, + "grad_norm": 2.1028919219970703, + "learning_rate": 3.0913279167477916e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.8022113911807537, + "num_tokens": 504053.0, + "step": 3720 + }, + { + "epoch": 120.0, + "grad_norm": 4.113481521606445, + "learning_rate": 3.0458569749291743e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.8031640326654589, + "num_tokens": 518123.0, + "step": 3730 + }, + { + "epoch": 120.32, + "grad_norm": 2.537456750869751, + "learning_rate": 3.000662780312178e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7974813230335712, + "num_tokens": 534453.0, + "step": 3740 + }, + { + "epoch": 120.64, + "grad_norm": 2.442348003387451, + "learning_rate": 2.9557471314457866e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.8017565876245498, + "num_tokens": 549839.0, + "step": 3750 + }, + { + "epoch": 120.96, + "grad_norm": 1.9900853633880615, + "learning_rate": 2.9111118157939745e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.8000222038477659, + "num_tokens": 566121.0, + "step": 3760 + }, + { + "epoch": 121.256, + "grad_norm": 2.138803005218506, + "learning_rate": 2.866758609664572e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.8003743641279839, + "num_tokens": 581873.0, + "step": 3770 + }, + { + "epoch": 121.576, + "grad_norm": 2.2369134426116943, + "learning_rate": 2.8226892781385673e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7941499546170234, + "num_tokens": 597192.0, + "step": 3780 + }, + { + "epoch": 121.896, + "grad_norm": 2.110830783843994, + "learning_rate": 2.7789055749998863e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7905905708670616, + "num_tokens": 612868.0, + "step": 3790 + }, + { + "epoch": 122.192, + "grad_norm": 1.9145809412002563, + "learning_rate": 2.7354092426655565e-06, + "loss": 0.737, + "mean_token_accuracy": 0.8060021779021701, + "num_tokens": 628187.0, + "step": 3800 + }, + { + "epoch": 122.512, + "grad_norm": 2.281447649002075, + "learning_rate": 2.6922020121164182e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.8067140795290471, + "num_tokens": 645037.0, + "step": 3810 + }, + { + "epoch": 122.832, + "grad_norm": 2.424180030822754, + "learning_rate": 2.6492856028281956e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.8041324406862259, + "num_tokens": 660729.0, + "step": 3820 + }, + { + "epoch": 123.128, + "grad_norm": 2.1080105304718018, + "learning_rate": 2.606661722703084e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7844757274598688, + "num_tokens": 675782.0, + "step": 3830 + }, + { + "epoch": 123.448, + "grad_norm": 1.995034098625183, + "learning_rate": 2.5643320680018012e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.8024365931749344, + "num_tokens": 693109.0, + "step": 3840 + }, + { + "epoch": 123.768, + "grad_norm": 2.0973665714263916, + "learning_rate": 2.522298323276039e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7944286055862904, + "num_tokens": 707242.0, + "step": 3850 + }, + { + "epoch": 124.064, + "grad_norm": 2.0112831592559814, + "learning_rate": 2.480562161301464e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.8035188607267432, + "num_tokens": 721297.0, + "step": 3860 + }, + { + "epoch": 124.384, + "grad_norm": 2.4737842082977295, + "learning_rate": 2.4391252430111388e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7951025106012821, + "num_tokens": 738725.0, + "step": 3870 + }, + { + "epoch": 124.704, + "grad_norm": 2.4330711364746094, + "learning_rate": 2.3979892174294105e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7900912150740623, + "num_tokens": 754563.0, + "step": 3880 + }, + { + "epoch": 125.0, + "grad_norm": 5.609644412994385, + "learning_rate": 2.3571557216062967e-06, + "loss": 0.659, + "mean_token_accuracy": 0.8116728329175228, + "num_tokens": 768588.0, + "step": 3890 + }, + { + "epoch": 125.32, + "grad_norm": 2.646641254425049, + "learning_rate": 2.316626380552337e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.8065689295530319, + "num_tokens": 785761.0, + "step": 3900 + }, + { + "epoch": 125.64, + "grad_norm": 2.1917035579681396, + "learning_rate": 2.2764028071739162e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7956257075071335, + "num_tokens": 800456.0, + "step": 3910 + }, + { + "epoch": 125.96, + "grad_norm": 2.0703041553497314, + "learning_rate": 2.236486602209097e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7975949931889772, + "num_tokens": 817034.0, + "step": 3920 + }, + { + "epoch": 126.256, + "grad_norm": 3.7488138675689697, + "learning_rate": 2.1968793541638877e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7973396097486084, + "num_tokens": 831010.0, + "step": 3930 + }, + { + "epoch": 126.576, + "grad_norm": 1.8520132303237915, + "learning_rate": 2.1575826392490507e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7970767199993134, + "num_tokens": 847034.0, + "step": 3940 + }, + { + "epoch": 126.896, + "grad_norm": 2.562690258026123, + "learning_rate": 2.118598021317362e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.8056481756269932, + "num_tokens": 862562.0, + "step": 3950 + }, + { + "epoch": 127.192, + "grad_norm": 2.3542673587799072, + "learning_rate": 2.07992705180138e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7991788826278738, + "num_tokens": 876729.0, + "step": 3960 + }, + { + "epoch": 127.512, + "grad_norm": 2.78625750541687, + "learning_rate": 2.0415712696517155e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7964733302593231, + "num_tokens": 895143.0, + "step": 3970 + }, + { + "epoch": 127.832, + "grad_norm": 2.283376455307007, + "learning_rate": 2.00353220127576e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.8018839418888092, + "num_tokens": 911917.0, + "step": 3980 + }, + { + "epoch": 128.128, + "grad_norm": 2.6434993743896484, + "learning_rate": 1.965811360476967e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7953068952302675, + "num_tokens": 924458.0, + "step": 3990 + }, + { + "epoch": 128.448, + "grad_norm": 2.869091033935547, + "learning_rate": 1.9284102483946042e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7926684629172087, + "num_tokens": 939111.0, + "step": 4000 + }, + { + "epoch": 128.768, + "grad_norm": 1.8905816078186035, + "learning_rate": 1.8913303534440019e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.8064320608973503, + "num_tokens": 956948.0, + "step": 4010 + }, + { + "epoch": 129.064, + "grad_norm": 2.6615140438079834, + "learning_rate": 1.8545731512573317e-06, + "loss": 0.725, + "mean_token_accuracy": 0.8079512683120934, + "num_tokens": 973125.0, + "step": 4020 + }, + { + "epoch": 129.384, + "grad_norm": 1.934066653251648, + "learning_rate": 1.8181401046248748e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7953744523227215, + "num_tokens": 988824.0, + "step": 4030 + }, + { + "epoch": 129.704, + "grad_norm": 2.660801410675049, + "learning_rate": 1.7820326634368124e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.8010114066302776, + "num_tokens": 1003868.0, + "step": 4040 + }, + { + "epoch": 130.0, + "grad_norm": 5.5373663902282715, + "learning_rate": 1.7462522646255319e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7884616557810757, + "num_tokens": 1019053.0, + "step": 4050 + }, + { + "epoch": 130.32, + "grad_norm": 2.5071728229522705, + "learning_rate": 1.7108003321084299e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.80089957639575, + "num_tokens": 1035776.0, + "step": 4060 + }, + { + "epoch": 130.64, + "grad_norm": 3.022608518600464, + "learning_rate": 1.675678276731253e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7966047372668982, + "num_tokens": 1049970.0, + "step": 4070 + }, + { + "epoch": 130.96, + "grad_norm": 2.2591309547424316, + "learning_rate": 1.6408874962119526e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7990051701664924, + "num_tokens": 1066724.0, + "step": 4080 + }, + { + "epoch": 131.256, + "grad_norm": 1.9211347103118896, + "learning_rate": 1.606429375085058e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7852608090316927, + "num_tokens": 1081511.0, + "step": 4090 + }, + { + "epoch": 131.576, + "grad_norm": 2.843675136566162, + "learning_rate": 1.572305284646587e-06, + "loss": 0.744, + "mean_token_accuracy": 0.8146931059658528, + "num_tokens": 1097216.0, + "step": 4100 + }, + { + "epoch": 131.896, + "grad_norm": 1.981909990310669, + "learning_rate": 1.538516582899453e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.80064931884408, + "num_tokens": 1114422.0, + "step": 4110 + }, + { + "epoch": 132.192, + "grad_norm": 2.5102710723876953, + "learning_rate": 1.505064614499443e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7895874824072864, + "num_tokens": 1130322.0, + "step": 4120 + }, + { + "epoch": 132.512, + "grad_norm": 2.168246269226074, + "learning_rate": 1.4719507107017005e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.8004750736057759, + "num_tokens": 1145434.0, + "step": 4130 + }, + { + "epoch": 132.832, + "grad_norm": 2.0837020874023438, + "learning_rate": 1.439176189307735e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.8010189373046159, + "num_tokens": 1161082.0, + "step": 4140 + }, + { + "epoch": 133.128, + "grad_norm": 2.729126453399658, + "learning_rate": 1.406742354613e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.8093010356297364, + "num_tokens": 1176663.0, + "step": 4150 + }, + { + "epoch": 133.448, + "grad_norm": 2.5754497051239014, + "learning_rate": 1.3746504973549613e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.8098891712725163, + "num_tokens": 1192788.0, + "step": 4160 + }, + { + "epoch": 133.768, + "grad_norm": 3.8127379417419434, + "learning_rate": 1.34290189466175e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7972260743379593, + "num_tokens": 1207918.0, + "step": 4170 + }, + { + "epoch": 134.064, + "grad_norm": 2.3844289779663086, + "learning_rate": 1.3114978100013376e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7910445159351503, + "num_tokens": 1222595.0, + "step": 4180 + }, + { + "epoch": 134.384, + "grad_norm": 2.5769917964935303, + "learning_rate": 1.2804394931312446e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.8134579740464687, + "num_tokens": 1238530.0, + "step": 4190 + }, + { + "epoch": 134.704, + "grad_norm": 2.293607234954834, + "learning_rate": 1.2497281800488092e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7885241828858852, + "num_tokens": 1255860.0, + "step": 4200 + }, + { + "epoch": 135.0, + "grad_norm": 9.035529136657715, + "learning_rate": 1.219365092942003e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7974654679765573, + "num_tokens": 1269518.0, + "step": 4210 + }, + { + "epoch": 135.32, + "grad_norm": 2.2957992553710938, + "learning_rate": 1.189351440140788e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7884160943329335, + "num_tokens": 1285778.0, + "step": 4220 + }, + { + "epoch": 135.64, + "grad_norm": 2.6251449584960938, + "learning_rate": 1.159688416069038e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.8091854326426983, + "num_tokens": 1301508.0, + "step": 4230 + }, + { + "epoch": 135.96, + "grad_norm": 2.2512965202331543, + "learning_rate": 1.1303772011969928e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.8062782268971205, + "num_tokens": 1317471.0, + "step": 4240 + }, + { + "epoch": 136.256, + "grad_norm": 2.4756977558135986, + "learning_rate": 1.1014189619942905e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.8019913362490164, + "num_tokens": 1333505.0, + "step": 4250 + }, + { + "epoch": 136.576, + "grad_norm": 3.272853374481201, + "learning_rate": 1.0728148508835424e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7980553403496742, + "num_tokens": 1349049.0, + "step": 4260 + }, + { + "epoch": 136.896, + "grad_norm": 2.3439319133758545, + "learning_rate": 1.0445660061944684e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7939603064209223, + "num_tokens": 1365036.0, + "step": 4270 + }, + { + "epoch": 137.192, + "grad_norm": 2.6307787895202637, + "learning_rate": 1.01667355211861e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.8034155070781708, + "num_tokens": 1379248.0, + "step": 4280 + }, + { + "epoch": 137.512, + "grad_norm": 2.3606624603271484, + "learning_rate": 9.891385986645675e-07, + "loss": 0.6851, + "mean_token_accuracy": 0.8086177695542573, + "num_tokens": 1394888.0, + "step": 4290 + }, + { + "epoch": 137.832, + "grad_norm": 2.0615415573120117, + "learning_rate": 9.619622416138475e-07, + "loss": 0.8681, + "mean_token_accuracy": 0.7964320003986358, + "num_tokens": 1410073.0, + "step": 4300 + }, + { + "epoch": 138.128, + "grad_norm": 2.324324131011963, + "learning_rate": 9.351455624772487e-07, + "loss": 0.7584, + "mean_token_accuracy": 0.8120040643859554, + "num_tokens": 1427006.0, + "step": 4310 + }, + { + "epoch": 138.448, + "grad_norm": 2.1296613216400146, + "learning_rate": 9.086896284518198e-07, + "loss": 0.7939, + "mean_token_accuracy": 0.802574060857296, + "num_tokens": 1442836.0, + "step": 4320 + }, + { + "epoch": 138.768, + "grad_norm": 2.3658032417297363, + "learning_rate": 8.825954923783875e-07, + "loss": 0.8059, + "mean_token_accuracy": 0.7980705320835113, + "num_tokens": 1457625.0, + "step": 4330 + }, + { + "epoch": 139.064, + "grad_norm": 2.4179608821868896, + "learning_rate": 8.568641926996646e-07, + "loss": 0.8155, + "mean_token_accuracy": 0.7932797682446402, + "num_tokens": 1473259.0, + "step": 4340 + }, + { + "epoch": 139.384, + "grad_norm": 2.3579256534576416, + "learning_rate": 8.314967534189166e-07, + "loss": 0.8503, + "mean_token_accuracy": 0.7952963810414075, + "num_tokens": 1490309.0, + "step": 4350 + }, + { + "epoch": 139.704, + "grad_norm": 2.3982760906219482, + "learning_rate": 8.064941840592178e-07, + "loss": 0.6937, + "mean_token_accuracy": 0.8161114897578955, + "num_tokens": 1505580.0, + "step": 4360 + }, + { + "epoch": 140.0, + "grad_norm": 3.710239887237549, + "learning_rate": 7.818574796232714e-07, + "loss": 0.776, + "mean_token_accuracy": 0.789946156579095, + "num_tokens": 1519983.0, + "step": 4370 + }, + { + "epoch": 140.32, + "grad_norm": 2.348114013671875, + "learning_rate": 7.575876205538113e-07, + "loss": 0.8371, + "mean_token_accuracy": 0.7868118450045586, + "num_tokens": 1535154.0, + "step": 4380 + }, + { + "epoch": 140.64, + "grad_norm": 1.7529124021530151, + "learning_rate": 7.336855726945891e-07, + "loss": 0.8106, + "mean_token_accuracy": 0.7890769924968482, + "num_tokens": 1552288.0, + "step": 4390 + }, + { + "epoch": 140.96, + "grad_norm": 2.1464691162109375, + "learning_rate": 7.101522872519306e-07, + "loss": 0.7481, + "mean_token_accuracy": 0.8147139415144921, + "num_tokens": 1567833.0, + "step": 4400 + }, + { + "epoch": 8.304941176470589, + "grad_norm": 3.626028537750244, + "learning_rate": 1.6054562751771983e-05, + "loss": 1.8624, + "mean_token_accuracy": 0.42639462910592557, + "num_tokens": 12548.0, + "step": 4410 + }, + { + "epoch": 8.323764705882352, + "grad_norm": 2.1800546646118164, + "learning_rate": 1.6037858352792722e-05, + "loss": 1.5835, + "mean_token_accuracy": 0.4775951974093914, + "num_tokens": 25755.0, + "step": 4420 + }, + { + "epoch": 8.342588235294118, + "grad_norm": 1.870821475982666, + "learning_rate": 1.602112739804461e-05, + "loss": 1.4872, + "mean_token_accuracy": 0.48393381759524345, + "num_tokens": 38667.0, + "step": 4430 + }, + { + "epoch": 8.361411764705883, + "grad_norm": 1.9865084886550903, + "learning_rate": 1.6004369961113897e-05, + "loss": 1.4383, + "mean_token_accuracy": 0.4954090975224972, + "num_tokens": 51649.0, + "step": 4440 + }, + { + "epoch": 8.380235294117647, + "grad_norm": 1.759982943534851, + "learning_rate": 1.5987586115703306e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.5117561783641577, + "num_tokens": 66035.0, + "step": 4450 + }, + { + "epoch": 8.399058823529412, + "grad_norm": 1.9075089693069458, + "learning_rate": 1.5970775935631717e-05, + "loss": 1.3555, + "mean_token_accuracy": 0.5182104598730802, + "num_tokens": 79576.0, + "step": 4460 + }, + { + "epoch": 8.417882352941177, + "grad_norm": 1.3230409622192383, + "learning_rate": 1.5953939494833832e-05, + "loss": 1.3668, + "mean_token_accuracy": 0.519798369705677, + "num_tokens": 92225.0, + "step": 4470 + }, + { + "epoch": 8.43670588235294, + "grad_norm": 2.239945650100708, + "learning_rate": 1.5937076867359852e-05, + "loss": 1.3048, + "mean_token_accuracy": 0.5348641883581877, + "num_tokens": 105922.0, + "step": 4480 + }, + { + "epoch": 8.455529411764706, + "grad_norm": 1.7611688375473022, + "learning_rate": 1.5920188127375152e-05, + "loss": 1.3466, + "mean_token_accuracy": 0.521543862298131, + "num_tokens": 119527.0, + "step": 4490 + }, + { + "epoch": 8.47435294117647, + "grad_norm": 1.486075520515442, + "learning_rate": 1.5903273349159958e-05, + "loss": 1.3115, + "mean_token_accuracy": 0.5352868799120187, + "num_tokens": 133451.0, + "step": 4500 + }, + { + "epoch": 8.493176470588235, + "grad_norm": 1.6955538988113403, + "learning_rate": 1.5886332607109017e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.5247942265123129, + "num_tokens": 147565.0, + "step": 4510 + }, + { + "epoch": 8.512, + "grad_norm": 1.5570602416992188, + "learning_rate": 1.5869365975731267e-05, + "loss": 1.2547, + "mean_token_accuracy": 0.5451988846063613, + "num_tokens": 160377.0, + "step": 4520 + }, + { + "epoch": 8.530823529411764, + "grad_norm": 1.4915376901626587, + "learning_rate": 1.585237352964952e-05, + "loss": 1.358, + "mean_token_accuracy": 0.526292197033763, + "num_tokens": 174242.0, + "step": 4530 + }, + { + "epoch": 8.54964705882353, + "grad_norm": 1.603037714958191, + "learning_rate": 1.583535534360012e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.5363341204822063, + "num_tokens": 187399.0, + "step": 4540 + }, + { + "epoch": 8.568470588235295, + "grad_norm": 2.1536943912506104, + "learning_rate": 1.581831149243262e-05, + "loss": 1.2976, + "mean_token_accuracy": 0.5274909067898989, + "num_tokens": 200947.0, + "step": 4550 + }, + { + "epoch": 8.587294117647058, + "grad_norm": 1.611542820930481, + "learning_rate": 1.580124205110946e-05, + "loss": 1.2672, + "mean_token_accuracy": 0.5402051657438278, + "num_tokens": 214010.0, + "step": 4560 + }, + { + "epoch": 8.606117647058824, + "grad_norm": 1.5133346319198608, + "learning_rate": 1.578414709470562e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.5325882468372584, + "num_tokens": 227829.0, + "step": 4570 + }, + { + "epoch": 8.624941176470589, + "grad_norm": 1.4743294715881348, + "learning_rate": 1.576702669840832e-05, + "loss": 1.2504, + "mean_token_accuracy": 0.5380570895969867, + "num_tokens": 240838.0, + "step": 4580 + }, + { + "epoch": 8.643764705882353, + "grad_norm": 1.4610170125961304, + "learning_rate": 1.5749880937516647e-05, + "loss": 1.2727, + "mean_token_accuracy": 0.5317132595926524, + "num_tokens": 253845.0, + "step": 4590 + }, + { + "epoch": 8.662588235294118, + "grad_norm": 1.8520996570587158, + "learning_rate": 1.573270988744126e-05, + "loss": 1.2746, + "mean_token_accuracy": 0.5325201127678156, + "num_tokens": 266058.0, + "step": 4600 + }, + { + "epoch": 8.681411764705881, + "grad_norm": 2.241778612136841, + "learning_rate": 1.5715513623704052e-05, + "loss": 1.2703, + "mean_token_accuracy": 0.5311647448688745, + "num_tokens": 279149.0, + "step": 4610 + }, + { + "epoch": 8.700235294117647, + "grad_norm": 1.921618938446045, + "learning_rate": 1.5698292221937787e-05, + "loss": 1.2823, + "mean_token_accuracy": 0.5341210236772895, + "num_tokens": 293451.0, + "step": 4620 + }, + { + "epoch": 8.719058823529412, + "grad_norm": 1.5892717838287354, + "learning_rate": 1.5681045757885817e-05, + "loss": 1.2531, + "mean_token_accuracy": 0.5422347262501717, + "num_tokens": 306138.0, + "step": 4630 + }, + { + "epoch": 8.737882352941176, + "grad_norm": 1.6042686700820923, + "learning_rate": 1.566377430740171e-05, + "loss": 1.2764, + "mean_token_accuracy": 0.5341788738965988, + "num_tokens": 319399.0, + "step": 4640 + }, + { + "epoch": 8.756705882352941, + "grad_norm": 2.2580060958862305, + "learning_rate": 1.5646477946448927e-05, + "loss": 1.2348, + "mean_token_accuracy": 0.5448929745703935, + "num_tokens": 332882.0, + "step": 4650 + }, + { + "epoch": 8.775529411764706, + "grad_norm": 1.2103066444396973, + "learning_rate": 1.5629156751100502e-05, + "loss": 1.2542, + "mean_token_accuracy": 0.545468046143651, + "num_tokens": 345343.0, + "step": 4660 + }, + { + "epoch": 8.79435294117647, + "grad_norm": 0.8362689018249512, + "learning_rate": 1.561181079753868e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.5429604861885309, + "num_tokens": 358912.0, + "step": 4670 + }, + { + "epoch": 8.813176470588235, + "grad_norm": 1.158046841621399, + "learning_rate": 1.5594440162054615e-05, + "loss": 1.2471, + "mean_token_accuracy": 0.5337832469493151, + "num_tokens": 372248.0, + "step": 4680 + }, + { + "epoch": 8.832, + "grad_norm": 1.3598729372024536, + "learning_rate": 1.557704492104801e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.5272687204182148, + "num_tokens": 386263.0, + "step": 4690 + }, + { + "epoch": 8.850823529411764, + "grad_norm": 1.7355713844299316, + "learning_rate": 1.5559625151026785e-05, + "loss": 1.3023, + "mean_token_accuracy": 0.524540626257658, + "num_tokens": 399314.0, + "step": 4700 + }, + { + "epoch": 8.86964705882353, + "grad_norm": 1.342244267463684, + "learning_rate": 1.5542180928606747e-05, + "loss": 1.2199, + "mean_token_accuracy": 0.5468841027468443, + "num_tokens": 413612.0, + "step": 4710 + }, + { + "epoch": 8.888470588235293, + "grad_norm": 1.322409987449646, + "learning_rate": 1.5524712330511246e-05, + "loss": 1.2383, + "mean_token_accuracy": 0.5588106140494347, + "num_tokens": 427389.0, + "step": 4720 + }, + { + "epoch": 8.907294117647059, + "grad_norm": 1.3516113758087158, + "learning_rate": 1.5507219433570848e-05, + "loss": 1.2482, + "mean_token_accuracy": 0.5358951542526483, + "num_tokens": 440751.0, + "step": 4730 + }, + { + "epoch": 8.926117647058824, + "grad_norm": 1.5260019302368164, + "learning_rate": 1.5489702314722986e-05, + "loss": 1.2168, + "mean_token_accuracy": 0.5595146797597408, + "num_tokens": 453892.0, + "step": 4740 + }, + { + "epoch": 8.944941176470588, + "grad_norm": 1.5382399559020996, + "learning_rate": 1.547216105101162e-05, + "loss": 1.2772, + "mean_token_accuracy": 0.531356817483902, + "num_tokens": 468069.0, + "step": 4750 + }, + { + "epoch": 8.963764705882353, + "grad_norm": 1.362877368927002, + "learning_rate": 1.5454595719586926e-05, + "loss": 1.2325, + "mean_token_accuracy": 0.5457029201090335, + "num_tokens": 480208.0, + "step": 4760 + }, + { + "epoch": 8.982588235294118, + "grad_norm": 1.0237706899642944, + "learning_rate": 1.543700639770491e-05, + "loss": 1.2282, + "mean_token_accuracy": 0.542092502117157, + "num_tokens": 493653.0, + "step": 4770 + }, + { + "epoch": 9.001882352941177, + "grad_norm": 3.654766082763672, + "learning_rate": 1.5419393162727105e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.5442763832284183, + "num_tokens": 507301.0, + "step": 4780 + }, + { + "epoch": 9.02070588235294, + "grad_norm": 1.20900297164917, + "learning_rate": 1.5401756092120215e-05, + "loss": 1.2509, + "mean_token_accuracy": 0.5424667615443468, + "num_tokens": 520131.0, + "step": 4790 + }, + { + "epoch": 9.039529411764706, + "grad_norm": 1.2988379001617432, + "learning_rate": 1.5384095263455782e-05, + "loss": 1.2669, + "mean_token_accuracy": 0.5415636003017426, + "num_tokens": 533609.0, + "step": 4800 + }, + { + "epoch": 9.058352941176471, + "grad_norm": 2.150287628173828, + "learning_rate": 1.5366410754409837e-05, + "loss": 1.2693, + "mean_token_accuracy": 0.5377780050039291, + "num_tokens": 547255.0, + "step": 4810 + }, + { + "epoch": 9.077176470588235, + "grad_norm": 1.0066241025924683, + "learning_rate": 1.5348702642762563e-05, + "loss": 1.2117, + "mean_token_accuracy": 0.5584665209054946, + "num_tokens": 560812.0, + "step": 4820 + }, + { + "epoch": 9.096, + "grad_norm": 1.0327008962631226, + "learning_rate": 1.5330971006397962e-05, + "loss": 1.1753, + "mean_token_accuracy": 0.5651697292923927, + "num_tokens": 574553.0, + "step": 4830 + }, + { + "epoch": 9.114823529411765, + "grad_norm": 1.200286865234375, + "learning_rate": 1.5313215923303482e-05, + "loss": 1.2833, + "mean_token_accuracy": 0.5195233155041933, + "num_tokens": 587992.0, + "step": 4840 + }, + { + "epoch": 9.133647058823529, + "grad_norm": 0.9596078991889954, + "learning_rate": 1.5295437471569714e-05, + "loss": 1.2403, + "mean_token_accuracy": 0.538974242284894, + "num_tokens": 602053.0, + "step": 4850 + }, + { + "epoch": 9.152470588235294, + "grad_norm": 1.0736156702041626, + "learning_rate": 1.5277635729390022e-05, + "loss": 1.2346, + "mean_token_accuracy": 0.5428169660270215, + "num_tokens": 616596.0, + "step": 4860 + }, + { + "epoch": 9.171294117647058, + "grad_norm": 1.773108959197998, + "learning_rate": 1.5259810775060202e-05, + "loss": 1.2516, + "mean_token_accuracy": 0.5292404495179653, + "num_tokens": 629154.0, + "step": 4870 + }, + { + "epoch": 9.190117647058823, + "grad_norm": 1.4007513523101807, + "learning_rate": 1.524196268697815e-05, + "loss": 1.1987, + "mean_token_accuracy": 0.5568405143916607, + "num_tokens": 641946.0, + "step": 4880 + }, + { + "epoch": 9.208941176470589, + "grad_norm": 1.3242895603179932, + "learning_rate": 1.5224091543643504e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.5264579340815544, + "num_tokens": 655771.0, + "step": 4890 + }, + { + "epoch": 9.227764705882352, + "grad_norm": 1.3015270233154297, + "learning_rate": 1.52061974236573e-05, + "loss": 1.1972, + "mean_token_accuracy": 0.5521455116569995, + "num_tokens": 669074.0, + "step": 4900 + }, + { + "epoch": 9.246588235294118, + "grad_norm": 1.4676063060760498, + "learning_rate": 1.5188280405721643e-05, + "loss": 1.2169, + "mean_token_accuracy": 0.5410921085625887, + "num_tokens": 682391.0, + "step": 4910 + }, + { + "epoch": 9.265411764705883, + "grad_norm": 1.505129098892212, + "learning_rate": 1.5170340568639335e-05, + "loss": 1.2445, + "mean_token_accuracy": 0.5468276925384998, + "num_tokens": 695279.0, + "step": 4920 + }, + { + "epoch": 9.284235294117646, + "grad_norm": 1.4586368799209595, + "learning_rate": 1.5152377991313547e-05, + "loss": 1.2183, + "mean_token_accuracy": 0.5493371106684208, + "num_tokens": 709036.0, + "step": 4930 + }, + { + "epoch": 9.303058823529412, + "grad_norm": 1.3103828430175781, + "learning_rate": 1.5134392752747469e-05, + "loss": 1.2207, + "mean_token_accuracy": 0.5371036138385534, + "num_tokens": 721600.0, + "step": 4940 + }, + { + "epoch": 9.321882352941177, + "grad_norm": 1.406219720840454, + "learning_rate": 1.5116384932043953e-05, + "loss": 1.2197, + "mean_token_accuracy": 0.5394637104123831, + "num_tokens": 734972.0, + "step": 4950 + }, + { + "epoch": 9.34070588235294, + "grad_norm": 1.3175715208053589, + "learning_rate": 1.5098354608405177e-05, + "loss": 1.3009, + "mean_token_accuracy": 0.5217017080634833, + "num_tokens": 749524.0, + "step": 4960 + }, + { + "epoch": 9.359529411764706, + "grad_norm": 1.1799266338348389, + "learning_rate": 1.5080301861132291e-05, + "loss": 1.233, + "mean_token_accuracy": 0.5553332667797803, + "num_tokens": 763976.0, + "step": 4970 + }, + { + "epoch": 9.378352941176471, + "grad_norm": 1.2330571413040161, + "learning_rate": 1.5062226769625068e-05, + "loss": 1.2127, + "mean_token_accuracy": 0.5426539558917284, + "num_tokens": 777548.0, + "step": 4980 + }, + { + "epoch": 9.397176470588235, + "grad_norm": 1.3530794382095337, + "learning_rate": 1.5044129413381551e-05, + "loss": 1.2137, + "mean_token_accuracy": 0.5432845208793878, + "num_tokens": 791104.0, + "step": 4990 + }, + { + "epoch": 9.416, + "grad_norm": 1.174985647201538, + "learning_rate": 1.5026009871997725e-05, + "loss": 1.1936, + "mean_token_accuracy": 0.5486832950264215, + "num_tokens": 804784.0, + "step": 5000 + }, + { + "epoch": 9.434823529411764, + "grad_norm": 0.9708495140075684, + "learning_rate": 1.5007868225167124e-05, + "loss": 1.2447, + "mean_token_accuracy": 0.5287159774452448, + "num_tokens": 817605.0, + "step": 5010 + }, + { + "epoch": 9.45364705882353, + "grad_norm": 1.4748586416244507, + "learning_rate": 1.4989704552680527e-05, + "loss": 1.1782, + "mean_token_accuracy": 0.5548595078289509, + "num_tokens": 830334.0, + "step": 5020 + }, + { + "epoch": 9.472470588235295, + "grad_norm": 1.4649749994277954, + "learning_rate": 1.497151893442558e-05, + "loss": 1.1558, + "mean_token_accuracy": 0.5786185275763274, + "num_tokens": 843520.0, + "step": 5030 + }, + { + "epoch": 9.491294117647058, + "grad_norm": 1.3614012002944946, + "learning_rate": 1.4953311450386447e-05, + "loss": 1.2294, + "mean_token_accuracy": 0.5436280608177185, + "num_tokens": 856605.0, + "step": 5040 + }, + { + "epoch": 9.510117647058824, + "grad_norm": 0.8162552714347839, + "learning_rate": 1.493508218064347e-05, + "loss": 1.1795, + "mean_token_accuracy": 0.5606917165219784, + "num_tokens": 869281.0, + "step": 5050 + }, + { + "epoch": 9.528941176470589, + "grad_norm": 1.1542294025421143, + "learning_rate": 1.4916831205372803e-05, + "loss": 1.283, + "mean_token_accuracy": 0.539304967597127, + "num_tokens": 883498.0, + "step": 5060 + }, + { + "epoch": 9.547764705882352, + "grad_norm": 1.3006714582443237, + "learning_rate": 1.4898558604846067e-05, + "loss": 1.2342, + "mean_token_accuracy": 0.5408715981990099, + "num_tokens": 897313.0, + "step": 5070 + }, + { + "epoch": 9.566588235294118, + "grad_norm": 0.9996142983436584, + "learning_rate": 1.488026445943e-05, + "loss": 1.2156, + "mean_token_accuracy": 0.5489041075110436, + "num_tokens": 910640.0, + "step": 5080 + }, + { + "epoch": 9.585411764705881, + "grad_norm": 2.1211931705474854, + "learning_rate": 1.486194884958609e-05, + "loss": 1.1633, + "mean_token_accuracy": 0.5579564660787583, + "num_tokens": 923363.0, + "step": 5090 + }, + { + "epoch": 9.604235294117647, + "grad_norm": 1.2634146213531494, + "learning_rate": 1.4843611855870235e-05, + "loss": 1.2593, + "mean_token_accuracy": 0.5273831244558096, + "num_tokens": 936250.0, + "step": 5100 + }, + { + "epoch": 9.623058823529412, + "grad_norm": 1.7456119060516357, + "learning_rate": 1.4825253558932386e-05, + "loss": 1.2228, + "mean_token_accuracy": 0.5505132492631674, + "num_tokens": 949552.0, + "step": 5110 + }, + { + "epoch": 9.641882352941176, + "grad_norm": 1.605895757675171, + "learning_rate": 1.480687403951619e-05, + "loss": 1.1788, + "mean_token_accuracy": 0.5624800592660903, + "num_tokens": 963342.0, + "step": 5120 + }, + { + "epoch": 9.660705882352941, + "grad_norm": 1.3311768770217896, + "learning_rate": 1.4788473378458626e-05, + "loss": 1.2062, + "mean_token_accuracy": 0.5582063946872949, + "num_tokens": 976717.0, + "step": 5130 + }, + { + "epoch": 9.679529411764706, + "grad_norm": 1.4497061967849731, + "learning_rate": 1.4770051656689672e-05, + "loss": 1.228, + "mean_token_accuracy": 0.5460193831473589, + "num_tokens": 989772.0, + "step": 5140 + }, + { + "epoch": 9.69835294117647, + "grad_norm": 1.1696816682815552, + "learning_rate": 1.4751608955231924e-05, + "loss": 1.1884, + "mean_token_accuracy": 0.5445575587451458, + "num_tokens": 1003123.0, + "step": 5150 + }, + { + "epoch": 9.717176470588235, + "grad_norm": 0.9232364892959595, + "learning_rate": 1.4733145355200255e-05, + "loss": 1.152, + "mean_token_accuracy": 0.5746063582599163, + "num_tokens": 1016187.0, + "step": 5160 + }, + { + "epoch": 9.736, + "grad_norm": 1.6106712818145752, + "learning_rate": 1.4714660937801461e-05, + "loss": 1.1799, + "mean_token_accuracy": 0.5663762982934714, + "num_tokens": 1029873.0, + "step": 5170 + }, + { + "epoch": 9.754823529411764, + "grad_norm": 1.334657073020935, + "learning_rate": 1.4696155784333885e-05, + "loss": 1.1942, + "mean_token_accuracy": 0.5546817529946566, + "num_tokens": 1043425.0, + "step": 5180 + }, + { + "epoch": 9.77364705882353, + "grad_norm": 0.8071675896644592, + "learning_rate": 1.467762997618708e-05, + "loss": 1.2465, + "mean_token_accuracy": 0.535656175762415, + "num_tokens": 1057319.0, + "step": 5190 + }, + { + "epoch": 9.792470588235295, + "grad_norm": 1.1653850078582764, + "learning_rate": 1.465908359484144e-05, + "loss": 1.2336, + "mean_token_accuracy": 0.5504809945821763, + "num_tokens": 1070725.0, + "step": 5200 + }, + { + "epoch": 9.811294117647058, + "grad_norm": 1.1270978450775146, + "learning_rate": 1.4640516721867843e-05, + "loss": 1.1989, + "mean_token_accuracy": 0.5558116808533669, + "num_tokens": 13834.0, + "step": 5210 + }, + { + "epoch": 9.830117647058824, + "grad_norm": 1.6317771673202515, + "learning_rate": 1.4621929438927299e-05, + "loss": 1.2151, + "mean_token_accuracy": 0.5420542072504759, + "num_tokens": 27298.0, + "step": 5220 + }, + { + "epoch": 9.848941176470587, + "grad_norm": 1.1831214427947998, + "learning_rate": 1.4603321827770578e-05, + "loss": 1.2075, + "mean_token_accuracy": 0.5548371035605669, + "num_tokens": 40750.0, + "step": 5230 + }, + { + "epoch": 9.867764705882353, + "grad_norm": 1.4046541452407837, + "learning_rate": 1.458469397023786e-05, + "loss": 1.2385, + "mean_token_accuracy": 0.5390195321291685, + "num_tokens": 54449.0, + "step": 5240 + }, + { + "epoch": 9.886588235294118, + "grad_norm": 1.1588149070739746, + "learning_rate": 1.4566045948258376e-05, + "loss": 1.2143, + "mean_token_accuracy": 0.5551448825746774, + "num_tokens": 67816.0, + "step": 5250 + }, + { + "epoch": 9.905411764705882, + "grad_norm": 1.3225456476211548, + "learning_rate": 1.4547377843850044e-05, + "loss": 1.2199, + "mean_token_accuracy": 0.5484016731381416, + "num_tokens": 81192.0, + "step": 5260 + }, + { + "epoch": 9.924235294117647, + "grad_norm": 0.9215822219848633, + "learning_rate": 1.45286897391191e-05, + "loss": 1.2136, + "mean_token_accuracy": 0.5466381188482046, + "num_tokens": 94106.0, + "step": 5270 + }, + { + "epoch": 9.943058823529412, + "grad_norm": 1.9844329357147217, + "learning_rate": 1.4509981716259762e-05, + "loss": 1.2251, + "mean_token_accuracy": 0.5436500191688538, + "num_tokens": 107211.0, + "step": 5280 + }, + { + "epoch": 9.961882352941176, + "grad_norm": 0.7866172194480896, + "learning_rate": 1.4491253857553838e-05, + "loss": 1.1928, + "mean_token_accuracy": 0.5536798264831304, + "num_tokens": 120603.0, + "step": 5290 + }, + { + "epoch": 9.980705882352941, + "grad_norm": 1.3284730911254883, + "learning_rate": 1.4472506245370382e-05, + "loss": 1.2201, + "mean_token_accuracy": 0.551696864143014, + "num_tokens": 135253.0, + "step": 5300 + }, + { + "epoch": 9.999529411764707, + "grad_norm": 0.8189272880554199, + "learning_rate": 1.445373896216533e-05, + "loss": 1.2535, + "mean_token_accuracy": 0.5395314753055572, + "num_tokens": 148070.0, + "step": 5310 + }, + { + "epoch": 10.018823529411765, + "grad_norm": 0.9590490460395813, + "learning_rate": 1.4434952090481135e-05, + "loss": 1.3331, + "mean_token_accuracy": 0.5544926153450478, + "num_tokens": 162263.0, + "step": 5320 + }, + { + "epoch": 10.037647058823529, + "grad_norm": 1.4627238512039185, + "learning_rate": 1.4416145712946406e-05, + "loss": 1.2488, + "mean_token_accuracy": 0.5324025351554156, + "num_tokens": 175371.0, + "step": 5330 + }, + { + "epoch": 10.056470588235294, + "grad_norm": 0.6929643154144287, + "learning_rate": 1.4397319912275535e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.5509116105735302, + "num_tokens": 188794.0, + "step": 5340 + }, + { + "epoch": 10.07529411764706, + "grad_norm": 1.5115923881530762, + "learning_rate": 1.437847477126835e-05, + "loss": 1.1505, + "mean_token_accuracy": 0.5615889120846986, + "num_tokens": 201733.0, + "step": 5350 + }, + { + "epoch": 10.094117647058823, + "grad_norm": 1.651714563369751, + "learning_rate": 1.4359610372809739e-05, + "loss": 1.2233, + "mean_token_accuracy": 0.5453080747276544, + "num_tokens": 214934.0, + "step": 5360 + }, + { + "epoch": 10.112941176470589, + "grad_norm": 1.2535176277160645, + "learning_rate": 1.4340726799869283e-05, + "loss": 1.1925, + "mean_token_accuracy": 0.5584179207682609, + "num_tokens": 227831.0, + "step": 5370 + }, + { + "epoch": 10.131764705882352, + "grad_norm": 1.8965996503829956, + "learning_rate": 1.4321824135500904e-05, + "loss": 1.2347, + "mean_token_accuracy": 0.5445710398256779, + "num_tokens": 242053.0, + "step": 5380 + }, + { + "epoch": 10.150588235294117, + "grad_norm": 1.9367871284484863, + "learning_rate": 1.430290246284249e-05, + "loss": 1.2115, + "mean_token_accuracy": 0.5574517220258712, + "num_tokens": 256086.0, + "step": 5390 + }, + { + "epoch": 10.169411764705883, + "grad_norm": 0.6884622573852539, + "learning_rate": 1.4283961865115528e-05, + "loss": 1.2457, + "mean_token_accuracy": 0.5295402128249407, + "num_tokens": 269977.0, + "step": 5400 + }, + { + "epoch": 10.188235294117646, + "grad_norm": 0.7671216726303101, + "learning_rate": 1.426500242562474e-05, + "loss": 1.1288, + "mean_token_accuracy": 0.5702939372509718, + "num_tokens": 283412.0, + "step": 5410 + }, + { + "epoch": 10.207058823529412, + "grad_norm": 1.1199065446853638, + "learning_rate": 1.4246024227757735e-05, + "loss": 1.2184, + "mean_token_accuracy": 0.5337141178548336, + "num_tokens": 296574.0, + "step": 5420 + }, + { + "epoch": 10.225882352941177, + "grad_norm": 0.8241312503814697, + "learning_rate": 1.4227027354984602e-05, + "loss": 1.1945, + "mean_token_accuracy": 0.5481650296598672, + "num_tokens": 310305.0, + "step": 5430 + }, + { + "epoch": 10.24470588235294, + "grad_norm": 1.6059694290161133, + "learning_rate": 1.4208011890857577e-05, + "loss": 1.1322, + "mean_token_accuracy": 0.5755776699632407, + "num_tokens": 323670.0, + "step": 5440 + }, + { + "epoch": 10.263529411764706, + "grad_norm": 1.0941455364227295, + "learning_rate": 1.4188977919010664e-05, + "loss": 1.1634, + "mean_token_accuracy": 0.5623828198760747, + "num_tokens": 336409.0, + "step": 5450 + }, + { + "epoch": 10.282352941176471, + "grad_norm": 0.760979175567627, + "learning_rate": 1.4169925523159274e-05, + "loss": 1.2111, + "mean_token_accuracy": 0.5577297646552324, + "num_tokens": 349680.0, + "step": 5460 + }, + { + "epoch": 10.301176470588235, + "grad_norm": 1.41929292678833, + "learning_rate": 1.4150854787099836e-05, + "loss": 1.1846, + "mean_token_accuracy": 0.5624632347375155, + "num_tokens": 363183.0, + "step": 5470 + }, + { + "epoch": 10.32, + "grad_norm": 0.7982503771781921, + "learning_rate": 1.413176579470946e-05, + "loss": 1.2039, + "mean_token_accuracy": 0.5504359491169453, + "num_tokens": 376390.0, + "step": 5480 + }, + { + "epoch": 10.338823529411764, + "grad_norm": 1.3889517784118652, + "learning_rate": 1.4112658629945535e-05, + "loss": 1.1928, + "mean_token_accuracy": 0.5593543030321598, + "num_tokens": 389745.0, + "step": 5490 + }, + { + "epoch": 10.35764705882353, + "grad_norm": 1.3614208698272705, + "learning_rate": 1.409353337684539e-05, + "loss": 1.2334, + "mean_token_accuracy": 0.5366870552301407, + "num_tokens": 404220.0, + "step": 5500 + }, + { + "epoch": 10.376470588235295, + "grad_norm": 0.9981026649475098, + "learning_rate": 1.4074390119525898e-05, + "loss": 1.1739, + "mean_token_accuracy": 0.5642281893640757, + "num_tokens": 417700.0, + "step": 5510 + }, + { + "epoch": 10.395294117647058, + "grad_norm": 1.0381234884262085, + "learning_rate": 1.4055228942183128e-05, + "loss": 1.1977, + "mean_token_accuracy": 0.5563704077154398, + "num_tokens": 430901.0, + "step": 5520 + }, + { + "epoch": 10.414117647058823, + "grad_norm": 0.8158124089241028, + "learning_rate": 1.4036049929091964e-05, + "loss": 1.1914, + "mean_token_accuracy": 0.5571797143667936, + "num_tokens": 445094.0, + "step": 5530 + }, + { + "epoch": 10.432941176470589, + "grad_norm": 0.7652572393417358, + "learning_rate": 1.4016853164605728e-05, + "loss": 1.2376, + "mean_token_accuracy": 0.5498634003102779, + "num_tokens": 459543.0, + "step": 5540 + }, + { + "epoch": 10.451764705882352, + "grad_norm": 0.7951592206954956, + "learning_rate": 1.3997638733155822e-05, + "loss": 1.1997, + "mean_token_accuracy": 0.5588535733520985, + "num_tokens": 473275.0, + "step": 5550 + }, + { + "epoch": 10.470588235294118, + "grad_norm": 1.2788842916488647, + "learning_rate": 1.3978406719251352e-05, + "loss": 1.204, + "mean_token_accuracy": 0.5432504419237375, + "num_tokens": 485574.0, + "step": 5560 + }, + { + "epoch": 10.489411764705883, + "grad_norm": 1.9643447399139404, + "learning_rate": 1.3959157207478753e-05, + "loss": 1.1918, + "mean_token_accuracy": 0.5582812011241913, + "num_tokens": 498349.0, + "step": 5570 + }, + { + "epoch": 10.508235294117647, + "grad_norm": 1.2677149772644043, + "learning_rate": 1.3939890282501418e-05, + "loss": 1.2043, + "mean_token_accuracy": 0.5601174239069223, + "num_tokens": 511915.0, + "step": 5580 + }, + { + "epoch": 10.527058823529412, + "grad_norm": 1.0180656909942627, + "learning_rate": 1.3920606029059332e-05, + "loss": 1.2173, + "mean_token_accuracy": 0.5526633080095052, + "num_tokens": 524995.0, + "step": 5590 + }, + { + "epoch": 10.545882352941177, + "grad_norm": 1.1644375324249268, + "learning_rate": 1.3901304531968684e-05, + "loss": 1.1837, + "mean_token_accuracy": 0.5520532440394164, + "num_tokens": 537557.0, + "step": 5600 + }, + { + "epoch": 10.564705882352941, + "grad_norm": 1.3104006052017212, + "learning_rate": 1.388198587612152e-05, + "loss": 1.2209, + "mean_token_accuracy": 0.5339883405715227, + "num_tokens": 551827.0, + "step": 5610 + }, + { + "epoch": 10.583529411764706, + "grad_norm": 1.103053331375122, + "learning_rate": 1.386265014648534e-05, + "loss": 1.154, + "mean_token_accuracy": 0.5668028537184, + "num_tokens": 565218.0, + "step": 5620 + }, + { + "epoch": 10.60235294117647, + "grad_norm": 0.8747602105140686, + "learning_rate": 1.3843297428102742e-05, + "loss": 1.2476, + "mean_token_accuracy": 0.5371836949139833, + "num_tokens": 578938.0, + "step": 5630 + }, + { + "epoch": 10.621176470588235, + "grad_norm": 0.8349719047546387, + "learning_rate": 1.382392780609105e-05, + "loss": 1.1669, + "mean_token_accuracy": 0.5530536573380231, + "num_tokens": 592617.0, + "step": 5640 + }, + { + "epoch": 10.64, + "grad_norm": 1.4140478372573853, + "learning_rate": 1.3804541365641923e-05, + "loss": 1.2016, + "mean_token_accuracy": 0.5596294030547142, + "num_tokens": 606658.0, + "step": 5650 + }, + { + "epoch": 10.658823529411764, + "grad_norm": 1.2245830297470093, + "learning_rate": 1.3785138192021002e-05, + "loss": 1.1768, + "mean_token_accuracy": 0.5598421145230532, + "num_tokens": 619930.0, + "step": 5660 + }, + { + "epoch": 10.67764705882353, + "grad_norm": 1.3025885820388794, + "learning_rate": 1.3765718370567514e-05, + "loss": 1.1994, + "mean_token_accuracy": 0.5509582087397575, + "num_tokens": 633099.0, + "step": 5670 + }, + { + "epoch": 10.696470588235295, + "grad_norm": 0.9705594778060913, + "learning_rate": 1.3746281986693917e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.5644174017012119, + "num_tokens": 647248.0, + "step": 5680 + }, + { + "epoch": 10.715294117647058, + "grad_norm": 1.8273649215698242, + "learning_rate": 1.3726829125885501e-05, + "loss": 1.1895, + "mean_token_accuracy": 0.5520309090614319, + "num_tokens": 660733.0, + "step": 5690 + }, + { + "epoch": 10.734117647058824, + "grad_norm": 0.9448793530464172, + "learning_rate": 1.370735987370004e-05, + "loss": 1.1612, + "mean_token_accuracy": 0.5580623522400856, + "num_tokens": 674331.0, + "step": 5700 + }, + { + "epoch": 10.75294117647059, + "grad_norm": 0.7382903099060059, + "learning_rate": 1.3687874315767388e-05, + "loss": 1.1906, + "mean_token_accuracy": 0.5549033779650927, + "num_tokens": 687329.0, + "step": 5710 + }, + { + "epoch": 10.771764705882353, + "grad_norm": 0.9180198907852173, + "learning_rate": 1.3668372537789122e-05, + "loss": 1.1708, + "mean_token_accuracy": 0.554550190269947, + "num_tokens": 701188.0, + "step": 5720 + }, + { + "epoch": 10.790588235294118, + "grad_norm": 1.3416258096694946, + "learning_rate": 1.3648854625538161e-05, + "loss": 1.2009, + "mean_token_accuracy": 0.5456226222217083, + "num_tokens": 715055.0, + "step": 5730 + }, + { + "epoch": 10.809411764705882, + "grad_norm": 0.9519694447517395, + "learning_rate": 1.3629320664858373e-05, + "loss": 1.188, + "mean_token_accuracy": 0.5580568216741085, + "num_tokens": 728299.0, + "step": 5740 + }, + { + "epoch": 10.828235294117647, + "grad_norm": 0.9768867492675781, + "learning_rate": 1.3609770741664225e-05, + "loss": 1.1748, + "mean_token_accuracy": 0.5568192675709724, + "num_tokens": 740400.0, + "step": 5750 + }, + { + "epoch": 10.847058823529412, + "grad_norm": 1.2277079820632935, + "learning_rate": 1.3590204941940384e-05, + "loss": 1.1883, + "mean_token_accuracy": 0.5541429404169321, + "num_tokens": 753926.0, + "step": 5760 + }, + { + "epoch": 10.865882352941176, + "grad_norm": 0.912382960319519, + "learning_rate": 1.3570623351741343e-05, + "loss": 1.2201, + "mean_token_accuracy": 0.5405797265470028, + "num_tokens": 767363.0, + "step": 5770 + }, + { + "epoch": 10.884705882352941, + "grad_norm": 1.2783665657043457, + "learning_rate": 1.3551026057191045e-05, + "loss": 1.2285, + "mean_token_accuracy": 0.5442549273371696, + "num_tokens": 780491.0, + "step": 5780 + }, + { + "epoch": 10.903529411764707, + "grad_norm": 0.789916455745697, + "learning_rate": 1.3531413144482512e-05, + "loss": 1.23, + "mean_token_accuracy": 0.5449609015136957, + "num_tokens": 793793.0, + "step": 5790 + }, + { + "epoch": 10.92235294117647, + "grad_norm": 1.2650339603424072, + "learning_rate": 1.351178469987745e-05, + "loss": 1.2049, + "mean_token_accuracy": 0.543266024813056, + "num_tokens": 807792.0, + "step": 5800 + }, + { + "epoch": 10.941176470588236, + "grad_norm": 0.9021736979484558, + "learning_rate": 1.3492140809705881e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.5581005875021219, + "num_tokens": 821010.0, + "step": 5810 + }, + { + "epoch": 10.96, + "grad_norm": 0.8924301266670227, + "learning_rate": 1.3472481560365758e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.5598813854157925, + "num_tokens": 833101.0, + "step": 5820 + }, + { + "epoch": 10.978823529411764, + "grad_norm": 1.8735415935516357, + "learning_rate": 1.3452807038322585e-05, + "loss": 1.1992, + "mean_token_accuracy": 0.5387950103729964, + "num_tokens": 845747.0, + "step": 5830 + }, + { + "epoch": 10.99764705882353, + "grad_norm": 0.8486454486846924, + "learning_rate": 1.3433117330109045e-05, + "loss": 1.1955, + "mean_token_accuracy": 0.550658929720521, + "num_tokens": 859896.0, + "step": 5840 + }, + { + "epoch": 11.015058823529412, + "grad_norm": 0.8041768670082092, + "learning_rate": 1.3413412522324609e-05, + "loss": 1.1699, + "mean_token_accuracy": 0.5523232479353208, + "num_tokens": 871831.0, + "step": 5850 + }, + { + "epoch": 11.033882352941177, + "grad_norm": 1.445483922958374, + "learning_rate": 1.3393692701635154e-05, + "loss": 1.214, + "mean_token_accuracy": 0.5537016060203314, + "num_tokens": 885406.0, + "step": 5860 + }, + { + "epoch": 11.05270588235294, + "grad_norm": 0.697123110294342, + "learning_rate": 1.33739579547726e-05, + "loss": 1.1622, + "mean_token_accuracy": 0.5540354669094085, + "num_tokens": 898421.0, + "step": 5870 + }, + { + "epoch": 11.071529411764706, + "grad_norm": 1.465420126914978, + "learning_rate": 1.3354208368534503e-05, + "loss": 1.2069, + "mean_token_accuracy": 0.551685893163085, + "num_tokens": 912642.0, + "step": 5880 + }, + { + "epoch": 11.090352941176471, + "grad_norm": 0.859109103679657, + "learning_rate": 1.333444402978369e-05, + "loss": 1.1826, + "mean_token_accuracy": 0.5513388890773058, + "num_tokens": 926366.0, + "step": 5890 + }, + { + "epoch": 11.109176470588235, + "grad_norm": 1.5829471349716187, + "learning_rate": 1.3314665025447876e-05, + "loss": 1.2038, + "mean_token_accuracy": 0.5505582805722952, + "num_tokens": 939684.0, + "step": 5900 + }, + { + "epoch": 11.128, + "grad_norm": 0.8772886991500854, + "learning_rate": 1.3294871442519271e-05, + "loss": 1.1629, + "mean_token_accuracy": 0.5473615158349275, + "num_tokens": 953085.0, + "step": 5910 + }, + { + "epoch": 11.146823529411765, + "grad_norm": 0.9280370473861694, + "learning_rate": 1.3275063368054208e-05, + "loss": 1.1943, + "mean_token_accuracy": 0.5425072379410267, + "num_tokens": 966596.0, + "step": 5920 + }, + { + "epoch": 11.165647058823529, + "grad_norm": 1.6158865690231323, + "learning_rate": 1.3255240889172764e-05, + "loss": 1.1669, + "mean_token_accuracy": 0.5687259271740913, + "num_tokens": 981302.0, + "step": 5930 + }, + { + "epoch": 11.184470588235294, + "grad_norm": 0.9575416445732117, + "learning_rate": 1.323540409305836e-05, + "loss": 1.1828, + "mean_token_accuracy": 0.5527924958616495, + "num_tokens": 993437.0, + "step": 5940 + }, + { + "epoch": 11.203294117647058, + "grad_norm": 0.8492655158042908, + "learning_rate": 1.3215553066957391e-05, + "loss": 1.1352, + "mean_token_accuracy": 0.5696950633078813, + "num_tokens": 1006394.0, + "step": 5950 + }, + { + "epoch": 11.222117647058823, + "grad_norm": 1.3289772272109985, + "learning_rate": 1.3195687898178837e-05, + "loss": 1.198, + "mean_token_accuracy": 0.5483724296092987, + "num_tokens": 1019972.0, + "step": 5960 + }, + { + "epoch": 11.240941176470589, + "grad_norm": 0.8151838779449463, + "learning_rate": 1.3175808674093882e-05, + "loss": 1.2229, + "mean_token_accuracy": 0.5452193580567837, + "num_tokens": 1033578.0, + "step": 5970 + }, + { + "epoch": 11.259764705882352, + "grad_norm": 1.242629051208496, + "learning_rate": 1.3155915482135528e-05, + "loss": 1.2035, + "mean_token_accuracy": 0.5487495046108961, + "num_tokens": 1046758.0, + "step": 5980 + }, + { + "epoch": 11.278588235294118, + "grad_norm": 1.238258719444275, + "learning_rate": 1.3136008409798214e-05, + "loss": 1.1993, + "mean_token_accuracy": 0.5489524565637112, + "num_tokens": 1061219.0, + "step": 5990 + }, + { + "epoch": 11.297411764705883, + "grad_norm": 1.2245213985443115, + "learning_rate": 1.3116087544637415e-05, + "loss": 1.1616, + "mean_token_accuracy": 0.5596596848219633, + "num_tokens": 1074213.0, + "step": 6000 + }, + { + "epoch": 11.316235294117647, + "grad_norm": 0.8657311797142029, + "learning_rate": 1.3096152974269289e-05, + "loss": 1.179, + "mean_token_accuracy": 0.5485074911266565, + "num_tokens": 1088151.0, + "step": 6010 + }, + { + "epoch": 11.335058823529412, + "grad_norm": 0.8435884714126587, + "learning_rate": 1.3076204786370256e-05, + "loss": 1.1862, + "mean_token_accuracy": 0.5667649589478969, + "num_tokens": 1101736.0, + "step": 6020 + }, + { + "epoch": 11.353882352941177, + "grad_norm": 4.664355278015137, + "learning_rate": 1.3056243068676637e-05, + "loss": 1.1899, + "mean_token_accuracy": 0.5602201897650957, + "num_tokens": 1115368.0, + "step": 6030 + }, + { + "epoch": 11.37270588235294, + "grad_norm": 0.7196553945541382, + "learning_rate": 1.3036267908984257e-05, + "loss": 1.2337, + "mean_token_accuracy": 0.5351801011711359, + "num_tokens": 1128875.0, + "step": 6040 + }, + { + "epoch": 11.391529411764706, + "grad_norm": 0.8050165772438049, + "learning_rate": 1.3016279395148067e-05, + "loss": 1.2082, + "mean_token_accuracy": 0.5454613540321589, + "num_tokens": 1141185.0, + "step": 6050 + }, + { + "epoch": 11.41035294117647, + "grad_norm": 1.6665176153182983, + "learning_rate": 1.2996277615081738e-05, + "loss": 1.1941, + "mean_token_accuracy": 0.5567255288362503, + "num_tokens": 1154738.0, + "step": 6060 + }, + { + "epoch": 11.429176470588235, + "grad_norm": 1.7246068716049194, + "learning_rate": 1.297626265675731e-05, + "loss": 1.19, + "mean_token_accuracy": 0.5438788242638111, + "num_tokens": 1168172.0, + "step": 6070 + }, + { + "epoch": 11.448, + "grad_norm": 0.8024677038192749, + "learning_rate": 1.2956234608204765e-05, + "loss": 1.2072, + "mean_token_accuracy": 0.5483981113880873, + "num_tokens": 1181363.0, + "step": 6080 + }, + { + "epoch": 11.466823529411764, + "grad_norm": 1.0496245622634888, + "learning_rate": 1.293619355751167e-05, + "loss": 1.1437, + "mean_token_accuracy": 0.5686488572508097, + "num_tokens": 1195397.0, + "step": 6090 + }, + { + "epoch": 11.48564705882353, + "grad_norm": 0.6598522067070007, + "learning_rate": 1.2916139592822776e-05, + "loss": 1.2051, + "mean_token_accuracy": 0.5419987261295318, + "num_tokens": 1208020.0, + "step": 6100 + }, + { + "epoch": 11.504470588235295, + "grad_norm": 1.8896702527999878, + "learning_rate": 1.2896072802339623e-05, + "loss": 1.1603, + "mean_token_accuracy": 0.571081367880106, + "num_tokens": 1222044.0, + "step": 6110 + }, + { + "epoch": 11.523294117647058, + "grad_norm": 0.8881447911262512, + "learning_rate": 1.2875993274320173e-05, + "loss": 1.1703, + "mean_token_accuracy": 0.5585772100836038, + "num_tokens": 1236218.0, + "step": 6120 + }, + { + "epoch": 11.542117647058824, + "grad_norm": 1.263449788093567, + "learning_rate": 1.2855901097078412e-05, + "loss": 1.1544, + "mean_token_accuracy": 0.5590313211083412, + "num_tokens": 1249412.0, + "step": 6130 + }, + { + "epoch": 11.560941176470589, + "grad_norm": 1.120104432106018, + "learning_rate": 1.2835796358983943e-05, + "loss": 1.2265, + "mean_token_accuracy": 0.5422938629984856, + "num_tokens": 1262521.0, + "step": 6140 + }, + { + "epoch": 11.579764705882353, + "grad_norm": 1.0734158754348755, + "learning_rate": 1.2815679148461636e-05, + "loss": 1.2199, + "mean_token_accuracy": 0.5645121570676566, + "num_tokens": 1276508.0, + "step": 6150 + }, + { + "epoch": 11.598588235294118, + "grad_norm": 0.7284833192825317, + "learning_rate": 1.2795549553991202e-05, + "loss": 1.1995, + "mean_token_accuracy": 0.55781021527946, + "num_tokens": 1289814.0, + "step": 6160 + }, + { + "epoch": 11.617411764705881, + "grad_norm": 0.9633259773254395, + "learning_rate": 1.2775407664106825e-05, + "loss": 1.1882, + "mean_token_accuracy": 0.555243044346571, + "num_tokens": 1303074.0, + "step": 6170 + }, + { + "epoch": 11.636235294117647, + "grad_norm": 0.6576571464538574, + "learning_rate": 1.2755253567396766e-05, + "loss": 1.208, + "mean_token_accuracy": 0.5453934874385595, + "num_tokens": 1316357.0, + "step": 6180 + }, + { + "epoch": 11.655058823529412, + "grad_norm": 2.550999879837036, + "learning_rate": 1.2735087352502977e-05, + "loss": 1.2179, + "mean_token_accuracy": 0.5549823541194201, + "num_tokens": 1329683.0, + "step": 6190 + }, + { + "epoch": 11.673882352941176, + "grad_norm": 1.061109185218811, + "learning_rate": 1.2714909108120698e-05, + "loss": 1.1632, + "mean_token_accuracy": 0.5776005409657955, + "num_tokens": 1342884.0, + "step": 6200 + }, + { + "epoch": 11.694588235294118, + "grad_norm": 2.3584885597229004, + "learning_rate": 1.2694718922998097e-05, + "loss": 1.1887, + "mean_token_accuracy": 0.5592446334660053, + "num_tokens": 13595.0, + "step": 6210 + }, + { + "epoch": 11.713411764705882, + "grad_norm": 0.8795002698898315, + "learning_rate": 1.2674516885935835e-05, + "loss": 1.1999, + "mean_token_accuracy": 0.5442189387977123, + "num_tokens": 26488.0, + "step": 6220 + }, + { + "epoch": 11.732235294117647, + "grad_norm": 0.8854806423187256, + "learning_rate": 1.2654303085786723e-05, + "loss": 1.1295, + "mean_token_accuracy": 0.5750894896686077, + "num_tokens": 40270.0, + "step": 6230 + }, + { + "epoch": 11.751058823529412, + "grad_norm": 0.8246656656265259, + "learning_rate": 1.2634077611455294e-05, + "loss": 1.1946, + "mean_token_accuracy": 0.5488846648484469, + "num_tokens": 54551.0, + "step": 6240 + }, + { + "epoch": 11.769882352941176, + "grad_norm": 1.1240957975387573, + "learning_rate": 1.2613840551897428e-05, + "loss": 1.2098, + "mean_token_accuracy": 0.5464079327881336, + "num_tokens": 68016.0, + "step": 6250 + }, + { + "epoch": 11.788705882352941, + "grad_norm": 0.672888994216919, + "learning_rate": 1.2593591996119965e-05, + "loss": 1.2383, + "mean_token_accuracy": 0.5364337969571352, + "num_tokens": 82740.0, + "step": 6260 + }, + { + "epoch": 11.807529411764706, + "grad_norm": 0.9078545570373535, + "learning_rate": 1.257333203318031e-05, + "loss": 1.1736, + "mean_token_accuracy": 0.5599259410053492, + "num_tokens": 95371.0, + "step": 6270 + }, + { + "epoch": 11.82635294117647, + "grad_norm": 1.5212457180023193, + "learning_rate": 1.2553060752186024e-05, + "loss": 1.1655, + "mean_token_accuracy": 0.5664675917476416, + "num_tokens": 108447.0, + "step": 6280 + }, + { + "epoch": 11.845176470588235, + "grad_norm": 0.6960899829864502, + "learning_rate": 1.2532778242294467e-05, + "loss": 1.184, + "mean_token_accuracy": 0.559162225574255, + "num_tokens": 121975.0, + "step": 6290 + }, + { + "epoch": 11.864, + "grad_norm": 1.1340759992599487, + "learning_rate": 1.2512484592712373e-05, + "loss": 1.1874, + "mean_token_accuracy": 0.5441106397658586, + "num_tokens": 135823.0, + "step": 6300 + }, + { + "epoch": 11.882823529411764, + "grad_norm": 0.7761991024017334, + "learning_rate": 1.2492179892695473e-05, + "loss": 1.1632, + "mean_token_accuracy": 0.5639519464224577, + "num_tokens": 147887.0, + "step": 6310 + }, + { + "epoch": 11.90164705882353, + "grad_norm": 1.0354701280593872, + "learning_rate": 1.24718642315481e-05, + "loss": 1.1432, + "mean_token_accuracy": 0.5634768087416887, + "num_tokens": 161408.0, + "step": 6320 + }, + { + "epoch": 11.920470588235293, + "grad_norm": 0.9502993822097778, + "learning_rate": 1.2451537698622799e-05, + "loss": 1.134, + "mean_token_accuracy": 0.5664606466889381, + "num_tokens": 173813.0, + "step": 6330 + }, + { + "epoch": 11.939294117647059, + "grad_norm": 0.7904968857765198, + "learning_rate": 1.2431200383319931e-05, + "loss": 1.1782, + "mean_token_accuracy": 0.5518010076135397, + "num_tokens": 186645.0, + "step": 6340 + }, + { + "epoch": 11.958117647058824, + "grad_norm": 0.6425819993019104, + "learning_rate": 1.2410852375087279e-05, + "loss": 1.1928, + "mean_token_accuracy": 0.5603426963090896, + "num_tokens": 201324.0, + "step": 6350 + }, + { + "epoch": 11.976941176470588, + "grad_norm": 1.029788851737976, + "learning_rate": 1.2390493763419654e-05, + "loss": 1.2296, + "mean_token_accuracy": 0.530813605338335, + "num_tokens": 213844.0, + "step": 6360 + }, + { + "epoch": 11.995764705882353, + "grad_norm": 1.0189554691314697, + "learning_rate": 1.2370124637858508e-05, + "loss": 1.125, + "mean_token_accuracy": 0.5650646161288023, + "num_tokens": 227343.0, + "step": 6370 + }, + { + "epoch": 12.015058823529412, + "grad_norm": 1.3754558563232422, + "learning_rate": 1.2349745087991529e-05, + "loss": 1.2719, + "mean_token_accuracy": 0.5659251939959642, + "num_tokens": 241245.0, + "step": 6380 + }, + { + "epoch": 12.033882352941177, + "grad_norm": 1.338383436203003, + "learning_rate": 1.2329355203452258e-05, + "loss": 1.2056, + "mean_token_accuracy": 0.5403179809451103, + "num_tokens": 254997.0, + "step": 6390 + }, + { + "epoch": 12.05270588235294, + "grad_norm": 0.7401157021522522, + "learning_rate": 1.2308955073919688e-05, + "loss": 1.2027, + "mean_token_accuracy": 0.5495895497500897, + "num_tokens": 268506.0, + "step": 6400 + }, + { + "epoch": 12.071529411764706, + "grad_norm": 1.382067084312439, + "learning_rate": 1.2288544789117876e-05, + "loss": 1.1167, + "mean_token_accuracy": 0.5725257787853479, + "num_tokens": 281845.0, + "step": 6410 + }, + { + "epoch": 12.090352941176471, + "grad_norm": 1.1322115659713745, + "learning_rate": 1.2268124438815531e-05, + "loss": 1.1649, + "mean_token_accuracy": 0.5655132979154587, + "num_tokens": 294981.0, + "step": 6420 + }, + { + "epoch": 12.109176470588235, + "grad_norm": 2.3306519985198975, + "learning_rate": 1.2247694112825654e-05, + "loss": 1.1841, + "mean_token_accuracy": 0.5553363788872957, + "num_tokens": 308385.0, + "step": 6430 + }, + { + "epoch": 12.128, + "grad_norm": 1.036372423171997, + "learning_rate": 1.2227253901005101e-05, + "loss": 1.1893, + "mean_token_accuracy": 0.550970122590661, + "num_tokens": 321685.0, + "step": 6440 + }, + { + "epoch": 12.146823529411765, + "grad_norm": 1.155049443244934, + "learning_rate": 1.2206803893254215e-05, + "loss": 1.1504, + "mean_token_accuracy": 0.5654265254735946, + "num_tokens": 334803.0, + "step": 6450 + }, + { + "epoch": 12.165647058823529, + "grad_norm": 0.8562523126602173, + "learning_rate": 1.2186344179516425e-05, + "loss": 1.1566, + "mean_token_accuracy": 0.5620875429362059, + "num_tokens": 347350.0, + "step": 6460 + }, + { + "epoch": 12.184470588235294, + "grad_norm": 1.5091642141342163, + "learning_rate": 1.2165874849777853e-05, + "loss": 1.1878, + "mean_token_accuracy": 0.5486861743032933, + "num_tokens": 361251.0, + "step": 6470 + }, + { + "epoch": 12.203294117647058, + "grad_norm": 1.0562283992767334, + "learning_rate": 1.21453959940669e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.5419348709285259, + "num_tokens": 374607.0, + "step": 6480 + }, + { + "epoch": 12.222117647058823, + "grad_norm": 2.187586545944214, + "learning_rate": 1.2124907702453883e-05, + "loss": 1.1733, + "mean_token_accuracy": 0.5536637313663959, + "num_tokens": 388714.0, + "step": 6490 + }, + { + "epoch": 12.240941176470589, + "grad_norm": 1.4512325525283813, + "learning_rate": 1.2104410065050605e-05, + "loss": 1.2252, + "mean_token_accuracy": 0.5447334434837103, + "num_tokens": 402108.0, + "step": 6500 + }, + { + "epoch": 12.259764705882352, + "grad_norm": 1.3915634155273438, + "learning_rate": 1.208390317200998e-05, + "loss": 1.1606, + "mean_token_accuracy": 0.5588117640465498, + "num_tokens": 417457.0, + "step": 6510 + }, + { + "epoch": 12.278588235294118, + "grad_norm": 1.3058298826217651, + "learning_rate": 1.2063387113525635e-05, + "loss": 1.1624, + "mean_token_accuracy": 0.567823113501072, + "num_tokens": 429732.0, + "step": 6520 + }, + { + "epoch": 12.297411764705883, + "grad_norm": 1.4169563055038452, + "learning_rate": 1.2042861979831496e-05, + "loss": 1.1758, + "mean_token_accuracy": 0.5708753641694784, + "num_tokens": 444198.0, + "step": 6530 + }, + { + "epoch": 12.316235294117647, + "grad_norm": 0.916401207447052, + "learning_rate": 1.202232786120141e-05, + "loss": 1.1952, + "mean_token_accuracy": 0.5533534411340952, + "num_tokens": 457447.0, + "step": 6540 + }, + { + "epoch": 12.335058823529412, + "grad_norm": 1.6477797031402588, + "learning_rate": 1.200178484794875e-05, + "loss": 1.1731, + "mean_token_accuracy": 0.5523608162999153, + "num_tokens": 471188.0, + "step": 6550 + }, + { + "epoch": 12.353882352941177, + "grad_norm": 0.7916552424430847, + "learning_rate": 1.1981233030425996e-05, + "loss": 1.1525, + "mean_token_accuracy": 0.5627595514059067, + "num_tokens": 483973.0, + "step": 6560 + }, + { + "epoch": 12.37270588235294, + "grad_norm": 1.6025060415267944, + "learning_rate": 1.1960672499024359e-05, + "loss": 1.1149, + "mean_token_accuracy": 0.5760308355093002, + "num_tokens": 497279.0, + "step": 6570 + }, + { + "epoch": 12.391529411764706, + "grad_norm": 1.0256032943725586, + "learning_rate": 1.1940103344173375e-05, + "loss": 1.1646, + "mean_token_accuracy": 0.5561530087143183, + "num_tokens": 511615.0, + "step": 6580 + }, + { + "epoch": 12.41035294117647, + "grad_norm": 1.3151596784591675, + "learning_rate": 1.1919525656340503e-05, + "loss": 1.1609, + "mean_token_accuracy": 0.5567661169916391, + "num_tokens": 524047.0, + "step": 6590 + }, + { + "epoch": 12.429176470588235, + "grad_norm": 0.9940578937530518, + "learning_rate": 1.1898939526030732e-05, + "loss": 1.199, + "mean_token_accuracy": 0.5474131718277931, + "num_tokens": 537761.0, + "step": 6600 + }, + { + "epoch": 12.448, + "grad_norm": 0.7173454165458679, + "learning_rate": 1.1878345043786195e-05, + "loss": 1.1698, + "mean_token_accuracy": 0.5660860728472471, + "num_tokens": 551598.0, + "step": 6610 + }, + { + "epoch": 12.466823529411764, + "grad_norm": 1.1414166688919067, + "learning_rate": 1.1857742300185739e-05, + "loss": 1.2336, + "mean_token_accuracy": 0.5513837717473506, + "num_tokens": 566797.0, + "step": 6620 + }, + { + "epoch": 12.48564705882353, + "grad_norm": 0.6648653745651245, + "learning_rate": 1.1837131385844567e-05, + "loss": 1.1919, + "mean_token_accuracy": 0.5441902942955494, + "num_tokens": 580339.0, + "step": 6630 + }, + { + "epoch": 12.504470588235295, + "grad_norm": 1.2969242334365845, + "learning_rate": 1.1816512391413798e-05, + "loss": 1.1584, + "mean_token_accuracy": 0.5687514644116163, + "num_tokens": 592369.0, + "step": 6640 + }, + { + "epoch": 12.523294117647058, + "grad_norm": 0.6251775622367859, + "learning_rate": 1.179588540758011e-05, + "loss": 1.2068, + "mean_token_accuracy": 0.5384650267660618, + "num_tokens": 605961.0, + "step": 6650 + }, + { + "epoch": 12.542117647058824, + "grad_norm": 1.3755369186401367, + "learning_rate": 1.1775250525065297e-05, + "loss": 1.1859, + "mean_token_accuracy": 0.5518178451806307, + "num_tokens": 618337.0, + "step": 6660 + }, + { + "epoch": 12.560941176470589, + "grad_norm": 1.2308052778244019, + "learning_rate": 1.1754607834625915e-05, + "loss": 1.2075, + "mean_token_accuracy": 0.5420106790959835, + "num_tokens": 632237.0, + "step": 6670 + }, + { + "epoch": 12.579764705882353, + "grad_norm": 0.7645729184150696, + "learning_rate": 1.1733957427052842e-05, + "loss": 1.1931, + "mean_token_accuracy": 0.562155156955123, + "num_tokens": 645494.0, + "step": 6680 + }, + { + "epoch": 12.598588235294118, + "grad_norm": 0.6689856052398682, + "learning_rate": 1.1713299393170916e-05, + "loss": 1.1567, + "mean_token_accuracy": 0.5580319032073021, + "num_tokens": 658861.0, + "step": 6690 + }, + { + "epoch": 12.617411764705881, + "grad_norm": 1.2952977418899536, + "learning_rate": 1.1692633823838503e-05, + "loss": 1.1983, + "mean_token_accuracy": 0.5488288260996341, + "num_tokens": 671873.0, + "step": 6700 + }, + { + "epoch": 12.636235294117647, + "grad_norm": 0.988854169845581, + "learning_rate": 1.1671960809947116e-05, + "loss": 1.2001, + "mean_token_accuracy": 0.5581530544906854, + "num_tokens": 684288.0, + "step": 6710 + }, + { + "epoch": 12.655058823529412, + "grad_norm": 0.9140803813934326, + "learning_rate": 1.165128044242101e-05, + "loss": 1.1754, + "mean_token_accuracy": 0.5484160725027323, + "num_tokens": 696428.0, + "step": 6720 + }, + { + "epoch": 12.673882352941176, + "grad_norm": 1.194382905960083, + "learning_rate": 1.163059281221679e-05, + "loss": 1.16, + "mean_token_accuracy": 0.5650255784392357, + "num_tokens": 709841.0, + "step": 6730 + }, + { + "epoch": 12.692705882352941, + "grad_norm": 0.7279021143913269, + "learning_rate": 1.1609898010322989e-05, + "loss": 1.1799, + "mean_token_accuracy": 0.5441335134208203, + "num_tokens": 724299.0, + "step": 6740 + }, + { + "epoch": 12.711529411764706, + "grad_norm": 0.7829269766807556, + "learning_rate": 1.1589196127759697e-05, + "loss": 1.1982, + "mean_token_accuracy": 0.5436731087043881, + "num_tokens": 737467.0, + "step": 6750 + }, + { + "epoch": 12.73035294117647, + "grad_norm": 0.908854603767395, + "learning_rate": 1.1568487255578135e-05, + "loss": 1.1589, + "mean_token_accuracy": 0.5564702823758125, + "num_tokens": 751035.0, + "step": 6760 + }, + { + "epoch": 12.749176470588235, + "grad_norm": 0.8606781363487244, + "learning_rate": 1.1547771484860282e-05, + "loss": 1.1811, + "mean_token_accuracy": 0.5530305828899145, + "num_tokens": 764012.0, + "step": 6770 + }, + { + "epoch": 12.768, + "grad_norm": 0.8715227246284485, + "learning_rate": 1.1527048906718434e-05, + "loss": 1.1731, + "mean_token_accuracy": 0.5534448944032192, + "num_tokens": 777823.0, + "step": 6780 + }, + { + "epoch": 12.786823529411764, + "grad_norm": 1.383436918258667, + "learning_rate": 1.1506319612294855e-05, + "loss": 1.2038, + "mean_token_accuracy": 0.5430160872638226, + "num_tokens": 791112.0, + "step": 6790 + }, + { + "epoch": 12.80564705882353, + "grad_norm": 0.6807175278663635, + "learning_rate": 1.148558369276132e-05, + "loss": 1.1325, + "mean_token_accuracy": 0.5736443504691124, + "num_tokens": 804227.0, + "step": 6800 + }, + { + "epoch": 12.824470588235293, + "grad_norm": 1.107948660850525, + "learning_rate": 1.1464841239318764e-05, + "loss": 1.1518, + "mean_token_accuracy": 0.5673416070640087, + "num_tokens": 817620.0, + "step": 6810 + }, + { + "epoch": 12.843294117647059, + "grad_norm": 0.7133264541625977, + "learning_rate": 1.1444092343196855e-05, + "loss": 1.1768, + "mean_token_accuracy": 0.5533497478812933, + "num_tokens": 831699.0, + "step": 6820 + }, + { + "epoch": 12.862117647058824, + "grad_norm": 0.7470325231552124, + "learning_rate": 1.1423337095653595e-05, + "loss": 1.1794, + "mean_token_accuracy": 0.556913785263896, + "num_tokens": 845041.0, + "step": 6830 + }, + { + "epoch": 12.880941176470587, + "grad_norm": 0.7599585056304932, + "learning_rate": 1.1402575587974915e-05, + "loss": 1.1831, + "mean_token_accuracy": 0.5495749611407519, + "num_tokens": 858034.0, + "step": 6840 + }, + { + "epoch": 12.899764705882353, + "grad_norm": 0.9152631163597107, + "learning_rate": 1.1381807911474291e-05, + "loss": 1.1693, + "mean_token_accuracy": 0.5672723963856697, + "num_tokens": 871960.0, + "step": 6850 + }, + { + "epoch": 12.918588235294118, + "grad_norm": 1.0719937086105347, + "learning_rate": 1.1361034157492324e-05, + "loss": 1.2041, + "mean_token_accuracy": 0.5518028371036052, + "num_tokens": 884661.0, + "step": 6860 + }, + { + "epoch": 12.937411764705882, + "grad_norm": 1.084991455078125, + "learning_rate": 1.1340254417396343e-05, + "loss": 1.2011, + "mean_token_accuracy": 0.5481019847095012, + "num_tokens": 897816.0, + "step": 6870 + }, + { + "epoch": 12.956235294117647, + "grad_norm": 1.3787931203842163, + "learning_rate": 1.131946878258001e-05, + "loss": 1.1315, + "mean_token_accuracy": 0.5697043187916279, + "num_tokens": 910552.0, + "step": 6880 + }, + { + "epoch": 12.975058823529412, + "grad_norm": 1.2762988805770874, + "learning_rate": 1.1298677344462914e-05, + "loss": 1.1643, + "mean_token_accuracy": 0.5592548452317715, + "num_tokens": 924705.0, + "step": 6890 + }, + { + "epoch": 12.993882352941176, + "grad_norm": 0.8996446132659912, + "learning_rate": 1.127788019449016e-05, + "loss": 1.2202, + "mean_token_accuracy": 0.5417749028652906, + "num_tokens": 938010.0, + "step": 6900 + }, + { + "epoch": 13.01129411764706, + "grad_norm": 1.195081114768982, + "learning_rate": 1.1257077424131985e-05, + "loss": 1.1559, + "mean_token_accuracy": 0.5514025462640298, + "num_tokens": 951050.0, + "step": 6910 + }, + { + "epoch": 13.030117647058823, + "grad_norm": 1.7555843591690063, + "learning_rate": 1.1236269124883339e-05, + "loss": 1.1524, + "mean_token_accuracy": 0.5562022086232901, + "num_tokens": 964411.0, + "step": 6920 + }, + { + "epoch": 13.048941176470588, + "grad_norm": 1.3604152202606201, + "learning_rate": 1.1215455388263496e-05, + "loss": 1.1602, + "mean_token_accuracy": 0.5531352117657662, + "num_tokens": 976752.0, + "step": 6930 + }, + { + "epoch": 13.067764705882352, + "grad_norm": 1.0296913385391235, + "learning_rate": 1.1194636305815635e-05, + "loss": 1.1608, + "mean_token_accuracy": 0.5703556634485721, + "num_tokens": 989410.0, + "step": 6940 + }, + { + "epoch": 13.086588235294117, + "grad_norm": 1.0703682899475098, + "learning_rate": 1.1173811969106451e-05, + "loss": 1.1665, + "mean_token_accuracy": 0.5600442342460156, + "num_tokens": 1003038.0, + "step": 6950 + }, + { + "epoch": 13.105411764705883, + "grad_norm": 0.9015535712242126, + "learning_rate": 1.1152982469725755e-05, + "loss": 1.1816, + "mean_token_accuracy": 0.5589367963373661, + "num_tokens": 1017162.0, + "step": 6960 + }, + { + "epoch": 13.124235294117646, + "grad_norm": 0.7695736885070801, + "learning_rate": 1.1132147899286054e-05, + "loss": 1.2044, + "mean_token_accuracy": 0.5554168112576008, + "num_tokens": 1030524.0, + "step": 6970 + }, + { + "epoch": 13.143058823529412, + "grad_norm": 0.6659622192382812, + "learning_rate": 1.1111308349422165e-05, + "loss": 1.1464, + "mean_token_accuracy": 0.5591688379645348, + "num_tokens": 1043594.0, + "step": 6980 + }, + { + "epoch": 13.161882352941177, + "grad_norm": 0.7469462156295776, + "learning_rate": 1.1090463911790807e-05, + "loss": 1.1107, + "mean_token_accuracy": 0.5700595445930958, + "num_tokens": 1057281.0, + "step": 6990 + }, + { + "epoch": 13.18070588235294, + "grad_norm": 0.7391088008880615, + "learning_rate": 1.1069614678070193e-05, + "loss": 1.1526, + "mean_token_accuracy": 0.5636973019689322, + "num_tokens": 1071037.0, + "step": 7000 + }, + { + "epoch": 13.199529411764706, + "grad_norm": 1.2832854986190796, + "learning_rate": 1.1048760739959628e-05, + "loss": 1.1978, + "mean_token_accuracy": 0.5526088491082192, + "num_tokens": 1084747.0, + "step": 7010 + }, + { + "epoch": 13.218352941176471, + "grad_norm": 0.7462761402130127, + "learning_rate": 1.1027902189179107e-05, + "loss": 1.1735, + "mean_token_accuracy": 0.5575515639036894, + "num_tokens": 1098389.0, + "step": 7020 + }, + { + "epoch": 13.237176470588235, + "grad_norm": 0.6917766332626343, + "learning_rate": 1.1007039117468928e-05, + "loss": 1.1831, + "mean_token_accuracy": 0.5607794526964426, + "num_tokens": 1111600.0, + "step": 7030 + }, + { + "epoch": 13.256, + "grad_norm": 0.8664299845695496, + "learning_rate": 1.0986171616589247e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.5606127306818962, + "num_tokens": 1125368.0, + "step": 7040 + }, + { + "epoch": 13.274823529411766, + "grad_norm": 0.7263596653938293, + "learning_rate": 1.0965299778319728e-05, + "loss": 1.2441, + "mean_token_accuracy": 0.5351598154753446, + "num_tokens": 1139128.0, + "step": 7050 + }, + { + "epoch": 13.29364705882353, + "grad_norm": 1.76611328125, + "learning_rate": 1.0944423694459087e-05, + "loss": 1.1559, + "mean_token_accuracy": 0.567909749224782, + "num_tokens": 1152782.0, + "step": 7060 + }, + { + "epoch": 13.312470588235294, + "grad_norm": 1.1992835998535156, + "learning_rate": 1.0923543456824737e-05, + "loss": 1.1802, + "mean_token_accuracy": 0.5539811603724957, + "num_tokens": 1166674.0, + "step": 7070 + }, + { + "epoch": 13.331294117647058, + "grad_norm": 1.3675031661987305, + "learning_rate": 1.0902659157252333e-05, + "loss": 1.1545, + "mean_token_accuracy": 0.5604531057178974, + "num_tokens": 1180331.0, + "step": 7080 + }, + { + "epoch": 13.350117647058823, + "grad_norm": 0.7396109700202942, + "learning_rate": 1.088177088759542e-05, + "loss": 1.158, + "mean_token_accuracy": 0.5598292458802462, + "num_tokens": 1193986.0, + "step": 7090 + }, + { + "epoch": 13.368941176470589, + "grad_norm": 0.7911150455474854, + "learning_rate": 1.0860878739724989e-05, + "loss": 1.1662, + "mean_token_accuracy": 0.5609062645584345, + "num_tokens": 1207593.0, + "step": 7100 + }, + { + "epoch": 13.387764705882352, + "grad_norm": 0.7450502514839172, + "learning_rate": 1.0839982805529097e-05, + "loss": 1.1734, + "mean_token_accuracy": 0.5542735267430544, + "num_tokens": 1221421.0, + "step": 7110 + }, + { + "epoch": 13.406588235294118, + "grad_norm": 0.695760190486908, + "learning_rate": 1.0819083176912446e-05, + "loss": 1.203, + "mean_token_accuracy": 0.5460193280130625, + "num_tokens": 1235153.0, + "step": 7120 + }, + { + "epoch": 13.425411764705883, + "grad_norm": 0.8482743501663208, + "learning_rate": 1.0798179945795996e-05, + "loss": 1.2084, + "mean_token_accuracy": 0.5523406885564327, + "num_tokens": 1248129.0, + "step": 7130 + }, + { + "epoch": 13.444235294117647, + "grad_norm": 0.7389087677001953, + "learning_rate": 1.0777273204116541e-05, + "loss": 1.1817, + "mean_token_accuracy": 0.5641430784016848, + "num_tokens": 1261697.0, + "step": 7140 + }, + { + "epoch": 13.463058823529412, + "grad_norm": 0.7677028179168701, + "learning_rate": 1.0756363043826328e-05, + "loss": 1.2031, + "mean_token_accuracy": 0.5505106158554554, + "num_tokens": 1274933.0, + "step": 7150 + }, + { + "epoch": 13.481882352941177, + "grad_norm": 0.8022538423538208, + "learning_rate": 1.0735449556892622e-05, + "loss": 1.1603, + "mean_token_accuracy": 0.5579750452190637, + "num_tokens": 1288707.0, + "step": 7160 + }, + { + "epoch": 13.500705882352941, + "grad_norm": 0.6393579840660095, + "learning_rate": 1.0714532835297344e-05, + "loss": 1.1945, + "mean_token_accuracy": 0.556298240274191, + "num_tokens": 1302937.0, + "step": 7170 + }, + { + "epoch": 13.519529411764706, + "grad_norm": 0.8472998142242432, + "learning_rate": 1.0693612971036616e-05, + "loss": 1.2097, + "mean_token_accuracy": 0.5355463117361069, + "num_tokens": 1316118.0, + "step": 7180 + }, + { + "epoch": 13.53835294117647, + "grad_norm": 0.5993279218673706, + "learning_rate": 1.0672690056120398e-05, + "loss": 1.1842, + "mean_token_accuracy": 0.5584869582206011, + "num_tokens": 1329672.0, + "step": 7190 + }, + { + "epoch": 13.557176470588235, + "grad_norm": 1.1063508987426758, + "learning_rate": 1.0651764182572063e-05, + "loss": 1.1652, + "mean_token_accuracy": 0.5537949342280626, + "num_tokens": 1342869.0, + "step": 7200 + }, + { + "epoch": 13.576, + "grad_norm": 0.9842997789382935, + "learning_rate": 1.0630835442428001e-05, + "loss": 1.2162, + "mean_token_accuracy": 0.5468976002186536, + "num_tokens": 1356452.0, + "step": 7210 + }, + { + "epoch": 13.594823529411764, + "grad_norm": 0.7008135914802551, + "learning_rate": 1.0609903927737196e-05, + "loss": 1.174, + "mean_token_accuracy": 0.5521068956702948, + "num_tokens": 1370184.0, + "step": 7220 + }, + { + "epoch": 13.61364705882353, + "grad_norm": 1.510910987854004, + "learning_rate": 1.0588969730560852e-05, + "loss": 1.2074, + "mean_token_accuracy": 0.5425486758351326, + "num_tokens": 1383485.0, + "step": 7230 + }, + { + "epoch": 13.632470588235295, + "grad_norm": 0.8757747411727905, + "learning_rate": 1.0568032942971962e-05, + "loss": 1.209, + "mean_token_accuracy": 0.5415149603039027, + "num_tokens": 1398031.0, + "step": 7240 + }, + { + "epoch": 13.651294117647058, + "grad_norm": 1.3053663969039917, + "learning_rate": 1.0547093657054914e-05, + "loss": 1.1542, + "mean_token_accuracy": 0.5658162008970976, + "num_tokens": 1410853.0, + "step": 7250 + }, + { + "epoch": 13.670117647058824, + "grad_norm": 1.395501971244812, + "learning_rate": 1.0526151964905085e-05, + "loss": 1.1775, + "mean_token_accuracy": 0.5551408220082521, + "num_tokens": 1423509.0, + "step": 7260 + }, + { + "epoch": 13.688941176470589, + "grad_norm": 1.4833544492721558, + "learning_rate": 1.0505207958628438e-05, + "loss": 1.0948, + "mean_token_accuracy": 0.5793862946331501, + "num_tokens": 1437175.0, + "step": 7270 + }, + { + "epoch": 13.707764705882353, + "grad_norm": 0.8894456624984741, + "learning_rate": 1.0484261730341101e-05, + "loss": 1.1577, + "mean_token_accuracy": 0.5599946200847625, + "num_tokens": 1449621.0, + "step": 7280 + }, + { + "epoch": 13.726588235294118, + "grad_norm": 0.9780417680740356, + "learning_rate": 1.0463313372168993e-05, + "loss": 1.2138, + "mean_token_accuracy": 0.5368953734636307, + "num_tokens": 1463044.0, + "step": 7290 + }, + { + "epoch": 13.745411764705882, + "grad_norm": 1.0154612064361572, + "learning_rate": 1.0442362976247384e-05, + "loss": 1.2187, + "mean_token_accuracy": 0.5392611972987652, + "num_tokens": 1476500.0, + "step": 7300 + }, + { + "epoch": 13.764235294117647, + "grad_norm": 0.8700633645057678, + "learning_rate": 1.0421410634720523e-05, + "loss": 1.1487, + "mean_token_accuracy": 0.5718079563230276, + "num_tokens": 1489072.0, + "step": 7310 + }, + { + "epoch": 13.783058823529412, + "grad_norm": 1.255520224571228, + "learning_rate": 1.0400456439741203e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.5543713182210922, + "num_tokens": 1502885.0, + "step": 7320 + }, + { + "epoch": 13.801882352941176, + "grad_norm": 0.9495011568069458, + "learning_rate": 1.0379500483470373e-05, + "loss": 1.1583, + "mean_token_accuracy": 0.5580469910055399, + "num_tokens": 1515640.0, + "step": 7330 + }, + { + "epoch": 13.820705882352941, + "grad_norm": 0.7675338387489319, + "learning_rate": 1.035854285807673e-05, + "loss": 1.1739, + "mean_token_accuracy": 0.5525693718343974, + "num_tokens": 1529604.0, + "step": 7340 + }, + { + "epoch": 13.839529411764707, + "grad_norm": 1.0846037864685059, + "learning_rate": 1.0337583655736312e-05, + "loss": 1.2115, + "mean_token_accuracy": 0.5463377732783556, + "num_tokens": 1543205.0, + "step": 7350 + }, + { + "epoch": 13.85835294117647, + "grad_norm": 0.711812436580658, + "learning_rate": 1.0316622968632088e-05, + "loss": 1.2121, + "mean_token_accuracy": 0.5422242067754268, + "num_tokens": 1556017.0, + "step": 7360 + }, + { + "epoch": 13.877176470588235, + "grad_norm": 1.1142287254333496, + "learning_rate": 1.029566088895357e-05, + "loss": 1.1561, + "mean_token_accuracy": 0.5748393829911947, + "num_tokens": 1570116.0, + "step": 7370 + }, + { + "epoch": 13.896, + "grad_norm": 0.8056155443191528, + "learning_rate": 1.0274697508896372e-05, + "loss": 1.1411, + "mean_token_accuracy": 0.5808346830308437, + "num_tokens": 1582496.0, + "step": 7380 + }, + { + "epoch": 13.914823529411764, + "grad_norm": 1.1729984283447266, + "learning_rate": 1.0253732920661856e-05, + "loss": 1.1881, + "mean_token_accuracy": 0.5547745041549206, + "num_tokens": 1596198.0, + "step": 7390 + }, + { + "epoch": 13.93364705882353, + "grad_norm": 1.339120626449585, + "learning_rate": 1.0232767216456672e-05, + "loss": 1.1534, + "mean_token_accuracy": 0.5604519348591566, + "num_tokens": 1609177.0, + "step": 7400 + }, + { + "epoch": 13.952470588235293, + "grad_norm": 0.6790438294410706, + "learning_rate": 1.0211800488492401e-05, + "loss": 1.1662, + "mean_token_accuracy": 0.5518874824047089, + "num_tokens": 1622444.0, + "step": 7410 + }, + { + "epoch": 13.971294117647059, + "grad_norm": 1.8246535062789917, + "learning_rate": 1.01908328289851e-05, + "loss": 1.1406, + "mean_token_accuracy": 0.5649749383330345, + "num_tokens": 1635759.0, + "step": 7420 + }, + { + "epoch": 13.990117647058824, + "grad_norm": 1.3896183967590332, + "learning_rate": 1.0169864330154951e-05, + "loss": 1.1608, + "mean_token_accuracy": 0.5683747977018356, + "num_tokens": 1649592.0, + "step": 7430 + }, + { + "epoch": 14.007529411764706, + "grad_norm": 0.9073938131332397, + "learning_rate": 1.0148895084225807e-05, + "loss": 1.1792, + "mean_token_accuracy": 0.5546492849652832, + "num_tokens": 1661557.0, + "step": 7440 + }, + { + "epoch": 14.026352941176471, + "grad_norm": 0.770523190498352, + "learning_rate": 1.012792518342482e-05, + "loss": 1.1737, + "mean_token_accuracy": 0.5552055723965168, + "num_tokens": 1675410.0, + "step": 7450 + }, + { + "epoch": 14.045176470588235, + "grad_norm": 1.5216394662857056, + "learning_rate": 1.0106954719982014e-05, + "loss": 1.1718, + "mean_token_accuracy": 0.5549504559487104, + "num_tokens": 1689244.0, + "step": 7460 + }, + { + "epoch": 14.064, + "grad_norm": 0.9542890787124634, + "learning_rate": 1.0085983786129894e-05, + "loss": 1.1549, + "mean_token_accuracy": 0.5636588338762522, + "num_tokens": 1702377.0, + "step": 7470 + }, + { + "epoch": 14.082823529411765, + "grad_norm": 0.9031827449798584, + "learning_rate": 1.0065012474103027e-05, + "loss": 1.195, + "mean_token_accuracy": 0.5450264655053616, + "num_tokens": 1714838.0, + "step": 7480 + }, + { + "epoch": 14.101647058823529, + "grad_norm": 1.3046774864196777, + "learning_rate": 1.0044040876137647e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.5547593403607607, + "num_tokens": 1728323.0, + "step": 7490 + }, + { + "epoch": 14.120470588235294, + "grad_norm": 0.7626868486404419, + "learning_rate": 1.0023069084471244e-05, + "loss": 1.193, + "mean_token_accuracy": 0.539740389585495, + "num_tokens": 1741948.0, + "step": 7500 + }, + { + "epoch": 14.13929411764706, + "grad_norm": 0.7623440027236938, + "learning_rate": 1.0002097191342167e-05, + "loss": 1.2001, + "mean_token_accuracy": 0.5466340240091085, + "num_tokens": 1754976.0, + "step": 7510 + }, + { + "epoch": 14.158117647058823, + "grad_norm": 0.5901451706886292, + "learning_rate": 9.981125288989197e-06, + "loss": 1.1834, + "mean_token_accuracy": 0.5578321043401957, + "num_tokens": 1769202.0, + "step": 7520 + }, + { + "epoch": 14.176941176470589, + "grad_norm": 1.7071183919906616, + "learning_rate": 9.960153469651173e-06, + "loss": 1.1479, + "mean_token_accuracy": 0.5668721627444029, + "num_tokens": 1782731.0, + "step": 7530 + }, + { + "epoch": 14.195764705882352, + "grad_norm": 0.84836345911026, + "learning_rate": 9.939181825566555e-06, + "loss": 1.1664, + "mean_token_accuracy": 0.5470958970487118, + "num_tokens": 1795962.0, + "step": 7540 + }, + { + "epoch": 14.214588235294118, + "grad_norm": 0.6625200510025024, + "learning_rate": 9.918210448973041e-06, + "loss": 1.167, + "mean_token_accuracy": 0.5696456581354141, + "num_tokens": 1809352.0, + "step": 7550 + }, + { + "epoch": 14.233411764705883, + "grad_norm": 1.065662145614624, + "learning_rate": 9.897239432107144e-06, + "loss": 1.2344, + "mean_token_accuracy": 0.5390154391527175, + "num_tokens": 1823306.0, + "step": 7560 + }, + { + "epoch": 14.252235294117646, + "grad_norm": 1.492067813873291, + "learning_rate": 9.876268867203803e-06, + "loss": 1.1505, + "mean_token_accuracy": 0.5786493707448244, + "num_tokens": 1836619.0, + "step": 7570 + }, + { + "epoch": 14.271058823529412, + "grad_norm": 0.8435829281806946, + "learning_rate": 9.855298846495964e-06, + "loss": 1.1985, + "mean_token_accuracy": 0.5462658539414406, + "num_tokens": 1850730.0, + "step": 7580 + }, + { + "epoch": 14.289882352941177, + "grad_norm": 0.8233511447906494, + "learning_rate": 9.834329462214186e-06, + "loss": 1.205, + "mean_token_accuracy": 0.5534395117312669, + "num_tokens": 1864080.0, + "step": 7590 + }, + { + "epoch": 14.30870588235294, + "grad_norm": 0.9905659556388855, + "learning_rate": 9.813360806586218e-06, + "loss": 1.223, + "mean_token_accuracy": 0.5520945586264133, + "num_tokens": 1877561.0, + "step": 7600 + }, + { + "epoch": 14.331294117647058, + "grad_norm": 0.8363655805587769, + "learning_rate": 9.792392971836614e-06, + "loss": 1.2066, + "mean_token_accuracy": 0.5393288742750884, + "num_tokens": 14227.0, + "step": 7610 + }, + { + "epoch": 14.350117647058823, + "grad_norm": 1.2111597061157227, + "learning_rate": 9.77142605018631e-06, + "loss": 1.1475, + "mean_token_accuracy": 0.5616458028554916, + "num_tokens": 26809.0, + "step": 7620 + }, + { + "epoch": 14.368941176470589, + "grad_norm": 1.3134557008743286, + "learning_rate": 9.750460133852234e-06, + "loss": 1.1937, + "mean_token_accuracy": 0.5489944957196713, + "num_tokens": 40285.0, + "step": 7630 + }, + { + "epoch": 14.387764705882352, + "grad_norm": 1.0377771854400635, + "learning_rate": 9.729495315046886e-06, + "loss": 1.1611, + "mean_token_accuracy": 0.5607926532626152, + "num_tokens": 53708.0, + "step": 7640 + }, + { + "epoch": 14.406588235294118, + "grad_norm": 0.5833343267440796, + "learning_rate": 9.708531685977945e-06, + "loss": 1.1491, + "mean_token_accuracy": 0.5668897565454245, + "num_tokens": 66990.0, + "step": 7650 + }, + { + "epoch": 14.425411764705883, + "grad_norm": 0.6283827424049377, + "learning_rate": 9.687569338847848e-06, + "loss": 1.1669, + "mean_token_accuracy": 0.5548735350370407, + "num_tokens": 80285.0, + "step": 7660 + }, + { + "epoch": 14.444235294117647, + "grad_norm": 1.9695147275924683, + "learning_rate": 9.666608365853405e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.586016109958291, + "num_tokens": 94238.0, + "step": 7670 + }, + { + "epoch": 14.463058823529412, + "grad_norm": 0.762913703918457, + "learning_rate": 9.645648859185372e-06, + "loss": 1.241, + "mean_token_accuracy": 0.5349636357277632, + "num_tokens": 109038.0, + "step": 7680 + }, + { + "epoch": 14.481882352941177, + "grad_norm": 1.1660629510879517, + "learning_rate": 9.624690911028062e-06, + "loss": 1.2031, + "mean_token_accuracy": 0.5475806038826704, + "num_tokens": 122828.0, + "step": 7690 + }, + { + "epoch": 14.500705882352941, + "grad_norm": 0.7873107194900513, + "learning_rate": 9.603734613558933e-06, + "loss": 1.1112, + "mean_token_accuracy": 0.5692310906946659, + "num_tokens": 136009.0, + "step": 7700 + }, + { + "epoch": 14.519529411764706, + "grad_norm": 0.7429578900337219, + "learning_rate": 9.582780058948182e-06, + "loss": 1.2031, + "mean_token_accuracy": 0.5352565940469504, + "num_tokens": 148439.0, + "step": 7710 + }, + { + "epoch": 14.53835294117647, + "grad_norm": 0.9231197834014893, + "learning_rate": 9.56182733935834e-06, + "loss": 1.1313, + "mean_token_accuracy": 0.5766253888607025, + "num_tokens": 162003.0, + "step": 7720 + }, + { + "epoch": 14.557176470588235, + "grad_norm": 0.9091134071350098, + "learning_rate": 9.540876546943863e-06, + "loss": 1.1224, + "mean_token_accuracy": 0.565272556990385, + "num_tokens": 174711.0, + "step": 7730 + }, + { + "epoch": 14.576, + "grad_norm": 0.7547168731689453, + "learning_rate": 9.51992777385074e-06, + "loss": 1.1618, + "mean_token_accuracy": 0.5629419464617967, + "num_tokens": 188030.0, + "step": 7740 + }, + { + "epoch": 14.594823529411764, + "grad_norm": 0.7168717980384827, + "learning_rate": 9.498981112216073e-06, + "loss": 1.1665, + "mean_token_accuracy": 0.562981392070651, + "num_tokens": 201691.0, + "step": 7750 + }, + { + "epoch": 14.61364705882353, + "grad_norm": 1.3017321825027466, + "learning_rate": 9.478036654167673e-06, + "loss": 1.1908, + "mean_token_accuracy": 0.5513414010405541, + "num_tokens": 215908.0, + "step": 7760 + }, + { + "epoch": 14.632470588235295, + "grad_norm": 0.7235598564147949, + "learning_rate": 9.457094491823674e-06, + "loss": 1.1668, + "mean_token_accuracy": 0.5548354998230934, + "num_tokens": 229890.0, + "step": 7770 + }, + { + "epoch": 14.651294117647058, + "grad_norm": 1.014742136001587, + "learning_rate": 9.436154717292095e-06, + "loss": 1.1381, + "mean_token_accuracy": 0.5685541749000549, + "num_tokens": 242083.0, + "step": 7780 + }, + { + "epoch": 14.670117647058824, + "grad_norm": 1.7762614488601685, + "learning_rate": 9.415217422670465e-06, + "loss": 1.2049, + "mean_token_accuracy": 0.5436010017991066, + "num_tokens": 255157.0, + "step": 7790 + }, + { + "epoch": 14.688941176470589, + "grad_norm": 1.2956479787826538, + "learning_rate": 9.3942827000454e-06, + "loss": 1.1581, + "mean_token_accuracy": 0.5617427326738834, + "num_tokens": 268163.0, + "step": 7800 + }, + { + "epoch": 14.707764705882353, + "grad_norm": 1.3141613006591797, + "learning_rate": 9.37335064149221e-06, + "loss": 1.2012, + "mean_token_accuracy": 0.5483662486076355, + "num_tokens": 281261.0, + "step": 7810 + }, + { + "epoch": 14.726588235294118, + "grad_norm": 0.9441389441490173, + "learning_rate": 9.352421339074481e-06, + "loss": 1.1539, + "mean_token_accuracy": 0.5657226879149675, + "num_tokens": 295835.0, + "step": 7820 + }, + { + "epoch": 14.745411764705882, + "grad_norm": 0.6852191686630249, + "learning_rate": 9.331494884843682e-06, + "loss": 1.1109, + "mean_token_accuracy": 0.5754560541361571, + "num_tokens": 308216.0, + "step": 7830 + }, + { + "epoch": 14.764235294117647, + "grad_norm": 1.1635503768920898, + "learning_rate": 9.310571370838747e-06, + "loss": 1.1794, + "mean_token_accuracy": 0.5661456611007452, + "num_tokens": 321205.0, + "step": 7840 + }, + { + "epoch": 14.783058823529412, + "grad_norm": 0.6046936511993408, + "learning_rate": 9.28965088908569e-06, + "loss": 1.1862, + "mean_token_accuracy": 0.5497476685792207, + "num_tokens": 336086.0, + "step": 7850 + }, + { + "epoch": 14.801882352941176, + "grad_norm": 1.6876798868179321, + "learning_rate": 9.268733531597185e-06, + "loss": 1.149, + "mean_token_accuracy": 0.5605622876435519, + "num_tokens": 348960.0, + "step": 7860 + }, + { + "epoch": 14.820705882352941, + "grad_norm": 0.6555068492889404, + "learning_rate": 9.24781939037215e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.562832348421216, + "num_tokens": 361287.0, + "step": 7870 + }, + { + "epoch": 14.839529411764707, + "grad_norm": 1.4029412269592285, + "learning_rate": 9.226908557395384e-06, + "loss": 1.1503, + "mean_token_accuracy": 0.5643013104796409, + "num_tokens": 373901.0, + "step": 7880 + }, + { + "epoch": 14.85835294117647, + "grad_norm": 0.6774556636810303, + "learning_rate": 9.206001124637113e-06, + "loss": 1.118, + "mean_token_accuracy": 0.5655334409326315, + "num_tokens": 388185.0, + "step": 7890 + }, + { + "epoch": 14.877176470588235, + "grad_norm": 1.3459422588348389, + "learning_rate": 9.185097184052615e-06, + "loss": 1.2017, + "mean_token_accuracy": 0.547975680232048, + "num_tokens": 402096.0, + "step": 7900 + }, + { + "epoch": 14.896, + "grad_norm": 1.6914349794387817, + "learning_rate": 9.164196827581817e-06, + "loss": 1.1513, + "mean_token_accuracy": 0.5657749876379967, + "num_tokens": 415109.0, + "step": 7910 + }, + { + "epoch": 14.914823529411764, + "grad_norm": 0.5575766563415527, + "learning_rate": 9.143300147148869e-06, + "loss": 1.1707, + "mean_token_accuracy": 0.5562089808285237, + "num_tokens": 428081.0, + "step": 7920 + }, + { + "epoch": 14.93364705882353, + "grad_norm": 1.4414509534835815, + "learning_rate": 9.122407234661764e-06, + "loss": 1.225, + "mean_token_accuracy": 0.5390350338071584, + "num_tokens": 441704.0, + "step": 7930 + }, + { + "epoch": 14.952470588235293, + "grad_norm": 1.122767448425293, + "learning_rate": 9.101518182011914e-06, + "loss": 1.1717, + "mean_token_accuracy": 0.5577954012900591, + "num_tokens": 454263.0, + "step": 7940 + }, + { + "epoch": 14.971294117647059, + "grad_norm": 0.7421872615814209, + "learning_rate": 9.080633081073763e-06, + "loss": 1.1535, + "mean_token_accuracy": 0.5636604465544224, + "num_tokens": 467572.0, + "step": 7950 + }, + { + "epoch": 14.990117647058824, + "grad_norm": 0.7315053939819336, + "learning_rate": 9.059752023704367e-06, + "loss": 1.1306, + "mean_token_accuracy": 0.5692378722131253, + "num_tokens": 481113.0, + "step": 7960 + }, + { + "epoch": 15.009411764705883, + "grad_norm": 0.8332544565200806, + "learning_rate": 9.038875101743003e-06, + "loss": 1.2939, + "mean_token_accuracy": 0.5470707878106977, + "num_tokens": 494860.0, + "step": 7970 + }, + { + "epoch": 15.028235294117646, + "grad_norm": 0.7542789578437805, + "learning_rate": 9.018002407010755e-06, + "loss": 1.2465, + "mean_token_accuracy": 0.5324202172458172, + "num_tokens": 509027.0, + "step": 7980 + }, + { + "epoch": 15.047058823529412, + "grad_norm": 0.6684849858283997, + "learning_rate": 8.997134031310123e-06, + "loss": 1.1783, + "mean_token_accuracy": 0.5569613084197045, + "num_tokens": 522094.0, + "step": 7990 + }, + { + "epoch": 15.065882352941177, + "grad_norm": 0.860490620136261, + "learning_rate": 8.976270066424602e-06, + "loss": 1.1533, + "mean_token_accuracy": 0.5639694180339575, + "num_tokens": 535686.0, + "step": 8000 + }, + { + "epoch": 15.08470588235294, + "grad_norm": 1.2201600074768066, + "learning_rate": 8.955410604118287e-06, + "loss": 1.1514, + "mean_token_accuracy": 0.5624887187033891, + "num_tokens": 547438.0, + "step": 8010 + }, + { + "epoch": 15.103529411764706, + "grad_norm": 0.7185003757476807, + "learning_rate": 8.934555736135475e-06, + "loss": 1.1916, + "mean_token_accuracy": 0.5531281687319278, + "num_tokens": 562299.0, + "step": 8020 + }, + { + "epoch": 15.122352941176471, + "grad_norm": 0.9599730372428894, + "learning_rate": 8.913705554200257e-06, + "loss": 1.2061, + "mean_token_accuracy": 0.5452544983476401, + "num_tokens": 575630.0, + "step": 8030 + }, + { + "epoch": 15.141176470588235, + "grad_norm": 1.0057822465896606, + "learning_rate": 8.892860150016108e-06, + "loss": 1.1375, + "mean_token_accuracy": 0.5605553191155195, + "num_tokens": 588679.0, + "step": 8040 + }, + { + "epoch": 15.16, + "grad_norm": 0.7954553961753845, + "learning_rate": 8.872019615265494e-06, + "loss": 1.164, + "mean_token_accuracy": 0.5492696654051542, + "num_tokens": 601254.0, + "step": 8050 + }, + { + "epoch": 15.178823529411765, + "grad_norm": 0.6151052713394165, + "learning_rate": 8.851184041609464e-06, + "loss": 1.2193, + "mean_token_accuracy": 0.5489672936499119, + "num_tokens": 614369.0, + "step": 8060 + }, + { + "epoch": 15.197647058823529, + "grad_norm": 0.8546686768531799, + "learning_rate": 8.830353520687245e-06, + "loss": 1.1938, + "mean_token_accuracy": 0.5609996184706688, + "num_tokens": 628665.0, + "step": 8070 + }, + { + "epoch": 15.216470588235294, + "grad_norm": 0.6473791003227234, + "learning_rate": 8.809528144115842e-06, + "loss": 1.1396, + "mean_token_accuracy": 0.5705398332327605, + "num_tokens": 643134.0, + "step": 8080 + }, + { + "epoch": 15.235294117647058, + "grad_norm": 1.5007941722869873, + "learning_rate": 8.788708003489636e-06, + "loss": 1.1456, + "mean_token_accuracy": 0.5757338788360358, + "num_tokens": 655759.0, + "step": 8090 + }, + { + "epoch": 15.254117647058823, + "grad_norm": 0.6832001209259033, + "learning_rate": 8.767893190379974e-06, + "loss": 1.1718, + "mean_token_accuracy": 0.5538515329360962, + "num_tokens": 669100.0, + "step": 8100 + }, + { + "epoch": 15.272941176470589, + "grad_norm": 0.7107806205749512, + "learning_rate": 8.747083796334776e-06, + "loss": 1.1645, + "mean_token_accuracy": 0.5559101283550263, + "num_tokens": 683005.0, + "step": 8110 + }, + { + "epoch": 15.291764705882352, + "grad_norm": 0.6784150004386902, + "learning_rate": 8.726279912878126e-06, + "loss": 1.1888, + "mean_token_accuracy": 0.5452313166111707, + "num_tokens": 696921.0, + "step": 8120 + }, + { + "epoch": 15.310588235294118, + "grad_norm": 1.1530226469039917, + "learning_rate": 8.705481631509876e-06, + "loss": 1.1809, + "mean_token_accuracy": 0.561325515806675, + "num_tokens": 710780.0, + "step": 8130 + }, + { + "epoch": 15.329411764705883, + "grad_norm": 0.6158255934715271, + "learning_rate": 8.684689043705231e-06, + "loss": 1.1597, + "mean_token_accuracy": 0.5495276678353548, + "num_tokens": 724228.0, + "step": 8140 + }, + { + "epoch": 15.348235294117647, + "grad_norm": 0.6823238730430603, + "learning_rate": 8.663902240914357e-06, + "loss": 1.1347, + "mean_token_accuracy": 0.568655128031969, + "num_tokens": 737696.0, + "step": 8150 + }, + { + "epoch": 15.367058823529412, + "grad_norm": 0.6831355690956116, + "learning_rate": 8.643121314561976e-06, + "loss": 1.1547, + "mean_token_accuracy": 0.5554495759308338, + "num_tokens": 751387.0, + "step": 8160 + }, + { + "epoch": 15.385882352941177, + "grad_norm": 0.7401413321495056, + "learning_rate": 8.622346356046972e-06, + "loss": 1.1248, + "mean_token_accuracy": 0.5787703268229961, + "num_tokens": 764116.0, + "step": 8170 + }, + { + "epoch": 15.40470588235294, + "grad_norm": 1.320250391960144, + "learning_rate": 8.601577456741967e-06, + "loss": 1.1582, + "mean_token_accuracy": 0.5558317702263593, + "num_tokens": 777306.0, + "step": 8180 + }, + { + "epoch": 15.423529411764706, + "grad_norm": 1.5581915378570557, + "learning_rate": 8.580814707992949e-06, + "loss": 1.1475, + "mean_token_accuracy": 0.5548811592161655, + "num_tokens": 790684.0, + "step": 8190 + }, + { + "epoch": 15.44235294117647, + "grad_norm": 0.6925750970840454, + "learning_rate": 8.560058201118842e-06, + "loss": 1.19, + "mean_token_accuracy": 0.5531508523970843, + "num_tokens": 804244.0, + "step": 8200 + }, + { + "epoch": 15.461176470588235, + "grad_norm": 1.275739073753357, + "learning_rate": 8.539308027411123e-06, + "loss": 1.1916, + "mean_token_accuracy": 0.5411147933453322, + "num_tokens": 818070.0, + "step": 8210 + }, + { + "epoch": 15.48, + "grad_norm": 1.308816909790039, + "learning_rate": 8.51856427813341e-06, + "loss": 1.1389, + "mean_token_accuracy": 0.5609162572771311, + "num_tokens": 831998.0, + "step": 8220 + }, + { + "epoch": 15.498823529411764, + "grad_norm": 1.4086875915527344, + "learning_rate": 8.497827044521074e-06, + "loss": 1.1806, + "mean_token_accuracy": 0.5614006619900465, + "num_tokens": 846346.0, + "step": 8230 + }, + { + "epoch": 15.51764705882353, + "grad_norm": 1.5172885656356812, + "learning_rate": 8.477096417780818e-06, + "loss": 1.1423, + "mean_token_accuracy": 0.5598192039877177, + "num_tokens": 859606.0, + "step": 8240 + }, + { + "epoch": 15.536470588235295, + "grad_norm": 0.635094404220581, + "learning_rate": 8.456372489090294e-06, + "loss": 1.195, + "mean_token_accuracy": 0.5515242625027895, + "num_tokens": 872352.0, + "step": 8250 + }, + { + "epoch": 15.555294117647058, + "grad_norm": 1.049091100692749, + "learning_rate": 8.43565534959769e-06, + "loss": 1.1708, + "mean_token_accuracy": 0.5665956649929285, + "num_tokens": 886434.0, + "step": 8260 + }, + { + "epoch": 15.574117647058824, + "grad_norm": 0.6422486901283264, + "learning_rate": 8.414945090421337e-06, + "loss": 1.1199, + "mean_token_accuracy": 0.5681115534156561, + "num_tokens": 899435.0, + "step": 8270 + }, + { + "epoch": 15.592941176470589, + "grad_norm": 1.1809626817703247, + "learning_rate": 8.394241802649307e-06, + "loss": 1.1553, + "mean_token_accuracy": 0.5631851524114608, + "num_tokens": 912350.0, + "step": 8280 + }, + { + "epoch": 15.611764705882353, + "grad_norm": 0.8651266694068909, + "learning_rate": 8.373545577339002e-06, + "loss": 1.1419, + "mean_token_accuracy": 0.5651818908751011, + "num_tokens": 925480.0, + "step": 8290 + }, + { + "epoch": 15.630588235294118, + "grad_norm": 0.7373852729797363, + "learning_rate": 8.352856505516765e-06, + "loss": 1.1959, + "mean_token_accuracy": 0.5542501173913479, + "num_tokens": 938863.0, + "step": 8300 + }, + { + "epoch": 15.649411764705881, + "grad_norm": 1.3229117393493652, + "learning_rate": 8.33217467817748e-06, + "loss": 1.1445, + "mean_token_accuracy": 0.5683686885982752, + "num_tokens": 952271.0, + "step": 8310 + }, + { + "epoch": 15.668235294117647, + "grad_norm": 0.9950488805770874, + "learning_rate": 8.311500186284166e-06, + "loss": 1.1469, + "mean_token_accuracy": 0.5671338357031346, + "num_tokens": 966154.0, + "step": 8320 + }, + { + "epoch": 15.687058823529412, + "grad_norm": 1.0574768781661987, + "learning_rate": 8.290833120767585e-06, + "loss": 1.1745, + "mean_token_accuracy": 0.5554843176156282, + "num_tokens": 978470.0, + "step": 8330 + }, + { + "epoch": 15.705882352941176, + "grad_norm": 0.9360321760177612, + "learning_rate": 8.270173572525824e-06, + "loss": 1.1932, + "mean_token_accuracy": 0.5552597276866436, + "num_tokens": 992540.0, + "step": 8340 + }, + { + "epoch": 15.724705882352941, + "grad_norm": 1.4928091764450073, + "learning_rate": 8.249521632423918e-06, + "loss": 1.1648, + "mean_token_accuracy": 0.5653361968696118, + "num_tokens": 1006353.0, + "step": 8350 + }, + { + "epoch": 15.743529411764706, + "grad_norm": 0.7848607897758484, + "learning_rate": 8.228877391293432e-06, + "loss": 1.1971, + "mean_token_accuracy": 0.5555432129651308, + "num_tokens": 1020378.0, + "step": 8360 + }, + { + "epoch": 15.76235294117647, + "grad_norm": 0.813613772392273, + "learning_rate": 8.20824093993208e-06, + "loss": 1.1464, + "mean_token_accuracy": 0.5570184625685215, + "num_tokens": 1032073.0, + "step": 8370 + }, + { + "epoch": 15.781176470588235, + "grad_norm": 0.6320846080780029, + "learning_rate": 8.1876123691033e-06, + "loss": 1.1486, + "mean_token_accuracy": 0.5596757929772138, + "num_tokens": 1045292.0, + "step": 8380 + }, + { + "epoch": 15.8, + "grad_norm": 1.1251544952392578, + "learning_rate": 8.166991769535886e-06, + "loss": 1.1581, + "mean_token_accuracy": 0.5615175377577544, + "num_tokens": 1058620.0, + "step": 8390 + }, + { + "epoch": 15.818823529411764, + "grad_norm": 1.5427863597869873, + "learning_rate": 8.146379231923558e-06, + "loss": 1.2204, + "mean_token_accuracy": 0.5381950225681067, + "num_tokens": 1072099.0, + "step": 8400 + }, + { + "epoch": 15.83764705882353, + "grad_norm": 0.9844633936882019, + "learning_rate": 8.12577484692459e-06, + "loss": 1.1673, + "mean_token_accuracy": 0.554484510794282, + "num_tokens": 1085409.0, + "step": 8410 + }, + { + "epoch": 15.856470588235293, + "grad_norm": 1.1419299840927124, + "learning_rate": 8.105178705161395e-06, + "loss": 1.1713, + "mean_token_accuracy": 0.5534321576356888, + "num_tokens": 1098803.0, + "step": 8420 + }, + { + "epoch": 15.875294117647059, + "grad_norm": 0.8007948994636536, + "learning_rate": 8.084590897220122e-06, + "loss": 1.1394, + "mean_token_accuracy": 0.562013290822506, + "num_tokens": 1111518.0, + "step": 8430 + }, + { + "epoch": 15.894117647058824, + "grad_norm": 0.7455958724021912, + "learning_rate": 8.064011513650276e-06, + "loss": 1.1577, + "mean_token_accuracy": 0.5672158092260361, + "num_tokens": 1126619.0, + "step": 8440 + }, + { + "epoch": 15.912941176470587, + "grad_norm": 1.5687180757522583, + "learning_rate": 8.04344064496431e-06, + "loss": 1.1668, + "mean_token_accuracy": 0.5503279969096184, + "num_tokens": 1139657.0, + "step": 8450 + }, + { + "epoch": 15.931764705882353, + "grad_norm": 0.9860045909881592, + "learning_rate": 8.022878381637219e-06, + "loss": 1.1937, + "mean_token_accuracy": 0.54759371727705, + "num_tokens": 1153370.0, + "step": 8460 + }, + { + "epoch": 15.950588235294118, + "grad_norm": 1.7666656970977783, + "learning_rate": 8.002324814106161e-06, + "loss": 1.2289, + "mean_token_accuracy": 0.5447251949459314, + "num_tokens": 1166420.0, + "step": 8470 + }, + { + "epoch": 15.969411764705882, + "grad_norm": 1.6037918329238892, + "learning_rate": 7.981780032770035e-06, + "loss": 1.1054, + "mean_token_accuracy": 0.5815329641103745, + "num_tokens": 1179026.0, + "step": 8480 + }, + { + "epoch": 15.988235294117647, + "grad_norm": 1.0090439319610596, + "learning_rate": 7.961244127989112e-06, + "loss": 1.181, + "mean_token_accuracy": 0.5504204016178846, + "num_tokens": 1192954.0, + "step": 8490 + }, + { + "epoch": 16.00564705882353, + "grad_norm": 1.271785855293274, + "learning_rate": 7.940717190084603e-06, + "loss": 1.2075, + "mean_token_accuracy": 0.5598280276801135, + "num_tokens": 1206256.0, + "step": 8500 + }, + { + "epoch": 16.024470588235292, + "grad_norm": 1.0442641973495483, + "learning_rate": 7.9201993093383e-06, + "loss": 1.1703, + "mean_token_accuracy": 0.5635019179433585, + "num_tokens": 1219916.0, + "step": 8510 + }, + { + "epoch": 16.043294117647058, + "grad_norm": 0.7755882143974304, + "learning_rate": 7.899690575992144e-06, + "loss": 1.2, + "mean_token_accuracy": 0.5463937662541867, + "num_tokens": 1233382.0, + "step": 8520 + }, + { + "epoch": 16.062117647058823, + "grad_norm": 0.8577190041542053, + "learning_rate": 7.879191080247857e-06, + "loss": 1.1861, + "mean_token_accuracy": 0.5470962207764387, + "num_tokens": 1248208.0, + "step": 8530 + }, + { + "epoch": 16.08094117647059, + "grad_norm": 0.7220326662063599, + "learning_rate": 7.85870091226652e-06, + "loss": 1.2134, + "mean_token_accuracy": 0.5384833466261625, + "num_tokens": 1262112.0, + "step": 8540 + }, + { + "epoch": 16.099764705882354, + "grad_norm": 0.8119881749153137, + "learning_rate": 7.838220162168199e-06, + "loss": 1.1882, + "mean_token_accuracy": 0.5573807552456855, + "num_tokens": 1274841.0, + "step": 8550 + }, + { + "epoch": 16.11858823529412, + "grad_norm": 0.7537893056869507, + "learning_rate": 7.817748920031533e-06, + "loss": 1.1632, + "mean_token_accuracy": 0.5611206289380789, + "num_tokens": 1289035.0, + "step": 8560 + }, + { + "epoch": 16.13741176470588, + "grad_norm": 1.9209939241409302, + "learning_rate": 7.797287275893339e-06, + "loss": 1.1986, + "mean_token_accuracy": 0.5489944905042649, + "num_tokens": 1302422.0, + "step": 8570 + }, + { + "epoch": 16.156235294117646, + "grad_norm": 0.7403332591056824, + "learning_rate": 7.776835319748226e-06, + "loss": 1.1926, + "mean_token_accuracy": 0.544366030395031, + "num_tokens": 1316218.0, + "step": 8580 + }, + { + "epoch": 16.17505882352941, + "grad_norm": 0.7108265161514282, + "learning_rate": 7.756393141548196e-06, + "loss": 1.188, + "mean_token_accuracy": 0.5483727026730776, + "num_tokens": 1329307.0, + "step": 8590 + }, + { + "epoch": 16.193882352941177, + "grad_norm": 0.9194144606590271, + "learning_rate": 7.735960831202233e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.5695245500653983, + "num_tokens": 1341639.0, + "step": 8600 + }, + { + "epoch": 16.212705882352942, + "grad_norm": 0.9176170825958252, + "learning_rate": 7.715538478575938e-06, + "loss": 1.1746, + "mean_token_accuracy": 0.5415426712483168, + "num_tokens": 1355385.0, + "step": 8610 + }, + { + "epoch": 16.231529411764704, + "grad_norm": 0.9545559883117676, + "learning_rate": 7.695126173491096e-06, + "loss": 1.1516, + "mean_token_accuracy": 0.5630953580141067, + "num_tokens": 1369295.0, + "step": 8620 + }, + { + "epoch": 16.25035294117647, + "grad_norm": 0.5547646284103394, + "learning_rate": 7.67472400572532e-06, + "loss": 1.1848, + "mean_token_accuracy": 0.5485550325363875, + "num_tokens": 1382440.0, + "step": 8630 + }, + { + "epoch": 16.269176470588235, + "grad_norm": 1.0427489280700684, + "learning_rate": 7.65433206501162e-06, + "loss": 1.1646, + "mean_token_accuracy": 0.5695793781429529, + "num_tokens": 1396411.0, + "step": 8640 + }, + { + "epoch": 16.288, + "grad_norm": 1.417345643043518, + "learning_rate": 7.633950441038041e-06, + "loss": 1.1358, + "mean_token_accuracy": 0.5730958338826895, + "num_tokens": 1409619.0, + "step": 8650 + }, + { + "epoch": 16.306823529411766, + "grad_norm": 0.6368844509124756, + "learning_rate": 7.613579223447238e-06, + "loss": 1.1379, + "mean_token_accuracy": 0.5624699790030718, + "num_tokens": 1422290.0, + "step": 8660 + }, + { + "epoch": 16.32564705882353, + "grad_norm": 0.7046752572059631, + "learning_rate": 7.593218501836108e-06, + "loss": 1.1817, + "mean_token_accuracy": 0.5441745646297932, + "num_tokens": 1435116.0, + "step": 8670 + }, + { + "epoch": 16.344470588235293, + "grad_norm": 0.9974550604820251, + "learning_rate": 7.572868365755377e-06, + "loss": 1.1776, + "mean_token_accuracy": 0.5540152471512556, + "num_tokens": 1449169.0, + "step": 8680 + }, + { + "epoch": 16.363294117647058, + "grad_norm": 0.9207789897918701, + "learning_rate": 7.552528904709224e-06, + "loss": 1.1139, + "mean_token_accuracy": 0.5721325032413006, + "num_tokens": 1461392.0, + "step": 8690 + }, + { + "epoch": 16.382117647058823, + "grad_norm": 0.7715643644332886, + "learning_rate": 7.532200208154856e-06, + "loss": 1.1484, + "mean_token_accuracy": 0.5646240394562483, + "num_tokens": 1475085.0, + "step": 8700 + }, + { + "epoch": 16.40094117647059, + "grad_norm": 0.6167107224464417, + "learning_rate": 7.511882365502161e-06, + "loss": 1.113, + "mean_token_accuracy": 0.568938347697258, + "num_tokens": 1488403.0, + "step": 8710 + }, + { + "epoch": 16.419764705882354, + "grad_norm": 1.0627272129058838, + "learning_rate": 7.491575466113269e-06, + "loss": 1.1889, + "mean_token_accuracy": 0.5542673517018557, + "num_tokens": 1501007.0, + "step": 8720 + }, + { + "epoch": 16.438588235294116, + "grad_norm": 0.8951665759086609, + "learning_rate": 7.4712795993021936e-06, + "loss": 1.1568, + "mean_token_accuracy": 0.5628596622496843, + "num_tokens": 1513501.0, + "step": 8730 + }, + { + "epoch": 16.45741176470588, + "grad_norm": 1.0367953777313232, + "learning_rate": 7.450994854334414e-06, + "loss": 1.1873, + "mean_token_accuracy": 0.5458543870598078, + "num_tokens": 1526907.0, + "step": 8740 + }, + { + "epoch": 16.476235294117647, + "grad_norm": 0.9993324875831604, + "learning_rate": 7.430721320426502e-06, + "loss": 1.1875, + "mean_token_accuracy": 0.5544692728668451, + "num_tokens": 1540310.0, + "step": 8750 + }, + { + "epoch": 16.495058823529412, + "grad_norm": 0.7184901237487793, + "learning_rate": 7.410459086745715e-06, + "loss": 1.1688, + "mean_token_accuracy": 0.5600904107093811, + "num_tokens": 1554564.0, + "step": 8760 + }, + { + "epoch": 16.513882352941177, + "grad_norm": 1.4242663383483887, + "learning_rate": 7.390208242409611e-06, + "loss": 1.1422, + "mean_token_accuracy": 0.5547851927578449, + "num_tokens": 1568019.0, + "step": 8770 + }, + { + "epoch": 16.532705882352943, + "grad_norm": 0.606593906879425, + "learning_rate": 7.3699688764856556e-06, + "loss": 1.1774, + "mean_token_accuracy": 0.5609697885811329, + "num_tokens": 1581881.0, + "step": 8780 + }, + { + "epoch": 16.551529411764704, + "grad_norm": 0.9548640847206116, + "learning_rate": 7.349741077990833e-06, + "loss": 1.1215, + "mean_token_accuracy": 0.5721657130867243, + "num_tokens": 1594281.0, + "step": 8790 + }, + { + "epoch": 16.57035294117647, + "grad_norm": 1.281101942062378, + "learning_rate": 7.3295249358912415e-06, + "loss": 1.1452, + "mean_token_accuracy": 0.5627951502799988, + "num_tokens": 1607907.0, + "step": 8800 + }, + { + "epoch": 16.591058823529412, + "grad_norm": 1.2261985540390015, + "learning_rate": 7.3093205391017275e-06, + "loss": 1.1948, + "mean_token_accuracy": 0.5499283254146576, + "num_tokens": 13007.0, + "step": 8810 + }, + { + "epoch": 16.609882352941177, + "grad_norm": 0.959309458732605, + "learning_rate": 7.289127976485462e-06, + "loss": 1.1569, + "mean_token_accuracy": 0.5632215116173029, + "num_tokens": 27121.0, + "step": 8820 + }, + { + "epoch": 16.628705882352943, + "grad_norm": 0.8404517769813538, + "learning_rate": 7.268947336853588e-06, + "loss": 1.2085, + "mean_token_accuracy": 0.5531386416405439, + "num_tokens": 40179.0, + "step": 8830 + }, + { + "epoch": 16.647529411764705, + "grad_norm": 1.2052935361862183, + "learning_rate": 7.248778708964781e-06, + "loss": 1.1325, + "mean_token_accuracy": 0.5616716485470533, + "num_tokens": 52387.0, + "step": 8840 + }, + { + "epoch": 16.66635294117647, + "grad_norm": 1.1935890913009644, + "learning_rate": 7.228622181524909e-06, + "loss": 1.1662, + "mean_token_accuracy": 0.5652685184031725, + "num_tokens": 66527.0, + "step": 8850 + }, + { + "epoch": 16.685176470588235, + "grad_norm": 1.4688079357147217, + "learning_rate": 7.20847784318661e-06, + "loss": 1.1733, + "mean_token_accuracy": 0.5583682101219892, + "num_tokens": 79512.0, + "step": 8860 + }, + { + "epoch": 16.704, + "grad_norm": 0.9977661967277527, + "learning_rate": 7.188345782548918e-06, + "loss": 1.1196, + "mean_token_accuracy": 0.5758439347147941, + "num_tokens": 92443.0, + "step": 8870 + }, + { + "epoch": 16.722823529411766, + "grad_norm": 1.6382378339767456, + "learning_rate": 7.168226088156858e-06, + "loss": 1.1558, + "mean_token_accuracy": 0.563961322978139, + "num_tokens": 107011.0, + "step": 8880 + }, + { + "epoch": 16.741647058823528, + "grad_norm": 0.7158175110816956, + "learning_rate": 7.148118848501073e-06, + "loss": 1.2003, + "mean_token_accuracy": 0.5421418201178312, + "num_tokens": 120340.0, + "step": 8890 + }, + { + "epoch": 16.760470588235293, + "grad_norm": 0.7682539224624634, + "learning_rate": 7.128024152017426e-06, + "loss": 1.1337, + "mean_token_accuracy": 0.5682530965656042, + "num_tokens": 133870.0, + "step": 8900 + }, + { + "epoch": 16.77929411764706, + "grad_norm": 1.2490408420562744, + "learning_rate": 7.10794208708661e-06, + "loss": 1.1464, + "mean_token_accuracy": 0.5654782570898533, + "num_tokens": 147737.0, + "step": 8910 + }, + { + "epoch": 16.798117647058824, + "grad_norm": 1.0072635412216187, + "learning_rate": 7.087872742033761e-06, + "loss": 1.1675, + "mean_token_accuracy": 0.5675601534545421, + "num_tokens": 160861.0, + "step": 8920 + }, + { + "epoch": 16.81694117647059, + "grad_norm": 0.9989560842514038, + "learning_rate": 7.0678162051280796e-06, + "loss": 1.1504, + "mean_token_accuracy": 0.5777845904231071, + "num_tokens": 173818.0, + "step": 8930 + }, + { + "epoch": 16.835764705882355, + "grad_norm": 0.7746507525444031, + "learning_rate": 7.04777256458242e-06, + "loss": 1.2331, + "mean_token_accuracy": 0.5357637394219636, + "num_tokens": 187606.0, + "step": 8940 + }, + { + "epoch": 16.854588235294116, + "grad_norm": 0.5496880412101746, + "learning_rate": 7.0277419085529275e-06, + "loss": 1.1534, + "mean_token_accuracy": 0.5625104811042547, + "num_tokens": 200788.0, + "step": 8950 + }, + { + "epoch": 16.87341176470588, + "grad_norm": 0.7524011731147766, + "learning_rate": 7.007724325138626e-06, + "loss": 1.1731, + "mean_token_accuracy": 0.5571359943598508, + "num_tokens": 214193.0, + "step": 8960 + }, + { + "epoch": 16.892235294117647, + "grad_norm": 1.898985743522644, + "learning_rate": 6.987719902381063e-06, + "loss": 1.1823, + "mean_token_accuracy": 0.546281049400568, + "num_tokens": 227004.0, + "step": 8970 + }, + { + "epoch": 16.911058823529412, + "grad_norm": 1.2188752889633179, + "learning_rate": 6.967728728263875e-06, + "loss": 1.2082, + "mean_token_accuracy": 0.5488316975533962, + "num_tokens": 240725.0, + "step": 8980 + }, + { + "epoch": 16.929882352941178, + "grad_norm": 1.4341834783554077, + "learning_rate": 6.947750890712452e-06, + "loss": 1.1383, + "mean_token_accuracy": 0.566087681055069, + "num_tokens": 255280.0, + "step": 8990 + }, + { + "epoch": 16.94870588235294, + "grad_norm": 1.4695709943771362, + "learning_rate": 6.927786477593517e-06, + "loss": 1.1297, + "mean_token_accuracy": 0.571322912350297, + "num_tokens": 268707.0, + "step": 9000 + }, + { + "epoch": 16.967529411764705, + "grad_norm": 1.0631098747253418, + "learning_rate": 6.907835576714752e-06, + "loss": 1.1401, + "mean_token_accuracy": 0.5591850385069848, + "num_tokens": 282374.0, + "step": 9010 + }, + { + "epoch": 16.98635294117647, + "grad_norm": 0.7683926820755005, + "learning_rate": 6.887898275824405e-06, + "loss": 1.1538, + "mean_token_accuracy": 0.5545760612934828, + "num_tokens": 295895.0, + "step": 9020 + }, + { + "epoch": 17.00564705882353, + "grad_norm": 0.5843003392219543, + "learning_rate": 6.8679746626109165e-06, + "loss": 1.304, + "mean_token_accuracy": 0.5468519330024719, + "num_tokens": 309096.0, + "step": 9030 + }, + { + "epoch": 17.024470588235292, + "grad_norm": 0.5110841393470764, + "learning_rate": 6.848064824702518e-06, + "loss": 1.1689, + "mean_token_accuracy": 0.5539047036319971, + "num_tokens": 322843.0, + "step": 9040 + }, + { + "epoch": 17.043294117647058, + "grad_norm": 0.681012749671936, + "learning_rate": 6.828168849666859e-06, + "loss": 1.1473, + "mean_token_accuracy": 0.5699756104499102, + "num_tokens": 335834.0, + "step": 9050 + }, + { + "epoch": 17.062117647058823, + "grad_norm": 0.6035940647125244, + "learning_rate": 6.808286825010611e-06, + "loss": 1.1957, + "mean_token_accuracy": 0.5480252616107464, + "num_tokens": 349415.0, + "step": 9060 + }, + { + "epoch": 17.08094117647059, + "grad_norm": 0.793001651763916, + "learning_rate": 6.788418838179101e-06, + "loss": 1.1495, + "mean_token_accuracy": 0.5695446979254484, + "num_tokens": 362782.0, + "step": 9070 + }, + { + "epoch": 17.099764705882354, + "grad_norm": 0.5845211148262024, + "learning_rate": 6.768564976555898e-06, + "loss": 1.2018, + "mean_token_accuracy": 0.5484800077974796, + "num_tokens": 375606.0, + "step": 9080 + }, + { + "epoch": 17.11858823529412, + "grad_norm": 0.7158066630363464, + "learning_rate": 6.748725327462462e-06, + "loss": 1.1601, + "mean_token_accuracy": 0.5678265064954757, + "num_tokens": 388427.0, + "step": 9090 + }, + { + "epoch": 17.13741176470588, + "grad_norm": 1.2140324115753174, + "learning_rate": 6.728899978157729e-06, + "loss": 1.2314, + "mean_token_accuracy": 0.5344064626842737, + "num_tokens": 402111.0, + "step": 9100 + }, + { + "epoch": 17.156235294117646, + "grad_norm": 1.0139904022216797, + "learning_rate": 6.709089015837758e-06, + "loss": 1.1831, + "mean_token_accuracy": 0.5626831982284785, + "num_tokens": 416419.0, + "step": 9110 + }, + { + "epoch": 17.17505882352941, + "grad_norm": 0.5783360600471497, + "learning_rate": 6.68929252763531e-06, + "loss": 1.1888, + "mean_token_accuracy": 0.5566362496465445, + "num_tokens": 430433.0, + "step": 9120 + }, + { + "epoch": 17.193882352941177, + "grad_norm": 1.0979998111724854, + "learning_rate": 6.669510600619502e-06, + "loss": 1.1366, + "mean_token_accuracy": 0.572005919739604, + "num_tokens": 442507.0, + "step": 9130 + }, + { + "epoch": 17.212705882352942, + "grad_norm": 1.239842414855957, + "learning_rate": 6.649743321795401e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.5650555603206158, + "num_tokens": 455301.0, + "step": 9140 + }, + { + "epoch": 17.231529411764704, + "grad_norm": 0.9120736718177795, + "learning_rate": 6.629990778103652e-06, + "loss": 1.1347, + "mean_token_accuracy": 0.5705232992768288, + "num_tokens": 468128.0, + "step": 9150 + }, + { + "epoch": 17.25035294117647, + "grad_norm": 1.5878956317901611, + "learning_rate": 6.6102530564200885e-06, + "loss": 1.1428, + "mean_token_accuracy": 0.5688801523298025, + "num_tokens": 482382.0, + "step": 9160 + }, + { + "epoch": 17.269176470588235, + "grad_norm": 1.3523510694503784, + "learning_rate": 6.5905302435553575e-06, + "loss": 1.1501, + "mean_token_accuracy": 0.5716863550245762, + "num_tokens": 495560.0, + "step": 9170 + }, + { + "epoch": 17.288, + "grad_norm": 1.0103275775909424, + "learning_rate": 6.570822426254526e-06, + "loss": 1.1479, + "mean_token_accuracy": 0.5623312875628471, + "num_tokens": 509660.0, + "step": 9180 + }, + { + "epoch": 17.306823529411766, + "grad_norm": 0.7961730360984802, + "learning_rate": 6.55112969119672e-06, + "loss": 1.177, + "mean_token_accuracy": 0.5561908625066281, + "num_tokens": 523652.0, + "step": 9190 + }, + { + "epoch": 17.32564705882353, + "grad_norm": 1.049294114112854, + "learning_rate": 6.531452124994716e-06, + "loss": 1.1729, + "mean_token_accuracy": 0.553871612995863, + "num_tokens": 536343.0, + "step": 9200 + }, + { + "epoch": 17.344470588235293, + "grad_norm": 0.9252281188964844, + "learning_rate": 6.511789814194588e-06, + "loss": 1.1302, + "mean_token_accuracy": 0.567984351888299, + "num_tokens": 549308.0, + "step": 9210 + }, + { + "epoch": 17.363294117647058, + "grad_norm": 0.8509281277656555, + "learning_rate": 6.492142845275302e-06, + "loss": 1.1896, + "mean_token_accuracy": 0.5457706928253174, + "num_tokens": 562695.0, + "step": 9220 + }, + { + "epoch": 17.382117647058823, + "grad_norm": 0.8771809935569763, + "learning_rate": 6.472511304648359e-06, + "loss": 1.1732, + "mean_token_accuracy": 0.5531365133821964, + "num_tokens": 575073.0, + "step": 9230 + }, + { + "epoch": 17.40094117647059, + "grad_norm": 1.814473032951355, + "learning_rate": 6.4528952786573904e-06, + "loss": 1.1541, + "mean_token_accuracy": 0.5633249927312136, + "num_tokens": 588911.0, + "step": 9240 + }, + { + "epoch": 17.419764705882354, + "grad_norm": 0.7689526081085205, + "learning_rate": 6.4332948535778075e-06, + "loss": 1.2086, + "mean_token_accuracy": 0.5450298830866813, + "num_tokens": 603178.0, + "step": 9250 + }, + { + "epoch": 17.438588235294116, + "grad_norm": 0.8878546357154846, + "learning_rate": 6.413710115616383e-06, + "loss": 1.1557, + "mean_token_accuracy": 0.5638493042439222, + "num_tokens": 616690.0, + "step": 9260 + }, + { + "epoch": 17.45741176470588, + "grad_norm": 0.610453188419342, + "learning_rate": 6.394141150910913e-06, + "loss": 1.1544, + "mean_token_accuracy": 0.5664511952549219, + "num_tokens": 629868.0, + "step": 9270 + }, + { + "epoch": 17.476235294117647, + "grad_norm": 0.7785117626190186, + "learning_rate": 6.37458804552981e-06, + "loss": 1.1758, + "mean_token_accuracy": 0.5591502383351326, + "num_tokens": 643658.0, + "step": 9280 + }, + { + "epoch": 17.495058823529412, + "grad_norm": 0.694078803062439, + "learning_rate": 6.355050885471743e-06, + "loss": 1.1698, + "mean_token_accuracy": 0.5497753735631704, + "num_tokens": 656038.0, + "step": 9290 + }, + { + "epoch": 17.513882352941177, + "grad_norm": 0.9329729676246643, + "learning_rate": 6.33552975666524e-06, + "loss": 1.1292, + "mean_token_accuracy": 0.5673054289072752, + "num_tokens": 670286.0, + "step": 9300 + }, + { + "epoch": 17.532705882352943, + "grad_norm": 1.1342458724975586, + "learning_rate": 6.316024744968327e-06, + "loss": 1.2161, + "mean_token_accuracy": 0.5357775934040546, + "num_tokens": 683493.0, + "step": 9310 + }, + { + "epoch": 17.551529411764704, + "grad_norm": 0.8364800810813904, + "learning_rate": 6.296535936168137e-06, + "loss": 1.1663, + "mean_token_accuracy": 0.5568131286650896, + "num_tokens": 697575.0, + "step": 9320 + }, + { + "epoch": 17.57035294117647, + "grad_norm": 1.625592589378357, + "learning_rate": 6.277063415980549e-06, + "loss": 1.1174, + "mean_token_accuracy": 0.5752797372639179, + "num_tokens": 710207.0, + "step": 9330 + }, + { + "epoch": 17.589176470588235, + "grad_norm": 1.3862090110778809, + "learning_rate": 6.257607270049791e-06, + "loss": 1.142, + "mean_token_accuracy": 0.5722228426486253, + "num_tokens": 724438.0, + "step": 9340 + }, + { + "epoch": 17.608, + "grad_norm": 1.26033616065979, + "learning_rate": 6.238167583948082e-06, + "loss": 1.1907, + "mean_token_accuracy": 0.5385926622897387, + "num_tokens": 739319.0, + "step": 9350 + }, + { + "epoch": 17.626823529411766, + "grad_norm": 1.0322513580322266, + "learning_rate": 6.218744443175237e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.5683851022273302, + "num_tokens": 751914.0, + "step": 9360 + }, + { + "epoch": 17.645647058823528, + "grad_norm": 0.7326356172561646, + "learning_rate": 6.199337933158316e-06, + "loss": 1.1813, + "mean_token_accuracy": 0.5477977491915226, + "num_tokens": 766447.0, + "step": 9370 + }, + { + "epoch": 17.664470588235293, + "grad_norm": 0.9041365385055542, + "learning_rate": 6.179948139251218e-06, + "loss": 1.1652, + "mean_token_accuracy": 0.55347336307168, + "num_tokens": 779625.0, + "step": 9380 + }, + { + "epoch": 17.68329411764706, + "grad_norm": 1.576574683189392, + "learning_rate": 6.160575146734338e-06, + "loss": 1.1529, + "mean_token_accuracy": 0.5669393539428711, + "num_tokens": 793737.0, + "step": 9390 + }, + { + "epoch": 17.702117647058824, + "grad_norm": 1.3531404733657837, + "learning_rate": 6.1412190408141505e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.5701812230050564, + "num_tokens": 807112.0, + "step": 9400 + }, + { + "epoch": 17.72094117647059, + "grad_norm": 1.6633743047714233, + "learning_rate": 6.121879906622883e-06, + "loss": 1.1761, + "mean_token_accuracy": 0.560976068302989, + "num_tokens": 820492.0, + "step": 9410 + }, + { + "epoch": 17.739764705882354, + "grad_norm": 1.1381210088729858, + "learning_rate": 6.102557829218105e-06, + "loss": 1.1562, + "mean_token_accuracy": 0.5558978658169508, + "num_tokens": 834186.0, + "step": 9420 + }, + { + "epoch": 17.758588235294116, + "grad_norm": 1.8115606307983398, + "learning_rate": 6.083252893582374e-06, + "loss": 1.1872, + "mean_token_accuracy": 0.554209940135479, + "num_tokens": 847318.0, + "step": 9430 + }, + { + "epoch": 17.77741176470588, + "grad_norm": 1.288480520248413, + "learning_rate": 6.063965184622845e-06, + "loss": 1.1726, + "mean_token_accuracy": 0.5530060395598412, + "num_tokens": 860095.0, + "step": 9440 + }, + { + "epoch": 17.796235294117647, + "grad_norm": 1.2023969888687134, + "learning_rate": 6.0446947871709174e-06, + "loss": 1.1904, + "mean_token_accuracy": 0.5426197368651628, + "num_tokens": 873256.0, + "step": 9450 + }, + { + "epoch": 17.815058823529412, + "grad_norm": 1.5823273658752441, + "learning_rate": 6.025441785981843e-06, + "loss": 1.1334, + "mean_token_accuracy": 0.5691535335034132, + "num_tokens": 886435.0, + "step": 9460 + }, + { + "epoch": 17.833882352941178, + "grad_norm": 0.8472403883934021, + "learning_rate": 6.006206265734364e-06, + "loss": 1.1382, + "mean_token_accuracy": 0.5657643742859364, + "num_tokens": 899127.0, + "step": 9470 + }, + { + "epoch": 17.852705882352943, + "grad_norm": 0.931440532207489, + "learning_rate": 5.9869883110303366e-06, + "loss": 1.1718, + "mean_token_accuracy": 0.5716207943856716, + "num_tokens": 913094.0, + "step": 9480 + }, + { + "epoch": 17.871529411764705, + "grad_norm": 0.7743551135063171, + "learning_rate": 5.967788006394364e-06, + "loss": 1.1778, + "mean_token_accuracy": 0.5500955499708653, + "num_tokens": 927506.0, + "step": 9490 + }, + { + "epoch": 17.89035294117647, + "grad_norm": 3.367818593978882, + "learning_rate": 5.948605436273411e-06, + "loss": 1.1036, + "mean_token_accuracy": 0.5776654280722141, + "num_tokens": 940411.0, + "step": 9500 + }, + { + "epoch": 17.909176470588235, + "grad_norm": 0.621562659740448, + "learning_rate": 5.9294406850364584e-06, + "loss": 1.2119, + "mean_token_accuracy": 0.5432645879685879, + "num_tokens": 954352.0, + "step": 9510 + }, + { + "epoch": 17.928, + "grad_norm": 0.5852854251861572, + "learning_rate": 5.910293836974099e-06, + "loss": 1.1967, + "mean_token_accuracy": 0.5400953222066164, + "num_tokens": 967263.0, + "step": 9520 + }, + { + "epoch": 17.946823529411766, + "grad_norm": 0.6211656332015991, + "learning_rate": 5.891164976298198e-06, + "loss": 1.1627, + "mean_token_accuracy": 0.5562442850321532, + "num_tokens": 980662.0, + "step": 9530 + }, + { + "epoch": 17.965647058823528, + "grad_norm": 0.6909055709838867, + "learning_rate": 5.872054187141492e-06, + "loss": 1.1726, + "mean_token_accuracy": 0.5591957967728376, + "num_tokens": 993499.0, + "step": 9540 + }, + { + "epoch": 17.984470588235293, + "grad_norm": 1.064255714416504, + "learning_rate": 5.852961553557251e-06, + "loss": 1.188, + "mean_token_accuracy": 0.5609757989645004, + "num_tokens": 1007775.0, + "step": 9550 + }, + { + "epoch": 18.001882352941177, + "grad_norm": 1.33067786693573, + "learning_rate": 5.833887159518882e-06, + "loss": 1.1521, + "mean_token_accuracy": 0.5590104452661566, + "num_tokens": 1019534.0, + "step": 9560 + }, + { + "epoch": 18.020705882352942, + "grad_norm": 1.1214513778686523, + "learning_rate": 5.8148310889195795e-06, + "loss": 1.1953, + "mean_token_accuracy": 0.5516563657671213, + "num_tokens": 1032963.0, + "step": 9570 + }, + { + "epoch": 18.039529411764708, + "grad_norm": 1.491575837135315, + "learning_rate": 5.795793425571943e-06, + "loss": 1.1595, + "mean_token_accuracy": 0.5553607545793057, + "num_tokens": 1045960.0, + "step": 9580 + }, + { + "epoch": 18.05835294117647, + "grad_norm": 0.6637427806854248, + "learning_rate": 5.776774253207607e-06, + "loss": 1.1874, + "mean_token_accuracy": 0.5493495386093855, + "num_tokens": 1060210.0, + "step": 9590 + }, + { + "epoch": 18.077176470588235, + "grad_norm": 0.5911340117454529, + "learning_rate": 5.757773655476895e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.57906415425241, + "num_tokens": 1072162.0, + "step": 9600 + }, + { + "epoch": 18.097882352941177, + "grad_norm": 1.7098009586334229, + "learning_rate": 5.738791715948421e-06, + "loss": 1.1165, + "mean_token_accuracy": 0.5782070815563202, + "num_tokens": 13939.0, + "step": 9610 + }, + { + "epoch": 18.116705882352942, + "grad_norm": 0.6690590977668762, + "learning_rate": 5.7198285181087406e-06, + "loss": 1.1392, + "mean_token_accuracy": 0.5584286205470562, + "num_tokens": 28443.0, + "step": 9620 + }, + { + "epoch": 18.135529411764704, + "grad_norm": 1.1516035795211792, + "learning_rate": 5.700884145361976e-06, + "loss": 1.202, + "mean_token_accuracy": 0.543058916553855, + "num_tokens": 43005.0, + "step": 9630 + }, + { + "epoch": 18.15435294117647, + "grad_norm": 0.6750898957252502, + "learning_rate": 5.6819586810294635e-06, + "loss": 1.0982, + "mean_token_accuracy": 0.5858326137065888, + "num_tokens": 55756.0, + "step": 9640 + }, + { + "epoch": 18.173176470588235, + "grad_norm": 1.525680661201477, + "learning_rate": 5.663052208349367e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.5846003469079732, + "num_tokens": 68605.0, + "step": 9650 + }, + { + "epoch": 18.192, + "grad_norm": 1.5765116214752197, + "learning_rate": 5.6441648104763215e-06, + "loss": 1.1771, + "mean_token_accuracy": 0.5558013815432787, + "num_tokens": 82077.0, + "step": 9660 + }, + { + "epoch": 18.210823529411766, + "grad_norm": 0.8816690444946289, + "learning_rate": 5.625296570481069e-06, + "loss": 1.1803, + "mean_token_accuracy": 0.5606073562055827, + "num_tokens": 94758.0, + "step": 9670 + }, + { + "epoch": 18.22964705882353, + "grad_norm": 1.345479130744934, + "learning_rate": 5.606447571350093e-06, + "loss": 1.2028, + "mean_token_accuracy": 0.5379578843712807, + "num_tokens": 109010.0, + "step": 9680 + }, + { + "epoch": 18.248470588235293, + "grad_norm": 1.374245524406433, + "learning_rate": 5.587617895985247e-06, + "loss": 1.196, + "mean_token_accuracy": 0.5491275552660226, + "num_tokens": 122939.0, + "step": 9690 + }, + { + "epoch": 18.267294117647058, + "grad_norm": 0.7323598265647888, + "learning_rate": 5.568807627203399e-06, + "loss": 1.1414, + "mean_token_accuracy": 0.5601202577352524, + "num_tokens": 137029.0, + "step": 9700 + }, + { + "epoch": 18.286117647058823, + "grad_norm": 1.2711374759674072, + "learning_rate": 5.550016847736055e-06, + "loss": 1.1124, + "mean_token_accuracy": 0.5777692060917616, + "num_tokens": 149183.0, + "step": 9710 + }, + { + "epoch": 18.30494117647059, + "grad_norm": 0.8101398944854736, + "learning_rate": 5.5312456402290174e-06, + "loss": 1.1478, + "mean_token_accuracy": 0.5615578092634678, + "num_tokens": 163147.0, + "step": 9720 + }, + { + "epoch": 18.323764705882354, + "grad_norm": 1.201725721359253, + "learning_rate": 5.512494087241995e-06, + "loss": 1.1889, + "mean_token_accuracy": 0.5410687677562237, + "num_tokens": 176934.0, + "step": 9730 + }, + { + "epoch": 18.342588235294116, + "grad_norm": 1.2040985822677612, + "learning_rate": 5.493762271248255e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.5771806977689267, + "num_tokens": 189676.0, + "step": 9740 + }, + { + "epoch": 18.36141176470588, + "grad_norm": 1.194765567779541, + "learning_rate": 5.475050274634255e-06, + "loss": 1.2101, + "mean_token_accuracy": 0.5370388999581337, + "num_tokens": 202706.0, + "step": 9750 + }, + { + "epoch": 18.380235294117647, + "grad_norm": 1.3350589275360107, + "learning_rate": 5.456358179699289e-06, + "loss": 1.17, + "mean_token_accuracy": 0.5458086933940649, + "num_tokens": 216179.0, + "step": 9760 + }, + { + "epoch": 18.399058823529412, + "grad_norm": 1.164698839187622, + "learning_rate": 5.437686068655115e-06, + "loss": 1.2626, + "mean_token_accuracy": 0.534201942011714, + "num_tokens": 229633.0, + "step": 9770 + }, + { + "epoch": 18.417882352941177, + "grad_norm": 0.6664723753929138, + "learning_rate": 5.419034023625597e-06, + "loss": 1.1409, + "mean_token_accuracy": 0.5639401733875274, + "num_tokens": 242540.0, + "step": 9780 + }, + { + "epoch": 18.436705882352943, + "grad_norm": 0.5169602036476135, + "learning_rate": 5.4004021266463415e-06, + "loss": 1.1817, + "mean_token_accuracy": 0.5450482603162528, + "num_tokens": 254975.0, + "step": 9790 + }, + { + "epoch": 18.455529411764704, + "grad_norm": 0.5377786755561829, + "learning_rate": 5.381790459664355e-06, + "loss": 1.1443, + "mean_token_accuracy": 0.5668651383370161, + "num_tokens": 269635.0, + "step": 9800 + }, + { + "epoch": 18.47435294117647, + "grad_norm": 1.7214852571487427, + "learning_rate": 5.363199104537649e-06, + "loss": 1.1384, + "mean_token_accuracy": 0.5759254258126021, + "num_tokens": 282552.0, + "step": 9810 + }, + { + "epoch": 18.493176470588235, + "grad_norm": 1.1212029457092285, + "learning_rate": 5.344628143034904e-06, + "loss": 1.1671, + "mean_token_accuracy": 0.5748541194945573, + "num_tokens": 296715.0, + "step": 9820 + }, + { + "epoch": 18.512, + "grad_norm": 0.8291766047477722, + "learning_rate": 5.32607765683511e-06, + "loss": 1.171, + "mean_token_accuracy": 0.5621029295027256, + "num_tokens": 310741.0, + "step": 9830 + }, + { + "epoch": 18.530823529411766, + "grad_norm": 1.666212558746338, + "learning_rate": 5.307547727527207e-06, + "loss": 1.1493, + "mean_token_accuracy": 0.5714134465903044, + "num_tokens": 324377.0, + "step": 9840 + }, + { + "epoch": 18.54964705882353, + "grad_norm": 0.6212410926818848, + "learning_rate": 5.28903843660971e-06, + "loss": 1.182, + "mean_token_accuracy": 0.5523442510515452, + "num_tokens": 336585.0, + "step": 9850 + }, + { + "epoch": 18.568470588235293, + "grad_norm": 0.763521134853363, + "learning_rate": 5.2705498654903666e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.5365333639085292, + "num_tokens": 351773.0, + "step": 9860 + }, + { + "epoch": 18.58729411764706, + "grad_norm": 1.7621654272079468, + "learning_rate": 5.252082095485793e-06, + "loss": 1.134, + "mean_token_accuracy": 0.5687922302633523, + "num_tokens": 364135.0, + "step": 9870 + }, + { + "epoch": 18.606117647058824, + "grad_norm": 0.8109177350997925, + "learning_rate": 5.233635207821126e-06, + "loss": 1.1571, + "mean_token_accuracy": 0.5557177890092134, + "num_tokens": 377314.0, + "step": 9880 + }, + { + "epoch": 18.62494117647059, + "grad_norm": 1.2423245906829834, + "learning_rate": 5.215209283629647e-06, + "loss": 1.1754, + "mean_token_accuracy": 0.5624301459640264, + "num_tokens": 391043.0, + "step": 9890 + }, + { + "epoch": 18.643764705882354, + "grad_norm": 1.4668940305709839, + "learning_rate": 5.19680440395244e-06, + "loss": 1.1433, + "mean_token_accuracy": 0.5692973904311657, + "num_tokens": 404155.0, + "step": 9900 + }, + { + "epoch": 18.662588235294116, + "grad_norm": 0.6444953083992004, + "learning_rate": 5.1784206497380275e-06, + "loss": 1.1656, + "mean_token_accuracy": 0.5535128690302372, + "num_tokens": 417083.0, + "step": 9910 + }, + { + "epoch": 18.68141176470588, + "grad_norm": 0.9833362102508545, + "learning_rate": 5.160058101842025e-06, + "loss": 1.1301, + "mean_token_accuracy": 0.5652093205600976, + "num_tokens": 430739.0, + "step": 9920 + }, + { + "epoch": 18.700235294117647, + "grad_norm": 1.3160921335220337, + "learning_rate": 5.141716841026774e-06, + "loss": 1.2158, + "mean_token_accuracy": 0.5528298642486333, + "num_tokens": 444108.0, + "step": 9930 + }, + { + "epoch": 18.719058823529412, + "grad_norm": 0.6429352164268494, + "learning_rate": 5.123396947960993e-06, + "loss": 1.1509, + "mean_token_accuracy": 0.5627094566822052, + "num_tokens": 457365.0, + "step": 9940 + }, + { + "epoch": 18.737882352941178, + "grad_norm": 0.57741379737854, + "learning_rate": 5.105098503219408e-06, + "loss": 1.1677, + "mean_token_accuracy": 0.5417445503175259, + "num_tokens": 470389.0, + "step": 9950 + }, + { + "epoch": 18.756705882352943, + "grad_norm": 1.4769562482833862, + "learning_rate": 5.08682158728243e-06, + "loss": 1.1693, + "mean_token_accuracy": 0.5543987430632115, + "num_tokens": 483516.0, + "step": 9960 + }, + { + "epoch": 18.775529411764705, + "grad_norm": 1.122862696647644, + "learning_rate": 5.068566280535772e-06, + "loss": 1.1676, + "mean_token_accuracy": 0.5597089301794768, + "num_tokens": 497211.0, + "step": 9970 + }, + { + "epoch": 18.79435294117647, + "grad_norm": 1.3088434934616089, + "learning_rate": 5.050332663270105e-06, + "loss": 1.1703, + "mean_token_accuracy": 0.5574114482849837, + "num_tokens": 511246.0, + "step": 9980 + }, + { + "epoch": 18.813176470588235, + "grad_norm": 1.146748661994934, + "learning_rate": 5.032120815680703e-06, + "loss": 1.1415, + "mean_token_accuracy": 0.5681348893791437, + "num_tokens": 524961.0, + "step": 9990 + }, + { + "epoch": 18.832, + "grad_norm": 0.6369304060935974, + "learning_rate": 5.013930817867103e-06, + "loss": 1.1355, + "mean_token_accuracy": 0.5745254665613174, + "num_tokens": 537543.0, + "step": 10000 + }, + { + "epoch": 18.850823529411766, + "grad_norm": 0.804693341255188, + "learning_rate": 4.995762749832731e-06, + "loss": 1.1858, + "mean_token_accuracy": 0.5416501805186271, + "num_tokens": 550396.0, + "step": 10010 + }, + { + "epoch": 18.869647058823528, + "grad_norm": 1.5857802629470825, + "learning_rate": 4.977616691484567e-06, + "loss": 1.1571, + "mean_token_accuracy": 0.5618045397102833, + "num_tokens": 563953.0, + "step": 10020 + }, + { + "epoch": 18.888470588235293, + "grad_norm": 1.1062195301055908, + "learning_rate": 4.9594927226327795e-06, + "loss": 1.2112, + "mean_token_accuracy": 0.5421402599662543, + "num_tokens": 577786.0, + "step": 10030 + }, + { + "epoch": 18.90729411764706, + "grad_norm": 0.7114964127540588, + "learning_rate": 4.941390922990398e-06, + "loss": 1.1818, + "mean_token_accuracy": 0.5595896728336811, + "num_tokens": 592052.0, + "step": 10040 + }, + { + "epoch": 18.926117647058824, + "grad_norm": 1.7100679874420166, + "learning_rate": 4.923311372172935e-06, + "loss": 1.1518, + "mean_token_accuracy": 0.5808280512690545, + "num_tokens": 605121.0, + "step": 10050 + }, + { + "epoch": 18.94494117647059, + "grad_norm": 1.4883623123168945, + "learning_rate": 4.905254149698049e-06, + "loss": 1.1205, + "mean_token_accuracy": 0.564001039788127, + "num_tokens": 618937.0, + "step": 10060 + }, + { + "epoch": 18.963764705882355, + "grad_norm": 0.5710100531578064, + "learning_rate": 4.8872193349852e-06, + "loss": 1.193, + "mean_token_accuracy": 0.5475729245692491, + "num_tokens": 631403.0, + "step": 10070 + }, + { + "epoch": 18.982588235294116, + "grad_norm": 0.82412189245224, + "learning_rate": 4.869207007355286e-06, + "loss": 1.1769, + "mean_token_accuracy": 0.549387214705348, + "num_tokens": 644809.0, + "step": 10080 + }, + { + "epoch": 19.001882352941177, + "grad_norm": 4.334903717041016, + "learning_rate": 4.851217246030307e-06, + "loss": 1.2232, + "mean_token_accuracy": 0.5745812316493291, + "num_tokens": 658742.0, + "step": 10090 + }, + { + "epoch": 19.020705882352942, + "grad_norm": 0.7227234244346619, + "learning_rate": 4.833250130133014e-06, + "loss": 1.1446, + "mean_token_accuracy": 0.5578329466283322, + "num_tokens": 672376.0, + "step": 10100 + }, + { + "epoch": 19.039529411764708, + "grad_norm": 1.063941240310669, + "learning_rate": 4.815305738686548e-06, + "loss": 1.1323, + "mean_token_accuracy": 0.5669731423258781, + "num_tokens": 684972.0, + "step": 10110 + }, + { + "epoch": 19.05835294117647, + "grad_norm": 1.1839574575424194, + "learning_rate": 4.7973841506141195e-06, + "loss": 1.178, + "mean_token_accuracy": 0.5547245424240828, + "num_tokens": 699346.0, + "step": 10120 + }, + { + "epoch": 19.077176470588235, + "grad_norm": 0.8287972807884216, + "learning_rate": 4.779485444738632e-06, + "loss": 1.1305, + "mean_token_accuracy": 0.5627703540027141, + "num_tokens": 711760.0, + "step": 10130 + }, + { + "epoch": 19.096, + "grad_norm": 1.2000936269760132, + "learning_rate": 4.761609699782351e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.5821688748896122, + "num_tokens": 724959.0, + "step": 10140 + }, + { + "epoch": 19.114823529411765, + "grad_norm": 1.1382743120193481, + "learning_rate": 4.743756994366555e-06, + "loss": 1.199, + "mean_token_accuracy": 0.5427775271236897, + "num_tokens": 738570.0, + "step": 10150 + }, + { + "epoch": 19.13364705882353, + "grad_norm": 0.6352145075798035, + "learning_rate": 4.7259274070111986e-06, + "loss": 1.1679, + "mean_token_accuracy": 0.5518446248024702, + "num_tokens": 751688.0, + "step": 10160 + }, + { + "epoch": 19.152470588235293, + "grad_norm": 1.043312668800354, + "learning_rate": 4.708121016134545e-06, + "loss": 1.1412, + "mean_token_accuracy": 0.56727832891047, + "num_tokens": 765261.0, + "step": 10170 + }, + { + "epoch": 19.171294117647058, + "grad_norm": 2.1281962394714355, + "learning_rate": 4.69033790005284e-06, + "loss": 1.1316, + "mean_token_accuracy": 0.5734411317855119, + "num_tokens": 777972.0, + "step": 10180 + }, + { + "epoch": 19.190117647058823, + "grad_norm": 1.2191262245178223, + "learning_rate": 4.672578136979961e-06, + "loss": 1.2033, + "mean_token_accuracy": 0.5502295974642039, + "num_tokens": 792540.0, + "step": 10190 + }, + { + "epoch": 19.20894117647059, + "grad_norm": 2.241875648498535, + "learning_rate": 4.65484180502708e-06, + "loss": 1.2114, + "mean_token_accuracy": 0.5526003040373325, + "num_tokens": 805628.0, + "step": 10200 + }, + { + "epoch": 19.227764705882354, + "grad_norm": 1.0254322290420532, + "learning_rate": 4.637128982202308e-06, + "loss": 1.1448, + "mean_token_accuracy": 0.566441947594285, + "num_tokens": 818605.0, + "step": 10210 + }, + { + "epoch": 19.24658823529412, + "grad_norm": 0.5974338054656982, + "learning_rate": 4.619439746410361e-06, + "loss": 1.1663, + "mean_token_accuracy": 0.5573429156094789, + "num_tokens": 831744.0, + "step": 10220 + }, + { + "epoch": 19.26541176470588, + "grad_norm": 1.1514984369277954, + "learning_rate": 4.601774175452203e-06, + "loss": 1.1816, + "mean_token_accuracy": 0.5479875948280096, + "num_tokens": 844511.0, + "step": 10230 + }, + { + "epoch": 19.284235294117646, + "grad_norm": 1.9380877017974854, + "learning_rate": 4.584132347024732e-06, + "loss": 1.1513, + "mean_token_accuracy": 0.5600051417946815, + "num_tokens": 857034.0, + "step": 10240 + }, + { + "epoch": 19.303058823529412, + "grad_norm": 1.8556410074234009, + "learning_rate": 4.566514338720414e-06, + "loss": 1.2121, + "mean_token_accuracy": 0.5544085066765547, + "num_tokens": 870895.0, + "step": 10250 + }, + { + "epoch": 19.321882352941177, + "grad_norm": 0.7411885857582092, + "learning_rate": 4.5489202280269465e-06, + "loss": 1.1471, + "mean_token_accuracy": 0.5642319560050965, + "num_tokens": 883794.0, + "step": 10260 + }, + { + "epoch": 19.340705882352943, + "grad_norm": 0.6563217043876648, + "learning_rate": 4.53135009232692e-06, + "loss": 1.119, + "mean_token_accuracy": 0.571989681199193, + "num_tokens": 896094.0, + "step": 10270 + }, + { + "epoch": 19.359529411764704, + "grad_norm": 0.717928946018219, + "learning_rate": 4.513804008897487e-06, + "loss": 1.1896, + "mean_token_accuracy": 0.5477908588945866, + "num_tokens": 909067.0, + "step": 10280 + }, + { + "epoch": 19.37835294117647, + "grad_norm": 1.7051725387573242, + "learning_rate": 4.496282054910006e-06, + "loss": 1.2038, + "mean_token_accuracy": 0.5528531819581985, + "num_tokens": 922861.0, + "step": 10290 + }, + { + "epoch": 19.397176470588235, + "grad_norm": 0.5672712922096252, + "learning_rate": 4.478784307429707e-06, + "loss": 1.1883, + "mean_token_accuracy": 0.5424028813838959, + "num_tokens": 935977.0, + "step": 10300 + }, + { + "epoch": 19.416, + "grad_norm": 1.4658303260803223, + "learning_rate": 4.461310843415354e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.5855666678398848, + "num_tokens": 950190.0, + "step": 10310 + }, + { + "epoch": 19.434823529411766, + "grad_norm": 0.8565905690193176, + "learning_rate": 4.4438617397189185e-06, + "loss": 1.1637, + "mean_token_accuracy": 0.5575710866600275, + "num_tokens": 964649.0, + "step": 10320 + }, + { + "epoch": 19.45364705882353, + "grad_norm": 1.2235567569732666, + "learning_rate": 4.42643707308522e-06, + "loss": 1.1326, + "mean_token_accuracy": 0.5670348349958658, + "num_tokens": 978559.0, + "step": 10330 + }, + { + "epoch": 19.472470588235293, + "grad_norm": 0.79115229845047, + "learning_rate": 4.4090369201516e-06, + "loss": 1.1965, + "mean_token_accuracy": 0.5441572275012732, + "num_tokens": 992276.0, + "step": 10340 + }, + { + "epoch": 19.491294117647058, + "grad_norm": 1.4369901418685913, + "learning_rate": 4.391661357447585e-06, + "loss": 1.1701, + "mean_token_accuracy": 0.5550823096185923, + "num_tokens": 1006379.0, + "step": 10350 + }, + { + "epoch": 19.510117647058824, + "grad_norm": 1.175675868988037, + "learning_rate": 4.374310461394548e-06, + "loss": 1.1332, + "mean_token_accuracy": 0.5736968379467726, + "num_tokens": 1019668.0, + "step": 10360 + }, + { + "epoch": 19.52894117647059, + "grad_norm": 0.5965217351913452, + "learning_rate": 4.356984308305374e-06, + "loss": 1.1563, + "mean_token_accuracy": 0.5608095470815897, + "num_tokens": 1032049.0, + "step": 10370 + }, + { + "epoch": 19.547764705882354, + "grad_norm": 1.406221628189087, + "learning_rate": 4.3396829743841205e-06, + "loss": 1.1749, + "mean_token_accuracy": 0.5496669236570597, + "num_tokens": 1045211.0, + "step": 10380 + }, + { + "epoch": 19.566588235294116, + "grad_norm": 1.215728759765625, + "learning_rate": 4.322406535725686e-06, + "loss": 1.1748, + "mean_token_accuracy": 0.5576162055134773, + "num_tokens": 1058179.0, + "step": 10390 + }, + { + "epoch": 19.58541176470588, + "grad_norm": 1.223363995552063, + "learning_rate": 4.305155068315481e-06, + "loss": 1.1467, + "mean_token_accuracy": 0.5632787074893713, + "num_tokens": 1071797.0, + "step": 10400 + }, + { + "epoch": 19.604235294117647, + "grad_norm": 1.32563054561615, + "learning_rate": 4.2879286480290784e-06, + "loss": 1.1665, + "mean_token_accuracy": 0.543903386592865, + "num_tokens": 1085172.0, + "step": 10410 + }, + { + "epoch": 19.623058823529412, + "grad_norm": 1.0964701175689697, + "learning_rate": 4.270727350631892e-06, + "loss": 1.1368, + "mean_token_accuracy": 0.5769836001098156, + "num_tokens": 1098617.0, + "step": 10420 + }, + { + "epoch": 19.641882352941177, + "grad_norm": 0.8849780559539795, + "learning_rate": 4.253551251778835e-06, + "loss": 1.238, + "mean_token_accuracy": 0.5346022747457028, + "num_tokens": 1111860.0, + "step": 10430 + }, + { + "epoch": 19.660705882352943, + "grad_norm": 1.4776290655136108, + "learning_rate": 4.236400427014005e-06, + "loss": 1.2089, + "mean_token_accuracy": 0.5553506713360548, + "num_tokens": 1125874.0, + "step": 10440 + }, + { + "epoch": 19.679529411764705, + "grad_norm": 0.6762340068817139, + "learning_rate": 4.2192749517703255e-06, + "loss": 1.1319, + "mean_token_accuracy": 0.572966867312789, + "num_tokens": 1139009.0, + "step": 10450 + }, + { + "epoch": 19.69835294117647, + "grad_norm": 0.9607488512992859, + "learning_rate": 4.202174901369236e-06, + "loss": 1.1342, + "mean_token_accuracy": 0.5701036512851715, + "num_tokens": 1151825.0, + "step": 10460 + }, + { + "epoch": 19.717176470588235, + "grad_norm": 0.7690389156341553, + "learning_rate": 4.1851003510203416e-06, + "loss": 1.1599, + "mean_token_accuracy": 0.554409109801054, + "num_tokens": 1165331.0, + "step": 10470 + }, + { + "epoch": 19.736, + "grad_norm": 0.6242837905883789, + "learning_rate": 4.168051375821108e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.5393414959311486, + "num_tokens": 1178787.0, + "step": 10480 + }, + { + "epoch": 19.754823529411766, + "grad_norm": 0.8885065317153931, + "learning_rate": 4.151028050756507e-06, + "loss": 1.1456, + "mean_token_accuracy": 0.5653862472623586, + "num_tokens": 1191650.0, + "step": 10490 + }, + { + "epoch": 19.773647058823528, + "grad_norm": 0.7802302837371826, + "learning_rate": 4.134030450698697e-06, + "loss": 1.1645, + "mean_token_accuracy": 0.5542376168072224, + "num_tokens": 1205371.0, + "step": 10500 + }, + { + "epoch": 19.792470588235293, + "grad_norm": 0.8476331233978271, + "learning_rate": 4.117058650406683e-06, + "loss": 1.1996, + "mean_token_accuracy": 0.5366521954536438, + "num_tokens": 1218885.0, + "step": 10510 + }, + { + "epoch": 19.81129411764706, + "grad_norm": 0.7439809441566467, + "learning_rate": 4.1001127245260175e-06, + "loss": 1.162, + "mean_token_accuracy": 0.5500502925366163, + "num_tokens": 1232409.0, + "step": 10520 + }, + { + "epoch": 19.830117647058824, + "grad_norm": 0.8466014266014099, + "learning_rate": 4.083192747588436e-06, + "loss": 1.2165, + "mean_token_accuracy": 0.547482916712761, + "num_tokens": 1245876.0, + "step": 10530 + }, + { + "epoch": 19.84894117647059, + "grad_norm": 0.9549068808555603, + "learning_rate": 4.066298794011551e-06, + "loss": 1.1552, + "mean_token_accuracy": 0.567984651774168, + "num_tokens": 1260603.0, + "step": 10540 + }, + { + "epoch": 19.867764705882355, + "grad_norm": 0.8882144689559937, + "learning_rate": 4.049430938098513e-06, + "loss": 1.1424, + "mean_token_accuracy": 0.566171682626009, + "num_tokens": 1274404.0, + "step": 10550 + }, + { + "epoch": 19.886588235294116, + "grad_norm": 1.1163575649261475, + "learning_rate": 4.0325892540377035e-06, + "loss": 1.1986, + "mean_token_accuracy": 0.54889883287251, + "num_tokens": 1288135.0, + "step": 10560 + }, + { + "epoch": 19.90541176470588, + "grad_norm": 0.5996796488761902, + "learning_rate": 4.01577381590238e-06, + "loss": 1.1317, + "mean_token_accuracy": 0.5590782940387726, + "num_tokens": 1301565.0, + "step": 10570 + }, + { + "epoch": 19.924235294117647, + "grad_norm": 0.5613903999328613, + "learning_rate": 3.998984697650369e-06, + "loss": 1.144, + "mean_token_accuracy": 0.5581843961030245, + "num_tokens": 1315363.0, + "step": 10580 + }, + { + "epoch": 19.943058823529412, + "grad_norm": 1.517250895500183, + "learning_rate": 3.982221973123738e-06, + "loss": 1.1585, + "mean_token_accuracy": 0.5547402266412973, + "num_tokens": 1328940.0, + "step": 10590 + }, + { + "epoch": 19.961882352941178, + "grad_norm": 1.6663577556610107, + "learning_rate": 3.965485716048473e-06, + "loss": 1.1706, + "mean_token_accuracy": 0.5520875003188849, + "num_tokens": 1342451.0, + "step": 10600 + }, + { + "epoch": 19.98070588235294, + "grad_norm": 1.2554893493652344, + "learning_rate": 3.948776000034144e-06, + "loss": 1.1016, + "mean_token_accuracy": 0.5831372920423746, + "num_tokens": 1355912.0, + "step": 10610 + }, + { + "epoch": 19.999529411764705, + "grad_norm": 1.3978781700134277, + "learning_rate": 3.932092898573593e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.5891566134989261, + "num_tokens": 1370301.0, + "step": 10620 + }, + { + "epoch": 20.01694117647059, + "grad_norm": 1.0220972299575806, + "learning_rate": 3.915436485042602e-06, + "loss": 1.1514, + "mean_token_accuracy": 0.5695925415367693, + "num_tokens": 1382512.0, + "step": 10630 + }, + { + "epoch": 20.035764705882354, + "grad_norm": 0.9992367625236511, + "learning_rate": 3.898806832699574e-06, + "loss": 1.2, + "mean_token_accuracy": 0.5552287392318249, + "num_tokens": 1395877.0, + "step": 10640 + }, + { + "epoch": 20.05458823529412, + "grad_norm": 0.8924506306648254, + "learning_rate": 3.882204014685213e-06, + "loss": 1.1325, + "mean_token_accuracy": 0.562846252322197, + "num_tokens": 1408762.0, + "step": 10650 + }, + { + "epoch": 20.07341176470588, + "grad_norm": 1.2033953666687012, + "learning_rate": 3.8656281040221975e-06, + "loss": 1.1699, + "mean_token_accuracy": 0.5635105472058057, + "num_tokens": 1421599.0, + "step": 10660 + }, + { + "epoch": 20.092235294117646, + "grad_norm": 0.999156653881073, + "learning_rate": 3.849079173614863e-06, + "loss": 1.1869, + "mean_token_accuracy": 0.5541719019412994, + "num_tokens": 1434720.0, + "step": 10670 + }, + { + "epoch": 20.11105882352941, + "grad_norm": 1.1332379579544067, + "learning_rate": 3.832557296248883e-06, + "loss": 1.1769, + "mean_token_accuracy": 0.5474078699946403, + "num_tokens": 1448113.0, + "step": 10680 + }, + { + "epoch": 20.129882352941177, + "grad_norm": 1.046257734298706, + "learning_rate": 3.816062544590944e-06, + "loss": 1.119, + "mean_token_accuracy": 0.5713853165507317, + "num_tokens": 1462113.0, + "step": 10690 + }, + { + "epoch": 20.148705882352942, + "grad_norm": 1.346682071685791, + "learning_rate": 3.7995949911884235e-06, + "loss": 1.1687, + "mean_token_accuracy": 0.5563855923712253, + "num_tokens": 1474782.0, + "step": 10700 + }, + { + "epoch": 20.167529411764704, + "grad_norm": 1.4376060962677002, + "learning_rate": 3.783154708469079e-06, + "loss": 1.1921, + "mean_token_accuracy": 0.5529118336737155, + "num_tokens": 1488150.0, + "step": 10710 + }, + { + "epoch": 20.18635294117647, + "grad_norm": 1.298230767250061, + "learning_rate": 3.7667417687407305e-06, + "loss": 1.1748, + "mean_token_accuracy": 0.553018931671977, + "num_tokens": 1501896.0, + "step": 10720 + }, + { + "epoch": 20.205176470588235, + "grad_norm": 1.2680264711380005, + "learning_rate": 3.750356244190931e-06, + "loss": 1.1694, + "mean_token_accuracy": 0.5553711723536253, + "num_tokens": 1515310.0, + "step": 10730 + }, + { + "epoch": 20.224, + "grad_norm": 1.5268313884735107, + "learning_rate": 3.7339982068866586e-06, + "loss": 1.1437, + "mean_token_accuracy": 0.566285153850913, + "num_tokens": 1528249.0, + "step": 10740 + }, + { + "epoch": 20.242823529411766, + "grad_norm": 1.163407802581787, + "learning_rate": 3.717667728773995e-06, + "loss": 1.1316, + "mean_token_accuracy": 0.5630121100693941, + "num_tokens": 1541606.0, + "step": 10750 + }, + { + "epoch": 20.26164705882353, + "grad_norm": 0.8959663510322571, + "learning_rate": 3.701364881677809e-06, + "loss": 1.163, + "mean_token_accuracy": 0.5470004346221685, + "num_tokens": 1554546.0, + "step": 10760 + }, + { + "epoch": 20.280470588235293, + "grad_norm": 0.8293361067771912, + "learning_rate": 3.6850897373014514e-06, + "loss": 1.2158, + "mean_token_accuracy": 0.5398020602762699, + "num_tokens": 1567389.0, + "step": 10770 + }, + { + "epoch": 20.299294117647058, + "grad_norm": 0.7178218364715576, + "learning_rate": 3.668842367226427e-06, + "loss": 1.1527, + "mean_token_accuracy": 0.5576813716441393, + "num_tokens": 1580197.0, + "step": 10780 + }, + { + "epoch": 20.318117647058823, + "grad_norm": 0.8142568469047546, + "learning_rate": 3.652622842912068e-06, + "loss": 1.167, + "mean_token_accuracy": 0.5699529372155666, + "num_tokens": 1593797.0, + "step": 10790 + }, + { + "epoch": 20.33694117647059, + "grad_norm": 1.293581247329712, + "learning_rate": 3.6364312356952603e-06, + "loss": 1.1763, + "mean_token_accuracy": 0.5648769486695528, + "num_tokens": 1607332.0, + "step": 10800 + }, + { + "epoch": 20.35764705882353, + "grad_norm": 1.6430315971374512, + "learning_rate": 5.595460614152204e-06, + "loss": 1.1903, + "mean_token_accuracy": 0.5535745773464441, + "num_tokens": 14114.0, + "step": 10810 + }, + { + "epoch": 20.376470588235293, + "grad_norm": 0.9296208620071411, + "learning_rate": 5.582109479305742e-06, + "loss": 1.1451, + "mean_token_accuracy": 0.5736374389380217, + "num_tokens": 26719.0, + "step": 10820 + }, + { + "epoch": 20.395294117647058, + "grad_norm": 0.7295175790786743, + "learning_rate": 5.5687583444592795e-06, + "loss": 1.1719, + "mean_token_accuracy": 0.5615855868905782, + "num_tokens": 40788.0, + "step": 10830 + }, + { + "epoch": 20.414117647058823, + "grad_norm": 0.5272361636161804, + "learning_rate": 5.555407209612818e-06, + "loss": 1.1707, + "mean_token_accuracy": 0.5535146549344063, + "num_tokens": 55000.0, + "step": 10840 + }, + { + "epoch": 20.43294117647059, + "grad_norm": 1.6679742336273193, + "learning_rate": 5.542056074766355e-06, + "loss": 1.1919, + "mean_token_accuracy": 0.5483698755502701, + "num_tokens": 68277.0, + "step": 10850 + }, + { + "epoch": 20.451764705882354, + "grad_norm": 1.3171534538269043, + "learning_rate": 5.528704939919893e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.5720774076879025, + "num_tokens": 81749.0, + "step": 10860 + }, + { + "epoch": 20.470588235294116, + "grad_norm": 0.6171587705612183, + "learning_rate": 5.515353805073432e-06, + "loss": 1.1495, + "mean_token_accuracy": 0.5558391027152538, + "num_tokens": 95748.0, + "step": 10870 + }, + { + "epoch": 20.48941176470588, + "grad_norm": 1.8184794187545776, + "learning_rate": 5.50200267022697e-06, + "loss": 1.1554, + "mean_token_accuracy": 0.5604365076869726, + "num_tokens": 109182.0, + "step": 10880 + }, + { + "epoch": 20.508235294117647, + "grad_norm": 0.6223208904266357, + "learning_rate": 5.488651535380508e-06, + "loss": 1.1084, + "mean_token_accuracy": 0.5673012807965279, + "num_tokens": 122093.0, + "step": 10890 + }, + { + "epoch": 20.527058823529412, + "grad_norm": 0.6511589288711548, + "learning_rate": 5.475300400534046e-06, + "loss": 1.1891, + "mean_token_accuracy": 0.5500082913786173, + "num_tokens": 135801.0, + "step": 10900 + }, + { + "epoch": 20.545882352941177, + "grad_norm": 1.1379867792129517, + "learning_rate": 5.461949265687584e-06, + "loss": 1.1577, + "mean_token_accuracy": 0.5615630965679884, + "num_tokens": 149528.0, + "step": 10910 + }, + { + "epoch": 20.564705882352943, + "grad_norm": 0.6646468043327332, + "learning_rate": 5.448598130841122e-06, + "loss": 1.1693, + "mean_token_accuracy": 0.5596156906336546, + "num_tokens": 162347.0, + "step": 10920 + }, + { + "epoch": 20.583529411764705, + "grad_norm": 0.6345205903053284, + "learning_rate": 5.435246995994659e-06, + "loss": 1.1393, + "mean_token_accuracy": 0.5636776462197304, + "num_tokens": 174228.0, + "step": 10930 + }, + { + "epoch": 20.60235294117647, + "grad_norm": 1.0836670398712158, + "learning_rate": 5.4218958611481976e-06, + "loss": 1.1253, + "mean_token_accuracy": 0.574345787242055, + "num_tokens": 188052.0, + "step": 10940 + }, + { + "epoch": 20.621176470588235, + "grad_norm": 0.7239655256271362, + "learning_rate": 5.408544726301737e-06, + "loss": 1.1691, + "mean_token_accuracy": 0.5582763768732548, + "num_tokens": 202984.0, + "step": 10950 + }, + { + "epoch": 20.64, + "grad_norm": 0.6753464937210083, + "learning_rate": 5.395193591455274e-06, + "loss": 1.1513, + "mean_token_accuracy": 0.5622932318598032, + "num_tokens": 216292.0, + "step": 10960 + }, + { + "epoch": 20.658823529411766, + "grad_norm": 0.5746181607246399, + "learning_rate": 5.381842456608812e-06, + "loss": 1.1928, + "mean_token_accuracy": 0.5508632536977529, + "num_tokens": 229204.0, + "step": 10970 + }, + { + "epoch": 20.677647058823528, + "grad_norm": 1.1452544927597046, + "learning_rate": 5.3684913217623505e-06, + "loss": 1.1549, + "mean_token_accuracy": 0.5678117204457521, + "num_tokens": 242540.0, + "step": 10980 + }, + { + "epoch": 20.696470588235293, + "grad_norm": 0.6321762800216675, + "learning_rate": 5.355140186915888e-06, + "loss": 1.1235, + "mean_token_accuracy": 0.575124978646636, + "num_tokens": 256825.0, + "step": 10990 + }, + { + "epoch": 20.71529411764706, + "grad_norm": 0.5946145057678223, + "learning_rate": 5.341789052069426e-06, + "loss": 1.1731, + "mean_token_accuracy": 0.5547100655734539, + "num_tokens": 270675.0, + "step": 11000 + }, + { + "epoch": 20.734117647058824, + "grad_norm": 1.3031941652297974, + "learning_rate": 5.3284379172229635e-06, + "loss": 1.1891, + "mean_token_accuracy": 0.5470674268901348, + "num_tokens": 282696.0, + "step": 11010 + }, + { + "epoch": 20.75294117647059, + "grad_norm": 0.5822432637214661, + "learning_rate": 5.315086782376503e-06, + "loss": 1.2153, + "mean_token_accuracy": 0.5374420482665301, + "num_tokens": 296789.0, + "step": 11020 + }, + { + "epoch": 20.771764705882354, + "grad_norm": 1.1034314632415771, + "learning_rate": 5.301735647530041e-06, + "loss": 1.1166, + "mean_token_accuracy": 0.5685049999505282, + "num_tokens": 310298.0, + "step": 11030 + }, + { + "epoch": 20.790588235294116, + "grad_norm": 1.798014521598816, + "learning_rate": 5.288384512683579e-06, + "loss": 1.1764, + "mean_token_accuracy": 0.5581833314150572, + "num_tokens": 323136.0, + "step": 11040 + }, + { + "epoch": 20.80941176470588, + "grad_norm": 1.2033790349960327, + "learning_rate": 5.2750333778371165e-06, + "loss": 1.1509, + "mean_token_accuracy": 0.5552062794566155, + "num_tokens": 336592.0, + "step": 11050 + }, + { + "epoch": 20.828235294117647, + "grad_norm": 0.9958351850509644, + "learning_rate": 5.261682242990655e-06, + "loss": 1.1421, + "mean_token_accuracy": 0.5717320717871189, + "num_tokens": 350031.0, + "step": 11060 + }, + { + "epoch": 20.847058823529412, + "grad_norm": 0.9751930832862854, + "learning_rate": 5.248331108144192e-06, + "loss": 1.2123, + "mean_token_accuracy": 0.5408148296177387, + "num_tokens": 364779.0, + "step": 11070 + }, + { + "epoch": 20.865882352941178, + "grad_norm": 0.6778987646102905, + "learning_rate": 5.23497997329773e-06, + "loss": 1.1929, + "mean_token_accuracy": 0.547919350117445, + "num_tokens": 379331.0, + "step": 11080 + }, + { + "epoch": 20.88470588235294, + "grad_norm": 0.85933518409729, + "learning_rate": 5.221628838451269e-06, + "loss": 1.119, + "mean_token_accuracy": 0.5630400247871876, + "num_tokens": 392787.0, + "step": 11090 + }, + { + "epoch": 20.903529411764705, + "grad_norm": 0.6913843750953674, + "learning_rate": 5.208277703604807e-06, + "loss": 1.1649, + "mean_token_accuracy": 0.5535217590630055, + "num_tokens": 406067.0, + "step": 11100 + }, + { + "epoch": 20.92235294117647, + "grad_norm": 2.0229623317718506, + "learning_rate": 5.194926568758345e-06, + "loss": 1.12, + "mean_token_accuracy": 0.5807195238769054, + "num_tokens": 420283.0, + "step": 11110 + }, + { + "epoch": 20.941176470588236, + "grad_norm": 1.6949794292449951, + "learning_rate": 5.181575433911883e-06, + "loss": 1.1508, + "mean_token_accuracy": 0.5568496011197567, + "num_tokens": 433084.0, + "step": 11120 + }, + { + "epoch": 20.96, + "grad_norm": 0.9725853204727173, + "learning_rate": 5.168224299065421e-06, + "loss": 1.2174, + "mean_token_accuracy": 0.5451920755207539, + "num_tokens": 447263.0, + "step": 11130 + }, + { + "epoch": 20.978823529411766, + "grad_norm": 1.1975111961364746, + "learning_rate": 5.154873164218959e-06, + "loss": 1.1483, + "mean_token_accuracy": 0.5572316914796829, + "num_tokens": 460142.0, + "step": 11140 + }, + { + "epoch": 20.997647058823528, + "grad_norm": 1.078298807144165, + "learning_rate": 5.141522029372496e-06, + "loss": 1.1615, + "mean_token_accuracy": 0.5542684197425842, + "num_tokens": 473192.0, + "step": 11150 + }, + { + "epoch": 21.01694117647059, + "grad_norm": 1.2463195323944092, + "learning_rate": 5.128170894526035e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.5541327973691429, + "num_tokens": 486372.0, + "step": 11160 + }, + { + "epoch": 21.035764705882354, + "grad_norm": 1.2360081672668457, + "learning_rate": 5.114819759679574e-06, + "loss": 1.1388, + "mean_token_accuracy": 0.5694611296057701, + "num_tokens": 499191.0, + "step": 11170 + }, + { + "epoch": 21.05458823529412, + "grad_norm": 0.9578425288200378, + "learning_rate": 5.101468624833111e-06, + "loss": 1.1818, + "mean_token_accuracy": 0.548756854981184, + "num_tokens": 512720.0, + "step": 11180 + }, + { + "epoch": 21.07341176470588, + "grad_norm": 0.6617890000343323, + "learning_rate": 5.088117489986649e-06, + "loss": 1.1515, + "mean_token_accuracy": 0.5633066941052676, + "num_tokens": 525141.0, + "step": 11190 + }, + { + "epoch": 21.092235294117646, + "grad_norm": 0.5509127974510193, + "learning_rate": 5.0747663551401875e-06, + "loss": 1.1534, + "mean_token_accuracy": 0.5600371100008488, + "num_tokens": 539326.0, + "step": 11200 + }, + { + "epoch": 21.11105882352941, + "grad_norm": 0.7871003150939941, + "learning_rate": 5.061415220293725e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.5748973291367292, + "num_tokens": 553010.0, + "step": 11210 + }, + { + "epoch": 21.129882352941177, + "grad_norm": 1.2488114833831787, + "learning_rate": 5.048064085447263e-06, + "loss": 1.1, + "mean_token_accuracy": 0.575805452466011, + "num_tokens": 566192.0, + "step": 11220 + }, + { + "epoch": 21.148705882352942, + "grad_norm": 0.6213930249214172, + "learning_rate": 5.034712950600802e-06, + "loss": 1.1342, + "mean_token_accuracy": 0.5743603181093931, + "num_tokens": 579897.0, + "step": 11230 + }, + { + "epoch": 21.167529411764704, + "grad_norm": 0.6450327634811401, + "learning_rate": 5.0213618157543396e-06, + "loss": 1.197, + "mean_token_accuracy": 0.5464676439762115, + "num_tokens": 594375.0, + "step": 11240 + }, + { + "epoch": 21.18635294117647, + "grad_norm": 1.1492058038711548, + "learning_rate": 5.008010680907878e-06, + "loss": 1.1822, + "mean_token_accuracy": 0.5506179232150317, + "num_tokens": 607333.0, + "step": 11250 + }, + { + "epoch": 21.205176470588235, + "grad_norm": 1.0695908069610596, + "learning_rate": 4.994659546061415e-06, + "loss": 1.1359, + "mean_token_accuracy": 0.5681717403233051, + "num_tokens": 620561.0, + "step": 11260 + }, + { + "epoch": 21.224, + "grad_norm": 0.608024001121521, + "learning_rate": 4.9813084112149534e-06, + "loss": 1.1696, + "mean_token_accuracy": 0.5604459267109633, + "num_tokens": 634382.0, + "step": 11270 + }, + { + "epoch": 21.242823529411766, + "grad_norm": 0.6441075801849365, + "learning_rate": 4.967957276368492e-06, + "loss": 1.144, + "mean_token_accuracy": 0.5593028951436281, + "num_tokens": 646437.0, + "step": 11280 + }, + { + "epoch": 21.26164705882353, + "grad_norm": 1.208881139755249, + "learning_rate": 4.95460614152203e-06, + "loss": 1.1171, + "mean_token_accuracy": 0.5644662406295538, + "num_tokens": 659651.0, + "step": 11290 + }, + { + "epoch": 21.280470588235293, + "grad_norm": 1.3741132020950317, + "learning_rate": 4.941255006675567e-06, + "loss": 1.1408, + "mean_token_accuracy": 0.5656964641064406, + "num_tokens": 673643.0, + "step": 11300 + }, + { + "epoch": 21.299294117647058, + "grad_norm": 1.710774302482605, + "learning_rate": 4.927903871829106e-06, + "loss": 1.1746, + "mean_token_accuracy": 0.5554309643805027, + "num_tokens": 686757.0, + "step": 11310 + }, + { + "epoch": 21.318117647058823, + "grad_norm": 0.5914443731307983, + "learning_rate": 4.914552736982644e-06, + "loss": 1.1724, + "mean_token_accuracy": 0.5584981873631477, + "num_tokens": 701156.0, + "step": 11320 + }, + { + "epoch": 21.33694117647059, + "grad_norm": 0.6047216653823853, + "learning_rate": 4.901201602136182e-06, + "loss": 1.1493, + "mean_token_accuracy": 0.570811814814806, + "num_tokens": 714373.0, + "step": 11330 + }, + { + "epoch": 21.355764705882354, + "grad_norm": 1.1371484994888306, + "learning_rate": 4.887850467289719e-06, + "loss": 1.1611, + "mean_token_accuracy": 0.5554365783929824, + "num_tokens": 727980.0, + "step": 11340 + }, + { + "epoch": 21.37458823529412, + "grad_norm": 0.6046891212463379, + "learning_rate": 4.8744993324432585e-06, + "loss": 1.1768, + "mean_token_accuracy": 0.5487030290067196, + "num_tokens": 741661.0, + "step": 11350 + }, + { + "epoch": 21.39341176470588, + "grad_norm": 1.0406830310821533, + "learning_rate": 4.861148197596796e-06, + "loss": 1.103, + "mean_token_accuracy": 0.5742270287126303, + "num_tokens": 754818.0, + "step": 11360 + }, + { + "epoch": 21.412235294117647, + "grad_norm": 1.8457794189453125, + "learning_rate": 4.847797062750334e-06, + "loss": 1.1727, + "mean_token_accuracy": 0.5553502965718508, + "num_tokens": 767996.0, + "step": 11370 + }, + { + "epoch": 21.431058823529412, + "grad_norm": 1.229186773300171, + "learning_rate": 4.834445927903872e-06, + "loss": 1.1148, + "mean_token_accuracy": 0.5711336594074965, + "num_tokens": 781314.0, + "step": 11380 + }, + { + "epoch": 21.449882352941177, + "grad_norm": 2.4556403160095215, + "learning_rate": 4.8210947930574106e-06, + "loss": 1.2005, + "mean_token_accuracy": 0.5530955422669649, + "num_tokens": 794540.0, + "step": 11390 + }, + { + "epoch": 21.468705882352943, + "grad_norm": 0.9994168281555176, + "learning_rate": 4.807743658210948e-06, + "loss": 1.1588, + "mean_token_accuracy": 0.5520365055650472, + "num_tokens": 807577.0, + "step": 11400 + }, + { + "epoch": 11.06690909090909, + "grad_norm": 2.4615001678466797, + "learning_rate": 2.7042869240445714e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6851526271551848, + "num_tokens": 10722.0, + "step": 11410 + }, + { + "epoch": 11.07660606060606, + "grad_norm": 2.1516900062561035, + "learning_rate": 2.689960187285652e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6662691086530685, + "num_tokens": 21293.0, + "step": 11420 + }, + { + "epoch": 11.08630303030303, + "grad_norm": 1.547285556793213, + "learning_rate": 2.675665601616777e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.6725459590554237, + "num_tokens": 32220.0, + "step": 11430 + }, + { + "epoch": 11.096, + "grad_norm": 1.2127219438552856, + "learning_rate": 2.6614032299085324e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6421669337898492, + "num_tokens": 43405.0, + "step": 11440 + }, + { + "epoch": 11.10569696969697, + "grad_norm": 2.609590530395508, + "learning_rate": 2.647173134889831e-06, + "loss": 0.955, + "mean_token_accuracy": 0.6785120502114296, + "num_tokens": 53503.0, + "step": 11450 + }, + { + "epoch": 11.11539393939394, + "grad_norm": 2.0895426273345947, + "learning_rate": 2.6329753791476143e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.6950253710150719, + "num_tokens": 62932.0, + "step": 11460 + }, + { + "epoch": 11.12509090909091, + "grad_norm": 1.0643393993377686, + "learning_rate": 2.6188100251265947e-06, + "loss": 0.965, + "mean_token_accuracy": 0.6781762517988682, + "num_tokens": 73725.0, + "step": 11470 + }, + { + "epoch": 11.13478787878788, + "grad_norm": 0.9910470843315125, + "learning_rate": 2.604677135128972e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6687733806669712, + "num_tokens": 84995.0, + "step": 11480 + }, + { + "epoch": 11.144484848484849, + "grad_norm": 1.3115955591201782, + "learning_rate": 2.590576771314166e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.6800978854298592, + "num_tokens": 95231.0, + "step": 11490 + }, + { + "epoch": 11.154181818181819, + "grad_norm": 1.7872004508972168, + "learning_rate": 2.5765089956985357e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7096298310905695, + "num_tokens": 104730.0, + "step": 11500 + }, + { + "epoch": 11.163878787878788, + "grad_norm": 1.7525864839553833, + "learning_rate": 2.56247387015511e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7024530675262213, + "num_tokens": 114691.0, + "step": 11510 + }, + { + "epoch": 11.173575757575758, + "grad_norm": 1.5869275331497192, + "learning_rate": 2.5484714564133237e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.6953748039901256, + "num_tokens": 124358.0, + "step": 11520 + }, + { + "epoch": 11.183272727272728, + "grad_norm": 1.2352665662765503, + "learning_rate": 2.534501816058731e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6811668451875448, + "num_tokens": 135440.0, + "step": 11530 + }, + { + "epoch": 11.192969696969698, + "grad_norm": 0.9480632543563843, + "learning_rate": 2.5205650105327405e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.6818428047001361, + "num_tokens": 145555.0, + "step": 11540 + }, + { + "epoch": 11.202666666666667, + "grad_norm": 1.0198525190353394, + "learning_rate": 2.5066611011323505e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6706477042287589, + "num_tokens": 157378.0, + "step": 11550 + }, + { + "epoch": 11.212363636363637, + "grad_norm": 0.9845815896987915, + "learning_rate": 2.4927901490098762e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6655347641557455, + "num_tokens": 168558.0, + "step": 11560 + }, + { + "epoch": 11.222060606060605, + "grad_norm": 1.9387283325195312, + "learning_rate": 2.4789522151726764e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.6992917202413083, + "num_tokens": 179836.0, + "step": 11570 + }, + { + "epoch": 11.231757575757575, + "grad_norm": 1.2270708084106445, + "learning_rate": 2.4651473604828903e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.675117377564311, + "num_tokens": 190256.0, + "step": 11580 + }, + { + "epoch": 11.241454545454545, + "grad_norm": 1.0312174558639526, + "learning_rate": 2.4513756456571667e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.6721729058772326, + "num_tokens": 201487.0, + "step": 11590 + }, + { + "epoch": 11.251151515151514, + "grad_norm": 1.4611775875091553, + "learning_rate": 2.437637131266396e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.6916485130786896, + "num_tokens": 210683.0, + "step": 11600 + }, + { + "epoch": 11.260848484848484, + "grad_norm": 1.251896858215332, + "learning_rate": 2.4239318777354593e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.6964191533625126, + "num_tokens": 220616.0, + "step": 11610 + }, + { + "epoch": 11.270545454545454, + "grad_norm": 1.1485376358032227, + "learning_rate": 2.410259945342929e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.6866546850651503, + "num_tokens": 231221.0, + "step": 11620 + }, + { + "epoch": 11.280242424242424, + "grad_norm": 1.1417745351791382, + "learning_rate": 2.3966213942208363e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7093327675014734, + "num_tokens": 241191.0, + "step": 11630 + }, + { + "epoch": 11.289939393939393, + "grad_norm": 1.0089123249053955, + "learning_rate": 2.383016284354397e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.6817425429821015, + "num_tokens": 250917.0, + "step": 11640 + }, + { + "epoch": 11.299636363636363, + "grad_norm": 1.3917380571365356, + "learning_rate": 2.369444675581738e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.6756290566176176, + "num_tokens": 261760.0, + "step": 11650 + }, + { + "epoch": 11.309333333333333, + "grad_norm": 1.3879166841506958, + "learning_rate": 2.355906627593647e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6554592750966549, + "num_tokens": 272414.0, + "step": 11660 + }, + { + "epoch": 11.319030303030303, + "grad_norm": 1.2711937427520752, + "learning_rate": 2.342402199933296e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7178003009408712, + "num_tokens": 282396.0, + "step": 11670 + }, + { + "epoch": 11.328727272727273, + "grad_norm": 0.8704134821891785, + "learning_rate": 2.3289314519960016e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6554147530347109, + "num_tokens": 292871.0, + "step": 11680 + }, + { + "epoch": 11.338424242424242, + "grad_norm": 1.9305732250213623, + "learning_rate": 2.315494443028937e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6782477792352438, + "num_tokens": 303864.0, + "step": 11690 + }, + { + "epoch": 11.348121212121212, + "grad_norm": 1.0496376752853394, + "learning_rate": 2.30209123213089e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.6711665719747544, + "num_tokens": 314081.0, + "step": 11700 + }, + { + "epoch": 11.357818181818182, + "grad_norm": 1.2640137672424316, + "learning_rate": 2.288721878251996e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.6923185490071774, + "num_tokens": 324585.0, + "step": 11710 + }, + { + "epoch": 11.367515151515152, + "grad_norm": 1.2028977870941162, + "learning_rate": 2.275386440193479e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.70025773383677, + "num_tokens": 334337.0, + "step": 11720 + }, + { + "epoch": 11.377212121212121, + "grad_norm": 1.366431713104248, + "learning_rate": 2.2620849766073993e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.6744892597198486, + "num_tokens": 344863.0, + "step": 11730 + }, + { + "epoch": 11.386909090909091, + "grad_norm": 1.1718578338623047, + "learning_rate": 2.248817545996387e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7153472680598497, + "num_tokens": 354780.0, + "step": 11740 + }, + { + "epoch": 11.39660606060606, + "grad_norm": 1.7113317251205444, + "learning_rate": 2.235584206713385e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.6829107455909252, + "num_tokens": 365921.0, + "step": 11750 + }, + { + "epoch": 11.40630303030303, + "grad_norm": 0.9357189536094666, + "learning_rate": 2.2223850169613993e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6629223726689816, + "num_tokens": 376384.0, + "step": 11760 + }, + { + "epoch": 11.416, + "grad_norm": 1.9502002000808716, + "learning_rate": 2.209220034793237e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.6891988046467304, + "num_tokens": 385616.0, + "step": 11770 + }, + { + "epoch": 11.42569696969697, + "grad_norm": 0.9912474751472473, + "learning_rate": 2.1960893181112553e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6538973189890385, + "num_tokens": 396111.0, + "step": 11780 + }, + { + "epoch": 11.43539393939394, + "grad_norm": 1.6034260988235474, + "learning_rate": 2.182992924667101e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.658538245409727, + "num_tokens": 407225.0, + "step": 11790 + }, + { + "epoch": 11.44509090909091, + "grad_norm": 0.7665310502052307, + "learning_rate": 2.1699309120614663e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6684748906642198, + "num_tokens": 417932.0, + "step": 11800 + }, + { + "epoch": 11.45478787878788, + "grad_norm": 1.4279521703720093, + "learning_rate": 2.1569033377438243e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6504704430699348, + "num_tokens": 429943.0, + "step": 11810 + }, + { + "epoch": 11.46448484848485, + "grad_norm": 1.4924397468566895, + "learning_rate": 2.1439102590121807e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.6776580080389977, + "num_tokens": 440423.0, + "step": 11820 + }, + { + "epoch": 11.474181818181819, + "grad_norm": 1.0187861919403076, + "learning_rate": 2.1309517330128217e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6710641365498304, + "num_tokens": 451528.0, + "step": 11830 + }, + { + "epoch": 11.483878787878789, + "grad_norm": 1.2168591022491455, + "learning_rate": 2.1180278167400726e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.6835467047989369, + "num_tokens": 461950.0, + "step": 11840 + }, + { + "epoch": 11.493575757575758, + "grad_norm": 0.7139029502868652, + "learning_rate": 2.105138567036026e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6716462299227715, + "num_tokens": 472203.0, + "step": 11850 + }, + { + "epoch": 11.503272727272726, + "grad_norm": 0.9237242341041565, + "learning_rate": 2.09228404059031e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.653912478685379, + "num_tokens": 484302.0, + "step": 11860 + }, + { + "epoch": 11.512969696969696, + "grad_norm": 1.539085865020752, + "learning_rate": 2.0794642939398315e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.6944774236530066, + "num_tokens": 494362.0, + "step": 11870 + }, + { + "epoch": 11.522666666666666, + "grad_norm": 0.7662031054496765, + "learning_rate": 2.066679383468524e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6761994324624538, + "num_tokens": 505137.0, + "step": 11880 + }, + { + "epoch": 11.532363636363636, + "grad_norm": 1.0061405897140503, + "learning_rate": 2.0539293654071167e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.669762023538351, + "num_tokens": 515456.0, + "step": 11890 + }, + { + "epoch": 11.542060606060605, + "grad_norm": 1.5532357692718506, + "learning_rate": 2.0412142958328586e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7063564002513886, + "num_tokens": 525883.0, + "step": 11900 + }, + { + "epoch": 11.551757575757575, + "grad_norm": 0.8483320474624634, + "learning_rate": 2.028534230669296e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.6627003367990255, + "num_tokens": 537384.0, + "step": 11910 + }, + { + "epoch": 11.561454545454545, + "grad_norm": 0.933795154094696, + "learning_rate": 2.015889225686022e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6608807422220707, + "num_tokens": 548906.0, + "step": 11920 + }, + { + "epoch": 11.571151515151515, + "grad_norm": 2.9793217182159424, + "learning_rate": 2.0032793364984225e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6697162009775639, + "num_tokens": 559505.0, + "step": 11930 + }, + { + "epoch": 11.580848484848485, + "grad_norm": 1.162315845489502, + "learning_rate": 1.9907046185674374e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7048578035086394, + "num_tokens": 569706.0, + "step": 11940 + }, + { + "epoch": 11.590545454545454, + "grad_norm": 0.6796969175338745, + "learning_rate": 1.978165127199313e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6569722048938275, + "num_tokens": 580920.0, + "step": 11950 + }, + { + "epoch": 11.600242424242424, + "grad_norm": 0.8585827946662903, + "learning_rate": 1.9656609175453724e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6617213696241379, + "num_tokens": 591801.0, + "step": 11960 + }, + { + "epoch": 11.609939393939394, + "grad_norm": 2.407949924468994, + "learning_rate": 1.9531920446017514e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7031644247472286, + "num_tokens": 601494.0, + "step": 11970 + }, + { + "epoch": 11.619636363636364, + "grad_norm": 1.2153403759002686, + "learning_rate": 1.940758563209172e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.667534577473998, + "num_tokens": 612759.0, + "step": 11980 + }, + { + "epoch": 11.629333333333333, + "grad_norm": 1.6503994464874268, + "learning_rate": 1.928360528052695e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.6706000864505768, + "num_tokens": 623931.0, + "step": 11990 + }, + { + "epoch": 11.639030303030303, + "grad_norm": 2.4140963554382324, + "learning_rate": 1.9159979936614813e-06, + "loss": 0.927, + "mean_token_accuracy": 0.6892816323786974, + "num_tokens": 634238.0, + "step": 12000 + }, + { + "epoch": 11.648727272727273, + "grad_norm": 1.6788582801818848, + "learning_rate": 1.9036710144085568e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.697110791504383, + "num_tokens": 644919.0, + "step": 12010 + }, + { + "epoch": 11.658424242424243, + "grad_norm": 1.9367320537567139, + "learning_rate": 1.891379644510566e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6408150486648083, + "num_tokens": 655529.0, + "step": 12020 + }, + { + "epoch": 11.668121212121212, + "grad_norm": 1.541839361190796, + "learning_rate": 1.8791239380275262e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.6883293610066176, + "num_tokens": 665483.0, + "step": 12030 + }, + { + "epoch": 11.677818181818182, + "grad_norm": 0.8563993573188782, + "learning_rate": 1.8669039488626162e-06, + "loss": 0.928, + "mean_token_accuracy": 0.6791775230318308, + "num_tokens": 676255.0, + "step": 12040 + }, + { + "epoch": 11.687515151515152, + "grad_norm": 1.097931981086731, + "learning_rate": 1.8547197307619102e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.645743177831173, + "num_tokens": 687475.0, + "step": 12050 + }, + { + "epoch": 11.697212121212122, + "grad_norm": 1.8921289443969727, + "learning_rate": 1.8425713373141597e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.689141795784235, + "num_tokens": 697582.0, + "step": 12060 + }, + { + "epoch": 11.706909090909091, + "grad_norm": 1.5931812524795532, + "learning_rate": 1.830458821950546e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.6943042069673538, + "num_tokens": 707522.0, + "step": 12070 + }, + { + "epoch": 11.716606060606061, + "grad_norm": 1.7966309785842896, + "learning_rate": 1.8183822379444604e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.6829646300524473, + "num_tokens": 718288.0, + "step": 12080 + }, + { + "epoch": 11.726303030303031, + "grad_norm": 1.5532159805297852, + "learning_rate": 1.8063416384112532e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.6941913302987814, + "num_tokens": 727672.0, + "step": 12090 + }, + { + "epoch": 11.736, + "grad_norm": 0.9339669942855835, + "learning_rate": 1.7943370763080093e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.652100894600153, + "num_tokens": 738798.0, + "step": 12100 + }, + { + "epoch": 11.74569696969697, + "grad_norm": 1.5431864261627197, + "learning_rate": 1.7823686044333134e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6828689679503441, + "num_tokens": 749829.0, + "step": 12110 + }, + { + "epoch": 11.75539393939394, + "grad_norm": 1.6083624362945557, + "learning_rate": 1.7704362754270143e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7035974383354187, + "num_tokens": 759547.0, + "step": 12120 + }, + { + "epoch": 11.765090909090908, + "grad_norm": 1.150327444076538, + "learning_rate": 1.7585401417700076e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.6958384934812785, + "num_tokens": 769350.0, + "step": 12130 + }, + { + "epoch": 11.77478787878788, + "grad_norm": 1.8415894508361816, + "learning_rate": 1.7466802557839834e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.6662904676049948, + "num_tokens": 780015.0, + "step": 12140 + }, + { + "epoch": 11.784484848484848, + "grad_norm": 2.885213851928711, + "learning_rate": 1.7348566696312108e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.6689011044800282, + "num_tokens": 790176.0, + "step": 12150 + }, + { + "epoch": 11.794181818181817, + "grad_norm": 1.567074179649353, + "learning_rate": 1.7230694353143041e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.69071399345994, + "num_tokens": 801125.0, + "step": 12160 + }, + { + "epoch": 11.803878787878787, + "grad_norm": 0.8478710651397705, + "learning_rate": 1.7113186046759956e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.6843322183936834, + "num_tokens": 811193.0, + "step": 12170 + }, + { + "epoch": 11.813575757575757, + "grad_norm": 1.2154415845870972, + "learning_rate": 1.6996042293989046e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7175424035638571, + "num_tokens": 822012.0, + "step": 12180 + }, + { + "epoch": 11.823272727272727, + "grad_norm": 1.4030102491378784, + "learning_rate": 1.6879263610053109e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.6776260420680046, + "num_tokens": 832565.0, + "step": 12190 + }, + { + "epoch": 11.832969696969696, + "grad_norm": 0.6021126508712769, + "learning_rate": 1.6762850508569383e-06, + "loss": 0.9, + "mean_token_accuracy": 0.6897207599133253, + "num_tokens": 843002.0, + "step": 12200 + }, + { + "epoch": 11.842666666666666, + "grad_norm": 1.1585458517074585, + "learning_rate": 1.6646803501547104e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.6929288487881422, + "num_tokens": 853533.0, + "step": 12210 + }, + { + "epoch": 11.852363636363636, + "grad_norm": 0.7529911398887634, + "learning_rate": 1.653112309938537e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.6742060914635658, + "num_tokens": 864186.0, + "step": 12220 + }, + { + "epoch": 11.862060606060606, + "grad_norm": 1.9934836626052856, + "learning_rate": 1.6415809810870854e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.6902973093092442, + "num_tokens": 875834.0, + "step": 12230 + }, + { + "epoch": 11.871757575757576, + "grad_norm": 1.0597401857376099, + "learning_rate": 1.6300864143175665e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.6818343084305525, + "num_tokens": 886161.0, + "step": 12240 + }, + { + "epoch": 11.881454545454545, + "grad_norm": 1.6065829992294312, + "learning_rate": 1.6186286601854962e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.6795123651623726, + "num_tokens": 895476.0, + "step": 12250 + }, + { + "epoch": 11.891151515151515, + "grad_norm": 1.7515671253204346, + "learning_rate": 1.6072077690844824e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6692648060619831, + "num_tokens": 906427.0, + "step": 12260 + }, + { + "epoch": 11.900848484848485, + "grad_norm": 1.2341034412384033, + "learning_rate": 1.5958237912460028e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.6872673355042934, + "num_tokens": 916474.0, + "step": 12270 + }, + { + "epoch": 11.910545454545455, + "grad_norm": 1.444577693939209, + "learning_rate": 1.5844767767391799e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.6981671530753374, + "num_tokens": 926428.0, + "step": 12280 + }, + { + "epoch": 11.920242424242424, + "grad_norm": 1.0555857419967651, + "learning_rate": 1.5731667754705716e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.6644821926951409, + "num_tokens": 936011.0, + "step": 12290 + }, + { + "epoch": 11.929939393939394, + "grad_norm": 1.8039774894714355, + "learning_rate": 1.5618938371839366e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6748083829879761, + "num_tokens": 946502.0, + "step": 12300 + }, + { + "epoch": 11.939636363636364, + "grad_norm": 1.572402834892273, + "learning_rate": 1.550658011460019e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7099429033696651, + "num_tokens": 956754.0, + "step": 12310 + }, + { + "epoch": 11.949333333333334, + "grad_norm": 0.8282158374786377, + "learning_rate": 1.5394593477163456e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7014227926731109, + "num_tokens": 966536.0, + "step": 12320 + }, + { + "epoch": 11.959030303030303, + "grad_norm": 1.0083385705947876, + "learning_rate": 1.5282978952069904e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.670442745834589, + "num_tokens": 976841.0, + "step": 12330 + }, + { + "epoch": 11.968727272727273, + "grad_norm": 2.277254581451416, + "learning_rate": 1.5171737030223632e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6729221884161234, + "num_tokens": 987549.0, + "step": 12340 + }, + { + "epoch": 11.978424242424243, + "grad_norm": 0.8625606894493103, + "learning_rate": 1.5060868200889955e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7104882929474116, + "num_tokens": 996977.0, + "step": 12350 + }, + { + "epoch": 11.988121212121213, + "grad_norm": 1.0558991432189941, + "learning_rate": 1.4950372951693316e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.678413325548172, + "num_tokens": 1007009.0, + "step": 12360 + }, + { + "epoch": 11.997818181818182, + "grad_norm": 1.0509843826293945, + "learning_rate": 1.4840251768614987e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7061600238084793, + "num_tokens": 1016337.0, + "step": 12370 + }, + { + "epoch": 12.007757575757577, + "grad_norm": 1.150305986404419, + "learning_rate": 1.473050513599107e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6960650755137932, + "num_tokens": 1027200.0, + "step": 12380 + }, + { + "epoch": 12.017454545454546, + "grad_norm": 0.8161555528640747, + "learning_rate": 1.462113353651029e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6727097641676665, + "num_tokens": 1037572.0, + "step": 12390 + }, + { + "epoch": 12.027151515151516, + "grad_norm": 1.7541327476501465, + "learning_rate": 1.4512137451211884e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.6972331315279007, + "num_tokens": 1046891.0, + "step": 12400 + }, + { + "epoch": 12.036848484848484, + "grad_norm": 0.9530600309371948, + "learning_rate": 1.4403517359483577e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.68136284686625, + "num_tokens": 10434.0, + "step": 12410 + }, + { + "epoch": 12.046545454545454, + "grad_norm": 1.3567638397216797, + "learning_rate": 1.42952737390593e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.676530422642827, + "num_tokens": 22038.0, + "step": 12420 + }, + { + "epoch": 12.056242424242424, + "grad_norm": 1.3663750886917114, + "learning_rate": 1.4187407066017245e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.6606147531419992, + "num_tokens": 32741.0, + "step": 12430 + }, + { + "epoch": 12.065939393939393, + "grad_norm": 1.086794376373291, + "learning_rate": 1.4079917814777667e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7050681680440902, + "num_tokens": 41500.0, + "step": 12440 + }, + { + "epoch": 12.075636363636363, + "grad_norm": 0.9989749193191528, + "learning_rate": 1.3972806458100885e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7239202216267586, + "num_tokens": 50782.0, + "step": 12450 + }, + { + "epoch": 12.085333333333333, + "grad_norm": 1.2325557470321655, + "learning_rate": 1.3866073467085127e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.6962772708386182, + "num_tokens": 60816.0, + "step": 12460 + }, + { + "epoch": 12.095030303030303, + "grad_norm": 1.5396286249160767, + "learning_rate": 1.3759719311164477e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6646735660731793, + "num_tokens": 71812.0, + "step": 12470 + }, + { + "epoch": 12.104727272727272, + "grad_norm": 1.008445382118225, + "learning_rate": 1.3653744458106876e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6767258770763874, + "num_tokens": 83843.0, + "step": 12480 + }, + { + "epoch": 12.114424242424242, + "grad_norm": 1.6044663190841675, + "learning_rate": 1.3548149374011986e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.701893288269639, + "num_tokens": 94582.0, + "step": 12490 + }, + { + "epoch": 12.124121212121212, + "grad_norm": 1.4867864847183228, + "learning_rate": 1.3442934523309137e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.678819801285863, + "num_tokens": 106037.0, + "step": 12500 + }, + { + "epoch": 12.133818181818182, + "grad_norm": 1.6262177228927612, + "learning_rate": 1.3338100368755346e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6883869960904121, + "num_tokens": 117393.0, + "step": 12510 + }, + { + "epoch": 12.143515151515151, + "grad_norm": 2.58561635017395, + "learning_rate": 1.3233647371433222e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.6864805597811937, + "num_tokens": 127326.0, + "step": 12520 + }, + { + "epoch": 12.153212121212121, + "grad_norm": 1.6916279792785645, + "learning_rate": 1.3129575990749e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.6888086255639791, + "num_tokens": 137539.0, + "step": 12530 + }, + { + "epoch": 12.162909090909091, + "grad_norm": 1.5663442611694336, + "learning_rate": 1.3025886684430467e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6744012456387282, + "num_tokens": 148648.0, + "step": 12540 + }, + { + "epoch": 12.17260606060606, + "grad_norm": 1.4812220335006714, + "learning_rate": 1.2922579908524946e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7043029896914959, + "num_tokens": 158590.0, + "step": 12550 + }, + { + "epoch": 12.18230303030303, + "grad_norm": 1.7226941585540771, + "learning_rate": 1.2819656117397328e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.6753247026354074, + "num_tokens": 170030.0, + "step": 12560 + }, + { + "epoch": 12.192, + "grad_norm": 0.7470999956130981, + "learning_rate": 1.2717115763728083e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.6908956177532672, + "num_tokens": 179668.0, + "step": 12570 + }, + { + "epoch": 12.20169696969697, + "grad_norm": 1.0085124969482422, + "learning_rate": 1.2614959298511231e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.6647142685949803, + "num_tokens": 190351.0, + "step": 12580 + }, + { + "epoch": 12.21139393939394, + "grad_norm": 0.801249623298645, + "learning_rate": 1.2513187171052288e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6591165266931057, + "num_tokens": 200784.0, + "step": 12590 + }, + { + "epoch": 12.22109090909091, + "grad_norm": 1.1452405452728271, + "learning_rate": 1.2411799828966497e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.6782014291733504, + "num_tokens": 210672.0, + "step": 12600 + }, + { + "epoch": 12.23078787878788, + "grad_norm": 1.4320217370986938, + "learning_rate": 1.2310797718176658e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.6874732073396445, + "num_tokens": 220175.0, + "step": 12610 + }, + { + "epoch": 12.240484848484849, + "grad_norm": 1.0549358129501343, + "learning_rate": 1.221018128291127e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.6930529691278935, + "num_tokens": 230511.0, + "step": 12620 + }, + { + "epoch": 12.250181818181819, + "grad_norm": 0.7888785004615784, + "learning_rate": 1.2109950965702532e-06, + "loss": 0.962, + "mean_token_accuracy": 0.6716390445828437, + "num_tokens": 240893.0, + "step": 12630 + }, + { + "epoch": 12.259878787878788, + "grad_norm": 2.5039796829223633, + "learning_rate": 1.2010107207384437e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7178800087422132, + "num_tokens": 250554.0, + "step": 12640 + }, + { + "epoch": 12.269575757575758, + "grad_norm": 1.5427664518356323, + "learning_rate": 1.1910650447090798e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6641611870378256, + "num_tokens": 261026.0, + "step": 12650 + }, + { + "epoch": 12.279272727272728, + "grad_norm": 1.7952816486358643, + "learning_rate": 1.1811581122253335e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6731622900813818, + "num_tokens": 271855.0, + "step": 12660 + }, + { + "epoch": 12.288969696969698, + "grad_norm": 1.4959173202514648, + "learning_rate": 1.171289966859973e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.6812974836677312, + "num_tokens": 281878.0, + "step": 12670 + }, + { + "epoch": 12.298666666666668, + "grad_norm": 0.7014359831809998, + "learning_rate": 1.1614606520151716e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.6856089878827334, + "num_tokens": 292658.0, + "step": 12680 + }, + { + "epoch": 12.308363636363636, + "grad_norm": 0.6972899436950684, + "learning_rate": 1.1516702109223243e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.6989801757037639, + "num_tokens": 302011.0, + "step": 12690 + }, + { + "epoch": 12.318060606060605, + "grad_norm": 1.2687288522720337, + "learning_rate": 1.1419186866418452e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.6923393607139587, + "num_tokens": 312147.0, + "step": 12700 + }, + { + "epoch": 12.327757575757575, + "grad_norm": 1.3525540828704834, + "learning_rate": 1.1322061220629855e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.684671938046813, + "num_tokens": 323474.0, + "step": 12710 + }, + { + "epoch": 12.337454545454545, + "grad_norm": 1.2294106483459473, + "learning_rate": 1.122532559903644e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.628922751918435, + "num_tokens": 334923.0, + "step": 12720 + }, + { + "epoch": 12.347151515151515, + "grad_norm": 1.096246600151062, + "learning_rate": 1.1128980427101766e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.683203124627471, + "num_tokens": 344506.0, + "step": 12730 + }, + { + "epoch": 12.356848484848484, + "grad_norm": 1.3699408769607544, + "learning_rate": 1.1033026128572156e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6612563081085682, + "num_tokens": 355007.0, + "step": 12740 + }, + { + "epoch": 12.366545454545454, + "grad_norm": 1.7355482578277588, + "learning_rate": 1.0937463125474724e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.6721325762569904, + "num_tokens": 365829.0, + "step": 12750 + }, + { + "epoch": 12.376242424242424, + "grad_norm": 2.603883981704712, + "learning_rate": 1.084229183811566e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.6898716945201159, + "num_tokens": 376436.0, + "step": 12760 + }, + { + "epoch": 12.385939393939394, + "grad_norm": 1.0586647987365723, + "learning_rate": 1.0747512685078264e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.6626970659941435, + "num_tokens": 387389.0, + "step": 12770 + }, + { + "epoch": 12.395636363636363, + "grad_norm": 1.6182094812393188, + "learning_rate": 1.0653126083221143e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.6970617674291134, + "num_tokens": 397693.0, + "step": 12780 + }, + { + "epoch": 12.405333333333333, + "grad_norm": 1.9159958362579346, + "learning_rate": 1.05591324476764e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6668812599033117, + "num_tokens": 408904.0, + "step": 12790 + }, + { + "epoch": 12.415030303030303, + "grad_norm": 1.2994496822357178, + "learning_rate": 1.046553219184776e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7094687633216381, + "num_tokens": 418860.0, + "step": 12800 + }, + { + "epoch": 12.424727272727273, + "grad_norm": 1.3715529441833496, + "learning_rate": 1.0372325727408838e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.6802921980619431, + "num_tokens": 429236.0, + "step": 12810 + }, + { + "epoch": 12.434424242424242, + "grad_norm": 0.981478750705719, + "learning_rate": 1.0279513464301204e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.6767802778631449, + "num_tokens": 439169.0, + "step": 12820 + }, + { + "epoch": 12.444121212121212, + "grad_norm": 0.9200496077537537, + "learning_rate": 1.0187095810732705e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6698127511888743, + "num_tokens": 450423.0, + "step": 12830 + }, + { + "epoch": 12.453818181818182, + "grad_norm": 1.1707184314727783, + "learning_rate": 1.0095073173175552e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.6921768002212048, + "num_tokens": 461570.0, + "step": 12840 + }, + { + "epoch": 12.463515151515152, + "grad_norm": 0.8096593022346497, + "learning_rate": 1.0003445956364666e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.6814159829169512, + "num_tokens": 471981.0, + "step": 12850 + }, + { + "epoch": 12.473212121212121, + "grad_norm": 1.0456788539886475, + "learning_rate": 9.912214563295787e-07, + "loss": 0.9224, + "mean_token_accuracy": 0.68552374728024, + "num_tokens": 482091.0, + "step": 12860 + }, + { + "epoch": 12.482909090909091, + "grad_norm": 1.2879787683486938, + "learning_rate": 9.821379395223684e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.6662912800908088, + "num_tokens": 493252.0, + "step": 12870 + }, + { + "epoch": 12.492606060606061, + "grad_norm": 1.3891626596450806, + "learning_rate": 9.730940851660554e-07, + "loss": 0.9407, + "mean_token_accuracy": 0.7015823908150196, + "num_tokens": 504067.0, + "step": 12880 + }, + { + "epoch": 12.50230303030303, + "grad_norm": 1.663533329963684, + "learning_rate": 9.640899330374088e-07, + "loss": 0.8911, + "mean_token_accuracy": 0.6906427904963494, + "num_tokens": 514270.0, + "step": 12890 + }, + { + "epoch": 12.512, + "grad_norm": 1.9871175289154053, + "learning_rate": 9.55125522738579e-07, + "loss": 0.9259, + "mean_token_accuracy": 0.6906178455799818, + "num_tokens": 524617.0, + "step": 12900 + }, + { + "epoch": 12.52169696969697, + "grad_norm": 0.9362130165100098, + "learning_rate": 9.462008936969258e-07, + "loss": 0.9653, + "mean_token_accuracy": 0.6761426538228988, + "num_tokens": 536008.0, + "step": 12910 + }, + { + "epoch": 12.53139393939394, + "grad_norm": 1.086140513420105, + "learning_rate": 9.373160851648422e-07, + "loss": 0.8916, + "mean_token_accuracy": 0.6984883040189743, + "num_tokens": 545747.0, + "step": 12920 + }, + { + "epoch": 12.54109090909091, + "grad_norm": 1.05403470993042, + "learning_rate": 9.28471136219582e-07, + "loss": 0.9704, + "mean_token_accuracy": 0.6681053042411804, + "num_tokens": 556761.0, + "step": 12930 + }, + { + "epoch": 12.55078787878788, + "grad_norm": 0.9770132303237915, + "learning_rate": 9.196660857630857e-07, + "loss": 0.9625, + "mean_token_accuracy": 0.6729031853377819, + "num_tokens": 566793.0, + "step": 12940 + }, + { + "epoch": 12.56048484848485, + "grad_norm": 2.408095598220825, + "learning_rate": 9.109009725218165e-07, + "loss": 0.9268, + "mean_token_accuracy": 0.6797478631138801, + "num_tokens": 577467.0, + "step": 12950 + }, + { + "epoch": 12.570181818181819, + "grad_norm": 1.0821237564086914, + "learning_rate": 9.021758350465804e-07, + "loss": 1.0222, + "mean_token_accuracy": 0.6477519739419222, + "num_tokens": 588108.0, + "step": 12960 + }, + { + "epoch": 12.579878787878787, + "grad_norm": 0.7974284887313843, + "learning_rate": 8.93490711712367e-07, + "loss": 0.9717, + "mean_token_accuracy": 0.6615799587219954, + "num_tokens": 598348.0, + "step": 12970 + }, + { + "epoch": 12.589575757575757, + "grad_norm": 0.9920361638069153, + "learning_rate": 8.848456407181715e-07, + "loss": 0.9195, + "mean_token_accuracy": 0.6796383894979954, + "num_tokens": 607847.0, + "step": 12980 + }, + { + "epoch": 12.599272727272727, + "grad_norm": 1.929929494857788, + "learning_rate": 8.762406600868301e-07, + "loss": 0.9424, + "mean_token_accuracy": 0.6750466857105494, + "num_tokens": 618641.0, + "step": 12990 + }, + { + "epoch": 12.608969696969696, + "grad_norm": 0.9798093438148499, + "learning_rate": 8.676758076648562e-07, + "loss": 0.9802, + "mean_token_accuracy": 0.6532435789704323, + "num_tokens": 629445.0, + "step": 13000 + }, + { + "epoch": 12.618666666666666, + "grad_norm": 1.7001301050186157, + "learning_rate": 8.59151121122268e-07, + "loss": 0.9055, + "mean_token_accuracy": 0.6827256765216589, + "num_tokens": 640368.0, + "step": 13010 + }, + { + "epoch": 12.628363636363636, + "grad_norm": 1.0197906494140625, + "learning_rate": 8.506666379524275e-07, + "loss": 0.9016, + "mean_token_accuracy": 0.6827419150620699, + "num_tokens": 650484.0, + "step": 13020 + }, + { + "epoch": 12.638060606060606, + "grad_norm": 2.6649887561798096, + "learning_rate": 8.4222239547187e-07, + "loss": 0.892, + "mean_token_accuracy": 0.6925595041364432, + "num_tokens": 660702.0, + "step": 13030 + }, + { + "epoch": 12.647757575757575, + "grad_norm": 1.020989179611206, + "learning_rate": 8.338184308201535e-07, + "loss": 0.9017, + "mean_token_accuracy": 0.6905462071299553, + "num_tokens": 671682.0, + "step": 13040 + }, + { + "epoch": 12.657454545454545, + "grad_norm": 1.4303945302963257, + "learning_rate": 8.254547809596747e-07, + "loss": 0.9703, + "mean_token_accuracy": 0.6805687319487334, + "num_tokens": 682100.0, + "step": 13050 + }, + { + "epoch": 12.667151515151515, + "grad_norm": 1.8320350646972656, + "learning_rate": 8.171314826755228e-07, + "loss": 0.9739, + "mean_token_accuracy": 0.66879703104496, + "num_tokens": 692660.0, + "step": 13060 + }, + { + "epoch": 12.676848484848485, + "grad_norm": 0.9438029527664185, + "learning_rate": 8.088485725753114e-07, + "loss": 0.9212, + "mean_token_accuracy": 0.6848585486412049, + "num_tokens": 702875.0, + "step": 13070 + }, + { + "epoch": 12.686545454545454, + "grad_norm": 2.9450020790100098, + "learning_rate": 8.006060870890165e-07, + "loss": 0.876, + "mean_token_accuracy": 0.6980018597096205, + "num_tokens": 712292.0, + "step": 13080 + }, + { + "epoch": 12.696242424242424, + "grad_norm": 1.4857258796691895, + "learning_rate": 7.924040624688245e-07, + "loss": 0.8641, + "mean_token_accuracy": 0.7006300635635853, + "num_tokens": 722244.0, + "step": 13090 + }, + { + "epoch": 12.705939393939394, + "grad_norm": 1.02292799949646, + "learning_rate": 7.842425347889582e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.6541789300739765, + "num_tokens": 733252.0, + "step": 13100 + }, + { + "epoch": 12.715636363636364, + "grad_norm": 1.1031875610351562, + "learning_rate": 7.761215399455324e-07, + "loss": 0.9232, + "mean_token_accuracy": 0.6899745035916567, + "num_tokens": 744027.0, + "step": 13110 + }, + { + "epoch": 12.725333333333333, + "grad_norm": 1.4371963739395142, + "learning_rate": 7.680411136563837e-07, + "loss": 0.9818, + "mean_token_accuracy": 0.6558696981519461, + "num_tokens": 754156.0, + "step": 13120 + }, + { + "epoch": 12.735030303030303, + "grad_norm": 1.3838204145431519, + "learning_rate": 7.600012914609301e-07, + "loss": 0.912, + "mean_token_accuracy": 0.7075278196483851, + "num_tokens": 763732.0, + "step": 13130 + }, + { + "epoch": 12.744727272727273, + "grad_norm": 0.7802479267120361, + "learning_rate": 7.520021087199925e-07, + "loss": 0.916, + "mean_token_accuracy": 0.6960792735219001, + "num_tokens": 773456.0, + "step": 13140 + }, + { + "epoch": 12.754424242424243, + "grad_norm": 1.3201979398727417, + "learning_rate": 7.440436006156559e-07, + "loss": 0.9347, + "mean_token_accuracy": 0.6893177561461925, + "num_tokens": 784266.0, + "step": 13150 + }, + { + "epoch": 12.764121212121212, + "grad_norm": 0.9860504269599915, + "learning_rate": 7.361258021511142e-07, + "loss": 0.9249, + "mean_token_accuracy": 0.6765072204172611, + "num_tokens": 794396.0, + "step": 13160 + }, + { + "epoch": 12.773818181818182, + "grad_norm": 1.6493189334869385, + "learning_rate": 7.282487481505041e-07, + "loss": 0.9379, + "mean_token_accuracy": 0.671536460146308, + "num_tokens": 804843.0, + "step": 13170 + }, + { + "epoch": 12.783515151515152, + "grad_norm": 0.8871903419494629, + "learning_rate": 7.204124732587659e-07, + "loss": 0.8677, + "mean_token_accuracy": 0.7121831141412258, + "num_tokens": 815821.0, + "step": 13180 + }, + { + "epoch": 12.793212121212122, + "grad_norm": 1.6710381507873535, + "learning_rate": 7.126170119414799e-07, + "loss": 0.9455, + "mean_token_accuracy": 0.6882101558148861, + "num_tokens": 826418.0, + "step": 13190 + }, + { + "epoch": 12.802909090909091, + "grad_norm": 1.0449455976486206, + "learning_rate": 7.048623984847203e-07, + "loss": 0.9237, + "mean_token_accuracy": 0.6743796251714229, + "num_tokens": 837180.0, + "step": 13200 + }, + { + "epoch": 12.812606060606061, + "grad_norm": 1.153255581855774, + "learning_rate": 6.971486669949102e-07, + "loss": 0.9745, + "mean_token_accuracy": 0.6699652068316937, + "num_tokens": 847602.0, + "step": 13210 + }, + { + "epoch": 12.822303030303031, + "grad_norm": 1.069661021232605, + "learning_rate": 6.894758513986566e-07, + "loss": 0.9217, + "mean_token_accuracy": 0.6804017089307308, + "num_tokens": 857486.0, + "step": 13220 + }, + { + "epoch": 12.832, + "grad_norm": 1.011649489402771, + "learning_rate": 6.818439854426151e-07, + "loss": 0.9386, + "mean_token_accuracy": 0.6823414113372565, + "num_tokens": 868972.0, + "step": 13230 + }, + { + "epoch": 12.84169696969697, + "grad_norm": 0.7872369885444641, + "learning_rate": 6.74253102693333e-07, + "loss": 0.9409, + "mean_token_accuracy": 0.6847406111657619, + "num_tokens": 879178.0, + "step": 13240 + }, + { + "epoch": 12.85139393939394, + "grad_norm": 1.3302205801010132, + "learning_rate": 6.667032365371095e-07, + "loss": 0.9514, + "mean_token_accuracy": 0.6746706318110227, + "num_tokens": 890112.0, + "step": 13250 + }, + { + "epoch": 12.861090909090908, + "grad_norm": 0.7299315333366394, + "learning_rate": 6.591944201798394e-07, + "loss": 0.8983, + "mean_token_accuracy": 0.6949192993342876, + "num_tokens": 900105.0, + "step": 13260 + }, + { + "epoch": 12.870787878787878, + "grad_norm": 0.9053242206573486, + "learning_rate": 6.517266866468741e-07, + "loss": 0.9662, + "mean_token_accuracy": 0.6785097420215607, + "num_tokens": 909781.0, + "step": 13270 + }, + { + "epoch": 12.880484848484848, + "grad_norm": 1.5465375185012817, + "learning_rate": 6.443000687828737e-07, + "loss": 0.9076, + "mean_token_accuracy": 0.6935414470732212, + "num_tokens": 920332.0, + "step": 13280 + }, + { + "epoch": 12.890181818181818, + "grad_norm": 0.9741002917289734, + "learning_rate": 6.369145992516635e-07, + "loss": 0.9533, + "mean_token_accuracy": 0.6718010984361171, + "num_tokens": 930800.0, + "step": 13290 + }, + { + "epoch": 12.899878787878787, + "grad_norm": 1.4398901462554932, + "learning_rate": 6.295703105360884e-07, + "loss": 0.9613, + "mean_token_accuracy": 0.6741296485066414, + "num_tokens": 942822.0, + "step": 13300 + }, + { + "epoch": 12.909575757575757, + "grad_norm": 0.8408631086349487, + "learning_rate": 6.222672349378711e-07, + "loss": 0.8839, + "mean_token_accuracy": 0.6957414381206035, + "num_tokens": 953151.0, + "step": 13310 + }, + { + "epoch": 12.919272727272727, + "grad_norm": 1.185342788696289, + "learning_rate": 6.150054045774745e-07, + "loss": 0.9431, + "mean_token_accuracy": 0.6786404684185982, + "num_tokens": 963817.0, + "step": 13320 + }, + { + "epoch": 12.928969696969697, + "grad_norm": 1.5377130508422852, + "learning_rate": 6.07784851393951e-07, + "loss": 0.9263, + "mean_token_accuracy": 0.6862830605357886, + "num_tokens": 974618.0, + "step": 13330 + }, + { + "epoch": 12.938666666666666, + "grad_norm": 2.0658161640167236, + "learning_rate": 6.006056071448119e-07, + "loss": 0.8625, + "mean_token_accuracy": 0.7110202703624964, + "num_tokens": 984540.0, + "step": 13340 + }, + { + "epoch": 12.948363636363636, + "grad_norm": 1.0002696514129639, + "learning_rate": 5.934677034058789e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.6710415873676538, + "num_tokens": 995538.0, + "step": 13350 + }, + { + "epoch": 12.958060606060606, + "grad_norm": 0.6808292269706726, + "learning_rate": 5.863711715711507e-07, + "loss": 0.9357, + "mean_token_accuracy": 0.6868117332458497, + "num_tokens": 1005955.0, + "step": 13360 + }, + { + "epoch": 12.967757575757576, + "grad_norm": 1.43692946434021, + "learning_rate": 5.793160428526678e-07, + "loss": 0.9581, + "mean_token_accuracy": 0.6872004386037588, + "num_tokens": 1017901.0, + "step": 13370 + }, + { + "epoch": 12.977454545454545, + "grad_norm": 1.1382737159729004, + "learning_rate": 5.723023482803658e-07, + "loss": 0.8893, + "mean_token_accuracy": 0.6952810846269131, + "num_tokens": 1027791.0, + "step": 13380 + }, + { + "epoch": 12.987151515151515, + "grad_norm": 1.5918898582458496, + "learning_rate": 5.653301187019455e-07, + "loss": 0.8051, + "mean_token_accuracy": 0.7371663119643926, + "num_tokens": 1037438.0, + "step": 13390 + }, + { + "epoch": 12.996848484848485, + "grad_norm": 1.294746994972229, + "learning_rate": 5.583993847827363e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.6542905114591122, + "num_tokens": 1048425.0, + "step": 13400 + }, + { + "epoch": 13.006787878787879, + "grad_norm": 1.0259826183319092, + "learning_rate": 5.515101770055653e-07, + "loss": 1.1464, + "mean_token_accuracy": 0.6516239614021487, + "num_tokens": 1059946.0, + "step": 13410 + }, + { + "epoch": 13.016484848484849, + "grad_norm": 1.570686936378479, + "learning_rate": 5.446625256706095e-07, + "loss": 0.9854, + "mean_token_accuracy": 0.6571170825511217, + "num_tokens": 1071512.0, + "step": 13420 + }, + { + "epoch": 13.026181818181819, + "grad_norm": 1.4056403636932373, + "learning_rate": 5.378564608952786e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6513097662478685, + "num_tokens": 1082669.0, + "step": 13430 + }, + { + "epoch": 13.035878787878788, + "grad_norm": 1.19424307346344, + "learning_rate": 5.310920126140773e-07, + "loss": 0.9449, + "mean_token_accuracy": 0.6799818322062492, + "num_tokens": 1093470.0, + "step": 13440 + }, + { + "epoch": 13.045575757575758, + "grad_norm": 1.1541939973831177, + "learning_rate": 5.243692105784682e-07, + "loss": 1.0241, + "mean_token_accuracy": 0.6430629625916481, + "num_tokens": 1105089.0, + "step": 13450 + }, + { + "epoch": 13.055272727272728, + "grad_norm": 0.825744092464447, + "learning_rate": 5.176880843567455e-07, + "loss": 0.9137, + "mean_token_accuracy": 0.6785864185541868, + "num_tokens": 1115643.0, + "step": 13460 + }, + { + "epoch": 13.064969696969698, + "grad_norm": 1.2951405048370361, + "learning_rate": 5.110486633339062e-07, + "loss": 0.972, + "mean_token_accuracy": 0.6606432240456342, + "num_tokens": 1126575.0, + "step": 13470 + }, + { + "epoch": 13.074666666666667, + "grad_norm": 0.8548156023025513, + "learning_rate": 5.044509767115158e-07, + "loss": 0.9143, + "mean_token_accuracy": 0.6806200005114078, + "num_tokens": 1137317.0, + "step": 13480 + }, + { + "epoch": 13.084363636363637, + "grad_norm": 1.2886772155761719, + "learning_rate": 4.978950535075878e-07, + "loss": 0.8903, + "mean_token_accuracy": 0.7035336244851351, + "num_tokens": 1148065.0, + "step": 13490 + }, + { + "epoch": 13.094060606060607, + "grad_norm": 1.966200351715088, + "learning_rate": 4.913809225564492e-07, + "loss": 0.9073, + "mean_token_accuracy": 0.701976515352726, + "num_tokens": 1158621.0, + "step": 13500 + }, + { + "epoch": 13.103757575757577, + "grad_norm": 0.8389899134635925, + "learning_rate": 4.849086125086156e-07, + "loss": 0.9414, + "mean_token_accuracy": 0.6894888635724783, + "num_tokens": 1168890.0, + "step": 13510 + }, + { + "epoch": 13.113454545454545, + "grad_norm": 0.9758931994438171, + "learning_rate": 4.784781518306624e-07, + "loss": 0.9384, + "mean_token_accuracy": 0.6737278677523136, + "num_tokens": 1178584.0, + "step": 13520 + }, + { + "epoch": 13.123151515151514, + "grad_norm": 1.0330685377120972, + "learning_rate": 4.720895688051108e-07, + "loss": 0.9524, + "mean_token_accuracy": 0.6780954591929913, + "num_tokens": 1189454.0, + "step": 13530 + }, + { + "epoch": 13.132848484848484, + "grad_norm": 1.9264168739318848, + "learning_rate": 4.657428915302875e-07, + "loss": 0.8363, + "mean_token_accuracy": 0.720489464327693, + "num_tokens": 1199557.0, + "step": 13540 + }, + { + "epoch": 13.142545454545454, + "grad_norm": 0.7110128402709961, + "learning_rate": 4.594381479202137e-07, + "loss": 0.9138, + "mean_token_accuracy": 0.6820375476032495, + "num_tokens": 1210005.0, + "step": 13550 + }, + { + "epoch": 13.152242424242424, + "grad_norm": 1.9913625717163086, + "learning_rate": 4.531753657044735e-07, + "loss": 0.9352, + "mean_token_accuracy": 0.6837764341384173, + "num_tokens": 1220507.0, + "step": 13560 + }, + { + "epoch": 13.161939393939393, + "grad_norm": 1.4394137859344482, + "learning_rate": 4.469545724280988e-07, + "loss": 0.9389, + "mean_token_accuracy": 0.6835528288036585, + "num_tokens": 1231088.0, + "step": 13570 + }, + { + "epoch": 13.171636363636363, + "grad_norm": 1.118189811706543, + "learning_rate": 4.407757954514458e-07, + "loss": 0.9182, + "mean_token_accuracy": 0.69982905164361, + "num_tokens": 1241297.0, + "step": 13580 + }, + { + "epoch": 13.181333333333333, + "grad_norm": 0.6542367935180664, + "learning_rate": 4.3463906195007066e-07, + "loss": 0.8837, + "mean_token_accuracy": 0.700026823580265, + "num_tokens": 1250943.0, + "step": 13590 + }, + { + "epoch": 13.191030303030303, + "grad_norm": 0.5948226451873779, + "learning_rate": 4.285443989146176e-07, + "loss": 1.0363, + "mean_token_accuracy": 0.6513338401913643, + "num_tokens": 1262487.0, + "step": 13600 + }, + { + "epoch": 13.200727272727272, + "grad_norm": 1.0918562412261963, + "learning_rate": 4.5e-05, + "loss": 0.9926, + "mean_token_accuracy": 0.6507623802870512, + "num_tokens": 11060.0, + "step": 13610 + }, + { + "epoch": 13.210424242424242, + "grad_norm": 1.453194499015808, + "learning_rate": 9.5e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.6994029752910137, + "num_tokens": 21506.0, + "step": 13620 + }, + { + "epoch": 13.220121212121212, + "grad_norm": 2.374359130859375, + "learning_rate": 9.995495495495496e-05, + "loss": 0.9635, + "mean_token_accuracy": 0.6764267832040787, + "num_tokens": 32669.0, + "step": 13630 + }, + { + "epoch": 13.229818181818182, + "grad_norm": 1.6310795545578003, + "learning_rate": 9.990490490490491e-05, + "loss": 0.8954, + "mean_token_accuracy": 0.7067163821309805, + "num_tokens": 42938.0, + "step": 13640 + }, + { + "epoch": 13.239515151515151, + "grad_norm": 0.8453378677368164, + "learning_rate": 9.985485485485487e-05, + "loss": 0.9392, + "mean_token_accuracy": 0.68135135024786, + "num_tokens": 53420.0, + "step": 13650 + }, + { + "epoch": 13.249212121212121, + "grad_norm": 2.5701301097869873, + "learning_rate": 9.980480480480481e-05, + "loss": 0.8713, + "mean_token_accuracy": 0.7125500839203596, + "num_tokens": 62912.0, + "step": 13660 + }, + { + "epoch": 13.258909090909091, + "grad_norm": 1.7641572952270508, + "learning_rate": 9.975475475475477e-05, + "loss": 0.9947, + "mean_token_accuracy": 0.6635019164532423, + "num_tokens": 73599.0, + "step": 13670 + }, + { + "epoch": 13.26860606060606, + "grad_norm": 2.168328046798706, + "learning_rate": 9.970470470470471e-05, + "loss": 0.9706, + "mean_token_accuracy": 0.688429095223546, + "num_tokens": 83750.0, + "step": 13680 + }, + { + "epoch": 13.27830303030303, + "grad_norm": 1.4071749448776245, + "learning_rate": 9.965465465465466e-05, + "loss": 0.9676, + "mean_token_accuracy": 0.6768725138157606, + "num_tokens": 93823.0, + "step": 13690 + }, + { + "epoch": 13.288, + "grad_norm": 1.1499977111816406, + "learning_rate": 9.960460460460461e-05, + "loss": 0.9889, + "mean_token_accuracy": 0.6712037593126297, + "num_tokens": 103911.0, + "step": 13700 + }, + { + "epoch": 13.29769696969697, + "grad_norm": 1.2642593383789062, + "learning_rate": 9.955455455455456e-05, + "loss": 0.9786, + "mean_token_accuracy": 0.6803277429193259, + "num_tokens": 114296.0, + "step": 13710 + }, + { + "epoch": 13.30739393939394, + "grad_norm": 0.9675585627555847, + "learning_rate": 9.950450450450451e-05, + "loss": 0.8952, + "mean_token_accuracy": 0.6976213902235031, + "num_tokens": 123697.0, + "step": 13720 + }, + { + "epoch": 13.31709090909091, + "grad_norm": 1.5083271265029907, + "learning_rate": 9.945445445445446e-05, + "loss": 0.9518, + "mean_token_accuracy": 0.6812848944216967, + "num_tokens": 133976.0, + "step": 13730 + }, + { + "epoch": 13.32678787878788, + "grad_norm": 1.0912386178970337, + "learning_rate": 9.94044044044044e-05, + "loss": 0.9224, + "mean_token_accuracy": 0.6897901255637408, + "num_tokens": 143868.0, + "step": 13740 + }, + { + "epoch": 13.336484848484849, + "grad_norm": 1.7375333309173584, + "learning_rate": 9.935435435435436e-05, + "loss": 0.9107, + "mean_token_accuracy": 0.706351314485073, + "num_tokens": 154106.0, + "step": 13750 + }, + { + "epoch": 13.346181818181819, + "grad_norm": 1.1665840148925781, + "learning_rate": 9.930430430430431e-05, + "loss": 0.941, + "mean_token_accuracy": 0.6972976390272378, + "num_tokens": 164042.0, + "step": 13760 + }, + { + "epoch": 13.355878787878789, + "grad_norm": 1.7706063985824585, + "learning_rate": 9.925425425425427e-05, + "loss": 0.8943, + "mean_token_accuracy": 0.7054846830666065, + "num_tokens": 174506.0, + "step": 13770 + }, + { + "epoch": 13.365575757575758, + "grad_norm": 0.5767163038253784, + "learning_rate": 9.920420420420421e-05, + "loss": 0.9656, + "mean_token_accuracy": 0.6823426600545645, + "num_tokens": 185338.0, + "step": 13780 + }, + { + "epoch": 13.375272727272728, + "grad_norm": 1.4523296356201172, + "learning_rate": 9.915415415415416e-05, + "loss": 0.9469, + "mean_token_accuracy": 0.6681080140173435, + "num_tokens": 195763.0, + "step": 13790 + }, + { + "epoch": 13.384969696969698, + "grad_norm": 0.7047093510627747, + "learning_rate": 9.910410410410411e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.6868221748620271, + "num_tokens": 205612.0, + "step": 13800 + }, + { + "epoch": 13.394666666666666, + "grad_norm": 0.7028587460517883, + "learning_rate": 9.905405405405406e-05, + "loss": 0.9799, + "mean_token_accuracy": 0.6751956883817911, + "num_tokens": 215645.0, + "step": 13810 + }, + { + "epoch": 13.404363636363636, + "grad_norm": 0.9091927409172058, + "learning_rate": 9.900400400400401e-05, + "loss": 0.9413, + "mean_token_accuracy": 0.6909396957606078, + "num_tokens": 225530.0, + "step": 13820 + }, + { + "epoch": 13.414060606060605, + "grad_norm": 1.0086578130722046, + "learning_rate": 9.895395395395396e-05, + "loss": 0.964, + "mean_token_accuracy": 0.6786475393921136, + "num_tokens": 236313.0, + "step": 13830 + }, + { + "epoch": 13.423757575757575, + "grad_norm": 1.5697195529937744, + "learning_rate": 9.89039039039039e-05, + "loss": 0.9573, + "mean_token_accuracy": 0.6727604184299707, + "num_tokens": 246884.0, + "step": 13840 + }, + { + "epoch": 13.433454545454545, + "grad_norm": 0.8102120161056519, + "learning_rate": 9.885385385385386e-05, + "loss": 0.9226, + "mean_token_accuracy": 0.6831782024353743, + "num_tokens": 256990.0, + "step": 13850 + }, + { + "epoch": 13.443151515151515, + "grad_norm": 0.9028761982917786, + "learning_rate": 9.880380380380381e-05, + "loss": 0.8753, + "mean_token_accuracy": 0.7016171887516975, + "num_tokens": 266786.0, + "step": 13860 + }, + { + "epoch": 13.452848484848484, + "grad_norm": 1.2319331169128418, + "learning_rate": 9.875375375375377e-05, + "loss": 0.9452, + "mean_token_accuracy": 0.6777403865009546, + "num_tokens": 278066.0, + "step": 13870 + }, + { + "epoch": 13.462545454545454, + "grad_norm": 1.340330719947815, + "learning_rate": 9.870370370370371e-05, + "loss": 0.8887, + "mean_token_accuracy": 0.6937030091881752, + "num_tokens": 287644.0, + "step": 13880 + }, + { + "epoch": 13.472242424242424, + "grad_norm": 2.107584238052368, + "learning_rate": 9.865365365365366e-05, + "loss": 0.9805, + "mean_token_accuracy": 0.6581023618578911, + "num_tokens": 299167.0, + "step": 13890 + }, + { + "epoch": 13.481939393939394, + "grad_norm": 1.3416616916656494, + "learning_rate": 9.860360360360361e-05, + "loss": 0.9588, + "mean_token_accuracy": 0.6862040366977453, + "num_tokens": 310325.0, + "step": 13900 + }, + { + "epoch": 13.491636363636363, + "grad_norm": 0.7638229727745056, + "learning_rate": 9.855355355355356e-05, + "loss": 0.9199, + "mean_token_accuracy": 0.6865271601825953, + "num_tokens": 320799.0, + "step": 13910 + }, + { + "epoch": 13.501333333333333, + "grad_norm": 1.8613024950027466, + "learning_rate": 9.850350350350351e-05, + "loss": 0.9029, + "mean_token_accuracy": 0.7134368922561407, + "num_tokens": 331742.0, + "step": 13920 + }, + { + "epoch": 13.511030303030303, + "grad_norm": 0.8470885753631592, + "learning_rate": 9.845345345345346e-05, + "loss": 0.9985, + "mean_token_accuracy": 0.6461464431136846, + "num_tokens": 342008.0, + "step": 13930 + }, + { + "epoch": 13.520727272727273, + "grad_norm": 1.4289556741714478, + "learning_rate": 9.84034034034034e-05, + "loss": 1.0229, + "mean_token_accuracy": 0.6607601415365935, + "num_tokens": 352783.0, + "step": 13940 + }, + { + "epoch": 13.530424242424242, + "grad_norm": 1.1315350532531738, + "learning_rate": 9.835335335335336e-05, + "loss": 0.8668, + "mean_token_accuracy": 0.7064531348645687, + "num_tokens": 362703.0, + "step": 13950 + }, + { + "epoch": 13.540121212121212, + "grad_norm": 0.8690136671066284, + "learning_rate": 9.83033033033033e-05, + "loss": 0.8763, + "mean_token_accuracy": 0.7114055767655373, + "num_tokens": 372732.0, + "step": 13960 + }, + { + "epoch": 13.549818181818182, + "grad_norm": 0.9560481905937195, + "learning_rate": 9.825325325325326e-05, + "loss": 0.9223, + "mean_token_accuracy": 0.6863605052232742, + "num_tokens": 382785.0, + "step": 13970 + }, + { + "epoch": 13.559515151515152, + "grad_norm": 1.053054928779602, + "learning_rate": 9.820320320320321e-05, + "loss": 0.9598, + "mean_token_accuracy": 0.6758723571896553, + "num_tokens": 393871.0, + "step": 13980 + }, + { + "epoch": 13.569212121212122, + "grad_norm": 0.4731355905532837, + "learning_rate": 9.815315315315316e-05, + "loss": 0.9456, + "mean_token_accuracy": 0.6892194643616676, + "num_tokens": 404378.0, + "step": 13990 + }, + { + "epoch": 13.578909090909091, + "grad_norm": 1.9100712537765503, + "learning_rate": 9.810310310310311e-05, + "loss": 1.011, + "mean_token_accuracy": 0.6585861250758172, + "num_tokens": 415762.0, + "step": 14000 + }, + { + "epoch": 13.588606060606061, + "grad_norm": 0.987190842628479, + "learning_rate": 9.805305305305306e-05, + "loss": 0.8616, + "mean_token_accuracy": 0.7012909840792417, + "num_tokens": 425425.0, + "step": 14010 + }, + { + "epoch": 13.59830303030303, + "grad_norm": 0.8835279941558838, + "learning_rate": 9.8003003003003e-05, + "loss": 0.9801, + "mean_token_accuracy": 0.6575286597013473, + "num_tokens": 435861.0, + "step": 14020 + }, + { + "epoch": 13.608, + "grad_norm": 0.9478653073310852, + "learning_rate": 9.795295295295296e-05, + "loss": 0.9283, + "mean_token_accuracy": 0.6895153563469648, + "num_tokens": 446411.0, + "step": 14030 + }, + { + "epoch": 13.61769696969697, + "grad_norm": 0.8801679015159607, + "learning_rate": 9.79029029029029e-05, + "loss": 0.9621, + "mean_token_accuracy": 0.68089236356318, + "num_tokens": 457521.0, + "step": 14040 + }, + { + "epoch": 13.62739393939394, + "grad_norm": 0.7246169447898865, + "learning_rate": 9.785285285285286e-05, + "loss": 0.915, + "mean_token_accuracy": 0.6914402432739735, + "num_tokens": 467230.0, + "step": 14050 + }, + { + "epoch": 13.63709090909091, + "grad_norm": 1.023116946220398, + "learning_rate": 9.78028028028028e-05, + "loss": 1.0017, + "mean_token_accuracy": 0.6633546780794859, + "num_tokens": 478815.0, + "step": 14060 + }, + { + "epoch": 13.64678787878788, + "grad_norm": 1.2296099662780762, + "learning_rate": 9.775275275275276e-05, + "loss": 0.9853, + "mean_token_accuracy": 0.6748053282499313, + "num_tokens": 488901.0, + "step": 14070 + }, + { + "epoch": 13.656484848484848, + "grad_norm": 0.9308061003684998, + "learning_rate": 9.770270270270272e-05, + "loss": 0.872, + "mean_token_accuracy": 0.7030160129070282, + "num_tokens": 499156.0, + "step": 14080 + }, + { + "epoch": 13.666181818181819, + "grad_norm": 1.4838083982467651, + "learning_rate": 9.765265265265266e-05, + "loss": 0.918, + "mean_token_accuracy": 0.6922544561326504, + "num_tokens": 508618.0, + "step": 14090 + }, + { + "epoch": 13.675878787878787, + "grad_norm": 0.6036433577537537, + "learning_rate": 9.760260260260262e-05, + "loss": 0.9253, + "mean_token_accuracy": 0.6886366963386535, + "num_tokens": 519918.0, + "step": 14100 + }, + { + "epoch": 13.685575757575757, + "grad_norm": 0.848430871963501, + "learning_rate": 9.755255255255256e-05, + "loss": 0.9634, + "mean_token_accuracy": 0.6708800371736288, + "num_tokens": 529716.0, + "step": 14110 + }, + { + "epoch": 13.695272727272727, + "grad_norm": 0.7561900019645691, + "learning_rate": 9.75025025025025e-05, + "loss": 0.8676, + "mean_token_accuracy": 0.6998249750584364, + "num_tokens": 539041.0, + "step": 14120 + }, + { + "epoch": 13.704969696969696, + "grad_norm": 0.8211101293563843, + "learning_rate": 9.745245245245246e-05, + "loss": 0.9797, + "mean_token_accuracy": 0.6581344068050384, + "num_tokens": 549883.0, + "step": 14130 + }, + { + "epoch": 13.714666666666666, + "grad_norm": 1.2751184701919556, + "learning_rate": 9.74024024024024e-05, + "loss": 0.9024, + "mean_token_accuracy": 0.6988375499844551, + "num_tokens": 560364.0, + "step": 14140 + }, + { + "epoch": 13.724363636363636, + "grad_norm": 0.7292294502258301, + "learning_rate": 9.735235235235236e-05, + "loss": 0.8688, + "mean_token_accuracy": 0.6985570065677166, + "num_tokens": 570146.0, + "step": 14150 + }, + { + "epoch": 13.734060606060606, + "grad_norm": 1.0787569284439087, + "learning_rate": 9.73023023023023e-05, + "loss": 0.9108, + "mean_token_accuracy": 0.6816088363528252, + "num_tokens": 580097.0, + "step": 14160 + }, + { + "epoch": 13.743757575757575, + "grad_norm": 0.5591951012611389, + "learning_rate": 9.725225225225225e-05, + "loss": 1.0228, + "mean_token_accuracy": 0.6424524009227752, + "num_tokens": 591054.0, + "step": 14170 + }, + { + "epoch": 13.753454545454545, + "grad_norm": 0.950010359287262, + "learning_rate": 9.72022022022022e-05, + "loss": 0.8811, + "mean_token_accuracy": 0.7032374102622271, + "num_tokens": 600799.0, + "step": 14180 + }, + { + "epoch": 13.763151515151515, + "grad_norm": 0.4867992103099823, + "learning_rate": 9.715215215215216e-05, + "loss": 0.9002, + "mean_token_accuracy": 0.6991371564567089, + "num_tokens": 611008.0, + "step": 14190 + }, + { + "epoch": 13.772848484848485, + "grad_norm": 0.5358482003211975, + "learning_rate": 9.710210210210212e-05, + "loss": 0.9305, + "mean_token_accuracy": 0.6951459109783172, + "num_tokens": 621483.0, + "step": 14200 + }, + { + "epoch": 13.782545454545454, + "grad_norm": 0.8481453657150269, + "learning_rate": 9.705205205205206e-05, + "loss": 0.9505, + "mean_token_accuracy": 0.6874804452061654, + "num_tokens": 633074.0, + "step": 14210 + }, + { + "epoch": 13.792242424242424, + "grad_norm": 0.664574146270752, + "learning_rate": 9.7002002002002e-05, + "loss": 0.971, + "mean_token_accuracy": 0.6761138528585434, + "num_tokens": 644108.0, + "step": 14220 + }, + { + "epoch": 13.801939393939394, + "grad_norm": 0.6939647793769836, + "learning_rate": 9.695195195195196e-05, + "loss": 0.9171, + "mean_token_accuracy": 0.6944379203021527, + "num_tokens": 654249.0, + "step": 14230 + }, + { + "epoch": 13.811636363636364, + "grad_norm": 0.6086325645446777, + "learning_rate": 9.69019019019019e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.6574176583439112, + "num_tokens": 664735.0, + "step": 14240 + }, + { + "epoch": 13.821333333333333, + "grad_norm": 2.137354612350464, + "learning_rate": 9.685185185185186e-05, + "loss": 0.9202, + "mean_token_accuracy": 0.6800346210598945, + "num_tokens": 675580.0, + "step": 14250 + }, + { + "epoch": 13.831030303030303, + "grad_norm": 1.0914839506149292, + "learning_rate": 9.68018018018018e-05, + "loss": 0.9407, + "mean_token_accuracy": 0.6815901666879653, + "num_tokens": 685042.0, + "step": 14260 + }, + { + "epoch": 13.840727272727273, + "grad_norm": 0.9622077345848083, + "learning_rate": 9.675175175175175e-05, + "loss": 0.9412, + "mean_token_accuracy": 0.6896888021379709, + "num_tokens": 695452.0, + "step": 14270 + }, + { + "epoch": 13.850424242424243, + "grad_norm": 0.5911729335784912, + "learning_rate": 9.67017017017017e-05, + "loss": 0.9476, + "mean_token_accuracy": 0.6852936699986458, + "num_tokens": 706283.0, + "step": 14280 + }, + { + "epoch": 13.860121212121213, + "grad_norm": 1.0763121843338013, + "learning_rate": 9.665165165165166e-05, + "loss": 0.8593, + "mean_token_accuracy": 0.70830412581563, + "num_tokens": 715851.0, + "step": 14290 + }, + { + "epoch": 13.869818181818182, + "grad_norm": 0.7274637818336487, + "learning_rate": 9.660160160160162e-05, + "loss": 1.0047, + "mean_token_accuracy": 0.6792124062776566, + "num_tokens": 727035.0, + "step": 14300 + }, + { + "epoch": 13.879515151515152, + "grad_norm": 0.6750665903091431, + "learning_rate": 9.655155155155156e-05, + "loss": 0.9299, + "mean_token_accuracy": 0.6856705665588378, + "num_tokens": 737554.0, + "step": 14310 + }, + { + "epoch": 13.889212121212122, + "grad_norm": 0.6934303641319275, + "learning_rate": 9.65015015015015e-05, + "loss": 0.903, + "mean_token_accuracy": 0.6835582558065653, + "num_tokens": 747852.0, + "step": 14320 + }, + { + "epoch": 13.898909090909092, + "grad_norm": 0.5132259726524353, + "learning_rate": 9.645145145145146e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.6827262349426746, + "num_tokens": 757970.0, + "step": 14330 + }, + { + "epoch": 13.908606060606061, + "grad_norm": 0.46078333258628845, + "learning_rate": 9.64014014014014e-05, + "loss": 0.9377, + "mean_token_accuracy": 0.684358200058341, + "num_tokens": 768757.0, + "step": 14340 + }, + { + "epoch": 13.918303030303031, + "grad_norm": 0.555814266204834, + "learning_rate": 9.635135135135136e-05, + "loss": 0.9762, + "mean_token_accuracy": 0.6606349345296622, + "num_tokens": 780206.0, + "step": 14350 + }, + { + "epoch": 13.928, + "grad_norm": 0.8341594338417053, + "learning_rate": 9.63013013013013e-05, + "loss": 0.9645, + "mean_token_accuracy": 0.6715805854648351, + "num_tokens": 790961.0, + "step": 14360 + }, + { + "epoch": 13.937696969696969, + "grad_norm": 0.6067021489143372, + "learning_rate": 9.625125125125125e-05, + "loss": 0.9145, + "mean_token_accuracy": 0.6793199084699154, + "num_tokens": 800987.0, + "step": 14370 + }, + { + "epoch": 13.947393939393939, + "grad_norm": 0.7952314019203186, + "learning_rate": 9.62012012012012e-05, + "loss": 0.9162, + "mean_token_accuracy": 0.69475242421031, + "num_tokens": 811828.0, + "step": 14380 + }, + { + "epoch": 13.957090909090908, + "grad_norm": 0.8746843934059143, + "learning_rate": 9.615115115115115e-05, + "loss": 0.8681, + "mean_token_accuracy": 0.7030756626278162, + "num_tokens": 822958.0, + "step": 14390 + }, + { + "epoch": 13.966787878787878, + "grad_norm": 0.4334689974784851, + "learning_rate": 9.61011011011011e-05, + "loss": 0.9797, + "mean_token_accuracy": 0.6570104032754898, + "num_tokens": 834206.0, + "step": 14400 + }, + { + "epoch": 13.976484848484848, + "grad_norm": 0.5802099108695984, + "learning_rate": 9.605105105105106e-05, + "loss": 0.9076, + "mean_token_accuracy": 0.6986728705465793, + "num_tokens": 845031.0, + "step": 14410 + }, + { + "epoch": 13.986181818181818, + "grad_norm": 0.41924917697906494, + "learning_rate": 9.6001001001001e-05, + "loss": 0.9134, + "mean_token_accuracy": 0.6965976521372795, + "num_tokens": 854691.0, + "step": 14420 + }, + { + "epoch": 13.995878787878787, + "grad_norm": 0.4162426292896271, + "learning_rate": 9.595095095095096e-05, + "loss": 1.0075, + "mean_token_accuracy": 0.6658117517828941, + "num_tokens": 865141.0, + "step": 14430 + }, + { + "epoch": 14.005818181818182, + "grad_norm": 0.6385387182235718, + "learning_rate": 9.59009009009009e-05, + "loss": 1.0001, + "mean_token_accuracy": 0.6936045238157598, + "num_tokens": 876494.0, + "step": 14440 + }, + { + "epoch": 14.015515151515151, + "grad_norm": 0.6041902303695679, + "learning_rate": 9.585085085085086e-05, + "loss": 0.9238, + "mean_token_accuracy": 0.6878648042678833, + "num_tokens": 886603.0, + "step": 14450 + }, + { + "epoch": 14.025212121212121, + "grad_norm": 0.9639670252799988, + "learning_rate": 9.58008008008008e-05, + "loss": 1.0331, + "mean_token_accuracy": 0.6559940252453089, + "num_tokens": 898033.0, + "step": 14460 + }, + { + "epoch": 14.03490909090909, + "grad_norm": 0.5883612036705017, + "learning_rate": 9.575075075075075e-05, + "loss": 0.9764, + "mean_token_accuracy": 0.6728239644318819, + "num_tokens": 909064.0, + "step": 14470 + }, + { + "epoch": 14.04460606060606, + "grad_norm": 0.8372961282730103, + "learning_rate": 9.57007007007007e-05, + "loss": 0.9061, + "mean_token_accuracy": 0.6874277569353581, + "num_tokens": 919744.0, + "step": 14480 + }, + { + "epoch": 14.05430303030303, + "grad_norm": 1.7760860919952393, + "learning_rate": 9.565065065065065e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.6859086826443672, + "num_tokens": 930349.0, + "step": 14490 + }, + { + "epoch": 14.064, + "grad_norm": 0.5744428634643555, + "learning_rate": 9.56006006006006e-05, + "loss": 0.9518, + "mean_token_accuracy": 0.6734505753964186, + "num_tokens": 940791.0, + "step": 14500 + }, + { + "epoch": 14.07369696969697, + "grad_norm": 0.9980311989784241, + "learning_rate": 9.555055055055056e-05, + "loss": 0.903, + "mean_token_accuracy": 0.6935466017574072, + "num_tokens": 951362.0, + "step": 14510 + }, + { + "epoch": 14.08339393939394, + "grad_norm": 0.6623931527137756, + "learning_rate": 9.55005005005005e-05, + "loss": 0.8918, + "mean_token_accuracy": 0.6993258882313966, + "num_tokens": 962286.0, + "step": 14520 + }, + { + "epoch": 14.09309090909091, + "grad_norm": 0.4992653429508209, + "learning_rate": 9.545045045045046e-05, + "loss": 0.9309, + "mean_token_accuracy": 0.6649952068924904, + "num_tokens": 972644.0, + "step": 14530 + }, + { + "epoch": 14.102787878787879, + "grad_norm": 0.4818670153617859, + "learning_rate": 9.54004004004004e-05, + "loss": 0.9225, + "mean_token_accuracy": 0.6798055626451969, + "num_tokens": 983056.0, + "step": 14540 + }, + { + "epoch": 14.112484848484849, + "grad_norm": 0.9694674015045166, + "learning_rate": 9.535035035035036e-05, + "loss": 0.8962, + "mean_token_accuracy": 0.6953587524592877, + "num_tokens": 992490.0, + "step": 14550 + }, + { + "epoch": 14.122181818181819, + "grad_norm": 0.8076632618904114, + "learning_rate": 9.53003003003003e-05, + "loss": 0.9096, + "mean_token_accuracy": 0.6695175170898438, + "num_tokens": 1003185.0, + "step": 14560 + }, + { + "epoch": 14.131878787878788, + "grad_norm": 0.39989814162254333, + "learning_rate": 9.525025025025025e-05, + "loss": 0.8959, + "mean_token_accuracy": 0.6875695057213307, + "num_tokens": 1013510.0, + "step": 14570 + }, + { + "epoch": 14.141575757575758, + "grad_norm": 0.5998600721359253, + "learning_rate": 9.52002002002002e-05, + "loss": 0.9311, + "mean_token_accuracy": 0.6833312470465899, + "num_tokens": 1023703.0, + "step": 14580 + }, + { + "epoch": 14.151272727272728, + "grad_norm": 1.0913785696029663, + "learning_rate": 9.515015015015015e-05, + "loss": 1.0003, + "mean_token_accuracy": 0.6780395913869143, + "num_tokens": 1035344.0, + "step": 14590 + }, + { + "epoch": 14.160969696969698, + "grad_norm": 0.891591489315033, + "learning_rate": 9.51001001001001e-05, + "loss": 0.9419, + "mean_token_accuracy": 0.6847637005150318, + "num_tokens": 1045341.0, + "step": 14600 + }, + { + "epoch": 14.170666666666667, + "grad_norm": 0.8624415397644043, + "learning_rate": 9.505005005005005e-05, + "loss": 0.8532, + "mean_token_accuracy": 0.7248432952910662, + "num_tokens": 1055979.0, + "step": 14610 + }, + { + "epoch": 14.180363636363637, + "grad_norm": 0.9150317311286926, + "learning_rate": 9.5e-05, + "loss": 0.9745, + "mean_token_accuracy": 0.6698940627276897, + "num_tokens": 1067319.0, + "step": 14620 + }, + { + "epoch": 14.190060606060607, + "grad_norm": 0.41908109188079834, + "learning_rate": 9.494994994994996e-05, + "loss": 0.9568, + "mean_token_accuracy": 0.678890322521329, + "num_tokens": 1078431.0, + "step": 14630 + }, + { + "epoch": 14.199757575757575, + "grad_norm": 0.878993809223175, + "learning_rate": 9.48998998998999e-05, + "loss": 0.9256, + "mean_token_accuracy": 0.6974173996597528, + "num_tokens": 1088662.0, + "step": 14640 + }, + { + "epoch": 14.209454545454545, + "grad_norm": 0.3703934848308563, + "learning_rate": 9.484984984984986e-05, + "loss": 0.9434, + "mean_token_accuracy": 0.6732165481895208, + "num_tokens": 1099362.0, + "step": 14650 + }, + { + "epoch": 14.219151515151514, + "grad_norm": 0.4467850625514984, + "learning_rate": 9.47997997997998e-05, + "loss": 0.9459, + "mean_token_accuracy": 0.6609635852277279, + "num_tokens": 1110047.0, + "step": 14660 + }, + { + "epoch": 14.228848484848484, + "grad_norm": 1.2241610288619995, + "learning_rate": 9.474974974974975e-05, + "loss": 0.9469, + "mean_token_accuracy": 0.6950714159756899, + "num_tokens": 1120542.0, + "step": 14670 + }, + { + "epoch": 14.238545454545454, + "grad_norm": 0.6757529973983765, + "learning_rate": 9.46996996996997e-05, + "loss": 0.9628, + "mean_token_accuracy": 0.6720464017242194, + "num_tokens": 1131343.0, + "step": 14680 + }, + { + "epoch": 14.248242424242424, + "grad_norm": 0.9918266534805298, + "learning_rate": 9.464964964964965e-05, + "loss": 0.9084, + "mean_token_accuracy": 0.6795904841274023, + "num_tokens": 1142007.0, + "step": 14690 + }, + { + "epoch": 14.257939393939393, + "grad_norm": 0.9975070953369141, + "learning_rate": 9.45995995995996e-05, + "loss": 0.806, + "mean_token_accuracy": 0.734311144053936, + "num_tokens": 1151794.0, + "step": 14700 + }, + { + "epoch": 14.267636363636363, + "grad_norm": 0.6164572238922119, + "learning_rate": 9.454954954954955e-05, + "loss": 0.8116, + "mean_token_accuracy": 0.7311961345374585, + "num_tokens": 1161276.0, + "step": 14710 + }, + { + "epoch": 14.277333333333333, + "grad_norm": 0.8973527550697327, + "learning_rate": 9.44994994994995e-05, + "loss": 0.8702, + "mean_token_accuracy": 0.7167054928839207, + "num_tokens": 1171332.0, + "step": 14720 + }, + { + "epoch": 14.287030303030303, + "grad_norm": 0.6523808240890503, + "learning_rate": 9.444944944944946e-05, + "loss": 0.9112, + "mean_token_accuracy": 0.6947918102145195, + "num_tokens": 1181393.0, + "step": 14730 + }, + { + "epoch": 14.296727272727273, + "grad_norm": 0.41433241963386536, + "learning_rate": 9.43993993993994e-05, + "loss": 0.9264, + "mean_token_accuracy": 0.6836030226200819, + "num_tokens": 1191806.0, + "step": 14740 + }, + { + "epoch": 14.306424242424242, + "grad_norm": 0.7625298500061035, + "learning_rate": 9.434934934934936e-05, + "loss": 0.8197, + "mean_token_accuracy": 0.7187630910426378, + "num_tokens": 1200839.0, + "step": 14750 + }, + { + "epoch": 14.316121212121212, + "grad_norm": 0.5743375420570374, + "learning_rate": 9.42992992992993e-05, + "loss": 0.9071, + "mean_token_accuracy": 0.6910372313112021, + "num_tokens": 1211129.0, + "step": 14760 + }, + { + "epoch": 14.325818181818182, + "grad_norm": 1.0408577919006348, + "learning_rate": 9.424924924924925e-05, + "loss": 0.9313, + "mean_token_accuracy": 0.687668776512146, + "num_tokens": 1221591.0, + "step": 14770 + }, + { + "epoch": 14.335515151515152, + "grad_norm": 0.8543786406517029, + "learning_rate": 9.41991991991992e-05, + "loss": 0.9029, + "mean_token_accuracy": 0.7062688145786524, + "num_tokens": 1231723.0, + "step": 14780 + }, + { + "epoch": 14.345212121212121, + "grad_norm": 0.5075017809867859, + "learning_rate": 9.414914914914915e-05, + "loss": 0.8447, + "mean_token_accuracy": 0.7158392701297999, + "num_tokens": 1241710.0, + "step": 14790 + }, + { + "epoch": 14.354909090909091, + "grad_norm": 1.1220818758010864, + "learning_rate": 9.40990990990991e-05, + "loss": 0.9342, + "mean_token_accuracy": 0.6817990552634001, + "num_tokens": 1251618.0, + "step": 14800 + }, + { + "epoch": 7.288492307692308, + "grad_norm": 0.6032423377037048, + "learning_rate": 9.845357679969794e-05, + "loss": 0.6822, + "mean_token_accuracy": 0.7867337457835675, + "num_tokens": 9312.0, + "step": 14810 + }, + { + "epoch": 7.293415384615384, + "grad_norm": 2.23929762840271, + "learning_rate": 9.842759302218645e-05, + "loss": 0.8016, + "mean_token_accuracy": 0.750348436832428, + "num_tokens": 18778.0, + "step": 14820 + }, + { + "epoch": 7.298338461538462, + "grad_norm": 0.8688719868659973, + "learning_rate": 9.840139624995212e-05, + "loss": 0.6881, + "mean_token_accuracy": 0.7714763689786196, + "num_tokens": 27387.0, + "step": 14830 + }, + { + "epoch": 7.3032615384615385, + "grad_norm": 0.6268885135650635, + "learning_rate": 9.837498659821384e-05, + "loss": 0.7321, + "mean_token_accuracy": 0.7611544221639633, + "num_tokens": 36938.0, + "step": 14840 + }, + { + "epoch": 7.308184615384615, + "grad_norm": 0.822592556476593, + "learning_rate": 9.834836418312681e-05, + "loss": 0.744, + "mean_token_accuracy": 0.7452987994998693, + "num_tokens": 45571.0, + "step": 14850 + }, + { + "epoch": 7.3131076923076925, + "grad_norm": 0.4749494194984436, + "learning_rate": 9.8321529121782e-05, + "loss": 0.7403, + "mean_token_accuracy": 0.750314911454916, + "num_tokens": 54765.0, + "step": 14860 + }, + { + "epoch": 7.318030769230769, + "grad_norm": 1.3120962381362915, + "learning_rate": 9.829448153220566e-05, + "loss": 0.761, + "mean_token_accuracy": 0.7358665529638528, + "num_tokens": 63751.0, + "step": 14870 + }, + { + "epoch": 7.322953846153846, + "grad_norm": 0.7016109228134155, + "learning_rate": 9.826722153335877e-05, + "loss": 0.7017, + "mean_token_accuracy": 0.7645948387682437, + "num_tokens": 71817.0, + "step": 14880 + }, + { + "epoch": 7.327876923076923, + "grad_norm": 0.5037406086921692, + "learning_rate": 9.82397492451365e-05, + "loss": 0.7157, + "mean_token_accuracy": 0.7650218937546015, + "num_tokens": 80510.0, + "step": 14890 + }, + { + "epoch": 7.3328, + "grad_norm": 0.6709319353103638, + "learning_rate": 9.821206478836775e-05, + "loss": 0.7248, + "mean_token_accuracy": 0.7560942731797695, + "num_tokens": 89412.0, + "step": 14900 + }, + { + "epoch": 7.337723076923077, + "grad_norm": 1.4935665130615234, + "learning_rate": 9.81841682848146e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.7548914663493633, + "num_tokens": 99256.0, + "step": 14910 + }, + { + "epoch": 7.342646153846154, + "grad_norm": 0.44451966881752014, + "learning_rate": 9.815605985717171e-05, + "loss": 0.7185, + "mean_token_accuracy": 0.7600229732692242, + "num_tokens": 107641.0, + "step": 14920 + }, + { + "epoch": 7.34756923076923, + "grad_norm": 0.5159631371498108, + "learning_rate": 9.812773962906586e-05, + "loss": 0.7593, + "mean_token_accuracy": 0.7515256915241479, + "num_tokens": 116291.0, + "step": 14930 + }, + { + "epoch": 7.352492307692308, + "grad_norm": 1.3890159130096436, + "learning_rate": 9.809920772505532e-05, + "loss": 0.8097, + "mean_token_accuracy": 0.7170861914753914, + "num_tokens": 126012.0, + "step": 14940 + }, + { + "epoch": 7.3574153846153845, + "grad_norm": 1.5582915544509888, + "learning_rate": 9.807046427062944e-05, + "loss": 0.7585, + "mean_token_accuracy": 0.7490797568112612, + "num_tokens": 135364.0, + "step": 14950 + }, + { + "epoch": 7.362338461538462, + "grad_norm": 0.3708029091358185, + "learning_rate": 9.804150939220796e-05, + "loss": 0.7772, + "mean_token_accuracy": 0.7339643765240907, + "num_tokens": 143997.0, + "step": 14960 + }, + { + "epoch": 7.3672615384615385, + "grad_norm": 0.7632699012756348, + "learning_rate": 9.80123432171405e-05, + "loss": 0.7651, + "mean_token_accuracy": 0.7485666394233703, + "num_tokens": 153574.0, + "step": 14970 + }, + { + "epoch": 7.372184615384615, + "grad_norm": 1.4701080322265625, + "learning_rate": 9.798296587370603e-05, + "loss": 0.7292, + "mean_token_accuracy": 0.7644454840570688, + "num_tokens": 162637.0, + "step": 14980 + }, + { + "epoch": 7.377107692307693, + "grad_norm": 0.7957881093025208, + "learning_rate": 9.795337749111229e-05, + "loss": 0.8468, + "mean_token_accuracy": 0.7276211023330689, + "num_tokens": 173011.0, + "step": 14990 + }, + { + "epoch": 7.382030769230769, + "grad_norm": 0.41769880056381226, + "learning_rate": 9.792357819949518e-05, + "loss": 0.7238, + "mean_token_accuracy": 0.7681386031210422, + "num_tokens": 181909.0, + "step": 15000 + }, + { + "epoch": 7.386953846153846, + "grad_norm": 0.7403699159622192, + "learning_rate": 9.881224657674156e-05, + "loss": 0.7858, + "mean_token_accuracy": 0.7396802183240652, + "num_tokens": 8895.0, + "step": 15010 + }, + { + "epoch": 7.391876923076923, + "grad_norm": 0.6027011275291443, + "learning_rate": 9.879515199721796e-05, + "loss": 0.7274, + "mean_token_accuracy": 0.7529193755239248, + "num_tokens": 17998.0, + "step": 15020 + }, + { + "epoch": 7.3968, + "grad_norm": 0.4352588951587677, + "learning_rate": 9.87779367793514e-05, + "loss": 0.7908, + "mean_token_accuracy": 0.7491613268852234, + "num_tokens": 27846.0, + "step": 15030 + }, + { + "epoch": 7.401723076923077, + "grad_norm": 0.39712125062942505, + "learning_rate": 9.87606009657038e-05, + "loss": 0.7353, + "mean_token_accuracy": 0.7678960163146258, + "num_tokens": 36104.0, + "step": 15040 + }, + { + "epoch": 7.406646153846154, + "grad_norm": 0.5739689469337463, + "learning_rate": 9.874314459913522e-05, + "loss": 0.6803, + "mean_token_accuracy": 0.772222863510251, + "num_tokens": 44607.0, + "step": 15050 + }, + { + "epoch": 7.4115692307692305, + "grad_norm": 0.5296592116355896, + "learning_rate": 9.872556772280379e-05, + "loss": 0.6219, + "mean_token_accuracy": 0.7882269717752933, + "num_tokens": 52426.0, + "step": 15060 + }, + { + "epoch": 7.416492307692308, + "grad_norm": 0.7450407147407532, + "learning_rate": 9.870787038016557e-05, + "loss": 0.7046, + "mean_token_accuracy": 0.7562790676951409, + "num_tokens": 60835.0, + "step": 15070 + }, + { + "epoch": 7.4214153846153845, + "grad_norm": 0.8603422045707703, + "learning_rate": 9.869005261497446e-05, + "loss": 0.7464, + "mean_token_accuracy": 0.7453157220035791, + "num_tokens": 70309.0, + "step": 15080 + }, + { + "epoch": 7.426338461538462, + "grad_norm": 0.4799814820289612, + "learning_rate": 9.867211447128208e-05, + "loss": 0.8564, + "mean_token_accuracy": 0.7118423756211996, + "num_tokens": 80831.0, + "step": 15090 + }, + { + "epoch": 7.431261538461539, + "grad_norm": 0.38809236884117126, + "learning_rate": 9.865405599343768e-05, + "loss": 0.778, + "mean_token_accuracy": 0.729843546077609, + "num_tokens": 89878.0, + "step": 15100 + }, + { + "epoch": 7.436184615384615, + "grad_norm": 0.561546802520752, + "learning_rate": 9.863587722608799e-05, + "loss": 0.766, + "mean_token_accuracy": 0.736732891574502, + "num_tokens": 98413.0, + "step": 15110 + }, + { + "epoch": 7.441107692307693, + "grad_norm": 0.4035409986972809, + "learning_rate": 9.861757821417718e-05, + "loss": 0.6529, + "mean_token_accuracy": 0.7860307555645705, + "num_tokens": 106310.0, + "step": 15120 + }, + { + "epoch": 7.446030769230769, + "grad_norm": 1.2474324703216553, + "learning_rate": 9.859915900294666e-05, + "loss": 0.6801, + "mean_token_accuracy": 0.7747167505323886, + "num_tokens": 114567.0, + "step": 15130 + }, + { + "epoch": 7.450953846153846, + "grad_norm": 1.240290880203247, + "learning_rate": 9.858061963793503e-05, + "loss": 0.6493, + "mean_token_accuracy": 0.7812603395432234, + "num_tokens": 123149.0, + "step": 15140 + }, + { + "epoch": 7.455876923076923, + "grad_norm": 0.9319782853126526, + "learning_rate": 9.856196016497798e-05, + "loss": 0.8078, + "mean_token_accuracy": 0.7315979212522506, + "num_tokens": 132265.0, + "step": 15150 + }, + { + "epoch": 7.4608, + "grad_norm": 0.35292956233024597, + "learning_rate": 9.85431806302081e-05, + "loss": 0.7718, + "mean_token_accuracy": 0.7364303342998028, + "num_tokens": 141098.0, + "step": 15160 + }, + { + "epoch": 7.4657230769230765, + "grad_norm": 0.5348508358001709, + "learning_rate": 9.852428108005487e-05, + "loss": 0.7324, + "mean_token_accuracy": 0.7685822080820799, + "num_tokens": 150742.0, + "step": 15170 + }, + { + "epoch": 7.470646153846154, + "grad_norm": 0.9570394158363342, + "learning_rate": 9.850526156124442e-05, + "loss": 0.6739, + "mean_token_accuracy": 0.7785952746868133, + "num_tokens": 159095.0, + "step": 15180 + }, + { + "epoch": 7.4755692307692305, + "grad_norm": 1.611872673034668, + "learning_rate": 9.848612212079955e-05, + "loss": 0.7185, + "mean_token_accuracy": 0.7705470208078623, + "num_tokens": 167922.0, + "step": 15190 + }, + { + "epoch": 7.480492307692308, + "grad_norm": 0.5744756460189819, + "learning_rate": 9.846686280603948e-05, + "loss": 0.8469, + "mean_token_accuracy": 0.724059621617198, + "num_tokens": 177884.0, + "step": 15200 + }, + { + "epoch": 7.485415384615385, + "grad_norm": 0.42839816212654114, + "learning_rate": 9.844748366457988e-05, + "loss": 0.7499, + "mean_token_accuracy": 0.7528812907636165, + "num_tokens": 187133.0, + "step": 15210 + }, + { + "epoch": 7.490338461538461, + "grad_norm": 2.1280364990234375, + "learning_rate": 9.84279847443326e-05, + "loss": 0.7742, + "mean_token_accuracy": 0.7489132527261972, + "num_tokens": 196413.0, + "step": 15220 + }, + { + "epoch": 7.495261538461539, + "grad_norm": 0.35753366351127625, + "learning_rate": 9.840836609350567e-05, + "loss": 0.835, + "mean_token_accuracy": 0.7175555892288685, + "num_tokens": 206238.0, + "step": 15230 + }, + { + "epoch": 7.500184615384615, + "grad_norm": 0.925093412399292, + "learning_rate": 9.838862776060312e-05, + "loss": 0.7501, + "mean_token_accuracy": 0.7446019750088453, + "num_tokens": 215620.0, + "step": 15240 + }, + { + "epoch": 7.505107692307693, + "grad_norm": 0.6911622881889343, + "learning_rate": 9.836876979442489e-05, + "loss": 0.7261, + "mean_token_accuracy": 0.7689918410032988, + "num_tokens": 224928.0, + "step": 15250 + }, + { + "epoch": 7.510030769230769, + "grad_norm": 0.7005440592765808, + "learning_rate": 9.834879224406663e-05, + "loss": 0.7894, + "mean_token_accuracy": 0.741933236643672, + "num_tokens": 235020.0, + "step": 15260 + }, + { + "epoch": 7.514953846153846, + "grad_norm": 0.5132576823234558, + "learning_rate": 9.832869515891975e-05, + "loss": 0.7629, + "mean_token_accuracy": 0.7501115497201681, + "num_tokens": 244901.0, + "step": 15270 + }, + { + "epoch": 7.519876923076923, + "grad_norm": 0.4901637136936188, + "learning_rate": 9.83084785886711e-05, + "loss": 0.7618, + "mean_token_accuracy": 0.7478879150003195, + "num_tokens": 254306.0, + "step": 15280 + }, + { + "epoch": 7.5248, + "grad_norm": 0.6824623346328735, + "learning_rate": 9.828814258330298e-05, + "loss": 0.7023, + "mean_token_accuracy": 0.7611722193658352, + "num_tokens": 263006.0, + "step": 15290 + }, + { + "epoch": 7.5297230769230765, + "grad_norm": 0.4069543480873108, + "learning_rate": 9.826768719309298e-05, + "loss": 0.7126, + "mean_token_accuracy": 0.7572260867804289, + "num_tokens": 271687.0, + "step": 15300 + }, + { + "epoch": 7.534646153846154, + "grad_norm": 0.7551083564758301, + "learning_rate": 9.824711246861382e-05, + "loss": 0.8352, + "mean_token_accuracy": 0.718966668099165, + "num_tokens": 281372.0, + "step": 15310 + }, + { + "epoch": 7.539569230769231, + "grad_norm": 0.8479435443878174, + "learning_rate": 9.822641846073329e-05, + "loss": 0.8138, + "mean_token_accuracy": 0.752262394875288, + "num_tokens": 290553.0, + "step": 15320 + }, + { + "epoch": 7.544492307692308, + "grad_norm": 0.38278627395629883, + "learning_rate": 9.820560522061403e-05, + "loss": 0.7287, + "mean_token_accuracy": 0.7666766557842493, + "num_tokens": 299428.0, + "step": 15330 + }, + { + "epoch": 7.549415384615385, + "grad_norm": 0.7417807579040527, + "learning_rate": 9.818467279971355e-05, + "loss": 0.6453, + "mean_token_accuracy": 0.7891027696430684, + "num_tokens": 308217.0, + "step": 15340 + }, + { + "epoch": 7.554338461538461, + "grad_norm": 0.41675281524658203, + "learning_rate": 9.816362124978396e-05, + "loss": 0.703, + "mean_token_accuracy": 0.7679217629134655, + "num_tokens": 316520.0, + "step": 15350 + }, + { + "epoch": 7.559261538461539, + "grad_norm": 0.8314495086669922, + "learning_rate": 9.814245062287189e-05, + "loss": 0.6985, + "mean_token_accuracy": 0.7756699241697789, + "num_tokens": 325247.0, + "step": 15360 + }, + { + "epoch": 7.564184615384615, + "grad_norm": 0.5109190344810486, + "learning_rate": 9.812116097131839e-05, + "loss": 0.6479, + "mean_token_accuracy": 0.7915467619895935, + "num_tokens": 333857.0, + "step": 15370 + }, + { + "epoch": 7.569107692307693, + "grad_norm": 0.8507750630378723, + "learning_rate": 9.80997523477588e-05, + "loss": 0.6877, + "mean_token_accuracy": 0.7707390915602446, + "num_tokens": 343201.0, + "step": 15380 + }, + { + "epoch": 7.574030769230769, + "grad_norm": 0.4465511739253998, + "learning_rate": 9.807822480512256e-05, + "loss": 0.7341, + "mean_token_accuracy": 0.7457791332155466, + "num_tokens": 352232.0, + "step": 15390 + }, + { + "epoch": 7.578953846153846, + "grad_norm": 0.7074446082115173, + "learning_rate": 9.805657839663313e-05, + "loss": 0.5786, + "mean_token_accuracy": 0.7954732224345207, + "num_tokens": 360362.0, + "step": 15400 + }, + { + "epoch": 7.583876923076923, + "grad_norm": 0.4567805826663971, + "learning_rate": 9.803481317580788e-05, + "loss": 0.7394, + "mean_token_accuracy": 0.7533329404890537, + "num_tokens": 369312.0, + "step": 15410 + }, + { + "epoch": 7.5888, + "grad_norm": 0.4720822274684906, + "learning_rate": 9.801292919645786e-05, + "loss": 0.7422, + "mean_token_accuracy": 0.7545758258551359, + "num_tokens": 378787.0, + "step": 15420 + }, + { + "epoch": 7.593723076923077, + "grad_norm": 0.6811593770980835, + "learning_rate": 9.799092651268778e-05, + "loss": 0.7089, + "mean_token_accuracy": 0.755854606255889, + "num_tokens": 387085.0, + "step": 15430 + }, + { + "epoch": 7.598646153846154, + "grad_norm": 0.520389974117279, + "learning_rate": 9.796880517889583e-05, + "loss": 0.7357, + "mean_token_accuracy": 0.7585709065198898, + "num_tokens": 395607.0, + "step": 15440 + }, + { + "epoch": 7.603569230769231, + "grad_norm": 0.4988136291503906, + "learning_rate": 9.794656524977353e-05, + "loss": 0.7718, + "mean_token_accuracy": 0.7427222758531571, + "num_tokens": 404335.0, + "step": 15450 + }, + { + "epoch": 7.608492307692307, + "grad_norm": 0.4840397834777832, + "learning_rate": 9.792420678030559e-05, + "loss": 0.7027, + "mean_token_accuracy": 0.7715373657643795, + "num_tokens": 412789.0, + "step": 15460 + }, + { + "epoch": 7.613415384615385, + "grad_norm": 0.48264145851135254, + "learning_rate": 9.790172982576982e-05, + "loss": 0.7478, + "mean_token_accuracy": 0.7376698384061455, + "num_tokens": 421957.0, + "step": 15470 + }, + { + "epoch": 7.618338461538461, + "grad_norm": 0.5378937125205994, + "learning_rate": 9.787913444173696e-05, + "loss": 0.7276, + "mean_token_accuracy": 0.7619619213044644, + "num_tokens": 431082.0, + "step": 15480 + }, + { + "epoch": 7.623261538461539, + "grad_norm": 0.775534451007843, + "learning_rate": 9.785642068407055e-05, + "loss": 0.6669, + "mean_token_accuracy": 0.7788964670151473, + "num_tokens": 439416.0, + "step": 15490 + }, + { + "epoch": 7.628184615384615, + "grad_norm": 0.705254077911377, + "learning_rate": 9.783358860892679e-05, + "loss": 0.7338, + "mean_token_accuracy": 0.7540049366652966, + "num_tokens": 447426.0, + "step": 15500 + }, + { + "epoch": 7.633107692307692, + "grad_norm": 0.5129554271697998, + "learning_rate": 9.781063827275437e-05, + "loss": 0.7533, + "mean_token_accuracy": 0.747262655198574, + "num_tokens": 456215.0, + "step": 15510 + }, + { + "epoch": 7.638030769230769, + "grad_norm": 0.546363890171051, + "learning_rate": 9.778756973229441e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.767508839443326, + "num_tokens": 465873.0, + "step": 15520 + }, + { + "epoch": 7.642953846153846, + "grad_norm": 0.5648319125175476, + "learning_rate": 9.776438304458025e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.7714706733822823, + "num_tokens": 474390.0, + "step": 15530 + }, + { + "epoch": 7.6478769230769235, + "grad_norm": 0.5035734176635742, + "learning_rate": 9.774107826693731e-05, + "loss": 0.6713, + "mean_token_accuracy": 0.7714920256286859, + "num_tokens": 482861.0, + "step": 15540 + }, + { + "epoch": 7.6528, + "grad_norm": 0.5865426659584045, + "learning_rate": 9.771765545698303e-05, + "loss": 0.6718, + "mean_token_accuracy": 0.7740787465125323, + "num_tokens": 492284.0, + "step": 15550 + }, + { + "epoch": 7.657723076923077, + "grad_norm": 0.39083245396614075, + "learning_rate": 9.769411467262658e-05, + "loss": 0.6844, + "mean_token_accuracy": 0.7694638129323721, + "num_tokens": 501852.0, + "step": 15560 + }, + { + "epoch": 7.662646153846154, + "grad_norm": 0.5364481806755066, + "learning_rate": 9.767045597206888e-05, + "loss": 0.8126, + "mean_token_accuracy": 0.7328575398772955, + "num_tokens": 511967.0, + "step": 15570 + }, + { + "epoch": 7.667569230769231, + "grad_norm": 0.8996931910514832, + "learning_rate": 9.764667941380234e-05, + "loss": 0.731, + "mean_token_accuracy": 0.7515979178249836, + "num_tokens": 520643.0, + "step": 15580 + }, + { + "epoch": 7.672492307692307, + "grad_norm": 0.5694324374198914, + "learning_rate": 9.762278505661074e-05, + "loss": 0.7069, + "mean_token_accuracy": 0.7669325869530439, + "num_tokens": 529583.0, + "step": 15590 + }, + { + "epoch": 7.677415384615385, + "grad_norm": 0.7352235317230225, + "learning_rate": 9.759877295956916e-05, + "loss": 0.7426, + "mean_token_accuracy": 0.7516727082431316, + "num_tokens": 538607.0, + "step": 15600 + }, + { + "epoch": 7.682338461538461, + "grad_norm": 0.32605522871017456, + "learning_rate": 9.757464318204373e-05, + "loss": 0.7449, + "mean_token_accuracy": 0.7565869923681021, + "num_tokens": 8860.0, + "step": 15610 + }, + { + "epoch": 7.687261538461539, + "grad_norm": 1.012762188911438, + "learning_rate": 9.755039578369149e-05, + "loss": 0.771, + "mean_token_accuracy": 0.7340531777590513, + "num_tokens": 18651.0, + "step": 15620 + }, + { + "epoch": 7.692184615384615, + "grad_norm": 0.8538568615913391, + "learning_rate": 9.752603082446036e-05, + "loss": 0.7248, + "mean_token_accuracy": 0.7604099120944738, + "num_tokens": 27363.0, + "step": 15630 + }, + { + "epoch": 7.697107692307692, + "grad_norm": 0.6249682903289795, + "learning_rate": 9.750154836458887e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.7692125029861927, + "num_tokens": 35912.0, + "step": 15640 + }, + { + "epoch": 7.7020307692307695, + "grad_norm": 0.8196687698364258, + "learning_rate": 9.747694846460605e-05, + "loss": 0.64, + "mean_token_accuracy": 0.7777061153203249, + "num_tokens": 44561.0, + "step": 15650 + }, + { + "epoch": 7.706953846153846, + "grad_norm": 0.4318842887878418, + "learning_rate": 9.745223118533127e-05, + "loss": 0.6814, + "mean_token_accuracy": 0.7579683996737003, + "num_tokens": 53007.0, + "step": 15660 + }, + { + "epoch": 7.7118769230769235, + "grad_norm": 0.3699759542942047, + "learning_rate": 9.742739658787414e-05, + "loss": 0.6928, + "mean_token_accuracy": 0.7668475016951561, + "num_tokens": 62417.0, + "step": 15670 + }, + { + "epoch": 7.7168, + "grad_norm": 0.4260357618331909, + "learning_rate": 9.740244473363426e-05, + "loss": 0.7704, + "mean_token_accuracy": 0.7583841320127249, + "num_tokens": 71808.0, + "step": 15680 + }, + { + "epoch": 7.721723076923077, + "grad_norm": 0.8471167087554932, + "learning_rate": 9.737737568430123e-05, + "loss": 0.6393, + "mean_token_accuracy": 0.7948539689183235, + "num_tokens": 80627.0, + "step": 15690 + }, + { + "epoch": 7.726646153846154, + "grad_norm": 0.3456837236881256, + "learning_rate": 9.735218950185428e-05, + "loss": 0.7253, + "mean_token_accuracy": 0.7614364203065633, + "num_tokens": 89544.0, + "step": 15700 + }, + { + "epoch": 7.731569230769231, + "grad_norm": 0.5443410277366638, + "learning_rate": 9.732688624856231e-05, + "loss": 0.6766, + "mean_token_accuracy": 0.774466859921813, + "num_tokens": 98452.0, + "step": 15710 + }, + { + "epoch": 7.736492307692307, + "grad_norm": 0.36821064352989197, + "learning_rate": 9.730146598698363e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.7423054609447718, + "num_tokens": 108409.0, + "step": 15720 + }, + { + "epoch": 7.741415384615385, + "grad_norm": 0.8412238955497742, + "learning_rate": 9.727592877996585e-05, + "loss": 0.6721, + "mean_token_accuracy": 0.7684708528220654, + "num_tokens": 116974.0, + "step": 15730 + }, + { + "epoch": 7.746338461538461, + "grad_norm": 0.709597647190094, + "learning_rate": 9.725027469064568e-05, + "loss": 0.6988, + "mean_token_accuracy": 0.769633786380291, + "num_tokens": 125747.0, + "step": 15740 + }, + { + "epoch": 7.751261538461538, + "grad_norm": 0.3418969213962555, + "learning_rate": 9.722450378244884e-05, + "loss": 0.739, + "mean_token_accuracy": 0.7599714059382677, + "num_tokens": 134645.0, + "step": 15750 + }, + { + "epoch": 7.7561846153846155, + "grad_norm": 0.6444127559661865, + "learning_rate": 9.719861611908984e-05, + "loss": 0.7256, + "mean_token_accuracy": 0.7679268248379231, + "num_tokens": 144249.0, + "step": 15760 + }, + { + "epoch": 7.761107692307692, + "grad_norm": 0.303688108921051, + "learning_rate": 9.717261176457187e-05, + "loss": 0.8164, + "mean_token_accuracy": 0.7411212358623743, + "num_tokens": 153958.0, + "step": 15770 + }, + { + "epoch": 7.7660307692307695, + "grad_norm": 0.3446861505508423, + "learning_rate": 9.71464907831866e-05, + "loss": 0.7577, + "mean_token_accuracy": 0.7447476647794247, + "num_tokens": 163223.0, + "step": 15780 + }, + { + "epoch": 7.770953846153846, + "grad_norm": 1.021315097808838, + "learning_rate": 9.712025323951405e-05, + "loss": 0.8067, + "mean_token_accuracy": 0.7387041725218296, + "num_tokens": 172836.0, + "step": 15790 + }, + { + "epoch": 7.775876923076924, + "grad_norm": 0.47042426466941833, + "learning_rate": 9.709389919842244e-05, + "loss": 0.644, + "mean_token_accuracy": 0.7863428425043821, + "num_tokens": 180923.0, + "step": 15800 + }, + { + "epoch": 7.7808, + "grad_norm": 0.4498484432697296, + "learning_rate": 9.706742872506796e-05, + "loss": 0.7652, + "mean_token_accuracy": 0.7459516085684299, + "num_tokens": 189045.0, + "step": 15810 + }, + { + "epoch": 7.785723076923077, + "grad_norm": 0.3966211974620819, + "learning_rate": 9.704084188489473e-05, + "loss": 0.7547, + "mean_token_accuracy": 0.7628035910427571, + "num_tokens": 197908.0, + "step": 15820 + }, + { + "epoch": 7.790646153846154, + "grad_norm": 0.5024072527885437, + "learning_rate": 9.701413874363449e-05, + "loss": 0.6979, + "mean_token_accuracy": 0.7664535760879516, + "num_tokens": 207676.0, + "step": 15830 + }, + { + "epoch": 7.795569230769231, + "grad_norm": 0.9534441232681274, + "learning_rate": 9.698731936730662e-05, + "loss": 0.6927, + "mean_token_accuracy": 0.7581623613834381, + "num_tokens": 216190.0, + "step": 15840 + }, + { + "epoch": 7.800492307692307, + "grad_norm": 0.3889976143836975, + "learning_rate": 9.696038382221775e-05, + "loss": 0.7342, + "mean_token_accuracy": 0.758062494546175, + "num_tokens": 224885.0, + "step": 15850 + }, + { + "epoch": 7.805415384615385, + "grad_norm": 0.3749610185623169, + "learning_rate": 9.693333217496183e-05, + "loss": 0.7733, + "mean_token_accuracy": 0.7525778859853745, + "num_tokens": 234675.0, + "step": 15860 + }, + { + "epoch": 7.8103384615384615, + "grad_norm": 1.377591848373413, + "learning_rate": 9.690616449241976e-05, + "loss": 0.7902, + "mean_token_accuracy": 0.7485566444694995, + "num_tokens": 243966.0, + "step": 15870 + }, + { + "epoch": 7.815261538461538, + "grad_norm": 0.4818339943885803, + "learning_rate": 9.68788808417594e-05, + "loss": 0.7403, + "mean_token_accuracy": 0.7505327112972736, + "num_tokens": 253324.0, + "step": 15880 + }, + { + "epoch": 7.8201846153846155, + "grad_norm": 0.6426275372505188, + "learning_rate": 9.685148129043528e-05, + "loss": 0.7431, + "mean_token_accuracy": 0.7493322882801294, + "num_tokens": 261869.0, + "step": 15890 + }, + { + "epoch": 7.825107692307692, + "grad_norm": 0.6179720163345337, + "learning_rate": 9.682396590618848e-05, + "loss": 0.8594, + "mean_token_accuracy": 0.726296653598547, + "num_tokens": 271518.0, + "step": 15900 + }, + { + "epoch": 7.83003076923077, + "grad_norm": 0.7233896851539612, + "learning_rate": 9.679633475704645e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.7537887316197157, + "num_tokens": 279856.0, + "step": 15910 + }, + { + "epoch": 7.834953846153846, + "grad_norm": 0.4409298002719879, + "learning_rate": 9.676858791132289e-05, + "loss": 0.6689, + "mean_token_accuracy": 0.7718529254198074, + "num_tokens": 288033.0, + "step": 15920 + }, + { + "epoch": 7.839876923076923, + "grad_norm": 1.0393766164779663, + "learning_rate": 9.674072543761747e-05, + "loss": 0.7102, + "mean_token_accuracy": 0.7685550011694431, + "num_tokens": 296825.0, + "step": 15930 + }, + { + "epoch": 7.8448, + "grad_norm": 0.4495505094528198, + "learning_rate": 9.671274740481584e-05, + "loss": 0.8089, + "mean_token_accuracy": 0.7236046094447375, + "num_tokens": 305764.0, + "step": 15940 + }, + { + "epoch": 7.849723076923077, + "grad_norm": 0.378525048494339, + "learning_rate": 9.668465388208923e-05, + "loss": 0.7541, + "mean_token_accuracy": 0.7619457546621561, + "num_tokens": 315045.0, + "step": 15950 + }, + { + "epoch": 7.854646153846154, + "grad_norm": 0.3276619613170624, + "learning_rate": 9.66564449388945e-05, + "loss": 0.7818, + "mean_token_accuracy": 0.7495603717863559, + "num_tokens": 324401.0, + "step": 15960 + }, + { + "epoch": 7.859569230769231, + "grad_norm": 0.6946415901184082, + "learning_rate": 9.66281206449738e-05, + "loss": 0.6496, + "mean_token_accuracy": 0.786292115598917, + "num_tokens": 332409.0, + "step": 15970 + }, + { + "epoch": 7.8644923076923074, + "grad_norm": 0.6143710613250732, + "learning_rate": 9.659968107035449e-05, + "loss": 0.7024, + "mean_token_accuracy": 0.7785773172974586, + "num_tokens": 341622.0, + "step": 15980 + }, + { + "epoch": 7.869415384615385, + "grad_norm": 1.626356601715088, + "learning_rate": 9.657112628534898e-05, + "loss": 0.7933, + "mean_token_accuracy": 0.7386716432869435, + "num_tokens": 350779.0, + "step": 15990 + }, + { + "epoch": 7.8743384615384615, + "grad_norm": 0.8346491456031799, + "learning_rate": 9.654245636055447e-05, + "loss": 0.6961, + "mean_token_accuracy": 0.7623608373105526, + "num_tokens": 360097.0, + "step": 16000 + }, + { + "epoch": 7.879261538461538, + "grad_norm": 0.5154821872711182, + "learning_rate": 9.651367136685283e-05, + "loss": 0.7421, + "mean_token_accuracy": 0.7435356438159942, + "num_tokens": 368657.0, + "step": 16010 + }, + { + "epoch": 7.884184615384616, + "grad_norm": 0.40335017442703247, + "learning_rate": 9.648477137541045e-05, + "loss": 0.7217, + "mean_token_accuracy": 0.7617222603410483, + "num_tokens": 376972.0, + "step": 16020 + }, + { + "epoch": 7.889107692307692, + "grad_norm": 0.3109897971153259, + "learning_rate": 9.645575645767802e-05, + "loss": 0.7785, + "mean_token_accuracy": 0.7252940777689219, + "num_tokens": 386691.0, + "step": 16030 + }, + { + "epoch": 7.894030769230769, + "grad_norm": 0.7454068064689636, + "learning_rate": 9.642662668539034e-05, + "loss": 0.7545, + "mean_token_accuracy": 0.7471344050019979, + "num_tokens": 395505.0, + "step": 16040 + }, + { + "epoch": 7.898953846153846, + "grad_norm": 0.37064892053604126, + "learning_rate": 9.63973821305662e-05, + "loss": 0.6351, + "mean_token_accuracy": 0.7928381565958261, + "num_tokens": 404112.0, + "step": 16050 + }, + { + "epoch": 7.903876923076923, + "grad_norm": 0.36622127890586853, + "learning_rate": 9.636802286550816e-05, + "loss": 0.7544, + "mean_token_accuracy": 0.7577709004282951, + "num_tokens": 413357.0, + "step": 16060 + }, + { + "epoch": 7.9088, + "grad_norm": 0.6665292978286743, + "learning_rate": 9.633854896280243e-05, + "loss": 0.7774, + "mean_token_accuracy": 0.743139598891139, + "num_tokens": 423588.0, + "step": 16070 + }, + { + "epoch": 7.913723076923077, + "grad_norm": 0.8473738431930542, + "learning_rate": 9.630896049531855e-05, + "loss": 0.7409, + "mean_token_accuracy": 0.7385849550366401, + "num_tokens": 432217.0, + "step": 16080 + }, + { + "epoch": 7.918646153846154, + "grad_norm": 1.1277004480361938, + "learning_rate": 9.627925753620939e-05, + "loss": 0.6382, + "mean_token_accuracy": 0.7997437328100204, + "num_tokens": 440454.0, + "step": 16090 + }, + { + "epoch": 7.923569230769231, + "grad_norm": 0.5827834010124207, + "learning_rate": 9.62494401589108e-05, + "loss": 0.7146, + "mean_token_accuracy": 0.7843764916062355, + "num_tokens": 449378.0, + "step": 16100 + }, + { + "epoch": 7.9284923076923075, + "grad_norm": 0.45561665296554565, + "learning_rate": 9.621950843714163e-05, + "loss": 0.7489, + "mean_token_accuracy": 0.755017938092351, + "num_tokens": 458985.0, + "step": 16110 + }, + { + "epoch": 7.933415384615385, + "grad_norm": 0.45355021953582764, + "learning_rate": 9.618946244490328e-05, + "loss": 0.7944, + "mean_token_accuracy": 0.7366019859910011, + "num_tokens": 467822.0, + "step": 16120 + }, + { + "epoch": 7.938338461538462, + "grad_norm": 0.45162439346313477, + "learning_rate": 9.61593022564798e-05, + "loss": 0.7063, + "mean_token_accuracy": 0.7567742951214314, + "num_tokens": 476100.0, + "step": 16130 + }, + { + "epoch": 7.943261538461538, + "grad_norm": 0.9754898548126221, + "learning_rate": 9.612902794643748e-05, + "loss": 0.6584, + "mean_token_accuracy": 0.780465978384018, + "num_tokens": 484368.0, + "step": 16140 + }, + { + "epoch": 7.948184615384616, + "grad_norm": 0.3318362832069397, + "learning_rate": 9.609863958962482e-05, + "loss": 0.6997, + "mean_token_accuracy": 0.7755364947021007, + "num_tokens": 493961.0, + "step": 16150 + }, + { + "epoch": 7.953107692307692, + "grad_norm": 0.435249388217926, + "learning_rate": 9.606813726117223e-05, + "loss": 0.5637, + "mean_token_accuracy": 0.7991742443293333, + "num_tokens": 501913.0, + "step": 16160 + }, + { + "epoch": 7.958030769230769, + "grad_norm": 0.43728408217430115, + "learning_rate": 9.603752103649194e-05, + "loss": 0.7412, + "mean_token_accuracy": 0.7628684055060149, + "num_tokens": 510392.0, + "step": 16170 + }, + { + "epoch": 7.962953846153846, + "grad_norm": 0.46618780493736267, + "learning_rate": 9.600679099127774e-05, + "loss": 0.7086, + "mean_token_accuracy": 0.7639894340187311, + "num_tokens": 519632.0, + "step": 16180 + }, + { + "epoch": 7.967876923076923, + "grad_norm": 0.35183632373809814, + "learning_rate": 9.597594720150485e-05, + "loss": 0.6746, + "mean_token_accuracy": 0.7732372462749482, + "num_tokens": 528385.0, + "step": 16190 + }, + { + "epoch": 7.9728, + "grad_norm": 0.42351534962654114, + "learning_rate": 9.59449897434297e-05, + "loss": 0.74, + "mean_token_accuracy": 0.7574392698705197, + "num_tokens": 537139.0, + "step": 16200 + }, + { + "epoch": 7.977723076923077, + "grad_norm": 0.451408326625824, + "learning_rate": 8.704204204204205e-05, + "loss": 0.739, + "mean_token_accuracy": 0.7462680261582136, + "num_tokens": 9224.0, + "step": 16210 + }, + { + "epoch": 7.9826461538461535, + "grad_norm": 0.519805908203125, + "learning_rate": 8.699199199199199e-05, + "loss": 0.727, + "mean_token_accuracy": 0.7574180524796248, + "num_tokens": 18701.0, + "step": 16220 + }, + { + "epoch": 7.987569230769231, + "grad_norm": 0.36464399099349976, + "learning_rate": 8.694194194194195e-05, + "loss": 0.6993, + "mean_token_accuracy": 0.7700857035815716, + "num_tokens": 27224.0, + "step": 16230 + }, + { + "epoch": 7.992492307692308, + "grad_norm": 0.2717822790145874, + "learning_rate": 8.68918918918919e-05, + "loss": 0.7204, + "mean_token_accuracy": 0.7610609702765941, + "num_tokens": 35640.0, + "step": 16240 + }, + { + "epoch": 7.997415384615385, + "grad_norm": 0.3014907240867615, + "learning_rate": 8.684184184184185e-05, + "loss": 0.8007, + "mean_token_accuracy": 0.7250883210450411, + "num_tokens": 45152.0, + "step": 16250 + }, + { + "epoch": 8.002461538461539, + "grad_norm": 0.4179680049419403, + "learning_rate": 8.67917917917918e-05, + "loss": 0.7608, + "mean_token_accuracy": 0.7735646199889299, + "num_tokens": 54246.0, + "step": 16260 + }, + { + "epoch": 8.007384615384616, + "grad_norm": 0.506325900554657, + "learning_rate": 8.674174174174175e-05, + "loss": 0.8325, + "mean_token_accuracy": 0.7233378864824772, + "num_tokens": 63913.0, + "step": 16270 + }, + { + "epoch": 8.012307692307692, + "grad_norm": 0.6368007063865662, + "learning_rate": 8.66916916916917e-05, + "loss": 0.7237, + "mean_token_accuracy": 0.7582856122404337, + "num_tokens": 72805.0, + "step": 16280 + }, + { + "epoch": 8.01723076923077, + "grad_norm": 0.45158663392066956, + "learning_rate": 8.664164164164165e-05, + "loss": 0.744, + "mean_token_accuracy": 0.7568928249180317, + "num_tokens": 81952.0, + "step": 16290 + }, + { + "epoch": 8.022153846153847, + "grad_norm": 0.8606657981872559, + "learning_rate": 8.659159159159159e-05, + "loss": 0.6465, + "mean_token_accuracy": 0.78743049018085, + "num_tokens": 90446.0, + "step": 16300 + }, + { + "epoch": 8.027076923076923, + "grad_norm": 0.8622094392776489, + "learning_rate": 8.654154154154155e-05, + "loss": 0.646, + "mean_token_accuracy": 0.7958425115793943, + "num_tokens": 99305.0, + "step": 16310 + }, + { + "epoch": 8.032, + "grad_norm": 0.37887170910835266, + "learning_rate": 8.649149149149149e-05, + "loss": 0.7942, + "mean_token_accuracy": 0.7280906450003386, + "num_tokens": 108733.0, + "step": 16320 + }, + { + "epoch": 8.036923076923078, + "grad_norm": 0.4614126980304718, + "learning_rate": 8.644144144144145e-05, + "loss": 0.7874, + "mean_token_accuracy": 0.7451971229165792, + "num_tokens": 118370.0, + "step": 16330 + }, + { + "epoch": 8.041846153846153, + "grad_norm": 0.5304930210113525, + "learning_rate": 8.639139139139139e-05, + "loss": 0.7829, + "mean_token_accuracy": 0.7486832808703184, + "num_tokens": 128043.0, + "step": 16340 + }, + { + "epoch": 8.04676923076923, + "grad_norm": 0.7120644450187683, + "learning_rate": 8.634134134134135e-05, + "loss": 0.8206, + "mean_token_accuracy": 0.7201329939067364, + "num_tokens": 138112.0, + "step": 16350 + }, + { + "epoch": 8.051692307692308, + "grad_norm": 0.40515926480293274, + "learning_rate": 8.62912912912913e-05, + "loss": 0.648, + "mean_token_accuracy": 0.7795034911483526, + "num_tokens": 146211.0, + "step": 16360 + }, + { + "epoch": 8.056615384615384, + "grad_norm": 0.5807082653045654, + "learning_rate": 8.624124124124125e-05, + "loss": 0.7214, + "mean_token_accuracy": 0.7683356497436762, + "num_tokens": 155212.0, + "step": 16370 + }, + { + "epoch": 8.061538461538461, + "grad_norm": 0.8227428793907166, + "learning_rate": 8.61911911911912e-05, + "loss": 0.7255, + "mean_token_accuracy": 0.7549647618085146, + "num_tokens": 164218.0, + "step": 16380 + }, + { + "epoch": 8.066461538461539, + "grad_norm": 0.3668994903564453, + "learning_rate": 8.614114114114115e-05, + "loss": 0.6535, + "mean_token_accuracy": 0.7847654249519109, + "num_tokens": 173078.0, + "step": 16390 + }, + { + "epoch": 8.071384615384616, + "grad_norm": 0.28024813532829285, + "learning_rate": 8.609109109109109e-05, + "loss": 0.6654, + "mean_token_accuracy": 0.7698544282466173, + "num_tokens": 181153.0, + "step": 16400 + }, + { + "epoch": 8.076307692307692, + "grad_norm": 0.328283429145813, + "learning_rate": 8.604104104104105e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.7609763164073229, + "num_tokens": 189338.0, + "step": 16410 + }, + { + "epoch": 8.08123076923077, + "grad_norm": 0.7588334083557129, + "learning_rate": 8.599099099099099e-05, + "loss": 0.7558, + "mean_token_accuracy": 0.742162485793233, + "num_tokens": 198089.0, + "step": 16420 + }, + { + "epoch": 8.086153846153847, + "grad_norm": 0.3376314342021942, + "learning_rate": 8.594094094094095e-05, + "loss": 0.6729, + "mean_token_accuracy": 0.768299813196063, + "num_tokens": 206389.0, + "step": 16430 + }, + { + "epoch": 8.091076923076923, + "grad_norm": 0.5634762048721313, + "learning_rate": 8.589089089089089e-05, + "loss": 0.7448, + "mean_token_accuracy": 0.7404033329337836, + "num_tokens": 215926.0, + "step": 16440 + }, + { + "epoch": 8.096, + "grad_norm": 0.3723192811012268, + "learning_rate": 8.584084084084085e-05, + "loss": 0.7675, + "mean_token_accuracy": 0.7391767490655183, + "num_tokens": 225284.0, + "step": 16450 + }, + { + "epoch": 8.100923076923078, + "grad_norm": 0.3543316721916199, + "learning_rate": 8.57907907907908e-05, + "loss": 0.6199, + "mean_token_accuracy": 0.7883176296949387, + "num_tokens": 233644.0, + "step": 16460 + }, + { + "epoch": 8.105846153846153, + "grad_norm": 1.3809056282043457, + "learning_rate": 8.574074074074075e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.7546458698809146, + "num_tokens": 242102.0, + "step": 16470 + }, + { + "epoch": 8.11076923076923, + "grad_norm": 0.4195917248725891, + "learning_rate": 8.56906906906907e-05, + "loss": 0.7027, + "mean_token_accuracy": 0.7585832923650742, + "num_tokens": 250978.0, + "step": 16480 + }, + { + "epoch": 8.115692307692308, + "grad_norm": 0.5387942790985107, + "learning_rate": 8.564064064064065e-05, + "loss": 0.785, + "mean_token_accuracy": 0.7469440281391144, + "num_tokens": 259569.0, + "step": 16490 + }, + { + "epoch": 8.120615384615384, + "grad_norm": 0.3012475371360779, + "learning_rate": 8.559059059059059e-05, + "loss": 0.8153, + "mean_token_accuracy": 0.7393761333078146, + "num_tokens": 269415.0, + "step": 16500 + }, + { + "epoch": 8.125538461538461, + "grad_norm": 0.8275740742683411, + "learning_rate": 8.554054054054055e-05, + "loss": 0.7829, + "mean_token_accuracy": 0.742312715575099, + "num_tokens": 278571.0, + "step": 16510 + }, + { + "epoch": 8.130461538461539, + "grad_norm": 1.3069651126861572, + "learning_rate": 8.549049049049049e-05, + "loss": 0.6394, + "mean_token_accuracy": 0.7816533345729113, + "num_tokens": 286512.0, + "step": 16520 + }, + { + "epoch": 8.135384615384615, + "grad_norm": 0.45634856820106506, + "learning_rate": 8.544044044044043e-05, + "loss": 0.7042, + "mean_token_accuracy": 0.765632963180542, + "num_tokens": 295790.0, + "step": 16530 + }, + { + "epoch": 8.140307692307692, + "grad_norm": 0.37332087755203247, + "learning_rate": 8.539039039039039e-05, + "loss": 0.6812, + "mean_token_accuracy": 0.7715237192809582, + "num_tokens": 304612.0, + "step": 16540 + }, + { + "epoch": 8.14523076923077, + "grad_norm": 0.3229140043258667, + "learning_rate": 8.534034034034033e-05, + "loss": 0.7512, + "mean_token_accuracy": 0.7543146207928657, + "num_tokens": 312878.0, + "step": 16550 + }, + { + "epoch": 8.150153846153847, + "grad_norm": 0.46332916617393494, + "learning_rate": 8.529029029029029e-05, + "loss": 0.7706, + "mean_token_accuracy": 0.7599172580987215, + "num_tokens": 322455.0, + "step": 16560 + }, + { + "epoch": 8.155076923076923, + "grad_norm": 0.3571588695049286, + "learning_rate": 8.524024024024025e-05, + "loss": 0.6311, + "mean_token_accuracy": 0.7764216579496861, + "num_tokens": 330287.0, + "step": 16570 + }, + { + "epoch": 8.16, + "grad_norm": 0.33986207842826843, + "learning_rate": 8.519019019019019e-05, + "loss": 0.7048, + "mean_token_accuracy": 0.7667416296899319, + "num_tokens": 339451.0, + "step": 16580 + }, + { + "epoch": 8.164923076923078, + "grad_norm": 0.4668309688568115, + "learning_rate": 8.514014014014015e-05, + "loss": 0.764, + "mean_token_accuracy": 0.757263046503067, + "num_tokens": 348412.0, + "step": 16590 + }, + { + "epoch": 8.169846153846153, + "grad_norm": 0.6498896479606628, + "learning_rate": 8.509009009009009e-05, + "loss": 0.7584, + "mean_token_accuracy": 0.7656078919768333, + "num_tokens": 357039.0, + "step": 16600 + }, + { + "epoch": 8.17476923076923, + "grad_norm": 0.7268086075782776, + "learning_rate": 8.504004004004005e-05, + "loss": 0.6747, + "mean_token_accuracy": 0.774444717913866, + "num_tokens": 365355.0, + "step": 16610 + }, + { + "epoch": 8.179692307692308, + "grad_norm": 0.29098740220069885, + "learning_rate": 8.498998998998999e-05, + "loss": 0.7868, + "mean_token_accuracy": 0.736617112159729, + "num_tokens": 374565.0, + "step": 16620 + }, + { + "epoch": 8.184615384615384, + "grad_norm": 0.453988254070282, + "learning_rate": 8.493993993993994e-05, + "loss": 0.7035, + "mean_token_accuracy": 0.7616805218160152, + "num_tokens": 383369.0, + "step": 16630 + }, + { + "epoch": 8.189538461538461, + "grad_norm": 0.5355010032653809, + "learning_rate": 8.488988988988989e-05, + "loss": 0.8054, + "mean_token_accuracy": 0.7372288048267365, + "num_tokens": 393060.0, + "step": 16640 + }, + { + "epoch": 8.194461538461539, + "grad_norm": 0.25265973806381226, + "learning_rate": 8.483983983983984e-05, + "loss": 0.6868, + "mean_token_accuracy": 0.7586379230022431, + "num_tokens": 401408.0, + "step": 16650 + }, + { + "epoch": 8.199384615384615, + "grad_norm": 0.3654129207134247, + "learning_rate": 8.478978978978979e-05, + "loss": 0.6787, + "mean_token_accuracy": 0.7611869160085917, + "num_tokens": 409918.0, + "step": 16660 + }, + { + "epoch": 8.204307692307692, + "grad_norm": 0.4879061281681061, + "learning_rate": 8.473973973973975e-05, + "loss": 0.6378, + "mean_token_accuracy": 0.7808506272733211, + "num_tokens": 418552.0, + "step": 16670 + }, + { + "epoch": 8.20923076923077, + "grad_norm": 0.32871031761169434, + "learning_rate": 8.468968968968969e-05, + "loss": 0.7656, + "mean_token_accuracy": 0.7485352344810963, + "num_tokens": 427532.0, + "step": 16680 + }, + { + "epoch": 8.214153846153847, + "grad_norm": 0.4512389600276947, + "learning_rate": 8.463963963963965e-05, + "loss": 0.6777, + "mean_token_accuracy": 0.7757456459105014, + "num_tokens": 435987.0, + "step": 16690 + }, + { + "epoch": 8.219076923076923, + "grad_norm": 0.698094367980957, + "learning_rate": 8.458958958958959e-05, + "loss": 0.5981, + "mean_token_accuracy": 0.7841473259031773, + "num_tokens": 444271.0, + "step": 16700 + }, + { + "epoch": 8.224, + "grad_norm": 0.5681586265563965, + "learning_rate": 8.453953953953955e-05, + "loss": 0.7459, + "mean_token_accuracy": 0.7463509045541287, + "num_tokens": 453153.0, + "step": 16710 + }, + { + "epoch": 8.228923076923078, + "grad_norm": 0.3863551914691925, + "learning_rate": 8.448948948948949e-05, + "loss": 0.7184, + "mean_token_accuracy": 0.7525065660476684, + "num_tokens": 462182.0, + "step": 16720 + }, + { + "epoch": 8.233846153846153, + "grad_norm": 1.4121780395507812, + "learning_rate": 8.443943943943944e-05, + "loss": 0.7869, + "mean_token_accuracy": 0.7282758131623268, + "num_tokens": 471829.0, + "step": 16730 + }, + { + "epoch": 8.23876923076923, + "grad_norm": 0.4115709662437439, + "learning_rate": 8.438938938938939e-05, + "loss": 0.6067, + "mean_token_accuracy": 0.7922206796705723, + "num_tokens": 479732.0, + "step": 16740 + }, + { + "epoch": 8.243692307692308, + "grad_norm": 0.35427096486091614, + "learning_rate": 8.433933933933934e-05, + "loss": 0.731, + "mean_token_accuracy": 0.7598383821547031, + "num_tokens": 488481.0, + "step": 16750 + }, + { + "epoch": 8.248615384615384, + "grad_norm": 0.4847518801689148, + "learning_rate": 8.428928928928929e-05, + "loss": 0.8492, + "mean_token_accuracy": 0.7316956970840692, + "num_tokens": 498612.0, + "step": 16760 + }, + { + "epoch": 8.253538461538461, + "grad_norm": 0.35778024792671204, + "learning_rate": 8.423923923923924e-05, + "loss": 0.8278, + "mean_token_accuracy": 0.7407127279788256, + "num_tokens": 508317.0, + "step": 16770 + }, + { + "epoch": 8.258461538461539, + "grad_norm": 0.4900796413421631, + "learning_rate": 8.418918918918919e-05, + "loss": 0.7169, + "mean_token_accuracy": 0.7744144190102815, + "num_tokens": 517266.0, + "step": 16780 + }, + { + "epoch": 8.263384615384615, + "grad_norm": 0.7427136898040771, + "learning_rate": 8.413913913913915e-05, + "loss": 0.721, + "mean_token_accuracy": 0.7743976633995772, + "num_tokens": 526031.0, + "step": 16790 + }, + { + "epoch": 8.268307692307692, + "grad_norm": 0.3626040518283844, + "learning_rate": 8.40890890890891e-05, + "loss": 0.7644, + "mean_token_accuracy": 0.7453425768762827, + "num_tokens": 535101.0, + "step": 16800 + }, + { + "epoch": 8.27323076923077, + "grad_norm": 0.3192290961742401, + "learning_rate": 8.403903903903905e-05, + "loss": 0.7191, + "mean_token_accuracy": 0.7587394848465919, + "num_tokens": 543850.0, + "step": 16810 + }, + { + "epoch": 8.278153846153845, + "grad_norm": 0.29766783118247986, + "learning_rate": 8.3988988988989e-05, + "loss": 0.7147, + "mean_token_accuracy": 0.7568968750536442, + "num_tokens": 552551.0, + "step": 16820 + }, + { + "epoch": 8.283076923076923, + "grad_norm": 0.42623892426490784, + "learning_rate": 8.393893893893894e-05, + "loss": 0.742, + "mean_token_accuracy": 0.752581474930048, + "num_tokens": 561647.0, + "step": 16830 + }, + { + "epoch": 8.288, + "grad_norm": 0.5091580152511597, + "learning_rate": 8.38888888888889e-05, + "loss": 0.74, + "mean_token_accuracy": 0.7456575892865658, + "num_tokens": 570203.0, + "step": 16840 + }, + { + "epoch": 8.292923076923078, + "grad_norm": 0.8799173831939697, + "learning_rate": 8.383883883883884e-05, + "loss": 0.7269, + "mean_token_accuracy": 0.7564969882369041, + "num_tokens": 578387.0, + "step": 16850 + }, + { + "epoch": 8.297846153846153, + "grad_norm": 0.4507330060005188, + "learning_rate": 8.37887887887888e-05, + "loss": 0.6896, + "mean_token_accuracy": 0.7675476286560297, + "num_tokens": 586819.0, + "step": 16860 + }, + { + "epoch": 8.302769230769231, + "grad_norm": 0.5583937168121338, + "learning_rate": 8.373873873873874e-05, + "loss": 0.7218, + "mean_token_accuracy": 0.7538196977227927, + "num_tokens": 595719.0, + "step": 16870 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 0.30985692143440247, + "learning_rate": 8.36886886886887e-05, + "loss": 0.7591, + "mean_token_accuracy": 0.7350195806473494, + "num_tokens": 605157.0, + "step": 16880 + }, + { + "epoch": 8.312615384615384, + "grad_norm": 0.29996439814567566, + "learning_rate": 8.363863863863865e-05, + "loss": 0.8303, + "mean_token_accuracy": 0.72854442037642, + "num_tokens": 615318.0, + "step": 16890 + }, + { + "epoch": 8.317538461538462, + "grad_norm": 0.29050928354263306, + "learning_rate": 8.35885885885886e-05, + "loss": 0.6958, + "mean_token_accuracy": 0.7601312138140202, + "num_tokens": 623747.0, + "step": 16900 + }, + { + "epoch": 8.322461538461539, + "grad_norm": 0.7444137334823608, + "learning_rate": 8.353853853853855e-05, + "loss": 0.7091, + "mean_token_accuracy": 0.7593867909163237, + "num_tokens": 632061.0, + "step": 16910 + }, + { + "epoch": 8.327384615384615, + "grad_norm": 0.27352163195610046, + "learning_rate": 8.34884884884885e-05, + "loss": 0.5748, + "mean_token_accuracy": 0.8001094650477171, + "num_tokens": 639693.0, + "step": 16920 + }, + { + "epoch": 8.332307692307692, + "grad_norm": 0.31675222516059875, + "learning_rate": 8.343843843843844e-05, + "loss": 0.7142, + "mean_token_accuracy": 0.7502024855464697, + "num_tokens": 648616.0, + "step": 16930 + }, + { + "epoch": 8.33723076923077, + "grad_norm": 0.24953658878803253, + "learning_rate": 8.33883883883884e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.7595278985798359, + "num_tokens": 658331.0, + "step": 16940 + }, + { + "epoch": 8.342153846153845, + "grad_norm": 0.29154184460639954, + "learning_rate": 8.333833833833834e-05, + "loss": 0.7491, + "mean_token_accuracy": 0.7431470949202776, + "num_tokens": 666914.0, + "step": 16950 + }, + { + "epoch": 8.347076923076923, + "grad_norm": 0.46732550859451294, + "learning_rate": 8.32882882882883e-05, + "loss": 0.6553, + "mean_token_accuracy": 0.7763445932418108, + "num_tokens": 675189.0, + "step": 16960 + }, + { + "epoch": 8.352, + "grad_norm": 0.6667472720146179, + "learning_rate": 8.323823823823824e-05, + "loss": 0.7915, + "mean_token_accuracy": 0.7494681358337403, + "num_tokens": 684818.0, + "step": 16970 + }, + { + "epoch": 8.356923076923078, + "grad_norm": 0.7695476412773132, + "learning_rate": 8.318818818818818e-05, + "loss": 0.6424, + "mean_token_accuracy": 0.7838353902101517, + "num_tokens": 693101.0, + "step": 16980 + }, + { + "epoch": 8.361846153846153, + "grad_norm": 0.48481637239456177, + "learning_rate": 8.313813813813814e-05, + "loss": 0.7594, + "mean_token_accuracy": 0.7688646581023931, + "num_tokens": 702267.0, + "step": 16990 + }, + { + "epoch": 8.366769230769231, + "grad_norm": 0.295489102602005, + "learning_rate": 8.30880880880881e-05, + "loss": 0.7447, + "mean_token_accuracy": 0.741809818893671, + "num_tokens": 710894.0, + "step": 17000 + }, + { + "epoch": 8.3712, + "grad_norm": 0.34440332651138306, + "learning_rate": 9.306748584382252e-05, + "loss": 0.6617, + "mean_token_accuracy": 0.7728524345904588, + "num_tokens": 8282.0, + "step": 17010 + }, + { + "epoch": 8.376123076923077, + "grad_norm": 0.46622994542121887, + "learning_rate": 9.302749347659147e-05, + "loss": 0.7989, + "mean_token_accuracy": 0.7468677569180727, + "num_tokens": 18349.0, + "step": 17020 + }, + { + "epoch": 8.381046153846153, + "grad_norm": 0.679972231388092, + "learning_rate": 9.298739473064651e-05, + "loss": 0.774, + "mean_token_accuracy": 0.748094291985035, + "num_tokens": 28331.0, + "step": 17030 + }, + { + "epoch": 8.38596923076923, + "grad_norm": 0.3294457495212555, + "learning_rate": 9.294718970512545e-05, + "loss": 0.7299, + "mean_token_accuracy": 0.755809823796153, + "num_tokens": 37071.0, + "step": 17040 + }, + { + "epoch": 8.390892307692308, + "grad_norm": 0.3379822373390198, + "learning_rate": 9.290687849942893e-05, + "loss": 0.7451, + "mean_token_accuracy": 0.7431266129016876, + "num_tokens": 45861.0, + "step": 17050 + }, + { + "epoch": 8.395815384615384, + "grad_norm": 0.3134821355342865, + "learning_rate": 9.286646121322004e-05, + "loss": 0.734, + "mean_token_accuracy": 0.7659897316247225, + "num_tokens": 55778.0, + "step": 17060 + }, + { + "epoch": 8.400738461538461, + "grad_norm": 1.9757238626480103, + "learning_rate": 9.282593794642423e-05, + "loss": 0.741, + "mean_token_accuracy": 0.760085154697299, + "num_tokens": 64336.0, + "step": 17070 + }, + { + "epoch": 8.405661538461539, + "grad_norm": 0.27846866846084595, + "learning_rate": 9.278530879922882e-05, + "loss": 0.6565, + "mean_token_accuracy": 0.7792624596506357, + "num_tokens": 72489.0, + "step": 17080 + }, + { + "epoch": 8.410584615384616, + "grad_norm": 0.30453068017959595, + "learning_rate": 9.274457387208305e-05, + "loss": 0.6348, + "mean_token_accuracy": 0.7794241864234209, + "num_tokens": 80730.0, + "step": 17090 + }, + { + "epoch": 8.415507692307692, + "grad_norm": 0.4652600884437561, + "learning_rate": 9.270373326569762e-05, + "loss": 0.6505, + "mean_token_accuracy": 0.7694964144378901, + "num_tokens": 88615.0, + "step": 17100 + }, + { + "epoch": 8.42043076923077, + "grad_norm": 0.28283053636550903, + "learning_rate": 9.266278708104448e-05, + "loss": 0.7432, + "mean_token_accuracy": 0.7504150871187448, + "num_tokens": 98314.0, + "step": 17110 + }, + { + "epoch": 8.425353846153847, + "grad_norm": 0.7553939819335938, + "learning_rate": 9.262173541935663e-05, + "loss": 0.818, + "mean_token_accuracy": 0.722613125666976, + "num_tokens": 108341.0, + "step": 17120 + }, + { + "epoch": 8.430276923076923, + "grad_norm": 0.4199792444705963, + "learning_rate": 9.25805783821279e-05, + "loss": 0.7948, + "mean_token_accuracy": 0.728472213447094, + "num_tokens": 118063.0, + "step": 17130 + }, + { + "epoch": 8.4352, + "grad_norm": 0.3130192756652832, + "learning_rate": 9.253931607111256e-05, + "loss": 0.773, + "mean_token_accuracy": 0.7356539122760296, + "num_tokens": 126793.0, + "step": 17140 + }, + { + "epoch": 8.440123076923078, + "grad_norm": 0.3275775909423828, + "learning_rate": 9.249794858832522e-05, + "loss": 0.6469, + "mean_token_accuracy": 0.7861156791448594, + "num_tokens": 134697.0, + "step": 17150 + }, + { + "epoch": 8.445046153846153, + "grad_norm": 0.3588021695613861, + "learning_rate": 9.245647603604042e-05, + "loss": 0.6775, + "mean_token_accuracy": 0.7681175690144301, + "num_tokens": 142697.0, + "step": 17160 + }, + { + "epoch": 8.44996923076923, + "grad_norm": 1.7960641384124756, + "learning_rate": 9.241489851679256e-05, + "loss": 0.6297, + "mean_token_accuracy": 0.787098852545023, + "num_tokens": 150829.0, + "step": 17170 + }, + { + "epoch": 8.454892307692308, + "grad_norm": 0.361743688583374, + "learning_rate": 9.237321613337552e-05, + "loss": 0.7567, + "mean_token_accuracy": 0.7540980920195579, + "num_tokens": 160250.0, + "step": 17180 + }, + { + "epoch": 8.459815384615384, + "grad_norm": 0.35826051235198975, + "learning_rate": 9.233142898884245e-05, + "loss": 0.7855, + "mean_token_accuracy": 0.7355491202324629, + "num_tokens": 169094.0, + "step": 17190 + }, + { + "epoch": 8.464738461538461, + "grad_norm": 0.46428659558296204, + "learning_rate": 9.228953718650548e-05, + "loss": 0.6995, + "mean_token_accuracy": 0.7721015859395266, + "num_tokens": 178014.0, + "step": 17200 + }, + { + "epoch": 8.469661538461539, + "grad_norm": 0.39374762773513794, + "learning_rate": 9.224754082993552e-05, + "loss": 0.6739, + "mean_token_accuracy": 0.7837085586041213, + "num_tokens": 187225.0, + "step": 17210 + }, + { + "epoch": 8.474584615384614, + "grad_norm": 0.5131353139877319, + "learning_rate": 9.220544002296194e-05, + "loss": 0.6725, + "mean_token_accuracy": 0.7798098236322403, + "num_tokens": 195918.0, + "step": 17220 + }, + { + "epoch": 8.479507692307692, + "grad_norm": 0.2942337095737457, + "learning_rate": 9.216323486967238e-05, + "loss": 0.8587, + "mean_token_accuracy": 0.7184354912489652, + "num_tokens": 205933.0, + "step": 17230 + }, + { + "epoch": 8.48443076923077, + "grad_norm": 0.6050145030021667, + "learning_rate": 9.212092547441246e-05, + "loss": 0.7515, + "mean_token_accuracy": 0.7520445462316274, + "num_tokens": 215316.0, + "step": 17240 + }, + { + "epoch": 8.489353846153847, + "grad_norm": 0.34459981322288513, + "learning_rate": 9.207851194178548e-05, + "loss": 0.7056, + "mean_token_accuracy": 0.7616019807755947, + "num_tokens": 224106.0, + "step": 17250 + }, + { + "epoch": 8.494276923076923, + "grad_norm": 0.3424946963787079, + "learning_rate": 9.203599437665226e-05, + "loss": 0.8357, + "mean_token_accuracy": 0.7299704484641552, + "num_tokens": 234099.0, + "step": 17260 + }, + { + "epoch": 8.4992, + "grad_norm": 0.3472613990306854, + "learning_rate": 9.19933728841308e-05, + "loss": 0.7371, + "mean_token_accuracy": 0.739522896334529, + "num_tokens": 243567.0, + "step": 17270 + }, + { + "epoch": 8.504123076923078, + "grad_norm": 0.4420841634273529, + "learning_rate": 9.1950647569596e-05, + "loss": 0.7009, + "mean_token_accuracy": 0.775427482649684, + "num_tokens": 252398.0, + "step": 17280 + }, + { + "epoch": 8.509046153846153, + "grad_norm": 0.42884570360183716, + "learning_rate": 9.19078185386795e-05, + "loss": 0.7443, + "mean_token_accuracy": 0.7564741510897874, + "num_tokens": 262701.0, + "step": 17290 + }, + { + "epoch": 8.51396923076923, + "grad_norm": 0.31011125445365906, + "learning_rate": 9.186488589726937e-05, + "loss": 0.7856, + "mean_token_accuracy": 0.7426570508629083, + "num_tokens": 272557.0, + "step": 17300 + }, + { + "epoch": 8.518892307692308, + "grad_norm": 0.3713008463382721, + "learning_rate": 9.18218497515098e-05, + "loss": 0.7374, + "mean_token_accuracy": 0.7558617364615202, + "num_tokens": 282227.0, + "step": 17310 + }, + { + "epoch": 8.523815384615384, + "grad_norm": 0.3813200891017914, + "learning_rate": 9.17787102078009e-05, + "loss": 0.7099, + "mean_token_accuracy": 0.761194471269846, + "num_tokens": 291044.0, + "step": 17320 + }, + { + "epoch": 8.528738461538461, + "grad_norm": 0.4389830529689789, + "learning_rate": 9.17354673727984e-05, + "loss": 0.6955, + "mean_token_accuracy": 0.7634035963565111, + "num_tokens": 299331.0, + "step": 17330 + }, + { + "epoch": 8.533661538461539, + "grad_norm": 0.26211830973625183, + "learning_rate": 9.169212135341343e-05, + "loss": 0.8433, + "mean_token_accuracy": 0.7042404491454363, + "num_tokens": 309303.0, + "step": 17340 + }, + { + "epoch": 8.538584615384615, + "grad_norm": 0.3813954293727875, + "learning_rate": 9.164867225681219e-05, + "loss": 0.7424, + "mean_token_accuracy": 0.7665748696774244, + "num_tokens": 318409.0, + "step": 17350 + }, + { + "epoch": 8.543507692307692, + "grad_norm": 0.2562119960784912, + "learning_rate": 9.160512019041577e-05, + "loss": 0.7056, + "mean_token_accuracy": 0.7714785143733025, + "num_tokens": 326915.0, + "step": 17360 + }, + { + "epoch": 8.54843076923077, + "grad_norm": 0.7329946160316467, + "learning_rate": 9.156146526189975e-05, + "loss": 0.6707, + "mean_token_accuracy": 0.7758157294243574, + "num_tokens": 335886.0, + "step": 17370 + }, + { + "epoch": 8.553353846153847, + "grad_norm": 0.7640717029571533, + "learning_rate": 9.151770757919414e-05, + "loss": 0.6965, + "mean_token_accuracy": 0.7744528673589229, + "num_tokens": 344820.0, + "step": 17380 + }, + { + "epoch": 8.558276923076923, + "grad_norm": 0.6143106818199158, + "learning_rate": 9.147384725048292e-05, + "loss": 0.6567, + "mean_token_accuracy": 0.7768244970589876, + "num_tokens": 353154.0, + "step": 17390 + }, + { + "epoch": 8.5632, + "grad_norm": 0.7920124530792236, + "learning_rate": 9.142988438420383e-05, + "loss": 0.6259, + "mean_token_accuracy": 0.8012331046164036, + "num_tokens": 361761.0, + "step": 17400 + }, + { + "epoch": 8.568123076923078, + "grad_norm": 0.6481062769889832, + "learning_rate": 9.138581908904818e-05, + "loss": 0.7013, + "mean_token_accuracy": 0.7641973450779915, + "num_tokens": 371035.0, + "step": 17410 + }, + { + "epoch": 8.573046153846153, + "grad_norm": 1.1524738073349, + "learning_rate": 9.134165147396045e-05, + "loss": 0.7164, + "mean_token_accuracy": 0.7598775941878557, + "num_tokens": 380223.0, + "step": 17420 + }, + { + "epoch": 8.57796923076923, + "grad_norm": 0.4807913601398468, + "learning_rate": 9.129738164813814e-05, + "loss": 0.5843, + "mean_token_accuracy": 0.7955730833113194, + "num_tokens": 388359.0, + "step": 17430 + }, + { + "epoch": 8.582892307692308, + "grad_norm": 0.35724836587905884, + "learning_rate": 9.125300972103146e-05, + "loss": 0.6942, + "mean_token_accuracy": 0.7595485664904118, + "num_tokens": 396973.0, + "step": 17440 + }, + { + "epoch": 8.587815384615384, + "grad_norm": 0.42613887786865234, + "learning_rate": 9.120853580234299e-05, + "loss": 0.7747, + "mean_token_accuracy": 0.7444280967116356, + "num_tokens": 406644.0, + "step": 17450 + }, + { + "epoch": 8.592738461538461, + "grad_norm": 0.44058769941329956, + "learning_rate": 9.116396000202752e-05, + "loss": 0.6742, + "mean_token_accuracy": 0.771963307633996, + "num_tokens": 415159.0, + "step": 17460 + }, + { + "epoch": 8.597661538461539, + "grad_norm": 0.8332029581069946, + "learning_rate": 9.111928243029171e-05, + "loss": 0.7305, + "mean_token_accuracy": 0.7594617635011673, + "num_tokens": 423770.0, + "step": 17470 + }, + { + "epoch": 8.602584615384615, + "grad_norm": 0.4663828909397125, + "learning_rate": 9.107450319759382e-05, + "loss": 0.7572, + "mean_token_accuracy": 0.7509522173553705, + "num_tokens": 432396.0, + "step": 17480 + }, + { + "epoch": 8.607507692307692, + "grad_norm": 0.34183964133262634, + "learning_rate": 9.102962241464348e-05, + "loss": 0.7106, + "mean_token_accuracy": 0.7614024080336094, + "num_tokens": 441108.0, + "step": 17490 + }, + { + "epoch": 8.61243076923077, + "grad_norm": 0.42849647998809814, + "learning_rate": 9.098464019240138e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.7648319080471992, + "num_tokens": 449634.0, + "step": 17500 + }, + { + "epoch": 8.617353846153847, + "grad_norm": 0.3230085074901581, + "learning_rate": 9.093955664207895e-05, + "loss": 0.7568, + "mean_token_accuracy": 0.7501963946968317, + "num_tokens": 459276.0, + "step": 17510 + }, + { + "epoch": 8.622276923076923, + "grad_norm": 0.46625813841819763, + "learning_rate": 9.089437187513821e-05, + "loss": 0.6697, + "mean_token_accuracy": 0.7733147449791431, + "num_tokens": 467354.0, + "step": 17520 + }, + { + "epoch": 8.6272, + "grad_norm": 0.34277865290641785, + "learning_rate": 9.08490860032914e-05, + "loss": 0.6949, + "mean_token_accuracy": 0.7638208650052547, + "num_tokens": 475322.0, + "step": 17530 + }, + { + "epoch": 8.632123076923078, + "grad_norm": 0.7064818739891052, + "learning_rate": 9.080369913850072e-05, + "loss": 0.7679, + "mean_token_accuracy": 0.7412798043340445, + "num_tokens": 484199.0, + "step": 17540 + }, + { + "epoch": 8.637046153846153, + "grad_norm": 0.3127409815788269, + "learning_rate": 9.075821139297805e-05, + "loss": 0.7316, + "mean_token_accuracy": 0.7602858003228903, + "num_tokens": 493997.0, + "step": 17550 + }, + { + "epoch": 8.64196923076923, + "grad_norm": 0.7649181485176086, + "learning_rate": 9.071262287918467e-05, + "loss": 0.6458, + "mean_token_accuracy": 0.7748285502195358, + "num_tokens": 502572.0, + "step": 17560 + }, + { + "epoch": 8.646892307692308, + "grad_norm": 0.3664408326148987, + "learning_rate": 9.066693370983105e-05, + "loss": 0.639, + "mean_token_accuracy": 0.7795850615948439, + "num_tokens": 511053.0, + "step": 17570 + }, + { + "epoch": 8.651815384615384, + "grad_norm": 0.8689625263214111, + "learning_rate": 9.062114399787647e-05, + "loss": 0.6433, + "mean_token_accuracy": 0.7788681592792273, + "num_tokens": 519715.0, + "step": 17580 + }, + { + "epoch": 8.656738461538461, + "grad_norm": 0.5213949084281921, + "learning_rate": 9.057525385652878e-05, + "loss": 0.6952, + "mean_token_accuracy": 0.7548884745687247, + "num_tokens": 529725.0, + "step": 17590 + }, + { + "epoch": 8.661661538461539, + "grad_norm": 0.3532446622848511, + "learning_rate": 9.052926339924413e-05, + "loss": 0.7587, + "mean_token_accuracy": 0.746376433596015, + "num_tokens": 539374.0, + "step": 17600 + }, + { + "epoch": 8.666584615384615, + "grad_norm": 0.36963027715682983, + "learning_rate": 9.048317273972675e-05, + "loss": 0.7293, + "mean_token_accuracy": 0.758986271545291, + "num_tokens": 548541.0, + "step": 17610 + }, + { + "epoch": 8.671507692307692, + "grad_norm": 0.3961709141731262, + "learning_rate": 9.043698199192849e-05, + "loss": 0.7, + "mean_token_accuracy": 0.767149792611599, + "num_tokens": 557495.0, + "step": 17620 + }, + { + "epoch": 8.67643076923077, + "grad_norm": 0.24584902822971344, + "learning_rate": 9.039069127004875e-05, + "loss": 0.7539, + "mean_token_accuracy": 0.7440818291157484, + "num_tokens": 566503.0, + "step": 17630 + }, + { + "epoch": 8.681353846153847, + "grad_norm": 0.2721676230430603, + "learning_rate": 9.034430068853405e-05, + "loss": 0.7589, + "mean_token_accuracy": 0.7534482311457396, + "num_tokens": 575670.0, + "step": 17640 + }, + { + "epoch": 8.686276923076923, + "grad_norm": 0.38591504096984863, + "learning_rate": 9.029781036207781e-05, + "loss": 0.6774, + "mean_token_accuracy": 0.7606659393757582, + "num_tokens": 585086.0, + "step": 17650 + }, + { + "epoch": 8.6912, + "grad_norm": 0.49064919352531433, + "learning_rate": 9.025122040562007e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.7477210737764836, + "num_tokens": 593883.0, + "step": 17660 + }, + { + "epoch": 8.696123076923078, + "grad_norm": 0.36603498458862305, + "learning_rate": 9.020453093434714e-05, + "loss": 0.6822, + "mean_token_accuracy": 0.767727042734623, + "num_tokens": 602034.0, + "step": 17670 + }, + { + "epoch": 8.701046153846153, + "grad_norm": 0.5092780590057373, + "learning_rate": 9.015774206369143e-05, + "loss": 0.6706, + "mean_token_accuracy": 0.7680165067315101, + "num_tokens": 611455.0, + "step": 17680 + }, + { + "epoch": 8.705969230769231, + "grad_norm": 0.33634933829307556, + "learning_rate": 9.011085390933105e-05, + "loss": 0.6266, + "mean_token_accuracy": 0.7781741376966238, + "num_tokens": 619532.0, + "step": 17690 + }, + { + "epoch": 8.710892307692308, + "grad_norm": 0.35559213161468506, + "learning_rate": 9.00638665871896e-05, + "loss": 0.6879, + "mean_token_accuracy": 0.7744422752410174, + "num_tokens": 628842.0, + "step": 17700 + }, + { + "epoch": 8.715815384615384, + "grad_norm": 0.2505728602409363, + "learning_rate": 9.001678021343586e-05, + "loss": 0.7787, + "mean_token_accuracy": 0.7485674019902945, + "num_tokens": 638729.0, + "step": 17710 + }, + { + "epoch": 8.720738461538462, + "grad_norm": 0.6747182011604309, + "learning_rate": 8.996959490448346e-05, + "loss": 0.6124, + "mean_token_accuracy": 0.7967943239957094, + "num_tokens": 647220.0, + "step": 17720 + }, + { + "epoch": 8.725661538461539, + "grad_norm": 0.24621552228927612, + "learning_rate": 8.992231077699067e-05, + "loss": 0.6561, + "mean_token_accuracy": 0.7866641227155924, + "num_tokens": 655677.0, + "step": 17730 + }, + { + "epoch": 8.730584615384615, + "grad_norm": 0.26347342133522034, + "learning_rate": 8.987492794786006e-05, + "loss": 0.7491, + "mean_token_accuracy": 0.7501132309436798, + "num_tokens": 665125.0, + "step": 17740 + }, + { + "epoch": 8.735507692307692, + "grad_norm": 0.32913926243782043, + "learning_rate": 8.982744653423825e-05, + "loss": 0.7115, + "mean_token_accuracy": 0.7552514169365168, + "num_tokens": 674914.0, + "step": 17750 + }, + { + "epoch": 8.74043076923077, + "grad_norm": 0.37196052074432373, + "learning_rate": 8.977986665351552e-05, + "loss": 0.6786, + "mean_token_accuracy": 0.7645082645118236, + "num_tokens": 683568.0, + "step": 17760 + }, + { + "epoch": 8.745353846153845, + "grad_norm": 0.7010545134544373, + "learning_rate": 8.97321884233257e-05, + "loss": 0.7045, + "mean_token_accuracy": 0.7673761691898108, + "num_tokens": 692549.0, + "step": 17770 + }, + { + "epoch": 8.750276923076923, + "grad_norm": 0.39983436465263367, + "learning_rate": 8.96844119615457e-05, + "loss": 0.6805, + "mean_token_accuracy": 0.7654839035123586, + "num_tokens": 700969.0, + "step": 17780 + }, + { + "epoch": 8.7552, + "grad_norm": 0.3482830822467804, + "learning_rate": 8.96365373862953e-05, + "loss": 0.7268, + "mean_token_accuracy": 0.7699950773268938, + "num_tokens": 710766.0, + "step": 17790 + }, + { + "epoch": 8.760123076923076, + "grad_norm": 0.7171288132667542, + "learning_rate": 8.958856481593687e-05, + "loss": 0.7709, + "mean_token_accuracy": 0.7531832829117775, + "num_tokens": 720097.0, + "step": 17800 + }, + { + "epoch": 8.765046153846153, + "grad_norm": 0.3223002552986145, + "learning_rate": 8.954049436907506e-05, + "loss": 0.7723, + "mean_token_accuracy": 0.7442610811442136, + "num_tokens": 9539.0, + "step": 17810 + }, + { + "epoch": 8.769969230769231, + "grad_norm": 0.2629016935825348, + "learning_rate": 8.949232616455647e-05, + "loss": 0.7714, + "mean_token_accuracy": 0.7383145179599524, + "num_tokens": 19114.0, + "step": 17820 + }, + { + "epoch": 8.774892307692308, + "grad_norm": 0.6586378216743469, + "learning_rate": 8.944406032146944e-05, + "loss": 0.642, + "mean_token_accuracy": 0.7885478623211384, + "num_tokens": 27435.0, + "step": 17830 + }, + { + "epoch": 8.779815384615384, + "grad_norm": 0.2907133400440216, + "learning_rate": 8.939569695914367e-05, + "loss": 0.7391, + "mean_token_accuracy": 0.748593881353736, + "num_tokens": 35485.0, + "step": 17840 + }, + { + "epoch": 8.784738461538462, + "grad_norm": 0.28475967049598694, + "learning_rate": 8.934723619714996e-05, + "loss": 0.7719, + "mean_token_accuracy": 0.7494703732430935, + "num_tokens": 44542.0, + "step": 17850 + }, + { + "epoch": 8.789661538461539, + "grad_norm": 0.47861045598983765, + "learning_rate": 8.929867815529993e-05, + "loss": 0.6828, + "mean_token_accuracy": 0.7696808248758316, + "num_tokens": 53560.0, + "step": 17860 + }, + { + "epoch": 8.794584615384615, + "grad_norm": 0.25639232993125916, + "learning_rate": 8.925002295364571e-05, + "loss": 0.7176, + "mean_token_accuracy": 0.7549582026898861, + "num_tokens": 62836.0, + "step": 17870 + }, + { + "epoch": 8.799507692307692, + "grad_norm": 0.27395716309547424, + "learning_rate": 8.920127071247963e-05, + "loss": 0.7167, + "mean_token_accuracy": 0.7506252504885197, + "num_tokens": 71377.0, + "step": 17880 + }, + { + "epoch": 8.80443076923077, + "grad_norm": 0.26782044768333435, + "learning_rate": 8.915242155233396e-05, + "loss": 0.7433, + "mean_token_accuracy": 0.7559556499123573, + "num_tokens": 80539.0, + "step": 17890 + }, + { + "epoch": 8.809353846153845, + "grad_norm": 0.31977975368499756, + "learning_rate": 8.910347559398056e-05, + "loss": 0.7916, + "mean_token_accuracy": 0.7560835804790258, + "num_tokens": 90708.0, + "step": 17900 + }, + { + "epoch": 8.814276923076923, + "grad_norm": 0.492887407541275, + "learning_rate": 8.905443295843061e-05, + "loss": 0.6752, + "mean_token_accuracy": 0.7661668874323369, + "num_tokens": 99271.0, + "step": 17910 + }, + { + "epoch": 8.8192, + "grad_norm": 0.32388588786125183, + "learning_rate": 8.900529376693434e-05, + "loss": 0.7657, + "mean_token_accuracy": 0.7424514323472977, + "num_tokens": 107993.0, + "step": 17920 + }, + { + "epoch": 8.824123076923076, + "grad_norm": 0.4508485496044159, + "learning_rate": 8.895605814098064e-05, + "loss": 0.8702, + "mean_token_accuracy": 0.7194077134132385, + "num_tokens": 118063.0, + "step": 17930 + }, + { + "epoch": 8.829046153846154, + "grad_norm": 0.3053992986679077, + "learning_rate": 8.89067262022969e-05, + "loss": 0.6951, + "mean_token_accuracy": 0.7752657104283571, + "num_tokens": 126208.0, + "step": 17940 + }, + { + "epoch": 8.833969230769231, + "grad_norm": 0.3835429251194, + "learning_rate": 8.885729807284856e-05, + "loss": 0.7114, + "mean_token_accuracy": 0.7549926679581404, + "num_tokens": 134733.0, + "step": 17950 + }, + { + "epoch": 8.838892307692308, + "grad_norm": 0.21046239137649536, + "learning_rate": 8.880777387483888e-05, + "loss": 0.7411, + "mean_token_accuracy": 0.7572793487459422, + "num_tokens": 143481.0, + "step": 17960 + }, + { + "epoch": 8.843815384615384, + "grad_norm": 0.2608044445514679, + "learning_rate": 8.875815373070868e-05, + "loss": 0.7923, + "mean_token_accuracy": 0.7279406886547803, + "num_tokens": 152666.0, + "step": 17970 + }, + { + "epoch": 8.848738461538462, + "grad_norm": 0.35140207409858704, + "learning_rate": 8.870843776313598e-05, + "loss": 0.7014, + "mean_token_accuracy": 0.7720128271728754, + "num_tokens": 161509.0, + "step": 17980 + }, + { + "epoch": 8.85366153846154, + "grad_norm": 0.2577463984489441, + "learning_rate": 8.865862609503566e-05, + "loss": 0.7647, + "mean_token_accuracy": 0.7505464531481266, + "num_tokens": 170704.0, + "step": 17990 + }, + { + "epoch": 8.858584615384615, + "grad_norm": 0.4267882704734802, + "learning_rate": 8.860871884955925e-05, + "loss": 0.694, + "mean_token_accuracy": 0.7794535614550113, + "num_tokens": 179267.0, + "step": 18000 + }, + { + "epoch": 8.863507692307692, + "grad_norm": 0.3592469394207001, + "learning_rate": 8.855871615009459e-05, + "loss": 0.6028, + "mean_token_accuracy": 0.8042227383702993, + "num_tokens": 187517.0, + "step": 18010 + }, + { + "epoch": 8.86843076923077, + "grad_norm": 0.30718037486076355, + "learning_rate": 8.850861812026548e-05, + "loss": 0.8283, + "mean_token_accuracy": 0.7271805927157402, + "num_tokens": 197279.0, + "step": 18020 + }, + { + "epoch": 8.873353846153845, + "grad_norm": 0.429610937833786, + "learning_rate": 8.845842488393141e-05, + "loss": 0.7049, + "mean_token_accuracy": 0.7623873326927424, + "num_tokens": 206305.0, + "step": 18030 + }, + { + "epoch": 8.878276923076923, + "grad_norm": 0.3396170139312744, + "learning_rate": 8.840813656518728e-05, + "loss": 0.7685, + "mean_token_accuracy": 0.7294337477535009, + "num_tokens": 215493.0, + "step": 18040 + }, + { + "epoch": 8.8832, + "grad_norm": 0.2978787124156952, + "learning_rate": 8.835775328836306e-05, + "loss": 0.658, + "mean_token_accuracy": 0.7802751030772924, + "num_tokens": 223108.0, + "step": 18050 + }, + { + "epoch": 8.888123076923076, + "grad_norm": 0.3851058781147003, + "learning_rate": 8.830727517802347e-05, + "loss": 0.7847, + "mean_token_accuracy": 0.7197605889290571, + "num_tokens": 233031.0, + "step": 18060 + }, + { + "epoch": 8.893046153846154, + "grad_norm": 0.8238245248794556, + "learning_rate": 8.82567023589677e-05, + "loss": 0.7422, + "mean_token_accuracy": 0.7518959946930408, + "num_tokens": 241782.0, + "step": 18070 + }, + { + "epoch": 8.897969230769231, + "grad_norm": 0.8435314297676086, + "learning_rate": 8.820603495622912e-05, + "loss": 0.6705, + "mean_token_accuracy": 0.7846408020704985, + "num_tokens": 250884.0, + "step": 18080 + }, + { + "epoch": 8.902892307692309, + "grad_norm": 0.6095793843269348, + "learning_rate": 8.81552730950749e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.775208180397749, + "num_tokens": 259293.0, + "step": 18090 + }, + { + "epoch": 8.907815384615384, + "grad_norm": 0.27715322375297546, + "learning_rate": 8.810441690100575e-05, + "loss": 0.8093, + "mean_token_accuracy": 0.7308334667235613, + "num_tokens": 269988.0, + "step": 18100 + }, + { + "epoch": 8.912738461538462, + "grad_norm": 0.39033225178718567, + "learning_rate": 8.805346649975565e-05, + "loss": 0.7432, + "mean_token_accuracy": 0.7343447051942349, + "num_tokens": 279011.0, + "step": 18110 + }, + { + "epoch": 8.91766153846154, + "grad_norm": 0.3180443048477173, + "learning_rate": 8.800242201729141e-05, + "loss": 0.6186, + "mean_token_accuracy": 0.7984683159738779, + "num_tokens": 287209.0, + "step": 18120 + }, + { + "epoch": 8.922584615384615, + "grad_norm": 0.46220842003822327, + "learning_rate": 8.795128357981253e-05, + "loss": 0.7108, + "mean_token_accuracy": 0.7846879895776511, + "num_tokens": 296052.0, + "step": 18130 + }, + { + "epoch": 8.927507692307692, + "grad_norm": 0.5963321328163147, + "learning_rate": 8.790005131375074e-05, + "loss": 0.6761, + "mean_token_accuracy": 0.7786654643714428, + "num_tokens": 304656.0, + "step": 18140 + }, + { + "epoch": 8.93243076923077, + "grad_norm": 1.1265039443969727, + "learning_rate": 8.784872534576978e-05, + "loss": 0.8073, + "mean_token_accuracy": 0.7344447121024131, + "num_tokens": 314434.0, + "step": 18150 + }, + { + "epoch": 8.937353846153846, + "grad_norm": 0.4689125120639801, + "learning_rate": 8.779730580276501e-05, + "loss": 0.7303, + "mean_token_accuracy": 0.7458708386868238, + "num_tokens": 322607.0, + "step": 18160 + }, + { + "epoch": 8.942276923076923, + "grad_norm": 0.2908729612827301, + "learning_rate": 8.774579281186319e-05, + "loss": 0.6007, + "mean_token_accuracy": 0.7946537002921105, + "num_tokens": 330677.0, + "step": 18170 + }, + { + "epoch": 8.9472, + "grad_norm": 0.3566810190677643, + "learning_rate": 8.76941865004221e-05, + "loss": 0.7285, + "mean_token_accuracy": 0.7698224943131209, + "num_tokens": 340641.0, + "step": 18180 + }, + { + "epoch": 8.952123076923076, + "grad_norm": 0.35960811376571655, + "learning_rate": 8.76424869960302e-05, + "loss": 0.5989, + "mean_token_accuracy": 0.7901786677539349, + "num_tokens": 348726.0, + "step": 18190 + }, + { + "epoch": 8.957046153846154, + "grad_norm": 0.38176608085632324, + "learning_rate": 8.75906944265064e-05, + "loss": 0.7193, + "mean_token_accuracy": 0.7635401219129563, + "num_tokens": 357104.0, + "step": 18200 + }, + { + "epoch": 8.961969230769231, + "grad_norm": 0.2969922423362732, + "learning_rate": 8.753880891989972e-05, + "loss": 0.6534, + "mean_token_accuracy": 0.7754518780857325, + "num_tokens": 8926.0, + "step": 18210 + }, + { + "epoch": 8.966892307692309, + "grad_norm": 0.4824042022228241, + "learning_rate": 8.748683060448886e-05, + "loss": 0.6409, + "mean_token_accuracy": 0.7802204493433237, + "num_tokens": 17538.0, + "step": 18220 + }, + { + "epoch": 8.971815384615384, + "grad_norm": 0.28852856159210205, + "learning_rate": 8.743475960878209e-05, + "loss": 0.7454, + "mean_token_accuracy": 0.7477544978260994, + "num_tokens": 26410.0, + "step": 18230 + }, + { + "epoch": 8.976738461538462, + "grad_norm": 0.32177209854125977, + "learning_rate": 8.738259606151672e-05, + "loss": 0.7582, + "mean_token_accuracy": 0.7423262905329466, + "num_tokens": 35455.0, + "step": 18240 + }, + { + "epoch": 8.98166153846154, + "grad_norm": 0.2885516285896301, + "learning_rate": 8.733034009165894e-05, + "loss": 0.7031, + "mean_token_accuracy": 0.7598079223185777, + "num_tokens": 45252.0, + "step": 18250 + }, + { + "epoch": 8.986584615384615, + "grad_norm": 0.2479109764099121, + "learning_rate": 8.727799182840344e-05, + "loss": 0.7091, + "mean_token_accuracy": 0.7633897583931685, + "num_tokens": 53904.0, + "step": 18260 + }, + { + "epoch": 8.991507692307692, + "grad_norm": 0.4614357650279999, + "learning_rate": 8.722555140117303e-05, + "loss": 0.6807, + "mean_token_accuracy": 0.7741472873836756, + "num_tokens": 62135.0, + "step": 18270 + }, + { + "epoch": 8.99643076923077, + "grad_norm": 0.24129466712474823, + "learning_rate": 8.717301893961844e-05, + "loss": 0.7596, + "mean_token_accuracy": 0.7397197656333446, + "num_tokens": 70828.0, + "step": 18280 + }, + { + "epoch": 9.001476923076924, + "grad_norm": 0.3193853497505188, + "learning_rate": 8.712039457361795e-05, + "loss": 0.8814, + "mean_token_accuracy": 0.7301918130095412, + "num_tokens": 80962.0, + "step": 18290 + }, + { + "epoch": 9.0064, + "grad_norm": 0.5235174894332886, + "learning_rate": 8.7067678433277e-05, + "loss": 0.6238, + "mean_token_accuracy": 0.7859724014997482, + "num_tokens": 88659.0, + "step": 18300 + }, + { + "epoch": 9.011323076923077, + "grad_norm": 0.6154448986053467, + "learning_rate": 8.701487064892797e-05, + "loss": 0.6343, + "mean_token_accuracy": 0.7893827341496944, + "num_tokens": 97611.0, + "step": 18310 + }, + { + "epoch": 9.016246153846154, + "grad_norm": 0.4159059226512909, + "learning_rate": 8.69619713511298e-05, + "loss": 0.7506, + "mean_token_accuracy": 0.73648741543293, + "num_tokens": 107105.0, + "step": 18320 + }, + { + "epoch": 9.02116923076923, + "grad_norm": 0.3411385118961334, + "learning_rate": 8.690898067066771e-05, + "loss": 0.7644, + "mean_token_accuracy": 0.7501700416207313, + "num_tokens": 116559.0, + "step": 18330 + }, + { + "epoch": 9.026092307692307, + "grad_norm": 0.36136379837989807, + "learning_rate": 8.68558987385528e-05, + "loss": 0.8201, + "mean_token_accuracy": 0.716056851670146, + "num_tokens": 125666.0, + "step": 18340 + }, + { + "epoch": 9.031015384615385, + "grad_norm": 0.5617722272872925, + "learning_rate": 8.680272568602181e-05, + "loss": 0.7773, + "mean_token_accuracy": 0.7351651962846517, + "num_tokens": 135346.0, + "step": 18350 + }, + { + "epoch": 9.035938461538462, + "grad_norm": 0.9331738948822021, + "learning_rate": 8.674946164453677e-05, + "loss": 0.6327, + "mean_token_accuracy": 0.7781557217240334, + "num_tokens": 144168.0, + "step": 18360 + }, + { + "epoch": 9.040861538461538, + "grad_norm": 0.7562224268913269, + "learning_rate": 8.669610674578463e-05, + "loss": 0.699, + "mean_token_accuracy": 0.7659288670867681, + "num_tokens": 153162.0, + "step": 18370 + }, + { + "epoch": 9.045784615384616, + "grad_norm": 0.2653519809246063, + "learning_rate": 8.664266112167702e-05, + "loss": 0.7789, + "mean_token_accuracy": 0.7399741619825363, + "num_tokens": 162494.0, + "step": 18380 + }, + { + "epoch": 9.050707692307693, + "grad_norm": 0.4635138213634491, + "learning_rate": 8.658912490434981e-05, + "loss": 0.7132, + "mean_token_accuracy": 0.7637255847454071, + "num_tokens": 171751.0, + "step": 18390 + }, + { + "epoch": 9.055630769230769, + "grad_norm": 1.0278699398040771, + "learning_rate": 8.653549822616289e-05, + "loss": 0.6387, + "mean_token_accuracy": 0.7829495001584291, + "num_tokens": 180110.0, + "step": 18400 + }, + { + "epoch": 9.060553846153846, + "grad_norm": 0.44372498989105225, + "learning_rate": 8.648178121969978e-05, + "loss": 0.6717, + "mean_token_accuracy": 0.7774417765438557, + "num_tokens": 188293.0, + "step": 18410 + }, + { + "epoch": 9.065476923076924, + "grad_norm": 0.5009580254554749, + "learning_rate": 8.642797401776739e-05, + "loss": 0.7577, + "mean_token_accuracy": 0.7463389489799738, + "num_tokens": 197442.0, + "step": 18420 + }, + { + "epoch": 9.0704, + "grad_norm": 0.3624105751514435, + "learning_rate": 8.63740767533955e-05, + "loss": 0.7365, + "mean_token_accuracy": 0.7567340433597565, + "num_tokens": 206170.0, + "step": 18430 + }, + { + "epoch": 9.075323076923077, + "grad_norm": 0.40718990564346313, + "learning_rate": 8.632008955983667e-05, + "loss": 0.7613, + "mean_token_accuracy": 0.7605375040322542, + "num_tokens": 215198.0, + "step": 18440 + }, + { + "epoch": 9.080246153846154, + "grad_norm": 0.8007605075836182, + "learning_rate": 8.626601257056573e-05, + "loss": 0.6795, + "mean_token_accuracy": 0.7678759694099426, + "num_tokens": 223760.0, + "step": 18450 + }, + { + "epoch": 9.08516923076923, + "grad_norm": 0.6228090524673462, + "learning_rate": 8.621184591927953e-05, + "loss": 0.7174, + "mean_token_accuracy": 0.7554727476090193, + "num_tokens": 232476.0, + "step": 18460 + }, + { + "epoch": 9.090092307692307, + "grad_norm": 0.36368465423583984, + "learning_rate": 8.61575897398966e-05, + "loss": 0.7264, + "mean_token_accuracy": 0.7487952623516321, + "num_tokens": 242567.0, + "step": 18470 + }, + { + "epoch": 9.095015384615385, + "grad_norm": 0.6309615969657898, + "learning_rate": 8.610324416655684e-05, + "loss": 0.6797, + "mean_token_accuracy": 0.7755587588995695, + "num_tokens": 251964.0, + "step": 18480 + }, + { + "epoch": 9.09993846153846, + "grad_norm": 0.5346474647521973, + "learning_rate": 8.604880933362113e-05, + "loss": 0.6778, + "mean_token_accuracy": 0.7865086987614631, + "num_tokens": 260748.0, + "step": 18490 + }, + { + "epoch": 9.104861538461538, + "grad_norm": 0.7138431668281555, + "learning_rate": 8.599428537567101e-05, + "loss": 0.681, + "mean_token_accuracy": 0.7724569093436002, + "num_tokens": 269283.0, + "step": 18500 + }, + { + "epoch": 9.109784615384616, + "grad_norm": 0.2846992611885071, + "learning_rate": 8.593967242750843e-05, + "loss": 0.7066, + "mean_token_accuracy": 0.7593025963753461, + "num_tokens": 278494.0, + "step": 18510 + }, + { + "epoch": 9.114707692307693, + "grad_norm": 0.2537672817707062, + "learning_rate": 8.588497062415528e-05, + "loss": 0.7057, + "mean_token_accuracy": 0.7501687645912171, + "num_tokens": 288579.0, + "step": 18520 + }, + { + "epoch": 9.119630769230769, + "grad_norm": 0.3954591453075409, + "learning_rate": 8.583018010085321e-05, + "loss": 0.7496, + "mean_token_accuracy": 0.7489098712801934, + "num_tokens": 298019.0, + "step": 18530 + }, + { + "epoch": 9.124553846153846, + "grad_norm": 1.1767851114273071, + "learning_rate": 8.577530099306317e-05, + "loss": 0.6797, + "mean_token_accuracy": 0.7796317916363478, + "num_tokens": 307575.0, + "step": 18540 + }, + { + "epoch": 9.129476923076924, + "grad_norm": 0.7717283964157104, + "learning_rate": 8.57203334364651e-05, + "loss": 0.7018, + "mean_token_accuracy": 0.763766011595726, + "num_tokens": 316091.0, + "step": 18550 + }, + { + "epoch": 9.1344, + "grad_norm": 0.366485595703125, + "learning_rate": 8.566527756695766e-05, + "loss": 0.6554, + "mean_token_accuracy": 0.7748038172721863, + "num_tokens": 324292.0, + "step": 18560 + }, + { + "epoch": 9.139323076923077, + "grad_norm": 0.44988542795181274, + "learning_rate": 8.561013352065783e-05, + "loss": 0.7434, + "mean_token_accuracy": 0.7497165717184544, + "num_tokens": 332960.0, + "step": 18570 + }, + { + "epoch": 9.144246153846154, + "grad_norm": 0.27599868178367615, + "learning_rate": 8.555490143390062e-05, + "loss": 0.6943, + "mean_token_accuracy": 0.7611289013177156, + "num_tokens": 341446.0, + "step": 18580 + }, + { + "epoch": 9.14916923076923, + "grad_norm": 0.29391446709632874, + "learning_rate": 8.549958144323862e-05, + "loss": 0.6971, + "mean_token_accuracy": 0.7690398130565882, + "num_tokens": 349712.0, + "step": 18590 + }, + { + "epoch": 9.154092307692308, + "grad_norm": 0.30475255846977234, + "learning_rate": 8.544417368544189e-05, + "loss": 0.7287, + "mean_token_accuracy": 0.7482383538037538, + "num_tokens": 359021.0, + "step": 18600 + }, + { + "epoch": 9.158523076923077, + "grad_norm": 0.5902156829833984, + "learning_rate": 8.538867829749734e-05, + "loss": 0.7084, + "mean_token_accuracy": 0.7611913044005633, + "num_tokens": 9586.0, + "step": 18610 + }, + { + "epoch": 9.163446153846154, + "grad_norm": 0.38383767008781433, + "learning_rate": 8.533309541660863e-05, + "loss": 0.7506, + "mean_token_accuracy": 0.7360954392701388, + "num_tokens": 18015.0, + "step": 18620 + }, + { + "epoch": 9.168369230769231, + "grad_norm": 0.27133414149284363, + "learning_rate": 8.527742518019567e-05, + "loss": 0.6913, + "mean_token_accuracy": 0.7697105508297681, + "num_tokens": 27033.0, + "step": 18630 + }, + { + "epoch": 9.173292307692307, + "grad_norm": 0.3312305510044098, + "learning_rate": 8.52216677258944e-05, + "loss": 0.7035, + "mean_token_accuracy": 0.7611101619899273, + "num_tokens": 36045.0, + "step": 18640 + }, + { + "epoch": 9.178215384615385, + "grad_norm": 0.7461434602737427, + "learning_rate": 8.516582319155633e-05, + "loss": 0.7247, + "mean_token_accuracy": 0.7545670151710511, + "num_tokens": 44409.0, + "step": 18650 + }, + { + "epoch": 9.183138461538462, + "grad_norm": 0.4726799726486206, + "learning_rate": 8.51098917152483e-05, + "loss": 0.7093, + "mean_token_accuracy": 0.7689370591193437, + "num_tokens": 53497.0, + "step": 18660 + }, + { + "epoch": 9.188061538461538, + "grad_norm": 0.26350924372673035, + "learning_rate": 8.505387343525209e-05, + "loss": 0.7406, + "mean_token_accuracy": 0.766264171525836, + "num_tokens": 61939.0, + "step": 18670 + }, + { + "epoch": 9.192984615384615, + "grad_norm": 0.20984847843647003, + "learning_rate": 8.49977684900641e-05, + "loss": 0.6724, + "mean_token_accuracy": 0.7731727968901396, + "num_tokens": 70391.0, + "step": 18680 + }, + { + "epoch": 9.197907692307693, + "grad_norm": 0.2626084089279175, + "learning_rate": 8.4941577018395e-05, + "loss": 0.6348, + "mean_token_accuracy": 0.7817917808890342, + "num_tokens": 78312.0, + "step": 18690 + }, + { + "epoch": 9.202830769230768, + "grad_norm": 0.29489243030548096, + "learning_rate": 8.488529915916936e-05, + "loss": 0.746, + "mean_token_accuracy": 0.7493366193026304, + "num_tokens": 87869.0, + "step": 18700 + }, + { + "epoch": 9.207753846153846, + "grad_norm": 0.4739731550216675, + "learning_rate": 8.482893505152533e-05, + "loss": 0.7485, + "mean_token_accuracy": 0.751647999510169, + "num_tokens": 96885.0, + "step": 18710 + }, + { + "epoch": 9.212676923076923, + "grad_norm": 0.8138965368270874, + "learning_rate": 8.47724848348143e-05, + "loss": 0.6989, + "mean_token_accuracy": 0.7752762287855148, + "num_tokens": 106453.0, + "step": 18720 + }, + { + "epoch": 9.2176, + "grad_norm": 0.3375241160392761, + "learning_rate": 8.471594864860058e-05, + "loss": 0.7631, + "mean_token_accuracy": 0.7568930108100176, + "num_tokens": 116549.0, + "step": 18730 + }, + { + "epoch": 9.222523076923077, + "grad_norm": 0.2639356851577759, + "learning_rate": 8.4659326632661e-05, + "loss": 0.6026, + "mean_token_accuracy": 0.7832478541880846, + "num_tokens": 124510.0, + "step": 18740 + }, + { + "epoch": 9.227446153846154, + "grad_norm": 0.915088951587677, + "learning_rate": 8.460261892698457e-05, + "loss": 0.6784, + "mean_token_accuracy": 0.772139797359705, + "num_tokens": 132722.0, + "step": 18750 + }, + { + "epoch": 9.232369230769232, + "grad_norm": 0.45034798979759216, + "learning_rate": 8.454582567177223e-05, + "loss": 0.5967, + "mean_token_accuracy": 0.8007356438785791, + "num_tokens": 141020.0, + "step": 18760 + }, + { + "epoch": 9.237292307692307, + "grad_norm": 0.23716862499713898, + "learning_rate": 8.44889470074363e-05, + "loss": 0.6551, + "mean_token_accuracy": 0.7765100870281458, + "num_tokens": 149320.0, + "step": 18770 + }, + { + "epoch": 9.242215384615385, + "grad_norm": 0.2853967845439911, + "learning_rate": 8.443198307460041e-05, + "loss": 0.7346, + "mean_token_accuracy": 0.7537438083440066, + "num_tokens": 158038.0, + "step": 18780 + }, + { + "epoch": 9.247138461538462, + "grad_norm": 0.40558820962905884, + "learning_rate": 8.437493401409888e-05, + "loss": 0.6459, + "mean_token_accuracy": 0.7761574640870095, + "num_tokens": 166488.0, + "step": 18790 + }, + { + "epoch": 9.252061538461538, + "grad_norm": 0.3173389434814453, + "learning_rate": 8.431779996697656e-05, + "loss": 0.6979, + "mean_token_accuracy": 0.7614830315113068, + "num_tokens": 175398.0, + "step": 18800 + }, + { + "epoch": 9.256984615384615, + "grad_norm": 0.31110990047454834, + "learning_rate": 8.426058107448841e-05, + "loss": 0.7819, + "mean_token_accuracy": 0.7374692268669605, + "num_tokens": 184697.0, + "step": 18810 + }, + { + "epoch": 9.261907692307693, + "grad_norm": 0.23878473043441772, + "learning_rate": 8.420327747809913e-05, + "loss": 0.732, + "mean_token_accuracy": 0.7535562068223953, + "num_tokens": 193847.0, + "step": 18820 + }, + { + "epoch": 9.266830769230769, + "grad_norm": 0.28896352648735046, + "learning_rate": 8.414588931948287e-05, + "loss": 0.6439, + "mean_token_accuracy": 0.7868727888911963, + "num_tokens": 202991.0, + "step": 18830 + }, + { + "epoch": 9.271753846153846, + "grad_norm": 0.43781089782714844, + "learning_rate": 8.408841674052284e-05, + "loss": 0.8251, + "mean_token_accuracy": 0.7127380024641752, + "num_tokens": 212459.0, + "step": 18840 + }, + { + "epoch": 9.276676923076923, + "grad_norm": 0.3533414900302887, + "learning_rate": 8.403085988331092e-05, + "loss": 0.7372, + "mean_token_accuracy": 0.7454831000417471, + "num_tokens": 221274.0, + "step": 18850 + }, + { + "epoch": 9.2816, + "grad_norm": 0.35645604133605957, + "learning_rate": 8.397321889014743e-05, + "loss": 0.7142, + "mean_token_accuracy": 0.7632349513471126, + "num_tokens": 229612.0, + "step": 18860 + }, + { + "epoch": 9.286523076923077, + "grad_norm": 0.4839099645614624, + "learning_rate": 8.391549390354061e-05, + "loss": 0.685, + "mean_token_accuracy": 0.7794561486691236, + "num_tokens": 239028.0, + "step": 18870 + }, + { + "epoch": 9.291446153846154, + "grad_norm": 0.8532965183258057, + "learning_rate": 8.385768506620649e-05, + "loss": 0.7402, + "mean_token_accuracy": 0.7652015954256057, + "num_tokens": 248482.0, + "step": 18880 + }, + { + "epoch": 9.296369230769232, + "grad_norm": 0.2782347798347473, + "learning_rate": 8.379979252106829e-05, + "loss": 0.6769, + "mean_token_accuracy": 0.7735106501728296, + "num_tokens": 256626.0, + "step": 18890 + }, + { + "epoch": 9.301292307692307, + "grad_norm": 0.2272525280714035, + "learning_rate": 8.374181641125622e-05, + "loss": 0.7279, + "mean_token_accuracy": 0.7602387875318527, + "num_tokens": 265897.0, + "step": 18900 + }, + { + "epoch": 9.306215384615385, + "grad_norm": 0.3278202414512634, + "learning_rate": 8.368375688010712e-05, + "loss": 0.7268, + "mean_token_accuracy": 0.7507894467562437, + "num_tokens": 275027.0, + "step": 18910 + }, + { + "epoch": 9.311138461538462, + "grad_norm": 0.26526904106140137, + "learning_rate": 8.362561407116405e-05, + "loss": 0.6761, + "mean_token_accuracy": 0.7650868054479361, + "num_tokens": 284258.0, + "step": 18920 + }, + { + "epoch": 9.316061538461538, + "grad_norm": 0.4439482092857361, + "learning_rate": 8.356738812817596e-05, + "loss": 0.7357, + "mean_token_accuracy": 0.7472224164754152, + "num_tokens": 293076.0, + "step": 18930 + }, + { + "epoch": 9.320984615384615, + "grad_norm": 0.27552875876426697, + "learning_rate": 8.350907919509734e-05, + "loss": 0.6793, + "mean_token_accuracy": 0.7668122231960297, + "num_tokens": 301359.0, + "step": 18940 + }, + { + "epoch": 9.325907692307693, + "grad_norm": 0.268410325050354, + "learning_rate": 8.345068741608786e-05, + "loss": 0.7784, + "mean_token_accuracy": 0.7451492633670569, + "num_tokens": 310342.0, + "step": 18950 + }, + { + "epoch": 9.330830769230769, + "grad_norm": 0.3308853209018707, + "learning_rate": 8.339221293551203e-05, + "loss": 0.6681, + "mean_token_accuracy": 0.7704876314848661, + "num_tokens": 318452.0, + "step": 18960 + }, + { + "epoch": 9.335753846153846, + "grad_norm": 0.25952640175819397, + "learning_rate": 8.33336558979388e-05, + "loss": 0.7042, + "mean_token_accuracy": 0.7535649377852678, + "num_tokens": 328195.0, + "step": 18970 + }, + { + "epoch": 9.340676923076924, + "grad_norm": 0.2396654337644577, + "learning_rate": 8.327501644814122e-05, + "loss": 0.6969, + "mean_token_accuracy": 0.7682115890085697, + "num_tokens": 337053.0, + "step": 18980 + }, + { + "epoch": 9.3456, + "grad_norm": 0.32446226477622986, + "learning_rate": 8.321629473109615e-05, + "loss": 0.6289, + "mean_token_accuracy": 0.7851255543529987, + "num_tokens": 345015.0, + "step": 18990 + }, + { + "epoch": 9.350523076923077, + "grad_norm": 0.2200402021408081, + "learning_rate": 8.315749089198378e-05, + "loss": 0.849, + "mean_token_accuracy": 0.714352885633707, + "num_tokens": 355063.0, + "step": 19000 + }, + { + "epoch": 9.355446153846154, + "grad_norm": 0.40607473254203796, + "learning_rate": 8.309860507618737e-05, + "loss": 0.7187, + "mean_token_accuracy": 0.7435123972594738, + "num_tokens": 364147.0, + "step": 19010 + }, + { + "epoch": 9.36036923076923, + "grad_norm": 0.2307461053133011, + "learning_rate": 8.303963742929284e-05, + "loss": 0.7594, + "mean_token_accuracy": 0.7431487880647183, + "num_tokens": 373084.0, + "step": 19020 + }, + { + "epoch": 9.365292307692307, + "grad_norm": 0.36096903681755066, + "learning_rate": 8.298058809708842e-05, + "loss": 0.8165, + "mean_token_accuracy": 0.7338841069489718, + "num_tokens": 383053.0, + "step": 19030 + }, + { + "epoch": 9.370215384615385, + "grad_norm": 0.4592624604701996, + "learning_rate": 8.292145722556431e-05, + "loss": 0.6305, + "mean_token_accuracy": 0.7788219083100557, + "num_tokens": 391397.0, + "step": 19040 + }, + { + "epoch": 9.375138461538462, + "grad_norm": 0.24138374626636505, + "learning_rate": 8.286224496091228e-05, + "loss": 0.7563, + "mean_token_accuracy": 0.7563010204583407, + "num_tokens": 401057.0, + "step": 19050 + }, + { + "epoch": 9.380061538461538, + "grad_norm": 1.3317973613739014, + "learning_rate": 8.280295144952536e-05, + "loss": 0.874, + "mean_token_accuracy": 0.7216722797602415, + "num_tokens": 411806.0, + "step": 19060 + }, + { + "epoch": 9.384984615384615, + "grad_norm": 0.24880658090114594, + "learning_rate": 8.274357683799744e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.7717092610895634, + "num_tokens": 420346.0, + "step": 19070 + }, + { + "epoch": 9.389907692307693, + "grad_norm": 0.28920841217041016, + "learning_rate": 8.268412127312293e-05, + "loss": 0.7502, + "mean_token_accuracy": 0.7458052407950163, + "num_tokens": 429329.0, + "step": 19080 + }, + { + "epoch": 9.394830769230769, + "grad_norm": 0.8361232280731201, + "learning_rate": 8.262458490189633e-05, + "loss": 0.7201, + "mean_token_accuracy": 0.7693693403154611, + "num_tokens": 438637.0, + "step": 19090 + }, + { + "epoch": 9.399753846153846, + "grad_norm": 1.081518292427063, + "learning_rate": 8.256496787151197e-05, + "loss": 0.73, + "mean_token_accuracy": 0.758531778678298, + "num_tokens": 447629.0, + "step": 19100 + }, + { + "epoch": 9.404676923076924, + "grad_norm": 0.3594396412372589, + "learning_rate": 8.250527032936359e-05, + "loss": 0.6957, + "mean_token_accuracy": 0.7701297465711832, + "num_tokens": 455960.0, + "step": 19110 + }, + { + "epoch": 9.4096, + "grad_norm": 0.2965713143348694, + "learning_rate": 8.244549242304399e-05, + "loss": 0.6313, + "mean_token_accuracy": 0.7778208505362272, + "num_tokens": 464243.0, + "step": 19120 + }, + { + "epoch": 9.414523076923077, + "grad_norm": 0.44778725504875183, + "learning_rate": 8.238563430034463e-05, + "loss": 0.6234, + "mean_token_accuracy": 0.7768757071346044, + "num_tokens": 472177.0, + "step": 19130 + }, + { + "epoch": 9.419446153846154, + "grad_norm": 0.4347788393497467, + "learning_rate": 8.232569610925533e-05, + "loss": 0.7254, + "mean_token_accuracy": 0.7469818860292434, + "num_tokens": 481557.0, + "step": 19140 + }, + { + "epoch": 9.42436923076923, + "grad_norm": 0.2500903010368347, + "learning_rate": 8.226567799796383e-05, + "loss": 0.782, + "mean_token_accuracy": 0.7323465205729007, + "num_tokens": 491491.0, + "step": 19150 + }, + { + "epoch": 9.429292307692307, + "grad_norm": 0.2921452820301056, + "learning_rate": 8.220558011485546e-05, + "loss": 0.7998, + "mean_token_accuracy": 0.7310435988008976, + "num_tokens": 501074.0, + "step": 19160 + }, + { + "epoch": 9.434215384615385, + "grad_norm": 0.2585694491863251, + "learning_rate": 8.21454026085128e-05, + "loss": 0.7696, + "mean_token_accuracy": 0.7234194416552782, + "num_tokens": 509884.0, + "step": 19170 + }, + { + "epoch": 9.439138461538462, + "grad_norm": 0.24239717423915863, + "learning_rate": 8.208514562771532e-05, + "loss": 0.671, + "mean_token_accuracy": 0.7750304438173771, + "num_tokens": 518146.0, + "step": 19180 + }, + { + "epoch": 9.444061538461538, + "grad_norm": 0.21721267700195312, + "learning_rate": 8.202480932143887e-05, + "loss": 0.6366, + "mean_token_accuracy": 0.7861224085092544, + "num_tokens": 526055.0, + "step": 19190 + }, + { + "epoch": 9.448984615384616, + "grad_norm": 0.8764368891716003, + "learning_rate": 8.19643938388555e-05, + "loss": 0.6603, + "mean_token_accuracy": 0.7765530787408352, + "num_tokens": 534409.0, + "step": 19200 + }, + { + "epoch": 9.453907692307693, + "grad_norm": 0.28634384274482727, + "learning_rate": 8.190389932933301e-05, + "loss": 0.6941, + "mean_token_accuracy": 0.7651392992585897, + "num_tokens": 543311.0, + "step": 19210 + }, + { + "epoch": 9.458830769230769, + "grad_norm": 0.30995234847068787, + "learning_rate": 8.184332594243455e-05, + "loss": 0.7696, + "mean_token_accuracy": 0.7483407512307167, + "num_tokens": 552293.0, + "step": 19220 + }, + { + "epoch": 9.463753846153846, + "grad_norm": 0.26820695400238037, + "learning_rate": 8.17826738279183e-05, + "loss": 0.7136, + "mean_token_accuracy": 0.7572588924318552, + "num_tokens": 560875.0, + "step": 19230 + }, + { + "epoch": 9.468676923076924, + "grad_norm": 0.3167503774166107, + "learning_rate": 8.172194313573711e-05, + "loss": 0.6687, + "mean_token_accuracy": 0.78301134519279, + "num_tokens": 570337.0, + "step": 19240 + }, + { + "epoch": 9.4736, + "grad_norm": 0.38579657673835754, + "learning_rate": 8.166113401603802e-05, + "loss": 0.6541, + "mean_token_accuracy": 0.7858757961541414, + "num_tokens": 578355.0, + "step": 19250 + }, + { + "epoch": 9.478523076923077, + "grad_norm": 0.2878792881965637, + "learning_rate": 8.160024661916204e-05, + "loss": 0.836, + "mean_token_accuracy": 0.7330463856458664, + "num_tokens": 588762.0, + "step": 19260 + }, + { + "epoch": 9.483446153846154, + "grad_norm": 0.3626146614551544, + "learning_rate": 8.153928109564369e-05, + "loss": 0.8072, + "mean_token_accuracy": 0.7299764085561037, + "num_tokens": 598467.0, + "step": 19270 + }, + { + "epoch": 9.48836923076923, + "grad_norm": 0.2230752855539322, + "learning_rate": 8.147823759621063e-05, + "loss": 0.6656, + "mean_token_accuracy": 0.7746995214372874, + "num_tokens": 607228.0, + "step": 19280 + }, + { + "epoch": 9.493292307692307, + "grad_norm": 0.35763120651245117, + "learning_rate": 8.141711627178335e-05, + "loss": 0.7943, + "mean_token_accuracy": 0.7384316265583039, + "num_tokens": 616620.0, + "step": 19290 + }, + { + "epoch": 9.498215384615385, + "grad_norm": 0.31609174609184265, + "learning_rate": 8.135591727347469e-05, + "loss": 0.7832, + "mean_token_accuracy": 0.731552030518651, + "num_tokens": 627052.0, + "step": 19300 + }, + { + "epoch": 9.503138461538462, + "grad_norm": 0.278390496969223, + "learning_rate": 8.129464075258956e-05, + "loss": 0.7252, + "mean_token_accuracy": 0.7618256121873855, + "num_tokens": 636036.0, + "step": 19310 + }, + { + "epoch": 9.508061538461538, + "grad_norm": 0.3673849105834961, + "learning_rate": 8.123328686062453e-05, + "loss": 0.6438, + "mean_token_accuracy": 0.7896918896585703, + "num_tokens": 645343.0, + "step": 19320 + }, + { + "epoch": 9.512984615384616, + "grad_norm": 0.33690834045410156, + "learning_rate": 8.117185574926744e-05, + "loss": 0.8169, + "mean_token_accuracy": 0.7260203436017036, + "num_tokens": 655914.0, + "step": 19330 + }, + { + "epoch": 9.517907692307693, + "grad_norm": 0.38085484504699707, + "learning_rate": 8.111034757039707e-05, + "loss": 0.7446, + "mean_token_accuracy": 0.7486122488975525, + "num_tokens": 665555.0, + "step": 19340 + }, + { + "epoch": 9.522830769230769, + "grad_norm": 0.8052934408187866, + "learning_rate": 8.10487624760827e-05, + "loss": 0.7267, + "mean_token_accuracy": 0.7574852678924799, + "num_tokens": 674482.0, + "step": 19350 + }, + { + "epoch": 9.527753846153846, + "grad_norm": 0.25485706329345703, + "learning_rate": 8.098710061858381e-05, + "loss": 0.6928, + "mean_token_accuracy": 0.7583078496158123, + "num_tokens": 682777.0, + "step": 19360 + }, + { + "epoch": 9.532676923076924, + "grad_norm": 0.2488587498664856, + "learning_rate": 8.092536215034967e-05, + "loss": 0.7838, + "mean_token_accuracy": 0.7227123014628887, + "num_tokens": 692407.0, + "step": 19370 + }, + { + "epoch": 9.5376, + "grad_norm": 1.3520376682281494, + "learning_rate": 8.086354722401892e-05, + "loss": 0.7324, + "mean_token_accuracy": 0.7700716838240623, + "num_tokens": 701713.0, + "step": 19380 + }, + { + "epoch": 9.542523076923077, + "grad_norm": 0.278576135635376, + "learning_rate": 8.080165599241924e-05, + "loss": 0.7461, + "mean_token_accuracy": 0.755218057706952, + "num_tokens": 710344.0, + "step": 19390 + }, + { + "epoch": 9.547446153846154, + "grad_norm": 0.3666495680809021, + "learning_rate": 8.0739688608567e-05, + "loss": 0.672, + "mean_token_accuracy": 0.7804633747786284, + "num_tokens": 719045.0, + "step": 19400 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.126844244746281e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}