{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.547446153846154, "eval_steps": 500, "global_step": 19400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "grad_norm": 0.48755398392677307, "learning_rate": 9e-06, "loss": 2.9601, "mean_token_accuracy": 0.4284199851565063, "num_tokens": 15763.0, "step": 10 }, { "epoch": 0.64, "grad_norm": 0.7431631088256836, "learning_rate": 1.9e-05, "loss": 3.1197, "mean_token_accuracy": 0.42047689845785496, "num_tokens": 30510.0, "step": 20 }, { "epoch": 0.96, "grad_norm": 0.9281144142150879, "learning_rate": 1.9999959867760483e-05, "loss": 3.0392, "mean_token_accuracy": 0.41553077606949956, "num_tokens": 44066.0, "step": 30 }, { "epoch": 1.256, "grad_norm": 1.2322081327438354, "learning_rate": 1.999982113944484e-05, "loss": 3.3331, "mean_token_accuracy": 0.4135566469583962, "num_tokens": 58784.0, "step": 40 }, { "epoch": 1.576, "grad_norm": 7.1544270515441895, "learning_rate": 1.9999583320967683e-05, "loss": 3.092, "mean_token_accuracy": 0.42304785093292596, "num_tokens": 74305.0, "step": 50 }, { "epoch": 1.896, "grad_norm": 0.8600693345069885, "learning_rate": 1.99992464146856e-05, "loss": 2.8575, "mean_token_accuracy": 0.4428128655999899, "num_tokens": 88582.0, "step": 60 }, { "epoch": 2.192, "grad_norm": 1.3089476823806763, "learning_rate": 1.999881042393706e-05, "loss": 3.2266, "mean_token_accuracy": 0.43124929654437144, "num_tokens": 101701.0, "step": 70 }, { "epoch": 2.512, "grad_norm": 0.6948946714401245, "learning_rate": 1.9998275353042377e-05, "loss": 2.6626, "mean_token_accuracy": 0.45563504602760074, "num_tokens": 116845.0, "step": 80 }, { "epoch": 2.832, "grad_norm": 1.0951130390167236, "learning_rate": 1.999764120730368e-05, "loss": 2.6061, "mean_token_accuracy": 0.4799388902261853, "num_tokens": 131642.0, "step": 90 }, { "epoch": 3.128, "grad_norm": 2.2256667613983154, "learning_rate": 1.9996907993004836e-05, "loss": 2.5437, "mean_token_accuracy": 0.4792116602530351, "num_tokens": 144071.0, "step": 100 }, { "epoch": 3.448, "grad_norm": 1.3083274364471436, "learning_rate": 1.9996075717411405e-05, "loss": 2.3134, "mean_token_accuracy": 0.49251377964392307, "num_tokens": 160671.0, "step": 110 }, { "epoch": 3.768, "grad_norm": 1.0546499490737915, "learning_rate": 1.9995144388770577e-05, "loss": 2.5219, "mean_token_accuracy": 0.49237193521112205, "num_tokens": 174208.0, "step": 120 }, { "epoch": 4.064, "grad_norm": 1.3395127058029175, "learning_rate": 1.9994114016311053e-05, "loss": 2.6405, "mean_token_accuracy": 0.494740814977401, "num_tokens": 189174.0, "step": 130 }, { "epoch": 4.384, "grad_norm": 0.9926703572273254, "learning_rate": 1.9992984610243006e-05, "loss": 2.2636, "mean_token_accuracy": 0.5148700190708041, "num_tokens": 205614.0, "step": 140 }, { "epoch": 4.704, "grad_norm": 2.0030810832977295, "learning_rate": 1.9991756181757936e-05, "loss": 2.2209, "mean_token_accuracy": 0.5199183862656355, "num_tokens": 220053.0, "step": 150 }, { "epoch": 5.0, "grad_norm": 3.6073784828186035, "learning_rate": 1.999042874302857e-05, "loss": 2.3257, "mean_token_accuracy": 0.5087871091389978, "num_tokens": 232090.0, "step": 160 }, { "epoch": 5.32, "grad_norm": 0.9835549592971802, "learning_rate": 1.9989002307208767e-05, "loss": 2.0461, "mean_token_accuracy": 0.5411571308970451, "num_tokens": 247494.0, "step": 170 }, { "epoch": 5.64, "grad_norm": 1.3350774049758911, "learning_rate": 1.998747688843335e-05, "loss": 2.1222, "mean_token_accuracy": 0.5401662968099117, "num_tokens": 261394.0, "step": 180 }, { "epoch": 5.96, "grad_norm": 1.4606289863586426, "learning_rate": 1.9985852501817985e-05, "loss": 2.0423, "mean_token_accuracy": 0.5442505508661271, "num_tokens": 276061.0, "step": 190 }, { "epoch": 6.256, "grad_norm": 2.1242387294769287, "learning_rate": 1.998412916345904e-05, "loss": 2.1295, "mean_token_accuracy": 0.5391830210951535, "num_tokens": 289693.0, "step": 200 }, { "epoch": 6.576, "grad_norm": 1.4135829210281372, "learning_rate": 1.99823068904334e-05, "loss": 1.9505, "mean_token_accuracy": 0.5633615963160992, "num_tokens": 304290.0, "step": 210 }, { "epoch": 6.896, "grad_norm": 1.05637526512146, "learning_rate": 1.998038570079833e-05, "loss": 2.0059, "mean_token_accuracy": 0.569461640715599, "num_tokens": 319354.0, "step": 220 }, { "epoch": 7.192, "grad_norm": 1.409026026725769, "learning_rate": 1.9978365613591263e-05, "loss": 1.8998, "mean_token_accuracy": 0.5667525610408267, "num_tokens": 333444.0, "step": 230 }, { "epoch": 7.5120000000000005, "grad_norm": 0.9068111181259155, "learning_rate": 1.9976246648829636e-05, "loss": 1.6992, "mean_token_accuracy": 0.6070717331022024, "num_tokens": 349035.0, "step": 240 }, { "epoch": 7.832, "grad_norm": 1.7179079055786133, "learning_rate": 1.997402882751068e-05, "loss": 1.898, "mean_token_accuracy": 0.5741809576749801, "num_tokens": 363648.0, "step": 250 }, { "epoch": 8.128, "grad_norm": 2.5260181427001953, "learning_rate": 1.997171217161122e-05, "loss": 1.8262, "mean_token_accuracy": 0.6014102785973936, "num_tokens": 377215.0, "step": 260 }, { "epoch": 8.448, "grad_norm": 1.0469582080841064, "learning_rate": 1.996929670408744e-05, "loss": 1.7423, "mean_token_accuracy": 0.6009339291602374, "num_tokens": 394237.0, "step": 270 }, { "epoch": 8.768, "grad_norm": 2.0109641551971436, "learning_rate": 1.996678244887468e-05, "loss": 1.844, "mean_token_accuracy": 0.5853462919592858, "num_tokens": 407239.0, "step": 280 }, { "epoch": 9.064, "grad_norm": 2.3061084747314453, "learning_rate": 1.9964169430887174e-05, "loss": 1.7935, "mean_token_accuracy": 0.5961252535517151, "num_tokens": 420770.0, "step": 290 }, { "epoch": 9.384, "grad_norm": 1.5985437631607056, "learning_rate": 1.9961457676017833e-05, "loss": 1.703, "mean_token_accuracy": 0.6001830734312534, "num_tokens": 435005.0, "step": 300 }, { "epoch": 9.704, "grad_norm": 1.0775606632232666, "learning_rate": 1.9958647211137952e-05, "loss": 1.7104, "mean_token_accuracy": 0.6070772130973637, "num_tokens": 449192.0, "step": 310 }, { "epoch": 10.0, "grad_norm": 4.900282859802246, "learning_rate": 1.9955738064096975e-05, "loss": 1.7131, "mean_token_accuracy": 0.6122549969602276, "num_tokens": 464180.0, "step": 320 }, { "epoch": 10.32, "grad_norm": 4.266648769378662, "learning_rate": 1.9952730263722205e-05, "loss": 1.6526, "mean_token_accuracy": 0.6316778633743525, "num_tokens": 480765.0, "step": 330 }, { "epoch": 10.64, "grad_norm": 1.3835431337356567, "learning_rate": 1.994962383981851e-05, "loss": 1.5473, "mean_token_accuracy": 0.6267588481307029, "num_tokens": 495929.0, "step": 340 }, { "epoch": 10.96, "grad_norm": 0.9430285096168518, "learning_rate": 1.9946418823168053e-05, "loss": 1.7158, "mean_token_accuracy": 0.6107403030619025, "num_tokens": 508555.0, "step": 350 }, { "epoch": 11.256, "grad_norm": 1.7163220643997192, "learning_rate": 1.994311524552996e-05, "loss": 1.557, "mean_token_accuracy": 0.6259392479787002, "num_tokens": 522011.0, "step": 360 }, { "epoch": 11.576, "grad_norm": 1.8388807773590088, "learning_rate": 1.993971313964002e-05, "loss": 1.6451, "mean_token_accuracy": 0.6252246461808681, "num_tokens": 535811.0, "step": 370 }, { "epoch": 11.896, "grad_norm": 2.878235101699829, "learning_rate": 1.993621253921036e-05, "loss": 1.6015, "mean_token_accuracy": 0.6341628909111023, "num_tokens": 551848.0, "step": 380 }, { "epoch": 12.192, "grad_norm": 8.73257064819336, "learning_rate": 1.9932613478929103e-05, "loss": 1.6087, "mean_token_accuracy": 0.6341016663087381, "num_tokens": 566187.0, "step": 390 }, { "epoch": 12.512, "grad_norm": 1.6711736917495728, "learning_rate": 1.9928915994460037e-05, "loss": 1.4662, "mean_token_accuracy": 0.641170359775424, "num_tokens": 579340.0, "step": 400 }, { "epoch": 12.832, "grad_norm": 1.2355554103851318, "learning_rate": 1.9925120122442253e-05, "loss": 1.4234, "mean_token_accuracy": 0.6438105596229434, "num_tokens": 595449.0, "step": 410 }, { "epoch": 13.128, "grad_norm": 1.1492340564727783, "learning_rate": 1.9921225900489776e-05, "loss": 1.5106, "mean_token_accuracy": 0.6468948456081184, "num_tokens": 610726.0, "step": 420 }, { "epoch": 13.448, "grad_norm": 2.4454128742218018, "learning_rate": 1.9917233367191205e-05, "loss": 1.4301, "mean_token_accuracy": 0.6485198132693768, "num_tokens": 624770.0, "step": 430 }, { "epoch": 13.768, "grad_norm": 1.809606909751892, "learning_rate": 1.9913142562109328e-05, "loss": 1.4744, "mean_token_accuracy": 0.656716751307249, "num_tokens": 640635.0, "step": 440 }, { "epoch": 14.064, "grad_norm": 1.8572089672088623, "learning_rate": 1.990895352578072e-05, "loss": 1.4747, "mean_token_accuracy": 0.6568980690192532, "num_tokens": 652969.0, "step": 450 }, { "epoch": 14.384, "grad_norm": 1.4431779384613037, "learning_rate": 1.9904666299715357e-05, "loss": 1.3779, "mean_token_accuracy": 0.6651028156280517, "num_tokens": 667518.0, "step": 460 }, { "epoch": 14.704, "grad_norm": 1.1659435033798218, "learning_rate": 1.9900280926396186e-05, "loss": 1.4853, "mean_token_accuracy": 0.6542218446731567, "num_tokens": 682093.0, "step": 470 }, { "epoch": 15.0, "grad_norm": 2.4632649421691895, "learning_rate": 1.989579744927872e-05, "loss": 1.5292, "mean_token_accuracy": 0.6373760857292123, "num_tokens": 696270.0, "step": 480 }, { "epoch": 15.32, "grad_norm": 1.2648308277130127, "learning_rate": 1.98912159127906e-05, "loss": 1.5282, "mean_token_accuracy": 0.6369021199643612, "num_tokens": 711645.0, "step": 490 }, { "epoch": 15.64, "grad_norm": 1.6591901779174805, "learning_rate": 1.988653636233116e-05, "loss": 1.3674, "mean_token_accuracy": 0.6757966171950102, "num_tokens": 724986.0, "step": 500 }, { "epoch": 15.96, "grad_norm": 1.3664172887802124, "learning_rate": 1.988175884427097e-05, "loss": 1.3619, "mean_token_accuracy": 0.6697750940918923, "num_tokens": 740314.0, "step": 510 }, { "epoch": 16.256, "grad_norm": 1.3944075107574463, "learning_rate": 1.9876883405951378e-05, "loss": 1.4199, "mean_token_accuracy": 0.6569653801821373, "num_tokens": 754126.0, "step": 520 }, { "epoch": 16.576, "grad_norm": 1.846074104309082, "learning_rate": 1.987191009568405e-05, "loss": 1.3542, "mean_token_accuracy": 0.6702545773237943, "num_tokens": 770447.0, "step": 530 }, { "epoch": 16.896, "grad_norm": 1.4818017482757568, "learning_rate": 1.9866838962750473e-05, "loss": 1.3385, "mean_token_accuracy": 0.673089163005352, "num_tokens": 784145.0, "step": 540 }, { "epoch": 17.192, "grad_norm": 1.6503247022628784, "learning_rate": 1.986167005740149e-05, "loss": 1.4058, "mean_token_accuracy": 0.6697660020074329, "num_tokens": 797632.0, "step": 550 }, { "epoch": 17.512, "grad_norm": 1.7759121656417847, "learning_rate": 1.985640343085678e-05, "loss": 1.485, "mean_token_accuracy": 0.6630586348474026, "num_tokens": 812748.0, "step": 560 }, { "epoch": 17.832, "grad_norm": 1.6328964233398438, "learning_rate": 1.9851039135304366e-05, "loss": 1.3464, "mean_token_accuracy": 0.673164501786232, "num_tokens": 827388.0, "step": 570 }, { "epoch": 18.128, "grad_norm": 1.394505500793457, "learning_rate": 1.9845577223900087e-05, "loss": 1.3223, "mean_token_accuracy": 0.6847520865298606, "num_tokens": 842219.0, "step": 580 }, { "epoch": 18.448, "grad_norm": 1.541831374168396, "learning_rate": 1.984001775076708e-05, "loss": 1.3222, "mean_token_accuracy": 0.6771992217749357, "num_tokens": 857904.0, "step": 590 }, { "epoch": 18.768, "grad_norm": 1.4715123176574707, "learning_rate": 1.983436077099524e-05, "loss": 1.3623, "mean_token_accuracy": 0.6771474566310645, "num_tokens": 871758.0, "step": 600 }, { "epoch": 19.064, "grad_norm": 1.244395136833191, "learning_rate": 1.9828606340640678e-05, "loss": 1.3194, "mean_token_accuracy": 0.6701785076308895, "num_tokens": 885177.0, "step": 610 }, { "epoch": 19.384, "grad_norm": 3.2594940662384033, "learning_rate": 1.9822754516725148e-05, "loss": 1.3986, "mean_token_accuracy": 0.6804742000997066, "num_tokens": 900412.0, "step": 620 }, { "epoch": 19.704, "grad_norm": 1.9484755992889404, "learning_rate": 1.9816805357235512e-05, "loss": 1.3087, "mean_token_accuracy": 0.675427176989615, "num_tokens": 914923.0, "step": 630 }, { "epoch": 20.0, "grad_norm": 4.795617580413818, "learning_rate": 1.981075892112314e-05, "loss": 1.3261, "mean_token_accuracy": 0.6869603467954172, "num_tokens": 928360.0, "step": 640 }, { "epoch": 20.32, "grad_norm": 2.1296019554138184, "learning_rate": 1.980461526830334e-05, "loss": 1.3365, "mean_token_accuracy": 0.6707998286932707, "num_tokens": 942431.0, "step": 650 }, { "epoch": 20.64, "grad_norm": 2.041980743408203, "learning_rate": 1.979837445965475e-05, "loss": 1.4148, "mean_token_accuracy": 0.6774610493332147, "num_tokens": 957665.0, "step": 660 }, { "epoch": 20.96, "grad_norm": 2.0277955532073975, "learning_rate": 1.979203655701875e-05, "loss": 1.1934, "mean_token_accuracy": 0.7059706412255764, "num_tokens": 972341.0, "step": 670 }, { "epoch": 21.256, "grad_norm": 1.4342715740203857, "learning_rate": 1.978560162319885e-05, "loss": 1.2645, "mean_token_accuracy": 0.6948015895244237, "num_tokens": 987917.0, "step": 680 }, { "epoch": 21.576, "grad_norm": 2.365342140197754, "learning_rate": 1.9779069721960046e-05, "loss": 1.356, "mean_token_accuracy": 0.6770768724381924, "num_tokens": 1001846.0, "step": 690 }, { "epoch": 21.896, "grad_norm": 1.4183971881866455, "learning_rate": 1.9772440918028217e-05, "loss": 1.3372, "mean_token_accuracy": 0.6928307216614484, "num_tokens": 1016810.0, "step": 700 }, { "epoch": 22.192, "grad_norm": 1.5157005786895752, "learning_rate": 1.9765715277089458e-05, "loss": 1.2262, "mean_token_accuracy": 0.6972452486689026, "num_tokens": 1032507.0, "step": 710 }, { "epoch": 22.512, "grad_norm": 1.4448522329330444, "learning_rate": 1.9758892865789445e-05, "loss": 1.2261, "mean_token_accuracy": 0.6949771210551262, "num_tokens": 1047386.0, "step": 720 }, { "epoch": 22.832, "grad_norm": 2.33046817779541, "learning_rate": 1.9751973751732775e-05, "loss": 1.2348, "mean_token_accuracy": 0.6982233498245478, "num_tokens": 1061351.0, "step": 730 }, { "epoch": 23.128, "grad_norm": 1.8830664157867432, "learning_rate": 1.9744958003482285e-05, "loss": 1.2979, "mean_token_accuracy": 0.6971497769291336, "num_tokens": 1073148.0, "step": 740 }, { "epoch": 23.448, "grad_norm": 1.466878056526184, "learning_rate": 1.9737845690558385e-05, "loss": 1.3683, "mean_token_accuracy": 0.680212589353323, "num_tokens": 1088218.0, "step": 750 }, { "epoch": 23.768, "grad_norm": 1.5701245069503784, "learning_rate": 1.973063688343835e-05, "loss": 1.1505, "mean_token_accuracy": 0.7072769150137901, "num_tokens": 1102836.0, "step": 760 }, { "epoch": 24.064, "grad_norm": 1.6687356233596802, "learning_rate": 1.9723331653555653e-05, "loss": 1.2474, "mean_token_accuracy": 0.6967680285105834, "num_tokens": 1116942.0, "step": 770 }, { "epoch": 24.384, "grad_norm": 1.3728556632995605, "learning_rate": 1.9715930073299227e-05, "loss": 1.2448, "mean_token_accuracy": 0.7040290288627148, "num_tokens": 1132054.0, "step": 780 }, { "epoch": 24.704, "grad_norm": 1.4181838035583496, "learning_rate": 1.970843221601276e-05, "loss": 1.1969, "mean_token_accuracy": 0.6944498892873525, "num_tokens": 1148041.0, "step": 790 }, { "epoch": 25.0, "grad_norm": 4.3919596672058105, "learning_rate": 1.9700838155993972e-05, "loss": 1.1934, "mean_token_accuracy": 0.7042354522524653, "num_tokens": 1160450.0, "step": 800 }, { "epoch": 25.32, "grad_norm": 1.5123074054718018, "learning_rate": 1.9693147968493872e-05, "loss": 1.2369, "mean_token_accuracy": 0.692409698665142, "num_tokens": 1174351.0, "step": 810 }, { "epoch": 25.64, "grad_norm": 1.278221845626831, "learning_rate": 1.9685361729716014e-05, "loss": 1.1829, "mean_token_accuracy": 0.7174848213791847, "num_tokens": 1190213.0, "step": 820 }, { "epoch": 25.96, "grad_norm": 2.224332094192505, "learning_rate": 1.967747951681575e-05, "loss": 1.2224, "mean_token_accuracy": 0.7052119519561529, "num_tokens": 1205508.0, "step": 830 }, { "epoch": 26.256, "grad_norm": 1.548086166381836, "learning_rate": 1.966950140789944e-05, "loss": 1.2254, "mean_token_accuracy": 0.7041690679820808, "num_tokens": 1219047.0, "step": 840 }, { "epoch": 26.576, "grad_norm": 2.0900254249572754, "learning_rate": 1.9661427482023718e-05, "loss": 1.1557, "mean_token_accuracy": 0.7088660508394241, "num_tokens": 1231738.0, "step": 850 }, { "epoch": 26.896, "grad_norm": 1.9919354915618896, "learning_rate": 1.965325781919467e-05, "loss": 1.1962, "mean_token_accuracy": 0.7142665989696979, "num_tokens": 1248062.0, "step": 860 }, { "epoch": 27.192, "grad_norm": 2.2158303260803223, "learning_rate": 1.9644992500367072e-05, "loss": 1.2078, "mean_token_accuracy": 0.7049629523141964, "num_tokens": 1261738.0, "step": 870 }, { "epoch": 27.512, "grad_norm": 1.830531358718872, "learning_rate": 1.9636631607443565e-05, "loss": 1.2142, "mean_token_accuracy": 0.7097026702016592, "num_tokens": 1278012.0, "step": 880 }, { "epoch": 27.832, "grad_norm": 2.0944063663482666, "learning_rate": 1.9628175223273847e-05, "loss": 1.1368, "mean_token_accuracy": 0.7265028398483991, "num_tokens": 1292725.0, "step": 890 }, { "epoch": 28.128, "grad_norm": 1.4445384740829468, "learning_rate": 1.9619623431653872e-05, "loss": 1.2329, "mean_token_accuracy": 0.6941638359346906, "num_tokens": 1305912.0, "step": 900 }, { "epoch": 28.448, "grad_norm": 2.084064245223999, "learning_rate": 1.9610976317324993e-05, "loss": 1.1324, "mean_token_accuracy": 0.7086500860750675, "num_tokens": 1320269.0, "step": 910 }, { "epoch": 28.768, "grad_norm": 1.5166538953781128, "learning_rate": 1.9602233965973145e-05, "loss": 1.215, "mean_token_accuracy": 0.7056132420897484, "num_tokens": 1336877.0, "step": 920 }, { "epoch": 29.064, "grad_norm": 1.324559211730957, "learning_rate": 1.9593396464227964e-05, "loss": 1.1762, "mean_token_accuracy": 0.7244789052653957, "num_tokens": 1349855.0, "step": 930 }, { "epoch": 29.384, "grad_norm": 1.3715434074401855, "learning_rate": 1.9584463899661975e-05, "loss": 1.1323, "mean_token_accuracy": 0.7216422040015459, "num_tokens": 1364729.0, "step": 940 }, { "epoch": 29.704, "grad_norm": 1.782844066619873, "learning_rate": 1.9575436360789687e-05, "loss": 1.2588, "mean_token_accuracy": 0.7018849883228541, "num_tokens": 1378903.0, "step": 950 }, { "epoch": 30.0, "grad_norm": 3.4414260387420654, "learning_rate": 1.9566313937066727e-05, "loss": 1.1545, "mean_token_accuracy": 0.7196269961627754, "num_tokens": 1392540.0, "step": 960 }, { "epoch": 30.32, "grad_norm": 3.570629835128784, "learning_rate": 1.9557096718888956e-05, "loss": 1.1217, "mean_token_accuracy": 0.7212486552074552, "num_tokens": 1406295.0, "step": 970 }, { "epoch": 30.64, "grad_norm": 1.5852808952331543, "learning_rate": 1.9547784797591565e-05, "loss": 1.1959, "mean_token_accuracy": 0.7164284475147724, "num_tokens": 1422592.0, "step": 980 }, { "epoch": 30.96, "grad_norm": 1.5355671644210815, "learning_rate": 1.9538378265448195e-05, "loss": 1.1813, "mean_token_accuracy": 0.710675698518753, "num_tokens": 1437502.0, "step": 990 }, { "epoch": 31.256, "grad_norm": 1.5741212368011475, "learning_rate": 1.9528877215669983e-05, "loss": 1.1143, "mean_token_accuracy": 0.7233139457734855, "num_tokens": 1452056.0, "step": 1000 }, { "epoch": 32.576, "grad_norm": 1.7357654571533203, "learning_rate": 1.8116046949409032e-05, "loss": 1.2445, "mean_token_accuracy": 0.7005614548921585, "num_tokens": 14368.0, "step": 1010 }, { "epoch": 32.896, "grad_norm": 2.1830084323883057, "learning_rate": 1.807903147537074e-05, "loss": 1.1743, "mean_token_accuracy": 0.7173698712140322, "num_tokens": 31135.0, "step": 1020 }, { "epoch": 33.224, "grad_norm": 2.0015718936920166, "learning_rate": 1.8041694488049716e-05, "loss": 1.2619, "mean_token_accuracy": 0.715624163063561, "num_tokens": 46347.0, "step": 1030 }, { "epoch": 33.544, "grad_norm": 1.7311397790908813, "learning_rate": 1.8004037473309373e-05, "loss": 1.2331, "mean_token_accuracy": 0.7106888771057129, "num_tokens": 63240.0, "step": 1040 }, { "epoch": 33.864, "grad_norm": 1.8815584182739258, "learning_rate": 1.7966061929748968e-05, "loss": 1.2194, "mean_token_accuracy": 0.7109542470425367, "num_tokens": 79655.0, "step": 1050 }, { "epoch": 34.16, "grad_norm": 1.684423565864563, "learning_rate": 1.7927769368643904e-05, "loss": 1.0667, "mean_token_accuracy": 0.7348488770626687, "num_tokens": 95318.0, "step": 1060 }, { "epoch": 34.48, "grad_norm": 1.6687734127044678, "learning_rate": 1.788916131388564e-05, "loss": 1.1796, "mean_token_accuracy": 0.7278237771242857, "num_tokens": 111213.0, "step": 1070 }, { "epoch": 34.8, "grad_norm": 1.7268950939178467, "learning_rate": 1.785023930192103e-05, "loss": 1.1723, "mean_token_accuracy": 0.7138827528804541, "num_tokens": 126943.0, "step": 1080 }, { "epoch": 35.096, "grad_norm": 1.5416665077209473, "learning_rate": 1.781100488169115e-05, "loss": 1.0286, "mean_token_accuracy": 0.7333241834028347, "num_tokens": 142073.0, "step": 1090 }, { "epoch": 35.416, "grad_norm": 1.7402383089065552, "learning_rate": 1.777145961456971e-05, "loss": 1.0884, "mean_token_accuracy": 0.7265842445194721, "num_tokens": 158263.0, "step": 1100 }, { "epoch": 35.736, "grad_norm": 1.4736402034759521, "learning_rate": 1.773160507430087e-05, "loss": 1.1012, "mean_token_accuracy": 0.727820971608162, "num_tokens": 172637.0, "step": 1110 }, { "epoch": 36.032, "grad_norm": 2.027437448501587, "learning_rate": 1.7691442846936643e-05, "loss": 1.1525, "mean_token_accuracy": 0.7281997264237017, "num_tokens": 189288.0, "step": 1120 }, { "epoch": 36.352, "grad_norm": 2.058610439300537, "learning_rate": 1.7650974530773745e-05, "loss": 1.147, "mean_token_accuracy": 0.7228171911090613, "num_tokens": 204429.0, "step": 1130 }, { "epoch": 36.672, "grad_norm": 1.47328519821167, "learning_rate": 1.7610201736290022e-05, "loss": 1.1293, "mean_token_accuracy": 0.7266111556440592, "num_tokens": 220109.0, "step": 1140 }, { "epoch": 36.992, "grad_norm": 1.4244815111160278, "learning_rate": 1.7569126086080342e-05, "loss": 1.0312, "mean_token_accuracy": 0.7415647856891155, "num_tokens": 236961.0, "step": 1150 }, { "epoch": 37.288, "grad_norm": 1.6087596416473389, "learning_rate": 1.7527749214792023e-05, "loss": 1.1148, "mean_token_accuracy": 0.722566624348228, "num_tokens": 251116.0, "step": 1160 }, { "epoch": 37.608, "grad_norm": 1.6909428834915161, "learning_rate": 1.7486072769059785e-05, "loss": 1.1283, "mean_token_accuracy": 0.7359607569873333, "num_tokens": 267570.0, "step": 1170 }, { "epoch": 37.928, "grad_norm": 1.646548867225647, "learning_rate": 1.7444098407440218e-05, "loss": 1.0572, "mean_token_accuracy": 0.7344494730234146, "num_tokens": 282974.0, "step": 1180 }, { "epoch": 38.224, "grad_norm": 1.5249629020690918, "learning_rate": 1.740182780034577e-05, "loss": 0.9779, "mean_token_accuracy": 0.7474351501142656, "num_tokens": 298664.0, "step": 1190 }, { "epoch": 38.544, "grad_norm": 1.8203458786010742, "learning_rate": 1.7359262629978286e-05, "loss": 1.044, "mean_token_accuracy": 0.7267404418438673, "num_tokens": 313932.0, "step": 1200 }, { "epoch": 38.864, "grad_norm": 1.960335612297058, "learning_rate": 1.731640459026206e-05, "loss": 1.0537, "mean_token_accuracy": 0.7449462197721004, "num_tokens": 330427.0, "step": 1210 }, { "epoch": 39.16, "grad_norm": 2.152423620223999, "learning_rate": 1.727325538677642e-05, "loss": 1.1988, "mean_token_accuracy": 0.7341888015334671, "num_tokens": 344595.0, "step": 1220 }, { "epoch": 39.48, "grad_norm": 1.9269284009933472, "learning_rate": 1.722981673668784e-05, "loss": 1.0929, "mean_token_accuracy": 0.7354621075093746, "num_tokens": 361903.0, "step": 1230 }, { "epoch": 39.8, "grad_norm": 2.627488374710083, "learning_rate": 1.7186090368681625e-05, "loss": 1.0304, "mean_token_accuracy": 0.7406851584091783, "num_tokens": 378158.0, "step": 1240 }, { "epoch": 40.096, "grad_norm": 1.340135931968689, "learning_rate": 1.714207802289311e-05, "loss": 0.9831, "mean_token_accuracy": 0.7508459779861811, "num_tokens": 393086.0, "step": 1250 }, { "epoch": 40.416, "grad_norm": 1.5764344930648804, "learning_rate": 1.7097781450838408e-05, "loss": 1.0411, "mean_token_accuracy": 0.7428241446614265, "num_tokens": 408865.0, "step": 1260 }, { "epoch": 40.736, "grad_norm": 2.27480149269104, "learning_rate": 1.7053202415344693e-05, "loss": 1.1553, "mean_token_accuracy": 0.7261891044676304, "num_tokens": 422941.0, "step": 1270 }, { "epoch": 41.032, "grad_norm": 2.0869431495666504, "learning_rate": 1.7008342690480075e-05, "loss": 1.0776, "mean_token_accuracy": 0.7442273002218556, "num_tokens": 438615.0, "step": 1280 }, { "epoch": 41.352, "grad_norm": 1.6138980388641357, "learning_rate": 1.6963204061482972e-05, "loss": 0.9933, "mean_token_accuracy": 0.7366263665258884, "num_tokens": 454742.0, "step": 1290 }, { "epoch": 41.672, "grad_norm": 2.201198101043701, "learning_rate": 1.6917788324691083e-05, "loss": 1.12, "mean_token_accuracy": 0.7349841587245465, "num_tokens": 471732.0, "step": 1300 }, { "epoch": 41.992, "grad_norm": 2.3492226600646973, "learning_rate": 1.687209728746989e-05, "loss": 1.0594, "mean_token_accuracy": 0.745047352835536, "num_tokens": 487349.0, "step": 1310 }, { "epoch": 42.288, "grad_norm": 1.7862104177474976, "learning_rate": 1.6826132768140735e-05, "loss": 0.9756, "mean_token_accuracy": 0.7570219249338717, "num_tokens": 502115.0, "step": 1320 }, { "epoch": 42.608, "grad_norm": 2.4716343879699707, "learning_rate": 1.6779896595908462e-05, "loss": 1.0208, "mean_token_accuracy": 0.7443521052598954, "num_tokens": 517825.0, "step": 1330 }, { "epoch": 42.928, "grad_norm": 2.661140203475952, "learning_rate": 1.6733390610788622e-05, "loss": 1.0313, "mean_token_accuracy": 0.7418102856725455, "num_tokens": 534561.0, "step": 1340 }, { "epoch": 43.224, "grad_norm": 1.9998219013214111, "learning_rate": 1.668661666353423e-05, "loss": 1.0699, "mean_token_accuracy": 0.7479387578126546, "num_tokens": 548327.0, "step": 1350 }, { "epoch": 43.544, "grad_norm": 2.4526405334472656, "learning_rate": 1.6639576615562143e-05, "loss": 0.9673, "mean_token_accuracy": 0.7542693041265011, "num_tokens": 565164.0, "step": 1360 }, { "epoch": 43.864, "grad_norm": 1.7199647426605225, "learning_rate": 1.6592272338878963e-05, "loss": 1.0644, "mean_token_accuracy": 0.743690374866128, "num_tokens": 580754.0, "step": 1370 }, { "epoch": 44.16, "grad_norm": 1.7065895795822144, "learning_rate": 1.6544705716006537e-05, "loss": 0.9511, "mean_token_accuracy": 0.7495483123772854, "num_tokens": 595953.0, "step": 1380 }, { "epoch": 44.48, "grad_norm": 1.5984984636306763, "learning_rate": 1.649687863990705e-05, "loss": 1.0901, "mean_token_accuracy": 0.7480962604284287, "num_tokens": 611850.0, "step": 1390 }, { "epoch": 44.8, "grad_norm": 2.719882011413574, "learning_rate": 1.644879301390769e-05, "loss": 0.9664, "mean_token_accuracy": 0.7527227349579334, "num_tokens": 627428.0, "step": 1400 }, { "epoch": 45.096, "grad_norm": 2.04146409034729, "learning_rate": 1.6400450751624897e-05, "loss": 0.9673, "mean_token_accuracy": 0.7482488421169488, "num_tokens": 641538.0, "step": 1410 }, { "epoch": 45.416, "grad_norm": 2.128373384475708, "learning_rate": 1.6351853776888214e-05, "loss": 0.9908, "mean_token_accuracy": 0.7453075967729091, "num_tokens": 658145.0, "step": 1420 }, { "epoch": 45.736, "grad_norm": 1.9842469692230225, "learning_rate": 1.630300402366373e-05, "loss": 1.0387, "mean_token_accuracy": 0.7478526467457414, "num_tokens": 675926.0, "step": 1430 }, { "epoch": 46.032, "grad_norm": 1.5507521629333496, "learning_rate": 1.6253903435977103e-05, "loss": 0.959, "mean_token_accuracy": 0.7599469971012425, "num_tokens": 689837.0, "step": 1440 }, { "epoch": 46.352, "grad_norm": 2.250763416290283, "learning_rate": 1.6204553967836216e-05, "loss": 1.0544, "mean_token_accuracy": 0.7468490976840257, "num_tokens": 705912.0, "step": 1450 }, { "epoch": 46.672, "grad_norm": 1.7809251546859741, "learning_rate": 1.6154957583153388e-05, "loss": 1.034, "mean_token_accuracy": 0.7534692898392678, "num_tokens": 722631.0, "step": 1460 }, { "epoch": 46.992, "grad_norm": 2.4856886863708496, "learning_rate": 1.6105116255667246e-05, "loss": 0.9083, "mean_token_accuracy": 0.7516257427632809, "num_tokens": 737649.0, "step": 1470 }, { "epoch": 47.288, "grad_norm": 2.1131696701049805, "learning_rate": 1.605503196886416e-05, "loss": 0.9908, "mean_token_accuracy": 0.7506888621562237, "num_tokens": 754612.0, "step": 1480 }, { "epoch": 47.608, "grad_norm": 1.3065401315689087, "learning_rate": 1.600470671589931e-05, "loss": 0.9346, "mean_token_accuracy": 0.757453129440546, "num_tokens": 771515.0, "step": 1490 }, { "epoch": 47.928, "grad_norm": 2.0020365715026855, "learning_rate": 1.5954142499517377e-05, "loss": 1.0396, "mean_token_accuracy": 0.7508561560884118, "num_tokens": 785665.0, "step": 1500 }, { "epoch": 48.224, "grad_norm": 1.8463741540908813, "learning_rate": 1.5903341331972832e-05, "loss": 0.9151, "mean_token_accuracy": 0.7590098671011023, "num_tokens": 799293.0, "step": 1510 }, { "epoch": 48.544, "grad_norm": 1.851616382598877, "learning_rate": 1.585230523494985e-05, "loss": 0.9102, "mean_token_accuracy": 0.7564024582505227, "num_tokens": 813555.0, "step": 1520 }, { "epoch": 48.864, "grad_norm": 1.4981343746185303, "learning_rate": 1.580103623948188e-05, "loss": 1.0654, "mean_token_accuracy": 0.748985405266285, "num_tokens": 831868.0, "step": 1530 }, { "epoch": 49.16, "grad_norm": 1.8819829225540161, "learning_rate": 1.574953638587079e-05, "loss": 0.993, "mean_token_accuracy": 0.7556418059645472, "num_tokens": 846798.0, "step": 1540 }, { "epoch": 49.48, "grad_norm": 2.24092960357666, "learning_rate": 1.569780772360568e-05, "loss": 0.9818, "mean_token_accuracy": 0.7535504069179296, "num_tokens": 862063.0, "step": 1550 }, { "epoch": 49.8, "grad_norm": 1.7873568534851074, "learning_rate": 1.5645852311281343e-05, "loss": 1.0086, "mean_token_accuracy": 0.7555014498531818, "num_tokens": 878215.0, "step": 1560 }, { "epoch": 50.096, "grad_norm": 2.5300111770629883, "learning_rate": 1.559367221651629e-05, "loss": 0.8826, "mean_token_accuracy": 0.7630251637987189, "num_tokens": 893320.0, "step": 1570 }, { "epoch": 50.416, "grad_norm": 1.9504714012145996, "learning_rate": 1.554126951587053e-05, "loss": 0.9572, "mean_token_accuracy": 0.7577113211154938, "num_tokens": 908230.0, "step": 1580 }, { "epoch": 50.736, "grad_norm": 1.8482609987258911, "learning_rate": 1.548864629476288e-05, "loss": 0.9715, "mean_token_accuracy": 0.7632556769996881, "num_tokens": 925533.0, "step": 1590 }, { "epoch": 51.032, "grad_norm": 1.7342660427093506, "learning_rate": 1.5435804647388003e-05, "loss": 1.0049, "mean_token_accuracy": 0.753706334410487, "num_tokens": 940557.0, "step": 1600 }, { "epoch": 51.352, "grad_norm": 1.7231630086898804, "learning_rate": 1.5382746676633053e-05, "loss": 0.9577, "mean_token_accuracy": 0.7602146591991186, "num_tokens": 955898.0, "step": 1610 }, { "epoch": 51.672, "grad_norm": 1.9401224851608276, "learning_rate": 1.5329474493993984e-05, "loss": 0.9607, "mean_token_accuracy": 0.7621455781161786, "num_tokens": 972435.0, "step": 1620 }, { "epoch": 51.992, "grad_norm": 2.089966297149658, "learning_rate": 1.5275990219491553e-05, "loss": 0.9482, "mean_token_accuracy": 0.762396826967597, "num_tokens": 988434.0, "step": 1630 }, { "epoch": 52.288, "grad_norm": 1.7538946866989136, "learning_rate": 1.522229598158691e-05, "loss": 0.9943, "mean_token_accuracy": 0.7541912862577954, "num_tokens": 1001536.0, "step": 1640 }, { "epoch": 52.608, "grad_norm": 1.8982934951782227, "learning_rate": 1.5168393917096917e-05, "loss": 0.9258, "mean_token_accuracy": 0.7704043008387089, "num_tokens": 1018633.0, "step": 1650 }, { "epoch": 52.928, "grad_norm": 2.195676803588867, "learning_rate": 1.5114286171109109e-05, "loss": 0.9363, "mean_token_accuracy": 0.7571658097207546, "num_tokens": 1035378.0, "step": 1660 }, { "epoch": 53.224, "grad_norm": 1.6514253616333008, "learning_rate": 1.5059974896896324e-05, "loss": 1.0186, "mean_token_accuracy": 0.7523992178407876, "num_tokens": 1050349.0, "step": 1670 }, { "epoch": 53.544, "grad_norm": 2.6811511516571045, "learning_rate": 1.5005462255831014e-05, "loss": 1.0254, "mean_token_accuracy": 0.7484087854623794, "num_tokens": 1064873.0, "step": 1680 }, { "epoch": 53.864, "grad_norm": 2.0554141998291016, "learning_rate": 1.4950750417299227e-05, "loss": 0.878, "mean_token_accuracy": 0.7748427361249923, "num_tokens": 1082105.0, "step": 1690 }, { "epoch": 54.16, "grad_norm": 1.614017128944397, "learning_rate": 1.489584155861428e-05, "loss": 0.9688, "mean_token_accuracy": 0.7686513742885074, "num_tokens": 1097886.0, "step": 1700 }, { "epoch": 54.48, "grad_norm": 1.7928838729858398, "learning_rate": 1.4840737864930106e-05, "loss": 0.8874, "mean_token_accuracy": 0.7716922122985125, "num_tokens": 1112624.0, "step": 1710 }, { "epoch": 54.8, "grad_norm": 1.688085675239563, "learning_rate": 1.4785441529154294e-05, "loss": 0.9361, "mean_token_accuracy": 0.767570473998785, "num_tokens": 1129549.0, "step": 1720 }, { "epoch": 55.096, "grad_norm": 1.3455687761306763, "learning_rate": 1.4729954751860827e-05, "loss": 1.0524, "mean_token_accuracy": 0.7470491971518542, "num_tokens": 1145039.0, "step": 1730 }, { "epoch": 55.416, "grad_norm": 1.7406009435653687, "learning_rate": 1.4674279741202495e-05, "loss": 0.8839, "mean_token_accuracy": 0.7727594949305058, "num_tokens": 1159810.0, "step": 1740 }, { "epoch": 55.736, "grad_norm": 2.1520540714263916, "learning_rate": 1.4618418712823028e-05, "loss": 0.9652, "mean_token_accuracy": 0.7532628539949655, "num_tokens": 1176245.0, "step": 1750 }, { "epoch": 56.032, "grad_norm": 1.581739902496338, "learning_rate": 1.4562373889768927e-05, "loss": 0.9332, "mean_token_accuracy": 0.7696672396079914, "num_tokens": 1191008.0, "step": 1760 }, { "epoch": 56.352, "grad_norm": 1.6474453210830688, "learning_rate": 1.4506147502400977e-05, "loss": 0.8376, "mean_token_accuracy": 0.772033654898405, "num_tokens": 1205755.0, "step": 1770 }, { "epoch": 56.672, "grad_norm": 1.8299458026885986, "learning_rate": 1.4449741788305514e-05, "loss": 0.9889, "mean_token_accuracy": 0.760890544205904, "num_tokens": 1221863.0, "step": 1780 }, { "epoch": 56.992, "grad_norm": 1.6759440898895264, "learning_rate": 1.4393158992205348e-05, "loss": 0.9799, "mean_token_accuracy": 0.7623420935124159, "num_tokens": 1238647.0, "step": 1790 }, { "epoch": 57.288, "grad_norm": 2.1239564418792725, "learning_rate": 1.4336401365870466e-05, "loss": 0.9944, "mean_token_accuracy": 0.7618524045557589, "num_tokens": 1253030.0, "step": 1800 }, { "epoch": 57.608, "grad_norm": 2.75298810005188, "learning_rate": 1.4279471168028382e-05, "loss": 0.9822, "mean_token_accuracy": 0.7654153741896152, "num_tokens": 1269147.0, "step": 1810 }, { "epoch": 57.928, "grad_norm": 1.8775372505187988, "learning_rate": 1.422237066427429e-05, "loss": 0.8866, "mean_token_accuracy": 0.7653848383575678, "num_tokens": 1285368.0, "step": 1820 }, { "epoch": 58.224, "grad_norm": 1.6810104846954346, "learning_rate": 1.416510212698086e-05, "loss": 0.9072, "mean_token_accuracy": 0.7690872151303936, "num_tokens": 1300660.0, "step": 1830 }, { "epoch": 58.544, "grad_norm": 1.914070725440979, "learning_rate": 1.4107667835207844e-05, "loss": 1.0272, "mean_token_accuracy": 0.7550359651446342, "num_tokens": 1317143.0, "step": 1840 }, { "epoch": 58.864, "grad_norm": 2.164189338684082, "learning_rate": 1.4050070074611355e-05, "loss": 0.9304, "mean_token_accuracy": 0.7650556772947311, "num_tokens": 1332705.0, "step": 1850 }, { "epoch": 59.16, "grad_norm": 2.7804877758026123, "learning_rate": 1.3992311137352918e-05, "loss": 0.8424, "mean_token_accuracy": 0.7625659327652003, "num_tokens": 1345993.0, "step": 1860 }, { "epoch": 59.48, "grad_norm": 1.7922106981277466, "learning_rate": 1.3934393322008241e-05, "loss": 0.8732, "mean_token_accuracy": 0.7774093203246594, "num_tokens": 1362688.0, "step": 1870 }, { "epoch": 59.8, "grad_norm": 1.39845609664917, "learning_rate": 1.387631893347575e-05, "loss": 0.8986, "mean_token_accuracy": 0.7775574192404747, "num_tokens": 1379021.0, "step": 1880 }, { "epoch": 60.096, "grad_norm": 2.3520147800445557, "learning_rate": 1.3818090282884869e-05, "loss": 0.9055, "mean_token_accuracy": 0.7666742781529555, "num_tokens": 1394388.0, "step": 1890 }, { "epoch": 60.416, "grad_norm": 1.9911949634552002, "learning_rate": 1.3759709687504022e-05, "loss": 0.9495, "mean_token_accuracy": 0.7690058574080467, "num_tokens": 1410943.0, "step": 1900 }, { "epoch": 60.736, "grad_norm": 2.0429329872131348, "learning_rate": 1.3701179470648444e-05, "loss": 0.9081, "mean_token_accuracy": 0.764681476354599, "num_tokens": 1428993.0, "step": 1910 }, { "epoch": 61.032, "grad_norm": 1.7519456148147583, "learning_rate": 1.36425019615877e-05, "loss": 0.9026, "mean_token_accuracy": 0.7673927166977444, "num_tokens": 1441530.0, "step": 1920 }, { "epoch": 61.352, "grad_norm": 2.105077028274536, "learning_rate": 1.3583679495453e-05, "loss": 0.8834, "mean_token_accuracy": 0.7748925991356372, "num_tokens": 1459071.0, "step": 1930 }, { "epoch": 61.672, "grad_norm": 1.9322600364685059, "learning_rate": 1.3524714413144282e-05, "loss": 0.91, "mean_token_accuracy": 0.7671246759593486, "num_tokens": 1474214.0, "step": 1940 }, { "epoch": 61.992, "grad_norm": 2.1808035373687744, "learning_rate": 1.346560906123702e-05, "loss": 0.915, "mean_token_accuracy": 0.7675775479525327, "num_tokens": 1489457.0, "step": 1950 }, { "epoch": 62.288, "grad_norm": 1.765626311302185, "learning_rate": 1.3406365791888865e-05, "loss": 1.0076, "mean_token_accuracy": 0.7589444365050342, "num_tokens": 1504842.0, "step": 1960 }, { "epoch": 62.608, "grad_norm": 2.268444061279297, "learning_rate": 1.3346986962746038e-05, "loss": 0.8381, "mean_token_accuracy": 0.780813368782401, "num_tokens": 1519276.0, "step": 1970 }, { "epoch": 62.928, "grad_norm": 1.4477503299713135, "learning_rate": 1.32874749368495e-05, "loss": 0.925, "mean_token_accuracy": 0.7658030860126018, "num_tokens": 1535446.0, "step": 1980 }, { "epoch": 63.224, "grad_norm": 2.732478618621826, "learning_rate": 1.3227832082540908e-05, "loss": 0.9051, "mean_token_accuracy": 0.7650254467451895, "num_tokens": 1550674.0, "step": 1990 }, { "epoch": 63.544, "grad_norm": 2.2961671352386475, "learning_rate": 1.3168060773368375e-05, "loss": 0.9873, "mean_token_accuracy": 0.7675742536783219, "num_tokens": 1564485.0, "step": 2000 }, { "epoch": 64.832, "grad_norm": 2.223515272140503, "learning_rate": 1.3108163387991993e-05, "loss": 0.8791, "mean_token_accuracy": 0.7629961850121617, "num_tokens": 16127.0, "step": 2010 }, { "epoch": 65.16, "grad_norm": 2.646225690841675, "learning_rate": 1.30481423100892e-05, "loss": 0.9661, "mean_token_accuracy": 0.768963757811523, "num_tokens": 31851.0, "step": 2020 }, { "epoch": 65.48, "grad_norm": 2.350883722305298, "learning_rate": 1.2987999928259897e-05, "loss": 0.9412, "mean_token_accuracy": 0.7736104667186737, "num_tokens": 47921.0, "step": 2030 }, { "epoch": 65.8, "grad_norm": 1.8255304098129272, "learning_rate": 1.2927738635931402e-05, "loss": 0.9436, "mean_token_accuracy": 0.7672818608582019, "num_tokens": 64830.0, "step": 2040 }, { "epoch": 66.096, "grad_norm": 1.562624454498291, "learning_rate": 1.2867360831263191e-05, "loss": 0.8713, "mean_token_accuracy": 0.7805772717740085, "num_tokens": 79811.0, "step": 2050 }, { "epoch": 66.416, "grad_norm": 2.139047145843506, "learning_rate": 1.280686891705147e-05, "loss": 0.9338, "mean_token_accuracy": 0.7658140640705824, "num_tokens": 95253.0, "step": 2060 }, { "epoch": 66.736, "grad_norm": 1.71339750289917, "learning_rate": 1.2746265300633556e-05, "loss": 0.8785, "mean_token_accuracy": 0.7795989379286766, "num_tokens": 110479.0, "step": 2070 }, { "epoch": 67.032, "grad_norm": 2.284088134765625, "learning_rate": 1.268555239379206e-05, "loss": 0.9184, "mean_token_accuracy": 0.7648406020692877, "num_tokens": 125681.0, "step": 2080 }, { "epoch": 67.352, "grad_norm": 1.8913801908493042, "learning_rate": 1.2624732612658923e-05, "loss": 0.8725, "mean_token_accuracy": 0.7707512844353914, "num_tokens": 141796.0, "step": 2090 }, { "epoch": 67.672, "grad_norm": 1.9446955919265747, "learning_rate": 1.2563808377619253e-05, "loss": 1.0064, "mean_token_accuracy": 0.7678989730775356, "num_tokens": 158724.0, "step": 2100 }, { "epoch": 67.992, "grad_norm": 2.2045912742614746, "learning_rate": 1.250278211321501e-05, "loss": 0.7989, "mean_token_accuracy": 0.7780600219964982, "num_tokens": 173629.0, "step": 2110 }, { "epoch": 68.288, "grad_norm": 1.9509689807891846, "learning_rate": 1.244165624804852e-05, "loss": 0.8634, "mean_token_accuracy": 0.7794965231740797, "num_tokens": 188518.0, "step": 2120 }, { "epoch": 68.608, "grad_norm": 2.1073553562164307, "learning_rate": 1.2380433214685813e-05, "loss": 0.8981, "mean_token_accuracy": 0.7774934440851211, "num_tokens": 205654.0, "step": 2130 }, { "epoch": 68.928, "grad_norm": 1.652787208557129, "learning_rate": 1.2319115449559835e-05, "loss": 0.8801, "mean_token_accuracy": 0.7723641652613878, "num_tokens": 220311.0, "step": 2140 }, { "epoch": 69.224, "grad_norm": 2.534707546234131, "learning_rate": 1.2257705392873476e-05, "loss": 0.8723, "mean_token_accuracy": 0.7854163485604364, "num_tokens": 236282.0, "step": 2150 }, { "epoch": 69.544, "grad_norm": 1.578347086906433, "learning_rate": 1.2196205488502463e-05, "loss": 0.8169, "mean_token_accuracy": 0.7866261303424835, "num_tokens": 252837.0, "step": 2160 }, { "epoch": 69.864, "grad_norm": 2.228119373321533, "learning_rate": 1.2134618183898105e-05, "loss": 0.9254, "mean_token_accuracy": 0.7748822212219239, "num_tokens": 267785.0, "step": 2170 }, { "epoch": 70.16, "grad_norm": 2.410616159439087, "learning_rate": 1.2072945929989888e-05, "loss": 0.8046, "mean_token_accuracy": 0.7787431329488754, "num_tokens": 281535.0, "step": 2180 }, { "epoch": 70.48, "grad_norm": 1.7590594291687012, "learning_rate": 1.201119118108794e-05, "loss": 0.8912, "mean_token_accuracy": 0.7787077182903885, "num_tokens": 298775.0, "step": 2190 }, { "epoch": 70.8, "grad_norm": 3.3293755054473877, "learning_rate": 1.1949356394785373e-05, "loss": 0.9112, "mean_token_accuracy": 0.7765318274497985, "num_tokens": 314484.0, "step": 2200 }, { "epoch": 71.096, "grad_norm": 2.363255739212036, "learning_rate": 1.1887444031860456e-05, "loss": 0.9063, "mean_token_accuracy": 0.776000738546655, "num_tokens": 327608.0, "step": 2210 }, { "epoch": 71.416, "grad_norm": 1.7942370176315308, "learning_rate": 1.1825456556178705e-05, "loss": 0.8095, "mean_token_accuracy": 0.7899976089596749, "num_tokens": 345798.0, "step": 2220 }, { "epoch": 71.736, "grad_norm": 1.9774558544158936, "learning_rate": 1.1763396434594823e-05, "loss": 0.9154, "mean_token_accuracy": 0.7691428020596505, "num_tokens": 361462.0, "step": 2230 }, { "epoch": 72.032, "grad_norm": 1.6556707620620728, "learning_rate": 1.1701266136854532e-05, "loss": 0.8829, "mean_token_accuracy": 0.7704721173724612, "num_tokens": 376304.0, "step": 2240 }, { "epoch": 72.352, "grad_norm": 2.80587100982666, "learning_rate": 1.1639068135496285e-05, "loss": 0.9485, "mean_token_accuracy": 0.7709558174014092, "num_tokens": 390379.0, "step": 2250 }, { "epoch": 72.672, "grad_norm": 2.0841872692108154, "learning_rate": 1.1576804905752873e-05, "loss": 0.9589, "mean_token_accuracy": 0.7605574566870927, "num_tokens": 408237.0, "step": 2260 }, { "epoch": 72.992, "grad_norm": 2.8403215408325195, "learning_rate": 1.1514478925452905e-05, "loss": 0.7252, "mean_token_accuracy": 0.7972878247499466, "num_tokens": 423763.0, "step": 2270 }, { "epoch": 73.288, "grad_norm": 6.484622955322266, "learning_rate": 1.1452092674922224e-05, "loss": 0.9519, "mean_token_accuracy": 0.7691420135465828, "num_tokens": 437835.0, "step": 2280 }, { "epoch": 73.608, "grad_norm": 2.27260160446167, "learning_rate": 1.1389648636885186e-05, "loss": 0.8394, "mean_token_accuracy": 0.7912575013935566, "num_tokens": 455397.0, "step": 2290 }, { "epoch": 73.928, "grad_norm": 1.805159091949463, "learning_rate": 1.132714929636586e-05, "loss": 0.8545, "mean_token_accuracy": 0.7838539175689221, "num_tokens": 471371.0, "step": 2300 }, { "epoch": 74.224, "grad_norm": 2.250121593475342, "learning_rate": 1.1264597140589127e-05, "loss": 0.8243, "mean_token_accuracy": 0.7824344949142353, "num_tokens": 486629.0, "step": 2310 }, { "epoch": 74.544, "grad_norm": 3.2095444202423096, "learning_rate": 1.120199465888171e-05, "loss": 0.8556, "mean_token_accuracy": 0.7762523703277111, "num_tokens": 501461.0, "step": 2320 }, { "epoch": 74.864, "grad_norm": 2.3047547340393066, "learning_rate": 1.1139344342573106e-05, "loss": 0.8754, "mean_token_accuracy": 0.7786926485598087, "num_tokens": 516976.0, "step": 2330 }, { "epoch": 75.16, "grad_norm": 2.0419108867645264, "learning_rate": 1.1076648684896441e-05, "loss": 0.8008, "mean_token_accuracy": 0.7848166005836951, "num_tokens": 532021.0, "step": 2340 }, { "epoch": 75.48, "grad_norm": 2.602372646331787, "learning_rate": 1.101391018088923e-05, "loss": 0.9487, "mean_token_accuracy": 0.7746396526694298, "num_tokens": 546596.0, "step": 2350 }, { "epoch": 75.8, "grad_norm": 1.9730421304702759, "learning_rate": 1.0951131327294123e-05, "loss": 0.8744, "mean_token_accuracy": 0.7984356313943863, "num_tokens": 563545.0, "step": 2360 }, { "epoch": 76.096, "grad_norm": 2.331416130065918, "learning_rate": 1.0888314622459509e-05, "loss": 0.8102, "mean_token_accuracy": 0.7831854063111383, "num_tokens": 578977.0, "step": 2370 }, { "epoch": 76.416, "grad_norm": 2.8027427196502686, "learning_rate": 1.082546256624011e-05, "loss": 0.8598, "mean_token_accuracy": 0.7751214955002069, "num_tokens": 594479.0, "step": 2380 }, { "epoch": 76.736, "grad_norm": 1.8376470804214478, "learning_rate": 1.0762577659897495e-05, "loss": 0.8722, "mean_token_accuracy": 0.7737262137234211, "num_tokens": 611581.0, "step": 2390 }, { "epoch": 77.032, "grad_norm": 2.3731982707977295, "learning_rate": 1.0699662406000533e-05, "loss": 0.8581, "mean_token_accuracy": 0.7886427938938141, "num_tokens": 626188.0, "step": 2400 }, { "epoch": 77.352, "grad_norm": 1.711204171180725, "learning_rate": 1.0636719308325803e-05, "loss": 0.9216, "mean_token_accuracy": 0.7730351705104113, "num_tokens": 643408.0, "step": 2410 }, { "epoch": 77.672, "grad_norm": 1.7660971879959106, "learning_rate": 1.0573750871757965e-05, "loss": 0.7626, "mean_token_accuracy": 0.7915604203939438, "num_tokens": 657604.0, "step": 2420 }, { "epoch": 77.992, "grad_norm": 2.0509514808654785, "learning_rate": 1.0510759602190055e-05, "loss": 0.8603, "mean_token_accuracy": 0.784786606580019, "num_tokens": 674373.0, "step": 2430 }, { "epoch": 78.288, "grad_norm": 2.348026752471924, "learning_rate": 1.0447748006423775e-05, "loss": 0.8823, "mean_token_accuracy": 0.7760254515183939, "num_tokens": 690196.0, "step": 2440 }, { "epoch": 78.608, "grad_norm": 2.094943046569824, "learning_rate": 1.0384718592069733e-05, "loss": 0.8474, "mean_token_accuracy": 0.7716075176373124, "num_tokens": 706149.0, "step": 2450 }, { "epoch": 78.928, "grad_norm": 2.465407609939575, "learning_rate": 1.0321673867447642e-05, "loss": 0.8644, "mean_token_accuracy": 0.786153320223093, "num_tokens": 721536.0, "step": 2460 }, { "epoch": 79.224, "grad_norm": 2.3234193325042725, "learning_rate": 1.0258616341486505e-05, "loss": 0.9199, "mean_token_accuracy": 0.7744305520444303, "num_tokens": 737605.0, "step": 2470 }, { "epoch": 79.544, "grad_norm": 1.9042166471481323, "learning_rate": 1.019554852362476e-05, "loss": 0.8054, "mean_token_accuracy": 0.7926479011774064, "num_tokens": 753913.0, "step": 2480 }, { "epoch": 79.864, "grad_norm": 2.5160131454467773, "learning_rate": 1.0132472923710437e-05, "loss": 0.8329, "mean_token_accuracy": 0.7762512426823378, "num_tokens": 769204.0, "step": 2490 }, { "epoch": 80.16, "grad_norm": 2.8922526836395264, "learning_rate": 1.0069392051901241e-05, "loss": 0.8492, "mean_token_accuracy": 0.7814656487993292, "num_tokens": 784216.0, "step": 2500 }, { "epoch": 80.48, "grad_norm": 2.763730049133301, "learning_rate": 1.0006308418564697e-05, "loss": 0.8454, "mean_token_accuracy": 0.7843520522117615, "num_tokens": 800421.0, "step": 2510 }, { "epoch": 80.8, "grad_norm": 2.41654372215271, "learning_rate": 9.94322453417821e-06, "loss": 0.7333, "mean_token_accuracy": 0.8014784809201956, "num_tokens": 815977.0, "step": 2520 }, { "epoch": 81.096, "grad_norm": 2.7866134643554688, "learning_rate": 9.880142909229188e-06, "loss": 0.8167, "mean_token_accuracy": 0.7932786933473639, "num_tokens": 829350.0, "step": 2530 }, { "epoch": 81.416, "grad_norm": 1.8219573497772217, "learning_rate": 9.817066054115117e-06, "loss": 0.8731, "mean_token_accuracy": 0.7871743485331535, "num_tokens": 845363.0, "step": 2540 }, { "epoch": 81.736, "grad_norm": 1.9417917728424072, "learning_rate": 9.753996479043672e-06, "loss": 0.8828, "mean_token_accuracy": 0.7732684839516878, "num_tokens": 862288.0, "step": 2550 }, { "epoch": 82.032, "grad_norm": 1.784688949584961, "learning_rate": 9.690936693932793e-06, "loss": 0.8167, "mean_token_accuracy": 0.791683446716618, "num_tokens": 876947.0, "step": 2560 }, { "epoch": 82.352, "grad_norm": 2.9179623126983643, "learning_rate": 9.627889208310831e-06, "loss": 0.8395, "mean_token_accuracy": 0.7839712589979172, "num_tokens": 891614.0, "step": 2570 }, { "epoch": 82.672, "grad_norm": 1.7839528322219849, "learning_rate": 9.564856531216666e-06, "loss": 0.8015, "mean_token_accuracy": 0.8007228754460811, "num_tokens": 909761.0, "step": 2580 }, { "epoch": 82.992, "grad_norm": 2.202512741088867, "learning_rate": 9.50184117109986e-06, "loss": 0.8684, "mean_token_accuracy": 0.7817719358950853, "num_tokens": 925239.0, "step": 2590 }, { "epoch": 83.288, "grad_norm": 2.0895087718963623, "learning_rate": 9.438845635720817e-06, "loss": 0.8603, "mean_token_accuracy": 0.7858564757012032, "num_tokens": 940941.0, "step": 2600 }, { "epoch": 84.192, "grad_norm": 1.9542677402496338, "learning_rate": 9.375872432051006e-06, "loss": 0.9471, "mean_token_accuracy": 0.780240989312893, "num_tokens": 17808.0, "step": 2610 }, { "epoch": 84.512, "grad_norm": 2.257493257522583, "learning_rate": 9.312924066173178e-06, "loss": 0.8627, "mean_token_accuracy": 0.7828688979148865, "num_tokens": 33397.0, "step": 2620 }, { "epoch": 84.832, "grad_norm": 1.9773480892181396, "learning_rate": 9.25000304318164e-06, "loss": 0.8219, "mean_token_accuracy": 0.7853579100221395, "num_tokens": 49526.0, "step": 2630 }, { "epoch": 85.128, "grad_norm": 2.3270950317382812, "learning_rate": 9.187111867082568e-06, "loss": 0.8709, "mean_token_accuracy": 0.7843060026297698, "num_tokens": 64196.0, "step": 2640 }, { "epoch": 85.448, "grad_norm": 1.751816749572754, "learning_rate": 9.124253040694334e-06, "loss": 0.8058, "mean_token_accuracy": 0.7842564310878515, "num_tokens": 82640.0, "step": 2650 }, { "epoch": 85.768, "grad_norm": 2.2109670639038086, "learning_rate": 9.061429065547933e-06, "loss": 0.7968, "mean_token_accuracy": 0.7830525517463685, "num_tokens": 96760.0, "step": 2660 }, { "epoch": 86.064, "grad_norm": 1.9558287858963013, "learning_rate": 8.998642441787417e-06, "loss": 0.7927, "mean_token_accuracy": 0.7956994892777624, "num_tokens": 110140.0, "step": 2670 }, { "epoch": 86.384, "grad_norm": 2.14022159576416, "learning_rate": 8.935895668070405e-06, "loss": 0.8324, "mean_token_accuracy": 0.7832688026130199, "num_tokens": 125468.0, "step": 2680 }, { "epoch": 86.704, "grad_norm": 2.9838459491729736, "learning_rate": 8.873191241468631e-06, "loss": 0.8433, "mean_token_accuracy": 0.7785748850554228, "num_tokens": 141001.0, "step": 2690 }, { "epoch": 87.0, "grad_norm": 3.1861932277679443, "learning_rate": 8.810531657368594e-06, "loss": 0.8058, "mean_token_accuracy": 0.794649675891206, "num_tokens": 156865.0, "step": 2700 }, { "epoch": 87.32, "grad_norm": 1.9827327728271484, "learning_rate": 8.747919409372236e-06, "loss": 0.8459, "mean_token_accuracy": 0.7892976485192775, "num_tokens": 173221.0, "step": 2710 }, { "epoch": 87.64, "grad_norm": 2.972670078277588, "learning_rate": 8.685356989197717e-06, "loss": 0.8232, "mean_token_accuracy": 0.7798152294009923, "num_tokens": 188954.0, "step": 2720 }, { "epoch": 87.96, "grad_norm": 2.67842173576355, "learning_rate": 8.62284688658023e-06, "loss": 0.8046, "mean_token_accuracy": 0.79255036637187, "num_tokens": 205013.0, "step": 2730 }, { "epoch": 88.256, "grad_norm": 1.817650556564331, "learning_rate": 8.56039158917296e-06, "loss": 0.8276, "mean_token_accuracy": 0.7901485256246619, "num_tokens": 219723.0, "step": 2740 }, { "epoch": 88.576, "grad_norm": 1.7845501899719238, "learning_rate": 8.497993582448044e-06, "loss": 0.8554, "mean_token_accuracy": 0.7930382348597049, "num_tokens": 235112.0, "step": 2750 }, { "epoch": 88.896, "grad_norm": 2.3108439445495605, "learning_rate": 8.43565534959769e-06, "loss": 0.8519, "mean_token_accuracy": 0.7890813775360584, "num_tokens": 252361.0, "step": 2760 }, { "epoch": 89.192, "grad_norm": 1.8335771560668945, "learning_rate": 8.373379371435346e-06, "loss": 0.7812, "mean_token_accuracy": 0.7960183253964862, "num_tokens": 265617.0, "step": 2770 }, { "epoch": 89.512, "grad_norm": 2.717653512954712, "learning_rate": 8.31116812629696e-06, "loss": 0.8319, "mean_token_accuracy": 0.7901519671082496, "num_tokens": 281770.0, "step": 2780 }, { "epoch": 89.832, "grad_norm": 1.9572986364364624, "learning_rate": 8.249024089942364e-06, "loss": 0.7733, "mean_token_accuracy": 0.7931222733110189, "num_tokens": 298511.0, "step": 2790 }, { "epoch": 90.128, "grad_norm": 1.8655132055282593, "learning_rate": 8.186949735456758e-06, "loss": 0.9238, "mean_token_accuracy": 0.7922740490049929, "num_tokens": 312957.0, "step": 2800 }, { "epoch": 90.448, "grad_norm": 2.0918149948120117, "learning_rate": 8.12494753315228e-06, "loss": 0.8428, "mean_token_accuracy": 0.7884520322084427, "num_tokens": 330412.0, "step": 2810 }, { "epoch": 90.768, "grad_norm": 1.7944889068603516, "learning_rate": 8.063019950469688e-06, "loss": 0.8145, "mean_token_accuracy": 0.7932636447250843, "num_tokens": 345474.0, "step": 2820 }, { "epoch": 91.064, "grad_norm": 1.7774523496627808, "learning_rate": 8.001169451880186e-06, "loss": 0.7867, "mean_token_accuracy": 0.7842674186906299, "num_tokens": 360670.0, "step": 2830 }, { "epoch": 91.384, "grad_norm": 2.441330909729004, "learning_rate": 7.939398498787332e-06, "loss": 0.835, "mean_token_accuracy": 0.7940668806433677, "num_tokens": 375578.0, "step": 2840 }, { "epoch": 91.704, "grad_norm": 1.986222505569458, "learning_rate": 7.877709549429092e-06, "loss": 0.8162, "mean_token_accuracy": 0.7950244933366776, "num_tokens": 392683.0, "step": 2850 }, { "epoch": 92.0, "grad_norm": 4.878885269165039, "learning_rate": 7.816105058780019e-06, "loss": 0.788, "mean_token_accuracy": 0.7837782482037673, "num_tokens": 407330.0, "step": 2860 }, { "epoch": 92.32, "grad_norm": 2.343815326690674, "learning_rate": 7.754587478453528e-06, "loss": 0.7753, "mean_token_accuracy": 0.7878943778574466, "num_tokens": 420579.0, "step": 2870 }, { "epoch": 92.64, "grad_norm": 2.5471577644348145, "learning_rate": 7.69315925660436e-06, "loss": 0.88, "mean_token_accuracy": 0.791867159307003, "num_tokens": 438517.0, "step": 2880 }, { "epoch": 92.96, "grad_norm": 2.2550160884857178, "learning_rate": 7.631822837831143e-06, "loss": 0.8228, "mean_token_accuracy": 0.7879139900207519, "num_tokens": 455707.0, "step": 2890 }, { "epoch": 93.256, "grad_norm": 2.0642154216766357, "learning_rate": 7.570580663079114e-06, "loss": 0.8605, "mean_token_accuracy": 0.7856367556629954, "num_tokens": 469780.0, "step": 2900 }, { "epoch": 93.576, "grad_norm": 2.1604714393615723, "learning_rate": 7.509435169542961e-06, "loss": 0.7849, "mean_token_accuracy": 0.7887919537723065, "num_tokens": 484586.0, "step": 2910 }, { "epoch": 93.896, "grad_norm": 2.2268590927124023, "learning_rate": 7.448388790569851e-06, "loss": 0.8657, "mean_token_accuracy": 0.7843763899058104, "num_tokens": 502557.0, "step": 2920 }, { "epoch": 94.192, "grad_norm": 1.8110442161560059, "learning_rate": 7.387443955562586e-06, "loss": 0.7889, "mean_token_accuracy": 0.7898652823390188, "num_tokens": 516331.0, "step": 2930 }, { "epoch": 94.512, "grad_norm": 2.456662178039551, "learning_rate": 7.326603089882925e-06, "loss": 0.7788, "mean_token_accuracy": 0.7980688564479351, "num_tokens": 532511.0, "step": 2940 }, { "epoch": 94.832, "grad_norm": 2.060681104660034, "learning_rate": 7.26586861475506e-06, "loss": 0.7585, "mean_token_accuracy": 0.7954543896019459, "num_tokens": 549222.0, "step": 2950 }, { "epoch": 95.128, "grad_norm": 2.5429089069366455, "learning_rate": 7.205242947169258e-06, "loss": 0.8637, "mean_token_accuracy": 0.7921945170776264, "num_tokens": 563980.0, "step": 2960 }, { "epoch": 95.448, "grad_norm": 2.3039979934692383, "learning_rate": 7.144728499785683e-06, "loss": 0.7492, "mean_token_accuracy": 0.801618828624487, "num_tokens": 579326.0, "step": 2970 }, { "epoch": 95.768, "grad_norm": 1.8464511632919312, "learning_rate": 7.0843276808383785e-06, "loss": 0.8439, "mean_token_accuracy": 0.7837361056357622, "num_tokens": 596726.0, "step": 2980 }, { "epoch": 96.064, "grad_norm": 2.409407377243042, "learning_rate": 7.024042894039434e-06, "loss": 0.7315, "mean_token_accuracy": 0.7905531976674054, "num_tokens": 611478.0, "step": 2990 }, { "epoch": 96.384, "grad_norm": 3.4677658081054688, "learning_rate": 6.963876538483305e-06, "loss": 0.7926, "mean_token_accuracy": 0.7856792386621236, "num_tokens": 626726.0, "step": 3000 }, { "epoch": 96.704, "grad_norm": 2.2152152061462402, "learning_rate": 6.9038310085513716e-06, "loss": 0.8723, "mean_token_accuracy": 0.781861812621355, "num_tokens": 641499.0, "step": 3010 }, { "epoch": 97.0, "grad_norm": 2.4535887241363525, "learning_rate": 6.843908693816627e-06, "loss": 0.8416, "mean_token_accuracy": 0.8028259905608924, "num_tokens": 657795.0, "step": 3020 }, { "epoch": 97.32, "grad_norm": 1.937121033668518, "learning_rate": 6.784111978948596e-06, "loss": 0.746, "mean_token_accuracy": 0.7986438237130642, "num_tokens": 673802.0, "step": 3030 }, { "epoch": 97.64, "grad_norm": 1.616132140159607, "learning_rate": 6.724443243618421e-06, "loss": 0.8305, "mean_token_accuracy": 0.7848228823393584, "num_tokens": 690896.0, "step": 3040 }, { "epoch": 97.96, "grad_norm": 2.3996787071228027, "learning_rate": 6.664904862404175e-06, "loss": 0.8508, "mean_token_accuracy": 0.7884074129164219, "num_tokens": 705680.0, "step": 3050 }, { "epoch": 98.256, "grad_norm": 3.018188714981079, "learning_rate": 6.605499204696351e-06, "loss": 0.8035, "mean_token_accuracy": 0.801042732354757, "num_tokens": 720238.0, "step": 3060 }, { "epoch": 98.576, "grad_norm": 2.550436496734619, "learning_rate": 6.546228634603578e-06, "loss": 0.7711, "mean_token_accuracy": 0.798908605799079, "num_tokens": 735457.0, "step": 3070 }, { "epoch": 98.896, "grad_norm": 3.060084819793701, "learning_rate": 6.487095510858543e-06, "loss": 0.9337, "mean_token_accuracy": 0.7785589572042226, "num_tokens": 752742.0, "step": 3080 }, { "epoch": 99.192, "grad_norm": 2.1915123462677, "learning_rate": 6.428102186724101e-06, "loss": 0.9185, "mean_token_accuracy": 0.7807568505003646, "num_tokens": 765549.0, "step": 3090 }, { "epoch": 99.512, "grad_norm": 2.3755106925964355, "learning_rate": 6.369251009899644e-06, "loss": 0.7954, "mean_token_accuracy": 0.788112024590373, "num_tokens": 782597.0, "step": 3100 }, { "epoch": 99.832, "grad_norm": 1.9347033500671387, "learning_rate": 6.310544322427674e-06, "loss": 0.8488, "mean_token_accuracy": 0.8023913279175758, "num_tokens": 799203.0, "step": 3110 }, { "epoch": 100.128, "grad_norm": 2.046133279800415, "learning_rate": 6.251984460600588e-06, "loss": 0.7156, "mean_token_accuracy": 0.7995543536302205, "num_tokens": 813931.0, "step": 3120 }, { "epoch": 100.448, "grad_norm": 2.557436943054199, "learning_rate": 6.193573754867708e-06, "loss": 0.7914, "mean_token_accuracy": 0.8036689855158329, "num_tokens": 830433.0, "step": 3130 }, { "epoch": 100.768, "grad_norm": 2.666550636291504, "learning_rate": 6.135314529742529e-06, "loss": 0.8148, "mean_token_accuracy": 0.79065520465374, "num_tokens": 846129.0, "step": 3140 }, { "epoch": 101.064, "grad_norm": 2.4647037982940674, "learning_rate": 6.077209103710232e-06, "loss": 0.8138, "mean_token_accuracy": 0.7805173554130502, "num_tokens": 860395.0, "step": 3150 }, { "epoch": 101.384, "grad_norm": 1.9933632612228394, "learning_rate": 6.019259789135404e-06, "loss": 0.7916, "mean_token_accuracy": 0.7982403136789799, "num_tokens": 878034.0, "step": 3160 }, { "epoch": 101.704, "grad_norm": 2.3307456970214844, "learning_rate": 5.961468892170016e-06, "loss": 0.7931, "mean_token_accuracy": 0.7907839316874743, "num_tokens": 892819.0, "step": 3170 }, { "epoch": 102.0, "grad_norm": 4.047188758850098, "learning_rate": 5.903838712661647e-06, "loss": 0.7685, "mean_token_accuracy": 0.7972375758596368, "num_tokens": 908260.0, "step": 3180 }, { "epoch": 102.32, "grad_norm": 1.9516690969467163, "learning_rate": 5.846371544061962e-06, "loss": 0.7943, "mean_token_accuracy": 0.7980046071112156, "num_tokens": 924521.0, "step": 3190 }, { "epoch": 102.64, "grad_norm": 2.3500168323516846, "learning_rate": 5.789069673335446e-06, "loss": 0.7704, "mean_token_accuracy": 0.8008730575442314, "num_tokens": 940805.0, "step": 3200 }, { "epoch": 102.96, "grad_norm": 1.9596396684646606, "learning_rate": 5.731935380868381e-06, "loss": 0.816, "mean_token_accuracy": 0.7914111088961363, "num_tokens": 957150.0, "step": 3210 }, { "epoch": 103.256, "grad_norm": 2.2512779235839844, "learning_rate": 5.674970940378102e-06, "loss": 0.7422, "mean_token_accuracy": 0.80112284542741, "num_tokens": 970896.0, "step": 3220 }, { "epoch": 103.576, "grad_norm": 2.6935369968414307, "learning_rate": 5.618178618822512e-06, "loss": 0.856, "mean_token_accuracy": 0.7918466597795486, "num_tokens": 986051.0, "step": 3230 }, { "epoch": 103.896, "grad_norm": 2.1991372108459473, "learning_rate": 5.561560676309874e-06, "loss": 0.7392, "mean_token_accuracy": 0.7981615476310253, "num_tokens": 1001657.0, "step": 3240 }, { "epoch": 104.192, "grad_norm": 2.4802706241607666, "learning_rate": 5.505119366008847e-06, "loss": 0.8709, "mean_token_accuracy": 0.7797639261226397, "num_tokens": 1018539.0, "step": 3250 }, { "epoch": 104.512, "grad_norm": 2.416335105895996, "learning_rate": 5.448856934058837e-06, "loss": 0.7811, "mean_token_accuracy": 0.802381145209074, "num_tokens": 1035770.0, "step": 3260 }, { "epoch": 104.832, "grad_norm": 1.9266993999481201, "learning_rate": 5.392775619480606e-06, "loss": 0.7597, "mean_token_accuracy": 0.801979061216116, "num_tokens": 1050287.0, "step": 3270 }, { "epoch": 105.128, "grad_norm": 3.1635992527008057, "learning_rate": 5.336877654087161e-06, "loss": 0.8394, "mean_token_accuracy": 0.7894677262048464, "num_tokens": 1063888.0, "step": 3280 }, { "epoch": 105.448, "grad_norm": 2.317321300506592, "learning_rate": 5.281165262394938e-06, "loss": 0.8313, "mean_token_accuracy": 0.7858642000705004, "num_tokens": 1080743.0, "step": 3290 }, { "epoch": 105.768, "grad_norm": 1.9168007373809814, "learning_rate": 5.2256406615353015e-06, "loss": 0.8397, "mean_token_accuracy": 0.7893419295549393, "num_tokens": 1097525.0, "step": 3300 }, { "epoch": 106.064, "grad_norm": 1.7733817100524902, "learning_rate": 5.170306061166254e-06, "loss": 0.6765, "mean_token_accuracy": 0.8171853680868406, "num_tokens": 1112336.0, "step": 3310 }, { "epoch": 106.384, "grad_norm": 2.349670648574829, "learning_rate": 5.115163663384563e-06, "loss": 0.7588, "mean_token_accuracy": 0.789124884083867, "num_tokens": 1126428.0, "step": 3320 }, { "epoch": 106.704, "grad_norm": 1.7135353088378906, "learning_rate": 5.060215662638084e-06, "loss": 0.7926, "mean_token_accuracy": 0.7968744553625584, "num_tokens": 1142993.0, "step": 3330 }, { "epoch": 107.0, "grad_norm": 6.969696044921875, "learning_rate": 5.005464245638447e-06, "loss": 0.8879, "mean_token_accuracy": 0.790745651399767, "num_tokens": 1158725.0, "step": 3340 }, { "epoch": 107.32, "grad_norm": 2.188507080078125, "learning_rate": 4.9509115912740445e-06, "loss": 0.7252, "mean_token_accuracy": 0.8085566960275173, "num_tokens": 1174330.0, "step": 3350 }, { "epoch": 107.64, "grad_norm": 2.8108561038970947, "learning_rate": 4.896559870523279e-06, "loss": 0.887, "mean_token_accuracy": 0.7786224085837603, "num_tokens": 1188830.0, "step": 3360 }, { "epoch": 107.96, "grad_norm": 2.438131093978882, "learning_rate": 4.842411246368226e-06, "loss": 0.795, "mean_token_accuracy": 0.8030483074486255, "num_tokens": 1207364.0, "step": 3370 }, { "epoch": 108.256, "grad_norm": 2.4335777759552, "learning_rate": 4.788467873708508e-06, "loss": 0.8032, "mean_token_accuracy": 0.7993817514664417, "num_tokens": 1223655.0, "step": 3380 }, { "epoch": 108.576, "grad_norm": 2.748537540435791, "learning_rate": 4.734731899275557e-06, "loss": 0.8288, "mean_token_accuracy": 0.7918653458356857, "num_tokens": 1238999.0, "step": 3390 }, { "epoch": 108.896, "grad_norm": 2.6951160430908203, "learning_rate": 4.681205461547187e-06, "loss": 0.7515, "mean_token_accuracy": 0.8007099393755197, "num_tokens": 1253439.0, "step": 3400 }, { "epoch": 109.992, "grad_norm": 2.926764726638794, "learning_rate": 4.62789069066248e-06, "loss": 0.8692, "mean_token_accuracy": 0.7860011033713817, "num_tokens": 16823.0, "step": 3410 }, { "epoch": 110.32, "grad_norm": 2.0920846462249756, "learning_rate": 4.574789708337018e-06, "loss": 0.9181, "mean_token_accuracy": 0.7867580187029954, "num_tokens": 31381.0, "step": 3420 }, { "epoch": 110.64, "grad_norm": 2.0056655406951904, "learning_rate": 4.521904627778463e-06, "loss": 0.7652, "mean_token_accuracy": 0.801980373263359, "num_tokens": 48922.0, "step": 3430 }, { "epoch": 110.96, "grad_norm": 2.2784852981567383, "learning_rate": 4.469237553602433e-06, "loss": 0.7703, "mean_token_accuracy": 0.7932860311120749, "num_tokens": 65112.0, "step": 3440 }, { "epoch": 111.256, "grad_norm": 2.1628239154815674, "learning_rate": 4.416790581748766e-06, "loss": 0.7087, "mean_token_accuracy": 0.8054195183354456, "num_tokens": 80534.0, "step": 3450 }, { "epoch": 111.576, "grad_norm": 2.2064616680145264, "learning_rate": 4.364565799398102e-06, "loss": 0.7648, "mean_token_accuracy": 0.7998192355036735, "num_tokens": 97437.0, "step": 3460 }, { "epoch": 111.896, "grad_norm": 2.387873888015747, "learning_rate": 4.312565284888819e-06, "loss": 0.81, "mean_token_accuracy": 0.7902990553528071, "num_tokens": 111663.0, "step": 3470 }, { "epoch": 112.192, "grad_norm": 2.14197039604187, "learning_rate": 4.2607911076343455e-06, "loss": 0.7631, "mean_token_accuracy": 0.7986521447027052, "num_tokens": 127399.0, "step": 3480 }, { "epoch": 112.512, "grad_norm": 2.459407091140747, "learning_rate": 4.2092453280407605e-06, "loss": 0.7647, "mean_token_accuracy": 0.7940516691654921, "num_tokens": 143237.0, "step": 3490 }, { "epoch": 112.832, "grad_norm": 2.606563091278076, "learning_rate": 4.157929997424853e-06, "loss": 0.7958, "mean_token_accuracy": 0.8024938710033893, "num_tokens": 159484.0, "step": 3500 }, { "epoch": 113.128, "grad_norm": 2.4822981357574463, "learning_rate": 4.106847157932445e-06, "loss": 0.7599, "mean_token_accuracy": 0.8050464319216238, "num_tokens": 172111.0, "step": 3510 }, { "epoch": 113.448, "grad_norm": 2.0524537563323975, "learning_rate": 4.0559988424571365e-06, "loss": 0.7676, "mean_token_accuracy": 0.797309584543109, "num_tokens": 189542.0, "step": 3520 }, { "epoch": 113.768, "grad_norm": 2.3543918132781982, "learning_rate": 4.005387074559421e-06, "loss": 0.8929, "mean_token_accuracy": 0.783655048161745, "num_tokens": 207117.0, "step": 3530 }, { "epoch": 114.064, "grad_norm": 2.1995062828063965, "learning_rate": 3.9550138683861184e-06, "loss": 0.7502, "mean_token_accuracy": 0.8102532826565407, "num_tokens": 220475.0, "step": 3540 }, { "epoch": 114.384, "grad_norm": 2.4602413177490234, "learning_rate": 3.904881228590253e-06, "loss": 0.7807, "mean_token_accuracy": 0.7911395899951458, "num_tokens": 235641.0, "step": 3550 }, { "epoch": 114.704, "grad_norm": 1.7863750457763672, "learning_rate": 3.854991150251271e-06, "loss": 0.8174, "mean_token_accuracy": 0.7878696542233229, "num_tokens": 252557.0, "step": 3560 }, { "epoch": 115.0, "grad_norm": 2.476003885269165, "learning_rate": 3.8053456187956315e-06, "loss": 0.8084, "mean_token_accuracy": 0.7975163347012287, "num_tokens": 267658.0, "step": 3570 }, { "epoch": 115.32, "grad_norm": 2.536450147628784, "learning_rate": 3.7559466099178e-06, "loss": 0.8019, "mean_token_accuracy": 0.7918349288403987, "num_tokens": 283710.0, "step": 3580 }, { "epoch": 115.64, "grad_norm": 1.9077178239822388, "learning_rate": 3.7067960895016277e-06, "loss": 0.8318, "mean_token_accuracy": 0.7928959406912327, "num_tokens": 299210.0, "step": 3590 }, { "epoch": 115.96, "grad_norm": 1.9973210096359253, "learning_rate": 3.6578960135421117e-06, "loss": 0.7243, "mean_token_accuracy": 0.8063468877226114, "num_tokens": 315732.0, "step": 3600 }, { "epoch": 116.256, "grad_norm": 2.727466344833374, "learning_rate": 3.6092483280675683e-06, "loss": 0.7522, "mean_token_accuracy": 0.814963810347222, "num_tokens": 329521.0, "step": 3610 }, { "epoch": 116.576, "grad_norm": 5.467685222625732, "learning_rate": 3.5608549690621562e-06, "loss": 0.8147, "mean_token_accuracy": 0.7913577631115913, "num_tokens": 346404.0, "step": 3620 }, { "epoch": 116.896, "grad_norm": 2.106499433517456, "learning_rate": 3.512717862388876e-06, "loss": 0.8025, "mean_token_accuracy": 0.7868743006139993, "num_tokens": 362461.0, "step": 3630 }, { "epoch": 117.192, "grad_norm": 1.8872570991516113, "learning_rate": 3.464838923712891e-06, "loss": 0.8173, "mean_token_accuracy": 0.7879228088501338, "num_tokens": 378441.0, "step": 3640 }, { "epoch": 117.512, "grad_norm": 2.7338979244232178, "learning_rate": 3.4172200584253077e-06, "loss": 0.7152, "mean_token_accuracy": 0.8083719074726105, "num_tokens": 392976.0, "step": 3650 }, { "epoch": 117.832, "grad_norm": 2.1551365852355957, "learning_rate": 3.369863161567363e-06, "loss": 0.8131, "mean_token_accuracy": 0.7942818276584148, "num_tokens": 409377.0, "step": 3660 }, { "epoch": 118.128, "grad_norm": 2.206017255783081, "learning_rate": 3.322770117754963e-06, "loss": 0.7659, "mean_token_accuracy": 0.8010066038853413, "num_tokens": 423629.0, "step": 3670 }, { "epoch": 118.448, "grad_norm": 2.915071964263916, "learning_rate": 3.2759428011037454e-06, "loss": 0.7366, "mean_token_accuracy": 0.7961924949660897, "num_tokens": 438934.0, "step": 3680 }, { "epoch": 118.768, "grad_norm": 2.040217876434326, "learning_rate": 3.229383075154445e-06, "loss": 0.7838, "mean_token_accuracy": 0.8073360413312912, "num_tokens": 456515.0, "step": 3690 }, { "epoch": 119.064, "grad_norm": 2.2024896144866943, "learning_rate": 3.18309279279876e-06, "loss": 0.8328, "mean_token_accuracy": 0.7914658515034495, "num_tokens": 471476.0, "step": 3700 }, { "epoch": 119.384, "grad_norm": 2.4495859146118164, "learning_rate": 3.137073796205601e-06, "loss": 0.8175, "mean_token_accuracy": 0.8003936596214771, "num_tokens": 486665.0, "step": 3710 }, { "epoch": 119.704, "grad_norm": 2.1028919219970703, "learning_rate": 3.0913279167477916e-06, "loss": 0.8224, "mean_token_accuracy": 0.8022113911807537, "num_tokens": 504053.0, "step": 3720 }, { "epoch": 120.0, "grad_norm": 4.113481521606445, "learning_rate": 3.0458569749291743e-06, "loss": 0.7272, "mean_token_accuracy": 0.8031640326654589, "num_tokens": 518123.0, "step": 3730 }, { "epoch": 120.32, "grad_norm": 2.537456750869751, "learning_rate": 3.000662780312178e-06, "loss": 0.7771, "mean_token_accuracy": 0.7974813230335712, "num_tokens": 534453.0, "step": 3740 }, { "epoch": 120.64, "grad_norm": 2.442348003387451, "learning_rate": 2.9557471314457866e-06, "loss": 0.7646, "mean_token_accuracy": 0.8017565876245498, "num_tokens": 549839.0, "step": 3750 }, { "epoch": 120.96, "grad_norm": 1.9900853633880615, "learning_rate": 2.9111118157939745e-06, "loss": 0.8154, "mean_token_accuracy": 0.8000222038477659, "num_tokens": 566121.0, "step": 3760 }, { "epoch": 121.256, "grad_norm": 2.138803005218506, "learning_rate": 2.866758609664572e-06, "loss": 0.7823, "mean_token_accuracy": 0.8003743641279839, "num_tokens": 581873.0, "step": 3770 }, { "epoch": 121.576, "grad_norm": 2.2369134426116943, "learning_rate": 2.8226892781385673e-06, "loss": 0.8084, "mean_token_accuracy": 0.7941499546170234, "num_tokens": 597192.0, "step": 3780 }, { "epoch": 121.896, "grad_norm": 2.110830783843994, "learning_rate": 2.7789055749998863e-06, "loss": 0.8422, "mean_token_accuracy": 0.7905905708670616, "num_tokens": 612868.0, "step": 3790 }, { "epoch": 122.192, "grad_norm": 1.9145809412002563, "learning_rate": 2.7354092426655565e-06, "loss": 0.737, "mean_token_accuracy": 0.8060021779021701, "num_tokens": 628187.0, "step": 3800 }, { "epoch": 122.512, "grad_norm": 2.281447649002075, "learning_rate": 2.6922020121164182e-06, "loss": 0.7426, "mean_token_accuracy": 0.8067140795290471, "num_tokens": 645037.0, "step": 3810 }, { "epoch": 122.832, "grad_norm": 2.424180030822754, "learning_rate": 2.6492856028281956e-06, "loss": 0.7792, "mean_token_accuracy": 0.8041324406862259, "num_tokens": 660729.0, "step": 3820 }, { "epoch": 123.128, "grad_norm": 2.1080105304718018, "learning_rate": 2.606661722703084e-06, "loss": 0.8608, "mean_token_accuracy": 0.7844757274598688, "num_tokens": 675782.0, "step": 3830 }, { "epoch": 123.448, "grad_norm": 1.995034098625183, "learning_rate": 2.5643320680018012e-06, "loss": 0.7897, "mean_token_accuracy": 0.8024365931749344, "num_tokens": 693109.0, "step": 3840 }, { "epoch": 123.768, "grad_norm": 2.0973665714263916, "learning_rate": 2.522298323276039e-06, "loss": 0.7599, "mean_token_accuracy": 0.7944286055862904, "num_tokens": 707242.0, "step": 3850 }, { "epoch": 124.064, "grad_norm": 2.0112831592559814, "learning_rate": 2.480562161301464e-06, "loss": 0.7615, "mean_token_accuracy": 0.8035188607267432, "num_tokens": 721297.0, "step": 3860 }, { "epoch": 124.384, "grad_norm": 2.4737842082977295, "learning_rate": 2.4391252430111388e-06, "loss": 0.8339, "mean_token_accuracy": 0.7951025106012821, "num_tokens": 738725.0, "step": 3870 }, { "epoch": 124.704, "grad_norm": 2.4330711364746094, "learning_rate": 2.3979892174294105e-06, "loss": 0.8477, "mean_token_accuracy": 0.7900912150740623, "num_tokens": 754563.0, "step": 3880 }, { "epoch": 125.0, "grad_norm": 5.609644412994385, "learning_rate": 2.3571557216062967e-06, "loss": 0.659, "mean_token_accuracy": 0.8116728329175228, "num_tokens": 768588.0, "step": 3890 }, { "epoch": 125.32, "grad_norm": 2.646641254425049, "learning_rate": 2.316626380552337e-06, "loss": 0.7614, "mean_token_accuracy": 0.8065689295530319, "num_tokens": 785761.0, "step": 3900 }, { "epoch": 125.64, "grad_norm": 2.1917035579681396, "learning_rate": 2.2764028071739162e-06, "loss": 0.7887, "mean_token_accuracy": 0.7956257075071335, "num_tokens": 800456.0, "step": 3910 }, { "epoch": 125.96, "grad_norm": 2.0703041553497314, "learning_rate": 2.236486602209097e-06, "loss": 0.7823, "mean_token_accuracy": 0.7975949931889772, "num_tokens": 817034.0, "step": 3920 }, { "epoch": 126.256, "grad_norm": 3.7488138675689697, "learning_rate": 2.1968793541638877e-06, "loss": 0.7804, "mean_token_accuracy": 0.7973396097486084, "num_tokens": 831010.0, "step": 3930 }, { "epoch": 126.576, "grad_norm": 1.8520132303237915, "learning_rate": 2.1575826392490507e-06, "loss": 0.8349, "mean_token_accuracy": 0.7970767199993134, "num_tokens": 847034.0, "step": 3940 }, { "epoch": 126.896, "grad_norm": 2.562690258026123, "learning_rate": 2.118598021317362e-06, "loss": 0.7205, "mean_token_accuracy": 0.8056481756269932, "num_tokens": 862562.0, "step": 3950 }, { "epoch": 127.192, "grad_norm": 2.3542673587799072, "learning_rate": 2.07992705180138e-06, "loss": 0.7441, "mean_token_accuracy": 0.7991788826278738, "num_tokens": 876729.0, "step": 3960 }, { "epoch": 127.512, "grad_norm": 2.78625750541687, "learning_rate": 2.0415712696517155e-06, "loss": 0.8173, "mean_token_accuracy": 0.7964733302593231, "num_tokens": 895143.0, "step": 3970 }, { "epoch": 127.832, "grad_norm": 2.283376455307007, "learning_rate": 2.00353220127576e-06, "loss": 0.7863, "mean_token_accuracy": 0.8018839418888092, "num_tokens": 911917.0, "step": 3980 }, { "epoch": 128.128, "grad_norm": 2.6434993743896484, "learning_rate": 1.965811360476967e-06, "loss": 0.784, "mean_token_accuracy": 0.7953068952302675, "num_tokens": 924458.0, "step": 3990 }, { "epoch": 128.448, "grad_norm": 2.869091033935547, "learning_rate": 1.9284102483946042e-06, "loss": 0.7912, "mean_token_accuracy": 0.7926684629172087, "num_tokens": 939111.0, "step": 4000 }, { "epoch": 128.768, "grad_norm": 1.8905816078186035, "learning_rate": 1.8913303534440019e-06, "loss": 0.7592, "mean_token_accuracy": 0.8064320608973503, "num_tokens": 956948.0, "step": 4010 }, { "epoch": 129.064, "grad_norm": 2.6615140438079834, "learning_rate": 1.8545731512573317e-06, "loss": 0.725, "mean_token_accuracy": 0.8079512683120934, "num_tokens": 973125.0, "step": 4020 }, { "epoch": 129.384, "grad_norm": 1.934066653251648, "learning_rate": 1.8181401046248748e-06, "loss": 0.8115, "mean_token_accuracy": 0.7953744523227215, "num_tokens": 988824.0, "step": 4030 }, { "epoch": 129.704, "grad_norm": 2.660801410675049, "learning_rate": 1.7820326634368124e-06, "loss": 0.7548, "mean_token_accuracy": 0.8010114066302776, "num_tokens": 1003868.0, "step": 4040 }, { "epoch": 130.0, "grad_norm": 5.5373663902282715, "learning_rate": 1.7462522646255319e-06, "loss": 0.8302, "mean_token_accuracy": 0.7884616557810757, "num_tokens": 1019053.0, "step": 4050 }, { "epoch": 130.32, "grad_norm": 2.5071728229522705, "learning_rate": 1.7108003321084299e-06, "loss": 0.7475, "mean_token_accuracy": 0.80089957639575, "num_tokens": 1035776.0, "step": 4060 }, { "epoch": 130.64, "grad_norm": 3.022608518600464, "learning_rate": 1.675678276731253e-06, "loss": 0.7242, "mean_token_accuracy": 0.7966047372668982, "num_tokens": 1049970.0, "step": 4070 }, { "epoch": 130.96, "grad_norm": 2.2591309547424316, "learning_rate": 1.6408874962119526e-06, "loss": 0.8193, "mean_token_accuracy": 0.7990051701664924, "num_tokens": 1066724.0, "step": 4080 }, { "epoch": 131.256, "grad_norm": 1.9211347103118896, "learning_rate": 1.606429375085058e-06, "loss": 0.8557, "mean_token_accuracy": 0.7852608090316927, "num_tokens": 1081511.0, "step": 4090 }, { "epoch": 131.576, "grad_norm": 2.843675136566162, "learning_rate": 1.572305284646587e-06, "loss": 0.744, "mean_token_accuracy": 0.8146931059658528, "num_tokens": 1097216.0, "step": 4100 }, { "epoch": 131.896, "grad_norm": 1.981909990310669, "learning_rate": 1.538516582899453e-06, "loss": 0.7557, "mean_token_accuracy": 0.80064931884408, "num_tokens": 1114422.0, "step": 4110 }, { "epoch": 132.192, "grad_norm": 2.5102710723876953, "learning_rate": 1.505064614499443e-06, "loss": 0.9181, "mean_token_accuracy": 0.7895874824072864, "num_tokens": 1130322.0, "step": 4120 }, { "epoch": 132.512, "grad_norm": 2.168246269226074, "learning_rate": 1.4719507107017005e-06, "loss": 0.7341, "mean_token_accuracy": 0.8004750736057759, "num_tokens": 1145434.0, "step": 4130 }, { "epoch": 132.832, "grad_norm": 2.0837020874023438, "learning_rate": 1.439176189307735e-06, "loss": 0.7871, "mean_token_accuracy": 0.8010189373046159, "num_tokens": 1161082.0, "step": 4140 }, { "epoch": 133.128, "grad_norm": 2.729126453399658, "learning_rate": 1.406742354613e-06, "loss": 0.7355, "mean_token_accuracy": 0.8093010356297364, "num_tokens": 1176663.0, "step": 4150 }, { "epoch": 133.448, "grad_norm": 2.5754497051239014, "learning_rate": 1.3746504973549613e-06, "loss": 0.7523, "mean_token_accuracy": 0.8098891712725163, "num_tokens": 1192788.0, "step": 4160 }, { "epoch": 133.768, "grad_norm": 3.8127379417419434, "learning_rate": 1.34290189466175e-06, "loss": 0.7298, "mean_token_accuracy": 0.7972260743379593, "num_tokens": 1207918.0, "step": 4170 }, { "epoch": 134.064, "grad_norm": 2.3844289779663086, "learning_rate": 1.3114978100013376e-06, "loss": 0.8014, "mean_token_accuracy": 0.7910445159351503, "num_tokens": 1222595.0, "step": 4180 }, { "epoch": 134.384, "grad_norm": 2.5769917964935303, "learning_rate": 1.2804394931312446e-06, "loss": 0.6971, "mean_token_accuracy": 0.8134579740464687, "num_tokens": 1238530.0, "step": 4190 }, { "epoch": 134.704, "grad_norm": 2.293607234954834, "learning_rate": 1.2497281800488092e-06, "loss": 0.8715, "mean_token_accuracy": 0.7885241828858852, "num_tokens": 1255860.0, "step": 4200 }, { "epoch": 135.0, "grad_norm": 9.035529136657715, "learning_rate": 1.219365092942003e-06, "loss": 0.7438, "mean_token_accuracy": 0.7974654679765573, "num_tokens": 1269518.0, "step": 4210 }, { "epoch": 135.32, "grad_norm": 2.2957992553710938, "learning_rate": 1.189351440140788e-06, "loss": 0.8218, "mean_token_accuracy": 0.7884160943329335, "num_tokens": 1285778.0, "step": 4220 }, { "epoch": 135.64, "grad_norm": 2.6251449584960938, "learning_rate": 1.159688416069038e-06, "loss": 0.7602, "mean_token_accuracy": 0.8091854326426983, "num_tokens": 1301508.0, "step": 4230 }, { "epoch": 135.96, "grad_norm": 2.2512965202331543, "learning_rate": 1.1303772011969928e-06, "loss": 0.7572, "mean_token_accuracy": 0.8062782268971205, "num_tokens": 1317471.0, "step": 4240 }, { "epoch": 136.256, "grad_norm": 2.4756977558135986, "learning_rate": 1.1014189619942905e-06, "loss": 0.7439, "mean_token_accuracy": 0.8019913362490164, "num_tokens": 1333505.0, "step": 4250 }, { "epoch": 136.576, "grad_norm": 3.272853374481201, "learning_rate": 1.0728148508835424e-06, "loss": 0.8706, "mean_token_accuracy": 0.7980553403496742, "num_tokens": 1349049.0, "step": 4260 }, { "epoch": 136.896, "grad_norm": 2.3439319133758545, "learning_rate": 1.0445660061944684e-06, "loss": 0.7531, "mean_token_accuracy": 0.7939603064209223, "num_tokens": 1365036.0, "step": 4270 }, { "epoch": 137.192, "grad_norm": 2.6307787895202637, "learning_rate": 1.01667355211861e-06, "loss": 0.7414, "mean_token_accuracy": 0.8034155070781708, "num_tokens": 1379248.0, "step": 4280 }, { "epoch": 137.512, "grad_norm": 2.3606624603271484, "learning_rate": 9.891385986645675e-07, "loss": 0.6851, "mean_token_accuracy": 0.8086177695542573, "num_tokens": 1394888.0, "step": 4290 }, { "epoch": 137.832, "grad_norm": 2.0615415573120117, "learning_rate": 9.619622416138475e-07, "loss": 0.8681, "mean_token_accuracy": 0.7964320003986358, "num_tokens": 1410073.0, "step": 4300 }, { "epoch": 138.128, "grad_norm": 2.324324131011963, "learning_rate": 9.351455624772487e-07, "loss": 0.7584, "mean_token_accuracy": 0.8120040643859554, "num_tokens": 1427006.0, "step": 4310 }, { "epoch": 138.448, "grad_norm": 2.1296613216400146, "learning_rate": 9.086896284518198e-07, "loss": 0.7939, "mean_token_accuracy": 0.802574060857296, "num_tokens": 1442836.0, "step": 4320 }, { "epoch": 138.768, "grad_norm": 2.3658032417297363, "learning_rate": 8.825954923783875e-07, "loss": 0.8059, "mean_token_accuracy": 0.7980705320835113, "num_tokens": 1457625.0, "step": 4330 }, { "epoch": 139.064, "grad_norm": 2.4179608821868896, "learning_rate": 8.568641926996646e-07, "loss": 0.8155, "mean_token_accuracy": 0.7932797682446402, "num_tokens": 1473259.0, "step": 4340 }, { "epoch": 139.384, "grad_norm": 2.3579256534576416, "learning_rate": 8.314967534189166e-07, "loss": 0.8503, "mean_token_accuracy": 0.7952963810414075, "num_tokens": 1490309.0, "step": 4350 }, { "epoch": 139.704, "grad_norm": 2.3982760906219482, "learning_rate": 8.064941840592178e-07, "loss": 0.6937, "mean_token_accuracy": 0.8161114897578955, "num_tokens": 1505580.0, "step": 4360 }, { "epoch": 140.0, "grad_norm": 3.710239887237549, "learning_rate": 7.818574796232714e-07, "loss": 0.776, "mean_token_accuracy": 0.789946156579095, "num_tokens": 1519983.0, "step": 4370 }, { "epoch": 140.32, "grad_norm": 2.348114013671875, "learning_rate": 7.575876205538113e-07, "loss": 0.8371, "mean_token_accuracy": 0.7868118450045586, "num_tokens": 1535154.0, "step": 4380 }, { "epoch": 140.64, "grad_norm": 1.7529124021530151, "learning_rate": 7.336855726945891e-07, "loss": 0.8106, "mean_token_accuracy": 0.7890769924968482, "num_tokens": 1552288.0, "step": 4390 }, { "epoch": 140.96, "grad_norm": 2.1464691162109375, "learning_rate": 7.101522872519306e-07, "loss": 0.7481, "mean_token_accuracy": 0.8147139415144921, "num_tokens": 1567833.0, "step": 4400 }, { "epoch": 8.304941176470589, "grad_norm": 3.626028537750244, "learning_rate": 1.6054562751771983e-05, "loss": 1.8624, "mean_token_accuracy": 0.42639462910592557, "num_tokens": 12548.0, "step": 4410 }, { "epoch": 8.323764705882352, "grad_norm": 2.1800546646118164, "learning_rate": 1.6037858352792722e-05, "loss": 1.5835, "mean_token_accuracy": 0.4775951974093914, "num_tokens": 25755.0, "step": 4420 }, { "epoch": 8.342588235294118, "grad_norm": 1.870821475982666, "learning_rate": 1.602112739804461e-05, "loss": 1.4872, "mean_token_accuracy": 0.48393381759524345, "num_tokens": 38667.0, "step": 4430 }, { "epoch": 8.361411764705883, "grad_norm": 1.9865084886550903, "learning_rate": 1.6004369961113897e-05, "loss": 1.4383, "mean_token_accuracy": 0.4954090975224972, "num_tokens": 51649.0, "step": 4440 }, { "epoch": 8.380235294117647, "grad_norm": 1.759982943534851, "learning_rate": 1.5987586115703306e-05, "loss": 1.4358, "mean_token_accuracy": 0.5117561783641577, "num_tokens": 66035.0, "step": 4450 }, { "epoch": 8.399058823529412, "grad_norm": 1.9075089693069458, "learning_rate": 1.5970775935631717e-05, "loss": 1.3555, "mean_token_accuracy": 0.5182104598730802, "num_tokens": 79576.0, "step": 4460 }, { "epoch": 8.417882352941177, "grad_norm": 1.3230409622192383, "learning_rate": 1.5953939494833832e-05, "loss": 1.3668, "mean_token_accuracy": 0.519798369705677, "num_tokens": 92225.0, "step": 4470 }, { "epoch": 8.43670588235294, "grad_norm": 2.239945650100708, "learning_rate": 1.5937076867359852e-05, "loss": 1.3048, "mean_token_accuracy": 0.5348641883581877, "num_tokens": 105922.0, "step": 4480 }, { "epoch": 8.455529411764706, "grad_norm": 1.7611688375473022, "learning_rate": 1.5920188127375152e-05, "loss": 1.3466, "mean_token_accuracy": 0.521543862298131, "num_tokens": 119527.0, "step": 4490 }, { "epoch": 8.47435294117647, "grad_norm": 1.486075520515442, "learning_rate": 1.5903273349159958e-05, "loss": 1.3115, "mean_token_accuracy": 0.5352868799120187, "num_tokens": 133451.0, "step": 4500 }, { "epoch": 8.493176470588235, "grad_norm": 1.6955538988113403, "learning_rate": 1.5886332607109017e-05, "loss": 1.3427, "mean_token_accuracy": 0.5247942265123129, "num_tokens": 147565.0, "step": 4510 }, { "epoch": 8.512, "grad_norm": 1.5570602416992188, "learning_rate": 1.5869365975731267e-05, "loss": 1.2547, "mean_token_accuracy": 0.5451988846063613, "num_tokens": 160377.0, "step": 4520 }, { "epoch": 8.530823529411764, "grad_norm": 1.4915376901626587, "learning_rate": 1.585237352964952e-05, "loss": 1.358, "mean_token_accuracy": 0.526292197033763, "num_tokens": 174242.0, "step": 4530 }, { "epoch": 8.54964705882353, "grad_norm": 1.603037714958191, "learning_rate": 1.583535534360012e-05, "loss": 1.2699, "mean_token_accuracy": 0.5363341204822063, "num_tokens": 187399.0, "step": 4540 }, { "epoch": 8.568470588235295, "grad_norm": 2.1536943912506104, "learning_rate": 1.581831149243262e-05, "loss": 1.2976, "mean_token_accuracy": 0.5274909067898989, "num_tokens": 200947.0, "step": 4550 }, { "epoch": 8.587294117647058, "grad_norm": 1.611542820930481, "learning_rate": 1.580124205110946e-05, "loss": 1.2672, "mean_token_accuracy": 0.5402051657438278, "num_tokens": 214010.0, "step": 4560 }, { "epoch": 8.606117647058824, "grad_norm": 1.5133346319198608, "learning_rate": 1.578414709470562e-05, "loss": 1.3097, "mean_token_accuracy": 0.5325882468372584, "num_tokens": 227829.0, "step": 4570 }, { "epoch": 8.624941176470589, "grad_norm": 1.4743294715881348, "learning_rate": 1.576702669840832e-05, "loss": 1.2504, "mean_token_accuracy": 0.5380570895969867, "num_tokens": 240838.0, "step": 4580 }, { "epoch": 8.643764705882353, "grad_norm": 1.4610170125961304, "learning_rate": 1.5749880937516647e-05, "loss": 1.2727, "mean_token_accuracy": 0.5317132595926524, "num_tokens": 253845.0, "step": 4590 }, { "epoch": 8.662588235294118, "grad_norm": 1.8520996570587158, "learning_rate": 1.573270988744126e-05, "loss": 1.2746, "mean_token_accuracy": 0.5325201127678156, "num_tokens": 266058.0, "step": 4600 }, { "epoch": 8.681411764705881, "grad_norm": 2.241778612136841, "learning_rate": 1.5715513623704052e-05, "loss": 1.2703, "mean_token_accuracy": 0.5311647448688745, "num_tokens": 279149.0, "step": 4610 }, { "epoch": 8.700235294117647, "grad_norm": 1.921618938446045, "learning_rate": 1.5698292221937787e-05, "loss": 1.2823, "mean_token_accuracy": 0.5341210236772895, "num_tokens": 293451.0, "step": 4620 }, { "epoch": 8.719058823529412, "grad_norm": 1.5892717838287354, "learning_rate": 1.5681045757885817e-05, "loss": 1.2531, "mean_token_accuracy": 0.5422347262501717, "num_tokens": 306138.0, "step": 4630 }, { "epoch": 8.737882352941176, "grad_norm": 1.6042686700820923, "learning_rate": 1.566377430740171e-05, "loss": 1.2764, "mean_token_accuracy": 0.5341788738965988, "num_tokens": 319399.0, "step": 4640 }, { "epoch": 8.756705882352941, "grad_norm": 2.2580060958862305, "learning_rate": 1.5646477946448927e-05, "loss": 1.2348, "mean_token_accuracy": 0.5448929745703935, "num_tokens": 332882.0, "step": 4650 }, { "epoch": 8.775529411764706, "grad_norm": 1.2103066444396973, "learning_rate": 1.5629156751100502e-05, "loss": 1.2542, "mean_token_accuracy": 0.545468046143651, "num_tokens": 345343.0, "step": 4660 }, { "epoch": 8.79435294117647, "grad_norm": 0.8362689018249512, "learning_rate": 1.561181079753868e-05, "loss": 1.2827, "mean_token_accuracy": 0.5429604861885309, "num_tokens": 358912.0, "step": 4670 }, { "epoch": 8.813176470588235, "grad_norm": 1.158046841621399, "learning_rate": 1.5594440162054615e-05, "loss": 1.2471, "mean_token_accuracy": 0.5337832469493151, "num_tokens": 372248.0, "step": 4680 }, { "epoch": 8.832, "grad_norm": 1.3598729372024536, "learning_rate": 1.557704492104801e-05, "loss": 1.3124, "mean_token_accuracy": 0.5272687204182148, "num_tokens": 386263.0, "step": 4690 }, { "epoch": 8.850823529411764, "grad_norm": 1.7355713844299316, "learning_rate": 1.5559625151026785e-05, "loss": 1.3023, "mean_token_accuracy": 0.524540626257658, "num_tokens": 399314.0, "step": 4700 }, { "epoch": 8.86964705882353, "grad_norm": 1.342244267463684, "learning_rate": 1.5542180928606747e-05, "loss": 1.2199, "mean_token_accuracy": 0.5468841027468443, "num_tokens": 413612.0, "step": 4710 }, { "epoch": 8.888470588235293, "grad_norm": 1.322409987449646, "learning_rate": 1.5524712330511246e-05, "loss": 1.2383, "mean_token_accuracy": 0.5588106140494347, "num_tokens": 427389.0, "step": 4720 }, { "epoch": 8.907294117647059, "grad_norm": 1.3516113758087158, "learning_rate": 1.5507219433570848e-05, "loss": 1.2482, "mean_token_accuracy": 0.5358951542526483, "num_tokens": 440751.0, "step": 4730 }, { "epoch": 8.926117647058824, "grad_norm": 1.5260019302368164, "learning_rate": 1.5489702314722986e-05, "loss": 1.2168, "mean_token_accuracy": 0.5595146797597408, "num_tokens": 453892.0, "step": 4740 }, { "epoch": 8.944941176470588, "grad_norm": 1.5382399559020996, "learning_rate": 1.547216105101162e-05, "loss": 1.2772, "mean_token_accuracy": 0.531356817483902, "num_tokens": 468069.0, "step": 4750 }, { "epoch": 8.963764705882353, "grad_norm": 1.362877368927002, "learning_rate": 1.5454595719586926e-05, "loss": 1.2325, "mean_token_accuracy": 0.5457029201090335, "num_tokens": 480208.0, "step": 4760 }, { "epoch": 8.982588235294118, "grad_norm": 1.0237706899642944, "learning_rate": 1.543700639770491e-05, "loss": 1.2282, "mean_token_accuracy": 0.542092502117157, "num_tokens": 493653.0, "step": 4770 }, { "epoch": 9.001882352941177, "grad_norm": 3.654766082763672, "learning_rate": 1.5419393162727105e-05, "loss": 1.3508, "mean_token_accuracy": 0.5442763832284183, "num_tokens": 507301.0, "step": 4780 }, { "epoch": 9.02070588235294, "grad_norm": 1.20900297164917, "learning_rate": 1.5401756092120215e-05, "loss": 1.2509, "mean_token_accuracy": 0.5424667615443468, "num_tokens": 520131.0, "step": 4790 }, { "epoch": 9.039529411764706, "grad_norm": 1.2988379001617432, "learning_rate": 1.5384095263455782e-05, "loss": 1.2669, "mean_token_accuracy": 0.5415636003017426, "num_tokens": 533609.0, "step": 4800 }, { "epoch": 9.058352941176471, "grad_norm": 2.150287628173828, "learning_rate": 1.5366410754409837e-05, "loss": 1.2693, "mean_token_accuracy": 0.5377780050039291, "num_tokens": 547255.0, "step": 4810 }, { "epoch": 9.077176470588235, "grad_norm": 1.0066241025924683, "learning_rate": 1.5348702642762563e-05, "loss": 1.2117, "mean_token_accuracy": 0.5584665209054946, "num_tokens": 560812.0, "step": 4820 }, { "epoch": 9.096, "grad_norm": 1.0327008962631226, "learning_rate": 1.5330971006397962e-05, "loss": 1.1753, "mean_token_accuracy": 0.5651697292923927, "num_tokens": 574553.0, "step": 4830 }, { "epoch": 9.114823529411765, "grad_norm": 1.200286865234375, "learning_rate": 1.5313215923303482e-05, "loss": 1.2833, "mean_token_accuracy": 0.5195233155041933, "num_tokens": 587992.0, "step": 4840 }, { "epoch": 9.133647058823529, "grad_norm": 0.9596078991889954, "learning_rate": 1.5295437471569714e-05, "loss": 1.2403, "mean_token_accuracy": 0.538974242284894, "num_tokens": 602053.0, "step": 4850 }, { "epoch": 9.152470588235294, "grad_norm": 1.0736156702041626, "learning_rate": 1.5277635729390022e-05, "loss": 1.2346, "mean_token_accuracy": 0.5428169660270215, "num_tokens": 616596.0, "step": 4860 }, { "epoch": 9.171294117647058, "grad_norm": 1.773108959197998, "learning_rate": 1.5259810775060202e-05, "loss": 1.2516, "mean_token_accuracy": 0.5292404495179653, "num_tokens": 629154.0, "step": 4870 }, { "epoch": 9.190117647058823, "grad_norm": 1.4007513523101807, "learning_rate": 1.524196268697815e-05, "loss": 1.1987, "mean_token_accuracy": 0.5568405143916607, "num_tokens": 641946.0, "step": 4880 }, { "epoch": 9.208941176470589, "grad_norm": 1.3242895603179932, "learning_rate": 1.5224091543643504e-05, "loss": 1.2781, "mean_token_accuracy": 0.5264579340815544, "num_tokens": 655771.0, "step": 4890 }, { "epoch": 9.227764705882352, "grad_norm": 1.3015270233154297, "learning_rate": 1.52061974236573e-05, "loss": 1.1972, "mean_token_accuracy": 0.5521455116569995, "num_tokens": 669074.0, "step": 4900 }, { "epoch": 9.246588235294118, "grad_norm": 1.4676063060760498, "learning_rate": 1.5188280405721643e-05, "loss": 1.2169, "mean_token_accuracy": 0.5410921085625887, "num_tokens": 682391.0, "step": 4910 }, { "epoch": 9.265411764705883, "grad_norm": 1.505129098892212, "learning_rate": 1.5170340568639335e-05, "loss": 1.2445, "mean_token_accuracy": 0.5468276925384998, "num_tokens": 695279.0, "step": 4920 }, { "epoch": 9.284235294117646, "grad_norm": 1.4586368799209595, "learning_rate": 1.5152377991313547e-05, "loss": 1.2183, "mean_token_accuracy": 0.5493371106684208, "num_tokens": 709036.0, "step": 4930 }, { "epoch": 9.303058823529412, "grad_norm": 1.3103828430175781, "learning_rate": 1.5134392752747469e-05, "loss": 1.2207, "mean_token_accuracy": 0.5371036138385534, "num_tokens": 721600.0, "step": 4940 }, { "epoch": 9.321882352941177, "grad_norm": 1.406219720840454, "learning_rate": 1.5116384932043953e-05, "loss": 1.2197, "mean_token_accuracy": 0.5394637104123831, "num_tokens": 734972.0, "step": 4950 }, { "epoch": 9.34070588235294, "grad_norm": 1.3175715208053589, "learning_rate": 1.5098354608405177e-05, "loss": 1.3009, "mean_token_accuracy": 0.5217017080634833, "num_tokens": 749524.0, "step": 4960 }, { "epoch": 9.359529411764706, "grad_norm": 1.1799266338348389, "learning_rate": 1.5080301861132291e-05, "loss": 1.233, "mean_token_accuracy": 0.5553332667797803, "num_tokens": 763976.0, "step": 4970 }, { "epoch": 9.378352941176471, "grad_norm": 1.2330571413040161, "learning_rate": 1.5062226769625068e-05, "loss": 1.2127, "mean_token_accuracy": 0.5426539558917284, "num_tokens": 777548.0, "step": 4980 }, { "epoch": 9.397176470588235, "grad_norm": 1.3530794382095337, "learning_rate": 1.5044129413381551e-05, "loss": 1.2137, "mean_token_accuracy": 0.5432845208793878, "num_tokens": 791104.0, "step": 4990 }, { "epoch": 9.416, "grad_norm": 1.174985647201538, "learning_rate": 1.5026009871997725e-05, "loss": 1.1936, "mean_token_accuracy": 0.5486832950264215, "num_tokens": 804784.0, "step": 5000 }, { "epoch": 9.434823529411764, "grad_norm": 0.9708495140075684, "learning_rate": 1.5007868225167124e-05, "loss": 1.2447, "mean_token_accuracy": 0.5287159774452448, "num_tokens": 817605.0, "step": 5010 }, { "epoch": 9.45364705882353, "grad_norm": 1.4748586416244507, "learning_rate": 1.4989704552680527e-05, "loss": 1.1782, "mean_token_accuracy": 0.5548595078289509, "num_tokens": 830334.0, "step": 5020 }, { "epoch": 9.472470588235295, "grad_norm": 1.4649749994277954, "learning_rate": 1.497151893442558e-05, "loss": 1.1558, "mean_token_accuracy": 0.5786185275763274, "num_tokens": 843520.0, "step": 5030 }, { "epoch": 9.491294117647058, "grad_norm": 1.3614012002944946, "learning_rate": 1.4953311450386447e-05, "loss": 1.2294, "mean_token_accuracy": 0.5436280608177185, "num_tokens": 856605.0, "step": 5040 }, { "epoch": 9.510117647058824, "grad_norm": 0.8162552714347839, "learning_rate": 1.493508218064347e-05, "loss": 1.1795, "mean_token_accuracy": 0.5606917165219784, "num_tokens": 869281.0, "step": 5050 }, { "epoch": 9.528941176470589, "grad_norm": 1.1542294025421143, "learning_rate": 1.4916831205372803e-05, "loss": 1.283, "mean_token_accuracy": 0.539304967597127, "num_tokens": 883498.0, "step": 5060 }, { "epoch": 9.547764705882352, "grad_norm": 1.3006714582443237, "learning_rate": 1.4898558604846067e-05, "loss": 1.2342, "mean_token_accuracy": 0.5408715981990099, "num_tokens": 897313.0, "step": 5070 }, { "epoch": 9.566588235294118, "grad_norm": 0.9996142983436584, "learning_rate": 1.488026445943e-05, "loss": 1.2156, "mean_token_accuracy": 0.5489041075110436, "num_tokens": 910640.0, "step": 5080 }, { "epoch": 9.585411764705881, "grad_norm": 2.1211931705474854, "learning_rate": 1.486194884958609e-05, "loss": 1.1633, "mean_token_accuracy": 0.5579564660787583, "num_tokens": 923363.0, "step": 5090 }, { "epoch": 9.604235294117647, "grad_norm": 1.2634146213531494, "learning_rate": 1.4843611855870235e-05, "loss": 1.2593, "mean_token_accuracy": 0.5273831244558096, "num_tokens": 936250.0, "step": 5100 }, { "epoch": 9.623058823529412, "grad_norm": 1.7456119060516357, "learning_rate": 1.4825253558932386e-05, "loss": 1.2228, "mean_token_accuracy": 0.5505132492631674, "num_tokens": 949552.0, "step": 5110 }, { "epoch": 9.641882352941176, "grad_norm": 1.605895757675171, "learning_rate": 1.480687403951619e-05, "loss": 1.1788, "mean_token_accuracy": 0.5624800592660903, "num_tokens": 963342.0, "step": 5120 }, { "epoch": 9.660705882352941, "grad_norm": 1.3311768770217896, "learning_rate": 1.4788473378458626e-05, "loss": 1.2062, "mean_token_accuracy": 0.5582063946872949, "num_tokens": 976717.0, "step": 5130 }, { "epoch": 9.679529411764706, "grad_norm": 1.4497061967849731, "learning_rate": 1.4770051656689672e-05, "loss": 1.228, "mean_token_accuracy": 0.5460193831473589, "num_tokens": 989772.0, "step": 5140 }, { "epoch": 9.69835294117647, "grad_norm": 1.1696816682815552, "learning_rate": 1.4751608955231924e-05, "loss": 1.1884, "mean_token_accuracy": 0.5445575587451458, "num_tokens": 1003123.0, "step": 5150 }, { "epoch": 9.717176470588235, "grad_norm": 0.9232364892959595, "learning_rate": 1.4733145355200255e-05, "loss": 1.152, "mean_token_accuracy": 0.5746063582599163, "num_tokens": 1016187.0, "step": 5160 }, { "epoch": 9.736, "grad_norm": 1.6106712818145752, "learning_rate": 1.4714660937801461e-05, "loss": 1.1799, "mean_token_accuracy": 0.5663762982934714, "num_tokens": 1029873.0, "step": 5170 }, { "epoch": 9.754823529411764, "grad_norm": 1.334657073020935, "learning_rate": 1.4696155784333885e-05, "loss": 1.1942, "mean_token_accuracy": 0.5546817529946566, "num_tokens": 1043425.0, "step": 5180 }, { "epoch": 9.77364705882353, "grad_norm": 0.8071675896644592, "learning_rate": 1.467762997618708e-05, "loss": 1.2465, "mean_token_accuracy": 0.535656175762415, "num_tokens": 1057319.0, "step": 5190 }, { "epoch": 9.792470588235295, "grad_norm": 1.1653850078582764, "learning_rate": 1.465908359484144e-05, "loss": 1.2336, "mean_token_accuracy": 0.5504809945821763, "num_tokens": 1070725.0, "step": 5200 }, { "epoch": 9.811294117647058, "grad_norm": 1.1270978450775146, "learning_rate": 1.4640516721867843e-05, "loss": 1.1989, "mean_token_accuracy": 0.5558116808533669, "num_tokens": 13834.0, "step": 5210 }, { "epoch": 9.830117647058824, "grad_norm": 1.6317771673202515, "learning_rate": 1.4621929438927299e-05, "loss": 1.2151, "mean_token_accuracy": 0.5420542072504759, "num_tokens": 27298.0, "step": 5220 }, { "epoch": 9.848941176470587, "grad_norm": 1.1831214427947998, "learning_rate": 1.4603321827770578e-05, "loss": 1.2075, "mean_token_accuracy": 0.5548371035605669, "num_tokens": 40750.0, "step": 5230 }, { "epoch": 9.867764705882353, "grad_norm": 1.4046541452407837, "learning_rate": 1.458469397023786e-05, "loss": 1.2385, "mean_token_accuracy": 0.5390195321291685, "num_tokens": 54449.0, "step": 5240 }, { "epoch": 9.886588235294118, "grad_norm": 1.1588149070739746, "learning_rate": 1.4566045948258376e-05, "loss": 1.2143, "mean_token_accuracy": 0.5551448825746774, "num_tokens": 67816.0, "step": 5250 }, { "epoch": 9.905411764705882, "grad_norm": 1.3225456476211548, "learning_rate": 1.4547377843850044e-05, "loss": 1.2199, "mean_token_accuracy": 0.5484016731381416, "num_tokens": 81192.0, "step": 5260 }, { "epoch": 9.924235294117647, "grad_norm": 0.9215822219848633, "learning_rate": 1.45286897391191e-05, "loss": 1.2136, "mean_token_accuracy": 0.5466381188482046, "num_tokens": 94106.0, "step": 5270 }, { "epoch": 9.943058823529412, "grad_norm": 1.9844329357147217, "learning_rate": 1.4509981716259762e-05, "loss": 1.2251, "mean_token_accuracy": 0.5436500191688538, "num_tokens": 107211.0, "step": 5280 }, { "epoch": 9.961882352941176, "grad_norm": 0.7866172194480896, "learning_rate": 1.4491253857553838e-05, "loss": 1.1928, "mean_token_accuracy": 0.5536798264831304, "num_tokens": 120603.0, "step": 5290 }, { "epoch": 9.980705882352941, "grad_norm": 1.3284730911254883, "learning_rate": 1.4472506245370382e-05, "loss": 1.2201, "mean_token_accuracy": 0.551696864143014, "num_tokens": 135253.0, "step": 5300 }, { "epoch": 9.999529411764707, "grad_norm": 0.8189272880554199, "learning_rate": 1.445373896216533e-05, "loss": 1.2535, "mean_token_accuracy": 0.5395314753055572, "num_tokens": 148070.0, "step": 5310 }, { "epoch": 10.018823529411765, "grad_norm": 0.9590490460395813, "learning_rate": 1.4434952090481135e-05, "loss": 1.3331, "mean_token_accuracy": 0.5544926153450478, "num_tokens": 162263.0, "step": 5320 }, { "epoch": 10.037647058823529, "grad_norm": 1.4627238512039185, "learning_rate": 1.4416145712946406e-05, "loss": 1.2488, "mean_token_accuracy": 0.5324025351554156, "num_tokens": 175371.0, "step": 5330 }, { "epoch": 10.056470588235294, "grad_norm": 0.6929643154144287, "learning_rate": 1.4397319912275535e-05, "loss": 1.2071, "mean_token_accuracy": 0.5509116105735302, "num_tokens": 188794.0, "step": 5340 }, { "epoch": 10.07529411764706, "grad_norm": 1.5115923881530762, "learning_rate": 1.437847477126835e-05, "loss": 1.1505, "mean_token_accuracy": 0.5615889120846986, "num_tokens": 201733.0, "step": 5350 }, { "epoch": 10.094117647058823, "grad_norm": 1.651714563369751, "learning_rate": 1.4359610372809739e-05, "loss": 1.2233, "mean_token_accuracy": 0.5453080747276544, "num_tokens": 214934.0, "step": 5360 }, { "epoch": 10.112941176470589, "grad_norm": 1.2535176277160645, "learning_rate": 1.4340726799869283e-05, "loss": 1.1925, "mean_token_accuracy": 0.5584179207682609, "num_tokens": 227831.0, "step": 5370 }, { "epoch": 10.131764705882352, "grad_norm": 1.8965996503829956, "learning_rate": 1.4321824135500904e-05, "loss": 1.2347, "mean_token_accuracy": 0.5445710398256779, "num_tokens": 242053.0, "step": 5380 }, { "epoch": 10.150588235294117, "grad_norm": 1.9367871284484863, "learning_rate": 1.430290246284249e-05, "loss": 1.2115, "mean_token_accuracy": 0.5574517220258712, "num_tokens": 256086.0, "step": 5390 }, { "epoch": 10.169411764705883, "grad_norm": 0.6884622573852539, "learning_rate": 1.4283961865115528e-05, "loss": 1.2457, "mean_token_accuracy": 0.5295402128249407, "num_tokens": 269977.0, "step": 5400 }, { "epoch": 10.188235294117646, "grad_norm": 0.7671216726303101, "learning_rate": 1.426500242562474e-05, "loss": 1.1288, "mean_token_accuracy": 0.5702939372509718, "num_tokens": 283412.0, "step": 5410 }, { "epoch": 10.207058823529412, "grad_norm": 1.1199065446853638, "learning_rate": 1.4246024227757735e-05, "loss": 1.2184, "mean_token_accuracy": 0.5337141178548336, "num_tokens": 296574.0, "step": 5420 }, { "epoch": 10.225882352941177, "grad_norm": 0.8241312503814697, "learning_rate": 1.4227027354984602e-05, "loss": 1.1945, "mean_token_accuracy": 0.5481650296598672, "num_tokens": 310305.0, "step": 5430 }, { "epoch": 10.24470588235294, "grad_norm": 1.6059694290161133, "learning_rate": 1.4208011890857577e-05, "loss": 1.1322, "mean_token_accuracy": 0.5755776699632407, "num_tokens": 323670.0, "step": 5440 }, { "epoch": 10.263529411764706, "grad_norm": 1.0941455364227295, "learning_rate": 1.4188977919010664e-05, "loss": 1.1634, "mean_token_accuracy": 0.5623828198760747, "num_tokens": 336409.0, "step": 5450 }, { "epoch": 10.282352941176471, "grad_norm": 0.760979175567627, "learning_rate": 1.4169925523159274e-05, "loss": 1.2111, "mean_token_accuracy": 0.5577297646552324, "num_tokens": 349680.0, "step": 5460 }, { "epoch": 10.301176470588235, "grad_norm": 1.41929292678833, "learning_rate": 1.4150854787099836e-05, "loss": 1.1846, "mean_token_accuracy": 0.5624632347375155, "num_tokens": 363183.0, "step": 5470 }, { "epoch": 10.32, "grad_norm": 0.7982503771781921, "learning_rate": 1.413176579470946e-05, "loss": 1.2039, "mean_token_accuracy": 0.5504359491169453, "num_tokens": 376390.0, "step": 5480 }, { "epoch": 10.338823529411764, "grad_norm": 1.3889517784118652, "learning_rate": 1.4112658629945535e-05, "loss": 1.1928, "mean_token_accuracy": 0.5593543030321598, "num_tokens": 389745.0, "step": 5490 }, { "epoch": 10.35764705882353, "grad_norm": 1.3614208698272705, "learning_rate": 1.409353337684539e-05, "loss": 1.2334, "mean_token_accuracy": 0.5366870552301407, "num_tokens": 404220.0, "step": 5500 }, { "epoch": 10.376470588235295, "grad_norm": 0.9981026649475098, "learning_rate": 1.4074390119525898e-05, "loss": 1.1739, "mean_token_accuracy": 0.5642281893640757, "num_tokens": 417700.0, "step": 5510 }, { "epoch": 10.395294117647058, "grad_norm": 1.0381234884262085, "learning_rate": 1.4055228942183128e-05, "loss": 1.1977, "mean_token_accuracy": 0.5563704077154398, "num_tokens": 430901.0, "step": 5520 }, { "epoch": 10.414117647058823, "grad_norm": 0.8158124089241028, "learning_rate": 1.4036049929091964e-05, "loss": 1.1914, "mean_token_accuracy": 0.5571797143667936, "num_tokens": 445094.0, "step": 5530 }, { "epoch": 10.432941176470589, "grad_norm": 0.7652572393417358, "learning_rate": 1.4016853164605728e-05, "loss": 1.2376, "mean_token_accuracy": 0.5498634003102779, "num_tokens": 459543.0, "step": 5540 }, { "epoch": 10.451764705882352, "grad_norm": 0.7951592206954956, "learning_rate": 1.3997638733155822e-05, "loss": 1.1997, "mean_token_accuracy": 0.5588535733520985, "num_tokens": 473275.0, "step": 5550 }, { "epoch": 10.470588235294118, "grad_norm": 1.2788842916488647, "learning_rate": 1.3978406719251352e-05, "loss": 1.204, "mean_token_accuracy": 0.5432504419237375, "num_tokens": 485574.0, "step": 5560 }, { "epoch": 10.489411764705883, "grad_norm": 1.9643447399139404, "learning_rate": 1.3959157207478753e-05, "loss": 1.1918, "mean_token_accuracy": 0.5582812011241913, "num_tokens": 498349.0, "step": 5570 }, { "epoch": 10.508235294117647, "grad_norm": 1.2677149772644043, "learning_rate": 1.3939890282501418e-05, "loss": 1.2043, "mean_token_accuracy": 0.5601174239069223, "num_tokens": 511915.0, "step": 5580 }, { "epoch": 10.527058823529412, "grad_norm": 1.0180656909942627, "learning_rate": 1.3920606029059332e-05, "loss": 1.2173, "mean_token_accuracy": 0.5526633080095052, "num_tokens": 524995.0, "step": 5590 }, { "epoch": 10.545882352941177, "grad_norm": 1.1644375324249268, "learning_rate": 1.3901304531968684e-05, "loss": 1.1837, "mean_token_accuracy": 0.5520532440394164, "num_tokens": 537557.0, "step": 5600 }, { "epoch": 10.564705882352941, "grad_norm": 1.3104006052017212, "learning_rate": 1.388198587612152e-05, "loss": 1.2209, "mean_token_accuracy": 0.5339883405715227, "num_tokens": 551827.0, "step": 5610 }, { "epoch": 10.583529411764706, "grad_norm": 1.103053331375122, "learning_rate": 1.386265014648534e-05, "loss": 1.154, "mean_token_accuracy": 0.5668028537184, "num_tokens": 565218.0, "step": 5620 }, { "epoch": 10.60235294117647, "grad_norm": 0.8747602105140686, "learning_rate": 1.3843297428102742e-05, "loss": 1.2476, "mean_token_accuracy": 0.5371836949139833, "num_tokens": 578938.0, "step": 5630 }, { "epoch": 10.621176470588235, "grad_norm": 0.8349719047546387, "learning_rate": 1.382392780609105e-05, "loss": 1.1669, "mean_token_accuracy": 0.5530536573380231, "num_tokens": 592617.0, "step": 5640 }, { "epoch": 10.64, "grad_norm": 1.4140478372573853, "learning_rate": 1.3804541365641923e-05, "loss": 1.2016, "mean_token_accuracy": 0.5596294030547142, "num_tokens": 606658.0, "step": 5650 }, { "epoch": 10.658823529411764, "grad_norm": 1.2245830297470093, "learning_rate": 1.3785138192021002e-05, "loss": 1.1768, "mean_token_accuracy": 0.5598421145230532, "num_tokens": 619930.0, "step": 5660 }, { "epoch": 10.67764705882353, "grad_norm": 1.3025885820388794, "learning_rate": 1.3765718370567514e-05, "loss": 1.1994, "mean_token_accuracy": 0.5509582087397575, "num_tokens": 633099.0, "step": 5670 }, { "epoch": 10.696470588235295, "grad_norm": 0.9705594778060913, "learning_rate": 1.3746281986693917e-05, "loss": 1.1976, "mean_token_accuracy": 0.5644174017012119, "num_tokens": 647248.0, "step": 5680 }, { "epoch": 10.715294117647058, "grad_norm": 1.8273649215698242, "learning_rate": 1.3726829125885501e-05, "loss": 1.1895, "mean_token_accuracy": 0.5520309090614319, "num_tokens": 660733.0, "step": 5690 }, { "epoch": 10.734117647058824, "grad_norm": 0.9448793530464172, "learning_rate": 1.370735987370004e-05, "loss": 1.1612, "mean_token_accuracy": 0.5580623522400856, "num_tokens": 674331.0, "step": 5700 }, { "epoch": 10.75294117647059, "grad_norm": 0.7382903099060059, "learning_rate": 1.3687874315767388e-05, "loss": 1.1906, "mean_token_accuracy": 0.5549033779650927, "num_tokens": 687329.0, "step": 5710 }, { "epoch": 10.771764705882353, "grad_norm": 0.9180198907852173, "learning_rate": 1.3668372537789122e-05, "loss": 1.1708, "mean_token_accuracy": 0.554550190269947, "num_tokens": 701188.0, "step": 5720 }, { "epoch": 10.790588235294118, "grad_norm": 1.3416258096694946, "learning_rate": 1.3648854625538161e-05, "loss": 1.2009, "mean_token_accuracy": 0.5456226222217083, "num_tokens": 715055.0, "step": 5730 }, { "epoch": 10.809411764705882, "grad_norm": 0.9519694447517395, "learning_rate": 1.3629320664858373e-05, "loss": 1.188, "mean_token_accuracy": 0.5580568216741085, "num_tokens": 728299.0, "step": 5740 }, { "epoch": 10.828235294117647, "grad_norm": 0.9768867492675781, "learning_rate": 1.3609770741664225e-05, "loss": 1.1748, "mean_token_accuracy": 0.5568192675709724, "num_tokens": 740400.0, "step": 5750 }, { "epoch": 10.847058823529412, "grad_norm": 1.2277079820632935, "learning_rate": 1.3590204941940384e-05, "loss": 1.1883, "mean_token_accuracy": 0.5541429404169321, "num_tokens": 753926.0, "step": 5760 }, { "epoch": 10.865882352941176, "grad_norm": 0.912382960319519, "learning_rate": 1.3570623351741343e-05, "loss": 1.2201, "mean_token_accuracy": 0.5405797265470028, "num_tokens": 767363.0, "step": 5770 }, { "epoch": 10.884705882352941, "grad_norm": 1.2783665657043457, "learning_rate": 1.3551026057191045e-05, "loss": 1.2285, "mean_token_accuracy": 0.5442549273371696, "num_tokens": 780491.0, "step": 5780 }, { "epoch": 10.903529411764707, "grad_norm": 0.789916455745697, "learning_rate": 1.3531413144482512e-05, "loss": 1.23, "mean_token_accuracy": 0.5449609015136957, "num_tokens": 793793.0, "step": 5790 }, { "epoch": 10.92235294117647, "grad_norm": 1.2650339603424072, "learning_rate": 1.351178469987745e-05, "loss": 1.2049, "mean_token_accuracy": 0.543266024813056, "num_tokens": 807792.0, "step": 5800 }, { "epoch": 10.941176470588236, "grad_norm": 0.9021736979484558, "learning_rate": 1.3492140809705881e-05, "loss": 1.1796, "mean_token_accuracy": 0.5581005875021219, "num_tokens": 821010.0, "step": 5810 }, { "epoch": 10.96, "grad_norm": 0.8924301266670227, "learning_rate": 1.3472481560365758e-05, "loss": 1.1796, "mean_token_accuracy": 0.5598813854157925, "num_tokens": 833101.0, "step": 5820 }, { "epoch": 10.978823529411764, "grad_norm": 1.8735415935516357, "learning_rate": 1.3452807038322585e-05, "loss": 1.1992, "mean_token_accuracy": 0.5387950103729964, "num_tokens": 845747.0, "step": 5830 }, { "epoch": 10.99764705882353, "grad_norm": 0.8486454486846924, "learning_rate": 1.3433117330109045e-05, "loss": 1.1955, "mean_token_accuracy": 0.550658929720521, "num_tokens": 859896.0, "step": 5840 }, { "epoch": 11.015058823529412, "grad_norm": 0.8041768670082092, "learning_rate": 1.3413412522324609e-05, "loss": 1.1699, "mean_token_accuracy": 0.5523232479353208, "num_tokens": 871831.0, "step": 5850 }, { "epoch": 11.033882352941177, "grad_norm": 1.445483922958374, "learning_rate": 1.3393692701635154e-05, "loss": 1.214, "mean_token_accuracy": 0.5537016060203314, "num_tokens": 885406.0, "step": 5860 }, { "epoch": 11.05270588235294, "grad_norm": 0.697123110294342, "learning_rate": 1.33739579547726e-05, "loss": 1.1622, "mean_token_accuracy": 0.5540354669094085, "num_tokens": 898421.0, "step": 5870 }, { "epoch": 11.071529411764706, "grad_norm": 1.465420126914978, "learning_rate": 1.3354208368534503e-05, "loss": 1.2069, "mean_token_accuracy": 0.551685893163085, "num_tokens": 912642.0, "step": 5880 }, { "epoch": 11.090352941176471, "grad_norm": 0.859109103679657, "learning_rate": 1.333444402978369e-05, "loss": 1.1826, "mean_token_accuracy": 0.5513388890773058, "num_tokens": 926366.0, "step": 5890 }, { "epoch": 11.109176470588235, "grad_norm": 1.5829471349716187, "learning_rate": 1.3314665025447876e-05, "loss": 1.2038, "mean_token_accuracy": 0.5505582805722952, "num_tokens": 939684.0, "step": 5900 }, { "epoch": 11.128, "grad_norm": 0.8772886991500854, "learning_rate": 1.3294871442519271e-05, "loss": 1.1629, "mean_token_accuracy": 0.5473615158349275, "num_tokens": 953085.0, "step": 5910 }, { "epoch": 11.146823529411765, "grad_norm": 0.9280370473861694, "learning_rate": 1.3275063368054208e-05, "loss": 1.1943, "mean_token_accuracy": 0.5425072379410267, "num_tokens": 966596.0, "step": 5920 }, { "epoch": 11.165647058823529, "grad_norm": 1.6158865690231323, "learning_rate": 1.3255240889172764e-05, "loss": 1.1669, "mean_token_accuracy": 0.5687259271740913, "num_tokens": 981302.0, "step": 5930 }, { "epoch": 11.184470588235294, "grad_norm": 0.9575416445732117, "learning_rate": 1.323540409305836e-05, "loss": 1.1828, "mean_token_accuracy": 0.5527924958616495, "num_tokens": 993437.0, "step": 5940 }, { "epoch": 11.203294117647058, "grad_norm": 0.8492655158042908, "learning_rate": 1.3215553066957391e-05, "loss": 1.1352, "mean_token_accuracy": 0.5696950633078813, "num_tokens": 1006394.0, "step": 5950 }, { "epoch": 11.222117647058823, "grad_norm": 1.3289772272109985, "learning_rate": 1.3195687898178837e-05, "loss": 1.198, "mean_token_accuracy": 0.5483724296092987, "num_tokens": 1019972.0, "step": 5960 }, { "epoch": 11.240941176470589, "grad_norm": 0.8151838779449463, "learning_rate": 1.3175808674093882e-05, "loss": 1.2229, "mean_token_accuracy": 0.5452193580567837, "num_tokens": 1033578.0, "step": 5970 }, { "epoch": 11.259764705882352, "grad_norm": 1.242629051208496, "learning_rate": 1.3155915482135528e-05, "loss": 1.2035, "mean_token_accuracy": 0.5487495046108961, "num_tokens": 1046758.0, "step": 5980 }, { "epoch": 11.278588235294118, "grad_norm": 1.238258719444275, "learning_rate": 1.3136008409798214e-05, "loss": 1.1993, "mean_token_accuracy": 0.5489524565637112, "num_tokens": 1061219.0, "step": 5990 }, { "epoch": 11.297411764705883, "grad_norm": 1.2245213985443115, "learning_rate": 1.3116087544637415e-05, "loss": 1.1616, "mean_token_accuracy": 0.5596596848219633, "num_tokens": 1074213.0, "step": 6000 }, { "epoch": 11.316235294117647, "grad_norm": 0.8657311797142029, "learning_rate": 1.3096152974269289e-05, "loss": 1.179, "mean_token_accuracy": 0.5485074911266565, "num_tokens": 1088151.0, "step": 6010 }, { "epoch": 11.335058823529412, "grad_norm": 0.8435884714126587, "learning_rate": 1.3076204786370256e-05, "loss": 1.1862, "mean_token_accuracy": 0.5667649589478969, "num_tokens": 1101736.0, "step": 6020 }, { "epoch": 11.353882352941177, "grad_norm": 4.664355278015137, "learning_rate": 1.3056243068676637e-05, "loss": 1.1899, "mean_token_accuracy": 0.5602201897650957, "num_tokens": 1115368.0, "step": 6030 }, { "epoch": 11.37270588235294, "grad_norm": 0.7196553945541382, "learning_rate": 1.3036267908984257e-05, "loss": 1.2337, "mean_token_accuracy": 0.5351801011711359, "num_tokens": 1128875.0, "step": 6040 }, { "epoch": 11.391529411764706, "grad_norm": 0.8050165772438049, "learning_rate": 1.3016279395148067e-05, "loss": 1.2082, "mean_token_accuracy": 0.5454613540321589, "num_tokens": 1141185.0, "step": 6050 }, { "epoch": 11.41035294117647, "grad_norm": 1.6665176153182983, "learning_rate": 1.2996277615081738e-05, "loss": 1.1941, "mean_token_accuracy": 0.5567255288362503, "num_tokens": 1154738.0, "step": 6060 }, { "epoch": 11.429176470588235, "grad_norm": 1.7246068716049194, "learning_rate": 1.297626265675731e-05, "loss": 1.19, "mean_token_accuracy": 0.5438788242638111, "num_tokens": 1168172.0, "step": 6070 }, { "epoch": 11.448, "grad_norm": 0.8024677038192749, "learning_rate": 1.2956234608204765e-05, "loss": 1.2072, "mean_token_accuracy": 0.5483981113880873, "num_tokens": 1181363.0, "step": 6080 }, { "epoch": 11.466823529411764, "grad_norm": 1.0496245622634888, "learning_rate": 1.293619355751167e-05, "loss": 1.1437, "mean_token_accuracy": 0.5686488572508097, "num_tokens": 1195397.0, "step": 6090 }, { "epoch": 11.48564705882353, "grad_norm": 0.6598522067070007, "learning_rate": 1.2916139592822776e-05, "loss": 1.2051, "mean_token_accuracy": 0.5419987261295318, "num_tokens": 1208020.0, "step": 6100 }, { "epoch": 11.504470588235295, "grad_norm": 1.8896702527999878, "learning_rate": 1.2896072802339623e-05, "loss": 1.1603, "mean_token_accuracy": 0.571081367880106, "num_tokens": 1222044.0, "step": 6110 }, { "epoch": 11.523294117647058, "grad_norm": 0.8881447911262512, "learning_rate": 1.2875993274320173e-05, "loss": 1.1703, "mean_token_accuracy": 0.5585772100836038, "num_tokens": 1236218.0, "step": 6120 }, { "epoch": 11.542117647058824, "grad_norm": 1.263449788093567, "learning_rate": 1.2855901097078412e-05, "loss": 1.1544, "mean_token_accuracy": 0.5590313211083412, "num_tokens": 1249412.0, "step": 6130 }, { "epoch": 11.560941176470589, "grad_norm": 1.120104432106018, "learning_rate": 1.2835796358983943e-05, "loss": 1.2265, "mean_token_accuracy": 0.5422938629984856, "num_tokens": 1262521.0, "step": 6140 }, { "epoch": 11.579764705882353, "grad_norm": 1.0734158754348755, "learning_rate": 1.2815679148461636e-05, "loss": 1.2199, "mean_token_accuracy": 0.5645121570676566, "num_tokens": 1276508.0, "step": 6150 }, { "epoch": 11.598588235294118, "grad_norm": 0.7284833192825317, "learning_rate": 1.2795549553991202e-05, "loss": 1.1995, "mean_token_accuracy": 0.55781021527946, "num_tokens": 1289814.0, "step": 6160 }, { "epoch": 11.617411764705881, "grad_norm": 0.9633259773254395, "learning_rate": 1.2775407664106825e-05, "loss": 1.1882, "mean_token_accuracy": 0.555243044346571, "num_tokens": 1303074.0, "step": 6170 }, { "epoch": 11.636235294117647, "grad_norm": 0.6576571464538574, "learning_rate": 1.2755253567396766e-05, "loss": 1.208, "mean_token_accuracy": 0.5453934874385595, "num_tokens": 1316357.0, "step": 6180 }, { "epoch": 11.655058823529412, "grad_norm": 2.550999879837036, "learning_rate": 1.2735087352502977e-05, "loss": 1.2179, "mean_token_accuracy": 0.5549823541194201, "num_tokens": 1329683.0, "step": 6190 }, { "epoch": 11.673882352941176, "grad_norm": 1.061109185218811, "learning_rate": 1.2714909108120698e-05, "loss": 1.1632, "mean_token_accuracy": 0.5776005409657955, "num_tokens": 1342884.0, "step": 6200 }, { "epoch": 11.694588235294118, "grad_norm": 2.3584885597229004, "learning_rate": 1.2694718922998097e-05, "loss": 1.1887, "mean_token_accuracy": 0.5592446334660053, "num_tokens": 13595.0, "step": 6210 }, { "epoch": 11.713411764705882, "grad_norm": 0.8795002698898315, "learning_rate": 1.2674516885935835e-05, "loss": 1.1999, "mean_token_accuracy": 0.5442189387977123, "num_tokens": 26488.0, "step": 6220 }, { "epoch": 11.732235294117647, "grad_norm": 0.8854806423187256, "learning_rate": 1.2654303085786723e-05, "loss": 1.1295, "mean_token_accuracy": 0.5750894896686077, "num_tokens": 40270.0, "step": 6230 }, { "epoch": 11.751058823529412, "grad_norm": 0.8246656656265259, "learning_rate": 1.2634077611455294e-05, "loss": 1.1946, "mean_token_accuracy": 0.5488846648484469, "num_tokens": 54551.0, "step": 6240 }, { "epoch": 11.769882352941176, "grad_norm": 1.1240957975387573, "learning_rate": 1.2613840551897428e-05, "loss": 1.2098, "mean_token_accuracy": 0.5464079327881336, "num_tokens": 68016.0, "step": 6250 }, { "epoch": 11.788705882352941, "grad_norm": 0.672888994216919, "learning_rate": 1.2593591996119965e-05, "loss": 1.2383, "mean_token_accuracy": 0.5364337969571352, "num_tokens": 82740.0, "step": 6260 }, { "epoch": 11.807529411764706, "grad_norm": 0.9078545570373535, "learning_rate": 1.257333203318031e-05, "loss": 1.1736, "mean_token_accuracy": 0.5599259410053492, "num_tokens": 95371.0, "step": 6270 }, { "epoch": 11.82635294117647, "grad_norm": 1.5212457180023193, "learning_rate": 1.2553060752186024e-05, "loss": 1.1655, "mean_token_accuracy": 0.5664675917476416, "num_tokens": 108447.0, "step": 6280 }, { "epoch": 11.845176470588235, "grad_norm": 0.6960899829864502, "learning_rate": 1.2532778242294467e-05, "loss": 1.184, "mean_token_accuracy": 0.559162225574255, "num_tokens": 121975.0, "step": 6290 }, { "epoch": 11.864, "grad_norm": 1.1340759992599487, "learning_rate": 1.2512484592712373e-05, "loss": 1.1874, "mean_token_accuracy": 0.5441106397658586, "num_tokens": 135823.0, "step": 6300 }, { "epoch": 11.882823529411764, "grad_norm": 0.7761991024017334, "learning_rate": 1.2492179892695473e-05, "loss": 1.1632, "mean_token_accuracy": 0.5639519464224577, "num_tokens": 147887.0, "step": 6310 }, { "epoch": 11.90164705882353, "grad_norm": 1.0354701280593872, "learning_rate": 1.24718642315481e-05, "loss": 1.1432, "mean_token_accuracy": 0.5634768087416887, "num_tokens": 161408.0, "step": 6320 }, { "epoch": 11.920470588235293, "grad_norm": 0.9502993822097778, "learning_rate": 1.2451537698622799e-05, "loss": 1.134, "mean_token_accuracy": 0.5664606466889381, "num_tokens": 173813.0, "step": 6330 }, { "epoch": 11.939294117647059, "grad_norm": 0.7904968857765198, "learning_rate": 1.2431200383319931e-05, "loss": 1.1782, "mean_token_accuracy": 0.5518010076135397, "num_tokens": 186645.0, "step": 6340 }, { "epoch": 11.958117647058824, "grad_norm": 0.6425819993019104, "learning_rate": 1.2410852375087279e-05, "loss": 1.1928, "mean_token_accuracy": 0.5603426963090896, "num_tokens": 201324.0, "step": 6350 }, { "epoch": 11.976941176470588, "grad_norm": 1.029788851737976, "learning_rate": 1.2390493763419654e-05, "loss": 1.2296, "mean_token_accuracy": 0.530813605338335, "num_tokens": 213844.0, "step": 6360 }, { "epoch": 11.995764705882353, "grad_norm": 1.0189554691314697, "learning_rate": 1.2370124637858508e-05, "loss": 1.125, "mean_token_accuracy": 0.5650646161288023, "num_tokens": 227343.0, "step": 6370 }, { "epoch": 12.015058823529412, "grad_norm": 1.3754558563232422, "learning_rate": 1.2349745087991529e-05, "loss": 1.2719, "mean_token_accuracy": 0.5659251939959642, "num_tokens": 241245.0, "step": 6380 }, { "epoch": 12.033882352941177, "grad_norm": 1.338383436203003, "learning_rate": 1.2329355203452258e-05, "loss": 1.2056, "mean_token_accuracy": 0.5403179809451103, "num_tokens": 254997.0, "step": 6390 }, { "epoch": 12.05270588235294, "grad_norm": 0.7401157021522522, "learning_rate": 1.2308955073919688e-05, "loss": 1.2027, "mean_token_accuracy": 0.5495895497500897, "num_tokens": 268506.0, "step": 6400 }, { "epoch": 12.071529411764706, "grad_norm": 1.382067084312439, "learning_rate": 1.2288544789117876e-05, "loss": 1.1167, "mean_token_accuracy": 0.5725257787853479, "num_tokens": 281845.0, "step": 6410 }, { "epoch": 12.090352941176471, "grad_norm": 1.1322115659713745, "learning_rate": 1.2268124438815531e-05, "loss": 1.1649, "mean_token_accuracy": 0.5655132979154587, "num_tokens": 294981.0, "step": 6420 }, { "epoch": 12.109176470588235, "grad_norm": 2.3306519985198975, "learning_rate": 1.2247694112825654e-05, "loss": 1.1841, "mean_token_accuracy": 0.5553363788872957, "num_tokens": 308385.0, "step": 6430 }, { "epoch": 12.128, "grad_norm": 1.036372423171997, "learning_rate": 1.2227253901005101e-05, "loss": 1.1893, "mean_token_accuracy": 0.550970122590661, "num_tokens": 321685.0, "step": 6440 }, { "epoch": 12.146823529411765, "grad_norm": 1.155049443244934, "learning_rate": 1.2206803893254215e-05, "loss": 1.1504, "mean_token_accuracy": 0.5654265254735946, "num_tokens": 334803.0, "step": 6450 }, { "epoch": 12.165647058823529, "grad_norm": 0.8562523126602173, "learning_rate": 1.2186344179516425e-05, "loss": 1.1566, "mean_token_accuracy": 0.5620875429362059, "num_tokens": 347350.0, "step": 6460 }, { "epoch": 12.184470588235294, "grad_norm": 1.5091642141342163, "learning_rate": 1.2165874849777853e-05, "loss": 1.1878, "mean_token_accuracy": 0.5486861743032933, "num_tokens": 361251.0, "step": 6470 }, { "epoch": 12.203294117647058, "grad_norm": 1.0562283992767334, "learning_rate": 1.21453959940669e-05, "loss": 1.2455, "mean_token_accuracy": 0.5419348709285259, "num_tokens": 374607.0, "step": 6480 }, { "epoch": 12.222117647058823, "grad_norm": 2.187586545944214, "learning_rate": 1.2124907702453883e-05, "loss": 1.1733, "mean_token_accuracy": 0.5536637313663959, "num_tokens": 388714.0, "step": 6490 }, { "epoch": 12.240941176470589, "grad_norm": 1.4512325525283813, "learning_rate": 1.2104410065050605e-05, "loss": 1.2252, "mean_token_accuracy": 0.5447334434837103, "num_tokens": 402108.0, "step": 6500 }, { "epoch": 12.259764705882352, "grad_norm": 1.3915634155273438, "learning_rate": 1.208390317200998e-05, "loss": 1.1606, "mean_token_accuracy": 0.5588117640465498, "num_tokens": 417457.0, "step": 6510 }, { "epoch": 12.278588235294118, "grad_norm": 1.3058298826217651, "learning_rate": 1.2063387113525635e-05, "loss": 1.1624, "mean_token_accuracy": 0.567823113501072, "num_tokens": 429732.0, "step": 6520 }, { "epoch": 12.297411764705883, "grad_norm": 1.4169563055038452, "learning_rate": 1.2042861979831496e-05, "loss": 1.1758, "mean_token_accuracy": 0.5708753641694784, "num_tokens": 444198.0, "step": 6530 }, { "epoch": 12.316235294117647, "grad_norm": 0.916401207447052, "learning_rate": 1.202232786120141e-05, "loss": 1.1952, "mean_token_accuracy": 0.5533534411340952, "num_tokens": 457447.0, "step": 6540 }, { "epoch": 12.335058823529412, "grad_norm": 1.6477797031402588, "learning_rate": 1.200178484794875e-05, "loss": 1.1731, "mean_token_accuracy": 0.5523608162999153, "num_tokens": 471188.0, "step": 6550 }, { "epoch": 12.353882352941177, "grad_norm": 0.7916552424430847, "learning_rate": 1.1981233030425996e-05, "loss": 1.1525, "mean_token_accuracy": 0.5627595514059067, "num_tokens": 483973.0, "step": 6560 }, { "epoch": 12.37270588235294, "grad_norm": 1.6025060415267944, "learning_rate": 1.1960672499024359e-05, "loss": 1.1149, "mean_token_accuracy": 0.5760308355093002, "num_tokens": 497279.0, "step": 6570 }, { "epoch": 12.391529411764706, "grad_norm": 1.0256032943725586, "learning_rate": 1.1940103344173375e-05, "loss": 1.1646, "mean_token_accuracy": 0.5561530087143183, "num_tokens": 511615.0, "step": 6580 }, { "epoch": 12.41035294117647, "grad_norm": 1.3151596784591675, "learning_rate": 1.1919525656340503e-05, "loss": 1.1609, "mean_token_accuracy": 0.5567661169916391, "num_tokens": 524047.0, "step": 6590 }, { "epoch": 12.429176470588235, "grad_norm": 0.9940578937530518, "learning_rate": 1.1898939526030732e-05, "loss": 1.199, "mean_token_accuracy": 0.5474131718277931, "num_tokens": 537761.0, "step": 6600 }, { "epoch": 12.448, "grad_norm": 0.7173454165458679, "learning_rate": 1.1878345043786195e-05, "loss": 1.1698, "mean_token_accuracy": 0.5660860728472471, "num_tokens": 551598.0, "step": 6610 }, { "epoch": 12.466823529411764, "grad_norm": 1.1414166688919067, "learning_rate": 1.1857742300185739e-05, "loss": 1.2336, "mean_token_accuracy": 0.5513837717473506, "num_tokens": 566797.0, "step": 6620 }, { "epoch": 12.48564705882353, "grad_norm": 0.6648653745651245, "learning_rate": 1.1837131385844567e-05, "loss": 1.1919, "mean_token_accuracy": 0.5441902942955494, "num_tokens": 580339.0, "step": 6630 }, { "epoch": 12.504470588235295, "grad_norm": 1.2969242334365845, "learning_rate": 1.1816512391413798e-05, "loss": 1.1584, "mean_token_accuracy": 0.5687514644116163, "num_tokens": 592369.0, "step": 6640 }, { "epoch": 12.523294117647058, "grad_norm": 0.6251775622367859, "learning_rate": 1.179588540758011e-05, "loss": 1.2068, "mean_token_accuracy": 0.5384650267660618, "num_tokens": 605961.0, "step": 6650 }, { "epoch": 12.542117647058824, "grad_norm": 1.3755369186401367, "learning_rate": 1.1775250525065297e-05, "loss": 1.1859, "mean_token_accuracy": 0.5518178451806307, "num_tokens": 618337.0, "step": 6660 }, { "epoch": 12.560941176470589, "grad_norm": 1.2308052778244019, "learning_rate": 1.1754607834625915e-05, "loss": 1.2075, "mean_token_accuracy": 0.5420106790959835, "num_tokens": 632237.0, "step": 6670 }, { "epoch": 12.579764705882353, "grad_norm": 0.7645729184150696, "learning_rate": 1.1733957427052842e-05, "loss": 1.1931, "mean_token_accuracy": 0.562155156955123, "num_tokens": 645494.0, "step": 6680 }, { "epoch": 12.598588235294118, "grad_norm": 0.6689856052398682, "learning_rate": 1.1713299393170916e-05, "loss": 1.1567, "mean_token_accuracy": 0.5580319032073021, "num_tokens": 658861.0, "step": 6690 }, { "epoch": 12.617411764705881, "grad_norm": 1.2952977418899536, "learning_rate": 1.1692633823838503e-05, "loss": 1.1983, "mean_token_accuracy": 0.5488288260996341, "num_tokens": 671873.0, "step": 6700 }, { "epoch": 12.636235294117647, "grad_norm": 0.988854169845581, "learning_rate": 1.1671960809947116e-05, "loss": 1.2001, "mean_token_accuracy": 0.5581530544906854, "num_tokens": 684288.0, "step": 6710 }, { "epoch": 12.655058823529412, "grad_norm": 0.9140803813934326, "learning_rate": 1.165128044242101e-05, "loss": 1.1754, "mean_token_accuracy": 0.5484160725027323, "num_tokens": 696428.0, "step": 6720 }, { "epoch": 12.673882352941176, "grad_norm": 1.194382905960083, "learning_rate": 1.163059281221679e-05, "loss": 1.16, "mean_token_accuracy": 0.5650255784392357, "num_tokens": 709841.0, "step": 6730 }, { "epoch": 12.692705882352941, "grad_norm": 0.7279021143913269, "learning_rate": 1.1609898010322989e-05, "loss": 1.1799, "mean_token_accuracy": 0.5441335134208203, "num_tokens": 724299.0, "step": 6740 }, { "epoch": 12.711529411764706, "grad_norm": 0.7829269766807556, "learning_rate": 1.1589196127759697e-05, "loss": 1.1982, "mean_token_accuracy": 0.5436731087043881, "num_tokens": 737467.0, "step": 6750 }, { "epoch": 12.73035294117647, "grad_norm": 0.908854603767395, "learning_rate": 1.1568487255578135e-05, "loss": 1.1589, "mean_token_accuracy": 0.5564702823758125, "num_tokens": 751035.0, "step": 6760 }, { "epoch": 12.749176470588235, "grad_norm": 0.8606781363487244, "learning_rate": 1.1547771484860282e-05, "loss": 1.1811, "mean_token_accuracy": 0.5530305828899145, "num_tokens": 764012.0, "step": 6770 }, { "epoch": 12.768, "grad_norm": 0.8715227246284485, "learning_rate": 1.1527048906718434e-05, "loss": 1.1731, "mean_token_accuracy": 0.5534448944032192, "num_tokens": 777823.0, "step": 6780 }, { "epoch": 12.786823529411764, "grad_norm": 1.383436918258667, "learning_rate": 1.1506319612294855e-05, "loss": 1.2038, "mean_token_accuracy": 0.5430160872638226, "num_tokens": 791112.0, "step": 6790 }, { "epoch": 12.80564705882353, "grad_norm": 0.6807175278663635, "learning_rate": 1.148558369276132e-05, "loss": 1.1325, "mean_token_accuracy": 0.5736443504691124, "num_tokens": 804227.0, "step": 6800 }, { "epoch": 12.824470588235293, "grad_norm": 1.107948660850525, "learning_rate": 1.1464841239318764e-05, "loss": 1.1518, "mean_token_accuracy": 0.5673416070640087, "num_tokens": 817620.0, "step": 6810 }, { "epoch": 12.843294117647059, "grad_norm": 0.7133264541625977, "learning_rate": 1.1444092343196855e-05, "loss": 1.1768, "mean_token_accuracy": 0.5533497478812933, "num_tokens": 831699.0, "step": 6820 }, { "epoch": 12.862117647058824, "grad_norm": 0.7470325231552124, "learning_rate": 1.1423337095653595e-05, "loss": 1.1794, "mean_token_accuracy": 0.556913785263896, "num_tokens": 845041.0, "step": 6830 }, { "epoch": 12.880941176470587, "grad_norm": 0.7599585056304932, "learning_rate": 1.1402575587974915e-05, "loss": 1.1831, "mean_token_accuracy": 0.5495749611407519, "num_tokens": 858034.0, "step": 6840 }, { "epoch": 12.899764705882353, "grad_norm": 0.9152631163597107, "learning_rate": 1.1381807911474291e-05, "loss": 1.1693, "mean_token_accuracy": 0.5672723963856697, "num_tokens": 871960.0, "step": 6850 }, { "epoch": 12.918588235294118, "grad_norm": 1.0719937086105347, "learning_rate": 1.1361034157492324e-05, "loss": 1.2041, "mean_token_accuracy": 0.5518028371036052, "num_tokens": 884661.0, "step": 6860 }, { "epoch": 12.937411764705882, "grad_norm": 1.084991455078125, "learning_rate": 1.1340254417396343e-05, "loss": 1.2011, "mean_token_accuracy": 0.5481019847095012, "num_tokens": 897816.0, "step": 6870 }, { "epoch": 12.956235294117647, "grad_norm": 1.3787931203842163, "learning_rate": 1.131946878258001e-05, "loss": 1.1315, "mean_token_accuracy": 0.5697043187916279, "num_tokens": 910552.0, "step": 6880 }, { "epoch": 12.975058823529412, "grad_norm": 1.2762988805770874, "learning_rate": 1.1298677344462914e-05, "loss": 1.1643, "mean_token_accuracy": 0.5592548452317715, "num_tokens": 924705.0, "step": 6890 }, { "epoch": 12.993882352941176, "grad_norm": 0.8996446132659912, "learning_rate": 1.127788019449016e-05, "loss": 1.2202, "mean_token_accuracy": 0.5417749028652906, "num_tokens": 938010.0, "step": 6900 }, { "epoch": 13.01129411764706, "grad_norm": 1.195081114768982, "learning_rate": 1.1257077424131985e-05, "loss": 1.1559, "mean_token_accuracy": 0.5514025462640298, "num_tokens": 951050.0, "step": 6910 }, { "epoch": 13.030117647058823, "grad_norm": 1.7555843591690063, "learning_rate": 1.1236269124883339e-05, "loss": 1.1524, "mean_token_accuracy": 0.5562022086232901, "num_tokens": 964411.0, "step": 6920 }, { "epoch": 13.048941176470588, "grad_norm": 1.3604152202606201, "learning_rate": 1.1215455388263496e-05, "loss": 1.1602, "mean_token_accuracy": 0.5531352117657662, "num_tokens": 976752.0, "step": 6930 }, { "epoch": 13.067764705882352, "grad_norm": 1.0296913385391235, "learning_rate": 1.1194636305815635e-05, "loss": 1.1608, "mean_token_accuracy": 0.5703556634485721, "num_tokens": 989410.0, "step": 6940 }, { "epoch": 13.086588235294117, "grad_norm": 1.0703682899475098, "learning_rate": 1.1173811969106451e-05, "loss": 1.1665, "mean_token_accuracy": 0.5600442342460156, "num_tokens": 1003038.0, "step": 6950 }, { "epoch": 13.105411764705883, "grad_norm": 0.9015535712242126, "learning_rate": 1.1152982469725755e-05, "loss": 1.1816, "mean_token_accuracy": 0.5589367963373661, "num_tokens": 1017162.0, "step": 6960 }, { "epoch": 13.124235294117646, "grad_norm": 0.7695736885070801, "learning_rate": 1.1132147899286054e-05, "loss": 1.2044, "mean_token_accuracy": 0.5554168112576008, "num_tokens": 1030524.0, "step": 6970 }, { "epoch": 13.143058823529412, "grad_norm": 0.6659622192382812, "learning_rate": 1.1111308349422165e-05, "loss": 1.1464, "mean_token_accuracy": 0.5591688379645348, "num_tokens": 1043594.0, "step": 6980 }, { "epoch": 13.161882352941177, "grad_norm": 0.7469462156295776, "learning_rate": 1.1090463911790807e-05, "loss": 1.1107, "mean_token_accuracy": 0.5700595445930958, "num_tokens": 1057281.0, "step": 6990 }, { "epoch": 13.18070588235294, "grad_norm": 0.7391088008880615, "learning_rate": 1.1069614678070193e-05, "loss": 1.1526, "mean_token_accuracy": 0.5636973019689322, "num_tokens": 1071037.0, "step": 7000 }, { "epoch": 13.199529411764706, "grad_norm": 1.2832854986190796, "learning_rate": 1.1048760739959628e-05, "loss": 1.1978, "mean_token_accuracy": 0.5526088491082192, "num_tokens": 1084747.0, "step": 7010 }, { "epoch": 13.218352941176471, "grad_norm": 0.7462761402130127, "learning_rate": 1.1027902189179107e-05, "loss": 1.1735, "mean_token_accuracy": 0.5575515639036894, "num_tokens": 1098389.0, "step": 7020 }, { "epoch": 13.237176470588235, "grad_norm": 0.6917766332626343, "learning_rate": 1.1007039117468928e-05, "loss": 1.1831, "mean_token_accuracy": 0.5607794526964426, "num_tokens": 1111600.0, "step": 7030 }, { "epoch": 13.256, "grad_norm": 0.8664299845695496, "learning_rate": 1.0986171616589247e-05, "loss": 1.1687, "mean_token_accuracy": 0.5606127306818962, "num_tokens": 1125368.0, "step": 7040 }, { "epoch": 13.274823529411766, "grad_norm": 0.7263596653938293, "learning_rate": 1.0965299778319728e-05, "loss": 1.2441, "mean_token_accuracy": 0.5351598154753446, "num_tokens": 1139128.0, "step": 7050 }, { "epoch": 13.29364705882353, "grad_norm": 1.76611328125, "learning_rate": 1.0944423694459087e-05, "loss": 1.1559, "mean_token_accuracy": 0.567909749224782, "num_tokens": 1152782.0, "step": 7060 }, { "epoch": 13.312470588235294, "grad_norm": 1.1992835998535156, "learning_rate": 1.0923543456824737e-05, "loss": 1.1802, "mean_token_accuracy": 0.5539811603724957, "num_tokens": 1166674.0, "step": 7070 }, { "epoch": 13.331294117647058, "grad_norm": 1.3675031661987305, "learning_rate": 1.0902659157252333e-05, "loss": 1.1545, "mean_token_accuracy": 0.5604531057178974, "num_tokens": 1180331.0, "step": 7080 }, { "epoch": 13.350117647058823, "grad_norm": 0.7396109700202942, "learning_rate": 1.088177088759542e-05, "loss": 1.158, "mean_token_accuracy": 0.5598292458802462, "num_tokens": 1193986.0, "step": 7090 }, { "epoch": 13.368941176470589, "grad_norm": 0.7911150455474854, "learning_rate": 1.0860878739724989e-05, "loss": 1.1662, "mean_token_accuracy": 0.5609062645584345, "num_tokens": 1207593.0, "step": 7100 }, { "epoch": 13.387764705882352, "grad_norm": 0.7450502514839172, "learning_rate": 1.0839982805529097e-05, "loss": 1.1734, "mean_token_accuracy": 0.5542735267430544, "num_tokens": 1221421.0, "step": 7110 }, { "epoch": 13.406588235294118, "grad_norm": 0.695760190486908, "learning_rate": 1.0819083176912446e-05, "loss": 1.203, "mean_token_accuracy": 0.5460193280130625, "num_tokens": 1235153.0, "step": 7120 }, { "epoch": 13.425411764705883, "grad_norm": 0.8482743501663208, "learning_rate": 1.0798179945795996e-05, "loss": 1.2084, "mean_token_accuracy": 0.5523406885564327, "num_tokens": 1248129.0, "step": 7130 }, { "epoch": 13.444235294117647, "grad_norm": 0.7389087677001953, "learning_rate": 1.0777273204116541e-05, "loss": 1.1817, "mean_token_accuracy": 0.5641430784016848, "num_tokens": 1261697.0, "step": 7140 }, { "epoch": 13.463058823529412, "grad_norm": 0.7677028179168701, "learning_rate": 1.0756363043826328e-05, "loss": 1.2031, "mean_token_accuracy": 0.5505106158554554, "num_tokens": 1274933.0, "step": 7150 }, { "epoch": 13.481882352941177, "grad_norm": 0.8022538423538208, "learning_rate": 1.0735449556892622e-05, "loss": 1.1603, "mean_token_accuracy": 0.5579750452190637, "num_tokens": 1288707.0, "step": 7160 }, { "epoch": 13.500705882352941, "grad_norm": 0.6393579840660095, "learning_rate": 1.0714532835297344e-05, "loss": 1.1945, "mean_token_accuracy": 0.556298240274191, "num_tokens": 1302937.0, "step": 7170 }, { "epoch": 13.519529411764706, "grad_norm": 0.8472998142242432, "learning_rate": 1.0693612971036616e-05, "loss": 1.2097, "mean_token_accuracy": 0.5355463117361069, "num_tokens": 1316118.0, "step": 7180 }, { "epoch": 13.53835294117647, "grad_norm": 0.5993279218673706, "learning_rate": 1.0672690056120398e-05, "loss": 1.1842, "mean_token_accuracy": 0.5584869582206011, "num_tokens": 1329672.0, "step": 7190 }, { "epoch": 13.557176470588235, "grad_norm": 1.1063508987426758, "learning_rate": 1.0651764182572063e-05, "loss": 1.1652, "mean_token_accuracy": 0.5537949342280626, "num_tokens": 1342869.0, "step": 7200 }, { "epoch": 13.576, "grad_norm": 0.9842997789382935, "learning_rate": 1.0630835442428001e-05, "loss": 1.2162, "mean_token_accuracy": 0.5468976002186536, "num_tokens": 1356452.0, "step": 7210 }, { "epoch": 13.594823529411764, "grad_norm": 0.7008135914802551, "learning_rate": 1.0609903927737196e-05, "loss": 1.174, "mean_token_accuracy": 0.5521068956702948, "num_tokens": 1370184.0, "step": 7220 }, { "epoch": 13.61364705882353, "grad_norm": 1.510910987854004, "learning_rate": 1.0588969730560852e-05, "loss": 1.2074, "mean_token_accuracy": 0.5425486758351326, "num_tokens": 1383485.0, "step": 7230 }, { "epoch": 13.632470588235295, "grad_norm": 0.8757747411727905, "learning_rate": 1.0568032942971962e-05, "loss": 1.209, "mean_token_accuracy": 0.5415149603039027, "num_tokens": 1398031.0, "step": 7240 }, { "epoch": 13.651294117647058, "grad_norm": 1.3053663969039917, "learning_rate": 1.0547093657054914e-05, "loss": 1.1542, "mean_token_accuracy": 0.5658162008970976, "num_tokens": 1410853.0, "step": 7250 }, { "epoch": 13.670117647058824, "grad_norm": 1.395501971244812, "learning_rate": 1.0526151964905085e-05, "loss": 1.1775, "mean_token_accuracy": 0.5551408220082521, "num_tokens": 1423509.0, "step": 7260 }, { "epoch": 13.688941176470589, "grad_norm": 1.4833544492721558, "learning_rate": 1.0505207958628438e-05, "loss": 1.0948, "mean_token_accuracy": 0.5793862946331501, "num_tokens": 1437175.0, "step": 7270 }, { "epoch": 13.707764705882353, "grad_norm": 0.8894456624984741, "learning_rate": 1.0484261730341101e-05, "loss": 1.1577, "mean_token_accuracy": 0.5599946200847625, "num_tokens": 1449621.0, "step": 7280 }, { "epoch": 13.726588235294118, "grad_norm": 0.9780417680740356, "learning_rate": 1.0463313372168993e-05, "loss": 1.2138, "mean_token_accuracy": 0.5368953734636307, "num_tokens": 1463044.0, "step": 7290 }, { "epoch": 13.745411764705882, "grad_norm": 1.0154612064361572, "learning_rate": 1.0442362976247384e-05, "loss": 1.2187, "mean_token_accuracy": 0.5392611972987652, "num_tokens": 1476500.0, "step": 7300 }, { "epoch": 13.764235294117647, "grad_norm": 0.8700633645057678, "learning_rate": 1.0421410634720523e-05, "loss": 1.1487, "mean_token_accuracy": 0.5718079563230276, "num_tokens": 1489072.0, "step": 7310 }, { "epoch": 13.783058823529412, "grad_norm": 1.255520224571228, "learning_rate": 1.0400456439741203e-05, "loss": 1.1511, "mean_token_accuracy": 0.5543713182210922, "num_tokens": 1502885.0, "step": 7320 }, { "epoch": 13.801882352941176, "grad_norm": 0.9495011568069458, "learning_rate": 1.0379500483470373e-05, "loss": 1.1583, "mean_token_accuracy": 0.5580469910055399, "num_tokens": 1515640.0, "step": 7330 }, { "epoch": 13.820705882352941, "grad_norm": 0.7675338387489319, "learning_rate": 1.035854285807673e-05, "loss": 1.1739, "mean_token_accuracy": 0.5525693718343974, "num_tokens": 1529604.0, "step": 7340 }, { "epoch": 13.839529411764707, "grad_norm": 1.0846037864685059, "learning_rate": 1.0337583655736312e-05, "loss": 1.2115, "mean_token_accuracy": 0.5463377732783556, "num_tokens": 1543205.0, "step": 7350 }, { "epoch": 13.85835294117647, "grad_norm": 0.711812436580658, "learning_rate": 1.0316622968632088e-05, "loss": 1.2121, "mean_token_accuracy": 0.5422242067754268, "num_tokens": 1556017.0, "step": 7360 }, { "epoch": 13.877176470588235, "grad_norm": 1.1142287254333496, "learning_rate": 1.029566088895357e-05, "loss": 1.1561, "mean_token_accuracy": 0.5748393829911947, "num_tokens": 1570116.0, "step": 7370 }, { "epoch": 13.896, "grad_norm": 0.8056155443191528, "learning_rate": 1.0274697508896372e-05, "loss": 1.1411, "mean_token_accuracy": 0.5808346830308437, "num_tokens": 1582496.0, "step": 7380 }, { "epoch": 13.914823529411764, "grad_norm": 1.1729984283447266, "learning_rate": 1.0253732920661856e-05, "loss": 1.1881, "mean_token_accuracy": 0.5547745041549206, "num_tokens": 1596198.0, "step": 7390 }, { "epoch": 13.93364705882353, "grad_norm": 1.339120626449585, "learning_rate": 1.0232767216456672e-05, "loss": 1.1534, "mean_token_accuracy": 0.5604519348591566, "num_tokens": 1609177.0, "step": 7400 }, { "epoch": 13.952470588235293, "grad_norm": 0.6790438294410706, "learning_rate": 1.0211800488492401e-05, "loss": 1.1662, "mean_token_accuracy": 0.5518874824047089, "num_tokens": 1622444.0, "step": 7410 }, { "epoch": 13.971294117647059, "grad_norm": 1.8246535062789917, "learning_rate": 1.01908328289851e-05, "loss": 1.1406, "mean_token_accuracy": 0.5649749383330345, "num_tokens": 1635759.0, "step": 7420 }, { "epoch": 13.990117647058824, "grad_norm": 1.3896183967590332, "learning_rate": 1.0169864330154951e-05, "loss": 1.1608, "mean_token_accuracy": 0.5683747977018356, "num_tokens": 1649592.0, "step": 7430 }, { "epoch": 14.007529411764706, "grad_norm": 0.9073938131332397, "learning_rate": 1.0148895084225807e-05, "loss": 1.1792, "mean_token_accuracy": 0.5546492849652832, "num_tokens": 1661557.0, "step": 7440 }, { "epoch": 14.026352941176471, "grad_norm": 0.770523190498352, "learning_rate": 1.012792518342482e-05, "loss": 1.1737, "mean_token_accuracy": 0.5552055723965168, "num_tokens": 1675410.0, "step": 7450 }, { "epoch": 14.045176470588235, "grad_norm": 1.5216394662857056, "learning_rate": 1.0106954719982014e-05, "loss": 1.1718, "mean_token_accuracy": 0.5549504559487104, "num_tokens": 1689244.0, "step": 7460 }, { "epoch": 14.064, "grad_norm": 0.9542890787124634, "learning_rate": 1.0085983786129894e-05, "loss": 1.1549, "mean_token_accuracy": 0.5636588338762522, "num_tokens": 1702377.0, "step": 7470 }, { "epoch": 14.082823529411765, "grad_norm": 0.9031827449798584, "learning_rate": 1.0065012474103027e-05, "loss": 1.195, "mean_token_accuracy": 0.5450264655053616, "num_tokens": 1714838.0, "step": 7480 }, { "epoch": 14.101647058823529, "grad_norm": 1.3046774864196777, "learning_rate": 1.0044040876137647e-05, "loss": 1.1586, "mean_token_accuracy": 0.5547593403607607, "num_tokens": 1728323.0, "step": 7490 }, { "epoch": 14.120470588235294, "grad_norm": 0.7626868486404419, "learning_rate": 1.0023069084471244e-05, "loss": 1.193, "mean_token_accuracy": 0.539740389585495, "num_tokens": 1741948.0, "step": 7500 }, { "epoch": 14.13929411764706, "grad_norm": 0.7623440027236938, "learning_rate": 1.0002097191342167e-05, "loss": 1.2001, "mean_token_accuracy": 0.5466340240091085, "num_tokens": 1754976.0, "step": 7510 }, { "epoch": 14.158117647058823, "grad_norm": 0.5901451706886292, "learning_rate": 9.981125288989197e-06, "loss": 1.1834, "mean_token_accuracy": 0.5578321043401957, "num_tokens": 1769202.0, "step": 7520 }, { "epoch": 14.176941176470589, "grad_norm": 1.7071183919906616, "learning_rate": 9.960153469651173e-06, "loss": 1.1479, "mean_token_accuracy": 0.5668721627444029, "num_tokens": 1782731.0, "step": 7530 }, { "epoch": 14.195764705882352, "grad_norm": 0.84836345911026, "learning_rate": 9.939181825566555e-06, "loss": 1.1664, "mean_token_accuracy": 0.5470958970487118, "num_tokens": 1795962.0, "step": 7540 }, { "epoch": 14.214588235294118, "grad_norm": 0.6625200510025024, "learning_rate": 9.918210448973041e-06, "loss": 1.167, "mean_token_accuracy": 0.5696456581354141, "num_tokens": 1809352.0, "step": 7550 }, { "epoch": 14.233411764705883, "grad_norm": 1.065662145614624, "learning_rate": 9.897239432107144e-06, "loss": 1.2344, "mean_token_accuracy": 0.5390154391527175, "num_tokens": 1823306.0, "step": 7560 }, { "epoch": 14.252235294117646, "grad_norm": 1.492067813873291, "learning_rate": 9.876268867203803e-06, "loss": 1.1505, "mean_token_accuracy": 0.5786493707448244, "num_tokens": 1836619.0, "step": 7570 }, { "epoch": 14.271058823529412, "grad_norm": 0.8435829281806946, "learning_rate": 9.855298846495964e-06, "loss": 1.1985, "mean_token_accuracy": 0.5462658539414406, "num_tokens": 1850730.0, "step": 7580 }, { "epoch": 14.289882352941177, "grad_norm": 0.8233511447906494, "learning_rate": 9.834329462214186e-06, "loss": 1.205, "mean_token_accuracy": 0.5534395117312669, "num_tokens": 1864080.0, "step": 7590 }, { "epoch": 14.30870588235294, "grad_norm": 0.9905659556388855, "learning_rate": 9.813360806586218e-06, "loss": 1.223, "mean_token_accuracy": 0.5520945586264133, "num_tokens": 1877561.0, "step": 7600 }, { "epoch": 14.331294117647058, "grad_norm": 0.8363655805587769, "learning_rate": 9.792392971836614e-06, "loss": 1.2066, "mean_token_accuracy": 0.5393288742750884, "num_tokens": 14227.0, "step": 7610 }, { "epoch": 14.350117647058823, "grad_norm": 1.2111597061157227, "learning_rate": 9.77142605018631e-06, "loss": 1.1475, "mean_token_accuracy": 0.5616458028554916, "num_tokens": 26809.0, "step": 7620 }, { "epoch": 14.368941176470589, "grad_norm": 1.3134557008743286, "learning_rate": 9.750460133852234e-06, "loss": 1.1937, "mean_token_accuracy": 0.5489944957196713, "num_tokens": 40285.0, "step": 7630 }, { "epoch": 14.387764705882352, "grad_norm": 1.0377771854400635, "learning_rate": 9.729495315046886e-06, "loss": 1.1611, "mean_token_accuracy": 0.5607926532626152, "num_tokens": 53708.0, "step": 7640 }, { "epoch": 14.406588235294118, "grad_norm": 0.5833343267440796, "learning_rate": 9.708531685977945e-06, "loss": 1.1491, "mean_token_accuracy": 0.5668897565454245, "num_tokens": 66990.0, "step": 7650 }, { "epoch": 14.425411764705883, "grad_norm": 0.6283827424049377, "learning_rate": 9.687569338847848e-06, "loss": 1.1669, "mean_token_accuracy": 0.5548735350370407, "num_tokens": 80285.0, "step": 7660 }, { "epoch": 14.444235294117647, "grad_norm": 1.9695147275924683, "learning_rate": 9.666608365853405e-06, "loss": 1.0941, "mean_token_accuracy": 0.586016109958291, "num_tokens": 94238.0, "step": 7670 }, { "epoch": 14.463058823529412, "grad_norm": 0.762913703918457, "learning_rate": 9.645648859185372e-06, "loss": 1.241, "mean_token_accuracy": 0.5349636357277632, "num_tokens": 109038.0, "step": 7680 }, { "epoch": 14.481882352941177, "grad_norm": 1.1660629510879517, "learning_rate": 9.624690911028062e-06, "loss": 1.2031, "mean_token_accuracy": 0.5475806038826704, "num_tokens": 122828.0, "step": 7690 }, { "epoch": 14.500705882352941, "grad_norm": 0.7873107194900513, "learning_rate": 9.603734613558933e-06, "loss": 1.1112, "mean_token_accuracy": 0.5692310906946659, "num_tokens": 136009.0, "step": 7700 }, { "epoch": 14.519529411764706, "grad_norm": 0.7429578900337219, "learning_rate": 9.582780058948182e-06, "loss": 1.2031, "mean_token_accuracy": 0.5352565940469504, "num_tokens": 148439.0, "step": 7710 }, { "epoch": 14.53835294117647, "grad_norm": 0.9231197834014893, "learning_rate": 9.56182733935834e-06, "loss": 1.1313, "mean_token_accuracy": 0.5766253888607025, "num_tokens": 162003.0, "step": 7720 }, { "epoch": 14.557176470588235, "grad_norm": 0.9091134071350098, "learning_rate": 9.540876546943863e-06, "loss": 1.1224, "mean_token_accuracy": 0.565272556990385, "num_tokens": 174711.0, "step": 7730 }, { "epoch": 14.576, "grad_norm": 0.7547168731689453, "learning_rate": 9.51992777385074e-06, "loss": 1.1618, "mean_token_accuracy": 0.5629419464617967, "num_tokens": 188030.0, "step": 7740 }, { "epoch": 14.594823529411764, "grad_norm": 0.7168717980384827, "learning_rate": 9.498981112216073e-06, "loss": 1.1665, "mean_token_accuracy": 0.562981392070651, "num_tokens": 201691.0, "step": 7750 }, { "epoch": 14.61364705882353, "grad_norm": 1.3017321825027466, "learning_rate": 9.478036654167673e-06, "loss": 1.1908, "mean_token_accuracy": 0.5513414010405541, "num_tokens": 215908.0, "step": 7760 }, { "epoch": 14.632470588235295, "grad_norm": 0.7235598564147949, "learning_rate": 9.457094491823674e-06, "loss": 1.1668, "mean_token_accuracy": 0.5548354998230934, "num_tokens": 229890.0, "step": 7770 }, { "epoch": 14.651294117647058, "grad_norm": 1.014742136001587, "learning_rate": 9.436154717292095e-06, "loss": 1.1381, "mean_token_accuracy": 0.5685541749000549, "num_tokens": 242083.0, "step": 7780 }, { "epoch": 14.670117647058824, "grad_norm": 1.7762614488601685, "learning_rate": 9.415217422670465e-06, "loss": 1.2049, "mean_token_accuracy": 0.5436010017991066, "num_tokens": 255157.0, "step": 7790 }, { "epoch": 14.688941176470589, "grad_norm": 1.2956479787826538, "learning_rate": 9.3942827000454e-06, "loss": 1.1581, "mean_token_accuracy": 0.5617427326738834, "num_tokens": 268163.0, "step": 7800 }, { "epoch": 14.707764705882353, "grad_norm": 1.3141613006591797, "learning_rate": 9.37335064149221e-06, "loss": 1.2012, "mean_token_accuracy": 0.5483662486076355, "num_tokens": 281261.0, "step": 7810 }, { "epoch": 14.726588235294118, "grad_norm": 0.9441389441490173, "learning_rate": 9.352421339074481e-06, "loss": 1.1539, "mean_token_accuracy": 0.5657226879149675, "num_tokens": 295835.0, "step": 7820 }, { "epoch": 14.745411764705882, "grad_norm": 0.6852191686630249, "learning_rate": 9.331494884843682e-06, "loss": 1.1109, "mean_token_accuracy": 0.5754560541361571, "num_tokens": 308216.0, "step": 7830 }, { "epoch": 14.764235294117647, "grad_norm": 1.1635503768920898, "learning_rate": 9.310571370838747e-06, "loss": 1.1794, "mean_token_accuracy": 0.5661456611007452, "num_tokens": 321205.0, "step": 7840 }, { "epoch": 14.783058823529412, "grad_norm": 0.6046936511993408, "learning_rate": 9.28965088908569e-06, "loss": 1.1862, "mean_token_accuracy": 0.5497476685792207, "num_tokens": 336086.0, "step": 7850 }, { "epoch": 14.801882352941176, "grad_norm": 1.6876798868179321, "learning_rate": 9.268733531597185e-06, "loss": 1.149, "mean_token_accuracy": 0.5605622876435519, "num_tokens": 348960.0, "step": 7860 }, { "epoch": 14.820705882352941, "grad_norm": 0.6555068492889404, "learning_rate": 9.24781939037215e-06, "loss": 1.1488, "mean_token_accuracy": 0.562832348421216, "num_tokens": 361287.0, "step": 7870 }, { "epoch": 14.839529411764707, "grad_norm": 1.4029412269592285, "learning_rate": 9.226908557395384e-06, "loss": 1.1503, "mean_token_accuracy": 0.5643013104796409, "num_tokens": 373901.0, "step": 7880 }, { "epoch": 14.85835294117647, "grad_norm": 0.6774556636810303, "learning_rate": 9.206001124637113e-06, "loss": 1.118, "mean_token_accuracy": 0.5655334409326315, "num_tokens": 388185.0, "step": 7890 }, { "epoch": 14.877176470588235, "grad_norm": 1.3459422588348389, "learning_rate": 9.185097184052615e-06, "loss": 1.2017, "mean_token_accuracy": 0.547975680232048, "num_tokens": 402096.0, "step": 7900 }, { "epoch": 14.896, "grad_norm": 1.6914349794387817, "learning_rate": 9.164196827581817e-06, "loss": 1.1513, "mean_token_accuracy": 0.5657749876379967, "num_tokens": 415109.0, "step": 7910 }, { "epoch": 14.914823529411764, "grad_norm": 0.5575766563415527, "learning_rate": 9.143300147148869e-06, "loss": 1.1707, "mean_token_accuracy": 0.5562089808285237, "num_tokens": 428081.0, "step": 7920 }, { "epoch": 14.93364705882353, "grad_norm": 1.4414509534835815, "learning_rate": 9.122407234661764e-06, "loss": 1.225, "mean_token_accuracy": 0.5390350338071584, "num_tokens": 441704.0, "step": 7930 }, { "epoch": 14.952470588235293, "grad_norm": 1.122767448425293, "learning_rate": 9.101518182011914e-06, "loss": 1.1717, "mean_token_accuracy": 0.5577954012900591, "num_tokens": 454263.0, "step": 7940 }, { "epoch": 14.971294117647059, "grad_norm": 0.7421872615814209, "learning_rate": 9.080633081073763e-06, "loss": 1.1535, "mean_token_accuracy": 0.5636604465544224, "num_tokens": 467572.0, "step": 7950 }, { "epoch": 14.990117647058824, "grad_norm": 0.7315053939819336, "learning_rate": 9.059752023704367e-06, "loss": 1.1306, "mean_token_accuracy": 0.5692378722131253, "num_tokens": 481113.0, "step": 7960 }, { "epoch": 15.009411764705883, "grad_norm": 0.8332544565200806, "learning_rate": 9.038875101743003e-06, "loss": 1.2939, "mean_token_accuracy": 0.5470707878106977, "num_tokens": 494860.0, "step": 7970 }, { "epoch": 15.028235294117646, "grad_norm": 0.7542789578437805, "learning_rate": 9.018002407010755e-06, "loss": 1.2465, "mean_token_accuracy": 0.5324202172458172, "num_tokens": 509027.0, "step": 7980 }, { "epoch": 15.047058823529412, "grad_norm": 0.6684849858283997, "learning_rate": 8.997134031310123e-06, "loss": 1.1783, "mean_token_accuracy": 0.5569613084197045, "num_tokens": 522094.0, "step": 7990 }, { "epoch": 15.065882352941177, "grad_norm": 0.860490620136261, "learning_rate": 8.976270066424602e-06, "loss": 1.1533, "mean_token_accuracy": 0.5639694180339575, "num_tokens": 535686.0, "step": 8000 }, { "epoch": 15.08470588235294, "grad_norm": 1.2201600074768066, "learning_rate": 8.955410604118287e-06, "loss": 1.1514, "mean_token_accuracy": 0.5624887187033891, "num_tokens": 547438.0, "step": 8010 }, { "epoch": 15.103529411764706, "grad_norm": 0.7185003757476807, "learning_rate": 8.934555736135475e-06, "loss": 1.1916, "mean_token_accuracy": 0.5531281687319278, "num_tokens": 562299.0, "step": 8020 }, { "epoch": 15.122352941176471, "grad_norm": 0.9599730372428894, "learning_rate": 8.913705554200257e-06, "loss": 1.2061, "mean_token_accuracy": 0.5452544983476401, "num_tokens": 575630.0, "step": 8030 }, { "epoch": 15.141176470588235, "grad_norm": 1.0057822465896606, "learning_rate": 8.892860150016108e-06, "loss": 1.1375, "mean_token_accuracy": 0.5605553191155195, "num_tokens": 588679.0, "step": 8040 }, { "epoch": 15.16, "grad_norm": 0.7954553961753845, "learning_rate": 8.872019615265494e-06, "loss": 1.164, "mean_token_accuracy": 0.5492696654051542, "num_tokens": 601254.0, "step": 8050 }, { "epoch": 15.178823529411765, "grad_norm": 0.6151052713394165, "learning_rate": 8.851184041609464e-06, "loss": 1.2193, "mean_token_accuracy": 0.5489672936499119, "num_tokens": 614369.0, "step": 8060 }, { "epoch": 15.197647058823529, "grad_norm": 0.8546686768531799, "learning_rate": 8.830353520687245e-06, "loss": 1.1938, "mean_token_accuracy": 0.5609996184706688, "num_tokens": 628665.0, "step": 8070 }, { "epoch": 15.216470588235294, "grad_norm": 0.6473791003227234, "learning_rate": 8.809528144115842e-06, "loss": 1.1396, "mean_token_accuracy": 0.5705398332327605, "num_tokens": 643134.0, "step": 8080 }, { "epoch": 15.235294117647058, "grad_norm": 1.5007941722869873, "learning_rate": 8.788708003489636e-06, "loss": 1.1456, "mean_token_accuracy": 0.5757338788360358, "num_tokens": 655759.0, "step": 8090 }, { "epoch": 15.254117647058823, "grad_norm": 0.6832001209259033, "learning_rate": 8.767893190379974e-06, "loss": 1.1718, "mean_token_accuracy": 0.5538515329360962, "num_tokens": 669100.0, "step": 8100 }, { "epoch": 15.272941176470589, "grad_norm": 0.7107806205749512, "learning_rate": 8.747083796334776e-06, "loss": 1.1645, "mean_token_accuracy": 0.5559101283550263, "num_tokens": 683005.0, "step": 8110 }, { "epoch": 15.291764705882352, "grad_norm": 0.6784150004386902, "learning_rate": 8.726279912878126e-06, "loss": 1.1888, "mean_token_accuracy": 0.5452313166111707, "num_tokens": 696921.0, "step": 8120 }, { "epoch": 15.310588235294118, "grad_norm": 1.1530226469039917, "learning_rate": 8.705481631509876e-06, "loss": 1.1809, "mean_token_accuracy": 0.561325515806675, "num_tokens": 710780.0, "step": 8130 }, { "epoch": 15.329411764705883, "grad_norm": 0.6158255934715271, "learning_rate": 8.684689043705231e-06, "loss": 1.1597, "mean_token_accuracy": 0.5495276678353548, "num_tokens": 724228.0, "step": 8140 }, { "epoch": 15.348235294117647, "grad_norm": 0.6823238730430603, "learning_rate": 8.663902240914357e-06, "loss": 1.1347, "mean_token_accuracy": 0.568655128031969, "num_tokens": 737696.0, "step": 8150 }, { "epoch": 15.367058823529412, "grad_norm": 0.6831355690956116, "learning_rate": 8.643121314561976e-06, "loss": 1.1547, "mean_token_accuracy": 0.5554495759308338, "num_tokens": 751387.0, "step": 8160 }, { "epoch": 15.385882352941177, "grad_norm": 0.7401413321495056, "learning_rate": 8.622346356046972e-06, "loss": 1.1248, "mean_token_accuracy": 0.5787703268229961, "num_tokens": 764116.0, "step": 8170 }, { "epoch": 15.40470588235294, "grad_norm": 1.320250391960144, "learning_rate": 8.601577456741967e-06, "loss": 1.1582, "mean_token_accuracy": 0.5558317702263593, "num_tokens": 777306.0, "step": 8180 }, { "epoch": 15.423529411764706, "grad_norm": 1.5581915378570557, "learning_rate": 8.580814707992949e-06, "loss": 1.1475, "mean_token_accuracy": 0.5548811592161655, "num_tokens": 790684.0, "step": 8190 }, { "epoch": 15.44235294117647, "grad_norm": 0.6925750970840454, "learning_rate": 8.560058201118842e-06, "loss": 1.19, "mean_token_accuracy": 0.5531508523970843, "num_tokens": 804244.0, "step": 8200 }, { "epoch": 15.461176470588235, "grad_norm": 1.275739073753357, "learning_rate": 8.539308027411123e-06, "loss": 1.1916, "mean_token_accuracy": 0.5411147933453322, "num_tokens": 818070.0, "step": 8210 }, { "epoch": 15.48, "grad_norm": 1.308816909790039, "learning_rate": 8.51856427813341e-06, "loss": 1.1389, "mean_token_accuracy": 0.5609162572771311, "num_tokens": 831998.0, "step": 8220 }, { "epoch": 15.498823529411764, "grad_norm": 1.4086875915527344, "learning_rate": 8.497827044521074e-06, "loss": 1.1806, "mean_token_accuracy": 0.5614006619900465, "num_tokens": 846346.0, "step": 8230 }, { "epoch": 15.51764705882353, "grad_norm": 1.5172885656356812, "learning_rate": 8.477096417780818e-06, "loss": 1.1423, "mean_token_accuracy": 0.5598192039877177, "num_tokens": 859606.0, "step": 8240 }, { "epoch": 15.536470588235295, "grad_norm": 0.635094404220581, "learning_rate": 8.456372489090294e-06, "loss": 1.195, "mean_token_accuracy": 0.5515242625027895, "num_tokens": 872352.0, "step": 8250 }, { "epoch": 15.555294117647058, "grad_norm": 1.049091100692749, "learning_rate": 8.43565534959769e-06, "loss": 1.1708, "mean_token_accuracy": 0.5665956649929285, "num_tokens": 886434.0, "step": 8260 }, { "epoch": 15.574117647058824, "grad_norm": 0.6422486901283264, "learning_rate": 8.414945090421337e-06, "loss": 1.1199, "mean_token_accuracy": 0.5681115534156561, "num_tokens": 899435.0, "step": 8270 }, { "epoch": 15.592941176470589, "grad_norm": 1.1809626817703247, "learning_rate": 8.394241802649307e-06, "loss": 1.1553, "mean_token_accuracy": 0.5631851524114608, "num_tokens": 912350.0, "step": 8280 }, { "epoch": 15.611764705882353, "grad_norm": 0.8651266694068909, "learning_rate": 8.373545577339002e-06, "loss": 1.1419, "mean_token_accuracy": 0.5651818908751011, "num_tokens": 925480.0, "step": 8290 }, { "epoch": 15.630588235294118, "grad_norm": 0.7373852729797363, "learning_rate": 8.352856505516765e-06, "loss": 1.1959, "mean_token_accuracy": 0.5542501173913479, "num_tokens": 938863.0, "step": 8300 }, { "epoch": 15.649411764705881, "grad_norm": 1.3229117393493652, "learning_rate": 8.33217467817748e-06, "loss": 1.1445, "mean_token_accuracy": 0.5683686885982752, "num_tokens": 952271.0, "step": 8310 }, { "epoch": 15.668235294117647, "grad_norm": 0.9950488805770874, "learning_rate": 8.311500186284166e-06, "loss": 1.1469, "mean_token_accuracy": 0.5671338357031346, "num_tokens": 966154.0, "step": 8320 }, { "epoch": 15.687058823529412, "grad_norm": 1.0574768781661987, "learning_rate": 8.290833120767585e-06, "loss": 1.1745, "mean_token_accuracy": 0.5554843176156282, "num_tokens": 978470.0, "step": 8330 }, { "epoch": 15.705882352941176, "grad_norm": 0.9360321760177612, "learning_rate": 8.270173572525824e-06, "loss": 1.1932, "mean_token_accuracy": 0.5552597276866436, "num_tokens": 992540.0, "step": 8340 }, { "epoch": 15.724705882352941, "grad_norm": 1.4928091764450073, "learning_rate": 8.249521632423918e-06, "loss": 1.1648, "mean_token_accuracy": 0.5653361968696118, "num_tokens": 1006353.0, "step": 8350 }, { "epoch": 15.743529411764706, "grad_norm": 0.7848607897758484, "learning_rate": 8.228877391293432e-06, "loss": 1.1971, "mean_token_accuracy": 0.5555432129651308, "num_tokens": 1020378.0, "step": 8360 }, { "epoch": 15.76235294117647, "grad_norm": 0.813613772392273, "learning_rate": 8.20824093993208e-06, "loss": 1.1464, "mean_token_accuracy": 0.5570184625685215, "num_tokens": 1032073.0, "step": 8370 }, { "epoch": 15.781176470588235, "grad_norm": 0.6320846080780029, "learning_rate": 8.1876123691033e-06, "loss": 1.1486, "mean_token_accuracy": 0.5596757929772138, "num_tokens": 1045292.0, "step": 8380 }, { "epoch": 15.8, "grad_norm": 1.1251544952392578, "learning_rate": 8.166991769535886e-06, "loss": 1.1581, "mean_token_accuracy": 0.5615175377577544, "num_tokens": 1058620.0, "step": 8390 }, { "epoch": 15.818823529411764, "grad_norm": 1.5427863597869873, "learning_rate": 8.146379231923558e-06, "loss": 1.2204, "mean_token_accuracy": 0.5381950225681067, "num_tokens": 1072099.0, "step": 8400 }, { "epoch": 15.83764705882353, "grad_norm": 0.9844633936882019, "learning_rate": 8.12577484692459e-06, "loss": 1.1673, "mean_token_accuracy": 0.554484510794282, "num_tokens": 1085409.0, "step": 8410 }, { "epoch": 15.856470588235293, "grad_norm": 1.1419299840927124, "learning_rate": 8.105178705161395e-06, "loss": 1.1713, "mean_token_accuracy": 0.5534321576356888, "num_tokens": 1098803.0, "step": 8420 }, { "epoch": 15.875294117647059, "grad_norm": 0.8007948994636536, "learning_rate": 8.084590897220122e-06, "loss": 1.1394, "mean_token_accuracy": 0.562013290822506, "num_tokens": 1111518.0, "step": 8430 }, { "epoch": 15.894117647058824, "grad_norm": 0.7455958724021912, "learning_rate": 8.064011513650276e-06, "loss": 1.1577, "mean_token_accuracy": 0.5672158092260361, "num_tokens": 1126619.0, "step": 8440 }, { "epoch": 15.912941176470587, "grad_norm": 1.5687180757522583, "learning_rate": 8.04344064496431e-06, "loss": 1.1668, "mean_token_accuracy": 0.5503279969096184, "num_tokens": 1139657.0, "step": 8450 }, { "epoch": 15.931764705882353, "grad_norm": 0.9860045909881592, "learning_rate": 8.022878381637219e-06, "loss": 1.1937, "mean_token_accuracy": 0.54759371727705, "num_tokens": 1153370.0, "step": 8460 }, { "epoch": 15.950588235294118, "grad_norm": 1.7666656970977783, "learning_rate": 8.002324814106161e-06, "loss": 1.2289, "mean_token_accuracy": 0.5447251949459314, "num_tokens": 1166420.0, "step": 8470 }, { "epoch": 15.969411764705882, "grad_norm": 1.6037918329238892, "learning_rate": 7.981780032770035e-06, "loss": 1.1054, "mean_token_accuracy": 0.5815329641103745, "num_tokens": 1179026.0, "step": 8480 }, { "epoch": 15.988235294117647, "grad_norm": 1.0090439319610596, "learning_rate": 7.961244127989112e-06, "loss": 1.181, "mean_token_accuracy": 0.5504204016178846, "num_tokens": 1192954.0, "step": 8490 }, { "epoch": 16.00564705882353, "grad_norm": 1.271785855293274, "learning_rate": 7.940717190084603e-06, "loss": 1.2075, "mean_token_accuracy": 0.5598280276801135, "num_tokens": 1206256.0, "step": 8500 }, { "epoch": 16.024470588235292, "grad_norm": 1.0442641973495483, "learning_rate": 7.9201993093383e-06, "loss": 1.1703, "mean_token_accuracy": 0.5635019179433585, "num_tokens": 1219916.0, "step": 8510 }, { "epoch": 16.043294117647058, "grad_norm": 0.7755882143974304, "learning_rate": 7.899690575992144e-06, "loss": 1.2, "mean_token_accuracy": 0.5463937662541867, "num_tokens": 1233382.0, "step": 8520 }, { "epoch": 16.062117647058823, "grad_norm": 0.8577190041542053, "learning_rate": 7.879191080247857e-06, "loss": 1.1861, "mean_token_accuracy": 0.5470962207764387, "num_tokens": 1248208.0, "step": 8530 }, { "epoch": 16.08094117647059, "grad_norm": 0.7220326662063599, "learning_rate": 7.85870091226652e-06, "loss": 1.2134, "mean_token_accuracy": 0.5384833466261625, "num_tokens": 1262112.0, "step": 8540 }, { "epoch": 16.099764705882354, "grad_norm": 0.8119881749153137, "learning_rate": 7.838220162168199e-06, "loss": 1.1882, "mean_token_accuracy": 0.5573807552456855, "num_tokens": 1274841.0, "step": 8550 }, { "epoch": 16.11858823529412, "grad_norm": 0.7537893056869507, "learning_rate": 7.817748920031533e-06, "loss": 1.1632, "mean_token_accuracy": 0.5611206289380789, "num_tokens": 1289035.0, "step": 8560 }, { "epoch": 16.13741176470588, "grad_norm": 1.9209939241409302, "learning_rate": 7.797287275893339e-06, "loss": 1.1986, "mean_token_accuracy": 0.5489944905042649, "num_tokens": 1302422.0, "step": 8570 }, { "epoch": 16.156235294117646, "grad_norm": 0.7403332591056824, "learning_rate": 7.776835319748226e-06, "loss": 1.1926, "mean_token_accuracy": 0.544366030395031, "num_tokens": 1316218.0, "step": 8580 }, { "epoch": 16.17505882352941, "grad_norm": 0.7108265161514282, "learning_rate": 7.756393141548196e-06, "loss": 1.188, "mean_token_accuracy": 0.5483727026730776, "num_tokens": 1329307.0, "step": 8590 }, { "epoch": 16.193882352941177, "grad_norm": 0.9194144606590271, "learning_rate": 7.735960831202233e-06, "loss": 1.1304, "mean_token_accuracy": 0.5695245500653983, "num_tokens": 1341639.0, "step": 8600 }, { "epoch": 16.212705882352942, "grad_norm": 0.9176170825958252, "learning_rate": 7.715538478575938e-06, "loss": 1.1746, "mean_token_accuracy": 0.5415426712483168, "num_tokens": 1355385.0, "step": 8610 }, { "epoch": 16.231529411764704, "grad_norm": 0.9545559883117676, "learning_rate": 7.695126173491096e-06, "loss": 1.1516, "mean_token_accuracy": 0.5630953580141067, "num_tokens": 1369295.0, "step": 8620 }, { "epoch": 16.25035294117647, "grad_norm": 0.5547646284103394, "learning_rate": 7.67472400572532e-06, "loss": 1.1848, "mean_token_accuracy": 0.5485550325363875, "num_tokens": 1382440.0, "step": 8630 }, { "epoch": 16.269176470588235, "grad_norm": 1.0427489280700684, "learning_rate": 7.65433206501162e-06, "loss": 1.1646, "mean_token_accuracy": 0.5695793781429529, "num_tokens": 1396411.0, "step": 8640 }, { "epoch": 16.288, "grad_norm": 1.417345643043518, "learning_rate": 7.633950441038041e-06, "loss": 1.1358, "mean_token_accuracy": 0.5730958338826895, "num_tokens": 1409619.0, "step": 8650 }, { "epoch": 16.306823529411766, "grad_norm": 0.6368844509124756, "learning_rate": 7.613579223447238e-06, "loss": 1.1379, "mean_token_accuracy": 0.5624699790030718, "num_tokens": 1422290.0, "step": 8660 }, { "epoch": 16.32564705882353, "grad_norm": 0.7046752572059631, "learning_rate": 7.593218501836108e-06, "loss": 1.1817, "mean_token_accuracy": 0.5441745646297932, "num_tokens": 1435116.0, "step": 8670 }, { "epoch": 16.344470588235293, "grad_norm": 0.9974550604820251, "learning_rate": 7.572868365755377e-06, "loss": 1.1776, "mean_token_accuracy": 0.5540152471512556, "num_tokens": 1449169.0, "step": 8680 }, { "epoch": 16.363294117647058, "grad_norm": 0.9207789897918701, "learning_rate": 7.552528904709224e-06, "loss": 1.1139, "mean_token_accuracy": 0.5721325032413006, "num_tokens": 1461392.0, "step": 8690 }, { "epoch": 16.382117647058823, "grad_norm": 0.7715643644332886, "learning_rate": 7.532200208154856e-06, "loss": 1.1484, "mean_token_accuracy": 0.5646240394562483, "num_tokens": 1475085.0, "step": 8700 }, { "epoch": 16.40094117647059, "grad_norm": 0.6167107224464417, "learning_rate": 7.511882365502161e-06, "loss": 1.113, "mean_token_accuracy": 0.568938347697258, "num_tokens": 1488403.0, "step": 8710 }, { "epoch": 16.419764705882354, "grad_norm": 1.0627272129058838, "learning_rate": 7.491575466113269e-06, "loss": 1.1889, "mean_token_accuracy": 0.5542673517018557, "num_tokens": 1501007.0, "step": 8720 }, { "epoch": 16.438588235294116, "grad_norm": 0.8951665759086609, "learning_rate": 7.4712795993021936e-06, "loss": 1.1568, "mean_token_accuracy": 0.5628596622496843, "num_tokens": 1513501.0, "step": 8730 }, { "epoch": 16.45741176470588, "grad_norm": 1.0367953777313232, "learning_rate": 7.450994854334414e-06, "loss": 1.1873, "mean_token_accuracy": 0.5458543870598078, "num_tokens": 1526907.0, "step": 8740 }, { "epoch": 16.476235294117647, "grad_norm": 0.9993324875831604, "learning_rate": 7.430721320426502e-06, "loss": 1.1875, "mean_token_accuracy": 0.5544692728668451, "num_tokens": 1540310.0, "step": 8750 }, { "epoch": 16.495058823529412, "grad_norm": 0.7184901237487793, "learning_rate": 7.410459086745715e-06, "loss": 1.1688, "mean_token_accuracy": 0.5600904107093811, "num_tokens": 1554564.0, "step": 8760 }, { "epoch": 16.513882352941177, "grad_norm": 1.4242663383483887, "learning_rate": 7.390208242409611e-06, "loss": 1.1422, "mean_token_accuracy": 0.5547851927578449, "num_tokens": 1568019.0, "step": 8770 }, { "epoch": 16.532705882352943, "grad_norm": 0.606593906879425, "learning_rate": 7.3699688764856556e-06, "loss": 1.1774, "mean_token_accuracy": 0.5609697885811329, "num_tokens": 1581881.0, "step": 8780 }, { "epoch": 16.551529411764704, "grad_norm": 0.9548640847206116, "learning_rate": 7.349741077990833e-06, "loss": 1.1215, "mean_token_accuracy": 0.5721657130867243, "num_tokens": 1594281.0, "step": 8790 }, { "epoch": 16.57035294117647, "grad_norm": 1.281101942062378, "learning_rate": 7.3295249358912415e-06, "loss": 1.1452, "mean_token_accuracy": 0.5627951502799988, "num_tokens": 1607907.0, "step": 8800 }, { "epoch": 16.591058823529412, "grad_norm": 1.2261985540390015, "learning_rate": 7.3093205391017275e-06, "loss": 1.1948, "mean_token_accuracy": 0.5499283254146576, "num_tokens": 13007.0, "step": 8810 }, { "epoch": 16.609882352941177, "grad_norm": 0.959309458732605, "learning_rate": 7.289127976485462e-06, "loss": 1.1569, "mean_token_accuracy": 0.5632215116173029, "num_tokens": 27121.0, "step": 8820 }, { "epoch": 16.628705882352943, "grad_norm": 0.8404517769813538, "learning_rate": 7.268947336853588e-06, "loss": 1.2085, "mean_token_accuracy": 0.5531386416405439, "num_tokens": 40179.0, "step": 8830 }, { "epoch": 16.647529411764705, "grad_norm": 1.2052935361862183, "learning_rate": 7.248778708964781e-06, "loss": 1.1325, "mean_token_accuracy": 0.5616716485470533, "num_tokens": 52387.0, "step": 8840 }, { "epoch": 16.66635294117647, "grad_norm": 1.1935890913009644, "learning_rate": 7.228622181524909e-06, "loss": 1.1662, "mean_token_accuracy": 0.5652685184031725, "num_tokens": 66527.0, "step": 8850 }, { "epoch": 16.685176470588235, "grad_norm": 1.4688079357147217, "learning_rate": 7.20847784318661e-06, "loss": 1.1733, "mean_token_accuracy": 0.5583682101219892, "num_tokens": 79512.0, "step": 8860 }, { "epoch": 16.704, "grad_norm": 0.9977661967277527, "learning_rate": 7.188345782548918e-06, "loss": 1.1196, "mean_token_accuracy": 0.5758439347147941, "num_tokens": 92443.0, "step": 8870 }, { "epoch": 16.722823529411766, "grad_norm": 1.6382378339767456, "learning_rate": 7.168226088156858e-06, "loss": 1.1558, "mean_token_accuracy": 0.563961322978139, "num_tokens": 107011.0, "step": 8880 }, { "epoch": 16.741647058823528, "grad_norm": 0.7158175110816956, "learning_rate": 7.148118848501073e-06, "loss": 1.2003, "mean_token_accuracy": 0.5421418201178312, "num_tokens": 120340.0, "step": 8890 }, { "epoch": 16.760470588235293, "grad_norm": 0.7682539224624634, "learning_rate": 7.128024152017426e-06, "loss": 1.1337, "mean_token_accuracy": 0.5682530965656042, "num_tokens": 133870.0, "step": 8900 }, { "epoch": 16.77929411764706, "grad_norm": 1.2490408420562744, "learning_rate": 7.10794208708661e-06, "loss": 1.1464, "mean_token_accuracy": 0.5654782570898533, "num_tokens": 147737.0, "step": 8910 }, { "epoch": 16.798117647058824, "grad_norm": 1.0072635412216187, "learning_rate": 7.087872742033761e-06, "loss": 1.1675, "mean_token_accuracy": 0.5675601534545421, "num_tokens": 160861.0, "step": 8920 }, { "epoch": 16.81694117647059, "grad_norm": 0.9989560842514038, "learning_rate": 7.0678162051280796e-06, "loss": 1.1504, "mean_token_accuracy": 0.5777845904231071, "num_tokens": 173818.0, "step": 8930 }, { "epoch": 16.835764705882355, "grad_norm": 0.7746507525444031, "learning_rate": 7.04777256458242e-06, "loss": 1.2331, "mean_token_accuracy": 0.5357637394219636, "num_tokens": 187606.0, "step": 8940 }, { "epoch": 16.854588235294116, "grad_norm": 0.5496880412101746, "learning_rate": 7.0277419085529275e-06, "loss": 1.1534, "mean_token_accuracy": 0.5625104811042547, "num_tokens": 200788.0, "step": 8950 }, { "epoch": 16.87341176470588, "grad_norm": 0.7524011731147766, "learning_rate": 7.007724325138626e-06, "loss": 1.1731, "mean_token_accuracy": 0.5571359943598508, "num_tokens": 214193.0, "step": 8960 }, { "epoch": 16.892235294117647, "grad_norm": 1.898985743522644, "learning_rate": 6.987719902381063e-06, "loss": 1.1823, "mean_token_accuracy": 0.546281049400568, "num_tokens": 227004.0, "step": 8970 }, { "epoch": 16.911058823529412, "grad_norm": 1.2188752889633179, "learning_rate": 6.967728728263875e-06, "loss": 1.2082, "mean_token_accuracy": 0.5488316975533962, "num_tokens": 240725.0, "step": 8980 }, { "epoch": 16.929882352941178, "grad_norm": 1.4341834783554077, "learning_rate": 6.947750890712452e-06, "loss": 1.1383, "mean_token_accuracy": 0.566087681055069, "num_tokens": 255280.0, "step": 8990 }, { "epoch": 16.94870588235294, "grad_norm": 1.4695709943771362, "learning_rate": 6.927786477593517e-06, "loss": 1.1297, "mean_token_accuracy": 0.571322912350297, "num_tokens": 268707.0, "step": 9000 }, { "epoch": 16.967529411764705, "grad_norm": 1.0631098747253418, "learning_rate": 6.907835576714752e-06, "loss": 1.1401, "mean_token_accuracy": 0.5591850385069848, "num_tokens": 282374.0, "step": 9010 }, { "epoch": 16.98635294117647, "grad_norm": 0.7683926820755005, "learning_rate": 6.887898275824405e-06, "loss": 1.1538, "mean_token_accuracy": 0.5545760612934828, "num_tokens": 295895.0, "step": 9020 }, { "epoch": 17.00564705882353, "grad_norm": 0.5843003392219543, "learning_rate": 6.8679746626109165e-06, "loss": 1.304, "mean_token_accuracy": 0.5468519330024719, "num_tokens": 309096.0, "step": 9030 }, { "epoch": 17.024470588235292, "grad_norm": 0.5110841393470764, "learning_rate": 6.848064824702518e-06, "loss": 1.1689, "mean_token_accuracy": 0.5539047036319971, "num_tokens": 322843.0, "step": 9040 }, { "epoch": 17.043294117647058, "grad_norm": 0.681012749671936, "learning_rate": 6.828168849666859e-06, "loss": 1.1473, "mean_token_accuracy": 0.5699756104499102, "num_tokens": 335834.0, "step": 9050 }, { "epoch": 17.062117647058823, "grad_norm": 0.6035940647125244, "learning_rate": 6.808286825010611e-06, "loss": 1.1957, "mean_token_accuracy": 0.5480252616107464, "num_tokens": 349415.0, "step": 9060 }, { "epoch": 17.08094117647059, "grad_norm": 0.793001651763916, "learning_rate": 6.788418838179101e-06, "loss": 1.1495, "mean_token_accuracy": 0.5695446979254484, "num_tokens": 362782.0, "step": 9070 }, { "epoch": 17.099764705882354, "grad_norm": 0.5845211148262024, "learning_rate": 6.768564976555898e-06, "loss": 1.2018, "mean_token_accuracy": 0.5484800077974796, "num_tokens": 375606.0, "step": 9080 }, { "epoch": 17.11858823529412, "grad_norm": 0.7158066630363464, "learning_rate": 6.748725327462462e-06, "loss": 1.1601, "mean_token_accuracy": 0.5678265064954757, "num_tokens": 388427.0, "step": 9090 }, { "epoch": 17.13741176470588, "grad_norm": 1.2140324115753174, "learning_rate": 6.728899978157729e-06, "loss": 1.2314, "mean_token_accuracy": 0.5344064626842737, "num_tokens": 402111.0, "step": 9100 }, { "epoch": 17.156235294117646, "grad_norm": 1.0139904022216797, "learning_rate": 6.709089015837758e-06, "loss": 1.1831, "mean_token_accuracy": 0.5626831982284785, "num_tokens": 416419.0, "step": 9110 }, { "epoch": 17.17505882352941, "grad_norm": 0.5783360600471497, "learning_rate": 6.68929252763531e-06, "loss": 1.1888, "mean_token_accuracy": 0.5566362496465445, "num_tokens": 430433.0, "step": 9120 }, { "epoch": 17.193882352941177, "grad_norm": 1.0979998111724854, "learning_rate": 6.669510600619502e-06, "loss": 1.1366, "mean_token_accuracy": 0.572005919739604, "num_tokens": 442507.0, "step": 9130 }, { "epoch": 17.212705882352942, "grad_norm": 1.239842414855957, "learning_rate": 6.649743321795401e-06, "loss": 1.1488, "mean_token_accuracy": 0.5650555603206158, "num_tokens": 455301.0, "step": 9140 }, { "epoch": 17.231529411764704, "grad_norm": 0.9120736718177795, "learning_rate": 6.629990778103652e-06, "loss": 1.1347, "mean_token_accuracy": 0.5705232992768288, "num_tokens": 468128.0, "step": 9150 }, { "epoch": 17.25035294117647, "grad_norm": 1.5878956317901611, "learning_rate": 6.6102530564200885e-06, "loss": 1.1428, "mean_token_accuracy": 0.5688801523298025, "num_tokens": 482382.0, "step": 9160 }, { "epoch": 17.269176470588235, "grad_norm": 1.3523510694503784, "learning_rate": 6.5905302435553575e-06, "loss": 1.1501, "mean_token_accuracy": 0.5716863550245762, "num_tokens": 495560.0, "step": 9170 }, { "epoch": 17.288, "grad_norm": 1.0103275775909424, "learning_rate": 6.570822426254526e-06, "loss": 1.1479, "mean_token_accuracy": 0.5623312875628471, "num_tokens": 509660.0, "step": 9180 }, { "epoch": 17.306823529411766, "grad_norm": 0.7961730360984802, "learning_rate": 6.55112969119672e-06, "loss": 1.177, "mean_token_accuracy": 0.5561908625066281, "num_tokens": 523652.0, "step": 9190 }, { "epoch": 17.32564705882353, "grad_norm": 1.049294114112854, "learning_rate": 6.531452124994716e-06, "loss": 1.1729, "mean_token_accuracy": 0.553871612995863, "num_tokens": 536343.0, "step": 9200 }, { "epoch": 17.344470588235293, "grad_norm": 0.9252281188964844, "learning_rate": 6.511789814194588e-06, "loss": 1.1302, "mean_token_accuracy": 0.567984351888299, "num_tokens": 549308.0, "step": 9210 }, { "epoch": 17.363294117647058, "grad_norm": 0.8509281277656555, "learning_rate": 6.492142845275302e-06, "loss": 1.1896, "mean_token_accuracy": 0.5457706928253174, "num_tokens": 562695.0, "step": 9220 }, { "epoch": 17.382117647058823, "grad_norm": 0.8771809935569763, "learning_rate": 6.472511304648359e-06, "loss": 1.1732, "mean_token_accuracy": 0.5531365133821964, "num_tokens": 575073.0, "step": 9230 }, { "epoch": 17.40094117647059, "grad_norm": 1.814473032951355, "learning_rate": 6.4528952786573904e-06, "loss": 1.1541, "mean_token_accuracy": 0.5633249927312136, "num_tokens": 588911.0, "step": 9240 }, { "epoch": 17.419764705882354, "grad_norm": 0.7689526081085205, "learning_rate": 6.4332948535778075e-06, "loss": 1.2086, "mean_token_accuracy": 0.5450298830866813, "num_tokens": 603178.0, "step": 9250 }, { "epoch": 17.438588235294116, "grad_norm": 0.8878546357154846, "learning_rate": 6.413710115616383e-06, "loss": 1.1557, "mean_token_accuracy": 0.5638493042439222, "num_tokens": 616690.0, "step": 9260 }, { "epoch": 17.45741176470588, "grad_norm": 0.610453188419342, "learning_rate": 6.394141150910913e-06, "loss": 1.1544, "mean_token_accuracy": 0.5664511952549219, "num_tokens": 629868.0, "step": 9270 }, { "epoch": 17.476235294117647, "grad_norm": 0.7785117626190186, "learning_rate": 6.37458804552981e-06, "loss": 1.1758, "mean_token_accuracy": 0.5591502383351326, "num_tokens": 643658.0, "step": 9280 }, { "epoch": 17.495058823529412, "grad_norm": 0.694078803062439, "learning_rate": 6.355050885471743e-06, "loss": 1.1698, "mean_token_accuracy": 0.5497753735631704, "num_tokens": 656038.0, "step": 9290 }, { "epoch": 17.513882352941177, "grad_norm": 0.9329729676246643, "learning_rate": 6.33552975666524e-06, "loss": 1.1292, "mean_token_accuracy": 0.5673054289072752, "num_tokens": 670286.0, "step": 9300 }, { "epoch": 17.532705882352943, "grad_norm": 1.1342458724975586, "learning_rate": 6.316024744968327e-06, "loss": 1.2161, "mean_token_accuracy": 0.5357775934040546, "num_tokens": 683493.0, "step": 9310 }, { "epoch": 17.551529411764704, "grad_norm": 0.8364800810813904, "learning_rate": 6.296535936168137e-06, "loss": 1.1663, "mean_token_accuracy": 0.5568131286650896, "num_tokens": 697575.0, "step": 9320 }, { "epoch": 17.57035294117647, "grad_norm": 1.625592589378357, "learning_rate": 6.277063415980549e-06, "loss": 1.1174, "mean_token_accuracy": 0.5752797372639179, "num_tokens": 710207.0, "step": 9330 }, { "epoch": 17.589176470588235, "grad_norm": 1.3862090110778809, "learning_rate": 6.257607270049791e-06, "loss": 1.142, "mean_token_accuracy": 0.5722228426486253, "num_tokens": 724438.0, "step": 9340 }, { "epoch": 17.608, "grad_norm": 1.26033616065979, "learning_rate": 6.238167583948082e-06, "loss": 1.1907, "mean_token_accuracy": 0.5385926622897387, "num_tokens": 739319.0, "step": 9350 }, { "epoch": 17.626823529411766, "grad_norm": 1.0322513580322266, "learning_rate": 6.218744443175237e-06, "loss": 1.1304, "mean_token_accuracy": 0.5683851022273302, "num_tokens": 751914.0, "step": 9360 }, { "epoch": 17.645647058823528, "grad_norm": 0.7326356172561646, "learning_rate": 6.199337933158316e-06, "loss": 1.1813, "mean_token_accuracy": 0.5477977491915226, "num_tokens": 766447.0, "step": 9370 }, { "epoch": 17.664470588235293, "grad_norm": 0.9041365385055542, "learning_rate": 6.179948139251218e-06, "loss": 1.1652, "mean_token_accuracy": 0.55347336307168, "num_tokens": 779625.0, "step": 9380 }, { "epoch": 17.68329411764706, "grad_norm": 1.576574683189392, "learning_rate": 6.160575146734338e-06, "loss": 1.1529, "mean_token_accuracy": 0.5669393539428711, "num_tokens": 793737.0, "step": 9390 }, { "epoch": 17.702117647058824, "grad_norm": 1.3531404733657837, "learning_rate": 6.1412190408141505e-06, "loss": 1.1197, "mean_token_accuracy": 0.5701812230050564, "num_tokens": 807112.0, "step": 9400 }, { "epoch": 17.72094117647059, "grad_norm": 1.6633743047714233, "learning_rate": 6.121879906622883e-06, "loss": 1.1761, "mean_token_accuracy": 0.560976068302989, "num_tokens": 820492.0, "step": 9410 }, { "epoch": 17.739764705882354, "grad_norm": 1.1381210088729858, "learning_rate": 6.102557829218105e-06, "loss": 1.1562, "mean_token_accuracy": 0.5558978658169508, "num_tokens": 834186.0, "step": 9420 }, { "epoch": 17.758588235294116, "grad_norm": 1.8115606307983398, "learning_rate": 6.083252893582374e-06, "loss": 1.1872, "mean_token_accuracy": 0.554209940135479, "num_tokens": 847318.0, "step": 9430 }, { "epoch": 17.77741176470588, "grad_norm": 1.288480520248413, "learning_rate": 6.063965184622845e-06, "loss": 1.1726, "mean_token_accuracy": 0.5530060395598412, "num_tokens": 860095.0, "step": 9440 }, { "epoch": 17.796235294117647, "grad_norm": 1.2023969888687134, "learning_rate": 6.0446947871709174e-06, "loss": 1.1904, "mean_token_accuracy": 0.5426197368651628, "num_tokens": 873256.0, "step": 9450 }, { "epoch": 17.815058823529412, "grad_norm": 1.5823273658752441, "learning_rate": 6.025441785981843e-06, "loss": 1.1334, "mean_token_accuracy": 0.5691535335034132, "num_tokens": 886435.0, "step": 9460 }, { "epoch": 17.833882352941178, "grad_norm": 0.8472403883934021, "learning_rate": 6.006206265734364e-06, "loss": 1.1382, "mean_token_accuracy": 0.5657643742859364, "num_tokens": 899127.0, "step": 9470 }, { "epoch": 17.852705882352943, "grad_norm": 0.931440532207489, "learning_rate": 5.9869883110303366e-06, "loss": 1.1718, "mean_token_accuracy": 0.5716207943856716, "num_tokens": 913094.0, "step": 9480 }, { "epoch": 17.871529411764705, "grad_norm": 0.7743551135063171, "learning_rate": 5.967788006394364e-06, "loss": 1.1778, "mean_token_accuracy": 0.5500955499708653, "num_tokens": 927506.0, "step": 9490 }, { "epoch": 17.89035294117647, "grad_norm": 3.367818593978882, "learning_rate": 5.948605436273411e-06, "loss": 1.1036, "mean_token_accuracy": 0.5776654280722141, "num_tokens": 940411.0, "step": 9500 }, { "epoch": 17.909176470588235, "grad_norm": 0.621562659740448, "learning_rate": 5.9294406850364584e-06, "loss": 1.2119, "mean_token_accuracy": 0.5432645879685879, "num_tokens": 954352.0, "step": 9510 }, { "epoch": 17.928, "grad_norm": 0.5852854251861572, "learning_rate": 5.910293836974099e-06, "loss": 1.1967, "mean_token_accuracy": 0.5400953222066164, "num_tokens": 967263.0, "step": 9520 }, { "epoch": 17.946823529411766, "grad_norm": 0.6211656332015991, "learning_rate": 5.891164976298198e-06, "loss": 1.1627, "mean_token_accuracy": 0.5562442850321532, "num_tokens": 980662.0, "step": 9530 }, { "epoch": 17.965647058823528, "grad_norm": 0.6909055709838867, "learning_rate": 5.872054187141492e-06, "loss": 1.1726, "mean_token_accuracy": 0.5591957967728376, "num_tokens": 993499.0, "step": 9540 }, { "epoch": 17.984470588235293, "grad_norm": 1.064255714416504, "learning_rate": 5.852961553557251e-06, "loss": 1.188, "mean_token_accuracy": 0.5609757989645004, "num_tokens": 1007775.0, "step": 9550 }, { "epoch": 18.001882352941177, "grad_norm": 1.33067786693573, "learning_rate": 5.833887159518882e-06, "loss": 1.1521, "mean_token_accuracy": 0.5590104452661566, "num_tokens": 1019534.0, "step": 9560 }, { "epoch": 18.020705882352942, "grad_norm": 1.1214513778686523, "learning_rate": 5.8148310889195795e-06, "loss": 1.1953, "mean_token_accuracy": 0.5516563657671213, "num_tokens": 1032963.0, "step": 9570 }, { "epoch": 18.039529411764708, "grad_norm": 1.491575837135315, "learning_rate": 5.795793425571943e-06, "loss": 1.1595, "mean_token_accuracy": 0.5553607545793057, "num_tokens": 1045960.0, "step": 9580 }, { "epoch": 18.05835294117647, "grad_norm": 0.6637427806854248, "learning_rate": 5.776774253207607e-06, "loss": 1.1874, "mean_token_accuracy": 0.5493495386093855, "num_tokens": 1060210.0, "step": 9590 }, { "epoch": 18.077176470588235, "grad_norm": 0.5911340117454529, "learning_rate": 5.757773655476895e-06, "loss": 1.1127, "mean_token_accuracy": 0.57906415425241, "num_tokens": 1072162.0, "step": 9600 }, { "epoch": 18.097882352941177, "grad_norm": 1.7098009586334229, "learning_rate": 5.738791715948421e-06, "loss": 1.1165, "mean_token_accuracy": 0.5782070815563202, "num_tokens": 13939.0, "step": 9610 }, { "epoch": 18.116705882352942, "grad_norm": 0.6690590977668762, "learning_rate": 5.7198285181087406e-06, "loss": 1.1392, "mean_token_accuracy": 0.5584286205470562, "num_tokens": 28443.0, "step": 9620 }, { "epoch": 18.135529411764704, "grad_norm": 1.1516035795211792, "learning_rate": 5.700884145361976e-06, "loss": 1.202, "mean_token_accuracy": 0.543058916553855, "num_tokens": 43005.0, "step": 9630 }, { "epoch": 18.15435294117647, "grad_norm": 0.6750898957252502, "learning_rate": 5.6819586810294635e-06, "loss": 1.0982, "mean_token_accuracy": 0.5858326137065888, "num_tokens": 55756.0, "step": 9640 }, { "epoch": 18.173176470588235, "grad_norm": 1.525680661201477, "learning_rate": 5.663052208349367e-06, "loss": 1.0754, "mean_token_accuracy": 0.5846003469079732, "num_tokens": 68605.0, "step": 9650 }, { "epoch": 18.192, "grad_norm": 1.5765116214752197, "learning_rate": 5.6441648104763215e-06, "loss": 1.1771, "mean_token_accuracy": 0.5558013815432787, "num_tokens": 82077.0, "step": 9660 }, { "epoch": 18.210823529411766, "grad_norm": 0.8816690444946289, "learning_rate": 5.625296570481069e-06, "loss": 1.1803, "mean_token_accuracy": 0.5606073562055827, "num_tokens": 94758.0, "step": 9670 }, { "epoch": 18.22964705882353, "grad_norm": 1.345479130744934, "learning_rate": 5.606447571350093e-06, "loss": 1.2028, "mean_token_accuracy": 0.5379578843712807, "num_tokens": 109010.0, "step": 9680 }, { "epoch": 18.248470588235293, "grad_norm": 1.374245524406433, "learning_rate": 5.587617895985247e-06, "loss": 1.196, "mean_token_accuracy": 0.5491275552660226, "num_tokens": 122939.0, "step": 9690 }, { "epoch": 18.267294117647058, "grad_norm": 0.7323598265647888, "learning_rate": 5.568807627203399e-06, "loss": 1.1414, "mean_token_accuracy": 0.5601202577352524, "num_tokens": 137029.0, "step": 9700 }, { "epoch": 18.286117647058823, "grad_norm": 1.2711374759674072, "learning_rate": 5.550016847736055e-06, "loss": 1.1124, "mean_token_accuracy": 0.5777692060917616, "num_tokens": 149183.0, "step": 9710 }, { "epoch": 18.30494117647059, "grad_norm": 0.8101398944854736, "learning_rate": 5.5312456402290174e-06, "loss": 1.1478, "mean_token_accuracy": 0.5615578092634678, "num_tokens": 163147.0, "step": 9720 }, { "epoch": 18.323764705882354, "grad_norm": 1.201725721359253, "learning_rate": 5.512494087241995e-06, "loss": 1.1889, "mean_token_accuracy": 0.5410687677562237, "num_tokens": 176934.0, "step": 9730 }, { "epoch": 18.342588235294116, "grad_norm": 1.2040985822677612, "learning_rate": 5.493762271248255e-06, "loss": 1.0963, "mean_token_accuracy": 0.5771806977689267, "num_tokens": 189676.0, "step": 9740 }, { "epoch": 18.36141176470588, "grad_norm": 1.194765567779541, "learning_rate": 5.475050274634255e-06, "loss": 1.2101, "mean_token_accuracy": 0.5370388999581337, "num_tokens": 202706.0, "step": 9750 }, { "epoch": 18.380235294117647, "grad_norm": 1.3350589275360107, "learning_rate": 5.456358179699289e-06, "loss": 1.17, "mean_token_accuracy": 0.5458086933940649, "num_tokens": 216179.0, "step": 9760 }, { "epoch": 18.399058823529412, "grad_norm": 1.164698839187622, "learning_rate": 5.437686068655115e-06, "loss": 1.2626, "mean_token_accuracy": 0.534201942011714, "num_tokens": 229633.0, "step": 9770 }, { "epoch": 18.417882352941177, "grad_norm": 0.6664723753929138, "learning_rate": 5.419034023625597e-06, "loss": 1.1409, "mean_token_accuracy": 0.5639401733875274, "num_tokens": 242540.0, "step": 9780 }, { "epoch": 18.436705882352943, "grad_norm": 0.5169602036476135, "learning_rate": 5.4004021266463415e-06, "loss": 1.1817, "mean_token_accuracy": 0.5450482603162528, "num_tokens": 254975.0, "step": 9790 }, { "epoch": 18.455529411764704, "grad_norm": 0.5377786755561829, "learning_rate": 5.381790459664355e-06, "loss": 1.1443, "mean_token_accuracy": 0.5668651383370161, "num_tokens": 269635.0, "step": 9800 }, { "epoch": 18.47435294117647, "grad_norm": 1.7214852571487427, "learning_rate": 5.363199104537649e-06, "loss": 1.1384, "mean_token_accuracy": 0.5759254258126021, "num_tokens": 282552.0, "step": 9810 }, { "epoch": 18.493176470588235, "grad_norm": 1.1212029457092285, "learning_rate": 5.344628143034904e-06, "loss": 1.1671, "mean_token_accuracy": 0.5748541194945573, "num_tokens": 296715.0, "step": 9820 }, { "epoch": 18.512, "grad_norm": 0.8291766047477722, "learning_rate": 5.32607765683511e-06, "loss": 1.171, "mean_token_accuracy": 0.5621029295027256, "num_tokens": 310741.0, "step": 9830 }, { "epoch": 18.530823529411766, "grad_norm": 1.666212558746338, "learning_rate": 5.307547727527207e-06, "loss": 1.1493, "mean_token_accuracy": 0.5714134465903044, "num_tokens": 324377.0, "step": 9840 }, { "epoch": 18.54964705882353, "grad_norm": 0.6212410926818848, "learning_rate": 5.28903843660971e-06, "loss": 1.182, "mean_token_accuracy": 0.5523442510515452, "num_tokens": 336585.0, "step": 9850 }, { "epoch": 18.568470588235293, "grad_norm": 0.763521134853363, "learning_rate": 5.2705498654903666e-06, "loss": 1.2182, "mean_token_accuracy": 0.5365333639085292, "num_tokens": 351773.0, "step": 9860 }, { "epoch": 18.58729411764706, "grad_norm": 1.7621654272079468, "learning_rate": 5.252082095485793e-06, "loss": 1.134, "mean_token_accuracy": 0.5687922302633523, "num_tokens": 364135.0, "step": 9870 }, { "epoch": 18.606117647058824, "grad_norm": 0.8109177350997925, "learning_rate": 5.233635207821126e-06, "loss": 1.1571, "mean_token_accuracy": 0.5557177890092134, "num_tokens": 377314.0, "step": 9880 }, { "epoch": 18.62494117647059, "grad_norm": 1.2423245906829834, "learning_rate": 5.215209283629647e-06, "loss": 1.1754, "mean_token_accuracy": 0.5624301459640264, "num_tokens": 391043.0, "step": 9890 }, { "epoch": 18.643764705882354, "grad_norm": 1.4668940305709839, "learning_rate": 5.19680440395244e-06, "loss": 1.1433, "mean_token_accuracy": 0.5692973904311657, "num_tokens": 404155.0, "step": 9900 }, { "epoch": 18.662588235294116, "grad_norm": 0.6444953083992004, "learning_rate": 5.1784206497380275e-06, "loss": 1.1656, "mean_token_accuracy": 0.5535128690302372, "num_tokens": 417083.0, "step": 9910 }, { "epoch": 18.68141176470588, "grad_norm": 0.9833362102508545, "learning_rate": 5.160058101842025e-06, "loss": 1.1301, "mean_token_accuracy": 0.5652093205600976, "num_tokens": 430739.0, "step": 9920 }, { "epoch": 18.700235294117647, "grad_norm": 1.3160921335220337, "learning_rate": 5.141716841026774e-06, "loss": 1.2158, "mean_token_accuracy": 0.5528298642486333, "num_tokens": 444108.0, "step": 9930 }, { "epoch": 18.719058823529412, "grad_norm": 0.6429352164268494, "learning_rate": 5.123396947960993e-06, "loss": 1.1509, "mean_token_accuracy": 0.5627094566822052, "num_tokens": 457365.0, "step": 9940 }, { "epoch": 18.737882352941178, "grad_norm": 0.57741379737854, "learning_rate": 5.105098503219408e-06, "loss": 1.1677, "mean_token_accuracy": 0.5417445503175259, "num_tokens": 470389.0, "step": 9950 }, { "epoch": 18.756705882352943, "grad_norm": 1.4769562482833862, "learning_rate": 5.08682158728243e-06, "loss": 1.1693, "mean_token_accuracy": 0.5543987430632115, "num_tokens": 483516.0, "step": 9960 }, { "epoch": 18.775529411764705, "grad_norm": 1.122862696647644, "learning_rate": 5.068566280535772e-06, "loss": 1.1676, "mean_token_accuracy": 0.5597089301794768, "num_tokens": 497211.0, "step": 9970 }, { "epoch": 18.79435294117647, "grad_norm": 1.3088434934616089, "learning_rate": 5.050332663270105e-06, "loss": 1.1703, "mean_token_accuracy": 0.5574114482849837, "num_tokens": 511246.0, "step": 9980 }, { "epoch": 18.813176470588235, "grad_norm": 1.146748661994934, "learning_rate": 5.032120815680703e-06, "loss": 1.1415, "mean_token_accuracy": 0.5681348893791437, "num_tokens": 524961.0, "step": 9990 }, { "epoch": 18.832, "grad_norm": 0.6369304060935974, "learning_rate": 5.013930817867103e-06, "loss": 1.1355, "mean_token_accuracy": 0.5745254665613174, "num_tokens": 537543.0, "step": 10000 }, { "epoch": 18.850823529411766, "grad_norm": 0.804693341255188, "learning_rate": 4.995762749832731e-06, "loss": 1.1858, "mean_token_accuracy": 0.5416501805186271, "num_tokens": 550396.0, "step": 10010 }, { "epoch": 18.869647058823528, "grad_norm": 1.5857802629470825, "learning_rate": 4.977616691484567e-06, "loss": 1.1571, "mean_token_accuracy": 0.5618045397102833, "num_tokens": 563953.0, "step": 10020 }, { "epoch": 18.888470588235293, "grad_norm": 1.1062195301055908, "learning_rate": 4.9594927226327795e-06, "loss": 1.2112, "mean_token_accuracy": 0.5421402599662543, "num_tokens": 577786.0, "step": 10030 }, { "epoch": 18.90729411764706, "grad_norm": 0.7114964127540588, "learning_rate": 4.941390922990398e-06, "loss": 1.1818, "mean_token_accuracy": 0.5595896728336811, "num_tokens": 592052.0, "step": 10040 }, { "epoch": 18.926117647058824, "grad_norm": 1.7100679874420166, "learning_rate": 4.923311372172935e-06, "loss": 1.1518, "mean_token_accuracy": 0.5808280512690545, "num_tokens": 605121.0, "step": 10050 }, { "epoch": 18.94494117647059, "grad_norm": 1.4883623123168945, "learning_rate": 4.905254149698049e-06, "loss": 1.1205, "mean_token_accuracy": 0.564001039788127, "num_tokens": 618937.0, "step": 10060 }, { "epoch": 18.963764705882355, "grad_norm": 0.5710100531578064, "learning_rate": 4.8872193349852e-06, "loss": 1.193, "mean_token_accuracy": 0.5475729245692491, "num_tokens": 631403.0, "step": 10070 }, { "epoch": 18.982588235294116, "grad_norm": 0.82412189245224, "learning_rate": 4.869207007355286e-06, "loss": 1.1769, "mean_token_accuracy": 0.549387214705348, "num_tokens": 644809.0, "step": 10080 }, { "epoch": 19.001882352941177, "grad_norm": 4.334903717041016, "learning_rate": 4.851217246030307e-06, "loss": 1.2232, "mean_token_accuracy": 0.5745812316493291, "num_tokens": 658742.0, "step": 10090 }, { "epoch": 19.020705882352942, "grad_norm": 0.7227234244346619, "learning_rate": 4.833250130133014e-06, "loss": 1.1446, "mean_token_accuracy": 0.5578329466283322, "num_tokens": 672376.0, "step": 10100 }, { "epoch": 19.039529411764708, "grad_norm": 1.063941240310669, "learning_rate": 4.815305738686548e-06, "loss": 1.1323, "mean_token_accuracy": 0.5669731423258781, "num_tokens": 684972.0, "step": 10110 }, { "epoch": 19.05835294117647, "grad_norm": 1.1839574575424194, "learning_rate": 4.7973841506141195e-06, "loss": 1.178, "mean_token_accuracy": 0.5547245424240828, "num_tokens": 699346.0, "step": 10120 }, { "epoch": 19.077176470588235, "grad_norm": 0.8287972807884216, "learning_rate": 4.779485444738632e-06, "loss": 1.1305, "mean_token_accuracy": 0.5627703540027141, "num_tokens": 711760.0, "step": 10130 }, { "epoch": 19.096, "grad_norm": 1.2000936269760132, "learning_rate": 4.761609699782351e-06, "loss": 1.1206, "mean_token_accuracy": 0.5821688748896122, "num_tokens": 724959.0, "step": 10140 }, { "epoch": 19.114823529411765, "grad_norm": 1.1382743120193481, "learning_rate": 4.743756994366555e-06, "loss": 1.199, "mean_token_accuracy": 0.5427775271236897, "num_tokens": 738570.0, "step": 10150 }, { "epoch": 19.13364705882353, "grad_norm": 0.6352145075798035, "learning_rate": 4.7259274070111986e-06, "loss": 1.1679, "mean_token_accuracy": 0.5518446248024702, "num_tokens": 751688.0, "step": 10160 }, { "epoch": 19.152470588235293, "grad_norm": 1.043312668800354, "learning_rate": 4.708121016134545e-06, "loss": 1.1412, "mean_token_accuracy": 0.56727832891047, "num_tokens": 765261.0, "step": 10170 }, { "epoch": 19.171294117647058, "grad_norm": 2.1281962394714355, "learning_rate": 4.69033790005284e-06, "loss": 1.1316, "mean_token_accuracy": 0.5734411317855119, "num_tokens": 777972.0, "step": 10180 }, { "epoch": 19.190117647058823, "grad_norm": 1.2191262245178223, "learning_rate": 4.672578136979961e-06, "loss": 1.2033, "mean_token_accuracy": 0.5502295974642039, "num_tokens": 792540.0, "step": 10190 }, { "epoch": 19.20894117647059, "grad_norm": 2.241875648498535, "learning_rate": 4.65484180502708e-06, "loss": 1.2114, "mean_token_accuracy": 0.5526003040373325, "num_tokens": 805628.0, "step": 10200 }, { "epoch": 19.227764705882354, "grad_norm": 1.0254322290420532, "learning_rate": 4.637128982202308e-06, "loss": 1.1448, "mean_token_accuracy": 0.566441947594285, "num_tokens": 818605.0, "step": 10210 }, { "epoch": 19.24658823529412, "grad_norm": 0.5974338054656982, "learning_rate": 4.619439746410361e-06, "loss": 1.1663, "mean_token_accuracy": 0.5573429156094789, "num_tokens": 831744.0, "step": 10220 }, { "epoch": 19.26541176470588, "grad_norm": 1.1514984369277954, "learning_rate": 4.601774175452203e-06, "loss": 1.1816, "mean_token_accuracy": 0.5479875948280096, "num_tokens": 844511.0, "step": 10230 }, { "epoch": 19.284235294117646, "grad_norm": 1.9380877017974854, "learning_rate": 4.584132347024732e-06, "loss": 1.1513, "mean_token_accuracy": 0.5600051417946815, "num_tokens": 857034.0, "step": 10240 }, { "epoch": 19.303058823529412, "grad_norm": 1.8556410074234009, "learning_rate": 4.566514338720414e-06, "loss": 1.2121, "mean_token_accuracy": 0.5544085066765547, "num_tokens": 870895.0, "step": 10250 }, { "epoch": 19.321882352941177, "grad_norm": 0.7411885857582092, "learning_rate": 4.5489202280269465e-06, "loss": 1.1471, "mean_token_accuracy": 0.5642319560050965, "num_tokens": 883794.0, "step": 10260 }, { "epoch": 19.340705882352943, "grad_norm": 0.6563217043876648, "learning_rate": 4.53135009232692e-06, "loss": 1.119, "mean_token_accuracy": 0.571989681199193, "num_tokens": 896094.0, "step": 10270 }, { "epoch": 19.359529411764704, "grad_norm": 0.717928946018219, "learning_rate": 4.513804008897487e-06, "loss": 1.1896, "mean_token_accuracy": 0.5477908588945866, "num_tokens": 909067.0, "step": 10280 }, { "epoch": 19.37835294117647, "grad_norm": 1.7051725387573242, "learning_rate": 4.496282054910006e-06, "loss": 1.2038, "mean_token_accuracy": 0.5528531819581985, "num_tokens": 922861.0, "step": 10290 }, { "epoch": 19.397176470588235, "grad_norm": 0.5672712922096252, "learning_rate": 4.478784307429707e-06, "loss": 1.1883, "mean_token_accuracy": 0.5424028813838959, "num_tokens": 935977.0, "step": 10300 }, { "epoch": 19.416, "grad_norm": 1.4658303260803223, "learning_rate": 4.461310843415354e-06, "loss": 1.1138, "mean_token_accuracy": 0.5855666678398848, "num_tokens": 950190.0, "step": 10310 }, { "epoch": 19.434823529411766, "grad_norm": 0.8565905690193176, "learning_rate": 4.4438617397189185e-06, "loss": 1.1637, "mean_token_accuracy": 0.5575710866600275, "num_tokens": 964649.0, "step": 10320 }, { "epoch": 19.45364705882353, "grad_norm": 1.2235567569732666, "learning_rate": 4.42643707308522e-06, "loss": 1.1326, "mean_token_accuracy": 0.5670348349958658, "num_tokens": 978559.0, "step": 10330 }, { "epoch": 19.472470588235293, "grad_norm": 0.79115229845047, "learning_rate": 4.4090369201516e-06, "loss": 1.1965, "mean_token_accuracy": 0.5441572275012732, "num_tokens": 992276.0, "step": 10340 }, { "epoch": 19.491294117647058, "grad_norm": 1.4369901418685913, "learning_rate": 4.391661357447585e-06, "loss": 1.1701, "mean_token_accuracy": 0.5550823096185923, "num_tokens": 1006379.0, "step": 10350 }, { "epoch": 19.510117647058824, "grad_norm": 1.175675868988037, "learning_rate": 4.374310461394548e-06, "loss": 1.1332, "mean_token_accuracy": 0.5736968379467726, "num_tokens": 1019668.0, "step": 10360 }, { "epoch": 19.52894117647059, "grad_norm": 0.5965217351913452, "learning_rate": 4.356984308305374e-06, "loss": 1.1563, "mean_token_accuracy": 0.5608095470815897, "num_tokens": 1032049.0, "step": 10370 }, { "epoch": 19.547764705882354, "grad_norm": 1.406221628189087, "learning_rate": 4.3396829743841205e-06, "loss": 1.1749, "mean_token_accuracy": 0.5496669236570597, "num_tokens": 1045211.0, "step": 10380 }, { "epoch": 19.566588235294116, "grad_norm": 1.215728759765625, "learning_rate": 4.322406535725686e-06, "loss": 1.1748, "mean_token_accuracy": 0.5576162055134773, "num_tokens": 1058179.0, "step": 10390 }, { "epoch": 19.58541176470588, "grad_norm": 1.223363995552063, "learning_rate": 4.305155068315481e-06, "loss": 1.1467, "mean_token_accuracy": 0.5632787074893713, "num_tokens": 1071797.0, "step": 10400 }, { "epoch": 19.604235294117647, "grad_norm": 1.32563054561615, "learning_rate": 4.2879286480290784e-06, "loss": 1.1665, "mean_token_accuracy": 0.543903386592865, "num_tokens": 1085172.0, "step": 10410 }, { "epoch": 19.623058823529412, "grad_norm": 1.0964701175689697, "learning_rate": 4.270727350631892e-06, "loss": 1.1368, "mean_token_accuracy": 0.5769836001098156, "num_tokens": 1098617.0, "step": 10420 }, { "epoch": 19.641882352941177, "grad_norm": 0.8849780559539795, "learning_rate": 4.253551251778835e-06, "loss": 1.238, "mean_token_accuracy": 0.5346022747457028, "num_tokens": 1111860.0, "step": 10430 }, { "epoch": 19.660705882352943, "grad_norm": 1.4776290655136108, "learning_rate": 4.236400427014005e-06, "loss": 1.2089, "mean_token_accuracy": 0.5553506713360548, "num_tokens": 1125874.0, "step": 10440 }, { "epoch": 19.679529411764705, "grad_norm": 0.6762340068817139, "learning_rate": 4.2192749517703255e-06, "loss": 1.1319, "mean_token_accuracy": 0.572966867312789, "num_tokens": 1139009.0, "step": 10450 }, { "epoch": 19.69835294117647, "grad_norm": 0.9607488512992859, "learning_rate": 4.202174901369236e-06, "loss": 1.1342, "mean_token_accuracy": 0.5701036512851715, "num_tokens": 1151825.0, "step": 10460 }, { "epoch": 19.717176470588235, "grad_norm": 0.7690389156341553, "learning_rate": 4.1851003510203416e-06, "loss": 1.1599, "mean_token_accuracy": 0.554409109801054, "num_tokens": 1165331.0, "step": 10470 }, { "epoch": 19.736, "grad_norm": 0.6242837905883789, "learning_rate": 4.168051375821108e-06, "loss": 1.1945, "mean_token_accuracy": 0.5393414959311486, "num_tokens": 1178787.0, "step": 10480 }, { "epoch": 19.754823529411766, "grad_norm": 0.8885065317153931, "learning_rate": 4.151028050756507e-06, "loss": 1.1456, "mean_token_accuracy": 0.5653862472623586, "num_tokens": 1191650.0, "step": 10490 }, { "epoch": 19.773647058823528, "grad_norm": 0.7802302837371826, "learning_rate": 4.134030450698697e-06, "loss": 1.1645, "mean_token_accuracy": 0.5542376168072224, "num_tokens": 1205371.0, "step": 10500 }, { "epoch": 19.792470588235293, "grad_norm": 0.8476331233978271, "learning_rate": 4.117058650406683e-06, "loss": 1.1996, "mean_token_accuracy": 0.5366521954536438, "num_tokens": 1218885.0, "step": 10510 }, { "epoch": 19.81129411764706, "grad_norm": 0.7439809441566467, "learning_rate": 4.1001127245260175e-06, "loss": 1.162, "mean_token_accuracy": 0.5500502925366163, "num_tokens": 1232409.0, "step": 10520 }, { "epoch": 19.830117647058824, "grad_norm": 0.8466014266014099, "learning_rate": 4.083192747588436e-06, "loss": 1.2165, "mean_token_accuracy": 0.547482916712761, "num_tokens": 1245876.0, "step": 10530 }, { "epoch": 19.84894117647059, "grad_norm": 0.9549068808555603, "learning_rate": 4.066298794011551e-06, "loss": 1.1552, "mean_token_accuracy": 0.567984651774168, "num_tokens": 1260603.0, "step": 10540 }, { "epoch": 19.867764705882355, "grad_norm": 0.8882144689559937, "learning_rate": 4.049430938098513e-06, "loss": 1.1424, "mean_token_accuracy": 0.566171682626009, "num_tokens": 1274404.0, "step": 10550 }, { "epoch": 19.886588235294116, "grad_norm": 1.1163575649261475, "learning_rate": 4.0325892540377035e-06, "loss": 1.1986, "mean_token_accuracy": 0.54889883287251, "num_tokens": 1288135.0, "step": 10560 }, { "epoch": 19.90541176470588, "grad_norm": 0.5996796488761902, "learning_rate": 4.01577381590238e-06, "loss": 1.1317, "mean_token_accuracy": 0.5590782940387726, "num_tokens": 1301565.0, "step": 10570 }, { "epoch": 19.924235294117647, "grad_norm": 0.5613903999328613, "learning_rate": 3.998984697650369e-06, "loss": 1.144, "mean_token_accuracy": 0.5581843961030245, "num_tokens": 1315363.0, "step": 10580 }, { "epoch": 19.943058823529412, "grad_norm": 1.517250895500183, "learning_rate": 3.982221973123738e-06, "loss": 1.1585, "mean_token_accuracy": 0.5547402266412973, "num_tokens": 1328940.0, "step": 10590 }, { "epoch": 19.961882352941178, "grad_norm": 1.6663577556610107, "learning_rate": 3.965485716048473e-06, "loss": 1.1706, "mean_token_accuracy": 0.5520875003188849, "num_tokens": 1342451.0, "step": 10600 }, { "epoch": 19.98070588235294, "grad_norm": 1.2554893493652344, "learning_rate": 3.948776000034144e-06, "loss": 1.1016, "mean_token_accuracy": 0.5831372920423746, "num_tokens": 1355912.0, "step": 10610 }, { "epoch": 19.999529411764705, "grad_norm": 1.3978781700134277, "learning_rate": 3.932092898573593e-06, "loss": 1.0993, "mean_token_accuracy": 0.5891566134989261, "num_tokens": 1370301.0, "step": 10620 }, { "epoch": 20.01694117647059, "grad_norm": 1.0220972299575806, "learning_rate": 3.915436485042602e-06, "loss": 1.1514, "mean_token_accuracy": 0.5695925415367693, "num_tokens": 1382512.0, "step": 10630 }, { "epoch": 20.035764705882354, "grad_norm": 0.9992367625236511, "learning_rate": 3.898806832699574e-06, "loss": 1.2, "mean_token_accuracy": 0.5552287392318249, "num_tokens": 1395877.0, "step": 10640 }, { "epoch": 20.05458823529412, "grad_norm": 0.8924506306648254, "learning_rate": 3.882204014685213e-06, "loss": 1.1325, "mean_token_accuracy": 0.562846252322197, "num_tokens": 1408762.0, "step": 10650 }, { "epoch": 20.07341176470588, "grad_norm": 1.2033953666687012, "learning_rate": 3.8656281040221975e-06, "loss": 1.1699, "mean_token_accuracy": 0.5635105472058057, "num_tokens": 1421599.0, "step": 10660 }, { "epoch": 20.092235294117646, "grad_norm": 0.999156653881073, "learning_rate": 3.849079173614863e-06, "loss": 1.1869, "mean_token_accuracy": 0.5541719019412994, "num_tokens": 1434720.0, "step": 10670 }, { "epoch": 20.11105882352941, "grad_norm": 1.1332379579544067, "learning_rate": 3.832557296248883e-06, "loss": 1.1769, "mean_token_accuracy": 0.5474078699946403, "num_tokens": 1448113.0, "step": 10680 }, { "epoch": 20.129882352941177, "grad_norm": 1.046257734298706, "learning_rate": 3.816062544590944e-06, "loss": 1.119, "mean_token_accuracy": 0.5713853165507317, "num_tokens": 1462113.0, "step": 10690 }, { "epoch": 20.148705882352942, "grad_norm": 1.346682071685791, "learning_rate": 3.7995949911884235e-06, "loss": 1.1687, "mean_token_accuracy": 0.5563855923712253, "num_tokens": 1474782.0, "step": 10700 }, { "epoch": 20.167529411764704, "grad_norm": 1.4376060962677002, "learning_rate": 3.783154708469079e-06, "loss": 1.1921, "mean_token_accuracy": 0.5529118336737155, "num_tokens": 1488150.0, "step": 10710 }, { "epoch": 20.18635294117647, "grad_norm": 1.298230767250061, "learning_rate": 3.7667417687407305e-06, "loss": 1.1748, "mean_token_accuracy": 0.553018931671977, "num_tokens": 1501896.0, "step": 10720 }, { "epoch": 20.205176470588235, "grad_norm": 1.2680264711380005, "learning_rate": 3.750356244190931e-06, "loss": 1.1694, "mean_token_accuracy": 0.5553711723536253, "num_tokens": 1515310.0, "step": 10730 }, { "epoch": 20.224, "grad_norm": 1.5268313884735107, "learning_rate": 3.7339982068866586e-06, "loss": 1.1437, "mean_token_accuracy": 0.566285153850913, "num_tokens": 1528249.0, "step": 10740 }, { "epoch": 20.242823529411766, "grad_norm": 1.163407802581787, "learning_rate": 3.717667728773995e-06, "loss": 1.1316, "mean_token_accuracy": 0.5630121100693941, "num_tokens": 1541606.0, "step": 10750 }, { "epoch": 20.26164705882353, "grad_norm": 0.8959663510322571, "learning_rate": 3.701364881677809e-06, "loss": 1.163, "mean_token_accuracy": 0.5470004346221685, "num_tokens": 1554546.0, "step": 10760 }, { "epoch": 20.280470588235293, "grad_norm": 0.8293361067771912, "learning_rate": 3.6850897373014514e-06, "loss": 1.2158, "mean_token_accuracy": 0.5398020602762699, "num_tokens": 1567389.0, "step": 10770 }, { "epoch": 20.299294117647058, "grad_norm": 0.7178218364715576, "learning_rate": 3.668842367226427e-06, "loss": 1.1527, "mean_token_accuracy": 0.5576813716441393, "num_tokens": 1580197.0, "step": 10780 }, { "epoch": 20.318117647058823, "grad_norm": 0.8142568469047546, "learning_rate": 3.652622842912068e-06, "loss": 1.167, "mean_token_accuracy": 0.5699529372155666, "num_tokens": 1593797.0, "step": 10790 }, { "epoch": 20.33694117647059, "grad_norm": 1.293581247329712, "learning_rate": 3.6364312356952603e-06, "loss": 1.1763, "mean_token_accuracy": 0.5648769486695528, "num_tokens": 1607332.0, "step": 10800 }, { "epoch": 20.35764705882353, "grad_norm": 1.6430315971374512, "learning_rate": 5.595460614152204e-06, "loss": 1.1903, "mean_token_accuracy": 0.5535745773464441, "num_tokens": 14114.0, "step": 10810 }, { "epoch": 20.376470588235293, "grad_norm": 0.9296208620071411, "learning_rate": 5.582109479305742e-06, "loss": 1.1451, "mean_token_accuracy": 0.5736374389380217, "num_tokens": 26719.0, "step": 10820 }, { "epoch": 20.395294117647058, "grad_norm": 0.7295175790786743, "learning_rate": 5.5687583444592795e-06, "loss": 1.1719, "mean_token_accuracy": 0.5615855868905782, "num_tokens": 40788.0, "step": 10830 }, { "epoch": 20.414117647058823, "grad_norm": 0.5272361636161804, "learning_rate": 5.555407209612818e-06, "loss": 1.1707, "mean_token_accuracy": 0.5535146549344063, "num_tokens": 55000.0, "step": 10840 }, { "epoch": 20.43294117647059, "grad_norm": 1.6679742336273193, "learning_rate": 5.542056074766355e-06, "loss": 1.1919, "mean_token_accuracy": 0.5483698755502701, "num_tokens": 68277.0, "step": 10850 }, { "epoch": 20.451764705882354, "grad_norm": 1.3171534538269043, "learning_rate": 5.528704939919893e-06, "loss": 1.1023, "mean_token_accuracy": 0.5720774076879025, "num_tokens": 81749.0, "step": 10860 }, { "epoch": 20.470588235294116, "grad_norm": 0.6171587705612183, "learning_rate": 5.515353805073432e-06, "loss": 1.1495, "mean_token_accuracy": 0.5558391027152538, "num_tokens": 95748.0, "step": 10870 }, { "epoch": 20.48941176470588, "grad_norm": 1.8184794187545776, "learning_rate": 5.50200267022697e-06, "loss": 1.1554, "mean_token_accuracy": 0.5604365076869726, "num_tokens": 109182.0, "step": 10880 }, { "epoch": 20.508235294117647, "grad_norm": 0.6223208904266357, "learning_rate": 5.488651535380508e-06, "loss": 1.1084, "mean_token_accuracy": 0.5673012807965279, "num_tokens": 122093.0, "step": 10890 }, { "epoch": 20.527058823529412, "grad_norm": 0.6511589288711548, "learning_rate": 5.475300400534046e-06, "loss": 1.1891, "mean_token_accuracy": 0.5500082913786173, "num_tokens": 135801.0, "step": 10900 }, { "epoch": 20.545882352941177, "grad_norm": 1.1379867792129517, "learning_rate": 5.461949265687584e-06, "loss": 1.1577, "mean_token_accuracy": 0.5615630965679884, "num_tokens": 149528.0, "step": 10910 }, { "epoch": 20.564705882352943, "grad_norm": 0.6646468043327332, "learning_rate": 5.448598130841122e-06, "loss": 1.1693, "mean_token_accuracy": 0.5596156906336546, "num_tokens": 162347.0, "step": 10920 }, { "epoch": 20.583529411764705, "grad_norm": 0.6345205903053284, "learning_rate": 5.435246995994659e-06, "loss": 1.1393, "mean_token_accuracy": 0.5636776462197304, "num_tokens": 174228.0, "step": 10930 }, { "epoch": 20.60235294117647, "grad_norm": 1.0836670398712158, "learning_rate": 5.4218958611481976e-06, "loss": 1.1253, "mean_token_accuracy": 0.574345787242055, "num_tokens": 188052.0, "step": 10940 }, { "epoch": 20.621176470588235, "grad_norm": 0.7239655256271362, "learning_rate": 5.408544726301737e-06, "loss": 1.1691, "mean_token_accuracy": 0.5582763768732548, "num_tokens": 202984.0, "step": 10950 }, { "epoch": 20.64, "grad_norm": 0.6753464937210083, "learning_rate": 5.395193591455274e-06, "loss": 1.1513, "mean_token_accuracy": 0.5622932318598032, "num_tokens": 216292.0, "step": 10960 }, { "epoch": 20.658823529411766, "grad_norm": 0.5746181607246399, "learning_rate": 5.381842456608812e-06, "loss": 1.1928, "mean_token_accuracy": 0.5508632536977529, "num_tokens": 229204.0, "step": 10970 }, { "epoch": 20.677647058823528, "grad_norm": 1.1452544927597046, "learning_rate": 5.3684913217623505e-06, "loss": 1.1549, "mean_token_accuracy": 0.5678117204457521, "num_tokens": 242540.0, "step": 10980 }, { "epoch": 20.696470588235293, "grad_norm": 0.6321762800216675, "learning_rate": 5.355140186915888e-06, "loss": 1.1235, "mean_token_accuracy": 0.575124978646636, "num_tokens": 256825.0, "step": 10990 }, { "epoch": 20.71529411764706, "grad_norm": 0.5946145057678223, "learning_rate": 5.341789052069426e-06, "loss": 1.1731, "mean_token_accuracy": 0.5547100655734539, "num_tokens": 270675.0, "step": 11000 }, { "epoch": 20.734117647058824, "grad_norm": 1.3031941652297974, "learning_rate": 5.3284379172229635e-06, "loss": 1.1891, "mean_token_accuracy": 0.5470674268901348, "num_tokens": 282696.0, "step": 11010 }, { "epoch": 20.75294117647059, "grad_norm": 0.5822432637214661, "learning_rate": 5.315086782376503e-06, "loss": 1.2153, "mean_token_accuracy": 0.5374420482665301, "num_tokens": 296789.0, "step": 11020 }, { "epoch": 20.771764705882354, "grad_norm": 1.1034314632415771, "learning_rate": 5.301735647530041e-06, "loss": 1.1166, "mean_token_accuracy": 0.5685049999505282, "num_tokens": 310298.0, "step": 11030 }, { "epoch": 20.790588235294116, "grad_norm": 1.798014521598816, "learning_rate": 5.288384512683579e-06, "loss": 1.1764, "mean_token_accuracy": 0.5581833314150572, "num_tokens": 323136.0, "step": 11040 }, { "epoch": 20.80941176470588, "grad_norm": 1.2033790349960327, "learning_rate": 5.2750333778371165e-06, "loss": 1.1509, "mean_token_accuracy": 0.5552062794566155, "num_tokens": 336592.0, "step": 11050 }, { "epoch": 20.828235294117647, "grad_norm": 0.9958351850509644, "learning_rate": 5.261682242990655e-06, "loss": 1.1421, "mean_token_accuracy": 0.5717320717871189, "num_tokens": 350031.0, "step": 11060 }, { "epoch": 20.847058823529412, "grad_norm": 0.9751930832862854, "learning_rate": 5.248331108144192e-06, "loss": 1.2123, "mean_token_accuracy": 0.5408148296177387, "num_tokens": 364779.0, "step": 11070 }, { "epoch": 20.865882352941178, "grad_norm": 0.6778987646102905, "learning_rate": 5.23497997329773e-06, "loss": 1.1929, "mean_token_accuracy": 0.547919350117445, "num_tokens": 379331.0, "step": 11080 }, { "epoch": 20.88470588235294, "grad_norm": 0.85933518409729, "learning_rate": 5.221628838451269e-06, "loss": 1.119, "mean_token_accuracy": 0.5630400247871876, "num_tokens": 392787.0, "step": 11090 }, { "epoch": 20.903529411764705, "grad_norm": 0.6913843750953674, "learning_rate": 5.208277703604807e-06, "loss": 1.1649, "mean_token_accuracy": 0.5535217590630055, "num_tokens": 406067.0, "step": 11100 }, { "epoch": 20.92235294117647, "grad_norm": 2.0229623317718506, "learning_rate": 5.194926568758345e-06, "loss": 1.12, "mean_token_accuracy": 0.5807195238769054, "num_tokens": 420283.0, "step": 11110 }, { "epoch": 20.941176470588236, "grad_norm": 1.6949794292449951, "learning_rate": 5.181575433911883e-06, "loss": 1.1508, "mean_token_accuracy": 0.5568496011197567, "num_tokens": 433084.0, "step": 11120 }, { "epoch": 20.96, "grad_norm": 0.9725853204727173, "learning_rate": 5.168224299065421e-06, "loss": 1.2174, "mean_token_accuracy": 0.5451920755207539, "num_tokens": 447263.0, "step": 11130 }, { "epoch": 20.978823529411766, "grad_norm": 1.1975111961364746, "learning_rate": 5.154873164218959e-06, "loss": 1.1483, "mean_token_accuracy": 0.5572316914796829, "num_tokens": 460142.0, "step": 11140 }, { "epoch": 20.997647058823528, "grad_norm": 1.078298807144165, "learning_rate": 5.141522029372496e-06, "loss": 1.1615, "mean_token_accuracy": 0.5542684197425842, "num_tokens": 473192.0, "step": 11150 }, { "epoch": 21.01694117647059, "grad_norm": 1.2463195323944092, "learning_rate": 5.128170894526035e-06, "loss": 1.2856, "mean_token_accuracy": 0.5541327973691429, "num_tokens": 486372.0, "step": 11160 }, { "epoch": 21.035764705882354, "grad_norm": 1.2360081672668457, "learning_rate": 5.114819759679574e-06, "loss": 1.1388, "mean_token_accuracy": 0.5694611296057701, "num_tokens": 499191.0, "step": 11170 }, { "epoch": 21.05458823529412, "grad_norm": 0.9578425288200378, "learning_rate": 5.101468624833111e-06, "loss": 1.1818, "mean_token_accuracy": 0.548756854981184, "num_tokens": 512720.0, "step": 11180 }, { "epoch": 21.07341176470588, "grad_norm": 0.6617890000343323, "learning_rate": 5.088117489986649e-06, "loss": 1.1515, "mean_token_accuracy": 0.5633066941052676, "num_tokens": 525141.0, "step": 11190 }, { "epoch": 21.092235294117646, "grad_norm": 0.5509127974510193, "learning_rate": 5.0747663551401875e-06, "loss": 1.1534, "mean_token_accuracy": 0.5600371100008488, "num_tokens": 539326.0, "step": 11200 }, { "epoch": 21.11105882352941, "grad_norm": 0.7871003150939941, "learning_rate": 5.061415220293725e-06, "loss": 1.0991, "mean_token_accuracy": 0.5748973291367292, "num_tokens": 553010.0, "step": 11210 }, { "epoch": 21.129882352941177, "grad_norm": 1.2488114833831787, "learning_rate": 5.048064085447263e-06, "loss": 1.1, "mean_token_accuracy": 0.575805452466011, "num_tokens": 566192.0, "step": 11220 }, { "epoch": 21.148705882352942, "grad_norm": 0.6213930249214172, "learning_rate": 5.034712950600802e-06, "loss": 1.1342, "mean_token_accuracy": 0.5743603181093931, "num_tokens": 579897.0, "step": 11230 }, { "epoch": 21.167529411764704, "grad_norm": 0.6450327634811401, "learning_rate": 5.0213618157543396e-06, "loss": 1.197, "mean_token_accuracy": 0.5464676439762115, "num_tokens": 594375.0, "step": 11240 }, { "epoch": 21.18635294117647, "grad_norm": 1.1492058038711548, "learning_rate": 5.008010680907878e-06, "loss": 1.1822, "mean_token_accuracy": 0.5506179232150317, "num_tokens": 607333.0, "step": 11250 }, { "epoch": 21.205176470588235, "grad_norm": 1.0695908069610596, "learning_rate": 4.994659546061415e-06, "loss": 1.1359, "mean_token_accuracy": 0.5681717403233051, "num_tokens": 620561.0, "step": 11260 }, { "epoch": 21.224, "grad_norm": 0.608024001121521, "learning_rate": 4.9813084112149534e-06, "loss": 1.1696, "mean_token_accuracy": 0.5604459267109633, "num_tokens": 634382.0, "step": 11270 }, { "epoch": 21.242823529411766, "grad_norm": 0.6441075801849365, "learning_rate": 4.967957276368492e-06, "loss": 1.144, "mean_token_accuracy": 0.5593028951436281, "num_tokens": 646437.0, "step": 11280 }, { "epoch": 21.26164705882353, "grad_norm": 1.208881139755249, "learning_rate": 4.95460614152203e-06, "loss": 1.1171, "mean_token_accuracy": 0.5644662406295538, "num_tokens": 659651.0, "step": 11290 }, { "epoch": 21.280470588235293, "grad_norm": 1.3741132020950317, "learning_rate": 4.941255006675567e-06, "loss": 1.1408, "mean_token_accuracy": 0.5656964641064406, "num_tokens": 673643.0, "step": 11300 }, { "epoch": 21.299294117647058, "grad_norm": 1.710774302482605, "learning_rate": 4.927903871829106e-06, "loss": 1.1746, "mean_token_accuracy": 0.5554309643805027, "num_tokens": 686757.0, "step": 11310 }, { "epoch": 21.318117647058823, "grad_norm": 0.5914443731307983, "learning_rate": 4.914552736982644e-06, "loss": 1.1724, "mean_token_accuracy": 0.5584981873631477, "num_tokens": 701156.0, "step": 11320 }, { "epoch": 21.33694117647059, "grad_norm": 0.6047216653823853, "learning_rate": 4.901201602136182e-06, "loss": 1.1493, "mean_token_accuracy": 0.570811814814806, "num_tokens": 714373.0, "step": 11330 }, { "epoch": 21.355764705882354, "grad_norm": 1.1371484994888306, "learning_rate": 4.887850467289719e-06, "loss": 1.1611, "mean_token_accuracy": 0.5554365783929824, "num_tokens": 727980.0, "step": 11340 }, { "epoch": 21.37458823529412, "grad_norm": 0.6046891212463379, "learning_rate": 4.8744993324432585e-06, "loss": 1.1768, "mean_token_accuracy": 0.5487030290067196, "num_tokens": 741661.0, "step": 11350 }, { "epoch": 21.39341176470588, "grad_norm": 1.0406830310821533, "learning_rate": 4.861148197596796e-06, "loss": 1.103, "mean_token_accuracy": 0.5742270287126303, "num_tokens": 754818.0, "step": 11360 }, { "epoch": 21.412235294117647, "grad_norm": 1.8457794189453125, "learning_rate": 4.847797062750334e-06, "loss": 1.1727, "mean_token_accuracy": 0.5553502965718508, "num_tokens": 767996.0, "step": 11370 }, { "epoch": 21.431058823529412, "grad_norm": 1.229186773300171, "learning_rate": 4.834445927903872e-06, "loss": 1.1148, "mean_token_accuracy": 0.5711336594074965, "num_tokens": 781314.0, "step": 11380 }, { "epoch": 21.449882352941177, "grad_norm": 2.4556403160095215, "learning_rate": 4.8210947930574106e-06, "loss": 1.2005, "mean_token_accuracy": 0.5530955422669649, "num_tokens": 794540.0, "step": 11390 }, { "epoch": 21.468705882352943, "grad_norm": 0.9994168281555176, "learning_rate": 4.807743658210948e-06, "loss": 1.1588, "mean_token_accuracy": 0.5520365055650472, "num_tokens": 807577.0, "step": 11400 }, { "epoch": 11.06690909090909, "grad_norm": 2.4615001678466797, "learning_rate": 2.7042869240445714e-06, "loss": 0.9921, "mean_token_accuracy": 0.6851526271551848, "num_tokens": 10722.0, "step": 11410 }, { "epoch": 11.07660606060606, "grad_norm": 2.1516900062561035, "learning_rate": 2.689960187285652e-06, "loss": 0.9928, "mean_token_accuracy": 0.6662691086530685, "num_tokens": 21293.0, "step": 11420 }, { "epoch": 11.08630303030303, "grad_norm": 1.547285556793213, "learning_rate": 2.675665601616777e-06, "loss": 0.9335, "mean_token_accuracy": 0.6725459590554237, "num_tokens": 32220.0, "step": 11430 }, { "epoch": 11.096, "grad_norm": 1.2127219438552856, "learning_rate": 2.6614032299085324e-06, "loss": 1.0357, "mean_token_accuracy": 0.6421669337898492, "num_tokens": 43405.0, "step": 11440 }, { "epoch": 11.10569696969697, "grad_norm": 2.609590530395508, "learning_rate": 2.647173134889831e-06, "loss": 0.955, "mean_token_accuracy": 0.6785120502114296, "num_tokens": 53503.0, "step": 11450 }, { "epoch": 11.11539393939394, "grad_norm": 2.0895426273345947, "learning_rate": 2.6329753791476143e-06, "loss": 0.8958, "mean_token_accuracy": 0.6950253710150719, "num_tokens": 62932.0, "step": 11460 }, { "epoch": 11.12509090909091, "grad_norm": 1.0643393993377686, "learning_rate": 2.6188100251265947e-06, "loss": 0.965, "mean_token_accuracy": 0.6781762517988682, "num_tokens": 73725.0, "step": 11470 }, { "epoch": 11.13478787878788, "grad_norm": 0.9910470843315125, "learning_rate": 2.604677135128972e-06, "loss": 0.9692, "mean_token_accuracy": 0.6687733806669712, "num_tokens": 84995.0, "step": 11480 }, { "epoch": 11.144484848484849, "grad_norm": 1.3115955591201782, "learning_rate": 2.590576771314166e-06, "loss": 0.9748, "mean_token_accuracy": 0.6800978854298592, "num_tokens": 95231.0, "step": 11490 }, { "epoch": 11.154181818181819, "grad_norm": 1.7872004508972168, "learning_rate": 2.5765089956985357e-06, "loss": 0.857, "mean_token_accuracy": 0.7096298310905695, "num_tokens": 104730.0, "step": 11500 }, { "epoch": 11.163878787878788, "grad_norm": 1.7525864839553833, "learning_rate": 2.56247387015511e-06, "loss": 0.9162, "mean_token_accuracy": 0.7024530675262213, "num_tokens": 114691.0, "step": 11510 }, { "epoch": 11.173575757575758, "grad_norm": 1.5869275331497192, "learning_rate": 2.5484714564133237e-06, "loss": 0.8835, "mean_token_accuracy": 0.6953748039901256, "num_tokens": 124358.0, "step": 11520 }, { "epoch": 11.183272727272728, "grad_norm": 1.2352665662765503, "learning_rate": 2.534501816058731e-06, "loss": 0.9839, "mean_token_accuracy": 0.6811668451875448, "num_tokens": 135440.0, "step": 11530 }, { "epoch": 11.192969696969698, "grad_norm": 0.9480632543563843, "learning_rate": 2.5205650105327405e-06, "loss": 0.9216, "mean_token_accuracy": 0.6818428047001361, "num_tokens": 145555.0, "step": 11540 }, { "epoch": 11.202666666666667, "grad_norm": 1.0198525190353394, "learning_rate": 2.5066611011323505e-06, "loss": 0.9813, "mean_token_accuracy": 0.6706477042287589, "num_tokens": 157378.0, "step": 11550 }, { "epoch": 11.212363636363637, "grad_norm": 0.9845815896987915, "learning_rate": 2.4927901490098762e-06, "loss": 0.9984, "mean_token_accuracy": 0.6655347641557455, "num_tokens": 168558.0, "step": 11560 }, { "epoch": 11.222060606060605, "grad_norm": 1.9387283325195312, "learning_rate": 2.4789522151726764e-06, "loss": 0.9677, "mean_token_accuracy": 0.6992917202413083, "num_tokens": 179836.0, "step": 11570 }, { "epoch": 11.231757575757575, "grad_norm": 1.2270708084106445, "learning_rate": 2.4651473604828903e-06, "loss": 0.9466, "mean_token_accuracy": 0.675117377564311, "num_tokens": 190256.0, "step": 11580 }, { "epoch": 11.241454545454545, "grad_norm": 1.0312174558639526, "learning_rate": 2.4513756456571667e-06, "loss": 0.9776, "mean_token_accuracy": 0.6721729058772326, "num_tokens": 201487.0, "step": 11590 }, { "epoch": 11.251151515151514, "grad_norm": 1.4611775875091553, "learning_rate": 2.437637131266396e-06, "loss": 0.9016, "mean_token_accuracy": 0.6916485130786896, "num_tokens": 210683.0, "step": 11600 }, { "epoch": 11.260848484848484, "grad_norm": 1.251896858215332, "learning_rate": 2.4239318777354593e-06, "loss": 0.9099, "mean_token_accuracy": 0.6964191533625126, "num_tokens": 220616.0, "step": 11610 }, { "epoch": 11.270545454545454, "grad_norm": 1.1485376358032227, "learning_rate": 2.410259945342929e-06, "loss": 0.9313, "mean_token_accuracy": 0.6866546850651503, "num_tokens": 231221.0, "step": 11620 }, { "epoch": 11.280242424242424, "grad_norm": 1.1417745351791382, "learning_rate": 2.3966213942208363e-06, "loss": 0.8653, "mean_token_accuracy": 0.7093327675014734, "num_tokens": 241191.0, "step": 11630 }, { "epoch": 11.289939393939393, "grad_norm": 1.0089123249053955, "learning_rate": 2.383016284354397e-06, "loss": 0.9424, "mean_token_accuracy": 0.6817425429821015, "num_tokens": 250917.0, "step": 11640 }, { "epoch": 11.299636363636363, "grad_norm": 1.3917380571365356, "learning_rate": 2.369444675581738e-06, "loss": 0.9331, "mean_token_accuracy": 0.6756290566176176, "num_tokens": 261760.0, "step": 11650 }, { "epoch": 11.309333333333333, "grad_norm": 1.3879166841506958, "learning_rate": 2.355906627593647e-06, "loss": 0.9807, "mean_token_accuracy": 0.6554592750966549, "num_tokens": 272414.0, "step": 11660 }, { "epoch": 11.319030303030303, "grad_norm": 1.2711937427520752, "learning_rate": 2.342402199933296e-06, "loss": 0.8541, "mean_token_accuracy": 0.7178003009408712, "num_tokens": 282396.0, "step": 11670 }, { "epoch": 11.328727272727273, "grad_norm": 0.8704134821891785, "learning_rate": 2.3289314519960016e-06, "loss": 0.9997, "mean_token_accuracy": 0.6554147530347109, "num_tokens": 292871.0, "step": 11680 }, { "epoch": 11.338424242424242, "grad_norm": 1.9305732250213623, "learning_rate": 2.315494443028937e-06, "loss": 0.9644, "mean_token_accuracy": 0.6782477792352438, "num_tokens": 303864.0, "step": 11690 }, { "epoch": 11.348121212121212, "grad_norm": 1.0496376752853394, "learning_rate": 2.30209123213089e-06, "loss": 0.9606, "mean_token_accuracy": 0.6711665719747544, "num_tokens": 314081.0, "step": 11700 }, { "epoch": 11.357818181818182, "grad_norm": 1.2640137672424316, "learning_rate": 2.288721878251996e-06, "loss": 0.9216, "mean_token_accuracy": 0.6923185490071774, "num_tokens": 324585.0, "step": 11710 }, { "epoch": 11.367515151515152, "grad_norm": 1.2028977870941162, "learning_rate": 2.275386440193479e-06, "loss": 0.8991, "mean_token_accuracy": 0.70025773383677, "num_tokens": 334337.0, "step": 11720 }, { "epoch": 11.377212121212121, "grad_norm": 1.366431713104248, "learning_rate": 2.2620849766073993e-06, "loss": 0.9379, "mean_token_accuracy": 0.6744892597198486, "num_tokens": 344863.0, "step": 11730 }, { "epoch": 11.386909090909091, "grad_norm": 1.1718578338623047, "learning_rate": 2.248817545996387e-06, "loss": 0.8827, "mean_token_accuracy": 0.7153472680598497, "num_tokens": 354780.0, "step": 11740 }, { "epoch": 11.39660606060606, "grad_norm": 1.7113317251205444, "learning_rate": 2.235584206713385e-06, "loss": 0.9333, "mean_token_accuracy": 0.6829107455909252, "num_tokens": 365921.0, "step": 11750 }, { "epoch": 11.40630303030303, "grad_norm": 0.9357189536094666, "learning_rate": 2.2223850169613993e-06, "loss": 0.9788, "mean_token_accuracy": 0.6629223726689816, "num_tokens": 376384.0, "step": 11760 }, { "epoch": 11.416, "grad_norm": 1.9502002000808716, "learning_rate": 2.209220034793237e-06, "loss": 0.9126, "mean_token_accuracy": 0.6891988046467304, "num_tokens": 385616.0, "step": 11770 }, { "epoch": 11.42569696969697, "grad_norm": 0.9912474751472473, "learning_rate": 2.1960893181112553e-06, "loss": 0.9927, "mean_token_accuracy": 0.6538973189890385, "num_tokens": 396111.0, "step": 11780 }, { "epoch": 11.43539393939394, "grad_norm": 1.6034260988235474, "learning_rate": 2.182992924667101e-06, "loss": 0.9853, "mean_token_accuracy": 0.658538245409727, "num_tokens": 407225.0, "step": 11790 }, { "epoch": 11.44509090909091, "grad_norm": 0.7665310502052307, "learning_rate": 2.1699309120614663e-06, "loss": 0.9821, "mean_token_accuracy": 0.6684748906642198, "num_tokens": 417932.0, "step": 11800 }, { "epoch": 11.45478787878788, "grad_norm": 1.4279521703720093, "learning_rate": 2.1569033377438243e-06, "loss": 1.0331, "mean_token_accuracy": 0.6504704430699348, "num_tokens": 429943.0, "step": 11810 }, { "epoch": 11.46448484848485, "grad_norm": 1.4924397468566895, "learning_rate": 2.1439102590121807e-06, "loss": 0.9594, "mean_token_accuracy": 0.6776580080389977, "num_tokens": 440423.0, "step": 11820 }, { "epoch": 11.474181818181819, "grad_norm": 1.0187861919403076, "learning_rate": 2.1309517330128217e-06, "loss": 1.0211, "mean_token_accuracy": 0.6710641365498304, "num_tokens": 451528.0, "step": 11830 }, { "epoch": 11.483878787878789, "grad_norm": 1.2168591022491455, "learning_rate": 2.1180278167400726e-06, "loss": 0.9256, "mean_token_accuracy": 0.6835467047989369, "num_tokens": 461950.0, "step": 11840 }, { "epoch": 11.493575757575758, "grad_norm": 0.7139029502868652, "learning_rate": 2.105138567036026e-06, "loss": 0.984, "mean_token_accuracy": 0.6716462299227715, "num_tokens": 472203.0, "step": 11850 }, { "epoch": 11.503272727272726, "grad_norm": 0.9237242341041565, "learning_rate": 2.09228404059031e-06, "loss": 1.0324, "mean_token_accuracy": 0.653912478685379, "num_tokens": 484302.0, "step": 11860 }, { "epoch": 11.512969696969696, "grad_norm": 1.539085865020752, "learning_rate": 2.0794642939398315e-06, "loss": 0.9019, "mean_token_accuracy": 0.6944774236530066, "num_tokens": 494362.0, "step": 11870 }, { "epoch": 11.522666666666666, "grad_norm": 0.7662031054496765, "learning_rate": 2.066679383468524e-06, "loss": 0.9687, "mean_token_accuracy": 0.6761994324624538, "num_tokens": 505137.0, "step": 11880 }, { "epoch": 11.532363636363636, "grad_norm": 1.0061405897140503, "learning_rate": 2.0539293654071167e-06, "loss": 0.9671, "mean_token_accuracy": 0.669762023538351, "num_tokens": 515456.0, "step": 11890 }, { "epoch": 11.542060606060605, "grad_norm": 1.5532357692718506, "learning_rate": 2.0412142958328586e-06, "loss": 0.8768, "mean_token_accuracy": 0.7063564002513886, "num_tokens": 525883.0, "step": 11900 }, { "epoch": 11.551757575757575, "grad_norm": 0.8483320474624634, "learning_rate": 2.028534230669296e-06, "loss": 0.9502, "mean_token_accuracy": 0.6627003367990255, "num_tokens": 537384.0, "step": 11910 }, { "epoch": 11.561454545454545, "grad_norm": 0.933795154094696, "learning_rate": 2.015889225686022e-06, "loss": 0.9788, "mean_token_accuracy": 0.6608807422220707, "num_tokens": 548906.0, "step": 11920 }, { "epoch": 11.571151515151515, "grad_norm": 2.9793217182159424, "learning_rate": 2.0032793364984225e-06, "loss": 0.9872, "mean_token_accuracy": 0.6697162009775639, "num_tokens": 559505.0, "step": 11930 }, { "epoch": 11.580848484848485, "grad_norm": 1.162315845489502, "learning_rate": 1.9907046185674374e-06, "loss": 0.8945, "mean_token_accuracy": 0.7048578035086394, "num_tokens": 569706.0, "step": 11940 }, { "epoch": 11.590545454545454, "grad_norm": 0.6796969175338745, "learning_rate": 1.978165127199313e-06, "loss": 1.0175, "mean_token_accuracy": 0.6569722048938275, "num_tokens": 580920.0, "step": 11950 }, { "epoch": 11.600242424242424, "grad_norm": 0.8585827946662903, "learning_rate": 1.9656609175453724e-06, "loss": 0.9786, "mean_token_accuracy": 0.6617213696241379, "num_tokens": 591801.0, "step": 11960 }, { "epoch": 11.609939393939394, "grad_norm": 2.407949924468994, "learning_rate": 1.9531920446017514e-06, "loss": 0.8688, "mean_token_accuracy": 0.7031644247472286, "num_tokens": 601494.0, "step": 11970 }, { "epoch": 11.619636363636364, "grad_norm": 1.2153403759002686, "learning_rate": 1.940758563209172e-06, "loss": 0.9819, "mean_token_accuracy": 0.667534577473998, "num_tokens": 612759.0, "step": 11980 }, { "epoch": 11.629333333333333, "grad_norm": 1.6503994464874268, "learning_rate": 1.928360528052695e-06, "loss": 0.9575, "mean_token_accuracy": 0.6706000864505768, "num_tokens": 623931.0, "step": 11990 }, { "epoch": 11.639030303030303, "grad_norm": 2.4140963554382324, "learning_rate": 1.9159979936614813e-06, "loss": 0.927, "mean_token_accuracy": 0.6892816323786974, "num_tokens": 634238.0, "step": 12000 }, { "epoch": 11.648727272727273, "grad_norm": 1.6788582801818848, "learning_rate": 1.9036710144085568e-06, "loss": 0.9357, "mean_token_accuracy": 0.697110791504383, "num_tokens": 644919.0, "step": 12010 }, { "epoch": 11.658424242424243, "grad_norm": 1.9367320537567139, "learning_rate": 1.891379644510566e-06, "loss": 1.038, "mean_token_accuracy": 0.6408150486648083, "num_tokens": 655529.0, "step": 12020 }, { "epoch": 11.668121212121212, "grad_norm": 1.541839361190796, "learning_rate": 1.8791239380275262e-06, "loss": 0.9451, "mean_token_accuracy": 0.6883293610066176, "num_tokens": 665483.0, "step": 12030 }, { "epoch": 11.677818181818182, "grad_norm": 0.8563993573188782, "learning_rate": 1.8669039488626162e-06, "loss": 0.928, "mean_token_accuracy": 0.6791775230318308, "num_tokens": 676255.0, "step": 12040 }, { "epoch": 11.687515151515152, "grad_norm": 1.097931981086731, "learning_rate": 1.8547197307619102e-06, "loss": 1.0107, "mean_token_accuracy": 0.645743177831173, "num_tokens": 687475.0, "step": 12050 }, { "epoch": 11.697212121212122, "grad_norm": 1.8921289443969727, "learning_rate": 1.8425713373141597e-06, "loss": 0.9458, "mean_token_accuracy": 0.689141795784235, "num_tokens": 697582.0, "step": 12060 }, { "epoch": 11.706909090909091, "grad_norm": 1.5931812524795532, "learning_rate": 1.830458821950546e-06, "loss": 0.8759, "mean_token_accuracy": 0.6943042069673538, "num_tokens": 707522.0, "step": 12070 }, { "epoch": 11.716606060606061, "grad_norm": 1.7966309785842896, "learning_rate": 1.8183822379444604e-06, "loss": 0.9538, "mean_token_accuracy": 0.6829646300524473, "num_tokens": 718288.0, "step": 12080 }, { "epoch": 11.726303030303031, "grad_norm": 1.5532159805297852, "learning_rate": 1.8063416384112532e-06, "loss": 0.9123, "mean_token_accuracy": 0.6941913302987814, "num_tokens": 727672.0, "step": 12090 }, { "epoch": 11.736, "grad_norm": 0.9339669942855835, "learning_rate": 1.7943370763080093e-06, "loss": 0.9778, "mean_token_accuracy": 0.652100894600153, "num_tokens": 738798.0, "step": 12100 }, { "epoch": 11.74569696969697, "grad_norm": 1.5431864261627197, "learning_rate": 1.7823686044333134e-06, "loss": 0.9883, "mean_token_accuracy": 0.6828689679503441, "num_tokens": 749829.0, "step": 12110 }, { "epoch": 11.75539393939394, "grad_norm": 1.6083624362945557, "learning_rate": 1.7704362754270143e-06, "loss": 0.8534, "mean_token_accuracy": 0.7035974383354187, "num_tokens": 759547.0, "step": 12120 }, { "epoch": 11.765090909090908, "grad_norm": 1.150327444076538, "learning_rate": 1.7585401417700076e-06, "loss": 0.8699, "mean_token_accuracy": 0.6958384934812785, "num_tokens": 769350.0, "step": 12130 }, { "epoch": 11.77478787878788, "grad_norm": 1.8415894508361816, "learning_rate": 1.7466802557839834e-06, "loss": 0.9496, "mean_token_accuracy": 0.6662904676049948, "num_tokens": 780015.0, "step": 12140 }, { "epoch": 11.784484848484848, "grad_norm": 2.885213851928711, "learning_rate": 1.7348566696312108e-06, "loss": 0.9546, "mean_token_accuracy": 0.6689011044800282, "num_tokens": 790176.0, "step": 12150 }, { "epoch": 11.794181818181817, "grad_norm": 1.567074179649353, "learning_rate": 1.7230694353143041e-06, "loss": 0.9247, "mean_token_accuracy": 0.69071399345994, "num_tokens": 801125.0, "step": 12160 }, { "epoch": 11.803878787878787, "grad_norm": 0.8478710651397705, "learning_rate": 1.7113186046759956e-06, "loss": 0.9342, "mean_token_accuracy": 0.6843322183936834, "num_tokens": 811193.0, "step": 12170 }, { "epoch": 11.813575757575757, "grad_norm": 1.2154415845870972, "learning_rate": 1.6996042293989046e-06, "loss": 0.8842, "mean_token_accuracy": 0.7175424035638571, "num_tokens": 822012.0, "step": 12180 }, { "epoch": 11.823272727272727, "grad_norm": 1.4030102491378784, "learning_rate": 1.6879263610053109e-06, "loss": 0.9184, "mean_token_accuracy": 0.6776260420680046, "num_tokens": 832565.0, "step": 12190 }, { "epoch": 11.832969696969696, "grad_norm": 0.6021126508712769, "learning_rate": 1.6762850508569383e-06, "loss": 0.9, "mean_token_accuracy": 0.6897207599133253, "num_tokens": 843002.0, "step": 12200 }, { "epoch": 11.842666666666666, "grad_norm": 1.1585458517074585, "learning_rate": 1.6646803501547104e-06, "loss": 0.9101, "mean_token_accuracy": 0.6929288487881422, "num_tokens": 853533.0, "step": 12210 }, { "epoch": 11.852363636363636, "grad_norm": 0.7529911398887634, "learning_rate": 1.653112309938537e-06, "loss": 0.9389, "mean_token_accuracy": 0.6742060914635658, "num_tokens": 864186.0, "step": 12220 }, { "epoch": 11.862060606060606, "grad_norm": 1.9934836626052856, "learning_rate": 1.6415809810870854e-06, "loss": 0.9119, "mean_token_accuracy": 0.6902973093092442, "num_tokens": 875834.0, "step": 12230 }, { "epoch": 11.871757575757576, "grad_norm": 1.0597401857376099, "learning_rate": 1.6300864143175665e-06, "loss": 0.9318, "mean_token_accuracy": 0.6818343084305525, "num_tokens": 886161.0, "step": 12240 }, { "epoch": 11.881454545454545, "grad_norm": 1.6065829992294312, "learning_rate": 1.6186286601854962e-06, "loss": 0.9453, "mean_token_accuracy": 0.6795123651623726, "num_tokens": 895476.0, "step": 12250 }, { "epoch": 11.891151515151515, "grad_norm": 1.7515671253204346, "learning_rate": 1.6072077690844824e-06, "loss": 0.9692, "mean_token_accuracy": 0.6692648060619831, "num_tokens": 906427.0, "step": 12260 }, { "epoch": 11.900848484848485, "grad_norm": 1.2341034412384033, "learning_rate": 1.5958237912460028e-06, "loss": 0.9293, "mean_token_accuracy": 0.6872673355042934, "num_tokens": 916474.0, "step": 12270 }, { "epoch": 11.910545454545455, "grad_norm": 1.444577693939209, "learning_rate": 1.5844767767391799e-06, "loss": 0.8847, "mean_token_accuracy": 0.6981671530753374, "num_tokens": 926428.0, "step": 12280 }, { "epoch": 11.920242424242424, "grad_norm": 1.0555857419967651, "learning_rate": 1.5731667754705716e-06, "loss": 0.9736, "mean_token_accuracy": 0.6644821926951409, "num_tokens": 936011.0, "step": 12290 }, { "epoch": 11.929939393939394, "grad_norm": 1.8039774894714355, "learning_rate": 1.5618938371839366e-06, "loss": 0.9717, "mean_token_accuracy": 0.6748083829879761, "num_tokens": 946502.0, "step": 12300 }, { "epoch": 11.939636363636364, "grad_norm": 1.572402834892273, "learning_rate": 1.550658011460019e-06, "loss": 0.9184, "mean_token_accuracy": 0.7099429033696651, "num_tokens": 956754.0, "step": 12310 }, { "epoch": 11.949333333333334, "grad_norm": 0.8282158374786377, "learning_rate": 1.5394593477163456e-06, "loss": 0.8845, "mean_token_accuracy": 0.7014227926731109, "num_tokens": 966536.0, "step": 12320 }, { "epoch": 11.959030303030303, "grad_norm": 1.0083385705947876, "learning_rate": 1.5282978952069904e-06, "loss": 0.9763, "mean_token_accuracy": 0.670442745834589, "num_tokens": 976841.0, "step": 12330 }, { "epoch": 11.968727272727273, "grad_norm": 2.277254581451416, "learning_rate": 1.5171737030223632e-06, "loss": 1.0049, "mean_token_accuracy": 0.6729221884161234, "num_tokens": 987549.0, "step": 12340 }, { "epoch": 11.978424242424243, "grad_norm": 0.8625606894493103, "learning_rate": 1.5060868200889955e-06, "loss": 0.8779, "mean_token_accuracy": 0.7104882929474116, "num_tokens": 996977.0, "step": 12350 }, { "epoch": 11.988121212121213, "grad_norm": 1.0558991432189941, "learning_rate": 1.4950372951693316e-06, "loss": 0.9398, "mean_token_accuracy": 0.678413325548172, "num_tokens": 1007009.0, "step": 12360 }, { "epoch": 11.997818181818182, "grad_norm": 1.0509843826293945, "learning_rate": 1.4840251768614987e-06, "loss": 0.8752, "mean_token_accuracy": 0.7061600238084793, "num_tokens": 1016337.0, "step": 12370 }, { "epoch": 12.007757575757577, "grad_norm": 1.150305986404419, "learning_rate": 1.473050513599107e-06, "loss": 1.0264, "mean_token_accuracy": 0.6960650755137932, "num_tokens": 1027200.0, "step": 12380 }, { "epoch": 12.017454545454546, "grad_norm": 0.8161555528640747, "learning_rate": 1.462113353651029e-06, "loss": 0.9927, "mean_token_accuracy": 0.6727097641676665, "num_tokens": 1037572.0, "step": 12390 }, { "epoch": 12.027151515151516, "grad_norm": 1.7541327476501465, "learning_rate": 1.4512137451211884e-06, "loss": 0.8799, "mean_token_accuracy": 0.6972331315279007, "num_tokens": 1046891.0, "step": 12400 }, { "epoch": 12.036848484848484, "grad_norm": 0.9530600309371948, "learning_rate": 1.4403517359483577e-06, "loss": 0.9435, "mean_token_accuracy": 0.68136284686625, "num_tokens": 10434.0, "step": 12410 }, { "epoch": 12.046545454545454, "grad_norm": 1.3567638397216797, "learning_rate": 1.42952737390593e-06, "loss": 0.9598, "mean_token_accuracy": 0.676530422642827, "num_tokens": 22038.0, "step": 12420 }, { "epoch": 12.056242424242424, "grad_norm": 1.3663750886917114, "learning_rate": 1.4187407066017245e-06, "loss": 0.9701, "mean_token_accuracy": 0.6606147531419992, "num_tokens": 32741.0, "step": 12430 }, { "epoch": 12.065939393939393, "grad_norm": 1.086794376373291, "learning_rate": 1.4079917814777667e-06, "loss": 0.8693, "mean_token_accuracy": 0.7050681680440902, "num_tokens": 41500.0, "step": 12440 }, { "epoch": 12.075636363636363, "grad_norm": 0.9989749193191528, "learning_rate": 1.3972806458100885e-06, "loss": 0.8294, "mean_token_accuracy": 0.7239202216267586, "num_tokens": 50782.0, "step": 12450 }, { "epoch": 12.085333333333333, "grad_norm": 1.2325557470321655, "learning_rate": 1.3866073467085127e-06, "loss": 0.8807, "mean_token_accuracy": 0.6962772708386182, "num_tokens": 60816.0, "step": 12460 }, { "epoch": 12.095030303030303, "grad_norm": 1.5396286249160767, "learning_rate": 1.3759719311164477e-06, "loss": 0.995, "mean_token_accuracy": 0.6646735660731793, "num_tokens": 71812.0, "step": 12470 }, { "epoch": 12.104727272727272, "grad_norm": 1.008445382118225, "learning_rate": 1.3653744458106876e-06, "loss": 0.9715, "mean_token_accuracy": 0.6767258770763874, "num_tokens": 83843.0, "step": 12480 }, { "epoch": 12.114424242424242, "grad_norm": 1.6044663190841675, "learning_rate": 1.3548149374011986e-06, "loss": 0.9437, "mean_token_accuracy": 0.701893288269639, "num_tokens": 94582.0, "step": 12490 }, { "epoch": 12.124121212121212, "grad_norm": 1.4867864847183228, "learning_rate": 1.3442934523309137e-06, "loss": 0.9598, "mean_token_accuracy": 0.678819801285863, "num_tokens": 106037.0, "step": 12500 }, { "epoch": 12.133818181818182, "grad_norm": 1.6262177228927612, "learning_rate": 1.3338100368755346e-06, "loss": 0.9829, "mean_token_accuracy": 0.6883869960904121, "num_tokens": 117393.0, "step": 12510 }, { "epoch": 12.143515151515151, "grad_norm": 2.58561635017395, "learning_rate": 1.3233647371433222e-06, "loss": 0.9191, "mean_token_accuracy": 0.6864805597811937, "num_tokens": 127326.0, "step": 12520 }, { "epoch": 12.153212121212121, "grad_norm": 1.6916279792785645, "learning_rate": 1.3129575990749e-06, "loss": 0.9546, "mean_token_accuracy": 0.6888086255639791, "num_tokens": 137539.0, "step": 12530 }, { "epoch": 12.162909090909091, "grad_norm": 1.5663442611694336, "learning_rate": 1.3025886684430467e-06, "loss": 0.9829, "mean_token_accuracy": 0.6744012456387282, "num_tokens": 148648.0, "step": 12540 }, { "epoch": 12.17260606060606, "grad_norm": 1.4812220335006714, "learning_rate": 1.2922579908524946e-06, "loss": 0.9033, "mean_token_accuracy": 0.7043029896914959, "num_tokens": 158590.0, "step": 12550 }, { "epoch": 12.18230303030303, "grad_norm": 1.7226941585540771, "learning_rate": 1.2819656117397328e-06, "loss": 0.9398, "mean_token_accuracy": 0.6753247026354074, "num_tokens": 170030.0, "step": 12560 }, { "epoch": 12.192, "grad_norm": 0.7470999956130981, "learning_rate": 1.2717115763728083e-06, "loss": 0.9315, "mean_token_accuracy": 0.6908956177532672, "num_tokens": 179668.0, "step": 12570 }, { "epoch": 12.20169696969697, "grad_norm": 1.0085124969482422, "learning_rate": 1.2614959298511231e-06, "loss": 0.9663, "mean_token_accuracy": 0.6647142685949803, "num_tokens": 190351.0, "step": 12580 }, { "epoch": 12.21139393939394, "grad_norm": 0.801249623298645, "learning_rate": 1.2513187171052288e-06, "loss": 1.0055, "mean_token_accuracy": 0.6591165266931057, "num_tokens": 200784.0, "step": 12590 }, { "epoch": 12.22109090909091, "grad_norm": 1.1452405452728271, "learning_rate": 1.2411799828966497e-06, "loss": 0.9374, "mean_token_accuracy": 0.6782014291733504, "num_tokens": 210672.0, "step": 12600 }, { "epoch": 12.23078787878788, "grad_norm": 1.4320217370986938, "learning_rate": 1.2310797718176658e-06, "loss": 0.9016, "mean_token_accuracy": 0.6874732073396445, "num_tokens": 220175.0, "step": 12610 }, { "epoch": 12.240484848484849, "grad_norm": 1.0549358129501343, "learning_rate": 1.221018128291127e-06, "loss": 0.9145, "mean_token_accuracy": 0.6930529691278935, "num_tokens": 230511.0, "step": 12620 }, { "epoch": 12.250181818181819, "grad_norm": 0.7888785004615784, "learning_rate": 1.2109950965702532e-06, "loss": 0.962, "mean_token_accuracy": 0.6716390445828437, "num_tokens": 240893.0, "step": 12630 }, { "epoch": 12.259878787878788, "grad_norm": 2.5039796829223633, "learning_rate": 1.2010107207384437e-06, "loss": 0.8335, "mean_token_accuracy": 0.7178800087422132, "num_tokens": 250554.0, "step": 12640 }, { "epoch": 12.269575757575758, "grad_norm": 1.5427664518356323, "learning_rate": 1.1910650447090798e-06, "loss": 1.0129, "mean_token_accuracy": 0.6641611870378256, "num_tokens": 261026.0, "step": 12650 }, { "epoch": 12.279272727272728, "grad_norm": 1.7952816486358643, "learning_rate": 1.1811581122253335e-06, "loss": 0.9801, "mean_token_accuracy": 0.6731622900813818, "num_tokens": 271855.0, "step": 12660 }, { "epoch": 12.288969696969698, "grad_norm": 1.4959173202514648, "learning_rate": 1.171289966859973e-06, "loss": 0.9324, "mean_token_accuracy": 0.6812974836677312, "num_tokens": 281878.0, "step": 12670 }, { "epoch": 12.298666666666668, "grad_norm": 0.7014359831809998, "learning_rate": 1.1614606520151716e-06, "loss": 0.9333, "mean_token_accuracy": 0.6856089878827334, "num_tokens": 292658.0, "step": 12680 }, { "epoch": 12.308363636363636, "grad_norm": 0.6972899436950684, "learning_rate": 1.1516702109223243e-06, "loss": 0.8949, "mean_token_accuracy": 0.6989801757037639, "num_tokens": 302011.0, "step": 12690 }, { "epoch": 12.318060606060605, "grad_norm": 1.2687288522720337, "learning_rate": 1.1419186866418452e-06, "loss": 0.9406, "mean_token_accuracy": 0.6923393607139587, "num_tokens": 312147.0, "step": 12700 }, { "epoch": 12.327757575757575, "grad_norm": 1.3525540828704834, "learning_rate": 1.1322061220629855e-06, "loss": 0.8962, "mean_token_accuracy": 0.684671938046813, "num_tokens": 323474.0, "step": 12710 }, { "epoch": 12.337454545454545, "grad_norm": 1.2294106483459473, "learning_rate": 1.122532559903644e-06, "loss": 1.0468, "mean_token_accuracy": 0.628922751918435, "num_tokens": 334923.0, "step": 12720 }, { "epoch": 12.347151515151515, "grad_norm": 1.096246600151062, "learning_rate": 1.1128980427101766e-06, "loss": 0.9059, "mean_token_accuracy": 0.683203124627471, "num_tokens": 344506.0, "step": 12730 }, { "epoch": 12.356848484848484, "grad_norm": 1.3699408769607544, "learning_rate": 1.1033026128572156e-06, "loss": 0.9996, "mean_token_accuracy": 0.6612563081085682, "num_tokens": 355007.0, "step": 12740 }, { "epoch": 12.366545454545454, "grad_norm": 1.7355482578277588, "learning_rate": 1.0937463125474724e-06, "loss": 0.9761, "mean_token_accuracy": 0.6721325762569904, "num_tokens": 365829.0, "step": 12750 }, { "epoch": 12.376242424242424, "grad_norm": 2.603883981704712, "learning_rate": 1.084229183811566e-06, "loss": 0.9383, "mean_token_accuracy": 0.6898716945201159, "num_tokens": 376436.0, "step": 12760 }, { "epoch": 12.385939393939394, "grad_norm": 1.0586647987365723, "learning_rate": 1.0747512685078264e-06, "loss": 0.9564, "mean_token_accuracy": 0.6626970659941435, "num_tokens": 387389.0, "step": 12770 }, { "epoch": 12.395636363636363, "grad_norm": 1.6182094812393188, "learning_rate": 1.0653126083221143e-06, "loss": 0.9003, "mean_token_accuracy": 0.6970617674291134, "num_tokens": 397693.0, "step": 12780 }, { "epoch": 12.405333333333333, "grad_norm": 1.9159958362579346, "learning_rate": 1.05591324476764e-06, "loss": 0.9911, "mean_token_accuracy": 0.6668812599033117, "num_tokens": 408904.0, "step": 12790 }, { "epoch": 12.415030303030303, "grad_norm": 1.2994496822357178, "learning_rate": 1.046553219184776e-06, "loss": 0.8753, "mean_token_accuracy": 0.7094687633216381, "num_tokens": 418860.0, "step": 12800 }, { "epoch": 12.424727272727273, "grad_norm": 1.3715529441833496, "learning_rate": 1.0372325727408838e-06, "loss": 0.9217, "mean_token_accuracy": 0.6802921980619431, "num_tokens": 429236.0, "step": 12810 }, { "epoch": 12.434424242424242, "grad_norm": 0.981478750705719, "learning_rate": 1.0279513464301204e-06, "loss": 0.9377, "mean_token_accuracy": 0.6767802778631449, "num_tokens": 439169.0, "step": 12820 }, { "epoch": 12.444121212121212, "grad_norm": 0.9200496077537537, "learning_rate": 1.0187095810732705e-06, "loss": 1.0123, "mean_token_accuracy": 0.6698127511888743, "num_tokens": 450423.0, "step": 12830 }, { "epoch": 12.453818181818182, "grad_norm": 1.1707184314727783, "learning_rate": 1.0095073173175552e-06, "loss": 0.9562, "mean_token_accuracy": 0.6921768002212048, "num_tokens": 461570.0, "step": 12840 }, { "epoch": 12.463515151515152, "grad_norm": 0.8096593022346497, "learning_rate": 1.0003445956364666e-06, "loss": 0.9273, "mean_token_accuracy": 0.6814159829169512, "num_tokens": 471981.0, "step": 12850 }, { "epoch": 12.473212121212121, "grad_norm": 1.0456788539886475, "learning_rate": 9.912214563295787e-07, "loss": 0.9224, "mean_token_accuracy": 0.68552374728024, "num_tokens": 482091.0, "step": 12860 }, { "epoch": 12.482909090909091, "grad_norm": 1.2879787683486938, "learning_rate": 9.821379395223684e-07, "loss": 0.9833, "mean_token_accuracy": 0.6662912800908088, "num_tokens": 493252.0, "step": 12870 }, { "epoch": 12.492606060606061, "grad_norm": 1.3891626596450806, "learning_rate": 9.730940851660554e-07, "loss": 0.9407, "mean_token_accuracy": 0.7015823908150196, "num_tokens": 504067.0, "step": 12880 }, { "epoch": 12.50230303030303, "grad_norm": 1.663533329963684, "learning_rate": 9.640899330374088e-07, "loss": 0.8911, "mean_token_accuracy": 0.6906427904963494, "num_tokens": 514270.0, "step": 12890 }, { "epoch": 12.512, "grad_norm": 1.9871175289154053, "learning_rate": 9.55125522738579e-07, "loss": 0.9259, "mean_token_accuracy": 0.6906178455799818, "num_tokens": 524617.0, "step": 12900 }, { "epoch": 12.52169696969697, "grad_norm": 0.9362130165100098, "learning_rate": 9.462008936969258e-07, "loss": 0.9653, "mean_token_accuracy": 0.6761426538228988, "num_tokens": 536008.0, "step": 12910 }, { "epoch": 12.53139393939394, "grad_norm": 1.086140513420105, "learning_rate": 9.373160851648422e-07, "loss": 0.8916, "mean_token_accuracy": 0.6984883040189743, "num_tokens": 545747.0, "step": 12920 }, { "epoch": 12.54109090909091, "grad_norm": 1.05403470993042, "learning_rate": 9.28471136219582e-07, "loss": 0.9704, "mean_token_accuracy": 0.6681053042411804, "num_tokens": 556761.0, "step": 12930 }, { "epoch": 12.55078787878788, "grad_norm": 0.9770132303237915, "learning_rate": 9.196660857630857e-07, "loss": 0.9625, "mean_token_accuracy": 0.6729031853377819, "num_tokens": 566793.0, "step": 12940 }, { "epoch": 12.56048484848485, "grad_norm": 2.408095598220825, "learning_rate": 9.109009725218165e-07, "loss": 0.9268, "mean_token_accuracy": 0.6797478631138801, "num_tokens": 577467.0, "step": 12950 }, { "epoch": 12.570181818181819, "grad_norm": 1.0821237564086914, "learning_rate": 9.021758350465804e-07, "loss": 1.0222, "mean_token_accuracy": 0.6477519739419222, "num_tokens": 588108.0, "step": 12960 }, { "epoch": 12.579878787878787, "grad_norm": 0.7974284887313843, "learning_rate": 8.93490711712367e-07, "loss": 0.9717, "mean_token_accuracy": 0.6615799587219954, "num_tokens": 598348.0, "step": 12970 }, { "epoch": 12.589575757575757, "grad_norm": 0.9920361638069153, "learning_rate": 8.848456407181715e-07, "loss": 0.9195, "mean_token_accuracy": 0.6796383894979954, "num_tokens": 607847.0, "step": 12980 }, { "epoch": 12.599272727272727, "grad_norm": 1.929929494857788, "learning_rate": 8.762406600868301e-07, "loss": 0.9424, "mean_token_accuracy": 0.6750466857105494, "num_tokens": 618641.0, "step": 12990 }, { "epoch": 12.608969696969696, "grad_norm": 0.9798093438148499, "learning_rate": 8.676758076648562e-07, "loss": 0.9802, "mean_token_accuracy": 0.6532435789704323, "num_tokens": 629445.0, "step": 13000 }, { "epoch": 12.618666666666666, "grad_norm": 1.7001301050186157, "learning_rate": 8.59151121122268e-07, "loss": 0.9055, "mean_token_accuracy": 0.6827256765216589, "num_tokens": 640368.0, "step": 13010 }, { "epoch": 12.628363636363636, "grad_norm": 1.0197906494140625, "learning_rate": 8.506666379524275e-07, "loss": 0.9016, "mean_token_accuracy": 0.6827419150620699, "num_tokens": 650484.0, "step": 13020 }, { "epoch": 12.638060606060606, "grad_norm": 2.6649887561798096, "learning_rate": 8.4222239547187e-07, "loss": 0.892, "mean_token_accuracy": 0.6925595041364432, "num_tokens": 660702.0, "step": 13030 }, { "epoch": 12.647757575757575, "grad_norm": 1.020989179611206, "learning_rate": 8.338184308201535e-07, "loss": 0.9017, "mean_token_accuracy": 0.6905462071299553, "num_tokens": 671682.0, "step": 13040 }, { "epoch": 12.657454545454545, "grad_norm": 1.4303945302963257, "learning_rate": 8.254547809596747e-07, "loss": 0.9703, "mean_token_accuracy": 0.6805687319487334, "num_tokens": 682100.0, "step": 13050 }, { "epoch": 12.667151515151515, "grad_norm": 1.8320350646972656, "learning_rate": 8.171314826755228e-07, "loss": 0.9739, "mean_token_accuracy": 0.66879703104496, "num_tokens": 692660.0, "step": 13060 }, { "epoch": 12.676848484848485, "grad_norm": 0.9438029527664185, "learning_rate": 8.088485725753114e-07, "loss": 0.9212, "mean_token_accuracy": 0.6848585486412049, "num_tokens": 702875.0, "step": 13070 }, { "epoch": 12.686545454545454, "grad_norm": 2.9450020790100098, "learning_rate": 8.006060870890165e-07, "loss": 0.876, "mean_token_accuracy": 0.6980018597096205, "num_tokens": 712292.0, "step": 13080 }, { "epoch": 12.696242424242424, "grad_norm": 1.4857258796691895, "learning_rate": 7.924040624688245e-07, "loss": 0.8641, "mean_token_accuracy": 0.7006300635635853, "num_tokens": 722244.0, "step": 13090 }, { "epoch": 12.705939393939394, "grad_norm": 1.02292799949646, "learning_rate": 7.842425347889582e-07, "loss": 0.9976, "mean_token_accuracy": 0.6541789300739765, "num_tokens": 733252.0, "step": 13100 }, { "epoch": 12.715636363636364, "grad_norm": 1.1031875610351562, "learning_rate": 7.761215399455324e-07, "loss": 0.9232, "mean_token_accuracy": 0.6899745035916567, "num_tokens": 744027.0, "step": 13110 }, { "epoch": 12.725333333333333, "grad_norm": 1.4371963739395142, "learning_rate": 7.680411136563837e-07, "loss": 0.9818, "mean_token_accuracy": 0.6558696981519461, "num_tokens": 754156.0, "step": 13120 }, { "epoch": 12.735030303030303, "grad_norm": 1.3838204145431519, "learning_rate": 7.600012914609301e-07, "loss": 0.912, "mean_token_accuracy": 0.7075278196483851, "num_tokens": 763732.0, "step": 13130 }, { "epoch": 12.744727272727273, "grad_norm": 0.7802479267120361, "learning_rate": 7.520021087199925e-07, "loss": 0.916, "mean_token_accuracy": 0.6960792735219001, "num_tokens": 773456.0, "step": 13140 }, { "epoch": 12.754424242424243, "grad_norm": 1.3201979398727417, "learning_rate": 7.440436006156559e-07, "loss": 0.9347, "mean_token_accuracy": 0.6893177561461925, "num_tokens": 784266.0, "step": 13150 }, { "epoch": 12.764121212121212, "grad_norm": 0.9860504269599915, "learning_rate": 7.361258021511142e-07, "loss": 0.9249, "mean_token_accuracy": 0.6765072204172611, "num_tokens": 794396.0, "step": 13160 }, { "epoch": 12.773818181818182, "grad_norm": 1.6493189334869385, "learning_rate": 7.282487481505041e-07, "loss": 0.9379, "mean_token_accuracy": 0.671536460146308, "num_tokens": 804843.0, "step": 13170 }, { "epoch": 12.783515151515152, "grad_norm": 0.8871903419494629, "learning_rate": 7.204124732587659e-07, "loss": 0.8677, "mean_token_accuracy": 0.7121831141412258, "num_tokens": 815821.0, "step": 13180 }, { "epoch": 12.793212121212122, "grad_norm": 1.6710381507873535, "learning_rate": 7.126170119414799e-07, "loss": 0.9455, "mean_token_accuracy": 0.6882101558148861, "num_tokens": 826418.0, "step": 13190 }, { "epoch": 12.802909090909091, "grad_norm": 1.0449455976486206, "learning_rate": 7.048623984847203e-07, "loss": 0.9237, "mean_token_accuracy": 0.6743796251714229, "num_tokens": 837180.0, "step": 13200 }, { "epoch": 12.812606060606061, "grad_norm": 1.153255581855774, "learning_rate": 6.971486669949102e-07, "loss": 0.9745, "mean_token_accuracy": 0.6699652068316937, "num_tokens": 847602.0, "step": 13210 }, { "epoch": 12.822303030303031, "grad_norm": 1.069661021232605, "learning_rate": 6.894758513986566e-07, "loss": 0.9217, "mean_token_accuracy": 0.6804017089307308, "num_tokens": 857486.0, "step": 13220 }, { "epoch": 12.832, "grad_norm": 1.011649489402771, "learning_rate": 6.818439854426151e-07, "loss": 0.9386, "mean_token_accuracy": 0.6823414113372565, "num_tokens": 868972.0, "step": 13230 }, { "epoch": 12.84169696969697, "grad_norm": 0.7872369885444641, "learning_rate": 6.74253102693333e-07, "loss": 0.9409, "mean_token_accuracy": 0.6847406111657619, "num_tokens": 879178.0, "step": 13240 }, { "epoch": 12.85139393939394, "grad_norm": 1.3302205801010132, "learning_rate": 6.667032365371095e-07, "loss": 0.9514, "mean_token_accuracy": 0.6746706318110227, "num_tokens": 890112.0, "step": 13250 }, { "epoch": 12.861090909090908, "grad_norm": 0.7299315333366394, "learning_rate": 6.591944201798394e-07, "loss": 0.8983, "mean_token_accuracy": 0.6949192993342876, "num_tokens": 900105.0, "step": 13260 }, { "epoch": 12.870787878787878, "grad_norm": 0.9053242206573486, "learning_rate": 6.517266866468741e-07, "loss": 0.9662, "mean_token_accuracy": 0.6785097420215607, "num_tokens": 909781.0, "step": 13270 }, { "epoch": 12.880484848484848, "grad_norm": 1.5465375185012817, "learning_rate": 6.443000687828737e-07, "loss": 0.9076, "mean_token_accuracy": 0.6935414470732212, "num_tokens": 920332.0, "step": 13280 }, { "epoch": 12.890181818181818, "grad_norm": 0.9741002917289734, "learning_rate": 6.369145992516635e-07, "loss": 0.9533, "mean_token_accuracy": 0.6718010984361171, "num_tokens": 930800.0, "step": 13290 }, { "epoch": 12.899878787878787, "grad_norm": 1.4398901462554932, "learning_rate": 6.295703105360884e-07, "loss": 0.9613, "mean_token_accuracy": 0.6741296485066414, "num_tokens": 942822.0, "step": 13300 }, { "epoch": 12.909575757575757, "grad_norm": 0.8408631086349487, "learning_rate": 6.222672349378711e-07, "loss": 0.8839, "mean_token_accuracy": 0.6957414381206035, "num_tokens": 953151.0, "step": 13310 }, { "epoch": 12.919272727272727, "grad_norm": 1.185342788696289, "learning_rate": 6.150054045774745e-07, "loss": 0.9431, "mean_token_accuracy": 0.6786404684185982, "num_tokens": 963817.0, "step": 13320 }, { "epoch": 12.928969696969697, "grad_norm": 1.5377130508422852, "learning_rate": 6.07784851393951e-07, "loss": 0.9263, "mean_token_accuracy": 0.6862830605357886, "num_tokens": 974618.0, "step": 13330 }, { "epoch": 12.938666666666666, "grad_norm": 2.0658161640167236, "learning_rate": 6.006056071448119e-07, "loss": 0.8625, "mean_token_accuracy": 0.7110202703624964, "num_tokens": 984540.0, "step": 13340 }, { "epoch": 12.948363636363636, "grad_norm": 1.0002696514129639, "learning_rate": 5.934677034058789e-07, "loss": 0.9983, "mean_token_accuracy": 0.6710415873676538, "num_tokens": 995538.0, "step": 13350 }, { "epoch": 12.958060606060606, "grad_norm": 0.6808292269706726, "learning_rate": 5.863711715711507e-07, "loss": 0.9357, "mean_token_accuracy": 0.6868117332458497, "num_tokens": 1005955.0, "step": 13360 }, { "epoch": 12.967757575757576, "grad_norm": 1.43692946434021, "learning_rate": 5.793160428526678e-07, "loss": 0.9581, "mean_token_accuracy": 0.6872004386037588, "num_tokens": 1017901.0, "step": 13370 }, { "epoch": 12.977454545454545, "grad_norm": 1.1382737159729004, "learning_rate": 5.723023482803658e-07, "loss": 0.8893, "mean_token_accuracy": 0.6952810846269131, "num_tokens": 1027791.0, "step": 13380 }, { "epoch": 12.987151515151515, "grad_norm": 1.5918898582458496, "learning_rate": 5.653301187019455e-07, "loss": 0.8051, "mean_token_accuracy": 0.7371663119643926, "num_tokens": 1037438.0, "step": 13390 }, { "epoch": 12.996848484848485, "grad_norm": 1.294746994972229, "learning_rate": 5.583993847827363e-07, "loss": 0.9868, "mean_token_accuracy": 0.6542905114591122, "num_tokens": 1048425.0, "step": 13400 }, { "epoch": 13.006787878787879, "grad_norm": 1.0259826183319092, "learning_rate": 5.515101770055653e-07, "loss": 1.1464, "mean_token_accuracy": 0.6516239614021487, "num_tokens": 1059946.0, "step": 13410 }, { "epoch": 13.016484848484849, "grad_norm": 1.570686936378479, "learning_rate": 5.446625256706095e-07, "loss": 0.9854, "mean_token_accuracy": 0.6571170825511217, "num_tokens": 1071512.0, "step": 13420 }, { "epoch": 13.026181818181819, "grad_norm": 1.4056403636932373, "learning_rate": 5.378564608952786e-07, "loss": 1.0107, "mean_token_accuracy": 0.6513097662478685, "num_tokens": 1082669.0, "step": 13430 }, { "epoch": 13.035878787878788, "grad_norm": 1.19424307346344, "learning_rate": 5.310920126140773e-07, "loss": 0.9449, "mean_token_accuracy": 0.6799818322062492, "num_tokens": 1093470.0, "step": 13440 }, { "epoch": 13.045575757575758, "grad_norm": 1.1541939973831177, "learning_rate": 5.243692105784682e-07, "loss": 1.0241, "mean_token_accuracy": 0.6430629625916481, "num_tokens": 1105089.0, "step": 13450 }, { "epoch": 13.055272727272728, "grad_norm": 0.825744092464447, "learning_rate": 5.176880843567455e-07, "loss": 0.9137, "mean_token_accuracy": 0.6785864185541868, "num_tokens": 1115643.0, "step": 13460 }, { "epoch": 13.064969696969698, "grad_norm": 1.2951405048370361, "learning_rate": 5.110486633339062e-07, "loss": 0.972, "mean_token_accuracy": 0.6606432240456342, "num_tokens": 1126575.0, "step": 13470 }, { "epoch": 13.074666666666667, "grad_norm": 0.8548156023025513, "learning_rate": 5.044509767115158e-07, "loss": 0.9143, "mean_token_accuracy": 0.6806200005114078, "num_tokens": 1137317.0, "step": 13480 }, { "epoch": 13.084363636363637, "grad_norm": 1.2886772155761719, "learning_rate": 4.978950535075878e-07, "loss": 0.8903, "mean_token_accuracy": 0.7035336244851351, "num_tokens": 1148065.0, "step": 13490 }, { "epoch": 13.094060606060607, "grad_norm": 1.966200351715088, "learning_rate": 4.913809225564492e-07, "loss": 0.9073, "mean_token_accuracy": 0.701976515352726, "num_tokens": 1158621.0, "step": 13500 }, { "epoch": 13.103757575757577, "grad_norm": 0.8389899134635925, "learning_rate": 4.849086125086156e-07, "loss": 0.9414, "mean_token_accuracy": 0.6894888635724783, "num_tokens": 1168890.0, "step": 13510 }, { "epoch": 13.113454545454545, "grad_norm": 0.9758931994438171, "learning_rate": 4.784781518306624e-07, "loss": 0.9384, "mean_token_accuracy": 0.6737278677523136, "num_tokens": 1178584.0, "step": 13520 }, { "epoch": 13.123151515151514, "grad_norm": 1.0330685377120972, "learning_rate": 4.720895688051108e-07, "loss": 0.9524, "mean_token_accuracy": 0.6780954591929913, "num_tokens": 1189454.0, "step": 13530 }, { "epoch": 13.132848484848484, "grad_norm": 1.9264168739318848, "learning_rate": 4.657428915302875e-07, "loss": 0.8363, "mean_token_accuracy": 0.720489464327693, "num_tokens": 1199557.0, "step": 13540 }, { "epoch": 13.142545454545454, "grad_norm": 0.7110128402709961, "learning_rate": 4.594381479202137e-07, "loss": 0.9138, "mean_token_accuracy": 0.6820375476032495, "num_tokens": 1210005.0, "step": 13550 }, { "epoch": 13.152242424242424, "grad_norm": 1.9913625717163086, "learning_rate": 4.531753657044735e-07, "loss": 0.9352, "mean_token_accuracy": 0.6837764341384173, "num_tokens": 1220507.0, "step": 13560 }, { "epoch": 13.161939393939393, "grad_norm": 1.4394137859344482, "learning_rate": 4.469545724280988e-07, "loss": 0.9389, "mean_token_accuracy": 0.6835528288036585, "num_tokens": 1231088.0, "step": 13570 }, { "epoch": 13.171636363636363, "grad_norm": 1.118189811706543, "learning_rate": 4.407757954514458e-07, "loss": 0.9182, "mean_token_accuracy": 0.69982905164361, "num_tokens": 1241297.0, "step": 13580 }, { "epoch": 13.181333333333333, "grad_norm": 0.6542367935180664, "learning_rate": 4.3463906195007066e-07, "loss": 0.8837, "mean_token_accuracy": 0.700026823580265, "num_tokens": 1250943.0, "step": 13590 }, { "epoch": 13.191030303030303, "grad_norm": 0.5948226451873779, "learning_rate": 4.285443989146176e-07, "loss": 1.0363, "mean_token_accuracy": 0.6513338401913643, "num_tokens": 1262487.0, "step": 13600 }, { "epoch": 13.200727272727272, "grad_norm": 1.0918562412261963, "learning_rate": 4.5e-05, "loss": 0.9926, "mean_token_accuracy": 0.6507623802870512, "num_tokens": 11060.0, "step": 13610 }, { "epoch": 13.210424242424242, "grad_norm": 1.453194499015808, "learning_rate": 9.5e-05, "loss": 0.8771, "mean_token_accuracy": 0.6994029752910137, "num_tokens": 21506.0, "step": 13620 }, { "epoch": 13.220121212121212, "grad_norm": 2.374359130859375, "learning_rate": 9.995495495495496e-05, "loss": 0.9635, "mean_token_accuracy": 0.6764267832040787, "num_tokens": 32669.0, "step": 13630 }, { "epoch": 13.229818181818182, "grad_norm": 1.6310795545578003, "learning_rate": 9.990490490490491e-05, "loss": 0.8954, "mean_token_accuracy": 0.7067163821309805, "num_tokens": 42938.0, "step": 13640 }, { "epoch": 13.239515151515151, "grad_norm": 0.8453378677368164, "learning_rate": 9.985485485485487e-05, "loss": 0.9392, "mean_token_accuracy": 0.68135135024786, "num_tokens": 53420.0, "step": 13650 }, { "epoch": 13.249212121212121, "grad_norm": 2.5701301097869873, "learning_rate": 9.980480480480481e-05, "loss": 0.8713, "mean_token_accuracy": 0.7125500839203596, "num_tokens": 62912.0, "step": 13660 }, { "epoch": 13.258909090909091, "grad_norm": 1.7641572952270508, "learning_rate": 9.975475475475477e-05, "loss": 0.9947, "mean_token_accuracy": 0.6635019164532423, "num_tokens": 73599.0, "step": 13670 }, { "epoch": 13.26860606060606, "grad_norm": 2.168328046798706, "learning_rate": 9.970470470470471e-05, "loss": 0.9706, "mean_token_accuracy": 0.688429095223546, "num_tokens": 83750.0, "step": 13680 }, { "epoch": 13.27830303030303, "grad_norm": 1.4071749448776245, "learning_rate": 9.965465465465466e-05, "loss": 0.9676, "mean_token_accuracy": 0.6768725138157606, "num_tokens": 93823.0, "step": 13690 }, { "epoch": 13.288, "grad_norm": 1.1499977111816406, "learning_rate": 9.960460460460461e-05, "loss": 0.9889, "mean_token_accuracy": 0.6712037593126297, "num_tokens": 103911.0, "step": 13700 }, { "epoch": 13.29769696969697, "grad_norm": 1.2642593383789062, "learning_rate": 9.955455455455456e-05, "loss": 0.9786, "mean_token_accuracy": 0.6803277429193259, "num_tokens": 114296.0, "step": 13710 }, { "epoch": 13.30739393939394, "grad_norm": 0.9675585627555847, "learning_rate": 9.950450450450451e-05, "loss": 0.8952, "mean_token_accuracy": 0.6976213902235031, "num_tokens": 123697.0, "step": 13720 }, { "epoch": 13.31709090909091, "grad_norm": 1.5083271265029907, "learning_rate": 9.945445445445446e-05, "loss": 0.9518, "mean_token_accuracy": 0.6812848944216967, "num_tokens": 133976.0, "step": 13730 }, { "epoch": 13.32678787878788, "grad_norm": 1.0912386178970337, "learning_rate": 9.94044044044044e-05, "loss": 0.9224, "mean_token_accuracy": 0.6897901255637408, "num_tokens": 143868.0, "step": 13740 }, { "epoch": 13.336484848484849, "grad_norm": 1.7375333309173584, "learning_rate": 9.935435435435436e-05, "loss": 0.9107, "mean_token_accuracy": 0.706351314485073, "num_tokens": 154106.0, "step": 13750 }, { "epoch": 13.346181818181819, "grad_norm": 1.1665840148925781, "learning_rate": 9.930430430430431e-05, "loss": 0.941, "mean_token_accuracy": 0.6972976390272378, "num_tokens": 164042.0, "step": 13760 }, { "epoch": 13.355878787878789, "grad_norm": 1.7706063985824585, "learning_rate": 9.925425425425427e-05, "loss": 0.8943, "mean_token_accuracy": 0.7054846830666065, "num_tokens": 174506.0, "step": 13770 }, { "epoch": 13.365575757575758, "grad_norm": 0.5767163038253784, "learning_rate": 9.920420420420421e-05, "loss": 0.9656, "mean_token_accuracy": 0.6823426600545645, "num_tokens": 185338.0, "step": 13780 }, { "epoch": 13.375272727272728, "grad_norm": 1.4523296356201172, "learning_rate": 9.915415415415416e-05, "loss": 0.9469, "mean_token_accuracy": 0.6681080140173435, "num_tokens": 195763.0, "step": 13790 }, { "epoch": 13.384969696969698, "grad_norm": 0.7047093510627747, "learning_rate": 9.910410410410411e-05, "loss": 0.9216, "mean_token_accuracy": 0.6868221748620271, "num_tokens": 205612.0, "step": 13800 }, { "epoch": 13.394666666666666, "grad_norm": 0.7028587460517883, "learning_rate": 9.905405405405406e-05, "loss": 0.9799, "mean_token_accuracy": 0.6751956883817911, "num_tokens": 215645.0, "step": 13810 }, { "epoch": 13.404363636363636, "grad_norm": 0.9091927409172058, "learning_rate": 9.900400400400401e-05, "loss": 0.9413, "mean_token_accuracy": 0.6909396957606078, "num_tokens": 225530.0, "step": 13820 }, { "epoch": 13.414060606060605, "grad_norm": 1.0086578130722046, "learning_rate": 9.895395395395396e-05, "loss": 0.964, "mean_token_accuracy": 0.6786475393921136, "num_tokens": 236313.0, "step": 13830 }, { "epoch": 13.423757575757575, "grad_norm": 1.5697195529937744, "learning_rate": 9.89039039039039e-05, "loss": 0.9573, "mean_token_accuracy": 0.6727604184299707, "num_tokens": 246884.0, "step": 13840 }, { "epoch": 13.433454545454545, "grad_norm": 0.8102120161056519, "learning_rate": 9.885385385385386e-05, "loss": 0.9226, "mean_token_accuracy": 0.6831782024353743, "num_tokens": 256990.0, "step": 13850 }, { "epoch": 13.443151515151515, "grad_norm": 0.9028761982917786, "learning_rate": 9.880380380380381e-05, "loss": 0.8753, "mean_token_accuracy": 0.7016171887516975, "num_tokens": 266786.0, "step": 13860 }, { "epoch": 13.452848484848484, "grad_norm": 1.2319331169128418, "learning_rate": 9.875375375375377e-05, "loss": 0.9452, "mean_token_accuracy": 0.6777403865009546, "num_tokens": 278066.0, "step": 13870 }, { "epoch": 13.462545454545454, "grad_norm": 1.340330719947815, "learning_rate": 9.870370370370371e-05, "loss": 0.8887, "mean_token_accuracy": 0.6937030091881752, "num_tokens": 287644.0, "step": 13880 }, { "epoch": 13.472242424242424, "grad_norm": 2.107584238052368, "learning_rate": 9.865365365365366e-05, "loss": 0.9805, "mean_token_accuracy": 0.6581023618578911, "num_tokens": 299167.0, "step": 13890 }, { "epoch": 13.481939393939394, "grad_norm": 1.3416616916656494, "learning_rate": 9.860360360360361e-05, "loss": 0.9588, "mean_token_accuracy": 0.6862040366977453, "num_tokens": 310325.0, "step": 13900 }, { "epoch": 13.491636363636363, "grad_norm": 0.7638229727745056, "learning_rate": 9.855355355355356e-05, "loss": 0.9199, "mean_token_accuracy": 0.6865271601825953, "num_tokens": 320799.0, "step": 13910 }, { "epoch": 13.501333333333333, "grad_norm": 1.8613024950027466, "learning_rate": 9.850350350350351e-05, "loss": 0.9029, "mean_token_accuracy": 0.7134368922561407, "num_tokens": 331742.0, "step": 13920 }, { "epoch": 13.511030303030303, "grad_norm": 0.8470885753631592, "learning_rate": 9.845345345345346e-05, "loss": 0.9985, "mean_token_accuracy": 0.6461464431136846, "num_tokens": 342008.0, "step": 13930 }, { "epoch": 13.520727272727273, "grad_norm": 1.4289556741714478, "learning_rate": 9.84034034034034e-05, "loss": 1.0229, "mean_token_accuracy": 0.6607601415365935, "num_tokens": 352783.0, "step": 13940 }, { "epoch": 13.530424242424242, "grad_norm": 1.1315350532531738, "learning_rate": 9.835335335335336e-05, "loss": 0.8668, "mean_token_accuracy": 0.7064531348645687, "num_tokens": 362703.0, "step": 13950 }, { "epoch": 13.540121212121212, "grad_norm": 0.8690136671066284, "learning_rate": 9.83033033033033e-05, "loss": 0.8763, "mean_token_accuracy": 0.7114055767655373, "num_tokens": 372732.0, "step": 13960 }, { "epoch": 13.549818181818182, "grad_norm": 0.9560481905937195, "learning_rate": 9.825325325325326e-05, "loss": 0.9223, "mean_token_accuracy": 0.6863605052232742, "num_tokens": 382785.0, "step": 13970 }, { "epoch": 13.559515151515152, "grad_norm": 1.053054928779602, "learning_rate": 9.820320320320321e-05, "loss": 0.9598, "mean_token_accuracy": 0.6758723571896553, "num_tokens": 393871.0, "step": 13980 }, { "epoch": 13.569212121212122, "grad_norm": 0.4731355905532837, "learning_rate": 9.815315315315316e-05, "loss": 0.9456, "mean_token_accuracy": 0.6892194643616676, "num_tokens": 404378.0, "step": 13990 }, { "epoch": 13.578909090909091, "grad_norm": 1.9100712537765503, "learning_rate": 9.810310310310311e-05, "loss": 1.011, "mean_token_accuracy": 0.6585861250758172, "num_tokens": 415762.0, "step": 14000 }, { "epoch": 13.588606060606061, "grad_norm": 0.987190842628479, "learning_rate": 9.805305305305306e-05, "loss": 0.8616, "mean_token_accuracy": 0.7012909840792417, "num_tokens": 425425.0, "step": 14010 }, { "epoch": 13.59830303030303, "grad_norm": 0.8835279941558838, "learning_rate": 9.8003003003003e-05, "loss": 0.9801, "mean_token_accuracy": 0.6575286597013473, "num_tokens": 435861.0, "step": 14020 }, { "epoch": 13.608, "grad_norm": 0.9478653073310852, "learning_rate": 9.795295295295296e-05, "loss": 0.9283, "mean_token_accuracy": 0.6895153563469648, "num_tokens": 446411.0, "step": 14030 }, { "epoch": 13.61769696969697, "grad_norm": 0.8801679015159607, "learning_rate": 9.79029029029029e-05, "loss": 0.9621, "mean_token_accuracy": 0.68089236356318, "num_tokens": 457521.0, "step": 14040 }, { "epoch": 13.62739393939394, "grad_norm": 0.7246169447898865, "learning_rate": 9.785285285285286e-05, "loss": 0.915, "mean_token_accuracy": 0.6914402432739735, "num_tokens": 467230.0, "step": 14050 }, { "epoch": 13.63709090909091, "grad_norm": 1.023116946220398, "learning_rate": 9.78028028028028e-05, "loss": 1.0017, "mean_token_accuracy": 0.6633546780794859, "num_tokens": 478815.0, "step": 14060 }, { "epoch": 13.64678787878788, "grad_norm": 1.2296099662780762, "learning_rate": 9.775275275275276e-05, "loss": 0.9853, "mean_token_accuracy": 0.6748053282499313, "num_tokens": 488901.0, "step": 14070 }, { "epoch": 13.656484848484848, "grad_norm": 0.9308061003684998, "learning_rate": 9.770270270270272e-05, "loss": 0.872, "mean_token_accuracy": 0.7030160129070282, "num_tokens": 499156.0, "step": 14080 }, { "epoch": 13.666181818181819, "grad_norm": 1.4838083982467651, "learning_rate": 9.765265265265266e-05, "loss": 0.918, "mean_token_accuracy": 0.6922544561326504, "num_tokens": 508618.0, "step": 14090 }, { "epoch": 13.675878787878787, "grad_norm": 0.6036433577537537, "learning_rate": 9.760260260260262e-05, "loss": 0.9253, "mean_token_accuracy": 0.6886366963386535, "num_tokens": 519918.0, "step": 14100 }, { "epoch": 13.685575757575757, "grad_norm": 0.848430871963501, "learning_rate": 9.755255255255256e-05, "loss": 0.9634, "mean_token_accuracy": 0.6708800371736288, "num_tokens": 529716.0, "step": 14110 }, { "epoch": 13.695272727272727, "grad_norm": 0.7561900019645691, "learning_rate": 9.75025025025025e-05, "loss": 0.8676, "mean_token_accuracy": 0.6998249750584364, "num_tokens": 539041.0, "step": 14120 }, { "epoch": 13.704969696969696, "grad_norm": 0.8211101293563843, "learning_rate": 9.745245245245246e-05, "loss": 0.9797, "mean_token_accuracy": 0.6581344068050384, "num_tokens": 549883.0, "step": 14130 }, { "epoch": 13.714666666666666, "grad_norm": 1.2751184701919556, "learning_rate": 9.74024024024024e-05, "loss": 0.9024, "mean_token_accuracy": 0.6988375499844551, "num_tokens": 560364.0, "step": 14140 }, { "epoch": 13.724363636363636, "grad_norm": 0.7292294502258301, "learning_rate": 9.735235235235236e-05, "loss": 0.8688, "mean_token_accuracy": 0.6985570065677166, "num_tokens": 570146.0, "step": 14150 }, { "epoch": 13.734060606060606, "grad_norm": 1.0787569284439087, "learning_rate": 9.73023023023023e-05, "loss": 0.9108, "mean_token_accuracy": 0.6816088363528252, "num_tokens": 580097.0, "step": 14160 }, { "epoch": 13.743757575757575, "grad_norm": 0.5591951012611389, "learning_rate": 9.725225225225225e-05, "loss": 1.0228, "mean_token_accuracy": 0.6424524009227752, "num_tokens": 591054.0, "step": 14170 }, { "epoch": 13.753454545454545, "grad_norm": 0.950010359287262, "learning_rate": 9.72022022022022e-05, "loss": 0.8811, "mean_token_accuracy": 0.7032374102622271, "num_tokens": 600799.0, "step": 14180 }, { "epoch": 13.763151515151515, "grad_norm": 0.4867992103099823, "learning_rate": 9.715215215215216e-05, "loss": 0.9002, "mean_token_accuracy": 0.6991371564567089, "num_tokens": 611008.0, "step": 14190 }, { "epoch": 13.772848484848485, "grad_norm": 0.5358482003211975, "learning_rate": 9.710210210210212e-05, "loss": 0.9305, "mean_token_accuracy": 0.6951459109783172, "num_tokens": 621483.0, "step": 14200 }, { "epoch": 13.782545454545454, "grad_norm": 0.8481453657150269, "learning_rate": 9.705205205205206e-05, "loss": 0.9505, "mean_token_accuracy": 0.6874804452061654, "num_tokens": 633074.0, "step": 14210 }, { "epoch": 13.792242424242424, "grad_norm": 0.664574146270752, "learning_rate": 9.7002002002002e-05, "loss": 0.971, "mean_token_accuracy": 0.6761138528585434, "num_tokens": 644108.0, "step": 14220 }, { "epoch": 13.801939393939394, "grad_norm": 0.6939647793769836, "learning_rate": 9.695195195195196e-05, "loss": 0.9171, "mean_token_accuracy": 0.6944379203021527, "num_tokens": 654249.0, "step": 14230 }, { "epoch": 13.811636363636364, "grad_norm": 0.6086325645446777, "learning_rate": 9.69019019019019e-05, "loss": 0.9794, "mean_token_accuracy": 0.6574176583439112, "num_tokens": 664735.0, "step": 14240 }, { "epoch": 13.821333333333333, "grad_norm": 2.137354612350464, "learning_rate": 9.685185185185186e-05, "loss": 0.9202, "mean_token_accuracy": 0.6800346210598945, "num_tokens": 675580.0, "step": 14250 }, { "epoch": 13.831030303030303, "grad_norm": 1.0914839506149292, "learning_rate": 9.68018018018018e-05, "loss": 0.9407, "mean_token_accuracy": 0.6815901666879653, "num_tokens": 685042.0, "step": 14260 }, { "epoch": 13.840727272727273, "grad_norm": 0.9622077345848083, "learning_rate": 9.675175175175175e-05, "loss": 0.9412, "mean_token_accuracy": 0.6896888021379709, "num_tokens": 695452.0, "step": 14270 }, { "epoch": 13.850424242424243, "grad_norm": 0.5911729335784912, "learning_rate": 9.67017017017017e-05, "loss": 0.9476, "mean_token_accuracy": 0.6852936699986458, "num_tokens": 706283.0, "step": 14280 }, { "epoch": 13.860121212121213, "grad_norm": 1.0763121843338013, "learning_rate": 9.665165165165166e-05, "loss": 0.8593, "mean_token_accuracy": 0.70830412581563, "num_tokens": 715851.0, "step": 14290 }, { "epoch": 13.869818181818182, "grad_norm": 0.7274637818336487, "learning_rate": 9.660160160160162e-05, "loss": 1.0047, "mean_token_accuracy": 0.6792124062776566, "num_tokens": 727035.0, "step": 14300 }, { "epoch": 13.879515151515152, "grad_norm": 0.6750665903091431, "learning_rate": 9.655155155155156e-05, "loss": 0.9299, "mean_token_accuracy": 0.6856705665588378, "num_tokens": 737554.0, "step": 14310 }, { "epoch": 13.889212121212122, "grad_norm": 0.6934303641319275, "learning_rate": 9.65015015015015e-05, "loss": 0.903, "mean_token_accuracy": 0.6835582558065653, "num_tokens": 747852.0, "step": 14320 }, { "epoch": 13.898909090909092, "grad_norm": 0.5132259726524353, "learning_rate": 9.645145145145146e-05, "loss": 0.9779, "mean_token_accuracy": 0.6827262349426746, "num_tokens": 757970.0, "step": 14330 }, { "epoch": 13.908606060606061, "grad_norm": 0.46078333258628845, "learning_rate": 9.64014014014014e-05, "loss": 0.9377, "mean_token_accuracy": 0.684358200058341, "num_tokens": 768757.0, "step": 14340 }, { "epoch": 13.918303030303031, "grad_norm": 0.555814266204834, "learning_rate": 9.635135135135136e-05, "loss": 0.9762, "mean_token_accuracy": 0.6606349345296622, "num_tokens": 780206.0, "step": 14350 }, { "epoch": 13.928, "grad_norm": 0.8341594338417053, "learning_rate": 9.63013013013013e-05, "loss": 0.9645, "mean_token_accuracy": 0.6715805854648351, "num_tokens": 790961.0, "step": 14360 }, { "epoch": 13.937696969696969, "grad_norm": 0.6067021489143372, "learning_rate": 9.625125125125125e-05, "loss": 0.9145, "mean_token_accuracy": 0.6793199084699154, "num_tokens": 800987.0, "step": 14370 }, { "epoch": 13.947393939393939, "grad_norm": 0.7952314019203186, "learning_rate": 9.62012012012012e-05, "loss": 0.9162, "mean_token_accuracy": 0.69475242421031, "num_tokens": 811828.0, "step": 14380 }, { "epoch": 13.957090909090908, "grad_norm": 0.8746843934059143, "learning_rate": 9.615115115115115e-05, "loss": 0.8681, "mean_token_accuracy": 0.7030756626278162, "num_tokens": 822958.0, "step": 14390 }, { "epoch": 13.966787878787878, "grad_norm": 0.4334689974784851, "learning_rate": 9.61011011011011e-05, "loss": 0.9797, "mean_token_accuracy": 0.6570104032754898, "num_tokens": 834206.0, "step": 14400 }, { "epoch": 13.976484848484848, "grad_norm": 0.5802099108695984, "learning_rate": 9.605105105105106e-05, "loss": 0.9076, "mean_token_accuracy": 0.6986728705465793, "num_tokens": 845031.0, "step": 14410 }, { "epoch": 13.986181818181818, "grad_norm": 0.41924917697906494, "learning_rate": 9.6001001001001e-05, "loss": 0.9134, "mean_token_accuracy": 0.6965976521372795, "num_tokens": 854691.0, "step": 14420 }, { "epoch": 13.995878787878787, "grad_norm": 0.4162426292896271, "learning_rate": 9.595095095095096e-05, "loss": 1.0075, "mean_token_accuracy": 0.6658117517828941, "num_tokens": 865141.0, "step": 14430 }, { "epoch": 14.005818181818182, "grad_norm": 0.6385387182235718, "learning_rate": 9.59009009009009e-05, "loss": 1.0001, "mean_token_accuracy": 0.6936045238157598, "num_tokens": 876494.0, "step": 14440 }, { "epoch": 14.015515151515151, "grad_norm": 0.6041902303695679, "learning_rate": 9.585085085085086e-05, "loss": 0.9238, "mean_token_accuracy": 0.6878648042678833, "num_tokens": 886603.0, "step": 14450 }, { "epoch": 14.025212121212121, "grad_norm": 0.9639670252799988, "learning_rate": 9.58008008008008e-05, "loss": 1.0331, "mean_token_accuracy": 0.6559940252453089, "num_tokens": 898033.0, "step": 14460 }, { "epoch": 14.03490909090909, "grad_norm": 0.5883612036705017, "learning_rate": 9.575075075075075e-05, "loss": 0.9764, "mean_token_accuracy": 0.6728239644318819, "num_tokens": 909064.0, "step": 14470 }, { "epoch": 14.04460606060606, "grad_norm": 0.8372961282730103, "learning_rate": 9.57007007007007e-05, "loss": 0.9061, "mean_token_accuracy": 0.6874277569353581, "num_tokens": 919744.0, "step": 14480 }, { "epoch": 14.05430303030303, "grad_norm": 1.7760860919952393, "learning_rate": 9.565065065065065e-05, "loss": 0.9198, "mean_token_accuracy": 0.6859086826443672, "num_tokens": 930349.0, "step": 14490 }, { "epoch": 14.064, "grad_norm": 0.5744428634643555, "learning_rate": 9.56006006006006e-05, "loss": 0.9518, "mean_token_accuracy": 0.6734505753964186, "num_tokens": 940791.0, "step": 14500 }, { "epoch": 14.07369696969697, "grad_norm": 0.9980311989784241, "learning_rate": 9.555055055055056e-05, "loss": 0.903, "mean_token_accuracy": 0.6935466017574072, "num_tokens": 951362.0, "step": 14510 }, { "epoch": 14.08339393939394, "grad_norm": 0.6623931527137756, "learning_rate": 9.55005005005005e-05, "loss": 0.8918, "mean_token_accuracy": 0.6993258882313966, "num_tokens": 962286.0, "step": 14520 }, { "epoch": 14.09309090909091, "grad_norm": 0.4992653429508209, "learning_rate": 9.545045045045046e-05, "loss": 0.9309, "mean_token_accuracy": 0.6649952068924904, "num_tokens": 972644.0, "step": 14530 }, { "epoch": 14.102787878787879, "grad_norm": 0.4818670153617859, "learning_rate": 9.54004004004004e-05, "loss": 0.9225, "mean_token_accuracy": 0.6798055626451969, "num_tokens": 983056.0, "step": 14540 }, { "epoch": 14.112484848484849, "grad_norm": 0.9694674015045166, "learning_rate": 9.535035035035036e-05, "loss": 0.8962, "mean_token_accuracy": 0.6953587524592877, "num_tokens": 992490.0, "step": 14550 }, { "epoch": 14.122181818181819, "grad_norm": 0.8076632618904114, "learning_rate": 9.53003003003003e-05, "loss": 0.9096, "mean_token_accuracy": 0.6695175170898438, "num_tokens": 1003185.0, "step": 14560 }, { "epoch": 14.131878787878788, "grad_norm": 0.39989814162254333, "learning_rate": 9.525025025025025e-05, "loss": 0.8959, "mean_token_accuracy": 0.6875695057213307, "num_tokens": 1013510.0, "step": 14570 }, { "epoch": 14.141575757575758, "grad_norm": 0.5998600721359253, "learning_rate": 9.52002002002002e-05, "loss": 0.9311, "mean_token_accuracy": 0.6833312470465899, "num_tokens": 1023703.0, "step": 14580 }, { "epoch": 14.151272727272728, "grad_norm": 1.0913785696029663, "learning_rate": 9.515015015015015e-05, "loss": 1.0003, "mean_token_accuracy": 0.6780395913869143, "num_tokens": 1035344.0, "step": 14590 }, { "epoch": 14.160969696969698, "grad_norm": 0.891591489315033, "learning_rate": 9.51001001001001e-05, "loss": 0.9419, "mean_token_accuracy": 0.6847637005150318, "num_tokens": 1045341.0, "step": 14600 }, { "epoch": 14.170666666666667, "grad_norm": 0.8624415397644043, "learning_rate": 9.505005005005005e-05, "loss": 0.8532, "mean_token_accuracy": 0.7248432952910662, "num_tokens": 1055979.0, "step": 14610 }, { "epoch": 14.180363636363637, "grad_norm": 0.9150317311286926, "learning_rate": 9.5e-05, "loss": 0.9745, "mean_token_accuracy": 0.6698940627276897, "num_tokens": 1067319.0, "step": 14620 }, { "epoch": 14.190060606060607, "grad_norm": 0.41908109188079834, "learning_rate": 9.494994994994996e-05, "loss": 0.9568, "mean_token_accuracy": 0.678890322521329, "num_tokens": 1078431.0, "step": 14630 }, { "epoch": 14.199757575757575, "grad_norm": 0.878993809223175, "learning_rate": 9.48998998998999e-05, "loss": 0.9256, "mean_token_accuracy": 0.6974173996597528, "num_tokens": 1088662.0, "step": 14640 }, { "epoch": 14.209454545454545, "grad_norm": 0.3703934848308563, "learning_rate": 9.484984984984986e-05, "loss": 0.9434, "mean_token_accuracy": 0.6732165481895208, "num_tokens": 1099362.0, "step": 14650 }, { "epoch": 14.219151515151514, "grad_norm": 0.4467850625514984, "learning_rate": 9.47997997997998e-05, "loss": 0.9459, "mean_token_accuracy": 0.6609635852277279, "num_tokens": 1110047.0, "step": 14660 }, { "epoch": 14.228848484848484, "grad_norm": 1.2241610288619995, "learning_rate": 9.474974974974975e-05, "loss": 0.9469, "mean_token_accuracy": 0.6950714159756899, "num_tokens": 1120542.0, "step": 14670 }, { "epoch": 14.238545454545454, "grad_norm": 0.6757529973983765, "learning_rate": 9.46996996996997e-05, "loss": 0.9628, "mean_token_accuracy": 0.6720464017242194, "num_tokens": 1131343.0, "step": 14680 }, { "epoch": 14.248242424242424, "grad_norm": 0.9918266534805298, "learning_rate": 9.464964964964965e-05, "loss": 0.9084, "mean_token_accuracy": 0.6795904841274023, "num_tokens": 1142007.0, "step": 14690 }, { "epoch": 14.257939393939393, "grad_norm": 0.9975070953369141, "learning_rate": 9.45995995995996e-05, "loss": 0.806, "mean_token_accuracy": 0.734311144053936, "num_tokens": 1151794.0, "step": 14700 }, { "epoch": 14.267636363636363, "grad_norm": 0.6164572238922119, "learning_rate": 9.454954954954955e-05, "loss": 0.8116, "mean_token_accuracy": 0.7311961345374585, "num_tokens": 1161276.0, "step": 14710 }, { "epoch": 14.277333333333333, "grad_norm": 0.8973527550697327, "learning_rate": 9.44994994994995e-05, "loss": 0.8702, "mean_token_accuracy": 0.7167054928839207, "num_tokens": 1171332.0, "step": 14720 }, { "epoch": 14.287030303030303, "grad_norm": 0.6523808240890503, "learning_rate": 9.444944944944946e-05, "loss": 0.9112, "mean_token_accuracy": 0.6947918102145195, "num_tokens": 1181393.0, "step": 14730 }, { "epoch": 14.296727272727273, "grad_norm": 0.41433241963386536, "learning_rate": 9.43993993993994e-05, "loss": 0.9264, "mean_token_accuracy": 0.6836030226200819, "num_tokens": 1191806.0, "step": 14740 }, { "epoch": 14.306424242424242, "grad_norm": 0.7625298500061035, "learning_rate": 9.434934934934936e-05, "loss": 0.8197, "mean_token_accuracy": 0.7187630910426378, "num_tokens": 1200839.0, "step": 14750 }, { "epoch": 14.316121212121212, "grad_norm": 0.5743375420570374, "learning_rate": 9.42992992992993e-05, "loss": 0.9071, "mean_token_accuracy": 0.6910372313112021, "num_tokens": 1211129.0, "step": 14760 }, { "epoch": 14.325818181818182, "grad_norm": 1.0408577919006348, "learning_rate": 9.424924924924925e-05, "loss": 0.9313, "mean_token_accuracy": 0.687668776512146, "num_tokens": 1221591.0, "step": 14770 }, { "epoch": 14.335515151515152, "grad_norm": 0.8543786406517029, "learning_rate": 9.41991991991992e-05, "loss": 0.9029, "mean_token_accuracy": 0.7062688145786524, "num_tokens": 1231723.0, "step": 14780 }, { "epoch": 14.345212121212121, "grad_norm": 0.5075017809867859, "learning_rate": 9.414914914914915e-05, "loss": 0.8447, "mean_token_accuracy": 0.7158392701297999, "num_tokens": 1241710.0, "step": 14790 }, { "epoch": 14.354909090909091, "grad_norm": 1.1220818758010864, "learning_rate": 9.40990990990991e-05, "loss": 0.9342, "mean_token_accuracy": 0.6817990552634001, "num_tokens": 1251618.0, "step": 14800 }, { "epoch": 7.288492307692308, "grad_norm": 0.6032423377037048, "learning_rate": 9.845357679969794e-05, "loss": 0.6822, "mean_token_accuracy": 0.7867337457835675, "num_tokens": 9312.0, "step": 14810 }, { "epoch": 7.293415384615384, "grad_norm": 2.23929762840271, "learning_rate": 9.842759302218645e-05, "loss": 0.8016, "mean_token_accuracy": 0.750348436832428, "num_tokens": 18778.0, "step": 14820 }, { "epoch": 7.298338461538462, "grad_norm": 0.8688719868659973, "learning_rate": 9.840139624995212e-05, "loss": 0.6881, "mean_token_accuracy": 0.7714763689786196, "num_tokens": 27387.0, "step": 14830 }, { "epoch": 7.3032615384615385, "grad_norm": 0.6268885135650635, "learning_rate": 9.837498659821384e-05, "loss": 0.7321, "mean_token_accuracy": 0.7611544221639633, "num_tokens": 36938.0, "step": 14840 }, { "epoch": 7.308184615384615, "grad_norm": 0.822592556476593, "learning_rate": 9.834836418312681e-05, "loss": 0.744, "mean_token_accuracy": 0.7452987994998693, "num_tokens": 45571.0, "step": 14850 }, { "epoch": 7.3131076923076925, "grad_norm": 0.4749494194984436, "learning_rate": 9.8321529121782e-05, "loss": 0.7403, "mean_token_accuracy": 0.750314911454916, "num_tokens": 54765.0, "step": 14860 }, { "epoch": 7.318030769230769, "grad_norm": 1.3120962381362915, "learning_rate": 9.829448153220566e-05, "loss": 0.761, "mean_token_accuracy": 0.7358665529638528, "num_tokens": 63751.0, "step": 14870 }, { "epoch": 7.322953846153846, "grad_norm": 0.7016109228134155, "learning_rate": 9.826722153335877e-05, "loss": 0.7017, "mean_token_accuracy": 0.7645948387682437, "num_tokens": 71817.0, "step": 14880 }, { "epoch": 7.327876923076923, "grad_norm": 0.5037406086921692, "learning_rate": 9.82397492451365e-05, "loss": 0.7157, "mean_token_accuracy": 0.7650218937546015, "num_tokens": 80510.0, "step": 14890 }, { "epoch": 7.3328, "grad_norm": 0.6709319353103638, "learning_rate": 9.821206478836775e-05, "loss": 0.7248, "mean_token_accuracy": 0.7560942731797695, "num_tokens": 89412.0, "step": 14900 }, { "epoch": 7.337723076923077, "grad_norm": 1.4935665130615234, "learning_rate": 9.81841682848146e-05, "loss": 0.7503, "mean_token_accuracy": 0.7548914663493633, "num_tokens": 99256.0, "step": 14910 }, { "epoch": 7.342646153846154, "grad_norm": 0.44451966881752014, "learning_rate": 9.815605985717171e-05, "loss": 0.7185, "mean_token_accuracy": 0.7600229732692242, "num_tokens": 107641.0, "step": 14920 }, { "epoch": 7.34756923076923, "grad_norm": 0.5159631371498108, "learning_rate": 9.812773962906586e-05, "loss": 0.7593, "mean_token_accuracy": 0.7515256915241479, "num_tokens": 116291.0, "step": 14930 }, { "epoch": 7.352492307692308, "grad_norm": 1.3890159130096436, "learning_rate": 9.809920772505532e-05, "loss": 0.8097, "mean_token_accuracy": 0.7170861914753914, "num_tokens": 126012.0, "step": 14940 }, { "epoch": 7.3574153846153845, "grad_norm": 1.5582915544509888, "learning_rate": 9.807046427062944e-05, "loss": 0.7585, "mean_token_accuracy": 0.7490797568112612, "num_tokens": 135364.0, "step": 14950 }, { "epoch": 7.362338461538462, "grad_norm": 0.3708029091358185, "learning_rate": 9.804150939220796e-05, "loss": 0.7772, "mean_token_accuracy": 0.7339643765240907, "num_tokens": 143997.0, "step": 14960 }, { "epoch": 7.3672615384615385, "grad_norm": 0.7632699012756348, "learning_rate": 9.80123432171405e-05, "loss": 0.7651, "mean_token_accuracy": 0.7485666394233703, "num_tokens": 153574.0, "step": 14970 }, { "epoch": 7.372184615384615, "grad_norm": 1.4701080322265625, "learning_rate": 9.798296587370603e-05, "loss": 0.7292, "mean_token_accuracy": 0.7644454840570688, "num_tokens": 162637.0, "step": 14980 }, { "epoch": 7.377107692307693, "grad_norm": 0.7957881093025208, "learning_rate": 9.795337749111229e-05, "loss": 0.8468, "mean_token_accuracy": 0.7276211023330689, "num_tokens": 173011.0, "step": 14990 }, { "epoch": 7.382030769230769, "grad_norm": 0.41769880056381226, "learning_rate": 9.792357819949518e-05, "loss": 0.7238, "mean_token_accuracy": 0.7681386031210422, "num_tokens": 181909.0, "step": 15000 }, { "epoch": 7.386953846153846, "grad_norm": 0.7403699159622192, "learning_rate": 9.881224657674156e-05, "loss": 0.7858, "mean_token_accuracy": 0.7396802183240652, "num_tokens": 8895.0, "step": 15010 }, { "epoch": 7.391876923076923, "grad_norm": 0.6027011275291443, "learning_rate": 9.879515199721796e-05, "loss": 0.7274, "mean_token_accuracy": 0.7529193755239248, "num_tokens": 17998.0, "step": 15020 }, { "epoch": 7.3968, "grad_norm": 0.4352588951587677, "learning_rate": 9.87779367793514e-05, "loss": 0.7908, "mean_token_accuracy": 0.7491613268852234, "num_tokens": 27846.0, "step": 15030 }, { "epoch": 7.401723076923077, "grad_norm": 0.39712125062942505, "learning_rate": 9.87606009657038e-05, "loss": 0.7353, "mean_token_accuracy": 0.7678960163146258, "num_tokens": 36104.0, "step": 15040 }, { "epoch": 7.406646153846154, "grad_norm": 0.5739689469337463, "learning_rate": 9.874314459913522e-05, "loss": 0.6803, "mean_token_accuracy": 0.772222863510251, "num_tokens": 44607.0, "step": 15050 }, { "epoch": 7.4115692307692305, "grad_norm": 0.5296592116355896, "learning_rate": 9.872556772280379e-05, "loss": 0.6219, "mean_token_accuracy": 0.7882269717752933, "num_tokens": 52426.0, "step": 15060 }, { "epoch": 7.416492307692308, "grad_norm": 0.7450407147407532, "learning_rate": 9.870787038016557e-05, "loss": 0.7046, "mean_token_accuracy": 0.7562790676951409, "num_tokens": 60835.0, "step": 15070 }, { "epoch": 7.4214153846153845, "grad_norm": 0.8603422045707703, "learning_rate": 9.869005261497446e-05, "loss": 0.7464, "mean_token_accuracy": 0.7453157220035791, "num_tokens": 70309.0, "step": 15080 }, { "epoch": 7.426338461538462, "grad_norm": 0.4799814820289612, "learning_rate": 9.867211447128208e-05, "loss": 0.8564, "mean_token_accuracy": 0.7118423756211996, "num_tokens": 80831.0, "step": 15090 }, { "epoch": 7.431261538461539, "grad_norm": 0.38809236884117126, "learning_rate": 9.865405599343768e-05, "loss": 0.778, "mean_token_accuracy": 0.729843546077609, "num_tokens": 89878.0, "step": 15100 }, { "epoch": 7.436184615384615, "grad_norm": 0.561546802520752, "learning_rate": 9.863587722608799e-05, "loss": 0.766, "mean_token_accuracy": 0.736732891574502, "num_tokens": 98413.0, "step": 15110 }, { "epoch": 7.441107692307693, "grad_norm": 0.4035409986972809, "learning_rate": 9.861757821417718e-05, "loss": 0.6529, "mean_token_accuracy": 0.7860307555645705, "num_tokens": 106310.0, "step": 15120 }, { "epoch": 7.446030769230769, "grad_norm": 1.2474324703216553, "learning_rate": 9.859915900294666e-05, "loss": 0.6801, "mean_token_accuracy": 0.7747167505323886, "num_tokens": 114567.0, "step": 15130 }, { "epoch": 7.450953846153846, "grad_norm": 1.240290880203247, "learning_rate": 9.858061963793503e-05, "loss": 0.6493, "mean_token_accuracy": 0.7812603395432234, "num_tokens": 123149.0, "step": 15140 }, { "epoch": 7.455876923076923, "grad_norm": 0.9319782853126526, "learning_rate": 9.856196016497798e-05, "loss": 0.8078, "mean_token_accuracy": 0.7315979212522506, "num_tokens": 132265.0, "step": 15150 }, { "epoch": 7.4608, "grad_norm": 0.35292956233024597, "learning_rate": 9.85431806302081e-05, "loss": 0.7718, "mean_token_accuracy": 0.7364303342998028, "num_tokens": 141098.0, "step": 15160 }, { "epoch": 7.4657230769230765, "grad_norm": 0.5348508358001709, "learning_rate": 9.852428108005487e-05, "loss": 0.7324, "mean_token_accuracy": 0.7685822080820799, "num_tokens": 150742.0, "step": 15170 }, { "epoch": 7.470646153846154, "grad_norm": 0.9570394158363342, "learning_rate": 9.850526156124442e-05, "loss": 0.6739, "mean_token_accuracy": 0.7785952746868133, "num_tokens": 159095.0, "step": 15180 }, { "epoch": 7.4755692307692305, "grad_norm": 1.611872673034668, "learning_rate": 9.848612212079955e-05, "loss": 0.7185, "mean_token_accuracy": 0.7705470208078623, "num_tokens": 167922.0, "step": 15190 }, { "epoch": 7.480492307692308, "grad_norm": 0.5744756460189819, "learning_rate": 9.846686280603948e-05, "loss": 0.8469, "mean_token_accuracy": 0.724059621617198, "num_tokens": 177884.0, "step": 15200 }, { "epoch": 7.485415384615385, "grad_norm": 0.42839816212654114, "learning_rate": 9.844748366457988e-05, "loss": 0.7499, "mean_token_accuracy": 0.7528812907636165, "num_tokens": 187133.0, "step": 15210 }, { "epoch": 7.490338461538461, "grad_norm": 2.1280364990234375, "learning_rate": 9.84279847443326e-05, "loss": 0.7742, "mean_token_accuracy": 0.7489132527261972, "num_tokens": 196413.0, "step": 15220 }, { "epoch": 7.495261538461539, "grad_norm": 0.35753366351127625, "learning_rate": 9.840836609350567e-05, "loss": 0.835, "mean_token_accuracy": 0.7175555892288685, "num_tokens": 206238.0, "step": 15230 }, { "epoch": 7.500184615384615, "grad_norm": 0.925093412399292, "learning_rate": 9.838862776060312e-05, "loss": 0.7501, "mean_token_accuracy": 0.7446019750088453, "num_tokens": 215620.0, "step": 15240 }, { "epoch": 7.505107692307693, "grad_norm": 0.6911622881889343, "learning_rate": 9.836876979442489e-05, "loss": 0.7261, "mean_token_accuracy": 0.7689918410032988, "num_tokens": 224928.0, "step": 15250 }, { "epoch": 7.510030769230769, "grad_norm": 0.7005440592765808, "learning_rate": 9.834879224406663e-05, "loss": 0.7894, "mean_token_accuracy": 0.741933236643672, "num_tokens": 235020.0, "step": 15260 }, { "epoch": 7.514953846153846, "grad_norm": 0.5132576823234558, "learning_rate": 9.832869515891975e-05, "loss": 0.7629, "mean_token_accuracy": 0.7501115497201681, "num_tokens": 244901.0, "step": 15270 }, { "epoch": 7.519876923076923, "grad_norm": 0.4901637136936188, "learning_rate": 9.83084785886711e-05, "loss": 0.7618, "mean_token_accuracy": 0.7478879150003195, "num_tokens": 254306.0, "step": 15280 }, { "epoch": 7.5248, "grad_norm": 0.6824623346328735, "learning_rate": 9.828814258330298e-05, "loss": 0.7023, "mean_token_accuracy": 0.7611722193658352, "num_tokens": 263006.0, "step": 15290 }, { "epoch": 7.5297230769230765, "grad_norm": 0.4069543480873108, "learning_rate": 9.826768719309298e-05, "loss": 0.7126, "mean_token_accuracy": 0.7572260867804289, "num_tokens": 271687.0, "step": 15300 }, { "epoch": 7.534646153846154, "grad_norm": 0.7551083564758301, "learning_rate": 9.824711246861382e-05, "loss": 0.8352, "mean_token_accuracy": 0.718966668099165, "num_tokens": 281372.0, "step": 15310 }, { "epoch": 7.539569230769231, "grad_norm": 0.8479435443878174, "learning_rate": 9.822641846073329e-05, "loss": 0.8138, "mean_token_accuracy": 0.752262394875288, "num_tokens": 290553.0, "step": 15320 }, { "epoch": 7.544492307692308, "grad_norm": 0.38278627395629883, "learning_rate": 9.820560522061403e-05, "loss": 0.7287, "mean_token_accuracy": 0.7666766557842493, "num_tokens": 299428.0, "step": 15330 }, { "epoch": 7.549415384615385, "grad_norm": 0.7417807579040527, "learning_rate": 9.818467279971355e-05, "loss": 0.6453, "mean_token_accuracy": 0.7891027696430684, "num_tokens": 308217.0, "step": 15340 }, { "epoch": 7.554338461538461, "grad_norm": 0.41675281524658203, "learning_rate": 9.816362124978396e-05, "loss": 0.703, "mean_token_accuracy": 0.7679217629134655, "num_tokens": 316520.0, "step": 15350 }, { "epoch": 7.559261538461539, "grad_norm": 0.8314495086669922, "learning_rate": 9.814245062287189e-05, "loss": 0.6985, "mean_token_accuracy": 0.7756699241697789, "num_tokens": 325247.0, "step": 15360 }, { "epoch": 7.564184615384615, "grad_norm": 0.5109190344810486, "learning_rate": 9.812116097131839e-05, "loss": 0.6479, "mean_token_accuracy": 0.7915467619895935, "num_tokens": 333857.0, "step": 15370 }, { "epoch": 7.569107692307693, "grad_norm": 0.8507750630378723, "learning_rate": 9.80997523477588e-05, "loss": 0.6877, "mean_token_accuracy": 0.7707390915602446, "num_tokens": 343201.0, "step": 15380 }, { "epoch": 7.574030769230769, "grad_norm": 0.4465511739253998, "learning_rate": 9.807822480512256e-05, "loss": 0.7341, "mean_token_accuracy": 0.7457791332155466, "num_tokens": 352232.0, "step": 15390 }, { "epoch": 7.578953846153846, "grad_norm": 0.7074446082115173, "learning_rate": 9.805657839663313e-05, "loss": 0.5786, "mean_token_accuracy": 0.7954732224345207, "num_tokens": 360362.0, "step": 15400 }, { "epoch": 7.583876923076923, "grad_norm": 0.4567805826663971, "learning_rate": 9.803481317580788e-05, "loss": 0.7394, "mean_token_accuracy": 0.7533329404890537, "num_tokens": 369312.0, "step": 15410 }, { "epoch": 7.5888, "grad_norm": 0.4720822274684906, "learning_rate": 9.801292919645786e-05, "loss": 0.7422, "mean_token_accuracy": 0.7545758258551359, "num_tokens": 378787.0, "step": 15420 }, { "epoch": 7.593723076923077, "grad_norm": 0.6811593770980835, "learning_rate": 9.799092651268778e-05, "loss": 0.7089, "mean_token_accuracy": 0.755854606255889, "num_tokens": 387085.0, "step": 15430 }, { "epoch": 7.598646153846154, "grad_norm": 0.520389974117279, "learning_rate": 9.796880517889583e-05, "loss": 0.7357, "mean_token_accuracy": 0.7585709065198898, "num_tokens": 395607.0, "step": 15440 }, { "epoch": 7.603569230769231, "grad_norm": 0.4988136291503906, "learning_rate": 9.794656524977353e-05, "loss": 0.7718, "mean_token_accuracy": 0.7427222758531571, "num_tokens": 404335.0, "step": 15450 }, { "epoch": 7.608492307692307, "grad_norm": 0.4840397834777832, "learning_rate": 9.792420678030559e-05, "loss": 0.7027, "mean_token_accuracy": 0.7715373657643795, "num_tokens": 412789.0, "step": 15460 }, { "epoch": 7.613415384615385, "grad_norm": 0.48264145851135254, "learning_rate": 9.790172982576982e-05, "loss": 0.7478, "mean_token_accuracy": 0.7376698384061455, "num_tokens": 421957.0, "step": 15470 }, { "epoch": 7.618338461538461, "grad_norm": 0.5378937125205994, "learning_rate": 9.787913444173696e-05, "loss": 0.7276, "mean_token_accuracy": 0.7619619213044644, "num_tokens": 431082.0, "step": 15480 }, { "epoch": 7.623261538461539, "grad_norm": 0.775534451007843, "learning_rate": 9.785642068407055e-05, "loss": 0.6669, "mean_token_accuracy": 0.7788964670151473, "num_tokens": 439416.0, "step": 15490 }, { "epoch": 7.628184615384615, "grad_norm": 0.705254077911377, "learning_rate": 9.783358860892679e-05, "loss": 0.7338, "mean_token_accuracy": 0.7540049366652966, "num_tokens": 447426.0, "step": 15500 }, { "epoch": 7.633107692307692, "grad_norm": 0.5129554271697998, "learning_rate": 9.781063827275437e-05, "loss": 0.7533, "mean_token_accuracy": 0.747262655198574, "num_tokens": 456215.0, "step": 15510 }, { "epoch": 7.638030769230769, "grad_norm": 0.546363890171051, "learning_rate": 9.778756973229441e-05, "loss": 0.7179, "mean_token_accuracy": 0.767508839443326, "num_tokens": 465873.0, "step": 15520 }, { "epoch": 7.642953846153846, "grad_norm": 0.5648319125175476, "learning_rate": 9.776438304458025e-05, "loss": 0.6624, "mean_token_accuracy": 0.7714706733822823, "num_tokens": 474390.0, "step": 15530 }, { "epoch": 7.6478769230769235, "grad_norm": 0.5035734176635742, "learning_rate": 9.774107826693731e-05, "loss": 0.6713, "mean_token_accuracy": 0.7714920256286859, "num_tokens": 482861.0, "step": 15540 }, { "epoch": 7.6528, "grad_norm": 0.5865426659584045, "learning_rate": 9.771765545698303e-05, "loss": 0.6718, "mean_token_accuracy": 0.7740787465125323, "num_tokens": 492284.0, "step": 15550 }, { "epoch": 7.657723076923077, "grad_norm": 0.39083245396614075, "learning_rate": 9.769411467262658e-05, "loss": 0.6844, "mean_token_accuracy": 0.7694638129323721, "num_tokens": 501852.0, "step": 15560 }, { "epoch": 7.662646153846154, "grad_norm": 0.5364481806755066, "learning_rate": 9.767045597206888e-05, "loss": 0.8126, "mean_token_accuracy": 0.7328575398772955, "num_tokens": 511967.0, "step": 15570 }, { "epoch": 7.667569230769231, "grad_norm": 0.8996931910514832, "learning_rate": 9.764667941380234e-05, "loss": 0.731, "mean_token_accuracy": 0.7515979178249836, "num_tokens": 520643.0, "step": 15580 }, { "epoch": 7.672492307692307, "grad_norm": 0.5694324374198914, "learning_rate": 9.762278505661074e-05, "loss": 0.7069, "mean_token_accuracy": 0.7669325869530439, "num_tokens": 529583.0, "step": 15590 }, { "epoch": 7.677415384615385, "grad_norm": 0.7352235317230225, "learning_rate": 9.759877295956916e-05, "loss": 0.7426, "mean_token_accuracy": 0.7516727082431316, "num_tokens": 538607.0, "step": 15600 }, { "epoch": 7.682338461538461, "grad_norm": 0.32605522871017456, "learning_rate": 9.757464318204373e-05, "loss": 0.7449, "mean_token_accuracy": 0.7565869923681021, "num_tokens": 8860.0, "step": 15610 }, { "epoch": 7.687261538461539, "grad_norm": 1.012762188911438, "learning_rate": 9.755039578369149e-05, "loss": 0.771, "mean_token_accuracy": 0.7340531777590513, "num_tokens": 18651.0, "step": 15620 }, { "epoch": 7.692184615384615, "grad_norm": 0.8538568615913391, "learning_rate": 9.752603082446036e-05, "loss": 0.7248, "mean_token_accuracy": 0.7604099120944738, "num_tokens": 27363.0, "step": 15630 }, { "epoch": 7.697107692307692, "grad_norm": 0.6249682903289795, "learning_rate": 9.750154836458887e-05, "loss": 0.6874, "mean_token_accuracy": 0.7692125029861927, "num_tokens": 35912.0, "step": 15640 }, { "epoch": 7.7020307692307695, "grad_norm": 0.8196687698364258, "learning_rate": 9.747694846460605e-05, "loss": 0.64, "mean_token_accuracy": 0.7777061153203249, "num_tokens": 44561.0, "step": 15650 }, { "epoch": 7.706953846153846, "grad_norm": 0.4318842887878418, "learning_rate": 9.745223118533127e-05, "loss": 0.6814, "mean_token_accuracy": 0.7579683996737003, "num_tokens": 53007.0, "step": 15660 }, { "epoch": 7.7118769230769235, "grad_norm": 0.3699759542942047, "learning_rate": 9.742739658787414e-05, "loss": 0.6928, "mean_token_accuracy": 0.7668475016951561, "num_tokens": 62417.0, "step": 15670 }, { "epoch": 7.7168, "grad_norm": 0.4260357618331909, "learning_rate": 9.740244473363426e-05, "loss": 0.7704, "mean_token_accuracy": 0.7583841320127249, "num_tokens": 71808.0, "step": 15680 }, { "epoch": 7.721723076923077, "grad_norm": 0.8471167087554932, "learning_rate": 9.737737568430123e-05, "loss": 0.6393, "mean_token_accuracy": 0.7948539689183235, "num_tokens": 80627.0, "step": 15690 }, { "epoch": 7.726646153846154, "grad_norm": 0.3456837236881256, "learning_rate": 9.735218950185428e-05, "loss": 0.7253, "mean_token_accuracy": 0.7614364203065633, "num_tokens": 89544.0, "step": 15700 }, { "epoch": 7.731569230769231, "grad_norm": 0.5443410277366638, "learning_rate": 9.732688624856231e-05, "loss": 0.6766, "mean_token_accuracy": 0.774466859921813, "num_tokens": 98452.0, "step": 15710 }, { "epoch": 7.736492307692307, "grad_norm": 0.36821064352989197, "learning_rate": 9.730146598698363e-05, "loss": 0.7503, "mean_token_accuracy": 0.7423054609447718, "num_tokens": 108409.0, "step": 15720 }, { "epoch": 7.741415384615385, "grad_norm": 0.8412238955497742, "learning_rate": 9.727592877996585e-05, "loss": 0.6721, "mean_token_accuracy": 0.7684708528220654, "num_tokens": 116974.0, "step": 15730 }, { "epoch": 7.746338461538461, "grad_norm": 0.709597647190094, "learning_rate": 9.725027469064568e-05, "loss": 0.6988, "mean_token_accuracy": 0.769633786380291, "num_tokens": 125747.0, "step": 15740 }, { "epoch": 7.751261538461538, "grad_norm": 0.3418969213962555, "learning_rate": 9.722450378244884e-05, "loss": 0.739, "mean_token_accuracy": 0.7599714059382677, "num_tokens": 134645.0, "step": 15750 }, { "epoch": 7.7561846153846155, "grad_norm": 0.6444127559661865, "learning_rate": 9.719861611908984e-05, "loss": 0.7256, "mean_token_accuracy": 0.7679268248379231, "num_tokens": 144249.0, "step": 15760 }, { "epoch": 7.761107692307692, "grad_norm": 0.303688108921051, "learning_rate": 9.717261176457187e-05, "loss": 0.8164, "mean_token_accuracy": 0.7411212358623743, "num_tokens": 153958.0, "step": 15770 }, { "epoch": 7.7660307692307695, "grad_norm": 0.3446861505508423, "learning_rate": 9.71464907831866e-05, "loss": 0.7577, "mean_token_accuracy": 0.7447476647794247, "num_tokens": 163223.0, "step": 15780 }, { "epoch": 7.770953846153846, "grad_norm": 1.021315097808838, "learning_rate": 9.712025323951405e-05, "loss": 0.8067, "mean_token_accuracy": 0.7387041725218296, "num_tokens": 172836.0, "step": 15790 }, { "epoch": 7.775876923076924, "grad_norm": 0.47042426466941833, "learning_rate": 9.709389919842244e-05, "loss": 0.644, "mean_token_accuracy": 0.7863428425043821, "num_tokens": 180923.0, "step": 15800 }, { "epoch": 7.7808, "grad_norm": 0.4498484432697296, "learning_rate": 9.706742872506796e-05, "loss": 0.7652, "mean_token_accuracy": 0.7459516085684299, "num_tokens": 189045.0, "step": 15810 }, { "epoch": 7.785723076923077, "grad_norm": 0.3966211974620819, "learning_rate": 9.704084188489473e-05, "loss": 0.7547, "mean_token_accuracy": 0.7628035910427571, "num_tokens": 197908.0, "step": 15820 }, { "epoch": 7.790646153846154, "grad_norm": 0.5024072527885437, "learning_rate": 9.701413874363449e-05, "loss": 0.6979, "mean_token_accuracy": 0.7664535760879516, "num_tokens": 207676.0, "step": 15830 }, { "epoch": 7.795569230769231, "grad_norm": 0.9534441232681274, "learning_rate": 9.698731936730662e-05, "loss": 0.6927, "mean_token_accuracy": 0.7581623613834381, "num_tokens": 216190.0, "step": 15840 }, { "epoch": 7.800492307692307, "grad_norm": 0.3889976143836975, "learning_rate": 9.696038382221775e-05, "loss": 0.7342, "mean_token_accuracy": 0.758062494546175, "num_tokens": 224885.0, "step": 15850 }, { "epoch": 7.805415384615385, "grad_norm": 0.3749610185623169, "learning_rate": 9.693333217496183e-05, "loss": 0.7733, "mean_token_accuracy": 0.7525778859853745, "num_tokens": 234675.0, "step": 15860 }, { "epoch": 7.8103384615384615, "grad_norm": 1.377591848373413, "learning_rate": 9.690616449241976e-05, "loss": 0.7902, "mean_token_accuracy": 0.7485566444694995, "num_tokens": 243966.0, "step": 15870 }, { "epoch": 7.815261538461538, "grad_norm": 0.4818339943885803, "learning_rate": 9.68788808417594e-05, "loss": 0.7403, "mean_token_accuracy": 0.7505327112972736, "num_tokens": 253324.0, "step": 15880 }, { "epoch": 7.8201846153846155, "grad_norm": 0.6426275372505188, "learning_rate": 9.685148129043528e-05, "loss": 0.7431, "mean_token_accuracy": 0.7493322882801294, "num_tokens": 261869.0, "step": 15890 }, { "epoch": 7.825107692307692, "grad_norm": 0.6179720163345337, "learning_rate": 9.682396590618848e-05, "loss": 0.8594, "mean_token_accuracy": 0.726296653598547, "num_tokens": 271518.0, "step": 15900 }, { "epoch": 7.83003076923077, "grad_norm": 0.7233896851539612, "learning_rate": 9.679633475704645e-05, "loss": 0.7503, "mean_token_accuracy": 0.7537887316197157, "num_tokens": 279856.0, "step": 15910 }, { "epoch": 7.834953846153846, "grad_norm": 0.4409298002719879, "learning_rate": 9.676858791132289e-05, "loss": 0.6689, "mean_token_accuracy": 0.7718529254198074, "num_tokens": 288033.0, "step": 15920 }, { "epoch": 7.839876923076923, "grad_norm": 1.0393766164779663, "learning_rate": 9.674072543761747e-05, "loss": 0.7102, "mean_token_accuracy": 0.7685550011694431, "num_tokens": 296825.0, "step": 15930 }, { "epoch": 7.8448, "grad_norm": 0.4495505094528198, "learning_rate": 9.671274740481584e-05, "loss": 0.8089, "mean_token_accuracy": 0.7236046094447375, "num_tokens": 305764.0, "step": 15940 }, { "epoch": 7.849723076923077, "grad_norm": 0.378525048494339, "learning_rate": 9.668465388208923e-05, "loss": 0.7541, "mean_token_accuracy": 0.7619457546621561, "num_tokens": 315045.0, "step": 15950 }, { "epoch": 7.854646153846154, "grad_norm": 0.3276619613170624, "learning_rate": 9.66564449388945e-05, "loss": 0.7818, "mean_token_accuracy": 0.7495603717863559, "num_tokens": 324401.0, "step": 15960 }, { "epoch": 7.859569230769231, "grad_norm": 0.6946415901184082, "learning_rate": 9.66281206449738e-05, "loss": 0.6496, "mean_token_accuracy": 0.786292115598917, "num_tokens": 332409.0, "step": 15970 }, { "epoch": 7.8644923076923074, "grad_norm": 0.6143710613250732, "learning_rate": 9.659968107035449e-05, "loss": 0.7024, "mean_token_accuracy": 0.7785773172974586, "num_tokens": 341622.0, "step": 15980 }, { "epoch": 7.869415384615385, "grad_norm": 1.626356601715088, "learning_rate": 9.657112628534898e-05, "loss": 0.7933, "mean_token_accuracy": 0.7386716432869435, "num_tokens": 350779.0, "step": 15990 }, { "epoch": 7.8743384615384615, "grad_norm": 0.8346491456031799, "learning_rate": 9.654245636055447e-05, "loss": 0.6961, "mean_token_accuracy": 0.7623608373105526, "num_tokens": 360097.0, "step": 16000 }, { "epoch": 7.879261538461538, "grad_norm": 0.5154821872711182, "learning_rate": 9.651367136685283e-05, "loss": 0.7421, "mean_token_accuracy": 0.7435356438159942, "num_tokens": 368657.0, "step": 16010 }, { "epoch": 7.884184615384616, "grad_norm": 0.40335017442703247, "learning_rate": 9.648477137541045e-05, "loss": 0.7217, "mean_token_accuracy": 0.7617222603410483, "num_tokens": 376972.0, "step": 16020 }, { "epoch": 7.889107692307692, "grad_norm": 0.3109897971153259, "learning_rate": 9.645575645767802e-05, "loss": 0.7785, "mean_token_accuracy": 0.7252940777689219, "num_tokens": 386691.0, "step": 16030 }, { "epoch": 7.894030769230769, "grad_norm": 0.7454068064689636, "learning_rate": 9.642662668539034e-05, "loss": 0.7545, "mean_token_accuracy": 0.7471344050019979, "num_tokens": 395505.0, "step": 16040 }, { "epoch": 7.898953846153846, "grad_norm": 0.37064892053604126, "learning_rate": 9.63973821305662e-05, "loss": 0.6351, "mean_token_accuracy": 0.7928381565958261, "num_tokens": 404112.0, "step": 16050 }, { "epoch": 7.903876923076923, "grad_norm": 0.36622127890586853, "learning_rate": 9.636802286550816e-05, "loss": 0.7544, "mean_token_accuracy": 0.7577709004282951, "num_tokens": 413357.0, "step": 16060 }, { "epoch": 7.9088, "grad_norm": 0.6665292978286743, "learning_rate": 9.633854896280243e-05, "loss": 0.7774, "mean_token_accuracy": 0.743139598891139, "num_tokens": 423588.0, "step": 16070 }, { "epoch": 7.913723076923077, "grad_norm": 0.8473738431930542, "learning_rate": 9.630896049531855e-05, "loss": 0.7409, "mean_token_accuracy": 0.7385849550366401, "num_tokens": 432217.0, "step": 16080 }, { "epoch": 7.918646153846154, "grad_norm": 1.1277004480361938, "learning_rate": 9.627925753620939e-05, "loss": 0.6382, "mean_token_accuracy": 0.7997437328100204, "num_tokens": 440454.0, "step": 16090 }, { "epoch": 7.923569230769231, "grad_norm": 0.5827834010124207, "learning_rate": 9.62494401589108e-05, "loss": 0.7146, "mean_token_accuracy": 0.7843764916062355, "num_tokens": 449378.0, "step": 16100 }, { "epoch": 7.9284923076923075, "grad_norm": 0.45561665296554565, "learning_rate": 9.621950843714163e-05, "loss": 0.7489, "mean_token_accuracy": 0.755017938092351, "num_tokens": 458985.0, "step": 16110 }, { "epoch": 7.933415384615385, "grad_norm": 0.45355021953582764, "learning_rate": 9.618946244490328e-05, "loss": 0.7944, "mean_token_accuracy": 0.7366019859910011, "num_tokens": 467822.0, "step": 16120 }, { "epoch": 7.938338461538462, "grad_norm": 0.45162439346313477, "learning_rate": 9.61593022564798e-05, "loss": 0.7063, "mean_token_accuracy": 0.7567742951214314, "num_tokens": 476100.0, "step": 16130 }, { "epoch": 7.943261538461538, "grad_norm": 0.9754898548126221, "learning_rate": 9.612902794643748e-05, "loss": 0.6584, "mean_token_accuracy": 0.780465978384018, "num_tokens": 484368.0, "step": 16140 }, { "epoch": 7.948184615384616, "grad_norm": 0.3318362832069397, "learning_rate": 9.609863958962482e-05, "loss": 0.6997, "mean_token_accuracy": 0.7755364947021007, "num_tokens": 493961.0, "step": 16150 }, { "epoch": 7.953107692307692, "grad_norm": 0.435249388217926, "learning_rate": 9.606813726117223e-05, "loss": 0.5637, "mean_token_accuracy": 0.7991742443293333, "num_tokens": 501913.0, "step": 16160 }, { "epoch": 7.958030769230769, "grad_norm": 0.43728408217430115, "learning_rate": 9.603752103649194e-05, "loss": 0.7412, "mean_token_accuracy": 0.7628684055060149, "num_tokens": 510392.0, "step": 16170 }, { "epoch": 7.962953846153846, "grad_norm": 0.46618780493736267, "learning_rate": 9.600679099127774e-05, "loss": 0.7086, "mean_token_accuracy": 0.7639894340187311, "num_tokens": 519632.0, "step": 16180 }, { "epoch": 7.967876923076923, "grad_norm": 0.35183632373809814, "learning_rate": 9.597594720150485e-05, "loss": 0.6746, "mean_token_accuracy": 0.7732372462749482, "num_tokens": 528385.0, "step": 16190 }, { "epoch": 7.9728, "grad_norm": 0.42351534962654114, "learning_rate": 9.59449897434297e-05, "loss": 0.74, "mean_token_accuracy": 0.7574392698705197, "num_tokens": 537139.0, "step": 16200 }, { "epoch": 7.977723076923077, "grad_norm": 0.451408326625824, "learning_rate": 8.704204204204205e-05, "loss": 0.739, "mean_token_accuracy": 0.7462680261582136, "num_tokens": 9224.0, "step": 16210 }, { "epoch": 7.9826461538461535, "grad_norm": 0.519805908203125, "learning_rate": 8.699199199199199e-05, "loss": 0.727, "mean_token_accuracy": 0.7574180524796248, "num_tokens": 18701.0, "step": 16220 }, { "epoch": 7.987569230769231, "grad_norm": 0.36464399099349976, "learning_rate": 8.694194194194195e-05, "loss": 0.6993, "mean_token_accuracy": 0.7700857035815716, "num_tokens": 27224.0, "step": 16230 }, { "epoch": 7.992492307692308, "grad_norm": 0.2717822790145874, "learning_rate": 8.68918918918919e-05, "loss": 0.7204, "mean_token_accuracy": 0.7610609702765941, "num_tokens": 35640.0, "step": 16240 }, { "epoch": 7.997415384615385, "grad_norm": 0.3014907240867615, "learning_rate": 8.684184184184185e-05, "loss": 0.8007, "mean_token_accuracy": 0.7250883210450411, "num_tokens": 45152.0, "step": 16250 }, { "epoch": 8.002461538461539, "grad_norm": 0.4179680049419403, "learning_rate": 8.67917917917918e-05, "loss": 0.7608, "mean_token_accuracy": 0.7735646199889299, "num_tokens": 54246.0, "step": 16260 }, { "epoch": 8.007384615384616, "grad_norm": 0.506325900554657, "learning_rate": 8.674174174174175e-05, "loss": 0.8325, "mean_token_accuracy": 0.7233378864824772, "num_tokens": 63913.0, "step": 16270 }, { "epoch": 8.012307692307692, "grad_norm": 0.6368007063865662, "learning_rate": 8.66916916916917e-05, "loss": 0.7237, "mean_token_accuracy": 0.7582856122404337, "num_tokens": 72805.0, "step": 16280 }, { "epoch": 8.01723076923077, "grad_norm": 0.45158663392066956, "learning_rate": 8.664164164164165e-05, "loss": 0.744, "mean_token_accuracy": 0.7568928249180317, "num_tokens": 81952.0, "step": 16290 }, { "epoch": 8.022153846153847, "grad_norm": 0.8606657981872559, "learning_rate": 8.659159159159159e-05, "loss": 0.6465, "mean_token_accuracy": 0.78743049018085, "num_tokens": 90446.0, "step": 16300 }, { "epoch": 8.027076923076923, "grad_norm": 0.8622094392776489, "learning_rate": 8.654154154154155e-05, "loss": 0.646, "mean_token_accuracy": 0.7958425115793943, "num_tokens": 99305.0, "step": 16310 }, { "epoch": 8.032, "grad_norm": 0.37887170910835266, "learning_rate": 8.649149149149149e-05, "loss": 0.7942, "mean_token_accuracy": 0.7280906450003386, "num_tokens": 108733.0, "step": 16320 }, { "epoch": 8.036923076923078, "grad_norm": 0.4614126980304718, "learning_rate": 8.644144144144145e-05, "loss": 0.7874, "mean_token_accuracy": 0.7451971229165792, "num_tokens": 118370.0, "step": 16330 }, { "epoch": 8.041846153846153, "grad_norm": 0.5304930210113525, "learning_rate": 8.639139139139139e-05, "loss": 0.7829, "mean_token_accuracy": 0.7486832808703184, "num_tokens": 128043.0, "step": 16340 }, { "epoch": 8.04676923076923, "grad_norm": 0.7120644450187683, "learning_rate": 8.634134134134135e-05, "loss": 0.8206, "mean_token_accuracy": 0.7201329939067364, "num_tokens": 138112.0, "step": 16350 }, { "epoch": 8.051692307692308, "grad_norm": 0.40515926480293274, "learning_rate": 8.62912912912913e-05, "loss": 0.648, "mean_token_accuracy": 0.7795034911483526, "num_tokens": 146211.0, "step": 16360 }, { "epoch": 8.056615384615384, "grad_norm": 0.5807082653045654, "learning_rate": 8.624124124124125e-05, "loss": 0.7214, "mean_token_accuracy": 0.7683356497436762, "num_tokens": 155212.0, "step": 16370 }, { "epoch": 8.061538461538461, "grad_norm": 0.8227428793907166, "learning_rate": 8.61911911911912e-05, "loss": 0.7255, "mean_token_accuracy": 0.7549647618085146, "num_tokens": 164218.0, "step": 16380 }, { "epoch": 8.066461538461539, "grad_norm": 0.3668994903564453, "learning_rate": 8.614114114114115e-05, "loss": 0.6535, "mean_token_accuracy": 0.7847654249519109, "num_tokens": 173078.0, "step": 16390 }, { "epoch": 8.071384615384616, "grad_norm": 0.28024813532829285, "learning_rate": 8.609109109109109e-05, "loss": 0.6654, "mean_token_accuracy": 0.7698544282466173, "num_tokens": 181153.0, "step": 16400 }, { "epoch": 8.076307692307692, "grad_norm": 0.328283429145813, "learning_rate": 8.604104104104105e-05, "loss": 0.6977, "mean_token_accuracy": 0.7609763164073229, "num_tokens": 189338.0, "step": 16410 }, { "epoch": 8.08123076923077, "grad_norm": 0.7588334083557129, "learning_rate": 8.599099099099099e-05, "loss": 0.7558, "mean_token_accuracy": 0.742162485793233, "num_tokens": 198089.0, "step": 16420 }, { "epoch": 8.086153846153847, "grad_norm": 0.3376314342021942, "learning_rate": 8.594094094094095e-05, "loss": 0.6729, "mean_token_accuracy": 0.768299813196063, "num_tokens": 206389.0, "step": 16430 }, { "epoch": 8.091076923076923, "grad_norm": 0.5634762048721313, "learning_rate": 8.589089089089089e-05, "loss": 0.7448, "mean_token_accuracy": 0.7404033329337836, "num_tokens": 215926.0, "step": 16440 }, { "epoch": 8.096, "grad_norm": 0.3723192811012268, "learning_rate": 8.584084084084085e-05, "loss": 0.7675, "mean_token_accuracy": 0.7391767490655183, "num_tokens": 225284.0, "step": 16450 }, { "epoch": 8.100923076923078, "grad_norm": 0.3543316721916199, "learning_rate": 8.57907907907908e-05, "loss": 0.6199, "mean_token_accuracy": 0.7883176296949387, "num_tokens": 233644.0, "step": 16460 }, { "epoch": 8.105846153846153, "grad_norm": 1.3809056282043457, "learning_rate": 8.574074074074075e-05, "loss": 0.6823, "mean_token_accuracy": 0.7546458698809146, "num_tokens": 242102.0, "step": 16470 }, { "epoch": 8.11076923076923, "grad_norm": 0.4195917248725891, "learning_rate": 8.56906906906907e-05, "loss": 0.7027, "mean_token_accuracy": 0.7585832923650742, "num_tokens": 250978.0, "step": 16480 }, { "epoch": 8.115692307692308, "grad_norm": 0.5387942790985107, "learning_rate": 8.564064064064065e-05, "loss": 0.785, "mean_token_accuracy": 0.7469440281391144, "num_tokens": 259569.0, "step": 16490 }, { "epoch": 8.120615384615384, "grad_norm": 0.3012475371360779, "learning_rate": 8.559059059059059e-05, "loss": 0.8153, "mean_token_accuracy": 0.7393761333078146, "num_tokens": 269415.0, "step": 16500 }, { "epoch": 8.125538461538461, "grad_norm": 0.8275740742683411, "learning_rate": 8.554054054054055e-05, "loss": 0.7829, "mean_token_accuracy": 0.742312715575099, "num_tokens": 278571.0, "step": 16510 }, { "epoch": 8.130461538461539, "grad_norm": 1.3069651126861572, "learning_rate": 8.549049049049049e-05, "loss": 0.6394, "mean_token_accuracy": 0.7816533345729113, "num_tokens": 286512.0, "step": 16520 }, { "epoch": 8.135384615384615, "grad_norm": 0.45634856820106506, "learning_rate": 8.544044044044043e-05, "loss": 0.7042, "mean_token_accuracy": 0.765632963180542, "num_tokens": 295790.0, "step": 16530 }, { "epoch": 8.140307692307692, "grad_norm": 0.37332087755203247, "learning_rate": 8.539039039039039e-05, "loss": 0.6812, "mean_token_accuracy": 0.7715237192809582, "num_tokens": 304612.0, "step": 16540 }, { "epoch": 8.14523076923077, "grad_norm": 0.3229140043258667, "learning_rate": 8.534034034034033e-05, "loss": 0.7512, "mean_token_accuracy": 0.7543146207928657, "num_tokens": 312878.0, "step": 16550 }, { "epoch": 8.150153846153847, "grad_norm": 0.46332916617393494, "learning_rate": 8.529029029029029e-05, "loss": 0.7706, "mean_token_accuracy": 0.7599172580987215, "num_tokens": 322455.0, "step": 16560 }, { "epoch": 8.155076923076923, "grad_norm": 0.3571588695049286, "learning_rate": 8.524024024024025e-05, "loss": 0.6311, "mean_token_accuracy": 0.7764216579496861, "num_tokens": 330287.0, "step": 16570 }, { "epoch": 8.16, "grad_norm": 0.33986207842826843, "learning_rate": 8.519019019019019e-05, "loss": 0.7048, "mean_token_accuracy": 0.7667416296899319, "num_tokens": 339451.0, "step": 16580 }, { "epoch": 8.164923076923078, "grad_norm": 0.4668309688568115, "learning_rate": 8.514014014014015e-05, "loss": 0.764, "mean_token_accuracy": 0.757263046503067, "num_tokens": 348412.0, "step": 16590 }, { "epoch": 8.169846153846153, "grad_norm": 0.6498896479606628, "learning_rate": 8.509009009009009e-05, "loss": 0.7584, "mean_token_accuracy": 0.7656078919768333, "num_tokens": 357039.0, "step": 16600 }, { "epoch": 8.17476923076923, "grad_norm": 0.7268086075782776, "learning_rate": 8.504004004004005e-05, "loss": 0.6747, "mean_token_accuracy": 0.774444717913866, "num_tokens": 365355.0, "step": 16610 }, { "epoch": 8.179692307692308, "grad_norm": 0.29098740220069885, "learning_rate": 8.498998998998999e-05, "loss": 0.7868, "mean_token_accuracy": 0.736617112159729, "num_tokens": 374565.0, "step": 16620 }, { "epoch": 8.184615384615384, "grad_norm": 0.453988254070282, "learning_rate": 8.493993993993994e-05, "loss": 0.7035, "mean_token_accuracy": 0.7616805218160152, "num_tokens": 383369.0, "step": 16630 }, { "epoch": 8.189538461538461, "grad_norm": 0.5355010032653809, "learning_rate": 8.488988988988989e-05, "loss": 0.8054, "mean_token_accuracy": 0.7372288048267365, "num_tokens": 393060.0, "step": 16640 }, { "epoch": 8.194461538461539, "grad_norm": 0.25265973806381226, "learning_rate": 8.483983983983984e-05, "loss": 0.6868, "mean_token_accuracy": 0.7586379230022431, "num_tokens": 401408.0, "step": 16650 }, { "epoch": 8.199384615384615, "grad_norm": 0.3654129207134247, "learning_rate": 8.478978978978979e-05, "loss": 0.6787, "mean_token_accuracy": 0.7611869160085917, "num_tokens": 409918.0, "step": 16660 }, { "epoch": 8.204307692307692, "grad_norm": 0.4879061281681061, "learning_rate": 8.473973973973975e-05, "loss": 0.6378, "mean_token_accuracy": 0.7808506272733211, "num_tokens": 418552.0, "step": 16670 }, { "epoch": 8.20923076923077, "grad_norm": 0.32871031761169434, "learning_rate": 8.468968968968969e-05, "loss": 0.7656, "mean_token_accuracy": 0.7485352344810963, "num_tokens": 427532.0, "step": 16680 }, { "epoch": 8.214153846153847, "grad_norm": 0.4512389600276947, "learning_rate": 8.463963963963965e-05, "loss": 0.6777, "mean_token_accuracy": 0.7757456459105014, "num_tokens": 435987.0, "step": 16690 }, { "epoch": 8.219076923076923, "grad_norm": 0.698094367980957, "learning_rate": 8.458958958958959e-05, "loss": 0.5981, "mean_token_accuracy": 0.7841473259031773, "num_tokens": 444271.0, "step": 16700 }, { "epoch": 8.224, "grad_norm": 0.5681586265563965, "learning_rate": 8.453953953953955e-05, "loss": 0.7459, "mean_token_accuracy": 0.7463509045541287, "num_tokens": 453153.0, "step": 16710 }, { "epoch": 8.228923076923078, "grad_norm": 0.3863551914691925, "learning_rate": 8.448948948948949e-05, "loss": 0.7184, "mean_token_accuracy": 0.7525065660476684, "num_tokens": 462182.0, "step": 16720 }, { "epoch": 8.233846153846153, "grad_norm": 1.4121780395507812, "learning_rate": 8.443943943943944e-05, "loss": 0.7869, "mean_token_accuracy": 0.7282758131623268, "num_tokens": 471829.0, "step": 16730 }, { "epoch": 8.23876923076923, "grad_norm": 0.4115709662437439, "learning_rate": 8.438938938938939e-05, "loss": 0.6067, "mean_token_accuracy": 0.7922206796705723, "num_tokens": 479732.0, "step": 16740 }, { "epoch": 8.243692307692308, "grad_norm": 0.35427096486091614, "learning_rate": 8.433933933933934e-05, "loss": 0.731, "mean_token_accuracy": 0.7598383821547031, "num_tokens": 488481.0, "step": 16750 }, { "epoch": 8.248615384615384, "grad_norm": 0.4847518801689148, "learning_rate": 8.428928928928929e-05, "loss": 0.8492, "mean_token_accuracy": 0.7316956970840692, "num_tokens": 498612.0, "step": 16760 }, { "epoch": 8.253538461538461, "grad_norm": 0.35778024792671204, "learning_rate": 8.423923923923924e-05, "loss": 0.8278, "mean_token_accuracy": 0.7407127279788256, "num_tokens": 508317.0, "step": 16770 }, { "epoch": 8.258461538461539, "grad_norm": 0.4900796413421631, "learning_rate": 8.418918918918919e-05, "loss": 0.7169, "mean_token_accuracy": 0.7744144190102815, "num_tokens": 517266.0, "step": 16780 }, { "epoch": 8.263384615384615, "grad_norm": 0.7427136898040771, "learning_rate": 8.413913913913915e-05, "loss": 0.721, "mean_token_accuracy": 0.7743976633995772, "num_tokens": 526031.0, "step": 16790 }, { "epoch": 8.268307692307692, "grad_norm": 0.3626040518283844, "learning_rate": 8.40890890890891e-05, "loss": 0.7644, "mean_token_accuracy": 0.7453425768762827, "num_tokens": 535101.0, "step": 16800 }, { "epoch": 8.27323076923077, "grad_norm": 0.3192290961742401, "learning_rate": 8.403903903903905e-05, "loss": 0.7191, "mean_token_accuracy": 0.7587394848465919, "num_tokens": 543850.0, "step": 16810 }, { "epoch": 8.278153846153845, "grad_norm": 0.29766783118247986, "learning_rate": 8.3988988988989e-05, "loss": 0.7147, "mean_token_accuracy": 0.7568968750536442, "num_tokens": 552551.0, "step": 16820 }, { "epoch": 8.283076923076923, "grad_norm": 0.42623892426490784, "learning_rate": 8.393893893893894e-05, "loss": 0.742, "mean_token_accuracy": 0.752581474930048, "num_tokens": 561647.0, "step": 16830 }, { "epoch": 8.288, "grad_norm": 0.5091580152511597, "learning_rate": 8.38888888888889e-05, "loss": 0.74, "mean_token_accuracy": 0.7456575892865658, "num_tokens": 570203.0, "step": 16840 }, { "epoch": 8.292923076923078, "grad_norm": 0.8799173831939697, "learning_rate": 8.383883883883884e-05, "loss": 0.7269, "mean_token_accuracy": 0.7564969882369041, "num_tokens": 578387.0, "step": 16850 }, { "epoch": 8.297846153846153, "grad_norm": 0.4507330060005188, "learning_rate": 8.37887887887888e-05, "loss": 0.6896, "mean_token_accuracy": 0.7675476286560297, "num_tokens": 586819.0, "step": 16860 }, { "epoch": 8.302769230769231, "grad_norm": 0.5583937168121338, "learning_rate": 8.373873873873874e-05, "loss": 0.7218, "mean_token_accuracy": 0.7538196977227927, "num_tokens": 595719.0, "step": 16870 }, { "epoch": 8.307692307692308, "grad_norm": 0.30985692143440247, "learning_rate": 8.36886886886887e-05, "loss": 0.7591, "mean_token_accuracy": 0.7350195806473494, "num_tokens": 605157.0, "step": 16880 }, { "epoch": 8.312615384615384, "grad_norm": 0.29996439814567566, "learning_rate": 8.363863863863865e-05, "loss": 0.8303, "mean_token_accuracy": 0.72854442037642, "num_tokens": 615318.0, "step": 16890 }, { "epoch": 8.317538461538462, "grad_norm": 0.29050928354263306, "learning_rate": 8.35885885885886e-05, "loss": 0.6958, "mean_token_accuracy": 0.7601312138140202, "num_tokens": 623747.0, "step": 16900 }, { "epoch": 8.322461538461539, "grad_norm": 0.7444137334823608, "learning_rate": 8.353853853853855e-05, "loss": 0.7091, "mean_token_accuracy": 0.7593867909163237, "num_tokens": 632061.0, "step": 16910 }, { "epoch": 8.327384615384615, "grad_norm": 0.27352163195610046, "learning_rate": 8.34884884884885e-05, "loss": 0.5748, "mean_token_accuracy": 0.8001094650477171, "num_tokens": 639693.0, "step": 16920 }, { "epoch": 8.332307692307692, "grad_norm": 0.31675222516059875, "learning_rate": 8.343843843843844e-05, "loss": 0.7142, "mean_token_accuracy": 0.7502024855464697, "num_tokens": 648616.0, "step": 16930 }, { "epoch": 8.33723076923077, "grad_norm": 0.24953658878803253, "learning_rate": 8.33883883883884e-05, "loss": 0.7179, "mean_token_accuracy": 0.7595278985798359, "num_tokens": 658331.0, "step": 16940 }, { "epoch": 8.342153846153845, "grad_norm": 0.29154184460639954, "learning_rate": 8.333833833833834e-05, "loss": 0.7491, "mean_token_accuracy": 0.7431470949202776, "num_tokens": 666914.0, "step": 16950 }, { "epoch": 8.347076923076923, "grad_norm": 0.46732550859451294, "learning_rate": 8.32882882882883e-05, "loss": 0.6553, "mean_token_accuracy": 0.7763445932418108, "num_tokens": 675189.0, "step": 16960 }, { "epoch": 8.352, "grad_norm": 0.6667472720146179, "learning_rate": 8.323823823823824e-05, "loss": 0.7915, "mean_token_accuracy": 0.7494681358337403, "num_tokens": 684818.0, "step": 16970 }, { "epoch": 8.356923076923078, "grad_norm": 0.7695476412773132, "learning_rate": 8.318818818818818e-05, "loss": 0.6424, "mean_token_accuracy": 0.7838353902101517, "num_tokens": 693101.0, "step": 16980 }, { "epoch": 8.361846153846153, "grad_norm": 0.48481637239456177, "learning_rate": 8.313813813813814e-05, "loss": 0.7594, "mean_token_accuracy": 0.7688646581023931, "num_tokens": 702267.0, "step": 16990 }, { "epoch": 8.366769230769231, "grad_norm": 0.295489102602005, "learning_rate": 8.30880880880881e-05, "loss": 0.7447, "mean_token_accuracy": 0.741809818893671, "num_tokens": 710894.0, "step": 17000 }, { "epoch": 8.3712, "grad_norm": 0.34440332651138306, "learning_rate": 9.306748584382252e-05, "loss": 0.6617, "mean_token_accuracy": 0.7728524345904588, "num_tokens": 8282.0, "step": 17010 }, { "epoch": 8.376123076923077, "grad_norm": 0.46622994542121887, "learning_rate": 9.302749347659147e-05, "loss": 0.7989, "mean_token_accuracy": 0.7468677569180727, "num_tokens": 18349.0, "step": 17020 }, { "epoch": 8.381046153846153, "grad_norm": 0.679972231388092, "learning_rate": 9.298739473064651e-05, "loss": 0.774, "mean_token_accuracy": 0.748094291985035, "num_tokens": 28331.0, "step": 17030 }, { "epoch": 8.38596923076923, "grad_norm": 0.3294457495212555, "learning_rate": 9.294718970512545e-05, "loss": 0.7299, "mean_token_accuracy": 0.755809823796153, "num_tokens": 37071.0, "step": 17040 }, { "epoch": 8.390892307692308, "grad_norm": 0.3379822373390198, "learning_rate": 9.290687849942893e-05, "loss": 0.7451, "mean_token_accuracy": 0.7431266129016876, "num_tokens": 45861.0, "step": 17050 }, { "epoch": 8.395815384615384, "grad_norm": 0.3134821355342865, "learning_rate": 9.286646121322004e-05, "loss": 0.734, "mean_token_accuracy": 0.7659897316247225, "num_tokens": 55778.0, "step": 17060 }, { "epoch": 8.400738461538461, "grad_norm": 1.9757238626480103, "learning_rate": 9.282593794642423e-05, "loss": 0.741, "mean_token_accuracy": 0.760085154697299, "num_tokens": 64336.0, "step": 17070 }, { "epoch": 8.405661538461539, "grad_norm": 0.27846866846084595, "learning_rate": 9.278530879922882e-05, "loss": 0.6565, "mean_token_accuracy": 0.7792624596506357, "num_tokens": 72489.0, "step": 17080 }, { "epoch": 8.410584615384616, "grad_norm": 0.30453068017959595, "learning_rate": 9.274457387208305e-05, "loss": 0.6348, "mean_token_accuracy": 0.7794241864234209, "num_tokens": 80730.0, "step": 17090 }, { "epoch": 8.415507692307692, "grad_norm": 0.4652600884437561, "learning_rate": 9.270373326569762e-05, "loss": 0.6505, "mean_token_accuracy": 0.7694964144378901, "num_tokens": 88615.0, "step": 17100 }, { "epoch": 8.42043076923077, "grad_norm": 0.28283053636550903, "learning_rate": 9.266278708104448e-05, "loss": 0.7432, "mean_token_accuracy": 0.7504150871187448, "num_tokens": 98314.0, "step": 17110 }, { "epoch": 8.425353846153847, "grad_norm": 0.7553939819335938, "learning_rate": 9.262173541935663e-05, "loss": 0.818, "mean_token_accuracy": 0.722613125666976, "num_tokens": 108341.0, "step": 17120 }, { "epoch": 8.430276923076923, "grad_norm": 0.4199792444705963, "learning_rate": 9.25805783821279e-05, "loss": 0.7948, "mean_token_accuracy": 0.728472213447094, "num_tokens": 118063.0, "step": 17130 }, { "epoch": 8.4352, "grad_norm": 0.3130192756652832, "learning_rate": 9.253931607111256e-05, "loss": 0.773, "mean_token_accuracy": 0.7356539122760296, "num_tokens": 126793.0, "step": 17140 }, { "epoch": 8.440123076923078, "grad_norm": 0.3275775909423828, "learning_rate": 9.249794858832522e-05, "loss": 0.6469, "mean_token_accuracy": 0.7861156791448594, "num_tokens": 134697.0, "step": 17150 }, { "epoch": 8.445046153846153, "grad_norm": 0.3588021695613861, "learning_rate": 9.245647603604042e-05, "loss": 0.6775, "mean_token_accuracy": 0.7681175690144301, "num_tokens": 142697.0, "step": 17160 }, { "epoch": 8.44996923076923, "grad_norm": 1.7960641384124756, "learning_rate": 9.241489851679256e-05, "loss": 0.6297, "mean_token_accuracy": 0.787098852545023, "num_tokens": 150829.0, "step": 17170 }, { "epoch": 8.454892307692308, "grad_norm": 0.361743688583374, "learning_rate": 9.237321613337552e-05, "loss": 0.7567, "mean_token_accuracy": 0.7540980920195579, "num_tokens": 160250.0, "step": 17180 }, { "epoch": 8.459815384615384, "grad_norm": 0.35826051235198975, "learning_rate": 9.233142898884245e-05, "loss": 0.7855, "mean_token_accuracy": 0.7355491202324629, "num_tokens": 169094.0, "step": 17190 }, { "epoch": 8.464738461538461, "grad_norm": 0.46428659558296204, "learning_rate": 9.228953718650548e-05, "loss": 0.6995, "mean_token_accuracy": 0.7721015859395266, "num_tokens": 178014.0, "step": 17200 }, { "epoch": 8.469661538461539, "grad_norm": 0.39374762773513794, "learning_rate": 9.224754082993552e-05, "loss": 0.6739, "mean_token_accuracy": 0.7837085586041213, "num_tokens": 187225.0, "step": 17210 }, { "epoch": 8.474584615384614, "grad_norm": 0.5131353139877319, "learning_rate": 9.220544002296194e-05, "loss": 0.6725, "mean_token_accuracy": 0.7798098236322403, "num_tokens": 195918.0, "step": 17220 }, { "epoch": 8.479507692307692, "grad_norm": 0.2942337095737457, "learning_rate": 9.216323486967238e-05, "loss": 0.8587, "mean_token_accuracy": 0.7184354912489652, "num_tokens": 205933.0, "step": 17230 }, { "epoch": 8.48443076923077, "grad_norm": 0.6050145030021667, "learning_rate": 9.212092547441246e-05, "loss": 0.7515, "mean_token_accuracy": 0.7520445462316274, "num_tokens": 215316.0, "step": 17240 }, { "epoch": 8.489353846153847, "grad_norm": 0.34459981322288513, "learning_rate": 9.207851194178548e-05, "loss": 0.7056, "mean_token_accuracy": 0.7616019807755947, "num_tokens": 224106.0, "step": 17250 }, { "epoch": 8.494276923076923, "grad_norm": 0.3424946963787079, "learning_rate": 9.203599437665226e-05, "loss": 0.8357, "mean_token_accuracy": 0.7299704484641552, "num_tokens": 234099.0, "step": 17260 }, { "epoch": 8.4992, "grad_norm": 0.3472613990306854, "learning_rate": 9.19933728841308e-05, "loss": 0.7371, "mean_token_accuracy": 0.739522896334529, "num_tokens": 243567.0, "step": 17270 }, { "epoch": 8.504123076923078, "grad_norm": 0.4420841634273529, "learning_rate": 9.1950647569596e-05, "loss": 0.7009, "mean_token_accuracy": 0.775427482649684, "num_tokens": 252398.0, "step": 17280 }, { "epoch": 8.509046153846153, "grad_norm": 0.42884570360183716, "learning_rate": 9.19078185386795e-05, "loss": 0.7443, "mean_token_accuracy": 0.7564741510897874, "num_tokens": 262701.0, "step": 17290 }, { "epoch": 8.51396923076923, "grad_norm": 0.31011125445365906, "learning_rate": 9.186488589726937e-05, "loss": 0.7856, "mean_token_accuracy": 0.7426570508629083, "num_tokens": 272557.0, "step": 17300 }, { "epoch": 8.518892307692308, "grad_norm": 0.3713008463382721, "learning_rate": 9.18218497515098e-05, "loss": 0.7374, "mean_token_accuracy": 0.7558617364615202, "num_tokens": 282227.0, "step": 17310 }, { "epoch": 8.523815384615384, "grad_norm": 0.3813200891017914, "learning_rate": 9.17787102078009e-05, "loss": 0.7099, "mean_token_accuracy": 0.761194471269846, "num_tokens": 291044.0, "step": 17320 }, { "epoch": 8.528738461538461, "grad_norm": 0.4389830529689789, "learning_rate": 9.17354673727984e-05, "loss": 0.6955, "mean_token_accuracy": 0.7634035963565111, "num_tokens": 299331.0, "step": 17330 }, { "epoch": 8.533661538461539, "grad_norm": 0.26211830973625183, "learning_rate": 9.169212135341343e-05, "loss": 0.8433, "mean_token_accuracy": 0.7042404491454363, "num_tokens": 309303.0, "step": 17340 }, { "epoch": 8.538584615384615, "grad_norm": 0.3813954293727875, "learning_rate": 9.164867225681219e-05, "loss": 0.7424, "mean_token_accuracy": 0.7665748696774244, "num_tokens": 318409.0, "step": 17350 }, { "epoch": 8.543507692307692, "grad_norm": 0.2562119960784912, "learning_rate": 9.160512019041577e-05, "loss": 0.7056, "mean_token_accuracy": 0.7714785143733025, "num_tokens": 326915.0, "step": 17360 }, { "epoch": 8.54843076923077, "grad_norm": 0.7329946160316467, "learning_rate": 9.156146526189975e-05, "loss": 0.6707, "mean_token_accuracy": 0.7758157294243574, "num_tokens": 335886.0, "step": 17370 }, { "epoch": 8.553353846153847, "grad_norm": 0.7640717029571533, "learning_rate": 9.151770757919414e-05, "loss": 0.6965, "mean_token_accuracy": 0.7744528673589229, "num_tokens": 344820.0, "step": 17380 }, { "epoch": 8.558276923076923, "grad_norm": 0.6143106818199158, "learning_rate": 9.147384725048292e-05, "loss": 0.6567, "mean_token_accuracy": 0.7768244970589876, "num_tokens": 353154.0, "step": 17390 }, { "epoch": 8.5632, "grad_norm": 0.7920124530792236, "learning_rate": 9.142988438420383e-05, "loss": 0.6259, "mean_token_accuracy": 0.8012331046164036, "num_tokens": 361761.0, "step": 17400 }, { "epoch": 8.568123076923078, "grad_norm": 0.6481062769889832, "learning_rate": 9.138581908904818e-05, "loss": 0.7013, "mean_token_accuracy": 0.7641973450779915, "num_tokens": 371035.0, "step": 17410 }, { "epoch": 8.573046153846153, "grad_norm": 1.1524738073349, "learning_rate": 9.134165147396045e-05, "loss": 0.7164, "mean_token_accuracy": 0.7598775941878557, "num_tokens": 380223.0, "step": 17420 }, { "epoch": 8.57796923076923, "grad_norm": 0.4807913601398468, "learning_rate": 9.129738164813814e-05, "loss": 0.5843, "mean_token_accuracy": 0.7955730833113194, "num_tokens": 388359.0, "step": 17430 }, { "epoch": 8.582892307692308, "grad_norm": 0.35724836587905884, "learning_rate": 9.125300972103146e-05, "loss": 0.6942, "mean_token_accuracy": 0.7595485664904118, "num_tokens": 396973.0, "step": 17440 }, { "epoch": 8.587815384615384, "grad_norm": 0.42613887786865234, "learning_rate": 9.120853580234299e-05, "loss": 0.7747, "mean_token_accuracy": 0.7444280967116356, "num_tokens": 406644.0, "step": 17450 }, { "epoch": 8.592738461538461, "grad_norm": 0.44058769941329956, "learning_rate": 9.116396000202752e-05, "loss": 0.6742, "mean_token_accuracy": 0.771963307633996, "num_tokens": 415159.0, "step": 17460 }, { "epoch": 8.597661538461539, "grad_norm": 0.8332029581069946, "learning_rate": 9.111928243029171e-05, "loss": 0.7305, "mean_token_accuracy": 0.7594617635011673, "num_tokens": 423770.0, "step": 17470 }, { "epoch": 8.602584615384615, "grad_norm": 0.4663828909397125, "learning_rate": 9.107450319759382e-05, "loss": 0.7572, "mean_token_accuracy": 0.7509522173553705, "num_tokens": 432396.0, "step": 17480 }, { "epoch": 8.607507692307692, "grad_norm": 0.34183964133262634, "learning_rate": 9.102962241464348e-05, "loss": 0.7106, "mean_token_accuracy": 0.7614024080336094, "num_tokens": 441108.0, "step": 17490 }, { "epoch": 8.61243076923077, "grad_norm": 0.42849647998809814, "learning_rate": 9.098464019240138e-05, "loss": 0.6806, "mean_token_accuracy": 0.7648319080471992, "num_tokens": 449634.0, "step": 17500 }, { "epoch": 8.617353846153847, "grad_norm": 0.3230085074901581, "learning_rate": 9.093955664207895e-05, "loss": 0.7568, "mean_token_accuracy": 0.7501963946968317, "num_tokens": 459276.0, "step": 17510 }, { "epoch": 8.622276923076923, "grad_norm": 0.46625813841819763, "learning_rate": 9.089437187513821e-05, "loss": 0.6697, "mean_token_accuracy": 0.7733147449791431, "num_tokens": 467354.0, "step": 17520 }, { "epoch": 8.6272, "grad_norm": 0.34277865290641785, "learning_rate": 9.08490860032914e-05, "loss": 0.6949, "mean_token_accuracy": 0.7638208650052547, "num_tokens": 475322.0, "step": 17530 }, { "epoch": 8.632123076923078, "grad_norm": 0.7064818739891052, "learning_rate": 9.080369913850072e-05, "loss": 0.7679, "mean_token_accuracy": 0.7412798043340445, "num_tokens": 484199.0, "step": 17540 }, { "epoch": 8.637046153846153, "grad_norm": 0.3127409815788269, "learning_rate": 9.075821139297805e-05, "loss": 0.7316, "mean_token_accuracy": 0.7602858003228903, "num_tokens": 493997.0, "step": 17550 }, { "epoch": 8.64196923076923, "grad_norm": 0.7649181485176086, "learning_rate": 9.071262287918467e-05, "loss": 0.6458, "mean_token_accuracy": 0.7748285502195358, "num_tokens": 502572.0, "step": 17560 }, { "epoch": 8.646892307692308, "grad_norm": 0.3664408326148987, "learning_rate": 9.066693370983105e-05, "loss": 0.639, "mean_token_accuracy": 0.7795850615948439, "num_tokens": 511053.0, "step": 17570 }, { "epoch": 8.651815384615384, "grad_norm": 0.8689625263214111, "learning_rate": 9.062114399787647e-05, "loss": 0.6433, "mean_token_accuracy": 0.7788681592792273, "num_tokens": 519715.0, "step": 17580 }, { "epoch": 8.656738461538461, "grad_norm": 0.5213949084281921, "learning_rate": 9.057525385652878e-05, "loss": 0.6952, "mean_token_accuracy": 0.7548884745687247, "num_tokens": 529725.0, "step": 17590 }, { "epoch": 8.661661538461539, "grad_norm": 0.3532446622848511, "learning_rate": 9.052926339924413e-05, "loss": 0.7587, "mean_token_accuracy": 0.746376433596015, "num_tokens": 539374.0, "step": 17600 }, { "epoch": 8.666584615384615, "grad_norm": 0.36963027715682983, "learning_rate": 9.048317273972675e-05, "loss": 0.7293, "mean_token_accuracy": 0.758986271545291, "num_tokens": 548541.0, "step": 17610 }, { "epoch": 8.671507692307692, "grad_norm": 0.3961709141731262, "learning_rate": 9.043698199192849e-05, "loss": 0.7, "mean_token_accuracy": 0.767149792611599, "num_tokens": 557495.0, "step": 17620 }, { "epoch": 8.67643076923077, "grad_norm": 0.24584902822971344, "learning_rate": 9.039069127004875e-05, "loss": 0.7539, "mean_token_accuracy": 0.7440818291157484, "num_tokens": 566503.0, "step": 17630 }, { "epoch": 8.681353846153847, "grad_norm": 0.2721676230430603, "learning_rate": 9.034430068853405e-05, "loss": 0.7589, "mean_token_accuracy": 0.7534482311457396, "num_tokens": 575670.0, "step": 17640 }, { "epoch": 8.686276923076923, "grad_norm": 0.38591504096984863, "learning_rate": 9.029781036207781e-05, "loss": 0.6774, "mean_token_accuracy": 0.7606659393757582, "num_tokens": 585086.0, "step": 17650 }, { "epoch": 8.6912, "grad_norm": 0.49064919352531433, "learning_rate": 9.025122040562007e-05, "loss": 0.7503, "mean_token_accuracy": 0.7477210737764836, "num_tokens": 593883.0, "step": 17660 }, { "epoch": 8.696123076923078, "grad_norm": 0.36603498458862305, "learning_rate": 9.020453093434714e-05, "loss": 0.6822, "mean_token_accuracy": 0.767727042734623, "num_tokens": 602034.0, "step": 17670 }, { "epoch": 8.701046153846153, "grad_norm": 0.5092780590057373, "learning_rate": 9.015774206369143e-05, "loss": 0.6706, "mean_token_accuracy": 0.7680165067315101, "num_tokens": 611455.0, "step": 17680 }, { "epoch": 8.705969230769231, "grad_norm": 0.33634933829307556, "learning_rate": 9.011085390933105e-05, "loss": 0.6266, "mean_token_accuracy": 0.7781741376966238, "num_tokens": 619532.0, "step": 17690 }, { "epoch": 8.710892307692308, "grad_norm": 0.35559213161468506, "learning_rate": 9.00638665871896e-05, "loss": 0.6879, "mean_token_accuracy": 0.7744422752410174, "num_tokens": 628842.0, "step": 17700 }, { "epoch": 8.715815384615384, "grad_norm": 0.2505728602409363, "learning_rate": 9.001678021343586e-05, "loss": 0.7787, "mean_token_accuracy": 0.7485674019902945, "num_tokens": 638729.0, "step": 17710 }, { "epoch": 8.720738461538462, "grad_norm": 0.6747182011604309, "learning_rate": 8.996959490448346e-05, "loss": 0.6124, "mean_token_accuracy": 0.7967943239957094, "num_tokens": 647220.0, "step": 17720 }, { "epoch": 8.725661538461539, "grad_norm": 0.24621552228927612, "learning_rate": 8.992231077699067e-05, "loss": 0.6561, "mean_token_accuracy": 0.7866641227155924, "num_tokens": 655677.0, "step": 17730 }, { "epoch": 8.730584615384615, "grad_norm": 0.26347342133522034, "learning_rate": 8.987492794786006e-05, "loss": 0.7491, "mean_token_accuracy": 0.7501132309436798, "num_tokens": 665125.0, "step": 17740 }, { "epoch": 8.735507692307692, "grad_norm": 0.32913926243782043, "learning_rate": 8.982744653423825e-05, "loss": 0.7115, "mean_token_accuracy": 0.7552514169365168, "num_tokens": 674914.0, "step": 17750 }, { "epoch": 8.74043076923077, "grad_norm": 0.37196052074432373, "learning_rate": 8.977986665351552e-05, "loss": 0.6786, "mean_token_accuracy": 0.7645082645118236, "num_tokens": 683568.0, "step": 17760 }, { "epoch": 8.745353846153845, "grad_norm": 0.7010545134544373, "learning_rate": 8.97321884233257e-05, "loss": 0.7045, "mean_token_accuracy": 0.7673761691898108, "num_tokens": 692549.0, "step": 17770 }, { "epoch": 8.750276923076923, "grad_norm": 0.39983436465263367, "learning_rate": 8.96844119615457e-05, "loss": 0.6805, "mean_token_accuracy": 0.7654839035123586, "num_tokens": 700969.0, "step": 17780 }, { "epoch": 8.7552, "grad_norm": 0.3482830822467804, "learning_rate": 8.96365373862953e-05, "loss": 0.7268, "mean_token_accuracy": 0.7699950773268938, "num_tokens": 710766.0, "step": 17790 }, { "epoch": 8.760123076923076, "grad_norm": 0.7171288132667542, "learning_rate": 8.958856481593687e-05, "loss": 0.7709, "mean_token_accuracy": 0.7531832829117775, "num_tokens": 720097.0, "step": 17800 }, { "epoch": 8.765046153846153, "grad_norm": 0.3223002552986145, "learning_rate": 8.954049436907506e-05, "loss": 0.7723, "mean_token_accuracy": 0.7442610811442136, "num_tokens": 9539.0, "step": 17810 }, { "epoch": 8.769969230769231, "grad_norm": 0.2629016935825348, "learning_rate": 8.949232616455647e-05, "loss": 0.7714, "mean_token_accuracy": 0.7383145179599524, "num_tokens": 19114.0, "step": 17820 }, { "epoch": 8.774892307692308, "grad_norm": 0.6586378216743469, "learning_rate": 8.944406032146944e-05, "loss": 0.642, "mean_token_accuracy": 0.7885478623211384, "num_tokens": 27435.0, "step": 17830 }, { "epoch": 8.779815384615384, "grad_norm": 0.2907133400440216, "learning_rate": 8.939569695914367e-05, "loss": 0.7391, "mean_token_accuracy": 0.748593881353736, "num_tokens": 35485.0, "step": 17840 }, { "epoch": 8.784738461538462, "grad_norm": 0.28475967049598694, "learning_rate": 8.934723619714996e-05, "loss": 0.7719, "mean_token_accuracy": 0.7494703732430935, "num_tokens": 44542.0, "step": 17850 }, { "epoch": 8.789661538461539, "grad_norm": 0.47861045598983765, "learning_rate": 8.929867815529993e-05, "loss": 0.6828, "mean_token_accuracy": 0.7696808248758316, "num_tokens": 53560.0, "step": 17860 }, { "epoch": 8.794584615384615, "grad_norm": 0.25639232993125916, "learning_rate": 8.925002295364571e-05, "loss": 0.7176, "mean_token_accuracy": 0.7549582026898861, "num_tokens": 62836.0, "step": 17870 }, { "epoch": 8.799507692307692, "grad_norm": 0.27395716309547424, "learning_rate": 8.920127071247963e-05, "loss": 0.7167, "mean_token_accuracy": 0.7506252504885197, "num_tokens": 71377.0, "step": 17880 }, { "epoch": 8.80443076923077, "grad_norm": 0.26782044768333435, "learning_rate": 8.915242155233396e-05, "loss": 0.7433, "mean_token_accuracy": 0.7559556499123573, "num_tokens": 80539.0, "step": 17890 }, { "epoch": 8.809353846153845, "grad_norm": 0.31977975368499756, "learning_rate": 8.910347559398056e-05, "loss": 0.7916, "mean_token_accuracy": 0.7560835804790258, "num_tokens": 90708.0, "step": 17900 }, { "epoch": 8.814276923076923, "grad_norm": 0.492887407541275, "learning_rate": 8.905443295843061e-05, "loss": 0.6752, "mean_token_accuracy": 0.7661668874323369, "num_tokens": 99271.0, "step": 17910 }, { "epoch": 8.8192, "grad_norm": 0.32388588786125183, "learning_rate": 8.900529376693434e-05, "loss": 0.7657, "mean_token_accuracy": 0.7424514323472977, "num_tokens": 107993.0, "step": 17920 }, { "epoch": 8.824123076923076, "grad_norm": 0.4508485496044159, "learning_rate": 8.895605814098064e-05, "loss": 0.8702, "mean_token_accuracy": 0.7194077134132385, "num_tokens": 118063.0, "step": 17930 }, { "epoch": 8.829046153846154, "grad_norm": 0.3053992986679077, "learning_rate": 8.89067262022969e-05, "loss": 0.6951, "mean_token_accuracy": 0.7752657104283571, "num_tokens": 126208.0, "step": 17940 }, { "epoch": 8.833969230769231, "grad_norm": 0.3835429251194, "learning_rate": 8.885729807284856e-05, "loss": 0.7114, "mean_token_accuracy": 0.7549926679581404, "num_tokens": 134733.0, "step": 17950 }, { "epoch": 8.838892307692308, "grad_norm": 0.21046239137649536, "learning_rate": 8.880777387483888e-05, "loss": 0.7411, "mean_token_accuracy": 0.7572793487459422, "num_tokens": 143481.0, "step": 17960 }, { "epoch": 8.843815384615384, "grad_norm": 0.2608044445514679, "learning_rate": 8.875815373070868e-05, "loss": 0.7923, "mean_token_accuracy": 0.7279406886547803, "num_tokens": 152666.0, "step": 17970 }, { "epoch": 8.848738461538462, "grad_norm": 0.35140207409858704, "learning_rate": 8.870843776313598e-05, "loss": 0.7014, "mean_token_accuracy": 0.7720128271728754, "num_tokens": 161509.0, "step": 17980 }, { "epoch": 8.85366153846154, "grad_norm": 0.2577463984489441, "learning_rate": 8.865862609503566e-05, "loss": 0.7647, "mean_token_accuracy": 0.7505464531481266, "num_tokens": 170704.0, "step": 17990 }, { "epoch": 8.858584615384615, "grad_norm": 0.4267882704734802, "learning_rate": 8.860871884955925e-05, "loss": 0.694, "mean_token_accuracy": 0.7794535614550113, "num_tokens": 179267.0, "step": 18000 }, { "epoch": 8.863507692307692, "grad_norm": 0.3592469394207001, "learning_rate": 8.855871615009459e-05, "loss": 0.6028, "mean_token_accuracy": 0.8042227383702993, "num_tokens": 187517.0, "step": 18010 }, { "epoch": 8.86843076923077, "grad_norm": 0.30718037486076355, "learning_rate": 8.850861812026548e-05, "loss": 0.8283, "mean_token_accuracy": 0.7271805927157402, "num_tokens": 197279.0, "step": 18020 }, { "epoch": 8.873353846153845, "grad_norm": 0.429610937833786, "learning_rate": 8.845842488393141e-05, "loss": 0.7049, "mean_token_accuracy": 0.7623873326927424, "num_tokens": 206305.0, "step": 18030 }, { "epoch": 8.878276923076923, "grad_norm": 0.3396170139312744, "learning_rate": 8.840813656518728e-05, "loss": 0.7685, "mean_token_accuracy": 0.7294337477535009, "num_tokens": 215493.0, "step": 18040 }, { "epoch": 8.8832, "grad_norm": 0.2978787124156952, "learning_rate": 8.835775328836306e-05, "loss": 0.658, "mean_token_accuracy": 0.7802751030772924, "num_tokens": 223108.0, "step": 18050 }, { "epoch": 8.888123076923076, "grad_norm": 0.3851058781147003, "learning_rate": 8.830727517802347e-05, "loss": 0.7847, "mean_token_accuracy": 0.7197605889290571, "num_tokens": 233031.0, "step": 18060 }, { "epoch": 8.893046153846154, "grad_norm": 0.8238245248794556, "learning_rate": 8.82567023589677e-05, "loss": 0.7422, "mean_token_accuracy": 0.7518959946930408, "num_tokens": 241782.0, "step": 18070 }, { "epoch": 8.897969230769231, "grad_norm": 0.8435314297676086, "learning_rate": 8.820603495622912e-05, "loss": 0.6705, "mean_token_accuracy": 0.7846408020704985, "num_tokens": 250884.0, "step": 18080 }, { "epoch": 8.902892307692309, "grad_norm": 0.6095793843269348, "learning_rate": 8.81552730950749e-05, "loss": 0.6823, "mean_token_accuracy": 0.775208180397749, "num_tokens": 259293.0, "step": 18090 }, { "epoch": 8.907815384615384, "grad_norm": 0.27715322375297546, "learning_rate": 8.810441690100575e-05, "loss": 0.8093, "mean_token_accuracy": 0.7308334667235613, "num_tokens": 269988.0, "step": 18100 }, { "epoch": 8.912738461538462, "grad_norm": 0.39033225178718567, "learning_rate": 8.805346649975565e-05, "loss": 0.7432, "mean_token_accuracy": 0.7343447051942349, "num_tokens": 279011.0, "step": 18110 }, { "epoch": 8.91766153846154, "grad_norm": 0.3180443048477173, "learning_rate": 8.800242201729141e-05, "loss": 0.6186, "mean_token_accuracy": 0.7984683159738779, "num_tokens": 287209.0, "step": 18120 }, { "epoch": 8.922584615384615, "grad_norm": 0.46220842003822327, "learning_rate": 8.795128357981253e-05, "loss": 0.7108, "mean_token_accuracy": 0.7846879895776511, "num_tokens": 296052.0, "step": 18130 }, { "epoch": 8.927507692307692, "grad_norm": 0.5963321328163147, "learning_rate": 8.790005131375074e-05, "loss": 0.6761, "mean_token_accuracy": 0.7786654643714428, "num_tokens": 304656.0, "step": 18140 }, { "epoch": 8.93243076923077, "grad_norm": 1.1265039443969727, "learning_rate": 8.784872534576978e-05, "loss": 0.8073, "mean_token_accuracy": 0.7344447121024131, "num_tokens": 314434.0, "step": 18150 }, { "epoch": 8.937353846153846, "grad_norm": 0.4689125120639801, "learning_rate": 8.779730580276501e-05, "loss": 0.7303, "mean_token_accuracy": 0.7458708386868238, "num_tokens": 322607.0, "step": 18160 }, { "epoch": 8.942276923076923, "grad_norm": 0.2908729612827301, "learning_rate": 8.774579281186319e-05, "loss": 0.6007, "mean_token_accuracy": 0.7946537002921105, "num_tokens": 330677.0, "step": 18170 }, { "epoch": 8.9472, "grad_norm": 0.3566810190677643, "learning_rate": 8.76941865004221e-05, "loss": 0.7285, "mean_token_accuracy": 0.7698224943131209, "num_tokens": 340641.0, "step": 18180 }, { "epoch": 8.952123076923076, "grad_norm": 0.35960811376571655, "learning_rate": 8.76424869960302e-05, "loss": 0.5989, "mean_token_accuracy": 0.7901786677539349, "num_tokens": 348726.0, "step": 18190 }, { "epoch": 8.957046153846154, "grad_norm": 0.38176608085632324, "learning_rate": 8.75906944265064e-05, "loss": 0.7193, "mean_token_accuracy": 0.7635401219129563, "num_tokens": 357104.0, "step": 18200 }, { "epoch": 8.961969230769231, "grad_norm": 0.2969922423362732, "learning_rate": 8.753880891989972e-05, "loss": 0.6534, "mean_token_accuracy": 0.7754518780857325, "num_tokens": 8926.0, "step": 18210 }, { "epoch": 8.966892307692309, "grad_norm": 0.4824042022228241, "learning_rate": 8.748683060448886e-05, "loss": 0.6409, "mean_token_accuracy": 0.7802204493433237, "num_tokens": 17538.0, "step": 18220 }, { "epoch": 8.971815384615384, "grad_norm": 0.28852856159210205, "learning_rate": 8.743475960878209e-05, "loss": 0.7454, "mean_token_accuracy": 0.7477544978260994, "num_tokens": 26410.0, "step": 18230 }, { "epoch": 8.976738461538462, "grad_norm": 0.32177209854125977, "learning_rate": 8.738259606151672e-05, "loss": 0.7582, "mean_token_accuracy": 0.7423262905329466, "num_tokens": 35455.0, "step": 18240 }, { "epoch": 8.98166153846154, "grad_norm": 0.2885516285896301, "learning_rate": 8.733034009165894e-05, "loss": 0.7031, "mean_token_accuracy": 0.7598079223185777, "num_tokens": 45252.0, "step": 18250 }, { "epoch": 8.986584615384615, "grad_norm": 0.2479109764099121, "learning_rate": 8.727799182840344e-05, "loss": 0.7091, "mean_token_accuracy": 0.7633897583931685, "num_tokens": 53904.0, "step": 18260 }, { "epoch": 8.991507692307692, "grad_norm": 0.4614357650279999, "learning_rate": 8.722555140117303e-05, "loss": 0.6807, "mean_token_accuracy": 0.7741472873836756, "num_tokens": 62135.0, "step": 18270 }, { "epoch": 8.99643076923077, "grad_norm": 0.24129466712474823, "learning_rate": 8.717301893961844e-05, "loss": 0.7596, "mean_token_accuracy": 0.7397197656333446, "num_tokens": 70828.0, "step": 18280 }, { "epoch": 9.001476923076924, "grad_norm": 0.3193853497505188, "learning_rate": 8.712039457361795e-05, "loss": 0.8814, "mean_token_accuracy": 0.7301918130095412, "num_tokens": 80962.0, "step": 18290 }, { "epoch": 9.0064, "grad_norm": 0.5235174894332886, "learning_rate": 8.7067678433277e-05, "loss": 0.6238, "mean_token_accuracy": 0.7859724014997482, "num_tokens": 88659.0, "step": 18300 }, { "epoch": 9.011323076923077, "grad_norm": 0.6154448986053467, "learning_rate": 8.701487064892797e-05, "loss": 0.6343, "mean_token_accuracy": 0.7893827341496944, "num_tokens": 97611.0, "step": 18310 }, { "epoch": 9.016246153846154, "grad_norm": 0.4159059226512909, "learning_rate": 8.69619713511298e-05, "loss": 0.7506, "mean_token_accuracy": 0.73648741543293, "num_tokens": 107105.0, "step": 18320 }, { "epoch": 9.02116923076923, "grad_norm": 0.3411385118961334, "learning_rate": 8.690898067066771e-05, "loss": 0.7644, "mean_token_accuracy": 0.7501700416207313, "num_tokens": 116559.0, "step": 18330 }, { "epoch": 9.026092307692307, "grad_norm": 0.36136379837989807, "learning_rate": 8.68558987385528e-05, "loss": 0.8201, "mean_token_accuracy": 0.716056851670146, "num_tokens": 125666.0, "step": 18340 }, { "epoch": 9.031015384615385, "grad_norm": 0.5617722272872925, "learning_rate": 8.680272568602181e-05, "loss": 0.7773, "mean_token_accuracy": 0.7351651962846517, "num_tokens": 135346.0, "step": 18350 }, { "epoch": 9.035938461538462, "grad_norm": 0.9331738948822021, "learning_rate": 8.674946164453677e-05, "loss": 0.6327, "mean_token_accuracy": 0.7781557217240334, "num_tokens": 144168.0, "step": 18360 }, { "epoch": 9.040861538461538, "grad_norm": 0.7562224268913269, "learning_rate": 8.669610674578463e-05, "loss": 0.699, "mean_token_accuracy": 0.7659288670867681, "num_tokens": 153162.0, "step": 18370 }, { "epoch": 9.045784615384616, "grad_norm": 0.2653519809246063, "learning_rate": 8.664266112167702e-05, "loss": 0.7789, "mean_token_accuracy": 0.7399741619825363, "num_tokens": 162494.0, "step": 18380 }, { "epoch": 9.050707692307693, "grad_norm": 0.4635138213634491, "learning_rate": 8.658912490434981e-05, "loss": 0.7132, "mean_token_accuracy": 0.7637255847454071, "num_tokens": 171751.0, "step": 18390 }, { "epoch": 9.055630769230769, "grad_norm": 1.0278699398040771, "learning_rate": 8.653549822616289e-05, "loss": 0.6387, "mean_token_accuracy": 0.7829495001584291, "num_tokens": 180110.0, "step": 18400 }, { "epoch": 9.060553846153846, "grad_norm": 0.44372498989105225, "learning_rate": 8.648178121969978e-05, "loss": 0.6717, "mean_token_accuracy": 0.7774417765438557, "num_tokens": 188293.0, "step": 18410 }, { "epoch": 9.065476923076924, "grad_norm": 0.5009580254554749, "learning_rate": 8.642797401776739e-05, "loss": 0.7577, "mean_token_accuracy": 0.7463389489799738, "num_tokens": 197442.0, "step": 18420 }, { "epoch": 9.0704, "grad_norm": 0.3624105751514435, "learning_rate": 8.63740767533955e-05, "loss": 0.7365, "mean_token_accuracy": 0.7567340433597565, "num_tokens": 206170.0, "step": 18430 }, { "epoch": 9.075323076923077, "grad_norm": 0.40718990564346313, "learning_rate": 8.632008955983667e-05, "loss": 0.7613, "mean_token_accuracy": 0.7605375040322542, "num_tokens": 215198.0, "step": 18440 }, { "epoch": 9.080246153846154, "grad_norm": 0.8007605075836182, "learning_rate": 8.626601257056573e-05, "loss": 0.6795, "mean_token_accuracy": 0.7678759694099426, "num_tokens": 223760.0, "step": 18450 }, { "epoch": 9.08516923076923, "grad_norm": 0.6228090524673462, "learning_rate": 8.621184591927953e-05, "loss": 0.7174, "mean_token_accuracy": 0.7554727476090193, "num_tokens": 232476.0, "step": 18460 }, { "epoch": 9.090092307692307, "grad_norm": 0.36368465423583984, "learning_rate": 8.61575897398966e-05, "loss": 0.7264, "mean_token_accuracy": 0.7487952623516321, "num_tokens": 242567.0, "step": 18470 }, { "epoch": 9.095015384615385, "grad_norm": 0.6309615969657898, "learning_rate": 8.610324416655684e-05, "loss": 0.6797, "mean_token_accuracy": 0.7755587588995695, "num_tokens": 251964.0, "step": 18480 }, { "epoch": 9.09993846153846, "grad_norm": 0.5346474647521973, "learning_rate": 8.604880933362113e-05, "loss": 0.6778, "mean_token_accuracy": 0.7865086987614631, "num_tokens": 260748.0, "step": 18490 }, { "epoch": 9.104861538461538, "grad_norm": 0.7138431668281555, "learning_rate": 8.599428537567101e-05, "loss": 0.681, "mean_token_accuracy": 0.7724569093436002, "num_tokens": 269283.0, "step": 18500 }, { "epoch": 9.109784615384616, "grad_norm": 0.2846992611885071, "learning_rate": 8.593967242750843e-05, "loss": 0.7066, "mean_token_accuracy": 0.7593025963753461, "num_tokens": 278494.0, "step": 18510 }, { "epoch": 9.114707692307693, "grad_norm": 0.2537672817707062, "learning_rate": 8.588497062415528e-05, "loss": 0.7057, "mean_token_accuracy": 0.7501687645912171, "num_tokens": 288579.0, "step": 18520 }, { "epoch": 9.119630769230769, "grad_norm": 0.3954591453075409, "learning_rate": 8.583018010085321e-05, "loss": 0.7496, "mean_token_accuracy": 0.7489098712801934, "num_tokens": 298019.0, "step": 18530 }, { "epoch": 9.124553846153846, "grad_norm": 1.1767851114273071, "learning_rate": 8.577530099306317e-05, "loss": 0.6797, "mean_token_accuracy": 0.7796317916363478, "num_tokens": 307575.0, "step": 18540 }, { "epoch": 9.129476923076924, "grad_norm": 0.7717283964157104, "learning_rate": 8.57203334364651e-05, "loss": 0.7018, "mean_token_accuracy": 0.763766011595726, "num_tokens": 316091.0, "step": 18550 }, { "epoch": 9.1344, "grad_norm": 0.366485595703125, "learning_rate": 8.566527756695766e-05, "loss": 0.6554, "mean_token_accuracy": 0.7748038172721863, "num_tokens": 324292.0, "step": 18560 }, { "epoch": 9.139323076923077, "grad_norm": 0.44988542795181274, "learning_rate": 8.561013352065783e-05, "loss": 0.7434, "mean_token_accuracy": 0.7497165717184544, "num_tokens": 332960.0, "step": 18570 }, { "epoch": 9.144246153846154, "grad_norm": 0.27599868178367615, "learning_rate": 8.555490143390062e-05, "loss": 0.6943, "mean_token_accuracy": 0.7611289013177156, "num_tokens": 341446.0, "step": 18580 }, { "epoch": 9.14916923076923, "grad_norm": 0.29391446709632874, "learning_rate": 8.549958144323862e-05, "loss": 0.6971, "mean_token_accuracy": 0.7690398130565882, "num_tokens": 349712.0, "step": 18590 }, { "epoch": 9.154092307692308, "grad_norm": 0.30475255846977234, "learning_rate": 8.544417368544189e-05, "loss": 0.7287, "mean_token_accuracy": 0.7482383538037538, "num_tokens": 359021.0, "step": 18600 }, { "epoch": 9.158523076923077, "grad_norm": 0.5902156829833984, "learning_rate": 8.538867829749734e-05, "loss": 0.7084, "mean_token_accuracy": 0.7611913044005633, "num_tokens": 9586.0, "step": 18610 }, { "epoch": 9.163446153846154, "grad_norm": 0.38383767008781433, "learning_rate": 8.533309541660863e-05, "loss": 0.7506, "mean_token_accuracy": 0.7360954392701388, "num_tokens": 18015.0, "step": 18620 }, { "epoch": 9.168369230769231, "grad_norm": 0.27133414149284363, "learning_rate": 8.527742518019567e-05, "loss": 0.6913, "mean_token_accuracy": 0.7697105508297681, "num_tokens": 27033.0, "step": 18630 }, { "epoch": 9.173292307692307, "grad_norm": 0.3312305510044098, "learning_rate": 8.52216677258944e-05, "loss": 0.7035, "mean_token_accuracy": 0.7611101619899273, "num_tokens": 36045.0, "step": 18640 }, { "epoch": 9.178215384615385, "grad_norm": 0.7461434602737427, "learning_rate": 8.516582319155633e-05, "loss": 0.7247, "mean_token_accuracy": 0.7545670151710511, "num_tokens": 44409.0, "step": 18650 }, { "epoch": 9.183138461538462, "grad_norm": 0.4726799726486206, "learning_rate": 8.51098917152483e-05, "loss": 0.7093, "mean_token_accuracy": 0.7689370591193437, "num_tokens": 53497.0, "step": 18660 }, { "epoch": 9.188061538461538, "grad_norm": 0.26350924372673035, "learning_rate": 8.505387343525209e-05, "loss": 0.7406, "mean_token_accuracy": 0.766264171525836, "num_tokens": 61939.0, "step": 18670 }, { "epoch": 9.192984615384615, "grad_norm": 0.20984847843647003, "learning_rate": 8.49977684900641e-05, "loss": 0.6724, "mean_token_accuracy": 0.7731727968901396, "num_tokens": 70391.0, "step": 18680 }, { "epoch": 9.197907692307693, "grad_norm": 0.2626084089279175, "learning_rate": 8.4941577018395e-05, "loss": 0.6348, "mean_token_accuracy": 0.7817917808890342, "num_tokens": 78312.0, "step": 18690 }, { "epoch": 9.202830769230768, "grad_norm": 0.29489243030548096, "learning_rate": 8.488529915916936e-05, "loss": 0.746, "mean_token_accuracy": 0.7493366193026304, "num_tokens": 87869.0, "step": 18700 }, { "epoch": 9.207753846153846, "grad_norm": 0.4739731550216675, "learning_rate": 8.482893505152533e-05, "loss": 0.7485, "mean_token_accuracy": 0.751647999510169, "num_tokens": 96885.0, "step": 18710 }, { "epoch": 9.212676923076923, "grad_norm": 0.8138965368270874, "learning_rate": 8.47724848348143e-05, "loss": 0.6989, "mean_token_accuracy": 0.7752762287855148, "num_tokens": 106453.0, "step": 18720 }, { "epoch": 9.2176, "grad_norm": 0.3375241160392761, "learning_rate": 8.471594864860058e-05, "loss": 0.7631, "mean_token_accuracy": 0.7568930108100176, "num_tokens": 116549.0, "step": 18730 }, { "epoch": 9.222523076923077, "grad_norm": 0.2639356851577759, "learning_rate": 8.4659326632661e-05, "loss": 0.6026, "mean_token_accuracy": 0.7832478541880846, "num_tokens": 124510.0, "step": 18740 }, { "epoch": 9.227446153846154, "grad_norm": 0.915088951587677, "learning_rate": 8.460261892698457e-05, "loss": 0.6784, "mean_token_accuracy": 0.772139797359705, "num_tokens": 132722.0, "step": 18750 }, { "epoch": 9.232369230769232, "grad_norm": 0.45034798979759216, "learning_rate": 8.454582567177223e-05, "loss": 0.5967, "mean_token_accuracy": 0.8007356438785791, "num_tokens": 141020.0, "step": 18760 }, { "epoch": 9.237292307692307, "grad_norm": 0.23716862499713898, "learning_rate": 8.44889470074363e-05, "loss": 0.6551, "mean_token_accuracy": 0.7765100870281458, "num_tokens": 149320.0, "step": 18770 }, { "epoch": 9.242215384615385, "grad_norm": 0.2853967845439911, "learning_rate": 8.443198307460041e-05, "loss": 0.7346, "mean_token_accuracy": 0.7537438083440066, "num_tokens": 158038.0, "step": 18780 }, { "epoch": 9.247138461538462, "grad_norm": 0.40558820962905884, "learning_rate": 8.437493401409888e-05, "loss": 0.6459, "mean_token_accuracy": 0.7761574640870095, "num_tokens": 166488.0, "step": 18790 }, { "epoch": 9.252061538461538, "grad_norm": 0.3173389434814453, "learning_rate": 8.431779996697656e-05, "loss": 0.6979, "mean_token_accuracy": 0.7614830315113068, "num_tokens": 175398.0, "step": 18800 }, { "epoch": 9.256984615384615, "grad_norm": 0.31110990047454834, "learning_rate": 8.426058107448841e-05, "loss": 0.7819, "mean_token_accuracy": 0.7374692268669605, "num_tokens": 184697.0, "step": 18810 }, { "epoch": 9.261907692307693, "grad_norm": 0.23878473043441772, "learning_rate": 8.420327747809913e-05, "loss": 0.732, "mean_token_accuracy": 0.7535562068223953, "num_tokens": 193847.0, "step": 18820 }, { "epoch": 9.266830769230769, "grad_norm": 0.28896352648735046, "learning_rate": 8.414588931948287e-05, "loss": 0.6439, "mean_token_accuracy": 0.7868727888911963, "num_tokens": 202991.0, "step": 18830 }, { "epoch": 9.271753846153846, "grad_norm": 0.43781089782714844, "learning_rate": 8.408841674052284e-05, "loss": 0.8251, "mean_token_accuracy": 0.7127380024641752, "num_tokens": 212459.0, "step": 18840 }, { "epoch": 9.276676923076923, "grad_norm": 0.3533414900302887, "learning_rate": 8.403085988331092e-05, "loss": 0.7372, "mean_token_accuracy": 0.7454831000417471, "num_tokens": 221274.0, "step": 18850 }, { "epoch": 9.2816, "grad_norm": 0.35645604133605957, "learning_rate": 8.397321889014743e-05, "loss": 0.7142, "mean_token_accuracy": 0.7632349513471126, "num_tokens": 229612.0, "step": 18860 }, { "epoch": 9.286523076923077, "grad_norm": 0.4839099645614624, "learning_rate": 8.391549390354061e-05, "loss": 0.685, "mean_token_accuracy": 0.7794561486691236, "num_tokens": 239028.0, "step": 18870 }, { "epoch": 9.291446153846154, "grad_norm": 0.8532965183258057, "learning_rate": 8.385768506620649e-05, "loss": 0.7402, "mean_token_accuracy": 0.7652015954256057, "num_tokens": 248482.0, "step": 18880 }, { "epoch": 9.296369230769232, "grad_norm": 0.2782347798347473, "learning_rate": 8.379979252106829e-05, "loss": 0.6769, "mean_token_accuracy": 0.7735106501728296, "num_tokens": 256626.0, "step": 18890 }, { "epoch": 9.301292307692307, "grad_norm": 0.2272525280714035, "learning_rate": 8.374181641125622e-05, "loss": 0.7279, "mean_token_accuracy": 0.7602387875318527, "num_tokens": 265897.0, "step": 18900 }, { "epoch": 9.306215384615385, "grad_norm": 0.3278202414512634, "learning_rate": 8.368375688010712e-05, "loss": 0.7268, "mean_token_accuracy": 0.7507894467562437, "num_tokens": 275027.0, "step": 18910 }, { "epoch": 9.311138461538462, "grad_norm": 0.26526904106140137, "learning_rate": 8.362561407116405e-05, "loss": 0.6761, "mean_token_accuracy": 0.7650868054479361, "num_tokens": 284258.0, "step": 18920 }, { "epoch": 9.316061538461538, "grad_norm": 0.4439482092857361, "learning_rate": 8.356738812817596e-05, "loss": 0.7357, "mean_token_accuracy": 0.7472224164754152, "num_tokens": 293076.0, "step": 18930 }, { "epoch": 9.320984615384615, "grad_norm": 0.27552875876426697, "learning_rate": 8.350907919509734e-05, "loss": 0.6793, "mean_token_accuracy": 0.7668122231960297, "num_tokens": 301359.0, "step": 18940 }, { "epoch": 9.325907692307693, "grad_norm": 0.268410325050354, "learning_rate": 8.345068741608786e-05, "loss": 0.7784, "mean_token_accuracy": 0.7451492633670569, "num_tokens": 310342.0, "step": 18950 }, { "epoch": 9.330830769230769, "grad_norm": 0.3308853209018707, "learning_rate": 8.339221293551203e-05, "loss": 0.6681, "mean_token_accuracy": 0.7704876314848661, "num_tokens": 318452.0, "step": 18960 }, { "epoch": 9.335753846153846, "grad_norm": 0.25952640175819397, "learning_rate": 8.33336558979388e-05, "loss": 0.7042, "mean_token_accuracy": 0.7535649377852678, "num_tokens": 328195.0, "step": 18970 }, { "epoch": 9.340676923076924, "grad_norm": 0.2396654337644577, "learning_rate": 8.327501644814122e-05, "loss": 0.6969, "mean_token_accuracy": 0.7682115890085697, "num_tokens": 337053.0, "step": 18980 }, { "epoch": 9.3456, "grad_norm": 0.32446226477622986, "learning_rate": 8.321629473109615e-05, "loss": 0.6289, "mean_token_accuracy": 0.7851255543529987, "num_tokens": 345015.0, "step": 18990 }, { "epoch": 9.350523076923077, "grad_norm": 0.2200402021408081, "learning_rate": 8.315749089198378e-05, "loss": 0.849, "mean_token_accuracy": 0.714352885633707, "num_tokens": 355063.0, "step": 19000 }, { "epoch": 9.355446153846154, "grad_norm": 0.40607473254203796, "learning_rate": 8.309860507618737e-05, "loss": 0.7187, "mean_token_accuracy": 0.7435123972594738, "num_tokens": 364147.0, "step": 19010 }, { "epoch": 9.36036923076923, "grad_norm": 0.2307461053133011, "learning_rate": 8.303963742929284e-05, "loss": 0.7594, "mean_token_accuracy": 0.7431487880647183, "num_tokens": 373084.0, "step": 19020 }, { "epoch": 9.365292307692307, "grad_norm": 0.36096903681755066, "learning_rate": 8.298058809708842e-05, "loss": 0.8165, "mean_token_accuracy": 0.7338841069489718, "num_tokens": 383053.0, "step": 19030 }, { "epoch": 9.370215384615385, "grad_norm": 0.4592624604701996, "learning_rate": 8.292145722556431e-05, "loss": 0.6305, "mean_token_accuracy": 0.7788219083100557, "num_tokens": 391397.0, "step": 19040 }, { "epoch": 9.375138461538462, "grad_norm": 0.24138374626636505, "learning_rate": 8.286224496091228e-05, "loss": 0.7563, "mean_token_accuracy": 0.7563010204583407, "num_tokens": 401057.0, "step": 19050 }, { "epoch": 9.380061538461538, "grad_norm": 1.3317973613739014, "learning_rate": 8.280295144952536e-05, "loss": 0.874, "mean_token_accuracy": 0.7216722797602415, "num_tokens": 411806.0, "step": 19060 }, { "epoch": 9.384984615384615, "grad_norm": 0.24880658090114594, "learning_rate": 8.274357683799744e-05, "loss": 0.6631, "mean_token_accuracy": 0.7717092610895634, "num_tokens": 420346.0, "step": 19070 }, { "epoch": 9.389907692307693, "grad_norm": 0.28920841217041016, "learning_rate": 8.268412127312293e-05, "loss": 0.7502, "mean_token_accuracy": 0.7458052407950163, "num_tokens": 429329.0, "step": 19080 }, { "epoch": 9.394830769230769, "grad_norm": 0.8361232280731201, "learning_rate": 8.262458490189633e-05, "loss": 0.7201, "mean_token_accuracy": 0.7693693403154611, "num_tokens": 438637.0, "step": 19090 }, { "epoch": 9.399753846153846, "grad_norm": 1.081518292427063, "learning_rate": 8.256496787151197e-05, "loss": 0.73, "mean_token_accuracy": 0.758531778678298, "num_tokens": 447629.0, "step": 19100 }, { "epoch": 9.404676923076924, "grad_norm": 0.3594396412372589, "learning_rate": 8.250527032936359e-05, "loss": 0.6957, "mean_token_accuracy": 0.7701297465711832, "num_tokens": 455960.0, "step": 19110 }, { "epoch": 9.4096, "grad_norm": 0.2965713143348694, "learning_rate": 8.244549242304399e-05, "loss": 0.6313, "mean_token_accuracy": 0.7778208505362272, "num_tokens": 464243.0, "step": 19120 }, { "epoch": 9.414523076923077, "grad_norm": 0.44778725504875183, "learning_rate": 8.238563430034463e-05, "loss": 0.6234, "mean_token_accuracy": 0.7768757071346044, "num_tokens": 472177.0, "step": 19130 }, { "epoch": 9.419446153846154, "grad_norm": 0.4347788393497467, "learning_rate": 8.232569610925533e-05, "loss": 0.7254, "mean_token_accuracy": 0.7469818860292434, "num_tokens": 481557.0, "step": 19140 }, { "epoch": 9.42436923076923, "grad_norm": 0.2500903010368347, "learning_rate": 8.226567799796383e-05, "loss": 0.782, "mean_token_accuracy": 0.7323465205729007, "num_tokens": 491491.0, "step": 19150 }, { "epoch": 9.429292307692307, "grad_norm": 0.2921452820301056, "learning_rate": 8.220558011485546e-05, "loss": 0.7998, "mean_token_accuracy": 0.7310435988008976, "num_tokens": 501074.0, "step": 19160 }, { "epoch": 9.434215384615385, "grad_norm": 0.2585694491863251, "learning_rate": 8.21454026085128e-05, "loss": 0.7696, "mean_token_accuracy": 0.7234194416552782, "num_tokens": 509884.0, "step": 19170 }, { "epoch": 9.439138461538462, "grad_norm": 0.24239717423915863, "learning_rate": 8.208514562771532e-05, "loss": 0.671, "mean_token_accuracy": 0.7750304438173771, "num_tokens": 518146.0, "step": 19180 }, { "epoch": 9.444061538461538, "grad_norm": 0.21721267700195312, "learning_rate": 8.202480932143887e-05, "loss": 0.6366, "mean_token_accuracy": 0.7861224085092544, "num_tokens": 526055.0, "step": 19190 }, { "epoch": 9.448984615384616, "grad_norm": 0.8764368891716003, "learning_rate": 8.19643938388555e-05, "loss": 0.6603, "mean_token_accuracy": 0.7765530787408352, "num_tokens": 534409.0, "step": 19200 }, { "epoch": 9.453907692307693, "grad_norm": 0.28634384274482727, "learning_rate": 8.190389932933301e-05, "loss": 0.6941, "mean_token_accuracy": 0.7651392992585897, "num_tokens": 543311.0, "step": 19210 }, { "epoch": 9.458830769230769, "grad_norm": 0.30995234847068787, "learning_rate": 8.184332594243455e-05, "loss": 0.7696, "mean_token_accuracy": 0.7483407512307167, "num_tokens": 552293.0, "step": 19220 }, { "epoch": 9.463753846153846, "grad_norm": 0.26820695400238037, "learning_rate": 8.17826738279183e-05, "loss": 0.7136, "mean_token_accuracy": 0.7572588924318552, "num_tokens": 560875.0, "step": 19230 }, { "epoch": 9.468676923076924, "grad_norm": 0.3167503774166107, "learning_rate": 8.172194313573711e-05, "loss": 0.6687, "mean_token_accuracy": 0.78301134519279, "num_tokens": 570337.0, "step": 19240 }, { "epoch": 9.4736, "grad_norm": 0.38579657673835754, "learning_rate": 8.166113401603802e-05, "loss": 0.6541, "mean_token_accuracy": 0.7858757961541414, "num_tokens": 578355.0, "step": 19250 }, { "epoch": 9.478523076923077, "grad_norm": 0.2878792881965637, "learning_rate": 8.160024661916204e-05, "loss": 0.836, "mean_token_accuracy": 0.7330463856458664, "num_tokens": 588762.0, "step": 19260 }, { "epoch": 9.483446153846154, "grad_norm": 0.3626146614551544, "learning_rate": 8.153928109564369e-05, "loss": 0.8072, "mean_token_accuracy": 0.7299764085561037, "num_tokens": 598467.0, "step": 19270 }, { "epoch": 9.48836923076923, "grad_norm": 0.2230752855539322, "learning_rate": 8.147823759621063e-05, "loss": 0.6656, "mean_token_accuracy": 0.7746995214372874, "num_tokens": 607228.0, "step": 19280 }, { "epoch": 9.493292307692307, "grad_norm": 0.35763120651245117, "learning_rate": 8.141711627178335e-05, "loss": 0.7943, "mean_token_accuracy": 0.7384316265583039, "num_tokens": 616620.0, "step": 19290 }, { "epoch": 9.498215384615385, "grad_norm": 0.31609174609184265, "learning_rate": 8.135591727347469e-05, "loss": 0.7832, "mean_token_accuracy": 0.731552030518651, "num_tokens": 627052.0, "step": 19300 }, { "epoch": 9.503138461538462, "grad_norm": 0.278390496969223, "learning_rate": 8.129464075258956e-05, "loss": 0.7252, "mean_token_accuracy": 0.7618256121873855, "num_tokens": 636036.0, "step": 19310 }, { "epoch": 9.508061538461538, "grad_norm": 0.3673849105834961, "learning_rate": 8.123328686062453e-05, "loss": 0.6438, "mean_token_accuracy": 0.7896918896585703, "num_tokens": 645343.0, "step": 19320 }, { "epoch": 9.512984615384616, "grad_norm": 0.33690834045410156, "learning_rate": 8.117185574926744e-05, "loss": 0.8169, "mean_token_accuracy": 0.7260203436017036, "num_tokens": 655914.0, "step": 19330 }, { "epoch": 9.517907692307693, "grad_norm": 0.38085484504699707, "learning_rate": 8.111034757039707e-05, "loss": 0.7446, "mean_token_accuracy": 0.7486122488975525, "num_tokens": 665555.0, "step": 19340 }, { "epoch": 9.522830769230769, "grad_norm": 0.8052934408187866, "learning_rate": 8.10487624760827e-05, "loss": 0.7267, "mean_token_accuracy": 0.7574852678924799, "num_tokens": 674482.0, "step": 19350 }, { "epoch": 9.527753846153846, "grad_norm": 0.25485706329345703, "learning_rate": 8.098710061858381e-05, "loss": 0.6928, "mean_token_accuracy": 0.7583078496158123, "num_tokens": 682777.0, "step": 19360 }, { "epoch": 9.532676923076924, "grad_norm": 0.2488587498664856, "learning_rate": 8.092536215034967e-05, "loss": 0.7838, "mean_token_accuracy": 0.7227123014628887, "num_tokens": 692407.0, "step": 19370 }, { "epoch": 9.5376, "grad_norm": 1.3520376682281494, "learning_rate": 8.086354722401892e-05, "loss": 0.7324, "mean_token_accuracy": 0.7700716838240623, "num_tokens": 701713.0, "step": 19380 }, { "epoch": 9.542523076923077, "grad_norm": 0.278576135635376, "learning_rate": 8.080165599241924e-05, "loss": 0.7461, "mean_token_accuracy": 0.755218057706952, "num_tokens": 710344.0, "step": 19390 }, { "epoch": 9.547446153846154, "grad_norm": 0.3666495680809021, "learning_rate": 8.0739688608567e-05, "loss": 0.672, "mean_token_accuracy": 0.7804633747786284, "num_tokens": 719045.0, "step": 19400 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.126844244746281e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }