diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,27034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2997152704930316, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.990509016434387e-05, + "grad_norm": 43.9115182777984, + "learning_rate": 0.0, + "loss": 3.9277, + "mean_token_accuracy": 0.465808629989624, + "num_tokens": 81392.0, + "step": 1 + }, + { + "epoch": 0.00019981018032868775, + "grad_norm": 37.74709273976156, + "learning_rate": 3.322259136212625e-08, + "loss": 3.8838, + "mean_token_accuracy": 0.4678333252668381, + "num_tokens": 162900.0, + "step": 2 + }, + { + "epoch": 0.0002997152704930316, + "grad_norm": 38.30973515949859, + "learning_rate": 6.64451827242525e-08, + "loss": 3.8863, + "mean_token_accuracy": 0.47671647369861603, + "num_tokens": 244469.0, + "step": 3 + }, + { + "epoch": 0.0003996203606573755, + "grad_norm": 42.86286839418935, + "learning_rate": 9.966777408637874e-08, + "loss": 3.9051, + "mean_token_accuracy": 0.46781882643699646, + "num_tokens": 325984.0, + "step": 4 + }, + { + "epoch": 0.0004995254508217194, + "grad_norm": 42.82387797713526, + "learning_rate": 1.32890365448505e-07, + "loss": 3.8761, + "mean_token_accuracy": 0.47103650867938995, + "num_tokens": 407551.0, + "step": 5 + }, + { + "epoch": 0.0005994305409860632, + "grad_norm": 39.141341404000684, + "learning_rate": 1.6611295681063126e-07, + "loss": 3.8935, + "mean_token_accuracy": 0.4719069302082062, + "num_tokens": 489037.0, + "step": 6 + }, + { + "epoch": 0.0006993356311504072, + "grad_norm": 40.3153565516347, + "learning_rate": 1.9933554817275749e-07, + "loss": 3.8537, + "mean_token_accuracy": 0.4766451567411423, + "num_tokens": 570643.0, + "step": 7 + }, + { + "epoch": 0.000799240721314751, + "grad_norm": 39.292570945003696, + "learning_rate": 2.3255813953488374e-07, + "loss": 3.8826, + "mean_token_accuracy": 0.4751485139131546, + "num_tokens": 652191.0, + "step": 8 + }, + { + "epoch": 0.0008991458114790948, + "grad_norm": 40.82490066943282, + "learning_rate": 2.6578073089701e-07, + "loss": 3.8503, + "mean_token_accuracy": 0.4755466729402542, + "num_tokens": 733747.0, + "step": 9 + }, + { + "epoch": 0.0009990509016434388, + "grad_norm": 41.140536945650574, + "learning_rate": 2.9900332225913623e-07, + "loss": 3.8527, + "mean_token_accuracy": 0.4712029695510864, + "num_tokens": 815299.0, + "step": 10 + }, + { + "epoch": 0.0010989559918077826, + "grad_norm": 44.26888688807489, + "learning_rate": 3.322259136212625e-07, + "loss": 3.852, + "mean_token_accuracy": 0.47195324301719666, + "num_tokens": 896835.0, + "step": 11 + }, + { + "epoch": 0.0011988610819721264, + "grad_norm": 41.608815276303424, + "learning_rate": 3.654485049833888e-07, + "loss": 3.8464, + "mean_token_accuracy": 0.4719657748937607, + "num_tokens": 978375.0, + "step": 12 + }, + { + "epoch": 0.0012987661721364703, + "grad_norm": 38.39621112846363, + "learning_rate": 3.9867109634551497e-07, + "loss": 3.8194, + "mean_token_accuracy": 0.47000326216220856, + "num_tokens": 1059788.0, + "step": 13 + }, + { + "epoch": 0.0013986712623008143, + "grad_norm": 37.89670690018434, + "learning_rate": 4.318936877076412e-07, + "loss": 3.7153, + "mean_token_accuracy": 0.48281629383563995, + "num_tokens": 1141376.0, + "step": 14 + }, + { + "epoch": 0.0014985763524651581, + "grad_norm": 36.75297703709166, + "learning_rate": 4.651162790697675e-07, + "loss": 3.7288, + "mean_token_accuracy": 0.4805563688278198, + "num_tokens": 1222945.0, + "step": 15 + }, + { + "epoch": 0.001598481442629502, + "grad_norm": 39.02327978742656, + "learning_rate": 4.983388704318938e-07, + "loss": 3.7095, + "mean_token_accuracy": 0.4870526194572449, + "num_tokens": 1304498.0, + "step": 16 + }, + { + "epoch": 0.0016983865327938458, + "grad_norm": 34.36225999991617, + "learning_rate": 5.3156146179402e-07, + "loss": 3.5074, + "mean_token_accuracy": 0.5072232633829117, + "num_tokens": 1386034.0, + "step": 17 + }, + { + "epoch": 0.0017982916229581896, + "grad_norm": 32.235350405745756, + "learning_rate": 5.647840531561462e-07, + "loss": 3.4477, + "mean_token_accuracy": 0.5128206312656403, + "num_tokens": 1467646.0, + "step": 18 + }, + { + "epoch": 0.0018981967131225337, + "grad_norm": 32.084682697022984, + "learning_rate": 5.980066445182725e-07, + "loss": 3.4355, + "mean_token_accuracy": 0.5076811015605927, + "num_tokens": 1549267.0, + "step": 19 + }, + { + "epoch": 0.0019981018032868775, + "grad_norm": 31.38192627224552, + "learning_rate": 6.312292358803987e-07, + "loss": 3.4316, + "mean_token_accuracy": 0.5104040205478668, + "num_tokens": 1630740.0, + "step": 20 + }, + { + "epoch": 0.0020980068934512213, + "grad_norm": 31.921542972233137, + "learning_rate": 6.64451827242525e-07, + "loss": 3.3834, + "mean_token_accuracy": 0.5278559625148773, + "num_tokens": 1712290.0, + "step": 21 + }, + { + "epoch": 0.002197911983615565, + "grad_norm": 33.17070548391948, + "learning_rate": 6.976744186046513e-07, + "loss": 3.389, + "mean_token_accuracy": 0.5249148607254028, + "num_tokens": 1793765.0, + "step": 22 + }, + { + "epoch": 0.002297817073779909, + "grad_norm": 26.60392173316673, + "learning_rate": 7.308970099667776e-07, + "loss": 2.9522, + "mean_token_accuracy": 0.5449841320514679, + "num_tokens": 1875292.0, + "step": 23 + }, + { + "epoch": 0.002397722163944253, + "grad_norm": 23.828909279049984, + "learning_rate": 7.641196013289037e-07, + "loss": 2.76, + "mean_token_accuracy": 0.5776057541370392, + "num_tokens": 1956912.0, + "step": 24 + }, + { + "epoch": 0.0024976272541085967, + "grad_norm": 26.422204139080858, + "learning_rate": 7.973421926910299e-07, + "loss": 2.74, + "mean_token_accuracy": 0.5723378658294678, + "num_tokens": 2038450.0, + "step": 25 + }, + { + "epoch": 0.0025975323442729405, + "grad_norm": 32.91258033566478, + "learning_rate": 8.305647840531563e-07, + "loss": 2.7143, + "mean_token_accuracy": 0.5759356021881104, + "num_tokens": 2119973.0, + "step": 26 + }, + { + "epoch": 0.0026974374344372848, + "grad_norm": 26.63225702040197, + "learning_rate": 8.637873754152824e-07, + "loss": 2.67, + "mean_token_accuracy": 0.5795794129371643, + "num_tokens": 2201527.0, + "step": 27 + }, + { + "epoch": 0.0027973425246016286, + "grad_norm": 24.029918059497426, + "learning_rate": 8.970099667774087e-07, + "loss": 2.5758, + "mean_token_accuracy": 0.5822240114212036, + "num_tokens": 2283033.0, + "step": 28 + }, + { + "epoch": 0.0028972476147659724, + "grad_norm": 23.280529392386658, + "learning_rate": 9.30232558139535e-07, + "loss": 2.5187, + "mean_token_accuracy": 0.5885159075260162, + "num_tokens": 2364513.0, + "step": 29 + }, + { + "epoch": 0.0029971527049303163, + "grad_norm": 26.772126368836208, + "learning_rate": 9.634551495016612e-07, + "loss": 2.5066, + "mean_token_accuracy": 0.5899362564086914, + "num_tokens": 2445940.0, + "step": 30 + }, + { + "epoch": 0.00309705779509466, + "grad_norm": 19.66662168014691, + "learning_rate": 9.966777408637875e-07, + "loss": 2.0835, + "mean_token_accuracy": 0.6304643750190735, + "num_tokens": 2527512.0, + "step": 31 + }, + { + "epoch": 0.003196962885259004, + "grad_norm": 18.90208434969219, + "learning_rate": 1.0299003322259137e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.6693356037139893, + "num_tokens": 2609012.0, + "step": 32 + }, + { + "epoch": 0.0032968679754233478, + "grad_norm": 14.209745891685431, + "learning_rate": 1.06312292358804e-06, + "loss": 1.7555, + "mean_token_accuracy": 0.6931554973125458, + "num_tokens": 2690476.0, + "step": 33 + }, + { + "epoch": 0.0033967730655876916, + "grad_norm": 11.97781038165495, + "learning_rate": 1.0963455149501661e-06, + "loss": 1.6951, + "mean_token_accuracy": 0.6984198689460754, + "num_tokens": 2771950.0, + "step": 34 + }, + { + "epoch": 0.0034966781557520354, + "grad_norm": 16.38627364321793, + "learning_rate": 1.1295681063122925e-06, + "loss": 1.6556, + "mean_token_accuracy": 0.7129922211170197, + "num_tokens": 2853436.0, + "step": 35 + }, + { + "epoch": 0.0035965832459163793, + "grad_norm": 15.834713161953818, + "learning_rate": 1.1627906976744188e-06, + "loss": 1.6239, + "mean_token_accuracy": 0.725387454032898, + "num_tokens": 2934942.0, + "step": 36 + }, + { + "epoch": 0.0036964883360807235, + "grad_norm": 13.259418464254, + "learning_rate": 1.196013289036545e-06, + "loss": 1.5793, + "mean_token_accuracy": 0.7409835755825043, + "num_tokens": 3016537.0, + "step": 37 + }, + { + "epoch": 0.0037963934262450674, + "grad_norm": 10.959099917718067, + "learning_rate": 1.2292358803986712e-06, + "loss": 1.5448, + "mean_token_accuracy": 0.759746640920639, + "num_tokens": 3098058.0, + "step": 38 + }, + { + "epoch": 0.003896298516409411, + "grad_norm": 11.036655360932409, + "learning_rate": 1.2624584717607974e-06, + "loss": 1.5072, + "mean_token_accuracy": 0.761745810508728, + "num_tokens": 3179638.0, + "step": 39 + }, + { + "epoch": 0.003996203606573755, + "grad_norm": 9.29780962477271, + "learning_rate": 1.2956810631229235e-06, + "loss": 1.4532, + "mean_token_accuracy": 0.7701937854290009, + "num_tokens": 3261183.0, + "step": 40 + }, + { + "epoch": 0.004096108696738099, + "grad_norm": 8.728700758357963, + "learning_rate": 1.32890365448505e-06, + "loss": 1.4206, + "mean_token_accuracy": 0.7790756821632385, + "num_tokens": 3342671.0, + "step": 41 + }, + { + "epoch": 0.004196013786902443, + "grad_norm": 8.41506174042118, + "learning_rate": 1.3621262458471762e-06, + "loss": 1.3555, + "mean_token_accuracy": 0.8088809847831726, + "num_tokens": 3424144.0, + "step": 42 + }, + { + "epoch": 0.0042959188770667865, + "grad_norm": 7.057953950116716, + "learning_rate": 1.3953488372093025e-06, + "loss": 1.3019, + "mean_token_accuracy": 0.8170831203460693, + "num_tokens": 3505633.0, + "step": 43 + }, + { + "epoch": 0.00439582396723113, + "grad_norm": 8.923031414440144, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.2821, + "mean_token_accuracy": 0.8145962655544281, + "num_tokens": 3587214.0, + "step": 44 + }, + { + "epoch": 0.004495729057395474, + "grad_norm": 7.812470531603665, + "learning_rate": 1.4617940199335552e-06, + "loss": 1.2162, + "mean_token_accuracy": 0.826030433177948, + "num_tokens": 3668730.0, + "step": 45 + }, + { + "epoch": 0.004595634147559818, + "grad_norm": 9.552296102764695, + "learning_rate": 1.4950166112956813e-06, + "loss": 1.2416, + "mean_token_accuracy": 0.8193812072277069, + "num_tokens": 3750216.0, + "step": 46 + }, + { + "epoch": 0.004695539237724162, + "grad_norm": 7.320748570604718, + "learning_rate": 1.5282392026578074e-06, + "loss": 1.2173, + "mean_token_accuracy": 0.8202420771121979, + "num_tokens": 3831640.0, + "step": 47 + }, + { + "epoch": 0.004795444327888506, + "grad_norm": 6.662393421002141, + "learning_rate": 1.5614617940199335e-06, + "loss": 1.1872, + "mean_token_accuracy": 0.825910896062851, + "num_tokens": 3913140.0, + "step": 48 + }, + { + "epoch": 0.0048953494180528495, + "grad_norm": 8.006168930262433, + "learning_rate": 1.5946843853820599e-06, + "loss": 1.1749, + "mean_token_accuracy": 0.8289189636707306, + "num_tokens": 3994706.0, + "step": 49 + }, + { + "epoch": 0.004995254508217193, + "grad_norm": 11.065532889785691, + "learning_rate": 1.6279069767441862e-06, + "loss": 1.165, + "mean_token_accuracy": 0.8307726383209229, + "num_tokens": 4076246.0, + "step": 50 + }, + { + "epoch": 0.005095159598381537, + "grad_norm": 7.849416070532809, + "learning_rate": 1.6611295681063126e-06, + "loss": 1.1455, + "mean_token_accuracy": 0.8311631679534912, + "num_tokens": 4157755.0, + "step": 51 + }, + { + "epoch": 0.005195064688545881, + "grad_norm": 8.505510980529811, + "learning_rate": 1.6943521594684387e-06, + "loss": 1.1504, + "mean_token_accuracy": 0.8321413099765778, + "num_tokens": 4239173.0, + "step": 52 + }, + { + "epoch": 0.005294969778710226, + "grad_norm": 9.14164254991678, + "learning_rate": 1.7275747508305648e-06, + "loss": 1.1355, + "mean_token_accuracy": 0.828074723482132, + "num_tokens": 4320636.0, + "step": 53 + }, + { + "epoch": 0.0053948748688745695, + "grad_norm": 6.831209775842347, + "learning_rate": 1.7607973421926911e-06, + "loss": 1.135, + "mean_token_accuracy": 0.8350955545902252, + "num_tokens": 4402031.0, + "step": 54 + }, + { + "epoch": 0.005494779959038913, + "grad_norm": 6.176658057109726, + "learning_rate": 1.7940199335548175e-06, + "loss": 1.085, + "mean_token_accuracy": 0.8409371376037598, + "num_tokens": 4483673.0, + "step": 55 + }, + { + "epoch": 0.005594685049203257, + "grad_norm": 7.877855450340788, + "learning_rate": 1.8272425249169438e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.8359730243682861, + "num_tokens": 4565076.0, + "step": 56 + }, + { + "epoch": 0.005694590139367601, + "grad_norm": 6.535351473977476, + "learning_rate": 1.86046511627907e-06, + "loss": 1.1003, + "mean_token_accuracy": 0.8392694890499115, + "num_tokens": 4646568.0, + "step": 57 + }, + { + "epoch": 0.005794495229531945, + "grad_norm": 8.543943966443743, + "learning_rate": 1.893687707641196e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.8445670306682587, + "num_tokens": 4728045.0, + "step": 58 + }, + { + "epoch": 0.005894400319696289, + "grad_norm": 10.787014907406606, + "learning_rate": 1.9269102990033224e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.8457443714141846, + "num_tokens": 4809551.0, + "step": 59 + }, + { + "epoch": 0.0059943054098606325, + "grad_norm": 11.592754295920857, + "learning_rate": 1.9601328903654487e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.8463897109031677, + "num_tokens": 4891131.0, + "step": 60 + }, + { + "epoch": 0.006094210500024976, + "grad_norm": 7.521673028863906, + "learning_rate": 1.993355481727575e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.8493745625019073, + "num_tokens": 4972652.0, + "step": 61 + }, + { + "epoch": 0.00619411559018932, + "grad_norm": 11.058124317095917, + "learning_rate": 2.026578073089701e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.850467175245285, + "num_tokens": 5054113.0, + "step": 62 + }, + { + "epoch": 0.006294020680353664, + "grad_norm": 8.693224631039815, + "learning_rate": 2.0598006644518273e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.8521609902381897, + "num_tokens": 5135687.0, + "step": 63 + }, + { + "epoch": 0.006393925770518008, + "grad_norm": 10.723748902098743, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.8534142673015594, + "num_tokens": 5217209.0, + "step": 64 + }, + { + "epoch": 0.006493830860682352, + "grad_norm": 8.961714414853386, + "learning_rate": 2.12624584717608e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.8512806594371796, + "num_tokens": 5298764.0, + "step": 65 + }, + { + "epoch": 0.0065937359508466955, + "grad_norm": 9.67980887950298, + "learning_rate": 2.1594684385382063e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.8502772748470306, + "num_tokens": 5380318.0, + "step": 66 + }, + { + "epoch": 0.006693641041011039, + "grad_norm": 7.734538167939381, + "learning_rate": 2.1926910299003322e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.8527667820453644, + "num_tokens": 5461921.0, + "step": 67 + }, + { + "epoch": 0.006793546131175383, + "grad_norm": 7.319487105504349, + "learning_rate": 2.2259136212624586e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.8567566871643066, + "num_tokens": 5543457.0, + "step": 68 + }, + { + "epoch": 0.006893451221339727, + "grad_norm": 13.01219819651779, + "learning_rate": 2.259136212624585e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.8570312857627869, + "num_tokens": 5624923.0, + "step": 69 + }, + { + "epoch": 0.006993356311504071, + "grad_norm": 7.857188550956097, + "learning_rate": 2.2923588039867112e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.8571699857711792, + "num_tokens": 5706407.0, + "step": 70 + }, + { + "epoch": 0.007093261401668415, + "grad_norm": 30.32385097190717, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.859236866235733, + "num_tokens": 5787861.0, + "step": 71 + }, + { + "epoch": 0.0071931664918327585, + "grad_norm": 8.372536369618517, + "learning_rate": 2.3588039867109635e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.8609906136989594, + "num_tokens": 5869373.0, + "step": 72 + }, + { + "epoch": 0.007293071581997102, + "grad_norm": 9.299480486444535, + "learning_rate": 2.39202657807309e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.861969381570816, + "num_tokens": 5950898.0, + "step": 73 + }, + { + "epoch": 0.007392976672161447, + "grad_norm": 8.208781970209937, + "learning_rate": 2.425249169435216e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.8616667091846466, + "num_tokens": 6032378.0, + "step": 74 + }, + { + "epoch": 0.007492881762325791, + "grad_norm": 9.900277793808131, + "learning_rate": 2.4584717607973425e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.8635565042495728, + "num_tokens": 6113891.0, + "step": 75 + }, + { + "epoch": 0.007592786852490135, + "grad_norm": 9.63578104581745, + "learning_rate": 2.4916943521594684e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.8593670129776001, + "num_tokens": 6195322.0, + "step": 76 + }, + { + "epoch": 0.0076926919426544785, + "grad_norm": 9.844911095169229, + "learning_rate": 2.5249169435215947e-06, + "loss": 0.83, + "mean_token_accuracy": 0.8663856685161591, + "num_tokens": 6276933.0, + "step": 77 + }, + { + "epoch": 0.007792597032818822, + "grad_norm": 10.336577957774587, + "learning_rate": 2.558139534883721e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.8646323978900909, + "num_tokens": 6358445.0, + "step": 78 + }, + { + "epoch": 0.007892502122983166, + "grad_norm": 12.075336792817218, + "learning_rate": 2.591362126245847e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.8765458166599274, + "num_tokens": 6439953.0, + "step": 79 + }, + { + "epoch": 0.00799240721314751, + "grad_norm": 12.337915989566497, + "learning_rate": 2.6245847176079738e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.8767749667167664, + "num_tokens": 6521450.0, + "step": 80 + }, + { + "epoch": 0.008092312303311854, + "grad_norm": 11.765390071418135, + "learning_rate": 2.6578073089701e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.878826767206192, + "num_tokens": 6603008.0, + "step": 81 + }, + { + "epoch": 0.008192217393476198, + "grad_norm": 11.317496011223984, + "learning_rate": 2.691029900332226e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.8773795068264008, + "num_tokens": 6684631.0, + "step": 82 + }, + { + "epoch": 0.008292122483640542, + "grad_norm": 16.16569204622234, + "learning_rate": 2.7242524916943523e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.8783608973026276, + "num_tokens": 6766164.0, + "step": 83 + }, + { + "epoch": 0.008392027573804885, + "grad_norm": 8.551877296899367, + "learning_rate": 2.7574750830564782e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.8798766434192657, + "num_tokens": 6847609.0, + "step": 84 + }, + { + "epoch": 0.00849193266396923, + "grad_norm": 6.5040137846827735, + "learning_rate": 2.790697674418605e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.8791915774345398, + "num_tokens": 6929171.0, + "step": 85 + }, + { + "epoch": 0.008591837754133573, + "grad_norm": 8.354578794725885, + "learning_rate": 2.8239202657807313e-06, + "loss": 0.734, + "mean_token_accuracy": 0.8795005083084106, + "num_tokens": 7010692.0, + "step": 86 + }, + { + "epoch": 0.008691742844297917, + "grad_norm": 10.326374251139793, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.8780491352081299, + "num_tokens": 7092185.0, + "step": 87 + }, + { + "epoch": 0.00879164793446226, + "grad_norm": 6.503911093583005, + "learning_rate": 2.8903654485049836e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.8827347457408905, + "num_tokens": 7173734.0, + "step": 88 + }, + { + "epoch": 0.008891553024626605, + "grad_norm": 7.335614399375306, + "learning_rate": 2.9235880398671104e-06, + "loss": 0.72, + "mean_token_accuracy": 0.8803918063640594, + "num_tokens": 7255178.0, + "step": 89 + }, + { + "epoch": 0.008991458114790948, + "grad_norm": 7.548381923263952, + "learning_rate": 2.9568106312292363e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.8818941712379456, + "num_tokens": 7336695.0, + "step": 90 + }, + { + "epoch": 0.009091363204955292, + "grad_norm": 6.573389320247024, + "learning_rate": 2.9900332225913626e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.880117267370224, + "num_tokens": 7418282.0, + "step": 91 + }, + { + "epoch": 0.009191268295119636, + "grad_norm": 9.167536479460843, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.8811139464378357, + "num_tokens": 7499715.0, + "step": 92 + }, + { + "epoch": 0.00929117338528398, + "grad_norm": 10.067944027976768, + "learning_rate": 3.056478405315615e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.8841680586338043, + "num_tokens": 7581318.0, + "step": 93 + }, + { + "epoch": 0.009391078475448324, + "grad_norm": 8.568259390670722, + "learning_rate": 3.089700996677741e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.8807838559150696, + "num_tokens": 7662827.0, + "step": 94 + }, + { + "epoch": 0.009490983565612668, + "grad_norm": 33.56661885934262, + "learning_rate": 3.122923588039867e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.8989840149879456, + "num_tokens": 7744342.0, + "step": 95 + }, + { + "epoch": 0.009590888655777011, + "grad_norm": 9.332729424765182, + "learning_rate": 3.156146179401994e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.8966100215911865, + "num_tokens": 7825834.0, + "step": 96 + }, + { + "epoch": 0.009690793745941355, + "grad_norm": 8.796571125238339, + "learning_rate": 3.1893687707641198e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8971054255962372, + "num_tokens": 7907349.0, + "step": 97 + }, + { + "epoch": 0.009790698836105699, + "grad_norm": 5.657766289674898, + "learning_rate": 3.222591362126246e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8972251415252686, + "num_tokens": 7988775.0, + "step": 98 + }, + { + "epoch": 0.009890603926270043, + "grad_norm": 21.34009314322161, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.9005087912082672, + "num_tokens": 8070321.0, + "step": 99 + }, + { + "epoch": 0.009990509016434387, + "grad_norm": 4.378816047270294, + "learning_rate": 3.2890365448504984e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8971885144710541, + "num_tokens": 8151925.0, + "step": 100 + }, + { + "epoch": 0.01009041410659873, + "grad_norm": 3.393227988936752, + "learning_rate": 3.322259136212625e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8964665234088898, + "num_tokens": 8233367.0, + "step": 101 + }, + { + "epoch": 0.010190319196763074, + "grad_norm": 4.462554813543325, + "learning_rate": 3.355481727574751e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.8976671993732452, + "num_tokens": 8314894.0, + "step": 102 + }, + { + "epoch": 0.010290224286927418, + "grad_norm": 4.868300077490772, + "learning_rate": 3.3887043189368774e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8965710699558258, + "num_tokens": 8396466.0, + "step": 103 + }, + { + "epoch": 0.010390129377091762, + "grad_norm": 3.3691004753172598, + "learning_rate": 3.4219269102990037e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8979867398738861, + "num_tokens": 8477964.0, + "step": 104 + }, + { + "epoch": 0.010490034467256106, + "grad_norm": 3.35195427225628, + "learning_rate": 3.4551495016611296e-06, + "loss": 0.559, + "mean_token_accuracy": 0.9003684520721436, + "num_tokens": 8559473.0, + "step": 105 + }, + { + "epoch": 0.010589939557420451, + "grad_norm": 4.1556317811376555, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8967815041542053, + "num_tokens": 8641029.0, + "step": 106 + }, + { + "epoch": 0.010689844647584795, + "grad_norm": 6.608950464668345, + "learning_rate": 3.5215946843853823e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.896422266960144, + "num_tokens": 8722628.0, + "step": 107 + }, + { + "epoch": 0.010789749737749139, + "grad_norm": 6.106723817290499, + "learning_rate": 3.5548172757475086e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8964198231697083, + "num_tokens": 8804195.0, + "step": 108 + }, + { + "epoch": 0.010889654827913483, + "grad_norm": 5.5585778267124235, + "learning_rate": 3.588039867109635e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8983577489852905, + "num_tokens": 8885852.0, + "step": 109 + }, + { + "epoch": 0.010989559918077827, + "grad_norm": 3.6779925623395875, + "learning_rate": 3.621262458471761e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.8986986577510834, + "num_tokens": 8967366.0, + "step": 110 + }, + { + "epoch": 0.01108946500824217, + "grad_norm": 7.05706336904852, + "learning_rate": 3.6544850498338876e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8947836458683014, + "num_tokens": 9048876.0, + "step": 111 + }, + { + "epoch": 0.011189370098406514, + "grad_norm": 4.918749615770082, + "learning_rate": 3.6877076411960135e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8982287049293518, + "num_tokens": 9130392.0, + "step": 112 + }, + { + "epoch": 0.011289275188570858, + "grad_norm": 5.349275235857383, + "learning_rate": 3.72093023255814e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8981278240680695, + "num_tokens": 9211875.0, + "step": 113 + }, + { + "epoch": 0.011389180278735202, + "grad_norm": 3.523958187539459, + "learning_rate": 3.754152823920266e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8982071876525879, + "num_tokens": 9293432.0, + "step": 114 + }, + { + "epoch": 0.011489085368899546, + "grad_norm": 11.404819438790442, + "learning_rate": 3.787375415282392e-06, + "loss": 0.552, + "mean_token_accuracy": 0.9010749459266663, + "num_tokens": 9375021.0, + "step": 115 + }, + { + "epoch": 0.01158899045906389, + "grad_norm": 3.6608482400816498, + "learning_rate": 3.8205980066445185e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.897555023431778, + "num_tokens": 9456617.0, + "step": 116 + }, + { + "epoch": 0.011688895549228234, + "grad_norm": 5.208329881883907, + "learning_rate": 3.853820598006645e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8982062637805939, + "num_tokens": 9538182.0, + "step": 117 + }, + { + "epoch": 0.011788800639392577, + "grad_norm": 4.5963943388378485, + "learning_rate": 3.887043189368771e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8966941237449646, + "num_tokens": 9619643.0, + "step": 118 + }, + { + "epoch": 0.011888705729556921, + "grad_norm": 4.6109525748586, + "learning_rate": 3.9202657807308975e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8962758481502533, + "num_tokens": 9701121.0, + "step": 119 + }, + { + "epoch": 0.011988610819721265, + "grad_norm": 3.173057166692904, + "learning_rate": 3.953488372093024e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8979039788246155, + "num_tokens": 9782595.0, + "step": 120 + }, + { + "epoch": 0.012088515909885609, + "grad_norm": 3.516576138734564, + "learning_rate": 3.98671096345515e-06, + "loss": 0.5463, + "mean_token_accuracy": 0.898377388715744, + "num_tokens": 9864109.0, + "step": 121 + }, + { + "epoch": 0.012188421000049953, + "grad_norm": 4.653983074523628, + "learning_rate": 4.0199335548172765e-06, + "loss": 0.555, + "mean_token_accuracy": 0.8981523811817169, + "num_tokens": 9945547.0, + "step": 122 + }, + { + "epoch": 0.012288326090214297, + "grad_norm": 3.881113890730638, + "learning_rate": 4.053156146179402e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8960205316543579, + "num_tokens": 10027089.0, + "step": 123 + }, + { + "epoch": 0.01238823118037864, + "grad_norm": 3.5995006156121514, + "learning_rate": 4.086378737541528e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.9003234207630157, + "num_tokens": 10108560.0, + "step": 124 + }, + { + "epoch": 0.012488136270542984, + "grad_norm": 3.771555850049265, + "learning_rate": 4.119601328903655e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.9017258584499359, + "num_tokens": 10190122.0, + "step": 125 + }, + { + "epoch": 0.012588041360707328, + "grad_norm": 6.973085843577829, + "learning_rate": 4.152823920265781e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.9007254540920258, + "num_tokens": 10271725.0, + "step": 126 + }, + { + "epoch": 0.012687946450871672, + "grad_norm": 3.4967977506422416, + "learning_rate": 4.186046511627907e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8997891843318939, + "num_tokens": 10353254.0, + "step": 127 + }, + { + "epoch": 0.012787851541036016, + "grad_norm": 3.7351740195102754, + "learning_rate": 4.219269102990034e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8979425728321075, + "num_tokens": 10434828.0, + "step": 128 + }, + { + "epoch": 0.01288775663120036, + "grad_norm": 3.1390777768127833, + "learning_rate": 4.25249169435216e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8987930119037628, + "num_tokens": 10516379.0, + "step": 129 + }, + { + "epoch": 0.012987661721364703, + "grad_norm": 3.259592515896356, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8964418172836304, + "num_tokens": 10597859.0, + "step": 130 + }, + { + "epoch": 0.013087566811529047, + "grad_norm": 3.3956116119265225, + "learning_rate": 4.318936877076413e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8988418579101562, + "num_tokens": 10679376.0, + "step": 131 + }, + { + "epoch": 0.013187471901693391, + "grad_norm": 3.6559188784061005, + "learning_rate": 4.352159468438539e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.9027482569217682, + "num_tokens": 10761007.0, + "step": 132 + }, + { + "epoch": 0.013287376991857735, + "grad_norm": 4.138066089832867, + "learning_rate": 4.3853820598006645e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8987249135971069, + "num_tokens": 10842447.0, + "step": 133 + }, + { + "epoch": 0.013387282082022079, + "grad_norm": 5.768603419534648, + "learning_rate": 4.418604651162791e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8992488086223602, + "num_tokens": 10924036.0, + "step": 134 + }, + { + "epoch": 0.013487187172186423, + "grad_norm": 3.144709189071282, + "learning_rate": 4.451827242524917e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8997672200202942, + "num_tokens": 11005533.0, + "step": 135 + }, + { + "epoch": 0.013587092262350766, + "grad_norm": 2.425202036863678, + "learning_rate": 4.4850498338870435e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8992173969745636, + "num_tokens": 11086980.0, + "step": 136 + }, + { + "epoch": 0.01368699735251511, + "grad_norm": 2.712402694371354, + "learning_rate": 4.51827242524917e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8981004059314728, + "num_tokens": 11168500.0, + "step": 137 + }, + { + "epoch": 0.013786902442679454, + "grad_norm": 4.34240031529007, + "learning_rate": 4.551495016611296e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8977246582508087, + "num_tokens": 11249979.0, + "step": 138 + }, + { + "epoch": 0.013886807532843798, + "grad_norm": 3.7830298739692165, + "learning_rate": 4.5847176079734225e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8975000381469727, + "num_tokens": 11331375.0, + "step": 139 + }, + { + "epoch": 0.013986712623008142, + "grad_norm": 4.523687834529611, + "learning_rate": 4.617940199335549e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.901622474193573, + "num_tokens": 11412989.0, + "step": 140 + }, + { + "epoch": 0.014086617713172486, + "grad_norm": 2.539574735533949, + "learning_rate": 4.651162790697675e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8971543610095978, + "num_tokens": 11494524.0, + "step": 141 + }, + { + "epoch": 0.01418652280333683, + "grad_norm": 2.714650196201869, + "learning_rate": 4.6843853820598015e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8979431986808777, + "num_tokens": 11576049.0, + "step": 142 + }, + { + "epoch": 0.014286427893501173, + "grad_norm": 4.855756702618394, + "learning_rate": 4.717607973421927e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8996695578098297, + "num_tokens": 11657638.0, + "step": 143 + }, + { + "epoch": 0.014386332983665517, + "grad_norm": 3.223643736165849, + "learning_rate": 4.750830564784053e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.9057171642780304, + "num_tokens": 11739171.0, + "step": 144 + }, + { + "epoch": 0.01448623807382986, + "grad_norm": 3.820676853720696, + "learning_rate": 4.78405315614618e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.9011287689208984, + "num_tokens": 11820837.0, + "step": 145 + }, + { + "epoch": 0.014586143163994205, + "grad_norm": 4.626415225125115, + "learning_rate": 4.817275747508306e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.8975524604320526, + "num_tokens": 11902335.0, + "step": 146 + }, + { + "epoch": 0.014686048254158549, + "grad_norm": 5.475080945867047, + "learning_rate": 4.850498338870432e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8982314765453339, + "num_tokens": 11983769.0, + "step": 147 + }, + { + "epoch": 0.014785953344322894, + "grad_norm": 4.520383168483069, + "learning_rate": 4.883720930232559e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.9002703130245209, + "num_tokens": 12065328.0, + "step": 148 + }, + { + "epoch": 0.014885858434487238, + "grad_norm": 22.87549578745822, + "learning_rate": 4.916943521594685e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8992882966995239, + "num_tokens": 12146839.0, + "step": 149 + }, + { + "epoch": 0.014985763524651582, + "grad_norm": 6.0231376748303775, + "learning_rate": 4.950166112956811e-06, + "loss": 0.544, + "mean_token_accuracy": 0.9009184837341309, + "num_tokens": 12228285.0, + "step": 150 + }, + { + "epoch": 0.015085668614815926, + "grad_norm": 3.3951389850263145, + "learning_rate": 4.983388704318937e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.9003256559371948, + "num_tokens": 12309835.0, + "step": 151 + }, + { + "epoch": 0.01518557370498027, + "grad_norm": 11.33620288675901, + "learning_rate": 5.016611295681063e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8984023332595825, + "num_tokens": 12391390.0, + "step": 152 + }, + { + "epoch": 0.015285478795144613, + "grad_norm": 3.9392827705922744, + "learning_rate": 5.0498338870431895e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8974924385547638, + "num_tokens": 12472855.0, + "step": 153 + }, + { + "epoch": 0.015385383885308957, + "grad_norm": 19.4500159154736, + "learning_rate": 5.083056478405316e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8971934020519257, + "num_tokens": 12554366.0, + "step": 154 + }, + { + "epoch": 0.015485288975473301, + "grad_norm": 7.0353070058006715, + "learning_rate": 5.116279069767442e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8955339193344116, + "num_tokens": 12635874.0, + "step": 155 + }, + { + "epoch": 0.015585194065637645, + "grad_norm": 3.406095416681761, + "learning_rate": 5.149501661129569e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.89928138256073, + "num_tokens": 12717393.0, + "step": 156 + }, + { + "epoch": 0.015685099155801987, + "grad_norm": 3.16392170744307, + "learning_rate": 5.182724252491694e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.9006933867931366, + "num_tokens": 12798924.0, + "step": 157 + }, + { + "epoch": 0.015785004245966332, + "grad_norm": 3.260812454334016, + "learning_rate": 5.215946843853821e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.9028765857219696, + "num_tokens": 12880475.0, + "step": 158 + }, + { + "epoch": 0.015884909336130675, + "grad_norm": 3.677113314246011, + "learning_rate": 5.2491694352159475e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8985165357589722, + "num_tokens": 12962073.0, + "step": 159 + }, + { + "epoch": 0.01598481442629502, + "grad_norm": 3.136412639102267, + "learning_rate": 5.282392026578074e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8984121978282928, + "num_tokens": 13043684.0, + "step": 160 + }, + { + "epoch": 0.016084719516459362, + "grad_norm": 3.295732779599208, + "learning_rate": 5.3156146179402e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8993483483791351, + "num_tokens": 13125260.0, + "step": 161 + }, + { + "epoch": 0.016184624606623708, + "grad_norm": 3.822890631442976, + "learning_rate": 5.348837209302326e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.9003464877605438, + "num_tokens": 13206751.0, + "step": 162 + }, + { + "epoch": 0.01628452969678805, + "grad_norm": 4.151139256148533, + "learning_rate": 5.382059800664452e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.899523138999939, + "num_tokens": 13288261.0, + "step": 163 + }, + { + "epoch": 0.016384434786952395, + "grad_norm": 2.2459720585819123, + "learning_rate": 5.415282392026578e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.901286780834198, + "num_tokens": 13369736.0, + "step": 164 + }, + { + "epoch": 0.016484339877116738, + "grad_norm": 11.688603883285378, + "learning_rate": 5.448504983388705e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.9031156599521637, + "num_tokens": 13451307.0, + "step": 165 + }, + { + "epoch": 0.016584244967281083, + "grad_norm": 3.199106678843647, + "learning_rate": 5.481727574750831e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.901060938835144, + "num_tokens": 13532844.0, + "step": 166 + }, + { + "epoch": 0.016684150057445425, + "grad_norm": 4.826484409338612, + "learning_rate": 5.5149501661129565e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.901436448097229, + "num_tokens": 13614332.0, + "step": 167 + }, + { + "epoch": 0.01678405514760977, + "grad_norm": 2.837097574123296, + "learning_rate": 5.548172757475083e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8989057540893555, + "num_tokens": 13695855.0, + "step": 168 + }, + { + "epoch": 0.016883960237774116, + "grad_norm": 2.1963045167497026, + "learning_rate": 5.58139534883721e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.9005324840545654, + "num_tokens": 13777304.0, + "step": 169 + }, + { + "epoch": 0.01698386532793846, + "grad_norm": 2.328052270376103, + "learning_rate": 5.614617940199336e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.9003839790821075, + "num_tokens": 13858868.0, + "step": 170 + }, + { + "epoch": 0.017083770418102804, + "grad_norm": 3.2024619592964867, + "learning_rate": 5.647840531561463e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.9009014368057251, + "num_tokens": 13940375.0, + "step": 171 + }, + { + "epoch": 0.017183675508267146, + "grad_norm": 12.982152524321297, + "learning_rate": 5.681063122923588e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.9034719169139862, + "num_tokens": 14021900.0, + "step": 172 + }, + { + "epoch": 0.01728358059843149, + "grad_norm": 3.135615062649806, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.901440292596817, + "num_tokens": 14103345.0, + "step": 173 + }, + { + "epoch": 0.017383485688595834, + "grad_norm": 3.480801738838714, + "learning_rate": 5.747508305647841e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.9014447331428528, + "num_tokens": 14184855.0, + "step": 174 + }, + { + "epoch": 0.01748339077876018, + "grad_norm": 2.0937583458500018, + "learning_rate": 5.780730897009967e-06, + "loss": 0.5426, + "mean_token_accuracy": 0.8993730843067169, + "num_tokens": 14266344.0, + "step": 175 + }, + { + "epoch": 0.01758329586892452, + "grad_norm": 3.248870036364807, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.9008569717407227, + "num_tokens": 14347878.0, + "step": 176 + }, + { + "epoch": 0.017683200959088867, + "grad_norm": 2.0502817179309316, + "learning_rate": 5.847176079734221e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.9005368053913116, + "num_tokens": 14429426.0, + "step": 177 + }, + { + "epoch": 0.01778310604925321, + "grad_norm": 7.212756076292208, + "learning_rate": 5.880398671096345e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.899193286895752, + "num_tokens": 14510909.0, + "step": 178 + }, + { + "epoch": 0.017883011139417555, + "grad_norm": 1.8369911456326324, + "learning_rate": 5.9136212624584725e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.90140500664711, + "num_tokens": 14592463.0, + "step": 179 + }, + { + "epoch": 0.017982916229581897, + "grad_norm": 7.240974532468446, + "learning_rate": 5.946843853820599e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.9004058539867401, + "num_tokens": 14673981.0, + "step": 180 + }, + { + "epoch": 0.018082821319746242, + "grad_norm": 2.5547581852577017, + "learning_rate": 5.980066445182725e-06, + "loss": 0.529, + "mean_token_accuracy": 0.9030691385269165, + "num_tokens": 14755516.0, + "step": 181 + }, + { + "epoch": 0.018182726409910584, + "grad_norm": 5.707541566389296, + "learning_rate": 6.0132890365448515e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.9034998416900635, + "num_tokens": 14837065.0, + "step": 182 + }, + { + "epoch": 0.01828263150007493, + "grad_norm": 2.6403728621977747, + "learning_rate": 6.046511627906977e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.9010443091392517, + "num_tokens": 14918633.0, + "step": 183 + }, + { + "epoch": 0.018382536590239272, + "grad_norm": 4.996562853454378, + "learning_rate": 6.079734219269103e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8984755575656891, + "num_tokens": 15000073.0, + "step": 184 + }, + { + "epoch": 0.018482441680403618, + "grad_norm": 2.1904204137605183, + "learning_rate": 6.11295681063123e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.9017013013362885, + "num_tokens": 15081569.0, + "step": 185 + }, + { + "epoch": 0.01858234677056796, + "grad_norm": 2.12922920350944, + "learning_rate": 6.146179401993356e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.902349978685379, + "num_tokens": 15163018.0, + "step": 186 + }, + { + "epoch": 0.018682251860732305, + "grad_norm": 2.355758037199344, + "learning_rate": 6.179401993355482e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.9030015170574188, + "num_tokens": 15244621.0, + "step": 187 + }, + { + "epoch": 0.018782156950896647, + "grad_norm": 1.671131045958137, + "learning_rate": 6.212624584717608e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8999567925930023, + "num_tokens": 15326053.0, + "step": 188 + }, + { + "epoch": 0.018882062041060993, + "grad_norm": 8.748680744727348, + "learning_rate": 6.245847176079734e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.9005983769893646, + "num_tokens": 15407539.0, + "step": 189 + }, + { + "epoch": 0.018981967131225335, + "grad_norm": 2.297067553514883, + "learning_rate": 6.279069767441861e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.9028298854827881, + "num_tokens": 15489094.0, + "step": 190 + }, + { + "epoch": 0.01908187222138968, + "grad_norm": 1.6338730775997197, + "learning_rate": 6.312292358803988e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.9015539586544037, + "num_tokens": 15570550.0, + "step": 191 + }, + { + "epoch": 0.019181777311554023, + "grad_norm": 2.3001618403900013, + "learning_rate": 6.345514950166114e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.9022310078144073, + "num_tokens": 15652183.0, + "step": 192 + }, + { + "epoch": 0.01928168240171837, + "grad_norm": 1.7250231553381692, + "learning_rate": 6.3787375415282395e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.901745080947876, + "num_tokens": 15733726.0, + "step": 193 + }, + { + "epoch": 0.01938158749188271, + "grad_norm": 1.3023742876184785, + "learning_rate": 6.411960132890366e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8980643153190613, + "num_tokens": 15815352.0, + "step": 194 + }, + { + "epoch": 0.019481492582047056, + "grad_norm": 1.4566801199871733, + "learning_rate": 6.445182724252492e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.897815614938736, + "num_tokens": 15896928.0, + "step": 195 + }, + { + "epoch": 0.019581397672211398, + "grad_norm": 2.8552309476732463, + "learning_rate": 6.4784053156146185e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8997523188591003, + "num_tokens": 15978450.0, + "step": 196 + }, + { + "epoch": 0.019681302762375744, + "grad_norm": 15.097471512736327, + "learning_rate": 6.511627906976745e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.9037798047065735, + "num_tokens": 16059953.0, + "step": 197 + }, + { + "epoch": 0.019781207852540086, + "grad_norm": 2.745879877946782, + "learning_rate": 6.54485049833887e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.9019938409328461, + "num_tokens": 16141432.0, + "step": 198 + }, + { + "epoch": 0.01988111294270443, + "grad_norm": 2.21470946859096, + "learning_rate": 6.578073089700997e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.9016544818878174, + "num_tokens": 16222957.0, + "step": 199 + }, + { + "epoch": 0.019981018032868773, + "grad_norm": 1.913673672284361, + "learning_rate": 6.611295681063124e-06, + "loss": 0.537, + "mean_token_accuracy": 0.9017518758773804, + "num_tokens": 16304441.0, + "step": 200 + }, + { + "epoch": 0.02008092312303312, + "grad_norm": 3.2365885613611334, + "learning_rate": 6.64451827242525e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8980345726013184, + "num_tokens": 16385934.0, + "step": 201 + }, + { + "epoch": 0.02018082821319746, + "grad_norm": 2.191081293135329, + "learning_rate": 6.6777408637873766e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.9010584056377411, + "num_tokens": 16467454.0, + "step": 202 + }, + { + "epoch": 0.020280733303361807, + "grad_norm": 1.5507339018783588, + "learning_rate": 6.710963455149502e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8983252644538879, + "num_tokens": 16548901.0, + "step": 203 + }, + { + "epoch": 0.02038063839352615, + "grad_norm": 1.9669368679985433, + "learning_rate": 6.744186046511628e-06, + "loss": 0.542, + "mean_token_accuracy": 0.9013760685920715, + "num_tokens": 16630333.0, + "step": 204 + }, + { + "epoch": 0.020480543483690494, + "grad_norm": 1.6509846487545277, + "learning_rate": 6.777408637873755e-06, + "loss": 0.545, + "mean_token_accuracy": 0.9001222252845764, + "num_tokens": 16711818.0, + "step": 205 + }, + { + "epoch": 0.020580448573854836, + "grad_norm": 3.4453576181990426, + "learning_rate": 6.810631229235881e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.9004342257976532, + "num_tokens": 16793387.0, + "step": 206 + }, + { + "epoch": 0.020680353664019182, + "grad_norm": 1.479354514924584, + "learning_rate": 6.843853820598007e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.9027372598648071, + "num_tokens": 16874866.0, + "step": 207 + }, + { + "epoch": 0.020780258754183524, + "grad_norm": 1.919636365846181, + "learning_rate": 6.877076411960133e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.9045898020267487, + "num_tokens": 16956326.0, + "step": 208 + }, + { + "epoch": 0.02088016384434787, + "grad_norm": 1.9018411175588137, + "learning_rate": 6.910299003322259e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.9012606143951416, + "num_tokens": 17037851.0, + "step": 209 + }, + { + "epoch": 0.02098006893451221, + "grad_norm": 2.208793121495549, + "learning_rate": 6.9435215946843855e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.90370312333107, + "num_tokens": 17119509.0, + "step": 210 + }, + { + "epoch": 0.021079974024676557, + "grad_norm": 1.5488059027795094, + "learning_rate": 6.976744186046513e-06, + "loss": 0.537, + "mean_token_accuracy": 0.901504635810852, + "num_tokens": 17200963.0, + "step": 211 + }, + { + "epoch": 0.021179879114840903, + "grad_norm": 1.3675729559193792, + "learning_rate": 7.009966777408639e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.9024268686771393, + "num_tokens": 17282518.0, + "step": 212 + }, + { + "epoch": 0.021279784205005245, + "grad_norm": 1.8702862794605095, + "learning_rate": 7.0431893687707646e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8999868631362915, + "num_tokens": 17364065.0, + "step": 213 + }, + { + "epoch": 0.02137968929516959, + "grad_norm": 2.034680511810886, + "learning_rate": 7.076411960132891e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.9022560715675354, + "num_tokens": 17445563.0, + "step": 214 + }, + { + "epoch": 0.021479594385333933, + "grad_norm": 2.547091306204526, + "learning_rate": 7.109634551495017e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8985510170459747, + "num_tokens": 17527085.0, + "step": 215 + }, + { + "epoch": 0.021579499475498278, + "grad_norm": 1.465242370943423, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.9009482562541962, + "num_tokens": 17608683.0, + "step": 216 + }, + { + "epoch": 0.02167940456566262, + "grad_norm": 4.322196065016918, + "learning_rate": 7.17607973421927e-06, + "loss": 0.528, + "mean_token_accuracy": 0.9022441208362579, + "num_tokens": 17690224.0, + "step": 217 + }, + { + "epoch": 0.021779309655826966, + "grad_norm": 1.7948935744109467, + "learning_rate": 7.209302325581395e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.9022092819213867, + "num_tokens": 17771782.0, + "step": 218 + }, + { + "epoch": 0.021879214745991308, + "grad_norm": 1.8225382082979624, + "learning_rate": 7.242524916943522e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.898448258638382, + "num_tokens": 17853240.0, + "step": 219 + }, + { + "epoch": 0.021979119836155653, + "grad_norm": 1.2623626401322714, + "learning_rate": 7.275747508305648e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.9009917080402374, + "num_tokens": 17934679.0, + "step": 220 + }, + { + "epoch": 0.022079024926319996, + "grad_norm": 1.6709893423705025, + "learning_rate": 7.308970099667775e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.9005982875823975, + "num_tokens": 18016232.0, + "step": 221 + }, + { + "epoch": 0.02217893001648434, + "grad_norm": 1.3231758318236884, + "learning_rate": 7.342192691029902e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.9016952514648438, + "num_tokens": 18097748.0, + "step": 222 + }, + { + "epoch": 0.022278835106648683, + "grad_norm": 2.7468399294500636, + "learning_rate": 7.375415282392027e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8995503783226013, + "num_tokens": 18179242.0, + "step": 223 + }, + { + "epoch": 0.02237874019681303, + "grad_norm": 1.3105597937422517, + "learning_rate": 7.408637873754153e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.9001533389091492, + "num_tokens": 18260752.0, + "step": 224 + }, + { + "epoch": 0.02247864528697737, + "grad_norm": 1.056513010517615, + "learning_rate": 7.44186046511628e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.9025140404701233, + "num_tokens": 18342253.0, + "step": 225 + }, + { + "epoch": 0.022578550377141716, + "grad_norm": 1.0768004922559309, + "learning_rate": 7.475083056478406e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.9038011729717255, + "num_tokens": 18423838.0, + "step": 226 + }, + { + "epoch": 0.02267845546730606, + "grad_norm": 1.1158499249486484, + "learning_rate": 7.508305647840532e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.9003252685070038, + "num_tokens": 18505340.0, + "step": 227 + }, + { + "epoch": 0.022778360557470404, + "grad_norm": 2.1433002499309617, + "learning_rate": 7.541528239202659e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.9027635157108307, + "num_tokens": 18586890.0, + "step": 228 + }, + { + "epoch": 0.022878265647634746, + "grad_norm": 1.328252740239554, + "learning_rate": 7.574750830564784e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.9016855359077454, + "num_tokens": 18668468.0, + "step": 229 + }, + { + "epoch": 0.022978170737799092, + "grad_norm": 1.1234614448396438, + "learning_rate": 7.6079734219269106e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.9041211903095245, + "num_tokens": 18750035.0, + "step": 230 + }, + { + "epoch": 0.023078075827963434, + "grad_norm": 0.8515711024473754, + "learning_rate": 7.641196013289037e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8988519012928009, + "num_tokens": 18831542.0, + "step": 231 + }, + { + "epoch": 0.02317798091812778, + "grad_norm": 0.8846276481237035, + "learning_rate": 7.674418604651164e-06, + "loss": 0.528, + "mean_token_accuracy": 0.903311550617218, + "num_tokens": 18913068.0, + "step": 232 + }, + { + "epoch": 0.02327788600829212, + "grad_norm": 1.0081853032397556, + "learning_rate": 7.70764119601329e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8968833684921265, + "num_tokens": 18994504.0, + "step": 233 + }, + { + "epoch": 0.023377791098456467, + "grad_norm": 1.2964948430803034, + "learning_rate": 7.740863787375415e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9015989005565643, + "num_tokens": 19076044.0, + "step": 234 + }, + { + "epoch": 0.02347769618862081, + "grad_norm": 1.348074524120464, + "learning_rate": 7.774086378737542e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.9027081727981567, + "num_tokens": 19157550.0, + "step": 235 + }, + { + "epoch": 0.023577601278785155, + "grad_norm": 1.5659498069386675, + "learning_rate": 7.807308970099668e-06, + "loss": 0.531, + "mean_token_accuracy": 0.9027597308158875, + "num_tokens": 19239124.0, + "step": 236 + }, + { + "epoch": 0.023677506368949497, + "grad_norm": 1.1381592581879947, + "learning_rate": 7.840531561461795e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8999877572059631, + "num_tokens": 19320672.0, + "step": 237 + }, + { + "epoch": 0.023777411459113842, + "grad_norm": 1.376969829877238, + "learning_rate": 7.873754152823922e-06, + "loss": 0.538, + "mean_token_accuracy": 0.900924414396286, + "num_tokens": 19402229.0, + "step": 238 + }, + { + "epoch": 0.023877316549278185, + "grad_norm": 1.156726767389507, + "learning_rate": 7.906976744186048e-06, + "loss": 0.538, + "mean_token_accuracy": 0.9018464386463165, + "num_tokens": 19483719.0, + "step": 239 + }, + { + "epoch": 0.02397722163944253, + "grad_norm": 1.09193075957782, + "learning_rate": 7.940199335548173e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.9016562700271606, + "num_tokens": 19565285.0, + "step": 240 + }, + { + "epoch": 0.024077126729606872, + "grad_norm": 1.2151536308937287, + "learning_rate": 7.9734219269103e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.9004620909690857, + "num_tokens": 19646886.0, + "step": 241 + }, + { + "epoch": 0.024177031819771218, + "grad_norm": 1.34756348771898, + "learning_rate": 8.006644518272426e-06, + "loss": 0.538, + "mean_token_accuracy": 0.9005023539066315, + "num_tokens": 19728419.0, + "step": 242 + }, + { + "epoch": 0.02427693690993556, + "grad_norm": 1.4172457413731834, + "learning_rate": 8.039867109634553e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.9023783504962921, + "num_tokens": 19809919.0, + "step": 243 + }, + { + "epoch": 0.024376842000099905, + "grad_norm": 1.3823011187983965, + "learning_rate": 8.073089700996678e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.9026186168193817, + "num_tokens": 19891541.0, + "step": 244 + }, + { + "epoch": 0.024476747090264248, + "grad_norm": 1.251551236192572, + "learning_rate": 8.106312292358804e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8993507027626038, + "num_tokens": 19973154.0, + "step": 245 + }, + { + "epoch": 0.024576652180428593, + "grad_norm": 1.301104450483109, + "learning_rate": 8.139534883720931e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.9039275348186493, + "num_tokens": 20054613.0, + "step": 246 + }, + { + "epoch": 0.024676557270592935, + "grad_norm": 1.7980862480043889, + "learning_rate": 8.172757475083057e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.903622567653656, + "num_tokens": 20136183.0, + "step": 247 + }, + { + "epoch": 0.02477646236075728, + "grad_norm": 1.252821639867097, + "learning_rate": 8.205980066445184e-06, + "loss": 0.534, + "mean_token_accuracy": 0.901963621377945, + "num_tokens": 20217702.0, + "step": 248 + }, + { + "epoch": 0.024876367450921623, + "grad_norm": 1.3951157076464729, + "learning_rate": 8.23920265780731e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.9024863541126251, + "num_tokens": 20299181.0, + "step": 249 + }, + { + "epoch": 0.02497627254108597, + "grad_norm": 3.709097296145147, + "learning_rate": 8.272425249169436e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.9012572467327118, + "num_tokens": 20380694.0, + "step": 250 + }, + { + "epoch": 0.02507617763125031, + "grad_norm": 1.0640343151695217, + "learning_rate": 8.305647840531562e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.9010838270187378, + "num_tokens": 20462183.0, + "step": 251 + }, + { + "epoch": 0.025176082721414656, + "grad_norm": 1.9501530251913948, + "learning_rate": 8.338870431893689e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.9035438597202301, + "num_tokens": 20543716.0, + "step": 252 + }, + { + "epoch": 0.025275987811579, + "grad_norm": 1.733248502717577, + "learning_rate": 8.372093023255815e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.9033922553062439, + "num_tokens": 20625338.0, + "step": 253 + }, + { + "epoch": 0.025375892901743344, + "grad_norm": 1.3093152433739506, + "learning_rate": 8.40531561461794e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.9009537994861603, + "num_tokens": 20706868.0, + "step": 254 + }, + { + "epoch": 0.02547579799190769, + "grad_norm": 1.3268065763840096, + "learning_rate": 8.438538205980067e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.9030052721500397, + "num_tokens": 20788439.0, + "step": 255 + }, + { + "epoch": 0.02557570308207203, + "grad_norm": 1.1268422607596134, + "learning_rate": 8.471760797342193e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.9037782549858093, + "num_tokens": 20869954.0, + "step": 256 + }, + { + "epoch": 0.025675608172236377, + "grad_norm": 1.4412792370973089, + "learning_rate": 8.50498338870432e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.9016894996166229, + "num_tokens": 20951412.0, + "step": 257 + }, + { + "epoch": 0.02577551326240072, + "grad_norm": 1.5431539269533805, + "learning_rate": 8.538205980066447e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.9039218127727509, + "num_tokens": 21032871.0, + "step": 258 + }, + { + "epoch": 0.025875418352565065, + "grad_norm": 1.9630331566208583, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.9008043110370636, + "num_tokens": 21114449.0, + "step": 259 + }, + { + "epoch": 0.025975323442729407, + "grad_norm": 1.539643407044089, + "learning_rate": 8.604651162790698e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.9039390087127686, + "num_tokens": 21196085.0, + "step": 260 + }, + { + "epoch": 0.026075228532893752, + "grad_norm": 1.3892633259145146, + "learning_rate": 8.637873754152825e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.9020589888095856, + "num_tokens": 21277606.0, + "step": 261 + }, + { + "epoch": 0.026175133623058094, + "grad_norm": 1.7811385386116931, + "learning_rate": 8.67109634551495e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.901127427816391, + "num_tokens": 21359059.0, + "step": 262 + }, + { + "epoch": 0.02627503871322244, + "grad_norm": 1.1100521972455297, + "learning_rate": 8.704318936877078e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.9048356711864471, + "num_tokens": 21440537.0, + "step": 263 + }, + { + "epoch": 0.026374943803386782, + "grad_norm": 2.1479026540748754, + "learning_rate": 8.737541528239203e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9024417996406555, + "num_tokens": 21522053.0, + "step": 264 + }, + { + "epoch": 0.026474848893551128, + "grad_norm": 1.4440800864173935, + "learning_rate": 8.770764119601329e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.9006265997886658, + "num_tokens": 21603578.0, + "step": 265 + }, + { + "epoch": 0.02657475398371547, + "grad_norm": 5.758438950737839, + "learning_rate": 8.803986710963456e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.905374675989151, + "num_tokens": 21685079.0, + "step": 266 + }, + { + "epoch": 0.026674659073879815, + "grad_norm": 1.7357736725507724, + "learning_rate": 8.837209302325582e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9028143882751465, + "num_tokens": 21766646.0, + "step": 267 + }, + { + "epoch": 0.026774564164044157, + "grad_norm": 1.8616843333538438, + "learning_rate": 8.870431893687709e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.9036270976066589, + "num_tokens": 21848168.0, + "step": 268 + }, + { + "epoch": 0.026874469254208503, + "grad_norm": 3.045448222812817, + "learning_rate": 8.903654485049834e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9040688574314117, + "num_tokens": 21929797.0, + "step": 269 + }, + { + "epoch": 0.026974374344372845, + "grad_norm": 1.8986725627785082, + "learning_rate": 8.93687707641196e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.9024825692176819, + "num_tokens": 22011295.0, + "step": 270 + }, + { + "epoch": 0.02707427943453719, + "grad_norm": 2.464266979901143, + "learning_rate": 8.970099667774087e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8986654579639435, + "num_tokens": 22092827.0, + "step": 271 + }, + { + "epoch": 0.027174184524701533, + "grad_norm": 1.1430547664332822, + "learning_rate": 9.003322259136214e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.9037951827049255, + "num_tokens": 22174372.0, + "step": 272 + }, + { + "epoch": 0.02727408961486588, + "grad_norm": 1.0744414814253624, + "learning_rate": 9.03654485049834e-06, + "loss": 0.525, + "mean_token_accuracy": 0.9037059843540192, + "num_tokens": 22255895.0, + "step": 273 + }, + { + "epoch": 0.02737399470503022, + "grad_norm": 1.6477013347730691, + "learning_rate": 9.069767441860465e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9001986980438232, + "num_tokens": 22337345.0, + "step": 274 + }, + { + "epoch": 0.027473899795194566, + "grad_norm": 0.8470219833325346, + "learning_rate": 9.102990033222592e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9035255610942841, + "num_tokens": 22418846.0, + "step": 275 + }, + { + "epoch": 0.027573804885358908, + "grad_norm": 0.9705994653434082, + "learning_rate": 9.136212624584718e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9045466184616089, + "num_tokens": 22500399.0, + "step": 276 + }, + { + "epoch": 0.027673709975523254, + "grad_norm": 1.5529574136510575, + "learning_rate": 9.169435215946845e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.9021793007850647, + "num_tokens": 22581949.0, + "step": 277 + }, + { + "epoch": 0.027773615065687596, + "grad_norm": 2.9789498028430517, + "learning_rate": 9.20265780730897e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.902214914560318, + "num_tokens": 22663529.0, + "step": 278 + }, + { + "epoch": 0.02787352015585194, + "grad_norm": 1.0649238812546535, + "learning_rate": 9.235880398671098e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.9038357138633728, + "num_tokens": 22744995.0, + "step": 279 + }, + { + "epoch": 0.027973425246016283, + "grad_norm": 1.2458137917485046, + "learning_rate": 9.269102990033223e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9049131274223328, + "num_tokens": 22826449.0, + "step": 280 + }, + { + "epoch": 0.02807333033618063, + "grad_norm": 1.1705465955916088, + "learning_rate": 9.30232558139535e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9044190049171448, + "num_tokens": 22908002.0, + "step": 281 + }, + { + "epoch": 0.02817323542634497, + "grad_norm": 1.3159148304798869, + "learning_rate": 9.335548172757476e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9054710268974304, + "num_tokens": 22989463.0, + "step": 282 + }, + { + "epoch": 0.028273140516509317, + "grad_norm": 1.1883230666373579, + "learning_rate": 9.368770764119603e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.9026030004024506, + "num_tokens": 23071043.0, + "step": 283 + }, + { + "epoch": 0.02837304560667366, + "grad_norm": 1.038496226997613, + "learning_rate": 9.401993355481728e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.902669370174408, + "num_tokens": 23152484.0, + "step": 284 + }, + { + "epoch": 0.028472950696838004, + "grad_norm": 1.1806831210481643, + "learning_rate": 9.435215946843854e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8999159336090088, + "num_tokens": 23233967.0, + "step": 285 + }, + { + "epoch": 0.028572855787002346, + "grad_norm": 1.1164246466365275, + "learning_rate": 9.468438538205981e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.9034466445446014, + "num_tokens": 23315428.0, + "step": 286 + }, + { + "epoch": 0.028672760877166692, + "grad_norm": 1.1597454501995246, + "learning_rate": 9.501661129568107e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.9035528004169464, + "num_tokens": 23396924.0, + "step": 287 + }, + { + "epoch": 0.028772665967331034, + "grad_norm": 0.8910151120197398, + "learning_rate": 9.534883720930234e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9056601822376251, + "num_tokens": 23478494.0, + "step": 288 + }, + { + "epoch": 0.02887257105749538, + "grad_norm": 1.9684023087308051, + "learning_rate": 9.56810631229236e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.9022349417209625, + "num_tokens": 23560137.0, + "step": 289 + }, + { + "epoch": 0.02897247614765972, + "grad_norm": 1.7518310958160352, + "learning_rate": 9.601328903654485e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8991504311561584, + "num_tokens": 23641666.0, + "step": 290 + }, + { + "epoch": 0.029072381237824067, + "grad_norm": 0.9121620633504288, + "learning_rate": 9.634551495016612e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9061159491539001, + "num_tokens": 23723144.0, + "step": 291 + }, + { + "epoch": 0.02917228632798841, + "grad_norm": 1.0293433356107327, + "learning_rate": 9.66777408637874e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.9002678096294403, + "num_tokens": 23804641.0, + "step": 292 + }, + { + "epoch": 0.029272191418152755, + "grad_norm": 0.8180122394324839, + "learning_rate": 9.700996677740865e-06, + "loss": 0.521, + "mean_token_accuracy": 0.9058926999568939, + "num_tokens": 23886167.0, + "step": 293 + }, + { + "epoch": 0.029372096508317097, + "grad_norm": 0.8034408628532523, + "learning_rate": 9.734219269102992e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.9038265347480774, + "num_tokens": 23967671.0, + "step": 294 + }, + { + "epoch": 0.029472001598481443, + "grad_norm": 1.1253559085561864, + "learning_rate": 9.767441860465117e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.9051342010498047, + "num_tokens": 24049162.0, + "step": 295 + }, + { + "epoch": 0.029571906688645788, + "grad_norm": 0.8625795816677507, + "learning_rate": 9.800664451827243e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.905270904302597, + "num_tokens": 24130627.0, + "step": 296 + }, + { + "epoch": 0.02967181177881013, + "grad_norm": 0.9129253642457857, + "learning_rate": 9.83388704318937e-06, + "loss": 0.529, + "mean_token_accuracy": 0.902266651391983, + "num_tokens": 24212067.0, + "step": 297 + }, + { + "epoch": 0.029771716868974476, + "grad_norm": 0.9337342264008087, + "learning_rate": 9.867109634551495e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.9036350250244141, + "num_tokens": 24293563.0, + "step": 298 + }, + { + "epoch": 0.029871621959138818, + "grad_norm": 1.006396959632961, + "learning_rate": 9.900332225913623e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.904728502035141, + "num_tokens": 24375036.0, + "step": 299 + }, + { + "epoch": 0.029971527049303164, + "grad_norm": 0.8470821789089912, + "learning_rate": 9.933554817275748e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.9022732377052307, + "num_tokens": 24456496.0, + "step": 300 + }, + { + "epoch": 0.030071432139467506, + "grad_norm": 0.920336915170577, + "learning_rate": 9.966777408637874e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9025474786758423, + "num_tokens": 24537957.0, + "step": 301 + }, + { + "epoch": 0.03017133722963185, + "grad_norm": 0.9844914850732313, + "learning_rate": 1e-05, + "loss": 0.515, + "mean_token_accuracy": 0.9057714939117432, + "num_tokens": 24619491.0, + "step": 302 + }, + { + "epoch": 0.030271242319796193, + "grad_norm": 0.8281638390070746, + "learning_rate": 9.999999738247555e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.902985155582428, + "num_tokens": 24700977.0, + "step": 303 + }, + { + "epoch": 0.03037114740996054, + "grad_norm": 0.8078740831688068, + "learning_rate": 9.999998952990247e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.9021625518798828, + "num_tokens": 24782563.0, + "step": 304 + }, + { + "epoch": 0.03047105250012488, + "grad_norm": 0.755742125010334, + "learning_rate": 9.999997644228155e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.9021710753440857, + "num_tokens": 24864087.0, + "step": 305 + }, + { + "epoch": 0.030570957590289227, + "grad_norm": 0.7953616515410041, + "learning_rate": 9.999995811961418e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.9001808762550354, + "num_tokens": 24945556.0, + "step": 306 + }, + { + "epoch": 0.03067086268045357, + "grad_norm": 0.7061368088471669, + "learning_rate": 9.99999345619023e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.9026142060756683, + "num_tokens": 25027155.0, + "step": 307 + }, + { + "epoch": 0.030770767770617914, + "grad_norm": 0.6785455845492809, + "learning_rate": 9.999990576914835e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.90269935131073, + "num_tokens": 25108685.0, + "step": 308 + }, + { + "epoch": 0.030870672860782256, + "grad_norm": 0.8189811625112559, + "learning_rate": 9.999987174135537e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9008549153804779, + "num_tokens": 25190238.0, + "step": 309 + }, + { + "epoch": 0.030970577950946602, + "grad_norm": 0.7434476359803444, + "learning_rate": 9.999983247852688e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9047308564186096, + "num_tokens": 25271762.0, + "step": 310 + }, + { + "epoch": 0.031070483041110944, + "grad_norm": 0.7700194641152103, + "learning_rate": 9.999978798066705e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9028623104095459, + "num_tokens": 25353228.0, + "step": 311 + }, + { + "epoch": 0.03117038813127529, + "grad_norm": 0.7785704764969362, + "learning_rate": 9.999973824778048e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8999902307987213, + "num_tokens": 25434652.0, + "step": 312 + }, + { + "epoch": 0.031270293221439635, + "grad_norm": 0.790576292675863, + "learning_rate": 9.999968327987242e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.9025032818317413, + "num_tokens": 25516224.0, + "step": 313 + }, + { + "epoch": 0.031370198311603974, + "grad_norm": 0.6884744019392192, + "learning_rate": 9.999962307694859e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.9023030698299408, + "num_tokens": 25597742.0, + "step": 314 + }, + { + "epoch": 0.03147010340176832, + "grad_norm": 0.8562929387928123, + "learning_rate": 9.999955763901532e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9031381607055664, + "num_tokens": 25679262.0, + "step": 315 + }, + { + "epoch": 0.031570008491932665, + "grad_norm": 0.7420952187714107, + "learning_rate": 9.999948696607946e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.9058058559894562, + "num_tokens": 25760728.0, + "step": 316 + }, + { + "epoch": 0.03166991358209701, + "grad_norm": 0.6679712487564492, + "learning_rate": 9.99994110581484e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.904850423336029, + "num_tokens": 25842251.0, + "step": 317 + }, + { + "epoch": 0.03176981867226135, + "grad_norm": 1.0025581783866797, + "learning_rate": 9.999932991523009e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.9003219902515411, + "num_tokens": 25923751.0, + "step": 318 + }, + { + "epoch": 0.031869723762425695, + "grad_norm": 0.6383259704038337, + "learning_rate": 9.999924353733303e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.9053891003131866, + "num_tokens": 26005306.0, + "step": 319 + }, + { + "epoch": 0.03196962885259004, + "grad_norm": 0.8974481804097868, + "learning_rate": 9.999915192446626e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8998847901821136, + "num_tokens": 26086886.0, + "step": 320 + }, + { + "epoch": 0.032069533942754386, + "grad_norm": 1.0732729491225788, + "learning_rate": 9.999905507663936e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.9024744927883148, + "num_tokens": 26168403.0, + "step": 321 + }, + { + "epoch": 0.032169439032918724, + "grad_norm": 0.7518564636936014, + "learning_rate": 9.999895299386248e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9062250256538391, + "num_tokens": 26250071.0, + "step": 322 + }, + { + "epoch": 0.03226934412308307, + "grad_norm": 0.8540643435706601, + "learning_rate": 9.999884567614634e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.9035094380378723, + "num_tokens": 26331641.0, + "step": 323 + }, + { + "epoch": 0.032369249213247415, + "grad_norm": 0.7387127975290545, + "learning_rate": 9.99987331235021e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.9027649462223053, + "num_tokens": 26413171.0, + "step": 324 + }, + { + "epoch": 0.03246915430341176, + "grad_norm": 0.8020338927000256, + "learning_rate": 9.999861533594162e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.9026748538017273, + "num_tokens": 26494580.0, + "step": 325 + }, + { + "epoch": 0.0325690593935761, + "grad_norm": 0.8067876384785493, + "learning_rate": 9.99984923134772e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.9016327857971191, + "num_tokens": 26576064.0, + "step": 326 + }, + { + "epoch": 0.032668964483740445, + "grad_norm": 0.8533130602342937, + "learning_rate": 9.999836405612173e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.9029033780097961, + "num_tokens": 26657595.0, + "step": 327 + }, + { + "epoch": 0.03276886957390479, + "grad_norm": 0.6600166903187084, + "learning_rate": 9.999823056388862e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.9027400016784668, + "num_tokens": 26739071.0, + "step": 328 + }, + { + "epoch": 0.032868774664069136, + "grad_norm": 0.8590773383774366, + "learning_rate": 9.999809183679186e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.9010383188724518, + "num_tokens": 26820570.0, + "step": 329 + }, + { + "epoch": 0.032968679754233475, + "grad_norm": 0.8003274229682402, + "learning_rate": 9.999794787484599e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.9013707339763641, + "num_tokens": 26902084.0, + "step": 330 + }, + { + "epoch": 0.03306858484439782, + "grad_norm": 0.7723973714561722, + "learning_rate": 9.999779867806604e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9038533866405487, + "num_tokens": 26983664.0, + "step": 331 + }, + { + "epoch": 0.033168489934562166, + "grad_norm": 0.722899370542741, + "learning_rate": 9.999764424646768e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.9008237719535828, + "num_tokens": 27065157.0, + "step": 332 + }, + { + "epoch": 0.03326839502472651, + "grad_norm": 1.1354125893632672, + "learning_rate": 9.999748458006705e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.9008576571941376, + "num_tokens": 27146608.0, + "step": 333 + }, + { + "epoch": 0.03336830011489085, + "grad_norm": 1.0433797060368375, + "learning_rate": 9.999731967888088e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.9046099185943604, + "num_tokens": 27228072.0, + "step": 334 + }, + { + "epoch": 0.033468205205055196, + "grad_norm": 0.8168176236076289, + "learning_rate": 9.999714954292641e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9028506875038147, + "num_tokens": 27309667.0, + "step": 335 + }, + { + "epoch": 0.03356811029521954, + "grad_norm": 0.7902919869187935, + "learning_rate": 9.99969741722215e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9002375602722168, + "num_tokens": 27391172.0, + "step": 336 + }, + { + "epoch": 0.03366801538538389, + "grad_norm": 0.775754070354072, + "learning_rate": 9.999679356678447e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.9022803902626038, + "num_tokens": 27472707.0, + "step": 337 + }, + { + "epoch": 0.03376792047554823, + "grad_norm": 0.8206183079634377, + "learning_rate": 9.999660772663425e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.9047212302684784, + "num_tokens": 27554207.0, + "step": 338 + }, + { + "epoch": 0.03386782556571257, + "grad_norm": 0.7929044730026946, + "learning_rate": 9.99964166517903e-06, + "loss": 0.525, + "mean_token_accuracy": 0.9010961949825287, + "num_tokens": 27635649.0, + "step": 339 + }, + { + "epoch": 0.03396773065587692, + "grad_norm": 0.8018196252943454, + "learning_rate": 9.99962203422726e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9027950763702393, + "num_tokens": 27717265.0, + "step": 340 + }, + { + "epoch": 0.03406763574604126, + "grad_norm": 1.1022574938605094, + "learning_rate": 9.999601879810172e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9053047001361847, + "num_tokens": 27798806.0, + "step": 341 + }, + { + "epoch": 0.03416754083620561, + "grad_norm": 0.8155483118065079, + "learning_rate": 9.999581201929878e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9019581079483032, + "num_tokens": 27880342.0, + "step": 342 + }, + { + "epoch": 0.03426744592636995, + "grad_norm": 1.2833138222723504, + "learning_rate": 9.99956000058854e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.9018970727920532, + "num_tokens": 27961837.0, + "step": 343 + }, + { + "epoch": 0.03436735101653429, + "grad_norm": 0.8072219726729847, + "learning_rate": 9.99953827578838e-06, + "loss": 0.528, + "mean_token_accuracy": 0.9019953608512878, + "num_tokens": 28043256.0, + "step": 344 + }, + { + "epoch": 0.03446725610669864, + "grad_norm": 0.997938695962293, + "learning_rate": 9.999516027531671e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8992826044559479, + "num_tokens": 28124758.0, + "step": 345 + }, + { + "epoch": 0.03456716119686298, + "grad_norm": 0.8570616762761587, + "learning_rate": 9.999493255820744e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9048739075660706, + "num_tokens": 28206316.0, + "step": 346 + }, + { + "epoch": 0.03466706628702732, + "grad_norm": 1.9444791736000544, + "learning_rate": 9.999469960657982e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9062663018703461, + "num_tokens": 28287924.0, + "step": 347 + }, + { + "epoch": 0.03476697137719167, + "grad_norm": 1.0171678686473251, + "learning_rate": 9.999446142045823e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9045514166355133, + "num_tokens": 28369447.0, + "step": 348 + }, + { + "epoch": 0.03486687646735601, + "grad_norm": 1.4075916876193364, + "learning_rate": 9.999421799986764e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9024241268634796, + "num_tokens": 28450950.0, + "step": 349 + }, + { + "epoch": 0.03496678155752036, + "grad_norm": 0.8368633874417354, + "learning_rate": 9.999396934483351e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9039103090763092, + "num_tokens": 28532583.0, + "step": 350 + }, + { + "epoch": 0.0350666866476847, + "grad_norm": 0.8194956250275142, + "learning_rate": 9.99937154553819e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.9020216763019562, + "num_tokens": 28614107.0, + "step": 351 + }, + { + "epoch": 0.03516659173784904, + "grad_norm": 0.9244582764182662, + "learning_rate": 9.999345633153935e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.9053756594657898, + "num_tokens": 28695610.0, + "step": 352 + }, + { + "epoch": 0.03526649682801339, + "grad_norm": 1.0594303915557435, + "learning_rate": 9.999319197333304e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.90304896235466, + "num_tokens": 28777080.0, + "step": 353 + }, + { + "epoch": 0.035366401918177734, + "grad_norm": 1.138630642176391, + "learning_rate": 9.999292238079061e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.9044035077095032, + "num_tokens": 28858610.0, + "step": 354 + }, + { + "epoch": 0.03546630700834207, + "grad_norm": 0.8411478497737577, + "learning_rate": 9.99926475539403e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9052041172981262, + "num_tokens": 28940185.0, + "step": 355 + }, + { + "epoch": 0.03556621209850642, + "grad_norm": 0.7832706091411372, + "learning_rate": 9.999236749281089e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.9031407535076141, + "num_tokens": 29021694.0, + "step": 356 + }, + { + "epoch": 0.035666117188670764, + "grad_norm": 0.7330119625703923, + "learning_rate": 9.99920821974317e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.901752382516861, + "num_tokens": 29103198.0, + "step": 357 + }, + { + "epoch": 0.03576602227883511, + "grad_norm": 0.9179849236628156, + "learning_rate": 9.999179166783259e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.9026069343090057, + "num_tokens": 29184738.0, + "step": 358 + }, + { + "epoch": 0.03586592736899945, + "grad_norm": 0.8313632403341981, + "learning_rate": 9.9991495904044e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.90512815117836, + "num_tokens": 29266285.0, + "step": 359 + }, + { + "epoch": 0.03596583245916379, + "grad_norm": 0.7438905932755572, + "learning_rate": 9.999119490609688e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9036875069141388, + "num_tokens": 29347787.0, + "step": 360 + }, + { + "epoch": 0.03606573754932814, + "grad_norm": 1.0881814225060076, + "learning_rate": 9.999088867402276e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9029636383056641, + "num_tokens": 29429221.0, + "step": 361 + }, + { + "epoch": 0.036165642639492485, + "grad_norm": 0.9144898844942023, + "learning_rate": 9.999057720785368e-06, + "loss": 0.515, + "mean_token_accuracy": 0.904909074306488, + "num_tokens": 29510742.0, + "step": 362 + }, + { + "epoch": 0.03626554772965682, + "grad_norm": 0.8800422343109038, + "learning_rate": 9.999026050762227e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.9003473520278931, + "num_tokens": 29592205.0, + "step": 363 + }, + { + "epoch": 0.03636545281982117, + "grad_norm": 0.9273086889202251, + "learning_rate": 9.998993857336167e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.906184583902359, + "num_tokens": 29673757.0, + "step": 364 + }, + { + "epoch": 0.036465357909985514, + "grad_norm": 1.1455524002090578, + "learning_rate": 9.99896114051056e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9047577381134033, + "num_tokens": 29755292.0, + "step": 365 + }, + { + "epoch": 0.03656526300014986, + "grad_norm": 0.9623743134717522, + "learning_rate": 9.998927900288833e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9044807851314545, + "num_tokens": 29836788.0, + "step": 366 + }, + { + "epoch": 0.0366651680903142, + "grad_norm": 1.3509360024072072, + "learning_rate": 9.998894136674464e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9044117629528046, + "num_tokens": 29918329.0, + "step": 367 + }, + { + "epoch": 0.036765073180478544, + "grad_norm": 0.8038509916247165, + "learning_rate": 9.998859849670987e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.903699517250061, + "num_tokens": 29999892.0, + "step": 368 + }, + { + "epoch": 0.03686497827064289, + "grad_norm": 0.984663465270005, + "learning_rate": 9.998825039281997e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.9027835428714752, + "num_tokens": 30081361.0, + "step": 369 + }, + { + "epoch": 0.036964883360807235, + "grad_norm": 1.4769470260039324, + "learning_rate": 9.998789705511131e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9019716084003448, + "num_tokens": 30162929.0, + "step": 370 + }, + { + "epoch": 0.037064788450971574, + "grad_norm": 1.003804942187557, + "learning_rate": 9.998753848362096e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.9040416777133942, + "num_tokens": 30244388.0, + "step": 371 + }, + { + "epoch": 0.03716469354113592, + "grad_norm": 1.3122962463777368, + "learning_rate": 9.998717467838643e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.9018543660640717, + "num_tokens": 30325868.0, + "step": 372 + }, + { + "epoch": 0.037264598631300265, + "grad_norm": 0.9199198367871689, + "learning_rate": 9.99868056394458e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9064778089523315, + "num_tokens": 30407375.0, + "step": 373 + }, + { + "epoch": 0.03736450372146461, + "grad_norm": 1.0899525098979264, + "learning_rate": 9.998643136683772e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.9008612930774689, + "num_tokens": 30488787.0, + "step": 374 + }, + { + "epoch": 0.03746440881162895, + "grad_norm": 0.8355603918686971, + "learning_rate": 9.998605186060138e-06, + "loss": 0.517, + "mean_token_accuracy": 0.9041275084018707, + "num_tokens": 30570337.0, + "step": 375 + }, + { + "epoch": 0.037564313901793295, + "grad_norm": 0.8500137911712791, + "learning_rate": 9.99856671207765e-06, + "loss": 0.512, + "mean_token_accuracy": 0.903048574924469, + "num_tokens": 30651859.0, + "step": 376 + }, + { + "epoch": 0.03766421899195764, + "grad_norm": 1.0738079497706137, + "learning_rate": 9.99852771474034e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.9035312533378601, + "num_tokens": 30733331.0, + "step": 377 + }, + { + "epoch": 0.037764124082121986, + "grad_norm": 0.8153082561598483, + "learning_rate": 9.998488194052287e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.9046271741390228, + "num_tokens": 30814783.0, + "step": 378 + }, + { + "epoch": 0.037864029172286325, + "grad_norm": 0.9094617410927809, + "learning_rate": 9.99844815001763e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.9033401906490326, + "num_tokens": 30896329.0, + "step": 379 + }, + { + "epoch": 0.03796393426245067, + "grad_norm": 0.9545689662047789, + "learning_rate": 9.99840758264056e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.904246598482132, + "num_tokens": 30977825.0, + "step": 380 + }, + { + "epoch": 0.038063839352615016, + "grad_norm": 1.854414929908087, + "learning_rate": 9.99836649192533e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9042709767818451, + "num_tokens": 31059410.0, + "step": 381 + }, + { + "epoch": 0.03816374444277936, + "grad_norm": 0.778525874230105, + "learning_rate": 9.998324877876237e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.9050759077072144, + "num_tokens": 31140879.0, + "step": 382 + }, + { + "epoch": 0.03826364953294371, + "grad_norm": 0.8421473285803887, + "learning_rate": 9.99828274049764e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.9049160480499268, + "num_tokens": 31222305.0, + "step": 383 + }, + { + "epoch": 0.038363554623108045, + "grad_norm": 1.5757339933701047, + "learning_rate": 9.99824007979395e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9052997827529907, + "num_tokens": 31303793.0, + "step": 384 + }, + { + "epoch": 0.03846345971327239, + "grad_norm": 0.8955124512134167, + "learning_rate": 9.998196895769637e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.9042021334171295, + "num_tokens": 31385336.0, + "step": 385 + }, + { + "epoch": 0.03856336480343674, + "grad_norm": 0.8653248768317131, + "learning_rate": 9.998153188429216e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9064939022064209, + "num_tokens": 31466953.0, + "step": 386 + }, + { + "epoch": 0.03866326989360108, + "grad_norm": 0.912222355710456, + "learning_rate": 9.998108957777269e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9033716022968292, + "num_tokens": 31548429.0, + "step": 387 + }, + { + "epoch": 0.03876317498376542, + "grad_norm": 2.325950926617422, + "learning_rate": 9.998064203818423e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9039231240749359, + "num_tokens": 31630095.0, + "step": 388 + }, + { + "epoch": 0.038863080073929766, + "grad_norm": 0.8039211267962989, + "learning_rate": 9.998018926557366e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9017524719238281, + "num_tokens": 31711587.0, + "step": 389 + }, + { + "epoch": 0.03896298516409411, + "grad_norm": 0.8509571065808872, + "learning_rate": 9.997973125998837e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9045479595661163, + "num_tokens": 31793116.0, + "step": 390 + }, + { + "epoch": 0.03906289025425846, + "grad_norm": 1.056371689338982, + "learning_rate": 9.997926802147635e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.9061518013477325, + "num_tokens": 31874590.0, + "step": 391 + }, + { + "epoch": 0.039162795344422796, + "grad_norm": 1.0811653702348734, + "learning_rate": 9.997879955008607e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9050158858299255, + "num_tokens": 31956146.0, + "step": 392 + }, + { + "epoch": 0.03926270043458714, + "grad_norm": 0.8646258432815312, + "learning_rate": 9.997832584586657e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.9025839865207672, + "num_tokens": 32037634.0, + "step": 393 + }, + { + "epoch": 0.03936260552475149, + "grad_norm": 1.077391328903099, + "learning_rate": 9.997784690886747e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9074783027172089, + "num_tokens": 32119184.0, + "step": 394 + }, + { + "epoch": 0.03946251061491583, + "grad_norm": 2.045357326347662, + "learning_rate": 9.99773627391389e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9066481590270996, + "num_tokens": 32200770.0, + "step": 395 + }, + { + "epoch": 0.03956241570508017, + "grad_norm": 0.8619209885596495, + "learning_rate": 9.997687333673158e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9034686386585236, + "num_tokens": 32282287.0, + "step": 396 + }, + { + "epoch": 0.03966232079524452, + "grad_norm": 0.8597861845558574, + "learning_rate": 9.997637870169673e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9055591821670532, + "num_tokens": 32363823.0, + "step": 397 + }, + { + "epoch": 0.03976222588540886, + "grad_norm": 0.9340805619953239, + "learning_rate": 9.997587883408611e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9050257802009583, + "num_tokens": 32445340.0, + "step": 398 + }, + { + "epoch": 0.03986213097557321, + "grad_norm": 0.8239429203328559, + "learning_rate": 9.997537373395212e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9067528247833252, + "num_tokens": 32526907.0, + "step": 399 + }, + { + "epoch": 0.03996203606573755, + "grad_norm": 0.7195317646610584, + "learning_rate": 9.997486340134759e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9056168794631958, + "num_tokens": 32608470.0, + "step": 400 + }, + { + "epoch": 0.04006194115590189, + "grad_norm": 1.0420156446075945, + "learning_rate": 9.997434783632599e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9060550630092621, + "num_tokens": 32690069.0, + "step": 401 + }, + { + "epoch": 0.04016184624606624, + "grad_norm": 1.3015362973082416, + "learning_rate": 9.997382703894128e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9066563248634338, + "num_tokens": 32771676.0, + "step": 402 + }, + { + "epoch": 0.04026175133623058, + "grad_norm": 2.1108435437032362, + "learning_rate": 9.9973301009248e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.9037097990512848, + "num_tokens": 32853148.0, + "step": 403 + }, + { + "epoch": 0.04036165642639492, + "grad_norm": 0.9471002694964457, + "learning_rate": 9.997276974730121e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9060743451118469, + "num_tokens": 32934675.0, + "step": 404 + }, + { + "epoch": 0.04046156151655927, + "grad_norm": 1.8703497839527934, + "learning_rate": 9.997223325315652e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9040623605251312, + "num_tokens": 33016144.0, + "step": 405 + }, + { + "epoch": 0.04056146660672361, + "grad_norm": 0.7241677112023546, + "learning_rate": 9.997169152687016e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9062772393226624, + "num_tokens": 33097634.0, + "step": 406 + }, + { + "epoch": 0.04066137169688796, + "grad_norm": 0.9179108675656568, + "learning_rate": 9.99711445684988e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9053508639335632, + "num_tokens": 33179219.0, + "step": 407 + }, + { + "epoch": 0.0407612767870523, + "grad_norm": 0.7360958933476652, + "learning_rate": 9.997059237809973e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9014551937580109, + "num_tokens": 33260767.0, + "step": 408 + }, + { + "epoch": 0.04086118187721664, + "grad_norm": 0.9929429799567825, + "learning_rate": 9.997003495573073e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.9026874303817749, + "num_tokens": 33342315.0, + "step": 409 + }, + { + "epoch": 0.04096108696738099, + "grad_norm": 1.1365842549482446, + "learning_rate": 9.99694723014502e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9033368825912476, + "num_tokens": 33423853.0, + "step": 410 + }, + { + "epoch": 0.041060992057545334, + "grad_norm": 1.324942094890923, + "learning_rate": 9.996890441531702e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9042468667030334, + "num_tokens": 33505349.0, + "step": 411 + }, + { + "epoch": 0.04116089714770967, + "grad_norm": 0.7695414647802867, + "learning_rate": 9.996833129739068e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9043255746364594, + "num_tokens": 33586911.0, + "step": 412 + }, + { + "epoch": 0.04126080223787402, + "grad_norm": 0.9553711411649004, + "learning_rate": 9.996775294773118e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9050535261631012, + "num_tokens": 33668493.0, + "step": 413 + }, + { + "epoch": 0.041360707328038364, + "grad_norm": 1.0959375168117282, + "learning_rate": 9.996716936639905e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9034704864025116, + "num_tokens": 33749967.0, + "step": 414 + }, + { + "epoch": 0.04146061241820271, + "grad_norm": 1.2160205557154706, + "learning_rate": 9.996658055345542e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.904901772737503, + "num_tokens": 33831506.0, + "step": 415 + }, + { + "epoch": 0.04156051750836705, + "grad_norm": 0.7367064120333245, + "learning_rate": 9.996598650896191e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.901189923286438, + "num_tokens": 33913023.0, + "step": 416 + }, + { + "epoch": 0.041660422598531394, + "grad_norm": 0.9192474769727713, + "learning_rate": 9.996538723298075e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.9051949083805084, + "num_tokens": 33994490.0, + "step": 417 + }, + { + "epoch": 0.04176032768869574, + "grad_norm": 0.792835571554044, + "learning_rate": 9.996478272557465e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9065754115581512, + "num_tokens": 34076027.0, + "step": 418 + }, + { + "epoch": 0.041860232778860085, + "grad_norm": 0.9510095881096441, + "learning_rate": 9.996417298680695e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9017641842365265, + "num_tokens": 34157541.0, + "step": 419 + }, + { + "epoch": 0.04196013786902442, + "grad_norm": 0.7962309538716148, + "learning_rate": 9.996355801674145e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9044212698936462, + "num_tokens": 34239059.0, + "step": 420 + }, + { + "epoch": 0.04206004295918877, + "grad_norm": 0.6907377147841642, + "learning_rate": 9.996293781544255e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9069874882698059, + "num_tokens": 34320540.0, + "step": 421 + }, + { + "epoch": 0.042159948049353115, + "grad_norm": 0.627226643505353, + "learning_rate": 9.996231238297516e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9073544442653656, + "num_tokens": 34402092.0, + "step": 422 + }, + { + "epoch": 0.04225985313951746, + "grad_norm": 1.0230940905503392, + "learning_rate": 9.996168171940482e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9079008102416992, + "num_tokens": 34483623.0, + "step": 423 + }, + { + "epoch": 0.042359758229681806, + "grad_norm": 1.0509299259273626, + "learning_rate": 9.996104582479752e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9051198959350586, + "num_tokens": 34565137.0, + "step": 424 + }, + { + "epoch": 0.042459663319846144, + "grad_norm": 1.089552328638866, + "learning_rate": 9.996040469921983e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9033429324626923, + "num_tokens": 34646692.0, + "step": 425 + }, + { + "epoch": 0.04255956841001049, + "grad_norm": 2.1819592936463295, + "learning_rate": 9.99597583427389e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9071929454803467, + "num_tokens": 34728295.0, + "step": 426 + }, + { + "epoch": 0.042659473500174835, + "grad_norm": 0.7866235945040047, + "learning_rate": 9.995910675542243e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.9011795818805695, + "num_tokens": 34809765.0, + "step": 427 + }, + { + "epoch": 0.04275937859033918, + "grad_norm": 0.7982660623184764, + "learning_rate": 9.995844993733857e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9048214256763458, + "num_tokens": 34891228.0, + "step": 428 + }, + { + "epoch": 0.04285928368050352, + "grad_norm": 0.7299863786132476, + "learning_rate": 9.995778788855614e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9064467251300812, + "num_tokens": 34972681.0, + "step": 429 + }, + { + "epoch": 0.042959188770667865, + "grad_norm": 0.647099705082551, + "learning_rate": 9.995712060914445e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9072273671627045, + "num_tokens": 35054230.0, + "step": 430 + }, + { + "epoch": 0.04305909386083221, + "grad_norm": 0.6882186853193614, + "learning_rate": 9.995644809917337e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9031301438808441, + "num_tokens": 35135710.0, + "step": 431 + }, + { + "epoch": 0.043158998950996556, + "grad_norm": 0.6448945109441085, + "learning_rate": 9.99557703587133e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9050428569316864, + "num_tokens": 35217175.0, + "step": 432 + }, + { + "epoch": 0.043258904041160895, + "grad_norm": 0.7645523459592615, + "learning_rate": 9.99550873878352e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.906006932258606, + "num_tokens": 35298707.0, + "step": 433 + }, + { + "epoch": 0.04335880913132524, + "grad_norm": 0.7498116545753141, + "learning_rate": 9.995439918661058e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9059284925460815, + "num_tokens": 35380286.0, + "step": 434 + }, + { + "epoch": 0.043458714221489586, + "grad_norm": 0.7822589426266533, + "learning_rate": 9.995370575511151e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.904312014579773, + "num_tokens": 35461747.0, + "step": 435 + }, + { + "epoch": 0.04355861931165393, + "grad_norm": 0.8336132357061709, + "learning_rate": 9.995300709341058e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9027913808822632, + "num_tokens": 35543239.0, + "step": 436 + }, + { + "epoch": 0.04365852440181827, + "grad_norm": 0.7898930871370808, + "learning_rate": 9.995230320158092e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.9016829133033752, + "num_tokens": 35624654.0, + "step": 437 + }, + { + "epoch": 0.043758429491982616, + "grad_norm": 0.7486989765088834, + "learning_rate": 9.995159407969626e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9025857746601105, + "num_tokens": 35706169.0, + "step": 438 + }, + { + "epoch": 0.04385833458214696, + "grad_norm": 0.8203684718189802, + "learning_rate": 9.995087972783084e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9033876955509186, + "num_tokens": 35787629.0, + "step": 439 + }, + { + "epoch": 0.04395823967231131, + "grad_norm": 0.7655161803513648, + "learning_rate": 9.995016014605945e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9049490690231323, + "num_tokens": 35869129.0, + "step": 440 + }, + { + "epoch": 0.044058144762475646, + "grad_norm": 0.771400553588589, + "learning_rate": 9.994943533445742e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.904741495847702, + "num_tokens": 35950613.0, + "step": 441 + }, + { + "epoch": 0.04415804985263999, + "grad_norm": 0.7433268125882516, + "learning_rate": 9.994870529310065e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9023516774177551, + "num_tokens": 36032151.0, + "step": 442 + }, + { + "epoch": 0.04425795494280434, + "grad_norm": 0.8589299950051543, + "learning_rate": 9.994797002206558e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9044131934642792, + "num_tokens": 36113710.0, + "step": 443 + }, + { + "epoch": 0.04435786003296868, + "grad_norm": 0.7767256089787942, + "learning_rate": 9.994722952142919e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.9028339982032776, + "num_tokens": 36195142.0, + "step": 444 + }, + { + "epoch": 0.04445776512313302, + "grad_norm": 0.781797615057403, + "learning_rate": 9.9946483791269e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.9025560617446899, + "num_tokens": 36276657.0, + "step": 445 + }, + { + "epoch": 0.044557670213297366, + "grad_norm": 0.7726514181908638, + "learning_rate": 9.99457328316631e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.902883380651474, + "num_tokens": 36358169.0, + "step": 446 + }, + { + "epoch": 0.04465757530346171, + "grad_norm": 0.8003176950613174, + "learning_rate": 9.99449766426901e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9099579453468323, + "num_tokens": 36439690.0, + "step": 447 + }, + { + "epoch": 0.04475748039362606, + "grad_norm": 0.7191553450057311, + "learning_rate": 9.99442152244292e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9047444462776184, + "num_tokens": 36521270.0, + "step": 448 + }, + { + "epoch": 0.044857385483790396, + "grad_norm": 0.7848635659386414, + "learning_rate": 9.99434485769601e-06, + "loss": 0.514, + "mean_token_accuracy": 0.905131995677948, + "num_tokens": 36602765.0, + "step": 449 + }, + { + "epoch": 0.04495729057395474, + "grad_norm": 0.7170240689251834, + "learning_rate": 9.994267670036309e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9053889513015747, + "num_tokens": 36684311.0, + "step": 450 + }, + { + "epoch": 0.04505719566411909, + "grad_norm": 0.7964548890755938, + "learning_rate": 9.994189959471895e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9033717215061188, + "num_tokens": 36765833.0, + "step": 451 + }, + { + "epoch": 0.04515710075428343, + "grad_norm": 1.2046840227051288, + "learning_rate": 9.994111726010909e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9061475694179535, + "num_tokens": 36847379.0, + "step": 452 + }, + { + "epoch": 0.04525700584444777, + "grad_norm": 0.8178181811003103, + "learning_rate": 9.99403296966154e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9052334427833557, + "num_tokens": 36928788.0, + "step": 453 + }, + { + "epoch": 0.04535691093461212, + "grad_norm": 0.7894560817976128, + "learning_rate": 9.993953690432032e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9061615467071533, + "num_tokens": 37010389.0, + "step": 454 + }, + { + "epoch": 0.04545681602477646, + "grad_norm": 0.6580032513049264, + "learning_rate": 9.993873888330688e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9052574634552002, + "num_tokens": 37091966.0, + "step": 455 + }, + { + "epoch": 0.04555672111494081, + "grad_norm": 0.8114508994197283, + "learning_rate": 9.993793563365864e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9046814739704132, + "num_tokens": 37173544.0, + "step": 456 + }, + { + "epoch": 0.04565662620510515, + "grad_norm": 0.8579401385919604, + "learning_rate": 9.993712715545966e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9045297503471375, + "num_tokens": 37255123.0, + "step": 457 + }, + { + "epoch": 0.04575653129526949, + "grad_norm": 0.6639185441619564, + "learning_rate": 9.993631344879465e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9039274752140045, + "num_tokens": 37336688.0, + "step": 458 + }, + { + "epoch": 0.04585643638543384, + "grad_norm": 0.786976753005559, + "learning_rate": 9.993549451374873e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9068180322647095, + "num_tokens": 37418170.0, + "step": 459 + }, + { + "epoch": 0.045956341475598184, + "grad_norm": 0.938568482354313, + "learning_rate": 9.993467035040772e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9067065715789795, + "num_tokens": 37499817.0, + "step": 460 + }, + { + "epoch": 0.04605624656576252, + "grad_norm": 0.9464423404004395, + "learning_rate": 9.993384095885786e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9015226066112518, + "num_tokens": 37581309.0, + "step": 461 + }, + { + "epoch": 0.04615615165592687, + "grad_norm": 0.7375638646687701, + "learning_rate": 9.993300633918602e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9055782556533813, + "num_tokens": 37662874.0, + "step": 462 + }, + { + "epoch": 0.04625605674609121, + "grad_norm": 0.6770484595233914, + "learning_rate": 9.993216649147955e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9057578444480896, + "num_tokens": 37744397.0, + "step": 463 + }, + { + "epoch": 0.04635596183625556, + "grad_norm": 0.9664731466219623, + "learning_rate": 9.99313214158264e-06, + "loss": 0.519, + "mean_token_accuracy": 0.9022270143032074, + "num_tokens": 37825846.0, + "step": 464 + }, + { + "epoch": 0.046455866926419905, + "grad_norm": 0.7226217678936787, + "learning_rate": 9.993047111231507e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9049926400184631, + "num_tokens": 37907329.0, + "step": 465 + }, + { + "epoch": 0.04655577201658424, + "grad_norm": 0.6533062092375692, + "learning_rate": 9.992961558103455e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9030300378799438, + "num_tokens": 37988832.0, + "step": 466 + }, + { + "epoch": 0.04665567710674859, + "grad_norm": 0.9071851484835749, + "learning_rate": 9.992875482207445e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9028553068637848, + "num_tokens": 38070362.0, + "step": 467 + }, + { + "epoch": 0.046755582196912934, + "grad_norm": 0.752007492238216, + "learning_rate": 9.992788883552487e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.9012612402439117, + "num_tokens": 38151854.0, + "step": 468 + }, + { + "epoch": 0.04685548728707728, + "grad_norm": 1.4059264430798233, + "learning_rate": 9.99270176214765e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9054809212684631, + "num_tokens": 38233415.0, + "step": 469 + }, + { + "epoch": 0.04695539237724162, + "grad_norm": 0.7212687365639884, + "learning_rate": 9.992614118002054e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9027402102947235, + "num_tokens": 38314932.0, + "step": 470 + }, + { + "epoch": 0.047055297467405964, + "grad_norm": 1.0987863297645792, + "learning_rate": 9.992525951124873e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9045037031173706, + "num_tokens": 38396494.0, + "step": 471 + }, + { + "epoch": 0.04715520255757031, + "grad_norm": 0.9394084669655902, + "learning_rate": 9.992437261525343e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9000013172626495, + "num_tokens": 38478012.0, + "step": 472 + }, + { + "epoch": 0.047255107647734655, + "grad_norm": 0.7286986250999907, + "learning_rate": 9.99234804921275e-06, + "loss": 0.51, + "mean_token_accuracy": 0.90263831615448, + "num_tokens": 38559528.0, + "step": 473 + }, + { + "epoch": 0.047355012737898994, + "grad_norm": 0.9223841156823638, + "learning_rate": 9.99225831419643e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9061861932277679, + "num_tokens": 38641005.0, + "step": 474 + }, + { + "epoch": 0.04745491782806334, + "grad_norm": 0.7559147291283137, + "learning_rate": 9.992168056485781e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9071686267852783, + "num_tokens": 38722592.0, + "step": 475 + }, + { + "epoch": 0.047554822918227685, + "grad_norm": 0.8158049716548873, + "learning_rate": 9.992077276090254e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9064978957176208, + "num_tokens": 38804058.0, + "step": 476 + }, + { + "epoch": 0.04765472800839203, + "grad_norm": 0.9274002734676675, + "learning_rate": 9.991985973019351e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9066363275051117, + "num_tokens": 38885562.0, + "step": 477 + }, + { + "epoch": 0.04775463309855637, + "grad_norm": 0.907284123467927, + "learning_rate": 9.991894147282635e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9043181538581848, + "num_tokens": 38967070.0, + "step": 478 + }, + { + "epoch": 0.047854538188720715, + "grad_norm": 0.8163454966463022, + "learning_rate": 9.991801798889718e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.903928816318512, + "num_tokens": 39048558.0, + "step": 479 + }, + { + "epoch": 0.04795444327888506, + "grad_norm": 0.6982978257559952, + "learning_rate": 9.99170892785027e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9043801724910736, + "num_tokens": 39130064.0, + "step": 480 + }, + { + "epoch": 0.048054348369049406, + "grad_norm": 0.7495626653966637, + "learning_rate": 9.991615534174014e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9025121927261353, + "num_tokens": 39211503.0, + "step": 481 + }, + { + "epoch": 0.048154253459213744, + "grad_norm": 0.7198619234706257, + "learning_rate": 9.991521617870726e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9052240550518036, + "num_tokens": 39293006.0, + "step": 482 + }, + { + "epoch": 0.04825415854937809, + "grad_norm": 0.6975193466695921, + "learning_rate": 9.991427178950243e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.905908614397049, + "num_tokens": 39374466.0, + "step": 483 + }, + { + "epoch": 0.048354063639542436, + "grad_norm": 0.7430858906417656, + "learning_rate": 9.991332217422454e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.904463142156601, + "num_tokens": 39455946.0, + "step": 484 + }, + { + "epoch": 0.04845396872970678, + "grad_norm": 0.7792055349757625, + "learning_rate": 9.991236733297295e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9034795463085175, + "num_tokens": 39537453.0, + "step": 485 + }, + { + "epoch": 0.04855387381987112, + "grad_norm": 0.6532865376312356, + "learning_rate": 9.99114072658477e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9067137241363525, + "num_tokens": 39618939.0, + "step": 486 + }, + { + "epoch": 0.048653778910035465, + "grad_norm": 1.1711664192074567, + "learning_rate": 9.991044197294927e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9048110842704773, + "num_tokens": 39700461.0, + "step": 487 + }, + { + "epoch": 0.04875368400019981, + "grad_norm": 0.7041838881143645, + "learning_rate": 9.990947145437878e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9060819149017334, + "num_tokens": 39782012.0, + "step": 488 + }, + { + "epoch": 0.048853589090364156, + "grad_norm": 0.9970062215294867, + "learning_rate": 9.990849571023775e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9043762683868408, + "num_tokens": 39863537.0, + "step": 489 + }, + { + "epoch": 0.048953494180528495, + "grad_norm": 0.9172731806791087, + "learning_rate": 9.990751474062843e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.906123548746109, + "num_tokens": 39945031.0, + "step": 490 + }, + { + "epoch": 0.04905339927069284, + "grad_norm": 0.6735565883671681, + "learning_rate": 9.990652854565348e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9028599560260773, + "num_tokens": 40026595.0, + "step": 491 + }, + { + "epoch": 0.049153304360857186, + "grad_norm": 0.8702566906804713, + "learning_rate": 9.990553712541617e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9061378240585327, + "num_tokens": 40108098.0, + "step": 492 + }, + { + "epoch": 0.04925320945102153, + "grad_norm": 0.8040268538035247, + "learning_rate": 9.990454048002033e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9054019451141357, + "num_tokens": 40189627.0, + "step": 493 + }, + { + "epoch": 0.04935311454118587, + "grad_norm": 0.7782740951388082, + "learning_rate": 9.990353860957025e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.903734415769577, + "num_tokens": 40271116.0, + "step": 494 + }, + { + "epoch": 0.049453019631350216, + "grad_norm": 0.7717176790233482, + "learning_rate": 9.990253151417087e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9037680625915527, + "num_tokens": 40352637.0, + "step": 495 + }, + { + "epoch": 0.04955292472151456, + "grad_norm": 0.7403657002647537, + "learning_rate": 9.990151919392762e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9044210910797119, + "num_tokens": 40434117.0, + "step": 496 + }, + { + "epoch": 0.04965282981167891, + "grad_norm": 0.8786453787565169, + "learning_rate": 9.99005016489465e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9017777144908905, + "num_tokens": 40515561.0, + "step": 497 + }, + { + "epoch": 0.049752734901843246, + "grad_norm": 0.7544709627207139, + "learning_rate": 9.989947887933404e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9066123366355896, + "num_tokens": 40597156.0, + "step": 498 + }, + { + "epoch": 0.04985263999200759, + "grad_norm": 0.7040609485830128, + "learning_rate": 9.989845088519732e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9057628512382507, + "num_tokens": 40678672.0, + "step": 499 + }, + { + "epoch": 0.04995254508217194, + "grad_norm": 0.7654071758867231, + "learning_rate": 9.989741766664399e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9055852591991425, + "num_tokens": 40760280.0, + "step": 500 + }, + { + "epoch": 0.05005245017233628, + "grad_norm": 0.6569184955866153, + "learning_rate": 9.989637922378222e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9029248356819153, + "num_tokens": 40841722.0, + "step": 501 + }, + { + "epoch": 0.05015235526250062, + "grad_norm": 0.7588322101381351, + "learning_rate": 9.989533555672074e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.904725968837738, + "num_tokens": 40923236.0, + "step": 502 + }, + { + "epoch": 0.05025226035266497, + "grad_norm": 0.6984968127366319, + "learning_rate": 9.98942866655688e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9041641652584076, + "num_tokens": 41004777.0, + "step": 503 + }, + { + "epoch": 0.05035216544282931, + "grad_norm": 0.6500414407743329, + "learning_rate": 9.989323255043623e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9060725271701813, + "num_tokens": 41086261.0, + "step": 504 + }, + { + "epoch": 0.05045207053299366, + "grad_norm": 0.7021593176008047, + "learning_rate": 9.989217321143342e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9021233916282654, + "num_tokens": 41167733.0, + "step": 505 + }, + { + "epoch": 0.050551975623158, + "grad_norm": 0.787335895378134, + "learning_rate": 9.989110864867126e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9043927192687988, + "num_tokens": 41249334.0, + "step": 506 + }, + { + "epoch": 0.05065188071332234, + "grad_norm": 0.8823838434072949, + "learning_rate": 9.989003886226123e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9035869538784027, + "num_tokens": 41330808.0, + "step": 507 + }, + { + "epoch": 0.05075178580348669, + "grad_norm": 0.7861434749178208, + "learning_rate": 9.988896385231532e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9043598771095276, + "num_tokens": 41412407.0, + "step": 508 + }, + { + "epoch": 0.05085169089365103, + "grad_norm": 0.6312698761838067, + "learning_rate": 9.988788361894609e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9087964594364166, + "num_tokens": 41493881.0, + "step": 509 + }, + { + "epoch": 0.05095159598381538, + "grad_norm": 0.6977987760930257, + "learning_rate": 9.988679816226665e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9041287302970886, + "num_tokens": 41575369.0, + "step": 510 + }, + { + "epoch": 0.05105150107397972, + "grad_norm": 0.687491133315546, + "learning_rate": 9.988570748239062e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9035336971282959, + "num_tokens": 41656861.0, + "step": 511 + }, + { + "epoch": 0.05115140616414406, + "grad_norm": 0.6738904683838008, + "learning_rate": 9.988461157943223e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9076258838176727, + "num_tokens": 41738455.0, + "step": 512 + }, + { + "epoch": 0.05125131125430841, + "grad_norm": 0.6629055445138887, + "learning_rate": 9.988351045350622e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9028708338737488, + "num_tokens": 41820003.0, + "step": 513 + }, + { + "epoch": 0.051351216344472754, + "grad_norm": 0.6479575541051432, + "learning_rate": 9.988240410472784e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.9028123021125793, + "num_tokens": 41901454.0, + "step": 514 + }, + { + "epoch": 0.05145112143463709, + "grad_norm": 0.9267123869928661, + "learning_rate": 9.988129253321298e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9022219777107239, + "num_tokens": 41982920.0, + "step": 515 + }, + { + "epoch": 0.05155102652480144, + "grad_norm": 0.7927440813242667, + "learning_rate": 9.988017573907798e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9042612612247467, + "num_tokens": 42064550.0, + "step": 516 + }, + { + "epoch": 0.051650931614965784, + "grad_norm": 1.0215511635030385, + "learning_rate": 9.987905372243979e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9075649380683899, + "num_tokens": 42146149.0, + "step": 517 + }, + { + "epoch": 0.05175083670513013, + "grad_norm": 0.7579284860191212, + "learning_rate": 9.987792648341587e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9054402410984039, + "num_tokens": 42227702.0, + "step": 518 + }, + { + "epoch": 0.05185074179529447, + "grad_norm": 0.8245859624736199, + "learning_rate": 9.987679402212426e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.90433070063591, + "num_tokens": 42309224.0, + "step": 519 + }, + { + "epoch": 0.051950646885458814, + "grad_norm": 0.9288009735022102, + "learning_rate": 9.987565633868355e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.906031459569931, + "num_tokens": 42390697.0, + "step": 520 + }, + { + "epoch": 0.05205055197562316, + "grad_norm": 0.779111901076011, + "learning_rate": 9.98745134332128e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9028956890106201, + "num_tokens": 42472177.0, + "step": 521 + }, + { + "epoch": 0.052150457065787505, + "grad_norm": 0.8500582905252421, + "learning_rate": 9.987336530583171e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.9034871757030487, + "num_tokens": 42553610.0, + "step": 522 + }, + { + "epoch": 0.05225036215595184, + "grad_norm": 0.6113984230875845, + "learning_rate": 9.987221195666048e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9061127007007599, + "num_tokens": 42635147.0, + "step": 523 + }, + { + "epoch": 0.05235026724611619, + "grad_norm": 0.9704648512206122, + "learning_rate": 9.987105338581988e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9069747626781464, + "num_tokens": 42716674.0, + "step": 524 + }, + { + "epoch": 0.052450172336280534, + "grad_norm": 0.7104334775087691, + "learning_rate": 9.986988959343121e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9014198184013367, + "num_tokens": 42798098.0, + "step": 525 + }, + { + "epoch": 0.05255007742644488, + "grad_norm": 0.7054553020607303, + "learning_rate": 9.98687205796163e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9040900766849518, + "num_tokens": 42879559.0, + "step": 526 + }, + { + "epoch": 0.05264998251660922, + "grad_norm": 0.6905366724405756, + "learning_rate": 9.986754634449756e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9062585234642029, + "num_tokens": 42961146.0, + "step": 527 + }, + { + "epoch": 0.052749887606773564, + "grad_norm": 0.8205791446023527, + "learning_rate": 9.986636688819795e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9058220386505127, + "num_tokens": 43042758.0, + "step": 528 + }, + { + "epoch": 0.05284979269693791, + "grad_norm": 0.738235725248253, + "learning_rate": 9.986518221084094e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9074845612049103, + "num_tokens": 43124319.0, + "step": 529 + }, + { + "epoch": 0.052949697787102255, + "grad_norm": 0.9392016511832011, + "learning_rate": 9.986399231255057e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9038276970386505, + "num_tokens": 43205814.0, + "step": 530 + }, + { + "epoch": 0.053049602877266594, + "grad_norm": 0.7855635925318327, + "learning_rate": 9.986279719345142e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9033735394477844, + "num_tokens": 43287280.0, + "step": 531 + }, + { + "epoch": 0.05314950796743094, + "grad_norm": 1.2647030989394985, + "learning_rate": 9.986159685366862e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9073372781276703, + "num_tokens": 43368862.0, + "step": 532 + }, + { + "epoch": 0.053249413057595285, + "grad_norm": 0.8449938459039354, + "learning_rate": 9.986039129332787e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9037904739379883, + "num_tokens": 43450375.0, + "step": 533 + }, + { + "epoch": 0.05334931814775963, + "grad_norm": 0.8746136108517751, + "learning_rate": 9.985918051255537e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9043111205101013, + "num_tokens": 43531921.0, + "step": 534 + }, + { + "epoch": 0.05344922323792397, + "grad_norm": 0.6746270157334029, + "learning_rate": 9.985796451147789e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9064461290836334, + "num_tokens": 43613459.0, + "step": 535 + }, + { + "epoch": 0.053549128328088315, + "grad_norm": 0.7518525113551161, + "learning_rate": 9.985674329022275e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.908028781414032, + "num_tokens": 43695001.0, + "step": 536 + }, + { + "epoch": 0.05364903341825266, + "grad_norm": 0.822470066040826, + "learning_rate": 9.985551684891784e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9053697884082794, + "num_tokens": 43776460.0, + "step": 537 + }, + { + "epoch": 0.053748938508417006, + "grad_norm": 0.674787033180487, + "learning_rate": 9.985428518769151e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9052402973175049, + "num_tokens": 43858035.0, + "step": 538 + }, + { + "epoch": 0.053848843598581345, + "grad_norm": 1.160885808045797, + "learning_rate": 9.985304830667278e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.906282365322113, + "num_tokens": 43939581.0, + "step": 539 + }, + { + "epoch": 0.05394874868874569, + "grad_norm": 1.00424838389105, + "learning_rate": 9.98518062059911e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9038306474685669, + "num_tokens": 44021055.0, + "step": 540 + }, + { + "epoch": 0.054048653778910036, + "grad_norm": 0.7190265033625625, + "learning_rate": 9.985055888577656e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9065024852752686, + "num_tokens": 44102590.0, + "step": 541 + }, + { + "epoch": 0.05414855886907438, + "grad_norm": 0.6903510059145946, + "learning_rate": 9.984930634615973e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9030565917491913, + "num_tokens": 44184026.0, + "step": 542 + }, + { + "epoch": 0.05424846395923872, + "grad_norm": 1.1250881042314116, + "learning_rate": 9.984804858727175e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9066107273101807, + "num_tokens": 44265568.0, + "step": 543 + }, + { + "epoch": 0.054348369049403066, + "grad_norm": 0.7772377251970849, + "learning_rate": 9.984678560924433e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9063241481781006, + "num_tokens": 44347137.0, + "step": 544 + }, + { + "epoch": 0.05444827413956741, + "grad_norm": 0.9331832829769334, + "learning_rate": 9.98455174122097e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9075644910335541, + "num_tokens": 44428631.0, + "step": 545 + }, + { + "epoch": 0.05454817922973176, + "grad_norm": 0.8057495061359532, + "learning_rate": 9.984424399630064e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9038012027740479, + "num_tokens": 44510141.0, + "step": 546 + }, + { + "epoch": 0.054648084319896095, + "grad_norm": 0.8119913879154617, + "learning_rate": 9.984296536165046e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9057769775390625, + "num_tokens": 44591625.0, + "step": 547 + }, + { + "epoch": 0.05474798941006044, + "grad_norm": 1.0548551216180864, + "learning_rate": 9.984168150839305e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9074825942516327, + "num_tokens": 44673188.0, + "step": 548 + }, + { + "epoch": 0.054847894500224786, + "grad_norm": 0.8265675694793646, + "learning_rate": 9.984039243666284e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8993583917617798, + "num_tokens": 44754713.0, + "step": 549 + }, + { + "epoch": 0.05494779959038913, + "grad_norm": 0.683546139282791, + "learning_rate": 9.983909814659476e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.908699095249176, + "num_tokens": 44836223.0, + "step": 550 + }, + { + "epoch": 0.05504770468055348, + "grad_norm": 0.7885430158083524, + "learning_rate": 9.983779863832436e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9088265895843506, + "num_tokens": 44917788.0, + "step": 551 + }, + { + "epoch": 0.055147609770717816, + "grad_norm": 1.1866429005287884, + "learning_rate": 9.983649391198771e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.903723806142807, + "num_tokens": 44999283.0, + "step": 552 + }, + { + "epoch": 0.05524751486088216, + "grad_norm": 0.8554684114970736, + "learning_rate": 9.983518396772138e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9061470329761505, + "num_tokens": 45080837.0, + "step": 553 + }, + { + "epoch": 0.05534741995104651, + "grad_norm": 0.8380919098671191, + "learning_rate": 9.983386880566253e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9045455455780029, + "num_tokens": 45162348.0, + "step": 554 + }, + { + "epoch": 0.05544732504121085, + "grad_norm": 0.731923275950148, + "learning_rate": 9.983254842594887e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9036534130573273, + "num_tokens": 45243899.0, + "step": 555 + }, + { + "epoch": 0.05554723013137519, + "grad_norm": 0.8936689012232678, + "learning_rate": 9.983122282871865e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9040939211845398, + "num_tokens": 45325352.0, + "step": 556 + }, + { + "epoch": 0.05564713522153954, + "grad_norm": 0.7697734497895199, + "learning_rate": 9.982989201411064e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9028273224830627, + "num_tokens": 45406828.0, + "step": 557 + }, + { + "epoch": 0.05574704031170388, + "grad_norm": 1.2987463443252372, + "learning_rate": 9.98285559822642e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9063084721565247, + "num_tokens": 45488407.0, + "step": 558 + }, + { + "epoch": 0.05584694540186823, + "grad_norm": 0.7389610226671909, + "learning_rate": 9.98272147333192e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9042415916919708, + "num_tokens": 45569903.0, + "step": 559 + }, + { + "epoch": 0.05594685049203257, + "grad_norm": 0.7692261728229135, + "learning_rate": 9.982586826741609e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9031031429767609, + "num_tokens": 45651439.0, + "step": 560 + }, + { + "epoch": 0.05604675558219691, + "grad_norm": 0.7424216771074469, + "learning_rate": 9.98245165846958e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9033664166927338, + "num_tokens": 45732955.0, + "step": 561 + }, + { + "epoch": 0.05614666067236126, + "grad_norm": 0.7399541020109823, + "learning_rate": 9.98231596852999e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9027880132198334, + "num_tokens": 45814395.0, + "step": 562 + }, + { + "epoch": 0.056246565762525604, + "grad_norm": 0.8941324582876353, + "learning_rate": 9.982179756937044e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.9029419422149658, + "num_tokens": 45895830.0, + "step": 563 + }, + { + "epoch": 0.05634647085268994, + "grad_norm": 0.853762422865222, + "learning_rate": 9.982043023705004e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9041364192962646, + "num_tokens": 45977325.0, + "step": 564 + }, + { + "epoch": 0.05644637594285429, + "grad_norm": 0.8365225568637779, + "learning_rate": 9.981905768848186e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9052121043205261, + "num_tokens": 46058873.0, + "step": 565 + }, + { + "epoch": 0.05654628103301863, + "grad_norm": 0.8418873519309005, + "learning_rate": 9.98176799238096e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9065936505794525, + "num_tokens": 46140395.0, + "step": 566 + }, + { + "epoch": 0.05664618612318298, + "grad_norm": 0.8312973700076015, + "learning_rate": 9.98162969431775e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9050578474998474, + "num_tokens": 46221873.0, + "step": 567 + }, + { + "epoch": 0.05674609121334732, + "grad_norm": 0.7667820149443337, + "learning_rate": 9.98149087467304e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.90684375166893, + "num_tokens": 46303380.0, + "step": 568 + }, + { + "epoch": 0.05684599630351166, + "grad_norm": 0.9002001347758731, + "learning_rate": 9.98135153346136e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9061290919780731, + "num_tokens": 46384956.0, + "step": 569 + }, + { + "epoch": 0.05694590139367601, + "grad_norm": 1.0173485077090887, + "learning_rate": 9.981211670697303e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9050259292125702, + "num_tokens": 46466432.0, + "step": 570 + }, + { + "epoch": 0.057045806483840354, + "grad_norm": 1.0375443395965953, + "learning_rate": 9.981071286395513e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9078463613986969, + "num_tokens": 46548037.0, + "step": 571 + }, + { + "epoch": 0.05714571157400469, + "grad_norm": 0.845162076747433, + "learning_rate": 9.980930380570683e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9062509536743164, + "num_tokens": 46629570.0, + "step": 572 + }, + { + "epoch": 0.05724561666416904, + "grad_norm": 0.6728785423737713, + "learning_rate": 9.980788953237572e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9053025841712952, + "num_tokens": 46711066.0, + "step": 573 + }, + { + "epoch": 0.057345521754333384, + "grad_norm": 0.8939080520528113, + "learning_rate": 9.980647004410986e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9060613214969635, + "num_tokens": 46792625.0, + "step": 574 + }, + { + "epoch": 0.05744542684449773, + "grad_norm": 0.741546855951992, + "learning_rate": 9.980504534105784e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9096270799636841, + "num_tokens": 46874250.0, + "step": 575 + }, + { + "epoch": 0.05754533193466207, + "grad_norm": 0.7281847094168469, + "learning_rate": 9.980361542336887e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9083313643932343, + "num_tokens": 46955709.0, + "step": 576 + }, + { + "epoch": 0.057645237024826414, + "grad_norm": 0.9655355832923979, + "learning_rate": 9.980218029119264e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9065059125423431, + "num_tokens": 47037244.0, + "step": 577 + }, + { + "epoch": 0.05774514211499076, + "grad_norm": 0.8917884452836365, + "learning_rate": 9.98007399446794e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9047136306762695, + "num_tokens": 47118746.0, + "step": 578 + }, + { + "epoch": 0.057845047205155105, + "grad_norm": 1.0523715955232624, + "learning_rate": 9.979929438397997e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.903364360332489, + "num_tokens": 47200262.0, + "step": 579 + }, + { + "epoch": 0.05794495229531944, + "grad_norm": 0.8901552969360769, + "learning_rate": 9.979784360924571e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9029790163040161, + "num_tokens": 47281770.0, + "step": 580 + }, + { + "epoch": 0.05804485738548379, + "grad_norm": 0.972028895998567, + "learning_rate": 9.979638762062851e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9023978114128113, + "num_tokens": 47363322.0, + "step": 581 + }, + { + "epoch": 0.058144762475648135, + "grad_norm": 0.8765968163943181, + "learning_rate": 9.979492641828082e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9046320021152496, + "num_tokens": 47444837.0, + "step": 582 + }, + { + "epoch": 0.05824466756581248, + "grad_norm": 1.3888967938915608, + "learning_rate": 9.979346000235562e-06, + "loss": 0.514, + "mean_token_accuracy": 0.9023203551769257, + "num_tokens": 47526279.0, + "step": 583 + }, + { + "epoch": 0.05834457265597682, + "grad_norm": 0.6940373526547073, + "learning_rate": 9.979198837300644e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9056486189365387, + "num_tokens": 47607815.0, + "step": 584 + }, + { + "epoch": 0.058444477746141164, + "grad_norm": 1.8203290371930627, + "learning_rate": 9.979051153038737e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9033860862255096, + "num_tokens": 47689283.0, + "step": 585 + }, + { + "epoch": 0.05854438283630551, + "grad_norm": 0.7192954611917483, + "learning_rate": 9.978902947465304e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.905979573726654, + "num_tokens": 47770826.0, + "step": 586 + }, + { + "epoch": 0.058644287926469855, + "grad_norm": 0.9499532368813511, + "learning_rate": 9.978754220595861e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9059444963932037, + "num_tokens": 47852428.0, + "step": 587 + }, + { + "epoch": 0.058744193016634194, + "grad_norm": 0.9254183575580884, + "learning_rate": 9.978604972445983e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9061014950275421, + "num_tokens": 47934014.0, + "step": 588 + }, + { + "epoch": 0.05884409810679854, + "grad_norm": 0.6971679479947909, + "learning_rate": 9.978455203031292e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9062390923500061, + "num_tokens": 48015641.0, + "step": 589 + }, + { + "epoch": 0.058944003196962885, + "grad_norm": 1.181660698201177, + "learning_rate": 9.97830491236747e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9039275944232941, + "num_tokens": 48097150.0, + "step": 590 + }, + { + "epoch": 0.05904390828712723, + "grad_norm": 1.7061084444766177, + "learning_rate": 9.978154100470255e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9059683978557587, + "num_tokens": 48178693.0, + "step": 591 + }, + { + "epoch": 0.059143813377291576, + "grad_norm": 0.8832330483295004, + "learning_rate": 9.978002767355437e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9029944837093353, + "num_tokens": 48260166.0, + "step": 592 + }, + { + "epoch": 0.059243718467455915, + "grad_norm": 0.7749832572335033, + "learning_rate": 9.977850913038858e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9058129191398621, + "num_tokens": 48341618.0, + "step": 593 + }, + { + "epoch": 0.05934362355762026, + "grad_norm": 0.6950478514856601, + "learning_rate": 9.97769853753642e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9042889773845673, + "num_tokens": 48423125.0, + "step": 594 + }, + { + "epoch": 0.059443528647784606, + "grad_norm": 1.0173846555759287, + "learning_rate": 9.977545640864073e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.904619574546814, + "num_tokens": 48504651.0, + "step": 595 + }, + { + "epoch": 0.05954343373794895, + "grad_norm": 0.7280512142734465, + "learning_rate": 9.97739222303783e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9048762619495392, + "num_tokens": 48586145.0, + "step": 596 + }, + { + "epoch": 0.05964333882811329, + "grad_norm": 0.8620738477750474, + "learning_rate": 9.977238284073753e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9056398272514343, + "num_tokens": 48667630.0, + "step": 597 + }, + { + "epoch": 0.059743243918277636, + "grad_norm": 1.0962290232216392, + "learning_rate": 9.977083823987957e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.90410116314888, + "num_tokens": 48749233.0, + "step": 598 + }, + { + "epoch": 0.05984314900844198, + "grad_norm": 1.415453696498204, + "learning_rate": 9.976928842796616e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.902673214673996, + "num_tokens": 48830752.0, + "step": 599 + }, + { + "epoch": 0.05994305409860633, + "grad_norm": 1.5114223902999413, + "learning_rate": 9.976773340515958e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9031254649162292, + "num_tokens": 48912240.0, + "step": 600 + }, + { + "epoch": 0.060042959188770666, + "grad_norm": 3.420200259905924, + "learning_rate": 9.976617317162261e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9087734222412109, + "num_tokens": 48993846.0, + "step": 601 + }, + { + "epoch": 0.06014286427893501, + "grad_norm": 1.2261219274146058, + "learning_rate": 9.976460772751863e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9081215560436249, + "num_tokens": 49075385.0, + "step": 602 + }, + { + "epoch": 0.06024276936909936, + "grad_norm": 1.0806990222536805, + "learning_rate": 9.976303707301155e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9059670269489288, + "num_tokens": 49156937.0, + "step": 603 + }, + { + "epoch": 0.0603426744592637, + "grad_norm": 0.9273929118653402, + "learning_rate": 9.97614612082658e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.902859628200531, + "num_tokens": 49238486.0, + "step": 604 + }, + { + "epoch": 0.06044257954942804, + "grad_norm": 2.2843422766122763, + "learning_rate": 9.975988013344638e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9029541909694672, + "num_tokens": 49320035.0, + "step": 605 + }, + { + "epoch": 0.06054248463959239, + "grad_norm": 1.4603016225602834, + "learning_rate": 9.975829384871884e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9061117768287659, + "num_tokens": 49401612.0, + "step": 606 + }, + { + "epoch": 0.06064238972975673, + "grad_norm": 0.8886861729545686, + "learning_rate": 9.975670235424927e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9088165462017059, + "num_tokens": 49483222.0, + "step": 607 + }, + { + "epoch": 0.06074229481992108, + "grad_norm": 1.2550575651274172, + "learning_rate": 9.975510565020426e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9057103097438812, + "num_tokens": 49564721.0, + "step": 608 + }, + { + "epoch": 0.060842199910085416, + "grad_norm": 1.0667005408526669, + "learning_rate": 9.975350373675101e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9070724546909332, + "num_tokens": 49646270.0, + "step": 609 + }, + { + "epoch": 0.06094210500024976, + "grad_norm": 1.2823465399924951, + "learning_rate": 9.975189661405728e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9051653146743774, + "num_tokens": 49727703.0, + "step": 610 + }, + { + "epoch": 0.06104201009041411, + "grad_norm": 0.9509295221573388, + "learning_rate": 9.975028428229128e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9058796167373657, + "num_tokens": 49809206.0, + "step": 611 + }, + { + "epoch": 0.06114191518057845, + "grad_norm": 0.895886513254812, + "learning_rate": 9.974866674162186e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9053772687911987, + "num_tokens": 49890719.0, + "step": 612 + }, + { + "epoch": 0.06124182027074279, + "grad_norm": 3.1084373861671435, + "learning_rate": 9.974704399221836e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9050609767436981, + "num_tokens": 49972216.0, + "step": 613 + }, + { + "epoch": 0.06134172536090714, + "grad_norm": 0.903648970202901, + "learning_rate": 9.97454160342507e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9069939851760864, + "num_tokens": 50053715.0, + "step": 614 + }, + { + "epoch": 0.06144163045107148, + "grad_norm": 1.5537978913564854, + "learning_rate": 9.97437828678893e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9060637950897217, + "num_tokens": 50135234.0, + "step": 615 + }, + { + "epoch": 0.06154153554123583, + "grad_norm": 0.8918277421178396, + "learning_rate": 9.97421444933052e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9055984616279602, + "num_tokens": 50216790.0, + "step": 616 + }, + { + "epoch": 0.06164144063140017, + "grad_norm": 1.4749041330269321, + "learning_rate": 9.974050091066989e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9061239957809448, + "num_tokens": 50298262.0, + "step": 617 + }, + { + "epoch": 0.06174134572156451, + "grad_norm": 1.2338288123801198, + "learning_rate": 9.973885212015545e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9072837829589844, + "num_tokens": 50379774.0, + "step": 618 + }, + { + "epoch": 0.06184125081172886, + "grad_norm": 1.1080155337578388, + "learning_rate": 9.973719812193458e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9056772887706757, + "num_tokens": 50461282.0, + "step": 619 + }, + { + "epoch": 0.061941155901893204, + "grad_norm": 0.8776821289998435, + "learning_rate": 9.97355389161804e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9061905741691589, + "num_tokens": 50542819.0, + "step": 620 + }, + { + "epoch": 0.06204106099205754, + "grad_norm": 5.305095462465025, + "learning_rate": 9.973387450306663e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9077383577823639, + "num_tokens": 50624367.0, + "step": 621 + }, + { + "epoch": 0.06214096608222189, + "grad_norm": 0.6754496087824473, + "learning_rate": 9.973220488276756e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9059399962425232, + "num_tokens": 50705878.0, + "step": 622 + }, + { + "epoch": 0.06224087117238623, + "grad_norm": 0.7255640920645089, + "learning_rate": 9.973053005545798e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9058490097522736, + "num_tokens": 50787430.0, + "step": 623 + }, + { + "epoch": 0.06234077626255058, + "grad_norm": 0.7073580442734392, + "learning_rate": 9.972885002131328e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9006865918636322, + "num_tokens": 50868990.0, + "step": 624 + }, + { + "epoch": 0.06244068135271492, + "grad_norm": 0.7114000251807027, + "learning_rate": 9.97271647805093e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9038931131362915, + "num_tokens": 50950424.0, + "step": 625 + }, + { + "epoch": 0.06254058644287927, + "grad_norm": 1.1715350391732842, + "learning_rate": 9.972547433322254e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9067972302436829, + "num_tokens": 51032028.0, + "step": 626 + }, + { + "epoch": 0.0626404915330436, + "grad_norm": 0.778445163983663, + "learning_rate": 9.972377867962998e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9050440192222595, + "num_tokens": 51113507.0, + "step": 627 + }, + { + "epoch": 0.06274039662320795, + "grad_norm": 0.7901241285522589, + "learning_rate": 9.972207781990912e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9060649573802948, + "num_tokens": 51195055.0, + "step": 628 + }, + { + "epoch": 0.06284030171337229, + "grad_norm": 0.7310025670866789, + "learning_rate": 9.97203717542381e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9079866409301758, + "num_tokens": 51276562.0, + "step": 629 + }, + { + "epoch": 0.06294020680353664, + "grad_norm": 0.9153014267067656, + "learning_rate": 9.97186604827955e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.905374139547348, + "num_tokens": 51358088.0, + "step": 630 + }, + { + "epoch": 0.06304011189370098, + "grad_norm": 0.9678847724586181, + "learning_rate": 9.971694400576053e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9024992287158966, + "num_tokens": 51439579.0, + "step": 631 + }, + { + "epoch": 0.06314001698386533, + "grad_norm": 0.6669782415907224, + "learning_rate": 9.971522232331288e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9038515388965607, + "num_tokens": 51521137.0, + "step": 632 + }, + { + "epoch": 0.06323992207402968, + "grad_norm": 0.7760827208361497, + "learning_rate": 9.97134954356328e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9039772152900696, + "num_tokens": 51602644.0, + "step": 633 + }, + { + "epoch": 0.06333982716419402, + "grad_norm": 0.7739504140219801, + "learning_rate": 9.971176334290114e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9047373235225677, + "num_tokens": 51684127.0, + "step": 634 + }, + { + "epoch": 0.06343973225435837, + "grad_norm": 0.7586808171548658, + "learning_rate": 9.971002604529922e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9054714739322662, + "num_tokens": 51765650.0, + "step": 635 + }, + { + "epoch": 0.0635396373445227, + "grad_norm": 0.9105244572757327, + "learning_rate": 9.970828354300895e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.908369392156601, + "num_tokens": 51847254.0, + "step": 636 + }, + { + "epoch": 0.06363954243468704, + "grad_norm": 2.06456944514252, + "learning_rate": 9.970653583621275e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.905531257390976, + "num_tokens": 51928809.0, + "step": 637 + }, + { + "epoch": 0.06373944752485139, + "grad_norm": 0.7755133779864166, + "learning_rate": 9.970478292509364e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9053638577461243, + "num_tokens": 52010365.0, + "step": 638 + }, + { + "epoch": 0.06383935261501573, + "grad_norm": 0.7835926839584673, + "learning_rate": 9.970302480983511e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9041573703289032, + "num_tokens": 52091865.0, + "step": 639 + }, + { + "epoch": 0.06393925770518008, + "grad_norm": 0.7584567705169869, + "learning_rate": 9.97012614906213e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9094904661178589, + "num_tokens": 52173377.0, + "step": 640 + }, + { + "epoch": 0.06403916279534443, + "grad_norm": 0.6549390932072843, + "learning_rate": 9.969949296763675e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9066552817821503, + "num_tokens": 52254881.0, + "step": 641 + }, + { + "epoch": 0.06413906788550877, + "grad_norm": 0.8283008857036878, + "learning_rate": 9.969771924106669e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9032323360443115, + "num_tokens": 52336346.0, + "step": 642 + }, + { + "epoch": 0.06423897297567312, + "grad_norm": 0.6827399189743257, + "learning_rate": 9.969594031109681e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.90592822432518, + "num_tokens": 52417841.0, + "step": 643 + }, + { + "epoch": 0.06433887806583745, + "grad_norm": 0.9241876998847236, + "learning_rate": 9.969415617791336e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9052382111549377, + "num_tokens": 52499310.0, + "step": 644 + }, + { + "epoch": 0.0644387831560018, + "grad_norm": 0.7221507460502408, + "learning_rate": 9.969236684170314e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.907571017742157, + "num_tokens": 52580835.0, + "step": 645 + }, + { + "epoch": 0.06453868824616614, + "grad_norm": 0.8286861278875907, + "learning_rate": 9.969057230265351e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9072697162628174, + "num_tokens": 52662323.0, + "step": 646 + }, + { + "epoch": 0.06463859333633049, + "grad_norm": 0.814935280706582, + "learning_rate": 9.968877256095234e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9050754606723785, + "num_tokens": 52743820.0, + "step": 647 + }, + { + "epoch": 0.06473849842649483, + "grad_norm": 0.6647334867736012, + "learning_rate": 9.968696761678808e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.904872477054596, + "num_tokens": 52825346.0, + "step": 648 + }, + { + "epoch": 0.06483840351665918, + "grad_norm": 0.8134649453475723, + "learning_rate": 9.96851574703497e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9058890342712402, + "num_tokens": 52906859.0, + "step": 649 + }, + { + "epoch": 0.06493830860682352, + "grad_norm": 0.7997956475900642, + "learning_rate": 9.968334212182674e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9048772156238556, + "num_tokens": 52988438.0, + "step": 650 + }, + { + "epoch": 0.06503821369698787, + "grad_norm": 0.6360220353070123, + "learning_rate": 9.968152157140925e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9067444503307343, + "num_tokens": 53069903.0, + "step": 651 + }, + { + "epoch": 0.0651381187871522, + "grad_norm": 0.6416537532117735, + "learning_rate": 9.967969581928784e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9068565964698792, + "num_tokens": 53151497.0, + "step": 652 + }, + { + "epoch": 0.06523802387731654, + "grad_norm": 0.6992091807913746, + "learning_rate": 9.967786486565369e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.906117171049118, + "num_tokens": 53232943.0, + "step": 653 + }, + { + "epoch": 0.06533792896748089, + "grad_norm": 0.8753836429958378, + "learning_rate": 9.96760287106985e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9041494429111481, + "num_tokens": 53314444.0, + "step": 654 + }, + { + "epoch": 0.06543783405764524, + "grad_norm": 0.7413645168616769, + "learning_rate": 9.967418735461449e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9049419462680817, + "num_tokens": 53395902.0, + "step": 655 + }, + { + "epoch": 0.06553773914780958, + "grad_norm": 1.172561899560518, + "learning_rate": 9.967234079759448e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.90558722615242, + "num_tokens": 53477486.0, + "step": 656 + }, + { + "epoch": 0.06563764423797393, + "grad_norm": 0.6419788830352912, + "learning_rate": 9.967048903983178e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9076560735702515, + "num_tokens": 53559028.0, + "step": 657 + }, + { + "epoch": 0.06573754932813827, + "grad_norm": 0.567537721069472, + "learning_rate": 9.966863208152031e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9080810546875, + "num_tokens": 53640619.0, + "step": 658 + }, + { + "epoch": 0.06583745441830262, + "grad_norm": 0.8965437996605717, + "learning_rate": 9.966676992285447e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9051335155963898, + "num_tokens": 53722155.0, + "step": 659 + }, + { + "epoch": 0.06593735950846695, + "grad_norm": 0.7060562008984801, + "learning_rate": 9.966490256402924e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9053296148777008, + "num_tokens": 53803718.0, + "step": 660 + }, + { + "epoch": 0.0660372645986313, + "grad_norm": 1.7302480616594973, + "learning_rate": 9.966303000524011e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9026765823364258, + "num_tokens": 53885143.0, + "step": 661 + }, + { + "epoch": 0.06613716968879564, + "grad_norm": 0.8643439448121779, + "learning_rate": 9.966115224668315e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9090560376644135, + "num_tokens": 53966720.0, + "step": 662 + }, + { + "epoch": 0.06623707477895999, + "grad_norm": 0.7231279570488567, + "learning_rate": 9.965926928855498e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9064497351646423, + "num_tokens": 54048271.0, + "step": 663 + }, + { + "epoch": 0.06633697986912433, + "grad_norm": 0.7449668038386942, + "learning_rate": 9.965738113105274e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9079667031764984, + "num_tokens": 54129774.0, + "step": 664 + }, + { + "epoch": 0.06643688495928868, + "grad_norm": 0.7608891589298395, + "learning_rate": 9.965548777437411e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9029059708118439, + "num_tokens": 54211265.0, + "step": 665 + }, + { + "epoch": 0.06653679004945302, + "grad_norm": 0.8136597503546666, + "learning_rate": 9.965358921871735e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9046337604522705, + "num_tokens": 54292825.0, + "step": 666 + }, + { + "epoch": 0.06663669513961737, + "grad_norm": 0.7945421335751723, + "learning_rate": 9.965168546428122e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9057070314884186, + "num_tokens": 54374367.0, + "step": 667 + }, + { + "epoch": 0.0667366002297817, + "grad_norm": 0.7100593209442803, + "learning_rate": 9.964977651126504e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9083203077316284, + "num_tokens": 54455912.0, + "step": 668 + }, + { + "epoch": 0.06683650531994605, + "grad_norm": 0.5776965436585055, + "learning_rate": 9.96478623598687e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9084008634090424, + "num_tokens": 54537431.0, + "step": 669 + }, + { + "epoch": 0.06693641041011039, + "grad_norm": 0.7916917543770445, + "learning_rate": 9.964594301029258e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9021255373954773, + "num_tokens": 54618903.0, + "step": 670 + }, + { + "epoch": 0.06703631550027474, + "grad_norm": 0.7764215939460177, + "learning_rate": 9.964401846273769e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9053577780723572, + "num_tokens": 54700416.0, + "step": 671 + }, + { + "epoch": 0.06713622059043908, + "grad_norm": 1.065419801971068, + "learning_rate": 9.964208871740548e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.90370112657547, + "num_tokens": 54781941.0, + "step": 672 + }, + { + "epoch": 0.06723612568060343, + "grad_norm": 0.8124660260619799, + "learning_rate": 9.964015377449803e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9038026928901672, + "num_tokens": 54863423.0, + "step": 673 + }, + { + "epoch": 0.06733603077076777, + "grad_norm": 0.7456057473394684, + "learning_rate": 9.963821363421793e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9076844155788422, + "num_tokens": 54944939.0, + "step": 674 + }, + { + "epoch": 0.06743593586093212, + "grad_norm": 0.8415764132889451, + "learning_rate": 9.963626829676829e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9051363170146942, + "num_tokens": 55026456.0, + "step": 675 + }, + { + "epoch": 0.06753584095109647, + "grad_norm": 0.7320227337613924, + "learning_rate": 9.963431776235279e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9063446223735809, + "num_tokens": 55107964.0, + "step": 676 + }, + { + "epoch": 0.0676357460412608, + "grad_norm": 0.6657482815940721, + "learning_rate": 9.963236203117569e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9048793613910675, + "num_tokens": 55189437.0, + "step": 677 + }, + { + "epoch": 0.06773565113142514, + "grad_norm": 0.654951446420089, + "learning_rate": 9.963040110344173e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.90861976146698, + "num_tokens": 55270996.0, + "step": 678 + }, + { + "epoch": 0.06783555622158949, + "grad_norm": 0.7291010629995776, + "learning_rate": 9.962843497935621e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9055865406990051, + "num_tokens": 55352476.0, + "step": 679 + }, + { + "epoch": 0.06793546131175383, + "grad_norm": 0.7470391225536006, + "learning_rate": 9.9626463659125e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9022253751754761, + "num_tokens": 55433904.0, + "step": 680 + }, + { + "epoch": 0.06803536640191818, + "grad_norm": 0.8712073714079572, + "learning_rate": 9.962448714295452e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9052118957042694, + "num_tokens": 55515498.0, + "step": 681 + }, + { + "epoch": 0.06813527149208252, + "grad_norm": 1.044338189560258, + "learning_rate": 9.962250543105167e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9053382277488708, + "num_tokens": 55597017.0, + "step": 682 + }, + { + "epoch": 0.06823517658224687, + "grad_norm": 0.6975172775885033, + "learning_rate": 9.962051852362396e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9033494293689728, + "num_tokens": 55678485.0, + "step": 683 + }, + { + "epoch": 0.06833508167241122, + "grad_norm": 1.2744098374634443, + "learning_rate": 9.961852642087943e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9065682291984558, + "num_tokens": 55760057.0, + "step": 684 + }, + { + "epoch": 0.06843498676257555, + "grad_norm": 0.7718018579114859, + "learning_rate": 9.961652912302664e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9081624746322632, + "num_tokens": 55841685.0, + "step": 685 + }, + { + "epoch": 0.0685348918527399, + "grad_norm": 0.8777895976579465, + "learning_rate": 9.96145266302747e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9048883020877838, + "num_tokens": 55923160.0, + "step": 686 + }, + { + "epoch": 0.06863479694290424, + "grad_norm": 0.6214451853827969, + "learning_rate": 9.96125189428333e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9051629900932312, + "num_tokens": 56004666.0, + "step": 687 + }, + { + "epoch": 0.06873470203306858, + "grad_norm": 0.6456175400622598, + "learning_rate": 9.961050606091263e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9067731499671936, + "num_tokens": 56086181.0, + "step": 688 + }, + { + "epoch": 0.06883460712323293, + "grad_norm": 0.8649500679216092, + "learning_rate": 9.960848798472344e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9080584645271301, + "num_tokens": 56167685.0, + "step": 689 + }, + { + "epoch": 0.06893451221339728, + "grad_norm": 0.7315000872164755, + "learning_rate": 9.960646471447703e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9043247699737549, + "num_tokens": 56249165.0, + "step": 690 + }, + { + "epoch": 0.06903441730356162, + "grad_norm": 0.7566987055261456, + "learning_rate": 9.960443625038525e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9063539505004883, + "num_tokens": 56330657.0, + "step": 691 + }, + { + "epoch": 0.06913432239372597, + "grad_norm": 0.7752228657457709, + "learning_rate": 9.960240259266046e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9045520424842834, + "num_tokens": 56412240.0, + "step": 692 + }, + { + "epoch": 0.0692342274838903, + "grad_norm": 0.7642961342394292, + "learning_rate": 9.960036374151557e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.902645468711853, + "num_tokens": 56493658.0, + "step": 693 + }, + { + "epoch": 0.06933413257405464, + "grad_norm": 0.9237353873267342, + "learning_rate": 9.959831969716412e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9053504168987274, + "num_tokens": 56575189.0, + "step": 694 + }, + { + "epoch": 0.06943403766421899, + "grad_norm": 0.7573335834712678, + "learning_rate": 9.959627045982006e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9036669731140137, + "num_tokens": 56656680.0, + "step": 695 + }, + { + "epoch": 0.06953394275438333, + "grad_norm": 0.7293799123219997, + "learning_rate": 9.959421602969796e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9067370593547821, + "num_tokens": 56738263.0, + "step": 696 + }, + { + "epoch": 0.06963384784454768, + "grad_norm": 0.7310594169398332, + "learning_rate": 9.959215640701292e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9065931737422943, + "num_tokens": 56819625.0, + "step": 697 + }, + { + "epoch": 0.06973375293471203, + "grad_norm": 0.7848338312009396, + "learning_rate": 9.95900915919806e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9053963124752045, + "num_tokens": 56901144.0, + "step": 698 + }, + { + "epoch": 0.06983365802487637, + "grad_norm": 0.6640899834551368, + "learning_rate": 9.958802158481718e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9036759436130524, + "num_tokens": 56982561.0, + "step": 699 + }, + { + "epoch": 0.06993356311504072, + "grad_norm": 0.7306636808465495, + "learning_rate": 9.95859463857394e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9037730991840363, + "num_tokens": 57064102.0, + "step": 700 + }, + { + "epoch": 0.07003346820520505, + "grad_norm": 0.6881550819475964, + "learning_rate": 9.95838659949645e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9043785035610199, + "num_tokens": 57145659.0, + "step": 701 + }, + { + "epoch": 0.0701333732953694, + "grad_norm": 0.7108292533112764, + "learning_rate": 9.958178041271035e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9076347053050995, + "num_tokens": 57227264.0, + "step": 702 + }, + { + "epoch": 0.07023327838553374, + "grad_norm": 0.6334838913963667, + "learning_rate": 9.957968963919527e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9060518443584442, + "num_tokens": 57308823.0, + "step": 703 + }, + { + "epoch": 0.07033318347569809, + "grad_norm": 0.700276088433864, + "learning_rate": 9.95775936746382e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9064425528049469, + "num_tokens": 57390392.0, + "step": 704 + }, + { + "epoch": 0.07043308856586243, + "grad_norm": 0.6728623337247593, + "learning_rate": 9.957549251925855e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9073387086391449, + "num_tokens": 57471877.0, + "step": 705 + }, + { + "epoch": 0.07053299365602678, + "grad_norm": 0.8009164763838911, + "learning_rate": 9.957338617327637e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9064753353595734, + "num_tokens": 57553512.0, + "step": 706 + }, + { + "epoch": 0.07063289874619112, + "grad_norm": 0.6712710434541014, + "learning_rate": 9.957127463691215e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9025898277759552, + "num_tokens": 57635011.0, + "step": 707 + }, + { + "epoch": 0.07073280383635547, + "grad_norm": 0.8131762829455881, + "learning_rate": 9.956915791038696e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9057250320911407, + "num_tokens": 57716481.0, + "step": 708 + }, + { + "epoch": 0.0708327089265198, + "grad_norm": 0.615602210131487, + "learning_rate": 9.956703599392246e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9026518166065216, + "num_tokens": 57797931.0, + "step": 709 + }, + { + "epoch": 0.07093261401668415, + "grad_norm": 0.6102576237893733, + "learning_rate": 9.95649088877408e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9035656452178955, + "num_tokens": 57879519.0, + "step": 710 + }, + { + "epoch": 0.07103251910684849, + "grad_norm": 0.6807525488696207, + "learning_rate": 9.95627765920647e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9030593931674957, + "num_tokens": 57961033.0, + "step": 711 + }, + { + "epoch": 0.07113242419701284, + "grad_norm": 0.6648608524809059, + "learning_rate": 9.956063910711739e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9097401201725006, + "num_tokens": 58042600.0, + "step": 712 + }, + { + "epoch": 0.07123232928717718, + "grad_norm": 0.7107017945615975, + "learning_rate": 9.955849643312272e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9077363014221191, + "num_tokens": 58124061.0, + "step": 713 + }, + { + "epoch": 0.07133223437734153, + "grad_norm": 0.7654714562949148, + "learning_rate": 9.955634857030495e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9026031196117401, + "num_tokens": 58205590.0, + "step": 714 + }, + { + "epoch": 0.07143213946750587, + "grad_norm": 0.6268211642145498, + "learning_rate": 9.955419551888903e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9039095342159271, + "num_tokens": 58287049.0, + "step": 715 + }, + { + "epoch": 0.07153204455767022, + "grad_norm": 0.7652644327433263, + "learning_rate": 9.955203727910037e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.903508722782135, + "num_tokens": 58368547.0, + "step": 716 + }, + { + "epoch": 0.07163194964783456, + "grad_norm": 0.6987291330086564, + "learning_rate": 9.95498738511649e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9065972864627838, + "num_tokens": 58450148.0, + "step": 717 + }, + { + "epoch": 0.0717318547379989, + "grad_norm": 0.7825064690514055, + "learning_rate": 9.954770523530918e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9027858674526215, + "num_tokens": 58531671.0, + "step": 718 + }, + { + "epoch": 0.07183175982816324, + "grad_norm": 0.6061684960383107, + "learning_rate": 9.954553143176026e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9037348330020905, + "num_tokens": 58613157.0, + "step": 719 + }, + { + "epoch": 0.07193166491832759, + "grad_norm": 0.752446882773055, + "learning_rate": 9.954335244074575e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9022804498672485, + "num_tokens": 58694670.0, + "step": 720 + }, + { + "epoch": 0.07203157000849193, + "grad_norm": 0.5729198463721145, + "learning_rate": 9.954116826249373e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9063713848590851, + "num_tokens": 58776182.0, + "step": 721 + }, + { + "epoch": 0.07213147509865628, + "grad_norm": 0.5997776623718648, + "learning_rate": 9.953897889723296e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9081947505474091, + "num_tokens": 58857815.0, + "step": 722 + }, + { + "epoch": 0.07223138018882062, + "grad_norm": 0.7381802963951241, + "learning_rate": 9.953678434519265e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9071066975593567, + "num_tokens": 58939355.0, + "step": 723 + }, + { + "epoch": 0.07233128527898497, + "grad_norm": 0.6084349958474987, + "learning_rate": 9.953458460660253e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9052829742431641, + "num_tokens": 59020854.0, + "step": 724 + }, + { + "epoch": 0.07243119036914931, + "grad_norm": 0.7083893548048441, + "learning_rate": 9.953237968169295e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9045974612236023, + "num_tokens": 59102431.0, + "step": 725 + }, + { + "epoch": 0.07253109545931365, + "grad_norm": 0.7341775633716636, + "learning_rate": 9.953016957069476e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.905889093875885, + "num_tokens": 59183978.0, + "step": 726 + }, + { + "epoch": 0.07263100054947799, + "grad_norm": 0.6695480616122257, + "learning_rate": 9.952795427383938e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9037035405635834, + "num_tokens": 59265460.0, + "step": 727 + }, + { + "epoch": 0.07273090563964234, + "grad_norm": 0.6863454474010166, + "learning_rate": 9.952573379135872e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9048437476158142, + "num_tokens": 59346984.0, + "step": 728 + }, + { + "epoch": 0.07283081072980668, + "grad_norm": 0.7659243920554635, + "learning_rate": 9.95235081234853e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9059776961803436, + "num_tokens": 59428546.0, + "step": 729 + }, + { + "epoch": 0.07293071581997103, + "grad_norm": 0.7443910114190153, + "learning_rate": 9.95212772704521e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9052598774433136, + "num_tokens": 59510068.0, + "step": 730 + }, + { + "epoch": 0.07303062091013537, + "grad_norm": 0.5282766755622278, + "learning_rate": 9.951904123249277e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9059212803840637, + "num_tokens": 59591576.0, + "step": 731 + }, + { + "epoch": 0.07313052600029972, + "grad_norm": 0.7646790286229166, + "learning_rate": 9.951680000984136e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9025040566921234, + "num_tokens": 59673047.0, + "step": 732 + }, + { + "epoch": 0.07323043109046407, + "grad_norm": 0.6687498341331847, + "learning_rate": 9.951455360273255e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9026210904121399, + "num_tokens": 59754558.0, + "step": 733 + }, + { + "epoch": 0.0733303361806284, + "grad_norm": 0.5858951978614867, + "learning_rate": 9.951230201140155e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9069741070270538, + "num_tokens": 59836046.0, + "step": 734 + }, + { + "epoch": 0.07343024127079274, + "grad_norm": 0.5404242623470276, + "learning_rate": 9.951004523608408e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9123719334602356, + "num_tokens": 59917629.0, + "step": 735 + }, + { + "epoch": 0.07353014636095709, + "grad_norm": 0.7131846712879345, + "learning_rate": 9.950778327701643e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9030588865280151, + "num_tokens": 59999164.0, + "step": 736 + }, + { + "epoch": 0.07363005145112143, + "grad_norm": 0.7730619742323765, + "learning_rate": 9.950551613443546e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9040481746196747, + "num_tokens": 60080615.0, + "step": 737 + }, + { + "epoch": 0.07372995654128578, + "grad_norm": 0.6730890786578108, + "learning_rate": 9.950324380857852e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.9060824513435364, + "num_tokens": 60161995.0, + "step": 738 + }, + { + "epoch": 0.07382986163145012, + "grad_norm": 0.6276112960791543, + "learning_rate": 9.950096629968353e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9090699255466461, + "num_tokens": 60243548.0, + "step": 739 + }, + { + "epoch": 0.07392976672161447, + "grad_norm": 0.678837395702441, + "learning_rate": 9.949868360798893e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9064590036869049, + "num_tokens": 60325046.0, + "step": 740 + }, + { + "epoch": 0.07402967181177882, + "grad_norm": 0.9934496163719619, + "learning_rate": 9.949639573373374e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9053492546081543, + "num_tokens": 60406610.0, + "step": 741 + }, + { + "epoch": 0.07412957690194315, + "grad_norm": 0.6180209283324254, + "learning_rate": 9.94941026771575e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9040932953357697, + "num_tokens": 60488062.0, + "step": 742 + }, + { + "epoch": 0.0742294819921075, + "grad_norm": 0.5830565309065346, + "learning_rate": 9.949180443850028e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9071592390537262, + "num_tokens": 60569563.0, + "step": 743 + }, + { + "epoch": 0.07432938708227184, + "grad_norm": 0.6711414070522163, + "learning_rate": 9.948950101800274e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9069705009460449, + "num_tokens": 60651117.0, + "step": 744 + }, + { + "epoch": 0.07442929217243618, + "grad_norm": 0.6848201980550751, + "learning_rate": 9.948719241590602e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9053126573562622, + "num_tokens": 60732585.0, + "step": 745 + }, + { + "epoch": 0.07452919726260053, + "grad_norm": 0.5690492224641002, + "learning_rate": 9.948487863245184e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9049800634384155, + "num_tokens": 60814107.0, + "step": 746 + }, + { + "epoch": 0.07462910235276488, + "grad_norm": 0.6109921266509396, + "learning_rate": 9.948255966788247e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9083106219768524, + "num_tokens": 60895659.0, + "step": 747 + }, + { + "epoch": 0.07472900744292922, + "grad_norm": 0.6984454015670548, + "learning_rate": 9.948023552244068e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.903488427400589, + "num_tokens": 60977163.0, + "step": 748 + }, + { + "epoch": 0.07482891253309357, + "grad_norm": 0.6604123020647178, + "learning_rate": 9.947790619636984e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9052490592002869, + "num_tokens": 61058699.0, + "step": 749 + }, + { + "epoch": 0.0749288176232579, + "grad_norm": 0.6337319144175415, + "learning_rate": 9.947557168991383e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9069618582725525, + "num_tokens": 61140215.0, + "step": 750 + }, + { + "epoch": 0.07502872271342224, + "grad_norm": 0.6412581973172398, + "learning_rate": 9.947323200331705e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.906371146440506, + "num_tokens": 61221789.0, + "step": 751 + }, + { + "epoch": 0.07512862780358659, + "grad_norm": 0.6441057584520795, + "learning_rate": 9.947088713682447e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9051487445831299, + "num_tokens": 61303318.0, + "step": 752 + }, + { + "epoch": 0.07522853289375094, + "grad_norm": 0.6602138470859599, + "learning_rate": 9.946853709068163e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.905057281255722, + "num_tokens": 61384878.0, + "step": 753 + }, + { + "epoch": 0.07532843798391528, + "grad_norm": 0.7445941700381462, + "learning_rate": 9.946618186513455e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9066101014614105, + "num_tokens": 61466398.0, + "step": 754 + }, + { + "epoch": 0.07542834307407963, + "grad_norm": 0.6778703315651436, + "learning_rate": 9.946382146042986e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9076637923717499, + "num_tokens": 61547922.0, + "step": 755 + }, + { + "epoch": 0.07552824816424397, + "grad_norm": 0.9682229931950462, + "learning_rate": 9.946145587681467e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9067719280719757, + "num_tokens": 61629454.0, + "step": 756 + }, + { + "epoch": 0.07562815325440832, + "grad_norm": 0.8832363745304769, + "learning_rate": 9.945908511453663e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9028443992137909, + "num_tokens": 61710976.0, + "step": 757 + }, + { + "epoch": 0.07572805834457265, + "grad_norm": 0.6841105350920197, + "learning_rate": 9.945670917384404e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9041623771190643, + "num_tokens": 61792413.0, + "step": 758 + }, + { + "epoch": 0.075827963434737, + "grad_norm": 0.7252140853466698, + "learning_rate": 9.94543280549856e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9064247906208038, + "num_tokens": 61873854.0, + "step": 759 + }, + { + "epoch": 0.07592786852490134, + "grad_norm": 0.6546534887727421, + "learning_rate": 9.945194175821063e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9084631502628326, + "num_tokens": 61955401.0, + "step": 760 + }, + { + "epoch": 0.07602777361506569, + "grad_norm": 0.6968103995505477, + "learning_rate": 9.944955028376899e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9052109122276306, + "num_tokens": 62036859.0, + "step": 761 + }, + { + "epoch": 0.07612767870523003, + "grad_norm": 0.7581035749446556, + "learning_rate": 9.944715363191105e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9064094424247742, + "num_tokens": 62118360.0, + "step": 762 + }, + { + "epoch": 0.07622758379539438, + "grad_norm": 0.6358064950926263, + "learning_rate": 9.944475180288777e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9056049287319183, + "num_tokens": 62199904.0, + "step": 763 + }, + { + "epoch": 0.07632748888555872, + "grad_norm": 0.6206111253396255, + "learning_rate": 9.944234479695058e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9069787263870239, + "num_tokens": 62281528.0, + "step": 764 + }, + { + "epoch": 0.07642739397572307, + "grad_norm": 0.8279155067104316, + "learning_rate": 9.943993261435155e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.906871110200882, + "num_tokens": 62363040.0, + "step": 765 + }, + { + "epoch": 0.07652729906588741, + "grad_norm": 0.6920072829690552, + "learning_rate": 9.94375152553432e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9070489406585693, + "num_tokens": 62444659.0, + "step": 766 + }, + { + "epoch": 0.07662720415605175, + "grad_norm": 0.7611138050942767, + "learning_rate": 9.943509272017863e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9053826332092285, + "num_tokens": 62526152.0, + "step": 767 + }, + { + "epoch": 0.07672710924621609, + "grad_norm": 0.691415661788861, + "learning_rate": 9.943266500911152e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.904572069644928, + "num_tokens": 62607588.0, + "step": 768 + }, + { + "epoch": 0.07682701433638044, + "grad_norm": 0.6095228607102859, + "learning_rate": 9.943023212239601e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9079705774784088, + "num_tokens": 62689074.0, + "step": 769 + }, + { + "epoch": 0.07692691942654478, + "grad_norm": 0.6161335207517609, + "learning_rate": 9.942779406028684e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9059022068977356, + "num_tokens": 62770605.0, + "step": 770 + }, + { + "epoch": 0.07702682451670913, + "grad_norm": 0.598695971512052, + "learning_rate": 9.942535082303927e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9111657738685608, + "num_tokens": 62852106.0, + "step": 771 + }, + { + "epoch": 0.07712672960687347, + "grad_norm": 0.6747070069686063, + "learning_rate": 9.942290241090916e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.904291957616806, + "num_tokens": 62933542.0, + "step": 772 + }, + { + "epoch": 0.07722663469703782, + "grad_norm": 0.6889731752530108, + "learning_rate": 9.942044882415276e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9022129476070404, + "num_tokens": 63015032.0, + "step": 773 + }, + { + "epoch": 0.07732653978720216, + "grad_norm": 0.6008792841951772, + "learning_rate": 9.941799006302705e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9065475761890411, + "num_tokens": 63096621.0, + "step": 774 + }, + { + "epoch": 0.0774264448773665, + "grad_norm": 0.7967947957810457, + "learning_rate": 9.941552612778945e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9040557444095612, + "num_tokens": 63178216.0, + "step": 775 + }, + { + "epoch": 0.07752634996753084, + "grad_norm": 0.614231686254657, + "learning_rate": 9.941305701869792e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9103879630565643, + "num_tokens": 63259831.0, + "step": 776 + }, + { + "epoch": 0.07762625505769519, + "grad_norm": 0.8277724470318694, + "learning_rate": 9.941058273601097e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9070296883583069, + "num_tokens": 63341400.0, + "step": 777 + }, + { + "epoch": 0.07772616014785953, + "grad_norm": 0.7293801535497128, + "learning_rate": 9.940810327998768e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9040962159633636, + "num_tokens": 63422912.0, + "step": 778 + }, + { + "epoch": 0.07782606523802388, + "grad_norm": 0.8165844604489336, + "learning_rate": 9.940561865088763e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9033924043178558, + "num_tokens": 63504370.0, + "step": 779 + }, + { + "epoch": 0.07792597032818822, + "grad_norm": 0.7686856386217769, + "learning_rate": 9.940312884897099e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9046637713909149, + "num_tokens": 63585828.0, + "step": 780 + }, + { + "epoch": 0.07802587541835257, + "grad_norm": 0.6021370511768317, + "learning_rate": 9.940063387449843e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9063200056552887, + "num_tokens": 63667358.0, + "step": 781 + }, + { + "epoch": 0.07812578050851691, + "grad_norm": 0.6157709492534535, + "learning_rate": 9.939813372773117e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.905162513256073, + "num_tokens": 63748762.0, + "step": 782 + }, + { + "epoch": 0.07822568559868125, + "grad_norm": 0.9959632290075329, + "learning_rate": 9.9395628408931e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9103983342647552, + "num_tokens": 63830432.0, + "step": 783 + }, + { + "epoch": 0.07832559068884559, + "grad_norm": 0.675000994446126, + "learning_rate": 9.93931179183602e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9050751328468323, + "num_tokens": 63912017.0, + "step": 784 + }, + { + "epoch": 0.07842549577900994, + "grad_norm": 0.6925442221512581, + "learning_rate": 9.939060225628162e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9040015339851379, + "num_tokens": 63993532.0, + "step": 785 + }, + { + "epoch": 0.07852540086917428, + "grad_norm": 0.716880815246361, + "learning_rate": 9.938808142295871e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9073455631732941, + "num_tokens": 64074996.0, + "step": 786 + }, + { + "epoch": 0.07862530595933863, + "grad_norm": 0.9790622616142596, + "learning_rate": 9.938555541865533e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9057769775390625, + "num_tokens": 64156547.0, + "step": 787 + }, + { + "epoch": 0.07872521104950297, + "grad_norm": 0.8058947963388166, + "learning_rate": 9.9383024243636e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9034056961536407, + "num_tokens": 64238101.0, + "step": 788 + }, + { + "epoch": 0.07882511613966732, + "grad_norm": 0.6443566667078391, + "learning_rate": 9.938048789816573e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9062735736370087, + "num_tokens": 64319722.0, + "step": 789 + }, + { + "epoch": 0.07892502122983167, + "grad_norm": 0.682913302213481, + "learning_rate": 9.937794638251003e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9072281122207642, + "num_tokens": 64401230.0, + "step": 790 + }, + { + "epoch": 0.079024926319996, + "grad_norm": 0.6872093107360472, + "learning_rate": 9.937539969693509e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9038302302360535, + "num_tokens": 64482756.0, + "step": 791 + }, + { + "epoch": 0.07912483141016034, + "grad_norm": 2.3126574744616426, + "learning_rate": 9.937284784170746e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9041697084903717, + "num_tokens": 64564256.0, + "step": 792 + }, + { + "epoch": 0.07922473650032469, + "grad_norm": 0.880505223262832, + "learning_rate": 9.937029081709439e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9054367542266846, + "num_tokens": 64645732.0, + "step": 793 + }, + { + "epoch": 0.07932464159048903, + "grad_norm": 0.5379747998426105, + "learning_rate": 9.936772862336357e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9105437695980072, + "num_tokens": 64727269.0, + "step": 794 + }, + { + "epoch": 0.07942454668065338, + "grad_norm": 0.6492650164453119, + "learning_rate": 9.936516126078326e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9030349850654602, + "num_tokens": 64808790.0, + "step": 795 + }, + { + "epoch": 0.07952445177081773, + "grad_norm": 0.6567486358101589, + "learning_rate": 9.936258872962229e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9058230221271515, + "num_tokens": 64890307.0, + "step": 796 + }, + { + "epoch": 0.07962435686098207, + "grad_norm": 0.8212215687915212, + "learning_rate": 9.936001103014996e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9043882787227631, + "num_tokens": 64971751.0, + "step": 797 + }, + { + "epoch": 0.07972426195114642, + "grad_norm": 0.6356325983453912, + "learning_rate": 9.935742816263622e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9048158526420593, + "num_tokens": 65053187.0, + "step": 798 + }, + { + "epoch": 0.07982416704131075, + "grad_norm": 0.644276330999676, + "learning_rate": 9.935484012735147e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9073496758937836, + "num_tokens": 65134746.0, + "step": 799 + }, + { + "epoch": 0.0799240721314751, + "grad_norm": 0.641414773504825, + "learning_rate": 9.935224692456665e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9054401218891144, + "num_tokens": 65216235.0, + "step": 800 + }, + { + "epoch": 0.08002397722163944, + "grad_norm": 0.8308029077887407, + "learning_rate": 9.934964855455332e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9055140018463135, + "num_tokens": 65297741.0, + "step": 801 + }, + { + "epoch": 0.08012388231180378, + "grad_norm": 0.6822128901668721, + "learning_rate": 9.93470450175835e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9043290019035339, + "num_tokens": 65379220.0, + "step": 802 + }, + { + "epoch": 0.08022378740196813, + "grad_norm": 0.54086679131861, + "learning_rate": 9.934443631392979e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9085542857646942, + "num_tokens": 65460783.0, + "step": 803 + }, + { + "epoch": 0.08032369249213248, + "grad_norm": 0.6670356457585321, + "learning_rate": 9.934182244386532e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9047558903694153, + "num_tokens": 65542246.0, + "step": 804 + }, + { + "epoch": 0.08042359758229682, + "grad_norm": 0.7119918035032863, + "learning_rate": 9.933920340766379e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9037385582923889, + "num_tokens": 65623773.0, + "step": 805 + }, + { + "epoch": 0.08052350267246117, + "grad_norm": 0.63069977905073, + "learning_rate": 9.933657920559939e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9101425409317017, + "num_tokens": 65705279.0, + "step": 806 + }, + { + "epoch": 0.08062340776262551, + "grad_norm": 0.7466060248052243, + "learning_rate": 9.933394983794688e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9009341895580292, + "num_tokens": 65786750.0, + "step": 807 + }, + { + "epoch": 0.08072331285278984, + "grad_norm": 0.7096563977239936, + "learning_rate": 9.933131530498157e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9066100120544434, + "num_tokens": 65868399.0, + "step": 808 + }, + { + "epoch": 0.08082321794295419, + "grad_norm": 0.7082982626128608, + "learning_rate": 9.93286756069793e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9061686098575592, + "num_tokens": 65949906.0, + "step": 809 + }, + { + "epoch": 0.08092312303311854, + "grad_norm": 0.5400746653940849, + "learning_rate": 9.93260307442164e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9068763256072998, + "num_tokens": 66031404.0, + "step": 810 + }, + { + "epoch": 0.08102302812328288, + "grad_norm": 0.7034478941732654, + "learning_rate": 9.932338071696986e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9041900634765625, + "num_tokens": 66112916.0, + "step": 811 + }, + { + "epoch": 0.08112293321344723, + "grad_norm": 0.8411416285619316, + "learning_rate": 9.93207255255171e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9048469662666321, + "num_tokens": 66194492.0, + "step": 812 + }, + { + "epoch": 0.08122283830361157, + "grad_norm": 0.731943694091716, + "learning_rate": 9.931806517013612e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9045557677745819, + "num_tokens": 66276032.0, + "step": 813 + }, + { + "epoch": 0.08132274339377592, + "grad_norm": 0.5748341417485371, + "learning_rate": 9.931539965110548e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9051277935504913, + "num_tokens": 66357566.0, + "step": 814 + }, + { + "epoch": 0.08142264848394026, + "grad_norm": 0.5452265593851023, + "learning_rate": 9.931272896870427e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9074427485466003, + "num_tokens": 66439068.0, + "step": 815 + }, + { + "epoch": 0.0815225535741046, + "grad_norm": 0.7554233859722423, + "learning_rate": 9.931005312321208e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9058498740196228, + "num_tokens": 66520556.0, + "step": 816 + }, + { + "epoch": 0.08162245866426894, + "grad_norm": 0.7406653260319461, + "learning_rate": 9.930737211490909e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9032904207706451, + "num_tokens": 66602065.0, + "step": 817 + }, + { + "epoch": 0.08172236375443329, + "grad_norm": 0.636793280515133, + "learning_rate": 9.9304685944076e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.911294549703598, + "num_tokens": 66683637.0, + "step": 818 + }, + { + "epoch": 0.08182226884459763, + "grad_norm": 0.9088159370196666, + "learning_rate": 9.930199461099406e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9046751260757446, + "num_tokens": 66765144.0, + "step": 819 + }, + { + "epoch": 0.08192217393476198, + "grad_norm": 0.5681269425109077, + "learning_rate": 9.929929811594507e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9074211418628693, + "num_tokens": 66846637.0, + "step": 820 + }, + { + "epoch": 0.08202207902492632, + "grad_norm": 0.7806591583767248, + "learning_rate": 9.929659645921132e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9069965481758118, + "num_tokens": 66928157.0, + "step": 821 + }, + { + "epoch": 0.08212198411509067, + "grad_norm": 0.8325806666478643, + "learning_rate": 9.929388964107572e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9049156010150909, + "num_tokens": 67009685.0, + "step": 822 + }, + { + "epoch": 0.08222188920525501, + "grad_norm": 1.427351163381059, + "learning_rate": 9.929117766182164e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9040774405002594, + "num_tokens": 67091157.0, + "step": 823 + }, + { + "epoch": 0.08232179429541935, + "grad_norm": 0.7331589726735706, + "learning_rate": 9.928846052173302e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9059133529663086, + "num_tokens": 67172653.0, + "step": 824 + }, + { + "epoch": 0.08242169938558369, + "grad_norm": 0.6835111028407242, + "learning_rate": 9.92857382210944e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9036811292171478, + "num_tokens": 67254135.0, + "step": 825 + }, + { + "epoch": 0.08252160447574804, + "grad_norm": 0.6477735857332294, + "learning_rate": 9.928301076019076e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9053612053394318, + "num_tokens": 67335583.0, + "step": 826 + }, + { + "epoch": 0.08262150956591238, + "grad_norm": 0.6163026977105106, + "learning_rate": 9.928027813930769e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9052813649177551, + "num_tokens": 67417151.0, + "step": 827 + }, + { + "epoch": 0.08272141465607673, + "grad_norm": 0.6431597319187369, + "learning_rate": 9.927754035873127e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9082411229610443, + "num_tokens": 67498748.0, + "step": 828 + }, + { + "epoch": 0.08282131974624107, + "grad_norm": 0.9739814643314164, + "learning_rate": 9.927479741874819e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9061024785041809, + "num_tokens": 67580224.0, + "step": 829 + }, + { + "epoch": 0.08292122483640542, + "grad_norm": 0.6605583825329912, + "learning_rate": 9.927204931964561e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9064055979251862, + "num_tokens": 67661781.0, + "step": 830 + }, + { + "epoch": 0.08302112992656976, + "grad_norm": 0.8017530701802583, + "learning_rate": 9.926929606171127e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.903700441122055, + "num_tokens": 67743283.0, + "step": 831 + }, + { + "epoch": 0.0831210350167341, + "grad_norm": 0.7117308283214521, + "learning_rate": 9.926653764523343e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9043205976486206, + "num_tokens": 67824772.0, + "step": 832 + }, + { + "epoch": 0.08322094010689844, + "grad_norm": 0.7586441926615849, + "learning_rate": 9.92637740705009e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9039982259273529, + "num_tokens": 67906225.0, + "step": 833 + }, + { + "epoch": 0.08332084519706279, + "grad_norm": 0.665090098080741, + "learning_rate": 9.926100533780304e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9041878879070282, + "num_tokens": 67987645.0, + "step": 834 + }, + { + "epoch": 0.08342075028722713, + "grad_norm": 0.6051989557700704, + "learning_rate": 9.925823144742972e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9060593247413635, + "num_tokens": 68069129.0, + "step": 835 + }, + { + "epoch": 0.08352065537739148, + "grad_norm": 0.7176105748340248, + "learning_rate": 9.925545239967141e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.902629554271698, + "num_tokens": 68150670.0, + "step": 836 + }, + { + "epoch": 0.08362056046755582, + "grad_norm": 0.606660804578409, + "learning_rate": 9.925266819481903e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9083800613880157, + "num_tokens": 68232235.0, + "step": 837 + }, + { + "epoch": 0.08372046555772017, + "grad_norm": 0.6114678347138667, + "learning_rate": 9.92498788331641e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9063471257686615, + "num_tokens": 68313732.0, + "step": 838 + }, + { + "epoch": 0.08382037064788452, + "grad_norm": 0.7039143224565465, + "learning_rate": 9.924708431499868e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9062743484973907, + "num_tokens": 68395290.0, + "step": 839 + }, + { + "epoch": 0.08392027573804885, + "grad_norm": 0.605363162802858, + "learning_rate": 9.924428464061536e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9063765108585358, + "num_tokens": 68476811.0, + "step": 840 + }, + { + "epoch": 0.08402018082821319, + "grad_norm": 0.5798430755863316, + "learning_rate": 9.924147981030728e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9046160280704498, + "num_tokens": 68558253.0, + "step": 841 + }, + { + "epoch": 0.08412008591837754, + "grad_norm": 0.8851782324774149, + "learning_rate": 9.923866982436807e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9070667028427124, + "num_tokens": 68639767.0, + "step": 842 + }, + { + "epoch": 0.08421999100854188, + "grad_norm": 1.0055150436224778, + "learning_rate": 9.923585468309197e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9057371020317078, + "num_tokens": 68721351.0, + "step": 843 + }, + { + "epoch": 0.08431989609870623, + "grad_norm": 0.6153499916727992, + "learning_rate": 9.923303438677373e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9010919332504272, + "num_tokens": 68802790.0, + "step": 844 + }, + { + "epoch": 0.08441980118887057, + "grad_norm": 0.6022507584970759, + "learning_rate": 9.923020893570861e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9046611785888672, + "num_tokens": 68884297.0, + "step": 845 + }, + { + "epoch": 0.08451970627903492, + "grad_norm": 0.6955711374305276, + "learning_rate": 9.922737833019247e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9058013558387756, + "num_tokens": 68965820.0, + "step": 846 + }, + { + "epoch": 0.08461961136919927, + "grad_norm": 0.6277004187784428, + "learning_rate": 9.922454257052166e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9063794314861298, + "num_tokens": 69047320.0, + "step": 847 + }, + { + "epoch": 0.08471951645936361, + "grad_norm": 0.8111969151806421, + "learning_rate": 9.922170165699307e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9068858623504639, + "num_tokens": 69128798.0, + "step": 848 + }, + { + "epoch": 0.08481942154952794, + "grad_norm": 0.5824904761598848, + "learning_rate": 9.921885558990418e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9063677489757538, + "num_tokens": 69210264.0, + "step": 849 + }, + { + "epoch": 0.08491932663969229, + "grad_norm": 0.6141344275495983, + "learning_rate": 9.921600436955297e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9054142832756042, + "num_tokens": 69291705.0, + "step": 850 + }, + { + "epoch": 0.08501923172985663, + "grad_norm": 0.580489084884524, + "learning_rate": 9.921314799623796e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9058152437210083, + "num_tokens": 69373323.0, + "step": 851 + }, + { + "epoch": 0.08511913682002098, + "grad_norm": 0.6712010030472265, + "learning_rate": 9.921028647025819e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9058849513530731, + "num_tokens": 69454853.0, + "step": 852 + }, + { + "epoch": 0.08521904191018533, + "grad_norm": 0.7123258105948694, + "learning_rate": 9.92074197919133e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9047631621360779, + "num_tokens": 69536339.0, + "step": 853 + }, + { + "epoch": 0.08531894700034967, + "grad_norm": 0.9850728203739333, + "learning_rate": 9.920454796150342e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.907745361328125, + "num_tokens": 69617880.0, + "step": 854 + }, + { + "epoch": 0.08541885209051402, + "grad_norm": 0.6366326891050326, + "learning_rate": 9.920167097932923e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9074483811855316, + "num_tokens": 69699446.0, + "step": 855 + }, + { + "epoch": 0.08551875718067836, + "grad_norm": 0.5725816246844948, + "learning_rate": 9.919878884569197e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9056214392185211, + "num_tokens": 69780919.0, + "step": 856 + }, + { + "epoch": 0.0856186622708427, + "grad_norm": 0.578176043360592, + "learning_rate": 9.919590156089338e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9060282409191132, + "num_tokens": 69862413.0, + "step": 857 + }, + { + "epoch": 0.08571856736100704, + "grad_norm": 0.6959773212315223, + "learning_rate": 9.919300912523576e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9046455025672913, + "num_tokens": 69943868.0, + "step": 858 + }, + { + "epoch": 0.08581847245117138, + "grad_norm": 0.5801701722394603, + "learning_rate": 9.919011153902196e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9042951762676239, + "num_tokens": 70025370.0, + "step": 859 + }, + { + "epoch": 0.08591837754133573, + "grad_norm": 0.6731790495634628, + "learning_rate": 9.91872088025554e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9045723080635071, + "num_tokens": 70106776.0, + "step": 860 + }, + { + "epoch": 0.08601828263150008, + "grad_norm": 0.6265186232653207, + "learning_rate": 9.918430091613993e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9057246148586273, + "num_tokens": 70188276.0, + "step": 861 + }, + { + "epoch": 0.08611818772166442, + "grad_norm": 0.6546172177738853, + "learning_rate": 9.918138788008003e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9074796438217163, + "num_tokens": 70269802.0, + "step": 862 + }, + { + "epoch": 0.08621809281182877, + "grad_norm": 0.6476317056354126, + "learning_rate": 9.917846969468073e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9047059118747711, + "num_tokens": 70351324.0, + "step": 863 + }, + { + "epoch": 0.08631799790199311, + "grad_norm": 0.7089770983793926, + "learning_rate": 9.917554636024754e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9031659364700317, + "num_tokens": 70432777.0, + "step": 864 + }, + { + "epoch": 0.08641790299215744, + "grad_norm": 0.7234565638608202, + "learning_rate": 9.917261787708653e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9062176942825317, + "num_tokens": 70514273.0, + "step": 865 + }, + { + "epoch": 0.08651780808232179, + "grad_norm": 0.6991950381140566, + "learning_rate": 9.916968424550432e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9058383703231812, + "num_tokens": 70595880.0, + "step": 866 + }, + { + "epoch": 0.08661771317248614, + "grad_norm": 0.71568041699492, + "learning_rate": 9.91667454658081e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9063928127288818, + "num_tokens": 70677445.0, + "step": 867 + }, + { + "epoch": 0.08671761826265048, + "grad_norm": 0.7134073402419407, + "learning_rate": 9.916380153830549e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9058991074562073, + "num_tokens": 70758939.0, + "step": 868 + }, + { + "epoch": 0.08681752335281483, + "grad_norm": 0.7560671850370076, + "learning_rate": 9.91608524633048e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9045644402503967, + "num_tokens": 70840525.0, + "step": 869 + }, + { + "epoch": 0.08691742844297917, + "grad_norm": 0.6311942240948966, + "learning_rate": 9.915789824111474e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9034683406352997, + "num_tokens": 70921947.0, + "step": 870 + }, + { + "epoch": 0.08701733353314352, + "grad_norm": 0.6694386362366539, + "learning_rate": 9.915493887204467e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9070733487606049, + "num_tokens": 71003463.0, + "step": 871 + }, + { + "epoch": 0.08711723862330786, + "grad_norm": 0.604351472796462, + "learning_rate": 9.91519743564044e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9090879559516907, + "num_tokens": 71085086.0, + "step": 872 + }, + { + "epoch": 0.0872171437134722, + "grad_norm": 0.6147162612891428, + "learning_rate": 9.914900469450434e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9049391150474548, + "num_tokens": 71166669.0, + "step": 873 + }, + { + "epoch": 0.08731704880363654, + "grad_norm": 0.6325038728187306, + "learning_rate": 9.91460298866554e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.906445175409317, + "num_tokens": 71248187.0, + "step": 874 + }, + { + "epoch": 0.08741695389380089, + "grad_norm": 0.5945358634014883, + "learning_rate": 9.914304993316906e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9062185883522034, + "num_tokens": 71329758.0, + "step": 875 + }, + { + "epoch": 0.08751685898396523, + "grad_norm": 0.649043245721002, + "learning_rate": 9.914006483435732e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9049612879753113, + "num_tokens": 71411210.0, + "step": 876 + }, + { + "epoch": 0.08761676407412958, + "grad_norm": 0.6091935419477329, + "learning_rate": 9.91370745905327e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9076765477657318, + "num_tokens": 71492712.0, + "step": 877 + }, + { + "epoch": 0.08771666916429392, + "grad_norm": 0.831135670868985, + "learning_rate": 9.913407920200832e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9042916297912598, + "num_tokens": 71574242.0, + "step": 878 + }, + { + "epoch": 0.08781657425445827, + "grad_norm": 0.6878000511949489, + "learning_rate": 9.913107866909779e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9074662029743195, + "num_tokens": 71655810.0, + "step": 879 + }, + { + "epoch": 0.08791647934462261, + "grad_norm": 0.8157409868901474, + "learning_rate": 9.912807299211524e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9036703705787659, + "num_tokens": 71737281.0, + "step": 880 + }, + { + "epoch": 0.08801638443478695, + "grad_norm": 0.7176106132659108, + "learning_rate": 9.912506217137542e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9070846140384674, + "num_tokens": 71818819.0, + "step": 881 + }, + { + "epoch": 0.08811628952495129, + "grad_norm": 0.6310275037299464, + "learning_rate": 9.91220462071935e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9073473811149597, + "num_tokens": 71900392.0, + "step": 882 + }, + { + "epoch": 0.08821619461511564, + "grad_norm": 0.591035951055039, + "learning_rate": 9.91190250998853e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9046774804592133, + "num_tokens": 71981977.0, + "step": 883 + }, + { + "epoch": 0.08831609970527998, + "grad_norm": 0.6081161841524366, + "learning_rate": 9.911599884976712e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9070749282836914, + "num_tokens": 72063534.0, + "step": 884 + }, + { + "epoch": 0.08841600479544433, + "grad_norm": 0.734458295677911, + "learning_rate": 9.911296745715583e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9083825945854187, + "num_tokens": 72145141.0, + "step": 885 + }, + { + "epoch": 0.08851590988560867, + "grad_norm": 1.063711336505933, + "learning_rate": 9.910993092236878e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.90724116563797, + "num_tokens": 72226618.0, + "step": 886 + }, + { + "epoch": 0.08861581497577302, + "grad_norm": 0.6163082660902157, + "learning_rate": 9.910688924572392e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9049643278121948, + "num_tokens": 72308139.0, + "step": 887 + }, + { + "epoch": 0.08871572006593736, + "grad_norm": 0.6495256136782027, + "learning_rate": 9.910384242753973e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9051685035228729, + "num_tokens": 72389665.0, + "step": 888 + }, + { + "epoch": 0.08881562515610171, + "grad_norm": 0.7724204605845219, + "learning_rate": 9.910079046813522e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9059417247772217, + "num_tokens": 72471065.0, + "step": 889 + }, + { + "epoch": 0.08891553024626604, + "grad_norm": 0.7151453448248323, + "learning_rate": 9.909773336782987e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9044484198093414, + "num_tokens": 72552473.0, + "step": 890 + }, + { + "epoch": 0.08901543533643039, + "grad_norm": 0.7097604124806123, + "learning_rate": 9.909467112694385e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9039668142795563, + "num_tokens": 72633947.0, + "step": 891 + }, + { + "epoch": 0.08911534042659473, + "grad_norm": 0.9651171921766214, + "learning_rate": 9.90916037457977e-06, + "loss": 0.513, + "mean_token_accuracy": 0.903611570596695, + "num_tokens": 72715327.0, + "step": 892 + }, + { + "epoch": 0.08921524551675908, + "grad_norm": 0.6935233089131888, + "learning_rate": 9.908853122471263e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9045716226100922, + "num_tokens": 72796764.0, + "step": 893 + }, + { + "epoch": 0.08931515060692342, + "grad_norm": 0.6242336395096585, + "learning_rate": 9.908545356401032e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.90720334649086, + "num_tokens": 72878240.0, + "step": 894 + }, + { + "epoch": 0.08941505569708777, + "grad_norm": 0.7003488114258999, + "learning_rate": 9.908237076401302e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9037571549415588, + "num_tokens": 72959696.0, + "step": 895 + }, + { + "epoch": 0.08951496078725212, + "grad_norm": 0.7281806062494915, + "learning_rate": 9.907928282504347e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9070348143577576, + "num_tokens": 73041231.0, + "step": 896 + }, + { + "epoch": 0.08961486587741646, + "grad_norm": 0.638928481197161, + "learning_rate": 9.907618974742499e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9064671099185944, + "num_tokens": 73122739.0, + "step": 897 + }, + { + "epoch": 0.08971477096758079, + "grad_norm": 0.6236285598134305, + "learning_rate": 9.907309153148143e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9059054851531982, + "num_tokens": 73204244.0, + "step": 898 + }, + { + "epoch": 0.08981467605774514, + "grad_norm": 0.6832645470476156, + "learning_rate": 9.90699881775372e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9053355157375336, + "num_tokens": 73285724.0, + "step": 899 + }, + { + "epoch": 0.08991458114790948, + "grad_norm": 0.7246916744174311, + "learning_rate": 9.90668796859172e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9082057476043701, + "num_tokens": 73367203.0, + "step": 900 + }, + { + "epoch": 0.09001448623807383, + "grad_norm": 0.957815950211231, + "learning_rate": 9.90637660569469e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9034390449523926, + "num_tokens": 73448729.0, + "step": 901 + }, + { + "epoch": 0.09011439132823817, + "grad_norm": 0.7850500127582889, + "learning_rate": 9.906064729095229e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9077897071838379, + "num_tokens": 73530239.0, + "step": 902 + }, + { + "epoch": 0.09021429641840252, + "grad_norm": 0.7170481771810725, + "learning_rate": 9.90575233882599e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9051633775234222, + "num_tokens": 73611820.0, + "step": 903 + }, + { + "epoch": 0.09031420150856687, + "grad_norm": 0.762346464666874, + "learning_rate": 9.905439434919685e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.904507964849472, + "num_tokens": 73693307.0, + "step": 904 + }, + { + "epoch": 0.09041410659873121, + "grad_norm": 0.6008795353266719, + "learning_rate": 9.905126017409072e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9063602983951569, + "num_tokens": 73774912.0, + "step": 905 + }, + { + "epoch": 0.09051401168889554, + "grad_norm": 0.8850249378299285, + "learning_rate": 9.904812086326965e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.907290905714035, + "num_tokens": 73856414.0, + "step": 906 + }, + { + "epoch": 0.09061391677905989, + "grad_norm": 0.6935644616330894, + "learning_rate": 9.904497641706237e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9051337242126465, + "num_tokens": 73937950.0, + "step": 907 + }, + { + "epoch": 0.09071382186922423, + "grad_norm": 0.6299477835001188, + "learning_rate": 9.904182683579807e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9082408249378204, + "num_tokens": 74019446.0, + "step": 908 + }, + { + "epoch": 0.09081372695938858, + "grad_norm": 0.7815186956259104, + "learning_rate": 9.90386721198065e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9072331190109253, + "num_tokens": 74101114.0, + "step": 909 + }, + { + "epoch": 0.09091363204955293, + "grad_norm": 0.5837773633365532, + "learning_rate": 9.903551226941801e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9056320190429688, + "num_tokens": 74182650.0, + "step": 910 + }, + { + "epoch": 0.09101353713971727, + "grad_norm": 0.6587565408079529, + "learning_rate": 9.903234728496341e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9074794948101044, + "num_tokens": 74264133.0, + "step": 911 + }, + { + "epoch": 0.09111344222988162, + "grad_norm": 0.7289393054977376, + "learning_rate": 9.902917716677409e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9075478613376617, + "num_tokens": 74345660.0, + "step": 912 + }, + { + "epoch": 0.09121334732004596, + "grad_norm": 0.7196742551597635, + "learning_rate": 9.902600191518196e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9052766263484955, + "num_tokens": 74427080.0, + "step": 913 + }, + { + "epoch": 0.0913132524102103, + "grad_norm": 0.8871289315919542, + "learning_rate": 9.902282153051946e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9056299030780792, + "num_tokens": 74508594.0, + "step": 914 + }, + { + "epoch": 0.09141315750037464, + "grad_norm": 0.6340570219404514, + "learning_rate": 9.901963601311959e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9059731066226959, + "num_tokens": 74590007.0, + "step": 915 + }, + { + "epoch": 0.09151306259053898, + "grad_norm": 0.7357827880375015, + "learning_rate": 9.901644536331588e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9054728746414185, + "num_tokens": 74671498.0, + "step": 916 + }, + { + "epoch": 0.09161296768070333, + "grad_norm": 0.7880011006411674, + "learning_rate": 9.90132495814424e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9061497449874878, + "num_tokens": 74753065.0, + "step": 917 + }, + { + "epoch": 0.09171287277086768, + "grad_norm": 0.5425432154348766, + "learning_rate": 9.901004866783372e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.906493216753006, + "num_tokens": 74834596.0, + "step": 918 + }, + { + "epoch": 0.09181277786103202, + "grad_norm": 1.000385336769969, + "learning_rate": 9.900684262282501e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9038678109645844, + "num_tokens": 74916034.0, + "step": 919 + }, + { + "epoch": 0.09191268295119637, + "grad_norm": 0.8433514927542649, + "learning_rate": 9.900363144675194e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9071792960166931, + "num_tokens": 74997548.0, + "step": 920 + }, + { + "epoch": 0.09201258804136071, + "grad_norm": 0.5421785155196499, + "learning_rate": 9.900041513995072e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.904581606388092, + "num_tokens": 75078985.0, + "step": 921 + }, + { + "epoch": 0.09211249313152504, + "grad_norm": 0.6679995596603433, + "learning_rate": 9.89971937027581e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9057612419128418, + "num_tokens": 75160478.0, + "step": 922 + }, + { + "epoch": 0.09221239822168939, + "grad_norm": 0.5247397588324163, + "learning_rate": 9.899396713551137e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9089457094669342, + "num_tokens": 75242010.0, + "step": 923 + }, + { + "epoch": 0.09231230331185374, + "grad_norm": 0.5416690433248982, + "learning_rate": 9.899073543854833e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9082903265953064, + "num_tokens": 75323542.0, + "step": 924 + }, + { + "epoch": 0.09241220840201808, + "grad_norm": 0.9160869785082052, + "learning_rate": 9.89874986122074e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.907716304063797, + "num_tokens": 75405133.0, + "step": 925 + }, + { + "epoch": 0.09251211349218243, + "grad_norm": 0.6576461500236815, + "learning_rate": 9.898425665682743e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9064836800098419, + "num_tokens": 75486707.0, + "step": 926 + }, + { + "epoch": 0.09261201858234677, + "grad_norm": 0.6851842560095099, + "learning_rate": 9.898100957274786e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9063084125518799, + "num_tokens": 75568232.0, + "step": 927 + }, + { + "epoch": 0.09271192367251112, + "grad_norm": 0.6698029444335718, + "learning_rate": 9.897775736030867e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9056845903396606, + "num_tokens": 75649716.0, + "step": 928 + }, + { + "epoch": 0.09281182876267546, + "grad_norm": 0.6613868491170545, + "learning_rate": 9.897450001985038e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9084149599075317, + "num_tokens": 75731204.0, + "step": 929 + }, + { + "epoch": 0.09291173385283981, + "grad_norm": 0.6055830047387738, + "learning_rate": 9.897123755171403e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9074743688106537, + "num_tokens": 75812779.0, + "step": 930 + }, + { + "epoch": 0.09301163894300414, + "grad_norm": 0.9356051270401554, + "learning_rate": 9.896796995624121e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9062438905239105, + "num_tokens": 75894420.0, + "step": 931 + }, + { + "epoch": 0.09311154403316849, + "grad_norm": 0.8212135679895612, + "learning_rate": 9.896469723377402e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9010060429573059, + "num_tokens": 75975841.0, + "step": 932 + }, + { + "epoch": 0.09321144912333283, + "grad_norm": 0.6848469247837468, + "learning_rate": 9.896141938465513e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9047156572341919, + "num_tokens": 76057374.0, + "step": 933 + }, + { + "epoch": 0.09331135421349718, + "grad_norm": 0.7701354357864915, + "learning_rate": 9.895813640922773e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9039326012134552, + "num_tokens": 76138874.0, + "step": 934 + }, + { + "epoch": 0.09341125930366152, + "grad_norm": 0.6952524530738413, + "learning_rate": 9.895484830783557e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9066027402877808, + "num_tokens": 76220436.0, + "step": 935 + }, + { + "epoch": 0.09351116439382587, + "grad_norm": 0.99073003503062, + "learning_rate": 9.89515550808229e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9080641865730286, + "num_tokens": 76301949.0, + "step": 936 + }, + { + "epoch": 0.09361106948399021, + "grad_norm": 0.6835022634376321, + "learning_rate": 9.894825672853451e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9033417999744415, + "num_tokens": 76383402.0, + "step": 937 + }, + { + "epoch": 0.09371097457415456, + "grad_norm": 0.5818447441235276, + "learning_rate": 9.894495325131577e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9059834480285645, + "num_tokens": 76464914.0, + "step": 938 + }, + { + "epoch": 0.09381087966431889, + "grad_norm": 0.690766203284401, + "learning_rate": 9.894164464951254e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9036503732204437, + "num_tokens": 76546509.0, + "step": 939 + }, + { + "epoch": 0.09391078475448324, + "grad_norm": 0.6442400684680549, + "learning_rate": 9.893833092347125e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9060201048851013, + "num_tokens": 76628055.0, + "step": 940 + }, + { + "epoch": 0.09401068984464758, + "grad_norm": 0.6188050218256352, + "learning_rate": 9.893501207353883e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9068036675453186, + "num_tokens": 76709540.0, + "step": 941 + }, + { + "epoch": 0.09411059493481193, + "grad_norm": 0.7270315263475635, + "learning_rate": 9.893168810006277e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9051905572414398, + "num_tokens": 76791133.0, + "step": 942 + }, + { + "epoch": 0.09421050002497627, + "grad_norm": 0.7234031713826986, + "learning_rate": 9.892835900339111e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9059901535511017, + "num_tokens": 76872581.0, + "step": 943 + }, + { + "epoch": 0.09431040511514062, + "grad_norm": 0.5371683390272445, + "learning_rate": 9.892502478387239e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9088720977306366, + "num_tokens": 76954086.0, + "step": 944 + }, + { + "epoch": 0.09441031020530496, + "grad_norm": 0.6356043917527051, + "learning_rate": 9.89216854418557e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9097650051116943, + "num_tokens": 77035670.0, + "step": 945 + }, + { + "epoch": 0.09451021529546931, + "grad_norm": 0.5971046344738203, + "learning_rate": 9.891834097769071e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9055157601833344, + "num_tokens": 77117112.0, + "step": 946 + }, + { + "epoch": 0.09461012038563364, + "grad_norm": 0.6412418813163155, + "learning_rate": 9.891499139172755e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9073620438575745, + "num_tokens": 77198686.0, + "step": 947 + }, + { + "epoch": 0.09471002547579799, + "grad_norm": 0.6865324125718156, + "learning_rate": 9.891163668431696e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.903347373008728, + "num_tokens": 77280255.0, + "step": 948 + }, + { + "epoch": 0.09480993056596233, + "grad_norm": 0.6451423276665099, + "learning_rate": 9.890827685581014e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9059634506702423, + "num_tokens": 77361858.0, + "step": 949 + }, + { + "epoch": 0.09490983565612668, + "grad_norm": 1.047913975694403, + "learning_rate": 9.890491190655892e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9056960046291351, + "num_tokens": 77443450.0, + "step": 950 + }, + { + "epoch": 0.09500974074629102, + "grad_norm": 0.6345404709730971, + "learning_rate": 9.890154183691554e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9061975479125977, + "num_tokens": 77525022.0, + "step": 951 + }, + { + "epoch": 0.09510964583645537, + "grad_norm": 0.7940461114354487, + "learning_rate": 9.88981666472329e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9074730277061462, + "num_tokens": 77606463.0, + "step": 952 + }, + { + "epoch": 0.09520955092661972, + "grad_norm": 0.6718795068822522, + "learning_rate": 9.88947863378644e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9067387282848358, + "num_tokens": 77687984.0, + "step": 953 + }, + { + "epoch": 0.09530945601678406, + "grad_norm": 0.6224530200013649, + "learning_rate": 9.889140090916394e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9064422845840454, + "num_tokens": 77769456.0, + "step": 954 + }, + { + "epoch": 0.09540936110694839, + "grad_norm": 0.612481699782118, + "learning_rate": 9.888801036148597e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9080550968647003, + "num_tokens": 77851004.0, + "step": 955 + }, + { + "epoch": 0.09550926619711274, + "grad_norm": 0.9058196112343737, + "learning_rate": 9.888461469518547e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9042698442935944, + "num_tokens": 77932478.0, + "step": 956 + }, + { + "epoch": 0.09560917128727708, + "grad_norm": 0.5764323500840652, + "learning_rate": 9.8881213910618e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9035286605358124, + "num_tokens": 78013989.0, + "step": 957 + }, + { + "epoch": 0.09570907637744143, + "grad_norm": 0.6824826399591191, + "learning_rate": 9.887780800813963e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9058088660240173, + "num_tokens": 78095497.0, + "step": 958 + }, + { + "epoch": 0.09580898146760577, + "grad_norm": 0.6219407160133975, + "learning_rate": 9.887439698810694e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9046356081962585, + "num_tokens": 78176977.0, + "step": 959 + }, + { + "epoch": 0.09590888655777012, + "grad_norm": 0.6710910470818688, + "learning_rate": 9.887098085087707e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9043632447719574, + "num_tokens": 78258348.0, + "step": 960 + }, + { + "epoch": 0.09600879164793447, + "grad_norm": 0.8243282629560617, + "learning_rate": 9.886755959680769e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9044191837310791, + "num_tokens": 78339815.0, + "step": 961 + }, + { + "epoch": 0.09610869673809881, + "grad_norm": 0.68917416089181, + "learning_rate": 9.886413322625703e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9055221676826477, + "num_tokens": 78421276.0, + "step": 962 + }, + { + "epoch": 0.09620860182826314, + "grad_norm": 0.6925058762102648, + "learning_rate": 9.886070173958382e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9042388200759888, + "num_tokens": 78502759.0, + "step": 963 + }, + { + "epoch": 0.09630850691842749, + "grad_norm": 0.5830954656396667, + "learning_rate": 9.885726513714732e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9087537825107574, + "num_tokens": 78584239.0, + "step": 964 + }, + { + "epoch": 0.09640841200859183, + "grad_norm": 0.6208006117098726, + "learning_rate": 9.885382341930739e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9068186283111572, + "num_tokens": 78665746.0, + "step": 965 + }, + { + "epoch": 0.09650831709875618, + "grad_norm": 0.7715171394428264, + "learning_rate": 9.885037658642436e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9057926833629608, + "num_tokens": 78747315.0, + "step": 966 + }, + { + "epoch": 0.09660822218892053, + "grad_norm": 0.5612218336062302, + "learning_rate": 9.88469246388591e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9058555960655212, + "num_tokens": 78828890.0, + "step": 967 + }, + { + "epoch": 0.09670812727908487, + "grad_norm": 0.6274522558047623, + "learning_rate": 9.884346757697304e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.907541036605835, + "num_tokens": 78910356.0, + "step": 968 + }, + { + "epoch": 0.09680803236924922, + "grad_norm": 0.6646088773317416, + "learning_rate": 9.884000540112814e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9061321020126343, + "num_tokens": 78991846.0, + "step": 969 + }, + { + "epoch": 0.09690793745941356, + "grad_norm": 1.0498144081549723, + "learning_rate": 9.883653811168693e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9074725806713104, + "num_tokens": 79073406.0, + "step": 970 + }, + { + "epoch": 0.09700784254957791, + "grad_norm": 0.6301362508388633, + "learning_rate": 9.883306570901237e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9076526761054993, + "num_tokens": 79154951.0, + "step": 971 + }, + { + "epoch": 0.09710774763974224, + "grad_norm": 0.8241587245525471, + "learning_rate": 9.882958819346807e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9064285755157471, + "num_tokens": 79236488.0, + "step": 972 + }, + { + "epoch": 0.09720765272990659, + "grad_norm": 0.7308597846618243, + "learning_rate": 9.882610556541812e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.906716525554657, + "num_tokens": 79317920.0, + "step": 973 + }, + { + "epoch": 0.09730755782007093, + "grad_norm": 0.6253696780352617, + "learning_rate": 9.882261782522715e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9067068696022034, + "num_tokens": 79399418.0, + "step": 974 + }, + { + "epoch": 0.09740746291023528, + "grad_norm": 0.6111308975060609, + "learning_rate": 9.881912497326034e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9064388275146484, + "num_tokens": 79481001.0, + "step": 975 + }, + { + "epoch": 0.09750736800039962, + "grad_norm": 0.8122258040839349, + "learning_rate": 9.88156270098834e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9095718860626221, + "num_tokens": 79562499.0, + "step": 976 + }, + { + "epoch": 0.09760727309056397, + "grad_norm": 0.6767559248735864, + "learning_rate": 9.881212393546253e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9068856239318848, + "num_tokens": 79644053.0, + "step": 977 + }, + { + "epoch": 0.09770717818072831, + "grad_norm": 0.6154220945580349, + "learning_rate": 9.880861575036455e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9062811136245728, + "num_tokens": 79725550.0, + "step": 978 + }, + { + "epoch": 0.09780708327089266, + "grad_norm": 0.5814138215169917, + "learning_rate": 9.880510245495675e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9041571021080017, + "num_tokens": 79807114.0, + "step": 979 + }, + { + "epoch": 0.09790698836105699, + "grad_norm": 0.7395149835232829, + "learning_rate": 9.880158404960698e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9045887589454651, + "num_tokens": 79888551.0, + "step": 980 + }, + { + "epoch": 0.09800689345122134, + "grad_norm": 0.5699515066757588, + "learning_rate": 9.879806053468361e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9039920270442963, + "num_tokens": 79970012.0, + "step": 981 + }, + { + "epoch": 0.09810679854138568, + "grad_norm": 0.6330040190163174, + "learning_rate": 9.87945319105556e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9066498875617981, + "num_tokens": 80051544.0, + "step": 982 + }, + { + "epoch": 0.09820670363155003, + "grad_norm": 0.7507873207304008, + "learning_rate": 9.879099817759232e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9037018418312073, + "num_tokens": 80133018.0, + "step": 983 + }, + { + "epoch": 0.09830660872171437, + "grad_norm": 0.7281808497063542, + "learning_rate": 9.878745933616383e-06, + "loss": 0.507, + "mean_token_accuracy": 0.904046505689621, + "num_tokens": 80214402.0, + "step": 984 + }, + { + "epoch": 0.09840651381187872, + "grad_norm": 1.1586727066321918, + "learning_rate": 9.878391538664061e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9069094657897949, + "num_tokens": 80295967.0, + "step": 985 + }, + { + "epoch": 0.09850641890204306, + "grad_norm": 0.5852614015919839, + "learning_rate": 9.878036632939374e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9075638949871063, + "num_tokens": 80377384.0, + "step": 986 + }, + { + "epoch": 0.09860632399220741, + "grad_norm": 0.7884134378973523, + "learning_rate": 9.877681216479478e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9069824516773224, + "num_tokens": 80458894.0, + "step": 987 + }, + { + "epoch": 0.09870622908237174, + "grad_norm": 0.5445944790182421, + "learning_rate": 9.877325289321587e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.9120776653289795, + "num_tokens": 80540646.0, + "step": 988 + }, + { + "epoch": 0.09880613417253609, + "grad_norm": 0.7070611957525497, + "learning_rate": 9.876968851502968e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9082713723182678, + "num_tokens": 80622166.0, + "step": 989 + }, + { + "epoch": 0.09890603926270043, + "grad_norm": 0.5475120195215634, + "learning_rate": 9.876611903060939e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9053463935852051, + "num_tokens": 80703752.0, + "step": 990 + }, + { + "epoch": 0.09900594435286478, + "grad_norm": 0.6611110643624813, + "learning_rate": 9.876254444032873e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9078154265880585, + "num_tokens": 80785235.0, + "step": 991 + }, + { + "epoch": 0.09910584944302912, + "grad_norm": 0.9974925599904882, + "learning_rate": 9.875896474456197e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9051282107830048, + "num_tokens": 80866872.0, + "step": 992 + }, + { + "epoch": 0.09920575453319347, + "grad_norm": 1.3810728820738163, + "learning_rate": 9.875537994368389e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9080813229084015, + "num_tokens": 80948460.0, + "step": 993 + }, + { + "epoch": 0.09930565962335781, + "grad_norm": 0.8013758706833427, + "learning_rate": 9.875179003806985e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9060831665992737, + "num_tokens": 81030001.0, + "step": 994 + }, + { + "epoch": 0.09940556471352216, + "grad_norm": 0.7371976785381749, + "learning_rate": 9.87481950280957e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9056420028209686, + "num_tokens": 81111579.0, + "step": 995 + }, + { + "epoch": 0.09950546980368649, + "grad_norm": 0.5974332586171462, + "learning_rate": 9.874459491413784e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9074396193027496, + "num_tokens": 81193110.0, + "step": 996 + }, + { + "epoch": 0.09960537489385084, + "grad_norm": 0.8002550264963368, + "learning_rate": 9.874098969657321e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.905116468667984, + "num_tokens": 81274583.0, + "step": 997 + }, + { + "epoch": 0.09970527998401518, + "grad_norm": 0.6496923386542186, + "learning_rate": 9.873737937577928e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.903766006231308, + "num_tokens": 81356029.0, + "step": 998 + }, + { + "epoch": 0.09980518507417953, + "grad_norm": 0.599974274508544, + "learning_rate": 9.873376395213405e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9055802524089813, + "num_tokens": 81437413.0, + "step": 999 + }, + { + "epoch": 0.09990509016434387, + "grad_norm": 0.691407633100139, + "learning_rate": 9.873014342601605e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9055542051792145, + "num_tokens": 81518898.0, + "step": 1000 + }, + { + "epoch": 0.10000499525450822, + "grad_norm": 0.568529285923241, + "learning_rate": 9.872651779780438e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9086836576461792, + "num_tokens": 81600494.0, + "step": 1001 + }, + { + "epoch": 0.10010490034467256, + "grad_norm": 0.5338687327532838, + "learning_rate": 9.872288706787862e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9039679765701294, + "num_tokens": 81681977.0, + "step": 1002 + }, + { + "epoch": 0.10020480543483691, + "grad_norm": 0.724697940254083, + "learning_rate": 9.871925123661892e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9049167335033417, + "num_tokens": 81763472.0, + "step": 1003 + }, + { + "epoch": 0.10030471052500124, + "grad_norm": 0.5905036666427542, + "learning_rate": 9.871561030440594e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9076482951641083, + "num_tokens": 81845002.0, + "step": 1004 + }, + { + "epoch": 0.10040461561516559, + "grad_norm": 0.6389488654421148, + "learning_rate": 9.871196427162094e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9053499698638916, + "num_tokens": 81926418.0, + "step": 1005 + }, + { + "epoch": 0.10050452070532993, + "grad_norm": 1.308578667858042, + "learning_rate": 9.87083131386456e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9046832919120789, + "num_tokens": 82007981.0, + "step": 1006 + }, + { + "epoch": 0.10060442579549428, + "grad_norm": 0.745730827417396, + "learning_rate": 9.870465690586223e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.908117949962616, + "num_tokens": 82089476.0, + "step": 1007 + }, + { + "epoch": 0.10070433088565862, + "grad_norm": 0.7478691398300212, + "learning_rate": 9.870099557365367e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9036674201488495, + "num_tokens": 82171040.0, + "step": 1008 + }, + { + "epoch": 0.10080423597582297, + "grad_norm": 0.827054256609352, + "learning_rate": 9.86973291424032e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9009964466094971, + "num_tokens": 82252412.0, + "step": 1009 + }, + { + "epoch": 0.10090414106598732, + "grad_norm": 0.7673148628163268, + "learning_rate": 9.869365761249474e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.903214693069458, + "num_tokens": 82333919.0, + "step": 1010 + }, + { + "epoch": 0.10100404615615166, + "grad_norm": 0.642724706259942, + "learning_rate": 9.868998098431269e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9036308526992798, + "num_tokens": 82415470.0, + "step": 1011 + }, + { + "epoch": 0.101103951246316, + "grad_norm": 0.5900916102170062, + "learning_rate": 9.8686299258242e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9085716009140015, + "num_tokens": 82497025.0, + "step": 1012 + }, + { + "epoch": 0.10120385633648034, + "grad_norm": 0.8180787682956736, + "learning_rate": 9.868261243466815e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9069214165210724, + "num_tokens": 82578602.0, + "step": 1013 + }, + { + "epoch": 0.10130376142664468, + "grad_norm": 0.5838091981251274, + "learning_rate": 9.867892051397714e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9065000116825104, + "num_tokens": 82660134.0, + "step": 1014 + }, + { + "epoch": 0.10140366651680903, + "grad_norm": 0.6013070357095354, + "learning_rate": 9.867522349655555e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9063485562801361, + "num_tokens": 82741697.0, + "step": 1015 + }, + { + "epoch": 0.10150357160697338, + "grad_norm": 0.793559465424094, + "learning_rate": 9.867152138279043e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9051179885864258, + "num_tokens": 82823198.0, + "step": 1016 + }, + { + "epoch": 0.10160347669713772, + "grad_norm": 0.6331060516872544, + "learning_rate": 9.866781417306943e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9050199389457703, + "num_tokens": 82904715.0, + "step": 1017 + }, + { + "epoch": 0.10170338178730207, + "grad_norm": 0.8068829636943445, + "learning_rate": 9.866410186778066e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.905917227268219, + "num_tokens": 82986232.0, + "step": 1018 + }, + { + "epoch": 0.10180328687746641, + "grad_norm": 0.9252464337758295, + "learning_rate": 9.866038446731282e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9039911925792694, + "num_tokens": 83067738.0, + "step": 1019 + }, + { + "epoch": 0.10190319196763076, + "grad_norm": 0.6691126200560913, + "learning_rate": 9.865666197205514e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9060443639755249, + "num_tokens": 83149276.0, + "step": 1020 + }, + { + "epoch": 0.10200309705779509, + "grad_norm": 0.5677343783393776, + "learning_rate": 9.865293438239734e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9083148539066315, + "num_tokens": 83230811.0, + "step": 1021 + }, + { + "epoch": 0.10210300214795943, + "grad_norm": 0.7027310328113652, + "learning_rate": 9.864920169872972e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9063697755336761, + "num_tokens": 83312206.0, + "step": 1022 + }, + { + "epoch": 0.10220290723812378, + "grad_norm": 0.7321369485652562, + "learning_rate": 9.864546392144309e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9057006537914276, + "num_tokens": 83393745.0, + "step": 1023 + }, + { + "epoch": 0.10230281232828813, + "grad_norm": 0.6146619888138132, + "learning_rate": 9.86417210509288e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9062700271606445, + "num_tokens": 83475323.0, + "step": 1024 + }, + { + "epoch": 0.10240271741845247, + "grad_norm": 0.5446173009390394, + "learning_rate": 9.863797308757872e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9073824882507324, + "num_tokens": 83556844.0, + "step": 1025 + }, + { + "epoch": 0.10250262250861682, + "grad_norm": 0.622481073664229, + "learning_rate": 9.863422003178528e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9046519100666046, + "num_tokens": 83638289.0, + "step": 1026 + }, + { + "epoch": 0.10260252759878116, + "grad_norm": 0.62056281484532, + "learning_rate": 9.863046188394145e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9060015082359314, + "num_tokens": 83719781.0, + "step": 1027 + }, + { + "epoch": 0.10270243268894551, + "grad_norm": 0.5146928175945269, + "learning_rate": 9.862669864444068e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9066275656223297, + "num_tokens": 83801247.0, + "step": 1028 + }, + { + "epoch": 0.10280233777910984, + "grad_norm": 0.5679504543549767, + "learning_rate": 9.862293031367698e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9038594663143158, + "num_tokens": 83882795.0, + "step": 1029 + }, + { + "epoch": 0.10290224286927419, + "grad_norm": 0.5594440172239143, + "learning_rate": 9.86191568920449e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9067273437976837, + "num_tokens": 83964346.0, + "step": 1030 + }, + { + "epoch": 0.10300214795943853, + "grad_norm": 0.644228029005635, + "learning_rate": 9.861537837993957e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9042503833770752, + "num_tokens": 84045791.0, + "step": 1031 + }, + { + "epoch": 0.10310205304960288, + "grad_norm": 0.7489471879634819, + "learning_rate": 9.861159477775653e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9066546261310577, + "num_tokens": 84127263.0, + "step": 1032 + }, + { + "epoch": 0.10320195813976722, + "grad_norm": 1.0370537606026602, + "learning_rate": 9.860780608589197e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9109793901443481, + "num_tokens": 84208763.0, + "step": 1033 + }, + { + "epoch": 0.10330186322993157, + "grad_norm": 0.6489323919817822, + "learning_rate": 9.860401230474257e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9062540233135223, + "num_tokens": 84290258.0, + "step": 1034 + }, + { + "epoch": 0.10340176832009591, + "grad_norm": 0.6698706202567232, + "learning_rate": 9.860021343470554e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9099403321743011, + "num_tokens": 84371790.0, + "step": 1035 + }, + { + "epoch": 0.10350167341026026, + "grad_norm": 0.9123852246063081, + "learning_rate": 9.859640947617861e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9062626659870148, + "num_tokens": 84453247.0, + "step": 1036 + }, + { + "epoch": 0.10360157850042459, + "grad_norm": 1.379226304843551, + "learning_rate": 9.859260042956008e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9062198400497437, + "num_tokens": 84534757.0, + "step": 1037 + }, + { + "epoch": 0.10370148359058894, + "grad_norm": 0.6512581970720652, + "learning_rate": 9.858878629524876e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9063684940338135, + "num_tokens": 84616340.0, + "step": 1038 + }, + { + "epoch": 0.10380138868075328, + "grad_norm": 0.6938645821420333, + "learning_rate": 9.858496707364395e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9057823121547699, + "num_tokens": 84697854.0, + "step": 1039 + }, + { + "epoch": 0.10390129377091763, + "grad_norm": 0.8898365785116069, + "learning_rate": 9.858114276514557e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.907761424779892, + "num_tokens": 84779459.0, + "step": 1040 + }, + { + "epoch": 0.10400119886108197, + "grad_norm": 0.8241586552387244, + "learning_rate": 9.857731337015403e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9058278203010559, + "num_tokens": 84861011.0, + "step": 1041 + }, + { + "epoch": 0.10410110395124632, + "grad_norm": 0.7331154221504823, + "learning_rate": 9.857347888907025e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9101307988166809, + "num_tokens": 84942599.0, + "step": 1042 + }, + { + "epoch": 0.10420100904141066, + "grad_norm": 1.1221146265766584, + "learning_rate": 9.85696393222957e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9081557393074036, + "num_tokens": 85024250.0, + "step": 1043 + }, + { + "epoch": 0.10430091413157501, + "grad_norm": 0.6974051200332275, + "learning_rate": 9.856579467023243e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9053070247173309, + "num_tokens": 85105759.0, + "step": 1044 + }, + { + "epoch": 0.10440081922173934, + "grad_norm": 0.6208022355163283, + "learning_rate": 9.856194493328293e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9053096175193787, + "num_tokens": 85187234.0, + "step": 1045 + }, + { + "epoch": 0.10450072431190369, + "grad_norm": 0.6281737506179377, + "learning_rate": 9.855809011185029e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9078777134418488, + "num_tokens": 85268756.0, + "step": 1046 + }, + { + "epoch": 0.10460062940206803, + "grad_norm": 1.1513684990385873, + "learning_rate": 9.855423020633812e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9040239155292511, + "num_tokens": 85350299.0, + "step": 1047 + }, + { + "epoch": 0.10470053449223238, + "grad_norm": 0.6392861994905573, + "learning_rate": 9.855036521715055e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9055265784263611, + "num_tokens": 85431814.0, + "step": 1048 + }, + { + "epoch": 0.10480043958239672, + "grad_norm": 0.5257413908722676, + "learning_rate": 9.854649514469224e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9061209261417389, + "num_tokens": 85513357.0, + "step": 1049 + }, + { + "epoch": 0.10490034467256107, + "grad_norm": 0.7896854114263975, + "learning_rate": 9.85426199893684e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9049706161022186, + "num_tokens": 85594867.0, + "step": 1050 + }, + { + "epoch": 0.10500024976272541, + "grad_norm": 0.7092288053727523, + "learning_rate": 9.853873975158476e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9049805700778961, + "num_tokens": 85676402.0, + "step": 1051 + }, + { + "epoch": 0.10510015485288976, + "grad_norm": 0.6103174034774419, + "learning_rate": 9.85348544317476e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9079710841178894, + "num_tokens": 85757872.0, + "step": 1052 + }, + { + "epoch": 0.10520005994305409, + "grad_norm": 0.7141642383834341, + "learning_rate": 9.853096403026367e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9062789082527161, + "num_tokens": 85839331.0, + "step": 1053 + }, + { + "epoch": 0.10529996503321844, + "grad_norm": 0.8844553449440863, + "learning_rate": 9.852706854754037e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9055916666984558, + "num_tokens": 85920946.0, + "step": 1054 + }, + { + "epoch": 0.10539987012338278, + "grad_norm": 0.7104932050720584, + "learning_rate": 9.85231679839855e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9062235057353973, + "num_tokens": 86002393.0, + "step": 1055 + }, + { + "epoch": 0.10549977521354713, + "grad_norm": 0.7891053316275405, + "learning_rate": 9.851926234000747e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9085209965705872, + "num_tokens": 86083920.0, + "step": 1056 + }, + { + "epoch": 0.10559968030371147, + "grad_norm": 0.6595435052911148, + "learning_rate": 9.851535161601521e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9070814847946167, + "num_tokens": 86165392.0, + "step": 1057 + }, + { + "epoch": 0.10569958539387582, + "grad_norm": 0.8091382114478501, + "learning_rate": 9.851143581241817e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9068827331066132, + "num_tokens": 86246935.0, + "step": 1058 + }, + { + "epoch": 0.10579949048404017, + "grad_norm": 0.7031039567382337, + "learning_rate": 9.850751492962636e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9065691232681274, + "num_tokens": 86328460.0, + "step": 1059 + }, + { + "epoch": 0.10589939557420451, + "grad_norm": 0.7869577007242062, + "learning_rate": 9.850358896805028e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9053292870521545, + "num_tokens": 86409977.0, + "step": 1060 + }, + { + "epoch": 0.10599930066436886, + "grad_norm": 0.8713176897669231, + "learning_rate": 9.849965792810099e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9079760313034058, + "num_tokens": 86491407.0, + "step": 1061 + }, + { + "epoch": 0.10609920575453319, + "grad_norm": 0.7426495618931361, + "learning_rate": 9.849572181019008e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9048053622245789, + "num_tokens": 86572920.0, + "step": 1062 + }, + { + "epoch": 0.10619911084469753, + "grad_norm": 0.6433698686259047, + "learning_rate": 9.849178061472962e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9055919647216797, + "num_tokens": 86654518.0, + "step": 1063 + }, + { + "epoch": 0.10629901593486188, + "grad_norm": 0.6085578245496708, + "learning_rate": 9.848783434213232e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9089394807815552, + "num_tokens": 86736062.0, + "step": 1064 + }, + { + "epoch": 0.10639892102502622, + "grad_norm": 0.568375799859436, + "learning_rate": 9.848388299281132e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9085326492786407, + "num_tokens": 86817657.0, + "step": 1065 + }, + { + "epoch": 0.10649882611519057, + "grad_norm": 0.6851634811632128, + "learning_rate": 9.847992656718035e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9070842862129211, + "num_tokens": 86899131.0, + "step": 1066 + }, + { + "epoch": 0.10659873120535492, + "grad_norm": 0.6967178518142898, + "learning_rate": 9.847596506565365e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9046452939510345, + "num_tokens": 86980646.0, + "step": 1067 + }, + { + "epoch": 0.10669863629551926, + "grad_norm": 0.7392193970626255, + "learning_rate": 9.847199848864597e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.904921293258667, + "num_tokens": 87062195.0, + "step": 1068 + }, + { + "epoch": 0.1067985413856836, + "grad_norm": 0.7959442711828154, + "learning_rate": 9.846802683657264e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9080069959163666, + "num_tokens": 87143769.0, + "step": 1069 + }, + { + "epoch": 0.10689844647584794, + "grad_norm": 0.7850738922084289, + "learning_rate": 9.846405010984948e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9077697992324829, + "num_tokens": 87225288.0, + "step": 1070 + }, + { + "epoch": 0.10699835156601228, + "grad_norm": 0.6846983158047765, + "learning_rate": 9.846006830889285e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9050372540950775, + "num_tokens": 87306722.0, + "step": 1071 + }, + { + "epoch": 0.10709825665617663, + "grad_norm": 0.6925873128245623, + "learning_rate": 9.84560814341197e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9045503437519073, + "num_tokens": 87388198.0, + "step": 1072 + }, + { + "epoch": 0.10719816174634098, + "grad_norm": 0.6341549040166629, + "learning_rate": 9.845208948594739e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9065175652503967, + "num_tokens": 87469798.0, + "step": 1073 + }, + { + "epoch": 0.10729806683650532, + "grad_norm": 0.6441160984453093, + "learning_rate": 9.844809246479392e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9043907523155212, + "num_tokens": 87551242.0, + "step": 1074 + }, + { + "epoch": 0.10739797192666967, + "grad_norm": 0.7829927453496007, + "learning_rate": 9.844409037107778e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9076542854309082, + "num_tokens": 87632761.0, + "step": 1075 + }, + { + "epoch": 0.10749787701683401, + "grad_norm": 0.5955189532054148, + "learning_rate": 9.844008320521798e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9048559367656708, + "num_tokens": 87714255.0, + "step": 1076 + }, + { + "epoch": 0.10759778210699836, + "grad_norm": 0.5845286767636746, + "learning_rate": 9.843607096763408e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9064908921718597, + "num_tokens": 87795797.0, + "step": 1077 + }, + { + "epoch": 0.10769768719716269, + "grad_norm": 0.6901868568875777, + "learning_rate": 9.843205365874619e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9076903760433197, + "num_tokens": 87877334.0, + "step": 1078 + }, + { + "epoch": 0.10779759228732703, + "grad_norm": 0.5530344861077413, + "learning_rate": 9.842803127897489e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9080210626125336, + "num_tokens": 87958884.0, + "step": 1079 + }, + { + "epoch": 0.10789749737749138, + "grad_norm": 0.7881353753158031, + "learning_rate": 9.842400382874133e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9061259925365448, + "num_tokens": 88040396.0, + "step": 1080 + }, + { + "epoch": 0.10799740246765573, + "grad_norm": 0.622522421426118, + "learning_rate": 9.84199713084672e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9059605300426483, + "num_tokens": 88121957.0, + "step": 1081 + }, + { + "epoch": 0.10809730755782007, + "grad_norm": 0.719669123568516, + "learning_rate": 9.841593371857472e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9067133367061615, + "num_tokens": 88203424.0, + "step": 1082 + }, + { + "epoch": 0.10819721264798442, + "grad_norm": 0.7922611921347819, + "learning_rate": 9.841189105948661e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9068337082862854, + "num_tokens": 88284932.0, + "step": 1083 + }, + { + "epoch": 0.10829711773814876, + "grad_norm": 0.6558623937461383, + "learning_rate": 9.840784333162614e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9053437411785126, + "num_tokens": 88366443.0, + "step": 1084 + }, + { + "epoch": 0.10839702282831311, + "grad_norm": 0.7023128703384481, + "learning_rate": 9.840379053541714e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9066589772701263, + "num_tokens": 88447934.0, + "step": 1085 + }, + { + "epoch": 0.10849692791847744, + "grad_norm": 0.837701109220317, + "learning_rate": 9.83997326712839e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9063440561294556, + "num_tokens": 88529452.0, + "step": 1086 + }, + { + "epoch": 0.10859683300864179, + "grad_norm": 0.6771820508898767, + "learning_rate": 9.83956697396513e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9062702357769012, + "num_tokens": 88610931.0, + "step": 1087 + }, + { + "epoch": 0.10869673809880613, + "grad_norm": 0.574142001354196, + "learning_rate": 9.839160174094476e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9063471853733063, + "num_tokens": 88692355.0, + "step": 1088 + }, + { + "epoch": 0.10879664318897048, + "grad_norm": 0.9547879324771706, + "learning_rate": 9.838752867559015e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9097233712673187, + "num_tokens": 88773855.0, + "step": 1089 + }, + { + "epoch": 0.10889654827913482, + "grad_norm": 0.7237970711051996, + "learning_rate": 9.838345054401398e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9070104658603668, + "num_tokens": 88855377.0, + "step": 1090 + }, + { + "epoch": 0.10899645336929917, + "grad_norm": 0.866745875970499, + "learning_rate": 9.837936734664318e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9027663469314575, + "num_tokens": 88936784.0, + "step": 1091 + }, + { + "epoch": 0.10909635845946351, + "grad_norm": 0.5996564425554918, + "learning_rate": 9.83752790839053e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9072145819664001, + "num_tokens": 89018377.0, + "step": 1092 + }, + { + "epoch": 0.10919626354962786, + "grad_norm": 0.6361869571490653, + "learning_rate": 9.837118575622839e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9059595465660095, + "num_tokens": 89099938.0, + "step": 1093 + }, + { + "epoch": 0.10929616863979219, + "grad_norm": 0.6624092802738656, + "learning_rate": 9.836708736404099e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9071537852287292, + "num_tokens": 89181493.0, + "step": 1094 + }, + { + "epoch": 0.10939607372995654, + "grad_norm": 0.644559999408759, + "learning_rate": 9.836298390777226e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9063251912593842, + "num_tokens": 89263069.0, + "step": 1095 + }, + { + "epoch": 0.10949597882012088, + "grad_norm": 0.6210982319056623, + "learning_rate": 9.835887538785179e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9087423086166382, + "num_tokens": 89344762.0, + "step": 1096 + }, + { + "epoch": 0.10959588391028523, + "grad_norm": 0.6993749100932821, + "learning_rate": 9.835476180470975e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9052612781524658, + "num_tokens": 89426255.0, + "step": 1097 + }, + { + "epoch": 0.10969578900044957, + "grad_norm": 0.5725559550180673, + "learning_rate": 9.835064315877685e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9055538773536682, + "num_tokens": 89507702.0, + "step": 1098 + }, + { + "epoch": 0.10979569409061392, + "grad_norm": 0.6652426479722771, + "learning_rate": 9.83465194504843e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9081504046916962, + "num_tokens": 89589275.0, + "step": 1099 + }, + { + "epoch": 0.10989559918077826, + "grad_norm": 0.6420298743718924, + "learning_rate": 9.834239068026388e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9068346917629242, + "num_tokens": 89670759.0, + "step": 1100 + }, + { + "epoch": 0.10999550427094261, + "grad_norm": 0.5318071745241069, + "learning_rate": 9.833825684854787e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9106713533401489, + "num_tokens": 89752249.0, + "step": 1101 + }, + { + "epoch": 0.11009540936110696, + "grad_norm": 0.5676812032775571, + "learning_rate": 9.833411795576908e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9068218171596527, + "num_tokens": 89833713.0, + "step": 1102 + }, + { + "epoch": 0.11019531445127129, + "grad_norm": 0.6496192959964966, + "learning_rate": 9.832997400236085e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9077355265617371, + "num_tokens": 89915210.0, + "step": 1103 + }, + { + "epoch": 0.11029521954143563, + "grad_norm": 0.6426459693895721, + "learning_rate": 9.832582498875706e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9057691693305969, + "num_tokens": 89996639.0, + "step": 1104 + }, + { + "epoch": 0.11039512463159998, + "grad_norm": 0.7918633843397024, + "learning_rate": 9.832167091539215e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9072847664356232, + "num_tokens": 90078181.0, + "step": 1105 + }, + { + "epoch": 0.11049502972176432, + "grad_norm": 0.6224375593855032, + "learning_rate": 9.831751178270099e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9085509479045868, + "num_tokens": 90159700.0, + "step": 1106 + }, + { + "epoch": 0.11059493481192867, + "grad_norm": 0.6513379041093277, + "learning_rate": 9.83133475911191e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9054588079452515, + "num_tokens": 90241147.0, + "step": 1107 + }, + { + "epoch": 0.11069483990209301, + "grad_norm": 0.7258050209655837, + "learning_rate": 9.830917834108245e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9072901010513306, + "num_tokens": 90322626.0, + "step": 1108 + }, + { + "epoch": 0.11079474499225736, + "grad_norm": 0.5785104040384333, + "learning_rate": 9.830500403302756e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9048466086387634, + "num_tokens": 90404088.0, + "step": 1109 + }, + { + "epoch": 0.1108946500824217, + "grad_norm": 0.6501480441192384, + "learning_rate": 9.830082466739149e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9056349396705627, + "num_tokens": 90485549.0, + "step": 1110 + }, + { + "epoch": 0.11099455517258604, + "grad_norm": 0.6563509922017631, + "learning_rate": 9.829664024461183e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9058310389518738, + "num_tokens": 90567110.0, + "step": 1111 + }, + { + "epoch": 0.11109446026275038, + "grad_norm": 0.6941765475182315, + "learning_rate": 9.82924507651267e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.905496209859848, + "num_tokens": 90648645.0, + "step": 1112 + }, + { + "epoch": 0.11119436535291473, + "grad_norm": 0.6577606979236054, + "learning_rate": 9.828825622937474e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9051339030265808, + "num_tokens": 90730095.0, + "step": 1113 + }, + { + "epoch": 0.11129427044307907, + "grad_norm": 0.6758286163823568, + "learning_rate": 9.82840566377951e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9016374051570892, + "num_tokens": 90811577.0, + "step": 1114 + }, + { + "epoch": 0.11139417553324342, + "grad_norm": 0.7161202671351613, + "learning_rate": 9.82798519908275e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9070679545402527, + "num_tokens": 90893189.0, + "step": 1115 + }, + { + "epoch": 0.11149408062340777, + "grad_norm": 0.6904891983894825, + "learning_rate": 9.827564228891218e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9041869640350342, + "num_tokens": 90974743.0, + "step": 1116 + }, + { + "epoch": 0.11159398571357211, + "grad_norm": 0.6344553322195443, + "learning_rate": 9.827142753248986e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9059279263019562, + "num_tokens": 91056292.0, + "step": 1117 + }, + { + "epoch": 0.11169389080373646, + "grad_norm": 0.6787161049610938, + "learning_rate": 9.826720772200187e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9056950509548187, + "num_tokens": 91137831.0, + "step": 1118 + }, + { + "epoch": 0.11179379589390079, + "grad_norm": 0.5580833807310268, + "learning_rate": 9.826298285789002e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9080054759979248, + "num_tokens": 91219362.0, + "step": 1119 + }, + { + "epoch": 0.11189370098406513, + "grad_norm": 0.6174337668320347, + "learning_rate": 9.825875294059663e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9083640277385712, + "num_tokens": 91300867.0, + "step": 1120 + }, + { + "epoch": 0.11199360607422948, + "grad_norm": 0.7070194288548273, + "learning_rate": 9.825451797056462e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9065976440906525, + "num_tokens": 91382267.0, + "step": 1121 + }, + { + "epoch": 0.11209351116439382, + "grad_norm": 0.6264014854807173, + "learning_rate": 9.825027794823738e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9047979712486267, + "num_tokens": 91463742.0, + "step": 1122 + }, + { + "epoch": 0.11219341625455817, + "grad_norm": 0.6141706003477737, + "learning_rate": 9.824603287405881e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9074931740760803, + "num_tokens": 91545282.0, + "step": 1123 + }, + { + "epoch": 0.11229332134472252, + "grad_norm": 0.5810307480499105, + "learning_rate": 9.824178274847343e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9071164727210999, + "num_tokens": 91626847.0, + "step": 1124 + }, + { + "epoch": 0.11239322643488686, + "grad_norm": 0.5926761003343562, + "learning_rate": 9.823752757192619e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9066060483455658, + "num_tokens": 91708391.0, + "step": 1125 + }, + { + "epoch": 0.11249313152505121, + "grad_norm": 0.6964764051188679, + "learning_rate": 9.823326734486262e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9057667851448059, + "num_tokens": 91789862.0, + "step": 1126 + }, + { + "epoch": 0.11259303661521554, + "grad_norm": 0.791535000740797, + "learning_rate": 9.822900206772879e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9066653847694397, + "num_tokens": 91871481.0, + "step": 1127 + }, + { + "epoch": 0.11269294170537988, + "grad_norm": 0.6755877217713926, + "learning_rate": 9.822473174097125e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9065615832805634, + "num_tokens": 91953003.0, + "step": 1128 + }, + { + "epoch": 0.11279284679554423, + "grad_norm": 0.5965901869309838, + "learning_rate": 9.822045636503713e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9043365120887756, + "num_tokens": 92034496.0, + "step": 1129 + }, + { + "epoch": 0.11289275188570858, + "grad_norm": 0.9863642176494875, + "learning_rate": 9.821617594037405e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.905674934387207, + "num_tokens": 92116013.0, + "step": 1130 + }, + { + "epoch": 0.11299265697587292, + "grad_norm": 0.7495436520185531, + "learning_rate": 9.821189046743019e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9024029672145844, + "num_tokens": 92197505.0, + "step": 1131 + }, + { + "epoch": 0.11309256206603727, + "grad_norm": 0.6177695432769658, + "learning_rate": 9.820759994665422e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9044261574745178, + "num_tokens": 92279089.0, + "step": 1132 + }, + { + "epoch": 0.11319246715620161, + "grad_norm": 0.6427524823556994, + "learning_rate": 9.820330437849538e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9076410531997681, + "num_tokens": 92360645.0, + "step": 1133 + }, + { + "epoch": 0.11329237224636596, + "grad_norm": 0.690386509013419, + "learning_rate": 9.819900376340342e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9086959064006805, + "num_tokens": 92442238.0, + "step": 1134 + }, + { + "epoch": 0.11339227733653029, + "grad_norm": 0.8798823875350896, + "learning_rate": 9.819469810182862e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9020530581474304, + "num_tokens": 92523663.0, + "step": 1135 + }, + { + "epoch": 0.11349218242669463, + "grad_norm": 0.608229166872612, + "learning_rate": 9.819038739422178e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9040222465991974, + "num_tokens": 92605192.0, + "step": 1136 + }, + { + "epoch": 0.11359208751685898, + "grad_norm": 0.9965022781980108, + "learning_rate": 9.818607164103425e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9054820537567139, + "num_tokens": 92686693.0, + "step": 1137 + }, + { + "epoch": 0.11369199260702333, + "grad_norm": 0.7091805877837826, + "learning_rate": 9.818175084271786e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9036585092544556, + "num_tokens": 92768172.0, + "step": 1138 + }, + { + "epoch": 0.11379189769718767, + "grad_norm": 0.7768241445178974, + "learning_rate": 9.817742499972502e-06, + "loss": 0.502, + "mean_token_accuracy": 0.902696818113327, + "num_tokens": 92849700.0, + "step": 1139 + }, + { + "epoch": 0.11389180278735202, + "grad_norm": 0.6560173080867936, + "learning_rate": 9.817309411250867e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.906270295381546, + "num_tokens": 92931213.0, + "step": 1140 + }, + { + "epoch": 0.11399170787751636, + "grad_norm": 0.6733691903902526, + "learning_rate": 9.816875818152225e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9040194153785706, + "num_tokens": 93012793.0, + "step": 1141 + }, + { + "epoch": 0.11409161296768071, + "grad_norm": 0.6800302351081714, + "learning_rate": 9.81644172072197e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9044562578201294, + "num_tokens": 93094266.0, + "step": 1142 + }, + { + "epoch": 0.11419151805784505, + "grad_norm": 0.748618761833425, + "learning_rate": 9.816007119005557e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9065502285957336, + "num_tokens": 93175685.0, + "step": 1143 + }, + { + "epoch": 0.11429142314800939, + "grad_norm": 0.6527531666940761, + "learning_rate": 9.815572013048486e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9053217470645905, + "num_tokens": 93257195.0, + "step": 1144 + }, + { + "epoch": 0.11439132823817373, + "grad_norm": 0.8653960864403345, + "learning_rate": 9.815136402896316e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9045678973197937, + "num_tokens": 93338769.0, + "step": 1145 + }, + { + "epoch": 0.11449123332833808, + "grad_norm": 0.6304006020658596, + "learning_rate": 9.814700288594655e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9060353338718414, + "num_tokens": 93420286.0, + "step": 1146 + }, + { + "epoch": 0.11459113841850242, + "grad_norm": 0.7514537225307714, + "learning_rate": 9.814263670189162e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9047472178936005, + "num_tokens": 93501852.0, + "step": 1147 + }, + { + "epoch": 0.11469104350866677, + "grad_norm": 0.7040833981897275, + "learning_rate": 9.813826547725553e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9056365489959717, + "num_tokens": 93583389.0, + "step": 1148 + }, + { + "epoch": 0.11479094859883111, + "grad_norm": 0.6214038267817337, + "learning_rate": 9.813388921249595e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9087361097335815, + "num_tokens": 93664968.0, + "step": 1149 + }, + { + "epoch": 0.11489085368899546, + "grad_norm": 0.5593962724723487, + "learning_rate": 9.81295079080711e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.9069922566413879, + "num_tokens": 93746692.0, + "step": 1150 + }, + { + "epoch": 0.1149907587791598, + "grad_norm": 0.6872180955398747, + "learning_rate": 9.812512156443967e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9092081487178802, + "num_tokens": 93828180.0, + "step": 1151 + }, + { + "epoch": 0.11509066386932414, + "grad_norm": 0.5644195155018205, + "learning_rate": 9.812073018206094e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9057325720787048, + "num_tokens": 93909739.0, + "step": 1152 + }, + { + "epoch": 0.11519056895948848, + "grad_norm": 0.6373441725292636, + "learning_rate": 9.81163337613947e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9059747159481049, + "num_tokens": 93991300.0, + "step": 1153 + }, + { + "epoch": 0.11529047404965283, + "grad_norm": 0.647586799284575, + "learning_rate": 9.811193230290124e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9083291590213776, + "num_tokens": 94072850.0, + "step": 1154 + }, + { + "epoch": 0.11539037913981717, + "grad_norm": 0.837834811641438, + "learning_rate": 9.81075258070414e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9038175046443939, + "num_tokens": 94154281.0, + "step": 1155 + }, + { + "epoch": 0.11549028422998152, + "grad_norm": 0.9398812745315581, + "learning_rate": 9.810311427427653e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9063656628131866, + "num_tokens": 94235814.0, + "step": 1156 + }, + { + "epoch": 0.11559018932014586, + "grad_norm": 0.7367446814010978, + "learning_rate": 9.809869770506855e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9071980714797974, + "num_tokens": 94317480.0, + "step": 1157 + }, + { + "epoch": 0.11569009441031021, + "grad_norm": 0.798856339017431, + "learning_rate": 9.809427609987987e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9053193032741547, + "num_tokens": 94399004.0, + "step": 1158 + }, + { + "epoch": 0.11578999950047456, + "grad_norm": 0.810586904609619, + "learning_rate": 9.808984945917344e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9031384885311127, + "num_tokens": 94480556.0, + "step": 1159 + }, + { + "epoch": 0.11588990459063889, + "grad_norm": 0.7995437754856551, + "learning_rate": 9.808541778341272e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9062557518482208, + "num_tokens": 94562100.0, + "step": 1160 + }, + { + "epoch": 0.11598980968080323, + "grad_norm": 0.6076155732894528, + "learning_rate": 9.808098107306172e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9042904078960419, + "num_tokens": 94643680.0, + "step": 1161 + }, + { + "epoch": 0.11608971477096758, + "grad_norm": 0.5331771632141076, + "learning_rate": 9.807653932858497e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9087609648704529, + "num_tokens": 94725195.0, + "step": 1162 + }, + { + "epoch": 0.11618961986113192, + "grad_norm": 0.5949589755602661, + "learning_rate": 9.807209255044752e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.907858818769455, + "num_tokens": 94806756.0, + "step": 1163 + }, + { + "epoch": 0.11628952495129627, + "grad_norm": 1.1828751830532325, + "learning_rate": 9.806764073911496e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9067663252353668, + "num_tokens": 94888201.0, + "step": 1164 + }, + { + "epoch": 0.11638943004146061, + "grad_norm": 0.6220681905643412, + "learning_rate": 9.806318389505338e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9056829214096069, + "num_tokens": 94969689.0, + "step": 1165 + }, + { + "epoch": 0.11648933513162496, + "grad_norm": 0.6077393647631656, + "learning_rate": 9.805872201872943e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9058269560337067, + "num_tokens": 95051209.0, + "step": 1166 + }, + { + "epoch": 0.1165892402217893, + "grad_norm": 0.7323934921156801, + "learning_rate": 9.805425511061028e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9048987030982971, + "num_tokens": 95132623.0, + "step": 1167 + }, + { + "epoch": 0.11668914531195364, + "grad_norm": 0.5762049784920732, + "learning_rate": 9.804978317116362e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9077184796333313, + "num_tokens": 95214134.0, + "step": 1168 + }, + { + "epoch": 0.11678905040211798, + "grad_norm": 1.0101259694990703, + "learning_rate": 9.804530620085764e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9050877094268799, + "num_tokens": 95295594.0, + "step": 1169 + }, + { + "epoch": 0.11688895549228233, + "grad_norm": 0.5756551697947431, + "learning_rate": 9.80408242001611e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.907512903213501, + "num_tokens": 95377048.0, + "step": 1170 + }, + { + "epoch": 0.11698886058244667, + "grad_norm": 0.8444855005310042, + "learning_rate": 9.803633716954329e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9040633141994476, + "num_tokens": 95458539.0, + "step": 1171 + }, + { + "epoch": 0.11708876567261102, + "grad_norm": 0.6752197289954861, + "learning_rate": 9.803184510947397e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9057181477546692, + "num_tokens": 95539974.0, + "step": 1172 + }, + { + "epoch": 0.11718867076277537, + "grad_norm": 0.6881264390201963, + "learning_rate": 9.802734802042347e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9032695591449738, + "num_tokens": 95621462.0, + "step": 1173 + }, + { + "epoch": 0.11728857585293971, + "grad_norm": 0.6820934360830769, + "learning_rate": 9.802284590286267e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9066320061683655, + "num_tokens": 95702909.0, + "step": 1174 + }, + { + "epoch": 0.11738848094310406, + "grad_norm": 0.6492806971904176, + "learning_rate": 9.80183387572629e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9083454310894012, + "num_tokens": 95784433.0, + "step": 1175 + }, + { + "epoch": 0.11748838603326839, + "grad_norm": 0.5926254687975374, + "learning_rate": 9.801382658409611e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9052251875400543, + "num_tokens": 95865987.0, + "step": 1176 + }, + { + "epoch": 0.11758829112343273, + "grad_norm": 0.7967161361668432, + "learning_rate": 9.80093093838347e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9077782034873962, + "num_tokens": 95947497.0, + "step": 1177 + }, + { + "epoch": 0.11768819621359708, + "grad_norm": 0.6800614715597301, + "learning_rate": 9.800478715695165e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9045092165470123, + "num_tokens": 96028990.0, + "step": 1178 + }, + { + "epoch": 0.11778810130376142, + "grad_norm": 0.5598136414048932, + "learning_rate": 9.80002599039204e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.906121164560318, + "num_tokens": 96110502.0, + "step": 1179 + }, + { + "epoch": 0.11788800639392577, + "grad_norm": 0.9174558820880859, + "learning_rate": 9.799572762521499e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9050878882408142, + "num_tokens": 96192015.0, + "step": 1180 + }, + { + "epoch": 0.11798791148409012, + "grad_norm": 0.7839032184304501, + "learning_rate": 9.799119032130995e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9062335789203644, + "num_tokens": 96273450.0, + "step": 1181 + }, + { + "epoch": 0.11808781657425446, + "grad_norm": 0.7834083643161154, + "learning_rate": 9.798664799268032e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9053440690040588, + "num_tokens": 96354952.0, + "step": 1182 + }, + { + "epoch": 0.11818772166441881, + "grad_norm": 0.5410980098440484, + "learning_rate": 9.798210063980172e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9081898331642151, + "num_tokens": 96436565.0, + "step": 1183 + }, + { + "epoch": 0.11828762675458315, + "grad_norm": 0.9653913174847869, + "learning_rate": 9.797754826315025e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9062054455280304, + "num_tokens": 96518071.0, + "step": 1184 + }, + { + "epoch": 0.11838753184474748, + "grad_norm": 0.6184148363958606, + "learning_rate": 9.797299086320253e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9107569754123688, + "num_tokens": 96599711.0, + "step": 1185 + }, + { + "epoch": 0.11848743693491183, + "grad_norm": 0.6434238414632667, + "learning_rate": 9.796842844043574e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9035720825195312, + "num_tokens": 96681173.0, + "step": 1186 + }, + { + "epoch": 0.11858734202507618, + "grad_norm": 0.5411891455146053, + "learning_rate": 9.796386099532756e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9088379442691803, + "num_tokens": 96762808.0, + "step": 1187 + }, + { + "epoch": 0.11868724711524052, + "grad_norm": 1.030514434283469, + "learning_rate": 9.795928852835621e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9042205214500427, + "num_tokens": 96844300.0, + "step": 1188 + }, + { + "epoch": 0.11878715220540487, + "grad_norm": 0.8540463019653248, + "learning_rate": 9.795471104000046e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9048726260662079, + "num_tokens": 96925765.0, + "step": 1189 + }, + { + "epoch": 0.11888705729556921, + "grad_norm": 0.7783356087431899, + "learning_rate": 9.795012853073954e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9088634252548218, + "num_tokens": 97007324.0, + "step": 1190 + }, + { + "epoch": 0.11898696238573356, + "grad_norm": 0.7212638910081224, + "learning_rate": 9.794554100105325e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9044183194637299, + "num_tokens": 97088853.0, + "step": 1191 + }, + { + "epoch": 0.1190868674758979, + "grad_norm": 0.7510760873062999, + "learning_rate": 9.794094845142192e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9044451415538788, + "num_tokens": 97170438.0, + "step": 1192 + }, + { + "epoch": 0.11918677256606224, + "grad_norm": 0.686724773831849, + "learning_rate": 9.793635088232638e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9077087640762329, + "num_tokens": 97251979.0, + "step": 1193 + }, + { + "epoch": 0.11928667765622658, + "grad_norm": 1.1462998668615825, + "learning_rate": 9.793174829424801e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9073220789432526, + "num_tokens": 97333537.0, + "step": 1194 + }, + { + "epoch": 0.11938658274639093, + "grad_norm": 0.7034222328695364, + "learning_rate": 9.792714068766872e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9087065160274506, + "num_tokens": 97415114.0, + "step": 1195 + }, + { + "epoch": 0.11948648783655527, + "grad_norm": 0.9074778195826759, + "learning_rate": 9.79225280630709e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9088970720767975, + "num_tokens": 97496688.0, + "step": 1196 + }, + { + "epoch": 0.11958639292671962, + "grad_norm": 0.7181326196358103, + "learning_rate": 9.791791042093752e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9057691097259521, + "num_tokens": 97578257.0, + "step": 1197 + }, + { + "epoch": 0.11968629801688396, + "grad_norm": 0.7353746459731425, + "learning_rate": 9.791328776175204e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9072485864162445, + "num_tokens": 97659801.0, + "step": 1198 + }, + { + "epoch": 0.11978620310704831, + "grad_norm": 1.2325727668408784, + "learning_rate": 9.790866008599846e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.906784862279892, + "num_tokens": 97741401.0, + "step": 1199 + }, + { + "epoch": 0.11988610819721265, + "grad_norm": 1.0638243211003462, + "learning_rate": 9.790402739416131e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9074948728084564, + "num_tokens": 97822909.0, + "step": 1200 + }, + { + "epoch": 0.11998601328737699, + "grad_norm": 0.8675627992855778, + "learning_rate": 9.789938968672562e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9069330990314484, + "num_tokens": 97904405.0, + "step": 1201 + }, + { + "epoch": 0.12008591837754133, + "grad_norm": 0.6624589403861842, + "learning_rate": 9.789474696417698e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9083499610424042, + "num_tokens": 97985886.0, + "step": 1202 + }, + { + "epoch": 0.12018582346770568, + "grad_norm": 0.7311042327333594, + "learning_rate": 9.789009922700147e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9082706868648529, + "num_tokens": 98067503.0, + "step": 1203 + }, + { + "epoch": 0.12028572855787002, + "grad_norm": 0.7505730113791632, + "learning_rate": 9.788544647568574e-06, + "loss": 0.504, + "mean_token_accuracy": 0.90485680103302, + "num_tokens": 98148994.0, + "step": 1204 + }, + { + "epoch": 0.12038563364803437, + "grad_norm": 0.6369325428939236, + "learning_rate": 9.788078871071688e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9052804708480835, + "num_tokens": 98230425.0, + "step": 1205 + }, + { + "epoch": 0.12048553873819871, + "grad_norm": 0.6539268180323499, + "learning_rate": 9.787612593258265e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9060919880867004, + "num_tokens": 98311986.0, + "step": 1206 + }, + { + "epoch": 0.12058544382836306, + "grad_norm": 0.7160960525303196, + "learning_rate": 9.787145814177118e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9040926992893219, + "num_tokens": 98393512.0, + "step": 1207 + }, + { + "epoch": 0.1206853489185274, + "grad_norm": 0.8105501343141798, + "learning_rate": 9.78667853387712e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9092881083488464, + "num_tokens": 98475107.0, + "step": 1208 + }, + { + "epoch": 0.12078525400869174, + "grad_norm": 0.682931397315781, + "learning_rate": 9.786210752407199e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9089848399162292, + "num_tokens": 98556643.0, + "step": 1209 + }, + { + "epoch": 0.12088515909885608, + "grad_norm": 0.6239995487643722, + "learning_rate": 9.78574246981633e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9045402109622955, + "num_tokens": 98638140.0, + "step": 1210 + }, + { + "epoch": 0.12098506418902043, + "grad_norm": 0.7231453492378768, + "learning_rate": 9.785273686153542e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9069879353046417, + "num_tokens": 98719733.0, + "step": 1211 + }, + { + "epoch": 0.12108496927918477, + "grad_norm": 0.5676708249923522, + "learning_rate": 9.784804401467917e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9059860706329346, + "num_tokens": 98801222.0, + "step": 1212 + }, + { + "epoch": 0.12118487436934912, + "grad_norm": 0.6221557454375414, + "learning_rate": 9.784334615808592e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9045269191265106, + "num_tokens": 98882738.0, + "step": 1213 + }, + { + "epoch": 0.12128477945951346, + "grad_norm": 1.008139480531632, + "learning_rate": 9.783864329224752e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9065324068069458, + "num_tokens": 98964260.0, + "step": 1214 + }, + { + "epoch": 0.12138468454967781, + "grad_norm": 0.6467289502123987, + "learning_rate": 9.783393541765639e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9031903743743896, + "num_tokens": 99045774.0, + "step": 1215 + }, + { + "epoch": 0.12148458963984216, + "grad_norm": 0.6390982136180832, + "learning_rate": 9.782922253480538e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9077388346195221, + "num_tokens": 99127388.0, + "step": 1216 + }, + { + "epoch": 0.12158449473000649, + "grad_norm": 0.6310121441175185, + "learning_rate": 9.782450464418802e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9071532487869263, + "num_tokens": 99208935.0, + "step": 1217 + }, + { + "epoch": 0.12168439982017083, + "grad_norm": 0.6644951336682486, + "learning_rate": 9.781978174629822e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9078508019447327, + "num_tokens": 99290447.0, + "step": 1218 + }, + { + "epoch": 0.12178430491033518, + "grad_norm": 0.6127243183798436, + "learning_rate": 9.78150538416305e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9078642427921295, + "num_tokens": 99371986.0, + "step": 1219 + }, + { + "epoch": 0.12188421000049952, + "grad_norm": 0.8340130151636059, + "learning_rate": 9.781032093067987e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9061567187309265, + "num_tokens": 99453490.0, + "step": 1220 + }, + { + "epoch": 0.12198411509066387, + "grad_norm": 0.7000730438906227, + "learning_rate": 9.780558301394187e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9051883518695831, + "num_tokens": 99534979.0, + "step": 1221 + }, + { + "epoch": 0.12208402018082821, + "grad_norm": 0.6544419333163549, + "learning_rate": 9.780084009191255e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9055204391479492, + "num_tokens": 99616527.0, + "step": 1222 + }, + { + "epoch": 0.12218392527099256, + "grad_norm": 0.7591321563642536, + "learning_rate": 9.779609216508852e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9055610597133636, + "num_tokens": 99698076.0, + "step": 1223 + }, + { + "epoch": 0.1222838303611569, + "grad_norm": 0.7485740805259224, + "learning_rate": 9.779133923396689e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9076454639434814, + "num_tokens": 99779608.0, + "step": 1224 + }, + { + "epoch": 0.12238373545132125, + "grad_norm": 0.7026663549328794, + "learning_rate": 9.778658129904529e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9094335734844208, + "num_tokens": 99861071.0, + "step": 1225 + }, + { + "epoch": 0.12248364054148558, + "grad_norm": 0.6647382833611647, + "learning_rate": 9.778181836082185e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9073659181594849, + "num_tokens": 99942655.0, + "step": 1226 + }, + { + "epoch": 0.12258354563164993, + "grad_norm": 0.7088019714265772, + "learning_rate": 9.777705041979532e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9085866808891296, + "num_tokens": 100024203.0, + "step": 1227 + }, + { + "epoch": 0.12268345072181427, + "grad_norm": 0.6727317379820761, + "learning_rate": 9.777227747646488e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9065841138362885, + "num_tokens": 100105672.0, + "step": 1228 + }, + { + "epoch": 0.12278335581197862, + "grad_norm": 0.8298845736222682, + "learning_rate": 9.776749953133022e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9073039591312408, + "num_tokens": 100187164.0, + "step": 1229 + }, + { + "epoch": 0.12288326090214297, + "grad_norm": 0.7505041247966501, + "learning_rate": 9.776271658489165e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9055005609989166, + "num_tokens": 100268708.0, + "step": 1230 + }, + { + "epoch": 0.12298316599230731, + "grad_norm": 0.5638916569772912, + "learning_rate": 9.775792863764992e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9079030454158783, + "num_tokens": 100350243.0, + "step": 1231 + }, + { + "epoch": 0.12308307108247166, + "grad_norm": 0.6675845146807606, + "learning_rate": 9.775313569010635e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9066998660564423, + "num_tokens": 100431899.0, + "step": 1232 + }, + { + "epoch": 0.123182976172636, + "grad_norm": 0.7211214246930827, + "learning_rate": 9.774833774276278e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9031629860401154, + "num_tokens": 100513360.0, + "step": 1233 + }, + { + "epoch": 0.12328288126280033, + "grad_norm": 0.550867883196782, + "learning_rate": 9.774353479612151e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9060030579566956, + "num_tokens": 100594863.0, + "step": 1234 + }, + { + "epoch": 0.12338278635296468, + "grad_norm": 0.5627677121778478, + "learning_rate": 9.773872685068543e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9061300158500671, + "num_tokens": 100676470.0, + "step": 1235 + }, + { + "epoch": 0.12348269144312903, + "grad_norm": 0.7023421383423956, + "learning_rate": 9.7733913906958e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9065177142620087, + "num_tokens": 100757920.0, + "step": 1236 + }, + { + "epoch": 0.12358259653329337, + "grad_norm": 0.6540092931126398, + "learning_rate": 9.772909596544304e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.9090960621833801, + "num_tokens": 100839542.0, + "step": 1237 + }, + { + "epoch": 0.12368250162345772, + "grad_norm": 1.0918594269712547, + "learning_rate": 9.772427302664507e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9036728143692017, + "num_tokens": 100921065.0, + "step": 1238 + }, + { + "epoch": 0.12378240671362206, + "grad_norm": 0.7156902619006278, + "learning_rate": 9.7719445091069e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9085580408573151, + "num_tokens": 101002646.0, + "step": 1239 + }, + { + "epoch": 0.12388231180378641, + "grad_norm": 0.9056280229130068, + "learning_rate": 9.771461215922037e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9063440263271332, + "num_tokens": 101084122.0, + "step": 1240 + }, + { + "epoch": 0.12398221689395075, + "grad_norm": 0.6034228207219833, + "learning_rate": 9.770977423160517e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9075508415699005, + "num_tokens": 101165704.0, + "step": 1241 + }, + { + "epoch": 0.12408212198411508, + "grad_norm": 0.744774009263216, + "learning_rate": 9.770493130872992e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9063694179058075, + "num_tokens": 101247248.0, + "step": 1242 + }, + { + "epoch": 0.12418202707427943, + "grad_norm": 0.7840185744917512, + "learning_rate": 9.77000833911017e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.90620556473732, + "num_tokens": 101328830.0, + "step": 1243 + }, + { + "epoch": 0.12428193216444378, + "grad_norm": 0.6559596454538561, + "learning_rate": 9.76952304792281e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9077709913253784, + "num_tokens": 101410426.0, + "step": 1244 + }, + { + "epoch": 0.12438183725460812, + "grad_norm": 0.6797366974652568, + "learning_rate": 9.76903725736172e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.9092218577861786, + "num_tokens": 101492047.0, + "step": 1245 + }, + { + "epoch": 0.12448174234477247, + "grad_norm": 0.7789065663055861, + "learning_rate": 9.768550967477763e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9059571325778961, + "num_tokens": 101573597.0, + "step": 1246 + }, + { + "epoch": 0.12458164743493681, + "grad_norm": 0.6548992372546247, + "learning_rate": 9.768064178321857e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9067213833332062, + "num_tokens": 101655158.0, + "step": 1247 + }, + { + "epoch": 0.12468155252510116, + "grad_norm": 2.0061218559258256, + "learning_rate": 9.767576889944965e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9043682217597961, + "num_tokens": 101736642.0, + "step": 1248 + }, + { + "epoch": 0.1247814576152655, + "grad_norm": 0.772928902853369, + "learning_rate": 9.767089102398111e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9087446928024292, + "num_tokens": 101818201.0, + "step": 1249 + }, + { + "epoch": 0.12488136270542984, + "grad_norm": 0.630397561944058, + "learning_rate": 9.766600815732363e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9086881875991821, + "num_tokens": 101899789.0, + "step": 1250 + }, + { + "epoch": 0.12498126779559418, + "grad_norm": 0.7070274901945883, + "learning_rate": 9.766112029998847e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9081604182720184, + "num_tokens": 101981269.0, + "step": 1251 + }, + { + "epoch": 0.12508117288575854, + "grad_norm": 0.7304105880034202, + "learning_rate": 9.765622745248739e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.9088583588600159, + "num_tokens": 102062815.0, + "step": 1252 + }, + { + "epoch": 0.12518107797592287, + "grad_norm": 0.6779018927460356, + "learning_rate": 9.765132961533269e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9051710069179535, + "num_tokens": 102144314.0, + "step": 1253 + }, + { + "epoch": 0.1252809830660872, + "grad_norm": 0.8704192405359217, + "learning_rate": 9.764642678903714e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.903333991765976, + "num_tokens": 102225757.0, + "step": 1254 + }, + { + "epoch": 0.12538088815625156, + "grad_norm": 0.9045153381130167, + "learning_rate": 9.76415189741141e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9042267799377441, + "num_tokens": 102307241.0, + "step": 1255 + }, + { + "epoch": 0.1254807932464159, + "grad_norm": 0.9411677462882491, + "learning_rate": 9.763660617107744e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9043418765068054, + "num_tokens": 102388652.0, + "step": 1256 + }, + { + "epoch": 0.12558069833658025, + "grad_norm": 0.7142275284026695, + "learning_rate": 9.76316883804415e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9031099379062653, + "num_tokens": 102470169.0, + "step": 1257 + }, + { + "epoch": 0.12568060342674459, + "grad_norm": 0.6584602650841004, + "learning_rate": 9.762676560272118e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9067963063716888, + "num_tokens": 102551655.0, + "step": 1258 + }, + { + "epoch": 0.12578050851690895, + "grad_norm": 0.8619493092507455, + "learning_rate": 9.762183783843191e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9067312180995941, + "num_tokens": 102633112.0, + "step": 1259 + }, + { + "epoch": 0.12588041360707328, + "grad_norm": 0.7567304251955674, + "learning_rate": 9.761690508808966e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9065475761890411, + "num_tokens": 102714680.0, + "step": 1260 + }, + { + "epoch": 0.12598031869723764, + "grad_norm": 0.627214885406523, + "learning_rate": 9.761196735221083e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.904145747423172, + "num_tokens": 102796072.0, + "step": 1261 + }, + { + "epoch": 0.12608022378740197, + "grad_norm": 0.6156731100614481, + "learning_rate": 9.760702463131247e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9075727760791779, + "num_tokens": 102877661.0, + "step": 1262 + }, + { + "epoch": 0.1261801288775663, + "grad_norm": 0.5843804377124372, + "learning_rate": 9.760207692591207e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9072892367839813, + "num_tokens": 102959132.0, + "step": 1263 + }, + { + "epoch": 0.12628003396773066, + "grad_norm": 0.6102595741538936, + "learning_rate": 9.759712423652761e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9048740565776825, + "num_tokens": 103040699.0, + "step": 1264 + }, + { + "epoch": 0.126379939057895, + "grad_norm": 0.8105852940254433, + "learning_rate": 9.75921665636777e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9077780246734619, + "num_tokens": 103122132.0, + "step": 1265 + }, + { + "epoch": 0.12647984414805935, + "grad_norm": 0.6893319425144957, + "learning_rate": 9.758720390788139e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9063253402709961, + "num_tokens": 103203613.0, + "step": 1266 + }, + { + "epoch": 0.12657974923822368, + "grad_norm": 0.7974326407711484, + "learning_rate": 9.758223626965828e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9079168438911438, + "num_tokens": 103285039.0, + "step": 1267 + }, + { + "epoch": 0.12667965432838804, + "grad_norm": 1.0678316763819962, + "learning_rate": 9.757726364952849e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9061545133590698, + "num_tokens": 103366553.0, + "step": 1268 + }, + { + "epoch": 0.12677955941855237, + "grad_norm": 0.6095406143673531, + "learning_rate": 9.757228604801266e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9062853157520294, + "num_tokens": 103448067.0, + "step": 1269 + }, + { + "epoch": 0.12687946450871673, + "grad_norm": 1.5349917049796267, + "learning_rate": 9.756730346563193e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9024520814418793, + "num_tokens": 103529646.0, + "step": 1270 + }, + { + "epoch": 0.12697936959888106, + "grad_norm": 0.9403931735967025, + "learning_rate": 9.7562315902908e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9065779745578766, + "num_tokens": 103611120.0, + "step": 1271 + }, + { + "epoch": 0.1270792746890454, + "grad_norm": 0.8311866784058537, + "learning_rate": 9.755732336036306e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9080473184585571, + "num_tokens": 103692716.0, + "step": 1272 + }, + { + "epoch": 0.12717917977920976, + "grad_norm": 1.0063862359551896, + "learning_rate": 9.755232583851986e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9039958417415619, + "num_tokens": 103774210.0, + "step": 1273 + }, + { + "epoch": 0.1272790848693741, + "grad_norm": 0.6719805829117597, + "learning_rate": 9.754732333790161e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.904809296131134, + "num_tokens": 103855742.0, + "step": 1274 + }, + { + "epoch": 0.12737898995953845, + "grad_norm": 0.7484692390849161, + "learning_rate": 9.754231585903208e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9049063920974731, + "num_tokens": 103937237.0, + "step": 1275 + }, + { + "epoch": 0.12747889504970278, + "grad_norm": 0.7623103912086947, + "learning_rate": 9.75373034024356e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9080223143100739, + "num_tokens": 104018664.0, + "step": 1276 + }, + { + "epoch": 0.12757880013986714, + "grad_norm": 0.6722740836665069, + "learning_rate": 9.753228596863694e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9078520238399506, + "num_tokens": 104100180.0, + "step": 1277 + }, + { + "epoch": 0.12767870523003147, + "grad_norm": 1.1181722534145666, + "learning_rate": 9.752726355816144e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9074711501598358, + "num_tokens": 104181687.0, + "step": 1278 + }, + { + "epoch": 0.1277786103201958, + "grad_norm": 0.9574269335906229, + "learning_rate": 9.752223617153495e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9052344560623169, + "num_tokens": 104263128.0, + "step": 1279 + }, + { + "epoch": 0.12787851541036016, + "grad_norm": 1.0747383501043748, + "learning_rate": 9.751720380928384e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9015457630157471, + "num_tokens": 104344625.0, + "step": 1280 + }, + { + "epoch": 0.1279784205005245, + "grad_norm": 0.7869158984454091, + "learning_rate": 9.751216647193502e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9065250158309937, + "num_tokens": 104426100.0, + "step": 1281 + }, + { + "epoch": 0.12807832559068885, + "grad_norm": 1.1305720751744193, + "learning_rate": 9.750712416001588e-06, + "loss": 0.501, + "mean_token_accuracy": 0.906618744134903, + "num_tokens": 104507597.0, + "step": 1282 + }, + { + "epoch": 0.12817823068085318, + "grad_norm": 0.9181133211476501, + "learning_rate": 9.750207687405437e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9038874804973602, + "num_tokens": 104589118.0, + "step": 1283 + }, + { + "epoch": 0.12827813577101754, + "grad_norm": 1.1160352401281912, + "learning_rate": 9.749702461457895e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9079866409301758, + "num_tokens": 104670710.0, + "step": 1284 + }, + { + "epoch": 0.12837804086118187, + "grad_norm": 0.620174099379847, + "learning_rate": 9.749196738211859e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9061897993087769, + "num_tokens": 104752195.0, + "step": 1285 + }, + { + "epoch": 0.12847794595134623, + "grad_norm": 1.131886523573295, + "learning_rate": 9.748690517720278e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.909242182970047, + "num_tokens": 104833755.0, + "step": 1286 + }, + { + "epoch": 0.12857785104151057, + "grad_norm": 0.827220551187744, + "learning_rate": 9.748183800036154e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9063671231269836, + "num_tokens": 104915277.0, + "step": 1287 + }, + { + "epoch": 0.1286777561316749, + "grad_norm": 1.533515071996982, + "learning_rate": 9.747676585212542e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9057876467704773, + "num_tokens": 104996820.0, + "step": 1288 + }, + { + "epoch": 0.12877766122183926, + "grad_norm": 0.909539427976147, + "learning_rate": 9.747168873302545e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9042758047580719, + "num_tokens": 105078309.0, + "step": 1289 + }, + { + "epoch": 0.1288775663120036, + "grad_norm": 0.6740005293545123, + "learning_rate": 9.746660664359326e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9075241684913635, + "num_tokens": 105159845.0, + "step": 1290 + }, + { + "epoch": 0.12897747140216795, + "grad_norm": 1.9522527361757747, + "learning_rate": 9.74615195843609e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9068134725093842, + "num_tokens": 105241370.0, + "step": 1291 + }, + { + "epoch": 0.12907737649233228, + "grad_norm": 0.9208195169494607, + "learning_rate": 9.745642755586102e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9067543148994446, + "num_tokens": 105322900.0, + "step": 1292 + }, + { + "epoch": 0.12917728158249664, + "grad_norm": 0.878373392632664, + "learning_rate": 9.745133055862676e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9061780273914337, + "num_tokens": 105404473.0, + "step": 1293 + }, + { + "epoch": 0.12927718667266097, + "grad_norm": 1.0841036512097701, + "learning_rate": 9.744622859319175e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9052906930446625, + "num_tokens": 105486021.0, + "step": 1294 + }, + { + "epoch": 0.1293770917628253, + "grad_norm": 0.6108564095456593, + "learning_rate": 9.744112166009022e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9063214361667633, + "num_tokens": 105567582.0, + "step": 1295 + }, + { + "epoch": 0.12947699685298966, + "grad_norm": 1.60725007015252, + "learning_rate": 9.743600975985681e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9073498249053955, + "num_tokens": 105649045.0, + "step": 1296 + }, + { + "epoch": 0.129576901943154, + "grad_norm": 0.7005353737395502, + "learning_rate": 9.74308928930268e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9086948335170746, + "num_tokens": 105730522.0, + "step": 1297 + }, + { + "epoch": 0.12967680703331835, + "grad_norm": 0.7191903991696673, + "learning_rate": 9.74257710601359e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9035830795764923, + "num_tokens": 105812017.0, + "step": 1298 + }, + { + "epoch": 0.12977671212348268, + "grad_norm": 1.0313462027308264, + "learning_rate": 9.742064426172035e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9035681784152985, + "num_tokens": 105893580.0, + "step": 1299 + }, + { + "epoch": 0.12987661721364704, + "grad_norm": 0.6689132912776992, + "learning_rate": 9.7415512498317e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9082794189453125, + "num_tokens": 105975191.0, + "step": 1300 + }, + { + "epoch": 0.12997652230381138, + "grad_norm": 0.914657101302499, + "learning_rate": 9.741037577046308e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9071328043937683, + "num_tokens": 106056789.0, + "step": 1301 + }, + { + "epoch": 0.13007642739397574, + "grad_norm": 0.7042903356259657, + "learning_rate": 9.740523407869643e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.906407505273819, + "num_tokens": 106138294.0, + "step": 1302 + }, + { + "epoch": 0.13017633248414007, + "grad_norm": 0.821895290791496, + "learning_rate": 9.740008742355542e-06, + "loss": 0.499, + "mean_token_accuracy": 0.905183881521225, + "num_tokens": 106219814.0, + "step": 1303 + }, + { + "epoch": 0.1302762375743044, + "grad_norm": 0.8796338851071908, + "learning_rate": 9.739493580557888e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9043141603469849, + "num_tokens": 106301326.0, + "step": 1304 + }, + { + "epoch": 0.13037614266446876, + "grad_norm": 0.681581445662117, + "learning_rate": 9.738977922530618e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9044926762580872, + "num_tokens": 106382787.0, + "step": 1305 + }, + { + "epoch": 0.1304760477546331, + "grad_norm": 0.7316693076155384, + "learning_rate": 9.738461768327725e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9074257612228394, + "num_tokens": 106464303.0, + "step": 1306 + }, + { + "epoch": 0.13057595284479745, + "grad_norm": 0.8474006672978414, + "learning_rate": 9.73794511800325e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9057268500328064, + "num_tokens": 106545825.0, + "step": 1307 + }, + { + "epoch": 0.13067585793496178, + "grad_norm": 0.9399842074210455, + "learning_rate": 9.737427971611287e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9079825282096863, + "num_tokens": 106627417.0, + "step": 1308 + }, + { + "epoch": 0.13077576302512614, + "grad_norm": 0.7354966646253079, + "learning_rate": 9.73691032920598e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9067693650722504, + "num_tokens": 106708907.0, + "step": 1309 + }, + { + "epoch": 0.13087566811529047, + "grad_norm": 0.5540272743346917, + "learning_rate": 9.736392190841526e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9079764187335968, + "num_tokens": 106790457.0, + "step": 1310 + }, + { + "epoch": 0.13097557320545483, + "grad_norm": 0.8040472344814403, + "learning_rate": 9.735873556572177e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9048062860965729, + "num_tokens": 106871988.0, + "step": 1311 + }, + { + "epoch": 0.13107547829561916, + "grad_norm": 0.7415677972395768, + "learning_rate": 9.735354426452235e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9079431295394897, + "num_tokens": 106953536.0, + "step": 1312 + }, + { + "epoch": 0.1311753833857835, + "grad_norm": 0.8595488278592823, + "learning_rate": 9.734834800536053e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.907467395067215, + "num_tokens": 107035114.0, + "step": 1313 + }, + { + "epoch": 0.13127528847594785, + "grad_norm": 0.7613280672983486, + "learning_rate": 9.734314678878033e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.904662698507309, + "num_tokens": 107116601.0, + "step": 1314 + }, + { + "epoch": 0.1313751935661122, + "grad_norm": 1.200506677435648, + "learning_rate": 9.733794061532636e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9099856615066528, + "num_tokens": 107198113.0, + "step": 1315 + }, + { + "epoch": 0.13147509865627655, + "grad_norm": 1.0419717447582975, + "learning_rate": 9.73327294855437e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9027813374996185, + "num_tokens": 107279659.0, + "step": 1316 + }, + { + "epoch": 0.13157500374644088, + "grad_norm": 0.8233965513494116, + "learning_rate": 9.732751339997795e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9043533205986023, + "num_tokens": 107361183.0, + "step": 1317 + }, + { + "epoch": 0.13167490883660524, + "grad_norm": 1.7697830444827458, + "learning_rate": 9.732229235917526e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9063107371330261, + "num_tokens": 107442645.0, + "step": 1318 + }, + { + "epoch": 0.13177481392676957, + "grad_norm": 1.1351643425908375, + "learning_rate": 9.731706636368228e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9053715169429779, + "num_tokens": 107524254.0, + "step": 1319 + }, + { + "epoch": 0.1318747190169339, + "grad_norm": 1.1276757418827574, + "learning_rate": 9.731183541404615e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9056498110294342, + "num_tokens": 107605720.0, + "step": 1320 + }, + { + "epoch": 0.13197462410709826, + "grad_norm": 0.8295637615235804, + "learning_rate": 9.730659951081456e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9042490124702454, + "num_tokens": 107687183.0, + "step": 1321 + }, + { + "epoch": 0.1320745291972626, + "grad_norm": 0.794808126176436, + "learning_rate": 9.730135865453572e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.9069148600101471, + "num_tokens": 107768888.0, + "step": 1322 + }, + { + "epoch": 0.13217443428742695, + "grad_norm": 0.6804367272513717, + "learning_rate": 9.729611284575837e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9057699739933014, + "num_tokens": 107850465.0, + "step": 1323 + }, + { + "epoch": 0.13227433937759128, + "grad_norm": 1.0782784713802078, + "learning_rate": 9.729086208503174e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9084301292896271, + "num_tokens": 107932051.0, + "step": 1324 + }, + { + "epoch": 0.13237424446775564, + "grad_norm": 0.7203210885338988, + "learning_rate": 9.728560637290558e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.905397355556488, + "num_tokens": 108013524.0, + "step": 1325 + }, + { + "epoch": 0.13247414955791997, + "grad_norm": 1.0650796090466195, + "learning_rate": 9.72803457099302e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9079259037971497, + "num_tokens": 108095112.0, + "step": 1326 + }, + { + "epoch": 0.13257405464808433, + "grad_norm": 1.028903504236438, + "learning_rate": 9.727508009665633e-06, + "loss": 0.505, + "mean_token_accuracy": 0.905540406703949, + "num_tokens": 108176611.0, + "step": 1327 + }, + { + "epoch": 0.13267395973824866, + "grad_norm": 0.7829976917610578, + "learning_rate": 9.726980953363536e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9055318832397461, + "num_tokens": 108258116.0, + "step": 1328 + }, + { + "epoch": 0.132773864828413, + "grad_norm": 0.7863262459375544, + "learning_rate": 9.726453402141906e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9054579436779022, + "num_tokens": 108339638.0, + "step": 1329 + }, + { + "epoch": 0.13287376991857736, + "grad_norm": 1.4452695718031594, + "learning_rate": 9.725925356055984e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9070732593536377, + "num_tokens": 108421153.0, + "step": 1330 + }, + { + "epoch": 0.1329736750087417, + "grad_norm": 0.8482693314033962, + "learning_rate": 9.725396815161053e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.9083622694015503, + "num_tokens": 108502768.0, + "step": 1331 + }, + { + "epoch": 0.13307358009890605, + "grad_norm": 0.7487071011155461, + "learning_rate": 9.724867779512453e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9077179729938507, + "num_tokens": 108584263.0, + "step": 1332 + }, + { + "epoch": 0.13317348518907038, + "grad_norm": 0.8393743367411269, + "learning_rate": 9.724338249165575e-06, + "loss": 0.488, + "mean_token_accuracy": 0.9089497327804565, + "num_tokens": 108665918.0, + "step": 1333 + }, + { + "epoch": 0.13327339027923474, + "grad_norm": 0.9884599250292125, + "learning_rate": 9.723808224175859e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.907095342874527, + "num_tokens": 108747374.0, + "step": 1334 + }, + { + "epoch": 0.13337329536939907, + "grad_norm": 0.8759602810916157, + "learning_rate": 9.723277704598803e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9040351212024689, + "num_tokens": 108828832.0, + "step": 1335 + }, + { + "epoch": 0.1334732004595634, + "grad_norm": 1.8506695168171254, + "learning_rate": 9.722746690489949e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9048590958118439, + "num_tokens": 108910276.0, + "step": 1336 + }, + { + "epoch": 0.13357310554972776, + "grad_norm": 1.2483763479307546, + "learning_rate": 9.722215181904897e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9057656526565552, + "num_tokens": 108991779.0, + "step": 1337 + }, + { + "epoch": 0.1336730106398921, + "grad_norm": 0.9690986254616614, + "learning_rate": 9.721683178899297e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9053566157817841, + "num_tokens": 109073195.0, + "step": 1338 + }, + { + "epoch": 0.13377291573005645, + "grad_norm": 1.0335502467800004, + "learning_rate": 9.721150681528848e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9055180847644806, + "num_tokens": 109154805.0, + "step": 1339 + }, + { + "epoch": 0.13387282082022078, + "grad_norm": 0.6542143622644576, + "learning_rate": 9.720617689849304e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9082152545452118, + "num_tokens": 109236353.0, + "step": 1340 + }, + { + "epoch": 0.13397272591038514, + "grad_norm": 0.8291927097981335, + "learning_rate": 9.720084203916472e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9069214761257172, + "num_tokens": 109317866.0, + "step": 1341 + }, + { + "epoch": 0.13407263100054947, + "grad_norm": 0.8432719090944719, + "learning_rate": 9.719550223786204e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9080021977424622, + "num_tokens": 109399441.0, + "step": 1342 + }, + { + "epoch": 0.13417253609071383, + "grad_norm": 1.5956093600742354, + "learning_rate": 9.71901574951441e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9071390926837921, + "num_tokens": 109481005.0, + "step": 1343 + }, + { + "epoch": 0.13427244118087817, + "grad_norm": 0.8022163413562742, + "learning_rate": 9.718480781157054e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9070532321929932, + "num_tokens": 109562512.0, + "step": 1344 + }, + { + "epoch": 0.1343723462710425, + "grad_norm": 0.8287912752539495, + "learning_rate": 9.717945318770142e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9077359437942505, + "num_tokens": 109644032.0, + "step": 1345 + }, + { + "epoch": 0.13447225136120686, + "grad_norm": 0.8178863181240876, + "learning_rate": 9.71740936240974e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9051669836044312, + "num_tokens": 109725649.0, + "step": 1346 + }, + { + "epoch": 0.1345721564513712, + "grad_norm": 0.7342701196135089, + "learning_rate": 9.716872912131964e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9035272896289825, + "num_tokens": 109807171.0, + "step": 1347 + }, + { + "epoch": 0.13467206154153555, + "grad_norm": 0.6577944302212836, + "learning_rate": 9.716335967992979e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9059602916240692, + "num_tokens": 109888713.0, + "step": 1348 + }, + { + "epoch": 0.13477196663169988, + "grad_norm": 0.8084826421462306, + "learning_rate": 9.715798530049006e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9062201678752899, + "num_tokens": 109970310.0, + "step": 1349 + }, + { + "epoch": 0.13487187172186424, + "grad_norm": 0.8765776797511882, + "learning_rate": 9.71526059835631e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9033644199371338, + "num_tokens": 110051818.0, + "step": 1350 + }, + { + "epoch": 0.13497177681202857, + "grad_norm": 0.8425476868276589, + "learning_rate": 9.71472217297122e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9061749279499054, + "num_tokens": 110133282.0, + "step": 1351 + }, + { + "epoch": 0.13507168190219293, + "grad_norm": 0.7552945767849986, + "learning_rate": 9.714183253950104e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.904658168554306, + "num_tokens": 110214705.0, + "step": 1352 + }, + { + "epoch": 0.13517158699235726, + "grad_norm": 0.8301036110313101, + "learning_rate": 9.713643841349392e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9075658321380615, + "num_tokens": 110296273.0, + "step": 1353 + }, + { + "epoch": 0.1352714920825216, + "grad_norm": 0.710566428215675, + "learning_rate": 9.713103935225559e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9060092270374298, + "num_tokens": 110377809.0, + "step": 1354 + }, + { + "epoch": 0.13537139717268595, + "grad_norm": 0.6738931474230214, + "learning_rate": 9.712563535635131e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9079491496086121, + "num_tokens": 110459311.0, + "step": 1355 + }, + { + "epoch": 0.13547130226285028, + "grad_norm": 0.6108458016254277, + "learning_rate": 9.712022642634691e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9067346453666687, + "num_tokens": 110540876.0, + "step": 1356 + }, + { + "epoch": 0.13557120735301464, + "grad_norm": 1.3041997585017997, + "learning_rate": 9.711481256280872e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9065117537975311, + "num_tokens": 110622421.0, + "step": 1357 + }, + { + "epoch": 0.13567111244317898, + "grad_norm": 0.6612957101978467, + "learning_rate": 9.710939376630356e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9058282971382141, + "num_tokens": 110703908.0, + "step": 1358 + }, + { + "epoch": 0.13577101753334334, + "grad_norm": 0.7983920908701093, + "learning_rate": 9.710397003739879e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.90518918633461, + "num_tokens": 110785405.0, + "step": 1359 + }, + { + "epoch": 0.13587092262350767, + "grad_norm": 1.5159870523574162, + "learning_rate": 9.709854137666228e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.906999945640564, + "num_tokens": 110866936.0, + "step": 1360 + }, + { + "epoch": 0.135970827713672, + "grad_norm": 0.67949107816947, + "learning_rate": 9.709310778466241e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9040016829967499, + "num_tokens": 110948453.0, + "step": 1361 + }, + { + "epoch": 0.13607073280383636, + "grad_norm": 0.7379662053616239, + "learning_rate": 9.708766926196809e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.904349148273468, + "num_tokens": 111029978.0, + "step": 1362 + }, + { + "epoch": 0.1361706378940007, + "grad_norm": 0.6509912311727327, + "learning_rate": 9.708222580914872e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9042805433273315, + "num_tokens": 111111581.0, + "step": 1363 + }, + { + "epoch": 0.13627054298416505, + "grad_norm": 0.758071568408513, + "learning_rate": 9.707677742677427e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9102345108985901, + "num_tokens": 111193174.0, + "step": 1364 + }, + { + "epoch": 0.13637044807432938, + "grad_norm": 0.6420280369959583, + "learning_rate": 9.707132411541516e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9083296060562134, + "num_tokens": 111274632.0, + "step": 1365 + }, + { + "epoch": 0.13647035316449374, + "grad_norm": 0.711867503021035, + "learning_rate": 9.706586587564236e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9066992998123169, + "num_tokens": 111356127.0, + "step": 1366 + }, + { + "epoch": 0.13657025825465807, + "grad_norm": 0.6302016742072122, + "learning_rate": 9.706040270802736e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9046351909637451, + "num_tokens": 111437604.0, + "step": 1367 + }, + { + "epoch": 0.13667016334482243, + "grad_norm": 0.7592065854172666, + "learning_rate": 9.705493461314217e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9057660698890686, + "num_tokens": 111519085.0, + "step": 1368 + }, + { + "epoch": 0.13677006843498676, + "grad_norm": 0.621065524597578, + "learning_rate": 9.70494615915593e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9090993702411652, + "num_tokens": 111600645.0, + "step": 1369 + }, + { + "epoch": 0.1368699735251511, + "grad_norm": 1.0537001134798079, + "learning_rate": 9.704398364385177e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9073980748653412, + "num_tokens": 111682179.0, + "step": 1370 + }, + { + "epoch": 0.13696987861531545, + "grad_norm": 0.642776147660109, + "learning_rate": 9.703850077059314e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9063782095909119, + "num_tokens": 111763692.0, + "step": 1371 + }, + { + "epoch": 0.1370697837054798, + "grad_norm": 1.702636327855337, + "learning_rate": 9.703301297235745e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.905548632144928, + "num_tokens": 111845160.0, + "step": 1372 + }, + { + "epoch": 0.13716968879564415, + "grad_norm": 0.6731228185064765, + "learning_rate": 9.702752024971929e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9075610339641571, + "num_tokens": 111926673.0, + "step": 1373 + }, + { + "epoch": 0.13726959388580848, + "grad_norm": 0.6404401116674161, + "learning_rate": 9.702202260325377e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9077689349651337, + "num_tokens": 112008138.0, + "step": 1374 + }, + { + "epoch": 0.13736949897597284, + "grad_norm": 0.6777404252653779, + "learning_rate": 9.701652003353648e-06, + "loss": 0.496, + "mean_token_accuracy": 0.906796008348465, + "num_tokens": 112089699.0, + "step": 1375 + }, + { + "epoch": 0.13746940406613717, + "grad_norm": 0.8032944071662049, + "learning_rate": 9.701101254114354e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9060111343860626, + "num_tokens": 112171214.0, + "step": 1376 + }, + { + "epoch": 0.1375693091563015, + "grad_norm": 0.7640646220898059, + "learning_rate": 9.70055001266516e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.904045045375824, + "num_tokens": 112252681.0, + "step": 1377 + }, + { + "epoch": 0.13766921424646586, + "grad_norm": 0.6335650163376447, + "learning_rate": 9.699998279063783e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9062358140945435, + "num_tokens": 112334232.0, + "step": 1378 + }, + { + "epoch": 0.1377691193366302, + "grad_norm": 0.5917771887767307, + "learning_rate": 9.699446053367985e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9075534045696259, + "num_tokens": 112415785.0, + "step": 1379 + }, + { + "epoch": 0.13786902442679455, + "grad_norm": 0.5853377595577962, + "learning_rate": 9.698893335635591e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9074271619319916, + "num_tokens": 112497341.0, + "step": 1380 + }, + { + "epoch": 0.13796892951695888, + "grad_norm": 0.9026118929095598, + "learning_rate": 9.698340125924468e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9065994620323181, + "num_tokens": 112578925.0, + "step": 1381 + }, + { + "epoch": 0.13806883460712324, + "grad_norm": 0.6062631259709061, + "learning_rate": 9.697786424292536e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9067765772342682, + "num_tokens": 112660544.0, + "step": 1382 + }, + { + "epoch": 0.13816873969728757, + "grad_norm": 0.7635075087209918, + "learning_rate": 9.69723223079777e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9061564803123474, + "num_tokens": 112742081.0, + "step": 1383 + }, + { + "epoch": 0.13826864478745193, + "grad_norm": 0.7910447512589411, + "learning_rate": 9.696677545498195e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9064647257328033, + "num_tokens": 112823580.0, + "step": 1384 + }, + { + "epoch": 0.13836854987761626, + "grad_norm": 0.95969913261548, + "learning_rate": 9.696122368451887e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9062830805778503, + "num_tokens": 112905086.0, + "step": 1385 + }, + { + "epoch": 0.1384684549677806, + "grad_norm": 1.1549114745170173, + "learning_rate": 9.695566699716971e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.906199038028717, + "num_tokens": 112986570.0, + "step": 1386 + }, + { + "epoch": 0.13856836005794496, + "grad_norm": 0.747864890263958, + "learning_rate": 9.695010539351631e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9068077206611633, + "num_tokens": 113068172.0, + "step": 1387 + }, + { + "epoch": 0.1386682651481093, + "grad_norm": 0.6779021152336913, + "learning_rate": 9.694453887414093e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9061891734600067, + "num_tokens": 113149709.0, + "step": 1388 + }, + { + "epoch": 0.13876817023827365, + "grad_norm": 0.7500138190996744, + "learning_rate": 9.69389674396264e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9087834060192108, + "num_tokens": 113231248.0, + "step": 1389 + }, + { + "epoch": 0.13886807532843798, + "grad_norm": 0.950456756493201, + "learning_rate": 9.693339109055608e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9065612554550171, + "num_tokens": 113312742.0, + "step": 1390 + }, + { + "epoch": 0.13896798041860234, + "grad_norm": 0.5943988311562609, + "learning_rate": 9.69278098275138e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9063310921192169, + "num_tokens": 113394313.0, + "step": 1391 + }, + { + "epoch": 0.13906788550876667, + "grad_norm": 0.910081518980201, + "learning_rate": 9.69222236510839e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9049428999423981, + "num_tokens": 113475801.0, + "step": 1392 + }, + { + "epoch": 0.13916779059893103, + "grad_norm": 0.7159362929751215, + "learning_rate": 9.691663256185131e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9073740839958191, + "num_tokens": 113557387.0, + "step": 1393 + }, + { + "epoch": 0.13926769568909536, + "grad_norm": 0.6552631715718139, + "learning_rate": 9.691103656040137e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9067184031009674, + "num_tokens": 113638842.0, + "step": 1394 + }, + { + "epoch": 0.1393676007792597, + "grad_norm": 0.8139482054969949, + "learning_rate": 9.690543564732001e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9068774282932281, + "num_tokens": 113720385.0, + "step": 1395 + }, + { + "epoch": 0.13946750586942405, + "grad_norm": 0.8197992184211187, + "learning_rate": 9.689982982319369e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9061779081821442, + "num_tokens": 113802008.0, + "step": 1396 + }, + { + "epoch": 0.13956741095958838, + "grad_norm": 0.7737817484119613, + "learning_rate": 9.689421908860928e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9039307236671448, + "num_tokens": 113883492.0, + "step": 1397 + }, + { + "epoch": 0.13966731604975274, + "grad_norm": 0.7955013040837035, + "learning_rate": 9.688860344415425e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9054079949855804, + "num_tokens": 113965037.0, + "step": 1398 + }, + { + "epoch": 0.13976722113991707, + "grad_norm": 0.8180365126569777, + "learning_rate": 9.688298289041658e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9061659276485443, + "num_tokens": 114046552.0, + "step": 1399 + }, + { + "epoch": 0.13986712623008143, + "grad_norm": 1.1570242670838595, + "learning_rate": 9.687735742798475e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9070099294185638, + "num_tokens": 114128072.0, + "step": 1400 + }, + { + "epoch": 0.13996703132024577, + "grad_norm": 0.6011678899140528, + "learning_rate": 9.687172705744773e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9060656428337097, + "num_tokens": 114209545.0, + "step": 1401 + }, + { + "epoch": 0.1400669364104101, + "grad_norm": 1.1563030192615975, + "learning_rate": 9.686609177939504e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9076432287693024, + "num_tokens": 114291107.0, + "step": 1402 + }, + { + "epoch": 0.14016684150057446, + "grad_norm": 0.7770852612849536, + "learning_rate": 9.686045159441669e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9046984016895294, + "num_tokens": 114372630.0, + "step": 1403 + }, + { + "epoch": 0.1402667465907388, + "grad_norm": 0.6994042229524008, + "learning_rate": 9.685480650310319e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9053921103477478, + "num_tokens": 114454149.0, + "step": 1404 + }, + { + "epoch": 0.14036665168090315, + "grad_norm": 1.0026760553193386, + "learning_rate": 9.684915650604566e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9022590219974518, + "num_tokens": 114535639.0, + "step": 1405 + }, + { + "epoch": 0.14046655677106748, + "grad_norm": 0.5953936124374801, + "learning_rate": 9.684350160383557e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.906946063041687, + "num_tokens": 114617093.0, + "step": 1406 + }, + { + "epoch": 0.14056646186123184, + "grad_norm": 0.6444881631902504, + "learning_rate": 9.683784179706507e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9071539342403412, + "num_tokens": 114698571.0, + "step": 1407 + }, + { + "epoch": 0.14066636695139617, + "grad_norm": 0.5799050247190324, + "learning_rate": 9.68321770863267e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9080273807048798, + "num_tokens": 114780004.0, + "step": 1408 + }, + { + "epoch": 0.14076627204156053, + "grad_norm": 0.570793782643909, + "learning_rate": 9.682650747221357e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9080200493335724, + "num_tokens": 114861544.0, + "step": 1409 + }, + { + "epoch": 0.14086617713172486, + "grad_norm": 0.6854310579897974, + "learning_rate": 9.682083295531932e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.905602365732193, + "num_tokens": 114943053.0, + "step": 1410 + }, + { + "epoch": 0.1409660822218892, + "grad_norm": 7.497511727003059, + "learning_rate": 9.681515353623806e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9077430069446564, + "num_tokens": 115024576.0, + "step": 1411 + }, + { + "epoch": 0.14106598731205355, + "grad_norm": 0.9282045560271548, + "learning_rate": 9.68094692155644e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9065719842910767, + "num_tokens": 115106180.0, + "step": 1412 + }, + { + "epoch": 0.14116589240221789, + "grad_norm": 0.6904572080855176, + "learning_rate": 9.680377999389355e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9045579731464386, + "num_tokens": 115187702.0, + "step": 1413 + }, + { + "epoch": 0.14126579749238224, + "grad_norm": 1.1077152010851141, + "learning_rate": 9.679808587182113e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9046159386634827, + "num_tokens": 115269292.0, + "step": 1414 + }, + { + "epoch": 0.14136570258254658, + "grad_norm": 0.7170230725118522, + "learning_rate": 9.679238684994334e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9052697122097015, + "num_tokens": 115350767.0, + "step": 1415 + }, + { + "epoch": 0.14146560767271094, + "grad_norm": 0.7213744801958948, + "learning_rate": 9.678668292885687e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9044034481048584, + "num_tokens": 115432243.0, + "step": 1416 + }, + { + "epoch": 0.14156551276287527, + "grad_norm": 0.6594419633398595, + "learning_rate": 9.678097410915894e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9040981233119965, + "num_tokens": 115513694.0, + "step": 1417 + }, + { + "epoch": 0.1416654178530396, + "grad_norm": 1.2529751248915444, + "learning_rate": 9.677526039144724e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9061657190322876, + "num_tokens": 115595292.0, + "step": 1418 + }, + { + "epoch": 0.14176532294320396, + "grad_norm": 0.8308598073710551, + "learning_rate": 9.676954177632006e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9074156582355499, + "num_tokens": 115676804.0, + "step": 1419 + }, + { + "epoch": 0.1418652280333683, + "grad_norm": 0.918590955620998, + "learning_rate": 9.676381826437606e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9055695831775665, + "num_tokens": 115758281.0, + "step": 1420 + }, + { + "epoch": 0.14196513312353265, + "grad_norm": 0.7184341277295947, + "learning_rate": 9.675808985621456e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9062365591526031, + "num_tokens": 115839876.0, + "step": 1421 + }, + { + "epoch": 0.14206503821369698, + "grad_norm": 0.7683285278484344, + "learning_rate": 9.675235655243532e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9059454798698425, + "num_tokens": 115921384.0, + "step": 1422 + }, + { + "epoch": 0.14216494330386134, + "grad_norm": 0.6737673712140323, + "learning_rate": 9.67466183536386e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9088371694087982, + "num_tokens": 116002908.0, + "step": 1423 + }, + { + "epoch": 0.14226484839402567, + "grad_norm": 0.8684909113173525, + "learning_rate": 9.67408752604252e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9080084264278412, + "num_tokens": 116084443.0, + "step": 1424 + }, + { + "epoch": 0.14236475348419003, + "grad_norm": 0.6917253288377697, + "learning_rate": 9.673512727339644e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9067161679267883, + "num_tokens": 116165941.0, + "step": 1425 + }, + { + "epoch": 0.14246465857435436, + "grad_norm": 0.7046414710717708, + "learning_rate": 9.672937439315415e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9055874943733215, + "num_tokens": 116247417.0, + "step": 1426 + }, + { + "epoch": 0.1425645636645187, + "grad_norm": 0.8101108444073111, + "learning_rate": 9.672361662030063e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9047666788101196, + "num_tokens": 116328880.0, + "step": 1427 + }, + { + "epoch": 0.14266446875468305, + "grad_norm": 3.8370471345331043, + "learning_rate": 9.671785395543876e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9058895111083984, + "num_tokens": 116410382.0, + "step": 1428 + }, + { + "epoch": 0.1427643738448474, + "grad_norm": 0.7077118391030408, + "learning_rate": 9.671208639917186e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9081244468688965, + "num_tokens": 116491913.0, + "step": 1429 + }, + { + "epoch": 0.14286427893501175, + "grad_norm": 2.424584655651338, + "learning_rate": 9.670631395210384e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9056679904460907, + "num_tokens": 116573460.0, + "step": 1430 + }, + { + "epoch": 0.14296418402517608, + "grad_norm": 1.2055490518489325, + "learning_rate": 9.670053661483904e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9069258272647858, + "num_tokens": 116655101.0, + "step": 1431 + }, + { + "epoch": 0.14306408911534044, + "grad_norm": 1.4899405011655382, + "learning_rate": 9.669475438798238e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9050225913524628, + "num_tokens": 116736585.0, + "step": 1432 + }, + { + "epoch": 0.14316399420550477, + "grad_norm": 0.9029543101353715, + "learning_rate": 9.668896727213925e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.9097737073898315, + "num_tokens": 116818291.0, + "step": 1433 + }, + { + "epoch": 0.14326389929566913, + "grad_norm": 1.3507839525739402, + "learning_rate": 9.668317526791559e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9038884341716766, + "num_tokens": 116899777.0, + "step": 1434 + }, + { + "epoch": 0.14336380438583346, + "grad_norm": 1.0137522377347203, + "learning_rate": 9.66773783759178e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9040345549583435, + "num_tokens": 116981296.0, + "step": 1435 + }, + { + "epoch": 0.1434637094759978, + "grad_norm": 0.9157687390107863, + "learning_rate": 9.667157659675284e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9078940749168396, + "num_tokens": 117062774.0, + "step": 1436 + }, + { + "epoch": 0.14356361456616215, + "grad_norm": 1.1790404280215745, + "learning_rate": 9.666576993102814e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9026411473751068, + "num_tokens": 117144306.0, + "step": 1437 + }, + { + "epoch": 0.14366351965632648, + "grad_norm": 1.4851717917209277, + "learning_rate": 9.665995837935168e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9041498601436615, + "num_tokens": 117225793.0, + "step": 1438 + }, + { + "epoch": 0.14376342474649084, + "grad_norm": 1.119841077731183, + "learning_rate": 9.665414194233194e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9064831733703613, + "num_tokens": 117307312.0, + "step": 1439 + }, + { + "epoch": 0.14386332983665517, + "grad_norm": 1.1992398039870797, + "learning_rate": 9.66483206205779e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9035328328609467, + "num_tokens": 117388782.0, + "step": 1440 + }, + { + "epoch": 0.14396323492681953, + "grad_norm": 1.1947391670440648, + "learning_rate": 9.664249441469905e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9092274606227875, + "num_tokens": 117470315.0, + "step": 1441 + }, + { + "epoch": 0.14406314001698386, + "grad_norm": 0.8777706460413772, + "learning_rate": 9.663666332530541e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9057049453258514, + "num_tokens": 117551865.0, + "step": 1442 + }, + { + "epoch": 0.1441630451071482, + "grad_norm": 0.7167314695309905, + "learning_rate": 9.66308273530075e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9070923030376434, + "num_tokens": 117633312.0, + "step": 1443 + }, + { + "epoch": 0.14426295019731256, + "grad_norm": 0.7501252833451411, + "learning_rate": 9.662498649841635e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9036353826522827, + "num_tokens": 117714820.0, + "step": 1444 + }, + { + "epoch": 0.1443628552874769, + "grad_norm": 0.5520916473926454, + "learning_rate": 9.661914076214349e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9077936708927155, + "num_tokens": 117796352.0, + "step": 1445 + }, + { + "epoch": 0.14446276037764125, + "grad_norm": 0.5820326767376177, + "learning_rate": 9.6613290144801e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9082101285457611, + "num_tokens": 117877941.0, + "step": 1446 + }, + { + "epoch": 0.14456266546780558, + "grad_norm": 0.5441049217808696, + "learning_rate": 9.660743464700144e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.907596081495285, + "num_tokens": 117959491.0, + "step": 1447 + }, + { + "epoch": 0.14466257055796994, + "grad_norm": 0.7635577311722338, + "learning_rate": 9.660157426935785e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9044625163078308, + "num_tokens": 118041056.0, + "step": 1448 + }, + { + "epoch": 0.14476247564813427, + "grad_norm": 0.8385546538043389, + "learning_rate": 9.659570901248388e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9064452052116394, + "num_tokens": 118122609.0, + "step": 1449 + }, + { + "epoch": 0.14486238073829863, + "grad_norm": 0.7865910949163974, + "learning_rate": 9.658983887699359e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9086226224899292, + "num_tokens": 118204124.0, + "step": 1450 + }, + { + "epoch": 0.14496228582846296, + "grad_norm": 1.4492594719055423, + "learning_rate": 9.658396386350157e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9067566692829132, + "num_tokens": 118285679.0, + "step": 1451 + }, + { + "epoch": 0.1450621909186273, + "grad_norm": 0.5935482106408213, + "learning_rate": 9.657808397262297e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9065345227718353, + "num_tokens": 118367206.0, + "step": 1452 + }, + { + "epoch": 0.14516209600879165, + "grad_norm": 0.6222936351909838, + "learning_rate": 9.657219920497343e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9055749773979187, + "num_tokens": 118448706.0, + "step": 1453 + }, + { + "epoch": 0.14526200109895598, + "grad_norm": 1.1436834859931464, + "learning_rate": 9.656630956116905e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9073433876037598, + "num_tokens": 118530313.0, + "step": 1454 + }, + { + "epoch": 0.14536190618912034, + "grad_norm": 0.6812412378610117, + "learning_rate": 9.656041504182651e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.909051388502121, + "num_tokens": 118611883.0, + "step": 1455 + }, + { + "epoch": 0.14546181127928468, + "grad_norm": 0.8426021219484445, + "learning_rate": 9.655451564756299e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9041750729084015, + "num_tokens": 118693381.0, + "step": 1456 + }, + { + "epoch": 0.14556171636944903, + "grad_norm": 0.7945426278998614, + "learning_rate": 9.654861137899613e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9058820605278015, + "num_tokens": 118774896.0, + "step": 1457 + }, + { + "epoch": 0.14566162145961337, + "grad_norm": 0.8381806917168204, + "learning_rate": 9.654270223674411e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9058012962341309, + "num_tokens": 118856412.0, + "step": 1458 + }, + { + "epoch": 0.1457615265497777, + "grad_norm": 0.7844936011894352, + "learning_rate": 9.653678822142564e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9062561094760895, + "num_tokens": 118937906.0, + "step": 1459 + }, + { + "epoch": 0.14586143163994206, + "grad_norm": 0.9299925285861665, + "learning_rate": 9.653086933365994e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9064039587974548, + "num_tokens": 119019366.0, + "step": 1460 + }, + { + "epoch": 0.1459613367301064, + "grad_norm": 0.7192460846082844, + "learning_rate": 9.652494557406666e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9061973392963409, + "num_tokens": 119100875.0, + "step": 1461 + }, + { + "epoch": 0.14606124182027075, + "grad_norm": 0.9004518196294927, + "learning_rate": 9.651901694326611e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.906263530254364, + "num_tokens": 119182420.0, + "step": 1462 + }, + { + "epoch": 0.14616114691043508, + "grad_norm": 0.7599400050363765, + "learning_rate": 9.651308344187895e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9038980901241302, + "num_tokens": 119263889.0, + "step": 1463 + }, + { + "epoch": 0.14626105200059944, + "grad_norm": 0.855260127098478, + "learning_rate": 9.650714507052646e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9052384197711945, + "num_tokens": 119345347.0, + "step": 1464 + }, + { + "epoch": 0.14636095709076377, + "grad_norm": 0.9116233545751065, + "learning_rate": 9.650120182983038e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9067163169384003, + "num_tokens": 119426896.0, + "step": 1465 + }, + { + "epoch": 0.14646086218092813, + "grad_norm": 0.9698258152374503, + "learning_rate": 9.6495253720413e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9049496650695801, + "num_tokens": 119508472.0, + "step": 1466 + }, + { + "epoch": 0.14656076727109246, + "grad_norm": 1.8046455467688998, + "learning_rate": 9.648930074289704e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9076929092407227, + "num_tokens": 119590016.0, + "step": 1467 + }, + { + "epoch": 0.1466606723612568, + "grad_norm": 1.6497752085172013, + "learning_rate": 9.648334289790585e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9081787168979645, + "num_tokens": 119671582.0, + "step": 1468 + }, + { + "epoch": 0.14676057745142115, + "grad_norm": 0.8706327521001371, + "learning_rate": 9.647738018606315e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9062606692314148, + "num_tokens": 119753041.0, + "step": 1469 + }, + { + "epoch": 0.14686048254158549, + "grad_norm": 1.4018048673726635, + "learning_rate": 9.64714126079933e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9063165783882141, + "num_tokens": 119834568.0, + "step": 1470 + }, + { + "epoch": 0.14696038763174984, + "grad_norm": 0.9805198584264414, + "learning_rate": 9.646544016432109e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9049387574195862, + "num_tokens": 119916026.0, + "step": 1471 + }, + { + "epoch": 0.14706029272191418, + "grad_norm": 1.9376010442592995, + "learning_rate": 9.645946285567183e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9038022458553314, + "num_tokens": 119997494.0, + "step": 1472 + }, + { + "epoch": 0.14716019781207854, + "grad_norm": 1.4026626465646523, + "learning_rate": 9.645348068267136e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9086815118789673, + "num_tokens": 120079027.0, + "step": 1473 + }, + { + "epoch": 0.14726010290224287, + "grad_norm": 0.7029341711347747, + "learning_rate": 9.644749364594604e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9037235975265503, + "num_tokens": 120160553.0, + "step": 1474 + }, + { + "epoch": 0.1473600079924072, + "grad_norm": 0.9097561027318273, + "learning_rate": 9.644150174612267e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9038383960723877, + "num_tokens": 120242035.0, + "step": 1475 + }, + { + "epoch": 0.14745991308257156, + "grad_norm": 1.2234647761768795, + "learning_rate": 9.643550498382865e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9076532125473022, + "num_tokens": 120323567.0, + "step": 1476 + }, + { + "epoch": 0.1475598181727359, + "grad_norm": 0.7666294142960106, + "learning_rate": 9.642950335969183e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9050077795982361, + "num_tokens": 120405083.0, + "step": 1477 + }, + { + "epoch": 0.14765972326290025, + "grad_norm": 0.6970248377475016, + "learning_rate": 9.642349687434059e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.908193439245224, + "num_tokens": 120486617.0, + "step": 1478 + }, + { + "epoch": 0.14775962835306458, + "grad_norm": 0.5700654770754054, + "learning_rate": 9.64174855284038e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9069699347019196, + "num_tokens": 120568157.0, + "step": 1479 + }, + { + "epoch": 0.14785953344322894, + "grad_norm": 0.6299021509602045, + "learning_rate": 9.641146932251088e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9071713387966156, + "num_tokens": 120649798.0, + "step": 1480 + }, + { + "epoch": 0.14795943853339327, + "grad_norm": 0.5848743745325499, + "learning_rate": 9.640544825729173e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.9067297875881195, + "num_tokens": 120731510.0, + "step": 1481 + }, + { + "epoch": 0.14805934362355763, + "grad_norm": 0.734432995289237, + "learning_rate": 9.639942233337674e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9077152013778687, + "num_tokens": 120813066.0, + "step": 1482 + }, + { + "epoch": 0.14815924871372196, + "grad_norm": 1.4669835123376795, + "learning_rate": 9.639339155139684e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9033429324626923, + "num_tokens": 120894500.0, + "step": 1483 + }, + { + "epoch": 0.1482591538038863, + "grad_norm": 0.7541090462132073, + "learning_rate": 9.638735591198347e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9096357226371765, + "num_tokens": 120976084.0, + "step": 1484 + }, + { + "epoch": 0.14835905889405065, + "grad_norm": 0.6846225391456027, + "learning_rate": 9.638131541576854e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9072548449039459, + "num_tokens": 121057582.0, + "step": 1485 + }, + { + "epoch": 0.148458963984215, + "grad_norm": 0.8519433589108023, + "learning_rate": 9.637527006338454e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9042260944843292, + "num_tokens": 121139044.0, + "step": 1486 + }, + { + "epoch": 0.14855886907437935, + "grad_norm": 0.7213043658776503, + "learning_rate": 9.636921985546438e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9042539596557617, + "num_tokens": 121220543.0, + "step": 1487 + }, + { + "epoch": 0.14865877416454368, + "grad_norm": 0.6135481758861071, + "learning_rate": 9.636316479264154e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9057795107364655, + "num_tokens": 121302036.0, + "step": 1488 + }, + { + "epoch": 0.14875867925470804, + "grad_norm": 0.6196700879510937, + "learning_rate": 9.635710487555e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.9099321961402893, + "num_tokens": 121383679.0, + "step": 1489 + }, + { + "epoch": 0.14885858434487237, + "grad_norm": 0.5356137542193243, + "learning_rate": 9.635104010482422e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9068591594696045, + "num_tokens": 121465100.0, + "step": 1490 + }, + { + "epoch": 0.14895848943503673, + "grad_norm": 0.6652801077692835, + "learning_rate": 9.63449704810992e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9040499329566956, + "num_tokens": 121546619.0, + "step": 1491 + }, + { + "epoch": 0.14905839452520106, + "grad_norm": 0.5889613841851982, + "learning_rate": 9.633889600501043e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9102950692176819, + "num_tokens": 121628222.0, + "step": 1492 + }, + { + "epoch": 0.1491582996153654, + "grad_norm": 0.6563310282028311, + "learning_rate": 9.633281667719394e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9040439128875732, + "num_tokens": 121709691.0, + "step": 1493 + }, + { + "epoch": 0.14925820470552975, + "grad_norm": 0.6321909769779915, + "learning_rate": 9.632673249828618e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9071270525455475, + "num_tokens": 121791289.0, + "step": 1494 + }, + { + "epoch": 0.14935810979569408, + "grad_norm": 0.629503413381927, + "learning_rate": 9.632064346892425e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.905927985906601, + "num_tokens": 121872794.0, + "step": 1495 + }, + { + "epoch": 0.14945801488585844, + "grad_norm": 0.6798930948260445, + "learning_rate": 9.631454958974562e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9015999436378479, + "num_tokens": 121954304.0, + "step": 1496 + }, + { + "epoch": 0.14955791997602277, + "grad_norm": 0.5867205581709716, + "learning_rate": 9.630845086138833e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.905054360628128, + "num_tokens": 122035846.0, + "step": 1497 + }, + { + "epoch": 0.14965782506618713, + "grad_norm": 0.6646490203433791, + "learning_rate": 9.630234728449095e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9086591899394989, + "num_tokens": 122117364.0, + "step": 1498 + }, + { + "epoch": 0.14975773015635147, + "grad_norm": 0.6067455718932669, + "learning_rate": 9.62962388596925e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9028102457523346, + "num_tokens": 122198798.0, + "step": 1499 + }, + { + "epoch": 0.1498576352465158, + "grad_norm": 0.6336276308668395, + "learning_rate": 9.629012558763256e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9038086235523224, + "num_tokens": 122280238.0, + "step": 1500 + }, + { + "epoch": 0.14995754033668016, + "grad_norm": 0.7203018236579867, + "learning_rate": 9.628400746895119e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9069197773933411, + "num_tokens": 122361795.0, + "step": 1501 + }, + { + "epoch": 0.1500574454268445, + "grad_norm": 0.6092063807294991, + "learning_rate": 9.627788450428896e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9083555042743683, + "num_tokens": 122443270.0, + "step": 1502 + }, + { + "epoch": 0.15015735051700885, + "grad_norm": 1.0474848970554966, + "learning_rate": 9.627175669428695e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9077299535274506, + "num_tokens": 122524764.0, + "step": 1503 + }, + { + "epoch": 0.15025725560717318, + "grad_norm": 0.6671048263710464, + "learning_rate": 9.626562403958674e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9047800302505493, + "num_tokens": 122606240.0, + "step": 1504 + }, + { + "epoch": 0.15035716069733754, + "grad_norm": 0.5054473909623532, + "learning_rate": 9.625948654083043e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9090155065059662, + "num_tokens": 122687760.0, + "step": 1505 + }, + { + "epoch": 0.15045706578750187, + "grad_norm": 0.6458801283362514, + "learning_rate": 9.625334419866064e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9041256606578827, + "num_tokens": 122769343.0, + "step": 1506 + }, + { + "epoch": 0.15055697087766623, + "grad_norm": 0.6223431308233839, + "learning_rate": 9.624719701372045e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.904462069272995, + "num_tokens": 122850846.0, + "step": 1507 + }, + { + "epoch": 0.15065687596783056, + "grad_norm": 1.0517627018563225, + "learning_rate": 9.624104498665353e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9033231139183044, + "num_tokens": 122932267.0, + "step": 1508 + }, + { + "epoch": 0.1507567810579949, + "grad_norm": 0.6664708416031822, + "learning_rate": 9.623488811810392e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.9087750017642975, + "num_tokens": 123013853.0, + "step": 1509 + }, + { + "epoch": 0.15085668614815925, + "grad_norm": 0.6268149767274139, + "learning_rate": 9.622872640871632e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.906977117061615, + "num_tokens": 123095329.0, + "step": 1510 + }, + { + "epoch": 0.15095659123832358, + "grad_norm": 0.5924386866366695, + "learning_rate": 9.622255985913584e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9064759910106659, + "num_tokens": 123176848.0, + "step": 1511 + }, + { + "epoch": 0.15105649632848794, + "grad_norm": 0.7368634746671824, + "learning_rate": 9.621638847000811e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.907557338476181, + "num_tokens": 123258437.0, + "step": 1512 + }, + { + "epoch": 0.15115640141865228, + "grad_norm": 0.7150613269603676, + "learning_rate": 9.621021224197931e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9048032164573669, + "num_tokens": 123339923.0, + "step": 1513 + }, + { + "epoch": 0.15125630650881663, + "grad_norm": 0.6366338008665764, + "learning_rate": 9.620403117569608e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9059279263019562, + "num_tokens": 123421495.0, + "step": 1514 + }, + { + "epoch": 0.15135621159898097, + "grad_norm": 0.8399447441233572, + "learning_rate": 9.619784527180559e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9075647592544556, + "num_tokens": 123503025.0, + "step": 1515 + }, + { + "epoch": 0.1514561166891453, + "grad_norm": 0.6943902827386883, + "learning_rate": 9.619165453095549e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9067317843437195, + "num_tokens": 123584545.0, + "step": 1516 + }, + { + "epoch": 0.15155602177930966, + "grad_norm": 0.5803094515496484, + "learning_rate": 9.618545895379398e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9079853594303131, + "num_tokens": 123666105.0, + "step": 1517 + }, + { + "epoch": 0.151655926869474, + "grad_norm": 0.5761079548093777, + "learning_rate": 9.617925854096975e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9069270193576813, + "num_tokens": 123747781.0, + "step": 1518 + }, + { + "epoch": 0.15175583195963835, + "grad_norm": 1.0403471522776908, + "learning_rate": 9.617305329313198e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9050042927265167, + "num_tokens": 123829264.0, + "step": 1519 + }, + { + "epoch": 0.15185573704980268, + "grad_norm": 0.6986276112766814, + "learning_rate": 9.616684321093035e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9045916795730591, + "num_tokens": 123910759.0, + "step": 1520 + }, + { + "epoch": 0.15195564213996704, + "grad_norm": 1.731550115905918, + "learning_rate": 9.616062829501507e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9088729619979858, + "num_tokens": 123992375.0, + "step": 1521 + }, + { + "epoch": 0.15205554723013137, + "grad_norm": 0.61493127534751, + "learning_rate": 9.615440854603686e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9075317680835724, + "num_tokens": 124073876.0, + "step": 1522 + }, + { + "epoch": 0.15215545232029573, + "grad_norm": 0.5314462010511823, + "learning_rate": 9.614818396464692e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9083892405033112, + "num_tokens": 124155408.0, + "step": 1523 + }, + { + "epoch": 0.15225535741046006, + "grad_norm": 0.7069548554250159, + "learning_rate": 9.614195455149698e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9083396196365356, + "num_tokens": 124236945.0, + "step": 1524 + }, + { + "epoch": 0.1523552625006244, + "grad_norm": 0.6358590982713213, + "learning_rate": 9.613572030723924e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9072528779506683, + "num_tokens": 124318549.0, + "step": 1525 + }, + { + "epoch": 0.15245516759078875, + "grad_norm": 0.8824511049777348, + "learning_rate": 9.612948123252647e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9061603248119354, + "num_tokens": 124400118.0, + "step": 1526 + }, + { + "epoch": 0.15255507268095309, + "grad_norm": 0.6064089981607693, + "learning_rate": 9.612323732801187e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9051263630390167, + "num_tokens": 124481589.0, + "step": 1527 + }, + { + "epoch": 0.15265497777111744, + "grad_norm": 0.5848530575271396, + "learning_rate": 9.611698859434923e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9077710807323456, + "num_tokens": 124563185.0, + "step": 1528 + }, + { + "epoch": 0.15275488286128178, + "grad_norm": 0.6724761458648132, + "learning_rate": 9.611073503219275e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9078067541122437, + "num_tokens": 124644728.0, + "step": 1529 + }, + { + "epoch": 0.15285478795144614, + "grad_norm": 0.6374082202987912, + "learning_rate": 9.610447664219722e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.909514456987381, + "num_tokens": 124726255.0, + "step": 1530 + }, + { + "epoch": 0.15295469304161047, + "grad_norm": 0.6111145460638863, + "learning_rate": 9.609821342501787e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9062907099723816, + "num_tokens": 124807727.0, + "step": 1531 + }, + { + "epoch": 0.15305459813177483, + "grad_norm": 0.6841791882008549, + "learning_rate": 9.609194538131048e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9058751165866852, + "num_tokens": 124889241.0, + "step": 1532 + }, + { + "epoch": 0.15315450322193916, + "grad_norm": 0.6667408356991087, + "learning_rate": 9.608567251173132e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9057484567165375, + "num_tokens": 124970762.0, + "step": 1533 + }, + { + "epoch": 0.1532544083121035, + "grad_norm": 0.5829732148364969, + "learning_rate": 9.607939481693717e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9074875712394714, + "num_tokens": 125052245.0, + "step": 1534 + }, + { + "epoch": 0.15335431340226785, + "grad_norm": 0.7379558752647551, + "learning_rate": 9.607311229758531e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9033805429935455, + "num_tokens": 125133743.0, + "step": 1535 + }, + { + "epoch": 0.15345421849243218, + "grad_norm": 0.7755618897378732, + "learning_rate": 9.606682495433352e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9041560888290405, + "num_tokens": 125215229.0, + "step": 1536 + }, + { + "epoch": 0.15355412358259654, + "grad_norm": 0.8583941184416136, + "learning_rate": 9.606053278784009e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.906825840473175, + "num_tokens": 125296715.0, + "step": 1537 + }, + { + "epoch": 0.15365402867276087, + "grad_norm": 0.7446509610930209, + "learning_rate": 9.605423579876381e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9076176583766937, + "num_tokens": 125378247.0, + "step": 1538 + }, + { + "epoch": 0.15375393376292523, + "grad_norm": 0.6619106499894811, + "learning_rate": 9.6047933987764e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9072850346565247, + "num_tokens": 125459791.0, + "step": 1539 + }, + { + "epoch": 0.15385383885308956, + "grad_norm": 0.7231292204948578, + "learning_rate": 9.604162735550045e-06, + "loss": 0.505, + "mean_token_accuracy": 0.907208651304245, + "num_tokens": 125541251.0, + "step": 1540 + }, + { + "epoch": 0.1539537439432539, + "grad_norm": 0.6613445080724203, + "learning_rate": 9.603531590263348e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9068465530872345, + "num_tokens": 125622781.0, + "step": 1541 + }, + { + "epoch": 0.15405364903341826, + "grad_norm": 0.628783595205484, + "learning_rate": 9.60289996298239e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.905491292476654, + "num_tokens": 125704420.0, + "step": 1542 + }, + { + "epoch": 0.1541535541235826, + "grad_norm": 0.7983464155294795, + "learning_rate": 9.602267853773301e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.908201277256012, + "num_tokens": 125785987.0, + "step": 1543 + }, + { + "epoch": 0.15425345921374695, + "grad_norm": 0.8787888854913658, + "learning_rate": 9.60163526270227e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9064143896102905, + "num_tokens": 125867535.0, + "step": 1544 + }, + { + "epoch": 0.15435336430391128, + "grad_norm": 0.6522347458554583, + "learning_rate": 9.601002189835522e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9066905975341797, + "num_tokens": 125949119.0, + "step": 1545 + }, + { + "epoch": 0.15445326939407564, + "grad_norm": 0.6786552883167468, + "learning_rate": 9.600368635239343e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9059307277202606, + "num_tokens": 126030626.0, + "step": 1546 + }, + { + "epoch": 0.15455317448423997, + "grad_norm": 0.6995200472367675, + "learning_rate": 9.59973459898007e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9074406027793884, + "num_tokens": 126112141.0, + "step": 1547 + }, + { + "epoch": 0.15465307957440433, + "grad_norm": 1.594831831101268, + "learning_rate": 9.599100081124083e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.904147207736969, + "num_tokens": 126193710.0, + "step": 1548 + }, + { + "epoch": 0.15475298466456866, + "grad_norm": 0.6462644848092839, + "learning_rate": 9.59846508173782e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.905205488204956, + "num_tokens": 126275180.0, + "step": 1549 + }, + { + "epoch": 0.154852889754733, + "grad_norm": 0.6630112151290521, + "learning_rate": 9.597829600887766e-06, + "loss": 0.498, + "mean_token_accuracy": 0.906772792339325, + "num_tokens": 126356706.0, + "step": 1550 + }, + { + "epoch": 0.15495279484489735, + "grad_norm": 0.9442556791503252, + "learning_rate": 9.597193638640451e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9048232138156891, + "num_tokens": 126438238.0, + "step": 1551 + }, + { + "epoch": 0.15505269993506168, + "grad_norm": 0.8602348072990518, + "learning_rate": 9.596557195062468e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9055223166942596, + "num_tokens": 126519704.0, + "step": 1552 + }, + { + "epoch": 0.15515260502522604, + "grad_norm": 0.6644355294128284, + "learning_rate": 9.59592027022045e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9070948660373688, + "num_tokens": 126601177.0, + "step": 1553 + }, + { + "epoch": 0.15525251011539037, + "grad_norm": 0.6532780462458078, + "learning_rate": 9.595282864181082e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.909816563129425, + "num_tokens": 126682818.0, + "step": 1554 + }, + { + "epoch": 0.15535241520555473, + "grad_norm": 0.9412660520348209, + "learning_rate": 9.594644977011103e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9074317216873169, + "num_tokens": 126764357.0, + "step": 1555 + }, + { + "epoch": 0.15545232029571907, + "grad_norm": 0.5531666486292481, + "learning_rate": 9.5940066087773e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.907920777797699, + "num_tokens": 126845867.0, + "step": 1556 + }, + { + "epoch": 0.1555522253858834, + "grad_norm": 0.6714242772608808, + "learning_rate": 9.59336775954651e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9048089683055878, + "num_tokens": 126927355.0, + "step": 1557 + }, + { + "epoch": 0.15565213047604776, + "grad_norm": 0.718962169920817, + "learning_rate": 9.592728429385625e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9047447443008423, + "num_tokens": 127008797.0, + "step": 1558 + }, + { + "epoch": 0.1557520355662121, + "grad_norm": 0.7520122754492157, + "learning_rate": 9.59208861836158e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9046245515346527, + "num_tokens": 127090258.0, + "step": 1559 + }, + { + "epoch": 0.15585194065637645, + "grad_norm": 0.7612073442484847, + "learning_rate": 9.591448326541365e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9078850150108337, + "num_tokens": 127171764.0, + "step": 1560 + }, + { + "epoch": 0.15595184574654078, + "grad_norm": 0.7876417521204977, + "learning_rate": 9.590807553992017e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9060553908348083, + "num_tokens": 127253282.0, + "step": 1561 + }, + { + "epoch": 0.15605175083670514, + "grad_norm": 1.0354195034154445, + "learning_rate": 9.590166300780628e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9041432738304138, + "num_tokens": 127334737.0, + "step": 1562 + }, + { + "epoch": 0.15615165592686947, + "grad_norm": 0.5767981518206836, + "learning_rate": 9.589524566974335e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9072834253311157, + "num_tokens": 127416283.0, + "step": 1563 + }, + { + "epoch": 0.15625156101703383, + "grad_norm": 0.6284464338012332, + "learning_rate": 9.588882352640332e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9063122570514679, + "num_tokens": 127497809.0, + "step": 1564 + }, + { + "epoch": 0.15635146610719816, + "grad_norm": 0.5659400346099259, + "learning_rate": 9.588239657845857e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9069015681743622, + "num_tokens": 127579289.0, + "step": 1565 + }, + { + "epoch": 0.1564513711973625, + "grad_norm": 0.6187951042156103, + "learning_rate": 9.587596482658201e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9105254113674164, + "num_tokens": 127660929.0, + "step": 1566 + }, + { + "epoch": 0.15655127628752685, + "grad_norm": 0.6023118160414906, + "learning_rate": 9.586952827144707e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9052762687206268, + "num_tokens": 127742423.0, + "step": 1567 + }, + { + "epoch": 0.15665118137769118, + "grad_norm": 0.6330922011599964, + "learning_rate": 9.586308691372763e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9067651033401489, + "num_tokens": 127824033.0, + "step": 1568 + }, + { + "epoch": 0.15675108646785554, + "grad_norm": 0.6870961546603646, + "learning_rate": 9.585664075409815e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9068551361560822, + "num_tokens": 127905498.0, + "step": 1569 + }, + { + "epoch": 0.15685099155801988, + "grad_norm": 0.7813506629769236, + "learning_rate": 9.58501897932335e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9053449034690857, + "num_tokens": 127987060.0, + "step": 1570 + }, + { + "epoch": 0.15695089664818423, + "grad_norm": 0.9036451360585457, + "learning_rate": 9.584373403180914e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9049816429615021, + "num_tokens": 128068491.0, + "step": 1571 + }, + { + "epoch": 0.15705080173834857, + "grad_norm": 0.7084302363728922, + "learning_rate": 9.583727347050098e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9053266048431396, + "num_tokens": 128149992.0, + "step": 1572 + }, + { + "epoch": 0.15715070682851293, + "grad_norm": 0.7518729121668709, + "learning_rate": 9.583080810998545e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.905009925365448, + "num_tokens": 128231522.0, + "step": 1573 + }, + { + "epoch": 0.15725061191867726, + "grad_norm": 0.8440240669668898, + "learning_rate": 9.582433795093944e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9097601771354675, + "num_tokens": 128313048.0, + "step": 1574 + }, + { + "epoch": 0.1573505170088416, + "grad_norm": 0.7577737144837743, + "learning_rate": 9.581786299404046e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9057100713253021, + "num_tokens": 128394546.0, + "step": 1575 + }, + { + "epoch": 0.15745042209900595, + "grad_norm": 0.6306266617698185, + "learning_rate": 9.581138323996639e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9045184850692749, + "num_tokens": 128476103.0, + "step": 1576 + }, + { + "epoch": 0.15755032718917028, + "grad_norm": 0.7405113862444107, + "learning_rate": 9.580489868939568e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9075751006603241, + "num_tokens": 128557531.0, + "step": 1577 + }, + { + "epoch": 0.15765023227933464, + "grad_norm": 0.6337255724617219, + "learning_rate": 9.579840934300728e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.905784398317337, + "num_tokens": 128639039.0, + "step": 1578 + }, + { + "epoch": 0.15775013736949897, + "grad_norm": 0.755762462296636, + "learning_rate": 9.57919152014806e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9072547256946564, + "num_tokens": 128720547.0, + "step": 1579 + }, + { + "epoch": 0.15785004245966333, + "grad_norm": 0.5882107518287606, + "learning_rate": 9.578541626549562e-06, + "loss": 0.498, + "mean_token_accuracy": 0.906115859746933, + "num_tokens": 128802093.0, + "step": 1580 + }, + { + "epoch": 0.15794994754982766, + "grad_norm": 0.6513151050656617, + "learning_rate": 9.577891253573274e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9098080694675446, + "num_tokens": 128883611.0, + "step": 1581 + }, + { + "epoch": 0.158049852639992, + "grad_norm": 0.738802111346824, + "learning_rate": 9.577240401287297e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9057254493236542, + "num_tokens": 128965144.0, + "step": 1582 + }, + { + "epoch": 0.15814975773015635, + "grad_norm": 0.6941186491989867, + "learning_rate": 9.576589069759769e-06, + "loss": 0.49, + "mean_token_accuracy": 0.9086723327636719, + "num_tokens": 129046720.0, + "step": 1583 + }, + { + "epoch": 0.15824966282032069, + "grad_norm": 0.7345821592760714, + "learning_rate": 9.575937259058891e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9038580060005188, + "num_tokens": 129128113.0, + "step": 1584 + }, + { + "epoch": 0.15834956791048505, + "grad_norm": 0.8072864266373515, + "learning_rate": 9.575284969252904e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9027951955795288, + "num_tokens": 129209544.0, + "step": 1585 + }, + { + "epoch": 0.15844947300064938, + "grad_norm": 1.1449413721435395, + "learning_rate": 9.574632200410105e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9086822271347046, + "num_tokens": 129291019.0, + "step": 1586 + }, + { + "epoch": 0.15854937809081374, + "grad_norm": 1.0263051334821807, + "learning_rate": 9.573978952598841e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9047433435916901, + "num_tokens": 129372524.0, + "step": 1587 + }, + { + "epoch": 0.15864928318097807, + "grad_norm": 0.7295796308686183, + "learning_rate": 9.573325225887506e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9060393869876862, + "num_tokens": 129454029.0, + "step": 1588 + }, + { + "epoch": 0.15874918827114243, + "grad_norm": 0.7679844191769754, + "learning_rate": 9.572671020344545e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9076330363750458, + "num_tokens": 129535604.0, + "step": 1589 + }, + { + "epoch": 0.15884909336130676, + "grad_norm": 1.1154402738032259, + "learning_rate": 9.572016336038454e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9063139259815216, + "num_tokens": 129617087.0, + "step": 1590 + }, + { + "epoch": 0.1589489984514711, + "grad_norm": 0.6385854946492654, + "learning_rate": 9.571361173037782e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9076849222183228, + "num_tokens": 129698585.0, + "step": 1591 + }, + { + "epoch": 0.15904890354163545, + "grad_norm": 0.7764202592890441, + "learning_rate": 9.570705531411122e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9056875109672546, + "num_tokens": 129780168.0, + "step": 1592 + }, + { + "epoch": 0.15914880863179978, + "grad_norm": 0.6758563766267455, + "learning_rate": 9.570049411227122e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9049542248249054, + "num_tokens": 129861710.0, + "step": 1593 + }, + { + "epoch": 0.15924871372196414, + "grad_norm": 0.6836796665301318, + "learning_rate": 9.56939281255448e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9022322297096252, + "num_tokens": 129943188.0, + "step": 1594 + }, + { + "epoch": 0.15934861881212847, + "grad_norm": 0.6767902152108399, + "learning_rate": 9.568735735461938e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.906958281993866, + "num_tokens": 130024692.0, + "step": 1595 + }, + { + "epoch": 0.15944852390229283, + "grad_norm": 0.7230250366900316, + "learning_rate": 9.568078180018295e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9037167429924011, + "num_tokens": 130106050.0, + "step": 1596 + }, + { + "epoch": 0.15954842899245716, + "grad_norm": 0.6380734033445482, + "learning_rate": 9.5674201462924e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9074632227420807, + "num_tokens": 130187521.0, + "step": 1597 + }, + { + "epoch": 0.1596483340826215, + "grad_norm": 0.5954781458112767, + "learning_rate": 9.566761634353145e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9064542353153229, + "num_tokens": 130268979.0, + "step": 1598 + }, + { + "epoch": 0.15974823917278586, + "grad_norm": 0.6658993069193025, + "learning_rate": 9.56610264426948e-06, + "loss": 0.498, + "mean_token_accuracy": 0.902932733297348, + "num_tokens": 130350546.0, + "step": 1599 + }, + { + "epoch": 0.1598481442629502, + "grad_norm": 0.7226409899356576, + "learning_rate": 9.565443176110402e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9048381447792053, + "num_tokens": 130432058.0, + "step": 1600 + }, + { + "epoch": 0.15994804935311455, + "grad_norm": 0.5676899272548976, + "learning_rate": 9.564783229944958e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9059796929359436, + "num_tokens": 130513548.0, + "step": 1601 + }, + { + "epoch": 0.16004795444327888, + "grad_norm": 0.6412908517589796, + "learning_rate": 9.564122805842244e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9077767729759216, + "num_tokens": 130595041.0, + "step": 1602 + }, + { + "epoch": 0.16014785953344324, + "grad_norm": 0.6309704929955983, + "learning_rate": 9.563461903871407e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9032706916332245, + "num_tokens": 130676612.0, + "step": 1603 + }, + { + "epoch": 0.16024776462360757, + "grad_norm": 0.7985676404668649, + "learning_rate": 9.562800524101644e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9034842848777771, + "num_tokens": 130758046.0, + "step": 1604 + }, + { + "epoch": 0.16034766971377193, + "grad_norm": 0.7551259379765461, + "learning_rate": 9.562138666602204e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9038709402084351, + "num_tokens": 130839534.0, + "step": 1605 + }, + { + "epoch": 0.16044757480393626, + "grad_norm": 0.8503982958772955, + "learning_rate": 9.56147633144238e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9054540395736694, + "num_tokens": 130921057.0, + "step": 1606 + }, + { + "epoch": 0.1605474798941006, + "grad_norm": 0.812317829016965, + "learning_rate": 9.560813518691524e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9080218076705933, + "num_tokens": 131002577.0, + "step": 1607 + }, + { + "epoch": 0.16064738498426495, + "grad_norm": 0.7631094012579531, + "learning_rate": 9.560150228419031e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9061503410339355, + "num_tokens": 131084079.0, + "step": 1608 + }, + { + "epoch": 0.16074729007442928, + "grad_norm": 0.6853633301005008, + "learning_rate": 9.559486460694348e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9058977961540222, + "num_tokens": 131165677.0, + "step": 1609 + }, + { + "epoch": 0.16084719516459364, + "grad_norm": 0.6012420255943545, + "learning_rate": 9.55882221558697e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9076880812644958, + "num_tokens": 131247136.0, + "step": 1610 + }, + { + "epoch": 0.16094710025475797, + "grad_norm": 0.9023899501340413, + "learning_rate": 9.55815749316645e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9048987925052643, + "num_tokens": 131328696.0, + "step": 1611 + }, + { + "epoch": 0.16104700534492233, + "grad_norm": 0.6792266037816015, + "learning_rate": 9.557492293502379e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9051739871501923, + "num_tokens": 131410213.0, + "step": 1612 + }, + { + "epoch": 0.16114691043508667, + "grad_norm": 0.5872311924264344, + "learning_rate": 9.556826616664408e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9058142900466919, + "num_tokens": 131491769.0, + "step": 1613 + }, + { + "epoch": 0.16124681552525102, + "grad_norm": 0.7791519944509667, + "learning_rate": 9.556160462722231e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9083250164985657, + "num_tokens": 131573317.0, + "step": 1614 + }, + { + "epoch": 0.16134672061541536, + "grad_norm": 0.9026644390113479, + "learning_rate": 9.555493831745598e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.903508335351944, + "num_tokens": 131654835.0, + "step": 1615 + }, + { + "epoch": 0.1614466257055797, + "grad_norm": 0.7278555550015554, + "learning_rate": 9.554826723804304e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9050689935684204, + "num_tokens": 131736335.0, + "step": 1616 + }, + { + "epoch": 0.16154653079574405, + "grad_norm": 0.5501501331096716, + "learning_rate": 9.554159138968195e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9045358598232269, + "num_tokens": 131817948.0, + "step": 1617 + }, + { + "epoch": 0.16164643588590838, + "grad_norm": 1.1960246639240517, + "learning_rate": 9.55349107730717e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9057718813419342, + "num_tokens": 131899473.0, + "step": 1618 + }, + { + "epoch": 0.16174634097607274, + "grad_norm": 0.5466737655312993, + "learning_rate": 9.552822538891175e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.9067637026309967, + "num_tokens": 131981083.0, + "step": 1619 + }, + { + "epoch": 0.16184624606623707, + "grad_norm": 0.6019699900892417, + "learning_rate": 9.552153523790207e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.905436247587204, + "num_tokens": 132062582.0, + "step": 1620 + }, + { + "epoch": 0.16194615115640143, + "grad_norm": 1.6541568052438291, + "learning_rate": 9.551484032074312e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9073681831359863, + "num_tokens": 132144122.0, + "step": 1621 + }, + { + "epoch": 0.16204605624656576, + "grad_norm": 0.7265668023907683, + "learning_rate": 9.550814063813585e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9028912484645844, + "num_tokens": 132225548.0, + "step": 1622 + }, + { + "epoch": 0.1621459613367301, + "grad_norm": 0.7386745028358875, + "learning_rate": 9.550143619078175e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9043355286121368, + "num_tokens": 132307114.0, + "step": 1623 + }, + { + "epoch": 0.16224586642689445, + "grad_norm": 0.6738171503107019, + "learning_rate": 9.549472697938275e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.90873584151268, + "num_tokens": 132388680.0, + "step": 1624 + }, + { + "epoch": 0.16234577151705878, + "grad_norm": 0.6970008173752992, + "learning_rate": 9.548801300464135e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9083088338375092, + "num_tokens": 132470113.0, + "step": 1625 + }, + { + "epoch": 0.16244567660722314, + "grad_norm": 0.940105445940998, + "learning_rate": 9.548129426726048e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.907597541809082, + "num_tokens": 132551545.0, + "step": 1626 + }, + { + "epoch": 0.16254558169738748, + "grad_norm": 0.6120828413855806, + "learning_rate": 9.54745707679436e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9070130288600922, + "num_tokens": 132633132.0, + "step": 1627 + }, + { + "epoch": 0.16264548678755184, + "grad_norm": 0.679628626058227, + "learning_rate": 9.546784250739468e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9046020805835724, + "num_tokens": 132714593.0, + "step": 1628 + }, + { + "epoch": 0.16274539187771617, + "grad_norm": 0.6675373270465383, + "learning_rate": 9.546110948631817e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9077273011207581, + "num_tokens": 132796113.0, + "step": 1629 + }, + { + "epoch": 0.16284529696788053, + "grad_norm": 0.9078599475117153, + "learning_rate": 9.545437170541903e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9073353111743927, + "num_tokens": 132877683.0, + "step": 1630 + }, + { + "epoch": 0.16294520205804486, + "grad_norm": 0.9057382977768482, + "learning_rate": 9.544762916540271e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9062457084655762, + "num_tokens": 132959196.0, + "step": 1631 + }, + { + "epoch": 0.1630451071482092, + "grad_norm": 1.699589003240341, + "learning_rate": 9.544088186697515e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9046290516853333, + "num_tokens": 133040617.0, + "step": 1632 + }, + { + "epoch": 0.16314501223837355, + "grad_norm": 0.828232471310966, + "learning_rate": 9.543412981084282e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.907277524471283, + "num_tokens": 133122105.0, + "step": 1633 + }, + { + "epoch": 0.16324491732853788, + "grad_norm": 0.7489656784681135, + "learning_rate": 9.542737299771262e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9086394906044006, + "num_tokens": 133203699.0, + "step": 1634 + }, + { + "epoch": 0.16334482241870224, + "grad_norm": 0.8769593331750596, + "learning_rate": 9.542061142829206e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9071888625621796, + "num_tokens": 133285193.0, + "step": 1635 + }, + { + "epoch": 0.16344472750886657, + "grad_norm": 0.5635503053606654, + "learning_rate": 9.541384510328905e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9073491394519806, + "num_tokens": 133366732.0, + "step": 1636 + }, + { + "epoch": 0.16354463259903093, + "grad_norm": 0.6566170138985866, + "learning_rate": 9.540707402341203e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9041151702404022, + "num_tokens": 133448185.0, + "step": 1637 + }, + { + "epoch": 0.16364453768919526, + "grad_norm": 0.9538825883019075, + "learning_rate": 9.540029818936993e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9072504043579102, + "num_tokens": 133529718.0, + "step": 1638 + }, + { + "epoch": 0.1637444427793596, + "grad_norm": 0.7944706118004107, + "learning_rate": 9.539351760187218e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9031723737716675, + "num_tokens": 133611284.0, + "step": 1639 + }, + { + "epoch": 0.16384434786952395, + "grad_norm": 0.6027733986097652, + "learning_rate": 9.538673226162878e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9074364304542542, + "num_tokens": 133692777.0, + "step": 1640 + }, + { + "epoch": 0.16394425295968829, + "grad_norm": 0.8891707156797168, + "learning_rate": 9.537994216935007e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9070604741573334, + "num_tokens": 133774333.0, + "step": 1641 + }, + { + "epoch": 0.16404415804985265, + "grad_norm": 0.6055472845450762, + "learning_rate": 9.537314732574702e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9078985452651978, + "num_tokens": 133855862.0, + "step": 1642 + }, + { + "epoch": 0.16414406314001698, + "grad_norm": 1.754208007485433, + "learning_rate": 9.536634773153108e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.906579852104187, + "num_tokens": 133937416.0, + "step": 1643 + }, + { + "epoch": 0.16424396823018134, + "grad_norm": 0.7913711529114611, + "learning_rate": 9.535954338741416e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9063867330551147, + "num_tokens": 134018938.0, + "step": 1644 + }, + { + "epoch": 0.16434387332034567, + "grad_norm": 0.7003219976732866, + "learning_rate": 9.535273429410865e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9073255360126495, + "num_tokens": 134100521.0, + "step": 1645 + }, + { + "epoch": 0.16444377841051003, + "grad_norm": 1.5601779649073595, + "learning_rate": 9.534592045232752e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9043480455875397, + "num_tokens": 134182003.0, + "step": 1646 + }, + { + "epoch": 0.16454368350067436, + "grad_norm": 1.540745533051271, + "learning_rate": 9.533910186278413e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9076672494411469, + "num_tokens": 134263479.0, + "step": 1647 + }, + { + "epoch": 0.1646435885908387, + "grad_norm": 0.7228230688321757, + "learning_rate": 9.533227852619244e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9041652679443359, + "num_tokens": 134345062.0, + "step": 1648 + }, + { + "epoch": 0.16474349368100305, + "grad_norm": 0.6181778237670192, + "learning_rate": 9.532545044326685e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9055406153202057, + "num_tokens": 134426566.0, + "step": 1649 + }, + { + "epoch": 0.16484339877116738, + "grad_norm": 0.6550974760651711, + "learning_rate": 9.531861761472222e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9068406224250793, + "num_tokens": 134508096.0, + "step": 1650 + }, + { + "epoch": 0.16494330386133174, + "grad_norm": 0.5996778981993468, + "learning_rate": 9.531178004127404e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9054346084594727, + "num_tokens": 134589531.0, + "step": 1651 + }, + { + "epoch": 0.16504320895149607, + "grad_norm": 0.7168486075741693, + "learning_rate": 9.530493772363814e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9074415266513824, + "num_tokens": 134671001.0, + "step": 1652 + }, + { + "epoch": 0.16514311404166043, + "grad_norm": 0.8563070338744141, + "learning_rate": 9.529809066253095e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9061516225337982, + "num_tokens": 134752482.0, + "step": 1653 + }, + { + "epoch": 0.16524301913182476, + "grad_norm": 0.7284840693629664, + "learning_rate": 9.529123885866934e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9067768454551697, + "num_tokens": 134834104.0, + "step": 1654 + }, + { + "epoch": 0.16534292422198912, + "grad_norm": 1.1251823211385672, + "learning_rate": 9.528438231277073e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9097774624824524, + "num_tokens": 134915640.0, + "step": 1655 + }, + { + "epoch": 0.16544282931215346, + "grad_norm": 0.637800535220352, + "learning_rate": 9.5277521025553e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9086481034755707, + "num_tokens": 134997179.0, + "step": 1656 + }, + { + "epoch": 0.1655427344023178, + "grad_norm": 0.8642965647844658, + "learning_rate": 9.527065499773449e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9085519015789032, + "num_tokens": 135078678.0, + "step": 1657 + }, + { + "epoch": 0.16564263949248215, + "grad_norm": 1.1644641514665772, + "learning_rate": 9.526378423003415e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9082458019256592, + "num_tokens": 135160173.0, + "step": 1658 + }, + { + "epoch": 0.16574254458264648, + "grad_norm": 0.6242902697613328, + "learning_rate": 9.52569087231713e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.904539555311203, + "num_tokens": 135241662.0, + "step": 1659 + }, + { + "epoch": 0.16584244967281084, + "grad_norm": 0.5920246151133854, + "learning_rate": 9.525002847786585e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9061886370182037, + "num_tokens": 135323280.0, + "step": 1660 + }, + { + "epoch": 0.16594235476297517, + "grad_norm": 0.8557197282545407, + "learning_rate": 9.524314349483815e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9056388139724731, + "num_tokens": 135404788.0, + "step": 1661 + }, + { + "epoch": 0.16604225985313953, + "grad_norm": 0.6548226063463448, + "learning_rate": 9.523625377480907e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9086312055587769, + "num_tokens": 135486328.0, + "step": 1662 + }, + { + "epoch": 0.16614216494330386, + "grad_norm": 1.1382738404324804, + "learning_rate": 9.522935931849996e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9074519872665405, + "num_tokens": 135567877.0, + "step": 1663 + }, + { + "epoch": 0.1662420700334682, + "grad_norm": 0.7413727792259, + "learning_rate": 9.522246012663267e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9075084924697876, + "num_tokens": 135649484.0, + "step": 1664 + }, + { + "epoch": 0.16634197512363255, + "grad_norm": 0.5240069932080877, + "learning_rate": 9.52155561999296e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9067566692829132, + "num_tokens": 135731028.0, + "step": 1665 + }, + { + "epoch": 0.16644188021379688, + "grad_norm": 0.9187474536199646, + "learning_rate": 9.520864753911353e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9083258509635925, + "num_tokens": 135812541.0, + "step": 1666 + }, + { + "epoch": 0.16654178530396124, + "grad_norm": 0.6117552921704543, + "learning_rate": 9.520173414490787e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9087722599506378, + "num_tokens": 135894062.0, + "step": 1667 + }, + { + "epoch": 0.16664169039412557, + "grad_norm": 0.7791754585824254, + "learning_rate": 9.51948160180364e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9056939482688904, + "num_tokens": 135975678.0, + "step": 1668 + }, + { + "epoch": 0.16674159548428993, + "grad_norm": 0.6297319486261922, + "learning_rate": 9.51878931592235e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9049843847751617, + "num_tokens": 136057170.0, + "step": 1669 + }, + { + "epoch": 0.16684150057445427, + "grad_norm": 0.6266945199194811, + "learning_rate": 9.518096556919396e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9056330919265747, + "num_tokens": 136138677.0, + "step": 1670 + }, + { + "epoch": 0.16694140566461863, + "grad_norm": 0.7354282727426261, + "learning_rate": 9.517403324867313e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9061991572380066, + "num_tokens": 136220174.0, + "step": 1671 + }, + { + "epoch": 0.16704131075478296, + "grad_norm": 0.6573390823394153, + "learning_rate": 9.516709619838685e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9071581363677979, + "num_tokens": 136301717.0, + "step": 1672 + }, + { + "epoch": 0.1671412158449473, + "grad_norm": 0.7809915464189496, + "learning_rate": 9.51601544190614e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9021559655666351, + "num_tokens": 136383177.0, + "step": 1673 + }, + { + "epoch": 0.16724112093511165, + "grad_norm": 0.7025869697601534, + "learning_rate": 9.51532079114236e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9092536568641663, + "num_tokens": 136464738.0, + "step": 1674 + }, + { + "epoch": 0.16734102602527598, + "grad_norm": 1.0116372854127105, + "learning_rate": 9.514625667620077e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.907145231962204, + "num_tokens": 136546315.0, + "step": 1675 + }, + { + "epoch": 0.16744093111544034, + "grad_norm": 0.553500814814777, + "learning_rate": 9.51393007141207e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9088323712348938, + "num_tokens": 136627793.0, + "step": 1676 + }, + { + "epoch": 0.16754083620560467, + "grad_norm": 0.8955463470218081, + "learning_rate": 9.513234002591167e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9065704941749573, + "num_tokens": 136709296.0, + "step": 1677 + }, + { + "epoch": 0.16764074129576903, + "grad_norm": 0.6379613609070485, + "learning_rate": 9.512537461230252e-06, + "loss": 0.496, + "mean_token_accuracy": 0.90772745013237, + "num_tokens": 136790847.0, + "step": 1678 + }, + { + "epoch": 0.16774064638593336, + "grad_norm": 0.7256286618699423, + "learning_rate": 9.511840447402247e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9065755605697632, + "num_tokens": 136872387.0, + "step": 1679 + }, + { + "epoch": 0.1678405514760977, + "grad_norm": 0.8012595083269332, + "learning_rate": 9.511142961180135e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9071458578109741, + "num_tokens": 136953984.0, + "step": 1680 + }, + { + "epoch": 0.16794045656626205, + "grad_norm": 1.050996669273086, + "learning_rate": 9.510445002636943e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9074898958206177, + "num_tokens": 137035481.0, + "step": 1681 + }, + { + "epoch": 0.16804036165642638, + "grad_norm": 0.6290824800417346, + "learning_rate": 9.509746571845747e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9081988036632538, + "num_tokens": 137117050.0, + "step": 1682 + }, + { + "epoch": 0.16814026674659074, + "grad_norm": 0.6005348504206087, + "learning_rate": 9.509047668879672e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9058395028114319, + "num_tokens": 137198529.0, + "step": 1683 + }, + { + "epoch": 0.16824017183675508, + "grad_norm": 0.9608430516223956, + "learning_rate": 9.508348293811895e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9058478772640228, + "num_tokens": 137280096.0, + "step": 1684 + }, + { + "epoch": 0.16834007692691944, + "grad_norm": 0.8049320708825034, + "learning_rate": 9.507648446715642e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9043868482112885, + "num_tokens": 137361642.0, + "step": 1685 + }, + { + "epoch": 0.16843998201708377, + "grad_norm": 0.8028378236620521, + "learning_rate": 9.506948127664186e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9049438834190369, + "num_tokens": 137443203.0, + "step": 1686 + }, + { + "epoch": 0.16853988710724813, + "grad_norm": 0.8892831854997035, + "learning_rate": 9.506247336730854e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9087750017642975, + "num_tokens": 137524779.0, + "step": 1687 + }, + { + "epoch": 0.16863979219741246, + "grad_norm": 0.861785333795962, + "learning_rate": 9.505546073989016e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9046136438846588, + "num_tokens": 137606304.0, + "step": 1688 + }, + { + "epoch": 0.1687396972875768, + "grad_norm": 0.5790637391203955, + "learning_rate": 9.504844339512096e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9052187502384186, + "num_tokens": 137687787.0, + "step": 1689 + }, + { + "epoch": 0.16883960237774115, + "grad_norm": 0.8532409561021851, + "learning_rate": 9.504142133373568e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9058282673358917, + "num_tokens": 137769274.0, + "step": 1690 + }, + { + "epoch": 0.16893950746790548, + "grad_norm": 0.6936093727348955, + "learning_rate": 9.503439455646952e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9062003791332245, + "num_tokens": 137850827.0, + "step": 1691 + }, + { + "epoch": 0.16903941255806984, + "grad_norm": 0.7416768734964501, + "learning_rate": 9.50273630640582e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9050959348678589, + "num_tokens": 137932221.0, + "step": 1692 + }, + { + "epoch": 0.16913931764823417, + "grad_norm": 0.8418683023607899, + "learning_rate": 9.502032685723792e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9043879508972168, + "num_tokens": 138013712.0, + "step": 1693 + }, + { + "epoch": 0.16923922273839853, + "grad_norm": 0.5562610703128849, + "learning_rate": 9.501328593674537e-06, + "loss": 0.494, + "mean_token_accuracy": 0.907463550567627, + "num_tokens": 138095269.0, + "step": 1694 + }, + { + "epoch": 0.16933912782856286, + "grad_norm": 0.7833547007177706, + "learning_rate": 9.500624030331775e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9060264229774475, + "num_tokens": 138176847.0, + "step": 1695 + }, + { + "epoch": 0.16943903291872722, + "grad_norm": 1.1768519609284787, + "learning_rate": 9.499918995769274e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9030897915363312, + "num_tokens": 138258275.0, + "step": 1696 + }, + { + "epoch": 0.16953893800889155, + "grad_norm": 0.7832105978906926, + "learning_rate": 9.499213490060853e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9068560302257538, + "num_tokens": 138339753.0, + "step": 1697 + }, + { + "epoch": 0.16963884309905589, + "grad_norm": 0.7454623607375576, + "learning_rate": 9.498507513280378e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9081404507160187, + "num_tokens": 138421250.0, + "step": 1698 + }, + { + "epoch": 0.16973874818922025, + "grad_norm": 0.6711310239109205, + "learning_rate": 9.497801065501766e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.905512273311615, + "num_tokens": 138502745.0, + "step": 1699 + }, + { + "epoch": 0.16983865327938458, + "grad_norm": 0.6655111523245894, + "learning_rate": 9.497094146798981e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9051060974597931, + "num_tokens": 138584225.0, + "step": 1700 + }, + { + "epoch": 0.16993855836954894, + "grad_norm": 0.6594233465532223, + "learning_rate": 9.496386757246041e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9055745303630829, + "num_tokens": 138665744.0, + "step": 1701 + }, + { + "epoch": 0.17003846345971327, + "grad_norm": 0.6269889253317498, + "learning_rate": 9.495678896917009e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9061280190944672, + "num_tokens": 138747363.0, + "step": 1702 + }, + { + "epoch": 0.17013836854987763, + "grad_norm": 0.8042208465226721, + "learning_rate": 9.494970565885998e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9076671898365021, + "num_tokens": 138828930.0, + "step": 1703 + }, + { + "epoch": 0.17023827364004196, + "grad_norm": 0.5705739178591659, + "learning_rate": 9.494261764227172e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9049564003944397, + "num_tokens": 138910471.0, + "step": 1704 + }, + { + "epoch": 0.1703381787302063, + "grad_norm": 0.708242198686869, + "learning_rate": 9.493552492014743e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9056335687637329, + "num_tokens": 138992050.0, + "step": 1705 + }, + { + "epoch": 0.17043808382037065, + "grad_norm": 0.73510376786334, + "learning_rate": 9.492842749322972e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9038724005222321, + "num_tokens": 139073506.0, + "step": 1706 + }, + { + "epoch": 0.17053798891053498, + "grad_norm": 0.5559136187291354, + "learning_rate": 9.492132536226168e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9096407890319824, + "num_tokens": 139155055.0, + "step": 1707 + }, + { + "epoch": 0.17063789400069934, + "grad_norm": 0.7635143735401003, + "learning_rate": 9.491421852798695e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9083443880081177, + "num_tokens": 139236615.0, + "step": 1708 + }, + { + "epoch": 0.17073779909086367, + "grad_norm": 0.6536579059780462, + "learning_rate": 9.49071069911496e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9081928133964539, + "num_tokens": 139318161.0, + "step": 1709 + }, + { + "epoch": 0.17083770418102803, + "grad_norm": 0.6515100218252939, + "learning_rate": 9.489999075249422e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9054655730724335, + "num_tokens": 139399629.0, + "step": 1710 + }, + { + "epoch": 0.17093760927119236, + "grad_norm": 0.6529428127066371, + "learning_rate": 9.48928698127659e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9063585102558136, + "num_tokens": 139481055.0, + "step": 1711 + }, + { + "epoch": 0.17103751436135672, + "grad_norm": 0.8010367847538394, + "learning_rate": 9.488574417271017e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9059750139713287, + "num_tokens": 139562532.0, + "step": 1712 + }, + { + "epoch": 0.17113741945152106, + "grad_norm": 0.6226068245720425, + "learning_rate": 9.487861383307312e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9063255488872528, + "num_tokens": 139643998.0, + "step": 1713 + }, + { + "epoch": 0.1712373245416854, + "grad_norm": 1.015792711863159, + "learning_rate": 9.48714787946013e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9077224731445312, + "num_tokens": 139725459.0, + "step": 1714 + }, + { + "epoch": 0.17133722963184975, + "grad_norm": 0.6036185262892764, + "learning_rate": 9.486433905804176e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9047078788280487, + "num_tokens": 139806993.0, + "step": 1715 + }, + { + "epoch": 0.17143713472201408, + "grad_norm": 0.6482574933537849, + "learning_rate": 9.485719462414202e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9046189486980438, + "num_tokens": 139888448.0, + "step": 1716 + }, + { + "epoch": 0.17153703981217844, + "grad_norm": 0.6771205167100642, + "learning_rate": 9.485004549365013e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9051766991615295, + "num_tokens": 139969873.0, + "step": 1717 + }, + { + "epoch": 0.17163694490234277, + "grad_norm": 0.7012527758990623, + "learning_rate": 9.484289166731461e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9064660668373108, + "num_tokens": 140051393.0, + "step": 1718 + }, + { + "epoch": 0.17173684999250713, + "grad_norm": 0.7303206230532286, + "learning_rate": 9.483573314588446e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9061448872089386, + "num_tokens": 140132885.0, + "step": 1719 + }, + { + "epoch": 0.17183675508267146, + "grad_norm": 0.555719748623127, + "learning_rate": 9.482856993010919e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9058286845684052, + "num_tokens": 140214319.0, + "step": 1720 + }, + { + "epoch": 0.1719366601728358, + "grad_norm": 0.6555476353278641, + "learning_rate": 9.48214020207388e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9043747782707214, + "num_tokens": 140295812.0, + "step": 1721 + }, + { + "epoch": 0.17203656526300015, + "grad_norm": 1.0137714985082051, + "learning_rate": 9.481422941852376e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9049481451511383, + "num_tokens": 140377365.0, + "step": 1722 + }, + { + "epoch": 0.17213647035316448, + "grad_norm": 0.9172397573543262, + "learning_rate": 9.480705212421505e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.908511221408844, + "num_tokens": 140458882.0, + "step": 1723 + }, + { + "epoch": 0.17223637544332884, + "grad_norm": 0.6474124871971157, + "learning_rate": 9.479987013856417e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9079648554325104, + "num_tokens": 140540485.0, + "step": 1724 + }, + { + "epoch": 0.17233628053349317, + "grad_norm": 0.5987377607762829, + "learning_rate": 9.479268346232307e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9058594703674316, + "num_tokens": 140621826.0, + "step": 1725 + }, + { + "epoch": 0.17243618562365753, + "grad_norm": 0.7917976435556887, + "learning_rate": 9.478549209624417e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9050836861133575, + "num_tokens": 140703367.0, + "step": 1726 + }, + { + "epoch": 0.17253609071382187, + "grad_norm": 0.6972354808748236, + "learning_rate": 9.477829604108044e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9068226516246796, + "num_tokens": 140784882.0, + "step": 1727 + }, + { + "epoch": 0.17263599580398623, + "grad_norm": 0.6913452721961985, + "learning_rate": 9.477109529758533e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9045950770378113, + "num_tokens": 140866460.0, + "step": 1728 + }, + { + "epoch": 0.17273590089415056, + "grad_norm": 0.5425808998311331, + "learning_rate": 9.476388986651272e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9067234992980957, + "num_tokens": 140947989.0, + "step": 1729 + }, + { + "epoch": 0.1728358059843149, + "grad_norm": 0.7315751536791887, + "learning_rate": 9.475667974861706e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9037464559078217, + "num_tokens": 141029495.0, + "step": 1730 + }, + { + "epoch": 0.17293571107447925, + "grad_norm": 0.5750703284699766, + "learning_rate": 9.474946494465324e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.905285507440567, + "num_tokens": 141111032.0, + "step": 1731 + }, + { + "epoch": 0.17303561616464358, + "grad_norm": 0.6637188131015636, + "learning_rate": 9.474224545537669e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9088197350502014, + "num_tokens": 141192598.0, + "step": 1732 + }, + { + "epoch": 0.17313552125480794, + "grad_norm": 0.8356693929250603, + "learning_rate": 9.473502128154324e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9035533964633942, + "num_tokens": 141274111.0, + "step": 1733 + }, + { + "epoch": 0.17323542634497227, + "grad_norm": 0.7186350398970983, + "learning_rate": 9.472779242390932e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9062967002391815, + "num_tokens": 141355605.0, + "step": 1734 + }, + { + "epoch": 0.17333533143513663, + "grad_norm": 0.5852810223978896, + "learning_rate": 9.472055888323177e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9057603180408478, + "num_tokens": 141437080.0, + "step": 1735 + }, + { + "epoch": 0.17343523652530096, + "grad_norm": 0.6669050778549915, + "learning_rate": 9.471332066026795e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9049766063690186, + "num_tokens": 141518527.0, + "step": 1736 + }, + { + "epoch": 0.17353514161546532, + "grad_norm": 0.5956044162630671, + "learning_rate": 9.470607775577574e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9078300595283508, + "num_tokens": 141600055.0, + "step": 1737 + }, + { + "epoch": 0.17363504670562965, + "grad_norm": 2.2469464286824645, + "learning_rate": 9.469883017051345e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.9055946469306946, + "num_tokens": 141681662.0, + "step": 1738 + }, + { + "epoch": 0.17373495179579398, + "grad_norm": 0.5712899781569449, + "learning_rate": 9.46915779052399e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9082921743392944, + "num_tokens": 141763171.0, + "step": 1739 + }, + { + "epoch": 0.17383485688595834, + "grad_norm": 0.6436783619807589, + "learning_rate": 9.468432096071442e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9065471887588501, + "num_tokens": 141844704.0, + "step": 1740 + }, + { + "epoch": 0.17393476197612268, + "grad_norm": 0.6524694852044933, + "learning_rate": 9.467705933769685e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9055320918560028, + "num_tokens": 141926241.0, + "step": 1741 + }, + { + "epoch": 0.17403466706628704, + "grad_norm": 0.540899162557973, + "learning_rate": 9.466979303694743e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9064542949199677, + "num_tokens": 142007736.0, + "step": 1742 + }, + { + "epoch": 0.17413457215645137, + "grad_norm": 0.5120416307150708, + "learning_rate": 9.4662522059227e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.9098650217056274, + "num_tokens": 142089317.0, + "step": 1743 + }, + { + "epoch": 0.17423447724661573, + "grad_norm": 0.6144251297005497, + "learning_rate": 9.465524640529681e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.906824380159378, + "num_tokens": 142170835.0, + "step": 1744 + }, + { + "epoch": 0.17433438233678006, + "grad_norm": 0.7401996928332254, + "learning_rate": 9.464796607591865e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.908199816942215, + "num_tokens": 142252358.0, + "step": 1745 + }, + { + "epoch": 0.1744342874269444, + "grad_norm": 0.767896053464006, + "learning_rate": 9.464068107185476e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9084627032279968, + "num_tokens": 142333882.0, + "step": 1746 + }, + { + "epoch": 0.17453419251710875, + "grad_norm": 0.5860201564109586, + "learning_rate": 9.463339139386788e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9045936167240143, + "num_tokens": 142415416.0, + "step": 1747 + }, + { + "epoch": 0.17463409760727308, + "grad_norm": 0.636985609443802, + "learning_rate": 9.462609704272127e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.906531423330307, + "num_tokens": 142496950.0, + "step": 1748 + }, + { + "epoch": 0.17473400269743744, + "grad_norm": 0.5421034325252269, + "learning_rate": 9.461879801917864e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9064869582653046, + "num_tokens": 142578447.0, + "step": 1749 + }, + { + "epoch": 0.17483390778760177, + "grad_norm": 0.6359981574509358, + "learning_rate": 9.46114943240042e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9070600271224976, + "num_tokens": 142660024.0, + "step": 1750 + }, + { + "epoch": 0.17493381287776613, + "grad_norm": 0.6809527434855983, + "learning_rate": 9.460418595796268e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9047248065471649, + "num_tokens": 142741538.0, + "step": 1751 + }, + { + "epoch": 0.17503371796793046, + "grad_norm": 0.5377894426298876, + "learning_rate": 9.459687292181924e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9076709151268005, + "num_tokens": 142823063.0, + "step": 1752 + }, + { + "epoch": 0.17513362305809482, + "grad_norm": 0.6467148973405662, + "learning_rate": 9.45895552163396e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9036359190940857, + "num_tokens": 142904610.0, + "step": 1753 + }, + { + "epoch": 0.17523352814825915, + "grad_norm": 0.5792339687676471, + "learning_rate": 9.45822328422899e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.905040979385376, + "num_tokens": 142986065.0, + "step": 1754 + }, + { + "epoch": 0.1753334332384235, + "grad_norm": 0.7149340443759724, + "learning_rate": 9.45749058004368e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9059952795505524, + "num_tokens": 143067609.0, + "step": 1755 + }, + { + "epoch": 0.17543333832858785, + "grad_norm": 0.5368158934016145, + "learning_rate": 9.456757409154747e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9072895348072052, + "num_tokens": 143149136.0, + "step": 1756 + }, + { + "epoch": 0.17553324341875218, + "grad_norm": 0.5582015475224166, + "learning_rate": 9.456023771638953e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.9065609574317932, + "num_tokens": 143230704.0, + "step": 1757 + }, + { + "epoch": 0.17563314850891654, + "grad_norm": 0.510505721150699, + "learning_rate": 9.45528966757311e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9070689678192139, + "num_tokens": 143312178.0, + "step": 1758 + }, + { + "epoch": 0.17573305359908087, + "grad_norm": 0.577989659584799, + "learning_rate": 9.454555097034081e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9078423380851746, + "num_tokens": 143393750.0, + "step": 1759 + }, + { + "epoch": 0.17583295868924523, + "grad_norm": 0.5979707595565923, + "learning_rate": 9.453820060098777e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9070585370063782, + "num_tokens": 143475319.0, + "step": 1760 + }, + { + "epoch": 0.17593286377940956, + "grad_norm": 0.5701983037020547, + "learning_rate": 9.453084556844154e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.908530592918396, + "num_tokens": 143556968.0, + "step": 1761 + }, + { + "epoch": 0.1760327688695739, + "grad_norm": 1.5409166640520564, + "learning_rate": 9.452348587347224e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.907105565071106, + "num_tokens": 143638499.0, + "step": 1762 + }, + { + "epoch": 0.17613267395973825, + "grad_norm": 0.6003299845841689, + "learning_rate": 9.45161215168504e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9056979119777679, + "num_tokens": 143719953.0, + "step": 1763 + }, + { + "epoch": 0.17623257904990258, + "grad_norm": 0.5969536773219375, + "learning_rate": 9.450875249934708e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9070693254470825, + "num_tokens": 143801574.0, + "step": 1764 + }, + { + "epoch": 0.17633248414006694, + "grad_norm": 0.8191169692269189, + "learning_rate": 9.450137882173385e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9017173051834106, + "num_tokens": 143883146.0, + "step": 1765 + }, + { + "epoch": 0.17643238923023127, + "grad_norm": 0.7450991237367286, + "learning_rate": 9.44940004847827e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9068742394447327, + "num_tokens": 143964698.0, + "step": 1766 + }, + { + "epoch": 0.17653229432039563, + "grad_norm": 0.5752589493171808, + "learning_rate": 9.44866174892662e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9097482860088348, + "num_tokens": 144046314.0, + "step": 1767 + }, + { + "epoch": 0.17663219941055996, + "grad_norm": 0.5562330181717334, + "learning_rate": 9.44792298359573e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9043392241001129, + "num_tokens": 144127764.0, + "step": 1768 + }, + { + "epoch": 0.17673210450072432, + "grad_norm": 2.3308382163755628, + "learning_rate": 9.447183752562954e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9057184755802155, + "num_tokens": 144209285.0, + "step": 1769 + }, + { + "epoch": 0.17683200959088866, + "grad_norm": 0.990668404080946, + "learning_rate": 9.446444055905691e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9069260954856873, + "num_tokens": 144290886.0, + "step": 1770 + }, + { + "epoch": 0.176931914681053, + "grad_norm": 0.534287221701527, + "learning_rate": 9.445703893701383e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9066377878189087, + "num_tokens": 144372421.0, + "step": 1771 + }, + { + "epoch": 0.17703181977121735, + "grad_norm": 0.625986923488721, + "learning_rate": 9.444963266027528e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9038734436035156, + "num_tokens": 144453864.0, + "step": 1772 + }, + { + "epoch": 0.17713172486138168, + "grad_norm": 0.6475885149564689, + "learning_rate": 9.444222172961672e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9078574478626251, + "num_tokens": 144535394.0, + "step": 1773 + }, + { + "epoch": 0.17723162995154604, + "grad_norm": 0.605396038341504, + "learning_rate": 9.443480614581406e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9099165201187134, + "num_tokens": 144616910.0, + "step": 1774 + }, + { + "epoch": 0.17733153504171037, + "grad_norm": 0.5748442298882398, + "learning_rate": 9.442738590964373e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9071542322635651, + "num_tokens": 144698453.0, + "step": 1775 + }, + { + "epoch": 0.17743144013187473, + "grad_norm": 0.5923754638683226, + "learning_rate": 9.441996102188265e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9050085544586182, + "num_tokens": 144779926.0, + "step": 1776 + }, + { + "epoch": 0.17753134522203906, + "grad_norm": 0.766114266286888, + "learning_rate": 9.441253148330818e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9044447243213654, + "num_tokens": 144861480.0, + "step": 1777 + }, + { + "epoch": 0.17763125031220342, + "grad_norm": 0.5795794190525863, + "learning_rate": 9.440509729469823e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9085617363452911, + "num_tokens": 144943004.0, + "step": 1778 + }, + { + "epoch": 0.17773115540236775, + "grad_norm": 0.561447456336714, + "learning_rate": 9.439765845683114e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9054165482521057, + "num_tokens": 145024520.0, + "step": 1779 + }, + { + "epoch": 0.17783106049253208, + "grad_norm": 0.7684221277602412, + "learning_rate": 9.439021497048577e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9064006507396698, + "num_tokens": 145106078.0, + "step": 1780 + }, + { + "epoch": 0.17793096558269644, + "grad_norm": 0.6061408794655894, + "learning_rate": 9.43827668364415e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9072589576244354, + "num_tokens": 145187559.0, + "step": 1781 + }, + { + "epoch": 0.17803087067286077, + "grad_norm": 0.8978542918082645, + "learning_rate": 9.43753140554781e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.903773307800293, + "num_tokens": 145269016.0, + "step": 1782 + }, + { + "epoch": 0.17813077576302513, + "grad_norm": 0.618752298846214, + "learning_rate": 9.436785662837591e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9052964150905609, + "num_tokens": 145350619.0, + "step": 1783 + }, + { + "epoch": 0.17823068085318947, + "grad_norm": 1.532461736075548, + "learning_rate": 9.436039455591574e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9038354456424713, + "num_tokens": 145432134.0, + "step": 1784 + }, + { + "epoch": 0.17833058594335383, + "grad_norm": 0.6746200999822892, + "learning_rate": 9.435292783887885e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9036101400852203, + "num_tokens": 145513662.0, + "step": 1785 + }, + { + "epoch": 0.17843049103351816, + "grad_norm": 0.5293427265080547, + "learning_rate": 9.434545647804703e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9064742922782898, + "num_tokens": 145595224.0, + "step": 1786 + }, + { + "epoch": 0.1785303961236825, + "grad_norm": 0.5522405675225005, + "learning_rate": 9.433798047420256e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9077882766723633, + "num_tokens": 145676747.0, + "step": 1787 + }, + { + "epoch": 0.17863030121384685, + "grad_norm": 0.703285968982105, + "learning_rate": 9.433049982812813e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9076388478279114, + "num_tokens": 145758256.0, + "step": 1788 + }, + { + "epoch": 0.17873020630401118, + "grad_norm": 0.7797719157448378, + "learning_rate": 9.432301454060702e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9056485593318939, + "num_tokens": 145839751.0, + "step": 1789 + }, + { + "epoch": 0.17883011139417554, + "grad_norm": 0.6514327884152066, + "learning_rate": 9.431552461242291e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9093227386474609, + "num_tokens": 145921225.0, + "step": 1790 + }, + { + "epoch": 0.17893001648433987, + "grad_norm": 0.7497722028959398, + "learning_rate": 9.430803004436004e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9068645238876343, + "num_tokens": 146002754.0, + "step": 1791 + }, + { + "epoch": 0.17902992157450423, + "grad_norm": 0.6964913742143148, + "learning_rate": 9.430053083720307e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9075047671794891, + "num_tokens": 146084262.0, + "step": 1792 + }, + { + "epoch": 0.17912982666466856, + "grad_norm": 0.7566091349566237, + "learning_rate": 9.429302699173719e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9066683948040009, + "num_tokens": 146165744.0, + "step": 1793 + }, + { + "epoch": 0.17922973175483292, + "grad_norm": 2.442938760173653, + "learning_rate": 9.428551850874805e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9039825797080994, + "num_tokens": 146247218.0, + "step": 1794 + }, + { + "epoch": 0.17932963684499725, + "grad_norm": 0.6658313686454721, + "learning_rate": 9.42780053890218e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9035297632217407, + "num_tokens": 146328696.0, + "step": 1795 + }, + { + "epoch": 0.17942954193516159, + "grad_norm": 0.8870666004485671, + "learning_rate": 9.427048763334507e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9047392308712006, + "num_tokens": 146410242.0, + "step": 1796 + }, + { + "epoch": 0.17952944702532594, + "grad_norm": 0.7666291200320158, + "learning_rate": 9.426296524250498e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9080179631710052, + "num_tokens": 146491729.0, + "step": 1797 + }, + { + "epoch": 0.17962935211549028, + "grad_norm": 0.7253441883679714, + "learning_rate": 9.425543821728913e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9072741866111755, + "num_tokens": 146573272.0, + "step": 1798 + }, + { + "epoch": 0.17972925720565464, + "grad_norm": 0.6299981481560498, + "learning_rate": 9.42479065584856e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9065729677677155, + "num_tokens": 146654792.0, + "step": 1799 + }, + { + "epoch": 0.17982916229581897, + "grad_norm": 0.5806397571144298, + "learning_rate": 9.424037026688298e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.9087846875190735, + "num_tokens": 146736407.0, + "step": 1800 + }, + { + "epoch": 0.17992906738598333, + "grad_norm": 0.620708164711908, + "learning_rate": 9.42328293432703e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9083366096019745, + "num_tokens": 146817894.0, + "step": 1801 + }, + { + "epoch": 0.18002897247614766, + "grad_norm": 0.5990170357876309, + "learning_rate": 9.422528378843714e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9088283777236938, + "num_tokens": 146899397.0, + "step": 1802 + }, + { + "epoch": 0.180128877566312, + "grad_norm": 0.6661483690919874, + "learning_rate": 9.421773360317348e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9054577648639679, + "num_tokens": 146980905.0, + "step": 1803 + }, + { + "epoch": 0.18022878265647635, + "grad_norm": 0.777275979270076, + "learning_rate": 9.421017878826986e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9073084592819214, + "num_tokens": 147062397.0, + "step": 1804 + }, + { + "epoch": 0.18032868774664068, + "grad_norm": 0.5184159198890159, + "learning_rate": 9.420261934451728e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9074543416500092, + "num_tokens": 147143981.0, + "step": 1805 + }, + { + "epoch": 0.18042859283680504, + "grad_norm": 0.7530074746959624, + "learning_rate": 9.41950552727072e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9093757569789886, + "num_tokens": 147225583.0, + "step": 1806 + }, + { + "epoch": 0.18052849792696937, + "grad_norm": 0.735064382755914, + "learning_rate": 9.418748657363161e-06, + "loss": 0.495, + "mean_token_accuracy": 0.907323569059372, + "num_tokens": 147307088.0, + "step": 1807 + }, + { + "epoch": 0.18062840301713373, + "grad_norm": 0.6655781962967461, + "learning_rate": 9.417991324808296e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9059262871742249, + "num_tokens": 147388669.0, + "step": 1808 + }, + { + "epoch": 0.18072830810729806, + "grad_norm": 0.5710897050968741, + "learning_rate": 9.417233529685417e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.906640350818634, + "num_tokens": 147470169.0, + "step": 1809 + }, + { + "epoch": 0.18082821319746242, + "grad_norm": 0.7243668716221684, + "learning_rate": 9.416475272073864e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9064674973487854, + "num_tokens": 147551594.0, + "step": 1810 + }, + { + "epoch": 0.18092811828762675, + "grad_norm": 0.6660671849012297, + "learning_rate": 9.415716552053031e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.905811220407486, + "num_tokens": 147633050.0, + "step": 1811 + }, + { + "epoch": 0.1810280233777911, + "grad_norm": 0.5527553387029088, + "learning_rate": 9.414957369702356e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9086842834949493, + "num_tokens": 147714593.0, + "step": 1812 + }, + { + "epoch": 0.18112792846795545, + "grad_norm": 0.695816183485607, + "learning_rate": 9.414197725101327e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9055016338825226, + "num_tokens": 147796066.0, + "step": 1813 + }, + { + "epoch": 0.18122783355811978, + "grad_norm": 0.6079377492501112, + "learning_rate": 9.413437618329476e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9077562987804413, + "num_tokens": 147877574.0, + "step": 1814 + }, + { + "epoch": 0.18132773864828414, + "grad_norm": 0.876588769275818, + "learning_rate": 9.412677049466388e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9070605337619781, + "num_tokens": 147959099.0, + "step": 1815 + }, + { + "epoch": 0.18142764373844847, + "grad_norm": 0.6112215243636038, + "learning_rate": 9.411916018591696e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9050488471984863, + "num_tokens": 148040629.0, + "step": 1816 + }, + { + "epoch": 0.18152754882861283, + "grad_norm": 0.7133636959693705, + "learning_rate": 9.411154525785082e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9059079885482788, + "num_tokens": 148122112.0, + "step": 1817 + }, + { + "epoch": 0.18162745391877716, + "grad_norm": 0.685000879286188, + "learning_rate": 9.410392571126275e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9075573980808258, + "num_tokens": 148203595.0, + "step": 1818 + }, + { + "epoch": 0.18172735900894152, + "grad_norm": 0.7428254642323121, + "learning_rate": 9.40963015469505e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9084612727165222, + "num_tokens": 148285142.0, + "step": 1819 + }, + { + "epoch": 0.18182726409910585, + "grad_norm": 0.644867875018014, + "learning_rate": 9.408867276571235e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9078057706356049, + "num_tokens": 148366763.0, + "step": 1820 + }, + { + "epoch": 0.18192716918927018, + "grad_norm": 0.6895690087845745, + "learning_rate": 9.408103936834703e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9061707258224487, + "num_tokens": 148448248.0, + "step": 1821 + }, + { + "epoch": 0.18202707427943454, + "grad_norm": 0.632026673622121, + "learning_rate": 9.407340135565375e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9051485955715179, + "num_tokens": 148529710.0, + "step": 1822 + }, + { + "epoch": 0.18212697936959887, + "grad_norm": 0.563999970385958, + "learning_rate": 9.406575872843224e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9071738421916962, + "num_tokens": 148611268.0, + "step": 1823 + }, + { + "epoch": 0.18222688445976323, + "grad_norm": 0.6691387174729496, + "learning_rate": 9.40581114874827e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9070113897323608, + "num_tokens": 148692844.0, + "step": 1824 + }, + { + "epoch": 0.18232678954992756, + "grad_norm": 0.6665656655613511, + "learning_rate": 9.405045963360577e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9051952958106995, + "num_tokens": 148774289.0, + "step": 1825 + }, + { + "epoch": 0.18242669464009192, + "grad_norm": 0.6016884107187881, + "learning_rate": 9.404280316760264e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9077471792697906, + "num_tokens": 148855905.0, + "step": 1826 + }, + { + "epoch": 0.18252659973025626, + "grad_norm": 0.6380261561423163, + "learning_rate": 9.403514209027491e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9059885740280151, + "num_tokens": 148937418.0, + "step": 1827 + }, + { + "epoch": 0.1826265048204206, + "grad_norm": 0.8899641736147691, + "learning_rate": 9.402747640242475e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9048977196216583, + "num_tokens": 149018946.0, + "step": 1828 + }, + { + "epoch": 0.18272640991058495, + "grad_norm": 0.7691915893514212, + "learning_rate": 9.401980610485472e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9051443338394165, + "num_tokens": 149100525.0, + "step": 1829 + }, + { + "epoch": 0.18282631500074928, + "grad_norm": 0.7055843976075137, + "learning_rate": 9.401213119836795e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9047654867172241, + "num_tokens": 149182092.0, + "step": 1830 + }, + { + "epoch": 0.18292622009091364, + "grad_norm": 0.5568524504932538, + "learning_rate": 9.400445168376798e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9020039141178131, + "num_tokens": 149263513.0, + "step": 1831 + }, + { + "epoch": 0.18302612518107797, + "grad_norm": 0.5697155417555548, + "learning_rate": 9.399676756185887e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.906879723072052, + "num_tokens": 149344992.0, + "step": 1832 + }, + { + "epoch": 0.18312603027124233, + "grad_norm": 1.004769190651812, + "learning_rate": 9.398907883344514e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9046063721179962, + "num_tokens": 149426431.0, + "step": 1833 + }, + { + "epoch": 0.18322593536140666, + "grad_norm": 0.5574849802774166, + "learning_rate": 9.398138549933184e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9053680300712585, + "num_tokens": 149507904.0, + "step": 1834 + }, + { + "epoch": 0.18332584045157102, + "grad_norm": 0.6602448219458568, + "learning_rate": 9.397368756032445e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9068464636802673, + "num_tokens": 149589304.0, + "step": 1835 + }, + { + "epoch": 0.18342574554173535, + "grad_norm": 1.0630472977693872, + "learning_rate": 9.396598501722897e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9051843881607056, + "num_tokens": 149670821.0, + "step": 1836 + }, + { + "epoch": 0.18352565063189968, + "grad_norm": 0.46512392067890584, + "learning_rate": 9.395827787085183e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.9085666537284851, + "num_tokens": 149752441.0, + "step": 1837 + }, + { + "epoch": 0.18362555572206404, + "grad_norm": 0.5747548083770989, + "learning_rate": 9.3950566122e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9077953696250916, + "num_tokens": 149833961.0, + "step": 1838 + }, + { + "epoch": 0.18372546081222837, + "grad_norm": 0.5779760857845674, + "learning_rate": 9.394284977148091e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9045862853527069, + "num_tokens": 149915462.0, + "step": 1839 + }, + { + "epoch": 0.18382536590239273, + "grad_norm": 4.728409160703537, + "learning_rate": 9.393512882010246e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9072144627571106, + "num_tokens": 149997034.0, + "step": 1840 + }, + { + "epoch": 0.18392527099255707, + "grad_norm": 0.7262939181225458, + "learning_rate": 9.392740326867304e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.90680792927742, + "num_tokens": 150078637.0, + "step": 1841 + }, + { + "epoch": 0.18402517608272143, + "grad_norm": 0.5268643399203243, + "learning_rate": 9.391967311800154e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9043060839176178, + "num_tokens": 150160117.0, + "step": 1842 + }, + { + "epoch": 0.18412508117288576, + "grad_norm": 0.6144579117740924, + "learning_rate": 9.391193836889728e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9075203239917755, + "num_tokens": 150241572.0, + "step": 1843 + }, + { + "epoch": 0.1842249862630501, + "grad_norm": 0.9192394014967535, + "learning_rate": 9.390419902217011e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9049548208713531, + "num_tokens": 150323125.0, + "step": 1844 + }, + { + "epoch": 0.18432489135321445, + "grad_norm": 0.5182581378075679, + "learning_rate": 9.389645507863036e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.903961181640625, + "num_tokens": 150404560.0, + "step": 1845 + }, + { + "epoch": 0.18442479644337878, + "grad_norm": 0.8931080457504863, + "learning_rate": 9.388870653908883e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9060675799846649, + "num_tokens": 150486123.0, + "step": 1846 + }, + { + "epoch": 0.18452470153354314, + "grad_norm": 0.6184544253927139, + "learning_rate": 9.38809534043568e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9037537276744843, + "num_tokens": 150567631.0, + "step": 1847 + }, + { + "epoch": 0.18462460662370747, + "grad_norm": 0.9902575658096878, + "learning_rate": 9.387319567524602e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9049323797225952, + "num_tokens": 150649079.0, + "step": 1848 + }, + { + "epoch": 0.18472451171387183, + "grad_norm": 0.7680669070424645, + "learning_rate": 9.38654333525687e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9038632810115814, + "num_tokens": 150730567.0, + "step": 1849 + }, + { + "epoch": 0.18482441680403616, + "grad_norm": 0.6070974367600969, + "learning_rate": 9.385766643713764e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9054605662822723, + "num_tokens": 150812110.0, + "step": 1850 + }, + { + "epoch": 0.18492432189420052, + "grad_norm": 0.5988437782988797, + "learning_rate": 9.384989492976598e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.9080241322517395, + "num_tokens": 150893762.0, + "step": 1851 + }, + { + "epoch": 0.18502422698436485, + "grad_norm": 0.9161056285924847, + "learning_rate": 9.384211883126741e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9031859636306763, + "num_tokens": 150975194.0, + "step": 1852 + }, + { + "epoch": 0.18512413207452919, + "grad_norm": 0.550870239135149, + "learning_rate": 9.383433814245612e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9038309156894684, + "num_tokens": 151056732.0, + "step": 1853 + }, + { + "epoch": 0.18522403716469354, + "grad_norm": 25.643130763085097, + "learning_rate": 9.382655286414677e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.906653642654419, + "num_tokens": 151138298.0, + "step": 1854 + }, + { + "epoch": 0.18532394225485788, + "grad_norm": 0.6937640180718597, + "learning_rate": 9.381876299715444e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9076639115810394, + "num_tokens": 151219788.0, + "step": 1855 + }, + { + "epoch": 0.18542384734502224, + "grad_norm": 1.101486245704459, + "learning_rate": 9.381096854229476e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9072220325469971, + "num_tokens": 151301348.0, + "step": 1856 + }, + { + "epoch": 0.18552375243518657, + "grad_norm": 0.5925721549909008, + "learning_rate": 9.380316950038382e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9063311815261841, + "num_tokens": 151382804.0, + "step": 1857 + }, + { + "epoch": 0.18562365752535093, + "grad_norm": 0.5584928141583437, + "learning_rate": 9.379536587223818e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.906413197517395, + "num_tokens": 151464297.0, + "step": 1858 + }, + { + "epoch": 0.18572356261551526, + "grad_norm": 0.5268005623653027, + "learning_rate": 9.378755765867488e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9064567387104034, + "num_tokens": 151545847.0, + "step": 1859 + }, + { + "epoch": 0.18582346770567962, + "grad_norm": 0.7274770571227946, + "learning_rate": 9.377974486051149e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9067384302616119, + "num_tokens": 151627441.0, + "step": 1860 + }, + { + "epoch": 0.18592337279584395, + "grad_norm": 0.6667251806933947, + "learning_rate": 9.377192747856596e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9069032669067383, + "num_tokens": 151708975.0, + "step": 1861 + }, + { + "epoch": 0.18602327788600828, + "grad_norm": 0.6008488954110265, + "learning_rate": 9.37641055136568e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.905929684638977, + "num_tokens": 151790473.0, + "step": 1862 + }, + { + "epoch": 0.18612318297617264, + "grad_norm": 0.5359141804104034, + "learning_rate": 9.375627896660299e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9094624817371368, + "num_tokens": 151872074.0, + "step": 1863 + }, + { + "epoch": 0.18622308806633697, + "grad_norm": 0.7347144078377553, + "learning_rate": 9.374844783822396e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9077342748641968, + "num_tokens": 151953659.0, + "step": 1864 + }, + { + "epoch": 0.18632299315650133, + "grad_norm": 0.6923608481848772, + "learning_rate": 9.374061212933965e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9048996567726135, + "num_tokens": 152035259.0, + "step": 1865 + }, + { + "epoch": 0.18642289824666566, + "grad_norm": 0.5659484212201332, + "learning_rate": 9.373277184077047e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9066562056541443, + "num_tokens": 152116771.0, + "step": 1866 + }, + { + "epoch": 0.18652280333683002, + "grad_norm": 1.0026526475538176, + "learning_rate": 9.372492697333728e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9077391624450684, + "num_tokens": 152198226.0, + "step": 1867 + }, + { + "epoch": 0.18662270842699435, + "grad_norm": 0.6402740786348927, + "learning_rate": 9.371707752786147e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9054982960224152, + "num_tokens": 152279739.0, + "step": 1868 + }, + { + "epoch": 0.1867226135171587, + "grad_norm": 0.6641328903492671, + "learning_rate": 9.370922350516486e-06, + "loss": 0.5, + "mean_token_accuracy": 0.907525509595871, + "num_tokens": 152361229.0, + "step": 1869 + }, + { + "epoch": 0.18682251860732305, + "grad_norm": 0.6683133542682779, + "learning_rate": 9.370136490606982e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9087464511394501, + "num_tokens": 152442716.0, + "step": 1870 + }, + { + "epoch": 0.18692242369748738, + "grad_norm": 0.7420315419663195, + "learning_rate": 9.369350173139911e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9086171984672546, + "num_tokens": 152524189.0, + "step": 1871 + }, + { + "epoch": 0.18702232878765174, + "grad_norm": 1.008260836341292, + "learning_rate": 9.368563398197603e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9068146646022797, + "num_tokens": 152605759.0, + "step": 1872 + }, + { + "epoch": 0.18712223387781607, + "grad_norm": 0.6098671532294075, + "learning_rate": 9.367776165862434e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9058589339256287, + "num_tokens": 152687100.0, + "step": 1873 + }, + { + "epoch": 0.18722213896798043, + "grad_norm": 0.7905671018110804, + "learning_rate": 9.366988476216826e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9082761108875275, + "num_tokens": 152768575.0, + "step": 1874 + }, + { + "epoch": 0.18732204405814476, + "grad_norm": 0.6371846921076927, + "learning_rate": 9.366200329343254e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9068490862846375, + "num_tokens": 152850136.0, + "step": 1875 + }, + { + "epoch": 0.18742194914830912, + "grad_norm": 0.6748885434800194, + "learning_rate": 9.365411725324237e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9077376127243042, + "num_tokens": 152931675.0, + "step": 1876 + }, + { + "epoch": 0.18752185423847345, + "grad_norm": 0.7956515186418317, + "learning_rate": 9.36462266424234e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9054464399814606, + "num_tokens": 153013206.0, + "step": 1877 + }, + { + "epoch": 0.18762175932863778, + "grad_norm": 0.8727415245434239, + "learning_rate": 9.36383314618018e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9057677090167999, + "num_tokens": 153094655.0, + "step": 1878 + }, + { + "epoch": 0.18772166441880214, + "grad_norm": 0.5781313970173502, + "learning_rate": 9.363043171220423e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9076172709465027, + "num_tokens": 153176171.0, + "step": 1879 + }, + { + "epoch": 0.18782156950896647, + "grad_norm": 0.6100602057404353, + "learning_rate": 9.362252739445776e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9062903225421906, + "num_tokens": 153257616.0, + "step": 1880 + }, + { + "epoch": 0.18792147459913083, + "grad_norm": 0.7512452308785703, + "learning_rate": 9.361461850938999e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9045843482017517, + "num_tokens": 153339137.0, + "step": 1881 + }, + { + "epoch": 0.18802137968929516, + "grad_norm": 0.5431705308190949, + "learning_rate": 9.360670505782903e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9104193449020386, + "num_tokens": 153420677.0, + "step": 1882 + }, + { + "epoch": 0.18812128477945952, + "grad_norm": 0.6699526404557308, + "learning_rate": 9.359878704060336e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9052842259407043, + "num_tokens": 153502256.0, + "step": 1883 + }, + { + "epoch": 0.18822118986962386, + "grad_norm": 0.6537991639613319, + "learning_rate": 9.359086445854206e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.905498206615448, + "num_tokens": 153583822.0, + "step": 1884 + }, + { + "epoch": 0.1883210949597882, + "grad_norm": 0.6705097291846488, + "learning_rate": 9.358293731247459e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.90718874335289, + "num_tokens": 153665358.0, + "step": 1885 + }, + { + "epoch": 0.18842100004995255, + "grad_norm": 1.0818577453345157, + "learning_rate": 9.357500560323096e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9073361158370972, + "num_tokens": 153746905.0, + "step": 1886 + }, + { + "epoch": 0.18852090514011688, + "grad_norm": 0.7517001979448388, + "learning_rate": 9.356706933164161e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.905774861574173, + "num_tokens": 153828345.0, + "step": 1887 + }, + { + "epoch": 0.18862081023028124, + "grad_norm": 0.6763994266447125, + "learning_rate": 9.355912849853747e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.906467467546463, + "num_tokens": 153909896.0, + "step": 1888 + }, + { + "epoch": 0.18872071532044557, + "grad_norm": 0.7707848313557039, + "learning_rate": 9.355118310475e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.906059741973877, + "num_tokens": 153991419.0, + "step": 1889 + }, + { + "epoch": 0.18882062041060993, + "grad_norm": 0.6260080822538819, + "learning_rate": 9.354323315111102e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.906522810459137, + "num_tokens": 154072945.0, + "step": 1890 + }, + { + "epoch": 0.18892052550077426, + "grad_norm": 0.6019454952043735, + "learning_rate": 9.353527863845296e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9096284508705139, + "num_tokens": 154154425.0, + "step": 1891 + }, + { + "epoch": 0.18902043059093862, + "grad_norm": 0.5197592198675104, + "learning_rate": 9.35273195676086e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9086027443408966, + "num_tokens": 154235943.0, + "step": 1892 + }, + { + "epoch": 0.18912033568110295, + "grad_norm": 0.58243755391287, + "learning_rate": 9.351935593941134e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9063893258571625, + "num_tokens": 154317531.0, + "step": 1893 + }, + { + "epoch": 0.18922024077126728, + "grad_norm": 0.6376198313381373, + "learning_rate": 9.351138775469493e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9064420461654663, + "num_tokens": 154399004.0, + "step": 1894 + }, + { + "epoch": 0.18932014586143164, + "grad_norm": 0.8677756715128482, + "learning_rate": 9.350341501429366e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9094602167606354, + "num_tokens": 154480515.0, + "step": 1895 + }, + { + "epoch": 0.18942005095159598, + "grad_norm": 0.8378475261788272, + "learning_rate": 9.349543771904225e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9059796333312988, + "num_tokens": 154562067.0, + "step": 1896 + }, + { + "epoch": 0.18951995604176033, + "grad_norm": 0.6572056912801416, + "learning_rate": 9.348745586977599e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9084439873695374, + "num_tokens": 154643546.0, + "step": 1897 + }, + { + "epoch": 0.18961986113192467, + "grad_norm": 1.0295798289421034, + "learning_rate": 9.347946946733055e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9059096574783325, + "num_tokens": 154725092.0, + "step": 1898 + }, + { + "epoch": 0.18971976622208903, + "grad_norm": 0.6399693859117991, + "learning_rate": 9.347147851254213e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9074902534484863, + "num_tokens": 154806675.0, + "step": 1899 + }, + { + "epoch": 0.18981967131225336, + "grad_norm": 0.799715286111481, + "learning_rate": 9.34634830062474e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9083117842674255, + "num_tokens": 154888229.0, + "step": 1900 + }, + { + "epoch": 0.18991957640241772, + "grad_norm": 0.7642275511764056, + "learning_rate": 9.345548294928344e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9075333774089813, + "num_tokens": 154969739.0, + "step": 1901 + }, + { + "epoch": 0.19001948149258205, + "grad_norm": 0.8688804103641944, + "learning_rate": 9.344747834248793e-06, + "loss": 0.487, + "mean_token_accuracy": 0.9084867835044861, + "num_tokens": 155051394.0, + "step": 1902 + }, + { + "epoch": 0.19011938658274638, + "grad_norm": 0.6449683193917508, + "learning_rate": 9.343946918669893e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9055069386959076, + "num_tokens": 155132943.0, + "step": 1903 + }, + { + "epoch": 0.19021929167291074, + "grad_norm": 0.6064982412390445, + "learning_rate": 9.343145548275503e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9072734415531158, + "num_tokens": 155214387.0, + "step": 1904 + }, + { + "epoch": 0.19031919676307507, + "grad_norm": 0.7427587558374888, + "learning_rate": 9.342343723149523e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9062856733798981, + "num_tokens": 155295881.0, + "step": 1905 + }, + { + "epoch": 0.19041910185323943, + "grad_norm": 0.671026184197625, + "learning_rate": 9.341541443375907e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9058356583118439, + "num_tokens": 155377305.0, + "step": 1906 + }, + { + "epoch": 0.19051900694340376, + "grad_norm": 0.9030676967088915, + "learning_rate": 9.340738709038657e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9035574197769165, + "num_tokens": 155458825.0, + "step": 1907 + }, + { + "epoch": 0.19061891203356812, + "grad_norm": 0.8858020038305547, + "learning_rate": 9.339935520221816e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9048610627651215, + "num_tokens": 155540284.0, + "step": 1908 + }, + { + "epoch": 0.19071881712373245, + "grad_norm": 0.722211498009958, + "learning_rate": 9.339131877009482e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.907678097486496, + "num_tokens": 155621797.0, + "step": 1909 + }, + { + "epoch": 0.19081872221389679, + "grad_norm": 0.7547431425184179, + "learning_rate": 9.338327779485794e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9061680734157562, + "num_tokens": 155703290.0, + "step": 1910 + }, + { + "epoch": 0.19091862730406114, + "grad_norm": 0.6083448303119237, + "learning_rate": 9.337523227734945e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9069973826408386, + "num_tokens": 155784854.0, + "step": 1911 + }, + { + "epoch": 0.19101853239422548, + "grad_norm": 0.6051695954922574, + "learning_rate": 9.33671822184117e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9050439298152924, + "num_tokens": 155866333.0, + "step": 1912 + }, + { + "epoch": 0.19111843748438984, + "grad_norm": 0.6216028791509561, + "learning_rate": 9.335912761888754e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9069182872772217, + "num_tokens": 155947793.0, + "step": 1913 + }, + { + "epoch": 0.19121834257455417, + "grad_norm": 0.6567695782261141, + "learning_rate": 9.335106847962032e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9072941839694977, + "num_tokens": 156029230.0, + "step": 1914 + }, + { + "epoch": 0.19131824766471853, + "grad_norm": 0.6621806150453671, + "learning_rate": 9.334300480145381e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9049394726753235, + "num_tokens": 156110727.0, + "step": 1915 + }, + { + "epoch": 0.19141815275488286, + "grad_norm": 0.7416694038673707, + "learning_rate": 9.333493658523231e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9072648286819458, + "num_tokens": 156192163.0, + "step": 1916 + }, + { + "epoch": 0.19151805784504722, + "grad_norm": 0.6425308759809761, + "learning_rate": 9.332686383180055e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9057368338108063, + "num_tokens": 156273719.0, + "step": 1917 + }, + { + "epoch": 0.19161796293521155, + "grad_norm": 0.6054931266641985, + "learning_rate": 9.331878654200377e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9078653156757355, + "num_tokens": 156355191.0, + "step": 1918 + }, + { + "epoch": 0.19171786802537588, + "grad_norm": 0.7612258989264458, + "learning_rate": 9.331070471668764e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9055874347686768, + "num_tokens": 156436680.0, + "step": 1919 + }, + { + "epoch": 0.19181777311554024, + "grad_norm": 0.7385976875012344, + "learning_rate": 9.330261835669839e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9070037007331848, + "num_tokens": 156518203.0, + "step": 1920 + }, + { + "epoch": 0.19191767820570457, + "grad_norm": 0.7537980246711535, + "learning_rate": 9.329452746288261e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9073029458522797, + "num_tokens": 156599714.0, + "step": 1921 + }, + { + "epoch": 0.19201758329586893, + "grad_norm": 0.5531805309648192, + "learning_rate": 9.328643203608747e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.9082764685153961, + "num_tokens": 156681396.0, + "step": 1922 + }, + { + "epoch": 0.19211748838603326, + "grad_norm": 0.5690778779224109, + "learning_rate": 9.327833207716053e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9077522158622742, + "num_tokens": 156762901.0, + "step": 1923 + }, + { + "epoch": 0.19221739347619762, + "grad_norm": 0.8595541612592623, + "learning_rate": 9.327022758694991e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9093089699745178, + "num_tokens": 156844455.0, + "step": 1924 + }, + { + "epoch": 0.19231729856636195, + "grad_norm": 0.6005051524573332, + "learning_rate": 9.32621185663041e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9073048830032349, + "num_tokens": 156925980.0, + "step": 1925 + }, + { + "epoch": 0.1924172036565263, + "grad_norm": 0.64658521068199, + "learning_rate": 9.325400501607218e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9044094979763031, + "num_tokens": 157007456.0, + "step": 1926 + }, + { + "epoch": 0.19251710874669065, + "grad_norm": 0.49597435762918446, + "learning_rate": 9.32458869371036e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9062802791595459, + "num_tokens": 157088873.0, + "step": 1927 + }, + { + "epoch": 0.19261701383685498, + "grad_norm": 0.7830562539317701, + "learning_rate": 9.323776433024838e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9035815596580505, + "num_tokens": 157170358.0, + "step": 1928 + }, + { + "epoch": 0.19271691892701934, + "grad_norm": 0.7919350906293392, + "learning_rate": 9.322963719635693e-06, + "loss": 0.502, + "mean_token_accuracy": 0.906330019235611, + "num_tokens": 157251802.0, + "step": 1929 + }, + { + "epoch": 0.19281682401718367, + "grad_norm": 0.5304193121217384, + "learning_rate": 9.322150553628017e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.9112669825553894, + "num_tokens": 157333486.0, + "step": 1930 + }, + { + "epoch": 0.19291672910734803, + "grad_norm": 0.6451530951205147, + "learning_rate": 9.32133693508695e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9035444259643555, + "num_tokens": 157414978.0, + "step": 1931 + }, + { + "epoch": 0.19301663419751236, + "grad_norm": 0.6560654060806259, + "learning_rate": 9.320522864097678e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9064770340919495, + "num_tokens": 157496551.0, + "step": 1932 + }, + { + "epoch": 0.19311653928767672, + "grad_norm": 0.5216949056585269, + "learning_rate": 9.319708340745437e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9087038636207581, + "num_tokens": 157578065.0, + "step": 1933 + }, + { + "epoch": 0.19321644437784105, + "grad_norm": 0.5943241446992187, + "learning_rate": 9.318893365115506e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9075430333614349, + "num_tokens": 157659533.0, + "step": 1934 + }, + { + "epoch": 0.19331634946800538, + "grad_norm": 0.9938295627761043, + "learning_rate": 9.318077937293215e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9053584933280945, + "num_tokens": 157740941.0, + "step": 1935 + }, + { + "epoch": 0.19341625455816974, + "grad_norm": 0.541865308931754, + "learning_rate": 9.31726205736394e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9061626493930817, + "num_tokens": 157822507.0, + "step": 1936 + }, + { + "epoch": 0.19351615964833407, + "grad_norm": 0.5548132054313373, + "learning_rate": 9.316445725413103e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9059953391551971, + "num_tokens": 157903976.0, + "step": 1937 + }, + { + "epoch": 0.19361606473849843, + "grad_norm": 1.562461192398568, + "learning_rate": 9.315628941526179e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9080550670623779, + "num_tokens": 157985476.0, + "step": 1938 + }, + { + "epoch": 0.19371596982866277, + "grad_norm": 0.5594822045337782, + "learning_rate": 9.31481170578868e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9097268879413605, + "num_tokens": 158067037.0, + "step": 1939 + }, + { + "epoch": 0.19381587491882712, + "grad_norm": 0.6139986958439341, + "learning_rate": 9.313994018286175e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9054493308067322, + "num_tokens": 158148618.0, + "step": 1940 + }, + { + "epoch": 0.19391578000899146, + "grad_norm": 0.6890693456249126, + "learning_rate": 9.313175879104277e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9056724905967712, + "num_tokens": 158230160.0, + "step": 1941 + }, + { + "epoch": 0.19401568509915582, + "grad_norm": 0.6401621353644406, + "learning_rate": 9.312357288328645e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9085469543933868, + "num_tokens": 158311747.0, + "step": 1942 + }, + { + "epoch": 0.19411559018932015, + "grad_norm": 0.5512405364896792, + "learning_rate": 9.311538246044987e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9053478538990021, + "num_tokens": 158393270.0, + "step": 1943 + }, + { + "epoch": 0.19421549527948448, + "grad_norm": 0.5379707945164692, + "learning_rate": 9.310718752339054e-06, + "loss": 0.502, + "mean_token_accuracy": 0.905362069606781, + "num_tokens": 158474740.0, + "step": 1944 + }, + { + "epoch": 0.19431540036964884, + "grad_norm": 0.563120634739421, + "learning_rate": 9.309898807296653e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9046469628810883, + "num_tokens": 158556237.0, + "step": 1945 + }, + { + "epoch": 0.19441530545981317, + "grad_norm": 0.72764021159478, + "learning_rate": 9.309078411003632e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9024545252323151, + "num_tokens": 158637683.0, + "step": 1946 + }, + { + "epoch": 0.19451521054997753, + "grad_norm": 0.8410742371086041, + "learning_rate": 9.308257563545885e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.9062701761722565, + "num_tokens": 158719291.0, + "step": 1947 + }, + { + "epoch": 0.19461511564014186, + "grad_norm": 0.5921428030020474, + "learning_rate": 9.307436265009354e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.905998021364212, + "num_tokens": 158800708.0, + "step": 1948 + }, + { + "epoch": 0.19471502073030622, + "grad_norm": 0.8359832789089906, + "learning_rate": 9.306614515480035e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9081862568855286, + "num_tokens": 158882199.0, + "step": 1949 + }, + { + "epoch": 0.19481492582047055, + "grad_norm": 0.5407407892447984, + "learning_rate": 9.305792315043962e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9066211879253387, + "num_tokens": 158963687.0, + "step": 1950 + }, + { + "epoch": 0.19491483091063488, + "grad_norm": 0.6217288778986854, + "learning_rate": 9.304969663787222e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9053827524185181, + "num_tokens": 159045182.0, + "step": 1951 + }, + { + "epoch": 0.19501473600079924, + "grad_norm": 0.6120940593853107, + "learning_rate": 9.304146561795946e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9076284170150757, + "num_tokens": 159126614.0, + "step": 1952 + }, + { + "epoch": 0.19511464109096358, + "grad_norm": 0.7156208943484721, + "learning_rate": 9.303323009156315e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9042367935180664, + "num_tokens": 159208077.0, + "step": 1953 + }, + { + "epoch": 0.19521454618112793, + "grad_norm": 0.6644382086134142, + "learning_rate": 9.302499005954557e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9037472009658813, + "num_tokens": 159289584.0, + "step": 1954 + }, + { + "epoch": 0.19531445127129227, + "grad_norm": 0.5603786840532636, + "learning_rate": 9.301674552276942e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9054603576660156, + "num_tokens": 159371075.0, + "step": 1955 + }, + { + "epoch": 0.19541435636145663, + "grad_norm": 0.6768054906087246, + "learning_rate": 9.300849648209794e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9063329994678497, + "num_tokens": 159452596.0, + "step": 1956 + }, + { + "epoch": 0.19551426145162096, + "grad_norm": 0.6354189238703191, + "learning_rate": 9.30002429383948e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9066615104675293, + "num_tokens": 159534130.0, + "step": 1957 + }, + { + "epoch": 0.19561416654178532, + "grad_norm": 0.7613534211900563, + "learning_rate": 9.299198489252417e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9060999751091003, + "num_tokens": 159615724.0, + "step": 1958 + }, + { + "epoch": 0.19571407163194965, + "grad_norm": 0.8778035999108625, + "learning_rate": 9.298372234535067e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9040997326374054, + "num_tokens": 159697329.0, + "step": 1959 + }, + { + "epoch": 0.19581397672211398, + "grad_norm": 0.5675701545633387, + "learning_rate": 9.297545529773936e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9058918654918671, + "num_tokens": 159778791.0, + "step": 1960 + }, + { + "epoch": 0.19591388181227834, + "grad_norm": 0.7900920062219308, + "learning_rate": 9.296718375055587e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9068384170532227, + "num_tokens": 159860235.0, + "step": 1961 + }, + { + "epoch": 0.19601378690244267, + "grad_norm": 0.8931508344241051, + "learning_rate": 9.29589077046662e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9052170217037201, + "num_tokens": 159941631.0, + "step": 1962 + }, + { + "epoch": 0.19611369199260703, + "grad_norm": 1.6964807782207454, + "learning_rate": 9.295062716093688e-06, + "loss": 0.496, + "mean_token_accuracy": 0.906160295009613, + "num_tokens": 160023220.0, + "step": 1963 + }, + { + "epoch": 0.19621359708277136, + "grad_norm": 0.6092881117735219, + "learning_rate": 9.294234212023485e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9068296849727631, + "num_tokens": 160104756.0, + "step": 1964 + }, + { + "epoch": 0.19631350217293572, + "grad_norm": 0.5898022920184391, + "learning_rate": 9.293405258342762e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9063940048217773, + "num_tokens": 160186352.0, + "step": 1965 + }, + { + "epoch": 0.19641340726310005, + "grad_norm": 0.6835254432661911, + "learning_rate": 9.292575855138307e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9054160118103027, + "num_tokens": 160267848.0, + "step": 1966 + }, + { + "epoch": 0.19651331235326439, + "grad_norm": 0.5677841596279544, + "learning_rate": 9.291746002496962e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9079137146472931, + "num_tokens": 160349416.0, + "step": 1967 + }, + { + "epoch": 0.19661321744342874, + "grad_norm": 0.7724762342764993, + "learning_rate": 9.290915700505611e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9063995182514191, + "num_tokens": 160430992.0, + "step": 1968 + }, + { + "epoch": 0.19671312253359308, + "grad_norm": 0.57718505099088, + "learning_rate": 9.29008494925119e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9074338376522064, + "num_tokens": 160512489.0, + "step": 1969 + }, + { + "epoch": 0.19681302762375744, + "grad_norm": 0.7441201585209979, + "learning_rate": 9.289253748820675e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9077600836753845, + "num_tokens": 160594009.0, + "step": 1970 + }, + { + "epoch": 0.19691293271392177, + "grad_norm": 1.006376164581019, + "learning_rate": 9.2884220993011e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.907161146402359, + "num_tokens": 160675536.0, + "step": 1971 + }, + { + "epoch": 0.19701283780408613, + "grad_norm": 0.8029325716052774, + "learning_rate": 9.287590000779535e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9052156209945679, + "num_tokens": 160757003.0, + "step": 1972 + }, + { + "epoch": 0.19711274289425046, + "grad_norm": 0.7773961135324524, + "learning_rate": 9.2867574533431e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9054014682769775, + "num_tokens": 160838455.0, + "step": 1973 + }, + { + "epoch": 0.19721264798441482, + "grad_norm": 0.5769052828298673, + "learning_rate": 9.28592445707897e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9060314297676086, + "num_tokens": 160919907.0, + "step": 1974 + }, + { + "epoch": 0.19731255307457915, + "grad_norm": 0.573484032131562, + "learning_rate": 9.285091012074354e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9065425097942352, + "num_tokens": 161001370.0, + "step": 1975 + }, + { + "epoch": 0.19741245816474348, + "grad_norm": 0.6536709784813907, + "learning_rate": 9.284257118416518e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9096881151199341, + "num_tokens": 161082888.0, + "step": 1976 + }, + { + "epoch": 0.19751236325490784, + "grad_norm": 0.5140191938630038, + "learning_rate": 9.283422776192772e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9086134433746338, + "num_tokens": 161164423.0, + "step": 1977 + }, + { + "epoch": 0.19761226834507217, + "grad_norm": 0.5298917476910361, + "learning_rate": 9.282587985490468e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9096068143844604, + "num_tokens": 161245957.0, + "step": 1978 + }, + { + "epoch": 0.19771217343523653, + "grad_norm": 0.6898923214967286, + "learning_rate": 9.281752746397015e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.905793160200119, + "num_tokens": 161327465.0, + "step": 1979 + }, + { + "epoch": 0.19781207852540086, + "grad_norm": 0.5857609672043803, + "learning_rate": 9.28091705899986e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9083531498908997, + "num_tokens": 161409067.0, + "step": 1980 + }, + { + "epoch": 0.19791198361556522, + "grad_norm": 0.5146117602291324, + "learning_rate": 9.280080923386501e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9082435369491577, + "num_tokens": 161490608.0, + "step": 1981 + }, + { + "epoch": 0.19801188870572956, + "grad_norm": 0.5719710411577479, + "learning_rate": 9.279244339644484e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9068638980388641, + "num_tokens": 161572172.0, + "step": 1982 + }, + { + "epoch": 0.19811179379589391, + "grad_norm": 0.5358608290959265, + "learning_rate": 9.278407307861397e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9055962562561035, + "num_tokens": 161653727.0, + "step": 1983 + }, + { + "epoch": 0.19821169888605825, + "grad_norm": 0.5423380725174843, + "learning_rate": 9.277569828124879e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9037507772445679, + "num_tokens": 161735184.0, + "step": 1984 + }, + { + "epoch": 0.19831160397622258, + "grad_norm": 0.5696373826330784, + "learning_rate": 9.276731900522616e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9075423777103424, + "num_tokens": 161816774.0, + "step": 1985 + }, + { + "epoch": 0.19841150906638694, + "grad_norm": 0.6260682810768935, + "learning_rate": 9.27589352514234e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9076349437236786, + "num_tokens": 161898329.0, + "step": 1986 + }, + { + "epoch": 0.19851141415655127, + "grad_norm": 0.6450498707467204, + "learning_rate": 9.275054702071828e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9066205620765686, + "num_tokens": 161979861.0, + "step": 1987 + }, + { + "epoch": 0.19861131924671563, + "grad_norm": 0.5210257690052856, + "learning_rate": 9.274215431398906e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9079143404960632, + "num_tokens": 162061446.0, + "step": 1988 + }, + { + "epoch": 0.19871122433687996, + "grad_norm": 0.6914849822082898, + "learning_rate": 9.273375713211447e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9088840782642365, + "num_tokens": 162143026.0, + "step": 1989 + }, + { + "epoch": 0.19881112942704432, + "grad_norm": 0.7283007851979745, + "learning_rate": 9.272535547597372e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9074620008468628, + "num_tokens": 162224476.0, + "step": 1990 + }, + { + "epoch": 0.19891103451720865, + "grad_norm": 0.687733525348715, + "learning_rate": 9.271694934644646e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.903670608997345, + "num_tokens": 162306039.0, + "step": 1991 + }, + { + "epoch": 0.19901093960737298, + "grad_norm": 0.7555124825449018, + "learning_rate": 9.270853874441281e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9061534404754639, + "num_tokens": 162387583.0, + "step": 1992 + }, + { + "epoch": 0.19911084469753734, + "grad_norm": 0.6110122183286637, + "learning_rate": 9.270012367075337e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.905514270067215, + "num_tokens": 162469128.0, + "step": 1993 + }, + { + "epoch": 0.19921074978770167, + "grad_norm": 0.5172074457746625, + "learning_rate": 9.26917041263492e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.9111604690551758, + "num_tokens": 162550685.0, + "step": 1994 + }, + { + "epoch": 0.19931065487786603, + "grad_norm": 0.6623754754532165, + "learning_rate": 9.268328011208186e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9072572886943817, + "num_tokens": 162632215.0, + "step": 1995 + }, + { + "epoch": 0.19941055996803037, + "grad_norm": 0.6684049342880383, + "learning_rate": 9.267485162883334e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.9090973734855652, + "num_tokens": 162713792.0, + "step": 1996 + }, + { + "epoch": 0.19951046505819472, + "grad_norm": 0.5990511794881087, + "learning_rate": 9.26664186774861e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9063998460769653, + "num_tokens": 162795306.0, + "step": 1997 + }, + { + "epoch": 0.19961037014835906, + "grad_norm": 0.638286692069447, + "learning_rate": 9.26579812589231e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9065134525299072, + "num_tokens": 162876790.0, + "step": 1998 + }, + { + "epoch": 0.19971027523852342, + "grad_norm": 1.220439938846545, + "learning_rate": 9.26495393740277e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9078517854213715, + "num_tokens": 162958404.0, + "step": 1999 + }, + { + "epoch": 0.19981018032868775, + "grad_norm": 0.6374490906983105, + "learning_rate": 9.264109302368383e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9072090685367584, + "num_tokens": 163039879.0, + "step": 2000 + }, + { + "epoch": 0.19991008541885208, + "grad_norm": 0.6166774056868426, + "learning_rate": 9.26326422087758e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9074035584926605, + "num_tokens": 81511.0, + "step": 2001 + }, + { + "epoch": 0.20000999050901644, + "grad_norm": 0.5129917575385542, + "learning_rate": 9.262418693018843e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9043433368206024, + "num_tokens": 162951.0, + "step": 2002 + }, + { + "epoch": 0.20010989559918077, + "grad_norm": 0.5466319463914092, + "learning_rate": 9.261572718880697e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9089897274971008, + "num_tokens": 244507.0, + "step": 2003 + }, + { + "epoch": 0.20020980068934513, + "grad_norm": 1.0663592101243475, + "learning_rate": 9.260726298551721e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9089770913124084, + "num_tokens": 326052.0, + "step": 2004 + }, + { + "epoch": 0.20030970577950946, + "grad_norm": 0.6497838167855501, + "learning_rate": 9.259879432120533e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9059172570705414, + "num_tokens": 407589.0, + "step": 2005 + }, + { + "epoch": 0.20040961086967382, + "grad_norm": 0.5354018103297211, + "learning_rate": 9.259032119675801e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9058073461055756, + "num_tokens": 489170.0, + "step": 2006 + }, + { + "epoch": 0.20050951595983815, + "grad_norm": 0.6478300687143702, + "learning_rate": 9.258184361306239e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9081608951091766, + "num_tokens": 570771.0, + "step": 2007 + }, + { + "epoch": 0.20060942105000248, + "grad_norm": 0.7833140086238812, + "learning_rate": 9.25733615710061e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9065282642841339, + "num_tokens": 652319.0, + "step": 2008 + }, + { + "epoch": 0.20070932614016684, + "grad_norm": 0.5831021083902312, + "learning_rate": 9.25648750714772e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9052320420742035, + "num_tokens": 733907.0, + "step": 2009 + }, + { + "epoch": 0.20080923123033118, + "grad_norm": 0.6752848618474252, + "learning_rate": 9.255638411536425e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9070261418819427, + "num_tokens": 815442.0, + "step": 2010 + }, + { + "epoch": 0.20090913632049553, + "grad_norm": 1.3697126424563013, + "learning_rate": 9.254788870355624e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9046869575977325, + "num_tokens": 896909.0, + "step": 2011 + }, + { + "epoch": 0.20100904141065987, + "grad_norm": 0.5786732655557234, + "learning_rate": 9.253938883694266e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.9075137674808502, + "num_tokens": 978492.0, + "step": 2012 + }, + { + "epoch": 0.20110894650082423, + "grad_norm": 0.7658242557984121, + "learning_rate": 9.253088451641347e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9062025249004364, + "num_tokens": 1059968.0, + "step": 2013 + }, + { + "epoch": 0.20120885159098856, + "grad_norm": 0.7969890958391599, + "learning_rate": 9.252237574285904e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9052704274654388, + "num_tokens": 1141411.0, + "step": 2014 + }, + { + "epoch": 0.20130875668115292, + "grad_norm": 0.557196609012171, + "learning_rate": 9.251386251717029e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.9083635210990906, + "num_tokens": 1223047.0, + "step": 2015 + }, + { + "epoch": 0.20140866177131725, + "grad_norm": 0.8882943992568852, + "learning_rate": 9.250534484023854e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9036664664745331, + "num_tokens": 1304536.0, + "step": 2016 + }, + { + "epoch": 0.20150856686148158, + "grad_norm": 0.6422004466209942, + "learning_rate": 9.24968227129556e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9082929790019989, + "num_tokens": 1386106.0, + "step": 2017 + }, + { + "epoch": 0.20160847195164594, + "grad_norm": 0.5646071983888762, + "learning_rate": 9.248829613621377e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9091111719608307, + "num_tokens": 1467699.0, + "step": 2018 + }, + { + "epoch": 0.20170837704181027, + "grad_norm": 0.6502353360242658, + "learning_rate": 9.247976511090576e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9079803228378296, + "num_tokens": 1549302.0, + "step": 2019 + }, + { + "epoch": 0.20180828213197463, + "grad_norm": 0.6444359516992659, + "learning_rate": 9.247122963792478e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9039395451545715, + "num_tokens": 1630744.0, + "step": 2020 + }, + { + "epoch": 0.20190818722213896, + "grad_norm": 1.670215177569267, + "learning_rate": 9.246268971816453e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9066731035709381, + "num_tokens": 1712302.0, + "step": 2021 + }, + { + "epoch": 0.20200809231230332, + "grad_norm": 0.6442031751954191, + "learning_rate": 9.24541453525191e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9098008871078491, + "num_tokens": 1793842.0, + "step": 2022 + }, + { + "epoch": 0.20210799740246765, + "grad_norm": 0.6376042232306465, + "learning_rate": 9.244559654188313e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9058891534805298, + "num_tokens": 1875345.0, + "step": 2023 + }, + { + "epoch": 0.202207902492632, + "grad_norm": 0.6991014114579189, + "learning_rate": 9.243704328715169e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9090305864810944, + "num_tokens": 1956949.0, + "step": 2024 + }, + { + "epoch": 0.20230780758279635, + "grad_norm": 0.4784377149122733, + "learning_rate": 9.24284855892203e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9060642123222351, + "num_tokens": 2038495.0, + "step": 2025 + }, + { + "epoch": 0.20240771267296068, + "grad_norm": 0.6608545058930064, + "learning_rate": 9.241992344898494e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9080206751823425, + "num_tokens": 2119972.0, + "step": 2026 + }, + { + "epoch": 0.20250761776312504, + "grad_norm": 0.9471854064831704, + "learning_rate": 9.241135686734213e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9070889055728912, + "num_tokens": 2201445.0, + "step": 2027 + }, + { + "epoch": 0.20260752285328937, + "grad_norm": 0.8501842223112762, + "learning_rate": 9.240278584518873e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9056735634803772, + "num_tokens": 2282910.0, + "step": 2028 + }, + { + "epoch": 0.20270742794345373, + "grad_norm": 0.7514561662442526, + "learning_rate": 9.239421038342221e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9092903733253479, + "num_tokens": 2364404.0, + "step": 2029 + }, + { + "epoch": 0.20280733303361806, + "grad_norm": 0.5885647640490372, + "learning_rate": 9.238563048294038e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9064749479293823, + "num_tokens": 2445968.0, + "step": 2030 + }, + { + "epoch": 0.20290723812378242, + "grad_norm": 0.6235081514412424, + "learning_rate": 9.237704614464157e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9058801233768463, + "num_tokens": 2527415.0, + "step": 2031 + }, + { + "epoch": 0.20300714321394675, + "grad_norm": 0.6115147459721415, + "learning_rate": 9.236845736942456e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9054447710514069, + "num_tokens": 2608913.0, + "step": 2032 + }, + { + "epoch": 0.20310704830411108, + "grad_norm": 0.5463535902742129, + "learning_rate": 9.235986415818865e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9070843756198883, + "num_tokens": 2690507.0, + "step": 2033 + }, + { + "epoch": 0.20320695339427544, + "grad_norm": 0.5938798034439571, + "learning_rate": 9.23512665118335e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9063262641429901, + "num_tokens": 2772023.0, + "step": 2034 + }, + { + "epoch": 0.20330685848443977, + "grad_norm": 0.5840075630631996, + "learning_rate": 9.234266443125933e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9085297286510468, + "num_tokens": 2853518.0, + "step": 2035 + }, + { + "epoch": 0.20340676357460413, + "grad_norm": 0.8923720575878133, + "learning_rate": 9.233405791736675e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9047411382198334, + "num_tokens": 2934959.0, + "step": 2036 + }, + { + "epoch": 0.20350666866476846, + "grad_norm": 0.6836996764879998, + "learning_rate": 9.23254469710569e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9053345620632172, + "num_tokens": 3016522.0, + "step": 2037 + }, + { + "epoch": 0.20360657375493282, + "grad_norm": 0.6399024846022637, + "learning_rate": 9.231683159323137e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9084806144237518, + "num_tokens": 3098015.0, + "step": 2038 + }, + { + "epoch": 0.20370647884509716, + "grad_norm": 0.6223853388473298, + "learning_rate": 9.230821178479214e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9051591753959656, + "num_tokens": 3179530.0, + "step": 2039 + }, + { + "epoch": 0.20380638393526151, + "grad_norm": 0.5588476932725719, + "learning_rate": 9.229958754664177e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9056532084941864, + "num_tokens": 3261003.0, + "step": 2040 + }, + { + "epoch": 0.20390628902542585, + "grad_norm": 0.6416900166124271, + "learning_rate": 9.22909588796832e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9062455892562866, + "num_tokens": 3342491.0, + "step": 2041 + }, + { + "epoch": 0.20400619411559018, + "grad_norm": 0.604416111360529, + "learning_rate": 9.228232578481987e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9034179747104645, + "num_tokens": 3424026.0, + "step": 2042 + }, + { + "epoch": 0.20410609920575454, + "grad_norm": 0.7152696045834229, + "learning_rate": 9.227368826295566e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9060747921466827, + "num_tokens": 3505547.0, + "step": 2043 + }, + { + "epoch": 0.20420600429591887, + "grad_norm": 0.688517622369701, + "learning_rate": 9.226504631499495e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9033501148223877, + "num_tokens": 3587085.0, + "step": 2044 + }, + { + "epoch": 0.20430590938608323, + "grad_norm": 0.7634149272598911, + "learning_rate": 9.225639994184253e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9051791131496429, + "num_tokens": 3668570.0, + "step": 2045 + }, + { + "epoch": 0.20440581447624756, + "grad_norm": 0.9222368504261117, + "learning_rate": 9.22477491444037e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9056186974048615, + "num_tokens": 3750061.0, + "step": 2046 + }, + { + "epoch": 0.20450571956641192, + "grad_norm": 0.8200074198160854, + "learning_rate": 9.22390939235842e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9067641794681549, + "num_tokens": 3831573.0, + "step": 2047 + }, + { + "epoch": 0.20460562465657625, + "grad_norm": 0.5649122578165265, + "learning_rate": 9.223043428029025e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9091183841228485, + "num_tokens": 3913053.0, + "step": 2048 + }, + { + "epoch": 0.20470552974674058, + "grad_norm": 0.48473268985250934, + "learning_rate": 9.222177021542853e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9086907505989075, + "num_tokens": 3994564.0, + "step": 2049 + }, + { + "epoch": 0.20480543483690494, + "grad_norm": 0.6111048143099318, + "learning_rate": 9.221310172990616e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.907932460308075, + "num_tokens": 4076073.0, + "step": 2050 + }, + { + "epoch": 0.20490533992706927, + "grad_norm": 0.5740409460413027, + "learning_rate": 9.220442882463074e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9054351150989532, + "num_tokens": 4157538.0, + "step": 2051 + }, + { + "epoch": 0.20500524501723363, + "grad_norm": 0.6908227116402946, + "learning_rate": 9.219575150051035e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9071555733680725, + "num_tokens": 4239006.0, + "step": 2052 + }, + { + "epoch": 0.20510515010739797, + "grad_norm": 1.0901045502570532, + "learning_rate": 9.218706975845349e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.906016081571579, + "num_tokens": 4320489.0, + "step": 2053 + }, + { + "epoch": 0.20520505519756232, + "grad_norm": 0.6422472952546543, + "learning_rate": 9.217838359936914e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9100294709205627, + "num_tokens": 4401969.0, + "step": 2054 + }, + { + "epoch": 0.20530496028772666, + "grad_norm": 0.8832401260830682, + "learning_rate": 9.216969302416678e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9057633578777313, + "num_tokens": 4483524.0, + "step": 2055 + }, + { + "epoch": 0.20540486537789102, + "grad_norm": 0.7509790852328738, + "learning_rate": 9.216099803375631e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.903488427400589, + "num_tokens": 4565029.0, + "step": 2056 + }, + { + "epoch": 0.20550477046805535, + "grad_norm": 0.597849541123373, + "learning_rate": 9.21522986290481e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9076788127422333, + "num_tokens": 4646519.0, + "step": 2057 + }, + { + "epoch": 0.20560467555821968, + "grad_norm": 0.6521225828526976, + "learning_rate": 9.214359481095299e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9077682495117188, + "num_tokens": 4727953.0, + "step": 2058 + }, + { + "epoch": 0.20570458064838404, + "grad_norm": 0.6165487789328833, + "learning_rate": 9.213488658038228e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9064449071884155, + "num_tokens": 4809447.0, + "step": 2059 + }, + { + "epoch": 0.20580448573854837, + "grad_norm": 0.6253916458674311, + "learning_rate": 9.212617393824772e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9041734635829926, + "num_tokens": 4890997.0, + "step": 2060 + }, + { + "epoch": 0.20590439082871273, + "grad_norm": 0.6865599285934395, + "learning_rate": 9.211745688546154e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9066371023654938, + "num_tokens": 4972570.0, + "step": 2061 + }, + { + "epoch": 0.20600429591887706, + "grad_norm": 0.8016141614934242, + "learning_rate": 9.210873542293642e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9053756594657898, + "num_tokens": 5054075.0, + "step": 2062 + }, + { + "epoch": 0.20610420100904142, + "grad_norm": 0.8389609292937762, + "learning_rate": 9.210000955158551e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9063977301120758, + "num_tokens": 5135556.0, + "step": 2063 + }, + { + "epoch": 0.20620410609920575, + "grad_norm": 0.9423891483275171, + "learning_rate": 9.209127927232243e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9041829407215118, + "num_tokens": 5217025.0, + "step": 2064 + }, + { + "epoch": 0.20630401118937008, + "grad_norm": 0.5380776140085824, + "learning_rate": 9.20825445860612e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9049034118652344, + "num_tokens": 5298520.0, + "step": 2065 + }, + { + "epoch": 0.20640391627953444, + "grad_norm": 0.635272993650383, + "learning_rate": 9.207380549371642e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9055631756782532, + "num_tokens": 5380036.0, + "step": 2066 + }, + { + "epoch": 0.20650382136969878, + "grad_norm": 0.7466297268925539, + "learning_rate": 9.206506199620303e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9053043127059937, + "num_tokens": 5461598.0, + "step": 2067 + }, + { + "epoch": 0.20660372645986314, + "grad_norm": 0.5950601788550118, + "learning_rate": 9.20563140944365e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9044750034809113, + "num_tokens": 5543101.0, + "step": 2068 + }, + { + "epoch": 0.20670363155002747, + "grad_norm": 0.7712754186364138, + "learning_rate": 9.204756178933274e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9030625522136688, + "num_tokens": 5624552.0, + "step": 2069 + }, + { + "epoch": 0.20680353664019183, + "grad_norm": 0.5150323074651622, + "learning_rate": 9.203880508180814e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9066973924636841, + "num_tokens": 5706083.0, + "step": 2070 + }, + { + "epoch": 0.20690344173035616, + "grad_norm": 0.5455217946154447, + "learning_rate": 9.203004397277952e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9061721563339233, + "num_tokens": 5787576.0, + "step": 2071 + }, + { + "epoch": 0.20700334682052052, + "grad_norm": 0.7246196124750403, + "learning_rate": 9.202127846316418e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9053781032562256, + "num_tokens": 5869037.0, + "step": 2072 + }, + { + "epoch": 0.20710325191068485, + "grad_norm": 0.805370670639791, + "learning_rate": 9.201250855387986e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9066319167613983, + "num_tokens": 5950546.0, + "step": 2073 + }, + { + "epoch": 0.20720315700084918, + "grad_norm": 0.6793029900644347, + "learning_rate": 9.200373424584481e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9070681929588318, + "num_tokens": 6032007.0, + "step": 2074 + }, + { + "epoch": 0.20730306209101354, + "grad_norm": 0.5561939855678909, + "learning_rate": 9.19949555399777e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9058715105056763, + "num_tokens": 6113487.0, + "step": 2075 + }, + { + "epoch": 0.20740296718117787, + "grad_norm": 0.5932259125983776, + "learning_rate": 9.198617243719765e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.907027393579483, + "num_tokens": 6194944.0, + "step": 2076 + }, + { + "epoch": 0.20750287227134223, + "grad_norm": 0.5797971542459457, + "learning_rate": 9.197738493842428e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9053291380405426, + "num_tokens": 6276474.0, + "step": 2077 + }, + { + "epoch": 0.20760277736150656, + "grad_norm": 0.8287073908664634, + "learning_rate": 9.196859304457764e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9090169370174408, + "num_tokens": 6357895.0, + "step": 2078 + }, + { + "epoch": 0.20770268245167092, + "grad_norm": 0.6947549296797018, + "learning_rate": 9.195979675657827e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.903498113155365, + "num_tokens": 6439371.0, + "step": 2079 + }, + { + "epoch": 0.20780258754183525, + "grad_norm": 0.929527818142501, + "learning_rate": 9.19509960753471e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9054223597049713, + "num_tokens": 6520959.0, + "step": 2080 + }, + { + "epoch": 0.2079024926319996, + "grad_norm": 0.9047669535535836, + "learning_rate": 9.194219100180563e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9074526727199554, + "num_tokens": 6602462.0, + "step": 2081 + }, + { + "epoch": 0.20800239772216395, + "grad_norm": 0.61630275974716, + "learning_rate": 9.193338153687572e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.9096778929233551, + "num_tokens": 6684090.0, + "step": 2082 + }, + { + "epoch": 0.20810230281232828, + "grad_norm": 0.5831922188200926, + "learning_rate": 9.192456768147974e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9062212109565735, + "num_tokens": 6765706.0, + "step": 2083 + }, + { + "epoch": 0.20820220790249264, + "grad_norm": 0.7282007603330932, + "learning_rate": 9.19157494365405e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9075345396995544, + "num_tokens": 6847120.0, + "step": 2084 + }, + { + "epoch": 0.20830211299265697, + "grad_norm": 0.571954515166325, + "learning_rate": 9.190692680298132e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9058620631694794, + "num_tokens": 6928663.0, + "step": 2085 + }, + { + "epoch": 0.20840201808282133, + "grad_norm": 0.5414526601731968, + "learning_rate": 9.18980997817259e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9065189957618713, + "num_tokens": 7010111.0, + "step": 2086 + }, + { + "epoch": 0.20850192317298566, + "grad_norm": 0.5703434015030452, + "learning_rate": 9.188926837369843e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9071373641490936, + "num_tokens": 7091632.0, + "step": 2087 + }, + { + "epoch": 0.20860182826315002, + "grad_norm": 0.560489403477423, + "learning_rate": 9.18804325798236e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9070153832435608, + "num_tokens": 7173100.0, + "step": 2088 + }, + { + "epoch": 0.20870173335331435, + "grad_norm": 1.0071509873113982, + "learning_rate": 9.18715924010265e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9038789570331573, + "num_tokens": 7254503.0, + "step": 2089 + }, + { + "epoch": 0.20880163844347868, + "grad_norm": 0.572846976008172, + "learning_rate": 9.186274783823272e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.907531201839447, + "num_tokens": 7336025.0, + "step": 2090 + }, + { + "epoch": 0.20890154353364304, + "grad_norm": 0.7304588312283046, + "learning_rate": 9.185389889236828e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9096726179122925, + "num_tokens": 7417587.0, + "step": 2091 + }, + { + "epoch": 0.20900144862380737, + "grad_norm": 0.6325276775293596, + "learning_rate": 9.18450455643597e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9078246653079987, + "num_tokens": 7499137.0, + "step": 2092 + }, + { + "epoch": 0.20910135371397173, + "grad_norm": 0.5241272205997507, + "learning_rate": 9.18361878551339e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9099539220333099, + "num_tokens": 7580702.0, + "step": 2093 + }, + { + "epoch": 0.20920125880413606, + "grad_norm": 0.8583234676391736, + "learning_rate": 9.18273257656183e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9068440198898315, + "num_tokens": 7662177.0, + "step": 2094 + }, + { + "epoch": 0.20930116389430042, + "grad_norm": 0.594548980431463, + "learning_rate": 9.181845929674078e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9083328545093536, + "num_tokens": 7743734.0, + "step": 2095 + }, + { + "epoch": 0.20940106898446476, + "grad_norm": 0.5709506941680774, + "learning_rate": 9.180958844942967e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9061050415039062, + "num_tokens": 7825223.0, + "step": 2096 + }, + { + "epoch": 0.20950097407462911, + "grad_norm": 0.5117783854729753, + "learning_rate": 9.180071322461375e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9075518250465393, + "num_tokens": 7906728.0, + "step": 2097 + }, + { + "epoch": 0.20960087916479345, + "grad_norm": 0.5217798312609916, + "learning_rate": 9.179183362322226e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9080142080783844, + "num_tokens": 7988227.0, + "step": 2098 + }, + { + "epoch": 0.20970078425495778, + "grad_norm": 0.494281038596622, + "learning_rate": 9.17829496461849e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.90632164478302, + "num_tokens": 8069797.0, + "step": 2099 + }, + { + "epoch": 0.20980068934512214, + "grad_norm": 0.6544888368343877, + "learning_rate": 9.177406129443185e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9048925936222076, + "num_tokens": 8151282.0, + "step": 2100 + }, + { + "epoch": 0.20990059443528647, + "grad_norm": 0.5238177491172878, + "learning_rate": 9.17651685688937e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9075801372528076, + "num_tokens": 8232774.0, + "step": 2101 + }, + { + "epoch": 0.21000049952545083, + "grad_norm": 0.5991354880551926, + "learning_rate": 9.175627147050157e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9050672054290771, + "num_tokens": 8314361.0, + "step": 2102 + }, + { + "epoch": 0.21010040461561516, + "grad_norm": 0.6522417662793413, + "learning_rate": 9.174737000018696e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.901087760925293, + "num_tokens": 8395882.0, + "step": 2103 + }, + { + "epoch": 0.21020030970577952, + "grad_norm": 0.5935028013613733, + "learning_rate": 9.173846415888186e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9068248271942139, + "num_tokens": 8477418.0, + "step": 2104 + }, + { + "epoch": 0.21030021479594385, + "grad_norm": 0.5383028509604072, + "learning_rate": 9.172955394751872e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9067977070808411, + "num_tokens": 8558847.0, + "step": 2105 + }, + { + "epoch": 0.21040011988610818, + "grad_norm": 0.6875450255724571, + "learning_rate": 9.172063936703049e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9036507606506348, + "num_tokens": 8640410.0, + "step": 2106 + }, + { + "epoch": 0.21050002497627254, + "grad_norm": 0.5528941621255428, + "learning_rate": 9.171172041835048e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.9075516164302826, + "num_tokens": 8722010.0, + "step": 2107 + }, + { + "epoch": 0.21059993006643687, + "grad_norm": 0.7064309533479575, + "learning_rate": 9.170279710241257e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9055228531360626, + "num_tokens": 8803610.0, + "step": 2108 + }, + { + "epoch": 0.21069983515660123, + "grad_norm": 0.5340855572597134, + "learning_rate": 9.169386942015098e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.9099243879318237, + "num_tokens": 8885217.0, + "step": 2109 + }, + { + "epoch": 0.21079974024676557, + "grad_norm": 0.43864070440333686, + "learning_rate": 9.168493737250048e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.908568412065506, + "num_tokens": 8966783.0, + "step": 2110 + }, + { + "epoch": 0.21089964533692993, + "grad_norm": 0.5259618065473503, + "learning_rate": 9.167600096039626e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9069910645484924, + "num_tokens": 9048325.0, + "step": 2111 + }, + { + "epoch": 0.21099955042709426, + "grad_norm": 0.6334179791676638, + "learning_rate": 9.166706018477398e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9070945680141449, + "num_tokens": 9129823.0, + "step": 2112 + }, + { + "epoch": 0.21109945551725862, + "grad_norm": 0.5593971028575504, + "learning_rate": 9.165811504656973e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9062912464141846, + "num_tokens": 9211349.0, + "step": 2113 + }, + { + "epoch": 0.21119936060742295, + "grad_norm": 0.6176786921411636, + "learning_rate": 9.164916554672007e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.9063183665275574, + "num_tokens": 9292907.0, + "step": 2114 + }, + { + "epoch": 0.21129926569758728, + "grad_norm": 0.8313182669430462, + "learning_rate": 9.164021168616206e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9095896780490875, + "num_tokens": 9374474.0, + "step": 2115 + }, + { + "epoch": 0.21139917078775164, + "grad_norm": 0.5628004727227178, + "learning_rate": 9.163125346583314e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9052594006061554, + "num_tokens": 9455997.0, + "step": 2116 + }, + { + "epoch": 0.21149907587791597, + "grad_norm": 0.6543627238925356, + "learning_rate": 9.162229088667125e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9066183567047119, + "num_tokens": 9537465.0, + "step": 2117 + }, + { + "epoch": 0.21159898096808033, + "grad_norm": 0.577987332498478, + "learning_rate": 9.16133239496148e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9081699848175049, + "num_tokens": 9619006.0, + "step": 2118 + }, + { + "epoch": 0.21169888605824466, + "grad_norm": 0.5531139692090118, + "learning_rate": 9.16043526556026e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.90733802318573, + "num_tokens": 9700533.0, + "step": 2119 + }, + { + "epoch": 0.21179879114840902, + "grad_norm": 0.6974594655266524, + "learning_rate": 9.1595377005574e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9053946733474731, + "num_tokens": 9781973.0, + "step": 2120 + }, + { + "epoch": 0.21189869623857335, + "grad_norm": 0.6679060987662915, + "learning_rate": 9.158639700046873e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9034801721572876, + "num_tokens": 9863477.0, + "step": 2121 + }, + { + "epoch": 0.2119986013287377, + "grad_norm": 0.577058107241188, + "learning_rate": 9.157741264122702e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9070216119289398, + "num_tokens": 9944924.0, + "step": 2122 + }, + { + "epoch": 0.21209850641890204, + "grad_norm": 0.5020466032718768, + "learning_rate": 9.156842392878953e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9065636992454529, + "num_tokens": 10026471.0, + "step": 2123 + }, + { + "epoch": 0.21219841150906638, + "grad_norm": 0.5327640544330017, + "learning_rate": 9.155943086409738e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.9087148606777191, + "num_tokens": 10108050.0, + "step": 2124 + }, + { + "epoch": 0.21229831659923074, + "grad_norm": 0.6196602818462071, + "learning_rate": 9.155043344809217e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9066621661186218, + "num_tokens": 10189544.0, + "step": 2125 + }, + { + "epoch": 0.21239822168939507, + "grad_norm": 0.5650723072302045, + "learning_rate": 9.154143168171594e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9087011814117432, + "num_tokens": 10271155.0, + "step": 2126 + }, + { + "epoch": 0.21249812677955943, + "grad_norm": 0.6849806225143106, + "learning_rate": 9.153242556591115e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9039191901683807, + "num_tokens": 10352615.0, + "step": 2127 + }, + { + "epoch": 0.21259803186972376, + "grad_norm": 0.6689980363491163, + "learning_rate": 9.15234151016208e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9062283933162689, + "num_tokens": 10434207.0, + "step": 2128 + }, + { + "epoch": 0.21269793695988812, + "grad_norm": 0.558834841567734, + "learning_rate": 9.151440028978826e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9085615575313568, + "num_tokens": 10515771.0, + "step": 2129 + }, + { + "epoch": 0.21279784205005245, + "grad_norm": 0.5672014504575282, + "learning_rate": 9.15053811313574e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9077386260032654, + "num_tokens": 10597287.0, + "step": 2130 + }, + { + "epoch": 0.21289774714021678, + "grad_norm": 0.557583028785806, + "learning_rate": 9.149635762727252e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9088267087936401, + "num_tokens": 10678821.0, + "step": 2131 + }, + { + "epoch": 0.21299765223038114, + "grad_norm": 0.6079060539889315, + "learning_rate": 9.148732977847842e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9092229902744293, + "num_tokens": 10760355.0, + "step": 2132 + }, + { + "epoch": 0.21309755732054547, + "grad_norm": 0.5930753900090324, + "learning_rate": 9.14782975859203e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9058975279331207, + "num_tokens": 10841846.0, + "step": 2133 + }, + { + "epoch": 0.21319746241070983, + "grad_norm": 0.7856622177918904, + "learning_rate": 9.146926105054384e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.905502200126648, + "num_tokens": 10923425.0, + "step": 2134 + }, + { + "epoch": 0.21329736750087416, + "grad_norm": 0.5164208536654903, + "learning_rate": 9.146022017329519e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9093694686889648, + "num_tokens": 11004928.0, + "step": 2135 + }, + { + "epoch": 0.21339727259103852, + "grad_norm": 0.6169860764533812, + "learning_rate": 9.145117495512092e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9063563644886017, + "num_tokens": 11086440.0, + "step": 2136 + }, + { + "epoch": 0.21349717768120285, + "grad_norm": 0.5634805870060872, + "learning_rate": 9.144212539696811e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.90761798620224, + "num_tokens": 11168000.0, + "step": 2137 + }, + { + "epoch": 0.2135970827713672, + "grad_norm": 0.5780807987536287, + "learning_rate": 9.14330714997842e-06, + "loss": 0.503, + "mean_token_accuracy": 0.905341625213623, + "num_tokens": 11249457.0, + "step": 2138 + }, + { + "epoch": 0.21369698786153155, + "grad_norm": 0.5787166427660985, + "learning_rate": 9.142401326451719e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.907018780708313, + "num_tokens": 11330935.0, + "step": 2139 + }, + { + "epoch": 0.21379689295169588, + "grad_norm": 0.6026829335436322, + "learning_rate": 9.141495069211545e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.9096519649028778, + "num_tokens": 11412509.0, + "step": 2140 + }, + { + "epoch": 0.21389679804186024, + "grad_norm": 0.5928526569856553, + "learning_rate": 9.140588378352788e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9072940647602081, + "num_tokens": 11494129.0, + "step": 2141 + }, + { + "epoch": 0.21399670313202457, + "grad_norm": 0.6116081525940342, + "learning_rate": 9.139681253970376e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9039322435855865, + "num_tokens": 11575641.0, + "step": 2142 + }, + { + "epoch": 0.21409660822218893, + "grad_norm": 0.6098990972965905, + "learning_rate": 9.138773696159287e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9107858538627625, + "num_tokens": 11657200.0, + "step": 2143 + }, + { + "epoch": 0.21419651331235326, + "grad_norm": 0.5126487061917474, + "learning_rate": 9.137865705014543e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9085273444652557, + "num_tokens": 11738720.0, + "step": 2144 + }, + { + "epoch": 0.21429641840251762, + "grad_norm": 0.8464163718970974, + "learning_rate": 9.136957280631212e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.9057577252388, + "num_tokens": 11820371.0, + "step": 2145 + }, + { + "epoch": 0.21439632349268195, + "grad_norm": 0.6014485668296933, + "learning_rate": 9.136048423104407e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9078609049320221, + "num_tokens": 11901955.0, + "step": 2146 + }, + { + "epoch": 0.21449622858284628, + "grad_norm": 0.623321806326042, + "learning_rate": 9.135139132529287e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9075108170509338, + "num_tokens": 11983455.0, + "step": 2147 + }, + { + "epoch": 0.21459613367301064, + "grad_norm": 0.5728700823829654, + "learning_rate": 9.134229409001054e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9071289598941803, + "num_tokens": 12065050.0, + "step": 2148 + }, + { + "epoch": 0.21469603876317497, + "grad_norm": 0.5420526398040895, + "learning_rate": 9.133319252614955e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9063701629638672, + "num_tokens": 12146581.0, + "step": 2149 + }, + { + "epoch": 0.21479594385333933, + "grad_norm": 0.5750674055443037, + "learning_rate": 9.13240866346629e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9059119820594788, + "num_tokens": 12228063.0, + "step": 2150 + }, + { + "epoch": 0.21489584894350366, + "grad_norm": 0.6344783965551672, + "learning_rate": 9.131497641650396e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9058117270469666, + "num_tokens": 12309687.0, + "step": 2151 + }, + { + "epoch": 0.21499575403366802, + "grad_norm": 0.6646074261208041, + "learning_rate": 9.130586187262656e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9026245176792145, + "num_tokens": 12391176.0, + "step": 2152 + }, + { + "epoch": 0.21509565912383236, + "grad_norm": 0.6366102153920511, + "learning_rate": 9.1296743003985e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9060650765895844, + "num_tokens": 12472673.0, + "step": 2153 + }, + { + "epoch": 0.21519556421399672, + "grad_norm": 0.5617174987165862, + "learning_rate": 9.128761981153406e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9042928516864777, + "num_tokens": 12554183.0, + "step": 2154 + }, + { + "epoch": 0.21529546930416105, + "grad_norm": 0.6434153904279063, + "learning_rate": 9.127849229622893e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.9066078662872314, + "num_tokens": 12635802.0, + "step": 2155 + }, + { + "epoch": 0.21539537439432538, + "grad_norm": 0.659806967393216, + "learning_rate": 9.12693604590253e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.906351774930954, + "num_tokens": 12717417.0, + "step": 2156 + }, + { + "epoch": 0.21549527948448974, + "grad_norm": 0.5562274646584575, + "learning_rate": 9.126022430087924e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9089041948318481, + "num_tokens": 12798999.0, + "step": 2157 + }, + { + "epoch": 0.21559518457465407, + "grad_norm": 0.6451702076818079, + "learning_rate": 9.125108382274733e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9069920480251312, + "num_tokens": 12880532.0, + "step": 2158 + }, + { + "epoch": 0.21569508966481843, + "grad_norm": 0.7890151775450929, + "learning_rate": 9.12419390255866e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9076779186725616, + "num_tokens": 12962034.0, + "step": 2159 + }, + { + "epoch": 0.21579499475498276, + "grad_norm": 0.5856697907261611, + "learning_rate": 9.123278991035451e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9072773456573486, + "num_tokens": 13043630.0, + "step": 2160 + }, + { + "epoch": 0.21589489984514712, + "grad_norm": 0.622014967768131, + "learning_rate": 9.122363647800898e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9048896133899689, + "num_tokens": 13125044.0, + "step": 2161 + }, + { + "epoch": 0.21599480493531145, + "grad_norm": 0.7475003501394717, + "learning_rate": 9.121447872950836e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.9087051749229431, + "num_tokens": 13206633.0, + "step": 2162 + }, + { + "epoch": 0.2160947100254758, + "grad_norm": 0.6091402805172421, + "learning_rate": 9.120531666581153e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9057252109050751, + "num_tokens": 13288146.0, + "step": 2163 + }, + { + "epoch": 0.21619461511564014, + "grad_norm": 0.8190457091971148, + "learning_rate": 9.119615028787771e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9058749377727509, + "num_tokens": 13369667.0, + "step": 2164 + }, + { + "epoch": 0.21629452020580447, + "grad_norm": 0.9676513694219531, + "learning_rate": 9.118697959666669e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9060985147953033, + "num_tokens": 13451198.0, + "step": 2165 + }, + { + "epoch": 0.21639442529596883, + "grad_norm": 0.845361134010979, + "learning_rate": 9.11778045931386e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9060240387916565, + "num_tokens": 13532767.0, + "step": 2166 + }, + { + "epoch": 0.21649433038613317, + "grad_norm": 0.6537204761799966, + "learning_rate": 9.116862527825407e-06, + "loss": 0.492, + "mean_token_accuracy": 0.90647292137146, + "num_tokens": 13614329.0, + "step": 2167 + }, + { + "epoch": 0.21659423547629753, + "grad_norm": 0.6112506433300774, + "learning_rate": 9.115944165297421e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9029442071914673, + "num_tokens": 13695874.0, + "step": 2168 + }, + { + "epoch": 0.21669414056646186, + "grad_norm": 0.5910138098278558, + "learning_rate": 9.115025371826054e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9050811529159546, + "num_tokens": 13777333.0, + "step": 2169 + }, + { + "epoch": 0.21679404565662622, + "grad_norm": 0.6076867605266073, + "learning_rate": 9.114106147507506e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9065181314945221, + "num_tokens": 13858855.0, + "step": 2170 + }, + { + "epoch": 0.21689395074679055, + "grad_norm": 2.555307357897485, + "learning_rate": 9.11318649243802e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9035485684871674, + "num_tokens": 13940398.0, + "step": 2171 + }, + { + "epoch": 0.21699385583695488, + "grad_norm": 0.591466085197757, + "learning_rate": 9.112266406713884e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9067935943603516, + "num_tokens": 14021869.0, + "step": 2172 + }, + { + "epoch": 0.21709376092711924, + "grad_norm": 1.0618694709104617, + "learning_rate": 9.111345890431431e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9036468863487244, + "num_tokens": 14103338.0, + "step": 2173 + }, + { + "epoch": 0.21719366601728357, + "grad_norm": 0.7228060493732148, + "learning_rate": 9.110424943687042e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9063203632831573, + "num_tokens": 14184812.0, + "step": 2174 + }, + { + "epoch": 0.21729357110744793, + "grad_norm": 0.5979228556122829, + "learning_rate": 9.109503566577143e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9044276177883148, + "num_tokens": 14266251.0, + "step": 2175 + }, + { + "epoch": 0.21739347619761226, + "grad_norm": 0.5864190474072135, + "learning_rate": 9.108581759198198e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9056098163127899, + "num_tokens": 14347777.0, + "step": 2176 + }, + { + "epoch": 0.21749338128777662, + "grad_norm": 0.7376984243227857, + "learning_rate": 9.107659521646723e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9086695909500122, + "num_tokens": 14429332.0, + "step": 2177 + }, + { + "epoch": 0.21759328637794095, + "grad_norm": 0.5706698171302725, + "learning_rate": 9.106736854019279e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9065994620323181, + "num_tokens": 14510820.0, + "step": 2178 + }, + { + "epoch": 0.2176931914681053, + "grad_norm": 0.693804996526983, + "learning_rate": 9.105813756412469e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9053118824958801, + "num_tokens": 14592298.0, + "step": 2179 + }, + { + "epoch": 0.21779309655826964, + "grad_norm": 0.6000087018873931, + "learning_rate": 9.104890228922942e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9079056680202484, + "num_tokens": 14673831.0, + "step": 2180 + }, + { + "epoch": 0.21789300164843398, + "grad_norm": 0.6265562829873711, + "learning_rate": 9.103966271647393e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9063959121704102, + "num_tokens": 14755355.0, + "step": 2181 + }, + { + "epoch": 0.21799290673859834, + "grad_norm": 0.881646037816162, + "learning_rate": 9.10304188468256e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9095840752124786, + "num_tokens": 14836788.0, + "step": 2182 + }, + { + "epoch": 0.21809281182876267, + "grad_norm": 0.622078861943309, + "learning_rate": 9.102117068125227e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9074369072914124, + "num_tokens": 14918260.0, + "step": 2183 + }, + { + "epoch": 0.21819271691892703, + "grad_norm": 0.5739294153599099, + "learning_rate": 9.101191822072227e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9045599400997162, + "num_tokens": 14999706.0, + "step": 2184 + }, + { + "epoch": 0.21829262200909136, + "grad_norm": 0.8152974738936851, + "learning_rate": 9.100266146620428e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9053591191768646, + "num_tokens": 15081229.0, + "step": 2185 + }, + { + "epoch": 0.21839252709925572, + "grad_norm": 0.5984844332912354, + "learning_rate": 9.099340041866754e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9069071114063263, + "num_tokens": 15162719.0, + "step": 2186 + }, + { + "epoch": 0.21849243218942005, + "grad_norm": 0.6909647201841614, + "learning_rate": 9.098413507908167e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9065874814987183, + "num_tokens": 15244259.0, + "step": 2187 + }, + { + "epoch": 0.21859233727958438, + "grad_norm": 0.7278176740272301, + "learning_rate": 9.097486544841675e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9074449241161346, + "num_tokens": 15325740.0, + "step": 2188 + }, + { + "epoch": 0.21869224236974874, + "grad_norm": 0.6010008946447362, + "learning_rate": 9.096559152764337e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9060453474521637, + "num_tokens": 15407182.0, + "step": 2189 + }, + { + "epoch": 0.21879214745991307, + "grad_norm": 0.8737137479624794, + "learning_rate": 9.095631331773245e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9078371524810791, + "num_tokens": 15488711.0, + "step": 2190 + }, + { + "epoch": 0.21889205255007743, + "grad_norm": 0.6195596368272647, + "learning_rate": 9.094703081965548e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9062112271785736, + "num_tokens": 15570187.0, + "step": 2191 + }, + { + "epoch": 0.21899195764024176, + "grad_norm": 0.719018209624136, + "learning_rate": 9.093774403438432e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9086824357509613, + "num_tokens": 15651831.0, + "step": 2192 + }, + { + "epoch": 0.21909186273040612, + "grad_norm": 0.6252038236765664, + "learning_rate": 9.09284529628913e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9061281681060791, + "num_tokens": 15733281.0, + "step": 2193 + }, + { + "epoch": 0.21919176782057045, + "grad_norm": 0.6790111973596932, + "learning_rate": 9.091915760614924e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9048574864864349, + "num_tokens": 15814774.0, + "step": 2194 + }, + { + "epoch": 0.21929167291073481, + "grad_norm": 0.5892527249944317, + "learning_rate": 9.090985796513133e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9086838066577911, + "num_tokens": 15896298.0, + "step": 2195 + }, + { + "epoch": 0.21939157800089915, + "grad_norm": 0.5001195137349369, + "learning_rate": 9.090055404081128e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9026140570640564, + "num_tokens": 15977848.0, + "step": 2196 + }, + { + "epoch": 0.21949148309106348, + "grad_norm": 0.6792849664850917, + "learning_rate": 9.08912458341632e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9062270820140839, + "num_tokens": 16059347.0, + "step": 2197 + }, + { + "epoch": 0.21959138818122784, + "grad_norm": 0.6193835231826147, + "learning_rate": 9.08819333461617e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9087932705879211, + "num_tokens": 16140845.0, + "step": 2198 + }, + { + "epoch": 0.21969129327139217, + "grad_norm": 0.5851458370822916, + "learning_rate": 9.087261657778177e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9061657190322876, + "num_tokens": 16222337.0, + "step": 2199 + }, + { + "epoch": 0.21979119836155653, + "grad_norm": 0.5974135302378931, + "learning_rate": 9.08632955299989e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9065149426460266, + "num_tokens": 16303860.0, + "step": 2200 + }, + { + "epoch": 0.21989110345172086, + "grad_norm": 0.7053421341731639, + "learning_rate": 9.085397020378902e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9051867425441742, + "num_tokens": 16385378.0, + "step": 2201 + }, + { + "epoch": 0.21999100854188522, + "grad_norm": 0.962629235838935, + "learning_rate": 9.084464060012849e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9058866500854492, + "num_tokens": 16466903.0, + "step": 2202 + }, + { + "epoch": 0.22009091363204955, + "grad_norm": 0.5653808249303599, + "learning_rate": 9.083530671999414e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9076083898544312, + "num_tokens": 16548344.0, + "step": 2203 + }, + { + "epoch": 0.2201908187222139, + "grad_norm": 0.5272502585504896, + "learning_rate": 9.082596856436323e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9081661105155945, + "num_tokens": 16629822.0, + "step": 2204 + }, + { + "epoch": 0.22029072381237824, + "grad_norm": 0.7810684915469412, + "learning_rate": 9.081662613421348e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9100877642631531, + "num_tokens": 16711341.0, + "step": 2205 + }, + { + "epoch": 0.22039062890254257, + "grad_norm": 0.5738780394421014, + "learning_rate": 9.080727943052304e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9082639217376709, + "num_tokens": 16792924.0, + "step": 2206 + }, + { + "epoch": 0.22049053399270693, + "grad_norm": 0.812335872162402, + "learning_rate": 9.079792845427051e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9060531854629517, + "num_tokens": 16874449.0, + "step": 2207 + }, + { + "epoch": 0.22059043908287126, + "grad_norm": 0.7036226394431596, + "learning_rate": 9.078857320643497e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9066930711269379, + "num_tokens": 16955996.0, + "step": 2208 + }, + { + "epoch": 0.22069034417303562, + "grad_norm": 1.015958689129906, + "learning_rate": 9.077921368799591e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9064944982528687, + "num_tokens": 17037538.0, + "step": 2209 + }, + { + "epoch": 0.22079024926319996, + "grad_norm": 0.6122077902052798, + "learning_rate": 9.07698498999333e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9082396328449249, + "num_tokens": 17119066.0, + "step": 2210 + }, + { + "epoch": 0.22089015435336432, + "grad_norm": 0.6287738384391701, + "learning_rate": 9.07604818432275e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9060086011886597, + "num_tokens": 17200600.0, + "step": 2211 + }, + { + "epoch": 0.22099005944352865, + "grad_norm": 0.6320992083697037, + "learning_rate": 9.075110951885937e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9092421233654022, + "num_tokens": 17282130.0, + "step": 2212 + }, + { + "epoch": 0.22108996453369298, + "grad_norm": 0.7384757392277104, + "learning_rate": 9.074173292781024e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9070439338684082, + "num_tokens": 17363657.0, + "step": 2213 + }, + { + "epoch": 0.22118986962385734, + "grad_norm": 0.6184288178697619, + "learning_rate": 9.07323520710618e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9052446782588959, + "num_tokens": 17445095.0, + "step": 2214 + }, + { + "epoch": 0.22128977471402167, + "grad_norm": 0.5301760462052385, + "learning_rate": 9.072296694959625e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.9047082662582397, + "num_tokens": 17526616.0, + "step": 2215 + }, + { + "epoch": 0.22138967980418603, + "grad_norm": 0.7509641436585796, + "learning_rate": 9.071357756439623e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9058833420276642, + "num_tokens": 17608183.0, + "step": 2216 + }, + { + "epoch": 0.22148958489435036, + "grad_norm": 0.7173001483519014, + "learning_rate": 9.07041839164448e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9062342643737793, + "num_tokens": 17689683.0, + "step": 2217 + }, + { + "epoch": 0.22158948998451472, + "grad_norm": 0.5145000937896744, + "learning_rate": 9.06947860067255e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9092433452606201, + "num_tokens": 17771231.0, + "step": 2218 + }, + { + "epoch": 0.22168939507467905, + "grad_norm": 0.6993964082067219, + "learning_rate": 9.068538383622232e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9041145145893097, + "num_tokens": 17852654.0, + "step": 2219 + }, + { + "epoch": 0.2217893001648434, + "grad_norm": 0.6440849763424056, + "learning_rate": 9.067597740591963e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9059131145477295, + "num_tokens": 17934200.0, + "step": 2220 + }, + { + "epoch": 0.22188920525500774, + "grad_norm": 0.7094416603457926, + "learning_rate": 9.066656671680231e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9063366949558258, + "num_tokens": 18015749.0, + "step": 2221 + }, + { + "epoch": 0.22198911034517207, + "grad_norm": 0.9534631679061525, + "learning_rate": 9.065715176985567e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9071661829948425, + "num_tokens": 18097305.0, + "step": 2222 + }, + { + "epoch": 0.22208901543533643, + "grad_norm": 0.5978157169283164, + "learning_rate": 9.064773256606549e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9029004573822021, + "num_tokens": 18178755.0, + "step": 2223 + }, + { + "epoch": 0.22218892052550077, + "grad_norm": 0.601587129507342, + "learning_rate": 9.063830910641793e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9048340022563934, + "num_tokens": 18260296.0, + "step": 2224 + }, + { + "epoch": 0.22228882561566513, + "grad_norm": 0.554888268182812, + "learning_rate": 9.062888139189964e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9077679216861725, + "num_tokens": 18341827.0, + "step": 2225 + }, + { + "epoch": 0.22238873070582946, + "grad_norm": 0.6814638338485364, + "learning_rate": 9.061944942349774e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.9108027219772339, + "num_tokens": 18423430.0, + "step": 2226 + }, + { + "epoch": 0.22248863579599382, + "grad_norm": 0.5789516855174135, + "learning_rate": 9.061001320219975e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.908812403678894, + "num_tokens": 18504906.0, + "step": 2227 + }, + { + "epoch": 0.22258854088615815, + "grad_norm": 0.8315406977754745, + "learning_rate": 9.060057272899366e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9063022434711456, + "num_tokens": 18586411.0, + "step": 2228 + }, + { + "epoch": 0.22268844597632248, + "grad_norm": 0.7116952367039031, + "learning_rate": 9.059112800486787e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9053259789943695, + "num_tokens": 18667961.0, + "step": 2229 + }, + { + "epoch": 0.22278835106648684, + "grad_norm": 0.6225917435562206, + "learning_rate": 9.058167903081126e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9090316891670227, + "num_tokens": 18749510.0, + "step": 2230 + }, + { + "epoch": 0.22288825615665117, + "grad_norm": 0.5319625104561505, + "learning_rate": 9.057222580781317e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9034853279590607, + "num_tokens": 18831026.0, + "step": 2231 + }, + { + "epoch": 0.22298816124681553, + "grad_norm": 0.6871667782829599, + "learning_rate": 9.056276833686337e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9072116017341614, + "num_tokens": 18912596.0, + "step": 2232 + }, + { + "epoch": 0.22308806633697986, + "grad_norm": 0.608359547162145, + "learning_rate": 9.0553306618952e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9071299731731415, + "num_tokens": 18994084.0, + "step": 2233 + }, + { + "epoch": 0.22318797142714422, + "grad_norm": 0.53163568178448, + "learning_rate": 9.054384065506979e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9072765707969666, + "num_tokens": 19075650.0, + "step": 2234 + }, + { + "epoch": 0.22328787651730855, + "grad_norm": 0.6658773207170045, + "learning_rate": 9.053437044620779e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9049160778522491, + "num_tokens": 19157168.0, + "step": 2235 + }, + { + "epoch": 0.2233877816074729, + "grad_norm": 0.5356660211571339, + "learning_rate": 9.052489599335756e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.9088845252990723, + "num_tokens": 19238796.0, + "step": 2236 + }, + { + "epoch": 0.22348768669763724, + "grad_norm": 0.5661843654349936, + "learning_rate": 9.051541729751107e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9038540720939636, + "num_tokens": 19320260.0, + "step": 2237 + }, + { + "epoch": 0.22358759178780158, + "grad_norm": 0.5341708609683077, + "learning_rate": 9.050593435966076e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9041861593723297, + "num_tokens": 19401793.0, + "step": 2238 + }, + { + "epoch": 0.22368749687796594, + "grad_norm": 0.6976585091029546, + "learning_rate": 9.04964471807995e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9068195521831512, + "num_tokens": 19483264.0, + "step": 2239 + }, + { + "epoch": 0.22378740196813027, + "grad_norm": 0.6060954997890676, + "learning_rate": 9.048695576192058e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9044314622879028, + "num_tokens": 19564838.0, + "step": 2240 + }, + { + "epoch": 0.22388730705829463, + "grad_norm": 0.5817426425643342, + "learning_rate": 9.04774601040178e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.9073969423770905, + "num_tokens": 19646448.0, + "step": 2241 + }, + { + "epoch": 0.22398721214845896, + "grad_norm": 0.5377532318387337, + "learning_rate": 9.046796020808535e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9083714485168457, + "num_tokens": 19727941.0, + "step": 2242 + }, + { + "epoch": 0.22408711723862332, + "grad_norm": 0.8830375762299383, + "learning_rate": 9.045845607511788e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9065452814102173, + "num_tokens": 19809478.0, + "step": 2243 + }, + { + "epoch": 0.22418702232878765, + "grad_norm": 0.5803737092428453, + "learning_rate": 9.044894770611049e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9068155884742737, + "num_tokens": 19891038.0, + "step": 2244 + }, + { + "epoch": 0.224286927418952, + "grad_norm": 0.61231592939164, + "learning_rate": 9.04394351020587e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9040319621562958, + "num_tokens": 19972601.0, + "step": 2245 + }, + { + "epoch": 0.22438683250911634, + "grad_norm": 0.6002676178567073, + "learning_rate": 9.042991826395848e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9067756235599518, + "num_tokens": 20054051.0, + "step": 2246 + }, + { + "epoch": 0.22448673759928067, + "grad_norm": 0.6090279032262144, + "learning_rate": 9.042039719280629e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9042828381061554, + "num_tokens": 20135496.0, + "step": 2247 + }, + { + "epoch": 0.22458664268944503, + "grad_norm": 0.5952753096333692, + "learning_rate": 9.041087188959896e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9068930149078369, + "num_tokens": 20216987.0, + "step": 2248 + }, + { + "epoch": 0.22468654777960936, + "grad_norm": 0.5384304595373209, + "learning_rate": 9.04013423553338e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9098114371299744, + "num_tokens": 20298497.0, + "step": 2249 + }, + { + "epoch": 0.22478645286977372, + "grad_norm": 0.5234593808554375, + "learning_rate": 9.03918085910086e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9058435261249542, + "num_tokens": 20379976.0, + "step": 2250 + }, + { + "epoch": 0.22488635795993805, + "grad_norm": 0.5438263661765389, + "learning_rate": 9.03822705976215e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9064952731132507, + "num_tokens": 20461508.0, + "step": 2251 + }, + { + "epoch": 0.22498626305010241, + "grad_norm": 0.6492946346670525, + "learning_rate": 9.03727283761712e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9061855971813202, + "num_tokens": 20542984.0, + "step": 2252 + }, + { + "epoch": 0.22508616814026675, + "grad_norm": 0.6331444449951891, + "learning_rate": 9.036318192765672e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9074423611164093, + "num_tokens": 20624513.0, + "step": 2253 + }, + { + "epoch": 0.22518607323043108, + "grad_norm": 0.625364494730804, + "learning_rate": 9.03536312530776e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9076792895793915, + "num_tokens": 20705982.0, + "step": 2254 + }, + { + "epoch": 0.22528597832059544, + "grad_norm": 0.5851015359383942, + "learning_rate": 9.034407635343381e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.9081449508666992, + "num_tokens": 20787588.0, + "step": 2255 + }, + { + "epoch": 0.22538588341075977, + "grad_norm": 0.5773939064327027, + "learning_rate": 9.033451722972578e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9074722528457642, + "num_tokens": 20869049.0, + "step": 2256 + }, + { + "epoch": 0.22548578850092413, + "grad_norm": 0.5451216769667226, + "learning_rate": 9.032495388295433e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9084422886371613, + "num_tokens": 20950495.0, + "step": 2257 + }, + { + "epoch": 0.22558569359108846, + "grad_norm": 0.6034045195868002, + "learning_rate": 9.031538631412076e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9086568057537079, + "num_tokens": 21031990.0, + "step": 2258 + }, + { + "epoch": 0.22568559868125282, + "grad_norm": 0.5987676984104412, + "learning_rate": 9.03058145242268e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9047747552394867, + "num_tokens": 21113551.0, + "step": 2259 + }, + { + "epoch": 0.22578550377141715, + "grad_norm": 0.6365036772324182, + "learning_rate": 9.029623851427463e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9039886593818665, + "num_tokens": 21195028.0, + "step": 2260 + }, + { + "epoch": 0.2258854088615815, + "grad_norm": 1.0829142556255078, + "learning_rate": 9.028665828526688e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9096079766750336, + "num_tokens": 21276573.0, + "step": 2261 + }, + { + "epoch": 0.22598531395174584, + "grad_norm": 0.6049898686521954, + "learning_rate": 9.027707383820658e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9086971580982208, + "num_tokens": 21358138.0, + "step": 2262 + }, + { + "epoch": 0.22608521904191017, + "grad_norm": 0.6599817057309428, + "learning_rate": 9.026748517409728e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9053716659545898, + "num_tokens": 21439533.0, + "step": 2263 + }, + { + "epoch": 0.22618512413207453, + "grad_norm": 0.5882990862017443, + "learning_rate": 9.025789229394287e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9041168987751007, + "num_tokens": 21520935.0, + "step": 2264 + }, + { + "epoch": 0.22628502922223886, + "grad_norm": 0.5153689402679603, + "learning_rate": 9.024829519874774e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9056523144245148, + "num_tokens": 21602449.0, + "step": 2265 + }, + { + "epoch": 0.22638493431240322, + "grad_norm": 0.6126087938644003, + "learning_rate": 9.023869388951673e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9083337187767029, + "num_tokens": 21683864.0, + "step": 2266 + }, + { + "epoch": 0.22648483940256756, + "grad_norm": 0.6105865637346358, + "learning_rate": 9.022908836725513e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9080066680908203, + "num_tokens": 21765372.0, + "step": 2267 + }, + { + "epoch": 0.22658474449273192, + "grad_norm": 0.5043659295106456, + "learning_rate": 9.021947863296862e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9065526723861694, + "num_tokens": 21846814.0, + "step": 2268 + }, + { + "epoch": 0.22668464958289625, + "grad_norm": 0.5360525882910786, + "learning_rate": 9.020986468766335e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9088895618915558, + "num_tokens": 21928344.0, + "step": 2269 + }, + { + "epoch": 0.22678455467306058, + "grad_norm": 0.5612239944825973, + "learning_rate": 9.02002465323459e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9081442952156067, + "num_tokens": 22009897.0, + "step": 2270 + }, + { + "epoch": 0.22688445976322494, + "grad_norm": 0.4797602863240463, + "learning_rate": 9.01906241680233e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.9059971868991852, + "num_tokens": 22091504.0, + "step": 2271 + }, + { + "epoch": 0.22698436485338927, + "grad_norm": 0.9200892203751951, + "learning_rate": 9.018099759570307e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9091375768184662, + "num_tokens": 22173041.0, + "step": 2272 + }, + { + "epoch": 0.22708426994355363, + "grad_norm": 1.0030868530090038, + "learning_rate": 9.017136681639307e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9110137820243835, + "num_tokens": 22254709.0, + "step": 2273 + }, + { + "epoch": 0.22718417503371796, + "grad_norm": 0.509948726660033, + "learning_rate": 9.016173183110166e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9074483513832092, + "num_tokens": 22336169.0, + "step": 2274 + }, + { + "epoch": 0.22728408012388232, + "grad_norm": 0.46637225261741994, + "learning_rate": 9.015209264083766e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9098201990127563, + "num_tokens": 22417649.0, + "step": 2275 + }, + { + "epoch": 0.22738398521404665, + "grad_norm": 0.5277898340182904, + "learning_rate": 9.014244924661026e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.9039736390113831, + "num_tokens": 22499341.0, + "step": 2276 + }, + { + "epoch": 0.227483890304211, + "grad_norm": 0.5621064771900197, + "learning_rate": 9.013280164942915e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9040614664554596, + "num_tokens": 22580811.0, + "step": 2277 + }, + { + "epoch": 0.22758379539437534, + "grad_norm": 0.6707585198587914, + "learning_rate": 9.012314985030445e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9057380855083466, + "num_tokens": 22662346.0, + "step": 2278 + }, + { + "epoch": 0.22768370048453968, + "grad_norm": 0.6121628632092836, + "learning_rate": 9.011349385024673e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.90648153424263, + "num_tokens": 22743904.0, + "step": 2279 + }, + { + "epoch": 0.22778360557470403, + "grad_norm": 0.65657592249753, + "learning_rate": 9.010383365026695e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9072244167327881, + "num_tokens": 22825444.0, + "step": 2280 + }, + { + "epoch": 0.22788351066486837, + "grad_norm": 0.6507762528130775, + "learning_rate": 9.009416925137655e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.907643735408783, + "num_tokens": 22906985.0, + "step": 2281 + }, + { + "epoch": 0.22798341575503273, + "grad_norm": 0.5655385396861444, + "learning_rate": 9.008450065458742e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9089814722537994, + "num_tokens": 22988403.0, + "step": 2282 + }, + { + "epoch": 0.22808332084519706, + "grad_norm": 0.7499449949712983, + "learning_rate": 9.007482786091185e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9071504175662994, + "num_tokens": 23070009.0, + "step": 2283 + }, + { + "epoch": 0.22818322593536142, + "grad_norm": 0.5766473027755656, + "learning_rate": 9.00651508713626e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9061679840087891, + "num_tokens": 23151513.0, + "step": 2284 + }, + { + "epoch": 0.22828313102552575, + "grad_norm": 0.6360326419170599, + "learning_rate": 9.005546968695288e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.907418429851532, + "num_tokens": 23233025.0, + "step": 2285 + }, + { + "epoch": 0.2283830361156901, + "grad_norm": 0.6183760853508764, + "learning_rate": 9.004578430869628e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9041421115398407, + "num_tokens": 23314545.0, + "step": 2286 + }, + { + "epoch": 0.22848294120585444, + "grad_norm": 0.7434801863045044, + "learning_rate": 9.003609473760689e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9069006443023682, + "num_tokens": 23396003.0, + "step": 2287 + }, + { + "epoch": 0.22858284629601877, + "grad_norm": 0.4905218526796657, + "learning_rate": 9.00264009746992e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9063677489757538, + "num_tokens": 23477493.0, + "step": 2288 + }, + { + "epoch": 0.22868275138618313, + "grad_norm": 0.5556717190476552, + "learning_rate": 9.001670302098819e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9076153039932251, + "num_tokens": 23559022.0, + "step": 2289 + }, + { + "epoch": 0.22878265647634746, + "grad_norm": 0.5118250126034943, + "learning_rate": 9.000700087748922e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9062804877758026, + "num_tokens": 23640524.0, + "step": 2290 + }, + { + "epoch": 0.22888256156651182, + "grad_norm": 0.5565838677541989, + "learning_rate": 8.999729454521812e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9059608280658722, + "num_tokens": 23722021.0, + "step": 2291 + }, + { + "epoch": 0.22898246665667615, + "grad_norm": 0.5569762145894059, + "learning_rate": 8.998758402519116e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9074193835258484, + "num_tokens": 23803577.0, + "step": 2292 + }, + { + "epoch": 0.2290823717468405, + "grad_norm": 0.6243841091010685, + "learning_rate": 8.997786931842504e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9079151153564453, + "num_tokens": 23885047.0, + "step": 2293 + }, + { + "epoch": 0.22918227683700484, + "grad_norm": 0.5232775092161218, + "learning_rate": 8.99681504259369e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9045235216617584, + "num_tokens": 23966560.0, + "step": 2294 + }, + { + "epoch": 0.22928218192716918, + "grad_norm": 0.6013871910433014, + "learning_rate": 8.995842734874429e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9074326753616333, + "num_tokens": 24048115.0, + "step": 2295 + }, + { + "epoch": 0.22938208701733354, + "grad_norm": 0.596736361569697, + "learning_rate": 8.994870008786527e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9059423208236694, + "num_tokens": 24129529.0, + "step": 2296 + }, + { + "epoch": 0.22948199210749787, + "grad_norm": 0.5389510132757952, + "learning_rate": 8.993896864431825e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9084348380565643, + "num_tokens": 24211083.0, + "step": 2297 + }, + { + "epoch": 0.22958189719766223, + "grad_norm": 0.5637763484615813, + "learning_rate": 8.992923301912215e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9075959622859955, + "num_tokens": 24292483.0, + "step": 2298 + }, + { + "epoch": 0.22968180228782656, + "grad_norm": 0.5089140032965942, + "learning_rate": 8.99194932132963e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9067973792552948, + "num_tokens": 24373987.0, + "step": 2299 + }, + { + "epoch": 0.22978170737799092, + "grad_norm": 0.5289413193570186, + "learning_rate": 8.990974922786046e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9060303866863251, + "num_tokens": 24455495.0, + "step": 2300 + }, + { + "epoch": 0.22988161246815525, + "grad_norm": 0.5117647243790036, + "learning_rate": 8.990000106383483e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9075453281402588, + "num_tokens": 24537009.0, + "step": 2301 + }, + { + "epoch": 0.2299815175583196, + "grad_norm": 0.8395630500026472, + "learning_rate": 8.989024872224004e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9088029265403748, + "num_tokens": 24618553.0, + "step": 2302 + }, + { + "epoch": 0.23008142264848394, + "grad_norm": 0.6408131577890503, + "learning_rate": 8.98804922040972e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9062442481517792, + "num_tokens": 24700063.0, + "step": 2303 + }, + { + "epoch": 0.23018132773864827, + "grad_norm": 0.6266607929546716, + "learning_rate": 8.987073151042782e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9049553871154785, + "num_tokens": 24781596.0, + "step": 2304 + }, + { + "epoch": 0.23028123282881263, + "grad_norm": 0.5656408240041566, + "learning_rate": 8.986096664225383e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9081454873085022, + "num_tokens": 24863095.0, + "step": 2305 + }, + { + "epoch": 0.23038113791897696, + "grad_norm": 1.36001678712615, + "learning_rate": 8.985119760059767e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9079159498214722, + "num_tokens": 24944553.0, + "step": 2306 + }, + { + "epoch": 0.23048104300914132, + "grad_norm": 0.5928394442958752, + "learning_rate": 8.984142438648212e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.908227950334549, + "num_tokens": 25026139.0, + "step": 2307 + }, + { + "epoch": 0.23058094809930565, + "grad_norm": 0.4729445716113895, + "learning_rate": 8.983164700093045e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9099930822849274, + "num_tokens": 25107719.0, + "step": 2308 + }, + { + "epoch": 0.23068085318947001, + "grad_norm": 0.5051710723770615, + "learning_rate": 8.982186544496638e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9074678719043732, + "num_tokens": 25189247.0, + "step": 2309 + }, + { + "epoch": 0.23078075827963435, + "grad_norm": 0.6624393738546629, + "learning_rate": 8.981207971961403e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9060401916503906, + "num_tokens": 25270773.0, + "step": 2310 + }, + { + "epoch": 0.23088066336979868, + "grad_norm": 0.63126016729401, + "learning_rate": 8.980228982589802e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.906111866235733, + "num_tokens": 25352275.0, + "step": 2311 + }, + { + "epoch": 0.23098056845996304, + "grad_norm": 0.5887110762396423, + "learning_rate": 8.97924957648433e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9052241146564484, + "num_tokens": 25433743.0, + "step": 2312 + }, + { + "epoch": 0.23108047355012737, + "grad_norm": 1.005835160780738, + "learning_rate": 8.978269753747535e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9078349471092224, + "num_tokens": 25515233.0, + "step": 2313 + }, + { + "epoch": 0.23118037864029173, + "grad_norm": 0.5437278167216478, + "learning_rate": 8.977289514482008e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.907535195350647, + "num_tokens": 25596766.0, + "step": 2314 + }, + { + "epoch": 0.23128028373045606, + "grad_norm": 0.49973718112985127, + "learning_rate": 8.976308858790375e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.906550258398056, + "num_tokens": 25678269.0, + "step": 2315 + }, + { + "epoch": 0.23138018882062042, + "grad_norm": 0.5576075039381005, + "learning_rate": 8.975327786775316e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9065506756305695, + "num_tokens": 25759742.0, + "step": 2316 + }, + { + "epoch": 0.23148009391078475, + "grad_norm": 0.7471620254603334, + "learning_rate": 8.974346298539546e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9074268043041229, + "num_tokens": 25841243.0, + "step": 2317 + }, + { + "epoch": 0.2315799990009491, + "grad_norm": 0.48427900234714555, + "learning_rate": 8.973364394185835e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9078148007392883, + "num_tokens": 25922759.0, + "step": 2318 + }, + { + "epoch": 0.23167990409111344, + "grad_norm": 0.7712849112564492, + "learning_rate": 8.972382073816981e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9066036641597748, + "num_tokens": 26004300.0, + "step": 2319 + }, + { + "epoch": 0.23177980918127777, + "grad_norm": 0.5457329420853797, + "learning_rate": 8.97139933753584e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9039193093776703, + "num_tokens": 26085834.0, + "step": 2320 + }, + { + "epoch": 0.23187971427144213, + "grad_norm": 0.5636800406488067, + "learning_rate": 8.970416185445304e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9034328162670135, + "num_tokens": 26167359.0, + "step": 2321 + }, + { + "epoch": 0.23197961936160647, + "grad_norm": 1.228597608107834, + "learning_rate": 8.969432617648309e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9070598185062408, + "num_tokens": 26248919.0, + "step": 2322 + }, + { + "epoch": 0.23207952445177082, + "grad_norm": 0.5676209881729418, + "learning_rate": 8.968448634247835e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9083161056041718, + "num_tokens": 26330454.0, + "step": 2323 + }, + { + "epoch": 0.23217942954193516, + "grad_norm": 0.5393899466591083, + "learning_rate": 8.967464235346908e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9055046439170837, + "num_tokens": 26411960.0, + "step": 2324 + }, + { + "epoch": 0.23227933463209952, + "grad_norm": 0.5081235188049322, + "learning_rate": 8.966479421048593e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9079857170581818, + "num_tokens": 26493433.0, + "step": 2325 + }, + { + "epoch": 0.23237923972226385, + "grad_norm": 0.5047824897677224, + "learning_rate": 8.965494191456003e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9050396978855133, + "num_tokens": 26574917.0, + "step": 2326 + }, + { + "epoch": 0.2324791448124282, + "grad_norm": 0.5224778780232622, + "learning_rate": 8.964508546672293e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9072381556034088, + "num_tokens": 26656417.0, + "step": 2327 + }, + { + "epoch": 0.23257904990259254, + "grad_norm": 0.5367153110134353, + "learning_rate": 8.963522486800658e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9054966270923615, + "num_tokens": 26737925.0, + "step": 2328 + }, + { + "epoch": 0.23267895499275687, + "grad_norm": 1.0321948823285632, + "learning_rate": 8.962536011944343e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9065845608711243, + "num_tokens": 26819432.0, + "step": 2329 + }, + { + "epoch": 0.23277886008292123, + "grad_norm": 0.6380045127487284, + "learning_rate": 8.96154912220663e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.9074903130531311, + "num_tokens": 26901025.0, + "step": 2330 + }, + { + "epoch": 0.23287876517308556, + "grad_norm": 0.7493740497860212, + "learning_rate": 8.960561817690849e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9098260700702667, + "num_tokens": 26982511.0, + "step": 2331 + }, + { + "epoch": 0.23297867026324992, + "grad_norm": 0.5049321275418298, + "learning_rate": 8.959574098500369e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.904350221157074, + "num_tokens": 27064029.0, + "step": 2332 + }, + { + "epoch": 0.23307857535341425, + "grad_norm": 0.6048304300634437, + "learning_rate": 8.95858596473861e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9092293679714203, + "num_tokens": 27145574.0, + "step": 2333 + }, + { + "epoch": 0.2331784804435786, + "grad_norm": 0.5872239192256713, + "learning_rate": 8.95759741650903e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9062074422836304, + "num_tokens": 27226996.0, + "step": 2334 + }, + { + "epoch": 0.23327838553374294, + "grad_norm": 0.6216327385628683, + "learning_rate": 8.956608453915126e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9089111089706421, + "num_tokens": 27308585.0, + "step": 2335 + }, + { + "epoch": 0.23337829062390728, + "grad_norm": 0.5541182162885873, + "learning_rate": 8.955619077060447e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9077202081680298, + "num_tokens": 27390088.0, + "step": 2336 + }, + { + "epoch": 0.23347819571407163, + "grad_norm": 0.5610192779778838, + "learning_rate": 8.95462928604858e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9067127704620361, + "num_tokens": 27471668.0, + "step": 2337 + }, + { + "epoch": 0.23357810080423597, + "grad_norm": 0.6477818616480141, + "learning_rate": 8.953639080983158e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9067211449146271, + "num_tokens": 27553189.0, + "step": 2338 + }, + { + "epoch": 0.23367800589440033, + "grad_norm": 0.6056802564901473, + "learning_rate": 8.952648461967859e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9074672758579254, + "num_tokens": 27634638.0, + "step": 2339 + }, + { + "epoch": 0.23377791098456466, + "grad_norm": 0.6490376271233919, + "learning_rate": 8.951657429106398e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9067629277706146, + "num_tokens": 27716226.0, + "step": 2340 + }, + { + "epoch": 0.23387781607472902, + "grad_norm": 0.5505543199716115, + "learning_rate": 8.950665982502538e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9053279757499695, + "num_tokens": 27797756.0, + "step": 2341 + }, + { + "epoch": 0.23397772116489335, + "grad_norm": 0.5422526841648874, + "learning_rate": 8.949674122260086e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9047440588474274, + "num_tokens": 27879301.0, + "step": 2342 + }, + { + "epoch": 0.2340776262550577, + "grad_norm": 0.5196946097521362, + "learning_rate": 8.948681848482889e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9062704145908356, + "num_tokens": 27960818.0, + "step": 2343 + }, + { + "epoch": 0.23417753134522204, + "grad_norm": 0.4508135648759486, + "learning_rate": 8.94768916127484e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9080347120761871, + "num_tokens": 28042382.0, + "step": 2344 + }, + { + "epoch": 0.23427743643538637, + "grad_norm": 0.6098459222716771, + "learning_rate": 8.946696060739873e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9055255949497223, + "num_tokens": 28123782.0, + "step": 2345 + }, + { + "epoch": 0.23437734152555073, + "grad_norm": 0.5751040558609999, + "learning_rate": 8.94570254698197e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9050701558589935, + "num_tokens": 28205293.0, + "step": 2346 + }, + { + "epoch": 0.23447724661571506, + "grad_norm": 0.568123424852547, + "learning_rate": 8.944708620105148e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9103677272796631, + "num_tokens": 28286776.0, + "step": 2347 + }, + { + "epoch": 0.23457715170587942, + "grad_norm": 0.7867157843646392, + "learning_rate": 8.943714280213477e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9068806767463684, + "num_tokens": 28368256.0, + "step": 2348 + }, + { + "epoch": 0.23467705679604375, + "grad_norm": 0.6477762136322618, + "learning_rate": 8.942719527411058e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9062750041484833, + "num_tokens": 28449727.0, + "step": 2349 + }, + { + "epoch": 0.2347769618862081, + "grad_norm": 0.9892979890882271, + "learning_rate": 8.941724361802053e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9093576073646545, + "num_tokens": 28531306.0, + "step": 2350 + }, + { + "epoch": 0.23487686697637244, + "grad_norm": 0.4445536525662568, + "learning_rate": 8.940728783490649e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9048675000667572, + "num_tokens": 28612735.0, + "step": 2351 + }, + { + "epoch": 0.23497677206653678, + "grad_norm": 0.42632016782068777, + "learning_rate": 8.939732792581086e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9070354700088501, + "num_tokens": 28694238.0, + "step": 2352 + }, + { + "epoch": 0.23507667715670114, + "grad_norm": 0.6136647442652233, + "learning_rate": 8.938736389177646e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9076805114746094, + "num_tokens": 28775762.0, + "step": 2353 + }, + { + "epoch": 0.23517658224686547, + "grad_norm": 0.5648559353161378, + "learning_rate": 8.937739573384653e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9087283909320831, + "num_tokens": 28857368.0, + "step": 2354 + }, + { + "epoch": 0.23527648733702983, + "grad_norm": 0.5728458463170825, + "learning_rate": 8.936742345306474e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9068379998207092, + "num_tokens": 28938907.0, + "step": 2355 + }, + { + "epoch": 0.23537639242719416, + "grad_norm": 0.5778567380335605, + "learning_rate": 8.935744705047522e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9082658588886261, + "num_tokens": 29020448.0, + "step": 2356 + }, + { + "epoch": 0.23547629751735852, + "grad_norm": 0.7981967859903596, + "learning_rate": 8.934746652712248e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9055094122886658, + "num_tokens": 29102004.0, + "step": 2357 + }, + { + "epoch": 0.23557620260752285, + "grad_norm": 0.5679847497625102, + "learning_rate": 8.93374818840515e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9094032645225525, + "num_tokens": 29183554.0, + "step": 2358 + }, + { + "epoch": 0.2356761076976872, + "grad_norm": 0.5741020778968264, + "learning_rate": 8.932749312230768e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9060057401657104, + "num_tokens": 29265014.0, + "step": 2359 + }, + { + "epoch": 0.23577601278785154, + "grad_norm": 0.6688799669830621, + "learning_rate": 8.931750024293689e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9058683216571808, + "num_tokens": 29346536.0, + "step": 2360 + }, + { + "epoch": 0.23587591787801587, + "grad_norm": 0.5355060656904314, + "learning_rate": 8.930750324698534e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9045026302337646, + "num_tokens": 29428030.0, + "step": 2361 + }, + { + "epoch": 0.23597582296818023, + "grad_norm": 0.5409733460907565, + "learning_rate": 8.929750213549974e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9096210300922394, + "num_tokens": 29509621.0, + "step": 2362 + }, + { + "epoch": 0.23607572805834456, + "grad_norm": 0.6137226898734895, + "learning_rate": 8.928749690952723e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9058999419212341, + "num_tokens": 29591030.0, + "step": 2363 + }, + { + "epoch": 0.23617563314850892, + "grad_norm": 0.5343374438463476, + "learning_rate": 8.927748757011536e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9072268009185791, + "num_tokens": 29672538.0, + "step": 2364 + }, + { + "epoch": 0.23627553823867325, + "grad_norm": 0.469591463224299, + "learning_rate": 8.926747411831213e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.907026469707489, + "num_tokens": 29754002.0, + "step": 2365 + }, + { + "epoch": 0.23637544332883761, + "grad_norm": 0.5261313877903359, + "learning_rate": 8.925745655516594e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.9095192849636078, + "num_tokens": 29835620.0, + "step": 2366 + }, + { + "epoch": 0.23647534841900195, + "grad_norm": 0.5597916875748208, + "learning_rate": 8.924743488172562e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9067883193492889, + "num_tokens": 29917092.0, + "step": 2367 + }, + { + "epoch": 0.2365752535091663, + "grad_norm": 0.5102739985764726, + "learning_rate": 8.92374090990405e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9072624146938324, + "num_tokens": 29998690.0, + "step": 2368 + }, + { + "epoch": 0.23667515859933064, + "grad_norm": 0.5523632995757176, + "learning_rate": 8.922737920816027e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9023948609828949, + "num_tokens": 30080201.0, + "step": 2369 + }, + { + "epoch": 0.23677506368949497, + "grad_norm": 0.44676127640335367, + "learning_rate": 8.921734521013506e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9067845046520233, + "num_tokens": 30161758.0, + "step": 2370 + }, + { + "epoch": 0.23687496877965933, + "grad_norm": 0.9355427897531445, + "learning_rate": 8.920730710601541e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9076586961746216, + "num_tokens": 30243196.0, + "step": 2371 + }, + { + "epoch": 0.23697487386982366, + "grad_norm": 0.6070465195816289, + "learning_rate": 8.919726489685239e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9063449800014496, + "num_tokens": 30324721.0, + "step": 2372 + }, + { + "epoch": 0.23707477895998802, + "grad_norm": 0.9796517526987057, + "learning_rate": 8.918721858369738e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9069557785987854, + "num_tokens": 30406259.0, + "step": 2373 + }, + { + "epoch": 0.23717468405015235, + "grad_norm": 0.6395576367007437, + "learning_rate": 8.917716816760225e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9050322473049164, + "num_tokens": 30487735.0, + "step": 2374 + }, + { + "epoch": 0.2372745891403167, + "grad_norm": 0.623679516992316, + "learning_rate": 8.916711364961927e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9081648588180542, + "num_tokens": 30569289.0, + "step": 2375 + }, + { + "epoch": 0.23737449423048104, + "grad_norm": 0.48186745945462217, + "learning_rate": 8.91570550308012e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9070273339748383, + "num_tokens": 30650844.0, + "step": 2376 + }, + { + "epoch": 0.23747439932064537, + "grad_norm": 0.5703835894865994, + "learning_rate": 8.914699231220115e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9036587476730347, + "num_tokens": 30732313.0, + "step": 2377 + }, + { + "epoch": 0.23757430441080973, + "grad_norm": 0.5512769080713716, + "learning_rate": 8.913692549487272e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.905402809381485, + "num_tokens": 30813754.0, + "step": 2378 + }, + { + "epoch": 0.23767420950097407, + "grad_norm": 0.7052377954262521, + "learning_rate": 8.912685457986989e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9075441956520081, + "num_tokens": 30895242.0, + "step": 2379 + }, + { + "epoch": 0.23777411459113842, + "grad_norm": 0.5969225671809407, + "learning_rate": 8.911677956824711e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.90738245844841, + "num_tokens": 30976829.0, + "step": 2380 + }, + { + "epoch": 0.23787401968130276, + "grad_norm": 0.5866177544133553, + "learning_rate": 8.910670046105927e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9052684009075165, + "num_tokens": 31058365.0, + "step": 2381 + }, + { + "epoch": 0.23797392477146712, + "grad_norm": 0.5343874925474501, + "learning_rate": 8.90966172593616e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9075329899787903, + "num_tokens": 31139919.0, + "step": 2382 + }, + { + "epoch": 0.23807382986163145, + "grad_norm": 0.5567397225247348, + "learning_rate": 8.908652996420987e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.907110184431076, + "num_tokens": 31221373.0, + "step": 2383 + }, + { + "epoch": 0.2381737349517958, + "grad_norm": 0.48569213643746817, + "learning_rate": 8.907643857666021e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9056746661663055, + "num_tokens": 31302784.0, + "step": 2384 + }, + { + "epoch": 0.23827364004196014, + "grad_norm": 0.5614063895443735, + "learning_rate": 8.906634309776922e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9048335552215576, + "num_tokens": 31384262.0, + "step": 2385 + }, + { + "epoch": 0.23837354513212447, + "grad_norm": 0.7636779048589647, + "learning_rate": 8.90562435285939e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9089865982532501, + "num_tokens": 31465819.0, + "step": 2386 + }, + { + "epoch": 0.23847345022228883, + "grad_norm": 0.4864003218174585, + "learning_rate": 8.904613987019167e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9064212143421173, + "num_tokens": 31547326.0, + "step": 2387 + }, + { + "epoch": 0.23857335531245316, + "grad_norm": 0.6832574672828878, + "learning_rate": 8.903603212362038e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9048991501331329, + "num_tokens": 31628870.0, + "step": 2388 + }, + { + "epoch": 0.23867326040261752, + "grad_norm": 0.4504743668342732, + "learning_rate": 8.902592028993834e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9074142575263977, + "num_tokens": 31710424.0, + "step": 2389 + }, + { + "epoch": 0.23877316549278185, + "grad_norm": 0.6353881536574889, + "learning_rate": 8.901580437020427e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.906566709280014, + "num_tokens": 31791954.0, + "step": 2390 + }, + { + "epoch": 0.2388730705829462, + "grad_norm": 0.5508790936534469, + "learning_rate": 8.900568436547733e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9059697389602661, + "num_tokens": 31873505.0, + "step": 2391 + }, + { + "epoch": 0.23897297567311054, + "grad_norm": 0.5859610730614223, + "learning_rate": 8.899556027681708e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9065770208835602, + "num_tokens": 31955000.0, + "step": 2392 + }, + { + "epoch": 0.23907288076327488, + "grad_norm": 0.6370013976812458, + "learning_rate": 8.89854321052835e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9064317047595978, + "num_tokens": 32036496.0, + "step": 2393 + }, + { + "epoch": 0.23917278585343923, + "grad_norm": 0.5195776452500833, + "learning_rate": 8.897529985193708e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9090003371238708, + "num_tokens": 32118064.0, + "step": 2394 + }, + { + "epoch": 0.23927269094360357, + "grad_norm": 0.4873596054016294, + "learning_rate": 8.89651635178386e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.9083812534809113, + "num_tokens": 32199584.0, + "step": 2395 + }, + { + "epoch": 0.23937259603376793, + "grad_norm": 0.6606335960486294, + "learning_rate": 8.89550231040494e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9067767560482025, + "num_tokens": 32281160.0, + "step": 2396 + }, + { + "epoch": 0.23947250112393226, + "grad_norm": 0.5785625972360412, + "learning_rate": 8.894487861163117e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9093352556228638, + "num_tokens": 32362669.0, + "step": 2397 + }, + { + "epoch": 0.23957240621409662, + "grad_norm": 0.5411048196708322, + "learning_rate": 8.893473004164605e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9089359641075134, + "num_tokens": 32444221.0, + "step": 2398 + }, + { + "epoch": 0.23967231130426095, + "grad_norm": 0.5465753474608198, + "learning_rate": 8.89245773951566e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.910112589597702, + "num_tokens": 32525788.0, + "step": 2399 + }, + { + "epoch": 0.2397722163944253, + "grad_norm": 0.49461674101825365, + "learning_rate": 8.891442067322583e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9080841839313507, + "num_tokens": 32607377.0, + "step": 2400 + }, + { + "epoch": 0.23987212148458964, + "grad_norm": 1.1095968109092256, + "learning_rate": 8.890425987691713e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9055125415325165, + "num_tokens": 32688850.0, + "step": 2401 + }, + { + "epoch": 0.23997202657475397, + "grad_norm": 0.6106116841747852, + "learning_rate": 8.889409500729439e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9060113430023193, + "num_tokens": 32770374.0, + "step": 2402 + }, + { + "epoch": 0.24007193166491833, + "grad_norm": 0.6607472737874692, + "learning_rate": 8.888392606542185e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9050147235393524, + "num_tokens": 32851836.0, + "step": 2403 + }, + { + "epoch": 0.24017183675508266, + "grad_norm": 0.5540865119356931, + "learning_rate": 8.887375305236419e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9055530428886414, + "num_tokens": 32933291.0, + "step": 2404 + }, + { + "epoch": 0.24027174184524702, + "grad_norm": 0.5375382818531741, + "learning_rate": 8.886357596918657e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9041416347026825, + "num_tokens": 33014841.0, + "step": 2405 + }, + { + "epoch": 0.24037164693541135, + "grad_norm": 0.61918655545098, + "learning_rate": 8.885339481695453e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9051182270050049, + "num_tokens": 33096365.0, + "step": 2406 + }, + { + "epoch": 0.2404715520255757, + "grad_norm": 0.634123897470635, + "learning_rate": 8.884320959673405e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9096823632717133, + "num_tokens": 33177961.0, + "step": 2407 + }, + { + "epoch": 0.24057145711574004, + "grad_norm": 0.5901082177203776, + "learning_rate": 8.883302030959151e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9076346158981323, + "num_tokens": 33259448.0, + "step": 2408 + }, + { + "epoch": 0.2406713622059044, + "grad_norm": 0.6742267123379497, + "learning_rate": 8.882282695659376e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9068316519260406, + "num_tokens": 33340996.0, + "step": 2409 + }, + { + "epoch": 0.24077126729606874, + "grad_norm": 0.6851821339068354, + "learning_rate": 8.881262953880807e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9063352346420288, + "num_tokens": 33422484.0, + "step": 2410 + }, + { + "epoch": 0.24087117238623307, + "grad_norm": 0.9414274226329595, + "learning_rate": 8.880242805730208e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9064065217971802, + "num_tokens": 33504039.0, + "step": 2411 + }, + { + "epoch": 0.24097107747639743, + "grad_norm": 0.6001905246392994, + "learning_rate": 8.879222251314392e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.9085182249546051, + "num_tokens": 33585655.0, + "step": 2412 + }, + { + "epoch": 0.24107098256656176, + "grad_norm": 0.8425890929815598, + "learning_rate": 8.87820129074021e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.9077910780906677, + "num_tokens": 33667265.0, + "step": 2413 + }, + { + "epoch": 0.24117088765672612, + "grad_norm": 0.49625962492657305, + "learning_rate": 8.877179924114561e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.907923549413681, + "num_tokens": 33748799.0, + "step": 2414 + }, + { + "epoch": 0.24127079274689045, + "grad_norm": 0.5234818750190463, + "learning_rate": 8.87615815154438e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9060616791248322, + "num_tokens": 33830350.0, + "step": 2415 + }, + { + "epoch": 0.2413706978370548, + "grad_norm": 0.5971112799140345, + "learning_rate": 8.87513597313665e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9083060026168823, + "num_tokens": 33911916.0, + "step": 2416 + }, + { + "epoch": 0.24147060292721914, + "grad_norm": 0.5533772173049969, + "learning_rate": 8.874113388998391e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9067012667655945, + "num_tokens": 33993389.0, + "step": 2417 + }, + { + "epoch": 0.24157050801738347, + "grad_norm": 0.6811483500151975, + "learning_rate": 8.873090399236671e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9091097414493561, + "num_tokens": 34074979.0, + "step": 2418 + }, + { + "epoch": 0.24167041310754783, + "grad_norm": 0.5922809903621982, + "learning_rate": 8.872067003958597e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9051704704761505, + "num_tokens": 34156538.0, + "step": 2419 + }, + { + "epoch": 0.24177031819771216, + "grad_norm": 0.6828421308487757, + "learning_rate": 8.871043203271322e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9056315124034882, + "num_tokens": 34238126.0, + "step": 2420 + }, + { + "epoch": 0.24187022328787652, + "grad_norm": 0.6522174051075428, + "learning_rate": 8.870018997282034e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9084075391292572, + "num_tokens": 34319593.0, + "step": 2421 + }, + { + "epoch": 0.24197012837804086, + "grad_norm": 0.5310288994000325, + "learning_rate": 8.868994386097973e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9078332483768463, + "num_tokens": 34401055.0, + "step": 2422 + }, + { + "epoch": 0.24207003346820521, + "grad_norm": 0.5605078267526787, + "learning_rate": 8.867969369826413e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9084110856056213, + "num_tokens": 34482630.0, + "step": 2423 + }, + { + "epoch": 0.24216993855836955, + "grad_norm": 0.5322529745090855, + "learning_rate": 8.866943948574678e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9088212251663208, + "num_tokens": 34564195.0, + "step": 2424 + }, + { + "epoch": 0.2422698436485339, + "grad_norm": 0.5724429091354508, + "learning_rate": 8.865918122450127e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9051265120506287, + "num_tokens": 34645689.0, + "step": 2425 + }, + { + "epoch": 0.24236974873869824, + "grad_norm": 0.5274457531612841, + "learning_rate": 8.864891891560167e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9066482782363892, + "num_tokens": 34727274.0, + "step": 2426 + }, + { + "epoch": 0.24246965382886257, + "grad_norm": 0.659961322717274, + "learning_rate": 8.863865256012247e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9049679636955261, + "num_tokens": 34808791.0, + "step": 2427 + }, + { + "epoch": 0.24256955891902693, + "grad_norm": 0.5426910972295338, + "learning_rate": 8.862838215913853e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9082356691360474, + "num_tokens": 34890330.0, + "step": 2428 + }, + { + "epoch": 0.24266946400919126, + "grad_norm": 0.8176542647268936, + "learning_rate": 8.861810771372522e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9064154624938965, + "num_tokens": 34971870.0, + "step": 2429 + }, + { + "epoch": 0.24276936909935562, + "grad_norm": 0.5209055642180408, + "learning_rate": 8.860782922495821e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.9083589315414429, + "num_tokens": 35053439.0, + "step": 2430 + }, + { + "epoch": 0.24286927418951995, + "grad_norm": 0.4829064865406212, + "learning_rate": 8.859754669391373e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.9073998034000397, + "num_tokens": 35135045.0, + "step": 2431 + }, + { + "epoch": 0.2429691792796843, + "grad_norm": 0.5622855723173741, + "learning_rate": 8.858726012166837e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9071608185768127, + "num_tokens": 35216536.0, + "step": 2432 + }, + { + "epoch": 0.24306908436984864, + "grad_norm": 0.5615071578815461, + "learning_rate": 8.857696950929913e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9077089726924896, + "num_tokens": 35298062.0, + "step": 2433 + }, + { + "epoch": 0.24316898946001297, + "grad_norm": 0.6322190174758353, + "learning_rate": 8.856667485788341e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9066672027111053, + "num_tokens": 35379607.0, + "step": 2434 + }, + { + "epoch": 0.24326889455017733, + "grad_norm": 0.6249533885310374, + "learning_rate": 8.855637616849912e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9076732695102692, + "num_tokens": 35460978.0, + "step": 2435 + }, + { + "epoch": 0.24336879964034167, + "grad_norm": 0.5304346902953602, + "learning_rate": 8.854607344222454e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.9057470858097076, + "num_tokens": 35542585.0, + "step": 2436 + }, + { + "epoch": 0.24346870473050602, + "grad_norm": 0.5828202873094412, + "learning_rate": 8.853576668013835e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9070902466773987, + "num_tokens": 35624060.0, + "step": 2437 + }, + { + "epoch": 0.24356860982067036, + "grad_norm": 0.545449711866159, + "learning_rate": 8.85254558833197e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9066494405269623, + "num_tokens": 35705584.0, + "step": 2438 + }, + { + "epoch": 0.24366851491083472, + "grad_norm": 0.5339061570064121, + "learning_rate": 8.851514105284813e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9060686826705933, + "num_tokens": 35787085.0, + "step": 2439 + }, + { + "epoch": 0.24376842000099905, + "grad_norm": 0.4424957613034466, + "learning_rate": 8.85048221898036e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9044299125671387, + "num_tokens": 35868575.0, + "step": 2440 + }, + { + "epoch": 0.2438683250911634, + "grad_norm": 0.6075179315621535, + "learning_rate": 8.849449929526654e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9074209034442902, + "num_tokens": 35950080.0, + "step": 2441 + }, + { + "epoch": 0.24396823018132774, + "grad_norm": 0.6705290119724053, + "learning_rate": 8.848417237031775e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9074883759021759, + "num_tokens": 36031622.0, + "step": 2442 + }, + { + "epoch": 0.24406813527149207, + "grad_norm": 0.6529852481761518, + "learning_rate": 8.847384141603845e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9076188802719116, + "num_tokens": 36113196.0, + "step": 2443 + }, + { + "epoch": 0.24416804036165643, + "grad_norm": 0.5489831863939265, + "learning_rate": 8.846350643351031e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9053425490856171, + "num_tokens": 36194771.0, + "step": 2444 + }, + { + "epoch": 0.24426794545182076, + "grad_norm": 0.6141888026277975, + "learning_rate": 8.845316742381542e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.908309817314148, + "num_tokens": 36276336.0, + "step": 2445 + }, + { + "epoch": 0.24436785054198512, + "grad_norm": 0.6502645415980697, + "learning_rate": 8.844282438803631e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9055611491203308, + "num_tokens": 36357811.0, + "step": 2446 + }, + { + "epoch": 0.24446775563214945, + "grad_norm": 0.56654368062296, + "learning_rate": 8.843247732725586e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9086582958698273, + "num_tokens": 36439373.0, + "step": 2447 + }, + { + "epoch": 0.2445676607223138, + "grad_norm": 0.47096487302612267, + "learning_rate": 8.842212624255745e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9065675735473633, + "num_tokens": 36520953.0, + "step": 2448 + }, + { + "epoch": 0.24466756581247814, + "grad_norm": 0.9132497863918482, + "learning_rate": 8.84117711350248e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.907599002122879, + "num_tokens": 36602491.0, + "step": 2449 + }, + { + "epoch": 0.2447674709026425, + "grad_norm": 0.5361526539194422, + "learning_rate": 8.840141200574218e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9055185317993164, + "num_tokens": 36684025.0, + "step": 2450 + }, + { + "epoch": 0.24486737599280683, + "grad_norm": 0.4858161800242309, + "learning_rate": 8.839104885579413e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9063980877399445, + "num_tokens": 36765571.0, + "step": 2451 + }, + { + "epoch": 0.24496728108297117, + "grad_norm": 0.5182577641249589, + "learning_rate": 8.838068168626572e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.906732439994812, + "num_tokens": 36847099.0, + "step": 2452 + }, + { + "epoch": 0.24506718617313553, + "grad_norm": 0.6818143513911882, + "learning_rate": 8.83703104982424e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.907599687576294, + "num_tokens": 36928563.0, + "step": 2453 + }, + { + "epoch": 0.24516709126329986, + "grad_norm": 0.4820091762226588, + "learning_rate": 8.835993529281001e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9090482294559479, + "num_tokens": 37010126.0, + "step": 2454 + }, + { + "epoch": 0.24526699635346422, + "grad_norm": 0.5289808257732154, + "learning_rate": 8.83495560710549e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.906415730714798, + "num_tokens": 37091621.0, + "step": 2455 + }, + { + "epoch": 0.24536690144362855, + "grad_norm": 0.6483019969143027, + "learning_rate": 8.833917283406372e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9034914672374725, + "num_tokens": 37173178.0, + "step": 2456 + }, + { + "epoch": 0.2454668065337929, + "grad_norm": 0.565216645585984, + "learning_rate": 8.832878558292366e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.906541109085083, + "num_tokens": 37254770.0, + "step": 2457 + }, + { + "epoch": 0.24556671162395724, + "grad_norm": 1.0981347279621259, + "learning_rate": 8.831839431872227e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.9082519710063934, + "num_tokens": 37336352.0, + "step": 2458 + }, + { + "epoch": 0.24566661671412157, + "grad_norm": 0.922404088069902, + "learning_rate": 8.830799904254748e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9089984595775604, + "num_tokens": 37417883.0, + "step": 2459 + }, + { + "epoch": 0.24576652180428593, + "grad_norm": 0.6795073870808997, + "learning_rate": 8.829759975548773e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9071678221225739, + "num_tokens": 37499396.0, + "step": 2460 + }, + { + "epoch": 0.24586642689445026, + "grad_norm": 0.912035607745851, + "learning_rate": 8.828719645863182e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9071448445320129, + "num_tokens": 37580938.0, + "step": 2461 + }, + { + "epoch": 0.24596633198461462, + "grad_norm": 0.5365197976084535, + "learning_rate": 8.827678915306898e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9061541557312012, + "num_tokens": 37662459.0, + "step": 2462 + }, + { + "epoch": 0.24606623707477895, + "grad_norm": 0.5867344376434964, + "learning_rate": 8.826637783988887e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9061946868896484, + "num_tokens": 37744030.0, + "step": 2463 + }, + { + "epoch": 0.2461661421649433, + "grad_norm": 0.48784282998492995, + "learning_rate": 8.825596252018158e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9082507491111755, + "num_tokens": 37825547.0, + "step": 2464 + }, + { + "epoch": 0.24626604725510765, + "grad_norm": 0.5640925666685852, + "learning_rate": 8.824554319503758e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9080816209316254, + "num_tokens": 37907180.0, + "step": 2465 + }, + { + "epoch": 0.246365952345272, + "grad_norm": 0.5188242326531985, + "learning_rate": 8.823511986554779e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9073135852813721, + "num_tokens": 37988619.0, + "step": 2466 + }, + { + "epoch": 0.24646585743543634, + "grad_norm": 0.6997304199421568, + "learning_rate": 8.822469253280355e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9059910178184509, + "num_tokens": 38070193.0, + "step": 2467 + }, + { + "epoch": 0.24656576252560067, + "grad_norm": 0.7636052316832942, + "learning_rate": 8.821426119789662e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9062127470970154, + "num_tokens": 38151682.0, + "step": 2468 + }, + { + "epoch": 0.24666566761576503, + "grad_norm": 0.5853887610570622, + "learning_rate": 8.820382586191916e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9050447940826416, + "num_tokens": 38233107.0, + "step": 2469 + }, + { + "epoch": 0.24676557270592936, + "grad_norm": 0.4633854959154816, + "learning_rate": 8.819338652596374e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9051062762737274, + "num_tokens": 38314683.0, + "step": 2470 + }, + { + "epoch": 0.24686547779609372, + "grad_norm": 0.47568479537632197, + "learning_rate": 8.81829431911234e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9065563976764679, + "num_tokens": 38396283.0, + "step": 2471 + }, + { + "epoch": 0.24696538288625805, + "grad_norm": 0.5287391285895073, + "learning_rate": 8.817249585849155e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9040795266628265, + "num_tokens": 38477773.0, + "step": 2472 + }, + { + "epoch": 0.2470652879764224, + "grad_norm": 0.6619988258578485, + "learning_rate": 8.816204452916204e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.907061368227005, + "num_tokens": 38559237.0, + "step": 2473 + }, + { + "epoch": 0.24716519306658674, + "grad_norm": 0.6600547229476805, + "learning_rate": 8.815158920422914e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.906906008720398, + "num_tokens": 38640696.0, + "step": 2474 + }, + { + "epoch": 0.24726509815675107, + "grad_norm": 0.4962669969216116, + "learning_rate": 8.814112988478754e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9083839058876038, + "num_tokens": 38722213.0, + "step": 2475 + }, + { + "epoch": 0.24736500324691543, + "grad_norm": 0.611692197273234, + "learning_rate": 8.813066657193232e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9079124629497528, + "num_tokens": 38803770.0, + "step": 2476 + }, + { + "epoch": 0.24746490833707976, + "grad_norm": 0.4708753743164907, + "learning_rate": 8.812019926675901e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9080576598644257, + "num_tokens": 38885239.0, + "step": 2477 + }, + { + "epoch": 0.24756481342724412, + "grad_norm": 0.6338084116496066, + "learning_rate": 8.810972797036354e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9089372754096985, + "num_tokens": 38966749.0, + "step": 2478 + }, + { + "epoch": 0.24766471851740846, + "grad_norm": 0.5158911515832785, + "learning_rate": 8.809925268384228e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9072403907775879, + "num_tokens": 39048274.0, + "step": 2479 + }, + { + "epoch": 0.24776462360757281, + "grad_norm": 0.5936738156110031, + "learning_rate": 8.808877340829199e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9057858288288116, + "num_tokens": 39129832.0, + "step": 2480 + }, + { + "epoch": 0.24786452869773715, + "grad_norm": 0.464935956591829, + "learning_rate": 8.807829014480987e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.907469779253006, + "num_tokens": 39211372.0, + "step": 2481 + }, + { + "epoch": 0.2479644337879015, + "grad_norm": 0.6036979163103934, + "learning_rate": 8.806780289449351e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9070883095264435, + "num_tokens": 39292859.0, + "step": 2482 + }, + { + "epoch": 0.24806433887806584, + "grad_norm": 0.5079864247036785, + "learning_rate": 8.805731165844093e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9093071818351746, + "num_tokens": 39374268.0, + "step": 2483 + }, + { + "epoch": 0.24816424396823017, + "grad_norm": 0.5992473823885619, + "learning_rate": 8.80468164377506e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.905305802822113, + "num_tokens": 39455723.0, + "step": 2484 + }, + { + "epoch": 0.24826414905839453, + "grad_norm": 0.5678569359358892, + "learning_rate": 8.80363172335214e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9050326943397522, + "num_tokens": 39537178.0, + "step": 2485 + }, + { + "epoch": 0.24836405414855886, + "grad_norm": 0.4443657428142288, + "learning_rate": 8.802581404685255e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.907266765832901, + "num_tokens": 39618785.0, + "step": 2486 + }, + { + "epoch": 0.24846395923872322, + "grad_norm": 0.5615391257981166, + "learning_rate": 8.801530687884378e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.908473551273346, + "num_tokens": 39700342.0, + "step": 2487 + }, + { + "epoch": 0.24856386432888755, + "grad_norm": 0.6466871993961826, + "learning_rate": 8.80047957305952e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9032273292541504, + "num_tokens": 39781766.0, + "step": 2488 + }, + { + "epoch": 0.2486637694190519, + "grad_norm": 0.5308798361413866, + "learning_rate": 8.799428060320732e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9079494774341583, + "num_tokens": 39863279.0, + "step": 2489 + }, + { + "epoch": 0.24876367450921624, + "grad_norm": 0.5079840175894973, + "learning_rate": 8.798376149778108e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.9105769395828247, + "num_tokens": 39944880.0, + "step": 2490 + }, + { + "epoch": 0.2488635795993806, + "grad_norm": 0.703988664872014, + "learning_rate": 8.797323841541788e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9039188921451569, + "num_tokens": 40026421.0, + "step": 2491 + }, + { + "epoch": 0.24896348468954493, + "grad_norm": 0.6387791954403947, + "learning_rate": 8.796271135721944e-06, + "loss": 0.49, + "mean_token_accuracy": 0.9061245918273926, + "num_tokens": 40108007.0, + "step": 2492 + }, + { + "epoch": 0.24906338977970927, + "grad_norm": 0.49910061571393366, + "learning_rate": 8.795218032428801e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9111195504665375, + "num_tokens": 40189638.0, + "step": 2493 + }, + { + "epoch": 0.24916329486987362, + "grad_norm": 0.5387664537858675, + "learning_rate": 8.794164531772615e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9082159399986267, + "num_tokens": 40271144.0, + "step": 2494 + }, + { + "epoch": 0.24926319996003796, + "grad_norm": 0.5475836968519611, + "learning_rate": 8.793110633863694e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9053180515766144, + "num_tokens": 40352696.0, + "step": 2495 + }, + { + "epoch": 0.24936310505020232, + "grad_norm": 0.5021367456042776, + "learning_rate": 8.792056338812376e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9060978889465332, + "num_tokens": 40434218.0, + "step": 2496 + }, + { + "epoch": 0.24946301014036665, + "grad_norm": 0.5897247195961351, + "learning_rate": 8.791001646729051e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9053314030170441, + "num_tokens": 40515738.0, + "step": 2497 + }, + { + "epoch": 0.249562915230531, + "grad_norm": 0.5295749564473845, + "learning_rate": 8.789946557724144e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9080762267112732, + "num_tokens": 40597294.0, + "step": 2498 + }, + { + "epoch": 0.24966282032069534, + "grad_norm": 0.6632726840689235, + "learning_rate": 8.788891071908127e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9077660739421844, + "num_tokens": 40678752.0, + "step": 2499 + }, + { + "epoch": 0.24976272541085967, + "grad_norm": 0.5601869943806538, + "learning_rate": 8.787835189391507e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9059200584888458, + "num_tokens": 40760302.0, + "step": 2500 + }, + { + "epoch": 0.24986263050102403, + "grad_norm": 0.7359646074198151, + "learning_rate": 8.786778910284836e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9066750109195709, + "num_tokens": 40841831.0, + "step": 2501 + }, + { + "epoch": 0.24996253559118836, + "grad_norm": 0.5523406833234545, + "learning_rate": 8.785722234698711e-06, + "loss": 0.497, + "mean_token_accuracy": 0.906950980424881, + "num_tokens": 40923338.0, + "step": 2502 + }, + { + "epoch": 0.2500624406813527, + "grad_norm": 0.5204565280864388, + "learning_rate": 8.784665162743764e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9072935879230499, + "num_tokens": 41004926.0, + "step": 2503 + }, + { + "epoch": 0.2501623457715171, + "grad_norm": 0.5078778436835263, + "learning_rate": 8.783607694530671e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9069189429283142, + "num_tokens": 41086353.0, + "step": 2504 + }, + { + "epoch": 0.2502622508616814, + "grad_norm": 0.5006122158361702, + "learning_rate": 8.782549830170153e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9050115644931793, + "num_tokens": 41167767.0, + "step": 2505 + }, + { + "epoch": 0.25036215595184574, + "grad_norm": 0.4925909442981076, + "learning_rate": 8.781491569772966e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9073788821697235, + "num_tokens": 41249222.0, + "step": 2506 + }, + { + "epoch": 0.2504620610420101, + "grad_norm": 0.49457596839218215, + "learning_rate": 8.780432913449913e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9083542823791504, + "num_tokens": 41330726.0, + "step": 2507 + }, + { + "epoch": 0.2505619661321744, + "grad_norm": 0.5333428189534691, + "learning_rate": 8.779373861311834e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9080210030078888, + "num_tokens": 41412246.0, + "step": 2508 + }, + { + "epoch": 0.2506618712223388, + "grad_norm": 0.5236537094623921, + "learning_rate": 8.778314413469618e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9063555598258972, + "num_tokens": 41493789.0, + "step": 2509 + }, + { + "epoch": 0.2507617763125031, + "grad_norm": 0.4384126526274936, + "learning_rate": 8.777254570034184e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.909997433423996, + "num_tokens": 41575302.0, + "step": 2510 + }, + { + "epoch": 0.25086168140266746, + "grad_norm": 0.6143695906785938, + "learning_rate": 8.776194331116503e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.9100017845630646, + "num_tokens": 41656917.0, + "step": 2511 + }, + { + "epoch": 0.2509615864928318, + "grad_norm": 0.47248513502396744, + "learning_rate": 8.77513369682758e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9066757261753082, + "num_tokens": 41738430.0, + "step": 2512 + }, + { + "epoch": 0.2510614915829962, + "grad_norm": 0.7537184962195355, + "learning_rate": 8.774072667278468e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9052119851112366, + "num_tokens": 41819897.0, + "step": 2513 + }, + { + "epoch": 0.2511613966731605, + "grad_norm": 0.5044953285862697, + "learning_rate": 8.773011242580254e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9062597453594208, + "num_tokens": 41901443.0, + "step": 2514 + }, + { + "epoch": 0.25126130176332484, + "grad_norm": 0.5842594625702023, + "learning_rate": 8.771949422844072e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9045739471912384, + "num_tokens": 41983007.0, + "step": 2515 + }, + { + "epoch": 0.25136120685348917, + "grad_norm": 0.5137779527382187, + "learning_rate": 8.770887208181095e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.9065514206886292, + "num_tokens": 42064669.0, + "step": 2516 + }, + { + "epoch": 0.2514611119436535, + "grad_norm": 0.5744312764145626, + "learning_rate": 8.769824598702541e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9086608290672302, + "num_tokens": 42146231.0, + "step": 2517 + }, + { + "epoch": 0.2515610170338179, + "grad_norm": 0.4493118705427031, + "learning_rate": 8.76876159451966e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.9088900983333588, + "num_tokens": 42227823.0, + "step": 2518 + }, + { + "epoch": 0.2516609221239822, + "grad_norm": 0.5033382168989415, + "learning_rate": 8.767698195743756e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9106223881244659, + "num_tokens": 42309366.0, + "step": 2519 + }, + { + "epoch": 0.25176082721414655, + "grad_norm": 0.558644730818406, + "learning_rate": 8.766634402486167e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9093900620937347, + "num_tokens": 42390913.0, + "step": 2520 + }, + { + "epoch": 0.2518607323043109, + "grad_norm": 0.5875976329315568, + "learning_rate": 8.765570214858268e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9067074060440063, + "num_tokens": 42472439.0, + "step": 2521 + }, + { + "epoch": 0.2519606373944753, + "grad_norm": 0.6111121931889527, + "learning_rate": 8.764505632971485e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9039816856384277, + "num_tokens": 42553901.0, + "step": 2522 + }, + { + "epoch": 0.2520605424846396, + "grad_norm": 1.0128374803567133, + "learning_rate": 8.763440656937279e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9058386981487274, + "num_tokens": 42635389.0, + "step": 2523 + }, + { + "epoch": 0.25216044757480394, + "grad_norm": 0.5147934834361888, + "learning_rate": 8.762375286867155e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9076777398586273, + "num_tokens": 42716836.0, + "step": 2524 + }, + { + "epoch": 0.25226035266496827, + "grad_norm": 0.5753644211968062, + "learning_rate": 8.761309522872657e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9048576056957245, + "num_tokens": 42798247.0, + "step": 2525 + }, + { + "epoch": 0.2523602577551326, + "grad_norm": 0.4677051671194478, + "learning_rate": 8.760243365065373e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.906485378742218, + "num_tokens": 42879748.0, + "step": 2526 + }, + { + "epoch": 0.252460162845297, + "grad_norm": 0.4784311704336725, + "learning_rate": 8.759176813556931e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9082373678684235, + "num_tokens": 42961264.0, + "step": 2527 + }, + { + "epoch": 0.2525600679354613, + "grad_norm": 0.5423965539841507, + "learning_rate": 8.758109868459e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9072005450725555, + "num_tokens": 43042842.0, + "step": 2528 + }, + { + "epoch": 0.25265997302562565, + "grad_norm": 0.5117509432122705, + "learning_rate": 8.757042529883287e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9049361348152161, + "num_tokens": 43124360.0, + "step": 2529 + }, + { + "epoch": 0.25275987811579, + "grad_norm": 0.5227036849562966, + "learning_rate": 8.755974797941548e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9076490104198456, + "num_tokens": 43205923.0, + "step": 2530 + }, + { + "epoch": 0.25285978320595437, + "grad_norm": 0.48782283144406724, + "learning_rate": 8.754906672745572e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.9075886011123657, + "num_tokens": 43287505.0, + "step": 2531 + }, + { + "epoch": 0.2529596882961187, + "grad_norm": 0.5072919607924646, + "learning_rate": 8.753838154407194e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9060483872890472, + "num_tokens": 43369010.0, + "step": 2532 + }, + { + "epoch": 0.25305959338628303, + "grad_norm": 0.5673104441112468, + "learning_rate": 8.75276924303829e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9048642516136169, + "num_tokens": 43450482.0, + "step": 2533 + }, + { + "epoch": 0.25315949847644736, + "grad_norm": 0.5384957198110484, + "learning_rate": 8.751699938750774e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9075730443000793, + "num_tokens": 43531919.0, + "step": 2534 + }, + { + "epoch": 0.2532594035666117, + "grad_norm": 0.5816438217499836, + "learning_rate": 8.750630241656605e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9090917408466339, + "num_tokens": 43613487.0, + "step": 2535 + }, + { + "epoch": 0.2533593086567761, + "grad_norm": 0.4608132320916049, + "learning_rate": 8.749560151867782e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9095878899097443, + "num_tokens": 43694963.0, + "step": 2536 + }, + { + "epoch": 0.2534592137469404, + "grad_norm": 0.6855977971010606, + "learning_rate": 8.74848966949634e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.90638667345047, + "num_tokens": 43776465.0, + "step": 2537 + }, + { + "epoch": 0.25355911883710475, + "grad_norm": 0.49121225913534206, + "learning_rate": 8.747418794654366e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9069873094558716, + "num_tokens": 43857994.0, + "step": 2538 + }, + { + "epoch": 0.2536590239272691, + "grad_norm": 0.45074020217073946, + "learning_rate": 8.746347527453977e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9082867801189423, + "num_tokens": 43939537.0, + "step": 2539 + }, + { + "epoch": 0.25375892901743347, + "grad_norm": 0.635388522707795, + "learning_rate": 8.745275868007335e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9082752466201782, + "num_tokens": 44021089.0, + "step": 2540 + }, + { + "epoch": 0.2538588341075978, + "grad_norm": 0.5557918688117726, + "learning_rate": 8.744203816426648e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9086343348026276, + "num_tokens": 44102693.0, + "step": 2541 + }, + { + "epoch": 0.25395873919776213, + "grad_norm": 0.45691397933026484, + "learning_rate": 8.743131372824158e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.908644437789917, + "num_tokens": 44184193.0, + "step": 2542 + }, + { + "epoch": 0.25405864428792646, + "grad_norm": 0.800949582035928, + "learning_rate": 8.742058537312152e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9075685739517212, + "num_tokens": 44265722.0, + "step": 2543 + }, + { + "epoch": 0.2541585493780908, + "grad_norm": 0.6268922608646367, + "learning_rate": 8.740985310002956e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9065697491168976, + "num_tokens": 44347227.0, + "step": 2544 + }, + { + "epoch": 0.2542584544682552, + "grad_norm": 0.5262802397327915, + "learning_rate": 8.73991169100894e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9061581790447235, + "num_tokens": 44428637.0, + "step": 2545 + }, + { + "epoch": 0.2543583595584195, + "grad_norm": 0.48530409365088306, + "learning_rate": 8.738837680442508e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9062652885913849, + "num_tokens": 44510165.0, + "step": 2546 + }, + { + "epoch": 0.25445826464858384, + "grad_norm": 0.5448292581714586, + "learning_rate": 8.737763278416116e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9078700244426727, + "num_tokens": 44591675.0, + "step": 2547 + }, + { + "epoch": 0.2545581697387482, + "grad_norm": 0.9917931016379153, + "learning_rate": 8.736688485042253e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9082159101963043, + "num_tokens": 44673255.0, + "step": 2548 + }, + { + "epoch": 0.2546580748289125, + "grad_norm": 0.46551074430515205, + "learning_rate": 8.735613300433447e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9033536911010742, + "num_tokens": 44754711.0, + "step": 2549 + }, + { + "epoch": 0.2547579799190769, + "grad_norm": 0.6828924424952797, + "learning_rate": 8.734537724702276e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9094156324863434, + "num_tokens": 44836237.0, + "step": 2550 + }, + { + "epoch": 0.2548578850092412, + "grad_norm": 0.5787240482678632, + "learning_rate": 8.733461757961353e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9072245359420776, + "num_tokens": 44917841.0, + "step": 2551 + }, + { + "epoch": 0.25495779009940556, + "grad_norm": 0.5041010453183046, + "learning_rate": 8.732385400323331e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9062651693820953, + "num_tokens": 44999406.0, + "step": 2552 + }, + { + "epoch": 0.2550576951895699, + "grad_norm": 0.467260677064934, + "learning_rate": 8.731308651900904e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9066202938556671, + "num_tokens": 45080915.0, + "step": 2553 + }, + { + "epoch": 0.2551576002797343, + "grad_norm": 0.5240239867928473, + "learning_rate": 8.730231512806816e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.9091105163097382, + "num_tokens": 45162520.0, + "step": 2554 + }, + { + "epoch": 0.2552575053698986, + "grad_norm": 0.471236724848674, + "learning_rate": 8.729153983153835e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9082077741622925, + "num_tokens": 45244078.0, + "step": 2555 + }, + { + "epoch": 0.25535741046006294, + "grad_norm": 0.49918334513399754, + "learning_rate": 8.728076063054786e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9053484797477722, + "num_tokens": 45325545.0, + "step": 2556 + }, + { + "epoch": 0.25545731555022727, + "grad_norm": 0.5237870459487199, + "learning_rate": 8.726997752622524e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9055052697658539, + "num_tokens": 45406966.0, + "step": 2557 + }, + { + "epoch": 0.2555572206403916, + "grad_norm": 0.5663779319957946, + "learning_rate": 8.725919051969955e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9057174026966095, + "num_tokens": 45488526.0, + "step": 2558 + }, + { + "epoch": 0.255657125730556, + "grad_norm": 0.6111498209736564, + "learning_rate": 8.724839961210014e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.908622533082962, + "num_tokens": 45570030.0, + "step": 2559 + }, + { + "epoch": 0.2557570308207203, + "grad_norm": 0.6854702314485289, + "learning_rate": 8.723760480455685e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9088923037052155, + "num_tokens": 45651558.0, + "step": 2560 + }, + { + "epoch": 0.25585693591088465, + "grad_norm": 0.4616482928085908, + "learning_rate": 8.72268060981999e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.906623125076294, + "num_tokens": 45733050.0, + "step": 2561 + }, + { + "epoch": 0.255956841001049, + "grad_norm": 0.6564033666806255, + "learning_rate": 8.721600349415996e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9046763479709625, + "num_tokens": 45814539.0, + "step": 2562 + }, + { + "epoch": 0.25605674609121337, + "grad_norm": 0.5412802135849638, + "learning_rate": 8.720519699356804e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.906674712896347, + "num_tokens": 45895999.0, + "step": 2563 + }, + { + "epoch": 0.2561566511813777, + "grad_norm": 0.543275055765557, + "learning_rate": 8.719438659755559e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9065746665000916, + "num_tokens": 45977550.0, + "step": 2564 + }, + { + "epoch": 0.25625655627154204, + "grad_norm": 0.5265439632270376, + "learning_rate": 8.71835723072545e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9063131213188171, + "num_tokens": 46059109.0, + "step": 2565 + }, + { + "epoch": 0.25635646136170637, + "grad_norm": 0.5903306030715596, + "learning_rate": 8.7172754123797e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9074303507804871, + "num_tokens": 46140657.0, + "step": 2566 + }, + { + "epoch": 0.2564563664518707, + "grad_norm": 0.8609116703503115, + "learning_rate": 8.716193204831578e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9065667390823364, + "num_tokens": 46222156.0, + "step": 2567 + }, + { + "epoch": 0.2565562715420351, + "grad_norm": 0.4751146844953772, + "learning_rate": 8.71511060819439e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9079813659191132, + "num_tokens": 46303628.0, + "step": 2568 + }, + { + "epoch": 0.2566561766321994, + "grad_norm": 0.5415714921554113, + "learning_rate": 8.71402762258149e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9065270125865936, + "num_tokens": 46385225.0, + "step": 2569 + }, + { + "epoch": 0.25675608172236375, + "grad_norm": 0.5158729084520752, + "learning_rate": 8.712944248106263e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9071489870548248, + "num_tokens": 46466714.0, + "step": 2570 + }, + { + "epoch": 0.2568559868125281, + "grad_norm": 0.47722120247208993, + "learning_rate": 8.711860484882141e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9106384515762329, + "num_tokens": 46548282.0, + "step": 2571 + }, + { + "epoch": 0.25695589190269247, + "grad_norm": 0.5689214542434616, + "learning_rate": 8.710776333022596e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9079303443431854, + "num_tokens": 46629837.0, + "step": 2572 + }, + { + "epoch": 0.2570557969928568, + "grad_norm": 0.4754942482346963, + "learning_rate": 8.70969179264114e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9097930788993835, + "num_tokens": 46711287.0, + "step": 2573 + }, + { + "epoch": 0.25715570208302113, + "grad_norm": 0.43624320577576714, + "learning_rate": 8.708606863851321e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.9095625877380371, + "num_tokens": 46792851.0, + "step": 2574 + }, + { + "epoch": 0.25725560717318546, + "grad_norm": 0.472285719456952, + "learning_rate": 8.707521546766739e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.9071863889694214, + "num_tokens": 46874471.0, + "step": 2575 + }, + { + "epoch": 0.2573555122633498, + "grad_norm": 0.5565527797148967, + "learning_rate": 8.706435841501022e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9069501757621765, + "num_tokens": 46955989.0, + "step": 2576 + }, + { + "epoch": 0.2574554173535142, + "grad_norm": 0.7142428786549917, + "learning_rate": 8.705349748167846e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9070266783237457, + "num_tokens": 47037467.0, + "step": 2577 + }, + { + "epoch": 0.2575553224436785, + "grad_norm": 0.501356462688615, + "learning_rate": 8.70426326688093e-06, + "loss": 0.501, + "mean_token_accuracy": 0.906737208366394, + "num_tokens": 47118903.0, + "step": 2578 + }, + { + "epoch": 0.25765522753384285, + "grad_norm": 0.5100358524747989, + "learning_rate": 8.703176397754023e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9097509980201721, + "num_tokens": 47200460.0, + "step": 2579 + }, + { + "epoch": 0.2577551326240072, + "grad_norm": 0.5830539576641952, + "learning_rate": 8.702089140900926e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9041403830051422, + "num_tokens": 47281895.0, + "step": 2580 + }, + { + "epoch": 0.25785503771417156, + "grad_norm": 0.6110122199966819, + "learning_rate": 8.701001496435473e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9064813852310181, + "num_tokens": 47363382.0, + "step": 2581 + }, + { + "epoch": 0.2579549428043359, + "grad_norm": 0.44712617570810315, + "learning_rate": 8.699913464471543e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9064100980758667, + "num_tokens": 47444897.0, + "step": 2582 + }, + { + "epoch": 0.25805484789450023, + "grad_norm": 0.5036979827816959, + "learning_rate": 8.698825045123057e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9061826765537262, + "num_tokens": 47526381.0, + "step": 2583 + }, + { + "epoch": 0.25815475298466456, + "grad_norm": 0.5235390325166553, + "learning_rate": 8.697736238503966e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9072688221931458, + "num_tokens": 47607948.0, + "step": 2584 + }, + { + "epoch": 0.2582546580748289, + "grad_norm": 0.5382842651440842, + "learning_rate": 8.696647044728275e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.907456785440445, + "num_tokens": 47689603.0, + "step": 2585 + }, + { + "epoch": 0.2583545631649933, + "grad_norm": 0.5375090121908136, + "learning_rate": 8.695557463910024e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9084791243076324, + "num_tokens": 47771182.0, + "step": 2586 + }, + { + "epoch": 0.2584544682551576, + "grad_norm": 0.5117099426738282, + "learning_rate": 8.69446749616329e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9070687890052795, + "num_tokens": 47852793.0, + "step": 2587 + }, + { + "epoch": 0.25855437334532194, + "grad_norm": 0.44798705079351914, + "learning_rate": 8.693377141602195e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.9067186117172241, + "num_tokens": 47934363.0, + "step": 2588 + }, + { + "epoch": 0.2586542784354863, + "grad_norm": 0.5155536805329028, + "learning_rate": 8.6922864003409e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.905703991651535, + "num_tokens": 48015944.0, + "step": 2589 + }, + { + "epoch": 0.2587541835256506, + "grad_norm": 0.5344517809170195, + "learning_rate": 8.691195272493606e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9046441316604614, + "num_tokens": 48097440.0, + "step": 2590 + }, + { + "epoch": 0.258854088615815, + "grad_norm": 0.700381815712571, + "learning_rate": 8.690103758174558e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9073996245861053, + "num_tokens": 48178919.0, + "step": 2591 + }, + { + "epoch": 0.2589539937059793, + "grad_norm": 0.4939992694236278, + "learning_rate": 8.689011857498037e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9069221615791321, + "num_tokens": 48260435.0, + "step": 2592 + }, + { + "epoch": 0.25905389879614366, + "grad_norm": 0.5072052624521745, + "learning_rate": 8.687919570578363e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.905516654253006, + "num_tokens": 48341961.0, + "step": 2593 + }, + { + "epoch": 0.259153803886308, + "grad_norm": 0.699665382274811, + "learning_rate": 8.686826897529905e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9068031013011932, + "num_tokens": 48423506.0, + "step": 2594 + }, + { + "epoch": 0.2592537089764724, + "grad_norm": 0.4481057980911027, + "learning_rate": 8.685733838467063e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.907442718744278, + "num_tokens": 48505042.0, + "step": 2595 + }, + { + "epoch": 0.2593536140666367, + "grad_norm": 0.4816467110991326, + "learning_rate": 8.684640393504283e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.9078925251960754, + "num_tokens": 48586616.0, + "step": 2596 + }, + { + "epoch": 0.25945351915680104, + "grad_norm": 0.48321954235964515, + "learning_rate": 8.683546562756049e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9089277982711792, + "num_tokens": 48668058.0, + "step": 2597 + }, + { + "epoch": 0.25955342424696537, + "grad_norm": 0.6104074521049668, + "learning_rate": 8.682452346336885e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9046993255615234, + "num_tokens": 48749534.0, + "step": 2598 + }, + { + "epoch": 0.2596533293371297, + "grad_norm": 0.5161889036712094, + "learning_rate": 8.68135774436136e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9068989455699921, + "num_tokens": 48831080.0, + "step": 2599 + }, + { + "epoch": 0.2597532344272941, + "grad_norm": 0.5168503362806262, + "learning_rate": 8.680262756944078e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9047478437423706, + "num_tokens": 48912618.0, + "step": 2600 + }, + { + "epoch": 0.2598531395174584, + "grad_norm": 0.5511987944371236, + "learning_rate": 8.679167384199686e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9081105291843414, + "num_tokens": 48994134.0, + "step": 2601 + }, + { + "epoch": 0.25995304460762275, + "grad_norm": 0.5710017440570323, + "learning_rate": 8.678071626242867e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9072459936141968, + "num_tokens": 49075611.0, + "step": 2602 + }, + { + "epoch": 0.2600529496977871, + "grad_norm": 0.5748605065814917, + "learning_rate": 8.676975483188352e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9071006774902344, + "num_tokens": 49157150.0, + "step": 2603 + }, + { + "epoch": 0.26015285478795147, + "grad_norm": 0.7059401057274459, + "learning_rate": 8.675878955150907e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9041476547718048, + "num_tokens": 49238698.0, + "step": 2604 + }, + { + "epoch": 0.2602527598781158, + "grad_norm": 0.5230994661015992, + "learning_rate": 8.674782042245339e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9021316170692444, + "num_tokens": 49320214.0, + "step": 2605 + }, + { + "epoch": 0.26035266496828013, + "grad_norm": 0.48701108221855727, + "learning_rate": 8.673684744586497e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9081217050552368, + "num_tokens": 49401764.0, + "step": 2606 + }, + { + "epoch": 0.26045257005844447, + "grad_norm": 0.6070728195207831, + "learning_rate": 8.672587062289266e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9080936312675476, + "num_tokens": 49483354.0, + "step": 2607 + }, + { + "epoch": 0.2605524751486088, + "grad_norm": 0.5067017202266572, + "learning_rate": 8.67148899546858e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9073226451873779, + "num_tokens": 49564785.0, + "step": 2608 + }, + { + "epoch": 0.2606523802387732, + "grad_norm": 0.49490923730020797, + "learning_rate": 8.670390544239404e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9074533879756927, + "num_tokens": 49646249.0, + "step": 2609 + }, + { + "epoch": 0.2607522853289375, + "grad_norm": 0.5489190212818236, + "learning_rate": 8.669291708716747e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9063307642936707, + "num_tokens": 49727694.0, + "step": 2610 + }, + { + "epoch": 0.26085219041910185, + "grad_norm": 0.4764081833325515, + "learning_rate": 8.668192489015659e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.9092448055744171, + "num_tokens": 49809294.0, + "step": 2611 + }, + { + "epoch": 0.2609520955092662, + "grad_norm": 0.4916255914935336, + "learning_rate": 8.667092885251226e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9090932905673981, + "num_tokens": 49890872.0, + "step": 2612 + }, + { + "epoch": 0.26105200059943057, + "grad_norm": 1.240893865296845, + "learning_rate": 8.665992897538584e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9078909754753113, + "num_tokens": 49972415.0, + "step": 2613 + }, + { + "epoch": 0.2611519056895949, + "grad_norm": 0.4761058161268332, + "learning_rate": 8.664892525992899e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.907303124666214, + "num_tokens": 50053949.0, + "step": 2614 + }, + { + "epoch": 0.26125181077975923, + "grad_norm": 0.485678628550856, + "learning_rate": 8.663791770729382e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9068752527236938, + "num_tokens": 50135431.0, + "step": 2615 + }, + { + "epoch": 0.26135171586992356, + "grad_norm": 0.48934462786738997, + "learning_rate": 8.66269063186328e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.908380776643753, + "num_tokens": 50216992.0, + "step": 2616 + }, + { + "epoch": 0.2614516209600879, + "grad_norm": 0.9939374744176542, + "learning_rate": 8.661589109509886e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9093403816223145, + "num_tokens": 50298571.0, + "step": 2617 + }, + { + "epoch": 0.2615515260502523, + "grad_norm": 0.5479834606185326, + "learning_rate": 8.660487203784534e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.907498836517334, + "num_tokens": 50380047.0, + "step": 2618 + }, + { + "epoch": 0.2616514311404166, + "grad_norm": 0.633208937907249, + "learning_rate": 8.659384914802588e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9047145843505859, + "num_tokens": 50461553.0, + "step": 2619 + }, + { + "epoch": 0.26175133623058094, + "grad_norm": 0.7350684777964499, + "learning_rate": 8.658282242679461e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9060259461402893, + "num_tokens": 50543109.0, + "step": 2620 + }, + { + "epoch": 0.2618512413207453, + "grad_norm": 0.5777881350983024, + "learning_rate": 8.657179187530606e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9071254432201385, + "num_tokens": 50624627.0, + "step": 2621 + }, + { + "epoch": 0.26195114641090966, + "grad_norm": 0.4882185629417592, + "learning_rate": 8.656075749471513e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9062686264514923, + "num_tokens": 50706213.0, + "step": 2622 + }, + { + "epoch": 0.262051051501074, + "grad_norm": 0.5548925767936889, + "learning_rate": 8.65497192861771e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9059557616710663, + "num_tokens": 50787680.0, + "step": 2623 + }, + { + "epoch": 0.2621509565912383, + "grad_norm": 0.5250275932789502, + "learning_rate": 8.653867725084772e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9046245515346527, + "num_tokens": 50869226.0, + "step": 2624 + }, + { + "epoch": 0.26225086168140266, + "grad_norm": 0.5729897838164257, + "learning_rate": 8.65276313898831e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9076798558235168, + "num_tokens": 50950674.0, + "step": 2625 + }, + { + "epoch": 0.262350766771567, + "grad_norm": 0.5306834295104745, + "learning_rate": 8.651658170443972e-06, + "loss": 0.49, + "mean_token_accuracy": 0.909984678030014, + "num_tokens": 51032251.0, + "step": 2626 + }, + { + "epoch": 0.2624506718617314, + "grad_norm": 0.4957309472052598, + "learning_rate": 8.650552819567452e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9082143902778625, + "num_tokens": 51113767.0, + "step": 2627 + }, + { + "epoch": 0.2625505769518957, + "grad_norm": 0.537930103862677, + "learning_rate": 8.64944708647448e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.9075297117233276, + "num_tokens": 51195323.0, + "step": 2628 + }, + { + "epoch": 0.26265048204206004, + "grad_norm": 0.603471388637479, + "learning_rate": 8.64834097128083e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9077850580215454, + "num_tokens": 51276876.0, + "step": 2629 + }, + { + "epoch": 0.2627503871322244, + "grad_norm": 0.5502780428812243, + "learning_rate": 8.64723447410231e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9069420993328094, + "num_tokens": 51358371.0, + "step": 2630 + }, + { + "epoch": 0.2628502922223887, + "grad_norm": 0.609553536974028, + "learning_rate": 8.646127595054771e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9033487141132355, + "num_tokens": 51439937.0, + "step": 2631 + }, + { + "epoch": 0.2629501973125531, + "grad_norm": 0.5000870654621913, + "learning_rate": 8.645020334254108e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.904567152261734, + "num_tokens": 51521458.0, + "step": 2632 + }, + { + "epoch": 0.2630501024027174, + "grad_norm": 0.6641626426585057, + "learning_rate": 8.643912691816252e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9057998955249786, + "num_tokens": 51602996.0, + "step": 2633 + }, + { + "epoch": 0.26315000749288175, + "grad_norm": 0.7046045882556748, + "learning_rate": 8.64280466785717e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9072173535823822, + "num_tokens": 51684590.0, + "step": 2634 + }, + { + "epoch": 0.2632499125830461, + "grad_norm": 0.6650033827208545, + "learning_rate": 8.641696262492876e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9074929654598236, + "num_tokens": 51766128.0, + "step": 2635 + }, + { + "epoch": 0.2633498176732105, + "grad_norm": 0.5411353515911794, + "learning_rate": 8.640587475839421e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.9078960716724396, + "num_tokens": 51847728.0, + "step": 2636 + }, + { + "epoch": 0.2634497227633748, + "grad_norm": 0.5392486775669685, + "learning_rate": 8.639478308012897e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9072566032409668, + "num_tokens": 51929343.0, + "step": 2637 + }, + { + "epoch": 0.26354962785353914, + "grad_norm": 0.4668666638529919, + "learning_rate": 8.638368759129433e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9083664119243622, + "num_tokens": 52010870.0, + "step": 2638 + }, + { + "epoch": 0.26364953294370347, + "grad_norm": 0.8825121852917164, + "learning_rate": 8.6372588293052e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9093241095542908, + "num_tokens": 52092423.0, + "step": 2639 + }, + { + "epoch": 0.2637494380338678, + "grad_norm": 0.642772048428794, + "learning_rate": 8.636148518656408e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9095261096954346, + "num_tokens": 52173961.0, + "step": 2640 + }, + { + "epoch": 0.2638493431240322, + "grad_norm": 0.5497236457990824, + "learning_rate": 8.635037827299313e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9081369638442993, + "num_tokens": 52255543.0, + "step": 2641 + }, + { + "epoch": 0.2639492482141965, + "grad_norm": 0.4978546732920611, + "learning_rate": 8.6339267553502e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9065230488777161, + "num_tokens": 52337016.0, + "step": 2642 + }, + { + "epoch": 0.26404915330436085, + "grad_norm": 0.6300591809870121, + "learning_rate": 8.6328153029254e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9059900343418121, + "num_tokens": 52418457.0, + "step": 2643 + }, + { + "epoch": 0.2641490583945252, + "grad_norm": 0.46593276678133455, + "learning_rate": 8.631703470141284e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9078496694564819, + "num_tokens": 52500031.0, + "step": 2644 + }, + { + "epoch": 0.26424896348468957, + "grad_norm": 0.493369533356242, + "learning_rate": 8.630591257114261e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9089411497116089, + "num_tokens": 52581585.0, + "step": 2645 + }, + { + "epoch": 0.2643488685748539, + "grad_norm": 0.45822249205660887, + "learning_rate": 8.629478663960785e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9074035286903381, + "num_tokens": 52663084.0, + "step": 2646 + }, + { + "epoch": 0.26444877366501823, + "grad_norm": 0.46602066660826863, + "learning_rate": 8.628365690797338e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9055316150188446, + "num_tokens": 52744548.0, + "step": 2647 + }, + { + "epoch": 0.26454867875518256, + "grad_norm": 0.6855169737608667, + "learning_rate": 8.627252337740457e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9054786264896393, + "num_tokens": 52826090.0, + "step": 2648 + }, + { + "epoch": 0.2646485838453469, + "grad_norm": 0.5510274419006935, + "learning_rate": 8.626138604906707e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9086069762706757, + "num_tokens": 52907640.0, + "step": 2649 + }, + { + "epoch": 0.2647484889355113, + "grad_norm": 0.8828572674181452, + "learning_rate": 8.625024492412697e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9082818031311035, + "num_tokens": 52989212.0, + "step": 2650 + }, + { + "epoch": 0.2648483940256756, + "grad_norm": 0.765982823618393, + "learning_rate": 8.623910000375078e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9060305953025818, + "num_tokens": 53070644.0, + "step": 2651 + }, + { + "epoch": 0.26494829911583995, + "grad_norm": 0.553921829488713, + "learning_rate": 8.622795128910537e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9061559736728668, + "num_tokens": 53152203.0, + "step": 2652 + }, + { + "epoch": 0.2650482042060043, + "grad_norm": 0.5768414161404017, + "learning_rate": 8.6216798781358e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.9098583161830902, + "num_tokens": 53233781.0, + "step": 2653 + }, + { + "epoch": 0.26514810929616867, + "grad_norm": 0.5208507811123724, + "learning_rate": 8.620564248167638e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9082159399986267, + "num_tokens": 53315318.0, + "step": 2654 + }, + { + "epoch": 0.265248014386333, + "grad_norm": 0.5906879889767296, + "learning_rate": 8.619448239122858e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9060763120651245, + "num_tokens": 53396784.0, + "step": 2655 + }, + { + "epoch": 0.26534791947649733, + "grad_norm": 0.9126943561734242, + "learning_rate": 8.618331851118306e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.9091294705867767, + "num_tokens": 53478366.0, + "step": 2656 + }, + { + "epoch": 0.26544782456666166, + "grad_norm": 0.5492650263783379, + "learning_rate": 8.61721508427087e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9044681191444397, + "num_tokens": 53559871.0, + "step": 2657 + }, + { + "epoch": 0.265547729656826, + "grad_norm": 0.805806325840713, + "learning_rate": 8.616097938697476e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9095108211040497, + "num_tokens": 53641476.0, + "step": 2658 + }, + { + "epoch": 0.2656476347469904, + "grad_norm": 0.6128875911342883, + "learning_rate": 8.614980414515088e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9040507078170776, + "num_tokens": 53723038.0, + "step": 2659 + }, + { + "epoch": 0.2657475398371547, + "grad_norm": 0.6233921826662612, + "learning_rate": 8.613862511840719e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.90585657954216, + "num_tokens": 53804595.0, + "step": 2660 + }, + { + "epoch": 0.26584744492731904, + "grad_norm": 0.6615529289584376, + "learning_rate": 8.612744230791406e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9028652310371399, + "num_tokens": 53886022.0, + "step": 2661 + }, + { + "epoch": 0.2659473500174834, + "grad_norm": 0.6878305448584351, + "learning_rate": 8.611625571484238e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.9105866849422455, + "num_tokens": 53967687.0, + "step": 2662 + }, + { + "epoch": 0.26604725510764776, + "grad_norm": 0.5314400642019453, + "learning_rate": 8.61050653403634e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9054829776287079, + "num_tokens": 54049125.0, + "step": 2663 + }, + { + "epoch": 0.2661471601978121, + "grad_norm": 0.5719093512905304, + "learning_rate": 8.609387118564879e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9057432413101196, + "num_tokens": 54130583.0, + "step": 2664 + }, + { + "epoch": 0.2662470652879764, + "grad_norm": 0.5508485442447837, + "learning_rate": 8.608267325187051e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9062998294830322, + "num_tokens": 54212078.0, + "step": 2665 + }, + { + "epoch": 0.26634697037814076, + "grad_norm": 0.5542311328744683, + "learning_rate": 8.607147154020108e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9091960489749908, + "num_tokens": 54293621.0, + "step": 2666 + }, + { + "epoch": 0.2664468754683051, + "grad_norm": 0.7959164187912612, + "learning_rate": 8.606026605181329e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9066026508808136, + "num_tokens": 54375149.0, + "step": 2667 + }, + { + "epoch": 0.2665467805584695, + "grad_norm": 0.7837443531922264, + "learning_rate": 8.604905678788033e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9096424281597137, + "num_tokens": 54456646.0, + "step": 2668 + }, + { + "epoch": 0.2666466856486338, + "grad_norm": 0.5060034304532895, + "learning_rate": 8.60378437495759e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.910284012556076, + "num_tokens": 54538151.0, + "step": 2669 + }, + { + "epoch": 0.26674659073879814, + "grad_norm": 0.5262877157501118, + "learning_rate": 8.602662693807395e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9076546728610992, + "num_tokens": 54619706.0, + "step": 2670 + }, + { + "epoch": 0.26684649582896247, + "grad_norm": 0.569414615350509, + "learning_rate": 8.601540635454895e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.904565840959549, + "num_tokens": 54701164.0, + "step": 2671 + }, + { + "epoch": 0.2669464009191268, + "grad_norm": 0.5688523687363254, + "learning_rate": 8.600418200017565e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9073011875152588, + "num_tokens": 54782751.0, + "step": 2672 + }, + { + "epoch": 0.2670463060092912, + "grad_norm": 0.5703351143048888, + "learning_rate": 8.599295387612928e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.908118337392807, + "num_tokens": 54864258.0, + "step": 2673 + }, + { + "epoch": 0.2671462110994555, + "grad_norm": 0.5002522792124028, + "learning_rate": 8.598172198358542e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9102630913257599, + "num_tokens": 54945818.0, + "step": 2674 + }, + { + "epoch": 0.26724611618961985, + "grad_norm": 0.46833377887868033, + "learning_rate": 8.597048632372007e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.908100813627243, + "num_tokens": 55027260.0, + "step": 2675 + }, + { + "epoch": 0.2673460212797842, + "grad_norm": 0.6220914012562091, + "learning_rate": 8.59592468977096e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.906747579574585, + "num_tokens": 55108741.0, + "step": 2676 + }, + { + "epoch": 0.2674459263699486, + "grad_norm": 0.5970959201574254, + "learning_rate": 8.594800370673083e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9069706201553345, + "num_tokens": 55190163.0, + "step": 2677 + }, + { + "epoch": 0.2675458314601129, + "grad_norm": 0.5778735184895324, + "learning_rate": 8.593675675196088e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9090588092803955, + "num_tokens": 55271682.0, + "step": 2678 + }, + { + "epoch": 0.26764573655027724, + "grad_norm": 0.8354946529971723, + "learning_rate": 8.592550603457738e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9033146798610687, + "num_tokens": 55353143.0, + "step": 2679 + }, + { + "epoch": 0.26774564164044157, + "grad_norm": 0.7530680258494719, + "learning_rate": 8.591425155575823e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9077757000923157, + "num_tokens": 55434673.0, + "step": 2680 + }, + { + "epoch": 0.2678455467306059, + "grad_norm": 0.5613805294928098, + "learning_rate": 8.590299331668182e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.90559983253479, + "num_tokens": 55516189.0, + "step": 2681 + }, + { + "epoch": 0.2679454518207703, + "grad_norm": 0.5688772840780385, + "learning_rate": 8.589173131852686e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9095102250576019, + "num_tokens": 55597762.0, + "step": 2682 + }, + { + "epoch": 0.2680453569109346, + "grad_norm": 0.66112124582226, + "learning_rate": 8.588046556247255e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9040654599666595, + "num_tokens": 55679273.0, + "step": 2683 + }, + { + "epoch": 0.26814526200109895, + "grad_norm": 0.5634565917598521, + "learning_rate": 8.58691960496984e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9095921218395233, + "num_tokens": 55760819.0, + "step": 2684 + }, + { + "epoch": 0.2682451670912633, + "grad_norm": 0.46552591673339283, + "learning_rate": 8.58579227813843e-06, + "loss": 0.49, + "mean_token_accuracy": 0.9089767932891846, + "num_tokens": 55842388.0, + "step": 2685 + }, + { + "epoch": 0.26834507218142767, + "grad_norm": 0.5977538377817956, + "learning_rate": 8.584664575871064e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9067184031009674, + "num_tokens": 55923820.0, + "step": 2686 + }, + { + "epoch": 0.268444977271592, + "grad_norm": 0.7884575675365016, + "learning_rate": 8.583536498285807e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9055715799331665, + "num_tokens": 56005381.0, + "step": 2687 + }, + { + "epoch": 0.26854488236175633, + "grad_norm": 0.6629111738498874, + "learning_rate": 8.582408045500777e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9081268310546875, + "num_tokens": 56086853.0, + "step": 2688 + }, + { + "epoch": 0.26864478745192066, + "grad_norm": 0.5730010337741808, + "learning_rate": 8.581279217634117e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9063770473003387, + "num_tokens": 56168341.0, + "step": 2689 + }, + { + "epoch": 0.268744692542085, + "grad_norm": 0.5636556251684984, + "learning_rate": 8.580150014804021e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9087375402450562, + "num_tokens": 56249778.0, + "step": 2690 + }, + { + "epoch": 0.2688445976322494, + "grad_norm": 0.5969858764923313, + "learning_rate": 8.579020437128715e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9061354994773865, + "num_tokens": 56331229.0, + "step": 2691 + }, + { + "epoch": 0.2689445027224137, + "grad_norm": 0.7533765814614786, + "learning_rate": 8.57789048472647e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9061833024024963, + "num_tokens": 56412778.0, + "step": 2692 + }, + { + "epoch": 0.26904440781257805, + "grad_norm": 0.48511809525568517, + "learning_rate": 8.57676015771559e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9045979678630829, + "num_tokens": 56494295.0, + "step": 2693 + }, + { + "epoch": 0.2691443129027424, + "grad_norm": 0.6375812851180022, + "learning_rate": 8.575629456214421e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.908383846282959, + "num_tokens": 56575856.0, + "step": 2694 + }, + { + "epoch": 0.26924421799290676, + "grad_norm": 1.4686506494302145, + "learning_rate": 8.574498380341351e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9070777297019958, + "num_tokens": 56657328.0, + "step": 2695 + }, + { + "epoch": 0.2693441230830711, + "grad_norm": 0.6432713331896333, + "learning_rate": 8.573366930214807e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9046258330345154, + "num_tokens": 56738854.0, + "step": 2696 + }, + { + "epoch": 0.26944402817323543, + "grad_norm": 0.5432156335978767, + "learning_rate": 8.572235105953245e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9082265198230743, + "num_tokens": 56820282.0, + "step": 2697 + }, + { + "epoch": 0.26954393326339976, + "grad_norm": 0.4976918084685885, + "learning_rate": 8.571102907675176e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.9086821973323822, + "num_tokens": 56901857.0, + "step": 2698 + }, + { + "epoch": 0.2696438383535641, + "grad_norm": 0.4827502940811183, + "learning_rate": 8.569970335499138e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9056317210197449, + "num_tokens": 56983359.0, + "step": 2699 + }, + { + "epoch": 0.2697437434437285, + "grad_norm": 0.44384629284098603, + "learning_rate": 8.568837389543713e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9043264389038086, + "num_tokens": 57064949.0, + "step": 2700 + }, + { + "epoch": 0.2698436485338928, + "grad_norm": 0.6813112215006082, + "learning_rate": 8.567704069927524e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9048945009708405, + "num_tokens": 57146507.0, + "step": 2701 + }, + { + "epoch": 0.26994355362405714, + "grad_norm": 0.49534622752798885, + "learning_rate": 8.566570376769228e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9067221879959106, + "num_tokens": 57228103.0, + "step": 2702 + }, + { + "epoch": 0.2700434587142215, + "grad_norm": 0.5708226294998704, + "learning_rate": 8.565436310187524e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9035312533378601, + "num_tokens": 57309653.0, + "step": 2703 + }, + { + "epoch": 0.27014336380438586, + "grad_norm": 0.5440155628331984, + "learning_rate": 8.56430187030115e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9083207845687866, + "num_tokens": 57391197.0, + "step": 2704 + }, + { + "epoch": 0.2702432688945502, + "grad_norm": 0.6152289570540951, + "learning_rate": 8.563167057228884e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9092913269996643, + "num_tokens": 57472762.0, + "step": 2705 + }, + { + "epoch": 0.2703431739847145, + "grad_norm": 0.5349102757214194, + "learning_rate": 8.562031871089542e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9098716080188751, + "num_tokens": 57554308.0, + "step": 2706 + }, + { + "epoch": 0.27044307907487886, + "grad_norm": 0.49982962064890635, + "learning_rate": 8.560896312001978e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9056676626205444, + "num_tokens": 57635794.0, + "step": 2707 + }, + { + "epoch": 0.2705429841650432, + "grad_norm": 0.4383598875913535, + "learning_rate": 8.559760380085087e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.908206433057785, + "num_tokens": 57717350.0, + "step": 2708 + }, + { + "epoch": 0.2706428892552076, + "grad_norm": 0.6606542921864277, + "learning_rate": 8.558624075457802e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9069189131259918, + "num_tokens": 57798843.0, + "step": 2709 + }, + { + "epoch": 0.2707427943453719, + "grad_norm": 0.5817374235907088, + "learning_rate": 8.557487398239095e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9078608453273773, + "num_tokens": 57880370.0, + "step": 2710 + }, + { + "epoch": 0.27084269943553624, + "grad_norm": 0.5038728510866339, + "learning_rate": 8.556350348547978e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9050479233264923, + "num_tokens": 57961866.0, + "step": 2711 + }, + { + "epoch": 0.27094260452570057, + "grad_norm": 0.5422067851535539, + "learning_rate": 8.555212926503498e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9060656726360321, + "num_tokens": 58043338.0, + "step": 2712 + }, + { + "epoch": 0.2710425096158649, + "grad_norm": 0.7654434227674154, + "learning_rate": 8.554075132224748e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9066909551620483, + "num_tokens": 58124820.0, + "step": 2713 + }, + { + "epoch": 0.2711424147060293, + "grad_norm": 1.5014196540277398, + "learning_rate": 8.552936965830856e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9069346785545349, + "num_tokens": 58206323.0, + "step": 2714 + }, + { + "epoch": 0.2712423197961936, + "grad_norm": 0.550014899675612, + "learning_rate": 8.551798427440985e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.9070166647434235, + "num_tokens": 58287877.0, + "step": 2715 + }, + { + "epoch": 0.27134222488635795, + "grad_norm": 0.5421747483160522, + "learning_rate": 8.550659517174347e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.9064392447471619, + "num_tokens": 58369373.0, + "step": 2716 + }, + { + "epoch": 0.2714421299765223, + "grad_norm": 0.5675649725638963, + "learning_rate": 8.549520235150183e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.906512051820755, + "num_tokens": 58450893.0, + "step": 2717 + }, + { + "epoch": 0.27154203506668667, + "grad_norm": 0.5867968313044523, + "learning_rate": 8.548380581487779e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9041019380092621, + "num_tokens": 58532456.0, + "step": 2718 + }, + { + "epoch": 0.271641940156851, + "grad_norm": 0.41962142333200386, + "learning_rate": 8.547240556306457e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9080504775047302, + "num_tokens": 58613944.0, + "step": 2719 + }, + { + "epoch": 0.27174184524701533, + "grad_norm": 0.6585483424872182, + "learning_rate": 8.546100159725578e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9082474410533905, + "num_tokens": 58695471.0, + "step": 2720 + }, + { + "epoch": 0.27184175033717967, + "grad_norm": 0.5669234905731929, + "learning_rate": 8.544959391864543e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9059399962425232, + "num_tokens": 58776896.0, + "step": 2721 + }, + { + "epoch": 0.271941655427344, + "grad_norm": 0.522999172188177, + "learning_rate": 8.543818252842793e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9070431888103485, + "num_tokens": 58858484.0, + "step": 2722 + }, + { + "epoch": 0.2720415605175084, + "grad_norm": 0.4821010682900626, + "learning_rate": 8.542676742779803e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9065238535404205, + "num_tokens": 58939966.0, + "step": 2723 + }, + { + "epoch": 0.2721414656076727, + "grad_norm": 0.516697653680878, + "learning_rate": 8.541534861795094e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9090812504291534, + "num_tokens": 59021543.0, + "step": 2724 + }, + { + "epoch": 0.27224137069783705, + "grad_norm": 0.45964776680963665, + "learning_rate": 8.540392610008221e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9099843204021454, + "num_tokens": 59103020.0, + "step": 2725 + }, + { + "epoch": 0.2723412757880014, + "grad_norm": 0.4608441565394446, + "learning_rate": 8.539249987538778e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.908655047416687, + "num_tokens": 59184484.0, + "step": 2726 + }, + { + "epoch": 0.27244118087816577, + "grad_norm": 0.49217488486483635, + "learning_rate": 8.538106994506398e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9046022891998291, + "num_tokens": 59265930.0, + "step": 2727 + }, + { + "epoch": 0.2725410859683301, + "grad_norm": 0.5084454922761623, + "learning_rate": 8.536963631030755e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.9059595763683319, + "num_tokens": 59347552.0, + "step": 2728 + }, + { + "epoch": 0.27264099105849443, + "grad_norm": 0.45288282945077796, + "learning_rate": 8.53581989723156e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9080439507961273, + "num_tokens": 59429106.0, + "step": 2729 + }, + { + "epoch": 0.27274089614865876, + "grad_norm": 0.5869809720830413, + "learning_rate": 8.534675793228562e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9086078703403473, + "num_tokens": 59510569.0, + "step": 2730 + }, + { + "epoch": 0.2728408012388231, + "grad_norm": 0.5567259262791231, + "learning_rate": 8.533531319141552e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9074907898902893, + "num_tokens": 59592088.0, + "step": 2731 + }, + { + "epoch": 0.2729407063289875, + "grad_norm": 0.7039271777516951, + "learning_rate": 8.532386475090355e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9035096764564514, + "num_tokens": 59673521.0, + "step": 2732 + }, + { + "epoch": 0.2730406114191518, + "grad_norm": 0.5026238124389144, + "learning_rate": 8.53124126119484e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.907875120639801, + "num_tokens": 59755018.0, + "step": 2733 + }, + { + "epoch": 0.27314051650931614, + "grad_norm": 0.5309598518292675, + "learning_rate": 8.53009567757491e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9058107733726501, + "num_tokens": 59836506.0, + "step": 2734 + }, + { + "epoch": 0.2732404215994805, + "grad_norm": 0.6371983632593293, + "learning_rate": 8.528949724350507e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9092801511287689, + "num_tokens": 59917999.0, + "step": 2735 + }, + { + "epoch": 0.27334032668964486, + "grad_norm": 0.6069803655128282, + "learning_rate": 8.527803401641619e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9068398773670197, + "num_tokens": 59999582.0, + "step": 2736 + }, + { + "epoch": 0.2734402317798092, + "grad_norm": 0.5075446545702097, + "learning_rate": 8.526656709568262e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9089755415916443, + "num_tokens": 60081087.0, + "step": 2737 + }, + { + "epoch": 0.2735401368699735, + "grad_norm": 0.7737191020751366, + "learning_rate": 8.5255096482505e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9071592688560486, + "num_tokens": 60162547.0, + "step": 2738 + }, + { + "epoch": 0.27364004196013786, + "grad_norm": 0.5102552387846662, + "learning_rate": 8.524362217808426e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9062506258487701, + "num_tokens": 60244091.0, + "step": 2739 + }, + { + "epoch": 0.2737399470503022, + "grad_norm": 0.5247406972009976, + "learning_rate": 8.52321441836218e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.9081142842769623, + "num_tokens": 60325652.0, + "step": 2740 + }, + { + "epoch": 0.2738398521404666, + "grad_norm": 0.4138756231489913, + "learning_rate": 8.52206625003194e-06, + "loss": 0.49, + "mean_token_accuracy": 0.909859836101532, + "num_tokens": 60407208.0, + "step": 2741 + }, + { + "epoch": 0.2739397572306309, + "grad_norm": 0.5294634846910202, + "learning_rate": 8.520917712937918e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9066361486911774, + "num_tokens": 60488709.0, + "step": 2742 + }, + { + "epoch": 0.27403966232079524, + "grad_norm": 0.5008213699401766, + "learning_rate": 8.519768807200364e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9083407521247864, + "num_tokens": 60570236.0, + "step": 2743 + }, + { + "epoch": 0.2741395674109596, + "grad_norm": 0.5497305719790245, + "learning_rate": 8.518619532939577e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9093514978885651, + "num_tokens": 60651792.0, + "step": 2744 + }, + { + "epoch": 0.27423947250112396, + "grad_norm": 0.7368656632891656, + "learning_rate": 8.51746989027588e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9073921740055084, + "num_tokens": 60733259.0, + "step": 2745 + }, + { + "epoch": 0.2743393775912883, + "grad_norm": 0.5911748939305159, + "learning_rate": 8.516319879329645e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9066541790962219, + "num_tokens": 60814837.0, + "step": 2746 + }, + { + "epoch": 0.2744392826814526, + "grad_norm": 0.4834797207880487, + "learning_rate": 8.515169500221278e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9065719544887543, + "num_tokens": 60896385.0, + "step": 2747 + }, + { + "epoch": 0.27453918777161695, + "grad_norm": 0.8139779447179935, + "learning_rate": 8.514018753071228e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9040746986865997, + "num_tokens": 60977941.0, + "step": 2748 + }, + { + "epoch": 0.2746390928617813, + "grad_norm": 0.5629139881787936, + "learning_rate": 8.512867637999975e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9087503552436829, + "num_tokens": 61059466.0, + "step": 2749 + }, + { + "epoch": 0.2747389979519457, + "grad_norm": 0.5054324258663719, + "learning_rate": 8.511716155128046e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9085652530193329, + "num_tokens": 61141000.0, + "step": 2750 + }, + { + "epoch": 0.27483890304211, + "grad_norm": 0.5101180969608383, + "learning_rate": 8.510564304575996e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.909426748752594, + "num_tokens": 61222518.0, + "step": 2751 + }, + { + "epoch": 0.27493880813227434, + "grad_norm": 0.5402855943451454, + "learning_rate": 8.509412086464432e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9085409343242645, + "num_tokens": 61304081.0, + "step": 2752 + }, + { + "epoch": 0.27503871322243867, + "grad_norm": 0.553940149202115, + "learning_rate": 8.50825950091399e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.9086026549339294, + "num_tokens": 61385662.0, + "step": 2753 + }, + { + "epoch": 0.275138618312603, + "grad_norm": 0.6876144208938323, + "learning_rate": 8.507106548045345e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.9087805151939392, + "num_tokens": 61467203.0, + "step": 2754 + }, + { + "epoch": 0.2752385234027674, + "grad_norm": 0.4949449324299722, + "learning_rate": 8.505953227979215e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9085306525230408, + "num_tokens": 61548722.0, + "step": 2755 + }, + { + "epoch": 0.2753384284929317, + "grad_norm": 0.4127536335894145, + "learning_rate": 8.50479954083635e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9095527529716492, + "num_tokens": 61630143.0, + "step": 2756 + }, + { + "epoch": 0.27543833358309605, + "grad_norm": 0.787959273707698, + "learning_rate": 8.503645486737547e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9069308638572693, + "num_tokens": 61711699.0, + "step": 2757 + }, + { + "epoch": 0.2755382386732604, + "grad_norm": 0.5631818506958924, + "learning_rate": 8.502491065803632e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9085923135280609, + "num_tokens": 61793203.0, + "step": 2758 + }, + { + "epoch": 0.27563814376342477, + "grad_norm": 0.6877738629648922, + "learning_rate": 8.501336278155479e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9067102074623108, + "num_tokens": 61874711.0, + "step": 2759 + }, + { + "epoch": 0.2757380488535891, + "grad_norm": 0.5871876524604874, + "learning_rate": 8.50018112391399e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9065857827663422, + "num_tokens": 61956186.0, + "step": 2760 + }, + { + "epoch": 0.27583795394375343, + "grad_norm": 1.0963690108921265, + "learning_rate": 8.499025603200113e-06, + "loss": 0.488, + "mean_token_accuracy": 0.906271904706955, + "num_tokens": 62037828.0, + "step": 2761 + }, + { + "epoch": 0.27593785903391777, + "grad_norm": 0.5536338739007626, + "learning_rate": 8.497869716134834e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9057484567165375, + "num_tokens": 62119307.0, + "step": 2762 + }, + { + "epoch": 0.2760377641240821, + "grad_norm": 0.47811332190716166, + "learning_rate": 8.496713462839175e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9068445265293121, + "num_tokens": 62200827.0, + "step": 2763 + }, + { + "epoch": 0.2761376692142465, + "grad_norm": 0.4715390538089461, + "learning_rate": 8.495556843434192e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.906073808670044, + "num_tokens": 62282368.0, + "step": 2764 + }, + { + "epoch": 0.2762375743044108, + "grad_norm": 0.5857772775542386, + "learning_rate": 8.494399858040991e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9119404852390289, + "num_tokens": 62363948.0, + "step": 2765 + }, + { + "epoch": 0.27633747939457515, + "grad_norm": 0.4598078361155584, + "learning_rate": 8.493242506780705e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9059707820415497, + "num_tokens": 62445566.0, + "step": 2766 + }, + { + "epoch": 0.2764373844847395, + "grad_norm": 0.4972862219780801, + "learning_rate": 8.492084789774514e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.90558061003685, + "num_tokens": 62527067.0, + "step": 2767 + }, + { + "epoch": 0.27653728957490387, + "grad_norm": 0.6566898826873644, + "learning_rate": 8.490926707143624e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9074147045612335, + "num_tokens": 62608578.0, + "step": 2768 + }, + { + "epoch": 0.2766371946650682, + "grad_norm": 0.4635662910466804, + "learning_rate": 8.489768259009298e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9062077701091766, + "num_tokens": 62690074.0, + "step": 2769 + }, + { + "epoch": 0.27673709975523253, + "grad_norm": 0.4676958880685925, + "learning_rate": 8.48860944549282e-06, + "loss": 0.494, + "mean_token_accuracy": 0.906769722700119, + "num_tokens": 62771585.0, + "step": 2770 + }, + { + "epoch": 0.27683700484539686, + "grad_norm": 0.5434414756991466, + "learning_rate": 8.487450266715518e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.9097396433353424, + "num_tokens": 62853252.0, + "step": 2771 + }, + { + "epoch": 0.2769369099355612, + "grad_norm": 0.5866163652114286, + "learning_rate": 8.486290722798765e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9045124351978302, + "num_tokens": 62934718.0, + "step": 2772 + }, + { + "epoch": 0.2770368150257256, + "grad_norm": 0.7046058065652787, + "learning_rate": 8.48513081386396e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9046638309955597, + "num_tokens": 63016225.0, + "step": 2773 + }, + { + "epoch": 0.2771367201158899, + "grad_norm": 0.5379013518337603, + "learning_rate": 8.48397054003255e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.9090155065059662, + "num_tokens": 63097797.0, + "step": 2774 + }, + { + "epoch": 0.27723662520605424, + "grad_norm": 0.5401088222595386, + "learning_rate": 8.482809901426017e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9054484069347382, + "num_tokens": 63179361.0, + "step": 2775 + }, + { + "epoch": 0.2773365302962186, + "grad_norm": 0.5548488200185989, + "learning_rate": 8.48164889816588e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9079752862453461, + "num_tokens": 63260844.0, + "step": 2776 + }, + { + "epoch": 0.27743643538638296, + "grad_norm": 0.5653475237749693, + "learning_rate": 8.480487530373697e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.906932532787323, + "num_tokens": 63342404.0, + "step": 2777 + }, + { + "epoch": 0.2775363404765473, + "grad_norm": 0.5185590196179669, + "learning_rate": 8.479325798171066e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9053781628608704, + "num_tokens": 63423885.0, + "step": 2778 + }, + { + "epoch": 0.2776362455667116, + "grad_norm": 0.4902979285857389, + "learning_rate": 8.47816370167962e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9063495099544525, + "num_tokens": 63505308.0, + "step": 2779 + }, + { + "epoch": 0.27773615065687596, + "grad_norm": 0.6791807285240644, + "learning_rate": 8.477001241021033e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9082508385181427, + "num_tokens": 63586741.0, + "step": 2780 + }, + { + "epoch": 0.2778360557470403, + "grad_norm": 0.6823557653492442, + "learning_rate": 8.475838416317012e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9074939787387848, + "num_tokens": 63668297.0, + "step": 2781 + }, + { + "epoch": 0.2779359608372047, + "grad_norm": 0.6351264881527581, + "learning_rate": 8.474675227689309e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9088583886623383, + "num_tokens": 63749779.0, + "step": 2782 + }, + { + "epoch": 0.278035865927369, + "grad_norm": 0.5279700580547, + "learning_rate": 8.473511675259712e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.9068367183208466, + "num_tokens": 63831362.0, + "step": 2783 + }, + { + "epoch": 0.27813577101753334, + "grad_norm": 0.5365571659765692, + "learning_rate": 8.472347759150044e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.907367616891861, + "num_tokens": 63912967.0, + "step": 2784 + }, + { + "epoch": 0.27823567610769767, + "grad_norm": 0.5136240867918112, + "learning_rate": 8.47118347948217e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9061948955059052, + "num_tokens": 63994432.0, + "step": 2785 + }, + { + "epoch": 0.27833558119786206, + "grad_norm": 0.4492881657093678, + "learning_rate": 8.470018836377987e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9094681441783905, + "num_tokens": 64075922.0, + "step": 2786 + }, + { + "epoch": 0.2784354862880264, + "grad_norm": 0.4651580321021055, + "learning_rate": 8.46885382995944e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9076656103134155, + "num_tokens": 64157426.0, + "step": 2787 + }, + { + "epoch": 0.2785353913781907, + "grad_norm": 0.9037887305361323, + "learning_rate": 8.467688460348503e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9044316411018372, + "num_tokens": 64238958.0, + "step": 2788 + }, + { + "epoch": 0.27863529646835505, + "grad_norm": 0.5207048669136259, + "learning_rate": 8.466522727667192e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9029217064380646, + "num_tokens": 64320512.0, + "step": 2789 + }, + { + "epoch": 0.2787352015585194, + "grad_norm": 0.5402187204214194, + "learning_rate": 8.465356632037559e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9066905379295349, + "num_tokens": 64402047.0, + "step": 2790 + }, + { + "epoch": 0.2788351066486838, + "grad_norm": 0.5068209249775218, + "learning_rate": 8.464190173581698e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.9083680510520935, + "num_tokens": 64483586.0, + "step": 2791 + }, + { + "epoch": 0.2789350117388481, + "grad_norm": 0.6319712464591133, + "learning_rate": 8.463023352421737e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.902373880147934, + "num_tokens": 64565117.0, + "step": 2792 + }, + { + "epoch": 0.27903491682901244, + "grad_norm": 0.8187183401187617, + "learning_rate": 8.461856168679843e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9068412780761719, + "num_tokens": 64646588.0, + "step": 2793 + }, + { + "epoch": 0.27913482191917677, + "grad_norm": 0.7657447697766412, + "learning_rate": 8.460688622478222e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9085340201854706, + "num_tokens": 64728086.0, + "step": 2794 + }, + { + "epoch": 0.2792347270093411, + "grad_norm": 0.6050546987352453, + "learning_rate": 8.459520713939115e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9047890901565552, + "num_tokens": 64809597.0, + "step": 2795 + }, + { + "epoch": 0.2793346320995055, + "grad_norm": 0.5684220761339107, + "learning_rate": 8.458352443184806e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.907113254070282, + "num_tokens": 64891022.0, + "step": 2796 + }, + { + "epoch": 0.2794345371896698, + "grad_norm": 0.6760957651312848, + "learning_rate": 8.457183810337614e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9065247774124146, + "num_tokens": 64972512.0, + "step": 2797 + }, + { + "epoch": 0.27953444227983415, + "grad_norm": 0.5547567652770357, + "learning_rate": 8.456014815519894e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9078332781791687, + "num_tokens": 65054003.0, + "step": 2798 + }, + { + "epoch": 0.2796343473699985, + "grad_norm": 0.604417980937652, + "learning_rate": 8.454845458854043e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.906196653842926, + "num_tokens": 65135512.0, + "step": 2799 + }, + { + "epoch": 0.27973425246016287, + "grad_norm": 0.5761837518593983, + "learning_rate": 8.453675740462492e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9070056080818176, + "num_tokens": 65216978.0, + "step": 2800 + }, + { + "epoch": 0.2798341575503272, + "grad_norm": 1.4300261898244708, + "learning_rate": 8.452505660467713e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9089474976062775, + "num_tokens": 65298445.0, + "step": 2801 + }, + { + "epoch": 0.27993406264049153, + "grad_norm": 0.555650799509899, + "learning_rate": 8.451335218992214e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9071252346038818, + "num_tokens": 65379965.0, + "step": 2802 + }, + { + "epoch": 0.28003396773065586, + "grad_norm": 0.5164476392617827, + "learning_rate": 8.45016441615854e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9087886810302734, + "num_tokens": 65461503.0, + "step": 2803 + }, + { + "epoch": 0.2801338728208202, + "grad_norm": 0.7466465279393146, + "learning_rate": 8.448993252089278e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9075936675071716, + "num_tokens": 65543028.0, + "step": 2804 + }, + { + "epoch": 0.2802337779109846, + "grad_norm": 0.567326824349374, + "learning_rate": 8.447821726907048e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9070024192333221, + "num_tokens": 65624530.0, + "step": 2805 + }, + { + "epoch": 0.2803336830011489, + "grad_norm": 0.5689857927803275, + "learning_rate": 8.446649840734509e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.906728595495224, + "num_tokens": 65706144.0, + "step": 2806 + }, + { + "epoch": 0.28043358809131325, + "grad_norm": 0.5381997808731059, + "learning_rate": 8.44547759369436e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9036549627780914, + "num_tokens": 65787655.0, + "step": 2807 + }, + { + "epoch": 0.2805334931814776, + "grad_norm": 0.5471713422918949, + "learning_rate": 8.44430498590934e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9041629135608673, + "num_tokens": 65869214.0, + "step": 2808 + }, + { + "epoch": 0.28063339827164197, + "grad_norm": 0.7585218282219941, + "learning_rate": 8.443132017502216e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9073747992515564, + "num_tokens": 65950744.0, + "step": 2809 + }, + { + "epoch": 0.2807333033618063, + "grad_norm": 0.5636509290582884, + "learning_rate": 8.441958688595802e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.904684454202652, + "num_tokens": 66032360.0, + "step": 2810 + }, + { + "epoch": 0.28083320845197063, + "grad_norm": 0.6210072067166297, + "learning_rate": 8.440784999312946e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9067424237728119, + "num_tokens": 66113904.0, + "step": 2811 + }, + { + "epoch": 0.28093311354213496, + "grad_norm": 0.5537185473080287, + "learning_rate": 8.439610949776533e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9036710262298584, + "num_tokens": 66195333.0, + "step": 2812 + }, + { + "epoch": 0.2810330186322993, + "grad_norm": 0.639886231023778, + "learning_rate": 8.438436540109492e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.9072079360485077, + "num_tokens": 66276913.0, + "step": 2813 + }, + { + "epoch": 0.2811329237224637, + "grad_norm": 0.5543989452302831, + "learning_rate": 8.43726177043478e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9056911766529083, + "num_tokens": 66358429.0, + "step": 2814 + }, + { + "epoch": 0.281232828812628, + "grad_norm": 0.834788603851038, + "learning_rate": 8.436086640875398e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9077649414539337, + "num_tokens": 66439917.0, + "step": 2815 + }, + { + "epoch": 0.28133273390279234, + "grad_norm": 0.45701922606450995, + "learning_rate": 8.434911151554384e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9073756635189056, + "num_tokens": 66521452.0, + "step": 2816 + }, + { + "epoch": 0.2814326389929567, + "grad_norm": 0.5242629615020948, + "learning_rate": 8.43373530259481e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.9019470810890198, + "num_tokens": 66602948.0, + "step": 2817 + }, + { + "epoch": 0.28153254408312106, + "grad_norm": 0.536839065161351, + "learning_rate": 8.432559094119793e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9071721732616425, + "num_tokens": 66684482.0, + "step": 2818 + }, + { + "epoch": 0.2816324491732854, + "grad_norm": 0.5091345146664225, + "learning_rate": 8.43138252625248e-06, + "loss": 0.496, + "mean_token_accuracy": 0.906541258096695, + "num_tokens": 66766038.0, + "step": 2819 + }, + { + "epoch": 0.2817323542634497, + "grad_norm": 0.47341216927579394, + "learning_rate": 8.43020559911606e-06, + "loss": 0.493, + "mean_token_accuracy": 0.9085954427719116, + "num_tokens": 66847563.0, + "step": 2820 + }, + { + "epoch": 0.28183225935361406, + "grad_norm": 0.4896806345416116, + "learning_rate": 8.429028312833759e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.9085395932197571, + "num_tokens": 66929103.0, + "step": 2821 + }, + { + "epoch": 0.2819321644437784, + "grad_norm": 0.5839713137053986, + "learning_rate": 8.427850667528838e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9080392122268677, + "num_tokens": 67010624.0, + "step": 2822 + }, + { + "epoch": 0.2820320695339428, + "grad_norm": 0.5278321062579029, + "learning_rate": 8.4266726633246e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9051914513111115, + "num_tokens": 67092089.0, + "step": 2823 + }, + { + "epoch": 0.2821319746241071, + "grad_norm": 0.5004555444181508, + "learning_rate": 8.425494300344381e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.9104185402393341, + "num_tokens": 67173640.0, + "step": 2824 + }, + { + "epoch": 0.28223187971427144, + "grad_norm": 0.46033203195590067, + "learning_rate": 8.424315578711559e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9071679711341858, + "num_tokens": 67255186.0, + "step": 2825 + }, + { + "epoch": 0.28233178480443577, + "grad_norm": 0.4969251392734958, + "learning_rate": 8.423136498549545e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9083760976791382, + "num_tokens": 67336694.0, + "step": 2826 + }, + { + "epoch": 0.28243168989460016, + "grad_norm": 0.5247562296016731, + "learning_rate": 8.421957059981792e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9061400592327118, + "num_tokens": 67418259.0, + "step": 2827 + }, + { + "epoch": 0.2825315949847645, + "grad_norm": 0.510910781703477, + "learning_rate": 8.420777263131786e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.9087296724319458, + "num_tokens": 67499817.0, + "step": 2828 + }, + { + "epoch": 0.2826315000749288, + "grad_norm": 0.7718714145743476, + "learning_rate": 8.419597108123054e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9054272472858429, + "num_tokens": 67581270.0, + "step": 2829 + }, + { + "epoch": 0.28273140516509315, + "grad_norm": 0.5751261507832688, + "learning_rate": 8.41841659507916e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.9092183709144592, + "num_tokens": 67662848.0, + "step": 2830 + }, + { + "epoch": 0.2828313102552575, + "grad_norm": 0.6506917022642201, + "learning_rate": 8.417235724123705e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9046803116798401, + "num_tokens": 67744355.0, + "step": 2831 + }, + { + "epoch": 0.28293121534542187, + "grad_norm": 0.57992824964899, + "learning_rate": 8.416054495380326e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9072366058826447, + "num_tokens": 67825896.0, + "step": 2832 + }, + { + "epoch": 0.2830311204355862, + "grad_norm": 0.5551178591898792, + "learning_rate": 8.4148729089727e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.9056220352649689, + "num_tokens": 67907496.0, + "step": 2833 + }, + { + "epoch": 0.28313102552575053, + "grad_norm": 0.5612062926003102, + "learning_rate": 8.41369096502454e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9035148918628693, + "num_tokens": 67988932.0, + "step": 2834 + }, + { + "epoch": 0.28323093061591487, + "grad_norm": 0.6263840109083681, + "learning_rate": 8.412508663659596e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9058782160282135, + "num_tokens": 68070358.0, + "step": 2835 + }, + { + "epoch": 0.2833308357060792, + "grad_norm": 0.6343604417622949, + "learning_rate": 8.411326005001658e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.90296271443367, + "num_tokens": 68151823.0, + "step": 2836 + }, + { + "epoch": 0.2834307407962436, + "grad_norm": 0.652552494687745, + "learning_rate": 8.410142989174548e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.9098960757255554, + "num_tokens": 68233415.0, + "step": 2837 + }, + { + "epoch": 0.2835306458864079, + "grad_norm": 0.5410972928105567, + "learning_rate": 8.408959616302133e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.9070931971073151, + "num_tokens": 68315014.0, + "step": 2838 + }, + { + "epoch": 0.28363055097657225, + "grad_norm": 0.6162902876994057, + "learning_rate": 8.40777588650831e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9076626300811768, + "num_tokens": 68396525.0, + "step": 2839 + }, + { + "epoch": 0.2837304560667366, + "grad_norm": 0.5605867903848466, + "learning_rate": 8.40659179991702e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9055051803588867, + "num_tokens": 68477975.0, + "step": 2840 + }, + { + "epoch": 0.28383036115690097, + "grad_norm": 0.5445188037588735, + "learning_rate": 8.405407356652233e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9056974351406097, + "num_tokens": 68559443.0, + "step": 2841 + }, + { + "epoch": 0.2839302662470653, + "grad_norm": 0.9855196874391571, + "learning_rate": 8.404222556837967e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9083694219589233, + "num_tokens": 68641002.0, + "step": 2842 + }, + { + "epoch": 0.28403017133722963, + "grad_norm": 0.6227185830358613, + "learning_rate": 8.403037400598268e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9065175652503967, + "num_tokens": 68722598.0, + "step": 2843 + }, + { + "epoch": 0.28413007642739396, + "grad_norm": 0.9877411602104752, + "learning_rate": 8.401851888057224e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9024023115634918, + "num_tokens": 68804040.0, + "step": 2844 + }, + { + "epoch": 0.2842299815175583, + "grad_norm": 0.5152325206073547, + "learning_rate": 8.40066601933896e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9068717062473297, + "num_tokens": 68885560.0, + "step": 2845 + }, + { + "epoch": 0.2843298866077227, + "grad_norm": 0.4921431380664481, + "learning_rate": 8.399479794567637e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.909284383058548, + "num_tokens": 68967145.0, + "step": 2846 + }, + { + "epoch": 0.284429791697887, + "grad_norm": 0.4611889051274111, + "learning_rate": 8.398293213867454e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9080677926540375, + "num_tokens": 69048710.0, + "step": 2847 + }, + { + "epoch": 0.28452969678805135, + "grad_norm": 0.5872550525816858, + "learning_rate": 8.397106277362647e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9073266685009003, + "num_tokens": 69130143.0, + "step": 2848 + }, + { + "epoch": 0.2846296018782157, + "grad_norm": 0.6051238616839356, + "learning_rate": 8.39591898517749e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9046970903873444, + "num_tokens": 69211687.0, + "step": 2849 + }, + { + "epoch": 0.28472950696838006, + "grad_norm": 0.6560979819364912, + "learning_rate": 8.394731337436292e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9053251445293427, + "num_tokens": 69293124.0, + "step": 2850 + }, + { + "epoch": 0.2848294120585444, + "grad_norm": 0.717937964545476, + "learning_rate": 8.393543334263404e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.907353401184082, + "num_tokens": 69374738.0, + "step": 2851 + }, + { + "epoch": 0.2849293171487087, + "grad_norm": 0.6390566795573683, + "learning_rate": 8.392354975783207e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9070852696895599, + "num_tokens": 69456201.0, + "step": 2852 + }, + { + "epoch": 0.28502922223887306, + "grad_norm": 0.5797470178779295, + "learning_rate": 8.391166262120128e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9087801277637482, + "num_tokens": 69537710.0, + "step": 2853 + }, + { + "epoch": 0.2851291273290374, + "grad_norm": 0.5126197799300284, + "learning_rate": 8.389977193398621e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9077597260475159, + "num_tokens": 69619229.0, + "step": 2854 + }, + { + "epoch": 0.2852290324192018, + "grad_norm": 0.5758640026891017, + "learning_rate": 8.388787769743188e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9084499776363373, + "num_tokens": 69700796.0, + "step": 2855 + }, + { + "epoch": 0.2853289375093661, + "grad_norm": 0.6212794084638358, + "learning_rate": 8.387597991278358e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9055034816265106, + "num_tokens": 69782258.0, + "step": 2856 + }, + { + "epoch": 0.28542884259953044, + "grad_norm": 0.6659903339153689, + "learning_rate": 8.386407858128707e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9073262214660645, + "num_tokens": 69863829.0, + "step": 2857 + }, + { + "epoch": 0.2855287476896948, + "grad_norm": 0.48164443560469095, + "learning_rate": 8.385217370418838e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9060624241828918, + "num_tokens": 69945357.0, + "step": 2858 + }, + { + "epoch": 0.28562865277985916, + "grad_norm": 0.6760337930608482, + "learning_rate": 8.384026528273401e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9064925014972687, + "num_tokens": 70026827.0, + "step": 2859 + }, + { + "epoch": 0.2857285578700235, + "grad_norm": 0.5302214248305098, + "learning_rate": 8.382835331817076e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.907738596200943, + "num_tokens": 70108387.0, + "step": 2860 + }, + { + "epoch": 0.2858284629601878, + "grad_norm": 0.5502863683298197, + "learning_rate": 8.381643781174581e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.9074424803256989, + "num_tokens": 70189979.0, + "step": 2861 + }, + { + "epoch": 0.28592836805035216, + "grad_norm": 0.49411158019688994, + "learning_rate": 8.380451876470674e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.9086919128894806, + "num_tokens": 70271590.0, + "step": 2862 + }, + { + "epoch": 0.2860282731405165, + "grad_norm": 0.681862436637343, + "learning_rate": 8.37925961783015e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9084150791168213, + "num_tokens": 70353130.0, + "step": 2863 + }, + { + "epoch": 0.2861281782306809, + "grad_norm": 0.6216412876663978, + "learning_rate": 8.378067005377838e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9071725606918335, + "num_tokens": 70434611.0, + "step": 2864 + }, + { + "epoch": 0.2862280833208452, + "grad_norm": 0.4997398159035856, + "learning_rate": 8.376874039238605e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9032599031925201, + "num_tokens": 70516089.0, + "step": 2865 + }, + { + "epoch": 0.28632798841100954, + "grad_norm": 0.4661831507232705, + "learning_rate": 8.375680719537359e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9075405597686768, + "num_tokens": 70597580.0, + "step": 2866 + }, + { + "epoch": 0.28642789350117387, + "grad_norm": 0.510970557281472, + "learning_rate": 8.374487046399035e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.9063113927841187, + "num_tokens": 70679138.0, + "step": 2867 + }, + { + "epoch": 0.28652779859133826, + "grad_norm": 0.6204506942749357, + "learning_rate": 8.37329301994862e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9079482853412628, + "num_tokens": 70760727.0, + "step": 2868 + }, + { + "epoch": 0.2866277036815026, + "grad_norm": 0.6518081579860513, + "learning_rate": 8.372098640311125e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.9074148237705231, + "num_tokens": 70842329.0, + "step": 2869 + }, + { + "epoch": 0.2867276087716669, + "grad_norm": 0.5346002332884203, + "learning_rate": 8.370903907611603e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9045387506484985, + "num_tokens": 70923798.0, + "step": 2870 + }, + { + "epoch": 0.28682751386183125, + "grad_norm": 0.7391829565829263, + "learning_rate": 8.369708821975144e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.9106015563011169, + "num_tokens": 71005305.0, + "step": 2871 + }, + { + "epoch": 0.2869274189519956, + "grad_norm": 0.6144429608779843, + "learning_rate": 8.368513383526874e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.9105319976806641, + "num_tokens": 71086930.0, + "step": 2872 + }, + { + "epoch": 0.28702732404215997, + "grad_norm": 0.645012777855929, + "learning_rate": 8.36731759239196e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9053283035755157, + "num_tokens": 71168386.0, + "step": 2873 + }, + { + "epoch": 0.2871272291323243, + "grad_norm": 0.7869293984181615, + "learning_rate": 8.366121448695599e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9073951244354248, + "num_tokens": 71249940.0, + "step": 2874 + }, + { + "epoch": 0.28722713422248863, + "grad_norm": 0.6169968330315558, + "learning_rate": 8.36492495256303e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9073703289031982, + "num_tokens": 71331449.0, + "step": 2875 + }, + { + "epoch": 0.28732703931265297, + "grad_norm": 0.6313653767989842, + "learning_rate": 8.363728104119527e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9074035882949829, + "num_tokens": 71412916.0, + "step": 2876 + }, + { + "epoch": 0.2874269444028173, + "grad_norm": 0.4830935218872949, + "learning_rate": 8.362530903490399e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9074584245681763, + "num_tokens": 71494410.0, + "step": 2877 + }, + { + "epoch": 0.2875268494929817, + "grad_norm": 0.6115066301386091, + "learning_rate": 8.361333350800998e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9059581160545349, + "num_tokens": 71575940.0, + "step": 2878 + }, + { + "epoch": 0.287626754583146, + "grad_norm": 0.5302482785821098, + "learning_rate": 8.360135446176704e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.908960610628128, + "num_tokens": 71657528.0, + "step": 2879 + }, + { + "epoch": 0.28772665967331035, + "grad_norm": 0.47559851048265733, + "learning_rate": 8.358937189742944e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9096330404281616, + "num_tokens": 71739052.0, + "step": 2880 + }, + { + "epoch": 0.2878265647634747, + "grad_norm": 0.8228950624786515, + "learning_rate": 8.357738581625174e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.90751713514328, + "num_tokens": 71820637.0, + "step": 2881 + }, + { + "epoch": 0.28792646985363907, + "grad_norm": 0.4756745309457338, + "learning_rate": 8.356539621948892e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9046826660633087, + "num_tokens": 71902105.0, + "step": 2882 + }, + { + "epoch": 0.2880263749438034, + "grad_norm": 0.5092303047294754, + "learning_rate": 8.355340310839625e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.9060244858264923, + "num_tokens": 71983586.0, + "step": 2883 + }, + { + "epoch": 0.28812628003396773, + "grad_norm": 0.6463974865327895, + "learning_rate": 8.354140648422947e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9072924554347992, + "num_tokens": 72065066.0, + "step": 2884 + }, + { + "epoch": 0.28822618512413206, + "grad_norm": 0.6166571879971441, + "learning_rate": 8.35294063482446e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9064668118953705, + "num_tokens": 72146639.0, + "step": 2885 + }, + { + "epoch": 0.2883260902142964, + "grad_norm": 0.6272329842404974, + "learning_rate": 8.35174027016981e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9105411171913147, + "num_tokens": 72228222.0, + "step": 2886 + }, + { + "epoch": 0.2884259953044608, + "grad_norm": 0.47933708806108216, + "learning_rate": 8.350539554584674e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9070029854774475, + "num_tokens": 72309744.0, + "step": 2887 + }, + { + "epoch": 0.2885259003946251, + "grad_norm": 0.9742208049388378, + "learning_rate": 8.34933848819477e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9073057174682617, + "num_tokens": 72391266.0, + "step": 2888 + }, + { + "epoch": 0.28862580548478944, + "grad_norm": 0.44762431798010055, + "learning_rate": 8.348137071125848e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9066236913204193, + "num_tokens": 72472732.0, + "step": 2889 + }, + { + "epoch": 0.2887257105749538, + "grad_norm": 0.5336548618083632, + "learning_rate": 8.346935303503701e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9045854806900024, + "num_tokens": 72554181.0, + "step": 2890 + }, + { + "epoch": 0.28882561566511816, + "grad_norm": 0.46268883248426135, + "learning_rate": 8.345733185454153e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9086142182350159, + "num_tokens": 72635709.0, + "step": 2891 + }, + { + "epoch": 0.2889255207552825, + "grad_norm": 0.5090820736378497, + "learning_rate": 8.344530717103067e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9082784652709961, + "num_tokens": 72717150.0, + "step": 2892 + }, + { + "epoch": 0.2890254258454468, + "grad_norm": 0.6902384311706086, + "learning_rate": 8.343327898576346e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9060169756412506, + "num_tokens": 72798673.0, + "step": 2893 + }, + { + "epoch": 0.28912533093561116, + "grad_norm": 0.5974871148785873, + "learning_rate": 8.342124729999921e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9098801016807556, + "num_tokens": 72880152.0, + "step": 2894 + }, + { + "epoch": 0.2892252360257755, + "grad_norm": 1.0861072233833873, + "learning_rate": 8.340921211499766e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.904050886631012, + "num_tokens": 72961643.0, + "step": 2895 + }, + { + "epoch": 0.2893251411159399, + "grad_norm": 0.6501710143747602, + "learning_rate": 8.339717343201894e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.9081389605998993, + "num_tokens": 73043281.0, + "step": 2896 + }, + { + "epoch": 0.2894250462061042, + "grad_norm": 0.6002437413710917, + "learning_rate": 8.338513125232348e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9099956750869751, + "num_tokens": 73124852.0, + "step": 2897 + }, + { + "epoch": 0.28952495129626854, + "grad_norm": 0.6717763121032756, + "learning_rate": 8.337308557717214e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9082579910755157, + "num_tokens": 73206359.0, + "step": 2898 + }, + { + "epoch": 0.28962485638643287, + "grad_norm": 0.7057472790884025, + "learning_rate": 8.336103640782606e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9040729403495789, + "num_tokens": 73287832.0, + "step": 2899 + }, + { + "epoch": 0.28972476147659726, + "grad_norm": 0.5073427154338086, + "learning_rate": 8.334898374554684e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.906712144613266, + "num_tokens": 73369362.0, + "step": 2900 + }, + { + "epoch": 0.2898246665667616, + "grad_norm": 0.729872763799816, + "learning_rate": 8.333692759159643e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9055640697479248, + "num_tokens": 73450861.0, + "step": 2901 + }, + { + "epoch": 0.2899245716569259, + "grad_norm": 0.5153388006471175, + "learning_rate": 8.332486794723706e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9087422490119934, + "num_tokens": 73532354.0, + "step": 2902 + }, + { + "epoch": 0.29002447674709025, + "grad_norm": 0.6376643905825478, + "learning_rate": 8.331280481373143e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9081791937351227, + "num_tokens": 73613885.0, + "step": 2903 + }, + { + "epoch": 0.2901243818372546, + "grad_norm": 0.5112435979642818, + "learning_rate": 8.330073819234254e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9053768515586853, + "num_tokens": 73695377.0, + "step": 2904 + }, + { + "epoch": 0.290224286927419, + "grad_norm": 0.6911390321721221, + "learning_rate": 8.328866808433378e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9102827608585358, + "num_tokens": 73776905.0, + "step": 2905 + }, + { + "epoch": 0.2903241920175833, + "grad_norm": 0.539287682660571, + "learning_rate": 8.327659449096892e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9089295268058777, + "num_tokens": 73858423.0, + "step": 2906 + }, + { + "epoch": 0.29042409710774764, + "grad_norm": 0.590228406854777, + "learning_rate": 8.326451741351204e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9083669185638428, + "num_tokens": 73940003.0, + "step": 2907 + }, + { + "epoch": 0.29052400219791197, + "grad_norm": 0.7756112819298797, + "learning_rate": 8.325243685322767e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9081235527992249, + "num_tokens": 74021534.0, + "step": 2908 + }, + { + "epoch": 0.29062390728807636, + "grad_norm": 0.6090650914193749, + "learning_rate": 8.324035281138063e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9080474972724915, + "num_tokens": 74103101.0, + "step": 2909 + }, + { + "epoch": 0.2907238123782407, + "grad_norm": 0.525108567545182, + "learning_rate": 8.322826528923614e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9067792892456055, + "num_tokens": 74184559.0, + "step": 2910 + }, + { + "epoch": 0.290823717468405, + "grad_norm": 0.9160644797310716, + "learning_rate": 8.321617428805974e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9068024456501007, + "num_tokens": 74266050.0, + "step": 2911 + }, + { + "epoch": 0.29092362255856935, + "grad_norm": 0.4348463402255769, + "learning_rate": 8.320407980911743e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.908017098903656, + "num_tokens": 74347568.0, + "step": 2912 + }, + { + "epoch": 0.2910235276487337, + "grad_norm": 0.6325073589820536, + "learning_rate": 8.319198185367547e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9077715277671814, + "num_tokens": 74429056.0, + "step": 2913 + }, + { + "epoch": 0.29112343273889807, + "grad_norm": 0.7088677985386254, + "learning_rate": 8.317988042300054e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9076941311359406, + "num_tokens": 74510623.0, + "step": 2914 + }, + { + "epoch": 0.2912233378290624, + "grad_norm": 0.4951014656618609, + "learning_rate": 8.316777551835967e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9081453680992126, + "num_tokens": 74592122.0, + "step": 2915 + }, + { + "epoch": 0.29132324291922673, + "grad_norm": 0.8305279475446484, + "learning_rate": 8.315566714102025e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9040420949459076, + "num_tokens": 74673581.0, + "step": 2916 + }, + { + "epoch": 0.29142314800939106, + "grad_norm": 0.5366088464757107, + "learning_rate": 8.314355529225004e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9059423208236694, + "num_tokens": 74755014.0, + "step": 2917 + }, + { + "epoch": 0.2915230530995554, + "grad_norm": 0.547473513298671, + "learning_rate": 8.313143997331718e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.9072721600532532, + "num_tokens": 74836612.0, + "step": 2918 + }, + { + "epoch": 0.2916229581897198, + "grad_norm": 0.5624026066847804, + "learning_rate": 8.311932118549015e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9034996926784515, + "num_tokens": 74918026.0, + "step": 2919 + }, + { + "epoch": 0.2917228632798841, + "grad_norm": 0.5505364741793691, + "learning_rate": 8.310719893003779e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9063855707645416, + "num_tokens": 74999483.0, + "step": 2920 + }, + { + "epoch": 0.29182276837004845, + "grad_norm": 0.5250971160466996, + "learning_rate": 8.30950732082293e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9051230549812317, + "num_tokens": 75080913.0, + "step": 2921 + }, + { + "epoch": 0.2919226734602128, + "grad_norm": 0.45417834536631346, + "learning_rate": 8.30829440213343e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9071054756641388, + "num_tokens": 75162389.0, + "step": 2922 + }, + { + "epoch": 0.29202257855037717, + "grad_norm": 0.4227851406418758, + "learning_rate": 8.307081137062267e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9064074754714966, + "num_tokens": 75243893.0, + "step": 2923 + }, + { + "epoch": 0.2921224836405415, + "grad_norm": 0.4399715194314648, + "learning_rate": 8.305867525736475e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9083541631698608, + "num_tokens": 75325396.0, + "step": 2924 + }, + { + "epoch": 0.29222238873070583, + "grad_norm": 0.47425292285998116, + "learning_rate": 8.30465356828312e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.909529060125351, + "num_tokens": 75407000.0, + "step": 2925 + }, + { + "epoch": 0.29232229382087016, + "grad_norm": 0.5078243872094828, + "learning_rate": 8.303439264829302e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.906792938709259, + "num_tokens": 75488568.0, + "step": 2926 + }, + { + "epoch": 0.2924221989110345, + "grad_norm": 0.5041412475754922, + "learning_rate": 8.302224615502162e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.9068872332572937, + "num_tokens": 75570197.0, + "step": 2927 + }, + { + "epoch": 0.2925221040011989, + "grad_norm": 0.5546476984003843, + "learning_rate": 8.301009620428874e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.9059956967830658, + "num_tokens": 75651785.0, + "step": 2928 + }, + { + "epoch": 0.2926220090913632, + "grad_norm": 0.45146105659991437, + "learning_rate": 8.29979427973665e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.907560259103775, + "num_tokens": 75733308.0, + "step": 2929 + }, + { + "epoch": 0.29272191418152754, + "grad_norm": 0.4994060160231168, + "learning_rate": 8.298578593552737e-06, + "loss": 0.494, + "mean_token_accuracy": 0.9066601097583771, + "num_tokens": 75814831.0, + "step": 2930 + }, + { + "epoch": 0.2928218192716919, + "grad_norm": 0.5556130357073222, + "learning_rate": 8.297362562004418e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.9073605835437775, + "num_tokens": 75896414.0, + "step": 2931 + }, + { + "epoch": 0.29292172436185626, + "grad_norm": 0.5466800948687687, + "learning_rate": 8.296146185219013e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.903481125831604, + "num_tokens": 75977867.0, + "step": 2932 + }, + { + "epoch": 0.2930216294520206, + "grad_norm": 0.48583977391080874, + "learning_rate": 8.294929463323878e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.9045122861862183, + "num_tokens": 76059345.0, + "step": 2933 + }, + { + "epoch": 0.2931215345421849, + "grad_norm": 0.4748169709337687, + "learning_rate": 8.293712396446405e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9049191474914551, + "num_tokens": 76140760.0, + "step": 2934 + }, + { + "epoch": 0.29322143963234926, + "grad_norm": 0.4682768904746809, + "learning_rate": 8.292494984714024e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.907799482345581, + "num_tokens": 76222293.0, + "step": 2935 + }, + { + "epoch": 0.2933213447225136, + "grad_norm": 0.5852673593571429, + "learning_rate": 8.291277228254193e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9068619906902313, + "num_tokens": 76303792.0, + "step": 2936 + }, + { + "epoch": 0.293421249812678, + "grad_norm": 0.6250148850010211, + "learning_rate": 8.29005912719442e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9056819677352905, + "num_tokens": 76385267.0, + "step": 2937 + }, + { + "epoch": 0.2935211549028423, + "grad_norm": 0.5108383201675473, + "learning_rate": 8.288840681662236e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9051901400089264, + "num_tokens": 76466704.0, + "step": 2938 + }, + { + "epoch": 0.29362105999300664, + "grad_norm": 0.4611996061555322, + "learning_rate": 8.287621891785217e-06, + "loss": 0.495, + "mean_token_accuracy": 0.9063418507575989, + "num_tokens": 76548225.0, + "step": 2939 + }, + { + "epoch": 0.29372096508317097, + "grad_norm": 0.5443528502776952, + "learning_rate": 8.286402757690968e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9081435203552246, + "num_tokens": 76629767.0, + "step": 2940 + }, + { + "epoch": 0.29382087017333536, + "grad_norm": 0.48178099961461274, + "learning_rate": 8.285183279507135e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9042298793792725, + "num_tokens": 76711270.0, + "step": 2941 + }, + { + "epoch": 0.2939207752634997, + "grad_norm": 0.4720851129331268, + "learning_rate": 8.2839634573614e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9053292572498322, + "num_tokens": 76792782.0, + "step": 2942 + }, + { + "epoch": 0.294020680353664, + "grad_norm": 0.5091580819147918, + "learning_rate": 8.28274329138148e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9053705334663391, + "num_tokens": 76874251.0, + "step": 2943 + }, + { + "epoch": 0.29412058544382835, + "grad_norm": 0.46420727459337613, + "learning_rate": 8.281522781695127e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9093121588230133, + "num_tokens": 76955751.0, + "step": 2944 + }, + { + "epoch": 0.2942204905339927, + "grad_norm": 0.5093579785028763, + "learning_rate": 8.280301928430125e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9096320569515228, + "num_tokens": 77037252.0, + "step": 2945 + }, + { + "epoch": 0.29432039562415707, + "grad_norm": 0.4477898351451511, + "learning_rate": 8.279080731714304e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.9076181352138519, + "num_tokens": 77118779.0, + "step": 2946 + }, + { + "epoch": 0.2944203007143214, + "grad_norm": 0.49933917815905066, + "learning_rate": 8.277859191675521e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.9080526530742645, + "num_tokens": 77200361.0, + "step": 2947 + }, + { + "epoch": 0.29452020580448574, + "grad_norm": 0.5395844896809233, + "learning_rate": 8.276637308441675e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9062846601009369, + "num_tokens": 77281873.0, + "step": 2948 + }, + { + "epoch": 0.29462011089465007, + "grad_norm": 0.4671878986708899, + "learning_rate": 8.275415082140698e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9065667688846588, + "num_tokens": 77363367.0, + "step": 2949 + }, + { + "epoch": 0.2947200159848144, + "grad_norm": 0.44282283270212114, + "learning_rate": 8.274192512900558e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9075462222099304, + "num_tokens": 77444889.0, + "step": 2950 + }, + { + "epoch": 0.2948199210749788, + "grad_norm": 0.5145512423142364, + "learning_rate": 8.272969600849257e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.9085985422134399, + "num_tokens": 77526404.0, + "step": 2951 + }, + { + "epoch": 0.2949198261651431, + "grad_norm": 0.5138570581793921, + "learning_rate": 8.27174634611484e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.9074752330780029, + "num_tokens": 77607908.0, + "step": 2952 + }, + { + "epoch": 0.29501973125530745, + "grad_norm": 0.5249346205022701, + "learning_rate": 8.270522748825377e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9057457745075226, + "num_tokens": 77689388.0, + "step": 2953 + }, + { + "epoch": 0.2951196363454718, + "grad_norm": 0.5467399765236233, + "learning_rate": 8.269298809108983e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9052582681179047, + "num_tokens": 77770864.0, + "step": 2954 + }, + { + "epoch": 0.29521954143563617, + "grad_norm": 0.9434976729797858, + "learning_rate": 8.268074527093807e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.9074993431568146, + "num_tokens": 77852450.0, + "step": 2955 + }, + { + "epoch": 0.2953194465258005, + "grad_norm": 0.644844939253276, + "learning_rate": 8.266849902908032e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9072857797145844, + "num_tokens": 77933953.0, + "step": 2956 + }, + { + "epoch": 0.29541935161596483, + "grad_norm": 0.5640298642019582, + "learning_rate": 8.265624936679875e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9057630300521851, + "num_tokens": 78015501.0, + "step": 2957 + }, + { + "epoch": 0.29551925670612916, + "grad_norm": 0.47003791161414266, + "learning_rate": 8.264399628537592e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.9077657759189606, + "num_tokens": 78097076.0, + "step": 2958 + }, + { + "epoch": 0.2956191617962935, + "grad_norm": 1.0954608320402286, + "learning_rate": 8.263173978609475e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.908143013715744, + "num_tokens": 78178594.0, + "step": 2959 + }, + { + "epoch": 0.2957190668864579, + "grad_norm": 0.48299484376049906, + "learning_rate": 8.261947987023851e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.907172292470932, + "num_tokens": 78260177.0, + "step": 2960 + }, + { + "epoch": 0.2958189719766222, + "grad_norm": 0.531070520377587, + "learning_rate": 8.26072165390908e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.9069130718708038, + "num_tokens": 78341677.0, + "step": 2961 + }, + { + "epoch": 0.29591887706678655, + "grad_norm": 0.47121331634796354, + "learning_rate": 8.259494979393563e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.9085469543933868, + "num_tokens": 78423209.0, + "step": 2962 + }, + { + "epoch": 0.2960187821569509, + "grad_norm": 0.5700185457825114, + "learning_rate": 8.258267963605735e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9070436060428619, + "num_tokens": 78504669.0, + "step": 2963 + }, + { + "epoch": 0.29611868724711526, + "grad_norm": 0.5155063531961833, + "learning_rate": 8.25704060667406e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9105023145675659, + "num_tokens": 78586215.0, + "step": 2964 + }, + { + "epoch": 0.2962185923372796, + "grad_norm": 0.6330268808686103, + "learning_rate": 8.25581290872705e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9067116677761078, + "num_tokens": 78667731.0, + "step": 2965 + }, + { + "epoch": 0.2963184974274439, + "grad_norm": 0.5489706160811416, + "learning_rate": 8.254584869893243e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9079139828681946, + "num_tokens": 78749253.0, + "step": 2966 + }, + { + "epoch": 0.29641840251760826, + "grad_norm": 0.5118619209288437, + "learning_rate": 8.253356490301216e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9071004688739777, + "num_tokens": 78830770.0, + "step": 2967 + }, + { + "epoch": 0.2965183076077726, + "grad_norm": 0.46912688419497495, + "learning_rate": 8.252127770079581e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.9077455997467041, + "num_tokens": 78912288.0, + "step": 2968 + }, + { + "epoch": 0.296618212697937, + "grad_norm": 0.654593543264848, + "learning_rate": 8.25089870935699e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9072445929050446, + "num_tokens": 78993817.0, + "step": 2969 + }, + { + "epoch": 0.2967181177881013, + "grad_norm": 0.46672515674150716, + "learning_rate": 8.24966930826212e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9079664051532745, + "num_tokens": 79075333.0, + "step": 2970 + }, + { + "epoch": 0.29681802287826564, + "grad_norm": 0.47760235339270485, + "learning_rate": 8.248439566923697e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9078577160835266, + "num_tokens": 79156816.0, + "step": 2971 + }, + { + "epoch": 0.29691792796843, + "grad_norm": 0.5638578793775658, + "learning_rate": 8.247209485470473e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9083449244499207, + "num_tokens": 79238352.0, + "step": 2972 + }, + { + "epoch": 0.29701783305859436, + "grad_norm": 0.593431580320485, + "learning_rate": 8.245979064031238e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9058738648891449, + "num_tokens": 79319812.0, + "step": 2973 + }, + { + "epoch": 0.2971177381487587, + "grad_norm": 0.5219099615919374, + "learning_rate": 8.244748302734822e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9082688689231873, + "num_tokens": 79401331.0, + "step": 2974 + }, + { + "epoch": 0.297217643238923, + "grad_norm": 0.5459636837361228, + "learning_rate": 8.243517201710083e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9079062044620514, + "num_tokens": 79482841.0, + "step": 2975 + }, + { + "epoch": 0.29731754832908736, + "grad_norm": 0.46369246959495214, + "learning_rate": 8.24228576108592e-06, + "loss": 0.497, + "mean_token_accuracy": 0.9080342650413513, + "num_tokens": 79564330.0, + "step": 2976 + }, + { + "epoch": 0.2974174534192517, + "grad_norm": 0.5904785650037487, + "learning_rate": 8.241053980991267e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.906483381986618, + "num_tokens": 79645849.0, + "step": 2977 + }, + { + "epoch": 0.2975173585094161, + "grad_norm": 0.4435898396622726, + "learning_rate": 8.239821861555091e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.9081998765468597, + "num_tokens": 79727416.0, + "step": 2978 + }, + { + "epoch": 0.2976172635995804, + "grad_norm": 0.4565832145691444, + "learning_rate": 8.238589402906396e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9060254096984863, + "num_tokens": 79808882.0, + "step": 2979 + }, + { + "epoch": 0.29771716868974474, + "grad_norm": 0.5172722783916137, + "learning_rate": 8.237356605174225e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9063146412372589, + "num_tokens": 79890325.0, + "step": 2980 + }, + { + "epoch": 0.29781707377990907, + "grad_norm": 0.5135669365731377, + "learning_rate": 8.236123468487649e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.9058278799057007, + "num_tokens": 79971836.0, + "step": 2981 + }, + { + "epoch": 0.29791697887007346, + "grad_norm": 0.511966499224255, + "learning_rate": 8.234889992975778e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.9085327386856079, + "num_tokens": 80053366.0, + "step": 2982 + }, + { + "epoch": 0.2980168839602378, + "grad_norm": 0.571910509150545, + "learning_rate": 8.233656178767762e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.906772643327713, + "num_tokens": 80134877.0, + "step": 2983 + }, + { + "epoch": 0.2981167890504021, + "grad_norm": 0.4648521690355255, + "learning_rate": 8.232422025992781e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9077282547950745, + "num_tokens": 80216298.0, + "step": 2984 + }, + { + "epoch": 0.29821669414056645, + "grad_norm": 0.6112034873602576, + "learning_rate": 8.231187534780051e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.9072686731815338, + "num_tokens": 80297864.0, + "step": 2985 + }, + { + "epoch": 0.2983165992307308, + "grad_norm": 0.5325954759786052, + "learning_rate": 8.229952705258827e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9077335000038147, + "num_tokens": 80379283.0, + "step": 2986 + }, + { + "epoch": 0.29841650432089517, + "grad_norm": 0.9453362475227042, + "learning_rate": 8.228717537558392e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.908373087644577, + "num_tokens": 80460812.0, + "step": 2987 + }, + { + "epoch": 0.2985164094110595, + "grad_norm": 0.5674188344099479, + "learning_rate": 8.227482031808076e-06, + "loss": 0.483, + "mean_token_accuracy": 0.9103112518787384, + "num_tokens": 80542518.0, + "step": 2988 + }, + { + "epoch": 0.29861631450122383, + "grad_norm": 0.42630233278422763, + "learning_rate": 8.226246188137232e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9073606431484222, + "num_tokens": 80623972.0, + "step": 2989 + }, + { + "epoch": 0.29871621959138817, + "grad_norm": 0.6503737793139875, + "learning_rate": 8.225010006675256e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9050593674182892, + "num_tokens": 80705459.0, + "step": 2990 + }, + { + "epoch": 0.2988161246815525, + "grad_norm": 0.5072620566719561, + "learning_rate": 8.223773487551579e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.9118438065052032, + "num_tokens": 80786977.0, + "step": 2991 + }, + { + "epoch": 0.2989160297717169, + "grad_norm": 0.4492748565978717, + "learning_rate": 8.222536630895662e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.9059075713157654, + "num_tokens": 80868565.0, + "step": 2992 + }, + { + "epoch": 0.2990159348618812, + "grad_norm": 0.463045969009454, + "learning_rate": 8.221299436837008e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9080836474895477, + "num_tokens": 80950068.0, + "step": 2993 + }, + { + "epoch": 0.29911583995204555, + "grad_norm": 0.5097748265820502, + "learning_rate": 8.220061905505152e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.9057662785053253, + "num_tokens": 81031634.0, + "step": 2994 + }, + { + "epoch": 0.2992157450422099, + "grad_norm": 0.5405243824471525, + "learning_rate": 8.218824037029664e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9039269089698792, + "num_tokens": 81113208.0, + "step": 2995 + }, + { + "epoch": 0.29931565013237427, + "grad_norm": 0.5271769030589446, + "learning_rate": 8.217585831540152e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9089834690093994, + "num_tokens": 81194686.0, + "step": 2996 + }, + { + "epoch": 0.2994155552225386, + "grad_norm": 0.4629196674421218, + "learning_rate": 8.216347289166253e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.9078019559383392, + "num_tokens": 81276203.0, + "step": 2997 + }, + { + "epoch": 0.29951546031270293, + "grad_norm": 0.45824940566408107, + "learning_rate": 8.215108410037649e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9067547917366028, + "num_tokens": 81357681.0, + "step": 2998 + }, + { + "epoch": 0.29961536540286726, + "grad_norm": 0.49284471202872604, + "learning_rate": 8.213869194284048e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9067279398441315, + "num_tokens": 81439137.0, + "step": 2999 + }, + { + "epoch": 0.2997152704930316, + "grad_norm": 0.4582186997202509, + "learning_rate": 8.2126296420352e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9074684083461761, + "num_tokens": 81520679.0, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 10010, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1335627594235904.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}