{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3587304668543143, "eval_steps": 200, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.1698873918503523, "epoch": 0.0019411821799475881, "grad_norm": 28.75, "learning_rate": 1.7241379310344828e-07, "loss": 0.9568568229675293, "mean_token_accuracy": 0.8273445844650269, "num_tokens": 68060.0, "step": 5 }, { "entropy": 0.1770024599507451, "epoch": 0.0038823643598951763, "grad_norm": 35.0, "learning_rate": 3.8793103448275865e-07, "loss": 0.9151897430419922, "mean_token_accuracy": 0.8321746543049813, "num_tokens": 138267.0, "step": 10 }, { "entropy": 0.19476149305701257, "epoch": 0.005823546539842764, "grad_norm": 30.75, "learning_rate": 6.034482758620691e-07, "loss": 0.8955170631408691, "mean_token_accuracy": 0.8212856188416481, "num_tokens": 217527.0, "step": 15 }, { "entropy": 0.19077662620693445, "epoch": 0.007764728719790353, "grad_norm": 29.875, "learning_rate": 8.189655172413794e-07, "loss": 0.9730923652648926, "mean_token_accuracy": 0.822833463549614, "num_tokens": 280569.0, "step": 20 }, { "entropy": 0.1851712815463543, "epoch": 0.00970591089973794, "grad_norm": 31.0, "learning_rate": 1.0344827586206898e-06, "loss": 0.8731012344360352, "mean_token_accuracy": 0.828074723482132, "num_tokens": 361410.0, "step": 25 }, { "entropy": 0.21108674593269824, "epoch": 0.011647093079685528, "grad_norm": 30.875, "learning_rate": 1.25e-06, "loss": 0.9689438819885254, "mean_token_accuracy": 0.8158086076378822, "num_tokens": 433416.0, "step": 30 }, { "entropy": 0.1849596280604601, "epoch": 0.013588275259633116, "grad_norm": 23.375, "learning_rate": 1.4655172413793104e-06, "loss": 0.8792544364929199, "mean_token_accuracy": 0.8349022850394249, "num_tokens": 509913.0, "step": 35 }, { "entropy": 0.22154813222587108, "epoch": 0.015529457439580705, "grad_norm": 24.125, "learning_rate": 1.681034482758621e-06, "loss": 0.8865782737731933, "mean_token_accuracy": 0.8225297197699547, "num_tokens": 574517.0, "step": 40 }, { "entropy": 0.21101174745708703, "epoch": 0.017470639619528293, "grad_norm": 21.125, "learning_rate": 1.896551724137931e-06, "loss": 0.8394794464111328, "mean_token_accuracy": 0.8262531071901321, "num_tokens": 646659.0, "step": 45 }, { "entropy": 0.22667213380336762, "epoch": 0.01941182179947588, "grad_norm": 18.875, "learning_rate": 2.1120689655172416e-06, "loss": 0.8253890037536621, "mean_token_accuracy": 0.8253167048096657, "num_tokens": 726398.0, "step": 50 }, { "entropy": 0.23717499971389772, "epoch": 0.02135300397942347, "grad_norm": 17.25, "learning_rate": 2.327586206896552e-06, "loss": 0.7434019565582275, "mean_token_accuracy": 0.8295832589268685, "num_tokens": 809917.0, "step": 55 }, { "entropy": 0.25738408528268336, "epoch": 0.023294186159371056, "grad_norm": 13.8125, "learning_rate": 2.543103448275862e-06, "loss": 0.7521392345428467, "mean_token_accuracy": 0.8236480697989463, "num_tokens": 886659.0, "step": 60 }, { "entropy": 0.22672694064676763, "epoch": 0.025235368339318644, "grad_norm": 12.5625, "learning_rate": 2.7586206896551725e-06, "loss": 0.7187893390655518, "mean_token_accuracy": 0.8356023579835892, "num_tokens": 953610.0, "step": 65 }, { "entropy": 0.26906434297561643, "epoch": 0.02717655051926623, "grad_norm": 11.9375, "learning_rate": 2.9741379310344832e-06, "loss": 0.7249302387237548, "mean_token_accuracy": 0.8295665010809898, "num_tokens": 1024103.0, "step": 70 }, { "entropy": 0.2811603628098965, "epoch": 0.029117732699213823, "grad_norm": 10.5, "learning_rate": 3.1896551724137935e-06, "loss": 0.7414528846740722, "mean_token_accuracy": 0.8267218798398972, "num_tokens": 1104806.0, "step": 75 }, { "entropy": 0.2909968294203281, "epoch": 0.03105891487916141, "grad_norm": 10.3125, "learning_rate": 3.4051724137931034e-06, "loss": 0.6241181850433349, "mean_token_accuracy": 0.8339074313640594, "num_tokens": 1185581.0, "step": 80 }, { "entropy": 0.3096594549715519, "epoch": 0.033000097059108995, "grad_norm": 7.6875, "learning_rate": 3.620689655172414e-06, "loss": 0.5764092445373535, "mean_token_accuracy": 0.834169502556324, "num_tokens": 1277766.0, "step": 85 }, { "entropy": 0.3442493978887796, "epoch": 0.034941279239056586, "grad_norm": 6.3125, "learning_rate": 3.8362068965517245e-06, "loss": 0.6330607414245606, "mean_token_accuracy": 0.8328578889369964, "num_tokens": 1343195.0, "step": 90 }, { "entropy": 0.3338872347027063, "epoch": 0.03688246141900418, "grad_norm": 3.9375, "learning_rate": 4.051724137931034e-06, "loss": 0.5495956897735595, "mean_token_accuracy": 0.8446441546082497, "num_tokens": 1422307.0, "step": 95 }, { "entropy": 0.3676390130072832, "epoch": 0.03882364359895176, "grad_norm": 3.78125, "learning_rate": 4.267241379310345e-06, "loss": 0.5221522331237793, "mean_token_accuracy": 0.8460515171289444, "num_tokens": 1502407.0, "step": 100 }, { "entropy": 0.3870431698858738, "epoch": 0.04076482577889935, "grad_norm": 2.875, "learning_rate": 4.482758620689656e-06, "loss": 0.5109545230865479, "mean_token_accuracy": 0.8474476292729378, "num_tokens": 1588898.0, "step": 105 }, { "entropy": 0.4107159897685051, "epoch": 0.04270600795884694, "grad_norm": 2.109375, "learning_rate": 4.698275862068966e-06, "loss": 0.5011598587036132, "mean_token_accuracy": 0.8443124324083329, "num_tokens": 1680514.0, "step": 110 }, { "entropy": 0.39204273708164694, "epoch": 0.04464719013879453, "grad_norm": 2.125, "learning_rate": 4.9137931034482765e-06, "loss": 0.4943844795227051, "mean_token_accuracy": 0.8546174108982086, "num_tokens": 1751043.0, "step": 115 }, { "entropy": 0.44253255538642405, "epoch": 0.04658837231874211, "grad_norm": 1.96875, "learning_rate": 5.129310344827587e-06, "loss": 0.5116967678070068, "mean_token_accuracy": 0.8467695400118828, "num_tokens": 1828152.0, "step": 120 }, { "entropy": 0.4137393295764923, "epoch": 0.0485295544986897, "grad_norm": 2.421875, "learning_rate": 5.344827586206896e-06, "loss": 0.4703562259674072, "mean_token_accuracy": 0.8589524060487748, "num_tokens": 1903581.0, "step": 125 }, { "entropy": 0.43903482854366305, "epoch": 0.05047073667863729, "grad_norm": 2.375, "learning_rate": 5.560344827586207e-06, "loss": 0.4887136936187744, "mean_token_accuracy": 0.8527244672179222, "num_tokens": 1976772.0, "step": 130 }, { "entropy": 0.4222589176148176, "epoch": 0.05241191885858488, "grad_norm": 2.265625, "learning_rate": 5.775862068965518e-06, "loss": 0.45414047241210936, "mean_token_accuracy": 0.8595154702663421, "num_tokens": 2045499.0, "step": 135 }, { "entropy": 0.4219055060297251, "epoch": 0.05435310103853246, "grad_norm": 1.9765625, "learning_rate": 5.9913793103448284e-06, "loss": 0.49632959365844725, "mean_token_accuracy": 0.8565252378582955, "num_tokens": 2120623.0, "step": 140 }, { "entropy": 0.42592958733439445, "epoch": 0.056294283218480054, "grad_norm": 1.828125, "learning_rate": 6.206896551724138e-06, "loss": 0.4315296173095703, "mean_token_accuracy": 0.8575032651424408, "num_tokens": 2198334.0, "step": 145 }, { "entropy": 0.40059518739581107, "epoch": 0.058235465398427645, "grad_norm": 2.015625, "learning_rate": 6.422413793103449e-06, "loss": 0.4771871566772461, "mean_token_accuracy": 0.8683709859848022, "num_tokens": 2265434.0, "step": 150 }, { "entropy": 0.41876095086336135, "epoch": 0.06017664757837523, "grad_norm": 1.34375, "learning_rate": 6.63793103448276e-06, "loss": 0.444645357131958, "mean_token_accuracy": 0.8565793663263321, "num_tokens": 2358742.0, "step": 155 }, { "entropy": 0.4004307441413403, "epoch": 0.06211782975832282, "grad_norm": 1.8828125, "learning_rate": 6.853448275862069e-06, "loss": 0.4320836544036865, "mean_token_accuracy": 0.8631758615374565, "num_tokens": 2437938.0, "step": 160 }, { "entropy": 0.41059495508670807, "epoch": 0.0640590119382704, "grad_norm": 1.8359375, "learning_rate": 7.0689655172413796e-06, "loss": 0.450638484954834, "mean_token_accuracy": 0.8649105593562126, "num_tokens": 2506611.0, "step": 165 }, { "entropy": 0.4375029347836971, "epoch": 0.06600019411821799, "grad_norm": 1.734375, "learning_rate": 7.28448275862069e-06, "loss": 0.48232545852661135, "mean_token_accuracy": 0.8586594820022583, "num_tokens": 2585776.0, "step": 170 }, { "entropy": 0.39430369548499583, "epoch": 0.06794137629816559, "grad_norm": 1.859375, "learning_rate": 7.500000000000001e-06, "loss": 0.42594180107116697, "mean_token_accuracy": 0.8716334730386734, "num_tokens": 2649428.0, "step": 175 }, { "entropy": 0.4384324595332146, "epoch": 0.06988255847811317, "grad_norm": 1.796875, "learning_rate": 7.715517241379312e-06, "loss": 0.45015506744384765, "mean_token_accuracy": 0.8587341636419297, "num_tokens": 2741939.0, "step": 180 }, { "entropy": 0.4197473503649235, "epoch": 0.07182374065806076, "grad_norm": 1.78125, "learning_rate": 7.93103448275862e-06, "loss": 0.430635404586792, "mean_token_accuracy": 0.8667460069060325, "num_tokens": 2814243.0, "step": 185 }, { "entropy": 0.40914484262466433, "epoch": 0.07376492283800835, "grad_norm": 2.0625, "learning_rate": 8.146551724137932e-06, "loss": 0.43552379608154296, "mean_token_accuracy": 0.865776352584362, "num_tokens": 2885496.0, "step": 190 }, { "entropy": 0.4126306913793087, "epoch": 0.07570610501795594, "grad_norm": 1.953125, "learning_rate": 8.362068965517242e-06, "loss": 0.44997596740722656, "mean_token_accuracy": 0.8635187759995461, "num_tokens": 2952067.0, "step": 195 }, { "entropy": 0.39164264835417273, "epoch": 0.07764728719790352, "grad_norm": 1.8046875, "learning_rate": 8.577586206896551e-06, "loss": 0.4397777557373047, "mean_token_accuracy": 0.8692673921585083, "num_tokens": 3032945.0, "step": 200 }, { "epoch": 0.07764728719790352, "eval_entropy": 0.40086069169184285, "eval_loss": 0.42599618434906006, "eval_mean_token_accuracy": 0.8688706356933251, "eval_num_tokens": 3032945.0, "eval_runtime": 60.1253, "eval_samples_per_second": 35.742, "eval_steps_per_second": 35.742, "step": 200 }, { "entropy": 0.43272491060197354, "epoch": 0.0795884693778511, "grad_norm": 1.8828125, "learning_rate": 8.793103448275862e-06, "loss": 0.44966917037963866, "mean_token_accuracy": 0.8635564997792244, "num_tokens": 3099465.0, "step": 205 }, { "entropy": 0.4191489264369011, "epoch": 0.0815296515577987, "grad_norm": 1.734375, "learning_rate": 9.008620689655173e-06, "loss": 0.45826034545898436, "mean_token_accuracy": 0.8656236171722412, "num_tokens": 3165471.0, "step": 210 }, { "entropy": 0.3851976301521063, "epoch": 0.08347083373774629, "grad_norm": 1.9921875, "learning_rate": 9.224137931034484e-06, "loss": 0.41442441940307617, "mean_token_accuracy": 0.8754843935370445, "num_tokens": 3250159.0, "step": 215 }, { "entropy": 0.4017783209681511, "epoch": 0.08541201591769387, "grad_norm": 2.03125, "learning_rate": 9.439655172413794e-06, "loss": 0.4402902603149414, "mean_token_accuracy": 0.8709232717752456, "num_tokens": 3336018.0, "step": 220 }, { "entropy": 0.38373975045979025, "epoch": 0.08735319809764146, "grad_norm": 1.8515625, "learning_rate": 9.655172413793105e-06, "loss": 0.3947699546813965, "mean_token_accuracy": 0.8761502489447593, "num_tokens": 3402332.0, "step": 225 }, { "entropy": 0.3711710192263126, "epoch": 0.08929438027758906, "grad_norm": 1.4453125, "learning_rate": 9.870689655172414e-06, "loss": 0.38801021575927735, "mean_token_accuracy": 0.8783514246344566, "num_tokens": 3490759.0, "step": 230 }, { "entropy": 0.37838716208934786, "epoch": 0.09123556245753664, "grad_norm": 1.6875, "learning_rate": 9.999998243530697e-06, "loss": 0.4078525543212891, "mean_token_accuracy": 0.8711143419146538, "num_tokens": 3581453.0, "step": 235 }, { "entropy": 0.37663319632411, "epoch": 0.09317674463748422, "grad_norm": 2.5, "learning_rate": 9.999978483265213e-06, "loss": 0.4262491226196289, "mean_token_accuracy": 0.8772962838411331, "num_tokens": 3646168.0, "step": 240 }, { "entropy": 0.4050050787627697, "epoch": 0.09511792681743182, "grad_norm": 1.53125, "learning_rate": 9.999936767234675e-06, "loss": 0.4260216236114502, "mean_token_accuracy": 0.8673088252544403, "num_tokens": 3731064.0, "step": 245 }, { "entropy": 0.384697999432683, "epoch": 0.0970591089973794, "grad_norm": 1.9921875, "learning_rate": 9.999873095622266e-06, "loss": 0.3961009979248047, "mean_token_accuracy": 0.8749020054936409, "num_tokens": 3798798.0, "step": 250 }, { "entropy": 0.4126061208546162, "epoch": 0.09900029117732699, "grad_norm": 1.4375, "learning_rate": 9.999787468707579e-06, "loss": 0.43981060981750486, "mean_token_accuracy": 0.8663996770977974, "num_tokens": 3886350.0, "step": 255 }, { "entropy": 0.40845385044813154, "epoch": 0.10094147335727457, "grad_norm": 1.5078125, "learning_rate": 9.999679886866614e-06, "loss": 0.41130051612854, "mean_token_accuracy": 0.8713549628853798, "num_tokens": 3959484.0, "step": 260 }, { "entropy": 0.40083046182990073, "epoch": 0.10288265553722217, "grad_norm": 1.8671875, "learning_rate": 9.999550350571785e-06, "loss": 0.4313651561737061, "mean_token_accuracy": 0.8690039083361626, "num_tokens": 4039205.0, "step": 265 }, { "entropy": 0.35567491836845877, "epoch": 0.10482383771716976, "grad_norm": 1.625, "learning_rate": 9.999398860391906e-06, "loss": 0.39510302543640136, "mean_token_accuracy": 0.8858051553368569, "num_tokens": 4114526.0, "step": 270 }, { "entropy": 0.3589722190052271, "epoch": 0.10676501989711734, "grad_norm": 1.671875, "learning_rate": 9.9992254169922e-06, "loss": 0.377154541015625, "mean_token_accuracy": 0.8847770616412163, "num_tokens": 4180520.0, "step": 275 }, { "entropy": 0.37694212086498735, "epoch": 0.10870620207706493, "grad_norm": 2.078125, "learning_rate": 9.99903002113428e-06, "loss": 0.3913008213043213, "mean_token_accuracy": 0.8760873630642891, "num_tokens": 4256660.0, "step": 280 }, { "entropy": 0.43231718949973585, "epoch": 0.11064738425701252, "grad_norm": 1.59375, "learning_rate": 9.99881267367617e-06, "loss": 0.4287400722503662, "mean_token_accuracy": 0.8621754452586174, "num_tokens": 4326362.0, "step": 285 }, { "entropy": 0.4439574245363474, "epoch": 0.11258856643696011, "grad_norm": 1.765625, "learning_rate": 9.998573375572277e-06, "loss": 0.44347705841064455, "mean_token_accuracy": 0.8591988816857338, "num_tokens": 4401568.0, "step": 290 }, { "entropy": 0.4327508192509413, "epoch": 0.11452974861690769, "grad_norm": 1.53125, "learning_rate": 9.998312127873398e-06, "loss": 0.41773738861083987, "mean_token_accuracy": 0.8604527100920677, "num_tokens": 4482468.0, "step": 295 }, { "entropy": 0.39415039904415605, "epoch": 0.11647093079685529, "grad_norm": 1.734375, "learning_rate": 9.99802893172672e-06, "loss": 0.37184581756591795, "mean_token_accuracy": 0.875768692791462, "num_tokens": 4566311.0, "step": 300 }, { "entropy": 0.3908670715987682, "epoch": 0.11841211297680287, "grad_norm": 1.421875, "learning_rate": 9.997723788375803e-06, "loss": 0.4179991722106934, "mean_token_accuracy": 0.8736939936876297, "num_tokens": 4639335.0, "step": 305 }, { "entropy": 0.37057909071445466, "epoch": 0.12035329515675046, "grad_norm": 1.5703125, "learning_rate": 9.997396699160586e-06, "loss": 0.3718397855758667, "mean_token_accuracy": 0.8778384670615196, "num_tokens": 4729786.0, "step": 310 }, { "entropy": 0.3775249246507883, "epoch": 0.12229447733669804, "grad_norm": 1.25, "learning_rate": 9.997047665517373e-06, "loss": 0.36892924308776853, "mean_token_accuracy": 0.8788423538208008, "num_tokens": 4815579.0, "step": 315 }, { "entropy": 0.42527642734348775, "epoch": 0.12423565951664564, "grad_norm": 1.859375, "learning_rate": 9.996676688978832e-06, "loss": 0.4455845832824707, "mean_token_accuracy": 0.8667929217219352, "num_tokens": 4890387.0, "step": 320 }, { "entropy": 0.3964430205523968, "epoch": 0.1261768416965932, "grad_norm": 1.5859375, "learning_rate": 9.996283771173982e-06, "loss": 0.4093163967132568, "mean_token_accuracy": 0.871640557050705, "num_tokens": 4963050.0, "step": 325 }, { "entropy": 0.41448891162872314, "epoch": 0.1281180238765408, "grad_norm": 1.453125, "learning_rate": 9.995868913828198e-06, "loss": 0.4085641860961914, "mean_token_accuracy": 0.8690432503819465, "num_tokens": 5041868.0, "step": 330 }, { "entropy": 0.4093653842806816, "epoch": 0.1300592060564884, "grad_norm": 1.65625, "learning_rate": 9.995432118763182e-06, "loss": 0.4269090175628662, "mean_token_accuracy": 0.8645370990037918, "num_tokens": 5131350.0, "step": 335 }, { "entropy": 0.40297048091888427, "epoch": 0.13200038823643598, "grad_norm": 1.671875, "learning_rate": 9.994973387896983e-06, "loss": 0.4135453224182129, "mean_token_accuracy": 0.8721030279994011, "num_tokens": 5206454.0, "step": 340 }, { "entropy": 0.3713659271597862, "epoch": 0.13394157041638358, "grad_norm": 1.234375, "learning_rate": 9.994492723243965e-06, "loss": 0.38209033012390137, "mean_token_accuracy": 0.8801575794816017, "num_tokens": 5290337.0, "step": 345 }, { "entropy": 0.43902772702276704, "epoch": 0.13588275259633117, "grad_norm": 1.8046875, "learning_rate": 9.993990126914808e-06, "loss": 0.45044879913330077, "mean_token_accuracy": 0.8585825085639953, "num_tokens": 5355252.0, "step": 350 }, { "entropy": 0.403434070199728, "epoch": 0.13782393477627874, "grad_norm": 1.5859375, "learning_rate": 9.9934656011165e-06, "loss": 0.4331518650054932, "mean_token_accuracy": 0.8729702636599541, "num_tokens": 5421254.0, "step": 355 }, { "entropy": 0.3791210547089577, "epoch": 0.13976511695622634, "grad_norm": 1.5859375, "learning_rate": 9.992919148152323e-06, "loss": 0.4140181064605713, "mean_token_accuracy": 0.8779570132493972, "num_tokens": 5506667.0, "step": 360 }, { "entropy": 0.3965904530137777, "epoch": 0.14170629913617394, "grad_norm": 1.484375, "learning_rate": 9.992350770421849e-06, "loss": 0.40141940116882324, "mean_token_accuracy": 0.873149348795414, "num_tokens": 5582156.0, "step": 365 }, { "entropy": 0.3853264570236206, "epoch": 0.1436474813161215, "grad_norm": 2.015625, "learning_rate": 9.991760470420917e-06, "loss": 0.386338210105896, "mean_token_accuracy": 0.8758631706237793, "num_tokens": 5651606.0, "step": 370 }, { "entropy": 0.3922650724649429, "epoch": 0.1455886634960691, "grad_norm": 1.484375, "learning_rate": 9.99114825074164e-06, "loss": 0.41069760322570803, "mean_token_accuracy": 0.8746298983693123, "num_tokens": 5746987.0, "step": 375 }, { "entropy": 0.43557493686676024, "epoch": 0.1475298456760167, "grad_norm": 2.390625, "learning_rate": 9.990514114072379e-06, "loss": 0.44701457023620605, "mean_token_accuracy": 0.8603363439440728, "num_tokens": 5820511.0, "step": 380 }, { "entropy": 0.3883111171424389, "epoch": 0.14947102785596428, "grad_norm": 1.4921875, "learning_rate": 9.989858063197735e-06, "loss": 0.40341935157775877, "mean_token_accuracy": 0.8721037909388543, "num_tokens": 5901615.0, "step": 385 }, { "entropy": 0.39608169682323935, "epoch": 0.15141221003591188, "grad_norm": 2.21875, "learning_rate": 9.989180100998543e-06, "loss": 0.4208333492279053, "mean_token_accuracy": 0.8722649529576302, "num_tokens": 5974272.0, "step": 390 }, { "entropy": 0.37846892662346365, "epoch": 0.15335339221585945, "grad_norm": 1.40625, "learning_rate": 9.988480230451849e-06, "loss": 0.38179306983947753, "mean_token_accuracy": 0.8771921068429946, "num_tokens": 6060716.0, "step": 395 }, { "entropy": 0.37789811603724954, "epoch": 0.15529457439580704, "grad_norm": 2.03125, "learning_rate": 9.987758454630909e-06, "loss": 0.39535834789276125, "mean_token_accuracy": 0.8785205245018005, "num_tokens": 6126916.0, "step": 400 }, { "epoch": 0.15529457439580704, "eval_entropy": 0.3923966215794283, "eval_loss": 0.39556533098220825, "eval_mean_token_accuracy": 0.876580595429302, "eval_num_tokens": 6126916.0, "eval_runtime": 60.1557, "eval_samples_per_second": 35.724, "eval_steps_per_second": 35.724, "step": 400 }, { "entropy": 0.43358618319034575, "epoch": 0.15723575657575464, "grad_norm": 1.6953125, "learning_rate": 9.98701477670516e-06, "loss": 0.4627527236938477, "mean_token_accuracy": 0.8672218635678292, "num_tokens": 6191091.0, "step": 405 }, { "entropy": 0.41946529373526575, "epoch": 0.1591769387557022, "grad_norm": 1.84375, "learning_rate": 9.986249199940221e-06, "loss": 0.4011059284210205, "mean_token_accuracy": 0.8665074944496155, "num_tokens": 6271540.0, "step": 410 }, { "entropy": 0.40461285747587683, "epoch": 0.1611181209356498, "grad_norm": 1.8828125, "learning_rate": 9.985461727697873e-06, "loss": 0.4005119800567627, "mean_token_accuracy": 0.8737778559327125, "num_tokens": 6335828.0, "step": 415 }, { "entropy": 0.42256330624222754, "epoch": 0.1630593031155974, "grad_norm": 1.9140625, "learning_rate": 9.98465236343604e-06, "loss": 0.447072172164917, "mean_token_accuracy": 0.8665044084191322, "num_tokens": 6398004.0, "step": 420 }, { "entropy": 0.3777090422809124, "epoch": 0.16500048529554498, "grad_norm": 1.6953125, "learning_rate": 9.98382111070878e-06, "loss": 0.40898308753967283, "mean_token_accuracy": 0.8773329868912697, "num_tokens": 6475011.0, "step": 425 }, { "entropy": 0.3920007921755314, "epoch": 0.16694166747549258, "grad_norm": 1.9375, "learning_rate": 9.982967973166269e-06, "loss": 0.36671743392944334, "mean_token_accuracy": 0.8770380824804306, "num_tokens": 6537403.0, "step": 430 }, { "entropy": 0.38834148123860357, "epoch": 0.16888284965544018, "grad_norm": 1.5625, "learning_rate": 9.982092954554776e-06, "loss": 0.40844144821166994, "mean_token_accuracy": 0.8771779343485833, "num_tokens": 6605063.0, "step": 435 }, { "entropy": 0.3952221803367138, "epoch": 0.17082403183538775, "grad_norm": 1.5859375, "learning_rate": 9.981196058716662e-06, "loss": 0.42590937614440916, "mean_token_accuracy": 0.8744154885411263, "num_tokens": 6679167.0, "step": 440 }, { "entropy": 0.38398993872106074, "epoch": 0.17276521401533534, "grad_norm": 1.5703125, "learning_rate": 9.98027728959035e-06, "loss": 0.39303438663482665, "mean_token_accuracy": 0.8744676560163498, "num_tokens": 6761337.0, "step": 445 }, { "entropy": 0.3808287113904953, "epoch": 0.17470639619528291, "grad_norm": 1.4296875, "learning_rate": 9.979336651210314e-06, "loss": 0.3940417289733887, "mean_token_accuracy": 0.8789507359266281, "num_tokens": 6834935.0, "step": 450 }, { "entropy": 0.4267194837331772, "epoch": 0.1766475783752305, "grad_norm": 1.765625, "learning_rate": 9.978374147707055e-06, "loss": 0.4340329647064209, "mean_token_accuracy": 0.8673587426543236, "num_tokens": 6904193.0, "step": 455 }, { "entropy": 0.41968510262668135, "epoch": 0.1785887605551781, "grad_norm": 1.6640625, "learning_rate": 9.977389783307095e-06, "loss": 0.4462919235229492, "mean_token_accuracy": 0.8695808529853821, "num_tokens": 6971958.0, "step": 460 }, { "entropy": 0.38464379906654356, "epoch": 0.18052994273512568, "grad_norm": 1.4140625, "learning_rate": 9.976383562332946e-06, "loss": 0.40283498764038084, "mean_token_accuracy": 0.880063496530056, "num_tokens": 7053470.0, "step": 465 }, { "entropy": 0.364135454967618, "epoch": 0.18247112491507328, "grad_norm": 1.640625, "learning_rate": 9.975355489203097e-06, "loss": 0.4077275276184082, "mean_token_accuracy": 0.8819017142057419, "num_tokens": 7126534.0, "step": 470 }, { "entropy": 0.3730104427784681, "epoch": 0.18441230709502088, "grad_norm": 1.7421875, "learning_rate": 9.974305568431994e-06, "loss": 0.3929471969604492, "mean_token_accuracy": 0.8784888133406639, "num_tokens": 7201674.0, "step": 475 }, { "entropy": 0.3986961957067251, "epoch": 0.18635348927496845, "grad_norm": 1.5703125, "learning_rate": 9.973233804630022e-06, "loss": 0.4142603874206543, "mean_token_accuracy": 0.8733824387192726, "num_tokens": 7271021.0, "step": 480 }, { "entropy": 0.4345793057233095, "epoch": 0.18829467145491605, "grad_norm": 1.7890625, "learning_rate": 9.972140202503477e-06, "loss": 0.45402941703796384, "mean_token_accuracy": 0.8599157705903053, "num_tokens": 7347794.0, "step": 485 }, { "entropy": 0.39443473145365715, "epoch": 0.19023585363486364, "grad_norm": 1.71875, "learning_rate": 9.971024766854554e-06, "loss": 0.4239619731903076, "mean_token_accuracy": 0.8729955241084099, "num_tokens": 7430791.0, "step": 490 }, { "entropy": 0.39781664684414864, "epoch": 0.19217703581481121, "grad_norm": 1.640625, "learning_rate": 9.969887502581324e-06, "loss": 0.41582446098327636, "mean_token_accuracy": 0.8728051796555519, "num_tokens": 7509803.0, "step": 495 }, { "entropy": 0.45657297112047673, "epoch": 0.1941182179947588, "grad_norm": 1.265625, "learning_rate": 9.96872841467771e-06, "loss": 0.446823263168335, "mean_token_accuracy": 0.8597154960036277, "num_tokens": 7601591.0, "step": 500 }, { "entropy": 0.3968469314277172, "epoch": 0.19605940017470638, "grad_norm": 1.703125, "learning_rate": 9.967547508233466e-06, "loss": 0.4176668643951416, "mean_token_accuracy": 0.872602291405201, "num_tokens": 7666352.0, "step": 505 }, { "entropy": 0.41542044915258886, "epoch": 0.19800058235465398, "grad_norm": 1.515625, "learning_rate": 9.966344788434154e-06, "loss": 0.43819799423217776, "mean_token_accuracy": 0.865588866174221, "num_tokens": 7742267.0, "step": 510 }, { "entropy": 0.39156174324452875, "epoch": 0.19994176453460158, "grad_norm": 1.3125, "learning_rate": 9.965120260561126e-06, "loss": 0.39728028774261476, "mean_token_accuracy": 0.8775914892554283, "num_tokens": 7819899.0, "step": 515 }, { "entropy": 0.3775805365294218, "epoch": 0.20188294671454915, "grad_norm": 1.9375, "learning_rate": 9.963873929991492e-06, "loss": 0.4102130889892578, "mean_token_accuracy": 0.878247183561325, "num_tokens": 7887873.0, "step": 520 }, { "entropy": 0.39370285868644717, "epoch": 0.20382412889449675, "grad_norm": 1.4375, "learning_rate": 9.962605802198105e-06, "loss": 0.4014415264129639, "mean_token_accuracy": 0.8733704462647438, "num_tokens": 7961482.0, "step": 525 }, { "entropy": 0.40123398676514627, "epoch": 0.20576531107444435, "grad_norm": 1.9453125, "learning_rate": 9.961315882749531e-06, "loss": 0.42458133697509765, "mean_token_accuracy": 0.875715845823288, "num_tokens": 8022441.0, "step": 530 }, { "entropy": 0.3925070337951183, "epoch": 0.20770649325439192, "grad_norm": 1.5234375, "learning_rate": 9.960004177310029e-06, "loss": 0.38911452293396, "mean_token_accuracy": 0.8763631775975227, "num_tokens": 8098552.0, "step": 535 }, { "entropy": 0.39059548266232014, "epoch": 0.20964767543433951, "grad_norm": 1.21875, "learning_rate": 9.958670691639523e-06, "loss": 0.41231446266174315, "mean_token_accuracy": 0.8729776293039322, "num_tokens": 8193252.0, "step": 540 }, { "entropy": 0.3952123038470745, "epoch": 0.2115888576142871, "grad_norm": 2.09375, "learning_rate": 9.957315431593578e-06, "loss": 0.41542778015136717, "mean_token_accuracy": 0.8778386160731315, "num_tokens": 8259794.0, "step": 545 }, { "entropy": 0.39136257991194723, "epoch": 0.21353003979423468, "grad_norm": 1.734375, "learning_rate": 9.955938403123372e-06, "loss": 0.4131179332733154, "mean_token_accuracy": 0.8729422584176063, "num_tokens": 8325306.0, "step": 550 }, { "entropy": 0.36876106821000576, "epoch": 0.21547122197418228, "grad_norm": 2.15625, "learning_rate": 9.954539612275676e-06, "loss": 0.3939671516418457, "mean_token_accuracy": 0.8807895123958588, "num_tokens": 8403551.0, "step": 555 }, { "entropy": 0.41602297611534594, "epoch": 0.21741240415412985, "grad_norm": 1.609375, "learning_rate": 9.95311906519282e-06, "loss": 0.43773713111877444, "mean_token_accuracy": 0.8707596242427826, "num_tokens": 8474927.0, "step": 560 }, { "entropy": 0.4392675504088402, "epoch": 0.21935358633407745, "grad_norm": 1.3984375, "learning_rate": 9.951676768112673e-06, "loss": 0.43816194534301756, "mean_token_accuracy": 0.8632524207234382, "num_tokens": 8551768.0, "step": 565 }, { "entropy": 0.4103314906358719, "epoch": 0.22129476851402505, "grad_norm": 1.6953125, "learning_rate": 9.950212727368606e-06, "loss": 0.43707122802734377, "mean_token_accuracy": 0.8690274521708489, "num_tokens": 8623280.0, "step": 570 }, { "entropy": 0.35621660873293876, "epoch": 0.22323595069397262, "grad_norm": 2.390625, "learning_rate": 9.948726949389474e-06, "loss": 0.39322140216827395, "mean_token_accuracy": 0.8832426086068154, "num_tokens": 8701428.0, "step": 575 }, { "entropy": 0.40779992677271365, "epoch": 0.22517713287392022, "grad_norm": 1.6796875, "learning_rate": 9.947219440699584e-06, "loss": 0.42441439628601074, "mean_token_accuracy": 0.8702899888157845, "num_tokens": 8769812.0, "step": 580 }, { "entropy": 0.42166984751820563, "epoch": 0.22711831505386781, "grad_norm": 1.7890625, "learning_rate": 9.945690207918667e-06, "loss": 0.41438679695129393, "mean_token_accuracy": 0.8685426101088524, "num_tokens": 8850435.0, "step": 585 }, { "entropy": 0.3902070872485638, "epoch": 0.22905949723381538, "grad_norm": 1.6796875, "learning_rate": 9.944139257761845e-06, "loss": 0.3842545747756958, "mean_token_accuracy": 0.8773440137505532, "num_tokens": 8903430.0, "step": 590 }, { "entropy": 0.43535452634096145, "epoch": 0.23100067941376298, "grad_norm": 1.5703125, "learning_rate": 9.942566597039608e-06, "loss": 0.42905964851379397, "mean_token_accuracy": 0.862843619287014, "num_tokens": 8976102.0, "step": 595 }, { "entropy": 0.42619109377264974, "epoch": 0.23294186159371058, "grad_norm": 1.9140625, "learning_rate": 9.940972232657782e-06, "loss": 0.4514484882354736, "mean_token_accuracy": 0.8609629839658737, "num_tokens": 9073966.0, "step": 600 }, { "epoch": 0.23294186159371058, "eval_entropy": 0.3833293942704263, "eval_loss": 0.38786980509757996, "eval_mean_token_accuracy": 0.8783254230255413, "eval_num_tokens": 9073966.0, "eval_runtime": 60.1116, "eval_samples_per_second": 35.75, "eval_steps_per_second": 35.75, "step": 600 }, { "entropy": 0.3839044734835625, "epoch": 0.23488304377365815, "grad_norm": 2.203125, "learning_rate": 9.93935617161749e-06, "loss": 0.4186872959136963, "mean_token_accuracy": 0.8785676345229149, "num_tokens": 9145748.0, "step": 605 }, { "entropy": 0.39257055819034575, "epoch": 0.23682422595360575, "grad_norm": 1.609375, "learning_rate": 9.937718421015137e-06, "loss": 0.418427848815918, "mean_token_accuracy": 0.8697977751493454, "num_tokens": 9232179.0, "step": 610 }, { "entropy": 0.377314992621541, "epoch": 0.23876540813355335, "grad_norm": 1.875, "learning_rate": 9.936058988042367e-06, "loss": 0.4139708042144775, "mean_token_accuracy": 0.8783795028924942, "num_tokens": 9308281.0, "step": 615 }, { "entropy": 0.38419472984969616, "epoch": 0.24070659031350092, "grad_norm": 1.4453125, "learning_rate": 9.934377879986035e-06, "loss": 0.40369882583618166, "mean_token_accuracy": 0.8746202811598778, "num_tokens": 9393206.0, "step": 620 }, { "entropy": 0.3744822334498167, "epoch": 0.24264777249344852, "grad_norm": 1.734375, "learning_rate": 9.932675104228177e-06, "loss": 0.40184435844421384, "mean_token_accuracy": 0.8763082399964333, "num_tokens": 9485199.0, "step": 625 }, { "entropy": 0.42811642177402975, "epoch": 0.2445889546733961, "grad_norm": 1.6015625, "learning_rate": 9.930950668245971e-06, "loss": 0.44397845268249514, "mean_token_accuracy": 0.8665265038609504, "num_tokens": 9565581.0, "step": 630 }, { "entropy": 0.39702326618134975, "epoch": 0.24653013685334368, "grad_norm": 2.0625, "learning_rate": 9.929204579611716e-06, "loss": 0.38111956119537355, "mean_token_accuracy": 0.8766680151224137, "num_tokens": 9636638.0, "step": 635 }, { "entropy": 0.4137250851839781, "epoch": 0.24847131903329128, "grad_norm": 1.59375, "learning_rate": 9.927436845992782e-06, "loss": 0.4052375316619873, "mean_token_accuracy": 0.8680448547005654, "num_tokens": 9714886.0, "step": 640 }, { "entropy": 0.394980476424098, "epoch": 0.2504125012132389, "grad_norm": 1.21875, "learning_rate": 9.925647475151596e-06, "loss": 0.40479207038879395, "mean_token_accuracy": 0.8760687246918678, "num_tokens": 9793866.0, "step": 645 }, { "entropy": 0.3940431509166956, "epoch": 0.2523536833931864, "grad_norm": 1.5, "learning_rate": 9.923836474945592e-06, "loss": 0.3884091854095459, "mean_token_accuracy": 0.8729974597692489, "num_tokens": 9880497.0, "step": 650 }, { "entropy": 0.42443324290215967, "epoch": 0.254294865573134, "grad_norm": 1.7109375, "learning_rate": 9.92200385332718e-06, "loss": 0.4349085330963135, "mean_token_accuracy": 0.8659611865878105, "num_tokens": 9947758.0, "step": 655 }, { "entropy": 0.43983132056891916, "epoch": 0.2562360477530816, "grad_norm": 1.6875, "learning_rate": 9.92014961834372e-06, "loss": 0.46071972846984866, "mean_token_accuracy": 0.8601326540112495, "num_tokens": 10016198.0, "step": 660 }, { "entropy": 0.3981770180165768, "epoch": 0.2581772299330292, "grad_norm": 1.4765625, "learning_rate": 9.918273778137477e-06, "loss": 0.41265163421630857, "mean_token_accuracy": 0.8710227489471436, "num_tokens": 10111907.0, "step": 665 }, { "entropy": 0.41508678197860716, "epoch": 0.2601184121129768, "grad_norm": 1.6875, "learning_rate": 9.916376340945584e-06, "loss": 0.4117740631103516, "mean_token_accuracy": 0.8687305614352226, "num_tokens": 10191989.0, "step": 670 }, { "entropy": 0.4320628222078085, "epoch": 0.2620595942929244, "grad_norm": 1.2734375, "learning_rate": 9.91445731510002e-06, "loss": 0.4121531963348389, "mean_token_accuracy": 0.8719193398952484, "num_tokens": 10273505.0, "step": 675 }, { "entropy": 0.3792607393115759, "epoch": 0.26400077647287196, "grad_norm": 2.296875, "learning_rate": 9.91251670902755e-06, "loss": 0.3988708257675171, "mean_token_accuracy": 0.8824092477560044, "num_tokens": 10342792.0, "step": 680 }, { "entropy": 0.37850567921996114, "epoch": 0.26594195865281955, "grad_norm": 1.46875, "learning_rate": 9.910554531249714e-06, "loss": 0.3946362018585205, "mean_token_accuracy": 0.8791269809007645, "num_tokens": 10409365.0, "step": 685 }, { "entropy": 0.36963232718408107, "epoch": 0.26788314083276715, "grad_norm": 1.375, "learning_rate": 9.90857079038277e-06, "loss": 0.37774851322174074, "mean_token_accuracy": 0.8818046569824218, "num_tokens": 10497911.0, "step": 690 }, { "entropy": 0.3968418601900339, "epoch": 0.26982432301271475, "grad_norm": 1.765625, "learning_rate": 9.906565495137665e-06, "loss": 0.3911430835723877, "mean_token_accuracy": 0.875646622478962, "num_tokens": 10569784.0, "step": 695 }, { "entropy": 0.384658931568265, "epoch": 0.27176550519266235, "grad_norm": 1.4609375, "learning_rate": 9.904538654319998e-06, "loss": 0.4171136379241943, "mean_token_accuracy": 0.8759941428899765, "num_tokens": 10638722.0, "step": 700 }, { "entropy": 0.3830408491194248, "epoch": 0.27370668737260995, "grad_norm": 1.734375, "learning_rate": 9.90249027682997e-06, "loss": 0.43827214241027834, "mean_token_accuracy": 0.8736877083778382, "num_tokens": 10717045.0, "step": 705 }, { "entropy": 0.42484647817909715, "epoch": 0.2756478695525575, "grad_norm": 1.5234375, "learning_rate": 9.900420371662364e-06, "loss": 0.42806663513183596, "mean_token_accuracy": 0.8680253028869629, "num_tokens": 10786078.0, "step": 710 }, { "entropy": 0.36103312484920025, "epoch": 0.2775890517325051, "grad_norm": 1.453125, "learning_rate": 9.898328947906489e-06, "loss": 0.3689872741699219, "mean_token_accuracy": 0.8823491036891937, "num_tokens": 10863918.0, "step": 715 }, { "entropy": 0.4158777046948671, "epoch": 0.2795302339124527, "grad_norm": 1.6328125, "learning_rate": 9.896216014746141e-06, "loss": 0.40418004989624023, "mean_token_accuracy": 0.8711874485015869, "num_tokens": 10952093.0, "step": 720 }, { "entropy": 0.388820331171155, "epoch": 0.2814714160924003, "grad_norm": 1.609375, "learning_rate": 9.894081581459579e-06, "loss": 0.40212116241455076, "mean_token_accuracy": 0.8809416055679321, "num_tokens": 11026123.0, "step": 725 }, { "entropy": 0.4209954336285591, "epoch": 0.2834125982723479, "grad_norm": 1.6484375, "learning_rate": 9.891925657419463e-06, "loss": 0.4366124153137207, "mean_token_accuracy": 0.870456813275814, "num_tokens": 11090901.0, "step": 730 }, { "entropy": 0.40942220725119116, "epoch": 0.2853537804522954, "grad_norm": 1.3515625, "learning_rate": 9.889748252092827e-06, "loss": 0.40485477447509766, "mean_token_accuracy": 0.8676602795720101, "num_tokens": 11178026.0, "step": 735 }, { "entropy": 0.3782723072916269, "epoch": 0.287294962632243, "grad_norm": 1.671875, "learning_rate": 9.887549375041031e-06, "loss": 0.40353665351867674, "mean_token_accuracy": 0.8802446350455284, "num_tokens": 11231711.0, "step": 740 }, { "entropy": 0.39903284460306165, "epoch": 0.2892361448121906, "grad_norm": 1.734375, "learning_rate": 9.885329035919724e-06, "loss": 0.4090695858001709, "mean_token_accuracy": 0.868949045240879, "num_tokens": 11327844.0, "step": 745 }, { "entropy": 0.4031669870018959, "epoch": 0.2911773269921382, "grad_norm": 2.125, "learning_rate": 9.883087244478796e-06, "loss": 0.45818114280700684, "mean_token_accuracy": 0.8704651057720184, "num_tokens": 11401714.0, "step": 750 }, { "entropy": 0.43706582076847555, "epoch": 0.2931185091720858, "grad_norm": 1.71875, "learning_rate": 9.880824010562338e-06, "loss": 0.42615551948547364, "mean_token_accuracy": 0.860453313589096, "num_tokens": 11489610.0, "step": 755 }, { "entropy": 0.41881459429860113, "epoch": 0.2950596913520334, "grad_norm": 1.2578125, "learning_rate": 9.878539344108599e-06, "loss": 0.42671880722045896, "mean_token_accuracy": 0.8694243490695953, "num_tokens": 11574592.0, "step": 760 }, { "entropy": 0.42061977460980415, "epoch": 0.29700087353198096, "grad_norm": 1.59375, "learning_rate": 9.876233255149945e-06, "loss": 0.4513099193572998, "mean_token_accuracy": 0.8744151741266251, "num_tokens": 11641646.0, "step": 765 }, { "entropy": 0.39190227575600145, "epoch": 0.29894205571192856, "grad_norm": 1.84375, "learning_rate": 9.873905753812807e-06, "loss": 0.39388408660888674, "mean_token_accuracy": 0.8745140552520752, "num_tokens": 11714807.0, "step": 770 }, { "entropy": 0.4235379956662655, "epoch": 0.30088323789187615, "grad_norm": 1.9609375, "learning_rate": 9.871556850317641e-06, "loss": 0.45146808624267576, "mean_token_accuracy": 0.8673876538872719, "num_tokens": 11781200.0, "step": 775 }, { "entropy": 0.4271043732762337, "epoch": 0.30282442007182375, "grad_norm": 2.078125, "learning_rate": 9.86918655497889e-06, "loss": 0.41930394172668456, "mean_token_accuracy": 0.8670791104435921, "num_tokens": 11859281.0, "step": 780 }, { "entropy": 0.3889836758375168, "epoch": 0.30476560225177135, "grad_norm": 1.9921875, "learning_rate": 9.866794878204926e-06, "loss": 0.42397122383117675, "mean_token_accuracy": 0.875930380821228, "num_tokens": 11935604.0, "step": 785 }, { "entropy": 0.407536294311285, "epoch": 0.3067067844317189, "grad_norm": 1.7265625, "learning_rate": 9.864381830498013e-06, "loss": 0.4073331356048584, "mean_token_accuracy": 0.8763293012976646, "num_tokens": 12000505.0, "step": 790 }, { "entropy": 0.3989451553672552, "epoch": 0.3086479666116665, "grad_norm": 1.5703125, "learning_rate": 9.861947422454262e-06, "loss": 0.40294957160949707, "mean_token_accuracy": 0.8775883078575134, "num_tokens": 12073943.0, "step": 795 }, { "entropy": 0.3732746794819832, "epoch": 0.3105891487916141, "grad_norm": 1.7265625, "learning_rate": 9.85949166476357e-06, "loss": 0.36971125602722166, "mean_token_accuracy": 0.8844305410981178, "num_tokens": 12139003.0, "step": 800 }, { "epoch": 0.3105891487916141, "eval_entropy": 0.381350220199683, "eval_loss": 0.38283953070640564, "eval_mean_token_accuracy": 0.8796889461046488, "eval_num_tokens": 12139003.0, "eval_runtime": 60.3322, "eval_samples_per_second": 35.619, "eval_steps_per_second": 35.619, "step": 800 }, { "entropy": 0.41282732523977755, "epoch": 0.3125303309715617, "grad_norm": 1.9375, "learning_rate": 9.857014568209597e-06, "loss": 0.46287264823913576, "mean_token_accuracy": 0.8671122461557388, "num_tokens": 12216310.0, "step": 805 }, { "entropy": 0.3740253135561943, "epoch": 0.3144715131515093, "grad_norm": 2.046875, "learning_rate": 9.854516143669699e-06, "loss": 0.3972623348236084, "mean_token_accuracy": 0.8769651532173157, "num_tokens": 12284513.0, "step": 810 }, { "entropy": 0.3666670624166727, "epoch": 0.3164126953314569, "grad_norm": 1.390625, "learning_rate": 9.851996402114886e-06, "loss": 0.3955537796020508, "mean_token_accuracy": 0.8804457679390907, "num_tokens": 12376220.0, "step": 815 }, { "entropy": 0.40160666182637217, "epoch": 0.3183538775114044, "grad_norm": 1.5703125, "learning_rate": 9.849455354609777e-06, "loss": 0.41783933639526366, "mean_token_accuracy": 0.8718173667788506, "num_tokens": 12465139.0, "step": 820 }, { "entropy": 0.395163032412529, "epoch": 0.320295059691352, "grad_norm": 1.46875, "learning_rate": 9.846893012312549e-06, "loss": 0.4353921413421631, "mean_token_accuracy": 0.8741151168942451, "num_tokens": 12543594.0, "step": 825 }, { "entropy": 0.38790931962430475, "epoch": 0.3222362418712996, "grad_norm": 1.4453125, "learning_rate": 9.844309386474886e-06, "loss": 0.4091060638427734, "mean_token_accuracy": 0.8755813196301461, "num_tokens": 12633984.0, "step": 830 }, { "entropy": 0.35505941733717916, "epoch": 0.3241774240512472, "grad_norm": 2.15625, "learning_rate": 9.841704488441934e-06, "loss": 0.34843366146087645, "mean_token_accuracy": 0.8839788928627967, "num_tokens": 12696787.0, "step": 835 }, { "entropy": 0.4340564154088497, "epoch": 0.3261186062311948, "grad_norm": 1.8125, "learning_rate": 9.83907832965225e-06, "loss": 0.4248070240020752, "mean_token_accuracy": 0.8684971421957016, "num_tokens": 12758495.0, "step": 840 }, { "entropy": 0.4072086077183485, "epoch": 0.32805978841114236, "grad_norm": 1.609375, "learning_rate": 9.836430921637746e-06, "loss": 0.4239677906036377, "mean_token_accuracy": 0.871665708720684, "num_tokens": 12824603.0, "step": 845 }, { "entropy": 0.4058460295200348, "epoch": 0.33000097059108996, "grad_norm": 1.6328125, "learning_rate": 9.833762276023646e-06, "loss": 0.437244176864624, "mean_token_accuracy": 0.8710885986685752, "num_tokens": 12915482.0, "step": 850 }, { "entropy": 0.4212725304067135, "epoch": 0.33194215277103756, "grad_norm": 1.703125, "learning_rate": 9.831072404528433e-06, "loss": 0.4174651622772217, "mean_token_accuracy": 0.8688189521431923, "num_tokens": 12993367.0, "step": 855 }, { "entropy": 0.3895297344774008, "epoch": 0.33388333495098516, "grad_norm": 1.671875, "learning_rate": 9.828361318963794e-06, "loss": 0.39536495208740235, "mean_token_accuracy": 0.8768842920660973, "num_tokens": 13065810.0, "step": 860 }, { "entropy": 0.4025235269218683, "epoch": 0.33582451713093275, "grad_norm": 1.9609375, "learning_rate": 9.825629031234574e-06, "loss": 0.37229766845703127, "mean_token_accuracy": 0.8749150916934013, "num_tokens": 13133471.0, "step": 865 }, { "entropy": 0.3760003004223108, "epoch": 0.33776569931088035, "grad_norm": 1.453125, "learning_rate": 9.822875553338715e-06, "loss": 0.3896082639694214, "mean_token_accuracy": 0.8785295352339745, "num_tokens": 13211617.0, "step": 870 }, { "entropy": 0.3616090904921293, "epoch": 0.3397068814908279, "grad_norm": 1.546875, "learning_rate": 9.820100897367214e-06, "loss": 0.3726183891296387, "mean_token_accuracy": 0.8849639266729354, "num_tokens": 13286555.0, "step": 875 }, { "entropy": 0.352078976854682, "epoch": 0.3416480636707755, "grad_norm": 1.8125, "learning_rate": 9.81730507550406e-06, "loss": 0.3919616937637329, "mean_token_accuracy": 0.8840990662574768, "num_tokens": 13355200.0, "step": 880 }, { "entropy": 0.38022752702236173, "epoch": 0.3435892458507231, "grad_norm": 1.5625, "learning_rate": 9.81448810002619e-06, "loss": 0.40322446823120117, "mean_token_accuracy": 0.8776090621948243, "num_tokens": 13427836.0, "step": 885 }, { "entropy": 0.4132396575063467, "epoch": 0.3455304280306707, "grad_norm": 1.4296875, "learning_rate": 9.811649983303425e-06, "loss": 0.4324185371398926, "mean_token_accuracy": 0.8720195293426514, "num_tokens": 13496990.0, "step": 890 }, { "entropy": 0.36122960932552817, "epoch": 0.3474716102106183, "grad_norm": 1.765625, "learning_rate": 9.808790737798426e-06, "loss": 0.39123167991638186, "mean_token_accuracy": 0.8842917993664742, "num_tokens": 13572848.0, "step": 895 }, { "entropy": 0.3831039108335972, "epoch": 0.34941279239056583, "grad_norm": 1.8984375, "learning_rate": 9.805910376066631e-06, "loss": 0.37870833873748777, "mean_token_accuracy": 0.874284490942955, "num_tokens": 13652434.0, "step": 900 }, { "entropy": 0.40437927283346653, "epoch": 0.3513539745705134, "grad_norm": 1.875, "learning_rate": 9.803008910756203e-06, "loss": 0.4461234092712402, "mean_token_accuracy": 0.8702120751142501, "num_tokens": 13730423.0, "step": 905 }, { "entropy": 0.4021365400403738, "epoch": 0.353295156750461, "grad_norm": 1.84375, "learning_rate": 9.800086354607975e-06, "loss": 0.4367063999176025, "mean_token_accuracy": 0.8725798889994621, "num_tokens": 13794147.0, "step": 910 }, { "entropy": 0.36978473588824273, "epoch": 0.3552363389304086, "grad_norm": 1.6015625, "learning_rate": 9.797142720455391e-06, "loss": 0.3837603569030762, "mean_token_accuracy": 0.882180480659008, "num_tokens": 13874189.0, "step": 915 }, { "entropy": 0.40101403892040255, "epoch": 0.3571775211103562, "grad_norm": 1.6953125, "learning_rate": 9.794178021224459e-06, "loss": 0.4223616123199463, "mean_token_accuracy": 0.8726651340723037, "num_tokens": 13945289.0, "step": 920 }, { "entropy": 0.35331339165568354, "epoch": 0.3591187032903038, "grad_norm": 1.6953125, "learning_rate": 9.79119226993368e-06, "loss": 0.4017838478088379, "mean_token_accuracy": 0.8861800834536553, "num_tokens": 14028418.0, "step": 925 }, { "entropy": 0.3538735806941986, "epoch": 0.36105988547025136, "grad_norm": 1.671875, "learning_rate": 9.788185479694004e-06, "loss": 0.382387375831604, "mean_token_accuracy": 0.8858973324298859, "num_tokens": 14098572.0, "step": 930 }, { "entropy": 0.3838920842856169, "epoch": 0.36300106765019896, "grad_norm": 1.515625, "learning_rate": 9.785157663708761e-06, "loss": 0.37942454814910886, "mean_token_accuracy": 0.8802381858229638, "num_tokens": 14180314.0, "step": 935 }, { "entropy": 0.41736325100064275, "epoch": 0.36494224983014656, "grad_norm": 2.40625, "learning_rate": 9.782108835273612e-06, "loss": 0.42386960983276367, "mean_token_accuracy": 0.8687801375985146, "num_tokens": 14255569.0, "step": 940 }, { "entropy": 0.3824776504188776, "epoch": 0.36688343201009416, "grad_norm": 1.7109375, "learning_rate": 9.779039007776487e-06, "loss": 0.39833407402038573, "mean_token_accuracy": 0.8781590893864631, "num_tokens": 14323554.0, "step": 945 }, { "entropy": 0.34414222836494446, "epoch": 0.36882461419004176, "grad_norm": 1.609375, "learning_rate": 9.775948194697528e-06, "loss": 0.3766109704971313, "mean_token_accuracy": 0.8860784068703651, "num_tokens": 14404112.0, "step": 950 }, { "entropy": 0.3819138675928116, "epoch": 0.3707657963699893, "grad_norm": 1.7265625, "learning_rate": 9.772836409609025e-06, "loss": 0.4195257663726807, "mean_token_accuracy": 0.8752670779824256, "num_tokens": 14493874.0, "step": 955 }, { "entropy": 0.3910291727632284, "epoch": 0.3727069785499369, "grad_norm": 1.5390625, "learning_rate": 9.76970366617536e-06, "loss": 0.42772369384765624, "mean_token_accuracy": 0.8762600779533386, "num_tokens": 14563770.0, "step": 960 }, { "entropy": 0.39120335690677166, "epoch": 0.3746481607298845, "grad_norm": 1.3828125, "learning_rate": 9.76654997815295e-06, "loss": 0.4206050395965576, "mean_token_accuracy": 0.8764881610870361, "num_tokens": 14649946.0, "step": 965 }, { "entropy": 0.37538095470517874, "epoch": 0.3765893429098321, "grad_norm": 1.2421875, "learning_rate": 9.763375359390181e-06, "loss": 0.40073623657226565, "mean_token_accuracy": 0.87882329672575, "num_tokens": 14732764.0, "step": 970 }, { "entropy": 0.4004466250538826, "epoch": 0.3785305250897797, "grad_norm": 1.34375, "learning_rate": 9.760179823827347e-06, "loss": 0.41023030281066897, "mean_token_accuracy": 0.8705371245741844, "num_tokens": 14814893.0, "step": 975 }, { "entropy": 0.4013238321989775, "epoch": 0.3804717072697273, "grad_norm": 1.7265625, "learning_rate": 9.756963385496599e-06, "loss": 0.44197940826416016, "mean_token_accuracy": 0.8728833734989166, "num_tokens": 14893153.0, "step": 980 }, { "entropy": 0.38930566757917406, "epoch": 0.38241288944967483, "grad_norm": 1.4921875, "learning_rate": 9.753726058521868e-06, "loss": 0.40584554672241213, "mean_token_accuracy": 0.8758206337690353, "num_tokens": 14968880.0, "step": 985 }, { "entropy": 0.4486214961856604, "epoch": 0.38435407162962243, "grad_norm": 1.71875, "learning_rate": 9.750467857118811e-06, "loss": 0.4342947959899902, "mean_token_accuracy": 0.8598737180233001, "num_tokens": 15055751.0, "step": 990 }, { "entropy": 0.3913938831537962, "epoch": 0.38629525380957, "grad_norm": 1.796875, "learning_rate": 9.747188795594755e-06, "loss": 0.43964052200317383, "mean_token_accuracy": 0.8771824568510056, "num_tokens": 15129042.0, "step": 995 }, { "entropy": 0.3813592415302992, "epoch": 0.3882364359895176, "grad_norm": 1.578125, "learning_rate": 9.743888888348618e-06, "loss": 0.3965798854827881, "mean_token_accuracy": 0.8770193979144096, "num_tokens": 15213724.0, "step": 1000 }, { "epoch": 0.3882364359895176, "eval_entropy": 0.3787456456798906, "eval_loss": 0.3797132670879364, "eval_mean_token_accuracy": 0.8805272205589316, "eval_num_tokens": 15213724.0, "eval_runtime": 60.2207, "eval_samples_per_second": 35.685, "eval_steps_per_second": 35.685, "step": 1000 }, { "entropy": 0.36478537507355213, "epoch": 0.3901776181694652, "grad_norm": 1.6015625, "learning_rate": 9.740568149870864e-06, "loss": 0.38557422161102295, "mean_token_accuracy": 0.8830441504716873, "num_tokens": 15293346.0, "step": 1005 }, { "entropy": 0.3718357976526022, "epoch": 0.39211880034941277, "grad_norm": 1.3203125, "learning_rate": 9.737226594743425e-06, "loss": 0.41036291122436525, "mean_token_accuracy": 0.8785412311553955, "num_tokens": 15381773.0, "step": 1010 }, { "entropy": 0.3791601274162531, "epoch": 0.39405998252936036, "grad_norm": 1.4140625, "learning_rate": 9.733864237639645e-06, "loss": 0.39489567279815674, "mean_token_accuracy": 0.8787289649248123, "num_tokens": 15463663.0, "step": 1015 }, { "entropy": 0.40409386418759824, "epoch": 0.39600116470930796, "grad_norm": 1.5, "learning_rate": 9.730481093324209e-06, "loss": 0.39559972286224365, "mean_token_accuracy": 0.873577019572258, "num_tokens": 15545078.0, "step": 1020 }, { "entropy": 0.38083929792046545, "epoch": 0.39794234688925556, "grad_norm": 1.3359375, "learning_rate": 9.72707717665309e-06, "loss": 0.3976888179779053, "mean_token_accuracy": 0.876065094769001, "num_tokens": 15634558.0, "step": 1025 }, { "entropy": 0.39706564620137214, "epoch": 0.39988352906920316, "grad_norm": 1.96875, "learning_rate": 9.723652502573465e-06, "loss": 0.39628422260284424, "mean_token_accuracy": 0.8746212273836136, "num_tokens": 15706364.0, "step": 1030 }, { "entropy": 0.4297271855175495, "epoch": 0.40182471124915076, "grad_norm": 1.8203125, "learning_rate": 9.720207086123674e-06, "loss": 0.44258790016174315, "mean_token_accuracy": 0.8682567358016968, "num_tokens": 15786703.0, "step": 1035 }, { "entropy": 0.37680495008826254, "epoch": 0.4037658934290983, "grad_norm": 1.34375, "learning_rate": 9.716740942433127e-06, "loss": 0.3932778358459473, "mean_token_accuracy": 0.8771912530064583, "num_tokens": 15883251.0, "step": 1040 }, { "entropy": 0.4019945695996284, "epoch": 0.4057070756090459, "grad_norm": 1.359375, "learning_rate": 9.713254086722259e-06, "loss": 0.39323198795318604, "mean_token_accuracy": 0.8740618228912354, "num_tokens": 15962211.0, "step": 1045 }, { "entropy": 0.3781829860061407, "epoch": 0.4076482577889935, "grad_norm": 1.4921875, "learning_rate": 9.709746534302453e-06, "loss": 0.4020181655883789, "mean_token_accuracy": 0.8759587749838829, "num_tokens": 16040844.0, "step": 1050 }, { "entropy": 0.35617672353982927, "epoch": 0.4095894399689411, "grad_norm": 1.453125, "learning_rate": 9.706218300575975e-06, "loss": 0.3751538276672363, "mean_token_accuracy": 0.8814701676368714, "num_tokens": 16117566.0, "step": 1055 }, { "entropy": 0.39271861389279367, "epoch": 0.4115306221488887, "grad_norm": 1.7265625, "learning_rate": 9.702669401035904e-06, "loss": 0.3784984827041626, "mean_token_accuracy": 0.877305480837822, "num_tokens": 16197812.0, "step": 1060 }, { "entropy": 0.3672247972339392, "epoch": 0.41347180432883623, "grad_norm": 2.15625, "learning_rate": 9.699099851266071e-06, "loss": 0.3595015525817871, "mean_token_accuracy": 0.8823614567518234, "num_tokens": 16275198.0, "step": 1065 }, { "entropy": 0.369810114428401, "epoch": 0.41541298650878383, "grad_norm": 1.359375, "learning_rate": 9.695509666940978e-06, "loss": 0.405411958694458, "mean_token_accuracy": 0.8822488501667977, "num_tokens": 16350965.0, "step": 1070 }, { "entropy": 0.3897046368569136, "epoch": 0.41735416868873143, "grad_norm": 1.4609375, "learning_rate": 9.691898863825749e-06, "loss": 0.38735527992248536, "mean_token_accuracy": 0.8744328498840332, "num_tokens": 16436304.0, "step": 1075 }, { "entropy": 0.3776374412700534, "epoch": 0.41929535086867903, "grad_norm": 1.7890625, "learning_rate": 9.688267457776032e-06, "loss": 0.39870805740356446, "mean_token_accuracy": 0.8785615637898445, "num_tokens": 16501266.0, "step": 1080 }, { "entropy": 0.38694494068622587, "epoch": 0.4212365330486266, "grad_norm": 1.59375, "learning_rate": 9.684615464737964e-06, "loss": 0.39393253326416017, "mean_token_accuracy": 0.8791399031877518, "num_tokens": 16560531.0, "step": 1085 }, { "entropy": 0.38168427646160125, "epoch": 0.4231777152285742, "grad_norm": 1.6796875, "learning_rate": 9.680942900748067e-06, "loss": 0.4125086784362793, "mean_token_accuracy": 0.878045716881752, "num_tokens": 16627433.0, "step": 1090 }, { "entropy": 0.3892548579722643, "epoch": 0.42511889740852177, "grad_norm": 1.359375, "learning_rate": 9.677249781933205e-06, "loss": 0.40183658599853517, "mean_token_accuracy": 0.8756108567118644, "num_tokens": 16731006.0, "step": 1095 }, { "entropy": 0.3736507400870323, "epoch": 0.42706007958846937, "grad_norm": 2.28125, "learning_rate": 9.673536124510496e-06, "loss": 0.40765180587768557, "mean_token_accuracy": 0.8797045171260833, "num_tokens": 16801170.0, "step": 1100 }, { "entropy": 0.3525055509060621, "epoch": 0.42900126176841696, "grad_norm": 1.3984375, "learning_rate": 9.669801944787249e-06, "loss": 0.3724426031112671, "mean_token_accuracy": 0.8849082082509995, "num_tokens": 16881282.0, "step": 1105 }, { "entropy": 0.39476602226495744, "epoch": 0.43094244394836456, "grad_norm": 1.3125, "learning_rate": 9.66604725916089e-06, "loss": 0.3921769380569458, "mean_token_accuracy": 0.8737779542803764, "num_tokens": 16965112.0, "step": 1110 }, { "entropy": 0.38296737633645533, "epoch": 0.43288362612831216, "grad_norm": 1.65625, "learning_rate": 9.662272084118887e-06, "loss": 0.39906389713287355, "mean_token_accuracy": 0.8742195263504982, "num_tokens": 17033565.0, "step": 1115 }, { "entropy": 0.4264007180929184, "epoch": 0.4348248083082597, "grad_norm": 1.7109375, "learning_rate": 9.658476436238683e-06, "loss": 0.4375418186187744, "mean_token_accuracy": 0.8661627262830734, "num_tokens": 17110727.0, "step": 1120 }, { "entropy": 0.3653211809694767, "epoch": 0.4367659904882073, "grad_norm": 1.390625, "learning_rate": 9.654660332187621e-06, "loss": 0.3593518972396851, "mean_token_accuracy": 0.882878914475441, "num_tokens": 17189104.0, "step": 1125 }, { "entropy": 0.33642441034317017, "epoch": 0.4387071726681549, "grad_norm": 1.546875, "learning_rate": 9.65082378872287e-06, "loss": 0.36505885124206544, "mean_token_accuracy": 0.8923633351922036, "num_tokens": 17256636.0, "step": 1130 }, { "entropy": 0.3748783551156521, "epoch": 0.4406483548481025, "grad_norm": 1.6484375, "learning_rate": 9.646966822691351e-06, "loss": 0.4033698558807373, "mean_token_accuracy": 0.8803606644272804, "num_tokens": 17333913.0, "step": 1135 }, { "entropy": 0.3871772147715092, "epoch": 0.4425895370280501, "grad_norm": 1.484375, "learning_rate": 9.643089451029666e-06, "loss": 0.39040231704711914, "mean_token_accuracy": 0.8764987051486969, "num_tokens": 17402674.0, "step": 1140 }, { "entropy": 0.4355839218944311, "epoch": 0.4445307192079977, "grad_norm": 1.9609375, "learning_rate": 9.639191690764018e-06, "loss": 0.40796470642089844, "mean_token_accuracy": 0.8702881962060929, "num_tokens": 17476753.0, "step": 1145 }, { "entropy": 0.37759856693446636, "epoch": 0.44647190138794524, "grad_norm": 1.3515625, "learning_rate": 9.635273559010148e-06, "loss": 0.38673570156097414, "mean_token_accuracy": 0.8773683786392212, "num_tokens": 17554472.0, "step": 1150 }, { "entropy": 0.3581284359097481, "epoch": 0.44841308356789283, "grad_norm": 1.390625, "learning_rate": 9.63133507297324e-06, "loss": 0.37009692192077637, "mean_token_accuracy": 0.8856742799282074, "num_tokens": 17637236.0, "step": 1155 }, { "entropy": 0.47716011516749857, "epoch": 0.45035426574784043, "grad_norm": 1.84375, "learning_rate": 9.627376249947866e-06, "loss": 0.491148042678833, "mean_token_accuracy": 0.8575920403003693, "num_tokens": 17714880.0, "step": 1160 }, { "entropy": 0.38734299391508104, "epoch": 0.45229544792778803, "grad_norm": 1.7734375, "learning_rate": 9.623397107317897e-06, "loss": 0.4355041980743408, "mean_token_accuracy": 0.8793953686952591, "num_tokens": 17786198.0, "step": 1165 }, { "entropy": 0.3389087375253439, "epoch": 0.45423663010773563, "grad_norm": 1.5625, "learning_rate": 9.619397662556434e-06, "loss": 0.35656213760375977, "mean_token_accuracy": 0.8894602239131928, "num_tokens": 17851917.0, "step": 1170 }, { "entropy": 0.3687282849103212, "epoch": 0.4561778122876832, "grad_norm": 1.5859375, "learning_rate": 9.615377933225727e-06, "loss": 0.4001771450042725, "mean_token_accuracy": 0.8769694566726685, "num_tokens": 17922400.0, "step": 1175 }, { "entropy": 0.37209414653480055, "epoch": 0.45811899446763077, "grad_norm": 1.3984375, "learning_rate": 9.611337936977096e-06, "loss": 0.39912428855896, "mean_token_accuracy": 0.8808334246277809, "num_tokens": 17996715.0, "step": 1180 }, { "entropy": 0.38213921934366224, "epoch": 0.46006017664757837, "grad_norm": 1.390625, "learning_rate": 9.607277691550862e-06, "loss": 0.41675233840942383, "mean_token_accuracy": 0.8742595329880715, "num_tokens": 18084477.0, "step": 1185 }, { "entropy": 0.39352951049804685, "epoch": 0.46200135882752597, "grad_norm": 2.3125, "learning_rate": 9.60319721477626e-06, "loss": 0.4071157932281494, "mean_token_accuracy": 0.8733987167477608, "num_tokens": 18152478.0, "step": 1190 }, { "entropy": 0.392200979962945, "epoch": 0.46394254100747356, "grad_norm": 1.5390625, "learning_rate": 9.59909652457136e-06, "loss": 0.4249094486236572, "mean_token_accuracy": 0.8737818524241447, "num_tokens": 18219295.0, "step": 1195 }, { "entropy": 0.35255367681384087, "epoch": 0.46588372318742116, "grad_norm": 1.609375, "learning_rate": 9.594975638943006e-06, "loss": 0.3529276132583618, "mean_token_accuracy": 0.8894811898469925, "num_tokens": 18289986.0, "step": 1200 }, { "epoch": 0.46588372318742116, "eval_entropy": 0.3770919001001211, "eval_loss": 0.37714096903800964, "eval_mean_token_accuracy": 0.8812256991003435, "eval_num_tokens": 18289986.0, "eval_runtime": 60.2886, "eval_samples_per_second": 35.645, "eval_steps_per_second": 35.645, "step": 1200 }, { "entropy": 0.38377687335014343, "epoch": 0.4678249053673687, "grad_norm": 1.9140625, "learning_rate": 9.59083457598671e-06, "loss": 0.4212928771972656, "mean_token_accuracy": 0.8758307337760926, "num_tokens": 18359707.0, "step": 1205 }, { "entropy": 0.3596473693847656, "epoch": 0.4697660875473163, "grad_norm": 1.4375, "learning_rate": 9.586673353886591e-06, "loss": 0.3813552141189575, "mean_token_accuracy": 0.8841730430722237, "num_tokens": 18435661.0, "step": 1210 }, { "entropy": 0.4140443943440914, "epoch": 0.4717072697272639, "grad_norm": 1.46875, "learning_rate": 9.582491990915292e-06, "loss": 0.4197361469268799, "mean_token_accuracy": 0.8726279020309449, "num_tokens": 18509573.0, "step": 1215 }, { "entropy": 0.3907853942364454, "epoch": 0.4736484519072115, "grad_norm": 1.625, "learning_rate": 9.578290505433896e-06, "loss": 0.4191273212432861, "mean_token_accuracy": 0.8725411191582679, "num_tokens": 18608110.0, "step": 1220 }, { "entropy": 0.3810266986489296, "epoch": 0.4755896340871591, "grad_norm": 1.4296875, "learning_rate": 9.57406891589185e-06, "loss": 0.38829352855682375, "mean_token_accuracy": 0.8763051420450211, "num_tokens": 18689794.0, "step": 1225 }, { "entropy": 0.420042909681797, "epoch": 0.4775308162671067, "grad_norm": 1.703125, "learning_rate": 9.569827240826876e-06, "loss": 0.40959844589233396, "mean_token_accuracy": 0.8703769713640213, "num_tokens": 18758060.0, "step": 1230 }, { "entropy": 0.36773351952433586, "epoch": 0.47947199844705424, "grad_norm": 2.328125, "learning_rate": 9.565565498864902e-06, "loss": 0.3752429962158203, "mean_token_accuracy": 0.8811664238572121, "num_tokens": 18824217.0, "step": 1235 }, { "entropy": 0.40542583018541334, "epoch": 0.48141318062700184, "grad_norm": 1.3515625, "learning_rate": 9.561283708719968e-06, "loss": 0.4128578662872314, "mean_token_accuracy": 0.8699263706803322, "num_tokens": 18910883.0, "step": 1240 }, { "entropy": 0.37737762071192266, "epoch": 0.48335436280694943, "grad_norm": 1.5234375, "learning_rate": 9.55698188919415e-06, "loss": 0.39167325496673583, "mean_token_accuracy": 0.879218578338623, "num_tokens": 18993242.0, "step": 1245 }, { "entropy": 0.38517537601292134, "epoch": 0.48529554498689703, "grad_norm": 1.921875, "learning_rate": 9.552660059177477e-06, "loss": 0.38378689289093015, "mean_token_accuracy": 0.879303851723671, "num_tokens": 19061334.0, "step": 1250 }, { "entropy": 0.37100368924438953, "epoch": 0.48723672716684463, "grad_norm": 1.828125, "learning_rate": 9.548318237647849e-06, "loss": 0.4200906753540039, "mean_token_accuracy": 0.8786957338452339, "num_tokens": 19128848.0, "step": 1255 }, { "entropy": 0.38898602277040484, "epoch": 0.4891779093467922, "grad_norm": 1.46875, "learning_rate": 9.543956443670947e-06, "loss": 0.4141817569732666, "mean_token_accuracy": 0.8764576897025108, "num_tokens": 19202756.0, "step": 1260 }, { "entropy": 0.3760422389954329, "epoch": 0.49111909152673977, "grad_norm": 2.109375, "learning_rate": 9.539574696400165e-06, "loss": 0.3719266653060913, "mean_token_accuracy": 0.8817565947771072, "num_tokens": 19268958.0, "step": 1265 }, { "entropy": 0.36780366376042367, "epoch": 0.49306027370668737, "grad_norm": 1.8203125, "learning_rate": 9.535173015076501e-06, "loss": 0.39360432624816893, "mean_token_accuracy": 0.8779341161251069, "num_tokens": 19352111.0, "step": 1270 }, { "entropy": 0.4413019739091396, "epoch": 0.49500145588663497, "grad_norm": 1.7109375, "learning_rate": 9.5307514190285e-06, "loss": 0.4407999515533447, "mean_token_accuracy": 0.8592960745096206, "num_tokens": 19441509.0, "step": 1275 }, { "entropy": 0.39783000349998476, "epoch": 0.49694263806658256, "grad_norm": 1.234375, "learning_rate": 9.526309927672148e-06, "loss": 0.42558717727661133, "mean_token_accuracy": 0.8755456551909446, "num_tokens": 19525598.0, "step": 1280 }, { "entropy": 0.36931919269263747, "epoch": 0.49888382024653016, "grad_norm": 1.3984375, "learning_rate": 9.521848560510796e-06, "loss": 0.38771824836730956, "mean_token_accuracy": 0.8801409855484963, "num_tokens": 19612547.0, "step": 1285 }, { "entropy": 0.41849659457802774, "epoch": 0.5008250024264778, "grad_norm": 1.6953125, "learning_rate": 9.517367337135076e-06, "loss": 0.43532710075378417, "mean_token_accuracy": 0.8676797851920128, "num_tokens": 19689731.0, "step": 1290 }, { "entropy": 0.41437376402318477, "epoch": 0.5027661846064253, "grad_norm": 1.6796875, "learning_rate": 9.51286627722281e-06, "loss": 0.4385324478149414, "mean_token_accuracy": 0.8706857517361641, "num_tokens": 19756661.0, "step": 1295 }, { "entropy": 0.4086436625570059, "epoch": 0.5047073667863728, "grad_norm": 1.6171875, "learning_rate": 9.508345400538926e-06, "loss": 0.4336398124694824, "mean_token_accuracy": 0.8683807790279389, "num_tokens": 19821704.0, "step": 1300 }, { "entropy": 0.3883740194141865, "epoch": 0.5066485489663205, "grad_norm": 1.3828125, "learning_rate": 9.503804726935369e-06, "loss": 0.39049382209777833, "mean_token_accuracy": 0.8757125899195671, "num_tokens": 19902414.0, "step": 1305 }, { "entropy": 0.39095442183315754, "epoch": 0.508589731146268, "grad_norm": 1.6640625, "learning_rate": 9.499244276351019e-06, "loss": 0.38634843826293946, "mean_token_accuracy": 0.8751242905855179, "num_tokens": 19984638.0, "step": 1310 }, { "entropy": 0.3666827451437712, "epoch": 0.5105309133262157, "grad_norm": 1.6171875, "learning_rate": 9.494664068811597e-06, "loss": 0.40018815994262696, "mean_token_accuracy": 0.880105035007, "num_tokens": 20064261.0, "step": 1315 }, { "entropy": 0.34377430453896524, "epoch": 0.5124720955061632, "grad_norm": 1.8046875, "learning_rate": 9.490064124429584e-06, "loss": 0.36567790508270265, "mean_token_accuracy": 0.8907153263688088, "num_tokens": 20126962.0, "step": 1320 }, { "entropy": 0.3555528115481138, "epoch": 0.5144132776861109, "grad_norm": 1.3046875, "learning_rate": 9.485444463404125e-06, "loss": 0.3725638151168823, "mean_token_accuracy": 0.8841280445456505, "num_tokens": 20215463.0, "step": 1325 }, { "entropy": 0.3422409202903509, "epoch": 0.5163544598660584, "grad_norm": 1.53125, "learning_rate": 9.480805106020947e-06, "loss": 0.3722813129425049, "mean_token_accuracy": 0.8891440883278847, "num_tokens": 20298577.0, "step": 1330 }, { "entropy": 0.3823659881949425, "epoch": 0.518295642046006, "grad_norm": 1.5234375, "learning_rate": 9.476146072652262e-06, "loss": 0.39306447505950926, "mean_token_accuracy": 0.876395545899868, "num_tokens": 20374990.0, "step": 1335 }, { "entropy": 0.388584029302001, "epoch": 0.5202368242259536, "grad_norm": 1.5, "learning_rate": 9.471467383756692e-06, "loss": 0.41069755554199217, "mean_token_accuracy": 0.8789796933531762, "num_tokens": 20446548.0, "step": 1340 }, { "entropy": 0.36739424169063567, "epoch": 0.5221780064059012, "grad_norm": 1.8671875, "learning_rate": 9.46676905987916e-06, "loss": 0.3884859085083008, "mean_token_accuracy": 0.8806875750422478, "num_tokens": 20517149.0, "step": 1345 }, { "entropy": 0.35578424148261545, "epoch": 0.5241191885858488, "grad_norm": 1.5, "learning_rate": 9.462051121650816e-06, "loss": 0.3846778869628906, "mean_token_accuracy": 0.8805378764867783, "num_tokens": 20596629.0, "step": 1350 }, { "entropy": 0.375444458052516, "epoch": 0.5260603707657964, "grad_norm": 1.8984375, "learning_rate": 9.457313589788937e-06, "loss": 0.40492801666259765, "mean_token_accuracy": 0.8799885243177414, "num_tokens": 20660631.0, "step": 1355 }, { "entropy": 0.38559874445199965, "epoch": 0.5280015529457439, "grad_norm": 1.984375, "learning_rate": 9.452556485096839e-06, "loss": 0.4140150547027588, "mean_token_accuracy": 0.8767204716801643, "num_tokens": 20723882.0, "step": 1360 }, { "entropy": 0.40360996387898923, "epoch": 0.5299427351256916, "grad_norm": 1.5234375, "learning_rate": 9.447779828463788e-06, "loss": 0.38798012733459475, "mean_token_accuracy": 0.8741889104247094, "num_tokens": 20801012.0, "step": 1365 }, { "entropy": 0.3853685542941093, "epoch": 0.5318839173056391, "grad_norm": 1.4140625, "learning_rate": 9.442983640864904e-06, "loss": 0.39840006828308105, "mean_token_accuracy": 0.8812712132930756, "num_tokens": 20870494.0, "step": 1370 }, { "entropy": 0.41988850980997083, "epoch": 0.5338250994855868, "grad_norm": 1.859375, "learning_rate": 9.43816794336107e-06, "loss": 0.42660999298095703, "mean_token_accuracy": 0.8691768750548363, "num_tokens": 20932159.0, "step": 1375 }, { "entropy": 0.35770875252783296, "epoch": 0.5357662816655343, "grad_norm": 1.3203125, "learning_rate": 9.433332757098844e-06, "loss": 0.35865347385406493, "mean_token_accuracy": 0.8853553980588913, "num_tokens": 21012568.0, "step": 1380 }, { "entropy": 0.3706452056765556, "epoch": 0.5377074638454818, "grad_norm": 1.734375, "learning_rate": 9.428478103310358e-06, "loss": 0.40013108253479, "mean_token_accuracy": 0.8823366552591324, "num_tokens": 21083094.0, "step": 1385 }, { "entropy": 0.39769635573029516, "epoch": 0.5396486460254295, "grad_norm": 1.390625, "learning_rate": 9.423604003313232e-06, "loss": 0.4011887550354004, "mean_token_accuracy": 0.8764868810772896, "num_tokens": 21161361.0, "step": 1390 }, { "entropy": 0.39253461360931396, "epoch": 0.541589828205377, "grad_norm": 1.8828125, "learning_rate": 9.418710478510478e-06, "loss": 0.41046462059020994, "mean_token_accuracy": 0.878234452009201, "num_tokens": 21225113.0, "step": 1395 }, { "entropy": 0.3744351703673601, "epoch": 0.5435310103853247, "grad_norm": 1.5625, "learning_rate": 9.413797550390403e-06, "loss": 0.37674736976623535, "mean_token_accuracy": 0.8828163802623749, "num_tokens": 21295691.0, "step": 1400 }, { "epoch": 0.5435310103853247, "eval_entropy": 0.3686942668842293, "eval_loss": 0.3755117952823639, "eval_mean_token_accuracy": 0.8816876367060402, "eval_num_tokens": 21295691.0, "eval_runtime": 60.3519, "eval_samples_per_second": 35.608, "eval_steps_per_second": 35.608, "step": 1400 }, { "entropy": 0.34990762211382387, "epoch": 0.5454721925652722, "grad_norm": 1.2109375, "learning_rate": 9.40886524052652e-06, "loss": 0.3539942979812622, "mean_token_accuracy": 0.8850773021578788, "num_tokens": 21376676.0, "step": 1405 }, { "entropy": 0.41086711175739765, "epoch": 0.5474133747452199, "grad_norm": 1.6328125, "learning_rate": 9.403913570577448e-06, "loss": 0.43075881004333494, "mean_token_accuracy": 0.871852807700634, "num_tokens": 21451444.0, "step": 1410 }, { "entropy": 0.37052917703986166, "epoch": 0.5493545569251674, "grad_norm": 1.453125, "learning_rate": 9.398942562286822e-06, "loss": 0.38300988674163816, "mean_token_accuracy": 0.8779854521155357, "num_tokens": 21536871.0, "step": 1415 }, { "entropy": 0.3861477542668581, "epoch": 0.551295739105115, "grad_norm": 1.7265625, "learning_rate": 9.393952237483195e-06, "loss": 0.40117707252502444, "mean_token_accuracy": 0.8765692830085754, "num_tokens": 21605987.0, "step": 1420 }, { "entropy": 0.4081678859889507, "epoch": 0.5532369212850626, "grad_norm": 1.7890625, "learning_rate": 9.38894261807994e-06, "loss": 0.42310566902160646, "mean_token_accuracy": 0.8693249508738518, "num_tokens": 21692774.0, "step": 1425 }, { "entropy": 0.3717794116586447, "epoch": 0.5551781034650102, "grad_norm": 1.4453125, "learning_rate": 9.383913726075157e-06, "loss": 0.38362655639648435, "mean_token_accuracy": 0.8794390082359314, "num_tokens": 21774027.0, "step": 1430 }, { "entropy": 0.3897536873817444, "epoch": 0.5571192856449578, "grad_norm": 1.59375, "learning_rate": 9.378865583551575e-06, "loss": 0.40027127265930174, "mean_token_accuracy": 0.874237485229969, "num_tokens": 21855865.0, "step": 1435 }, { "entropy": 0.4386448211967945, "epoch": 0.5590604678249054, "grad_norm": 1.8984375, "learning_rate": 9.373798212676459e-06, "loss": 0.44517908096313474, "mean_token_accuracy": 0.8637019321322441, "num_tokens": 21937592.0, "step": 1440 }, { "entropy": 0.40399301163852214, "epoch": 0.5610016500048529, "grad_norm": 2.234375, "learning_rate": 9.368711635701499e-06, "loss": 0.42911725044250487, "mean_token_accuracy": 0.8717393398284912, "num_tokens": 22016228.0, "step": 1445 }, { "entropy": 0.33674396723508837, "epoch": 0.5629428321848006, "grad_norm": 1.625, "learning_rate": 9.363605874962735e-06, "loss": 0.3449155569076538, "mean_token_accuracy": 0.8916645109653473, "num_tokens": 22091155.0, "step": 1450 }, { "entropy": 0.34950118474662306, "epoch": 0.5648840143647481, "grad_norm": 1.6875, "learning_rate": 9.358480952880438e-06, "loss": 0.37925631999969484, "mean_token_accuracy": 0.8876421838998795, "num_tokens": 22168063.0, "step": 1455 }, { "entropy": 0.3646994840353727, "epoch": 0.5668251965446958, "grad_norm": 1.359375, "learning_rate": 9.35333689195902e-06, "loss": 0.3887592554092407, "mean_token_accuracy": 0.8781901568174362, "num_tokens": 22247190.0, "step": 1460 }, { "entropy": 0.42925515584647655, "epoch": 0.5687663787246433, "grad_norm": 1.7265625, "learning_rate": 9.34817371478694e-06, "loss": 0.4361258983612061, "mean_token_accuracy": 0.8652834072709084, "num_tokens": 22327454.0, "step": 1465 }, { "entropy": 0.40622838474810125, "epoch": 0.5707075609045908, "grad_norm": 1.53125, "learning_rate": 9.342991444036593e-06, "loss": 0.4456647872924805, "mean_token_accuracy": 0.8694571733474732, "num_tokens": 22412083.0, "step": 1470 }, { "entropy": 0.42173517793416976, "epoch": 0.5726487430845385, "grad_norm": 1.421875, "learning_rate": 9.337790102464224e-06, "loss": 0.454360818862915, "mean_token_accuracy": 0.8657700821757317, "num_tokens": 22490065.0, "step": 1475 }, { "entropy": 0.4561560284346342, "epoch": 0.574589925264486, "grad_norm": 1.40625, "learning_rate": 9.332569712909816e-06, "loss": 0.4739046573638916, "mean_token_accuracy": 0.8589961290359497, "num_tokens": 22578402.0, "step": 1480 }, { "entropy": 0.38373861461877823, "epoch": 0.5765311074444337, "grad_norm": 1.6796875, "learning_rate": 9.327330298296998e-06, "loss": 0.3775209665298462, "mean_token_accuracy": 0.8786651358008385, "num_tokens": 22657716.0, "step": 1485 }, { "entropy": 0.34858159013092516, "epoch": 0.5784722896243812, "grad_norm": 1.5, "learning_rate": 9.32207188163294e-06, "loss": 0.36159141063690187, "mean_token_accuracy": 0.8849190220236778, "num_tokens": 22727213.0, "step": 1490 }, { "entropy": 0.36950380988419057, "epoch": 0.5804134718043288, "grad_norm": 1.84375, "learning_rate": 9.316794486008254e-06, "loss": 0.41820201873779295, "mean_token_accuracy": 0.8807887002825737, "num_tokens": 22796084.0, "step": 1495 }, { "entropy": 0.3770993869751692, "epoch": 0.5823546539842764, "grad_norm": 1.5234375, "learning_rate": 9.31149813459689e-06, "loss": 0.3539431571960449, "mean_token_accuracy": 0.8795977741479873, "num_tokens": 22870251.0, "step": 1500 }, { "entropy": 0.3788142062723637, "epoch": 0.584295836164224, "grad_norm": 1.828125, "learning_rate": 9.306182850656037e-06, "loss": 0.3946338415145874, "mean_token_accuracy": 0.8801519960165024, "num_tokens": 22951081.0, "step": 1505 }, { "entropy": 0.3841311365365982, "epoch": 0.5862370183441716, "grad_norm": 1.46875, "learning_rate": 9.300848657526024e-06, "loss": 0.38277838230133054, "mean_token_accuracy": 0.8772434189915657, "num_tokens": 23034217.0, "step": 1510 }, { "entropy": 0.3577498983591795, "epoch": 0.5881782005241192, "grad_norm": 2.0, "learning_rate": 9.29549557863021e-06, "loss": 0.37149059772491455, "mean_token_accuracy": 0.8867600724101067, "num_tokens": 23103243.0, "step": 1515 }, { "entropy": 0.36544432379305364, "epoch": 0.5901193827040668, "grad_norm": 1.65625, "learning_rate": 9.29012363747488e-06, "loss": 0.3911574840545654, "mean_token_accuracy": 0.8809396475553513, "num_tokens": 23180618.0, "step": 1520 }, { "entropy": 0.37773900777101516, "epoch": 0.5920605648840144, "grad_norm": 1.6015625, "learning_rate": 9.284732857649154e-06, "loss": 0.40440049171447756, "mean_token_accuracy": 0.8771207213401795, "num_tokens": 23274254.0, "step": 1525 }, { "entropy": 0.41926471069455146, "epoch": 0.5940017470639619, "grad_norm": 1.78125, "learning_rate": 9.279323262824871e-06, "loss": 0.43068270683288573, "mean_token_accuracy": 0.8644863858819007, "num_tokens": 23354103.0, "step": 1530 }, { "entropy": 0.3747733347117901, "epoch": 0.5959429292439096, "grad_norm": 1.5703125, "learning_rate": 9.273894876756497e-06, "loss": 0.3952503204345703, "mean_token_accuracy": 0.8833319827914238, "num_tokens": 23420187.0, "step": 1535 }, { "entropy": 0.3898849368095398, "epoch": 0.5978841114238571, "grad_norm": 1.4375, "learning_rate": 9.268447723281003e-06, "loss": 0.4146092891693115, "mean_token_accuracy": 0.8798678085207939, "num_tokens": 23491179.0, "step": 1540 }, { "entropy": 0.34879231434315444, "epoch": 0.5998252936038048, "grad_norm": 1.4375, "learning_rate": 9.262981826317778e-06, "loss": 0.37036240100860596, "mean_token_accuracy": 0.8908288046717644, "num_tokens": 23561490.0, "step": 1545 }, { "entropy": 0.4013595413416624, "epoch": 0.6017664757837523, "grad_norm": 1.90625, "learning_rate": 9.257497209868516e-06, "loss": 0.42535991668701173, "mean_token_accuracy": 0.8715372681617737, "num_tokens": 23643197.0, "step": 1550 }, { "entropy": 0.3767758123576641, "epoch": 0.6037076579636999, "grad_norm": 1.875, "learning_rate": 9.251993898017109e-06, "loss": 0.3970643997192383, "mean_token_accuracy": 0.8821586266160011, "num_tokens": 23714513.0, "step": 1555 }, { "entropy": 0.40486165285110476, "epoch": 0.6056488401436475, "grad_norm": 1.515625, "learning_rate": 9.246471914929547e-06, "loss": 0.41384401321411135, "mean_token_accuracy": 0.8695383608341217, "num_tokens": 23801743.0, "step": 1560 }, { "entropy": 0.3538258448243141, "epoch": 0.607590022323595, "grad_norm": 1.6328125, "learning_rate": 9.240931284853807e-06, "loss": 0.3868009090423584, "mean_token_accuracy": 0.8842133894562721, "num_tokens": 23893948.0, "step": 1565 }, { "entropy": 0.3908126030117273, "epoch": 0.6095312045035427, "grad_norm": 1.8125, "learning_rate": 9.235372032119747e-06, "loss": 0.40709576606750486, "mean_token_accuracy": 0.8742722377181054, "num_tokens": 23959291.0, "step": 1570 }, { "entropy": 0.39328810535371306, "epoch": 0.6114723866834902, "grad_norm": 1.625, "learning_rate": 9.229794181139002e-06, "loss": 0.40347847938537595, "mean_token_accuracy": 0.874020305275917, "num_tokens": 24028375.0, "step": 1575 }, { "entropy": 0.38035779893398286, "epoch": 0.6134135688634378, "grad_norm": 1.4765625, "learning_rate": 9.224197756404875e-06, "loss": 0.39300010204315183, "mean_token_accuracy": 0.8796775847673416, "num_tokens": 24106755.0, "step": 1580 }, { "entropy": 0.35121305733919145, "epoch": 0.6153547510433854, "grad_norm": 2.296875, "learning_rate": 9.218582782492228e-06, "loss": 0.39762823581695556, "mean_token_accuracy": 0.8853226408362389, "num_tokens": 24174787.0, "step": 1585 }, { "entropy": 0.3741837713867426, "epoch": 0.617295933223333, "grad_norm": 1.7578125, "learning_rate": 9.212949284057378e-06, "loss": 0.39895901679992674, "mean_token_accuracy": 0.8801515579223633, "num_tokens": 24253990.0, "step": 1590 }, { "entropy": 0.4153418317437172, "epoch": 0.6192371154032806, "grad_norm": 1.7890625, "learning_rate": 9.207297285837984e-06, "loss": 0.4323587894439697, "mean_token_accuracy": 0.8745242461562157, "num_tokens": 24326608.0, "step": 1595 }, { "entropy": 0.42888959534466264, "epoch": 0.6211782975832282, "grad_norm": 1.3125, "learning_rate": 9.201626812652942e-06, "loss": 0.4193469524383545, "mean_token_accuracy": 0.8636892691254616, "num_tokens": 24408096.0, "step": 1600 }, { "epoch": 0.6211782975832282, "eval_entropy": 0.3693768273520714, "eval_loss": 0.3738563656806946, "eval_mean_token_accuracy": 0.8822050338265174, "eval_num_tokens": 24408096.0, "eval_runtime": 60.0738, "eval_samples_per_second": 35.773, "eval_steps_per_second": 35.773, "step": 1600 }, { "entropy": 0.4069465111941099, "epoch": 0.6231194797631757, "grad_norm": 1.3828125, "learning_rate": 9.195937889402276e-06, "loss": 0.3946805238723755, "mean_token_accuracy": 0.8711962580680848, "num_tokens": 24477273.0, "step": 1605 }, { "entropy": 0.3497770603746176, "epoch": 0.6250606619431234, "grad_norm": 1.53125, "learning_rate": 9.190230541067023e-06, "loss": 0.3609620094299316, "mean_token_accuracy": 0.8848605647683143, "num_tokens": 24556472.0, "step": 1610 }, { "entropy": 0.4083269018679857, "epoch": 0.6270018441230709, "grad_norm": 1.484375, "learning_rate": 9.184504792709134e-06, "loss": 0.4195822238922119, "mean_token_accuracy": 0.873286210000515, "num_tokens": 24633310.0, "step": 1615 }, { "entropy": 0.3741916142404079, "epoch": 0.6289430263030186, "grad_norm": 1.71875, "learning_rate": 9.178760669471351e-06, "loss": 0.3778867244720459, "mean_token_accuracy": 0.8781406879425049, "num_tokens": 24702572.0, "step": 1620 }, { "entropy": 0.34799036718904974, "epoch": 0.6308842084829661, "grad_norm": 1.3828125, "learning_rate": 9.17299819657711e-06, "loss": 0.36290202140808103, "mean_token_accuracy": 0.8880066946148872, "num_tokens": 24786717.0, "step": 1625 }, { "entropy": 0.35105147287249566, "epoch": 0.6328253906629138, "grad_norm": 1.9296875, "learning_rate": 9.167217399330418e-06, "loss": 0.367209792137146, "mean_token_accuracy": 0.886526557803154, "num_tokens": 24861736.0, "step": 1630 }, { "entropy": 0.3578195352107286, "epoch": 0.6347665728428613, "grad_norm": 1.375, "learning_rate": 9.161418303115749e-06, "loss": 0.3651568412780762, "mean_token_accuracy": 0.8801181107759476, "num_tokens": 24932067.0, "step": 1635 }, { "entropy": 0.4024165827780962, "epoch": 0.6367077550228089, "grad_norm": 1.546875, "learning_rate": 9.155600933397932e-06, "loss": 0.4195927619934082, "mean_token_accuracy": 0.8746752873063087, "num_tokens": 25003342.0, "step": 1640 }, { "entropy": 0.41863835491240026, "epoch": 0.6386489372027565, "grad_norm": 1.234375, "learning_rate": 9.149765315722039e-06, "loss": 0.4207592964172363, "mean_token_accuracy": 0.8699864789843559, "num_tokens": 25089543.0, "step": 1645 }, { "entropy": 0.3774807959794998, "epoch": 0.640590119382704, "grad_norm": 1.484375, "learning_rate": 9.14391147571327e-06, "loss": 0.38125219345092776, "mean_token_accuracy": 0.8813193202018738, "num_tokens": 25164250.0, "step": 1650 }, { "entropy": 0.36004649810492995, "epoch": 0.6425313015626517, "grad_norm": 1.4609375, "learning_rate": 9.13803943907684e-06, "loss": 0.38634524345397947, "mean_token_accuracy": 0.8815308138728142, "num_tokens": 25235584.0, "step": 1655 }, { "entropy": 0.40895739644765855, "epoch": 0.6444724837425992, "grad_norm": 1.390625, "learning_rate": 9.132149231597874e-06, "loss": 0.42175993919372556, "mean_token_accuracy": 0.8735464856028556, "num_tokens": 25326121.0, "step": 1660 }, { "entropy": 0.42022631838917734, "epoch": 0.6464136659225468, "grad_norm": 1.3671875, "learning_rate": 9.126240879141286e-06, "loss": 0.4283411502838135, "mean_token_accuracy": 0.8683241337537766, "num_tokens": 25416532.0, "step": 1665 }, { "entropy": 0.3419806692749262, "epoch": 0.6483548481024944, "grad_norm": 1.3515625, "learning_rate": 9.120314407651665e-06, "loss": 0.3869215726852417, "mean_token_accuracy": 0.8876996964216233, "num_tokens": 25500339.0, "step": 1670 }, { "entropy": 0.37519195675849915, "epoch": 0.650296030282442, "grad_norm": 1.65625, "learning_rate": 9.114369843153168e-06, "loss": 0.38437614440917967, "mean_token_accuracy": 0.880204701423645, "num_tokens": 25571598.0, "step": 1675 }, { "entropy": 0.34165109843015673, "epoch": 0.6522372124623896, "grad_norm": 1.7734375, "learning_rate": 9.108407211749397e-06, "loss": 0.3734029531478882, "mean_token_accuracy": 0.8863589748740196, "num_tokens": 25647870.0, "step": 1680 }, { "entropy": 0.3643860913813114, "epoch": 0.6541783946423372, "grad_norm": 1.4765625, "learning_rate": 9.102426539623295e-06, "loss": 0.3877432107925415, "mean_token_accuracy": 0.8784330353140831, "num_tokens": 25729611.0, "step": 1685 }, { "entropy": 0.4150772735476494, "epoch": 0.6561195768222847, "grad_norm": 1.78125, "learning_rate": 9.09642785303702e-06, "loss": 0.4420276641845703, "mean_token_accuracy": 0.8656805634498597, "num_tokens": 25808840.0, "step": 1690 }, { "entropy": 0.3560687083750963, "epoch": 0.6580607590022324, "grad_norm": 1.7421875, "learning_rate": 9.090411178331835e-06, "loss": 0.37286901473999023, "mean_token_accuracy": 0.8856526881456375, "num_tokens": 25887901.0, "step": 1695 }, { "entropy": 0.4048729032278061, "epoch": 0.6600019411821799, "grad_norm": 1.2734375, "learning_rate": 9.084376541927995e-06, "loss": 0.4281449317932129, "mean_token_accuracy": 0.8717523291707039, "num_tokens": 25980223.0, "step": 1700 }, { "entropy": 0.4058201160281897, "epoch": 0.6619431233621276, "grad_norm": 1.28125, "learning_rate": 9.078323970324626e-06, "loss": 0.42533535957336427, "mean_token_accuracy": 0.8724991276860237, "num_tokens": 26057550.0, "step": 1705 }, { "entropy": 0.40643964521586895, "epoch": 0.6638843055420751, "grad_norm": 1.640625, "learning_rate": 9.072253490099607e-06, "loss": 0.4063755512237549, "mean_token_accuracy": 0.8733890399336814, "num_tokens": 26131468.0, "step": 1710 }, { "entropy": 0.37885321527719495, "epoch": 0.6658254877220227, "grad_norm": 1.53125, "learning_rate": 9.066165127909463e-06, "loss": 0.39308197498321534, "mean_token_accuracy": 0.881773728132248, "num_tokens": 26209570.0, "step": 1715 }, { "entropy": 0.39982022494077685, "epoch": 0.6677666699019703, "grad_norm": 1.359375, "learning_rate": 9.060058910489237e-06, "loss": 0.4166593551635742, "mean_token_accuracy": 0.875648008286953, "num_tokens": 26291210.0, "step": 1720 }, { "entropy": 0.38358601108193396, "epoch": 0.6697078520819179, "grad_norm": 1.6328125, "learning_rate": 9.053934864652382e-06, "loss": 0.39159939289093015, "mean_token_accuracy": 0.8792028650641441, "num_tokens": 26363096.0, "step": 1725 }, { "entropy": 0.39065288491547107, "epoch": 0.6716490342618655, "grad_norm": 1.6875, "learning_rate": 9.047793017290635e-06, "loss": 0.41971278190612793, "mean_token_accuracy": 0.8771908909082413, "num_tokens": 26449438.0, "step": 1730 }, { "entropy": 0.36197944805026055, "epoch": 0.673590216441813, "grad_norm": 1.78125, "learning_rate": 9.041633395373902e-06, "loss": 0.3651232957839966, "mean_token_accuracy": 0.8863715797662735, "num_tokens": 26506251.0, "step": 1735 }, { "entropy": 0.41872271075844764, "epoch": 0.6755313986217607, "grad_norm": 1.5234375, "learning_rate": 9.035456025950145e-06, "loss": 0.4293703556060791, "mean_token_accuracy": 0.8711474344134331, "num_tokens": 26577535.0, "step": 1740 }, { "entropy": 0.3581832841038704, "epoch": 0.6774725808017082, "grad_norm": 1.609375, "learning_rate": 9.029260936145252e-06, "loss": 0.3745636224746704, "mean_token_accuracy": 0.8827520579099655, "num_tokens": 26652699.0, "step": 1745 }, { "entropy": 0.43668837919831277, "epoch": 0.6794137629816558, "grad_norm": 1.65625, "learning_rate": 9.02304815316293e-06, "loss": 0.45046534538269045, "mean_token_accuracy": 0.865936142206192, "num_tokens": 26735591.0, "step": 1750 }, { "entropy": 0.3559565614908934, "epoch": 0.6813549451616034, "grad_norm": 1.421875, "learning_rate": 9.016817704284575e-06, "loss": 0.36423630714416505, "mean_token_accuracy": 0.8824115738272666, "num_tokens": 26812459.0, "step": 1755 }, { "entropy": 0.3483701661229134, "epoch": 0.683296127341551, "grad_norm": 1.8203125, "learning_rate": 9.010569616869159e-06, "loss": 0.37481648921966554, "mean_token_accuracy": 0.8892971143126488, "num_tokens": 26882592.0, "step": 1760 }, { "entropy": 0.3975631568580866, "epoch": 0.6852373095214986, "grad_norm": 1.484375, "learning_rate": 9.004303918353107e-06, "loss": 0.39717047214508056, "mean_token_accuracy": 0.8726603716611863, "num_tokens": 26954080.0, "step": 1765 }, { "entropy": 0.38138355370610955, "epoch": 0.6871784917014462, "grad_norm": 2.234375, "learning_rate": 8.998020636250181e-06, "loss": 0.39662230014801025, "mean_token_accuracy": 0.8773909747600556, "num_tokens": 27025611.0, "step": 1770 }, { "entropy": 0.35980530045926573, "epoch": 0.6891196738813937, "grad_norm": 1.90625, "learning_rate": 8.991719798151354e-06, "loss": 0.38723225593566896, "mean_token_accuracy": 0.8855857968330383, "num_tokens": 27106998.0, "step": 1775 }, { "entropy": 0.39718156717717645, "epoch": 0.6910608560613414, "grad_norm": 1.3203125, "learning_rate": 8.985401431724685e-06, "loss": 0.42195706367492675, "mean_token_accuracy": 0.870930427312851, "num_tokens": 27191593.0, "step": 1780 }, { "entropy": 0.39791759476065636, "epoch": 0.6930020382412889, "grad_norm": 1.4453125, "learning_rate": 8.979065564715209e-06, "loss": 0.3908670902252197, "mean_token_accuracy": 0.877900630235672, "num_tokens": 27259061.0, "step": 1785 }, { "entropy": 0.37184464260935784, "epoch": 0.6949432204212366, "grad_norm": 1.328125, "learning_rate": 8.972712224944808e-06, "loss": 0.3723410367965698, "mean_token_accuracy": 0.8796270757913589, "num_tokens": 27345514.0, "step": 1790 }, { "entropy": 0.39280957020819185, "epoch": 0.6968844026011841, "grad_norm": 1.3671875, "learning_rate": 8.966341440312088e-06, "loss": 0.37746195793151854, "mean_token_accuracy": 0.8742925137281418, "num_tokens": 27434611.0, "step": 1795 }, { "entropy": 0.38702532537281514, "epoch": 0.6988255847811317, "grad_norm": 1.34375, "learning_rate": 8.959953238792261e-06, "loss": 0.4323995113372803, "mean_token_accuracy": 0.876301246881485, "num_tokens": 27522141.0, "step": 1800 }, { "epoch": 0.6988255847811317, "eval_entropy": 0.36904463945099675, "eval_loss": 0.37259599566459656, "eval_mean_token_accuracy": 0.8823383979156108, "eval_num_tokens": 27522141.0, "eval_runtime": 60.1232, "eval_samples_per_second": 35.743, "eval_steps_per_second": 35.743, "step": 1800 }, { "entropy": 0.39694005586206915, "epoch": 0.7007667669610793, "grad_norm": 1.6640625, "learning_rate": 8.953547648437016e-06, "loss": 0.422884464263916, "mean_token_accuracy": 0.8706113517284393, "num_tokens": 27606238.0, "step": 1805 }, { "entropy": 0.35907841585576533, "epoch": 0.7027079491410269, "grad_norm": 1.6171875, "learning_rate": 8.947124697374403e-06, "loss": 0.37867820262908936, "mean_token_accuracy": 0.8819711148738861, "num_tokens": 27698297.0, "step": 1810 }, { "entropy": 0.39380453154444695, "epoch": 0.7046491313209745, "grad_norm": 1.1328125, "learning_rate": 8.940684413808704e-06, "loss": 0.41552581787109377, "mean_token_accuracy": 0.8773353233933449, "num_tokens": 27783292.0, "step": 1815 }, { "entropy": 0.3948865693062544, "epoch": 0.706590313500922, "grad_norm": 2.0, "learning_rate": 8.93422682602031e-06, "loss": 0.45133333206176757, "mean_token_accuracy": 0.8750575929880142, "num_tokens": 27857682.0, "step": 1820 }, { "entropy": 0.39443247877061366, "epoch": 0.7085314956808697, "grad_norm": 1.8671875, "learning_rate": 8.927751962365603e-06, "loss": 0.39142508506774903, "mean_token_accuracy": 0.8749705284833909, "num_tokens": 27933338.0, "step": 1825 }, { "entropy": 0.38654340282082555, "epoch": 0.7104726778608172, "grad_norm": 1.7890625, "learning_rate": 8.921259851276816e-06, "loss": 0.38780851364135743, "mean_token_accuracy": 0.8745802566409111, "num_tokens": 28004374.0, "step": 1830 }, { "entropy": 0.3340354781597853, "epoch": 0.7124138600407648, "grad_norm": 1.8125, "learning_rate": 8.91475052126193e-06, "loss": 0.34950056076049807, "mean_token_accuracy": 0.8917890131473541, "num_tokens": 28076071.0, "step": 1835 }, { "entropy": 0.36462055034935476, "epoch": 0.7143550422207124, "grad_norm": 1.1796875, "learning_rate": 8.90822400090453e-06, "loss": 0.36106727123260496, "mean_token_accuracy": 0.879101251065731, "num_tokens": 28167857.0, "step": 1840 }, { "entropy": 0.3714527040719986, "epoch": 0.71629622440066, "grad_norm": 1.921875, "learning_rate": 8.90168031886369e-06, "loss": 0.3883594274520874, "mean_token_accuracy": 0.881637692451477, "num_tokens": 28228771.0, "step": 1845 }, { "entropy": 0.39277232214808466, "epoch": 0.7182374065806076, "grad_norm": 1.734375, "learning_rate": 8.895119503873841e-06, "loss": 0.4170830726623535, "mean_token_accuracy": 0.8729140803217887, "num_tokens": 28299510.0, "step": 1850 }, { "entropy": 0.3991117935627699, "epoch": 0.7201785887605552, "grad_norm": 2.59375, "learning_rate": 8.888541584744652e-06, "loss": 0.3907686710357666, "mean_token_accuracy": 0.8788457185029983, "num_tokens": 28356716.0, "step": 1855 }, { "entropy": 0.33190413266420365, "epoch": 0.7221197709405027, "grad_norm": 1.3125, "learning_rate": 8.881946590360893e-06, "loss": 0.3549908399581909, "mean_token_accuracy": 0.8904741749167442, "num_tokens": 28425961.0, "step": 1860 }, { "entropy": 0.3761907495558262, "epoch": 0.7240609531204504, "grad_norm": 1.7265625, "learning_rate": 8.875334549682322e-06, "loss": 0.40765061378479006, "mean_token_accuracy": 0.8756383866071701, "num_tokens": 28492753.0, "step": 1865 }, { "entropy": 0.3859711352735758, "epoch": 0.7260021353003979, "grad_norm": 1.640625, "learning_rate": 8.868705491743543e-06, "loss": 0.40584306716918944, "mean_token_accuracy": 0.8751093596220016, "num_tokens": 28574648.0, "step": 1870 }, { "entropy": 0.3750033970922232, "epoch": 0.7279433174803456, "grad_norm": 1.375, "learning_rate": 8.862059445653892e-06, "loss": 0.42207088470458987, "mean_token_accuracy": 0.8791605412960053, "num_tokens": 28673368.0, "step": 1875 }, { "entropy": 0.33736986815929415, "epoch": 0.7298844996602931, "grad_norm": 1.984375, "learning_rate": 8.855396440597299e-06, "loss": 0.33533928394317625, "mean_token_accuracy": 0.8882556319236755, "num_tokens": 28745333.0, "step": 1880 }, { "entropy": 0.38949261195957663, "epoch": 0.7318256818402407, "grad_norm": 1.46875, "learning_rate": 8.848716505832163e-06, "loss": 0.39729306697845457, "mean_token_accuracy": 0.8767626166343689, "num_tokens": 28823783.0, "step": 1885 }, { "entropy": 0.373487963527441, "epoch": 0.7337668640201883, "grad_norm": 1.578125, "learning_rate": 8.842019670691226e-06, "loss": 0.3975057601928711, "mean_token_accuracy": 0.8789292603731156, "num_tokens": 28899576.0, "step": 1890 }, { "entropy": 0.3453738629817963, "epoch": 0.7357080462001359, "grad_norm": 2.0625, "learning_rate": 8.835305964581442e-06, "loss": 0.38850131034851076, "mean_token_accuracy": 0.8864782005548477, "num_tokens": 28979338.0, "step": 1895 }, { "entropy": 0.3513793833553791, "epoch": 0.7376492283800835, "grad_norm": 1.734375, "learning_rate": 8.828575416983853e-06, "loss": 0.3649607181549072, "mean_token_accuracy": 0.8849209144711494, "num_tokens": 29038858.0, "step": 1900 }, { "entropy": 0.3707513175904751, "epoch": 0.739590410560031, "grad_norm": 1.609375, "learning_rate": 8.821828057453448e-06, "loss": 0.3917756795883179, "mean_token_accuracy": 0.8805187106132507, "num_tokens": 29121454.0, "step": 1905 }, { "entropy": 0.3511063469573855, "epoch": 0.7415315927399786, "grad_norm": 1.9296875, "learning_rate": 8.81506391561904e-06, "loss": 0.3545810699462891, "mean_token_accuracy": 0.8827590346336365, "num_tokens": 29192959.0, "step": 1910 }, { "entropy": 0.3934715397655964, "epoch": 0.7434727749199262, "grad_norm": 1.4609375, "learning_rate": 8.80828302118314e-06, "loss": 0.44544425010681155, "mean_token_accuracy": 0.873169532418251, "num_tokens": 29275025.0, "step": 1915 }, { "entropy": 0.37588623352348804, "epoch": 0.7454139570998738, "grad_norm": 1.8046875, "learning_rate": 8.801485403921823e-06, "loss": 0.4109992027282715, "mean_token_accuracy": 0.8753042757511139, "num_tokens": 29359266.0, "step": 1920 }, { "entropy": 0.3526043064892292, "epoch": 0.7473551392798214, "grad_norm": 1.625, "learning_rate": 8.794671093684595e-06, "loss": 0.3500061988830566, "mean_token_accuracy": 0.8878669127821922, "num_tokens": 29415745.0, "step": 1925 }, { "entropy": 0.39229949191212654, "epoch": 0.749296321459769, "grad_norm": 1.453125, "learning_rate": 8.787840120394261e-06, "loss": 0.4506565570831299, "mean_token_accuracy": 0.873441505432129, "num_tokens": 29492482.0, "step": 1930 }, { "entropy": 0.41463610120117667, "epoch": 0.7512375036397166, "grad_norm": 1.5703125, "learning_rate": 8.7809925140468e-06, "loss": 0.4298503875732422, "mean_token_accuracy": 0.8726509675383568, "num_tokens": 29572784.0, "step": 1935 }, { "entropy": 0.44622854702174664, "epoch": 0.7531786858196642, "grad_norm": 1.578125, "learning_rate": 8.774128304711232e-06, "loss": 0.47462167739868166, "mean_token_accuracy": 0.858974027633667, "num_tokens": 29664399.0, "step": 1940 }, { "entropy": 0.36902854703366755, "epoch": 0.7551198679996117, "grad_norm": 1.625, "learning_rate": 8.767247522529473e-06, "loss": 0.38140344619750977, "mean_token_accuracy": 0.8812808141112327, "num_tokens": 29743761.0, "step": 1945 }, { "entropy": 0.3770481664687395, "epoch": 0.7570610501795594, "grad_norm": 1.5625, "learning_rate": 8.760350197716228e-06, "loss": 0.37451202869415284, "mean_token_accuracy": 0.8845255061984062, "num_tokens": 29805351.0, "step": 1950 }, { "entropy": 0.42605091743171214, "epoch": 0.7590022323595069, "grad_norm": 1.71875, "learning_rate": 8.75343636055883e-06, "loss": 0.434804630279541, "mean_token_accuracy": 0.8679893091320992, "num_tokens": 29876830.0, "step": 1955 }, { "entropy": 0.4243669960647821, "epoch": 0.7609434145394546, "grad_norm": 1.6953125, "learning_rate": 8.746506041417133e-06, "loss": 0.41442170143127444, "mean_token_accuracy": 0.8689531117677689, "num_tokens": 29952810.0, "step": 1960 }, { "entropy": 0.40345251336693766, "epoch": 0.7628845967194021, "grad_norm": 1.828125, "learning_rate": 8.739559270723353e-06, "loss": 0.3906730651855469, "mean_token_accuracy": 0.8731215000152588, "num_tokens": 30017592.0, "step": 1965 }, { "entropy": 0.40580461621284486, "epoch": 0.7648257788993497, "grad_norm": 1.4375, "learning_rate": 8.732596078981957e-06, "loss": 0.40709662437438965, "mean_token_accuracy": 0.8757615357637405, "num_tokens": 30091851.0, "step": 1970 }, { "entropy": 0.3563157990574837, "epoch": 0.7667669610792973, "grad_norm": 1.53125, "learning_rate": 8.72561649676952e-06, "loss": 0.36572675704956054, "mean_token_accuracy": 0.8835931360721588, "num_tokens": 30167070.0, "step": 1975 }, { "entropy": 0.4045840006321669, "epoch": 0.7687081432592449, "grad_norm": 2.1875, "learning_rate": 8.718620554734582e-06, "loss": 0.4593046188354492, "mean_token_accuracy": 0.8688464492559433, "num_tokens": 30232066.0, "step": 1980 }, { "entropy": 0.38409191854298114, "epoch": 0.7706493254391925, "grad_norm": 1.65625, "learning_rate": 8.71160828359753e-06, "loss": 0.40677800178527834, "mean_token_accuracy": 0.8754825726151466, "num_tokens": 30303663.0, "step": 1985 }, { "entropy": 0.36517377346754076, "epoch": 0.77259050761914, "grad_norm": 1.5, "learning_rate": 8.704579714150451e-06, "loss": 0.38115544319152833, "mean_token_accuracy": 0.8828602716326713, "num_tokens": 30371090.0, "step": 1990 }, { "entropy": 0.3954622160643339, "epoch": 0.7745316897990876, "grad_norm": 1.234375, "learning_rate": 8.697534877257003e-06, "loss": 0.4024034023284912, "mean_token_accuracy": 0.8711701706051826, "num_tokens": 30462563.0, "step": 1995 }, { "entropy": 0.3614397499710321, "epoch": 0.7764728719790353, "grad_norm": 1.9375, "learning_rate": 8.690473803852277e-06, "loss": 0.38828601837158205, "mean_token_accuracy": 0.8840885296463966, "num_tokens": 30537774.0, "step": 2000 }, { "epoch": 0.7764728719790353, "eval_entropy": 0.37249069839105764, "eval_loss": 0.3714829683303833, "eval_mean_token_accuracy": 0.8825830673411481, "eval_num_tokens": 30537774.0, "eval_runtime": 60.1567, "eval_samples_per_second": 35.723, "eval_steps_per_second": 35.723, "step": 2000 }, { "entropy": 0.4046864528208971, "epoch": 0.7784140541589828, "grad_norm": 1.59375, "learning_rate": 8.683396524942655e-06, "loss": 0.4361577033996582, "mean_token_accuracy": 0.8703169271349906, "num_tokens": 30629222.0, "step": 2005 }, { "entropy": 0.3440706986933947, "epoch": 0.7803552363389304, "grad_norm": 2.046875, "learning_rate": 8.676303071605692e-06, "loss": 0.3639081954956055, "mean_token_accuracy": 0.8900219470262527, "num_tokens": 30691162.0, "step": 2010 }, { "entropy": 0.3819488488137722, "epoch": 0.782296418518878, "grad_norm": 1.9453125, "learning_rate": 8.669193474989957e-06, "loss": 0.3750166654586792, "mean_token_accuracy": 0.8811448708176612, "num_tokens": 30763162.0, "step": 2015 }, { "entropy": 0.41373511366546156, "epoch": 0.7842376006988255, "grad_norm": 1.5625, "learning_rate": 8.66206776631491e-06, "loss": 0.4189298152923584, "mean_token_accuracy": 0.8678776487708092, "num_tokens": 30851103.0, "step": 2020 }, { "entropy": 0.38263436295092107, "epoch": 0.7861787828787732, "grad_norm": 1.2265625, "learning_rate": 8.654925976870766e-06, "loss": 0.4248814582824707, "mean_token_accuracy": 0.8735449090600014, "num_tokens": 30940204.0, "step": 2025 }, { "entropy": 0.4140047915279865, "epoch": 0.7881199650587207, "grad_norm": 1.5625, "learning_rate": 8.647768138018348e-06, "loss": 0.41850671768188474, "mean_token_accuracy": 0.8704892829060554, "num_tokens": 31020160.0, "step": 2030 }, { "entropy": 0.35883037857711314, "epoch": 0.7900611472386684, "grad_norm": 1.90625, "learning_rate": 8.640594281188958e-06, "loss": 0.3723835229873657, "mean_token_accuracy": 0.8855434998869895, "num_tokens": 31099746.0, "step": 2035 }, { "entropy": 0.36093359626829624, "epoch": 0.7920023294186159, "grad_norm": 1.390625, "learning_rate": 8.633404437884235e-06, "loss": 0.3619117498397827, "mean_token_accuracy": 0.8832022443413734, "num_tokens": 31175731.0, "step": 2040 }, { "entropy": 0.38728207871317866, "epoch": 0.7939435115985636, "grad_norm": 1.6953125, "learning_rate": 8.626198639676014e-06, "loss": 0.38774235248565675, "mean_token_accuracy": 0.8787285834550858, "num_tokens": 31258521.0, "step": 2045 }, { "entropy": 0.36480732820928097, "epoch": 0.7958846937785111, "grad_norm": 1.71875, "learning_rate": 8.618976918206196e-06, "loss": 0.3832773447036743, "mean_token_accuracy": 0.8871650651097298, "num_tokens": 31332899.0, "step": 2050 }, { "entropy": 0.37207553833723067, "epoch": 0.7978258759584587, "grad_norm": 1.609375, "learning_rate": 8.611739305186602e-06, "loss": 0.4212314605712891, "mean_token_accuracy": 0.877054350078106, "num_tokens": 31409631.0, "step": 2055 }, { "entropy": 0.4014189802110195, "epoch": 0.7997670581384063, "grad_norm": 1.5390625, "learning_rate": 8.604485832398833e-06, "loss": 0.4188095569610596, "mean_token_accuracy": 0.8725872606039047, "num_tokens": 31486529.0, "step": 2060 }, { "entropy": 0.3518418502062559, "epoch": 0.8017082403183539, "grad_norm": 1.4609375, "learning_rate": 8.597216531694136e-06, "loss": 0.3803000211715698, "mean_token_accuracy": 0.8850871086120605, "num_tokens": 31557811.0, "step": 2065 }, { "entropy": 0.38242518045008184, "epoch": 0.8036494224983015, "grad_norm": 1.890625, "learning_rate": 8.589931434993262e-06, "loss": 0.4062291145324707, "mean_token_accuracy": 0.8756525501608848, "num_tokens": 31627170.0, "step": 2070 }, { "entropy": 0.3889755714684725, "epoch": 0.8055906046782491, "grad_norm": 1.6875, "learning_rate": 8.58263057428632e-06, "loss": 0.3767040014266968, "mean_token_accuracy": 0.8771411761641502, "num_tokens": 31696937.0, "step": 2075 }, { "entropy": 0.3841622915118933, "epoch": 0.8075317868581966, "grad_norm": 2.03125, "learning_rate": 8.575313981632645e-06, "loss": 0.4042715549468994, "mean_token_accuracy": 0.877042506635189, "num_tokens": 31769548.0, "step": 2080 }, { "entropy": 0.3571667678654194, "epoch": 0.8094729690381443, "grad_norm": 1.4453125, "learning_rate": 8.567981689160654e-06, "loss": 0.3828322172164917, "mean_token_accuracy": 0.8810154914855957, "num_tokens": 31843626.0, "step": 2085 }, { "entropy": 0.4046927910298109, "epoch": 0.8114141512180918, "grad_norm": 1.8046875, "learning_rate": 8.560633729067705e-06, "loss": 0.4062997341156006, "mean_token_accuracy": 0.8745594829320907, "num_tokens": 31926157.0, "step": 2090 }, { "entropy": 0.3526596352458, "epoch": 0.8133553333980394, "grad_norm": 1.953125, "learning_rate": 8.55327013361995e-06, "loss": 0.3879395484924316, "mean_token_accuracy": 0.8853856906294822, "num_tokens": 31985096.0, "step": 2095 }, { "entropy": 0.37672988660633566, "epoch": 0.815296515577987, "grad_norm": 1.5859375, "learning_rate": 8.545890935152204e-06, "loss": 0.3643826961517334, "mean_token_accuracy": 0.8801594600081444, "num_tokens": 32071262.0, "step": 2100 }, { "entropy": 0.3610169466584921, "epoch": 0.8172376977579345, "grad_norm": 1.5546875, "learning_rate": 8.538496166067798e-06, "loss": 0.37534480094909667, "mean_token_accuracy": 0.8826367557048798, "num_tokens": 32154499.0, "step": 2105 }, { "entropy": 0.3441557249054313, "epoch": 0.8191788799378822, "grad_norm": 1.765625, "learning_rate": 8.531085858838434e-06, "loss": 0.34778728485107424, "mean_token_accuracy": 0.8875968590378761, "num_tokens": 32218854.0, "step": 2110 }, { "entropy": 0.361727100238204, "epoch": 0.8211200621178297, "grad_norm": 1.5078125, "learning_rate": 8.523660046004043e-06, "loss": 0.36833460330963136, "mean_token_accuracy": 0.8847725585103035, "num_tokens": 32290653.0, "step": 2115 }, { "entropy": 0.37507129870355127, "epoch": 0.8230612442977774, "grad_norm": 1.8828125, "learning_rate": 8.516218760172647e-06, "loss": 0.4152214050292969, "mean_token_accuracy": 0.8757175728678703, "num_tokens": 32369260.0, "step": 2120 }, { "entropy": 0.38906187675893306, "epoch": 0.8250024264777249, "grad_norm": 1.390625, "learning_rate": 8.508762034020211e-06, "loss": 0.40627117156982423, "mean_token_accuracy": 0.8743843853473663, "num_tokens": 32461339.0, "step": 2125 }, { "entropy": 0.3820859346538782, "epoch": 0.8269436086576725, "grad_norm": 1.8359375, "learning_rate": 8.501289900290499e-06, "loss": 0.3897759437561035, "mean_token_accuracy": 0.8774882882833481, "num_tokens": 32541252.0, "step": 2130 }, { "entropy": 0.43145383819937705, "epoch": 0.8288847908376201, "grad_norm": 1.78125, "learning_rate": 8.49380239179494e-06, "loss": 0.4624598503112793, "mean_token_accuracy": 0.8640148594975472, "num_tokens": 32626380.0, "step": 2135 }, { "entropy": 0.38095561824738977, "epoch": 0.8308259730175677, "grad_norm": 1.546875, "learning_rate": 8.486299541412466e-06, "loss": 0.4128393650054932, "mean_token_accuracy": 0.8786048114299774, "num_tokens": 32702475.0, "step": 2140 }, { "entropy": 0.38069754019379615, "epoch": 0.8327671551975153, "grad_norm": 1.6953125, "learning_rate": 8.478781382089387e-06, "loss": 0.41826744079589845, "mean_token_accuracy": 0.8762264057993889, "num_tokens": 32798281.0, "step": 2145 }, { "entropy": 0.4370048839598894, "epoch": 0.8347083373774629, "grad_norm": 1.8046875, "learning_rate": 8.471247946839229e-06, "loss": 0.4501640796661377, "mean_token_accuracy": 0.8669643774628639, "num_tokens": 32865902.0, "step": 2150 }, { "entropy": 0.35586942471563815, "epoch": 0.8366495195574105, "grad_norm": 1.4609375, "learning_rate": 8.463699268742604e-06, "loss": 0.3725292444229126, "mean_token_accuracy": 0.8853255197405815, "num_tokens": 32936532.0, "step": 2155 }, { "entropy": 0.3503182210028172, "epoch": 0.8385907017373581, "grad_norm": 1.9921875, "learning_rate": 8.456135380947055e-06, "loss": 0.3832036733627319, "mean_token_accuracy": 0.8870538592338562, "num_tokens": 33000960.0, "step": 2160 }, { "entropy": 0.41169595904648304, "epoch": 0.8405318839173056, "grad_norm": 1.3671875, "learning_rate": 8.448556316666912e-06, "loss": 0.4174903392791748, "mean_token_accuracy": 0.8725608646869659, "num_tokens": 33086991.0, "step": 2165 }, { "entropy": 0.3502325866371393, "epoch": 0.8424730660972533, "grad_norm": 1.6484375, "learning_rate": 8.44096210918315e-06, "loss": 0.356764554977417, "mean_token_accuracy": 0.8890736445784568, "num_tokens": 33149859.0, "step": 2170 }, { "entropy": 0.4082874767482281, "epoch": 0.8444142482772008, "grad_norm": 1.5859375, "learning_rate": 8.43335279184324e-06, "loss": 0.41769680976867674, "mean_token_accuracy": 0.8735889151692391, "num_tokens": 33235890.0, "step": 2175 }, { "entropy": 0.3691970378160477, "epoch": 0.8463554304571484, "grad_norm": 1.5, "learning_rate": 8.425728398061002e-06, "loss": 0.4044227600097656, "mean_token_accuracy": 0.8845529943704605, "num_tokens": 33293736.0, "step": 2180 }, { "entropy": 0.3638597309589386, "epoch": 0.848296612637096, "grad_norm": 1.7890625, "learning_rate": 8.418088961316459e-06, "loss": 0.3457561254501343, "mean_token_accuracy": 0.8865492403507232, "num_tokens": 33355455.0, "step": 2185 }, { "entropy": 0.39855882450938224, "epoch": 0.8502377948170435, "grad_norm": 1.4921875, "learning_rate": 8.410434515155694e-06, "loss": 0.40858187675476076, "mean_token_accuracy": 0.8782258868217468, "num_tokens": 33428022.0, "step": 2190 }, { "entropy": 0.345845440402627, "epoch": 0.8521789769969912, "grad_norm": 1.6328125, "learning_rate": 8.402765093190693e-06, "loss": 0.35137181282043456, "mean_token_accuracy": 0.8922701835632324, "num_tokens": 33495081.0, "step": 2195 }, { "entropy": 0.3340891394764185, "epoch": 0.8541201591769387, "grad_norm": 1.3671875, "learning_rate": 8.395080729099206e-06, "loss": 0.3650421380996704, "mean_token_accuracy": 0.8865584105253219, "num_tokens": 33588202.0, "step": 2200 }, { "epoch": 0.8541201591769387, "eval_entropy": 0.36483841773214426, "eval_loss": 0.3705015778541565, "eval_mean_token_accuracy": 0.8827846525241297, "eval_num_tokens": 33588202.0, "eval_runtime": 60.1598, "eval_samples_per_second": 35.722, "eval_steps_per_second": 35.722, "step": 2200 }, { "entropy": 0.3953329209238291, "epoch": 0.8560613413568864, "grad_norm": 1.65625, "learning_rate": 8.3873814566246e-06, "loss": 0.4315669536590576, "mean_token_accuracy": 0.8746445998549461, "num_tokens": 33668354.0, "step": 2205 }, { "entropy": 0.37631992548704146, "epoch": 0.8580025235368339, "grad_norm": 1.6015625, "learning_rate": 8.379667309575699e-06, "loss": 0.41765918731689455, "mean_token_accuracy": 0.8796603456139565, "num_tokens": 33733737.0, "step": 2210 }, { "entropy": 0.3429785013198853, "epoch": 0.8599437057167815, "grad_norm": 1.609375, "learning_rate": 8.371938321826654e-06, "loss": 0.35924372673034666, "mean_token_accuracy": 0.8863778650760651, "num_tokens": 33813616.0, "step": 2215 }, { "entropy": 0.36748342849314214, "epoch": 0.8618848878967291, "grad_norm": 1.1953125, "learning_rate": 8.364194527316776e-06, "loss": 0.38543248176574707, "mean_token_accuracy": 0.8795094177126884, "num_tokens": 33893625.0, "step": 2220 }, { "entropy": 0.3699775494635105, "epoch": 0.8638260700766767, "grad_norm": 1.3203125, "learning_rate": 8.356435960050398e-06, "loss": 0.3805511474609375, "mean_token_accuracy": 0.8810431718826294, "num_tokens": 33969465.0, "step": 2225 }, { "entropy": 0.39027220420539377, "epoch": 0.8657672522566243, "grad_norm": 1.765625, "learning_rate": 8.348662654096724e-06, "loss": 0.3937405586242676, "mean_token_accuracy": 0.875699220597744, "num_tokens": 34037321.0, "step": 2230 }, { "entropy": 0.380586925894022, "epoch": 0.8677084344365719, "grad_norm": 1.96875, "learning_rate": 8.340874643589676e-06, "loss": 0.39784080982208253, "mean_token_accuracy": 0.8761513873934745, "num_tokens": 34115202.0, "step": 2235 }, { "entropy": 0.38007759377360345, "epoch": 0.8696496166165194, "grad_norm": 1.3359375, "learning_rate": 8.333071962727745e-06, "loss": 0.3872611284255981, "mean_token_accuracy": 0.8754698395729065, "num_tokens": 34202914.0, "step": 2240 }, { "entropy": 0.36750674396753313, "epoch": 0.8715907987964671, "grad_norm": 1.4765625, "learning_rate": 8.325254645773849e-06, "loss": 0.36534600257873534, "mean_token_accuracy": 0.8795874208211899, "num_tokens": 34276806.0, "step": 2245 }, { "entropy": 0.3650043081492186, "epoch": 0.8735319809764146, "grad_norm": 1.59375, "learning_rate": 8.317422727055165e-06, "loss": 0.3911173105239868, "mean_token_accuracy": 0.8810791179537774, "num_tokens": 34349979.0, "step": 2250 }, { "entropy": 0.4252156797796488, "epoch": 0.8754731631563623, "grad_norm": 1.421875, "learning_rate": 8.309576240962998e-06, "loss": 0.3878526449203491, "mean_token_accuracy": 0.870864699780941, "num_tokens": 34431756.0, "step": 2255 }, { "entropy": 0.3478477492928505, "epoch": 0.8774143453363098, "grad_norm": 1.703125, "learning_rate": 8.301715221952615e-06, "loss": 0.3578909635543823, "mean_token_accuracy": 0.8878472730517387, "num_tokens": 34507200.0, "step": 2260 }, { "entropy": 0.3685021881014109, "epoch": 0.8793555275162575, "grad_norm": 1.59375, "learning_rate": 8.293839704543103e-06, "loss": 0.39955284595489504, "mean_token_accuracy": 0.880965618789196, "num_tokens": 34586189.0, "step": 2265 }, { "entropy": 0.35574909709393976, "epoch": 0.881296709696205, "grad_norm": 1.859375, "learning_rate": 8.285949723317214e-06, "loss": 0.38354690074920655, "mean_token_accuracy": 0.8842941373586655, "num_tokens": 34664914.0, "step": 2270 }, { "entropy": 0.3691468223929405, "epoch": 0.8832378918761525, "grad_norm": 1.3984375, "learning_rate": 8.27804531292121e-06, "loss": 0.3860490322113037, "mean_token_accuracy": 0.8787114471197128, "num_tokens": 34748974.0, "step": 2275 }, { "entropy": 0.3915288481861353, "epoch": 0.8851790740561002, "grad_norm": 2.015625, "learning_rate": 8.270126508064717e-06, "loss": 0.4229584217071533, "mean_token_accuracy": 0.8710038289427757, "num_tokens": 34823550.0, "step": 2280 }, { "entropy": 0.4291458610445261, "epoch": 0.8871202562360477, "grad_norm": 1.3359375, "learning_rate": 8.262193343520567e-06, "loss": 0.43143463134765625, "mean_token_accuracy": 0.8686526745557785, "num_tokens": 34917555.0, "step": 2285 }, { "entropy": 0.37965994626283645, "epoch": 0.8890614384159954, "grad_norm": 1.5078125, "learning_rate": 8.254245854124652e-06, "loss": 0.3806295394897461, "mean_token_accuracy": 0.881553427875042, "num_tokens": 34990801.0, "step": 2290 }, { "entropy": 0.4057528983801603, "epoch": 0.8910026205959429, "grad_norm": 1.875, "learning_rate": 8.246284074775763e-06, "loss": 0.41382293701171874, "mean_token_accuracy": 0.8720936447381973, "num_tokens": 35074294.0, "step": 2295 }, { "entropy": 0.36701224111020564, "epoch": 0.8929438027758905, "grad_norm": 1.53125, "learning_rate": 8.23830804043544e-06, "loss": 0.39014787673950196, "mean_token_accuracy": 0.8813544929027557, "num_tokens": 35142936.0, "step": 2300 }, { "entropy": 0.36863389015197756, "epoch": 0.8948849849558381, "grad_norm": 1.921875, "learning_rate": 8.230317786127822e-06, "loss": 0.4085258483886719, "mean_token_accuracy": 0.8811587393283844, "num_tokens": 35228369.0, "step": 2305 }, { "entropy": 0.3572548136115074, "epoch": 0.8968261671357857, "grad_norm": 1.6640625, "learning_rate": 8.22231334693949e-06, "loss": 0.40120840072631836, "mean_token_accuracy": 0.8830894485116005, "num_tokens": 35306656.0, "step": 2310 }, { "entropy": 0.3663245867937803, "epoch": 0.8987673493157333, "grad_norm": 1.65625, "learning_rate": 8.21429475801931e-06, "loss": 0.3976627826690674, "mean_token_accuracy": 0.8808062687516213, "num_tokens": 35383007.0, "step": 2315 }, { "entropy": 0.40247388668358325, "epoch": 0.9007085314956809, "grad_norm": 1.609375, "learning_rate": 8.20626205457829e-06, "loss": 0.4049511909484863, "mean_token_accuracy": 0.8711845085024834, "num_tokens": 35453462.0, "step": 2320 }, { "entropy": 0.3913619853556156, "epoch": 0.9026497136756284, "grad_norm": 1.8203125, "learning_rate": 8.198215271889405e-06, "loss": 0.3979458808898926, "mean_token_accuracy": 0.879303203523159, "num_tokens": 35524043.0, "step": 2325 }, { "entropy": 0.37322167456150057, "epoch": 0.9045908958555761, "grad_norm": 1.5703125, "learning_rate": 8.190154445287466e-06, "loss": 0.41640191078186034, "mean_token_accuracy": 0.8771696910262108, "num_tokens": 35609328.0, "step": 2330 }, { "entropy": 0.39078370332717893, "epoch": 0.9065320780355236, "grad_norm": 1.421875, "learning_rate": 8.182079610168945e-06, "loss": 0.37838523387908934, "mean_token_accuracy": 0.8762111157178879, "num_tokens": 35680622.0, "step": 2335 }, { "entropy": 0.39326913058757784, "epoch": 0.9084732602154713, "grad_norm": 1.8203125, "learning_rate": 8.173990801991834e-06, "loss": 0.38826932907104494, "mean_token_accuracy": 0.8793436914682389, "num_tokens": 35744201.0, "step": 2340 }, { "entropy": 0.38700769394636153, "epoch": 0.9104144423954188, "grad_norm": 1.3125, "learning_rate": 8.165888056275478e-06, "loss": 0.4147165298461914, "mean_token_accuracy": 0.8736557975411415, "num_tokens": 35822206.0, "step": 2345 }, { "entropy": 0.3916376482695341, "epoch": 0.9123556245753665, "grad_norm": 1.4375, "learning_rate": 8.157771408600427e-06, "loss": 0.40491595268249514, "mean_token_accuracy": 0.8760656327009201, "num_tokens": 35898583.0, "step": 2350 }, { "entropy": 0.37616121433675287, "epoch": 0.914296806755314, "grad_norm": 1.859375, "learning_rate": 8.149640894608277e-06, "loss": 0.39853510856628416, "mean_token_accuracy": 0.8797665163874626, "num_tokens": 35962197.0, "step": 2355 }, { "entropy": 0.3872214786708355, "epoch": 0.9162379889352615, "grad_norm": 1.6953125, "learning_rate": 8.141496550001512e-06, "loss": 0.4320131778717041, "mean_token_accuracy": 0.8749197080731392, "num_tokens": 36048634.0, "step": 2360 }, { "entropy": 0.3654011983424425, "epoch": 0.9181791711152092, "grad_norm": 1.8125, "learning_rate": 8.13333841054335e-06, "loss": 0.4127936363220215, "mean_token_accuracy": 0.8794704377651215, "num_tokens": 36114078.0, "step": 2365 }, { "entropy": 0.4052185159176588, "epoch": 0.9201203532951567, "grad_norm": 1.671875, "learning_rate": 8.125166512057583e-06, "loss": 0.4502895355224609, "mean_token_accuracy": 0.8730918914079666, "num_tokens": 36185468.0, "step": 2370 }, { "entropy": 0.3805313348770142, "epoch": 0.9220615354751044, "grad_norm": 1.71875, "learning_rate": 8.116980890428421e-06, "loss": 0.4319314956665039, "mean_token_accuracy": 0.8795101106166839, "num_tokens": 36262273.0, "step": 2375 }, { "entropy": 0.379262937605381, "epoch": 0.9240027176550519, "grad_norm": 1.828125, "learning_rate": 8.108781581600337e-06, "loss": 0.3972128391265869, "mean_token_accuracy": 0.8769020855426788, "num_tokens": 36339772.0, "step": 2380 }, { "entropy": 0.3275274306535721, "epoch": 0.9259438998349995, "grad_norm": 1.8828125, "learning_rate": 8.100568621577907e-06, "loss": 0.349655294418335, "mean_token_accuracy": 0.8939395412802696, "num_tokens": 36405190.0, "step": 2385 }, { "entropy": 0.37873089760541917, "epoch": 0.9278850820149471, "grad_norm": 1.65625, "learning_rate": 8.092342046425647e-06, "loss": 0.41008243560791013, "mean_token_accuracy": 0.8806984931230545, "num_tokens": 36471551.0, "step": 2390 }, { "entropy": 0.42233099043369293, "epoch": 0.9298262641948947, "grad_norm": 1.5, "learning_rate": 8.084101892267866e-06, "loss": 0.43898987770080566, "mean_token_accuracy": 0.8671094790101052, "num_tokens": 36545141.0, "step": 2395 }, { "entropy": 0.377986478433013, "epoch": 0.9317674463748423, "grad_norm": 1.6015625, "learning_rate": 8.075848195288495e-06, "loss": 0.4050844669342041, "mean_token_accuracy": 0.8788811087608337, "num_tokens": 36621766.0, "step": 2400 }, { "epoch": 0.9317674463748423, "eval_entropy": 0.3660563061428991, "eval_loss": 0.3696966767311096, "eval_mean_token_accuracy": 0.8829624101609727, "eval_num_tokens": 36621766.0, "eval_runtime": 60.0474, "eval_samples_per_second": 35.788, "eval_steps_per_second": 35.788, "step": 2400 }, { "entropy": 0.41329049319028854, "epoch": 0.9337086285547899, "grad_norm": 1.3203125, "learning_rate": 8.06758099173094e-06, "loss": 0.3864266872406006, "mean_token_accuracy": 0.8733444228768349, "num_tokens": 36692402.0, "step": 2405 }, { "entropy": 0.35147353522479535, "epoch": 0.9356498107347374, "grad_norm": 1.84375, "learning_rate": 8.059300317897907e-06, "loss": 0.3865788698196411, "mean_token_accuracy": 0.8899516001343727, "num_tokens": 36762687.0, "step": 2410 }, { "entropy": 0.35971076525747775, "epoch": 0.9375909929146851, "grad_norm": 2.015625, "learning_rate": 8.051006210151264e-06, "loss": 0.38848717212677003, "mean_token_accuracy": 0.8857970297336578, "num_tokens": 36829409.0, "step": 2415 }, { "entropy": 0.36762615144252775, "epoch": 0.9395321750946326, "grad_norm": 1.7578125, "learning_rate": 8.04269870491186e-06, "loss": 0.38152570724487306, "mean_token_accuracy": 0.8803352236747741, "num_tokens": 36920865.0, "step": 2420 }, { "entropy": 0.37836971804499625, "epoch": 0.9414733572745803, "grad_norm": 1.6875, "learning_rate": 8.03437783865938e-06, "loss": 0.3982245683670044, "mean_token_accuracy": 0.8782578885555268, "num_tokens": 36993918.0, "step": 2425 }, { "entropy": 0.40916073732078073, "epoch": 0.9434145394545278, "grad_norm": 1.3203125, "learning_rate": 8.02604364793218e-06, "loss": 0.40181870460510255, "mean_token_accuracy": 0.8741835564374923, "num_tokens": 37067466.0, "step": 2430 }, { "entropy": 0.38971280567348004, "epoch": 0.9453557216344753, "grad_norm": 1.4375, "learning_rate": 8.017696169327121e-06, "loss": 0.3853023052215576, "mean_token_accuracy": 0.8737384587526321, "num_tokens": 37158825.0, "step": 2435 }, { "entropy": 0.36188525408506395, "epoch": 0.947296903814423, "grad_norm": 1.8046875, "learning_rate": 8.009335439499418e-06, "loss": 0.39717903137207033, "mean_token_accuracy": 0.8839860737323761, "num_tokens": 37231768.0, "step": 2440 }, { "entropy": 0.3343341175466776, "epoch": 0.9492380859943705, "grad_norm": 1.5078125, "learning_rate": 8.000961495162474e-06, "loss": 0.34873759746551514, "mean_token_accuracy": 0.8942202180624008, "num_tokens": 37295787.0, "step": 2445 }, { "entropy": 0.37651418149471283, "epoch": 0.9511792681743182, "grad_norm": 2.0625, "learning_rate": 7.992574373087717e-06, "loss": 0.3985455989837646, "mean_token_accuracy": 0.8804239287972451, "num_tokens": 37365031.0, "step": 2450 }, { "entropy": 0.3872853074222803, "epoch": 0.9531204503542657, "grad_norm": 1.859375, "learning_rate": 7.984174110104442e-06, "loss": 0.3960126876831055, "mean_token_accuracy": 0.8773596182465553, "num_tokens": 37440723.0, "step": 2455 }, { "entropy": 0.36501435153186323, "epoch": 0.9550616325342134, "grad_norm": 1.546875, "learning_rate": 7.975760743099648e-06, "loss": 0.3613110065460205, "mean_token_accuracy": 0.8814436718821526, "num_tokens": 37517552.0, "step": 2460 }, { "entropy": 0.4213182792067528, "epoch": 0.9570028147141609, "grad_norm": 2.125, "learning_rate": 7.967334309017876e-06, "loss": 0.42275075912475585, "mean_token_accuracy": 0.8684807687997818, "num_tokens": 37576304.0, "step": 2465 }, { "entropy": 0.36460405923426153, "epoch": 0.9589439968941085, "grad_norm": 1.4453125, "learning_rate": 7.958894844861044e-06, "loss": 0.4192854881286621, "mean_token_accuracy": 0.882645896077156, "num_tokens": 37649463.0, "step": 2470 }, { "entropy": 0.3699316095560789, "epoch": 0.9608851790740561, "grad_norm": 1.53125, "learning_rate": 7.950442387688295e-06, "loss": 0.39672675132751467, "mean_token_accuracy": 0.8789965286850929, "num_tokens": 37727011.0, "step": 2475 }, { "entropy": 0.3613630454987288, "epoch": 0.9628263612540037, "grad_norm": 1.65625, "learning_rate": 7.941976974615817e-06, "loss": 0.35828289985656736, "mean_token_accuracy": 0.8861474558711052, "num_tokens": 37799274.0, "step": 2480 }, { "entropy": 0.4028384655714035, "epoch": 0.9647675434339513, "grad_norm": 1.453125, "learning_rate": 7.933498642816698e-06, "loss": 0.39244048595428466, "mean_token_accuracy": 0.8733719438314438, "num_tokens": 37872790.0, "step": 2485 }, { "entropy": 0.37262568436563015, "epoch": 0.9667087256138989, "grad_norm": 1.328125, "learning_rate": 7.925007429520745e-06, "loss": 0.3869138240814209, "mean_token_accuracy": 0.8780170202255249, "num_tokens": 37949478.0, "step": 2490 }, { "entropy": 0.3532130911946297, "epoch": 0.9686499077938464, "grad_norm": 1.65625, "learning_rate": 7.916503372014339e-06, "loss": 0.3645073175430298, "mean_token_accuracy": 0.8856014132499694, "num_tokens": 38010035.0, "step": 2495 }, { "entropy": 0.4101907879114151, "epoch": 0.9705910899737941, "grad_norm": 1.4453125, "learning_rate": 7.90798650764026e-06, "loss": 0.43153948783874513, "mean_token_accuracy": 0.868617196381092, "num_tokens": 38091317.0, "step": 2500 }, { "entropy": 0.37458378039300444, "epoch": 0.9725322721537416, "grad_norm": 1.578125, "learning_rate": 7.899456873797519e-06, "loss": 0.4130906105041504, "mean_token_accuracy": 0.8811309933662415, "num_tokens": 38156010.0, "step": 2505 }, { "entropy": 0.33361660987138747, "epoch": 0.9744734543336893, "grad_norm": 1.5078125, "learning_rate": 7.890914507941209e-06, "loss": 0.3599473714828491, "mean_token_accuracy": 0.891946268081665, "num_tokens": 38227058.0, "step": 2510 }, { "entropy": 0.36410921774804594, "epoch": 0.9764146365136368, "grad_norm": 1.71875, "learning_rate": 7.882359447582323e-06, "loss": 0.36246566772460936, "mean_token_accuracy": 0.8795957028865814, "num_tokens": 38308578.0, "step": 2515 }, { "entropy": 0.40007474571466445, "epoch": 0.9783558186935843, "grad_norm": 1.7734375, "learning_rate": 7.873791730287607e-06, "loss": 0.416595458984375, "mean_token_accuracy": 0.8698130205273629, "num_tokens": 38393316.0, "step": 2520 }, { "entropy": 0.37237811721861364, "epoch": 0.980297000873532, "grad_norm": 1.65625, "learning_rate": 7.865211393679374e-06, "loss": 0.3867233991622925, "mean_token_accuracy": 0.8821268856525422, "num_tokens": 38469288.0, "step": 2525 }, { "entropy": 0.3845432631671429, "epoch": 0.9822381830534795, "grad_norm": 1.3515625, "learning_rate": 7.856618475435361e-06, "loss": 0.3905576944351196, "mean_token_accuracy": 0.8774156749248505, "num_tokens": 38543267.0, "step": 2530 }, { "entropy": 0.40176920518279075, "epoch": 0.9841793652334272, "grad_norm": 1.546875, "learning_rate": 7.848013013288548e-06, "loss": 0.41007471084594727, "mean_token_accuracy": 0.8690975129604339, "num_tokens": 38626344.0, "step": 2535 }, { "entropy": 0.38942315727472304, "epoch": 0.9861205474133747, "grad_norm": 1.6796875, "learning_rate": 7.839395045027e-06, "loss": 0.40326895713806155, "mean_token_accuracy": 0.8756881758570672, "num_tokens": 38701064.0, "step": 2540 }, { "entropy": 0.3606115547940135, "epoch": 0.9880617295933223, "grad_norm": 1.59375, "learning_rate": 7.830764608493697e-06, "loss": 0.36026384830474856, "mean_token_accuracy": 0.8839934259653092, "num_tokens": 38779059.0, "step": 2545 }, { "entropy": 0.39264477528631686, "epoch": 0.9900029117732699, "grad_norm": 1.3203125, "learning_rate": 7.822121741586368e-06, "loss": 0.4251199245452881, "mean_token_accuracy": 0.8718041434884072, "num_tokens": 38880681.0, "step": 2550 }, { "entropy": 0.40420532748103144, "epoch": 0.9919440939532175, "grad_norm": 1.7109375, "learning_rate": 7.813466482257327e-06, "loss": 0.4312422752380371, "mean_token_accuracy": 0.8763286352157593, "num_tokens": 38941161.0, "step": 2555 }, { "entropy": 0.3369973488152027, "epoch": 0.9938852761331651, "grad_norm": 1.40625, "learning_rate": 7.804798868513306e-06, "loss": 0.35411407947540285, "mean_token_accuracy": 0.8899437338113785, "num_tokens": 39018011.0, "step": 2560 }, { "entropy": 0.39636878967285155, "epoch": 0.9958264583131127, "grad_norm": 1.59375, "learning_rate": 7.796118938415289e-06, "loss": 0.407199764251709, "mean_token_accuracy": 0.8719804942607879, "num_tokens": 39101097.0, "step": 2565 }, { "entropy": 0.3830322280526161, "epoch": 0.9977676404930603, "grad_norm": 1.8046875, "learning_rate": 7.78742673007834e-06, "loss": 0.38955183029174806, "mean_token_accuracy": 0.8754953861236572, "num_tokens": 39180075.0, "step": 2570 }, { "entropy": 0.37996360957622527, "epoch": 0.9997088226730079, "grad_norm": 1.8046875, "learning_rate": 7.77872228167144e-06, "loss": 0.4175414085388184, "mean_token_accuracy": 0.8776975840330123, "num_tokens": 39256747.0, "step": 2575 }, { "entropy": 0.3622729911615974, "epoch": 1.001552945743958, "grad_norm": 1.5, "learning_rate": 7.770005631417316e-06, "loss": 0.3494336366653442, "mean_token_accuracy": 0.8844640803964514, "num_tokens": 39331281.0, "step": 2580 }, { "entropy": 0.3717500135302544, "epoch": 1.0034941279239056, "grad_norm": 1.7890625, "learning_rate": 7.761276817592283e-06, "loss": 0.38556852340698244, "mean_token_accuracy": 0.8811951488256454, "num_tokens": 39403180.0, "step": 2585 }, { "entropy": 0.3536002866923809, "epoch": 1.0054353101038533, "grad_norm": 1.5078125, "learning_rate": 7.752535878526057e-06, "loss": 0.3865217208862305, "mean_token_accuracy": 0.8846583724021911, "num_tokens": 39482653.0, "step": 2590 }, { "entropy": 0.36924818605184556, "epoch": 1.0073764922838009, "grad_norm": 1.359375, "learning_rate": 7.743782852601609e-06, "loss": 0.3744253873825073, "mean_token_accuracy": 0.8835659891366958, "num_tokens": 39571122.0, "step": 2595 }, { "entropy": 0.3826246250420809, "epoch": 1.0093176744637484, "grad_norm": 1.3125, "learning_rate": 7.735017778254976e-06, "loss": 0.3962560176849365, "mean_token_accuracy": 0.8752669557929039, "num_tokens": 39643625.0, "step": 2600 }, { "epoch": 1.0093176744637484, "eval_entropy": 0.3590102044439915, "eval_loss": 0.3693583905696869, "eval_mean_token_accuracy": 0.8832231578354283, "eval_num_tokens": 39643625.0, "eval_runtime": 60.1285, "eval_samples_per_second": 35.74, "eval_steps_per_second": 35.74, "step": 2600 }, { "entropy": 0.3363046307116747, "epoch": 1.011258856643696, "grad_norm": 1.5859375, "learning_rate": 7.726240693975112e-06, "loss": 0.3622615814208984, "mean_token_accuracy": 0.890743799507618, "num_tokens": 39727345.0, "step": 2605 }, { "entropy": 0.39671580009162427, "epoch": 1.0132000388236435, "grad_norm": 1.640625, "learning_rate": 7.7174516383037e-06, "loss": 0.42294821739196775, "mean_token_accuracy": 0.8709942042827606, "num_tokens": 39809382.0, "step": 2610 }, { "entropy": 0.38348409347236156, "epoch": 1.0151412210035913, "grad_norm": 1.8046875, "learning_rate": 7.70865064983499e-06, "loss": 0.40362987518310545, "mean_token_accuracy": 0.8796380490064621, "num_tokens": 39870908.0, "step": 2615 }, { "entropy": 0.35954158157110216, "epoch": 1.0170824031835388, "grad_norm": 1.5859375, "learning_rate": 7.699837767215642e-06, "loss": 0.391841459274292, "mean_token_accuracy": 0.8825426653027535, "num_tokens": 39946284.0, "step": 2620 }, { "entropy": 0.3796717070043087, "epoch": 1.0190235853634864, "grad_norm": 1.8203125, "learning_rate": 7.691013029144535e-06, "loss": 0.4171717643737793, "mean_token_accuracy": 0.8788355842232705, "num_tokens": 40017489.0, "step": 2625 }, { "entropy": 0.35411719866096975, "epoch": 1.020964767543434, "grad_norm": 1.265625, "learning_rate": 7.682176474372613e-06, "loss": 0.36679236888885497, "mean_token_accuracy": 0.8839921057224274, "num_tokens": 40091956.0, "step": 2630 }, { "entropy": 0.4047669190913439, "epoch": 1.0229059497233814, "grad_norm": 1.78125, "learning_rate": 7.673328141702708e-06, "loss": 0.42531418800354004, "mean_token_accuracy": 0.8716289401054382, "num_tokens": 40174273.0, "step": 2635 }, { "entropy": 0.3872047744691372, "epoch": 1.0248471319033292, "grad_norm": 1.453125, "learning_rate": 7.664468069989363e-06, "loss": 0.39284777641296387, "mean_token_accuracy": 0.8739194989204406, "num_tokens": 40259401.0, "step": 2640 }, { "entropy": 0.39128339402377604, "epoch": 1.0267883140832768, "grad_norm": 1.8515625, "learning_rate": 7.655596298138683e-06, "loss": 0.3992388963699341, "mean_token_accuracy": 0.8772461920976639, "num_tokens": 40326313.0, "step": 2645 }, { "entropy": 0.36866898983716967, "epoch": 1.0287294962632243, "grad_norm": 1.65625, "learning_rate": 7.646712865108143e-06, "loss": 0.376071572303772, "mean_token_accuracy": 0.885202445089817, "num_tokens": 40402194.0, "step": 2650 }, { "entropy": 0.3736342485994101, "epoch": 1.0306706784431718, "grad_norm": 1.59375, "learning_rate": 7.637817809906422e-06, "loss": 0.38311469554901123, "mean_token_accuracy": 0.8761677891016006, "num_tokens": 40482708.0, "step": 2655 }, { "entropy": 0.36633356250822546, "epoch": 1.0326118606231194, "grad_norm": 1.8671875, "learning_rate": 7.628911171593236e-06, "loss": 0.39987525939941404, "mean_token_accuracy": 0.8770500838756561, "num_tokens": 40550320.0, "step": 2660 }, { "entropy": 0.3814588252454996, "epoch": 1.0345530428030671, "grad_norm": 1.8203125, "learning_rate": 7.6199929892791666e-06, "loss": 0.42825708389282224, "mean_token_accuracy": 0.8766391202807426, "num_tokens": 40621121.0, "step": 2665 }, { "entropy": 0.3263087157160044, "epoch": 1.0364942249830147, "grad_norm": 1.8046875, "learning_rate": 7.611063302125485e-06, "loss": 0.3370352745056152, "mean_token_accuracy": 0.892676542699337, "num_tokens": 40675017.0, "step": 2670 }, { "entropy": 0.35688314363360407, "epoch": 1.0384354071629622, "grad_norm": 1.59375, "learning_rate": 7.602122149343982e-06, "loss": 0.37260828018188474, "mean_token_accuracy": 0.8857112199068069, "num_tokens": 40735182.0, "step": 2675 }, { "entropy": 0.3657310428097844, "epoch": 1.0403765893429098, "grad_norm": 1.4296875, "learning_rate": 7.593169570196798e-06, "loss": 0.38663344383239745, "mean_token_accuracy": 0.8788129478693009, "num_tokens": 40812276.0, "step": 2680 }, { "entropy": 0.39327623806893824, "epoch": 1.0423177715228573, "grad_norm": 2.078125, "learning_rate": 7.5842056039962465e-06, "loss": 0.40496459007263186, "mean_token_accuracy": 0.8742495253682137, "num_tokens": 40883935.0, "step": 2685 }, { "entropy": 0.37283147126436234, "epoch": 1.044258953702805, "grad_norm": 1.5859375, "learning_rate": 7.575230290104643e-06, "loss": 0.38010687828063966, "mean_token_accuracy": 0.8804807871580124, "num_tokens": 40965277.0, "step": 2690 }, { "entropy": 0.33802379108965397, "epoch": 1.0462001358827526, "grad_norm": 1.5078125, "learning_rate": 7.566243667934132e-06, "loss": 0.34528648853302, "mean_token_accuracy": 0.8925616145133972, "num_tokens": 41036874.0, "step": 2695 }, { "entropy": 0.34530715458095074, "epoch": 1.0481413180627002, "grad_norm": 1.4375, "learning_rate": 7.557245776946522e-06, "loss": 0.3618237257003784, "mean_token_accuracy": 0.8869366824626923, "num_tokens": 41123295.0, "step": 2700 }, { "entropy": 0.3728719219565392, "epoch": 1.0500825002426477, "grad_norm": 1.53125, "learning_rate": 7.548236656653095e-06, "loss": 0.3764779567718506, "mean_token_accuracy": 0.8791951701045037, "num_tokens": 41206755.0, "step": 2705 }, { "entropy": 0.36930376179516317, "epoch": 1.0520236824225955, "grad_norm": 1.3671875, "learning_rate": 7.539216346614448e-06, "loss": 0.3768768310546875, "mean_token_accuracy": 0.8802413672208786, "num_tokens": 41295129.0, "step": 2710 }, { "entropy": 0.3499265480786562, "epoch": 1.053964864602543, "grad_norm": 2.046875, "learning_rate": 7.530184886440312e-06, "loss": 0.3675286293029785, "mean_token_accuracy": 0.889797542989254, "num_tokens": 41374363.0, "step": 2715 }, { "entropy": 0.3654515855014324, "epoch": 1.0559060467824906, "grad_norm": 1.53125, "learning_rate": 7.521142315789382e-06, "loss": 0.3843737840652466, "mean_token_accuracy": 0.8779830664396286, "num_tokens": 41452026.0, "step": 2720 }, { "entropy": 0.36771729625761507, "epoch": 1.057847228962438, "grad_norm": 2.03125, "learning_rate": 7.512088674369143e-06, "loss": 0.3673874378204346, "mean_token_accuracy": 0.8848532065749168, "num_tokens": 41516536.0, "step": 2725 }, { "entropy": 0.4460260573774576, "epoch": 1.0597884111423856, "grad_norm": 1.546875, "learning_rate": 7.503024001935686e-06, "loss": 0.45882291793823243, "mean_token_accuracy": 0.8644292831420899, "num_tokens": 41595307.0, "step": 2730 }, { "entropy": 0.34162113182246684, "epoch": 1.0617295933223332, "grad_norm": 1.375, "learning_rate": 7.493948338293549e-06, "loss": 0.35989553928375245, "mean_token_accuracy": 0.8857067421078682, "num_tokens": 41675093.0, "step": 2735 }, { "entropy": 0.34600385688245294, "epoch": 1.063670775502281, "grad_norm": 1.453125, "learning_rate": 7.4848617232955275e-06, "loss": 0.36208953857421877, "mean_token_accuracy": 0.8861552521586418, "num_tokens": 41751969.0, "step": 2740 }, { "entropy": 0.34471417032182217, "epoch": 1.0656119576822285, "grad_norm": 1.453125, "learning_rate": 7.475764196842516e-06, "loss": 0.3590202331542969, "mean_token_accuracy": 0.8894360795617103, "num_tokens": 41826333.0, "step": 2745 }, { "entropy": 0.35084208101034164, "epoch": 1.067553139862176, "grad_norm": 2.15625, "learning_rate": 7.466655798883313e-06, "loss": 0.3687446117401123, "mean_token_accuracy": 0.8872736170887947, "num_tokens": 41908430.0, "step": 2750 }, { "entropy": 0.3609664674848318, "epoch": 1.0694943220421236, "grad_norm": 2.171875, "learning_rate": 7.457536569414459e-06, "loss": 0.3871330738067627, "mean_token_accuracy": 0.8852574542164803, "num_tokens": 41985011.0, "step": 2755 }, { "entropy": 0.34639161452651024, "epoch": 1.0714355042220713, "grad_norm": 1.4375, "learning_rate": 7.448406548480063e-06, "loss": 0.3695810794830322, "mean_token_accuracy": 0.8897538051009178, "num_tokens": 42048911.0, "step": 2760 }, { "entropy": 0.32954322583973406, "epoch": 1.0733766864020189, "grad_norm": 1.5, "learning_rate": 7.439265776171611e-06, "loss": 0.3176077365875244, "mean_token_accuracy": 0.8935502767562866, "num_tokens": 42120003.0, "step": 2765 }, { "entropy": 0.37422714903950693, "epoch": 1.0753178685819664, "grad_norm": 1.59375, "learning_rate": 7.430114292627808e-06, "loss": 0.350958251953125, "mean_token_accuracy": 0.8825857222080231, "num_tokens": 42189503.0, "step": 2770 }, { "entropy": 0.35138509757816794, "epoch": 1.077259050761914, "grad_norm": 1.9921875, "learning_rate": 7.420952138034392e-06, "loss": 0.3909478187561035, "mean_token_accuracy": 0.8854555234313011, "num_tokens": 42251724.0, "step": 2775 }, { "entropy": 0.3574231918901205, "epoch": 1.0792002329418615, "grad_norm": 1.3125, "learning_rate": 7.411779352623958e-06, "loss": 0.36853466033935545, "mean_token_accuracy": 0.8846323460340499, "num_tokens": 42328165.0, "step": 2780 }, { "entropy": 0.3312282390892506, "epoch": 1.0811414151218093, "grad_norm": 1.265625, "learning_rate": 7.402595976675785e-06, "loss": 0.34425904750823977, "mean_token_accuracy": 0.8917144045233727, "num_tokens": 42416101.0, "step": 2785 }, { "entropy": 0.38315938860177995, "epoch": 1.0830825973017568, "grad_norm": 1.4375, "learning_rate": 7.393402050515652e-06, "loss": 0.41192307472229006, "mean_token_accuracy": 0.8757176354527474, "num_tokens": 42490192.0, "step": 2790 }, { "entropy": 0.3734312802553177, "epoch": 1.0850237794817044, "grad_norm": 1.6484375, "learning_rate": 7.384197614515672e-06, "loss": 0.3864989519119263, "mean_token_accuracy": 0.8778089836239815, "num_tokens": 42579078.0, "step": 2795 }, { "entropy": 0.3602697692811489, "epoch": 1.086964961661652, "grad_norm": 1.515625, "learning_rate": 7.3749827090941074e-06, "loss": 0.4144554615020752, "mean_token_accuracy": 0.8825942382216454, "num_tokens": 42656738.0, "step": 2800 }, { "epoch": 1.086964961661652, "eval_entropy": 0.3603161519775561, "eval_loss": 0.3686440587043762, "eval_mean_token_accuracy": 0.8834110738205987, "eval_num_tokens": 42656738.0, "eval_runtime": 60.0795, "eval_samples_per_second": 35.769, "eval_steps_per_second": 35.769, "step": 2800 }, { "entropy": 0.37382765375077726, "epoch": 1.0889061438415994, "grad_norm": 1.8046875, "learning_rate": 7.365757374715188e-06, "loss": 0.4022432804107666, "mean_token_accuracy": 0.878273893892765, "num_tokens": 42727874.0, "step": 2805 }, { "entropy": 0.3878149565309286, "epoch": 1.0908473260215472, "grad_norm": 1.453125, "learning_rate": 7.356521651888946e-06, "loss": 0.4143357276916504, "mean_token_accuracy": 0.8759766072034836, "num_tokens": 42805952.0, "step": 2810 }, { "entropy": 0.3720662288367748, "epoch": 1.0927885082014948, "grad_norm": 1.765625, "learning_rate": 7.347275581171027e-06, "loss": 0.3936682939529419, "mean_token_accuracy": 0.8800241187214851, "num_tokens": 42879670.0, "step": 2815 }, { "entropy": 0.3727020751684904, "epoch": 1.0947296903814423, "grad_norm": 1.53125, "learning_rate": 7.338019203162516e-06, "loss": 0.40426788330078123, "mean_token_accuracy": 0.8778004497289658, "num_tokens": 42966404.0, "step": 2820 }, { "entropy": 0.37748123742640016, "epoch": 1.0966708725613898, "grad_norm": 1.6875, "learning_rate": 7.3287525585097615e-06, "loss": 0.3956634044647217, "mean_token_accuracy": 0.877564987540245, "num_tokens": 43043314.0, "step": 2825 }, { "entropy": 0.3898195032030344, "epoch": 1.0986120547413374, "grad_norm": 2.390625, "learning_rate": 7.319475687904193e-06, "loss": 0.39679808616638185, "mean_token_accuracy": 0.8783272713422775, "num_tokens": 43108948.0, "step": 2830 }, { "entropy": 0.3532901670783758, "epoch": 1.1005532369212851, "grad_norm": 1.5859375, "learning_rate": 7.310188632082145e-06, "loss": 0.3547484874725342, "mean_token_accuracy": 0.8868882149457932, "num_tokens": 43182120.0, "step": 2835 }, { "entropy": 0.38809507302939894, "epoch": 1.1024944191012327, "grad_norm": 2.046875, "learning_rate": 7.300891431824673e-06, "loss": 0.4074056148529053, "mean_token_accuracy": 0.8722813636064529, "num_tokens": 43263374.0, "step": 2840 }, { "entropy": 0.35386649817228316, "epoch": 1.1044356012811802, "grad_norm": 1.265625, "learning_rate": 7.291584127957384e-06, "loss": 0.3566242456436157, "mean_token_accuracy": 0.8883859798312187, "num_tokens": 43334896.0, "step": 2845 }, { "entropy": 0.37594650611281394, "epoch": 1.1063767834611278, "grad_norm": 1.6875, "learning_rate": 7.282266761350249e-06, "loss": 0.3671935319900513, "mean_token_accuracy": 0.8861946225166321, "num_tokens": 43395764.0, "step": 2850 }, { "entropy": 0.37138679772615435, "epoch": 1.1083179656410753, "grad_norm": 1.3359375, "learning_rate": 7.272939372917427e-06, "loss": 0.3758493185043335, "mean_token_accuracy": 0.8805965319275856, "num_tokens": 43483273.0, "step": 2855 }, { "entropy": 0.3592002343386412, "epoch": 1.110259147821023, "grad_norm": 1.609375, "learning_rate": 7.263602003617083e-06, "loss": 0.36438665390014646, "mean_token_accuracy": 0.8856978788971901, "num_tokens": 43554609.0, "step": 2860 }, { "entropy": 0.3399000741541386, "epoch": 1.1122003300009706, "grad_norm": 1.5625, "learning_rate": 7.2542546944512106e-06, "loss": 0.3749422550201416, "mean_token_accuracy": 0.8887553334236145, "num_tokens": 43626500.0, "step": 2865 }, { "entropy": 0.40647769123315813, "epoch": 1.1141415121809182, "grad_norm": 1.9453125, "learning_rate": 7.244897486465451e-06, "loss": 0.43062515258789064, "mean_token_accuracy": 0.8718539297580719, "num_tokens": 43696284.0, "step": 2870 }, { "entropy": 0.3513967592269182, "epoch": 1.1160826943608657, "grad_norm": 1.3984375, "learning_rate": 7.2355304207489154e-06, "loss": 0.35802536010742186, "mean_token_accuracy": 0.88879015147686, "num_tokens": 43768064.0, "step": 2875 }, { "entropy": 0.3477201282978058, "epoch": 1.1180238765408133, "grad_norm": 1.828125, "learning_rate": 7.226153538433996e-06, "loss": 0.37060644626617434, "mean_token_accuracy": 0.8868695870041847, "num_tokens": 43841997.0, "step": 2880 }, { "entropy": 0.3926592905074358, "epoch": 1.119965058720761, "grad_norm": 1.84375, "learning_rate": 7.216766880696199e-06, "loss": 0.4085033893585205, "mean_token_accuracy": 0.8761053428053855, "num_tokens": 43920068.0, "step": 2885 }, { "entropy": 0.34527620263397696, "epoch": 1.1219062409007086, "grad_norm": 1.640625, "learning_rate": 7.207370488753949e-06, "loss": 0.35770795345306394, "mean_token_accuracy": 0.8901533395051956, "num_tokens": 43989892.0, "step": 2890 }, { "entropy": 0.3740640126168728, "epoch": 1.123847423080656, "grad_norm": 1.375, "learning_rate": 7.197964403868421e-06, "loss": 0.39128780364990234, "mean_token_accuracy": 0.8786321595311165, "num_tokens": 44067873.0, "step": 2895 }, { "entropy": 0.3401097748428583, "epoch": 1.1257886052606036, "grad_norm": 1.6875, "learning_rate": 7.188548667343347e-06, "loss": 0.357515287399292, "mean_token_accuracy": 0.8893807768821717, "num_tokens": 44142522.0, "step": 2900 }, { "entropy": 0.39471787922084334, "epoch": 1.1277297874405514, "grad_norm": 1.6328125, "learning_rate": 7.179123320524848e-06, "loss": 0.3968302488327026, "mean_token_accuracy": 0.8748754128813744, "num_tokens": 44228212.0, "step": 2905 }, { "entropy": 0.4300340283662081, "epoch": 1.129670969620499, "grad_norm": 1.5546875, "learning_rate": 7.169688404801241e-06, "loss": 0.4560871124267578, "mean_token_accuracy": 0.8694811254739762, "num_tokens": 44294114.0, "step": 2910 }, { "entropy": 0.3595341399312019, "epoch": 1.1316121518004465, "grad_norm": 2.078125, "learning_rate": 7.160243961602863e-06, "loss": 0.3778635025024414, "mean_token_accuracy": 0.8835319861769676, "num_tokens": 44355840.0, "step": 2915 }, { "entropy": 0.42457632496953013, "epoch": 1.133553333980394, "grad_norm": 2.375, "learning_rate": 7.150790032401887e-06, "loss": 0.4247872829437256, "mean_token_accuracy": 0.8706662476062774, "num_tokens": 44426127.0, "step": 2920 }, { "entropy": 0.4208326905965805, "epoch": 1.1354945161603416, "grad_norm": 1.5703125, "learning_rate": 7.1413266587121434e-06, "loss": 0.42088823318481444, "mean_token_accuracy": 0.871395905315876, "num_tokens": 44497833.0, "step": 2925 }, { "entropy": 0.356390430778265, "epoch": 1.1374356983402891, "grad_norm": 1.4765625, "learning_rate": 7.13185388208893e-06, "loss": 0.37389678955078126, "mean_token_accuracy": 0.8866265177726745, "num_tokens": 44582547.0, "step": 2930 }, { "entropy": 0.3736116912215948, "epoch": 1.139376880520237, "grad_norm": 1.828125, "learning_rate": 7.122371744128839e-06, "loss": 0.3963154792785645, "mean_token_accuracy": 0.8842655003070832, "num_tokens": 44655812.0, "step": 2935 }, { "entropy": 0.43113922215998174, "epoch": 1.1413180627001844, "grad_norm": 1.6328125, "learning_rate": 7.112880286469568e-06, "loss": 0.42786569595336915, "mean_token_accuracy": 0.8678357198834419, "num_tokens": 44730175.0, "step": 2940 }, { "entropy": 0.34794704206287863, "epoch": 1.143259244880132, "grad_norm": 1.4453125, "learning_rate": 7.103379550789741e-06, "loss": 0.35416512489318847, "mean_token_accuracy": 0.8896363779902459, "num_tokens": 44794755.0, "step": 2945 }, { "entropy": 0.40246716812253, "epoch": 1.1452004270600795, "grad_norm": 1.5234375, "learning_rate": 7.093869578808719e-06, "loss": 0.41913704872131347, "mean_token_accuracy": 0.868728120625019, "num_tokens": 44866536.0, "step": 2950 }, { "entropy": 0.3885481279343367, "epoch": 1.1471416092400273, "grad_norm": 1.8671875, "learning_rate": 7.084350412286424e-06, "loss": 0.40956454277038573, "mean_token_accuracy": 0.8750770896673202, "num_tokens": 44942021.0, "step": 2955 }, { "entropy": 0.3983582962304354, "epoch": 1.1490827914199748, "grad_norm": 1.578125, "learning_rate": 7.074822093023154e-06, "loss": 0.4057170391082764, "mean_token_accuracy": 0.8758019611239434, "num_tokens": 45016428.0, "step": 2960 }, { "entropy": 0.3897486738860607, "epoch": 1.1510239735999224, "grad_norm": 1.921875, "learning_rate": 7.065284662859395e-06, "loss": 0.4297188282012939, "mean_token_accuracy": 0.8763071224093437, "num_tokens": 45082753.0, "step": 2965 }, { "entropy": 0.3621146373450756, "epoch": 1.15296515577987, "grad_norm": 1.6796875, "learning_rate": 7.055738163675645e-06, "loss": 0.35830867290496826, "mean_token_accuracy": 0.8859908595681191, "num_tokens": 45147776.0, "step": 2970 }, { "entropy": 0.40969758927822114, "epoch": 1.1549063379598175, "grad_norm": 1.5234375, "learning_rate": 7.046182637392221e-06, "loss": 0.3900305271148682, "mean_token_accuracy": 0.8708938717842102, "num_tokens": 45223891.0, "step": 2975 }, { "entropy": 0.37424799539148806, "epoch": 1.156847520139765, "grad_norm": 1.828125, "learning_rate": 7.036618125969081e-06, "loss": 0.3869047164916992, "mean_token_accuracy": 0.8798339098691941, "num_tokens": 45294331.0, "step": 2980 }, { "entropy": 0.38800337798893453, "epoch": 1.1587887023197128, "grad_norm": 1.90625, "learning_rate": 7.027044671405643e-06, "loss": 0.3859901428222656, "mean_token_accuracy": 0.8810177177190781, "num_tokens": 45348349.0, "step": 2985 }, { "entropy": 0.3485205162316561, "epoch": 1.1607298844996603, "grad_norm": 1.4296875, "learning_rate": 7.017462315740586e-06, "loss": 0.3649015188217163, "mean_token_accuracy": 0.8860207587480545, "num_tokens": 45425964.0, "step": 2990 }, { "entropy": 0.37424386143684385, "epoch": 1.1626710666796078, "grad_norm": 1.6484375, "learning_rate": 7.007871101051686e-06, "loss": 0.3772335767745972, "mean_token_accuracy": 0.8799161404371262, "num_tokens": 45500293.0, "step": 2995 }, { "entropy": 0.38352062441408635, "epoch": 1.1646122488595554, "grad_norm": 1.9296875, "learning_rate": 6.998271069455612e-06, "loss": 0.40156922340393064, "mean_token_accuracy": 0.8821331828832626, "num_tokens": 45571727.0, "step": 3000 }, { "epoch": 1.1646122488595554, "eval_entropy": 0.3600764439036925, "eval_loss": 0.36817678809165955, "eval_mean_token_accuracy": 0.883426463426463, "eval_num_tokens": 45571727.0, "eval_runtime": 60.1453, "eval_samples_per_second": 35.73, "eval_steps_per_second": 35.73, "step": 3000 }, { "entropy": 0.4300002858042717, "epoch": 1.1665534310395032, "grad_norm": 1.4296875, "learning_rate": 6.988662263107755e-06, "loss": 0.4532319068908691, "mean_token_accuracy": 0.8672842562198639, "num_tokens": 45648648.0, "step": 3005 }, { "entropy": 0.3812822367995977, "epoch": 1.1684946132194507, "grad_norm": 1.640625, "learning_rate": 6.979044724202034e-06, "loss": 0.39993724822998045, "mean_token_accuracy": 0.8782149285078049, "num_tokens": 45743015.0, "step": 3010 }, { "entropy": 0.41226282604038716, "epoch": 1.1704357953993982, "grad_norm": 1.5234375, "learning_rate": 6.969418494970717e-06, "loss": 0.4353823661804199, "mean_token_accuracy": 0.8674470081925392, "num_tokens": 45826008.0, "step": 3015 }, { "entropy": 0.37322292029857634, "epoch": 1.1723769775793458, "grad_norm": 1.7265625, "learning_rate": 6.9597836176842315e-06, "loss": 0.4075223445892334, "mean_token_accuracy": 0.8766345664858818, "num_tokens": 45907989.0, "step": 3020 }, { "entropy": 0.3779715023934841, "epoch": 1.1743181597592933, "grad_norm": 1.609375, "learning_rate": 6.9501401346509786e-06, "loss": 0.4066688060760498, "mean_token_accuracy": 0.8800197467207909, "num_tokens": 45976593.0, "step": 3025 }, { "entropy": 0.364545364305377, "epoch": 1.176259341939241, "grad_norm": 1.453125, "learning_rate": 6.940488088217152e-06, "loss": 0.37837910652160645, "mean_token_accuracy": 0.8811485067009925, "num_tokens": 46067425.0, "step": 3030 }, { "entropy": 0.34584682770073416, "epoch": 1.1782005241191886, "grad_norm": 1.609375, "learning_rate": 6.930827520766544e-06, "loss": 0.3524082899093628, "mean_token_accuracy": 0.8913029715418815, "num_tokens": 46141435.0, "step": 3035 }, { "entropy": 0.38420435078442094, "epoch": 1.1801417062991362, "grad_norm": 1.7890625, "learning_rate": 6.921158474720368e-06, "loss": 0.3806861400604248, "mean_token_accuracy": 0.8749532103538513, "num_tokens": 46222095.0, "step": 3040 }, { "entropy": 0.376201831176877, "epoch": 1.1820828884790837, "grad_norm": 1.5625, "learning_rate": 6.911480992537072e-06, "loss": 0.4178003311157227, "mean_token_accuracy": 0.8752377212047577, "num_tokens": 46312000.0, "step": 3045 }, { "entropy": 0.38689825385808946, "epoch": 1.1840240706590313, "grad_norm": 2.09375, "learning_rate": 6.901795116712136e-06, "loss": 0.40619282722473143, "mean_token_accuracy": 0.8773537456989289, "num_tokens": 46381015.0, "step": 3050 }, { "entropy": 0.39098729118704795, "epoch": 1.185965252838979, "grad_norm": 1.546875, "learning_rate": 6.892100889777913e-06, "loss": 0.42108306884765623, "mean_token_accuracy": 0.8786390334367752, "num_tokens": 46464894.0, "step": 3055 }, { "entropy": 0.3601615995168686, "epoch": 1.1879064350189266, "grad_norm": 1.5546875, "learning_rate": 6.882398354303416e-06, "loss": 0.3870659351348877, "mean_token_accuracy": 0.8846402570605278, "num_tokens": 46545575.0, "step": 3060 }, { "entropy": 0.3909476988017559, "epoch": 1.189847617198874, "grad_norm": 2.125, "learning_rate": 6.872687552894145e-06, "loss": 0.3942322969436646, "mean_token_accuracy": 0.8762227043509483, "num_tokens": 46620397.0, "step": 3065 }, { "entropy": 0.36160071194171906, "epoch": 1.1917887993788217, "grad_norm": 1.484375, "learning_rate": 6.8629685281919025e-06, "loss": 0.35771043300628663, "mean_token_accuracy": 0.8830681905150414, "num_tokens": 46695823.0, "step": 3070 }, { "entropy": 0.40609239749610426, "epoch": 1.1937299815587692, "grad_norm": 1.4453125, "learning_rate": 6.853241322874593e-06, "loss": 0.40566306114196776, "mean_token_accuracy": 0.8745755672454834, "num_tokens": 46763659.0, "step": 3075 }, { "entropy": 0.39826103691011666, "epoch": 1.195671163738717, "grad_norm": 1.8046875, "learning_rate": 6.843505979656049e-06, "loss": 0.42182149887084963, "mean_token_accuracy": 0.878571617603302, "num_tokens": 46827063.0, "step": 3080 }, { "entropy": 0.3527070388197899, "epoch": 1.1976123459186645, "grad_norm": 1.7421875, "learning_rate": 6.8337625412858364e-06, "loss": 0.3918677806854248, "mean_token_accuracy": 0.8872169196605683, "num_tokens": 46902371.0, "step": 3085 }, { "entropy": 0.360038623213768, "epoch": 1.199553528098612, "grad_norm": 1.4140625, "learning_rate": 6.824011050549067e-06, "loss": 0.36493072509765623, "mean_token_accuracy": 0.8841731250286102, "num_tokens": 46982779.0, "step": 3090 }, { "entropy": 0.372321966663003, "epoch": 1.2014947102785596, "grad_norm": 1.1796875, "learning_rate": 6.814251550266216e-06, "loss": 0.39631216526031493, "mean_token_accuracy": 0.8795213535428047, "num_tokens": 47072025.0, "step": 3095 }, { "entropy": 0.44918127730488777, "epoch": 1.2034358924585074, "grad_norm": 1.5, "learning_rate": 6.8044840832929216e-06, "loss": 0.4901744365692139, "mean_token_accuracy": 0.8591332510113716, "num_tokens": 47134711.0, "step": 3100 }, { "entropy": 0.36903586611151695, "epoch": 1.205377074638455, "grad_norm": 1.734375, "learning_rate": 6.794708692519815e-06, "loss": 0.36009137630462645, "mean_token_accuracy": 0.8829508319497108, "num_tokens": 47211803.0, "step": 3105 }, { "entropy": 0.4135138522833586, "epoch": 1.2073182568184024, "grad_norm": 1.5703125, "learning_rate": 6.784925420872315e-06, "loss": 0.4357631683349609, "mean_token_accuracy": 0.8690931290388108, "num_tokens": 47289477.0, "step": 3110 }, { "entropy": 0.3864825196564198, "epoch": 1.20925943899835, "grad_norm": 1.5078125, "learning_rate": 6.775134311310449e-06, "loss": 0.3875833034515381, "mean_token_accuracy": 0.8817471221089364, "num_tokens": 47361495.0, "step": 3115 }, { "entropy": 0.3780834227800369, "epoch": 1.2112006211782975, "grad_norm": 2.578125, "learning_rate": 6.765335406828664e-06, "loss": 0.4267258167266846, "mean_token_accuracy": 0.8809913843870163, "num_tokens": 47423556.0, "step": 3120 }, { "entropy": 0.36974840685725213, "epoch": 1.213141803358245, "grad_norm": 1.75, "learning_rate": 6.755528750455634e-06, "loss": 0.36568589210510255, "mean_token_accuracy": 0.8837969750165939, "num_tokens": 47502054.0, "step": 3125 }, { "entropy": 0.40455227382481096, "epoch": 1.2150829855381928, "grad_norm": 1.9765625, "learning_rate": 6.745714385254072e-06, "loss": 0.4230593204498291, "mean_token_accuracy": 0.8713468372821808, "num_tokens": 47576658.0, "step": 3130 }, { "entropy": 0.3805558536201715, "epoch": 1.2170241677181404, "grad_norm": 1.75, "learning_rate": 6.735892354320544e-06, "loss": 0.38716301918029783, "mean_token_accuracy": 0.8806325614452362, "num_tokens": 47646232.0, "step": 3135 }, { "entropy": 0.36968096643686293, "epoch": 1.218965349898088, "grad_norm": 1.3125, "learning_rate": 6.726062700785273e-06, "loss": 0.39945073127746583, "mean_token_accuracy": 0.8774180024862289, "num_tokens": 47741132.0, "step": 3140 }, { "entropy": 0.35996747594326733, "epoch": 1.2209065320780355, "grad_norm": 1.8046875, "learning_rate": 6.716225467811961e-06, "loss": 0.37158637046813964, "mean_token_accuracy": 0.8840801179409027, "num_tokens": 47812661.0, "step": 3145 }, { "entropy": 0.37774690724909304, "epoch": 1.2228477142579832, "grad_norm": 1.796875, "learning_rate": 6.706380698597588e-06, "loss": 0.3942166805267334, "mean_token_accuracy": 0.8794585153460502, "num_tokens": 47883300.0, "step": 3150 }, { "entropy": 0.4131374925374985, "epoch": 1.2247888964379308, "grad_norm": 1.6875, "learning_rate": 6.696528436372229e-06, "loss": 0.4139698505401611, "mean_token_accuracy": 0.869027565419674, "num_tokens": 47979410.0, "step": 3155 }, { "entropy": 0.3713895071297884, "epoch": 1.2267300786178783, "grad_norm": 2.390625, "learning_rate": 6.68666872439886e-06, "loss": 0.3731879472732544, "mean_token_accuracy": 0.880556121468544, "num_tokens": 48058170.0, "step": 3160 }, { "entropy": 0.3848850384354591, "epoch": 1.2286712607978258, "grad_norm": 1.8125, "learning_rate": 6.67680160597317e-06, "loss": 0.40471110343933103, "mean_token_accuracy": 0.8750801667571068, "num_tokens": 48127200.0, "step": 3165 }, { "entropy": 0.3463645543903112, "epoch": 1.2306124429777734, "grad_norm": 1.6171875, "learning_rate": 6.666927124423374e-06, "loss": 0.3593963623046875, "mean_token_accuracy": 0.8887131616473198, "num_tokens": 48190947.0, "step": 3170 }, { "entropy": 0.3704676777124405, "epoch": 1.232553625157721, "grad_norm": 1.7265625, "learning_rate": 6.657045323110017e-06, "loss": 0.3847299337387085, "mean_token_accuracy": 0.8824770480394364, "num_tokens": 48268615.0, "step": 3175 }, { "entropy": 0.3637780986726284, "epoch": 1.2344948073376687, "grad_norm": 1.59375, "learning_rate": 6.647156245425789e-06, "loss": 0.3874013423919678, "mean_token_accuracy": 0.8841297894716262, "num_tokens": 48336348.0, "step": 3180 }, { "entropy": 0.35034383423626425, "epoch": 1.2364359895176162, "grad_norm": 1.4765625, "learning_rate": 6.637259934795328e-06, "loss": 0.34986927509307864, "mean_token_accuracy": 0.8901937618851662, "num_tokens": 48406226.0, "step": 3185 }, { "entropy": 0.403315170109272, "epoch": 1.2383771716975638, "grad_norm": 1.2421875, "learning_rate": 6.627356434675035e-06, "loss": 0.4066962718963623, "mean_token_accuracy": 0.8722446888685227, "num_tokens": 48490073.0, "step": 3190 }, { "entropy": 0.36379750072956085, "epoch": 1.2403183538775113, "grad_norm": 1.3984375, "learning_rate": 6.6174457885528855e-06, "loss": 0.3708995819091797, "mean_token_accuracy": 0.8828730553388595, "num_tokens": 48561708.0, "step": 3195 }, { "entropy": 0.35466758720576763, "epoch": 1.242259536057459, "grad_norm": 1.859375, "learning_rate": 6.607528039948226e-06, "loss": 0.36141531467437743, "mean_token_accuracy": 0.8850849062204361, "num_tokens": 48629826.0, "step": 3200 }, { "epoch": 1.242259536057459, "eval_entropy": 0.3607052234181308, "eval_loss": 0.3680853247642517, "eval_mean_token_accuracy": 0.8834105204598413, "eval_num_tokens": 48629826.0, "eval_runtime": 60.0565, "eval_samples_per_second": 35.783, "eval_steps_per_second": 35.783, "step": 3200 }, { "entropy": 0.33488245457410815, "epoch": 1.2442007182374066, "grad_norm": 1.609375, "learning_rate": 6.597603232411597e-06, "loss": 0.40059671401977537, "mean_token_accuracy": 0.8868094369769096, "num_tokens": 48705277.0, "step": 3205 }, { "entropy": 0.36259912960231305, "epoch": 1.2461419004173542, "grad_norm": 1.7265625, "learning_rate": 6.587671409524534e-06, "loss": 0.36700074672698973, "mean_token_accuracy": 0.8851820915937424, "num_tokens": 48773921.0, "step": 3210 }, { "entropy": 0.3843076877295971, "epoch": 1.2480830825973017, "grad_norm": 1.59375, "learning_rate": 6.577732614899379e-06, "loss": 0.4054192066192627, "mean_token_accuracy": 0.8755981966853141, "num_tokens": 48859799.0, "step": 3215 }, { "entropy": 0.38492829352617264, "epoch": 1.2500242647772493, "grad_norm": 1.6328125, "learning_rate": 6.56778689217909e-06, "loss": 0.39413578510284425, "mean_token_accuracy": 0.8772288784384727, "num_tokens": 48930817.0, "step": 3220 }, { "entropy": 0.3711765740066767, "epoch": 1.2519654469571968, "grad_norm": 1.3515625, "learning_rate": 6.5578342850370415e-06, "loss": 0.37443616390228274, "mean_token_accuracy": 0.8799751400947571, "num_tokens": 49002171.0, "step": 3225 }, { "entropy": 0.3800849601626396, "epoch": 1.2539066291371446, "grad_norm": 1.5234375, "learning_rate": 6.547874837176847e-06, "loss": 0.3951963186264038, "mean_token_accuracy": 0.88048807233572, "num_tokens": 49073741.0, "step": 3230 }, { "entropy": 0.3790770899504423, "epoch": 1.255847811317092, "grad_norm": 1.4765625, "learning_rate": 6.537908592332147e-06, "loss": 0.40506410598754883, "mean_token_accuracy": 0.8772617995738983, "num_tokens": 49148184.0, "step": 3235 }, { "entropy": 0.3464452028274536, "epoch": 1.2577889934970397, "grad_norm": 1.484375, "learning_rate": 6.5279355942664435e-06, "loss": 0.3766259908676147, "mean_token_accuracy": 0.8875595390796661, "num_tokens": 49218339.0, "step": 3240 }, { "entropy": 0.3986640240997076, "epoch": 1.2597301756769874, "grad_norm": 1.9921875, "learning_rate": 6.51795588677288e-06, "loss": 0.3935344696044922, "mean_token_accuracy": 0.87731524258852, "num_tokens": 49279139.0, "step": 3245 }, { "entropy": 0.3633753590285778, "epoch": 1.261671357856935, "grad_norm": 1.6953125, "learning_rate": 6.5079695136740706e-06, "loss": 0.3786989688873291, "mean_token_accuracy": 0.8843644946813584, "num_tokens": 49352390.0, "step": 3250 }, { "entropy": 0.38015848845243455, "epoch": 1.2636125400368825, "grad_norm": 1.3203125, "learning_rate": 6.497976518821896e-06, "loss": 0.4066456317901611, "mean_token_accuracy": 0.878155305981636, "num_tokens": 49446216.0, "step": 3255 }, { "entropy": 0.39295368976891043, "epoch": 1.26555372221683, "grad_norm": 1.7578125, "learning_rate": 6.487976946097314e-06, "loss": 0.3828210115432739, "mean_token_accuracy": 0.8781291946768761, "num_tokens": 49523614.0, "step": 3260 }, { "entropy": 0.3769407343119383, "epoch": 1.2674949043967776, "grad_norm": 1.40625, "learning_rate": 6.477970839410166e-06, "loss": 0.40603952407836913, "mean_token_accuracy": 0.8798100754618645, "num_tokens": 49591786.0, "step": 3265 }, { "entropy": 0.3537591304630041, "epoch": 1.2694360865767251, "grad_norm": 1.25, "learning_rate": 6.46795824269899e-06, "loss": 0.3576634407043457, "mean_token_accuracy": 0.8874867498874665, "num_tokens": 49663637.0, "step": 3270 }, { "entropy": 0.35550354048609734, "epoch": 1.271377268756673, "grad_norm": 1.8359375, "learning_rate": 6.457939199930815e-06, "loss": 0.39648468494415284, "mean_token_accuracy": 0.8848752856254578, "num_tokens": 49731508.0, "step": 3275 }, { "entropy": 0.3586549339815974, "epoch": 1.2733184509366204, "grad_norm": 1.9296875, "learning_rate": 6.4479137551009855e-06, "loss": 0.3832548141479492, "mean_token_accuracy": 0.8830386832356453, "num_tokens": 49813544.0, "step": 3280 }, { "entropy": 0.3636878037825227, "epoch": 1.275259633116568, "grad_norm": 1.640625, "learning_rate": 6.437881952232947e-06, "loss": 0.3801161766052246, "mean_token_accuracy": 0.8825449839234352, "num_tokens": 49885620.0, "step": 3285 }, { "entropy": 0.3816155593842268, "epoch": 1.2772008152965155, "grad_norm": 1.8359375, "learning_rate": 6.427843835378074e-06, "loss": 0.3867227554321289, "mean_token_accuracy": 0.87795270383358, "num_tokens": 49964261.0, "step": 3290 }, { "entropy": 0.33171985633671286, "epoch": 1.2791419974764633, "grad_norm": 1.7734375, "learning_rate": 6.417799448615465e-06, "loss": 0.3791377544403076, "mean_token_accuracy": 0.8870480135083199, "num_tokens": 50051116.0, "step": 3295 }, { "entropy": 0.33659284114837645, "epoch": 1.2810831796564108, "grad_norm": 1.59375, "learning_rate": 6.407748836051746e-06, "loss": 0.35617640018463137, "mean_token_accuracy": 0.8889725834131241, "num_tokens": 50125047.0, "step": 3300 }, { "entropy": 0.3994648285210133, "epoch": 1.2830243618363584, "grad_norm": 2.0, "learning_rate": 6.397692041820885e-06, "loss": 0.37363758087158205, "mean_token_accuracy": 0.8738816857337952, "num_tokens": 50185527.0, "step": 3305 }, { "entropy": 0.3728124268352985, "epoch": 1.284965544016306, "grad_norm": 1.3984375, "learning_rate": 6.387629110083995e-06, "loss": 0.37665843963623047, "mean_token_accuracy": 0.8807444587349892, "num_tokens": 50267257.0, "step": 3310 }, { "entropy": 0.39132245220243933, "epoch": 1.2869067261962535, "grad_norm": 1.6015625, "learning_rate": 6.377560085029139e-06, "loss": 0.3918001651763916, "mean_token_accuracy": 0.8778573974967003, "num_tokens": 50339377.0, "step": 3315 }, { "entropy": 0.3549855757504702, "epoch": 1.288847908376201, "grad_norm": 1.375, "learning_rate": 6.367485010871136e-06, "loss": 0.3473607301712036, "mean_token_accuracy": 0.8883946269750596, "num_tokens": 50412920.0, "step": 3320 }, { "entropy": 0.40118363983929156, "epoch": 1.2907890905561488, "grad_norm": 1.7734375, "learning_rate": 6.35740393185137e-06, "loss": 0.4183527946472168, "mean_token_accuracy": 0.8735328048467637, "num_tokens": 50493105.0, "step": 3325 }, { "entropy": 0.3577863838523626, "epoch": 1.2927302727360963, "grad_norm": 1.21875, "learning_rate": 6.347316892237592e-06, "loss": 0.36974031925201417, "mean_token_accuracy": 0.8823448717594147, "num_tokens": 50597292.0, "step": 3330 }, { "entropy": 0.4042118158191442, "epoch": 1.2946714549160439, "grad_norm": 1.4609375, "learning_rate": 6.3372239363237255e-06, "loss": 0.3996162414550781, "mean_token_accuracy": 0.8742659211158752, "num_tokens": 50669226.0, "step": 3335 }, { "entropy": 0.38877438604831693, "epoch": 1.2966126370959914, "grad_norm": 1.4140625, "learning_rate": 6.327125108429677e-06, "loss": 0.3838073492050171, "mean_token_accuracy": 0.8778223499655724, "num_tokens": 50740937.0, "step": 3340 }, { "entropy": 0.3652618743479252, "epoch": 1.2985538192759392, "grad_norm": 1.609375, "learning_rate": 6.317020452901134e-06, "loss": 0.40174212455749514, "mean_token_accuracy": 0.8852205485105514, "num_tokens": 50800354.0, "step": 3345 }, { "entropy": 0.36313771940767764, "epoch": 1.3004950014558867, "grad_norm": 1.75, "learning_rate": 6.3069100141093755e-06, "loss": 0.40732836723327637, "mean_token_accuracy": 0.8836523965001106, "num_tokens": 50874364.0, "step": 3350 }, { "entropy": 0.3877572625875473, "epoch": 1.3024361836358342, "grad_norm": 1.4609375, "learning_rate": 6.2967938364510794e-06, "loss": 0.3883176326751709, "mean_token_accuracy": 0.8740043297410012, "num_tokens": 50953290.0, "step": 3355 }, { "entropy": 0.3541896607726812, "epoch": 1.3043773658157818, "grad_norm": 1.71875, "learning_rate": 6.2866719643481185e-06, "loss": 0.40380287170410156, "mean_token_accuracy": 0.8862088546156883, "num_tokens": 51013828.0, "step": 3360 }, { "entropy": 0.3846803639084101, "epoch": 1.3063185479957293, "grad_norm": 1.5546875, "learning_rate": 6.2765444422473735e-06, "loss": 0.4024141788482666, "mean_token_accuracy": 0.8768733203411102, "num_tokens": 51088099.0, "step": 3365 }, { "entropy": 0.382393941283226, "epoch": 1.3082597301756769, "grad_norm": 1.5390625, "learning_rate": 6.2664113146205355e-06, "loss": 0.4033693313598633, "mean_token_accuracy": 0.8758635804057121, "num_tokens": 51174151.0, "step": 3370 }, { "entropy": 0.35187431797385216, "epoch": 1.3102009123556246, "grad_norm": 1.53125, "learning_rate": 6.256272625963908e-06, "loss": 0.3925636291503906, "mean_token_accuracy": 0.8831515818834305, "num_tokens": 51253871.0, "step": 3375 }, { "entropy": 0.3575460772961378, "epoch": 1.3121420945355722, "grad_norm": 1.7578125, "learning_rate": 6.24612842079822e-06, "loss": 0.3699699878692627, "mean_token_accuracy": 0.8861239358782769, "num_tokens": 51320927.0, "step": 3380 }, { "entropy": 0.3574929475784302, "epoch": 1.3140832767155197, "grad_norm": 1.40625, "learning_rate": 6.235978743668415e-06, "loss": 0.3928325653076172, "mean_token_accuracy": 0.8840924382209778, "num_tokens": 51393313.0, "step": 3385 }, { "entropy": 0.4037714671343565, "epoch": 1.3160244588954673, "grad_norm": 1.75, "learning_rate": 6.2258236391434735e-06, "loss": 0.43996176719665525, "mean_token_accuracy": 0.8732839792966842, "num_tokens": 51469149.0, "step": 3390 }, { "entropy": 0.3767301281914115, "epoch": 1.317965641075415, "grad_norm": 1.671875, "learning_rate": 6.215663151816204e-06, "loss": 0.41208739280700685, "mean_token_accuracy": 0.8773611128330231, "num_tokens": 51549599.0, "step": 3395 }, { "entropy": 0.41424218341708186, "epoch": 1.3199068232553626, "grad_norm": 1.390625, "learning_rate": 6.205497326303054e-06, "loss": 0.4277363300323486, "mean_token_accuracy": 0.8679974019527436, "num_tokens": 51642096.0, "step": 3400 }, { "epoch": 1.3199068232553626, "eval_entropy": 0.3615535780503174, "eval_loss": 0.36774712800979614, "eval_mean_token_accuracy": 0.8836295662535352, "eval_num_tokens": 51642096.0, "eval_runtime": 60.122, "eval_samples_per_second": 35.744, "eval_steps_per_second": 35.744, "step": 3400 }, { "entropy": 0.3396712843328714, "epoch": 1.3218480054353101, "grad_norm": 2.0, "learning_rate": 6.1953262072439104e-06, "loss": 0.36101136207580564, "mean_token_accuracy": 0.8887990996241569, "num_tokens": 51720729.0, "step": 3405 }, { "entropy": 0.37931633833795786, "epoch": 1.3237891876152577, "grad_norm": 1.5625, "learning_rate": 6.185149839301904e-06, "loss": 0.4054765224456787, "mean_token_accuracy": 0.8779459938406944, "num_tokens": 51794420.0, "step": 3410 }, { "entropy": 0.3843179401010275, "epoch": 1.3257303697952052, "grad_norm": 1.5, "learning_rate": 6.1749682671632185e-06, "loss": 0.40850515365600587, "mean_token_accuracy": 0.8769529685378075, "num_tokens": 51877828.0, "step": 3415 }, { "entropy": 0.3709686040878296, "epoch": 1.3276715519751527, "grad_norm": 1.4609375, "learning_rate": 6.1647815355368845e-06, "loss": 0.38035385608673095, "mean_token_accuracy": 0.8827380672097206, "num_tokens": 51949655.0, "step": 3420 }, { "entropy": 0.3752392638474703, "epoch": 1.3296127341551005, "grad_norm": 1.6328125, "learning_rate": 6.154589689154594e-06, "loss": 0.36831343173980713, "mean_token_accuracy": 0.8784654662013054, "num_tokens": 52034397.0, "step": 3425 }, { "entropy": 0.4212960582226515, "epoch": 1.331553916335048, "grad_norm": 1.921875, "learning_rate": 6.144392772770498e-06, "loss": 0.4582382678985596, "mean_token_accuracy": 0.8687367781996727, "num_tokens": 52108021.0, "step": 3430 }, { "entropy": 0.3721700422465801, "epoch": 1.3334950985149956, "grad_norm": 1.703125, "learning_rate": 6.134190831161004e-06, "loss": 0.39261841773986816, "mean_token_accuracy": 0.8782809346914291, "num_tokens": 52189452.0, "step": 3435 }, { "entropy": 0.41867862418293955, "epoch": 1.3354362806949434, "grad_norm": 1.5703125, "learning_rate": 6.123983909124597e-06, "loss": 0.4275325298309326, "mean_token_accuracy": 0.8698067650198936, "num_tokens": 52269123.0, "step": 3440 }, { "entropy": 0.32477850653231144, "epoch": 1.337377462874891, "grad_norm": 1.65625, "learning_rate": 6.113772051481622e-06, "loss": 0.3294957399368286, "mean_token_accuracy": 0.8947040095925332, "num_tokens": 52329549.0, "step": 3445 }, { "entropy": 0.3995737452059984, "epoch": 1.3393186450548384, "grad_norm": 1.3125, "learning_rate": 6.103555303074105e-06, "loss": 0.42353267669677735, "mean_token_accuracy": 0.8714441776275634, "num_tokens": 52432623.0, "step": 3450 }, { "entropy": 0.3620085157454014, "epoch": 1.341259827234786, "grad_norm": 1.5234375, "learning_rate": 6.093333708765541e-06, "loss": 0.37543137073516847, "mean_token_accuracy": 0.8843591079115868, "num_tokens": 52505394.0, "step": 3455 }, { "entropy": 0.40583874434232714, "epoch": 1.3432010094147335, "grad_norm": 1.9921875, "learning_rate": 6.08310731344071e-06, "loss": 0.40748982429504393, "mean_token_accuracy": 0.8733288407325744, "num_tokens": 52574341.0, "step": 3460 }, { "entropy": 0.37172621488571167, "epoch": 1.345142191594681, "grad_norm": 1.390625, "learning_rate": 6.072876162005474e-06, "loss": 0.3841069221496582, "mean_token_accuracy": 0.8812505900859833, "num_tokens": 52650739.0, "step": 3465 }, { "entropy": 0.36620298847556115, "epoch": 1.3470833737746286, "grad_norm": 1.7265625, "learning_rate": 6.062640299386573e-06, "loss": 0.37988154888153075, "mean_token_accuracy": 0.8840025961399078, "num_tokens": 52730667.0, "step": 3470 }, { "entropy": 0.37304456941783426, "epoch": 1.3490245559545764, "grad_norm": 1.6953125, "learning_rate": 6.052399770531441e-06, "loss": 0.3846965551376343, "mean_token_accuracy": 0.8781289547681809, "num_tokens": 52816922.0, "step": 3475 }, { "entropy": 0.37507508173584936, "epoch": 1.350965738134524, "grad_norm": 1.390625, "learning_rate": 6.042154620408003e-06, "loss": 0.39843082427978516, "mean_token_accuracy": 0.8789984509348869, "num_tokens": 52893991.0, "step": 3480 }, { "entropy": 0.33831214234232904, "epoch": 1.3529069203144715, "grad_norm": 1.890625, "learning_rate": 6.0319048940044715e-06, "loss": 0.35968937873840334, "mean_token_accuracy": 0.8883291095495224, "num_tokens": 52959658.0, "step": 3485 }, { "entropy": 0.39217614494264125, "epoch": 1.3548481024944192, "grad_norm": 1.5234375, "learning_rate": 6.021650636329159e-06, "loss": 0.395078182220459, "mean_token_accuracy": 0.8747363820672035, "num_tokens": 53030794.0, "step": 3490 }, { "entropy": 0.36045850329101087, "epoch": 1.3567892846743668, "grad_norm": 1.2890625, "learning_rate": 6.011391892410272e-06, "loss": 0.39100329875946044, "mean_token_accuracy": 0.8819140180945396, "num_tokens": 53118526.0, "step": 3495 }, { "entropy": 0.393280316144228, "epoch": 1.3587304668543143, "grad_norm": 1.7109375, "learning_rate": 6.0011287072957205e-06, "loss": 0.39736104011535645, "mean_token_accuracy": 0.8757255643606185, "num_tokens": 53188265.0, "step": 3500 } ], "logging_steps": 5, "max_steps": 7728, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.926527421264753e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }