{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 11.0, "eval_steps": 500, "global_step": 47190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.13037056401371955, "epoch": 0.0011656370206317753, "grad_norm": 5.8125, "learning_rate": 1.230769230769231e-06, "loss": 1.6988, "mean_token_accuracy": 0.7972599864006042, "num_tokens": 20665.0, "step": 5 }, { "entropy": 0.2026868939399719, "epoch": 0.0023312740412635507, "grad_norm": 18.75, "learning_rate": 2.7692307692307697e-06, "loss": 2.2638, "mean_token_accuracy": 0.7449834465980529, "num_tokens": 35523.0, "step": 10 }, { "entropy": 0.1481923870742321, "epoch": 0.003496911061895326, "grad_norm": 11.6875, "learning_rate": 4.307692307692308e-06, "loss": 1.774, "mean_token_accuracy": 0.8239133059978485, "num_tokens": 48005.0, "step": 15 }, { "entropy": 0.14728636629879474, "epoch": 0.004662548082527101, "grad_norm": 29.75, "learning_rate": 5.846153846153847e-06, "loss": 1.534, "mean_token_accuracy": 0.8404697060585022, "num_tokens": 66174.0, "step": 20 }, { "entropy": 0.1347280215471983, "epoch": 0.005828185103158876, "grad_norm": 13.8125, "learning_rate": 7.384615384615386e-06, "loss": 1.4182, "mean_token_accuracy": 0.8635528087615967, "num_tokens": 83447.0, "step": 25 }, { "entropy": 0.1397606860846281, "epoch": 0.006993822123790652, "grad_norm": 12.3125, "learning_rate": 8.923076923076925e-06, "loss": 1.264, "mean_token_accuracy": 0.864665013551712, "num_tokens": 97990.0, "step": 30 }, { "entropy": 0.11616570875048637, "epoch": 0.008159459144422426, "grad_norm": 10.0625, "learning_rate": 1.0461538461538463e-05, "loss": 0.7635, "mean_token_accuracy": 0.8946437478065491, "num_tokens": 127034.0, "step": 35 }, { "entropy": 0.2478984471410513, "epoch": 0.009325096165054203, "grad_norm": 20.75, "learning_rate": 1.2e-05, "loss": 1.3525, "mean_token_accuracy": 0.8422249376773834, "num_tokens": 149561.0, "step": 40 }, { "entropy": 0.23592497650533914, "epoch": 0.010490733185685977, "grad_norm": 2.171875, "learning_rate": 1.353846153846154e-05, "loss": 0.8752, "mean_token_accuracy": 0.8744755148887634, "num_tokens": 180020.0, "step": 45 }, { "entropy": 0.27611723467707633, "epoch": 0.011656370206317752, "grad_norm": 4.28125, "learning_rate": 1.5076923076923078e-05, "loss": 1.0946, "mean_token_accuracy": 0.8549042701721191, "num_tokens": 193617.0, "step": 50 }, { "entropy": 0.24654684234410523, "epoch": 0.012822007226949528, "grad_norm": 5.71875, "learning_rate": 1.6615384615384618e-05, "loss": 0.6624, "mean_token_accuracy": 0.8910752832889557, "num_tokens": 216657.0, "step": 55 }, { "entropy": 0.4486447751522064, "epoch": 0.013987644247581303, "grad_norm": 5.5, "learning_rate": 1.8153846153846155e-05, "loss": 1.2489, "mean_token_accuracy": 0.8384927690029145, "num_tokens": 225069.0, "step": 60 }, { "entropy": 0.33645918890833854, "epoch": 0.015153281268213078, "grad_norm": 2.75, "learning_rate": 1.9692307692307696e-05, "loss": 0.8006, "mean_token_accuracy": 0.8728708684444427, "num_tokens": 239949.0, "step": 65 }, { "entropy": 0.32945496812462804, "epoch": 0.016318918288844853, "grad_norm": 4.28125, "learning_rate": 1.9999999976141594e-05, "loss": 0.7551, "mean_token_accuracy": 0.8722834944725036, "num_tokens": 263424.0, "step": 70 }, { "entropy": 0.22601164653897285, "epoch": 0.01748455530947663, "grad_norm": 0.7734375, "learning_rate": 1.9999999879216817e-05, "loss": 0.3863, "mean_token_accuracy": 0.9277985453605652, "num_tokens": 282642.0, "step": 75 }, { "entropy": 0.3768581360578537, "epoch": 0.018650192330108405, "grad_norm": 2.09375, "learning_rate": 1.9999999707734522e-05, "loss": 0.9283, "mean_token_accuracy": 0.8531839072704315, "num_tokens": 293090.0, "step": 80 }, { "entropy": 0.3327511861920357, "epoch": 0.01981582935074018, "grad_norm": 9.125, "learning_rate": 1.9999999461694714e-05, "loss": 0.9839, "mean_token_accuracy": 0.855659818649292, "num_tokens": 309153.0, "step": 85 }, { "entropy": 0.27368163913488386, "epoch": 0.020981466371371955, "grad_norm": 0.8828125, "learning_rate": 1.9999999141097392e-05, "loss": 0.6371, "mean_token_accuracy": 0.9080254793167114, "num_tokens": 325081.0, "step": 90 }, { "entropy": 0.3098975282162428, "epoch": 0.02214710339200373, "grad_norm": 0.890625, "learning_rate": 1.999999874594256e-05, "loss": 0.5994, "mean_token_accuracy": 0.8943244755268097, "num_tokens": 345901.0, "step": 95 }, { "entropy": 0.3766414072364569, "epoch": 0.023312740412635504, "grad_norm": 2.375, "learning_rate": 1.9999998276230227e-05, "loss": 0.986, "mean_token_accuracy": 0.8288512468338013, "num_tokens": 383154.0, "step": 100 }, { "entropy": 0.32121687904000285, "epoch": 0.024478377433267282, "grad_norm": 1.5078125, "learning_rate": 1.9999997731960398e-05, "loss": 0.4956, "mean_token_accuracy": 0.9077661991119385, "num_tokens": 400284.0, "step": 105 }, { "entropy": 0.31962902620434763, "epoch": 0.025644014453899057, "grad_norm": 3.046875, "learning_rate": 1.9999997113133085e-05, "loss": 0.7156, "mean_token_accuracy": 0.8738201320171356, "num_tokens": 415485.0, "step": 110 }, { "entropy": 0.33004358410835266, "epoch": 0.02680965147453083, "grad_norm": 1.15625, "learning_rate": 1.9999996419748292e-05, "loss": 0.5986, "mean_token_accuracy": 0.9044980466365814, "num_tokens": 447579.0, "step": 115 }, { "entropy": 0.3824078649282455, "epoch": 0.027975288495162606, "grad_norm": 2.984375, "learning_rate": 1.9999995651806034e-05, "loss": 0.9191, "mean_token_accuracy": 0.8670506238937378, "num_tokens": 459128.0, "step": 120 }, { "entropy": 0.4574072495102882, "epoch": 0.02914092551579438, "grad_norm": 5.34375, "learning_rate": 1.9999994809306322e-05, "loss": 1.0716, "mean_token_accuracy": 0.8488256156444549, "num_tokens": 470027.0, "step": 125 }, { "entropy": 0.2947243496775627, "epoch": 0.030306562536426156, "grad_norm": 0.63671875, "learning_rate": 1.9999993892249164e-05, "loss": 0.6576, "mean_token_accuracy": 0.9072929441928863, "num_tokens": 499546.0, "step": 130 }, { "entropy": 0.3057884469628334, "epoch": 0.031472199557057934, "grad_norm": 3.796875, "learning_rate": 1.9999992900634578e-05, "loss": 0.5276, "mean_token_accuracy": 0.8684212446212769, "num_tokens": 536170.0, "step": 135 }, { "entropy": 0.3543548423796892, "epoch": 0.032637836577689705, "grad_norm": 1.1953125, "learning_rate": 1.999999183446258e-05, "loss": 0.6128, "mean_token_accuracy": 0.8815208315849304, "num_tokens": 573314.0, "step": 140 }, { "entropy": 0.38243546336889267, "epoch": 0.03380347359832148, "grad_norm": 0.7578125, "learning_rate": 1.9999990693733178e-05, "loss": 0.8458, "mean_token_accuracy": 0.8730092644691467, "num_tokens": 590469.0, "step": 145 }, { "entropy": 0.3092967767268419, "epoch": 0.03496911061895326, "grad_norm": 6.84375, "learning_rate": 1.9999989478446396e-05, "loss": 0.6054, "mean_token_accuracy": 0.8881650030612945, "num_tokens": 616301.0, "step": 150 }, { "entropy": 0.3355667755007744, "epoch": 0.03613474763958503, "grad_norm": 2.203125, "learning_rate": 1.9999988188602252e-05, "loss": 0.6318, "mean_token_accuracy": 0.8999499917030335, "num_tokens": 634344.0, "step": 155 }, { "entropy": 0.3264051340520382, "epoch": 0.03730038466021681, "grad_norm": 1.9453125, "learning_rate": 1.9999986824200764e-05, "loss": 0.6383, "mean_token_accuracy": 0.894915622472763, "num_tokens": 653730.0, "step": 160 }, { "entropy": 0.2963225370272994, "epoch": 0.03846602168084858, "grad_norm": 0.462890625, "learning_rate": 1.999998538524195e-05, "loss": 0.434, "mean_token_accuracy": 0.8856126844882966, "num_tokens": 682057.0, "step": 165 }, { "entropy": 0.3581529125571251, "epoch": 0.03963165870148036, "grad_norm": 2.046875, "learning_rate": 1.9999983871725833e-05, "loss": 0.5984, "mean_token_accuracy": 0.9039258837699891, "num_tokens": 697306.0, "step": 170 }, { "entropy": 0.3010772816836834, "epoch": 0.04079729572211213, "grad_norm": 1.1640625, "learning_rate": 1.999998228365244e-05, "loss": 0.6009, "mean_token_accuracy": 0.898896187543869, "num_tokens": 713213.0, "step": 175 }, { "entropy": 0.2990272644907236, "epoch": 0.04196293274274391, "grad_norm": 2.46875, "learning_rate": 1.999998062102179e-05, "loss": 0.5517, "mean_token_accuracy": 0.917355763912201, "num_tokens": 741039.0, "step": 180 }, { "entropy": 0.24269667863845826, "epoch": 0.04312856976337569, "grad_norm": 1.6953125, "learning_rate": 1.9999978883833904e-05, "loss": 0.3234, "mean_token_accuracy": 0.9207581281661987, "num_tokens": 767100.0, "step": 185 }, { "entropy": 0.2515395663678646, "epoch": 0.04429420678400746, "grad_norm": 0.416015625, "learning_rate": 1.9999977072088815e-05, "loss": 0.3565, "mean_token_accuracy": 0.9244686782360076, "num_tokens": 799222.0, "step": 190 }, { "entropy": 0.3098013773560524, "epoch": 0.04545984380463924, "grad_norm": 3.265625, "learning_rate": 1.999997518578655e-05, "loss": 0.842, "mean_token_accuracy": 0.8814567267894745, "num_tokens": 810222.0, "step": 195 }, { "entropy": 0.3232828423380852, "epoch": 0.04662548082527101, "grad_norm": 2.203125, "learning_rate": 1.999997322492713e-05, "loss": 0.743, "mean_token_accuracy": 0.8766939163208007, "num_tokens": 826444.0, "step": 200 }, { "entropy": 0.32065615206956866, "epoch": 0.047791117845902786, "grad_norm": 0.75390625, "learning_rate": 1.9999971189510594e-05, "loss": 0.5851, "mean_token_accuracy": 0.9104382634162903, "num_tokens": 855814.0, "step": 205 }, { "entropy": 0.40788267850875853, "epoch": 0.048956754866534564, "grad_norm": 1.7265625, "learning_rate": 1.9999969079536963e-05, "loss": 0.7793, "mean_token_accuracy": 0.8514861643314362, "num_tokens": 894196.0, "step": 210 }, { "entropy": 0.445697608217597, "epoch": 0.050122391887166336, "grad_norm": 2.390625, "learning_rate": 1.9999966895006273e-05, "loss": 0.6643, "mean_token_accuracy": 0.8878465294837952, "num_tokens": 912147.0, "step": 215 }, { "entropy": 0.29761432111263275, "epoch": 0.051288028907798114, "grad_norm": 4.96875, "learning_rate": 1.9999964635918557e-05, "loss": 0.4497, "mean_token_accuracy": 0.9224862337112427, "num_tokens": 942808.0, "step": 220 }, { "entropy": 0.3196300931274891, "epoch": 0.052453665928429885, "grad_norm": 1.28125, "learning_rate": 1.999996230227385e-05, "loss": 0.5977, "mean_token_accuracy": 0.8641709625720978, "num_tokens": 966122.0, "step": 225 }, { "entropy": 0.3358492150902748, "epoch": 0.05361930294906166, "grad_norm": 2.359375, "learning_rate": 1.9999959894072183e-05, "loss": 0.6385, "mean_token_accuracy": 0.8833851218223572, "num_tokens": 976774.0, "step": 230 }, { "entropy": 0.38430210798978803, "epoch": 0.054784939969693434, "grad_norm": 3.328125, "learning_rate": 1.9999957411313592e-05, "loss": 0.8489, "mean_token_accuracy": 0.8765129625797272, "num_tokens": 989123.0, "step": 235 }, { "entropy": 0.3497027777135372, "epoch": 0.05595057699032521, "grad_norm": 0.67578125, "learning_rate": 1.999995485399812e-05, "loss": 0.7954, "mean_token_accuracy": 0.8668942451477051, "num_tokens": 1017050.0, "step": 240 }, { "entropy": 0.3112874172627926, "epoch": 0.05711621401095699, "grad_norm": 1.140625, "learning_rate": 1.9999952222125795e-05, "loss": 0.5732, "mean_token_accuracy": 0.8893404483795166, "num_tokens": 1043083.0, "step": 245 }, { "entropy": 0.32975875288248063, "epoch": 0.05828185103158876, "grad_norm": 17.25, "learning_rate": 1.9999949515696662e-05, "loss": 0.8527, "mean_token_accuracy": 0.8816092252731323, "num_tokens": 1066737.0, "step": 250 }, { "entropy": 0.45641955733299255, "epoch": 0.05944748805222054, "grad_norm": 2.0625, "learning_rate": 1.9999946734710768e-05, "loss": 0.8614, "mean_token_accuracy": 0.8740241050720214, "num_tokens": 1095622.0, "step": 255 }, { "entropy": 0.19169552214443683, "epoch": 0.06061312507285231, "grad_norm": 0.58203125, "learning_rate": 1.9999943879168143e-05, "loss": 0.2542, "mean_token_accuracy": 0.9506396472454071, "num_tokens": 1130270.0, "step": 260 }, { "entropy": 0.29430868923664094, "epoch": 0.06177876209348409, "grad_norm": 1.5234375, "learning_rate": 1.9999940949068837e-05, "loss": 0.4795, "mean_token_accuracy": 0.8999245047569275, "num_tokens": 1152894.0, "step": 265 }, { "entropy": 0.33416991904377935, "epoch": 0.06294439911411587, "grad_norm": 2.015625, "learning_rate": 1.999993794441289e-05, "loss": 0.7509, "mean_token_accuracy": 0.8660208523273468, "num_tokens": 1165983.0, "step": 270 }, { "entropy": 0.2795898959040642, "epoch": 0.06411003613474764, "grad_norm": 3.203125, "learning_rate": 1.9999934865200345e-05, "loss": 0.7822, "mean_token_accuracy": 0.8882498800754547, "num_tokens": 1177442.0, "step": 275 }, { "entropy": 0.3017091706395149, "epoch": 0.06527567315537941, "grad_norm": 1.59375, "learning_rate": 1.9999931711431256e-05, "loss": 0.655, "mean_token_accuracy": 0.8937211573123932, "num_tokens": 1192885.0, "step": 280 }, { "entropy": 0.3526634112000465, "epoch": 0.0664413101760112, "grad_norm": 2.46875, "learning_rate": 1.9999928483105663e-05, "loss": 0.6486, "mean_token_accuracy": 0.8815766870975494, "num_tokens": 1213874.0, "step": 285 }, { "entropy": 0.40433951616287234, "epoch": 0.06760694719664297, "grad_norm": 5.875, "learning_rate": 1.9999925180223613e-05, "loss": 0.8081, "mean_token_accuracy": 0.876986026763916, "num_tokens": 1230434.0, "step": 290 }, { "entropy": 0.2815289504826069, "epoch": 0.06877258421727474, "grad_norm": 0.53125, "learning_rate": 1.999992180278516e-05, "loss": 0.5367, "mean_token_accuracy": 0.9034562647342682, "num_tokens": 1245116.0, "step": 295 }, { "entropy": 0.36914983466267587, "epoch": 0.06993822123790652, "grad_norm": 5.90625, "learning_rate": 1.9999918350790354e-05, "loss": 0.6114, "mean_token_accuracy": 0.8818127453327179, "num_tokens": 1259884.0, "step": 300 }, { "entropy": 0.27012285105884076, "epoch": 0.0711038582585383, "grad_norm": 0.279296875, "learning_rate": 1.9999914824239243e-05, "loss": 0.5234, "mean_token_accuracy": 0.9039028882980347, "num_tokens": 1280626.0, "step": 305 }, { "entropy": 0.27597851008176805, "epoch": 0.07226949527917007, "grad_norm": 1.8125, "learning_rate": 1.9999911223131885e-05, "loss": 0.7649, "mean_token_accuracy": 0.8721263349056244, "num_tokens": 1293685.0, "step": 310 }, { "entropy": 0.38573506474494934, "epoch": 0.07343513229980184, "grad_norm": 1.140625, "learning_rate": 1.9999907547468328e-05, "loss": 0.6097, "mean_token_accuracy": 0.8858954429626464, "num_tokens": 1322562.0, "step": 315 }, { "entropy": 0.3330282442271709, "epoch": 0.07460076932043362, "grad_norm": 2.359375, "learning_rate": 1.999990379724863e-05, "loss": 0.6147, "mean_token_accuracy": 0.8772442162036895, "num_tokens": 1345673.0, "step": 320 }, { "entropy": 0.3170699439942837, "epoch": 0.07576640634106539, "grad_norm": 3.296875, "learning_rate": 1.9999899972472843e-05, "loss": 0.458, "mean_token_accuracy": 0.8802279114723206, "num_tokens": 1383625.0, "step": 325 }, { "entropy": 0.46571023017168045, "epoch": 0.07693204336169716, "grad_norm": 1.71875, "learning_rate": 1.999989607314103e-05, "loss": 0.8476, "mean_token_accuracy": 0.8469921767711639, "num_tokens": 1409144.0, "step": 330 }, { "entropy": 0.41157747209072115, "epoch": 0.07809768038232895, "grad_norm": 4.375, "learning_rate": 1.9999892099253247e-05, "loss": 0.8684, "mean_token_accuracy": 0.8459766566753387, "num_tokens": 1431361.0, "step": 335 }, { "entropy": 0.39902357161045077, "epoch": 0.07926331740296072, "grad_norm": 3.5, "learning_rate": 1.999988805080955e-05, "loss": 0.9777, "mean_token_accuracy": 0.8353796422481536, "num_tokens": 1442281.0, "step": 340 }, { "entropy": 0.42176967263221743, "epoch": 0.08042895442359249, "grad_norm": 2.703125, "learning_rate": 1.9999883927810005e-05, "loss": 1.0149, "mean_token_accuracy": 0.8583439648151397, "num_tokens": 1450914.0, "step": 345 }, { "entropy": 0.2953450310975313, "epoch": 0.08159459144422426, "grad_norm": 1.203125, "learning_rate": 1.999987973025467e-05, "loss": 0.3853, "mean_token_accuracy": 0.9233019709587097, "num_tokens": 1474403.0, "step": 350 }, { "entropy": 0.33179600164294243, "epoch": 0.08276022846485605, "grad_norm": 2.625, "learning_rate": 1.9999875458143604e-05, "loss": 0.4703, "mean_token_accuracy": 0.8944461524486542, "num_tokens": 1509158.0, "step": 355 }, { "entropy": 0.3036386102437973, "epoch": 0.08392586548548782, "grad_norm": 1.125, "learning_rate": 1.999987111147688e-05, "loss": 0.6394, "mean_token_accuracy": 0.8986095011234283, "num_tokens": 1526440.0, "step": 360 }, { "entropy": 0.31046125665307045, "epoch": 0.08509150250611959, "grad_norm": 3.421875, "learning_rate": 1.9999866690254554e-05, "loss": 0.6663, "mean_token_accuracy": 0.8590294003486634, "num_tokens": 1549145.0, "step": 365 }, { "entropy": 0.3097700498998165, "epoch": 0.08625713952675138, "grad_norm": 2.09375, "learning_rate": 1.99998621944767e-05, "loss": 0.4658, "mean_token_accuracy": 0.9011349081993103, "num_tokens": 1583727.0, "step": 370 }, { "entropy": 0.3039284236729145, "epoch": 0.08742277654738315, "grad_norm": 3.03125, "learning_rate": 1.9999857624143373e-05, "loss": 0.6759, "mean_token_accuracy": 0.8913724482059479, "num_tokens": 1597894.0, "step": 375 }, { "entropy": 0.24622596129775048, "epoch": 0.08858841356801492, "grad_norm": 1.5703125, "learning_rate": 1.9999852979254655e-05, "loss": 0.4396, "mean_token_accuracy": 0.9359685719013214, "num_tokens": 1623602.0, "step": 380 }, { "entropy": 0.39024817757308483, "epoch": 0.08975405058864669, "grad_norm": 2.734375, "learning_rate": 1.9999848259810605e-05, "loss": 0.8393, "mean_token_accuracy": 0.8547381460666656, "num_tokens": 1636767.0, "step": 385 }, { "entropy": 0.3580179035663605, "epoch": 0.09091968760927847, "grad_norm": 0.875, "learning_rate": 1.9999843465811297e-05, "loss": 0.6393, "mean_token_accuracy": 0.8810992777347565, "num_tokens": 1655707.0, "step": 390 }, { "entropy": 0.37314921617507935, "epoch": 0.09208532462991025, "grad_norm": 2.15625, "learning_rate": 1.9999838597256807e-05, "loss": 0.4304, "mean_token_accuracy": 0.8614090740680694, "num_tokens": 1696414.0, "step": 395 }, { "entropy": 0.29948038049042225, "epoch": 0.09325096165054202, "grad_norm": 2.953125, "learning_rate": 1.99998336541472e-05, "loss": 0.5126, "mean_token_accuracy": 0.9010177552700043, "num_tokens": 1715229.0, "step": 400 }, { "entropy": 0.3523738864809275, "epoch": 0.0944165986711738, "grad_norm": 2.84375, "learning_rate": 1.9999828636482553e-05, "loss": 0.7424, "mean_token_accuracy": 0.8713351786136627, "num_tokens": 1732900.0, "step": 405 }, { "entropy": 0.305879208445549, "epoch": 0.09558223569180557, "grad_norm": 0.72265625, "learning_rate": 1.9999823544262942e-05, "loss": 0.5051, "mean_token_accuracy": 0.9006154477596283, "num_tokens": 1747602.0, "step": 410 }, { "entropy": 0.3407420488074422, "epoch": 0.09674787271243734, "grad_norm": 1.0546875, "learning_rate": 1.9999818377488443e-05, "loss": 0.2896, "mean_token_accuracy": 0.9086630046367645, "num_tokens": 1772100.0, "step": 415 }, { "entropy": 0.2643546022474766, "epoch": 0.09791350973306913, "grad_norm": 0.53125, "learning_rate": 1.999981313615913e-05, "loss": 0.4521, "mean_token_accuracy": 0.9116932094097138, "num_tokens": 1791608.0, "step": 420 }, { "entropy": 0.343147162348032, "epoch": 0.0990791467537009, "grad_norm": 2.15625, "learning_rate": 1.9999807820275082e-05, "loss": 0.5072, "mean_token_accuracy": 0.8775932788848877, "num_tokens": 1819571.0, "step": 425 }, { "entropy": 0.287718054279685, "epoch": 0.10024478377433267, "grad_norm": 1.4453125, "learning_rate": 1.9999802429836383e-05, "loss": 0.4765, "mean_token_accuracy": 0.9125708997249603, "num_tokens": 1849569.0, "step": 430 }, { "entropy": 0.40832418352365496, "epoch": 0.10141042079496444, "grad_norm": 2.1875, "learning_rate": 1.9999796964843104e-05, "loss": 0.9028, "mean_token_accuracy": 0.8589731276035308, "num_tokens": 1858952.0, "step": 435 }, { "entropy": 0.30324168093502524, "epoch": 0.10257605781559623, "grad_norm": 1.75, "learning_rate": 1.9999791425295338e-05, "loss": 0.5968, "mean_token_accuracy": 0.8990670144557953, "num_tokens": 1879066.0, "step": 440 }, { "entropy": 0.2882155541330576, "epoch": 0.103741694836228, "grad_norm": 0.271484375, "learning_rate": 1.9999785811193154e-05, "loss": 0.4322, "mean_token_accuracy": 0.9170081973075866, "num_tokens": 1903364.0, "step": 445 }, { "entropy": 0.2898620326071978, "epoch": 0.10490733185685977, "grad_norm": 0.9375, "learning_rate": 1.999978012253665e-05, "loss": 0.447, "mean_token_accuracy": 0.922281152009964, "num_tokens": 1922997.0, "step": 450 }, { "entropy": 0.3374782703816891, "epoch": 0.10607296887749156, "grad_norm": 2.6875, "learning_rate": 1.9999774359325905e-05, "loss": 0.7052, "mean_token_accuracy": 0.897776848077774, "num_tokens": 1950028.0, "step": 455 }, { "entropy": 0.354191205278039, "epoch": 0.10723860589812333, "grad_norm": 2.15625, "learning_rate": 1.9999768521561002e-05, "loss": 0.5536, "mean_token_accuracy": 0.8853957891464234, "num_tokens": 1976726.0, "step": 460 }, { "entropy": 0.43799355179071425, "epoch": 0.1084042429187551, "grad_norm": 2.09375, "learning_rate": 1.9999762609242028e-05, "loss": 0.8109, "mean_token_accuracy": 0.8636049032211304, "num_tokens": 1987708.0, "step": 465 }, { "entropy": 0.3960087105631828, "epoch": 0.10956987993938687, "grad_norm": 1.65625, "learning_rate": 1.9999756622369077e-05, "loss": 0.8937, "mean_token_accuracy": 0.8690516471862793, "num_tokens": 1996457.0, "step": 470 }, { "entropy": 0.3699250042438507, "epoch": 0.11073551696001865, "grad_norm": 6.4375, "learning_rate": 1.9999750560942234e-05, "loss": 0.8203, "mean_token_accuracy": 0.8731843948364257, "num_tokens": 2011481.0, "step": 475 }, { "entropy": 0.17613436691462994, "epoch": 0.11190115398065043, "grad_norm": 0.42578125, "learning_rate": 1.9999744424961588e-05, "loss": 0.2405, "mean_token_accuracy": 0.9508815348148346, "num_tokens": 2043874.0, "step": 480 }, { "entropy": 0.26431639343500135, "epoch": 0.1130667910012822, "grad_norm": 3.375, "learning_rate": 1.9999738214427236e-05, "loss": 0.5847, "mean_token_accuracy": 0.9056618392467499, "num_tokens": 2064771.0, "step": 485 }, { "entropy": 0.2693479511886835, "epoch": 0.11423242802191398, "grad_norm": 0.384765625, "learning_rate": 1.9999731929339263e-05, "loss": 0.4551, "mean_token_accuracy": 0.9169823467731476, "num_tokens": 2090213.0, "step": 490 }, { "entropy": 0.4294183999300003, "epoch": 0.11539806504254575, "grad_norm": 3.046875, "learning_rate": 1.999972556969777e-05, "loss": 0.7813, "mean_token_accuracy": 0.8543537199497223, "num_tokens": 2113168.0, "step": 495 }, { "entropy": 0.26536157727241516, "epoch": 0.11656370206317752, "grad_norm": 2.546875, "learning_rate": 1.999971913550285e-05, "loss": 0.3274, "mean_token_accuracy": 0.9273768246173859, "num_tokens": 2134370.0, "step": 500 }, { "entropy": 0.2530547440052032, "epoch": 0.1177293390838093, "grad_norm": 0.51171875, "learning_rate": 1.9999712626754593e-05, "loss": 0.3245, "mean_token_accuracy": 0.929281085729599, "num_tokens": 2172563.0, "step": 505 }, { "entropy": 0.26137659288942816, "epoch": 0.11889497610444108, "grad_norm": 0.419921875, "learning_rate": 1.9999706043453103e-05, "loss": 0.6226, "mean_token_accuracy": 0.8935856699943543, "num_tokens": 2193099.0, "step": 510 }, { "entropy": 0.25196021553128956, "epoch": 0.12006061312507285, "grad_norm": 0.419921875, "learning_rate": 1.9999699385598476e-05, "loss": 0.338, "mean_token_accuracy": 0.9162642061710358, "num_tokens": 2230359.0, "step": 515 }, { "entropy": 0.3314253244549036, "epoch": 0.12122625014570462, "grad_norm": 1.390625, "learning_rate": 1.999969265319081e-05, "loss": 0.4916, "mean_token_accuracy": 0.8800876617431641, "num_tokens": 2256112.0, "step": 520 }, { "entropy": 0.2865926086902618, "epoch": 0.12239188716633641, "grad_norm": 0.640625, "learning_rate": 1.999968584623021e-05, "loss": 0.5819, "mean_token_accuracy": 0.9126100778579712, "num_tokens": 2269650.0, "step": 525 }, { "entropy": 0.2737023938447237, "epoch": 0.12355752418696818, "grad_norm": 1.609375, "learning_rate": 1.999967896471677e-05, "loss": 0.462, "mean_token_accuracy": 0.910278183221817, "num_tokens": 2294847.0, "step": 530 }, { "entropy": 0.250915889441967, "epoch": 0.12472316120759995, "grad_norm": 2.140625, "learning_rate": 1.9999672008650603e-05, "loss": 0.5603, "mean_token_accuracy": 0.8969495117664337, "num_tokens": 2310263.0, "step": 535 }, { "entropy": 0.31160888969898226, "epoch": 0.12588879822823174, "grad_norm": 0.890625, "learning_rate": 1.99996649780318e-05, "loss": 0.3346, "mean_token_accuracy": 0.9062823116779327, "num_tokens": 2328242.0, "step": 540 }, { "entropy": 0.42439975365996363, "epoch": 0.1270544352488635, "grad_norm": 4.1875, "learning_rate": 1.9999657872860476e-05, "loss": 0.8226, "mean_token_accuracy": 0.8711081981658936, "num_tokens": 2349683.0, "step": 545 }, { "entropy": 0.2684651080518961, "epoch": 0.12822007226949528, "grad_norm": 1.6875, "learning_rate": 1.999965069313673e-05, "loss": 0.3773, "mean_token_accuracy": 0.8972374260425567, "num_tokens": 2372089.0, "step": 550 }, { "entropy": 0.18653137236833572, "epoch": 0.12938570929012705, "grad_norm": 2.390625, "learning_rate": 1.9999643438860674e-05, "loss": 0.5196, "mean_token_accuracy": 0.9188428163528443, "num_tokens": 2404211.0, "step": 555 }, { "entropy": 0.2632068574428558, "epoch": 0.13055134631075882, "grad_norm": 0.8984375, "learning_rate": 1.9999636110032415e-05, "loss": 0.4882, "mean_token_accuracy": 0.9154849767684936, "num_tokens": 2419959.0, "step": 560 }, { "entropy": 0.25248654522001746, "epoch": 0.1317169833313906, "grad_norm": 3.03125, "learning_rate": 1.999962870665206e-05, "loss": 0.4536, "mean_token_accuracy": 0.9148861825466156, "num_tokens": 2442409.0, "step": 565 }, { "entropy": 0.2907357782125473, "epoch": 0.1328826203520224, "grad_norm": 0.69921875, "learning_rate": 1.9999621228719724e-05, "loss": 0.6041, "mean_token_accuracy": 0.8970059275627136, "num_tokens": 2458082.0, "step": 570 }, { "entropy": 0.275528746843338, "epoch": 0.13404825737265416, "grad_norm": 2.375, "learning_rate": 1.9999613676235512e-05, "loss": 0.6932, "mean_token_accuracy": 0.8919620513916016, "num_tokens": 2470546.0, "step": 575 }, { "entropy": 0.3516815423965454, "epoch": 0.13521389439328593, "grad_norm": 3.03125, "learning_rate": 1.9999606049199543e-05, "loss": 0.656, "mean_token_accuracy": 0.8738880634307862, "num_tokens": 2495063.0, "step": 580 }, { "entropy": 0.29452326260507106, "epoch": 0.1363795314139177, "grad_norm": 1.5078125, "learning_rate": 1.999959834761193e-05, "loss": 0.6642, "mean_token_accuracy": 0.8788491487503052, "num_tokens": 2511349.0, "step": 585 }, { "entropy": 0.2358495132997632, "epoch": 0.13754516843454948, "grad_norm": 0.21875, "learning_rate": 1.999959057147278e-05, "loss": 0.2328, "mean_token_accuracy": 0.9249357283115387, "num_tokens": 2549642.0, "step": 590 }, { "entropy": 0.2459204986691475, "epoch": 0.13871080545518125, "grad_norm": 1.765625, "learning_rate": 1.9999582720782217e-05, "loss": 0.5451, "mean_token_accuracy": 0.9135986566543579, "num_tokens": 2570037.0, "step": 595 }, { "entropy": 0.27237192876636984, "epoch": 0.13987644247581305, "grad_norm": 0.1796875, "learning_rate": 1.9999574795540357e-05, "loss": 0.4804, "mean_token_accuracy": 0.9119515061378479, "num_tokens": 2590600.0, "step": 600 }, { "entropy": 0.3028899788856506, "epoch": 0.14104207949644482, "grad_norm": 0.65625, "learning_rate": 1.9999566795747316e-05, "loss": 0.3404, "mean_token_accuracy": 0.9131358981132507, "num_tokens": 2620638.0, "step": 605 }, { "entropy": 0.2697331115603447, "epoch": 0.1422077165170766, "grad_norm": 1.421875, "learning_rate": 1.9999558721403215e-05, "loss": 0.4712, "mean_token_accuracy": 0.9166036427021027, "num_tokens": 2640375.0, "step": 610 }, { "entropy": 0.37615430131554606, "epoch": 0.14337335353770836, "grad_norm": 1.578125, "learning_rate": 1.9999550572508174e-05, "loss": 0.653, "mean_token_accuracy": 0.8842727184295655, "num_tokens": 2651304.0, "step": 615 }, { "entropy": 0.2938199445605278, "epoch": 0.14453899055834013, "grad_norm": 0.427734375, "learning_rate": 1.9999542349062314e-05, "loss": 0.679, "mean_token_accuracy": 0.8845574855804443, "num_tokens": 2672520.0, "step": 620 }, { "entropy": 0.391637334227562, "epoch": 0.1457046275789719, "grad_norm": 0.98828125, "learning_rate": 1.9999534051065757e-05, "loss": 0.7966, "mean_token_accuracy": 0.845041885972023, "num_tokens": 2694477.0, "step": 625 }, { "entropy": 0.3956002026796341, "epoch": 0.14687026459960367, "grad_norm": 2.953125, "learning_rate": 1.9999525678518628e-05, "loss": 0.9271, "mean_token_accuracy": 0.8404446899890899, "num_tokens": 2708270.0, "step": 630 }, { "entropy": 0.3675729542970657, "epoch": 0.14803590162023547, "grad_norm": 0.87890625, "learning_rate": 1.9999517231421053e-05, "loss": 0.5063, "mean_token_accuracy": 0.8811340808868409, "num_tokens": 2722600.0, "step": 635 }, { "entropy": 0.25438366681337354, "epoch": 0.14920153864086724, "grad_norm": 1.21875, "learning_rate": 1.9999508709773155e-05, "loss": 0.571, "mean_token_accuracy": 0.9114357829093933, "num_tokens": 2741534.0, "step": 640 }, { "entropy": 0.29461836684495213, "epoch": 0.15036717566149901, "grad_norm": 0.2041015625, "learning_rate": 1.999950011357506e-05, "loss": 0.508, "mean_token_accuracy": 0.9003230154514312, "num_tokens": 2766169.0, "step": 645 }, { "entropy": 0.36445762515068053, "epoch": 0.15153281268213079, "grad_norm": 1.6328125, "learning_rate": 1.9999491442826903e-05, "loss": 0.7562, "mean_token_accuracy": 0.8814541339874268, "num_tokens": 2775925.0, "step": 650 }, { "entropy": 0.2811070531606674, "epoch": 0.15269844970276256, "grad_norm": 1.78125, "learning_rate": 1.9999482697528808e-05, "loss": 0.4886, "mean_token_accuracy": 0.9111315429210662, "num_tokens": 2796279.0, "step": 655 }, { "entropy": 0.25250407978892325, "epoch": 0.15386408672339433, "grad_norm": 0.470703125, "learning_rate": 1.9999473877680903e-05, "loss": 0.371, "mean_token_accuracy": 0.9198955953121185, "num_tokens": 2829382.0, "step": 660 }, { "entropy": 0.35067772716283796, "epoch": 0.1550297237440261, "grad_norm": 1.328125, "learning_rate": 1.9999464983283325e-05, "loss": 0.5332, "mean_token_accuracy": 0.8845549583435058, "num_tokens": 2843150.0, "step": 665 }, { "entropy": 0.24629681333899497, "epoch": 0.1561953607646579, "grad_norm": 2.1875, "learning_rate": 1.9999456014336206e-05, "loss": 0.4008, "mean_token_accuracy": 0.9258623898029328, "num_tokens": 2868976.0, "step": 670 }, { "entropy": 0.3138530794531107, "epoch": 0.15736099778528967, "grad_norm": 2.8125, "learning_rate": 1.9999446970839677e-05, "loss": 0.6616, "mean_token_accuracy": 0.8881543815135956, "num_tokens": 2884671.0, "step": 675 }, { "entropy": 0.38320747911930086, "epoch": 0.15852663480592144, "grad_norm": 1.3515625, "learning_rate": 1.9999437852793874e-05, "loss": 0.7154, "mean_token_accuracy": 0.8834590017795563, "num_tokens": 2895630.0, "step": 680 }, { "entropy": 0.33303392827510836, "epoch": 0.1596922718265532, "grad_norm": 2.453125, "learning_rate": 1.9999428660198933e-05, "loss": 0.8552, "mean_token_accuracy": 0.865819638967514, "num_tokens": 2905209.0, "step": 685 }, { "entropy": 0.31202927231788635, "epoch": 0.16085790884718498, "grad_norm": 2.046875, "learning_rate": 1.999941939305499e-05, "loss": 0.5953, "mean_token_accuracy": 0.8999407291412354, "num_tokens": 2917687.0, "step": 690 }, { "entropy": 0.3931444585323334, "epoch": 0.16202354586781675, "grad_norm": 3.15625, "learning_rate": 1.9999410051362185e-05, "loss": 0.6859, "mean_token_accuracy": 0.8783853471279144, "num_tokens": 2935177.0, "step": 695 }, { "entropy": 0.4097077568992972, "epoch": 0.16318918288844853, "grad_norm": 0.5703125, "learning_rate": 1.9999400635120656e-05, "loss": 0.7382, "mean_token_accuracy": 0.8664321959018707, "num_tokens": 2962275.0, "step": 700 }, { "entropy": 0.3893901389092207, "epoch": 0.16435481990908032, "grad_norm": 3.0625, "learning_rate": 1.9999391144330547e-05, "loss": 0.6817, "mean_token_accuracy": 0.8475090980529785, "num_tokens": 2985130.0, "step": 705 }, { "entropy": 0.3332042768597603, "epoch": 0.1655204569297121, "grad_norm": 0.88671875, "learning_rate": 1.9999381578991995e-05, "loss": 0.5577, "mean_token_accuracy": 0.8721989512443542, "num_tokens": 3012620.0, "step": 710 }, { "entropy": 0.3115752834826708, "epoch": 0.16668609395034387, "grad_norm": 2.34375, "learning_rate": 1.999937193910514e-05, "loss": 0.7448, "mean_token_accuracy": 0.8875145256519318, "num_tokens": 3026831.0, "step": 715 }, { "entropy": 0.4049858648329973, "epoch": 0.16785173097097564, "grad_norm": 3.734375, "learning_rate": 1.9999362224670136e-05, "loss": 0.771, "mean_token_accuracy": 0.8793026149272919, "num_tokens": 3043491.0, "step": 720 }, { "entropy": 0.3188063256442547, "epoch": 0.1690173679916074, "grad_norm": 0.58203125, "learning_rate": 1.999935243568712e-05, "loss": 0.5652, "mean_token_accuracy": 0.8942893028259278, "num_tokens": 3068104.0, "step": 725 }, { "entropy": 0.4390459656715393, "epoch": 0.17018300501223918, "grad_norm": 2.9375, "learning_rate": 1.9999342572156236e-05, "loss": 0.993, "mean_token_accuracy": 0.8510157585144043, "num_tokens": 3076457.0, "step": 730 }, { "entropy": 0.2523434393107891, "epoch": 0.17134864203287095, "grad_norm": 0.279296875, "learning_rate": 1.999933263407764e-05, "loss": 0.4319, "mean_token_accuracy": 0.91023069024086, "num_tokens": 3102744.0, "step": 735 }, { "entropy": 0.2924702726304531, "epoch": 0.17251427905350275, "grad_norm": 0.66015625, "learning_rate": 1.9999322621451472e-05, "loss": 0.6122, "mean_token_accuracy": 0.8960310935974121, "num_tokens": 3116362.0, "step": 740 }, { "entropy": 0.34593453593552115, "epoch": 0.17367991607413452, "grad_norm": 4.75, "learning_rate": 1.9999312534277886e-05, "loss": 0.5829, "mean_token_accuracy": 0.8776205480098724, "num_tokens": 3142420.0, "step": 745 }, { "entropy": 0.2968858815729618, "epoch": 0.1748455530947663, "grad_norm": 2.015625, "learning_rate": 1.999930237255703e-05, "loss": 0.3671, "mean_token_accuracy": 0.9223788380622864, "num_tokens": 3163109.0, "step": 750 }, { "entropy": 0.28595022670924664, "epoch": 0.17601119011539806, "grad_norm": 0.953125, "learning_rate": 1.9999292136289056e-05, "loss": 0.4699, "mean_token_accuracy": 0.8934225618839264, "num_tokens": 3193584.0, "step": 755 }, { "entropy": 0.354030817002058, "epoch": 0.17717682713602984, "grad_norm": 2.0, "learning_rate": 1.9999281825474117e-05, "loss": 0.5145, "mean_token_accuracy": 0.8838940739631653, "num_tokens": 3214263.0, "step": 760 }, { "entropy": 0.43382971435785295, "epoch": 0.1783424641566616, "grad_norm": 2.28125, "learning_rate": 1.9999271440112367e-05, "loss": 0.8561, "mean_token_accuracy": 0.8413860857486725, "num_tokens": 3241698.0, "step": 765 }, { "entropy": 0.30166892930865286, "epoch": 0.17950810117729338, "grad_norm": 1.53125, "learning_rate": 1.999926098020396e-05, "loss": 0.4246, "mean_token_accuracy": 0.9079195559024811, "num_tokens": 3257616.0, "step": 770 }, { "entropy": 0.405256550014019, "epoch": 0.18067373819792518, "grad_norm": 2.359375, "learning_rate": 1.9999250445749052e-05, "loss": 0.5895, "mean_token_accuracy": 0.8911011576652527, "num_tokens": 3285955.0, "step": 775 }, { "entropy": 0.34804592877626417, "epoch": 0.18183937521855695, "grad_norm": 3.375, "learning_rate": 1.9999239836747802e-05, "loss": 0.6435, "mean_token_accuracy": 0.8680084943771362, "num_tokens": 3302017.0, "step": 780 }, { "entropy": 0.2602914243936539, "epoch": 0.18300501223918872, "grad_norm": 1.6640625, "learning_rate": 1.9999229153200365e-05, "loss": 0.3589, "mean_token_accuracy": 0.930289226770401, "num_tokens": 3321016.0, "step": 785 }, { "entropy": 0.31426837891340254, "epoch": 0.1841706492598205, "grad_norm": 1.9609375, "learning_rate": 1.9999218395106906e-05, "loss": 0.4683, "mean_token_accuracy": 0.8997460782527924, "num_tokens": 3348461.0, "step": 790 }, { "entropy": 0.22893422991037368, "epoch": 0.18533628628045226, "grad_norm": 1.3359375, "learning_rate": 1.999920756246758e-05, "loss": 0.2776, "mean_token_accuracy": 0.9370344758033753, "num_tokens": 3390799.0, "step": 795 }, { "entropy": 0.2902509465813637, "epoch": 0.18650192330108403, "grad_norm": 1.6796875, "learning_rate": 1.9999196655282546e-05, "loss": 0.4329, "mean_token_accuracy": 0.9119967639446258, "num_tokens": 3409234.0, "step": 800 }, { "entropy": 0.27368216067552564, "epoch": 0.1876675603217158, "grad_norm": 0.404296875, "learning_rate": 1.9999185673551972e-05, "loss": 0.4364, "mean_token_accuracy": 0.9043579339981079, "num_tokens": 3433336.0, "step": 805 }, { "entropy": 0.4705745026469231, "epoch": 0.1888331973423476, "grad_norm": 1.96875, "learning_rate": 1.999917461727602e-05, "loss": 0.8298, "mean_token_accuracy": 0.8345303326845169, "num_tokens": 3448349.0, "step": 810 }, { "entropy": 0.34988665878772734, "epoch": 0.18999883436297937, "grad_norm": 2.859375, "learning_rate": 1.999916348645486e-05, "loss": 0.7454, "mean_token_accuracy": 0.892153388261795, "num_tokens": 3466389.0, "step": 815 }, { "entropy": 0.44057891592383386, "epoch": 0.19116447138361115, "grad_norm": 3.40625, "learning_rate": 1.999915228108865e-05, "loss": 0.7607, "mean_token_accuracy": 0.8726061344146728, "num_tokens": 3478097.0, "step": 820 }, { "entropy": 0.3719429075717926, "epoch": 0.19233010840424292, "grad_norm": 3.578125, "learning_rate": 1.999914100117756e-05, "loss": 0.8634, "mean_token_accuracy": 0.8735311985015869, "num_tokens": 3492951.0, "step": 825 }, { "entropy": 0.34916748106479645, "epoch": 0.1934957454248747, "grad_norm": 1.8046875, "learning_rate": 1.9999129646721757e-05, "loss": 0.8812, "mean_token_accuracy": 0.8656373739242553, "num_tokens": 3503828.0, "step": 830 }, { "entropy": 0.4179299771785736, "epoch": 0.19466138244550646, "grad_norm": 4.09375, "learning_rate": 1.9999118217721415e-05, "loss": 0.8812, "mean_token_accuracy": 0.8484634220600128, "num_tokens": 3516704.0, "step": 835 }, { "entropy": 0.4467567354440689, "epoch": 0.19582701946613826, "grad_norm": 1.53125, "learning_rate": 1.99991067141767e-05, "loss": 0.9907, "mean_token_accuracy": 0.8312698066234588, "num_tokens": 3525728.0, "step": 840 }, { "entropy": 0.46012266874313357, "epoch": 0.19699265648677003, "grad_norm": 7.8125, "learning_rate": 1.9999095136087785e-05, "loss": 0.7815, "mean_token_accuracy": 0.8484793066978454, "num_tokens": 3549324.0, "step": 845 }, { "entropy": 0.226668768748641, "epoch": 0.1981582935074018, "grad_norm": 1.390625, "learning_rate": 1.9999083483454842e-05, "loss": 0.4126, "mean_token_accuracy": 0.9244813799858094, "num_tokens": 3568736.0, "step": 850 }, { "entropy": 0.31615454629063605, "epoch": 0.19932393052803357, "grad_norm": 0.87890625, "learning_rate": 1.9999071756278046e-05, "loss": 0.6654, "mean_token_accuracy": 0.9037168562412262, "num_tokens": 3583029.0, "step": 855 }, { "entropy": 0.3806022718548775, "epoch": 0.20048956754866534, "grad_norm": 1.6796875, "learning_rate": 1.999905995455757e-05, "loss": 0.8238, "mean_token_accuracy": 0.8618193626403808, "num_tokens": 3592622.0, "step": 860 }, { "entropy": 0.38668873235583306, "epoch": 0.20165520456929711, "grad_norm": 2.265625, "learning_rate": 1.9999048078293594e-05, "loss": 0.7331, "mean_token_accuracy": 0.8647488534450531, "num_tokens": 3612682.0, "step": 865 }, { "entropy": 0.3393211871385574, "epoch": 0.20282084158992889, "grad_norm": 1.6875, "learning_rate": 1.999903612748629e-05, "loss": 0.7762, "mean_token_accuracy": 0.8786431312561035, "num_tokens": 3622844.0, "step": 870 }, { "entropy": 0.3805833376944065, "epoch": 0.20398647861056068, "grad_norm": 1.2578125, "learning_rate": 1.9999024102135838e-05, "loss": 0.5378, "mean_token_accuracy": 0.8691211700439453, "num_tokens": 3650860.0, "step": 875 }, { "entropy": 0.21699289083480836, "epoch": 0.20515211563119246, "grad_norm": 0.494140625, "learning_rate": 1.9999012002242417e-05, "loss": 0.4605, "mean_token_accuracy": 0.9262413024902344, "num_tokens": 3669625.0, "step": 880 }, { "entropy": 0.28322131410241125, "epoch": 0.20631775265182423, "grad_norm": 0.56640625, "learning_rate": 1.999899982780621e-05, "loss": 0.718, "mean_token_accuracy": 0.8780093967914582, "num_tokens": 3688004.0, "step": 885 }, { "entropy": 0.2387631695717573, "epoch": 0.207483389672456, "grad_norm": 0.412109375, "learning_rate": 1.9998987578827392e-05, "loss": 0.4813, "mean_token_accuracy": 0.9062504410743714, "num_tokens": 3713401.0, "step": 890 }, { "entropy": 0.33120530694723127, "epoch": 0.20864902669308777, "grad_norm": 2.28125, "learning_rate": 1.9998975255306157e-05, "loss": 0.6927, "mean_token_accuracy": 0.8789681971073151, "num_tokens": 3725464.0, "step": 895 }, { "entropy": 0.2995867744088173, "epoch": 0.20981466371371954, "grad_norm": 0.4921875, "learning_rate": 1.9998962857242678e-05, "loss": 0.6636, "mean_token_accuracy": 0.898937052488327, "num_tokens": 3739091.0, "step": 900 }, { "entropy": 0.29027860462665556, "epoch": 0.2109803007343513, "grad_norm": 1.890625, "learning_rate": 1.9998950384637146e-05, "loss": 0.4248, "mean_token_accuracy": 0.9191015899181366, "num_tokens": 3756692.0, "step": 905 }, { "entropy": 0.3700431428849697, "epoch": 0.2121459377549831, "grad_norm": 5.875, "learning_rate": 1.9998937837489746e-05, "loss": 0.7384, "mean_token_accuracy": 0.8679684460163116, "num_tokens": 3779933.0, "step": 910 }, { "entropy": 0.2644999146461487, "epoch": 0.21331157477561488, "grad_norm": 0.44921875, "learning_rate": 1.999892521580066e-05, "loss": 0.1845, "mean_token_accuracy": 0.9258367955684662, "num_tokens": 3814612.0, "step": 915 }, { "entropy": 0.30614554286003115, "epoch": 0.21447721179624665, "grad_norm": 1.71875, "learning_rate": 1.9998912519570083e-05, "loss": 0.6013, "mean_token_accuracy": 0.8913537681102752, "num_tokens": 3828380.0, "step": 920 }, { "entropy": 0.36284519638866186, "epoch": 0.21564284881687842, "grad_norm": 1.40625, "learning_rate": 1.99988997487982e-05, "loss": 0.6176, "mean_token_accuracy": 0.88633993268013, "num_tokens": 3853807.0, "step": 925 }, { "entropy": 0.4497147111222148, "epoch": 0.2168084858375102, "grad_norm": 2.890625, "learning_rate": 1.9998886903485204e-05, "loss": 0.773, "mean_token_accuracy": 0.8610835254192353, "num_tokens": 3883069.0, "step": 930 }, { "entropy": 0.33425854444503783, "epoch": 0.21797412285814197, "grad_norm": 1.4609375, "learning_rate": 1.9998873983631283e-05, "loss": 0.7184, "mean_token_accuracy": 0.8913797974586487, "num_tokens": 3892539.0, "step": 935 }, { "entropy": 0.37412596940994264, "epoch": 0.21913975987877374, "grad_norm": 0.9296875, "learning_rate": 1.9998860989236636e-05, "loss": 0.6036, "mean_token_accuracy": 0.8902391850948334, "num_tokens": 3905637.0, "step": 940 }, { "entropy": 0.28550264425575733, "epoch": 0.22030539689940554, "grad_norm": 0.53515625, "learning_rate": 1.999884792030145e-05, "loss": 0.6622, "mean_token_accuracy": 0.8766939222812653, "num_tokens": 3926668.0, "step": 945 }, { "entropy": 0.4102290324866772, "epoch": 0.2214710339200373, "grad_norm": 3.328125, "learning_rate": 1.9998834776825926e-05, "loss": 0.7183, "mean_token_accuracy": 0.8621988534927368, "num_tokens": 3943647.0, "step": 950 }, { "entropy": 0.29528709389269353, "epoch": 0.22263667094066908, "grad_norm": 3.265625, "learning_rate": 1.9998821558810254e-05, "loss": 0.6308, "mean_token_accuracy": 0.8999729871749877, "num_tokens": 3971668.0, "step": 955 }, { "entropy": 0.26888910457491877, "epoch": 0.22380230796130085, "grad_norm": 1.6484375, "learning_rate": 1.9998808266254633e-05, "loss": 0.3323, "mean_token_accuracy": 0.903475534915924, "num_tokens": 3994987.0, "step": 960 }, { "entropy": 0.3866387724876404, "epoch": 0.22496794498193262, "grad_norm": 3.015625, "learning_rate": 1.9998794899159266e-05, "loss": 0.8279, "mean_token_accuracy": 0.8675558984279632, "num_tokens": 4012144.0, "step": 965 }, { "entropy": 0.31260843873023986, "epoch": 0.2261335820025644, "grad_norm": 1.3515625, "learning_rate": 1.9998781457524345e-05, "loss": 0.4928, "mean_token_accuracy": 0.9094948828220367, "num_tokens": 4030145.0, "step": 970 }, { "entropy": 0.26560680121183394, "epoch": 0.22729921902319616, "grad_norm": 2.796875, "learning_rate": 1.9998767941350078e-05, "loss": 0.5832, "mean_token_accuracy": 0.9050080478191376, "num_tokens": 4051878.0, "step": 975 }, { "entropy": 0.2385837372392416, "epoch": 0.22846485604382796, "grad_norm": 0.3203125, "learning_rate": 1.999875435063666e-05, "loss": 0.2669, "mean_token_accuracy": 0.943767887353897, "num_tokens": 4082619.0, "step": 980 }, { "entropy": 0.2860884163528681, "epoch": 0.22963049306445973, "grad_norm": 1.578125, "learning_rate": 1.9998740685384293e-05, "loss": 0.4171, "mean_token_accuracy": 0.9251432895660401, "num_tokens": 4102940.0, "step": 985 }, { "entropy": 0.2541156569495797, "epoch": 0.2307961300850915, "grad_norm": 0.5078125, "learning_rate": 1.9998726945593186e-05, "loss": 0.6771, "mean_token_accuracy": 0.8742362856864929, "num_tokens": 4132989.0, "step": 990 }, { "entropy": 0.3671549305319786, "epoch": 0.23196176710572328, "grad_norm": 3.5, "learning_rate": 1.9998713131263545e-05, "loss": 0.8071, "mean_token_accuracy": 0.8727976560592652, "num_tokens": 4143129.0, "step": 995 }, { "entropy": 0.331473582983017, "epoch": 0.23312740412635505, "grad_norm": 1.0546875, "learning_rate": 1.999869924239557e-05, "loss": 0.4618, "mean_token_accuracy": 0.8977300941944122, "num_tokens": 4161622.0, "step": 1000 }, { "entropy": 0.27616468332707883, "epoch": 0.23429304114698682, "grad_norm": 3.375, "learning_rate": 1.9998685278989472e-05, "loss": 0.485, "mean_token_accuracy": 0.9088415265083313, "num_tokens": 4180399.0, "step": 1005 }, { "entropy": 0.33261996433138846, "epoch": 0.2354586781676186, "grad_norm": 3.78125, "learning_rate": 1.9998671241045454e-05, "loss": 0.7578, "mean_token_accuracy": 0.888950502872467, "num_tokens": 4192069.0, "step": 1010 }, { "entropy": 0.24764457009732724, "epoch": 0.2366243151882504, "grad_norm": 0.734375, "learning_rate": 1.9998657128563736e-05, "loss": 0.1616, "mean_token_accuracy": 0.9366363644599914, "num_tokens": 4228810.0, "step": 1015 }, { "entropy": 0.2843809101730585, "epoch": 0.23778995220888216, "grad_norm": 0.431640625, "learning_rate": 1.9998642941544518e-05, "loss": 0.5265, "mean_token_accuracy": 0.8996910214424133, "num_tokens": 4254374.0, "step": 1020 }, { "entropy": 0.304288499802351, "epoch": 0.23895558922951393, "grad_norm": 0.9375, "learning_rate": 1.9998628679988013e-05, "loss": 0.6136, "mean_token_accuracy": 0.8884929776191711, "num_tokens": 4269516.0, "step": 1025 }, { "entropy": 0.3449162319302559, "epoch": 0.2401212262501457, "grad_norm": 1.9609375, "learning_rate": 1.9998614343894438e-05, "loss": 0.8248, "mean_token_accuracy": 0.8706211626529694, "num_tokens": 4279381.0, "step": 1030 }, { "entropy": 0.2974201008677483, "epoch": 0.24128686327077747, "grad_norm": 3.109375, "learning_rate": 1.9998599933264007e-05, "loss": 0.5335, "mean_token_accuracy": 0.9134840488433837, "num_tokens": 4299652.0, "step": 1035 }, { "entropy": 0.2299369264394045, "epoch": 0.24245250029140925, "grad_norm": 3.375, "learning_rate": 1.999858544809693e-05, "loss": 0.4326, "mean_token_accuracy": 0.9048404097557068, "num_tokens": 4326102.0, "step": 1040 }, { "entropy": 0.5418604515492916, "epoch": 0.24361813731204104, "grad_norm": 2.09375, "learning_rate": 1.9998570888393427e-05, "loss": 0.8301, "mean_token_accuracy": 0.820774444937706, "num_tokens": 4349589.0, "step": 1045 }, { "entropy": 0.34275285750627515, "epoch": 0.24478377433267282, "grad_norm": 1.4140625, "learning_rate": 1.9998556254153715e-05, "loss": 0.6571, "mean_token_accuracy": 0.8875455260276794, "num_tokens": 4363787.0, "step": 1050 }, { "entropy": 0.27274183109402655, "epoch": 0.2459494113533046, "grad_norm": 0.45703125, "learning_rate": 1.9998541545378007e-05, "loss": 0.4852, "mean_token_accuracy": 0.9113678991794586, "num_tokens": 4387489.0, "step": 1055 }, { "entropy": 0.35988821983337405, "epoch": 0.24711504837393636, "grad_norm": 2.859375, "learning_rate": 1.999852676206653e-05, "loss": 0.6556, "mean_token_accuracy": 0.8755202710628509, "num_tokens": 4403253.0, "step": 1060 }, { "entropy": 0.2266766732558608, "epoch": 0.24828068539456813, "grad_norm": 1.4609375, "learning_rate": 1.99985119042195e-05, "loss": 0.3077, "mean_token_accuracy": 0.9274795413017273, "num_tokens": 4434817.0, "step": 1065 }, { "entropy": 0.35136873573064803, "epoch": 0.2494463224151999, "grad_norm": 5.5, "learning_rate": 1.9998496971837137e-05, "loss": 1.0309, "mean_token_accuracy": 0.8499768733978271, "num_tokens": 4444556.0, "step": 1070 }, { "entropy": 0.2657496578991413, "epoch": 0.2506119594358317, "grad_norm": 1.3203125, "learning_rate": 1.999848196491967e-05, "loss": 0.4618, "mean_token_accuracy": 0.9091532528400421, "num_tokens": 4470876.0, "step": 1075 }, { "entropy": 0.2768456295132637, "epoch": 0.25177759645646347, "grad_norm": 3.0625, "learning_rate": 1.9998466883467316e-05, "loss": 0.7123, "mean_token_accuracy": 0.8955871105194092, "num_tokens": 4482427.0, "step": 1080 }, { "entropy": 0.20596542172133922, "epoch": 0.25294323347709524, "grad_norm": 2.578125, "learning_rate": 1.9998451727480302e-05, "loss": 0.3453, "mean_token_accuracy": 0.9377319753170014, "num_tokens": 4504480.0, "step": 1085 }, { "entropy": 0.3682212561368942, "epoch": 0.254108870497727, "grad_norm": 3.21875, "learning_rate": 1.999843649695886e-05, "loss": 0.7303, "mean_token_accuracy": 0.8806298255920411, "num_tokens": 4513098.0, "step": 1090 }, { "entropy": 0.20294536799192428, "epoch": 0.2552745075183588, "grad_norm": 0.373046875, "learning_rate": 1.9998421191903204e-05, "loss": 0.4142, "mean_token_accuracy": 0.9349018275737763, "num_tokens": 4538371.0, "step": 1095 }, { "entropy": 0.2562996305525303, "epoch": 0.25644014453899056, "grad_norm": 1.5625, "learning_rate": 1.9998405812313573e-05, "loss": 0.5281, "mean_token_accuracy": 0.9155136287212372, "num_tokens": 4552309.0, "step": 1100 }, { "entropy": 0.38680734038352965, "epoch": 0.2576057815596223, "grad_norm": 4.125, "learning_rate": 1.9998390358190197e-05, "loss": 0.6167, "mean_token_accuracy": 0.8845182538032532, "num_tokens": 4579283.0, "step": 1105 }, { "entropy": 0.39438874665647744, "epoch": 0.2587714185802541, "grad_norm": 0.6484375, "learning_rate": 1.9998374829533298e-05, "loss": 0.6791, "mean_token_accuracy": 0.8696678400039672, "num_tokens": 4597972.0, "step": 1110 }, { "entropy": 0.29664220958948134, "epoch": 0.25993705560088587, "grad_norm": 1.21875, "learning_rate": 1.9998359226343113e-05, "loss": 0.4791, "mean_token_accuracy": 0.9082686901092529, "num_tokens": 4615795.0, "step": 1115 }, { "entropy": 0.2910390578210354, "epoch": 0.26110269262151764, "grad_norm": 0.73828125, "learning_rate": 1.9998343548619878e-05, "loss": 0.5356, "mean_token_accuracy": 0.8995784521102905, "num_tokens": 4630562.0, "step": 1120 }, { "entropy": 0.3535892143845558, "epoch": 0.2622683296421494, "grad_norm": 2.421875, "learning_rate": 1.9998327796363818e-05, "loss": 0.7542, "mean_token_accuracy": 0.8523446142673492, "num_tokens": 4644011.0, "step": 1125 }, { "entropy": 0.31830633580684664, "epoch": 0.2634339666627812, "grad_norm": 1.578125, "learning_rate": 1.9998311969575174e-05, "loss": 0.5886, "mean_token_accuracy": 0.8993684887886048, "num_tokens": 4656443.0, "step": 1130 }, { "entropy": 0.19877025708556176, "epoch": 0.264599603683413, "grad_norm": 1.4296875, "learning_rate": 1.9998296068254183e-05, "loss": 0.3187, "mean_token_accuracy": 0.9322415173053742, "num_tokens": 4700790.0, "step": 1135 }, { "entropy": 0.3790184512734413, "epoch": 0.2657652407040448, "grad_norm": 2.75, "learning_rate": 1.9998280092401076e-05, "loss": 0.7684, "mean_token_accuracy": 0.8741475522518158, "num_tokens": 4711374.0, "step": 1140 }, { "entropy": 0.4036370933055878, "epoch": 0.26693087772467655, "grad_norm": 4.28125, "learning_rate": 1.9998264042016096e-05, "loss": 0.8014, "mean_token_accuracy": 0.8721184432506561, "num_tokens": 4720625.0, "step": 1145 }, { "entropy": 0.31758620887994765, "epoch": 0.2680965147453083, "grad_norm": 1.2265625, "learning_rate": 1.9998247917099482e-05, "loss": 0.8177, "mean_token_accuracy": 0.8595401406288147, "num_tokens": 4731984.0, "step": 1150 }, { "entropy": 0.3104529224336147, "epoch": 0.2692621517659401, "grad_norm": 2.265625, "learning_rate": 1.9998231717651476e-05, "loss": 0.5926, "mean_token_accuracy": 0.8918087661266327, "num_tokens": 4754962.0, "step": 1155 }, { "entropy": 0.4066608652472496, "epoch": 0.27042778878657187, "grad_norm": 1.8046875, "learning_rate": 1.9998215443672316e-05, "loss": 0.8178, "mean_token_accuracy": 0.8719375371932984, "num_tokens": 4765334.0, "step": 1160 }, { "entropy": 0.34720696657896044, "epoch": 0.27159342580720364, "grad_norm": 3.046875, "learning_rate": 1.9998199095162242e-05, "loss": 0.7829, "mean_token_accuracy": 0.869851005077362, "num_tokens": 4778433.0, "step": 1165 }, { "entropy": 0.3552531830966473, "epoch": 0.2727590628278354, "grad_norm": 0.75390625, "learning_rate": 1.9998182672121506e-05, "loss": 0.5921, "mean_token_accuracy": 0.8870375871658325, "num_tokens": 4812441.0, "step": 1170 }, { "entropy": 0.28509389981627464, "epoch": 0.2739246998484672, "grad_norm": 1.4296875, "learning_rate": 1.9998166174550348e-05, "loss": 0.4108, "mean_token_accuracy": 0.9073728084564209, "num_tokens": 4836739.0, "step": 1175 }, { "entropy": 0.4455971851944923, "epoch": 0.27509033686909895, "grad_norm": 2.90625, "learning_rate": 1.9998149602449014e-05, "loss": 0.9755, "mean_token_accuracy": 0.8507632434368133, "num_tokens": 4844095.0, "step": 1180 }, { "entropy": 0.15860873758792876, "epoch": 0.2762559738897307, "grad_norm": 0.55859375, "learning_rate": 1.9998132955817753e-05, "loss": 0.2743, "mean_token_accuracy": 0.9363688051700592, "num_tokens": 4874147.0, "step": 1185 }, { "entropy": 0.3147427745163441, "epoch": 0.2774216109103625, "grad_norm": 0.365234375, "learning_rate": 1.999811623465681e-05, "loss": 0.5888, "mean_token_accuracy": 0.9021156907081604, "num_tokens": 4891157.0, "step": 1190 }, { "entropy": 0.20595242828130722, "epoch": 0.27858724793099426, "grad_norm": 1.28125, "learning_rate": 1.9998099438966437e-05, "loss": 0.2696, "mean_token_accuracy": 0.9339096426963807, "num_tokens": 4933168.0, "step": 1195 }, { "entropy": 0.33057937026023865, "epoch": 0.2797528849516261, "grad_norm": 2.15625, "learning_rate": 1.9998082568746888e-05, "loss": 0.6004, "mean_token_accuracy": 0.8792477488517761, "num_tokens": 4955692.0, "step": 1200 }, { "entropy": 0.3509062934666872, "epoch": 0.28091852197225786, "grad_norm": 1.4453125, "learning_rate": 1.9998065623998403e-05, "loss": 0.5698, "mean_token_accuracy": 0.8836005508899689, "num_tokens": 4971106.0, "step": 1205 }, { "entropy": 0.3051776558160782, "epoch": 0.28208415899288963, "grad_norm": 0.412109375, "learning_rate": 1.9998048604721248e-05, "loss": 0.4699, "mean_token_accuracy": 0.9227140247821808, "num_tokens": 5008881.0, "step": 1210 }, { "entropy": 0.38542407751083374, "epoch": 0.2832497960135214, "grad_norm": 2.984375, "learning_rate": 1.9998031510915666e-05, "loss": 0.6204, "mean_token_accuracy": 0.8672150433063507, "num_tokens": 5028439.0, "step": 1215 }, { "entropy": 0.3344720020890236, "epoch": 0.2844154330341532, "grad_norm": 3.21875, "learning_rate": 1.9998014342581922e-05, "loss": 0.6178, "mean_token_accuracy": 0.8967957139015198, "num_tokens": 5042219.0, "step": 1220 }, { "entropy": 0.26223552152514457, "epoch": 0.28558107005478495, "grad_norm": 0.87890625, "learning_rate": 1.9997997099720263e-05, "loss": 0.1843, "mean_token_accuracy": 0.9286192059516907, "num_tokens": 5070919.0, "step": 1225 }, { "entropy": 0.3363313525915146, "epoch": 0.2867467070754167, "grad_norm": 8.875, "learning_rate": 1.9997979782330953e-05, "loss": 0.8308, "mean_token_accuracy": 0.8689966082572937, "num_tokens": 5081678.0, "step": 1230 }, { "entropy": 0.3352774230763316, "epoch": 0.2879123440960485, "grad_norm": 2.90625, "learning_rate": 1.9997962390414243e-05, "loss": 0.7084, "mean_token_accuracy": 0.8848056674003602, "num_tokens": 5103156.0, "step": 1235 }, { "entropy": 0.2327698018401861, "epoch": 0.28907798111668026, "grad_norm": 1.25, "learning_rate": 1.9997944923970397e-05, "loss": 0.3922, "mean_token_accuracy": 0.92334885597229, "num_tokens": 5123337.0, "step": 1240 }, { "entropy": 0.27879791483283045, "epoch": 0.29024361813731203, "grad_norm": 2.328125, "learning_rate": 1.9997927382999677e-05, "loss": 0.445, "mean_token_accuracy": 0.9180491745471955, "num_tokens": 5148142.0, "step": 1245 }, { "entropy": 0.4868896633386612, "epoch": 0.2914092551579438, "grad_norm": 2.65625, "learning_rate": 1.9997909767502342e-05, "loss": 0.6209, "mean_token_accuracy": 0.8842796742916107, "num_tokens": 5164243.0, "step": 1250 }, { "entropy": 0.2981738731265068, "epoch": 0.2925748921785756, "grad_norm": 6.03125, "learning_rate": 1.9997892077478654e-05, "loss": 0.5485, "mean_token_accuracy": 0.9020951867103577, "num_tokens": 5186669.0, "step": 1255 }, { "entropy": 0.3072267949581146, "epoch": 0.29374052919920735, "grad_norm": 0.8828125, "learning_rate": 1.9997874312928878e-05, "loss": 0.4802, "mean_token_accuracy": 0.8995555102825165, "num_tokens": 5209969.0, "step": 1260 }, { "entropy": 0.2941398710012436, "epoch": 0.2949061662198391, "grad_norm": 0.349609375, "learning_rate": 1.999785647385328e-05, "loss": 0.3772, "mean_token_accuracy": 0.9175294041633606, "num_tokens": 5236167.0, "step": 1265 }, { "entropy": 0.420187583938241, "epoch": 0.29607180324047094, "grad_norm": 2.765625, "learning_rate": 1.9997838560252122e-05, "loss": 0.6751, "mean_token_accuracy": 0.8618550479412079, "num_tokens": 5253019.0, "step": 1270 }, { "entropy": 0.30597032606601715, "epoch": 0.2972374402611027, "grad_norm": 4.0625, "learning_rate": 1.999782057212568e-05, "loss": 0.7702, "mean_token_accuracy": 0.8824066698551178, "num_tokens": 5272393.0, "step": 1275 }, { "entropy": 0.32190938405692576, "epoch": 0.2984030772817345, "grad_norm": 0.94140625, "learning_rate": 1.999780250947421e-05, "loss": 0.5561, "mean_token_accuracy": 0.8893412470817565, "num_tokens": 5287540.0, "step": 1280 }, { "entropy": 0.25939189046621325, "epoch": 0.29956871430236626, "grad_norm": 5.09375, "learning_rate": 1.9997784372297987e-05, "loss": 0.5246, "mean_token_accuracy": 0.9045629918575286, "num_tokens": 5306321.0, "step": 1285 }, { "entropy": 0.3444667488336563, "epoch": 0.30073435132299803, "grad_norm": 3.28125, "learning_rate": 1.9997766160597285e-05, "loss": 0.7877, "mean_token_accuracy": 0.866803640127182, "num_tokens": 5323357.0, "step": 1290 }, { "entropy": 0.2251311082392931, "epoch": 0.3018999883436298, "grad_norm": 1.953125, "learning_rate": 1.9997747874372374e-05, "loss": 0.4677, "mean_token_accuracy": 0.9120672941207886, "num_tokens": 5342328.0, "step": 1295 }, { "entropy": 0.4307561069726944, "epoch": 0.30306562536426157, "grad_norm": 1.671875, "learning_rate": 1.9997729513623523e-05, "loss": 0.7848, "mean_token_accuracy": 0.8459433108568192, "num_tokens": 5357527.0, "step": 1300 }, { "entropy": 0.22801509127020836, "epoch": 0.30423126238489334, "grad_norm": 2.25, "learning_rate": 1.999771107835101e-05, "loss": 0.5102, "mean_token_accuracy": 0.904511559009552, "num_tokens": 5372786.0, "step": 1305 }, { "entropy": 0.38460721224546435, "epoch": 0.3053968994055251, "grad_norm": 4.46875, "learning_rate": 1.9997692568555102e-05, "loss": 0.7097, "mean_token_accuracy": 0.8811452388763428, "num_tokens": 5383589.0, "step": 1310 }, { "entropy": 0.34651473462581633, "epoch": 0.3065625364261569, "grad_norm": 2.0, "learning_rate": 1.9997673984236085e-05, "loss": 0.7524, "mean_token_accuracy": 0.8840455830097198, "num_tokens": 5394471.0, "step": 1315 }, { "entropy": 0.25617843568325044, "epoch": 0.30772817344678866, "grad_norm": 2.53125, "learning_rate": 1.9997655325394232e-05, "loss": 0.6068, "mean_token_accuracy": 0.910620141029358, "num_tokens": 5408393.0, "step": 1320 }, { "entropy": 0.2709501812234521, "epoch": 0.3088938104674204, "grad_norm": 2.46875, "learning_rate": 1.999763659202982e-05, "loss": 0.4298, "mean_token_accuracy": 0.8932905435562134, "num_tokens": 5437024.0, "step": 1325 }, { "entropy": 0.28042895793914796, "epoch": 0.3100594474880522, "grad_norm": 0.5234375, "learning_rate": 1.9997617784143132e-05, "loss": 0.5373, "mean_token_accuracy": 0.883009546995163, "num_tokens": 5466512.0, "step": 1330 }, { "entropy": 0.29205573797225953, "epoch": 0.31122508450868397, "grad_norm": 1.7265625, "learning_rate": 1.9997598901734444e-05, "loss": 0.5968, "mean_token_accuracy": 0.8985353469848633, "num_tokens": 5480158.0, "step": 1335 }, { "entropy": 0.28621186055243014, "epoch": 0.3123907215293158, "grad_norm": 1.65625, "learning_rate": 1.9997579944804038e-05, "loss": 0.4702, "mean_token_accuracy": 0.9099491775035858, "num_tokens": 5496242.0, "step": 1340 }, { "entropy": 0.388295142352581, "epoch": 0.31355635854994757, "grad_norm": 2.546875, "learning_rate": 1.9997560913352202e-05, "loss": 0.7143, "mean_token_accuracy": 0.8692742168903351, "num_tokens": 5521237.0, "step": 1345 }, { "entropy": 0.26313092596828935, "epoch": 0.31472199557057934, "grad_norm": 0.35546875, "learning_rate": 1.9997541807379213e-05, "loss": 0.5295, "mean_token_accuracy": 0.9129705667495728, "num_tokens": 5543355.0, "step": 1350 }, { "entropy": 0.2963793471455574, "epoch": 0.3158876325912111, "grad_norm": 4.75, "learning_rate": 1.999752262688536e-05, "loss": 0.5275, "mean_token_accuracy": 0.8948480784893036, "num_tokens": 5567454.0, "step": 1355 }, { "entropy": 0.23987912703305483, "epoch": 0.3170532696118429, "grad_norm": 2.921875, "learning_rate": 1.999750337187093e-05, "loss": 0.5735, "mean_token_accuracy": 0.8970786690711975, "num_tokens": 5591399.0, "step": 1360 }, { "entropy": 0.44506589621305465, "epoch": 0.31821890663247465, "grad_norm": 3.625, "learning_rate": 1.9997484042336207e-05, "loss": 0.7905, "mean_token_accuracy": 0.8698137998580933, "num_tokens": 5606132.0, "step": 1365 }, { "entropy": 0.2002415034919977, "epoch": 0.3193845436531064, "grad_norm": 0.265625, "learning_rate": 1.9997464638281475e-05, "loss": 0.1768, "mean_token_accuracy": 0.9350118517875672, "num_tokens": 5645491.0, "step": 1370 }, { "entropy": 0.233334668725729, "epoch": 0.3205501806737382, "grad_norm": 0.8828125, "learning_rate": 1.9997445159707035e-05, "loss": 0.2001, "mean_token_accuracy": 0.9283820569515229, "num_tokens": 5677361.0, "step": 1375 }, { "entropy": 0.28531249314546586, "epoch": 0.32171581769436997, "grad_norm": 0.859375, "learning_rate": 1.999742560661317e-05, "loss": 0.8321, "mean_token_accuracy": 0.8798725843429566, "num_tokens": 5691533.0, "step": 1380 }, { "entropy": 0.33057908453047274, "epoch": 0.32288145471500174, "grad_norm": 2.234375, "learning_rate": 1.9997405979000172e-05, "loss": 0.6412, "mean_token_accuracy": 0.8915965855121613, "num_tokens": 5706375.0, "step": 1385 }, { "entropy": 0.33576981276273726, "epoch": 0.3240470917356335, "grad_norm": 3.125, "learning_rate": 1.9997386276868332e-05, "loss": 0.6126, "mean_token_accuracy": 0.8828125953674316, "num_tokens": 5723036.0, "step": 1390 }, { "entropy": 0.4078744914382696, "epoch": 0.3252127287562653, "grad_norm": 3.046875, "learning_rate": 1.999736650021795e-05, "loss": 0.573, "mean_token_accuracy": 0.8879797518253326, "num_tokens": 5745820.0, "step": 1395 }, { "entropy": 0.40988370552659037, "epoch": 0.32637836577689705, "grad_norm": 2.421875, "learning_rate": 1.9997346649049314e-05, "loss": 0.6882, "mean_token_accuracy": 0.8822323262691498, "num_tokens": 5773501.0, "step": 1400 }, { "entropy": 0.2149012926965952, "epoch": 0.3275440027975289, "grad_norm": 0.62890625, "learning_rate": 1.9997326723362725e-05, "loss": 0.2689, "mean_token_accuracy": 0.9297385811805725, "num_tokens": 5800329.0, "step": 1405 }, { "entropy": 0.38212478160858154, "epoch": 0.32870963981816065, "grad_norm": 4.28125, "learning_rate": 1.9997306723158477e-05, "loss": 0.7987, "mean_token_accuracy": 0.8482033610343933, "num_tokens": 5809536.0, "step": 1410 }, { "entropy": 0.33134956248104575, "epoch": 0.3298752768387924, "grad_norm": 2.84375, "learning_rate": 1.999728664843687e-05, "loss": 0.5945, "mean_token_accuracy": 0.8864220082759857, "num_tokens": 5826701.0, "step": 1415 }, { "entropy": 0.34533427506685255, "epoch": 0.3310409138594242, "grad_norm": 3.6875, "learning_rate": 1.9997266499198203e-05, "loss": 0.7109, "mean_token_accuracy": 0.8641054213047028, "num_tokens": 5840942.0, "step": 1420 }, { "entropy": 0.3226647362112999, "epoch": 0.33220655088005596, "grad_norm": 2.0625, "learning_rate": 1.9997246275442776e-05, "loss": 0.7447, "mean_token_accuracy": 0.8617903709411621, "num_tokens": 5854515.0, "step": 1425 }, { "entropy": 0.300795790925622, "epoch": 0.33337218790068773, "grad_norm": 1.21875, "learning_rate": 1.999722597717089e-05, "loss": 0.7193, "mean_token_accuracy": 0.8860871911048889, "num_tokens": 5870124.0, "step": 1430 }, { "entropy": 0.27445814162492754, "epoch": 0.3345378249213195, "grad_norm": 0.94921875, "learning_rate": 1.999720560438285e-05, "loss": 0.4695, "mean_token_accuracy": 0.9113145053386689, "num_tokens": 5894072.0, "step": 1435 }, { "entropy": 0.2666664507240057, "epoch": 0.3357034619419513, "grad_norm": 1.109375, "learning_rate": 1.9997185157078958e-05, "loss": 0.5023, "mean_token_accuracy": 0.9125133752822876, "num_tokens": 5908980.0, "step": 1440 }, { "entropy": 0.2709391973912716, "epoch": 0.33686909896258305, "grad_norm": 0.90234375, "learning_rate": 1.9997164635259515e-05, "loss": 0.6362, "mean_token_accuracy": 0.9022528111934662, "num_tokens": 5930796.0, "step": 1445 }, { "entropy": 0.31120129972696303, "epoch": 0.3380347359832148, "grad_norm": 1.890625, "learning_rate": 1.9997144038924836e-05, "loss": 0.6915, "mean_token_accuracy": 0.8787729322910309, "num_tokens": 5942512.0, "step": 1450 }, { "entropy": 0.41584598571062087, "epoch": 0.3392003730038466, "grad_norm": 4.09375, "learning_rate": 1.999712336807522e-05, "loss": 0.8791, "mean_token_accuracy": 0.8504727482795715, "num_tokens": 5954320.0, "step": 1455 }, { "entropy": 0.3380214340984821, "epoch": 0.34036601002447836, "grad_norm": 2.421875, "learning_rate": 1.9997102622710983e-05, "loss": 0.5363, "mean_token_accuracy": 0.8888375997543335, "num_tokens": 5981457.0, "step": 1460 }, { "entropy": 0.2891572520136833, "epoch": 0.34153164704511013, "grad_norm": 0.2890625, "learning_rate": 1.9997081802832427e-05, "loss": 0.6988, "mean_token_accuracy": 0.8861670017242431, "num_tokens": 6004171.0, "step": 1465 }, { "entropy": 0.3689094468951225, "epoch": 0.3426972840657419, "grad_norm": 1.6953125, "learning_rate": 1.9997060908439864e-05, "loss": 0.8413, "mean_token_accuracy": 0.8352766156196594, "num_tokens": 6013575.0, "step": 1470 }, { "entropy": 0.262055104598403, "epoch": 0.34386292108637373, "grad_norm": 2.703125, "learning_rate": 1.999703993953361e-05, "loss": 0.4639, "mean_token_accuracy": 0.9029669284820556, "num_tokens": 6033804.0, "step": 1475 }, { "entropy": 0.2753362640738487, "epoch": 0.3450285581070055, "grad_norm": 2.28125, "learning_rate": 1.999701889611397e-05, "loss": 0.4095, "mean_token_accuracy": 0.9083203911781311, "num_tokens": 6055032.0, "step": 1480 }, { "entropy": 0.32363075017929077, "epoch": 0.34619419512763727, "grad_norm": 5.0, "learning_rate": 1.9996997778181268e-05, "loss": 0.8892, "mean_token_accuracy": 0.8496161878108979, "num_tokens": 6065287.0, "step": 1485 }, { "entropy": 0.27738819643855095, "epoch": 0.34735983214826904, "grad_norm": 2.171875, "learning_rate": 1.999697658573581e-05, "loss": 0.5262, "mean_token_accuracy": 0.904751843214035, "num_tokens": 6080176.0, "step": 1490 }, { "entropy": 0.29178103134036065, "epoch": 0.3485254691689008, "grad_norm": 1.40625, "learning_rate": 1.9996955318777914e-05, "loss": 0.6819, "mean_token_accuracy": 0.8912020802497864, "num_tokens": 6104331.0, "step": 1495 }, { "entropy": 0.30065750852227213, "epoch": 0.3496911061895326, "grad_norm": 4.78125, "learning_rate": 1.99969339773079e-05, "loss": 0.4326, "mean_token_accuracy": 0.8998681366443634, "num_tokens": 6120457.0, "step": 1500 }, { "entropy": 0.38931316807866095, "epoch": 0.35085674321016436, "grad_norm": 0.240234375, "learning_rate": 1.9996912561326082e-05, "loss": 0.7776, "mean_token_accuracy": 0.8692282378673554, "num_tokens": 6142385.0, "step": 1505 }, { "entropy": 0.2787280652672052, "epoch": 0.35202238023079613, "grad_norm": 5.71875, "learning_rate": 1.9996891070832785e-05, "loss": 0.5146, "mean_token_accuracy": 0.8925648927688599, "num_tokens": 6167164.0, "step": 1510 }, { "entropy": 0.38545300289988516, "epoch": 0.3531880172514279, "grad_norm": 0.38671875, "learning_rate": 1.9996869505828327e-05, "loss": 0.6809, "mean_token_accuracy": 0.873318862915039, "num_tokens": 6181075.0, "step": 1515 }, { "entropy": 0.4982053153216839, "epoch": 0.35435365427205967, "grad_norm": 2.453125, "learning_rate": 1.9996847866313026e-05, "loss": 0.8632, "mean_token_accuracy": 0.8616338968276978, "num_tokens": 6192075.0, "step": 1520 }, { "entropy": 0.29975649788975717, "epoch": 0.35551929129269144, "grad_norm": 1.203125, "learning_rate": 1.999682615228721e-05, "loss": 0.4278, "mean_token_accuracy": 0.8942910373210907, "num_tokens": 6211561.0, "step": 1525 }, { "entropy": 0.35748309791088106, "epoch": 0.3566849283133232, "grad_norm": 1.546875, "learning_rate": 1.99968043637512e-05, "loss": 0.8788, "mean_token_accuracy": 0.8682412981987, "num_tokens": 6222311.0, "step": 1530 }, { "entropy": 0.30189107209444044, "epoch": 0.357850565333955, "grad_norm": 2.65625, "learning_rate": 1.999678250070532e-05, "loss": 0.6564, "mean_token_accuracy": 0.8856256902217865, "num_tokens": 6232548.0, "step": 1535 }, { "entropy": 0.30567995868623254, "epoch": 0.35901620235458676, "grad_norm": 0.57421875, "learning_rate": 1.9996760563149898e-05, "loss": 0.6377, "mean_token_accuracy": 0.8897257745265961, "num_tokens": 6256342.0, "step": 1540 }, { "entropy": 0.3873858168721199, "epoch": 0.3601818393752186, "grad_norm": 4.5625, "learning_rate": 1.999673855108526e-05, "loss": 0.7463, "mean_token_accuracy": 0.8492149412631989, "num_tokens": 6269451.0, "step": 1545 }, { "entropy": 0.333609714359045, "epoch": 0.36134747639585035, "grad_norm": 5.0625, "learning_rate": 1.9996716464511735e-05, "loss": 0.7832, "mean_token_accuracy": 0.8789271175861358, "num_tokens": 6279773.0, "step": 1550 }, { "entropy": 0.27410609275102615, "epoch": 0.3625131134164821, "grad_norm": 2.703125, "learning_rate": 1.9996694303429653e-05, "loss": 0.6224, "mean_token_accuracy": 0.9015877664089202, "num_tokens": 6298073.0, "step": 1555 }, { "entropy": 0.2983283132314682, "epoch": 0.3636787504371139, "grad_norm": 0.8359375, "learning_rate": 1.9996672067839344e-05, "loss": 0.541, "mean_token_accuracy": 0.8865351200103759, "num_tokens": 6328075.0, "step": 1560 }, { "entropy": 0.3246372312307358, "epoch": 0.36484438745774567, "grad_norm": 0.828125, "learning_rate": 1.9996649757741138e-05, "loss": 0.5547, "mean_token_accuracy": 0.8967100918293, "num_tokens": 6356726.0, "step": 1565 }, { "entropy": 0.26282679475843906, "epoch": 0.36601002447837744, "grad_norm": 1.9609375, "learning_rate": 1.999662737313537e-05, "loss": 0.3613, "mean_token_accuracy": 0.917979234457016, "num_tokens": 6380621.0, "step": 1570 }, { "entropy": 0.3043300576508045, "epoch": 0.3671756614990092, "grad_norm": 1.2734375, "learning_rate": 1.999660491402237e-05, "loss": 0.3796, "mean_token_accuracy": 0.9121894776821137, "num_tokens": 6405319.0, "step": 1575 }, { "entropy": 0.2572882641106844, "epoch": 0.368341298519641, "grad_norm": 3.28125, "learning_rate": 1.9996582380402475e-05, "loss": 0.6378, "mean_token_accuracy": 0.8814368367195129, "num_tokens": 6424737.0, "step": 1580 }, { "entropy": 0.21875108852982522, "epoch": 0.36950693554027275, "grad_norm": 0.87890625, "learning_rate": 1.9996559772276024e-05, "loss": 0.3042, "mean_token_accuracy": 0.9384470582008362, "num_tokens": 6459703.0, "step": 1585 }, { "entropy": 0.3576366286724806, "epoch": 0.3706725725609045, "grad_norm": 1.0234375, "learning_rate": 1.9996537089643352e-05, "loss": 0.5118, "mean_token_accuracy": 0.8810594260692597, "num_tokens": 6486737.0, "step": 1590 }, { "entropy": 0.22341410033404827, "epoch": 0.3718382095815363, "grad_norm": 0.9375, "learning_rate": 1.9996514332504795e-05, "loss": 0.3055, "mean_token_accuracy": 0.9157830238342285, "num_tokens": 6522014.0, "step": 1595 }, { "entropy": 0.3736105814576149, "epoch": 0.37300384660216807, "grad_norm": 1.09375, "learning_rate": 1.999649150086069e-05, "loss": 0.854, "mean_token_accuracy": 0.8709351003170014, "num_tokens": 6530820.0, "step": 1600 }, { "entropy": 0.3235956601798534, "epoch": 0.37416948362279984, "grad_norm": 2.828125, "learning_rate": 1.999646859471139e-05, "loss": 0.5949, "mean_token_accuracy": 0.8829461574554444, "num_tokens": 6545363.0, "step": 1605 }, { "entropy": 0.2963440466672182, "epoch": 0.3753351206434316, "grad_norm": 1.8984375, "learning_rate": 1.9996445614057227e-05, "loss": 0.5785, "mean_token_accuracy": 0.9058303475379944, "num_tokens": 6561658.0, "step": 1610 }, { "entropy": 0.34780505336821077, "epoch": 0.37650075766406343, "grad_norm": 0.3125, "learning_rate": 1.999642255889854e-05, "loss": 0.7008, "mean_token_accuracy": 0.8779121518135071, "num_tokens": 6586862.0, "step": 1615 }, { "entropy": 0.31498599648475645, "epoch": 0.3776663946846952, "grad_norm": 6.125, "learning_rate": 1.999639942923568e-05, "loss": 0.7251, "mean_token_accuracy": 0.8791738033294678, "num_tokens": 6600831.0, "step": 1620 }, { "entropy": 0.4721190705895424, "epoch": 0.378832031705327, "grad_norm": 2.015625, "learning_rate": 1.999637622506899e-05, "loss": 0.8876, "mean_token_accuracy": 0.8171874463558197, "num_tokens": 6627349.0, "step": 1625 }, { "entropy": 0.24465088918805122, "epoch": 0.37999766872595875, "grad_norm": 1.3671875, "learning_rate": 1.9996352946398812e-05, "loss": 0.5261, "mean_token_accuracy": 0.9152758538722991, "num_tokens": 6640587.0, "step": 1630 }, { "entropy": 0.19936469215899705, "epoch": 0.3811633057465905, "grad_norm": 1.421875, "learning_rate": 1.99963295932255e-05, "loss": 0.1646, "mean_token_accuracy": 0.9334083795547485, "num_tokens": 6683779.0, "step": 1635 }, { "entropy": 0.2622247666120529, "epoch": 0.3823289427672223, "grad_norm": 0.55078125, "learning_rate": 1.9996306165549398e-05, "loss": 0.5497, "mean_token_accuracy": 0.9146570563316345, "num_tokens": 6701209.0, "step": 1640 }, { "entropy": 0.2674731440842152, "epoch": 0.38349457978785406, "grad_norm": 1.4140625, "learning_rate": 1.9996282663370855e-05, "loss": 0.5693, "mean_token_accuracy": 0.9051762282848358, "num_tokens": 6720783.0, "step": 1645 }, { "entropy": 0.26045389622449877, "epoch": 0.38466021680848583, "grad_norm": 1.390625, "learning_rate": 1.9996259086690225e-05, "loss": 0.551, "mean_token_accuracy": 0.907971054315567, "num_tokens": 6732408.0, "step": 1650 }, { "entropy": 0.19415582679212093, "epoch": 0.3858258538291176, "grad_norm": 2.09375, "learning_rate": 1.9996235435507857e-05, "loss": 0.2055, "mean_token_accuracy": 0.9295479655265808, "num_tokens": 6762843.0, "step": 1655 }, { "entropy": 0.2839368298649788, "epoch": 0.3869914908497494, "grad_norm": 2.734375, "learning_rate": 1.9996211709824103e-05, "loss": 0.602, "mean_token_accuracy": 0.8842163443565368, "num_tokens": 6774026.0, "step": 1660 }, { "entropy": 0.277092095091939, "epoch": 0.38815712787038115, "grad_norm": 2.578125, "learning_rate": 1.999618790963932e-05, "loss": 0.4622, "mean_token_accuracy": 0.8996878266334534, "num_tokens": 6791014.0, "step": 1665 }, { "entropy": 0.30382088720798495, "epoch": 0.3893227648910129, "grad_norm": 1.6640625, "learning_rate": 1.9996164034953856e-05, "loss": 0.6885, "mean_token_accuracy": 0.8841532289981842, "num_tokens": 6803386.0, "step": 1670 }, { "entropy": 0.26629671454429626, "epoch": 0.3904884019116447, "grad_norm": 1.2890625, "learning_rate": 1.9996140085768075e-05, "loss": 0.2775, "mean_token_accuracy": 0.918737781047821, "num_tokens": 6834927.0, "step": 1675 }, { "entropy": 0.2534227319061756, "epoch": 0.3916540389322765, "grad_norm": 1.3203125, "learning_rate": 1.9996116062082328e-05, "loss": 0.4968, "mean_token_accuracy": 0.9031592726707458, "num_tokens": 6853376.0, "step": 1680 }, { "entropy": 0.466543810069561, "epoch": 0.3928196759529083, "grad_norm": 3.078125, "learning_rate": 1.9996091963896977e-05, "loss": 1.0411, "mean_token_accuracy": 0.8225615203380585, "num_tokens": 6863019.0, "step": 1685 }, { "entropy": 0.41575510799884796, "epoch": 0.39398531297354006, "grad_norm": 6.5, "learning_rate": 1.999606779121238e-05, "loss": 0.7638, "mean_token_accuracy": 0.8457536458969116, "num_tokens": 6877920.0, "step": 1690 }, { "entropy": 0.26502939909696577, "epoch": 0.39515094999417183, "grad_norm": 5.5625, "learning_rate": 1.99960435440289e-05, "loss": 0.5983, "mean_token_accuracy": 0.8899563789367676, "num_tokens": 6892767.0, "step": 1695 }, { "entropy": 0.3662456482648849, "epoch": 0.3963165870148036, "grad_norm": 2.75, "learning_rate": 1.9996019222346896e-05, "loss": 0.7134, "mean_token_accuracy": 0.8851879596710205, "num_tokens": 6909505.0, "step": 1700 }, { "entropy": 0.24746759831905366, "epoch": 0.39748222403543537, "grad_norm": 0.298828125, "learning_rate": 1.9995994826166728e-05, "loss": 0.3404, "mean_token_accuracy": 0.9107642292976379, "num_tokens": 6929397.0, "step": 1705 }, { "entropy": 0.3495868884027004, "epoch": 0.39864786105606714, "grad_norm": 2.375, "learning_rate": 1.9995970355488765e-05, "loss": 0.7069, "mean_token_accuracy": 0.865958285331726, "num_tokens": 6945542.0, "step": 1710 }, { "entropy": 0.2592170963063836, "epoch": 0.3998134980766989, "grad_norm": 0.328125, "learning_rate": 1.999594581031337e-05, "loss": 0.5571, "mean_token_accuracy": 0.8999188899993896, "num_tokens": 6971445.0, "step": 1715 }, { "entropy": 0.29373019393533467, "epoch": 0.4009791350973307, "grad_norm": 0.158203125, "learning_rate": 1.999592119064091e-05, "loss": 0.4141, "mean_token_accuracy": 0.9174761772155762, "num_tokens": 6999820.0, "step": 1720 }, { "entropy": 0.2551829546689987, "epoch": 0.40214477211796246, "grad_norm": 2.515625, "learning_rate": 1.999589649647175e-05, "loss": 0.6868, "mean_token_accuracy": 0.8982496917247772, "num_tokens": 7015487.0, "step": 1725 }, { "entropy": 0.3725998356938362, "epoch": 0.40331040913859423, "grad_norm": 1.0703125, "learning_rate": 1.9995871727806257e-05, "loss": 0.6217, "mean_token_accuracy": 0.8710966169834137, "num_tokens": 7047747.0, "step": 1730 }, { "entropy": 0.3806193895637989, "epoch": 0.404476046159226, "grad_norm": 1.359375, "learning_rate": 1.99958468846448e-05, "loss": 0.717, "mean_token_accuracy": 0.8549498856067658, "num_tokens": 7075138.0, "step": 1735 }, { "entropy": 0.5255543380975723, "epoch": 0.40564168317985777, "grad_norm": 3.828125, "learning_rate": 1.9995821966987754e-05, "loss": 1.1207, "mean_token_accuracy": 0.8527864456176758, "num_tokens": 7093547.0, "step": 1740 }, { "entropy": 0.28944441452622416, "epoch": 0.40680732020048954, "grad_norm": 0.796875, "learning_rate": 1.9995796974835492e-05, "loss": 0.4932, "mean_token_accuracy": 0.9065792560577393, "num_tokens": 7113040.0, "step": 1745 }, { "entropy": 0.341873537003994, "epoch": 0.40797295722112137, "grad_norm": 2.546875, "learning_rate": 1.9995771908188377e-05, "loss": 0.8112, "mean_token_accuracy": 0.8654444336891174, "num_tokens": 7138138.0, "step": 1750 }, { "entropy": 0.23857189230620862, "epoch": 0.40913859424175314, "grad_norm": 1.609375, "learning_rate": 1.9995746767046794e-05, "loss": 0.4241, "mean_token_accuracy": 0.9137138366699219, "num_tokens": 7162249.0, "step": 1755 }, { "entropy": 0.2880829580128193, "epoch": 0.4103042312623849, "grad_norm": 2.078125, "learning_rate": 1.999572155141111e-05, "loss": 0.6309, "mean_token_accuracy": 0.8943787276744842, "num_tokens": 7177792.0, "step": 1760 }, { "entropy": 0.3655803993344307, "epoch": 0.4114698682830167, "grad_norm": 0.3515625, "learning_rate": 1.9995696261281703e-05, "loss": 0.6261, "mean_token_accuracy": 0.8850533306598664, "num_tokens": 7192528.0, "step": 1765 }, { "entropy": 0.2609210178256035, "epoch": 0.41263550530364845, "grad_norm": 1.015625, "learning_rate": 1.999567089665895e-05, "loss": 0.3491, "mean_token_accuracy": 0.9026422142982483, "num_tokens": 7216693.0, "step": 1770 }, { "entropy": 0.3143754366785288, "epoch": 0.4138011423242802, "grad_norm": 3.453125, "learning_rate": 1.9995645457543232e-05, "loss": 0.5448, "mean_token_accuracy": 0.8901515245437622, "num_tokens": 7236689.0, "step": 1775 }, { "entropy": 0.33804006576538087, "epoch": 0.414966779344912, "grad_norm": 3.859375, "learning_rate": 1.999561994393493e-05, "loss": 0.8207, "mean_token_accuracy": 0.8533077538013458, "num_tokens": 7246576.0, "step": 1780 }, { "entropy": 0.19590835236012935, "epoch": 0.41613241636554377, "grad_norm": 1.5, "learning_rate": 1.9995594355834417e-05, "loss": 0.3561, "mean_token_accuracy": 0.9268556416034699, "num_tokens": 7274075.0, "step": 1785 }, { "entropy": 0.41329563707113265, "epoch": 0.41729805338617554, "grad_norm": 6.3125, "learning_rate": 1.999556869324208e-05, "loss": 0.9415, "mean_token_accuracy": 0.8555647194385528, "num_tokens": 7291078.0, "step": 1790 }, { "entropy": 0.23999712951481342, "epoch": 0.4184636904068073, "grad_norm": 2.140625, "learning_rate": 1.9995542956158296e-05, "loss": 0.4092, "mean_token_accuracy": 0.9112877190113068, "num_tokens": 7332326.0, "step": 1795 }, { "entropy": 0.2772911600768566, "epoch": 0.4196293274274391, "grad_norm": 0.482421875, "learning_rate": 1.9995517144583454e-05, "loss": 0.579, "mean_token_accuracy": 0.9155928194522858, "num_tokens": 7348012.0, "step": 1800 }, { "entropy": 0.3538199070841074, "epoch": 0.42079496444807085, "grad_norm": 1.09375, "learning_rate": 1.999549125851794e-05, "loss": 0.8149, "mean_token_accuracy": 0.8583323359489441, "num_tokens": 7361301.0, "step": 1805 }, { "entropy": 0.4551861360669136, "epoch": 0.4219606014687026, "grad_norm": 4.5, "learning_rate": 1.9995465297962138e-05, "loss": 1.06, "mean_token_accuracy": 0.8336413264274597, "num_tokens": 7368604.0, "step": 1810 }, { "entropy": 0.28974622786045073, "epoch": 0.4231262384893344, "grad_norm": 0.5625, "learning_rate": 1.9995439262916433e-05, "loss": 0.5063, "mean_token_accuracy": 0.9072389006614685, "num_tokens": 7392166.0, "step": 1815 }, { "entropy": 0.35121317878365516, "epoch": 0.4242918755099662, "grad_norm": 1.34375, "learning_rate": 1.9995413153381215e-05, "loss": 0.6501, "mean_token_accuracy": 0.8880411267280579, "num_tokens": 7409022.0, "step": 1820 }, { "entropy": 0.3598515219986439, "epoch": 0.425457512530598, "grad_norm": 0.7265625, "learning_rate": 1.9995386969356875e-05, "loss": 0.6913, "mean_token_accuracy": 0.8816120862960816, "num_tokens": 7431789.0, "step": 1825 }, { "entropy": 0.5655100494623184, "epoch": 0.42662314955122976, "grad_norm": 2.953125, "learning_rate": 1.9995360710843797e-05, "loss": 1.1957, "mean_token_accuracy": 0.8307225584983826, "num_tokens": 7454374.0, "step": 1830 }, { "entropy": 0.2977691352367401, "epoch": 0.42778878657186153, "grad_norm": 0.5703125, "learning_rate": 1.9995334377842385e-05, "loss": 0.6536, "mean_token_accuracy": 0.8897909998893738, "num_tokens": 7473775.0, "step": 1835 }, { "entropy": 0.255497844517231, "epoch": 0.4289544235924933, "grad_norm": 2.765625, "learning_rate": 1.999530797035302e-05, "loss": 0.4676, "mean_token_accuracy": 0.9138294994831085, "num_tokens": 7487979.0, "step": 1840 }, { "entropy": 0.368525767326355, "epoch": 0.4301200606131251, "grad_norm": 4.90625, "learning_rate": 1.9995281488376097e-05, "loss": 0.652, "mean_token_accuracy": 0.8939944744110108, "num_tokens": 7500991.0, "step": 1845 }, { "entropy": 0.3272201903164387, "epoch": 0.43128569763375685, "grad_norm": 2.359375, "learning_rate": 1.9995254931912017e-05, "loss": 0.5455, "mean_token_accuracy": 0.885904461145401, "num_tokens": 7514349.0, "step": 1850 }, { "entropy": 0.2522231835871935, "epoch": 0.4324513346543886, "grad_norm": 0.40625, "learning_rate": 1.9995228300961175e-05, "loss": 0.2802, "mean_token_accuracy": 0.917109364271164, "num_tokens": 7550584.0, "step": 1855 }, { "entropy": 0.38599798902869226, "epoch": 0.4336169716750204, "grad_norm": 4.09375, "learning_rate": 1.999520159552396e-05, "loss": 0.5278, "mean_token_accuracy": 0.8643686592578887, "num_tokens": 7582607.0, "step": 1860 }, { "entropy": 0.3535305127501488, "epoch": 0.43478260869565216, "grad_norm": 2.109375, "learning_rate": 1.999517481560078e-05, "loss": 1.0017, "mean_token_accuracy": 0.8477705299854279, "num_tokens": 7591089.0, "step": 1865 }, { "entropy": 0.2962531797587872, "epoch": 0.43594824571628393, "grad_norm": 0.75390625, "learning_rate": 1.999514796119203e-05, "loss": 0.4272, "mean_token_accuracy": 0.9027734518051147, "num_tokens": 7619933.0, "step": 1870 }, { "entropy": 0.23546839468181133, "epoch": 0.4371138827369157, "grad_norm": 0.42578125, "learning_rate": 1.9995121032298107e-05, "loss": 0.3185, "mean_token_accuracy": 0.9401553213596344, "num_tokens": 7639102.0, "step": 1875 }, { "entropy": 0.2960228305310011, "epoch": 0.4382795197575475, "grad_norm": 0.306640625, "learning_rate": 1.999509402891942e-05, "loss": 0.3647, "mean_token_accuracy": 0.9110294997692108, "num_tokens": 7662303.0, "step": 1880 }, { "entropy": 0.28797239661216734, "epoch": 0.4394451567781793, "grad_norm": 1.28125, "learning_rate": 1.999506695105637e-05, "loss": 0.3928, "mean_token_accuracy": 0.9123729705810547, "num_tokens": 7690576.0, "step": 1885 }, { "entropy": 0.4960401579737663, "epoch": 0.4406107937988111, "grad_norm": 2.453125, "learning_rate": 1.9995039798709356e-05, "loss": 0.8772, "mean_token_accuracy": 0.8687143921852112, "num_tokens": 7697941.0, "step": 1890 }, { "entropy": 0.3485689952969551, "epoch": 0.44177643081944284, "grad_norm": 1.828125, "learning_rate": 1.9995012571878784e-05, "loss": 0.7626, "mean_token_accuracy": 0.8469592273235321, "num_tokens": 7717516.0, "step": 1895 }, { "entropy": 0.3791528955101967, "epoch": 0.4429420678400746, "grad_norm": 3.0625, "learning_rate": 1.9994985270565068e-05, "loss": 0.7173, "mean_token_accuracy": 0.8701053142547608, "num_tokens": 7736106.0, "step": 1900 }, { "entropy": 0.25195520669221877, "epoch": 0.4441077048607064, "grad_norm": 1.78125, "learning_rate": 1.99949578947686e-05, "loss": 0.5638, "mean_token_accuracy": 0.9047086894512176, "num_tokens": 7748913.0, "step": 1905 }, { "entropy": 0.3371236763894558, "epoch": 0.44527334188133816, "grad_norm": 0.359375, "learning_rate": 1.99949304444898e-05, "loss": 0.5597, "mean_token_accuracy": 0.8984895050525665, "num_tokens": 7766235.0, "step": 1910 }, { "entropy": 0.305242264457047, "epoch": 0.44643897890196993, "grad_norm": 4.40625, "learning_rate": 1.999490291972908e-05, "loss": 0.4301, "mean_token_accuracy": 0.9092363059520722, "num_tokens": 7794204.0, "step": 1915 }, { "entropy": 0.2959679692983627, "epoch": 0.4476046159226017, "grad_norm": 2.546875, "learning_rate": 1.9994875320486837e-05, "loss": 0.6583, "mean_token_accuracy": 0.9009743809700013, "num_tokens": 7809446.0, "step": 1920 }, { "entropy": 0.24514828026294708, "epoch": 0.44877025294323347, "grad_norm": 2.484375, "learning_rate": 1.9994847646763495e-05, "loss": 0.6453, "mean_token_accuracy": 0.8898572206497193, "num_tokens": 7821972.0, "step": 1925 }, { "entropy": 0.40760628655552866, "epoch": 0.44993588996386524, "grad_norm": 1.8828125, "learning_rate": 1.9994819898559458e-05, "loss": 0.809, "mean_token_accuracy": 0.8585144937038421, "num_tokens": 7841586.0, "step": 1930 }, { "entropy": 0.44150213301181795, "epoch": 0.451101526984497, "grad_norm": 3.265625, "learning_rate": 1.9994792075875147e-05, "loss": 0.9715, "mean_token_accuracy": 0.853536581993103, "num_tokens": 7854407.0, "step": 1935 }, { "entropy": 0.49898901283741, "epoch": 0.4522671640051288, "grad_norm": 2.328125, "learning_rate": 1.9994764178710974e-05, "loss": 0.9198, "mean_token_accuracy": 0.8207449436187744, "num_tokens": 7883591.0, "step": 1940 }, { "entropy": 0.30882971081882715, "epoch": 0.45343280102576056, "grad_norm": 2.71875, "learning_rate": 1.999473620706735e-05, "loss": 0.645, "mean_token_accuracy": 0.8867849349975586, "num_tokens": 7903835.0, "step": 1945 }, { "entropy": 0.3548956707119942, "epoch": 0.45459843804639233, "grad_norm": 1.7421875, "learning_rate": 1.9994708160944702e-05, "loss": 0.7981, "mean_token_accuracy": 0.85789794921875, "num_tokens": 7914330.0, "step": 1950 }, { "entropy": 0.3712115705013275, "epoch": 0.45576407506702415, "grad_norm": 2.34375, "learning_rate": 1.999468004034344e-05, "loss": 0.6738, "mean_token_accuracy": 0.8534205198287964, "num_tokens": 7935519.0, "step": 1955 }, { "entropy": 0.22481756322085858, "epoch": 0.4569297120876559, "grad_norm": 0.58984375, "learning_rate": 1.9994651845263986e-05, "loss": 0.2575, "mean_token_accuracy": 0.946908849477768, "num_tokens": 7966658.0, "step": 1960 }, { "entropy": 0.26531825475394727, "epoch": 0.4580953491082877, "grad_norm": 0.3515625, "learning_rate": 1.9994623575706762e-05, "loss": 0.3248, "mean_token_accuracy": 0.9264973163604736, "num_tokens": 7987994.0, "step": 1965 }, { "entropy": 0.27761743124574423, "epoch": 0.45926098612891947, "grad_norm": 1.625, "learning_rate": 1.9994595231672188e-05, "loss": 0.5992, "mean_token_accuracy": 0.8765576720237732, "num_tokens": 8009731.0, "step": 1970 }, { "entropy": 0.30080183520913123, "epoch": 0.46042662314955124, "grad_norm": 0.546875, "learning_rate": 1.9994566813160686e-05, "loss": 0.6036, "mean_token_accuracy": 0.878866708278656, "num_tokens": 8025321.0, "step": 1975 }, { "entropy": 0.36402922198176385, "epoch": 0.461592260170183, "grad_norm": 5.21875, "learning_rate": 1.999453832017268e-05, "loss": 0.6552, "mean_token_accuracy": 0.8646969377994538, "num_tokens": 8053893.0, "step": 1980 }, { "entropy": 0.28284838795661926, "epoch": 0.4627578971908148, "grad_norm": 1.4296875, "learning_rate": 1.9994509752708596e-05, "loss": 0.4469, "mean_token_accuracy": 0.9201807320117951, "num_tokens": 8076731.0, "step": 1985 }, { "entropy": 0.37472383230924605, "epoch": 0.46392353421144655, "grad_norm": 2.21875, "learning_rate": 1.999448111076886e-05, "loss": 0.7793, "mean_token_accuracy": 0.8679640293121338, "num_tokens": 8085669.0, "step": 1990 }, { "entropy": 0.21981629952788354, "epoch": 0.4650891712320783, "grad_norm": 6.46875, "learning_rate": 1.99944523943539e-05, "loss": 0.4973, "mean_token_accuracy": 0.9069263756275177, "num_tokens": 8101729.0, "step": 1995 }, { "entropy": 0.48663657903671265, "epoch": 0.4662548082527101, "grad_norm": 0.9375, "learning_rate": 1.999442360346414e-05, "loss": 0.3556, "mean_token_accuracy": 0.9074055433273316, "num_tokens": 8128873.0, "step": 2000 }, { "entropy": 0.36330757662653923, "epoch": 0.46742044527334187, "grad_norm": 3.46875, "learning_rate": 1.9994394738100014e-05, "loss": 0.689, "mean_token_accuracy": 0.8651768624782562, "num_tokens": 8156836.0, "step": 2005 }, { "entropy": 0.29512323513627053, "epoch": 0.46858608229397364, "grad_norm": 1.1796875, "learning_rate": 1.999436579826195e-05, "loss": 0.5125, "mean_token_accuracy": 0.8990586638450623, "num_tokens": 8176283.0, "step": 2010 }, { "entropy": 0.3674930900335312, "epoch": 0.4697517193146054, "grad_norm": 1.390625, "learning_rate": 1.999433678395038e-05, "loss": 0.7055, "mean_token_accuracy": 0.8752959787845611, "num_tokens": 8211887.0, "step": 2015 }, { "entropy": 0.3832183495163918, "epoch": 0.4709173563352372, "grad_norm": 4.53125, "learning_rate": 1.9994307695165732e-05, "loss": 0.7355, "mean_token_accuracy": 0.8708587050437927, "num_tokens": 8231249.0, "step": 2020 }, { "entropy": 0.32334342747926714, "epoch": 0.472082993355869, "grad_norm": 1.1953125, "learning_rate": 1.999427853190845e-05, "loss": 0.4626, "mean_token_accuracy": 0.8914056956768036, "num_tokens": 8254247.0, "step": 2025 }, { "entropy": 0.39032787531614305, "epoch": 0.4732486303765008, "grad_norm": 2.609375, "learning_rate": 1.9994249294178964e-05, "loss": 0.6039, "mean_token_accuracy": 0.8988476514816284, "num_tokens": 8271475.0, "step": 2030 }, { "entropy": 0.2363220054656267, "epoch": 0.47441426739713255, "grad_norm": 3.453125, "learning_rate": 1.9994219981977704e-05, "loss": 0.3266, "mean_token_accuracy": 0.9166711091995239, "num_tokens": 8298232.0, "step": 2035 }, { "entropy": 0.2717366095632315, "epoch": 0.4755799044177643, "grad_norm": 1.96875, "learning_rate": 1.9994190595305115e-05, "loss": 0.4732, "mean_token_accuracy": 0.8963268280029297, "num_tokens": 8335594.0, "step": 2040 }, { "entropy": 0.26531027555465697, "epoch": 0.4767455414383961, "grad_norm": 0.384765625, "learning_rate": 1.9994161134161632e-05, "loss": 0.5162, "mean_token_accuracy": 0.9053733050823212, "num_tokens": 8353409.0, "step": 2045 }, { "entropy": 0.2092266406863928, "epoch": 0.47791117845902786, "grad_norm": 2.21875, "learning_rate": 1.9994131598547698e-05, "loss": 0.3521, "mean_token_accuracy": 0.9272272467613221, "num_tokens": 8391098.0, "step": 2050 }, { "entropy": 0.26025751419365406, "epoch": 0.47907681547965963, "grad_norm": 0.546875, "learning_rate": 1.9994101988463748e-05, "loss": 0.588, "mean_token_accuracy": 0.8997723400592804, "num_tokens": 8410541.0, "step": 2055 }, { "entropy": 0.23223126903176308, "epoch": 0.4802424525002914, "grad_norm": 0.59375, "learning_rate": 1.9994072303910226e-05, "loss": 0.2832, "mean_token_accuracy": 0.9168312430381775, "num_tokens": 8443139.0, "step": 2060 }, { "entropy": 0.29509652592241764, "epoch": 0.4814080895209232, "grad_norm": 3.078125, "learning_rate": 1.9994042544887574e-05, "loss": 0.3704, "mean_token_accuracy": 0.9128056645393372, "num_tokens": 8473193.0, "step": 2065 }, { "entropy": 0.33343763947486876, "epoch": 0.48257372654155495, "grad_norm": 2.75, "learning_rate": 1.9994012711396235e-05, "loss": 0.5006, "mean_token_accuracy": 0.8973432779312134, "num_tokens": 8507250.0, "step": 2070 }, { "entropy": 0.25346662774682044, "epoch": 0.4837393635621867, "grad_norm": 5.0, "learning_rate": 1.999398280343666e-05, "loss": 0.3977, "mean_token_accuracy": 0.9170755028724671, "num_tokens": 8527345.0, "step": 2075 }, { "entropy": 0.27076412811875344, "epoch": 0.4849050005828185, "grad_norm": 1.296875, "learning_rate": 1.9993952821009284e-05, "loss": 0.3173, "mean_token_accuracy": 0.9087885797023774, "num_tokens": 8553539.0, "step": 2080 }, { "entropy": 0.2886177830398083, "epoch": 0.48607063760345026, "grad_norm": 1.0546875, "learning_rate": 1.9993922764114563e-05, "loss": 0.487, "mean_token_accuracy": 0.8957848846912384, "num_tokens": 8571399.0, "step": 2085 }, { "entropy": 0.27993311583995817, "epoch": 0.4872362746240821, "grad_norm": 0.42578125, "learning_rate": 1.9993892632752944e-05, "loss": 0.6064, "mean_token_accuracy": 0.9026101946830749, "num_tokens": 8586558.0, "step": 2090 }, { "entropy": 0.3958744205534458, "epoch": 0.48840191164471386, "grad_norm": 3.046875, "learning_rate": 1.999386242692487e-05, "loss": 0.8093, "mean_token_accuracy": 0.8745874762535095, "num_tokens": 8595053.0, "step": 2095 }, { "entropy": 0.24650852084159852, "epoch": 0.48956754866534563, "grad_norm": 1.5703125, "learning_rate": 1.9993832146630798e-05, "loss": 0.5389, "mean_token_accuracy": 0.911456036567688, "num_tokens": 8615637.0, "step": 2100 }, { "entropy": 0.42095171473920345, "epoch": 0.4907331856859774, "grad_norm": 0.6015625, "learning_rate": 1.9993801791871178e-05, "loss": 0.5099, "mean_token_accuracy": 0.8906022787094117, "num_tokens": 8634768.0, "step": 2105 }, { "entropy": 0.21017308831214904, "epoch": 0.4918988227066092, "grad_norm": 2.265625, "learning_rate": 1.9993771362646462e-05, "loss": 0.3232, "mean_token_accuracy": 0.9302698016166687, "num_tokens": 8660305.0, "step": 2110 }, { "entropy": 0.3113011471927166, "epoch": 0.49306445972724094, "grad_norm": 2.125, "learning_rate": 1.999374085895711e-05, "loss": 0.5919, "mean_token_accuracy": 0.8718327701091766, "num_tokens": 8688826.0, "step": 2115 }, { "entropy": 0.49419403113424776, "epoch": 0.4942300967478727, "grad_norm": 2.28125, "learning_rate": 1.999371028080356e-05, "loss": 0.8276, "mean_token_accuracy": 0.8368937015533447, "num_tokens": 8716357.0, "step": 2120 }, { "entropy": 0.3417753532528877, "epoch": 0.4953957337685045, "grad_norm": 1.609375, "learning_rate": 1.9993679628186285e-05, "loss": 0.7675, "mean_token_accuracy": 0.8862788140773773, "num_tokens": 8725375.0, "step": 2125 }, { "entropy": 0.30906201936304567, "epoch": 0.49656137078913626, "grad_norm": 2.578125, "learning_rate": 1.9993648901105734e-05, "loss": 0.4319, "mean_token_accuracy": 0.8808507740497589, "num_tokens": 8769781.0, "step": 2130 }, { "entropy": 0.38649284690618513, "epoch": 0.49772700780976803, "grad_norm": 1.4921875, "learning_rate": 1.9993618099562367e-05, "loss": 0.6999, "mean_token_accuracy": 0.8772247135639191, "num_tokens": 8782281.0, "step": 2135 }, { "entropy": 0.31261972039937974, "epoch": 0.4988926448303998, "grad_norm": 3.8125, "learning_rate": 1.9993587223556646e-05, "loss": 0.6166, "mean_token_accuracy": 0.8909159898757935, "num_tokens": 8793835.0, "step": 2140 }, { "entropy": 0.33045312389731407, "epoch": 0.5000582818510316, "grad_norm": 0.6953125, "learning_rate": 1.9993556273089027e-05, "loss": 0.4532, "mean_token_accuracy": 0.9167261719703674, "num_tokens": 8825829.0, "step": 2145 }, { "entropy": 0.31448785960674286, "epoch": 0.5012239188716634, "grad_norm": 0.7265625, "learning_rate": 1.999352524815997e-05, "loss": 0.4656, "mean_token_accuracy": 0.8962031364440918, "num_tokens": 8846276.0, "step": 2150 }, { "entropy": 0.3556225474923849, "epoch": 0.5023895558922952, "grad_norm": 0.796875, "learning_rate": 1.9993494148769944e-05, "loss": 0.3641, "mean_token_accuracy": 0.8676750302314759, "num_tokens": 8882856.0, "step": 2155 }, { "entropy": 0.38040348403155805, "epoch": 0.5035551929129269, "grad_norm": 3.96875, "learning_rate": 1.9993462974919412e-05, "loss": 0.5104, "mean_token_accuracy": 0.8853009760379791, "num_tokens": 8900856.0, "step": 2160 }, { "entropy": 0.24543123692274094, "epoch": 0.5047208299335587, "grad_norm": 0.53125, "learning_rate": 1.9993431726608832e-05, "loss": 0.3641, "mean_token_accuracy": 0.9188506543636322, "num_tokens": 8939209.0, "step": 2165 }, { "entropy": 0.38197765350341795, "epoch": 0.5058864669541905, "grad_norm": 1.1328125, "learning_rate": 1.9993400403838676e-05, "loss": 0.6432, "mean_token_accuracy": 0.8996619880199432, "num_tokens": 8965669.0, "step": 2170 }, { "entropy": 0.21522373408079148, "epoch": 0.5070521039748223, "grad_norm": 1.1171875, "learning_rate": 1.999336900660941e-05, "loss": 0.285, "mean_token_accuracy": 0.9250701904296875, "num_tokens": 8995150.0, "step": 2175 }, { "entropy": 0.24849132001399993, "epoch": 0.508217740995454, "grad_norm": 0.94921875, "learning_rate": 1.99933375349215e-05, "loss": 0.2888, "mean_token_accuracy": 0.8985641777515412, "num_tokens": 9025083.0, "step": 2180 }, { "entropy": 0.3158140107989311, "epoch": 0.5093833780160858, "grad_norm": 1.7109375, "learning_rate": 1.999330598877542e-05, "loss": 0.7789, "mean_token_accuracy": 0.8875580310821534, "num_tokens": 9044814.0, "step": 2185 }, { "entropy": 0.29365613460540774, "epoch": 0.5105490150367176, "grad_norm": 2.1875, "learning_rate": 1.9993274368171635e-05, "loss": 0.489, "mean_token_accuracy": 0.9092134416103363, "num_tokens": 9059605.0, "step": 2190 }, { "entropy": 0.3623622298240662, "epoch": 0.5117146520573493, "grad_norm": 3.015625, "learning_rate": 1.999324267311062e-05, "loss": 0.7613, "mean_token_accuracy": 0.8859258830547333, "num_tokens": 9068729.0, "step": 2195 }, { "entropy": 0.2239295145496726, "epoch": 0.5128802890779811, "grad_norm": 1.875, "learning_rate": 1.9993210903592845e-05, "loss": 0.3056, "mean_token_accuracy": 0.9294314622879029, "num_tokens": 9104084.0, "step": 2200 }, { "entropy": 0.3030864965170622, "epoch": 0.5140459260986129, "grad_norm": 2.734375, "learning_rate": 1.9993179059618786e-05, "loss": 0.4594, "mean_token_accuracy": 0.8959613561630249, "num_tokens": 9130649.0, "step": 2205 }, { "entropy": 0.36943227648735044, "epoch": 0.5152115631192447, "grad_norm": 1.203125, "learning_rate": 1.999314714118892e-05, "loss": 0.7231, "mean_token_accuracy": 0.8850698232650757, "num_tokens": 9144581.0, "step": 2210 }, { "entropy": 0.32705827206373217, "epoch": 0.5163772001398764, "grad_norm": 2.0, "learning_rate": 1.9993115148303713e-05, "loss": 0.5517, "mean_token_accuracy": 0.879249781370163, "num_tokens": 9159448.0, "step": 2215 }, { "entropy": 0.3374114118516445, "epoch": 0.5175428371605082, "grad_norm": 1.2578125, "learning_rate": 1.9993083080963655e-05, "loss": 0.5958, "mean_token_accuracy": 0.8918022990226746, "num_tokens": 9182436.0, "step": 2220 }, { "entropy": 0.2363934338092804, "epoch": 0.51870847418114, "grad_norm": 1.703125, "learning_rate": 1.9993050939169217e-05, "loss": 0.4725, "mean_token_accuracy": 0.9231416463851929, "num_tokens": 9200503.0, "step": 2225 }, { "entropy": 0.21474622301757335, "epoch": 0.5198741112017717, "grad_norm": 1.828125, "learning_rate": 1.999301872292088e-05, "loss": 0.3416, "mean_token_accuracy": 0.9220646381378174, "num_tokens": 9229185.0, "step": 2230 }, { "entropy": 0.33478106260299684, "epoch": 0.5210397482224035, "grad_norm": 4.34375, "learning_rate": 1.9992986432219122e-05, "loss": 0.7656, "mean_token_accuracy": 0.8595859110355377, "num_tokens": 9239938.0, "step": 2235 }, { "entropy": 0.2523132786154747, "epoch": 0.5222053852430353, "grad_norm": 1.7109375, "learning_rate": 1.999295406706443e-05, "loss": 0.3842, "mean_token_accuracy": 0.8974947988986969, "num_tokens": 9262129.0, "step": 2240 }, { "entropy": 0.3062707144767046, "epoch": 0.523371022263667, "grad_norm": 2.859375, "learning_rate": 1.9992921627457278e-05, "loss": 0.6809, "mean_token_accuracy": 0.8839954435825348, "num_tokens": 9278003.0, "step": 2245 }, { "entropy": 0.2711375970393419, "epoch": 0.5245366592842988, "grad_norm": 2.90625, "learning_rate": 1.9992889113398158e-05, "loss": 0.5016, "mean_token_accuracy": 0.9179113209247589, "num_tokens": 9292655.0, "step": 2250 }, { "entropy": 0.27142874896526337, "epoch": 0.5257022963049306, "grad_norm": 1.1328125, "learning_rate": 1.9992856524887553e-05, "loss": 0.6119, "mean_token_accuracy": 0.892951512336731, "num_tokens": 9303397.0, "step": 2255 }, { "entropy": 0.24010205119848252, "epoch": 0.5268679333255624, "grad_norm": 1.8671875, "learning_rate": 1.9992823861925944e-05, "loss": 0.5484, "mean_token_accuracy": 0.9073122262954711, "num_tokens": 9315706.0, "step": 2260 }, { "entropy": 0.2968090422451496, "epoch": 0.5280335703461942, "grad_norm": 0.55859375, "learning_rate": 1.999279112451382e-05, "loss": 0.5365, "mean_token_accuracy": 0.8851703405380249, "num_tokens": 9328933.0, "step": 2265 }, { "entropy": 0.25726330243051054, "epoch": 0.529199207366826, "grad_norm": 0.345703125, "learning_rate": 1.9992758312651673e-05, "loss": 0.5031, "mean_token_accuracy": 0.9195921361446381, "num_tokens": 9353825.0, "step": 2270 }, { "entropy": 0.2275122195482254, "epoch": 0.5303648443874578, "grad_norm": 0.4375, "learning_rate": 1.9992725426339995e-05, "loss": 0.2281, "mean_token_accuracy": 0.9220161437988281, "num_tokens": 9392817.0, "step": 2275 }, { "entropy": 0.33882615994662046, "epoch": 0.5315304814080896, "grad_norm": 1.46875, "learning_rate": 1.9992692465579266e-05, "loss": 0.5083, "mean_token_accuracy": 0.8918252646923065, "num_tokens": 9418700.0, "step": 2280 }, { "entropy": 0.2928564824163914, "epoch": 0.5326961184287213, "grad_norm": 1.2890625, "learning_rate": 1.9992659430369984e-05, "loss": 0.4381, "mean_token_accuracy": 0.9111608743667603, "num_tokens": 9439423.0, "step": 2285 }, { "entropy": 0.2320990853011608, "epoch": 0.5338617554493531, "grad_norm": 1.5, "learning_rate": 1.999262632071264e-05, "loss": 0.4826, "mean_token_accuracy": 0.9189637899398804, "num_tokens": 9461479.0, "step": 2290 }, { "entropy": 0.26314267963171006, "epoch": 0.5350273924699849, "grad_norm": 1.0, "learning_rate": 1.999259313660773e-05, "loss": 0.4041, "mean_token_accuracy": 0.9190383732318879, "num_tokens": 9491437.0, "step": 2295 }, { "entropy": 0.24241818580776453, "epoch": 0.5361930294906166, "grad_norm": 1.8125, "learning_rate": 1.9992559878055743e-05, "loss": 0.382, "mean_token_accuracy": 0.9189897537231445, "num_tokens": 9514680.0, "step": 2300 }, { "entropy": 0.2492776945233345, "epoch": 0.5373586665112484, "grad_norm": 3.28125, "learning_rate": 1.9992526545057184e-05, "loss": 0.6233, "mean_token_accuracy": 0.8866626501083374, "num_tokens": 9535986.0, "step": 2305 }, { "entropy": 0.26340335980057716, "epoch": 0.5385243035318802, "grad_norm": 1.0859375, "learning_rate": 1.9992493137612543e-05, "loss": 0.6176, "mean_token_accuracy": 0.8974877238273621, "num_tokens": 9548719.0, "step": 2310 }, { "entropy": 0.44216432273387907, "epoch": 0.539689940552512, "grad_norm": 0.27734375, "learning_rate": 1.999245965572232e-05, "loss": 0.7069, "mean_token_accuracy": 0.8899427652359009, "num_tokens": 9585119.0, "step": 2315 }, { "entropy": 0.28867107182741164, "epoch": 0.5408555775731437, "grad_norm": 5.625, "learning_rate": 1.9992426099387014e-05, "loss": 0.5598, "mean_token_accuracy": 0.8989390969276428, "num_tokens": 9597896.0, "step": 2320 }, { "entropy": 0.27894147783517836, "epoch": 0.5420212145937755, "grad_norm": 0.400390625, "learning_rate": 1.9992392468607127e-05, "loss": 0.5465, "mean_token_accuracy": 0.8931815326213837, "num_tokens": 9619759.0, "step": 2325 }, { "entropy": 0.2875699769705534, "epoch": 0.5431868516144073, "grad_norm": 1.3671875, "learning_rate": 1.999235876338316e-05, "loss": 0.4764, "mean_token_accuracy": 0.8938539862632752, "num_tokens": 9639899.0, "step": 2330 }, { "entropy": 0.33880536295473573, "epoch": 0.544352488635039, "grad_norm": 0.27734375, "learning_rate": 1.9992324983715612e-05, "loss": 0.5468, "mean_token_accuracy": 0.8840242743492126, "num_tokens": 9683774.0, "step": 2335 }, { "entropy": 0.19824768621474503, "epoch": 0.5455181256556708, "grad_norm": 0.4765625, "learning_rate": 1.999229112960499e-05, "loss": 0.3, "mean_token_accuracy": 0.9403502106666565, "num_tokens": 9715282.0, "step": 2340 }, { "entropy": 0.4543208494782448, "epoch": 0.5466837626763026, "grad_norm": 0.734375, "learning_rate": 1.9992257201051802e-05, "loss": 0.6336, "mean_token_accuracy": 0.8709604918956757, "num_tokens": 9729682.0, "step": 2345 }, { "entropy": 0.3436585277318954, "epoch": 0.5478493996969344, "grad_norm": 2.234375, "learning_rate": 1.9992223198056545e-05, "loss": 0.6982, "mean_token_accuracy": 0.8596957445144653, "num_tokens": 9755689.0, "step": 2350 }, { "entropy": 0.36073665916919706, "epoch": 0.5490150367175661, "grad_norm": 2.984375, "learning_rate": 1.9992189120619736e-05, "loss": 0.9042, "mean_token_accuracy": 0.8605527520179749, "num_tokens": 9763825.0, "step": 2355 }, { "entropy": 0.3805040195584297, "epoch": 0.5501806737381979, "grad_norm": 2.390625, "learning_rate": 1.9992154968741877e-05, "loss": 0.7636, "mean_token_accuracy": 0.8586589694023132, "num_tokens": 9776348.0, "step": 2360 }, { "entropy": 0.20938777849078177, "epoch": 0.5513463107588297, "grad_norm": 1.609375, "learning_rate": 1.9992120742423476e-05, "loss": 0.4021, "mean_token_accuracy": 0.9294235825538635, "num_tokens": 9794244.0, "step": 2365 }, { "entropy": 0.3471638299524784, "epoch": 0.5525119477794614, "grad_norm": 2.65625, "learning_rate": 1.9992086441665052e-05, "loss": 0.5431, "mean_token_accuracy": 0.8725887358188629, "num_tokens": 9808138.0, "step": 2370 }, { "entropy": 0.33833655789494516, "epoch": 0.5536775848000932, "grad_norm": 2.734375, "learning_rate": 1.9992052066467106e-05, "loss": 0.6093, "mean_token_accuracy": 0.8938002645969391, "num_tokens": 9824435.0, "step": 2375 }, { "entropy": 0.29400101453065874, "epoch": 0.554843221820725, "grad_norm": 4.125, "learning_rate": 1.9992017616830156e-05, "loss": 0.6356, "mean_token_accuracy": 0.8934900283813476, "num_tokens": 9838139.0, "step": 2380 }, { "entropy": 0.30893067717552186, "epoch": 0.5560088588413568, "grad_norm": 1.640625, "learning_rate": 1.9991983092754717e-05, "loss": 0.631, "mean_token_accuracy": 0.8898307919502259, "num_tokens": 9855668.0, "step": 2385 }, { "entropy": 0.28324234634637835, "epoch": 0.5571744958619885, "grad_norm": 1.3515625, "learning_rate": 1.99919484942413e-05, "loss": 0.4305, "mean_token_accuracy": 0.9174308836460113, "num_tokens": 9870495.0, "step": 2390 }, { "entropy": 0.29785450994968415, "epoch": 0.5583401328826203, "grad_norm": 3.5, "learning_rate": 1.9991913821290423e-05, "loss": 0.7319, "mean_token_accuracy": 0.8782787978649139, "num_tokens": 9881271.0, "step": 2395 }, { "entropy": 0.29459096789360045, "epoch": 0.5595057699032522, "grad_norm": 3.03125, "learning_rate": 1.99918790739026e-05, "loss": 0.4413, "mean_token_accuracy": 0.9159037947654725, "num_tokens": 9897443.0, "step": 2400 }, { "entropy": 0.2606068912893534, "epoch": 0.560671406923884, "grad_norm": 1.8046875, "learning_rate": 1.9991844252078355e-05, "loss": 0.5554, "mean_token_accuracy": 0.9029782056808472, "num_tokens": 9915846.0, "step": 2405 }, { "entropy": 0.3030439902096987, "epoch": 0.5618370439445157, "grad_norm": 1.9296875, "learning_rate": 1.9991809355818207e-05, "loss": 0.4412, "mean_token_accuracy": 0.89673712849617, "num_tokens": 9941287.0, "step": 2410 }, { "entropy": 0.2712526571005583, "epoch": 0.5630026809651475, "grad_norm": 1.7109375, "learning_rate": 1.9991774385122665e-05, "loss": 0.3325, "mean_token_accuracy": 0.9228278815746307, "num_tokens": 9960373.0, "step": 2415 }, { "entropy": 0.32818218022584916, "epoch": 0.5641683179857793, "grad_norm": 3.28125, "learning_rate": 1.9991739339992266e-05, "loss": 0.6853, "mean_token_accuracy": 0.869037389755249, "num_tokens": 9979671.0, "step": 2420 }, { "entropy": 0.3927590001374483, "epoch": 0.565333955006411, "grad_norm": 0.51953125, "learning_rate": 1.9991704220427522e-05, "loss": 0.5787, "mean_token_accuracy": 0.8790532112121582, "num_tokens": 10000556.0, "step": 2425 }, { "entropy": 0.37120668292045594, "epoch": 0.5664995920270428, "grad_norm": 1.9609375, "learning_rate": 1.9991669026428965e-05, "loss": 0.8423, "mean_token_accuracy": 0.879127699136734, "num_tokens": 10008605.0, "step": 2430 }, { "entropy": 0.3100695013999939, "epoch": 0.5676652290476746, "grad_norm": 2.125, "learning_rate": 1.999163375799711e-05, "loss": 0.347, "mean_token_accuracy": 0.8831665992736817, "num_tokens": 10033245.0, "step": 2435 }, { "entropy": 0.39593042582273485, "epoch": 0.5688308660683064, "grad_norm": 1.9296875, "learning_rate": 1.999159841513249e-05, "loss": 0.6707, "mean_token_accuracy": 0.8801019787788391, "num_tokens": 10044394.0, "step": 2440 }, { "entropy": 0.36338729187846186, "epoch": 0.5699965030889381, "grad_norm": 2.125, "learning_rate": 1.9991562997835626e-05, "loss": 0.5301, "mean_token_accuracy": 0.8973929762840271, "num_tokens": 10055701.0, "step": 2445 }, { "entropy": 0.3218478888273239, "epoch": 0.5711621401095699, "grad_norm": 2.65625, "learning_rate": 1.9991527506107052e-05, "loss": 0.6329, "mean_token_accuracy": 0.871570247411728, "num_tokens": 10077584.0, "step": 2450 }, { "entropy": 0.2377581749111414, "epoch": 0.5723277771302017, "grad_norm": 4.375, "learning_rate": 1.99914919399473e-05, "loss": 0.5299, "mean_token_accuracy": 0.9120466470718384, "num_tokens": 10097728.0, "step": 2455 }, { "entropy": 0.3393100008368492, "epoch": 0.5734934141508334, "grad_norm": 0.94921875, "learning_rate": 1.999145629935689e-05, "loss": 0.4899, "mean_token_accuracy": 0.9051253616809845, "num_tokens": 10109022.0, "step": 2460 }, { "entropy": 0.1963107619434595, "epoch": 0.5746590511714652, "grad_norm": 1.25, "learning_rate": 1.999142058433636e-05, "loss": 0.4243, "mean_token_accuracy": 0.9210612416267395, "num_tokens": 10132061.0, "step": 2465 }, { "entropy": 0.32366092354059217, "epoch": 0.575824688192097, "grad_norm": 3.96875, "learning_rate": 1.999138479488624e-05, "loss": 0.68, "mean_token_accuracy": 0.8752106666564942, "num_tokens": 10150067.0, "step": 2470 }, { "entropy": 0.2483457863330841, "epoch": 0.5769903252127287, "grad_norm": 2.234375, "learning_rate": 1.999134893100707e-05, "loss": 0.6091, "mean_token_accuracy": 0.9089722335338593, "num_tokens": 10161991.0, "step": 2475 }, { "entropy": 0.32887863293290137, "epoch": 0.5781559622333605, "grad_norm": 2.796875, "learning_rate": 1.9991312992699377e-05, "loss": 0.4883, "mean_token_accuracy": 0.8847136437892914, "num_tokens": 10176161.0, "step": 2480 }, { "entropy": 0.3010684911161661, "epoch": 0.5793215992539923, "grad_norm": 1.1328125, "learning_rate": 1.99912769799637e-05, "loss": 0.4217, "mean_token_accuracy": 0.893145751953125, "num_tokens": 10206309.0, "step": 2485 }, { "entropy": 0.27251421064138415, "epoch": 0.5804872362746241, "grad_norm": 0.458984375, "learning_rate": 1.9991240892800576e-05, "loss": 0.4683, "mean_token_accuracy": 0.9054334104061127, "num_tokens": 10231252.0, "step": 2490 }, { "entropy": 0.3114484429359436, "epoch": 0.5816528732952558, "grad_norm": 1.6796875, "learning_rate": 1.9991204731210543e-05, "loss": 0.5862, "mean_token_accuracy": 0.8936571300029754, "num_tokens": 10249983.0, "step": 2495 }, { "entropy": 0.24855418205261232, "epoch": 0.5828185103158876, "grad_norm": 2.40625, "learning_rate": 1.9991168495194138e-05, "loss": 0.5488, "mean_token_accuracy": 0.9062022984027862, "num_tokens": 10261309.0, "step": 2500 }, { "entropy": 0.22968003787100316, "epoch": 0.5839841473365194, "grad_norm": 1.453125, "learning_rate": 1.999113218475191e-05, "loss": 0.4208, "mean_token_accuracy": 0.9215640425682068, "num_tokens": 10279292.0, "step": 2505 }, { "entropy": 0.3279329985380173, "epoch": 0.5851497843571511, "grad_norm": 3.375, "learning_rate": 1.9991095799884392e-05, "loss": 0.5601, "mean_token_accuracy": 0.8970583975315094, "num_tokens": 10297929.0, "step": 2510 }, { "entropy": 0.2907501269131899, "epoch": 0.5863154213777829, "grad_norm": 2.171875, "learning_rate": 1.9991059340592125e-05, "loss": 0.3888, "mean_token_accuracy": 0.9076000213623047, "num_tokens": 10329976.0, "step": 2515 }, { "entropy": 0.3549231544137001, "epoch": 0.5874810583984147, "grad_norm": 2.96875, "learning_rate": 1.9991022806875656e-05, "loss": 0.6519, "mean_token_accuracy": 0.8874382436275482, "num_tokens": 10347544.0, "step": 2520 }, { "entropy": 0.3080939695239067, "epoch": 0.5886466954190465, "grad_norm": 3.6875, "learning_rate": 1.9990986198735533e-05, "loss": 0.5544, "mean_token_accuracy": 0.8929486870765686, "num_tokens": 10366002.0, "step": 2525 }, { "entropy": 0.3876295104622841, "epoch": 0.5898123324396782, "grad_norm": 1.9921875, "learning_rate": 1.99909495161723e-05, "loss": 0.7927, "mean_token_accuracy": 0.8583209037780761, "num_tokens": 10378268.0, "step": 2530 }, { "entropy": 0.41035100668668745, "epoch": 0.59097796946031, "grad_norm": 2.1875, "learning_rate": 1.9990912759186498e-05, "loss": 0.8022, "mean_token_accuracy": 0.8597595751285553, "num_tokens": 10390596.0, "step": 2535 }, { "entropy": 0.24308755919337272, "epoch": 0.5921436064809419, "grad_norm": 1.703125, "learning_rate": 1.9990875927778684e-05, "loss": 0.3967, "mean_token_accuracy": 0.9243945956230164, "num_tokens": 10413960.0, "step": 2540 }, { "entropy": 0.21663215793669224, "epoch": 0.5933092435015737, "grad_norm": 0.1796875, "learning_rate": 1.99908390219494e-05, "loss": 0.2111, "mean_token_accuracy": 0.9359542548656463, "num_tokens": 10437663.0, "step": 2545 }, { "entropy": 0.408858510479331, "epoch": 0.5944748805222054, "grad_norm": 0.67578125, "learning_rate": 1.9990802041699206e-05, "loss": 0.6026, "mean_token_accuracy": 0.8689159452915192, "num_tokens": 10455699.0, "step": 2550 }, { "entropy": 0.29723052904009817, "epoch": 0.5956405175428372, "grad_norm": 3.390625, "learning_rate": 1.9990764987028642e-05, "loss": 0.5624, "mean_token_accuracy": 0.8806465566158295, "num_tokens": 10474849.0, "step": 2555 }, { "entropy": 0.312545171380043, "epoch": 0.596806154563469, "grad_norm": 6.09375, "learning_rate": 1.9990727857938265e-05, "loss": 0.6364, "mean_token_accuracy": 0.8904350399971008, "num_tokens": 10484376.0, "step": 2560 }, { "entropy": 0.2090074449777603, "epoch": 0.5979717915841007, "grad_norm": 2.15625, "learning_rate": 1.9990690654428627e-05, "loss": 0.3618, "mean_token_accuracy": 0.9283179759979248, "num_tokens": 10510343.0, "step": 2565 }, { "entropy": 0.24909560680389403, "epoch": 0.5991374286047325, "grad_norm": 1.2578125, "learning_rate": 1.999065337650029e-05, "loss": 0.5294, "mean_token_accuracy": 0.9161015510559082, "num_tokens": 10522462.0, "step": 2570 }, { "entropy": 0.436211097240448, "epoch": 0.6003030656253643, "grad_norm": 0.357421875, "learning_rate": 1.9990616024153804e-05, "loss": 0.7157, "mean_token_accuracy": 0.8679668664932251, "num_tokens": 10555004.0, "step": 2575 }, { "entropy": 0.3197683900594711, "epoch": 0.6014687026459961, "grad_norm": 1.796875, "learning_rate": 1.9990578597389726e-05, "loss": 0.6512, "mean_token_accuracy": 0.8892592430114746, "num_tokens": 10564631.0, "step": 2580 }, { "entropy": 0.3422405393794179, "epoch": 0.6026343396666278, "grad_norm": 1.5234375, "learning_rate": 1.9990541096208614e-05, "loss": 0.5619, "mean_token_accuracy": 0.8856872081756592, "num_tokens": 10601595.0, "step": 2585 }, { "entropy": 0.3777252405881882, "epoch": 0.6037999766872596, "grad_norm": 0.5078125, "learning_rate": 1.999050352061103e-05, "loss": 0.4936, "mean_token_accuracy": 0.8734250783920288, "num_tokens": 10621546.0, "step": 2590 }, { "entropy": 0.32311970740556717, "epoch": 0.6049656137078914, "grad_norm": 2.9375, "learning_rate": 1.9990465870597528e-05, "loss": 0.7566, "mean_token_accuracy": 0.8670526087284088, "num_tokens": 10632893.0, "step": 2595 }, { "entropy": 0.3077698152512312, "epoch": 0.6061312507285231, "grad_norm": 2.328125, "learning_rate": 1.999042814616868e-05, "loss": 0.4922, "mean_token_accuracy": 0.913305151462555, "num_tokens": 10660453.0, "step": 2600 }, { "entropy": 0.40748190470039847, "epoch": 0.6072968877491549, "grad_norm": 1.953125, "learning_rate": 1.9990390347325037e-05, "loss": 0.5576, "mean_token_accuracy": 0.8577117413282395, "num_tokens": 10701815.0, "step": 2605 }, { "entropy": 0.27749653831124305, "epoch": 0.6084625247697867, "grad_norm": 0.71875, "learning_rate": 1.9990352474067173e-05, "loss": 0.4882, "mean_token_accuracy": 0.9072460174560547, "num_tokens": 10720236.0, "step": 2610 }, { "entropy": 0.3385494023561478, "epoch": 0.6096281617904185, "grad_norm": 2.578125, "learning_rate": 1.999031452639564e-05, "loss": 0.6127, "mean_token_accuracy": 0.8825340390205383, "num_tokens": 10741518.0, "step": 2615 }, { "entropy": 0.2643695339560509, "epoch": 0.6107937988110502, "grad_norm": 3.3125, "learning_rate": 1.9990276504311018e-05, "loss": 0.4195, "mean_token_accuracy": 0.9126249015331268, "num_tokens": 10762998.0, "step": 2620 }, { "entropy": 0.3385010756552219, "epoch": 0.611959435831682, "grad_norm": 3.71875, "learning_rate": 1.9990238407813866e-05, "loss": 0.6874, "mean_token_accuracy": 0.8858413577079773, "num_tokens": 10778292.0, "step": 2625 }, { "entropy": 0.25794244520366194, "epoch": 0.6131250728523138, "grad_norm": 0.578125, "learning_rate": 1.999020023690475e-05, "loss": 0.3079, "mean_token_accuracy": 0.9154360055923462, "num_tokens": 10825350.0, "step": 2630 }, { "entropy": 0.2832651875913143, "epoch": 0.6142907098729455, "grad_norm": 0.28125, "learning_rate": 1.9990161991584253e-05, "loss": 0.3404, "mean_token_accuracy": 0.8956524491310119, "num_tokens": 10849280.0, "step": 2635 }, { "entropy": 0.2753490924835205, "epoch": 0.6154563468935773, "grad_norm": 3.078125, "learning_rate": 1.9990123671852927e-05, "loss": 0.6067, "mean_token_accuracy": 0.9064594686031342, "num_tokens": 10863035.0, "step": 2640 }, { "entropy": 0.31272173710167406, "epoch": 0.6166219839142091, "grad_norm": 0.294921875, "learning_rate": 1.9990085277711352e-05, "loss": 0.47, "mean_token_accuracy": 0.8716696619987487, "num_tokens": 10888521.0, "step": 2645 }, { "entropy": 0.2767592526972294, "epoch": 0.6177876209348409, "grad_norm": 0.7734375, "learning_rate": 1.99900468091601e-05, "loss": 0.4332, "mean_token_accuracy": 0.8956589341163635, "num_tokens": 10918722.0, "step": 2650 }, { "entropy": 0.1582114040851593, "epoch": 0.6189532579554726, "grad_norm": 1.7734375, "learning_rate": 1.9990008266199747e-05, "loss": 0.2572, "mean_token_accuracy": 0.9478217661380768, "num_tokens": 10950005.0, "step": 2655 }, { "entropy": 0.3565956801176071, "epoch": 0.6201188949761044, "grad_norm": 1.8515625, "learning_rate": 1.998996964883086e-05, "loss": 0.5539, "mean_token_accuracy": 0.8841744959354401, "num_tokens": 10971433.0, "step": 2660 }, { "entropy": 0.2978044833987951, "epoch": 0.6212845319967362, "grad_norm": 3.59375, "learning_rate": 1.9989930957054028e-05, "loss": 0.5773, "mean_token_accuracy": 0.9038799822330474, "num_tokens": 10987337.0, "step": 2665 }, { "entropy": 0.3411064647138119, "epoch": 0.6224501690173679, "grad_norm": 0.154296875, "learning_rate": 1.9989892190869816e-05, "loss": 0.6672, "mean_token_accuracy": 0.8761695384979248, "num_tokens": 11004420.0, "step": 2670 }, { "entropy": 0.31689151339232924, "epoch": 0.6236158060379998, "grad_norm": 6.0, "learning_rate": 1.9989853350278804e-05, "loss": 0.7847, "mean_token_accuracy": 0.8588998973369598, "num_tokens": 11022152.0, "step": 2675 }, { "entropy": 0.3129567734897137, "epoch": 0.6247814430586316, "grad_norm": 0.625, "learning_rate": 1.9989814435281576e-05, "loss": 0.5403, "mean_token_accuracy": 0.890529477596283, "num_tokens": 11060882.0, "step": 2680 }, { "entropy": 0.26123612076044084, "epoch": 0.6259470800792634, "grad_norm": 1.234375, "learning_rate": 1.998977544587871e-05, "loss": 0.3597, "mean_token_accuracy": 0.9167574405670166, "num_tokens": 11078695.0, "step": 2685 }, { "entropy": 0.3337280437350273, "epoch": 0.6271127170998951, "grad_norm": 0.59375, "learning_rate": 1.9989736382070787e-05, "loss": 0.5563, "mean_token_accuracy": 0.8964059770107269, "num_tokens": 11104248.0, "step": 2690 }, { "entropy": 0.20731370337307453, "epoch": 0.6282783541205269, "grad_norm": 1.8984375, "learning_rate": 1.9989697243858388e-05, "loss": 0.3084, "mean_token_accuracy": 0.9278476953506469, "num_tokens": 11130600.0, "step": 2695 }, { "entropy": 0.3032124895602465, "epoch": 0.6294439911411587, "grad_norm": 3.0625, "learning_rate": 1.99896580312421e-05, "loss": 0.6284, "mean_token_accuracy": 0.8884129106998444, "num_tokens": 11148703.0, "step": 2700 }, { "entropy": 0.22488728277385234, "epoch": 0.6306096281617904, "grad_norm": 1.8046875, "learning_rate": 1.9989618744222506e-05, "loss": 0.4241, "mean_token_accuracy": 0.9202291667461395, "num_tokens": 11169931.0, "step": 2705 }, { "entropy": 0.25353550985455514, "epoch": 0.6317752651824222, "grad_norm": 0.57421875, "learning_rate": 1.998957938280019e-05, "loss": 0.3501, "mean_token_accuracy": 0.9135826289653778, "num_tokens": 11207948.0, "step": 2710 }, { "entropy": 0.27142262272536755, "epoch": 0.632940902203054, "grad_norm": 0.6796875, "learning_rate": 1.998953994697574e-05, "loss": 0.5832, "mean_token_accuracy": 0.8925474107265472, "num_tokens": 11225668.0, "step": 2715 }, { "entropy": 0.2797514094039798, "epoch": 0.6341065392236858, "grad_norm": 1.2578125, "learning_rate": 1.9989500436749746e-05, "loss": 0.4563, "mean_token_accuracy": 0.9044257879257203, "num_tokens": 11253386.0, "step": 2720 }, { "entropy": 0.3232995979487896, "epoch": 0.6352721762443175, "grad_norm": 2.625, "learning_rate": 1.9989460852122798e-05, "loss": 0.6513, "mean_token_accuracy": 0.8970522165298462, "num_tokens": 11264773.0, "step": 2725 }, { "entropy": 0.32866962999105453, "epoch": 0.6364378132649493, "grad_norm": 1.0625, "learning_rate": 1.998942119309548e-05, "loss": 0.5501, "mean_token_accuracy": 0.8841784775257111, "num_tokens": 11293142.0, "step": 2730 }, { "entropy": 0.3042488098144531, "epoch": 0.6376034502855811, "grad_norm": 1.7890625, "learning_rate": 1.9989381459668392e-05, "loss": 0.6684, "mean_token_accuracy": 0.8961536705493927, "num_tokens": 11303088.0, "step": 2735 }, { "entropy": 0.3178449098020792, "epoch": 0.6387690873062128, "grad_norm": 0.8203125, "learning_rate": 1.9989341651842117e-05, "loss": 0.3019, "mean_token_accuracy": 0.9054975390434266, "num_tokens": 11329525.0, "step": 2740 }, { "entropy": 0.27289107590913775, "epoch": 0.6399347243268446, "grad_norm": 3.828125, "learning_rate": 1.9989301769617258e-05, "loss": 0.5949, "mean_token_accuracy": 0.899017083644867, "num_tokens": 11341596.0, "step": 2745 }, { "entropy": 0.3231787905097008, "epoch": 0.6411003613474764, "grad_norm": 1.7578125, "learning_rate": 1.99892618129944e-05, "loss": 0.7994, "mean_token_accuracy": 0.8684586107730865, "num_tokens": 11350540.0, "step": 2750 }, { "entropy": 0.29003828167915346, "epoch": 0.6422659983681082, "grad_norm": 0.486328125, "learning_rate": 1.998922178197415e-05, "loss": 0.3397, "mean_token_accuracy": 0.9051499962806702, "num_tokens": 11389605.0, "step": 2755 }, { "entropy": 0.29986562281847, "epoch": 0.6434316353887399, "grad_norm": 0.302734375, "learning_rate": 1.9989181676557097e-05, "loss": 0.6057, "mean_token_accuracy": 0.8865071713924408, "num_tokens": 11411773.0, "step": 2760 }, { "entropy": 0.2845982387661934, "epoch": 0.6445972724093717, "grad_norm": 0.44921875, "learning_rate": 1.998914149674384e-05, "loss": 0.4129, "mean_token_accuracy": 0.9082658767700196, "num_tokens": 11432083.0, "step": 2765 }, { "entropy": 0.4338554725050926, "epoch": 0.6457629094300035, "grad_norm": 1.5546875, "learning_rate": 1.998910124253498e-05, "loss": 0.7722, "mean_token_accuracy": 0.8590993523597718, "num_tokens": 11447520.0, "step": 2770 }, { "entropy": 0.3870300319045782, "epoch": 0.6469285464506352, "grad_norm": 2.765625, "learning_rate": 1.9989060913931117e-05, "loss": 0.6756, "mean_token_accuracy": 0.8583252966403961, "num_tokens": 11471002.0, "step": 2775 }, { "entropy": 0.23096679151058197, "epoch": 0.648094183471267, "grad_norm": 2.984375, "learning_rate": 1.998902051093285e-05, "loss": 0.4019, "mean_token_accuracy": 0.9051249861717224, "num_tokens": 11496091.0, "step": 2780 }, { "entropy": 0.401183944940567, "epoch": 0.6492598204918988, "grad_norm": 1.8828125, "learning_rate": 1.9988980033540787e-05, "loss": 0.6471, "mean_token_accuracy": 0.856478476524353, "num_tokens": 11528974.0, "step": 2785 }, { "entropy": 0.2962135147303343, "epoch": 0.6504254575125306, "grad_norm": 6.5, "learning_rate": 1.9988939481755523e-05, "loss": 0.6008, "mean_token_accuracy": 0.8978751420974731, "num_tokens": 11552542.0, "step": 2790 }, { "entropy": 0.42872381433844564, "epoch": 0.6515910945331623, "grad_norm": 2.6875, "learning_rate": 1.998889885557767e-05, "loss": 0.6457, "mean_token_accuracy": 0.8847284972667694, "num_tokens": 11563152.0, "step": 2795 }, { "entropy": 0.307503542304039, "epoch": 0.6527567315537941, "grad_norm": 0.7734375, "learning_rate": 1.9988858155007832e-05, "loss": 0.4287, "mean_token_accuracy": 0.9065074861049652, "num_tokens": 11587159.0, "step": 2800 }, { "entropy": 0.3470982015132904, "epoch": 0.6539223685744259, "grad_norm": 3.515625, "learning_rate": 1.9988817380046615e-05, "loss": 0.7755, "mean_token_accuracy": 0.8728764653205872, "num_tokens": 11609631.0, "step": 2805 }, { "entropy": 0.2724352993071079, "epoch": 0.6550880055950578, "grad_norm": 2.609375, "learning_rate": 1.9988776530694624e-05, "loss": 0.3999, "mean_token_accuracy": 0.925034087896347, "num_tokens": 11626566.0, "step": 2810 }, { "entropy": 0.19500094205141066, "epoch": 0.6562536426156895, "grad_norm": 0.46484375, "learning_rate": 1.9988735606952473e-05, "loss": 0.229, "mean_token_accuracy": 0.9306256473064423, "num_tokens": 11652736.0, "step": 2815 }, { "entropy": 0.29740233570337293, "epoch": 0.6574192796363213, "grad_norm": 1.375, "learning_rate": 1.9988694608820775e-05, "loss": 0.567, "mean_token_accuracy": 0.891633152961731, "num_tokens": 11669815.0, "step": 2820 }, { "entropy": 0.21288203895092012, "epoch": 0.6585849166569531, "grad_norm": 0.734375, "learning_rate": 1.9988653536300132e-05, "loss": 0.351, "mean_token_accuracy": 0.9191200852394104, "num_tokens": 11693951.0, "step": 2825 }, { "entropy": 0.3570013351738453, "epoch": 0.6597505536775848, "grad_norm": 12.0, "learning_rate": 1.9988612389391163e-05, "loss": 0.7257, "mean_token_accuracy": 0.9116259157657624, "num_tokens": 11735005.0, "step": 2830 }, { "entropy": 0.3212395556271076, "epoch": 0.6609161906982166, "grad_norm": 2.015625, "learning_rate": 1.998857116809448e-05, "loss": 0.7315, "mean_token_accuracy": 0.8878246247768402, "num_tokens": 11747768.0, "step": 2835 }, { "entropy": 0.3063099093735218, "epoch": 0.6620818277188484, "grad_norm": 0.9765625, "learning_rate": 1.9988529872410698e-05, "loss": 0.5298, "mean_token_accuracy": 0.8972079992294312, "num_tokens": 11765542.0, "step": 2840 }, { "entropy": 0.23337812647223471, "epoch": 0.6632474647394802, "grad_norm": 0.5703125, "learning_rate": 1.9988488502340432e-05, "loss": 0.2289, "mean_token_accuracy": 0.932334941625595, "num_tokens": 11800468.0, "step": 2845 }, { "entropy": 0.43383695781230924, "epoch": 0.6644131017601119, "grad_norm": 2.0625, "learning_rate": 1.99884470578843e-05, "loss": 0.6844, "mean_token_accuracy": 0.8706629991531372, "num_tokens": 11818230.0, "step": 2850 }, { "entropy": 0.31050637289881705, "epoch": 0.6655787387807437, "grad_norm": 0.486328125, "learning_rate": 1.9988405539042918e-05, "loss": 0.4206, "mean_token_accuracy": 0.9051454186439514, "num_tokens": 11837491.0, "step": 2855 }, { "entropy": 0.3793765440583229, "epoch": 0.6667443758013755, "grad_norm": 5.21875, "learning_rate": 1.9988363945816906e-05, "loss": 0.7112, "mean_token_accuracy": 0.8710412621498108, "num_tokens": 11865376.0, "step": 2860 }, { "entropy": 0.29791102930903435, "epoch": 0.6679100128220072, "grad_norm": 5.3125, "learning_rate": 1.9988322278206887e-05, "loss": 0.5251, "mean_token_accuracy": 0.9106599807739257, "num_tokens": 11880384.0, "step": 2865 }, { "entropy": 0.25474109277129175, "epoch": 0.669075649842639, "grad_norm": 3.6875, "learning_rate": 1.9988280536213477e-05, "loss": 0.5121, "mean_token_accuracy": 0.906974321603775, "num_tokens": 11896588.0, "step": 2870 }, { "entropy": 0.22310059182345868, "epoch": 0.6702412868632708, "grad_norm": 0.4140625, "learning_rate": 1.9988238719837306e-05, "loss": 0.3435, "mean_token_accuracy": 0.922565633058548, "num_tokens": 11932465.0, "step": 2875 }, { "entropy": 0.29564819037914275, "epoch": 0.6714069238839026, "grad_norm": 2.5625, "learning_rate": 1.9988196829078988e-05, "loss": 0.6536, "mean_token_accuracy": 0.894516795873642, "num_tokens": 11951000.0, "step": 2880 }, { "entropy": 0.25558542367070913, "epoch": 0.6725725609045343, "grad_norm": 0.984375, "learning_rate": 1.9988154863939156e-05, "loss": 0.3796, "mean_token_accuracy": 0.9356054246425629, "num_tokens": 11973901.0, "step": 2885 }, { "entropy": 0.26891485378146174, "epoch": 0.6737381979251661, "grad_norm": 2.25, "learning_rate": 1.998811282441843e-05, "loss": 0.4838, "mean_token_accuracy": 0.9037762641906738, "num_tokens": 11997570.0, "step": 2890 }, { "entropy": 0.23099879249930383, "epoch": 0.6749038349457979, "grad_norm": 2.484375, "learning_rate": 1.998807071051744e-05, "loss": 0.3147, "mean_token_accuracy": 0.9223685622215271, "num_tokens": 12023241.0, "step": 2895 }, { "entropy": 0.3084828436374664, "epoch": 0.6760694719664296, "grad_norm": 1.65625, "learning_rate": 1.9988028522236814e-05, "loss": 0.5966, "mean_token_accuracy": 0.8926734149456024, "num_tokens": 12042716.0, "step": 2900 }, { "entropy": 0.22942945584654809, "epoch": 0.6772351089870614, "grad_norm": 1.7734375, "learning_rate": 1.9987986259577178e-05, "loss": 0.3961, "mean_token_accuracy": 0.934667432308197, "num_tokens": 12066142.0, "step": 2905 }, { "entropy": 0.23905463069677352, "epoch": 0.6784007460076932, "grad_norm": 0.55078125, "learning_rate": 1.9987943922539168e-05, "loss": 0.35, "mean_token_accuracy": 0.9203935861587524, "num_tokens": 12091703.0, "step": 2910 }, { "entropy": 0.3106345657259226, "epoch": 0.679566383028325, "grad_norm": 1.2890625, "learning_rate": 1.9987901511123412e-05, "loss": 0.7652, "mean_token_accuracy": 0.8731933474540711, "num_tokens": 12108139.0, "step": 2915 }, { "entropy": 0.3008536197245121, "epoch": 0.6807320200489567, "grad_norm": 2.359375, "learning_rate": 1.9987859025330537e-05, "loss": 0.4116, "mean_token_accuracy": 0.9075113773345947, "num_tokens": 12139750.0, "step": 2920 }, { "entropy": 0.2751265406608582, "epoch": 0.6818976570695885, "grad_norm": 2.46875, "learning_rate": 1.9987816465161186e-05, "loss": 0.429, "mean_token_accuracy": 0.9219581127166748, "num_tokens": 12154831.0, "step": 2925 }, { "entropy": 0.2258956879377365, "epoch": 0.6830632940902203, "grad_norm": 0.9609375, "learning_rate": 1.998777383061599e-05, "loss": 0.4781, "mean_token_accuracy": 0.9113677144050598, "num_tokens": 12176507.0, "step": 2930 }, { "entropy": 0.3542913876473904, "epoch": 0.684228931110852, "grad_norm": 0.322265625, "learning_rate": 1.998773112169558e-05, "loss": 0.4442, "mean_token_accuracy": 0.8889063417911529, "num_tokens": 12204987.0, "step": 2935 }, { "entropy": 0.26746199689805505, "epoch": 0.6853945681314838, "grad_norm": 2.546875, "learning_rate": 1.99876883384006e-05, "loss": 0.7624, "mean_token_accuracy": 0.8774264693260193, "num_tokens": 12219026.0, "step": 2940 }, { "entropy": 0.2980736643075943, "epoch": 0.6865602051521156, "grad_norm": 1.359375, "learning_rate": 1.9987645480731687e-05, "loss": 0.5191, "mean_token_accuracy": 0.9098766207695007, "num_tokens": 12229932.0, "step": 2945 }, { "entropy": 0.27693705186247825, "epoch": 0.6877258421727475, "grad_norm": 2.4375, "learning_rate": 1.9987602548689475e-05, "loss": 0.5249, "mean_token_accuracy": 0.9166098952293396, "num_tokens": 12252411.0, "step": 2950 }, { "entropy": 0.28829210847616193, "epoch": 0.6888914791933792, "grad_norm": 2.375, "learning_rate": 1.998755954227461e-05, "loss": 0.6741, "mean_token_accuracy": 0.890589964389801, "num_tokens": 12262482.0, "step": 2955 }, { "entropy": 0.26005504429340365, "epoch": 0.690057116214011, "grad_norm": 0.5, "learning_rate": 1.9987516461487726e-05, "loss": 0.3838, "mean_token_accuracy": 0.8957611501216889, "num_tokens": 12285838.0, "step": 2960 }, { "entropy": 0.28918950296938417, "epoch": 0.6912227532346428, "grad_norm": 1.15625, "learning_rate": 1.9987473306329473e-05, "loss": 0.3924, "mean_token_accuracy": 0.9049074351787567, "num_tokens": 12304686.0, "step": 2965 }, { "entropy": 0.3453959345817566, "epoch": 0.6923883902552745, "grad_norm": 4.8125, "learning_rate": 1.998743007680049e-05, "loss": 0.9069, "mean_token_accuracy": 0.8660809576511384, "num_tokens": 12313370.0, "step": 2970 }, { "entropy": 0.3491327941417694, "epoch": 0.6935540272759063, "grad_norm": 1.3359375, "learning_rate": 1.9987386772901426e-05, "loss": 0.7424, "mean_token_accuracy": 0.8711540699005127, "num_tokens": 12326798.0, "step": 2975 }, { "entropy": 0.46053214073181153, "epoch": 0.6947196642965381, "grad_norm": 2.359375, "learning_rate": 1.998734339463292e-05, "loss": 0.9168, "mean_token_accuracy": 0.8372248828411102, "num_tokens": 12336458.0, "step": 2980 }, { "entropy": 0.359159516915679, "epoch": 0.6958853013171699, "grad_norm": 2.21875, "learning_rate": 1.9987299941995625e-05, "loss": 0.5595, "mean_token_accuracy": 0.8771745800971985, "num_tokens": 12354223.0, "step": 2985 }, { "entropy": 0.3393791884183884, "epoch": 0.6970509383378016, "grad_norm": 3.078125, "learning_rate": 1.9987256414990183e-05, "loss": 0.69, "mean_token_accuracy": 0.8888447761535645, "num_tokens": 12374416.0, "step": 2990 }, { "entropy": 0.2696688212454319, "epoch": 0.6982165753584334, "grad_norm": 0.91015625, "learning_rate": 1.998721281361725e-05, "loss": 0.3425, "mean_token_accuracy": 0.9043423593044281, "num_tokens": 12399187.0, "step": 2995 }, { "entropy": 0.2706952027976513, "epoch": 0.6993822123790652, "grad_norm": 0.79296875, "learning_rate": 1.9987169137877474e-05, "loss": 0.467, "mean_token_accuracy": 0.9108605206012725, "num_tokens": 12416919.0, "step": 3000 }, { "entropy": 0.32974872663617133, "epoch": 0.7005478493996969, "grad_norm": 0.5625, "learning_rate": 1.9987125387771502e-05, "loss": 0.4047, "mean_token_accuracy": 0.9061824440956116, "num_tokens": 12441006.0, "step": 3005 }, { "entropy": 0.2989892097190022, "epoch": 0.7017134864203287, "grad_norm": 0.498046875, "learning_rate": 1.9987081563299992e-05, "loss": 0.3412, "mean_token_accuracy": 0.8854645431041718, "num_tokens": 12476290.0, "step": 3010 }, { "entropy": 0.3127955436706543, "epoch": 0.7028791234409605, "grad_norm": 1.09375, "learning_rate": 1.9987037664463593e-05, "loss": 0.706, "mean_token_accuracy": 0.8708417236804962, "num_tokens": 12495372.0, "step": 3015 }, { "entropy": 0.22802112326025964, "epoch": 0.7040447604615923, "grad_norm": 0.44921875, "learning_rate": 1.9986993691262963e-05, "loss": 0.2649, "mean_token_accuracy": 0.9163160860538483, "num_tokens": 12521478.0, "step": 3020 }, { "entropy": 0.29017159678041937, "epoch": 0.705210397482224, "grad_norm": 0.185546875, "learning_rate": 1.9986949643698752e-05, "loss": 0.4145, "mean_token_accuracy": 0.9037138223648071, "num_tokens": 12542998.0, "step": 3025 }, { "entropy": 0.2329173669219017, "epoch": 0.7063760345028558, "grad_norm": 1.125, "learning_rate": 1.9986905521771625e-05, "loss": 0.3699, "mean_token_accuracy": 0.9263135194778442, "num_tokens": 12562045.0, "step": 3030 }, { "entropy": 0.3430146735161543, "epoch": 0.7075416715234876, "grad_norm": 0.42578125, "learning_rate": 1.9986861325482236e-05, "loss": 0.4286, "mean_token_accuracy": 0.8931098103523254, "num_tokens": 12598642.0, "step": 3035 }, { "entropy": 0.296509512513876, "epoch": 0.7087073085441193, "grad_norm": 1.1640625, "learning_rate": 1.998681705483124e-05, "loss": 0.6135, "mean_token_accuracy": 0.8931901514530182, "num_tokens": 12610992.0, "step": 3040 }, { "entropy": 0.36168722808361053, "epoch": 0.7098729455647511, "grad_norm": 1.75, "learning_rate": 1.9986772709819305e-05, "loss": 0.8423, "mean_token_accuracy": 0.8605839788913727, "num_tokens": 12620250.0, "step": 3045 }, { "entropy": 0.29431844148784875, "epoch": 0.7110385825853829, "grad_norm": 0.384765625, "learning_rate": 1.998672829044709e-05, "loss": 0.5822, "mean_token_accuracy": 0.9037684023380279, "num_tokens": 12638767.0, "step": 3050 }, { "entropy": 0.324118447676301, "epoch": 0.7122042196060147, "grad_norm": 0.486328125, "learning_rate": 1.9986683796715253e-05, "loss": 0.6017, "mean_token_accuracy": 0.90307257771492, "num_tokens": 12655374.0, "step": 3055 }, { "entropy": 0.2938187211751938, "epoch": 0.7133698566266464, "grad_norm": 0.9140625, "learning_rate": 1.998663922862446e-05, "loss": 0.5992, "mean_token_accuracy": 0.9050024807453155, "num_tokens": 12666178.0, "step": 3060 }, { "entropy": 0.32323597818613053, "epoch": 0.7145354936472782, "grad_norm": 3.953125, "learning_rate": 1.9986594586175375e-05, "loss": 0.6051, "mean_token_accuracy": 0.8777613401412964, "num_tokens": 12685326.0, "step": 3065 }, { "entropy": 0.28426281437277795, "epoch": 0.71570113066791, "grad_norm": 0.20703125, "learning_rate": 1.9986549869368667e-05, "loss": 0.271, "mean_token_accuracy": 0.892994499206543, "num_tokens": 12722638.0, "step": 3070 }, { "entropy": 0.31557350754737856, "epoch": 0.7168667676885417, "grad_norm": 0.5859375, "learning_rate": 1.9986505078205e-05, "loss": 0.3999, "mean_token_accuracy": 0.9035743296146392, "num_tokens": 12749198.0, "step": 3075 }, { "entropy": 0.371865002810955, "epoch": 0.7180324047091735, "grad_norm": 2.78125, "learning_rate": 1.998646021268504e-05, "loss": 0.8639, "mean_token_accuracy": 0.8539581596851349, "num_tokens": 12759500.0, "step": 3080 }, { "entropy": 0.26819879561662674, "epoch": 0.7191980417298054, "grad_norm": 0.9765625, "learning_rate": 1.9986415272809458e-05, "loss": 0.3749, "mean_token_accuracy": 0.9239216327667237, "num_tokens": 12783515.0, "step": 3085 }, { "entropy": 0.25302067399024963, "epoch": 0.7203636787504372, "grad_norm": 0.46484375, "learning_rate": 1.9986370258578925e-05, "loss": 0.4213, "mean_token_accuracy": 0.9127112746238708, "num_tokens": 12806426.0, "step": 3090 }, { "entropy": 0.3763828493654728, "epoch": 0.7215293157710689, "grad_norm": 4.34375, "learning_rate": 1.9986325169994116e-05, "loss": 0.7125, "mean_token_accuracy": 0.8624271988868714, "num_tokens": 12826732.0, "step": 3095 }, { "entropy": 0.43457455970346925, "epoch": 0.7226949527917007, "grad_norm": 0.51171875, "learning_rate": 1.998628000705569e-05, "loss": 0.791, "mean_token_accuracy": 0.8754983246326447, "num_tokens": 12855074.0, "step": 3100 }, { "entropy": 0.3388949878513813, "epoch": 0.7238605898123325, "grad_norm": 0.76953125, "learning_rate": 1.9986234769764337e-05, "loss": 0.8004, "mean_token_accuracy": 0.8680715382099151, "num_tokens": 12865296.0, "step": 3105 }, { "entropy": 0.24326059743762016, "epoch": 0.7250262268329642, "grad_norm": 0.81640625, "learning_rate": 1.9986189458120722e-05, "loss": 0.3808, "mean_token_accuracy": 0.9010769128799438, "num_tokens": 12895080.0, "step": 3110 }, { "entropy": 0.33171582967042923, "epoch": 0.726191863853596, "grad_norm": 1.0, "learning_rate": 1.998614407212552e-05, "loss": 0.5749, "mean_token_accuracy": 0.9047894716262818, "num_tokens": 12905648.0, "step": 3115 }, { "entropy": 0.2634651020169258, "epoch": 0.7273575008742278, "grad_norm": 1.171875, "learning_rate": 1.9986098611779412e-05, "loss": 0.4922, "mean_token_accuracy": 0.9093749225139618, "num_tokens": 12921502.0, "step": 3120 }, { "entropy": 0.3110058598220348, "epoch": 0.7285231378948596, "grad_norm": 2.375, "learning_rate": 1.9986053077083074e-05, "loss": 0.8057, "mean_token_accuracy": 0.8743705987930298, "num_tokens": 12934872.0, "step": 3125 }, { "entropy": 0.3678535770624876, "epoch": 0.7296887749154913, "grad_norm": 2.015625, "learning_rate": 1.9986007468037187e-05, "loss": 0.6753, "mean_token_accuracy": 0.874816483259201, "num_tokens": 12946749.0, "step": 3130 }, { "entropy": 0.2128050871193409, "epoch": 0.7308544119361231, "grad_norm": 0.66015625, "learning_rate": 1.9985961784642426e-05, "loss": 0.2591, "mean_token_accuracy": 0.9357932329177856, "num_tokens": 12974482.0, "step": 3135 }, { "entropy": 0.31313026919960973, "epoch": 0.7320200489567549, "grad_norm": 5.125, "learning_rate": 1.9985916026899478e-05, "loss": 0.5509, "mean_token_accuracy": 0.8914876878261566, "num_tokens": 12994951.0, "step": 3140 }, { "entropy": 0.2906319335103035, "epoch": 0.7331856859773866, "grad_norm": 3.359375, "learning_rate": 1.9985870194809022e-05, "loss": 0.7678, "mean_token_accuracy": 0.8900413334369659, "num_tokens": 13005139.0, "step": 3145 }, { "entropy": 0.43225277215242386, "epoch": 0.7343513229980184, "grad_norm": 1.328125, "learning_rate": 1.9985824288371743e-05, "loss": 0.6923, "mean_token_accuracy": 0.8420580059289933, "num_tokens": 13032822.0, "step": 3150 }, { "entropy": 0.23028683103621006, "epoch": 0.7355169600186502, "grad_norm": 2.625, "learning_rate": 1.9985778307588323e-05, "loss": 0.2613, "mean_token_accuracy": 0.9250756561756134, "num_tokens": 13069596.0, "step": 3155 }, { "entropy": 0.2778568729758263, "epoch": 0.736682597039282, "grad_norm": 1.6484375, "learning_rate": 1.998573225245945e-05, "loss": 0.4831, "mean_token_accuracy": 0.918241810798645, "num_tokens": 13084383.0, "step": 3160 }, { "entropy": 0.24663490243256092, "epoch": 0.7378482340599137, "grad_norm": 0.66796875, "learning_rate": 1.998568612298581e-05, "loss": 0.2513, "mean_token_accuracy": 0.9311227321624755, "num_tokens": 13114417.0, "step": 3165 }, { "entropy": 0.28456913605332373, "epoch": 0.7390138710805455, "grad_norm": 1.453125, "learning_rate": 1.9985639919168093e-05, "loss": 0.5558, "mean_token_accuracy": 0.8984531760215759, "num_tokens": 13132970.0, "step": 3170 }, { "entropy": 0.2570043332874775, "epoch": 0.7401795081011773, "grad_norm": 8.625, "learning_rate": 1.9985593641006984e-05, "loss": 0.5133, "mean_token_accuracy": 0.9061603724956513, "num_tokens": 13156883.0, "step": 3175 }, { "entropy": 0.2510953940451145, "epoch": 0.741345145121809, "grad_norm": 3.609375, "learning_rate": 1.9985547288503175e-05, "loss": 0.3524, "mean_token_accuracy": 0.9195565164089203, "num_tokens": 13191055.0, "step": 3180 }, { "entropy": 0.2396312952041626, "epoch": 0.7425107821424408, "grad_norm": 1.5703125, "learning_rate": 1.9985500861657358e-05, "loss": 0.353, "mean_token_accuracy": 0.9258275687694549, "num_tokens": 13216177.0, "step": 3185 }, { "entropy": 0.32750511094927787, "epoch": 0.7436764191630726, "grad_norm": 2.609375, "learning_rate": 1.9985454360470224e-05, "loss": 0.4645, "mean_token_accuracy": 0.9065626561641693, "num_tokens": 13229869.0, "step": 3190 }, { "entropy": 0.33407930880784986, "epoch": 0.7448420561837044, "grad_norm": 5.0625, "learning_rate": 1.9985407784942467e-05, "loss": 0.5446, "mean_token_accuracy": 0.8834915876388549, "num_tokens": 13247975.0, "step": 3195 }, { "entropy": 0.35299213230609894, "epoch": 0.7460076932043361, "grad_norm": 1.703125, "learning_rate": 1.9985361135074782e-05, "loss": 0.6986, "mean_token_accuracy": 0.870141988992691, "num_tokens": 13260189.0, "step": 3200 }, { "entropy": 0.2146347463130951, "epoch": 0.7471733302249679, "grad_norm": 2.03125, "learning_rate": 1.998531441086786e-05, "loss": 0.3993, "mean_token_accuracy": 0.9297842562198639, "num_tokens": 13284011.0, "step": 3205 }, { "entropy": 0.31019294261932373, "epoch": 0.7483389672455997, "grad_norm": 1.3359375, "learning_rate": 1.9985267612322408e-05, "loss": 0.5904, "mean_token_accuracy": 0.8879059374332428, "num_tokens": 13301433.0, "step": 3210 }, { "entropy": 0.31197711527347566, "epoch": 0.7495046042662314, "grad_norm": 0.640625, "learning_rate": 1.9985220739439117e-05, "loss": 0.5727, "mean_token_accuracy": 0.8965538263320922, "num_tokens": 13314835.0, "step": 3215 }, { "entropy": 0.3941266030073166, "epoch": 0.7506702412868632, "grad_norm": 1.8984375, "learning_rate": 1.9985173792218683e-05, "loss": 0.9058, "mean_token_accuracy": 0.8483995854854584, "num_tokens": 13323087.0, "step": 3220 }, { "entropy": 0.2891694024205208, "epoch": 0.7518358783074951, "grad_norm": 2.1875, "learning_rate": 1.998512677066181e-05, "loss": 0.5224, "mean_token_accuracy": 0.9001904428005219, "num_tokens": 13345448.0, "step": 3225 }, { "entropy": 0.3195288822054863, "epoch": 0.7530015153281269, "grad_norm": 3.34375, "learning_rate": 1.9985079674769203e-05, "loss": 0.6119, "mean_token_accuracy": 0.892349797487259, "num_tokens": 13357508.0, "step": 3230 }, { "entropy": 0.2566119972616434, "epoch": 0.7541671523487586, "grad_norm": 2.265625, "learning_rate": 1.9985032504541555e-05, "loss": 0.4236, "mean_token_accuracy": 0.9038833737373352, "num_tokens": 13375653.0, "step": 3235 }, { "entropy": 0.36683848649263384, "epoch": 0.7553327893693904, "grad_norm": 2.640625, "learning_rate": 1.9984985259979577e-05, "loss": 0.7721, "mean_token_accuracy": 0.8706809699535369, "num_tokens": 13384215.0, "step": 3240 }, { "entropy": 0.23739213198423387, "epoch": 0.7564984263900222, "grad_norm": 0.80859375, "learning_rate": 1.998493794108397e-05, "loss": 0.4994, "mean_token_accuracy": 0.9105100572109223, "num_tokens": 13398419.0, "step": 3245 }, { "entropy": 0.3068238809704781, "epoch": 0.757664063410654, "grad_norm": 2.828125, "learning_rate": 1.9984890547855444e-05, "loss": 0.6933, "mean_token_accuracy": 0.8871238231658936, "num_tokens": 13410075.0, "step": 3250 }, { "entropy": 0.23343345075845717, "epoch": 0.7588297004312857, "grad_norm": 0.8359375, "learning_rate": 1.9984843080294695e-05, "loss": 0.4127, "mean_token_accuracy": 0.9250391602516175, "num_tokens": 13431831.0, "step": 3255 }, { "entropy": 0.35735142379999163, "epoch": 0.7599953374519175, "grad_norm": 0.67578125, "learning_rate": 1.9984795538402444e-05, "loss": 0.5364, "mean_token_accuracy": 0.8816571056842804, "num_tokens": 13453700.0, "step": 3260 }, { "entropy": 0.2540307299233973, "epoch": 0.7611609744725493, "grad_norm": 0.73828125, "learning_rate": 1.9984747922179393e-05, "loss": 0.2662, "mean_token_accuracy": 0.9232360005378724, "num_tokens": 13481739.0, "step": 3265 }, { "entropy": 0.3812587969005108, "epoch": 0.762326611493181, "grad_norm": 5.71875, "learning_rate": 1.998470023162625e-05, "loss": 0.809, "mean_token_accuracy": 0.8781751275062561, "num_tokens": 13494741.0, "step": 3270 }, { "entropy": 0.3789962977170944, "epoch": 0.7634922485138128, "grad_norm": 1.34375, "learning_rate": 1.9984652466743733e-05, "loss": 0.905, "mean_token_accuracy": 0.8371272504329681, "num_tokens": 13508467.0, "step": 3275 }, { "entropy": 0.2287605084478855, "epoch": 0.7646578855344446, "grad_norm": 3.46875, "learning_rate": 1.9984604627532547e-05, "loss": 0.4125, "mean_token_accuracy": 0.9276242315769195, "num_tokens": 13532941.0, "step": 3280 }, { "entropy": 0.23304533641785383, "epoch": 0.7658235225550764, "grad_norm": 0.5, "learning_rate": 1.9984556713993414e-05, "loss": 0.3236, "mean_token_accuracy": 0.9229490995407105, "num_tokens": 13555509.0, "step": 3285 }, { "entropy": 0.35275202319025994, "epoch": 0.7669891595757081, "grad_norm": 3.375, "learning_rate": 1.9984508726127038e-05, "loss": 0.6199, "mean_token_accuracy": 0.8747760951519012, "num_tokens": 13585680.0, "step": 3290 }, { "entropy": 0.24191362485289575, "epoch": 0.7681547965963399, "grad_norm": 0.58984375, "learning_rate": 1.9984460663934143e-05, "loss": 0.5734, "mean_token_accuracy": 0.9015963733196258, "num_tokens": 13609809.0, "step": 3295 }, { "entropy": 0.29589404761791227, "epoch": 0.7693204336169717, "grad_norm": 2.328125, "learning_rate": 1.998441252741544e-05, "loss": 0.5253, "mean_token_accuracy": 0.8925094544887543, "num_tokens": 13628829.0, "step": 3300 }, { "entropy": 0.20487794056534767, "epoch": 0.7704860706376034, "grad_norm": 1.046875, "learning_rate": 1.9984364316571652e-05, "loss": 0.2499, "mean_token_accuracy": 0.9276638627052307, "num_tokens": 13650807.0, "step": 3305 }, { "entropy": 0.2848545204848051, "epoch": 0.7716517076582352, "grad_norm": 1.6796875, "learning_rate": 1.9984316031403494e-05, "loss": 0.547, "mean_token_accuracy": 0.8881556272506714, "num_tokens": 13665337.0, "step": 3310 }, { "entropy": 0.2990790454670787, "epoch": 0.772817344678867, "grad_norm": 0.59375, "learning_rate": 1.9984267671911685e-05, "loss": 0.4545, "mean_token_accuracy": 0.9087825000286103, "num_tokens": 13686946.0, "step": 3315 }, { "entropy": 0.31242423579096795, "epoch": 0.7739829816994988, "grad_norm": 2.875, "learning_rate": 1.998421923809695e-05, "loss": 0.7536, "mean_token_accuracy": 0.8559371948242187, "num_tokens": 13699043.0, "step": 3320 }, { "entropy": 0.3204685363918543, "epoch": 0.7751486187201305, "grad_norm": 2.5, "learning_rate": 1.998417072996001e-05, "loss": 0.6342, "mean_token_accuracy": 0.880757862329483, "num_tokens": 13715301.0, "step": 3325 }, { "entropy": 0.28653821013867853, "epoch": 0.7763142557407623, "grad_norm": 0.466796875, "learning_rate": 1.9984122147501586e-05, "loss": 0.2364, "mean_token_accuracy": 0.916357421875, "num_tokens": 13739522.0, "step": 3330 }, { "entropy": 0.3267530560493469, "epoch": 0.7774798927613941, "grad_norm": 0.94921875, "learning_rate": 1.9984073490722406e-05, "loss": 0.4487, "mean_token_accuracy": 0.8852592051029206, "num_tokens": 13757223.0, "step": 3335 }, { "entropy": 0.2911109760403633, "epoch": 0.7786455297820258, "grad_norm": 1.125, "learning_rate": 1.9984024759623192e-05, "loss": 0.397, "mean_token_accuracy": 0.9045413374900818, "num_tokens": 13775390.0, "step": 3340 }, { "entropy": 0.28941122740507125, "epoch": 0.7798111668026576, "grad_norm": 2.234375, "learning_rate": 1.9983975954204674e-05, "loss": 0.6316, "mean_token_accuracy": 0.8857159197330475, "num_tokens": 13788002.0, "step": 3345 }, { "entropy": 0.2598882310092449, "epoch": 0.7809768038232894, "grad_norm": 1.515625, "learning_rate": 1.9983927074467577e-05, "loss": 0.4756, "mean_token_accuracy": 0.9066189110279084, "num_tokens": 13826780.0, "step": 3350 }, { "entropy": 0.33761544786393644, "epoch": 0.7821424408439211, "grad_norm": 1.4921875, "learning_rate": 1.9983878120412632e-05, "loss": 0.566, "mean_token_accuracy": 0.8932851791381836, "num_tokens": 13851290.0, "step": 3355 }, { "entropy": 0.2210379447788, "epoch": 0.783308077864553, "grad_norm": 1.21875, "learning_rate": 1.9983829092040568e-05, "loss": 0.2929, "mean_token_accuracy": 0.9251502156257629, "num_tokens": 13876976.0, "step": 3360 }, { "entropy": 0.20661963215097784, "epoch": 0.7844737148851848, "grad_norm": 0.353515625, "learning_rate": 1.9983779989352113e-05, "loss": 0.2915, "mean_token_accuracy": 0.9164006829261779, "num_tokens": 13916136.0, "step": 3365 }, { "entropy": 0.27498682774603367, "epoch": 0.7856393519058166, "grad_norm": 0.85546875, "learning_rate": 1.9983730812348007e-05, "loss": 0.4304, "mean_token_accuracy": 0.8965947449207305, "num_tokens": 13936758.0, "step": 3370 }, { "entropy": 0.3657851852476597, "epoch": 0.7868049889264483, "grad_norm": 6.5625, "learning_rate": 1.9983681561028977e-05, "loss": 0.5509, "mean_token_accuracy": 0.9021944999694824, "num_tokens": 13967866.0, "step": 3375 }, { "entropy": 0.32351127788424494, "epoch": 0.7879706259470801, "grad_norm": 0.7265625, "learning_rate": 1.998363223539576e-05, "loss": 0.7277, "mean_token_accuracy": 0.8853053629398346, "num_tokens": 13979343.0, "step": 3380 }, { "entropy": 0.32861794382333753, "epoch": 0.7891362629677119, "grad_norm": 0.74609375, "learning_rate": 1.9983582835449088e-05, "loss": 0.7126, "mean_token_accuracy": 0.8623862028121948, "num_tokens": 13997112.0, "step": 3385 }, { "entropy": 0.2560902625322342, "epoch": 0.7903018999883437, "grad_norm": 0.365234375, "learning_rate": 1.9983533361189702e-05, "loss": 0.5358, "mean_token_accuracy": 0.8984552204608918, "num_tokens": 14020986.0, "step": 3390 }, { "entropy": 0.38792429491877556, "epoch": 0.7914675370089754, "grad_norm": 2.25, "learning_rate": 1.9983483812618337e-05, "loss": 0.7052, "mean_token_accuracy": 0.8550672352313995, "num_tokens": 14044564.0, "step": 3395 }, { "entropy": 0.3164536517113447, "epoch": 0.7926331740296072, "grad_norm": 1.3984375, "learning_rate": 1.9983434189735735e-05, "loss": 0.4811, "mean_token_accuracy": 0.9013772666454315, "num_tokens": 14060290.0, "step": 3400 }, { "entropy": 0.33436234965920447, "epoch": 0.793798811050239, "grad_norm": 0.28515625, "learning_rate": 1.9983384492542634e-05, "loss": 0.6164, "mean_token_accuracy": 0.9005577623844147, "num_tokens": 14076258.0, "step": 3405 }, { "entropy": 0.4063506111502647, "epoch": 0.7949644480708707, "grad_norm": 8.625, "learning_rate": 1.998333472103977e-05, "loss": 0.8561, "mean_token_accuracy": 0.8626307547092438, "num_tokens": 14106977.0, "step": 3410 }, { "entropy": 0.40561274290084837, "epoch": 0.7961300850915025, "grad_norm": 1.3984375, "learning_rate": 1.9983284875227894e-05, "loss": 0.4162, "mean_token_accuracy": 0.8764112651348114, "num_tokens": 14133864.0, "step": 3415 }, { "entropy": 0.335993605107069, "epoch": 0.7972957221121343, "grad_norm": 1.515625, "learning_rate": 1.9983234955107743e-05, "loss": 0.5466, "mean_token_accuracy": 0.8508065283298493, "num_tokens": 14147764.0, "step": 3420 }, { "entropy": 0.23777510970830917, "epoch": 0.7984613591327661, "grad_norm": 1.53125, "learning_rate": 1.9983184960680068e-05, "loss": 0.2813, "mean_token_accuracy": 0.9156764447689056, "num_tokens": 14169080.0, "step": 3425 }, { "entropy": 0.3019415006041527, "epoch": 0.7996269961533978, "grad_norm": 2.046875, "learning_rate": 1.9983134891945604e-05, "loss": 0.6884, "mean_token_accuracy": 0.8911664128303528, "num_tokens": 14178988.0, "step": 3430 }, { "entropy": 0.34377400428056715, "epoch": 0.8007926331740296, "grad_norm": 1.3515625, "learning_rate": 1.9983084748905107e-05, "loss": 0.4009, "mean_token_accuracy": 0.8842729508876801, "num_tokens": 14213845.0, "step": 3435 }, { "entropy": 0.3087873324751854, "epoch": 0.8019582701946614, "grad_norm": 3.671875, "learning_rate": 1.998303453155932e-05, "loss": 0.7759, "mean_token_accuracy": 0.8876574337482452, "num_tokens": 14223321.0, "step": 3440 }, { "entropy": 0.2858346672728658, "epoch": 0.8031239072152931, "grad_norm": 1.65625, "learning_rate": 1.9982984239908995e-05, "loss": 0.6156, "mean_token_accuracy": 0.890706866979599, "num_tokens": 14242105.0, "step": 3445 }, { "entropy": 0.37374798357486727, "epoch": 0.8042895442359249, "grad_norm": 2.734375, "learning_rate": 1.9982933873954878e-05, "loss": 0.8822, "mean_token_accuracy": 0.86474609375, "num_tokens": 14252615.0, "step": 3450 }, { "entropy": 0.41026286482810975, "epoch": 0.8054551812565567, "grad_norm": 3.125, "learning_rate": 1.9982883433697723e-05, "loss": 0.8252, "mean_token_accuracy": 0.8614952623844147, "num_tokens": 14269831.0, "step": 3455 }, { "entropy": 0.30025038607418536, "epoch": 0.8066208182771885, "grad_norm": 0.390625, "learning_rate": 1.9982832919138286e-05, "loss": 0.1509, "mean_token_accuracy": 0.903667026758194, "num_tokens": 14306269.0, "step": 3460 }, { "entropy": 0.3967276046052575, "epoch": 0.8077864552978202, "grad_norm": 0.447265625, "learning_rate": 1.9982782330277308e-05, "loss": 0.621, "mean_token_accuracy": 0.8603341758251191, "num_tokens": 14335201.0, "step": 3465 }, { "entropy": 0.26602720804512503, "epoch": 0.808952092318452, "grad_norm": 1.5546875, "learning_rate": 1.9982731667115556e-05, "loss": 0.4676, "mean_token_accuracy": 0.8931980729103088, "num_tokens": 14355603.0, "step": 3470 }, { "entropy": 0.30935631804168223, "epoch": 0.8101177293390838, "grad_norm": 0.6328125, "learning_rate": 1.9982680929653777e-05, "loss": 0.441, "mean_token_accuracy": 0.9137549519538879, "num_tokens": 14374468.0, "step": 3475 }, { "entropy": 0.31746798753738403, "epoch": 0.8112833663597155, "grad_norm": 1.0390625, "learning_rate": 1.9982630117892735e-05, "loss": 0.5511, "mean_token_accuracy": 0.8939504265785218, "num_tokens": 14387992.0, "step": 3480 }, { "entropy": 0.20791075341403484, "epoch": 0.8124490033803473, "grad_norm": 0.224609375, "learning_rate": 1.998257923183318e-05, "loss": 0.2899, "mean_token_accuracy": 0.9380963206291199, "num_tokens": 14412238.0, "step": 3485 }, { "entropy": 0.22601248007267713, "epoch": 0.8136146404009791, "grad_norm": 0.84765625, "learning_rate": 1.9982528271475876e-05, "loss": 0.3418, "mean_token_accuracy": 0.9155470192432403, "num_tokens": 14435236.0, "step": 3490 }, { "entropy": 0.2866682179272175, "epoch": 0.814780277421611, "grad_norm": 0.5390625, "learning_rate": 1.998247723682158e-05, "loss": 0.436, "mean_token_accuracy": 0.912602162361145, "num_tokens": 14456632.0, "step": 3495 }, { "entropy": 0.23919593989849092, "epoch": 0.8159459144422427, "grad_norm": 2.59375, "learning_rate": 1.9982426127871056e-05, "loss": 0.3145, "mean_token_accuracy": 0.9271677911281586, "num_tokens": 14478160.0, "step": 3500 }, { "entropy": 0.264223488420248, "epoch": 0.8171115514628745, "grad_norm": 0.67578125, "learning_rate": 1.9982374944625064e-05, "loss": 0.612, "mean_token_accuracy": 0.8922808647155762, "num_tokens": 14494151.0, "step": 3505 }, { "entropy": 0.2899939067661762, "epoch": 0.8182771884835063, "grad_norm": 0.44140625, "learning_rate": 1.9982323687084365e-05, "loss": 0.5561, "mean_token_accuracy": 0.9069446921348572, "num_tokens": 14516496.0, "step": 3510 }, { "entropy": 0.24367965012788773, "epoch": 0.819442825504138, "grad_norm": 0.89453125, "learning_rate": 1.998227235524973e-05, "loss": 0.4233, "mean_token_accuracy": 0.9186331748962402, "num_tokens": 14538180.0, "step": 3515 }, { "entropy": 0.3140773274004459, "epoch": 0.8206084625247698, "grad_norm": 0.490234375, "learning_rate": 1.998222094912192e-05, "loss": 0.4142, "mean_token_accuracy": 0.9007182121276855, "num_tokens": 14571081.0, "step": 3520 }, { "entropy": 0.33965519815683365, "epoch": 0.8217740995454016, "grad_norm": 0.28515625, "learning_rate": 1.9982169468701702e-05, "loss": 0.501, "mean_token_accuracy": 0.8900741517543793, "num_tokens": 14589550.0, "step": 3525 }, { "entropy": 0.3125692706555128, "epoch": 0.8229397365660334, "grad_norm": 2.546875, "learning_rate": 1.9982117913989844e-05, "loss": 0.7301, "mean_token_accuracy": 0.882424396276474, "num_tokens": 14605935.0, "step": 3530 }, { "entropy": 0.2235667049884796, "epoch": 0.8241053735866651, "grad_norm": 0.73046875, "learning_rate": 1.9982066284987108e-05, "loss": 0.3922, "mean_token_accuracy": 0.9213648438453674, "num_tokens": 14629378.0, "step": 3535 }, { "entropy": 0.3533320169895887, "epoch": 0.8252710106072969, "grad_norm": 0.765625, "learning_rate": 1.9982014581694277e-05, "loss": 0.5098, "mean_token_accuracy": 0.8809333443641663, "num_tokens": 14659124.0, "step": 3540 }, { "entropy": 0.28666834309697153, "epoch": 0.8264366476279287, "grad_norm": 1.2265625, "learning_rate": 1.9981962804112113e-05, "loss": 0.6463, "mean_token_accuracy": 0.8949851632118225, "num_tokens": 14673210.0, "step": 3545 }, { "entropy": 0.27381937131285666, "epoch": 0.8276022846485604, "grad_norm": 3.078125, "learning_rate": 1.998191095224139e-05, "loss": 0.5301, "mean_token_accuracy": 0.9156894981861115, "num_tokens": 14688514.0, "step": 3550 }, { "entropy": 0.27920118868350985, "epoch": 0.8287679216691922, "grad_norm": 4.53125, "learning_rate": 1.9981859026082882e-05, "loss": 0.558, "mean_token_accuracy": 0.8992727637290955, "num_tokens": 14705520.0, "step": 3555 }, { "entropy": 0.3166303887963295, "epoch": 0.829933558689824, "grad_norm": 1.625, "learning_rate": 1.9981807025637365e-05, "loss": 0.4082, "mean_token_accuracy": 0.9166463553905487, "num_tokens": 14728999.0, "step": 3560 }, { "entropy": 0.26104464530944826, "epoch": 0.8310991957104558, "grad_norm": 0.259765625, "learning_rate": 1.998175495090561e-05, "loss": 0.493, "mean_token_accuracy": 0.9059361159801483, "num_tokens": 14766556.0, "step": 3565 }, { "entropy": 0.31263031214475634, "epoch": 0.8322648327310875, "grad_norm": 2.796875, "learning_rate": 1.9981702801888393e-05, "loss": 0.5876, "mean_token_accuracy": 0.8922380983829499, "num_tokens": 14786649.0, "step": 3570 }, { "entropy": 0.17848571315407752, "epoch": 0.8334304697517193, "grad_norm": 0.3359375, "learning_rate": 1.99816505785865e-05, "loss": 0.2014, "mean_token_accuracy": 0.947855943441391, "num_tokens": 14831989.0, "step": 3575 }, { "entropy": 0.3408513143658638, "epoch": 0.8345961067723511, "grad_norm": 2.84375, "learning_rate": 1.99815982810007e-05, "loss": 0.5118, "mean_token_accuracy": 0.8933568835258484, "num_tokens": 14849088.0, "step": 3580 }, { "entropy": 0.2799061857163906, "epoch": 0.8357617437929828, "grad_norm": 2.3125, "learning_rate": 1.998154590913178e-05, "loss": 0.5424, "mean_token_accuracy": 0.9027321338653564, "num_tokens": 14864188.0, "step": 3585 }, { "entropy": 0.2542919620871544, "epoch": 0.8369273808136146, "grad_norm": 1.484375, "learning_rate": 1.9981493462980514e-05, "loss": 0.424, "mean_token_accuracy": 0.9119072020053863, "num_tokens": 14881774.0, "step": 3590 }, { "entropy": 0.28481525033712385, "epoch": 0.8380930178342464, "grad_norm": 1.6875, "learning_rate": 1.998144094254769e-05, "loss": 0.4691, "mean_token_accuracy": 0.9085649073123931, "num_tokens": 14896673.0, "step": 3595 }, { "entropy": 0.28367489129304885, "epoch": 0.8392586548548782, "grad_norm": 1.546875, "learning_rate": 1.998138834783409e-05, "loss": 0.5003, "mean_token_accuracy": 0.9036788463592529, "num_tokens": 14910997.0, "step": 3600 }, { "entropy": 0.24263509139418601, "epoch": 0.8404242918755099, "grad_norm": 2.21875, "learning_rate": 1.9981335678840495e-05, "loss": 0.3992, "mean_token_accuracy": 0.9190425634384155, "num_tokens": 14935090.0, "step": 3605 }, { "entropy": 0.24624939411878585, "epoch": 0.8415899288961417, "grad_norm": 0.490234375, "learning_rate": 1.9981282935567693e-05, "loss": 0.4746, "mean_token_accuracy": 0.9015482604503632, "num_tokens": 14949307.0, "step": 3610 }, { "entropy": 0.2743656687438488, "epoch": 0.8427555659167735, "grad_norm": 0.427734375, "learning_rate": 1.998123011801647e-05, "loss": 0.2832, "mean_token_accuracy": 0.9231635749340057, "num_tokens": 14979284.0, "step": 3615 }, { "entropy": 0.18736706282943488, "epoch": 0.8439212029374052, "grad_norm": 0.87109375, "learning_rate": 1.9981177226187617e-05, "loss": 0.3228, "mean_token_accuracy": 0.9420149087905884, "num_tokens": 15012270.0, "step": 3620 }, { "entropy": 0.356676259636879, "epoch": 0.845086839958037, "grad_norm": 2.09375, "learning_rate": 1.9981124260081917e-05, "loss": 0.7459, "mean_token_accuracy": 0.8781390905380249, "num_tokens": 15023574.0, "step": 3625 }, { "entropy": 0.24901985973119736, "epoch": 0.8462524769786688, "grad_norm": 0.4375, "learning_rate": 1.998107121970016e-05, "loss": 0.3729, "mean_token_accuracy": 0.9155802965164185, "num_tokens": 15053734.0, "step": 3630 }, { "entropy": 0.26287381947040556, "epoch": 0.8474181139993007, "grad_norm": 2.234375, "learning_rate": 1.9981018105043144e-05, "loss": 0.5193, "mean_token_accuracy": 0.9083257913589478, "num_tokens": 15072791.0, "step": 3635 }, { "entropy": 0.3157116275280714, "epoch": 0.8485837510199324, "grad_norm": 1.0625, "learning_rate": 1.9980964916111654e-05, "loss": 0.3004, "mean_token_accuracy": 0.903801566362381, "num_tokens": 15107653.0, "step": 3640 }, { "entropy": 0.28021684251725676, "epoch": 0.8497493880405642, "grad_norm": 0.50390625, "learning_rate": 1.9980911652906484e-05, "loss": 0.4549, "mean_token_accuracy": 0.8997021973133087, "num_tokens": 15132932.0, "step": 3645 }, { "entropy": 0.25191900655627253, "epoch": 0.850915025061196, "grad_norm": 1.5859375, "learning_rate": 1.998085831542843e-05, "loss": 0.4318, "mean_token_accuracy": 0.9206875085830688, "num_tokens": 15146894.0, "step": 3650 }, { "entropy": 0.3430237350985408, "epoch": 0.8520806620818278, "grad_norm": 4.5, "learning_rate": 1.9980804903678287e-05, "loss": 0.5912, "mean_token_accuracy": 0.88895942568779, "num_tokens": 15174424.0, "step": 3655 }, { "entropy": 0.31596364453434944, "epoch": 0.8532462991024595, "grad_norm": 1.6640625, "learning_rate": 1.998075141765685e-05, "loss": 0.3543, "mean_token_accuracy": 0.8607090711593628, "num_tokens": 15204731.0, "step": 3660 }, { "entropy": 0.3275330767035484, "epoch": 0.8544119361230913, "grad_norm": 3.59375, "learning_rate": 1.998069785736492e-05, "loss": 0.5245, "mean_token_accuracy": 0.907353276014328, "num_tokens": 15232691.0, "step": 3665 }, { "entropy": 0.27450884664431213, "epoch": 0.8555775731437231, "grad_norm": 0.162109375, "learning_rate": 1.9980644222803296e-05, "loss": 0.4254, "mean_token_accuracy": 0.8911545634269714, "num_tokens": 15270176.0, "step": 3670 }, { "entropy": 0.2559219378978014, "epoch": 0.8567432101643548, "grad_norm": 0.73046875, "learning_rate": 1.9980590513972775e-05, "loss": 0.5514, "mean_token_accuracy": 0.911538553237915, "num_tokens": 15289751.0, "step": 3675 }, { "entropy": 0.24207828417420388, "epoch": 0.8579088471849866, "grad_norm": 1.515625, "learning_rate": 1.9980536730874154e-05, "loss": 0.2878, "mean_token_accuracy": 0.9238324701786041, "num_tokens": 15314355.0, "step": 3680 }, { "entropy": 0.44712048918008807, "epoch": 0.8590744842056184, "grad_norm": 1.640625, "learning_rate": 1.998048287350824e-05, "loss": 0.5822, "mean_token_accuracy": 0.8792366266250611, "num_tokens": 15325754.0, "step": 3685 }, { "entropy": 0.31567412763834, "epoch": 0.8602401212262502, "grad_norm": 1.9375, "learning_rate": 1.9980428941875835e-05, "loss": 0.4949, "mean_token_accuracy": 0.914052402973175, "num_tokens": 15353841.0, "step": 3690 }, { "entropy": 0.26079447716474535, "epoch": 0.8614057582468819, "grad_norm": 2.453125, "learning_rate": 1.9980374935977747e-05, "loss": 0.53, "mean_token_accuracy": 0.9045489251613616, "num_tokens": 15376130.0, "step": 3695 }, { "entropy": 0.41225661505013705, "epoch": 0.8625713952675137, "grad_norm": 1.2265625, "learning_rate": 1.9980320855814775e-05, "loss": 0.5735, "mean_token_accuracy": 0.8939766168594361, "num_tokens": 15407430.0, "step": 3700 }, { "entropy": 0.30904234796762464, "epoch": 0.8637370322881455, "grad_norm": 2.796875, "learning_rate": 1.998026670138773e-05, "loss": 0.6142, "mean_token_accuracy": 0.8968602418899536, "num_tokens": 15418732.0, "step": 3705 }, { "entropy": 0.2660053789615631, "epoch": 0.8649026693087772, "grad_norm": 0.376953125, "learning_rate": 1.9980212472697414e-05, "loss": 0.283, "mean_token_accuracy": 0.9082744538784027, "num_tokens": 15458633.0, "step": 3710 }, { "entropy": 0.2127044014632702, "epoch": 0.866068306329409, "grad_norm": 3.171875, "learning_rate": 1.9980158169744644e-05, "loss": 0.2172, "mean_token_accuracy": 0.9319159090518951, "num_tokens": 15493223.0, "step": 3715 }, { "entropy": 0.3829865030944347, "epoch": 0.8672339433500408, "grad_norm": 3.171875, "learning_rate": 1.998010379253022e-05, "loss": 0.8015, "mean_token_accuracy": 0.8439773738384246, "num_tokens": 15505228.0, "step": 3720 }, { "entropy": 0.36416123397648337, "epoch": 0.8683995803706726, "grad_norm": 2.0625, "learning_rate": 1.9980049341054963e-05, "loss": 0.6867, "mean_token_accuracy": 0.8739733338356018, "num_tokens": 15520660.0, "step": 3725 }, { "entropy": 0.2619575455784798, "epoch": 0.8695652173913043, "grad_norm": 0.41796875, "learning_rate": 1.9979994815319677e-05, "loss": 0.4769, "mean_token_accuracy": 0.9197808086872101, "num_tokens": 15550128.0, "step": 3730 }, { "entropy": 0.27543322481215, "epoch": 0.8707308544119361, "grad_norm": 2.5625, "learning_rate": 1.9979940215325178e-05, "loss": 0.5064, "mean_token_accuracy": 0.9059760510921478, "num_tokens": 15571541.0, "step": 3735 }, { "entropy": 0.25470650345087054, "epoch": 0.8718964914325679, "grad_norm": 0.53515625, "learning_rate": 1.997988554107228e-05, "loss": 0.3501, "mean_token_accuracy": 0.9108909010887146, "num_tokens": 15608560.0, "step": 3740 }, { "entropy": 0.24583430550992488, "epoch": 0.8730621284531996, "grad_norm": 0.8671875, "learning_rate": 1.99798307925618e-05, "loss": 0.382, "mean_token_accuracy": 0.9136410057544708, "num_tokens": 15630666.0, "step": 3745 }, { "entropy": 0.1376958515495062, "epoch": 0.8742277654738314, "grad_norm": 0.9296875, "learning_rate": 1.9979775969794553e-05, "loss": 0.1491, "mean_token_accuracy": 0.9549042522907257, "num_tokens": 15671696.0, "step": 3750 }, { "entropy": 0.22299788594245912, "epoch": 0.8753934024944632, "grad_norm": 0.61328125, "learning_rate": 1.9979721072771357e-05, "loss": 0.4353, "mean_token_accuracy": 0.9072445154190063, "num_tokens": 15703582.0, "step": 3755 }, { "entropy": 0.24687078446149827, "epoch": 0.876559039515095, "grad_norm": 1.265625, "learning_rate": 1.9979666101493027e-05, "loss": 0.592, "mean_token_accuracy": 0.891139543056488, "num_tokens": 15722485.0, "step": 3760 }, { "entropy": 0.3579208765178919, "epoch": 0.8777246765357267, "grad_norm": 3.34375, "learning_rate": 1.9979611055960385e-05, "loss": 0.8938, "mean_token_accuracy": 0.8332218408584595, "num_tokens": 15746229.0, "step": 3765 }, { "entropy": 0.2589207597076893, "epoch": 0.8788903135563586, "grad_norm": 0.36328125, "learning_rate": 1.997955593617426e-05, "loss": 0.5105, "mean_token_accuracy": 0.9086531221866607, "num_tokens": 15786496.0, "step": 3770 }, { "entropy": 0.22225482761859894, "epoch": 0.8800559505769904, "grad_norm": 2.1875, "learning_rate": 1.9979500742135456e-05, "loss": 0.4195, "mean_token_accuracy": 0.9165923833847046, "num_tokens": 15805106.0, "step": 3775 }, { "entropy": 0.3877521827816963, "epoch": 0.8812215875976221, "grad_norm": 1.34375, "learning_rate": 1.9979445473844813e-05, "loss": 0.5482, "mean_token_accuracy": 0.8581431388854981, "num_tokens": 15825199.0, "step": 3780 }, { "entropy": 0.24894805401563644, "epoch": 0.8823872246182539, "grad_norm": 2.171875, "learning_rate": 1.9979390131303144e-05, "loss": 0.4555, "mean_token_accuracy": 0.9111175835132599, "num_tokens": 15845149.0, "step": 3785 }, { "entropy": 0.40299307107925414, "epoch": 0.8835528616388857, "grad_norm": 3.0625, "learning_rate": 1.997933471451128e-05, "loss": 0.8629, "mean_token_accuracy": 0.8512665927410126, "num_tokens": 15862956.0, "step": 3790 }, { "entropy": 0.28117614462971685, "epoch": 0.8847184986595175, "grad_norm": 2.09375, "learning_rate": 1.997927922347005e-05, "loss": 0.4772, "mean_token_accuracy": 0.9130080938339233, "num_tokens": 15876637.0, "step": 3795 }, { "entropy": 0.5891709521412849, "epoch": 0.8858841356801492, "grad_norm": 0.73046875, "learning_rate": 1.9979223658180273e-05, "loss": 0.8647, "mean_token_accuracy": 0.7845417231321334, "num_tokens": 15909938.0, "step": 3800 }, { "entropy": 0.3107435509562492, "epoch": 0.887049772700781, "grad_norm": 0.37890625, "learning_rate": 1.9979168018642784e-05, "loss": 0.5805, "mean_token_accuracy": 0.8911435663700104, "num_tokens": 15937554.0, "step": 3805 }, { "entropy": 0.2665897116065025, "epoch": 0.8882154097214128, "grad_norm": 3.84375, "learning_rate": 1.997911230485841e-05, "loss": 0.7637, "mean_token_accuracy": 0.8828855872154235, "num_tokens": 15947609.0, "step": 3810 }, { "entropy": 0.3165190897881985, "epoch": 0.8893810467420445, "grad_norm": 2.921875, "learning_rate": 1.9979056516827984e-05, "loss": 0.6888, "mean_token_accuracy": 0.8773676693439484, "num_tokens": 15960074.0, "step": 3815 }, { "entropy": 0.2786505434662104, "epoch": 0.8905466837626763, "grad_norm": 6.0, "learning_rate": 1.9979000654552336e-05, "loss": 0.6122, "mean_token_accuracy": 0.8958684921264648, "num_tokens": 15977495.0, "step": 3820 }, { "entropy": 0.28286024890840056, "epoch": 0.8917123207833081, "grad_norm": 6.28125, "learning_rate": 1.99789447180323e-05, "loss": 0.6977, "mean_token_accuracy": 0.8930557310581207, "num_tokens": 15992438.0, "step": 3825 }, { "entropy": 0.27810373045504094, "epoch": 0.8928779578039399, "grad_norm": 0.314453125, "learning_rate": 1.9978888707268706e-05, "loss": 0.4654, "mean_token_accuracy": 0.8919014155864715, "num_tokens": 16020386.0, "step": 3830 }, { "entropy": 0.4190558884292841, "epoch": 0.8940435948245716, "grad_norm": 0.2421875, "learning_rate": 1.9978832622262397e-05, "loss": 0.6651, "mean_token_accuracy": 0.8513571143150329, "num_tokens": 16039241.0, "step": 3835 }, { "entropy": 0.25377435609698296, "epoch": 0.8952092318452034, "grad_norm": 0.8359375, "learning_rate": 1.9978776463014203e-05, "loss": 0.5405, "mean_token_accuracy": 0.9124003052711487, "num_tokens": 16052979.0, "step": 3840 }, { "entropy": 0.384748013317585, "epoch": 0.8963748688658352, "grad_norm": 0.55078125, "learning_rate": 1.9978720229524963e-05, "loss": 0.6201, "mean_token_accuracy": 0.8712624669075012, "num_tokens": 16071177.0, "step": 3845 }, { "entropy": 0.2742875192314386, "epoch": 0.8975405058864669, "grad_norm": 0.81640625, "learning_rate": 1.9978663921795515e-05, "loss": 0.5037, "mean_token_accuracy": 0.8969690918922424, "num_tokens": 16098544.0, "step": 3850 }, { "entropy": 0.3160762920975685, "epoch": 0.8987061429070987, "grad_norm": 3.375, "learning_rate": 1.9978607539826702e-05, "loss": 0.5482, "mean_token_accuracy": 0.8990673005580903, "num_tokens": 16112610.0, "step": 3855 }, { "entropy": 0.32029368802905084, "epoch": 0.8998717799277305, "grad_norm": 0.90234375, "learning_rate": 1.997855108361936e-05, "loss": 0.41, "mean_token_accuracy": 0.9168787181377411, "num_tokens": 16135541.0, "step": 3860 }, { "entropy": 0.24988686461001636, "epoch": 0.9010374169483623, "grad_norm": 0.3984375, "learning_rate": 1.9978494553174337e-05, "loss": 0.3174, "mean_token_accuracy": 0.9324497878551483, "num_tokens": 16157098.0, "step": 3865 }, { "entropy": 0.40392661690711973, "epoch": 0.902203053968994, "grad_norm": 2.328125, "learning_rate": 1.997843794849247e-05, "loss": 1.0247, "mean_token_accuracy": 0.8482144296169281, "num_tokens": 16164815.0, "step": 3870 }, { "entropy": 0.26116530895233153, "epoch": 0.9033686909896258, "grad_norm": 2.9375, "learning_rate": 1.9978381269574605e-05, "loss": 0.6495, "mean_token_accuracy": 0.9009441614151001, "num_tokens": 16184481.0, "step": 3875 }, { "entropy": 0.21197875253856183, "epoch": 0.9045343280102576, "grad_norm": 0.390625, "learning_rate": 1.997832451642159e-05, "loss": 0.2791, "mean_token_accuracy": 0.9292280495166778, "num_tokens": 16216813.0, "step": 3880 }, { "entropy": 0.3333045765757561, "epoch": 0.9056999650308893, "grad_norm": 2.34375, "learning_rate": 1.9978267689034263e-05, "loss": 0.8819, "mean_token_accuracy": 0.855522632598877, "num_tokens": 16234546.0, "step": 3885 }, { "entropy": 0.37700636237859725, "epoch": 0.9068656020515211, "grad_norm": 0.38671875, "learning_rate": 1.9978210787413478e-05, "loss": 0.7481, "mean_token_accuracy": 0.8536148369312286, "num_tokens": 16251924.0, "step": 3890 }, { "entropy": 0.31856413632631303, "epoch": 0.9080312390721529, "grad_norm": 0.71484375, "learning_rate": 1.9978153811560084e-05, "loss": 0.6041, "mean_token_accuracy": 0.8982816457748413, "num_tokens": 16263609.0, "step": 3895 }, { "entropy": 0.3457029201090336, "epoch": 0.9091968760927847, "grad_norm": 2.015625, "learning_rate": 1.9978096761474932e-05, "loss": 0.4611, "mean_token_accuracy": 0.9150507569313049, "num_tokens": 16284607.0, "step": 3900 }, { "entropy": 0.28251473493874074, "epoch": 0.9103625131134164, "grad_norm": 2.078125, "learning_rate": 1.9978039637158865e-05, "loss": 0.6731, "mean_token_accuracy": 0.8842589139938355, "num_tokens": 16302675.0, "step": 3905 }, { "entropy": 0.23792364951223136, "epoch": 0.9115281501340483, "grad_norm": 0.734375, "learning_rate": 1.9977982438612744e-05, "loss": 0.3204, "mean_token_accuracy": 0.9141345858573914, "num_tokens": 16339004.0, "step": 3910 }, { "entropy": 0.23666626550257205, "epoch": 0.9126937871546801, "grad_norm": 0.9453125, "learning_rate": 1.9977925165837415e-05, "loss": 0.3407, "mean_token_accuracy": 0.9296033978462219, "num_tokens": 16359569.0, "step": 3915 }, { "entropy": 0.22190179247409106, "epoch": 0.9138594241753119, "grad_norm": 2.28125, "learning_rate": 1.9977867818833737e-05, "loss": 0.2375, "mean_token_accuracy": 0.915921813249588, "num_tokens": 16393658.0, "step": 3920 }, { "entropy": 0.3424635842442513, "epoch": 0.9150250611959436, "grad_norm": 2.140625, "learning_rate": 1.997781039760256e-05, "loss": 0.4036, "mean_token_accuracy": 0.8847531020641327, "num_tokens": 16426979.0, "step": 3925 }, { "entropy": 0.2571659699082375, "epoch": 0.9161906982165754, "grad_norm": 2.484375, "learning_rate": 1.9977752902144742e-05, "loss": 0.5223, "mean_token_accuracy": 0.9112405836582184, "num_tokens": 16438542.0, "step": 3930 }, { "entropy": 0.3213506378233433, "epoch": 0.9173563352372072, "grad_norm": 0.46875, "learning_rate": 1.9977695332461143e-05, "loss": 0.5068, "mean_token_accuracy": 0.8930616676807404, "num_tokens": 16453504.0, "step": 3935 }, { "entropy": 0.19305912591516972, "epoch": 0.9185219722578389, "grad_norm": 0.96875, "learning_rate": 1.997763768855262e-05, "loss": 0.2318, "mean_token_accuracy": 0.932563591003418, "num_tokens": 16485783.0, "step": 3940 }, { "entropy": 0.22780342772603035, "epoch": 0.9196876092784707, "grad_norm": 1.0078125, "learning_rate": 1.9977579970420033e-05, "loss": 0.4052, "mean_token_accuracy": 0.9049673736095428, "num_tokens": 16511421.0, "step": 3945 }, { "entropy": 0.17885006498545408, "epoch": 0.9208532462991025, "grad_norm": 0.671875, "learning_rate": 1.9977522178064242e-05, "loss": 0.3811, "mean_token_accuracy": 0.9320909857749939, "num_tokens": 16542554.0, "step": 3950 }, { "entropy": 0.2305964458733797, "epoch": 0.9220188833197343, "grad_norm": 0.265625, "learning_rate": 1.9977464311486105e-05, "loss": 0.293, "mean_token_accuracy": 0.9366983592510223, "num_tokens": 16577722.0, "step": 3955 }, { "entropy": 0.22702017351984977, "epoch": 0.923184520340366, "grad_norm": 2.265625, "learning_rate": 1.9977406370686493e-05, "loss": 0.4052, "mean_token_accuracy": 0.9182761132717132, "num_tokens": 16605331.0, "step": 3960 }, { "entropy": 0.33075036108493805, "epoch": 0.9243501573609978, "grad_norm": 0.82421875, "learning_rate": 1.9977348355666265e-05, "loss": 0.6821, "mean_token_accuracy": 0.8687184751033783, "num_tokens": 16616854.0, "step": 3965 }, { "entropy": 0.2858414173126221, "epoch": 0.9255157943816296, "grad_norm": 0.99609375, "learning_rate": 1.9977290266426284e-05, "loss": 0.4689, "mean_token_accuracy": 0.91506507396698, "num_tokens": 16636421.0, "step": 3970 }, { "entropy": 0.31540155187249186, "epoch": 0.9266814314022613, "grad_norm": 0.45703125, "learning_rate": 1.9977232102967422e-05, "loss": 0.3668, "mean_token_accuracy": 0.9111514151096344, "num_tokens": 16659301.0, "step": 3975 }, { "entropy": 0.31216612458229065, "epoch": 0.9278470684228931, "grad_norm": 2.53125, "learning_rate": 1.997717386529054e-05, "loss": 0.7358, "mean_token_accuracy": 0.8802648484706879, "num_tokens": 16672943.0, "step": 3980 }, { "entropy": 0.2625709608197212, "epoch": 0.9290127054435249, "grad_norm": 1.2578125, "learning_rate": 1.997711555339651e-05, "loss": 0.3855, "mean_token_accuracy": 0.9172736048698426, "num_tokens": 16695486.0, "step": 3985 }, { "entropy": 0.3358628749847412, "epoch": 0.9301783424641566, "grad_norm": 1.265625, "learning_rate": 1.9977057167286203e-05, "loss": 0.7025, "mean_token_accuracy": 0.888811719417572, "num_tokens": 16704825.0, "step": 3990 }, { "entropy": 0.24518522769212722, "epoch": 0.9313439794847884, "grad_norm": 1.6328125, "learning_rate": 1.9976998706960488e-05, "loss": 0.4656, "mean_token_accuracy": 0.9169709384441376, "num_tokens": 16724234.0, "step": 3995 }, { "entropy": 0.25113632045686246, "epoch": 0.9325096165054202, "grad_norm": 0.419921875, "learning_rate": 1.9976940172420232e-05, "loss": 0.3557, "mean_token_accuracy": 0.9113317489624023, "num_tokens": 16757652.0, "step": 4000 }, { "entropy": 0.3050874337553978, "epoch": 0.933675253526052, "grad_norm": 1.3203125, "learning_rate": 1.9976881563666318e-05, "loss": 0.4791, "mean_token_accuracy": 0.8894456505775452, "num_tokens": 16779558.0, "step": 4005 }, { "entropy": 0.354495170339942, "epoch": 0.9348408905466837, "grad_norm": 1.4296875, "learning_rate": 1.9976822880699612e-05, "loss": 0.5045, "mean_token_accuracy": 0.8878437876701355, "num_tokens": 16805757.0, "step": 4010 }, { "entropy": 0.24523208253085613, "epoch": 0.9360065275673155, "grad_norm": 0.64453125, "learning_rate": 1.997676412352099e-05, "loss": 0.5218, "mean_token_accuracy": 0.9126578509807587, "num_tokens": 16824284.0, "step": 4015 }, { "entropy": 0.19175314232707025, "epoch": 0.9371721645879473, "grad_norm": 0.79296875, "learning_rate": 1.997670529213133e-05, "loss": 0.3261, "mean_token_accuracy": 0.9379496335983276, "num_tokens": 16847471.0, "step": 4020 }, { "entropy": 0.2729227438569069, "epoch": 0.938337801608579, "grad_norm": 0.6875, "learning_rate": 1.997664638653151e-05, "loss": 0.5073, "mean_token_accuracy": 0.9096316695213318, "num_tokens": 16869398.0, "step": 4025 }, { "entropy": 0.3152917675673962, "epoch": 0.9395034386292108, "grad_norm": 1.3046875, "learning_rate": 1.9976587406722404e-05, "loss": 0.5437, "mean_token_accuracy": 0.9035122036933899, "num_tokens": 16881743.0, "step": 4030 }, { "entropy": 0.4029148206114769, "epoch": 0.9406690756498426, "grad_norm": 1.9609375, "learning_rate": 1.99765283527049e-05, "loss": 0.6399, "mean_token_accuracy": 0.8744793474674225, "num_tokens": 16896559.0, "step": 4035 }, { "entropy": 0.4212800972163677, "epoch": 0.9418347126704744, "grad_norm": 2.6875, "learning_rate": 1.9976469224479868e-05, "loss": 0.6903, "mean_token_accuracy": 0.8723967254161835, "num_tokens": 16909529.0, "step": 4040 }, { "entropy": 0.33663665801286696, "epoch": 0.9430003496911062, "grad_norm": 1.21875, "learning_rate": 1.9976410022048198e-05, "loss": 0.7401, "mean_token_accuracy": 0.8718845307826996, "num_tokens": 16920073.0, "step": 4045 }, { "entropy": 0.24011497870087622, "epoch": 0.944165986711738, "grad_norm": 0.50390625, "learning_rate": 1.997635074541077e-05, "loss": 0.4172, "mean_token_accuracy": 0.9157796263694763, "num_tokens": 16948944.0, "step": 4050 }, { "entropy": 0.2570995256304741, "epoch": 0.9453316237323698, "grad_norm": 0.73046875, "learning_rate": 1.9976291394568465e-05, "loss": 0.3138, "mean_token_accuracy": 0.92692631483078, "num_tokens": 16973131.0, "step": 4055 }, { "entropy": 0.4393061429262161, "epoch": 0.9464972607530016, "grad_norm": 2.40625, "learning_rate": 1.997623196952217e-05, "loss": 0.8668, "mean_token_accuracy": 0.8413293391466141, "num_tokens": 16992622.0, "step": 4060 }, { "entropy": 0.34109273105859755, "epoch": 0.9476628977736333, "grad_norm": 0.69921875, "learning_rate": 1.9976172470272774e-05, "loss": 0.5098, "mean_token_accuracy": 0.9031398296356201, "num_tokens": 17013703.0, "step": 4065 }, { "entropy": 0.5193141900002957, "epoch": 0.9488285347942651, "grad_norm": 0.79296875, "learning_rate": 1.9976112896821164e-05, "loss": 0.7514, "mean_token_accuracy": 0.8595295757055282, "num_tokens": 17046681.0, "step": 4070 }, { "entropy": 0.29651210978627207, "epoch": 0.9499941718148969, "grad_norm": 0.34375, "learning_rate": 1.997605324916822e-05, "loss": 0.6418, "mean_token_accuracy": 0.8880113184452056, "num_tokens": 17068445.0, "step": 4075 }, { "entropy": 0.2569761071354151, "epoch": 0.9511598088355286, "grad_norm": 0.8515625, "learning_rate": 1.997599352731484e-05, "loss": 0.5029, "mean_token_accuracy": 0.9025446593761444, "num_tokens": 17085586.0, "step": 4080 }, { "entropy": 0.30053408779203894, "epoch": 0.9523254458561604, "grad_norm": 2.40625, "learning_rate": 1.9975933731261917e-05, "loss": 0.5302, "mean_token_accuracy": 0.8973315000534058, "num_tokens": 17108799.0, "step": 4085 }, { "entropy": 0.2752389371395111, "epoch": 0.9534910828767922, "grad_norm": 0.462890625, "learning_rate": 1.997587386101033e-05, "loss": 0.5593, "mean_token_accuracy": 0.900017648935318, "num_tokens": 17120763.0, "step": 4090 }, { "entropy": 0.19511011131107808, "epoch": 0.954656719897424, "grad_norm": 0.8984375, "learning_rate": 1.9975813916560988e-05, "loss": 0.169, "mean_token_accuracy": 0.9473197340965271, "num_tokens": 17147695.0, "step": 4095 }, { "entropy": 0.2630501437932253, "epoch": 0.9558223569180557, "grad_norm": 2.75, "learning_rate": 1.997575389791477e-05, "loss": 0.5802, "mean_token_accuracy": 0.895086270570755, "num_tokens": 17164836.0, "step": 4100 }, { "entropy": 0.3251684829592705, "epoch": 0.9569879939386875, "grad_norm": 0.36328125, "learning_rate": 1.997569380507258e-05, "loss": 0.628, "mean_token_accuracy": 0.8754357278347016, "num_tokens": 17193063.0, "step": 4105 }, { "entropy": 0.26151609756052496, "epoch": 0.9581536309593193, "grad_norm": 0.21484375, "learning_rate": 1.997563363803531e-05, "loss": 0.4691, "mean_token_accuracy": 0.9026021063327789, "num_tokens": 17220094.0, "step": 4110 }, { "entropy": 0.20274873673915864, "epoch": 0.959319267979951, "grad_norm": 1.3984375, "learning_rate": 1.997557339680386e-05, "loss": 0.5498, "mean_token_accuracy": 0.910257762670517, "num_tokens": 17234695.0, "step": 4115 }, { "entropy": 0.2564930349588394, "epoch": 0.9604849050005828, "grad_norm": 1.8046875, "learning_rate": 1.9975513081379125e-05, "loss": 0.4044, "mean_token_accuracy": 0.9224636614322662, "num_tokens": 17251544.0, "step": 4120 }, { "entropy": 0.2453150164335966, "epoch": 0.9616505420212146, "grad_norm": 1.65625, "learning_rate": 1.997545269176201e-05, "loss": 0.3385, "mean_token_accuracy": 0.9272057890892029, "num_tokens": 17277767.0, "step": 4125 }, { "entropy": 0.27376823760569097, "epoch": 0.9628161790418464, "grad_norm": 4.125, "learning_rate": 1.997539222795341e-05, "loss": 0.5076, "mean_token_accuracy": 0.8964590787887573, "num_tokens": 17299605.0, "step": 4130 }, { "entropy": 0.2696050863713026, "epoch": 0.9639818160624781, "grad_norm": 1.953125, "learning_rate": 1.9975331689954228e-05, "loss": 0.3222, "mean_token_accuracy": 0.9068039536476136, "num_tokens": 17337810.0, "step": 4135 }, { "entropy": 0.27581446021795275, "epoch": 0.9651474530831099, "grad_norm": 0.921875, "learning_rate": 1.9975271077765365e-05, "loss": 0.569, "mean_token_accuracy": 0.893970274925232, "num_tokens": 17350006.0, "step": 4140 }, { "entropy": 0.2386571519076824, "epoch": 0.9663130901037417, "grad_norm": 0.94140625, "learning_rate": 1.997521039138773e-05, "loss": 0.4095, "mean_token_accuracy": 0.9206796944141388, "num_tokens": 17364775.0, "step": 4145 }, { "entropy": 0.19428746849298478, "epoch": 0.9674787271243734, "grad_norm": 0.65234375, "learning_rate": 1.9975149630822226e-05, "loss": 0.2389, "mean_token_accuracy": 0.9308637917041779, "num_tokens": 17399499.0, "step": 4150 }, { "entropy": 0.32295563369989394, "epoch": 0.9686443641450052, "grad_norm": 1.140625, "learning_rate": 1.9975088796069758e-05, "loss": 0.5525, "mean_token_accuracy": 0.8863936126232147, "num_tokens": 17412882.0, "step": 4155 }, { "entropy": 0.2559451676905155, "epoch": 0.969810001165637, "grad_norm": 2.546875, "learning_rate": 1.9975027887131228e-05, "loss": 0.541, "mean_token_accuracy": 0.9047481536865234, "num_tokens": 17430157.0, "step": 4160 }, { "entropy": 0.2594328373670578, "epoch": 0.9709756381862688, "grad_norm": 4.09375, "learning_rate": 1.9974966904007553e-05, "loss": 0.4485, "mean_token_accuracy": 0.9149899840354919, "num_tokens": 17443985.0, "step": 4165 }, { "entropy": 0.4551702942699194, "epoch": 0.9721412752069005, "grad_norm": 0.72265625, "learning_rate": 1.997490584669964e-05, "loss": 0.7002, "mean_token_accuracy": 0.8617740720510483, "num_tokens": 17476885.0, "step": 4170 }, { "entropy": 0.3077906858175993, "epoch": 0.9733069122275323, "grad_norm": 4.5, "learning_rate": 1.9974844715208397e-05, "loss": 0.648, "mean_token_accuracy": 0.893233060836792, "num_tokens": 17492268.0, "step": 4175 }, { "entropy": 0.32362317144870756, "epoch": 0.9744725492481642, "grad_norm": 3.203125, "learning_rate": 1.9974783509534737e-05, "loss": 0.6601, "mean_token_accuracy": 0.8944636046886444, "num_tokens": 17514877.0, "step": 4180 }, { "entropy": 0.3098685838282108, "epoch": 0.975638186268796, "grad_norm": 1.015625, "learning_rate": 1.997472222967957e-05, "loss": 0.5523, "mean_token_accuracy": 0.8969592690467835, "num_tokens": 17526680.0, "step": 4185 }, { "entropy": 0.3358676999807358, "epoch": 0.9768038232894277, "grad_norm": 3.0, "learning_rate": 1.9974660875643814e-05, "loss": 0.7514, "mean_token_accuracy": 0.8687438666820526, "num_tokens": 17537720.0, "step": 4190 }, { "entropy": 0.31152590177953243, "epoch": 0.9779694603100595, "grad_norm": 1.015625, "learning_rate": 1.997459944742838e-05, "loss": 0.4645, "mean_token_accuracy": 0.892284095287323, "num_tokens": 17559300.0, "step": 4195 }, { "entropy": 0.35799047742038964, "epoch": 0.9791350973306913, "grad_norm": 3.5625, "learning_rate": 1.997453794503419e-05, "loss": 0.5022, "mean_token_accuracy": 0.8763210058212281, "num_tokens": 17592705.0, "step": 4200 }, { "entropy": 0.2442564606666565, "epoch": 0.980300734351323, "grad_norm": 0.88671875, "learning_rate": 1.9974476368462155e-05, "loss": 0.3056, "mean_token_accuracy": 0.9137523889541626, "num_tokens": 17614714.0, "step": 4205 }, { "entropy": 0.21334104053676128, "epoch": 0.9814663713719548, "grad_norm": 0.59765625, "learning_rate": 1.9974414717713196e-05, "loss": 0.3557, "mean_token_accuracy": 0.929449713230133, "num_tokens": 17634380.0, "step": 4210 }, { "entropy": 0.3569655314087868, "epoch": 0.9826320083925866, "grad_norm": 0.671875, "learning_rate": 1.997435299278823e-05, "loss": 0.3218, "mean_token_accuracy": 0.8842150449752808, "num_tokens": 17677414.0, "step": 4215 }, { "entropy": 0.42681182771921156, "epoch": 0.9837976454132183, "grad_norm": 3.90625, "learning_rate": 1.997429119368818e-05, "loss": 0.9208, "mean_token_accuracy": 0.8420846164226532, "num_tokens": 17688253.0, "step": 4220 }, { "entropy": 0.3741662811487913, "epoch": 0.9849632824338501, "grad_norm": 0.33203125, "learning_rate": 1.997422932041397e-05, "loss": 0.7088, "mean_token_accuracy": 0.8799990713596344, "num_tokens": 17705105.0, "step": 4225 }, { "entropy": 0.2380964808166027, "epoch": 0.9861289194544819, "grad_norm": 3.078125, "learning_rate": 1.9974167372966512e-05, "loss": 0.3689, "mean_token_accuracy": 0.9005732059478759, "num_tokens": 17731972.0, "step": 4230 }, { "entropy": 0.3463113531470299, "epoch": 0.9872945564751137, "grad_norm": 2.0, "learning_rate": 1.9974105351346742e-05, "loss": 0.4693, "mean_token_accuracy": 0.8790521025657654, "num_tokens": 17753540.0, "step": 4235 }, { "entropy": 0.3185799553990364, "epoch": 0.9884601934957454, "grad_norm": 0.91015625, "learning_rate": 1.9974043255555576e-05, "loss": 0.6741, "mean_token_accuracy": 0.8678267776966095, "num_tokens": 17768594.0, "step": 4240 }, { "entropy": 0.3051718398928642, "epoch": 0.9896258305163772, "grad_norm": 0.490234375, "learning_rate": 1.9973981085593947e-05, "loss": 0.5239, "mean_token_accuracy": 0.8959280610084533, "num_tokens": 17787716.0, "step": 4245 }, { "entropy": 0.26151602268218993, "epoch": 0.990791467537009, "grad_norm": 1.8203125, "learning_rate": 1.9973918841462782e-05, "loss": 0.3758, "mean_token_accuracy": 0.9223517417907715, "num_tokens": 17803913.0, "step": 4250 }, { "entropy": 0.3868280317634344, "epoch": 0.9919571045576407, "grad_norm": 3.828125, "learning_rate": 1.9973856523162996e-05, "loss": 0.6642, "mean_token_accuracy": 0.8484514772891998, "num_tokens": 17832350.0, "step": 4255 }, { "entropy": 0.2370097152888775, "epoch": 0.9931227415782725, "grad_norm": 1.4921875, "learning_rate": 1.9973794130695536e-05, "loss": 0.4053, "mean_token_accuracy": 0.9227615237236023, "num_tokens": 17845252.0, "step": 4260 }, { "entropy": 0.27250404432415964, "epoch": 0.9942883785989043, "grad_norm": 3.21875, "learning_rate": 1.997373166406132e-05, "loss": 0.5611, "mean_token_accuracy": 0.9023528039455414, "num_tokens": 17862577.0, "step": 4265 }, { "entropy": 0.31106444634497166, "epoch": 0.9954540156195361, "grad_norm": 4.0, "learning_rate": 1.997366912326129e-05, "loss": 0.5155, "mean_token_accuracy": 0.9105746448040009, "num_tokens": 17881250.0, "step": 4270 }, { "entropy": 0.23939795568585395, "epoch": 0.9966196526401678, "grad_norm": 0.3203125, "learning_rate": 1.997360650829637e-05, "loss": 0.3377, "mean_token_accuracy": 0.9150448620319367, "num_tokens": 17906259.0, "step": 4275 }, { "entropy": 0.33422945328056813, "epoch": 0.9977852896607996, "grad_norm": 1.328125, "learning_rate": 1.9973543819167493e-05, "loss": 0.6478, "mean_token_accuracy": 0.8795804023742676, "num_tokens": 17933066.0, "step": 4280 }, { "entropy": 0.27011500149965284, "epoch": 0.9989509266814314, "grad_norm": 0.82421875, "learning_rate": 1.9973481055875598e-05, "loss": 0.4263, "mean_token_accuracy": 0.9082007288932801, "num_tokens": 17963474.0, "step": 4285 }, { "entropy": 0.20046385543213951, "epoch": 1.0, "grad_norm": 2.859375, "learning_rate": 1.997341821842162e-05, "loss": 0.3831, "mean_token_accuracy": 0.9208313756518893, "num_tokens": 17980750.0, "step": 4290 }, { "entropy": 0.35562875419855117, "epoch": 1.0011656370206319, "grad_norm": 0.86328125, "learning_rate": 1.99733553068065e-05, "loss": 0.7468, "mean_token_accuracy": 0.8761965453624725, "num_tokens": 17990376.0, "step": 4295 }, { "entropy": 0.26873988620936873, "epoch": 1.0023312740412635, "grad_norm": 1.75, "learning_rate": 1.9973292321031168e-05, "loss": 0.5668, "mean_token_accuracy": 0.9023180723190307, "num_tokens": 18019003.0, "step": 4300 }, { "entropy": 0.22322348281741142, "epoch": 1.0034969110618954, "grad_norm": 2.96875, "learning_rate": 1.9973229261096567e-05, "loss": 0.4842, "mean_token_accuracy": 0.9101031005382538, "num_tokens": 18033478.0, "step": 4305 }, { "entropy": 0.27951807379722593, "epoch": 1.004662548082527, "grad_norm": 2.5625, "learning_rate": 1.997316612700364e-05, "loss": 0.5195, "mean_token_accuracy": 0.8981912076473236, "num_tokens": 18052997.0, "step": 4310 }, { "entropy": 0.16746344231069088, "epoch": 1.005828185103159, "grad_norm": 3.625, "learning_rate": 1.9973102918753323e-05, "loss": 0.3076, "mean_token_accuracy": 0.9257374405860901, "num_tokens": 18088486.0, "step": 4315 }, { "entropy": 0.33466513007879256, "epoch": 1.0069938221237906, "grad_norm": 2.078125, "learning_rate": 1.9973039636346566e-05, "loss": 0.8306, "mean_token_accuracy": 0.8667209804058075, "num_tokens": 18097090.0, "step": 4320 }, { "entropy": 0.282178159058094, "epoch": 1.0081594591444225, "grad_norm": 0.4296875, "learning_rate": 1.9972976279784304e-05, "loss": 0.4186, "mean_token_accuracy": 0.9172792494297027, "num_tokens": 18117837.0, "step": 4325 }, { "entropy": 0.48776160553097725, "epoch": 1.0093250961650542, "grad_norm": 1.4765625, "learning_rate": 1.9972912849067486e-05, "loss": 0.9886, "mean_token_accuracy": 0.8563101321458817, "num_tokens": 18140432.0, "step": 4330 }, { "entropy": 0.2983349785208702, "epoch": 1.010490733185686, "grad_norm": 1.6484375, "learning_rate": 1.997284934419706e-05, "loss": 0.4383, "mean_token_accuracy": 0.900026822090149, "num_tokens": 18162642.0, "step": 4335 }, { "entropy": 0.28172708451747897, "epoch": 1.0116563702063177, "grad_norm": 0.609375, "learning_rate": 1.997278576517397e-05, "loss": 0.6158, "mean_token_accuracy": 0.8869329035282135, "num_tokens": 18176449.0, "step": 4340 }, { "entropy": 0.2790716167539358, "epoch": 1.0128220072269496, "grad_norm": 0.9296875, "learning_rate": 1.9972722111999165e-05, "loss": 0.3866, "mean_token_accuracy": 0.8965228259563446, "num_tokens": 18204501.0, "step": 4345 }, { "entropy": 0.23924841061234475, "epoch": 1.0139876442475813, "grad_norm": 1.4453125, "learning_rate": 1.9972658384673594e-05, "loss": 0.4254, "mean_token_accuracy": 0.9159099042415619, "num_tokens": 18217738.0, "step": 4350 }, { "entropy": 0.2642524816095829, "epoch": 1.0151532812682131, "grad_norm": 1.453125, "learning_rate": 1.9972594583198206e-05, "loss": 0.4372, "mean_token_accuracy": 0.9113417148590088, "num_tokens": 18231551.0, "step": 4355 }, { "entropy": 0.2554112922400236, "epoch": 1.0163189182888448, "grad_norm": 0.6328125, "learning_rate": 1.9972530707573954e-05, "loss": 0.4601, "mean_token_accuracy": 0.9215417444705963, "num_tokens": 18251896.0, "step": 4360 }, { "entropy": 0.35437683314085006, "epoch": 1.0174845553094767, "grad_norm": 2.15625, "learning_rate": 1.997246675780179e-05, "loss": 0.7178, "mean_token_accuracy": 0.8783478498458862, "num_tokens": 18266899.0, "step": 4365 }, { "entropy": 0.13722085095942022, "epoch": 1.0186501923301083, "grad_norm": 2.46875, "learning_rate": 1.997240273388267e-05, "loss": 0.217, "mean_token_accuracy": 0.961503392457962, "num_tokens": 18303207.0, "step": 4370 }, { "entropy": 0.24477976867929102, "epoch": 1.0198158293507402, "grad_norm": 0.244140625, "learning_rate": 1.9972338635817542e-05, "loss": 0.473, "mean_token_accuracy": 0.9170270323753357, "num_tokens": 18326728.0, "step": 4375 }, { "entropy": 0.19941614847630262, "epoch": 1.0209814663713719, "grad_norm": 0.185546875, "learning_rate": 1.9972274463607367e-05, "loss": 0.3927, "mean_token_accuracy": 0.9253010809421539, "num_tokens": 18348890.0, "step": 4380 }, { "entropy": 0.38385615646839144, "epoch": 1.0221471033920038, "grad_norm": 2.328125, "learning_rate": 1.9972210217253105e-05, "loss": 0.8185, "mean_token_accuracy": 0.8647119581699372, "num_tokens": 18356656.0, "step": 4385 }, { "entropy": 0.20581890493631363, "epoch": 1.0233127404126354, "grad_norm": 3.390625, "learning_rate": 1.9972145896755707e-05, "loss": 0.5018, "mean_token_accuracy": 0.9183601677417755, "num_tokens": 18387230.0, "step": 4390 }, { "entropy": 0.3187251575291157, "epoch": 1.0244783774332673, "grad_norm": 1.40625, "learning_rate": 1.9972081502116133e-05, "loss": 0.5653, "mean_token_accuracy": 0.9041484355926513, "num_tokens": 18404146.0, "step": 4395 }, { "entropy": 0.2324225589632988, "epoch": 1.025644014453899, "grad_norm": 1.2578125, "learning_rate": 1.997201703333535e-05, "loss": 0.5529, "mean_token_accuracy": 0.9044724762439728, "num_tokens": 18424272.0, "step": 4400 }, { "entropy": 0.2801080636680126, "epoch": 1.0268096514745308, "grad_norm": 0.54296875, "learning_rate": 1.9971952490414312e-05, "loss": 0.6135, "mean_token_accuracy": 0.9049952983856201, "num_tokens": 18436748.0, "step": 4405 }, { "entropy": 0.24056704565882683, "epoch": 1.0279752884951625, "grad_norm": 1.15625, "learning_rate": 1.9971887873353983e-05, "loss": 0.3592, "mean_token_accuracy": 0.9270002782344818, "num_tokens": 18461229.0, "step": 4410 }, { "entropy": 0.2713818594813347, "epoch": 1.0291409255157944, "grad_norm": 3.234375, "learning_rate": 1.997182318215533e-05, "loss": 0.5206, "mean_token_accuracy": 0.9101826786994934, "num_tokens": 18473547.0, "step": 4415 }, { "entropy": 0.2699232131242752, "epoch": 1.030306562536426, "grad_norm": 4.40625, "learning_rate": 1.9971758416819312e-05, "loss": 0.602, "mean_token_accuracy": 0.8996315360069275, "num_tokens": 18485691.0, "step": 4420 }, { "entropy": 0.27017700783908366, "epoch": 1.031472199557058, "grad_norm": 1.796875, "learning_rate": 1.9971693577346904e-05, "loss": 0.2799, "mean_token_accuracy": 0.9148701131343842, "num_tokens": 18510802.0, "step": 4425 }, { "entropy": 0.29673150181770325, "epoch": 1.0326378365776896, "grad_norm": 3.5625, "learning_rate": 1.9971628663739063e-05, "loss": 0.4561, "mean_token_accuracy": 0.8998429119586945, "num_tokens": 18531646.0, "step": 4430 }, { "entropy": 0.23062185496091842, "epoch": 1.0338034735983215, "grad_norm": 1.515625, "learning_rate": 1.997156367599676e-05, "loss": 0.3979, "mean_token_accuracy": 0.9245606422424316, "num_tokens": 18543407.0, "step": 4435 }, { "entropy": 0.2646921593695879, "epoch": 1.0349691106189534, "grad_norm": 0.37109375, "learning_rate": 1.997149861412097e-05, "loss": 0.6075, "mean_token_accuracy": 0.9036493420600891, "num_tokens": 18558389.0, "step": 4440 }, { "entropy": 0.25541475638747213, "epoch": 1.036134747639585, "grad_norm": 0.6875, "learning_rate": 1.9971433478112653e-05, "loss": 0.4424, "mean_token_accuracy": 0.9157603502273559, "num_tokens": 18575353.0, "step": 4445 }, { "entropy": 0.2649271070957184, "epoch": 1.037300384660217, "grad_norm": 3.375, "learning_rate": 1.9971368267972787e-05, "loss": 0.5452, "mean_token_accuracy": 0.9030423641204834, "num_tokens": 18585986.0, "step": 4450 }, { "entropy": 0.3601708263158798, "epoch": 1.0384660216808486, "grad_norm": 6.0, "learning_rate": 1.9971302983702342e-05, "loss": 0.8623, "mean_token_accuracy": 0.8600324988365173, "num_tokens": 18598197.0, "step": 4455 }, { "entropy": 0.24812965393066405, "epoch": 1.0396316587014804, "grad_norm": 1.9140625, "learning_rate": 1.9971237625302294e-05, "loss": 0.4411, "mean_token_accuracy": 0.9120156764984131, "num_tokens": 18621661.0, "step": 4460 }, { "entropy": 0.45811365395784376, "epoch": 1.040797295722112, "grad_norm": 1.171875, "learning_rate": 1.9971172192773612e-05, "loss": 0.6904, "mean_token_accuracy": 0.8586636424064636, "num_tokens": 18643842.0, "step": 4465 }, { "entropy": 0.2506137236952782, "epoch": 1.041962932742744, "grad_norm": 2.734375, "learning_rate": 1.997110668611728e-05, "loss": 0.3648, "mean_token_accuracy": 0.9153142750263215, "num_tokens": 18668595.0, "step": 4470 }, { "entropy": 0.2909720130264759, "epoch": 1.0431285697633756, "grad_norm": 4.25, "learning_rate": 1.9971041105334267e-05, "loss": 0.7996, "mean_token_accuracy": 0.8663729548454284, "num_tokens": 18679674.0, "step": 4475 }, { "entropy": 0.33627558797597884, "epoch": 1.0442942067840075, "grad_norm": 2.59375, "learning_rate": 1.997097545042556e-05, "loss": 0.6941, "mean_token_accuracy": 0.8793821096420288, "num_tokens": 18688713.0, "step": 4480 }, { "entropy": 0.25198244825005534, "epoch": 1.0454598438046392, "grad_norm": 1.3984375, "learning_rate": 1.9970909721392123e-05, "loss": 0.3858, "mean_token_accuracy": 0.9182367920875549, "num_tokens": 18712780.0, "step": 4485 }, { "entropy": 0.21226853989064692, "epoch": 1.046625480825271, "grad_norm": 1.9921875, "learning_rate": 1.9970843918234953e-05, "loss": 0.3497, "mean_token_accuracy": 0.9377843379974365, "num_tokens": 18735852.0, "step": 4490 }, { "entropy": 0.3144254297018051, "epoch": 1.0477911178459027, "grad_norm": 1.953125, "learning_rate": 1.997077804095502e-05, "loss": 0.6218, "mean_token_accuracy": 0.8912947416305542, "num_tokens": 18747949.0, "step": 4495 }, { "entropy": 0.2468037974089384, "epoch": 1.0489567548665346, "grad_norm": 1.3828125, "learning_rate": 1.9970712089553312e-05, "loss": 0.3826, "mean_token_accuracy": 0.9294294059276581, "num_tokens": 18764902.0, "step": 4500 }, { "entropy": 0.3669187381863594, "epoch": 1.0501223918871663, "grad_norm": 2.46875, "learning_rate": 1.997064606403081e-05, "loss": 0.6487, "mean_token_accuracy": 0.8842283308506012, "num_tokens": 18775403.0, "step": 4505 }, { "entropy": 0.222046391479671, "epoch": 1.0512880289077982, "grad_norm": 1.4375, "learning_rate": 1.99705799643885e-05, "loss": 0.2955, "mean_token_accuracy": 0.9272170722484588, "num_tokens": 18806888.0, "step": 4510 }, { "entropy": 0.24561046734452247, "epoch": 1.0524536659284298, "grad_norm": 0.65625, "learning_rate": 1.9970513790627363e-05, "loss": 0.4845, "mean_token_accuracy": 0.9211707890033722, "num_tokens": 18820932.0, "step": 4515 }, { "entropy": 0.24179197587072848, "epoch": 1.0536193029490617, "grad_norm": 1.4140625, "learning_rate": 1.997044754274839e-05, "loss": 0.4912, "mean_token_accuracy": 0.9171702861785889, "num_tokens": 18836876.0, "step": 4520 }, { "entropy": 0.2697492055594921, "epoch": 1.0547849399696934, "grad_norm": 1.8828125, "learning_rate": 1.997038122075257e-05, "loss": 0.479, "mean_token_accuracy": 0.9130570411682128, "num_tokens": 18851087.0, "step": 4525 }, { "entropy": 0.35710594058036804, "epoch": 1.0559505769903252, "grad_norm": 1.34375, "learning_rate": 1.9970314824640892e-05, "loss": 0.58, "mean_token_accuracy": 0.8559439688920975, "num_tokens": 18883555.0, "step": 4530 }, { "entropy": 0.2868450716137886, "epoch": 1.057116214010957, "grad_norm": 0.2265625, "learning_rate": 1.9970248354414342e-05, "loss": 0.6734, "mean_token_accuracy": 0.8887735664844513, "num_tokens": 18901186.0, "step": 4535 }, { "entropy": 0.23753126077353953, "epoch": 1.0582818510315888, "grad_norm": 1.8046875, "learning_rate": 1.9970181810073913e-05, "loss": 0.4314, "mean_token_accuracy": 0.9144180476665497, "num_tokens": 18922703.0, "step": 4540 }, { "entropy": 0.2350205697119236, "epoch": 1.0594474880522204, "grad_norm": 0.72265625, "learning_rate": 1.9970115191620596e-05, "loss": 0.3227, "mean_token_accuracy": 0.9272505462169647, "num_tokens": 18938623.0, "step": 4545 }, { "entropy": 0.29542321562767027, "epoch": 1.0606131250728523, "grad_norm": 0.73828125, "learning_rate": 1.997004849905539e-05, "loss": 0.5377, "mean_token_accuracy": 0.8950551569461822, "num_tokens": 18954884.0, "step": 4550 }, { "entropy": 0.2510738968849182, "epoch": 1.061778762093484, "grad_norm": 2.296875, "learning_rate": 1.996998173237928e-05, "loss": 0.4795, "mean_token_accuracy": 0.9140961229801178, "num_tokens": 18970242.0, "step": 4555 }, { "entropy": 0.3822752878069878, "epoch": 1.0629443991141159, "grad_norm": 4.21875, "learning_rate": 1.9969914891593272e-05, "loss": 0.6895, "mean_token_accuracy": 0.8611569821834564, "num_tokens": 18990243.0, "step": 4560 }, { "entropy": 0.28315904922783375, "epoch": 1.0641100361347475, "grad_norm": 3.125, "learning_rate": 1.9969847976698355e-05, "loss": 0.4318, "mean_token_accuracy": 0.9066355526447296, "num_tokens": 19006220.0, "step": 4565 }, { "entropy": 0.1834595672786236, "epoch": 1.0652756731553794, "grad_norm": 0.69921875, "learning_rate": 1.996978098769553e-05, "loss": 0.2685, "mean_token_accuracy": 0.940112954378128, "num_tokens": 19029958.0, "step": 4570 }, { "entropy": 0.2601656034588814, "epoch": 1.0664413101760113, "grad_norm": 3.203125, "learning_rate": 1.99697139245858e-05, "loss": 0.6005, "mean_token_accuracy": 0.8940555512905121, "num_tokens": 19045954.0, "step": 4575 }, { "entropy": 0.31239586509764194, "epoch": 1.067606947196643, "grad_norm": 3.8125, "learning_rate": 1.9969646787370154e-05, "loss": 0.4227, "mean_token_accuracy": 0.9133547961711883, "num_tokens": 19068136.0, "step": 4580 }, { "entropy": 0.24956375658512114, "epoch": 1.0687725842172748, "grad_norm": 1.8046875, "learning_rate": 1.9969579576049603e-05, "loss": 0.4452, "mean_token_accuracy": 0.9185151219367981, "num_tokens": 19090732.0, "step": 4585 }, { "entropy": 0.2317537029273808, "epoch": 1.0699382212379065, "grad_norm": 3.328125, "learning_rate": 1.9969512290625145e-05, "loss": 0.3482, "mean_token_accuracy": 0.9122158706188201, "num_tokens": 19117200.0, "step": 4590 }, { "entropy": 0.32455471605062486, "epoch": 1.0711038582585384, "grad_norm": 3.328125, "learning_rate": 1.9969444931097782e-05, "loss": 0.6336, "mean_token_accuracy": 0.8840024530887604, "num_tokens": 19131499.0, "step": 4595 }, { "entropy": 0.31562632527202367, "epoch": 1.07226949527917, "grad_norm": 1.1875, "learning_rate": 1.9969377497468524e-05, "loss": 0.4875, "mean_token_accuracy": 0.8833531379699707, "num_tokens": 19174599.0, "step": 4600 }, { "entropy": 0.19211040288209916, "epoch": 1.073435132299802, "grad_norm": 1.4140625, "learning_rate": 1.996930998973837e-05, "loss": 0.2376, "mean_token_accuracy": 0.935130262374878, "num_tokens": 19194152.0, "step": 4605 }, { "entropy": 0.2705673351883888, "epoch": 1.0746007693204336, "grad_norm": 2.171875, "learning_rate": 1.9969242407908336e-05, "loss": 0.4673, "mean_token_accuracy": 0.893635481595993, "num_tokens": 19210561.0, "step": 4610 }, { "entropy": 0.22310225944966078, "epoch": 1.0757664063410655, "grad_norm": 0.3984375, "learning_rate": 1.996917475197942e-05, "loss": 0.3425, "mean_token_accuracy": 0.934300822019577, "num_tokens": 19249267.0, "step": 4615 }, { "entropy": 0.21514166854321956, "epoch": 1.0769320433616971, "grad_norm": 0.271484375, "learning_rate": 1.996910702195263e-05, "loss": 0.2464, "mean_token_accuracy": 0.9217098355293274, "num_tokens": 19279689.0, "step": 4620 }, { "entropy": 0.421810145303607, "epoch": 1.078097680382329, "grad_norm": 2.953125, "learning_rate": 1.9969039217828988e-05, "loss": 0.5759, "mean_token_accuracy": 0.8948563575744629, "num_tokens": 19309629.0, "step": 4625 }, { "entropy": 0.30465604811906816, "epoch": 1.0792633174029607, "grad_norm": 3.203125, "learning_rate": 1.9968971339609494e-05, "loss": 0.5313, "mean_token_accuracy": 0.9070425748825073, "num_tokens": 19332872.0, "step": 4630 }, { "entropy": 0.21195731237530707, "epoch": 1.0804289544235925, "grad_norm": 0.89453125, "learning_rate": 1.9968903387295162e-05, "loss": 0.2566, "mean_token_accuracy": 0.943495512008667, "num_tokens": 19354020.0, "step": 4635 }, { "entropy": 0.28489233925938606, "epoch": 1.0815945914442242, "grad_norm": 2.125, "learning_rate": 1.9968835360887007e-05, "loss": 0.4362, "mean_token_accuracy": 0.9148045003414154, "num_tokens": 19373694.0, "step": 4640 }, { "entropy": 0.20208041854202746, "epoch": 1.082760228464856, "grad_norm": 4.0, "learning_rate": 1.9968767260386043e-05, "loss": 0.4345, "mean_token_accuracy": 0.9049011528491974, "num_tokens": 19394838.0, "step": 4645 }, { "entropy": 0.27856975942850115, "epoch": 1.0839258654854877, "grad_norm": 3.546875, "learning_rate": 1.996869908579329e-05, "loss": 0.5698, "mean_token_accuracy": 0.9010435700416565, "num_tokens": 19410829.0, "step": 4650 }, { "entropy": 0.36488936431705954, "epoch": 1.0850915025061196, "grad_norm": 2.578125, "learning_rate": 1.9968630837109756e-05, "loss": 0.7248, "mean_token_accuracy": 0.8689317822456359, "num_tokens": 19423986.0, "step": 4655 }, { "entropy": 0.247793560475111, "epoch": 1.0862571395267513, "grad_norm": 0.466796875, "learning_rate": 1.9968562514336464e-05, "loss": 0.4697, "mean_token_accuracy": 0.9112800538539887, "num_tokens": 19446363.0, "step": 4660 }, { "entropy": 0.26282162182033064, "epoch": 1.0874227765473832, "grad_norm": 3.40625, "learning_rate": 1.996849411747443e-05, "loss": 0.5178, "mean_token_accuracy": 0.9006511569023132, "num_tokens": 19479115.0, "step": 4665 }, { "entropy": 0.2753569819033146, "epoch": 1.0885884135680148, "grad_norm": 2.046875, "learning_rate": 1.9968425646524682e-05, "loss": 0.6274, "mean_token_accuracy": 0.8891318142414093, "num_tokens": 19489958.0, "step": 4670 }, { "entropy": 0.2802404096350074, "epoch": 1.0897540505886467, "grad_norm": 3.6875, "learning_rate": 1.9968357101488227e-05, "loss": 0.4688, "mean_token_accuracy": 0.9078739762306214, "num_tokens": 19522416.0, "step": 4675 }, { "entropy": 0.3198259465396404, "epoch": 1.0909196876092784, "grad_norm": 2.265625, "learning_rate": 1.9968288482366097e-05, "loss": 0.5392, "mean_token_accuracy": 0.8862555027008057, "num_tokens": 19545618.0, "step": 4680 }, { "entropy": 0.23124271370470523, "epoch": 1.0920853246299103, "grad_norm": 0.236328125, "learning_rate": 1.9968219789159314e-05, "loss": 0.5302, "mean_token_accuracy": 0.9184695065021515, "num_tokens": 19568029.0, "step": 4685 }, { "entropy": 0.20576266888529063, "epoch": 1.093250961650542, "grad_norm": 2.03125, "learning_rate": 1.9968151021868906e-05, "loss": 0.3619, "mean_token_accuracy": 0.9128438830375671, "num_tokens": 19595024.0, "step": 4690 }, { "entropy": 0.2861837536096573, "epoch": 1.0944165986711738, "grad_norm": 3.234375, "learning_rate": 1.9968082180495887e-05, "loss": 0.5211, "mean_token_accuracy": 0.9105264604091644, "num_tokens": 19609621.0, "step": 4695 }, { "entropy": 0.36588003784418105, "epoch": 1.0955822356918055, "grad_norm": 4.8125, "learning_rate": 1.9968013265041293e-05, "loss": 0.5441, "mean_token_accuracy": 0.8853360414505005, "num_tokens": 19628917.0, "step": 4700 }, { "entropy": 0.2513444259762764, "epoch": 1.0967478727124373, "grad_norm": 1.515625, "learning_rate": 1.996794427550615e-05, "loss": 0.3729, "mean_token_accuracy": 0.9267160654067993, "num_tokens": 19649010.0, "step": 4705 }, { "entropy": 0.26096164882183076, "epoch": 1.0979135097330692, "grad_norm": 2.203125, "learning_rate": 1.9967875211891483e-05, "loss": 0.5006, "mean_token_accuracy": 0.9162486016750335, "num_tokens": 19667982.0, "step": 4710 }, { "entropy": 0.34309937320649625, "epoch": 1.0990791467537009, "grad_norm": 1.40625, "learning_rate": 1.9967806074198323e-05, "loss": 0.512, "mean_token_accuracy": 0.8793802082538604, "num_tokens": 19688359.0, "step": 4715 }, { "entropy": 0.2940907657146454, "epoch": 1.1002447837743328, "grad_norm": 1.1953125, "learning_rate": 1.9967736862427707e-05, "loss": 0.452, "mean_token_accuracy": 0.9185880184173584, "num_tokens": 19702524.0, "step": 4720 }, { "entropy": 0.2799779180437326, "epoch": 1.1014104207949644, "grad_norm": 0.66015625, "learning_rate": 1.996766757658066e-05, "loss": 0.4462, "mean_token_accuracy": 0.889285272359848, "num_tokens": 19732863.0, "step": 4725 }, { "entropy": 0.38467673361301424, "epoch": 1.1025760578155963, "grad_norm": 2.078125, "learning_rate": 1.9967598216658217e-05, "loss": 0.6861, "mean_token_accuracy": 0.8668268322944641, "num_tokens": 19756176.0, "step": 4730 }, { "entropy": 0.3017943359911442, "epoch": 1.103741694836228, "grad_norm": 0.43359375, "learning_rate": 1.9967528782661413e-05, "loss": 0.6322, "mean_token_accuracy": 0.9024563610553742, "num_tokens": 19774248.0, "step": 4735 }, { "entropy": 0.26271404810249804, "epoch": 1.1049073318568599, "grad_norm": 1.8203125, "learning_rate": 1.9967459274591286e-05, "loss": 0.304, "mean_token_accuracy": 0.906380957365036, "num_tokens": 19794918.0, "step": 4740 }, { "entropy": 0.4135320819914341, "epoch": 1.1060729688774915, "grad_norm": 1.78125, "learning_rate": 1.996738969244887e-05, "loss": 0.5758, "mean_token_accuracy": 0.8603252410888672, "num_tokens": 19818348.0, "step": 4745 }, { "entropy": 0.32035259939730165, "epoch": 1.1072386058981234, "grad_norm": 0.451171875, "learning_rate": 1.9967320036235198e-05, "loss": 0.4718, "mean_token_accuracy": 0.8999540865421295, "num_tokens": 19835870.0, "step": 4750 }, { "entropy": 0.2962488478049636, "epoch": 1.108404242918755, "grad_norm": 0.267578125, "learning_rate": 1.9967250305951317e-05, "loss": 0.2977, "mean_token_accuracy": 0.9045934975147247, "num_tokens": 19869134.0, "step": 4755 }, { "entropy": 0.2296114858239889, "epoch": 1.109569879939387, "grad_norm": 1.9453125, "learning_rate": 1.9967180501598257e-05, "loss": 0.3174, "mean_token_accuracy": 0.9314020931720733, "num_tokens": 19895029.0, "step": 4760 }, { "entropy": 0.265631403028965, "epoch": 1.1107355169600186, "grad_norm": 1.6640625, "learning_rate": 1.9967110623177072e-05, "loss": 0.5776, "mean_token_accuracy": 0.8974580585956573, "num_tokens": 19907777.0, "step": 4765 }, { "entropy": 0.41383567191660403, "epoch": 1.1119011539806505, "grad_norm": 3.203125, "learning_rate": 1.9967040670688792e-05, "loss": 0.803, "mean_token_accuracy": 0.8468187242746353, "num_tokens": 19933236.0, "step": 4770 }, { "entropy": 0.21881723627448083, "epoch": 1.1130667910012821, "grad_norm": 0.330078125, "learning_rate": 1.9966970644134467e-05, "loss": 0.2632, "mean_token_accuracy": 0.9160628378391266, "num_tokens": 19971416.0, "step": 4775 }, { "entropy": 0.21530752796679736, "epoch": 1.114232428021914, "grad_norm": 1.828125, "learning_rate": 1.9966900543515137e-05, "loss": 0.3804, "mean_token_accuracy": 0.9274242103099823, "num_tokens": 20001318.0, "step": 4780 }, { "entropy": 0.2666290858760476, "epoch": 1.1153980650425457, "grad_norm": 3.59375, "learning_rate": 1.9966830368831852e-05, "loss": 0.5918, "mean_token_accuracy": 0.8934563815593719, "num_tokens": 20019964.0, "step": 4785 }, { "entropy": 0.3114991795271635, "epoch": 1.1165637020631776, "grad_norm": 3.25, "learning_rate": 1.9966760120085654e-05, "loss": 0.6155, "mean_token_accuracy": 0.8941902697086335, "num_tokens": 20040676.0, "step": 4790 }, { "entropy": 0.1996636178344488, "epoch": 1.1177293390838092, "grad_norm": 0.412109375, "learning_rate": 1.996668979727759e-05, "loss": 0.2037, "mean_token_accuracy": 0.9335273861885071, "num_tokens": 20078019.0, "step": 4795 }, { "entropy": 0.3700167939066887, "epoch": 1.118894976104441, "grad_norm": 2.40625, "learning_rate": 1.996661940040871e-05, "loss": 0.8312, "mean_token_accuracy": 0.8724365890026092, "num_tokens": 20089226.0, "step": 4800 }, { "entropy": 0.285097012668848, "epoch": 1.1200606131250728, "grad_norm": 1.625, "learning_rate": 1.9966548929480072e-05, "loss": 0.6656, "mean_token_accuracy": 0.8899259626865387, "num_tokens": 20100172.0, "step": 4805 }, { "entropy": 0.28418309725821017, "epoch": 1.1212262501457047, "grad_norm": 2.546875, "learning_rate": 1.9966478384492713e-05, "loss": 0.42, "mean_token_accuracy": 0.9119695782661438, "num_tokens": 20125120.0, "step": 4810 }, { "entropy": 0.24601214230060578, "epoch": 1.1223918871663363, "grad_norm": 1.5546875, "learning_rate": 1.9966407765447694e-05, "loss": 0.4867, "mean_token_accuracy": 0.8957095205783844, "num_tokens": 20145895.0, "step": 4815 }, { "entropy": 0.2464228618890047, "epoch": 1.1235575241869682, "grad_norm": 1.3046875, "learning_rate": 1.9966337072346063e-05, "loss": 0.2837, "mean_token_accuracy": 0.9086942136287689, "num_tokens": 20174555.0, "step": 4820 }, { "entropy": 0.29267266392707825, "epoch": 1.1247231612075999, "grad_norm": 1.9609375, "learning_rate": 1.996626630518888e-05, "loss": 0.4338, "mean_token_accuracy": 0.9187252819538116, "num_tokens": 20194808.0, "step": 4825 }, { "entropy": 0.2548982771113515, "epoch": 1.1258887982282317, "grad_norm": 1.6171875, "learning_rate": 1.9966195463977193e-05, "loss": 0.3892, "mean_token_accuracy": 0.9128529787063598, "num_tokens": 20220293.0, "step": 4830 }, { "entropy": 0.43977917432785035, "epoch": 1.1270544352488634, "grad_norm": 2.3125, "learning_rate": 1.9966124548712064e-05, "loss": 0.7689, "mean_token_accuracy": 0.8565711915493012, "num_tokens": 20246105.0, "step": 4835 }, { "entropy": 0.22334610745310784, "epoch": 1.1282200722694953, "grad_norm": 0.8046875, "learning_rate": 1.9966053559394548e-05, "loss": 0.3492, "mean_token_accuracy": 0.9118761956691742, "num_tokens": 20274785.0, "step": 4840 }, { "entropy": 0.2549382247030735, "epoch": 1.1293857092901272, "grad_norm": 1.1015625, "learning_rate": 1.9965982496025705e-05, "loss": 0.3671, "mean_token_accuracy": 0.9273378610610962, "num_tokens": 20297017.0, "step": 4845 }, { "entropy": 0.26040482223033906, "epoch": 1.1305513463107588, "grad_norm": 1.6015625, "learning_rate": 1.9965911358606592e-05, "loss": 0.5535, "mean_token_accuracy": 0.9075103044509888, "num_tokens": 20308131.0, "step": 4850 }, { "entropy": 0.2684775315225124, "epoch": 1.1317169833313905, "grad_norm": 0.46875, "learning_rate": 1.9965840147138273e-05, "loss": 0.4772, "mean_token_accuracy": 0.9086149275302887, "num_tokens": 20332934.0, "step": 4855 }, { "entropy": 0.26287127695977686, "epoch": 1.1328826203520224, "grad_norm": 1.59375, "learning_rate": 1.996576886162181e-05, "loss": 0.3848, "mean_token_accuracy": 0.9158682942390441, "num_tokens": 20355529.0, "step": 4860 }, { "entropy": 0.2752557501196861, "epoch": 1.1340482573726542, "grad_norm": 4.1875, "learning_rate": 1.996569750205826e-05, "loss": 0.6068, "mean_token_accuracy": 0.8869381487369538, "num_tokens": 20374204.0, "step": 4865 }, { "entropy": 0.19664042815566063, "epoch": 1.135213894393286, "grad_norm": 5.21875, "learning_rate": 1.9965626068448694e-05, "loss": 0.5432, "mean_token_accuracy": 0.9174998879432679, "num_tokens": 20390154.0, "step": 4870 }, { "entropy": 0.21189433336257935, "epoch": 1.1363795314139178, "grad_norm": 2.0625, "learning_rate": 1.9965554560794173e-05, "loss": 0.433, "mean_token_accuracy": 0.9120612025260926, "num_tokens": 20411386.0, "step": 4875 }, { "entropy": 0.27202789932489396, "epoch": 1.1375451684345494, "grad_norm": 3.171875, "learning_rate": 1.9965482979095766e-05, "loss": 0.4594, "mean_token_accuracy": 0.8918597221374511, "num_tokens": 20429358.0, "step": 4880 }, { "entropy": 0.24662722423672676, "epoch": 1.1387108054551813, "grad_norm": 0.462890625, "learning_rate": 1.996541132335454e-05, "loss": 0.3717, "mean_token_accuracy": 0.9205475330352784, "num_tokens": 20457099.0, "step": 4885 }, { "entropy": 0.33891555294394493, "epoch": 1.139876442475813, "grad_norm": 1.171875, "learning_rate": 1.9965339593571562e-05, "loss": 0.545, "mean_token_accuracy": 0.8842925369739533, "num_tokens": 20478711.0, "step": 4890 }, { "entropy": 0.3443553917109966, "epoch": 1.1410420794964449, "grad_norm": 1.09375, "learning_rate": 1.9965267789747902e-05, "loss": 0.5574, "mean_token_accuracy": 0.8829470694065094, "num_tokens": 20499220.0, "step": 4895 }, { "entropy": 0.2517301285639405, "epoch": 1.1422077165170765, "grad_norm": 0.419921875, "learning_rate": 1.9965195911884632e-05, "loss": 0.5739, "mean_token_accuracy": 0.8933994770050049, "num_tokens": 20518241.0, "step": 4900 }, { "entropy": 0.23689071610569953, "epoch": 1.1433733535377084, "grad_norm": 2.5625, "learning_rate": 1.996512395998282e-05, "loss": 0.4953, "mean_token_accuracy": 0.921712464094162, "num_tokens": 20532325.0, "step": 4905 }, { "entropy": 0.2861016098409891, "epoch": 1.14453899055834, "grad_norm": 4.15625, "learning_rate": 1.9965051934043545e-05, "loss": 0.4887, "mean_token_accuracy": 0.9079478204250335, "num_tokens": 20555088.0, "step": 4910 }, { "entropy": 0.35219028070569036, "epoch": 1.145704627578972, "grad_norm": 0.318359375, "learning_rate": 1.996497983406788e-05, "loss": 0.6508, "mean_token_accuracy": 0.8878498017787934, "num_tokens": 20577242.0, "step": 4915 }, { "entropy": 0.17518207393586635, "epoch": 1.1468702645996036, "grad_norm": 1.1171875, "learning_rate": 1.9964907660056894e-05, "loss": 0.3614, "mean_token_accuracy": 0.9309583485126496, "num_tokens": 20599132.0, "step": 4920 }, { "entropy": 0.3855657190084457, "epoch": 1.1480359016202355, "grad_norm": 0.490234375, "learning_rate": 1.9964835412011667e-05, "loss": 0.6255, "mean_token_accuracy": 0.8820390582084656, "num_tokens": 20611755.0, "step": 4925 }, { "entropy": 0.19828929752111435, "epoch": 1.1492015386408672, "grad_norm": 1.453125, "learning_rate": 1.996476308993328e-05, "loss": 0.4292, "mean_token_accuracy": 0.92506183385849, "num_tokens": 20631471.0, "step": 4930 }, { "entropy": 0.24127081036567688, "epoch": 1.150367175661499, "grad_norm": 1.2578125, "learning_rate": 1.9964690693822805e-05, "loss": 0.3354, "mean_token_accuracy": 0.9134461104869842, "num_tokens": 20653340.0, "step": 4935 }, { "entropy": 0.25572728682309387, "epoch": 1.1515328126821307, "grad_norm": 3.75, "learning_rate": 1.996461822368132e-05, "loss": 0.3822, "mean_token_accuracy": 0.9080056071281433, "num_tokens": 20691385.0, "step": 4940 }, { "entropy": 0.3221442990005016, "epoch": 1.1526984497027626, "grad_norm": 0.59765625, "learning_rate": 1.9964545679509917e-05, "loss": 0.674, "mean_token_accuracy": 0.8798101782798767, "num_tokens": 20703113.0, "step": 4945 }, { "entropy": 0.3144264120608568, "epoch": 1.1538640867233942, "grad_norm": 0.376953125, "learning_rate": 1.996447306130967e-05, "loss": 0.4135, "mean_token_accuracy": 0.8957395732402802, "num_tokens": 20730511.0, "step": 4950 }, { "entropy": 0.3189440816640854, "epoch": 1.1550297237440261, "grad_norm": 2.15625, "learning_rate": 1.996440036908166e-05, "loss": 0.3236, "mean_token_accuracy": 0.9054770946502686, "num_tokens": 20759904.0, "step": 4955 }, { "entropy": 0.2833704300224781, "epoch": 1.156195360764658, "grad_norm": 0.458984375, "learning_rate": 1.9964327602826977e-05, "loss": 0.5359, "mean_token_accuracy": 0.9107962608337402, "num_tokens": 20793843.0, "step": 4960 }, { "entropy": 0.28943902999162674, "epoch": 1.1573609977852897, "grad_norm": 1.890625, "learning_rate": 1.99642547625467e-05, "loss": 0.6363, "mean_token_accuracy": 0.8925088405609131, "num_tokens": 20803894.0, "step": 4965 }, { "entropy": 0.31220489740371704, "epoch": 1.1585266348059213, "grad_norm": 0.9375, "learning_rate": 1.996418184824192e-05, "loss": 0.4212, "mean_token_accuracy": 0.9167026937007904, "num_tokens": 20827567.0, "step": 4970 }, { "entropy": 0.2499225214123726, "epoch": 1.1596922718265532, "grad_norm": 3.40625, "learning_rate": 1.996410885991372e-05, "loss": 0.5375, "mean_token_accuracy": 0.9086990296840668, "num_tokens": 20847726.0, "step": 4975 }, { "entropy": 0.1842884048819542, "epoch": 1.160857908847185, "grad_norm": 1.71875, "learning_rate": 1.996403579756319e-05, "loss": 0.307, "mean_token_accuracy": 0.9375099897384643, "num_tokens": 20876274.0, "step": 4980 }, { "entropy": 0.2099345738068223, "epoch": 1.1620235458678168, "grad_norm": 0.2470703125, "learning_rate": 1.996396266119142e-05, "loss": 0.3601, "mean_token_accuracy": 0.9239444255828857, "num_tokens": 20908707.0, "step": 4985 }, { "entropy": 0.2952556751668453, "epoch": 1.1631891828884484, "grad_norm": 2.6875, "learning_rate": 1.9963889450799503e-05, "loss": 0.538, "mean_token_accuracy": 0.9095062971115112, "num_tokens": 20923367.0, "step": 4990 }, { "entropy": 0.32244512140750886, "epoch": 1.1643548199090803, "grad_norm": 0.390625, "learning_rate": 1.9963816166388527e-05, "loss": 0.3306, "mean_token_accuracy": 0.8819191873073577, "num_tokens": 20956465.0, "step": 4995 }, { "entropy": 0.28892406783998015, "epoch": 1.1655204569297122, "grad_norm": 0.75, "learning_rate": 1.9963742807959587e-05, "loss": 0.5125, "mean_token_accuracy": 0.8838415026664734, "num_tokens": 20980348.0, "step": 5000 }, { "entropy": 0.31543378755450246, "epoch": 1.1666860939503438, "grad_norm": 2.625, "learning_rate": 1.9963669375513773e-05, "loss": 0.6096, "mean_token_accuracy": 0.8829708397388458, "num_tokens": 20997289.0, "step": 5005 }, { "entropy": 0.2742725571617484, "epoch": 1.1678517309709757, "grad_norm": 0.54296875, "learning_rate": 1.9963595869052185e-05, "loss": 0.5468, "mean_token_accuracy": 0.8958122670650482, "num_tokens": 21022460.0, "step": 5010 }, { "entropy": 0.4142576478421688, "epoch": 1.1690173679916074, "grad_norm": 1.59375, "learning_rate": 1.9963522288575915e-05, "loss": 0.6416, "mean_token_accuracy": 0.8969951689243316, "num_tokens": 21049444.0, "step": 5015 }, { "entropy": 0.3312222182750702, "epoch": 1.1701830050122393, "grad_norm": 3.203125, "learning_rate": 1.9963448634086063e-05, "loss": 0.7295, "mean_token_accuracy": 0.8790445506572724, "num_tokens": 21058389.0, "step": 5020 }, { "entropy": 0.3322805389761925, "epoch": 1.171348642032871, "grad_norm": 0.78515625, "learning_rate": 1.996337490558373e-05, "loss": 0.5016, "mean_token_accuracy": 0.9036110222339631, "num_tokens": 21069310.0, "step": 5025 }, { "entropy": 0.2525691881775856, "epoch": 1.1725142790535028, "grad_norm": 3.140625, "learning_rate": 1.9963301103070007e-05, "loss": 0.314, "mean_token_accuracy": 0.8964376389980316, "num_tokens": 21103588.0, "step": 5030 }, { "entropy": 0.2757076404988766, "epoch": 1.1736799160741345, "grad_norm": 2.296875, "learning_rate": 1.9963227226546e-05, "loss": 0.5008, "mean_token_accuracy": 0.9100005924701691, "num_tokens": 21115956.0, "step": 5035 }, { "entropy": 0.24521611034870147, "epoch": 1.1748455530947663, "grad_norm": 3.453125, "learning_rate": 1.996315327601281e-05, "loss": 0.5026, "mean_token_accuracy": 0.9170917510986328, "num_tokens": 21129529.0, "step": 5040 }, { "entropy": 0.38413129895925524, "epoch": 1.176011190115398, "grad_norm": 2.71875, "learning_rate": 1.996307925147154e-05, "loss": 0.7491, "mean_token_accuracy": 0.8473885953426361, "num_tokens": 21149606.0, "step": 5045 }, { "entropy": 0.2860517233610153, "epoch": 1.17717682713603, "grad_norm": 1.4453125, "learning_rate": 1.9963005152923297e-05, "loss": 0.6112, "mean_token_accuracy": 0.906189215183258, "num_tokens": 21160383.0, "step": 5050 }, { "entropy": 0.25727585405111314, "epoch": 1.1783424641566616, "grad_norm": 0.625, "learning_rate": 1.9962930980369178e-05, "loss": 0.3184, "mean_token_accuracy": 0.9098371982574462, "num_tokens": 21184716.0, "step": 5055 }, { "entropy": 0.2614014007151127, "epoch": 1.1795081011772934, "grad_norm": 2.640625, "learning_rate": 1.9962856733810295e-05, "loss": 0.6666, "mean_token_accuracy": 0.8902623951435089, "num_tokens": 21197902.0, "step": 5060 }, { "entropy": 0.28945720940828323, "epoch": 1.180673738197925, "grad_norm": 2.6875, "learning_rate": 1.9962782413247753e-05, "loss": 0.6894, "mean_token_accuracy": 0.8888876676559448, "num_tokens": 21208021.0, "step": 5065 }, { "entropy": 0.18397099822759627, "epoch": 1.181839375218557, "grad_norm": 0.7890625, "learning_rate": 1.9962708018682663e-05, "loss": 0.25, "mean_token_accuracy": 0.9296178877353668, "num_tokens": 21243231.0, "step": 5070 }, { "entropy": 0.2348667562007904, "epoch": 1.1830050122391886, "grad_norm": 2.09375, "learning_rate": 1.996263355011613e-05, "loss": 0.4452, "mean_token_accuracy": 0.9184268653392792, "num_tokens": 21262906.0, "step": 5075 }, { "entropy": 0.21639835610985755, "epoch": 1.1841706492598205, "grad_norm": 6.71875, "learning_rate": 1.9962559007549265e-05, "loss": 0.3633, "mean_token_accuracy": 0.9239116966724396, "num_tokens": 21276857.0, "step": 5080 }, { "entropy": 0.32814215682446957, "epoch": 1.1853362862804522, "grad_norm": 0.53515625, "learning_rate": 1.9962484390983182e-05, "loss": 0.3529, "mean_token_accuracy": 0.9116757154464722, "num_tokens": 21307183.0, "step": 5085 }, { "entropy": 0.25477695763111113, "epoch": 1.186501923301084, "grad_norm": 3.421875, "learning_rate": 1.9962409700418993e-05, "loss": 0.4097, "mean_token_accuracy": 0.9303404748439789, "num_tokens": 21334651.0, "step": 5090 }, { "entropy": 0.2113716546446085, "epoch": 1.1876675603217157, "grad_norm": 0.43359375, "learning_rate": 1.9962334935857813e-05, "loss": 0.3035, "mean_token_accuracy": 0.9402331054210663, "num_tokens": 21359790.0, "step": 5095 }, { "entropy": 0.23614692371338605, "epoch": 1.1888331973423476, "grad_norm": 3.09375, "learning_rate": 1.9962260097300752e-05, "loss": 0.23, "mean_token_accuracy": 0.9286585927009583, "num_tokens": 21403308.0, "step": 5100 }, { "entropy": 0.29054297506809235, "epoch": 1.1899988343629793, "grad_norm": 4.6875, "learning_rate": 1.9962185184748934e-05, "loss": 0.5162, "mean_token_accuracy": 0.9029807567596435, "num_tokens": 21417092.0, "step": 5105 }, { "entropy": 0.30761214941740034, "epoch": 1.1911644713836111, "grad_norm": 0.51171875, "learning_rate": 1.996211019820347e-05, "loss": 0.433, "mean_token_accuracy": 0.9027220129966735, "num_tokens": 21444507.0, "step": 5110 }, { "entropy": 0.3023512065410614, "epoch": 1.192330108404243, "grad_norm": 0.392578125, "learning_rate": 1.9962035137665475e-05, "loss": 0.7219, "mean_token_accuracy": 0.8693067491054535, "num_tokens": 21458658.0, "step": 5115 }, { "entropy": 0.38721234649419783, "epoch": 1.1934957454248747, "grad_norm": 0.6484375, "learning_rate": 1.996196000313608e-05, "loss": 0.8143, "mean_token_accuracy": 0.8422433733940125, "num_tokens": 21475926.0, "step": 5120 }, { "entropy": 0.31085776090621947, "epoch": 1.1946613824455063, "grad_norm": 2.03125, "learning_rate": 1.9961884794616393e-05, "loss": 0.4117, "mean_token_accuracy": 0.8949326932430267, "num_tokens": 21494892.0, "step": 5125 }, { "entropy": 0.36102984845638275, "epoch": 1.1958270194661382, "grad_norm": 0.9609375, "learning_rate": 1.996180951210754e-05, "loss": 0.5535, "mean_token_accuracy": 0.8549659192562103, "num_tokens": 21525176.0, "step": 5130 }, { "entropy": 0.319421348720789, "epoch": 1.1969926564867701, "grad_norm": 3.328125, "learning_rate": 1.9961734155610647e-05, "loss": 0.4569, "mean_token_accuracy": 0.899403166770935, "num_tokens": 21540302.0, "step": 5135 }, { "entropy": 0.3183010257780552, "epoch": 1.1981582935074018, "grad_norm": 0.6953125, "learning_rate": 1.9961658725126835e-05, "loss": 0.4923, "mean_token_accuracy": 0.8722436010837555, "num_tokens": 21565453.0, "step": 5140 }, { "entropy": 0.3733881928026676, "epoch": 1.1993239305280337, "grad_norm": 2.28125, "learning_rate": 1.9961583220657228e-05, "loss": 0.6894, "mean_token_accuracy": 0.8586771845817566, "num_tokens": 21581316.0, "step": 5145 }, { "entropy": 0.27310655564069747, "epoch": 1.2004895675486653, "grad_norm": 0.76953125, "learning_rate": 1.9961507642202953e-05, "loss": 0.4088, "mean_token_accuracy": 0.9163376450538635, "num_tokens": 21601574.0, "step": 5150 }, { "entropy": 0.2638257570564747, "epoch": 1.2016552045692972, "grad_norm": 2.890625, "learning_rate": 1.9961431989765137e-05, "loss": 0.4367, "mean_token_accuracy": 0.9224029779434204, "num_tokens": 21638480.0, "step": 5155 }, { "entropy": 0.25862696319818496, "epoch": 1.2028208415899289, "grad_norm": 2.390625, "learning_rate": 1.9961356263344903e-05, "loss": 0.4565, "mean_token_accuracy": 0.896664959192276, "num_tokens": 21660075.0, "step": 5160 }, { "entropy": 0.28866573721170424, "epoch": 1.2039864786105607, "grad_norm": 0.82421875, "learning_rate": 1.996128046294339e-05, "loss": 0.7012, "mean_token_accuracy": 0.8824924468994141, "num_tokens": 21671753.0, "step": 5165 }, { "entropy": 0.3241452187299728, "epoch": 1.2051521156311924, "grad_norm": 1.625, "learning_rate": 1.9961204588561723e-05, "loss": 0.5501, "mean_token_accuracy": 0.8935443520545959, "num_tokens": 21684460.0, "step": 5170 }, { "entropy": 0.21291088834404945, "epoch": 1.2063177526518243, "grad_norm": 0.3125, "learning_rate": 1.996112864020103e-05, "loss": 0.3549, "mean_token_accuracy": 0.9189221918582916, "num_tokens": 21709414.0, "step": 5175 }, { "entropy": 0.2792430866509676, "epoch": 1.207483389672456, "grad_norm": 4.09375, "learning_rate": 1.9961052617862447e-05, "loss": 0.4852, "mean_token_accuracy": 0.910968005657196, "num_tokens": 21728226.0, "step": 5180 }, { "entropy": 0.27534177508205177, "epoch": 1.2086490266930878, "grad_norm": 4.34375, "learning_rate": 1.9960976521547112e-05, "loss": 0.7243, "mean_token_accuracy": 0.8686468303203583, "num_tokens": 21748996.0, "step": 5185 }, { "entropy": 0.2846062559634447, "epoch": 1.2098146637137195, "grad_norm": 2.140625, "learning_rate": 1.9960900351256154e-05, "loss": 0.4689, "mean_token_accuracy": 0.8924234092235566, "num_tokens": 21769334.0, "step": 5190 }, { "entropy": 0.305383824557066, "epoch": 1.2109803007343514, "grad_norm": 2.890625, "learning_rate": 1.996082410699071e-05, "loss": 0.5887, "mean_token_accuracy": 0.9004212081432342, "num_tokens": 21783300.0, "step": 5195 }, { "entropy": 0.3224423822015524, "epoch": 1.212145937754983, "grad_norm": 2.25, "learning_rate": 1.9960747788751916e-05, "loss": 0.6121, "mean_token_accuracy": 0.8869613647460938, "num_tokens": 21814313.0, "step": 5200 }, { "entropy": 0.23042884096503258, "epoch": 1.213311574775615, "grad_norm": 1.21875, "learning_rate": 1.9960671396540908e-05, "loss": 0.3764, "mean_token_accuracy": 0.9182167291641236, "num_tokens": 21835573.0, "step": 5205 }, { "entropy": 0.3127889070659876, "epoch": 1.2144772117962466, "grad_norm": 7.875, "learning_rate": 1.996059493035883e-05, "loss": 0.4681, "mean_token_accuracy": 0.885968142747879, "num_tokens": 21854284.0, "step": 5210 }, { "entropy": 0.27360687740147116, "epoch": 1.2156428488168785, "grad_norm": 0.62109375, "learning_rate": 1.9960518390206824e-05, "loss": 0.5724, "mean_token_accuracy": 0.8911721289157868, "num_tokens": 21877998.0, "step": 5215 }, { "entropy": 0.2860093414783478, "epoch": 1.2168084858375101, "grad_norm": 6.65625, "learning_rate": 1.9960441776086025e-05, "loss": 0.691, "mean_token_accuracy": 0.8818713963031769, "num_tokens": 21893428.0, "step": 5220 }, { "entropy": 0.24104835242033004, "epoch": 1.217974122858142, "grad_norm": 2.0, "learning_rate": 1.996036508799758e-05, "loss": 0.3223, "mean_token_accuracy": 0.9073219478130341, "num_tokens": 21913016.0, "step": 5225 }, { "entropy": 0.2667414344847202, "epoch": 1.2191397598787737, "grad_norm": 3.328125, "learning_rate": 1.9960288325942628e-05, "loss": 0.6249, "mean_token_accuracy": 0.899876493215561, "num_tokens": 21926214.0, "step": 5230 }, { "entropy": 0.34385873265564443, "epoch": 1.2203053968994055, "grad_norm": 1.5390625, "learning_rate": 1.9960211489922318e-05, "loss": 0.4407, "mean_token_accuracy": 0.8673520445823669, "num_tokens": 21955376.0, "step": 5235 }, { "entropy": 0.20846946127712726, "epoch": 1.2214710339200372, "grad_norm": 2.359375, "learning_rate": 1.9960134579937796e-05, "loss": 0.2482, "mean_token_accuracy": 0.9406843543052673, "num_tokens": 21979736.0, "step": 5240 }, { "entropy": 0.27167031932622193, "epoch": 1.222636670940669, "grad_norm": 0.2001953125, "learning_rate": 1.9960057595990205e-05, "loss": 0.6442, "mean_token_accuracy": 0.8813684284687042, "num_tokens": 21997966.0, "step": 5245 }, { "entropy": 0.36240064799785615, "epoch": 1.223802307961301, "grad_norm": 1.8671875, "learning_rate": 1.9959980538080696e-05, "loss": 0.6134, "mean_token_accuracy": 0.898576021194458, "num_tokens": 22018016.0, "step": 5250 }, { "entropy": 0.26327665261924266, "epoch": 1.2249679449819326, "grad_norm": 4.21875, "learning_rate": 1.9959903406210415e-05, "loss": 0.5013, "mean_token_accuracy": 0.9144660592079162, "num_tokens": 22032216.0, "step": 5255 }, { "entropy": 0.27930256724357605, "epoch": 1.2261335820025643, "grad_norm": 5.03125, "learning_rate": 1.9959826200380514e-05, "loss": 0.5056, "mean_token_accuracy": 0.9050133049488067, "num_tokens": 22044317.0, "step": 5260 }, { "entropy": 0.252337658405304, "epoch": 1.2272992190231962, "grad_norm": 1.0078125, "learning_rate": 1.9959748920592147e-05, "loss": 0.3191, "mean_token_accuracy": 0.929460322856903, "num_tokens": 22063934.0, "step": 5265 }, { "entropy": 0.251625837571919, "epoch": 1.228464856043828, "grad_norm": 2.671875, "learning_rate": 1.995967156684646e-05, "loss": 0.2712, "mean_token_accuracy": 0.9225124597549439, "num_tokens": 22087458.0, "step": 5270 }, { "entropy": 0.3579515844583511, "epoch": 1.2296304930644597, "grad_norm": 1.0234375, "learning_rate": 1.995959413914461e-05, "loss": 0.6229, "mean_token_accuracy": 0.8749721348285675, "num_tokens": 22112344.0, "step": 5275 }, { "entropy": 0.2837061192840338, "epoch": 1.2307961300850916, "grad_norm": 1.515625, "learning_rate": 1.9959516637487758e-05, "loss": 0.3442, "mean_token_accuracy": 0.8971266686916352, "num_tokens": 22139433.0, "step": 5280 }, { "entropy": 0.28239931985735894, "epoch": 1.2319617671057232, "grad_norm": 3.21875, "learning_rate": 1.9959439061877045e-05, "loss": 0.5502, "mean_token_accuracy": 0.8951669692993164, "num_tokens": 22151982.0, "step": 5285 }, { "entropy": 0.2798332400619984, "epoch": 1.2331274041263551, "grad_norm": 2.78125, "learning_rate": 1.9959361412313644e-05, "loss": 0.5572, "mean_token_accuracy": 0.8993366360664368, "num_tokens": 22172368.0, "step": 5290 }, { "entropy": 0.21451436076313257, "epoch": 1.2342930411469868, "grad_norm": 2.609375, "learning_rate": 1.99592836887987e-05, "loss": 0.4159, "mean_token_accuracy": 0.9279858708381653, "num_tokens": 22198315.0, "step": 5295 }, { "entropy": 0.24950225427746772, "epoch": 1.2354586781676187, "grad_norm": 2.453125, "learning_rate": 1.9959205891333377e-05, "loss": 0.3212, "mean_token_accuracy": 0.9208178699016571, "num_tokens": 22226935.0, "step": 5300 }, { "entropy": 0.2923054873943329, "epoch": 1.2366243151882503, "grad_norm": 2.328125, "learning_rate": 1.995912801991884e-05, "loss": 0.6991, "mean_token_accuracy": 0.881962114572525, "num_tokens": 22237313.0, "step": 5305 }, { "entropy": 0.20678225150331855, "epoch": 1.2377899522088822, "grad_norm": 1.0625, "learning_rate": 1.995905007455624e-05, "loss": 0.2865, "mean_token_accuracy": 0.9330097615718842, "num_tokens": 22264438.0, "step": 5310 }, { "entropy": 0.24442143216729165, "epoch": 1.2389555892295139, "grad_norm": 0.435546875, "learning_rate": 1.9958972055246745e-05, "loss": 0.3176, "mean_token_accuracy": 0.9177972376346588, "num_tokens": 22290522.0, "step": 5315 }, { "entropy": 0.24426558166742324, "epoch": 1.2401212262501458, "grad_norm": 2.15625, "learning_rate": 1.995889396199152e-05, "loss": 0.4477, "mean_token_accuracy": 0.9186632454395294, "num_tokens": 22325579.0, "step": 5320 }, { "entropy": 0.3336485348641872, "epoch": 1.2412868632707774, "grad_norm": 3.234375, "learning_rate": 1.995881579479173e-05, "loss": 0.6578, "mean_token_accuracy": 0.8619132041931152, "num_tokens": 22338730.0, "step": 5325 }, { "entropy": 0.22589795142412186, "epoch": 1.2424525002914093, "grad_norm": 1.453125, "learning_rate": 1.9958737553648534e-05, "loss": 0.393, "mean_token_accuracy": 0.9276214420795441, "num_tokens": 22358940.0, "step": 5330 }, { "entropy": 0.3180350840091705, "epoch": 1.243618137312041, "grad_norm": 4.1875, "learning_rate": 1.99586592385631e-05, "loss": 0.516, "mean_token_accuracy": 0.9004910588264465, "num_tokens": 22373909.0, "step": 5335 }, { "entropy": 0.22264700792729855, "epoch": 1.2447837743326728, "grad_norm": 2.03125, "learning_rate": 1.9958580849536605e-05, "loss": 0.3238, "mean_token_accuracy": 0.9395315706729889, "num_tokens": 22399772.0, "step": 5340 }, { "entropy": 0.2370637021958828, "epoch": 1.2459494113533045, "grad_norm": 0.4375, "learning_rate": 1.9958502386570205e-05, "loss": 0.4085, "mean_token_accuracy": 0.9249156236648559, "num_tokens": 22419605.0, "step": 5345 }, { "entropy": 0.20505453906953336, "epoch": 1.2471150483739364, "grad_norm": 0.890625, "learning_rate": 1.9958423849665083e-05, "loss": 0.363, "mean_token_accuracy": 0.9317021369934082, "num_tokens": 22447540.0, "step": 5350 }, { "entropy": 0.24239457100629808, "epoch": 1.248280685394568, "grad_norm": 0.8828125, "learning_rate": 1.9958345238822398e-05, "loss": 0.5966, "mean_token_accuracy": 0.9081678748130798, "num_tokens": 22459109.0, "step": 5355 }, { "entropy": 0.2920355159789324, "epoch": 1.2494463224152, "grad_norm": 2.484375, "learning_rate": 1.995826655404333e-05, "loss": 0.3709, "mean_token_accuracy": 0.9113083839416504, "num_tokens": 22479242.0, "step": 5360 }, { "entropy": 0.18992096073925496, "epoch": 1.2506119594358318, "grad_norm": 2.828125, "learning_rate": 1.9958187795329052e-05, "loss": 0.3623, "mean_token_accuracy": 0.932866907119751, "num_tokens": 22498918.0, "step": 5365 }, { "entropy": 0.2889205154031515, "epoch": 1.2517775964564635, "grad_norm": 1.1796875, "learning_rate": 1.9958108962680734e-05, "loss": 0.3862, "mean_token_accuracy": 0.870019656419754, "num_tokens": 22546690.0, "step": 5370 }, { "entropy": 0.2940490383654833, "epoch": 1.2529432334770951, "grad_norm": 0.37109375, "learning_rate": 1.9958030056099554e-05, "loss": 0.4688, "mean_token_accuracy": 0.9204876840114593, "num_tokens": 22573571.0, "step": 5375 }, { "entropy": 0.2702594131231308, "epoch": 1.254108870497727, "grad_norm": 4.71875, "learning_rate": 1.995795107558669e-05, "loss": 0.7299, "mean_token_accuracy": 0.8841011583805084, "num_tokens": 22585871.0, "step": 5380 }, { "entropy": 0.31155082285404206, "epoch": 1.255274507518359, "grad_norm": 4.78125, "learning_rate": 1.9957872021143315e-05, "loss": 0.4556, "mean_token_accuracy": 0.8956121504306793, "num_tokens": 22600566.0, "step": 5385 }, { "entropy": 0.24358467012643814, "epoch": 1.2564401445389906, "grad_norm": 2.546875, "learning_rate": 1.9957792892770614e-05, "loss": 0.6497, "mean_token_accuracy": 0.9009511232376098, "num_tokens": 22611432.0, "step": 5390 }, { "entropy": 0.23342385441064833, "epoch": 1.2576057815596222, "grad_norm": 4.78125, "learning_rate": 1.9957713690469765e-05, "loss": 0.3691, "mean_token_accuracy": 0.9175182461738587, "num_tokens": 22640108.0, "step": 5395 }, { "entropy": 0.20412432737648487, "epoch": 1.258771418580254, "grad_norm": 0.65234375, "learning_rate": 1.9957634414241947e-05, "loss": 0.2297, "mean_token_accuracy": 0.9177448451519012, "num_tokens": 22676472.0, "step": 5400 }, { "entropy": 0.22115157768130303, "epoch": 1.259937055600886, "grad_norm": 4.09375, "learning_rate": 1.995755506408834e-05, "loss": 0.5732, "mean_token_accuracy": 0.9084285914897918, "num_tokens": 22699491.0, "step": 5405 }, { "entropy": 0.23604003936052323, "epoch": 1.2611026926215176, "grad_norm": 4.0625, "learning_rate": 1.9957475640010134e-05, "loss": 0.4428, "mean_token_accuracy": 0.9152187645435333, "num_tokens": 22718024.0, "step": 5410 }, { "entropy": 0.18227957040071488, "epoch": 1.2622683296421493, "grad_norm": 1.7109375, "learning_rate": 1.9957396142008508e-05, "loss": 0.2653, "mean_token_accuracy": 0.9294990241527558, "num_tokens": 22736249.0, "step": 5415 }, { "entropy": 0.242751706764102, "epoch": 1.2634339666627812, "grad_norm": 3.046875, "learning_rate": 1.9957316570084653e-05, "loss": 0.499, "mean_token_accuracy": 0.9050007402896881, "num_tokens": 22757046.0, "step": 5420 }, { "entropy": 0.24649465046823024, "epoch": 1.264599603683413, "grad_norm": 0.283203125, "learning_rate": 1.9957236924239747e-05, "loss": 0.4445, "mean_token_accuracy": 0.91389000415802, "num_tokens": 22787756.0, "step": 5425 }, { "entropy": 0.27705603912472726, "epoch": 1.2657652407040447, "grad_norm": 3.390625, "learning_rate": 1.9957157204474985e-05, "loss": 0.5771, "mean_token_accuracy": 0.8832474410533905, "num_tokens": 22800485.0, "step": 5430 }, { "entropy": 0.2584805965423584, "epoch": 1.2669308777246766, "grad_norm": 1.6640625, "learning_rate": 1.995707741079155e-05, "loss": 0.4395, "mean_token_accuracy": 0.9243484199047088, "num_tokens": 22812750.0, "step": 5435 }, { "entropy": 0.20337900295853614, "epoch": 1.2680965147453083, "grad_norm": 1.2265625, "learning_rate": 1.995699754319064e-05, "loss": 0.3228, "mean_token_accuracy": 0.9291253626346588, "num_tokens": 22837745.0, "step": 5440 }, { "entropy": 0.38868461102247237, "epoch": 1.2692621517659401, "grad_norm": 2.515625, "learning_rate": 1.9956917601673437e-05, "loss": 0.6526, "mean_token_accuracy": 0.8844451904296875, "num_tokens": 22856058.0, "step": 5445 }, { "entropy": 0.21811575591564178, "epoch": 1.2704277887865718, "grad_norm": 2.96875, "learning_rate": 1.9956837586241138e-05, "loss": 0.4726, "mean_token_accuracy": 0.9248144567012787, "num_tokens": 22876617.0, "step": 5450 }, { "entropy": 0.2282697781920433, "epoch": 1.2715934258072037, "grad_norm": 1.203125, "learning_rate": 1.9956757496894935e-05, "loss": 0.242, "mean_token_accuracy": 0.9245195806026458, "num_tokens": 22917771.0, "step": 5455 }, { "entropy": 0.30149990916252134, "epoch": 1.2727590628278354, "grad_norm": 0.51171875, "learning_rate": 1.9956677333636024e-05, "loss": 0.6098, "mean_token_accuracy": 0.8778238594532013, "num_tokens": 22932042.0, "step": 5460 }, { "entropy": 0.25813721865415573, "epoch": 1.2739246998484672, "grad_norm": 3.734375, "learning_rate": 1.9956597096465594e-05, "loss": 0.6734, "mean_token_accuracy": 0.8907998919486999, "num_tokens": 22942112.0, "step": 5465 }, { "entropy": 0.1798843963071704, "epoch": 1.275090336869099, "grad_norm": 1.6328125, "learning_rate": 1.995651678538485e-05, "loss": 0.3044, "mean_token_accuracy": 0.9292000591754913, "num_tokens": 22969289.0, "step": 5470 }, { "entropy": 0.30131426751613616, "epoch": 1.2762559738897308, "grad_norm": 5.6875, "learning_rate": 1.9956436400394984e-05, "loss": 0.6164, "mean_token_accuracy": 0.8864310383796692, "num_tokens": 22980735.0, "step": 5475 }, { "entropy": 0.29428491592407224, "epoch": 1.2774216109103624, "grad_norm": 3.546875, "learning_rate": 1.99563559414972e-05, "loss": 0.6925, "mean_token_accuracy": 0.8707791328430176, "num_tokens": 22994196.0, "step": 5480 }, { "entropy": 0.2878478910773993, "epoch": 1.2785872479309943, "grad_norm": 4.34375, "learning_rate": 1.995627540869269e-05, "loss": 0.5568, "mean_token_accuracy": 0.8926146745681762, "num_tokens": 23014522.0, "step": 5485 }, { "entropy": 0.263381066173315, "epoch": 1.279752884951626, "grad_norm": 1.2421875, "learning_rate": 1.995619480198266e-05, "loss": 0.4982, "mean_token_accuracy": 0.9074977576732636, "num_tokens": 23033023.0, "step": 5490 }, { "entropy": 0.295852642133832, "epoch": 1.2809185219722579, "grad_norm": 0.63671875, "learning_rate": 1.9956114121368314e-05, "loss": 0.3381, "mean_token_accuracy": 0.9136807262897492, "num_tokens": 23058135.0, "step": 5495 }, { "entropy": 0.2935203604400158, "epoch": 1.2820841589928897, "grad_norm": 0.6015625, "learning_rate": 1.9956033366850847e-05, "loss": 0.5256, "mean_token_accuracy": 0.8971765220165253, "num_tokens": 23070936.0, "step": 5500 }, { "entropy": 0.2395356010645628, "epoch": 1.2832497960135214, "grad_norm": 4.96875, "learning_rate": 1.995595253843147e-05, "loss": 0.5813, "mean_token_accuracy": 0.88962482213974, "num_tokens": 23089938.0, "step": 5505 }, { "entropy": 0.419968231767416, "epoch": 1.284415433034153, "grad_norm": 4.0, "learning_rate": 1.9955871636111386e-05, "loss": 0.5162, "mean_token_accuracy": 0.8872650682926178, "num_tokens": 23110821.0, "step": 5510 }, { "entropy": 0.2630806386470795, "epoch": 1.285581070054785, "grad_norm": 5.125, "learning_rate": 1.9955790659891804e-05, "loss": 0.5462, "mean_token_accuracy": 0.9106649100780487, "num_tokens": 23125481.0, "step": 5515 }, { "entropy": 0.38911786004900933, "epoch": 1.2867467070754168, "grad_norm": 2.640625, "learning_rate": 1.995570960977393e-05, "loss": 0.7633, "mean_token_accuracy": 0.8669470906257629, "num_tokens": 23139471.0, "step": 5520 }, { "entropy": 0.26729471050202847, "epoch": 1.2879123440960485, "grad_norm": 5.09375, "learning_rate": 1.9955628485758968e-05, "loss": 0.4119, "mean_token_accuracy": 0.9227097749710083, "num_tokens": 23156969.0, "step": 5525 }, { "entropy": 0.194175586104393, "epoch": 1.2890779811166801, "grad_norm": 0.490234375, "learning_rate": 1.9955547287848136e-05, "loss": 0.3486, "mean_token_accuracy": 0.914922684431076, "num_tokens": 23186931.0, "step": 5530 }, { "entropy": 0.2041798748075962, "epoch": 1.290243618137312, "grad_norm": 2.296875, "learning_rate": 1.9955466016042637e-05, "loss": 0.3021, "mean_token_accuracy": 0.9194576680660248, "num_tokens": 23218246.0, "step": 5535 }, { "entropy": 0.20812569046393037, "epoch": 1.291409255157944, "grad_norm": 0.27734375, "learning_rate": 1.995538467034369e-05, "loss": 0.2525, "mean_token_accuracy": 0.9183135628700256, "num_tokens": 23252019.0, "step": 5540 }, { "entropy": 0.3059225469827652, "epoch": 1.2925748921785756, "grad_norm": 5.25, "learning_rate": 1.99553032507525e-05, "loss": 0.7214, "mean_token_accuracy": 0.8827582836151123, "num_tokens": 23260474.0, "step": 5545 }, { "entropy": 0.42859417796134947, "epoch": 1.2937405291992072, "grad_norm": 4.78125, "learning_rate": 1.9955221757270287e-05, "loss": 0.6725, "mean_token_accuracy": 0.8570225417613984, "num_tokens": 23275961.0, "step": 5550 }, { "entropy": 0.33137104101479053, "epoch": 1.2949061662198391, "grad_norm": 2.140625, "learning_rate": 1.9955140189898262e-05, "loss": 0.5805, "mean_token_accuracy": 0.8772634506225586, "num_tokens": 23288893.0, "step": 5555 }, { "entropy": 0.2654805898666382, "epoch": 1.296071803240471, "grad_norm": 5.4375, "learning_rate": 1.995505854863765e-05, "loss": 0.5707, "mean_token_accuracy": 0.9057930707931519, "num_tokens": 23299913.0, "step": 5560 }, { "entropy": 0.26006845086812974, "epoch": 1.2972374402611027, "grad_norm": 3.40625, "learning_rate": 1.995497683348966e-05, "loss": 0.378, "mean_token_accuracy": 0.9138223767280579, "num_tokens": 23323199.0, "step": 5565 }, { "entropy": 0.2738994713872671, "epoch": 1.2984030772817345, "grad_norm": 1.7578125, "learning_rate": 1.995489504445551e-05, "loss": 0.3923, "mean_token_accuracy": 0.9132127881050109, "num_tokens": 23340559.0, "step": 5570 }, { "entropy": 0.34965813905000687, "epoch": 1.2995687143023662, "grad_norm": 0.59765625, "learning_rate": 1.995481318153642e-05, "loss": 0.8147, "mean_token_accuracy": 0.8552458584308624, "num_tokens": 23358466.0, "step": 5575 }, { "entropy": 0.23392754904925822, "epoch": 1.300734351322998, "grad_norm": 1.171875, "learning_rate": 1.9954731244733618e-05, "loss": 0.3513, "mean_token_accuracy": 0.9144553184509278, "num_tokens": 23381477.0, "step": 5580 }, { "entropy": 0.23823288679122925, "epoch": 1.3018999883436297, "grad_norm": 2.96875, "learning_rate": 1.995464923404832e-05, "loss": 0.4633, "mean_token_accuracy": 0.9204339981079102, "num_tokens": 23403290.0, "step": 5585 }, { "entropy": 0.2805923163890839, "epoch": 1.3030656253642616, "grad_norm": 1.296875, "learning_rate": 1.995456714948175e-05, "loss": 0.5294, "mean_token_accuracy": 0.9073796331882477, "num_tokens": 23413307.0, "step": 5590 }, { "entropy": 0.36135985255241393, "epoch": 1.3042312623848933, "grad_norm": 7.03125, "learning_rate": 1.995448499103513e-05, "loss": 0.8949, "mean_token_accuracy": 0.8443944990634918, "num_tokens": 23423224.0, "step": 5595 }, { "entropy": 0.3975703451782465, "epoch": 1.3053968994055252, "grad_norm": 0.291015625, "learning_rate": 1.9954402758709687e-05, "loss": 0.6798, "mean_token_accuracy": 0.869120967388153, "num_tokens": 23449403.0, "step": 5600 }, { "entropy": 0.26895866096019744, "epoch": 1.3065625364261568, "grad_norm": 0.66015625, "learning_rate": 1.9954320452506648e-05, "loss": 0.573, "mean_token_accuracy": 0.9046859323978425, "num_tokens": 23477768.0, "step": 5605 }, { "entropy": 0.2528399731963873, "epoch": 1.3077281734467887, "grad_norm": 0.65234375, "learning_rate": 1.995423807242724e-05, "loss": 0.4023, "mean_token_accuracy": 0.9266422271728516, "num_tokens": 23504880.0, "step": 5610 }, { "entropy": 0.2804453056305647, "epoch": 1.3088938104674204, "grad_norm": 3.671875, "learning_rate": 1.9954155618472687e-05, "loss": 0.488, "mean_token_accuracy": 0.9003461420536041, "num_tokens": 23532967.0, "step": 5615 }, { "entropy": 0.26205198690295217, "epoch": 1.3100594474880523, "grad_norm": 0.88671875, "learning_rate": 1.9954073090644227e-05, "loss": 0.3315, "mean_token_accuracy": 0.9261995434761048, "num_tokens": 23548656.0, "step": 5620 }, { "entropy": 0.18076314106583596, "epoch": 1.311225084508684, "grad_norm": 0.337890625, "learning_rate": 1.9953990488943086e-05, "loss": 0.2923, "mean_token_accuracy": 0.9447229325771331, "num_tokens": 23576455.0, "step": 5625 }, { "entropy": 0.267459000647068, "epoch": 1.3123907215293158, "grad_norm": 0.9296875, "learning_rate": 1.9953907813370494e-05, "loss": 0.3465, "mean_token_accuracy": 0.9184731841087341, "num_tokens": 23591955.0, "step": 5630 }, { "entropy": 0.3997308075428009, "epoch": 1.3135563585499477, "grad_norm": 5.15625, "learning_rate": 1.9953825063927684e-05, "loss": 0.7307, "mean_token_accuracy": 0.8641902565956116, "num_tokens": 23600933.0, "step": 5635 }, { "entropy": 0.4334285452961922, "epoch": 1.3147219955705793, "grad_norm": 1.1640625, "learning_rate": 1.995374224061589e-05, "loss": 0.5922, "mean_token_accuracy": 0.8764832258224488, "num_tokens": 23615985.0, "step": 5640 }, { "entropy": 0.4441813049837947, "epoch": 1.315887632591211, "grad_norm": 0.40625, "learning_rate": 1.9953659343436352e-05, "loss": 0.8662, "mean_token_accuracy": 0.8815126717090607, "num_tokens": 23644565.0, "step": 5645 }, { "entropy": 0.27254144847393036, "epoch": 1.3170532696118429, "grad_norm": 2.78125, "learning_rate": 1.99535763723903e-05, "loss": 0.4853, "mean_token_accuracy": 0.9063039720058441, "num_tokens": 23663586.0, "step": 5650 }, { "entropy": 0.37559669390320777, "epoch": 1.3182189066324748, "grad_norm": 4.84375, "learning_rate": 1.9953493327478976e-05, "loss": 0.7446, "mean_token_accuracy": 0.8823855638504028, "num_tokens": 23672706.0, "step": 5655 }, { "entropy": 0.20802447032183408, "epoch": 1.3193845436531064, "grad_norm": 0.228515625, "learning_rate": 1.9953410208703614e-05, "loss": 0.4037, "mean_token_accuracy": 0.919158661365509, "num_tokens": 23697302.0, "step": 5660 }, { "entropy": 0.3208934962749481, "epoch": 1.320550180673738, "grad_norm": 2.546875, "learning_rate": 1.9953327016065455e-05, "loss": 0.686, "mean_token_accuracy": 0.8717946231365203, "num_tokens": 23707498.0, "step": 5665 }, { "entropy": 0.3227735310792923, "epoch": 1.32171581769437, "grad_norm": 2.609375, "learning_rate": 1.9953243749565742e-05, "loss": 0.8797, "mean_token_accuracy": 0.8639743089675903, "num_tokens": 23716875.0, "step": 5670 }, { "entropy": 0.27695200871676207, "epoch": 1.3228814547150018, "grad_norm": 6.625, "learning_rate": 1.9953160409205714e-05, "loss": 0.3936, "mean_token_accuracy": 0.9095885276794433, "num_tokens": 23758465.0, "step": 5675 }, { "entropy": 0.22672717571258544, "epoch": 1.3240470917356335, "grad_norm": 1.8984375, "learning_rate": 1.9953076994986613e-05, "loss": 0.3288, "mean_token_accuracy": 0.9335966229438781, "num_tokens": 23774175.0, "step": 5680 }, { "entropy": 0.2518630506470799, "epoch": 1.3252127287562652, "grad_norm": 0.318359375, "learning_rate": 1.9952993506909687e-05, "loss": 0.3842, "mean_token_accuracy": 0.9146360695362091, "num_tokens": 23806579.0, "step": 5685 }, { "entropy": 0.2931936949491501, "epoch": 1.326378365776897, "grad_norm": 3.859375, "learning_rate": 1.9952909944976175e-05, "loss": 0.5051, "mean_token_accuracy": 0.8879533410072327, "num_tokens": 23821716.0, "step": 5690 }, { "entropy": 0.2536670383065939, "epoch": 1.327544002797529, "grad_norm": 2.9375, "learning_rate": 1.995282630918733e-05, "loss": 0.4721, "mean_token_accuracy": 0.9183348596096039, "num_tokens": 23836439.0, "step": 5695 }, { "entropy": 0.25506643392145634, "epoch": 1.3287096398181606, "grad_norm": 0.765625, "learning_rate": 1.995274259954439e-05, "loss": 0.4695, "mean_token_accuracy": 0.91966432929039, "num_tokens": 23850720.0, "step": 5700 }, { "entropy": 0.2509689211845398, "epoch": 1.3298752768387925, "grad_norm": 4.875, "learning_rate": 1.9952658816048612e-05, "loss": 0.5005, "mean_token_accuracy": 0.8925885140895844, "num_tokens": 23869638.0, "step": 5705 }, { "entropy": 0.2714238610118628, "epoch": 1.3310409138594241, "grad_norm": 2.609375, "learning_rate": 1.995257495870124e-05, "loss": 0.4868, "mean_token_accuracy": 0.917405641078949, "num_tokens": 23886300.0, "step": 5710 }, { "entropy": 0.23347382061183453, "epoch": 1.332206550880056, "grad_norm": 2.25, "learning_rate": 1.9952491027503527e-05, "loss": 0.4869, "mean_token_accuracy": 0.9239233791828155, "num_tokens": 23922439.0, "step": 5715 }, { "entropy": 0.24450993463397025, "epoch": 1.3333721879006877, "grad_norm": 4.21875, "learning_rate": 1.9952407022456722e-05, "loss": 0.4458, "mean_token_accuracy": 0.9130438029766083, "num_tokens": 23940457.0, "step": 5720 }, { "entropy": 0.15998954940587282, "epoch": 1.3345378249213196, "grad_norm": 0.69140625, "learning_rate": 1.9952322943562085e-05, "loss": 0.2078, "mean_token_accuracy": 0.9507815659046173, "num_tokens": 23970127.0, "step": 5725 }, { "entropy": 0.29249856173992156, "epoch": 1.3357034619419512, "grad_norm": 2.34375, "learning_rate": 1.995223879082086e-05, "loss": 0.5888, "mean_token_accuracy": 0.9081490993499756, "num_tokens": 23980439.0, "step": 5730 }, { "entropy": 0.2913537845015526, "epoch": 1.336869098962583, "grad_norm": 0.3125, "learning_rate": 1.9952154564234307e-05, "loss": 0.5545, "mean_token_accuracy": 0.8999676525592804, "num_tokens": 24006720.0, "step": 5735 }, { "entropy": 0.2000728841871023, "epoch": 1.3380347359832148, "grad_norm": 0.310546875, "learning_rate": 1.995207026380368e-05, "loss": 0.4729, "mean_token_accuracy": 0.9112296581268311, "num_tokens": 24030735.0, "step": 5740 }, { "entropy": 0.40132756531238556, "epoch": 1.3392003730038466, "grad_norm": 2.34375, "learning_rate": 1.9951985889530237e-05, "loss": 0.7819, "mean_token_accuracy": 0.8558760762214661, "num_tokens": 24044026.0, "step": 5745 }, { "entropy": 0.31329273283481596, "epoch": 1.3403660100244783, "grad_norm": 3.421875, "learning_rate": 1.995190144141524e-05, "loss": 0.5908, "mean_token_accuracy": 0.8867635846138, "num_tokens": 24059893.0, "step": 5750 }, { "entropy": 0.2145456612110138, "epoch": 1.3415316470451102, "grad_norm": 3.09375, "learning_rate": 1.9951816919459937e-05, "loss": 0.4101, "mean_token_accuracy": 0.9166670083999634, "num_tokens": 24084954.0, "step": 5755 }, { "entropy": 0.26943669021129607, "epoch": 1.3426972840657418, "grad_norm": 3.296875, "learning_rate": 1.9951732323665602e-05, "loss": 0.4286, "mean_token_accuracy": 0.9149986863136291, "num_tokens": 24097300.0, "step": 5760 }, { "entropy": 0.4137574560940266, "epoch": 1.3438629210863737, "grad_norm": 1.9375, "learning_rate": 1.9951647654033487e-05, "loss": 0.7597, "mean_token_accuracy": 0.8728756129741668, "num_tokens": 24119361.0, "step": 5765 }, { "entropy": 0.22645775750279426, "epoch": 1.3450285581070056, "grad_norm": 0.388671875, "learning_rate": 1.995156291056486e-05, "loss": 0.4117, "mean_token_accuracy": 0.9256348073482513, "num_tokens": 24144326.0, "step": 5770 }, { "entropy": 0.3713815161027014, "epoch": 1.3461941951276373, "grad_norm": 1.515625, "learning_rate": 1.9951478093260984e-05, "loss": 0.3942, "mean_token_accuracy": 0.8806808292865753, "num_tokens": 24179091.0, "step": 5775 }, { "entropy": 0.3129763476550579, "epoch": 1.347359832148269, "grad_norm": 2.90625, "learning_rate": 1.995139320212312e-05, "loss": 0.6692, "mean_token_accuracy": 0.8922302544116973, "num_tokens": 24191822.0, "step": 5780 }, { "entropy": 0.2801304005086422, "epoch": 1.3485254691689008, "grad_norm": 1.1015625, "learning_rate": 1.9951308237152534e-05, "loss": 0.5596, "mean_token_accuracy": 0.8922988891601562, "num_tokens": 24206746.0, "step": 5785 }, { "entropy": 0.3644409075379372, "epoch": 1.3496911061895327, "grad_norm": 5.3125, "learning_rate": 1.9951223198350494e-05, "loss": 0.6947, "mean_token_accuracy": 0.8766247451305389, "num_tokens": 24220264.0, "step": 5790 }, { "entropy": 0.2567942202091217, "epoch": 1.3508567432101644, "grad_norm": 2.484375, "learning_rate": 1.9951138085718274e-05, "loss": 0.4551, "mean_token_accuracy": 0.9105157017707824, "num_tokens": 24254482.0, "step": 5795 }, { "entropy": 0.2791744988411665, "epoch": 1.352022380230796, "grad_norm": 0.91015625, "learning_rate": 1.9951052899257138e-05, "loss": 0.4648, "mean_token_accuracy": 0.9042120218276978, "num_tokens": 24278510.0, "step": 5800 }, { "entropy": 0.23117900416254997, "epoch": 1.353188017251428, "grad_norm": 0.298828125, "learning_rate": 1.9950967638968352e-05, "loss": 0.1542, "mean_token_accuracy": 0.9249145030975342, "num_tokens": 24320079.0, "step": 5805 }, { "entropy": 0.2455504924058914, "epoch": 1.3543536542720598, "grad_norm": 0.28515625, "learning_rate": 1.9950882304853195e-05, "loss": 0.2634, "mean_token_accuracy": 0.9251305639743805, "num_tokens": 24351086.0, "step": 5810 }, { "entropy": 0.32813689541071656, "epoch": 1.3555192912926914, "grad_norm": 0.353515625, "learning_rate": 1.9950796896912937e-05, "loss": 0.4891, "mean_token_accuracy": 0.8899535357952117, "num_tokens": 24400487.0, "step": 5815 }, { "entropy": 0.24501474015414715, "epoch": 1.356684928313323, "grad_norm": 0.51171875, "learning_rate": 1.995071141514885e-05, "loss": 0.5379, "mean_token_accuracy": 0.9088938236236572, "num_tokens": 24427331.0, "step": 5820 }, { "entropy": 0.3071946881711483, "epoch": 1.357850565333955, "grad_norm": 1.578125, "learning_rate": 1.9950625859562208e-05, "loss": 0.5551, "mean_token_accuracy": 0.8883362352848053, "num_tokens": 24438312.0, "step": 5825 }, { "entropy": 0.19779857322573663, "epoch": 1.3590162023545869, "grad_norm": 1.359375, "learning_rate": 1.995054023015429e-05, "loss": 0.3946, "mean_token_accuracy": 0.9297601401805877, "num_tokens": 24451773.0, "step": 5830 }, { "entropy": 0.22592936605215072, "epoch": 1.3601818393752185, "grad_norm": 0.41796875, "learning_rate": 1.995045452692637e-05, "loss": 0.4492, "mean_token_accuracy": 0.9175360441207886, "num_tokens": 24483192.0, "step": 5835 }, { "entropy": 0.2569009017199278, "epoch": 1.3613474763958504, "grad_norm": 4.9375, "learning_rate": 1.9950368749879726e-05, "loss": 0.5167, "mean_token_accuracy": 0.9054693639278412, "num_tokens": 24500215.0, "step": 5840 }, { "entropy": 0.1963417749851942, "epoch": 1.362513113416482, "grad_norm": 1.65625, "learning_rate": 1.995028289901564e-05, "loss": 0.301, "mean_token_accuracy": 0.9437475621700286, "num_tokens": 24527876.0, "step": 5845 }, { "entropy": 0.35903219059109687, "epoch": 1.363678750437114, "grad_norm": 3.640625, "learning_rate": 1.9950196974335392e-05, "loss": 0.5245, "mean_token_accuracy": 0.8531217724084854, "num_tokens": 24556353.0, "step": 5850 }, { "entropy": 0.23062839526683093, "epoch": 1.3648443874577456, "grad_norm": 0.23828125, "learning_rate": 1.9950110975840256e-05, "loss": 0.3947, "mean_token_accuracy": 0.9208827018737793, "num_tokens": 24585285.0, "step": 5855 }, { "entropy": 0.3217948623001575, "epoch": 1.3660100244783775, "grad_norm": 3.09375, "learning_rate": 1.9950024903531525e-05, "loss": 0.5242, "mean_token_accuracy": 0.8897144675254822, "num_tokens": 24602223.0, "step": 5860 }, { "entropy": 0.25288745760917664, "epoch": 1.3671756614990092, "grad_norm": 0.357421875, "learning_rate": 1.994993875741048e-05, "loss": 0.3786, "mean_token_accuracy": 0.9193249821662903, "num_tokens": 24622073.0, "step": 5865 }, { "entropy": 0.22469098567962648, "epoch": 1.368341298519641, "grad_norm": 3.578125, "learning_rate": 1.9949852537478396e-05, "loss": 0.411, "mean_token_accuracy": 0.9251877844333649, "num_tokens": 24638083.0, "step": 5870 }, { "entropy": 0.2176618270576, "epoch": 1.3695069355402727, "grad_norm": 2.5625, "learning_rate": 1.9949766243736567e-05, "loss": 0.3303, "mean_token_accuracy": 0.934071135520935, "num_tokens": 24653018.0, "step": 5875 }, { "entropy": 0.2513718821108341, "epoch": 1.3706725725609046, "grad_norm": 3.609375, "learning_rate": 1.9949679876186282e-05, "loss": 0.4386, "mean_token_accuracy": 0.9089162170886993, "num_tokens": 24674972.0, "step": 5880 }, { "entropy": 0.28676617220044137, "epoch": 1.3718382095815362, "grad_norm": 4.40625, "learning_rate": 1.9949593434828826e-05, "loss": 0.6229, "mean_token_accuracy": 0.8899110794067383, "num_tokens": 24689403.0, "step": 5885 }, { "entropy": 0.23323919475078583, "epoch": 1.3730038466021681, "grad_norm": 0.64453125, "learning_rate": 1.9949506919665483e-05, "loss": 0.4513, "mean_token_accuracy": 0.9205905556678772, "num_tokens": 24715786.0, "step": 5890 }, { "entropy": 0.2819278556853533, "epoch": 1.3741694836227998, "grad_norm": 0.70703125, "learning_rate": 1.994942033069755e-05, "loss": 0.3828, "mean_token_accuracy": 0.9168027639389038, "num_tokens": 24734448.0, "step": 5895 }, { "entropy": 0.3614561915397644, "epoch": 1.3753351206434317, "grad_norm": 3.484375, "learning_rate": 1.9949333667926315e-05, "loss": 0.633, "mean_token_accuracy": 0.8805983364582062, "num_tokens": 24753262.0, "step": 5900 }, { "entropy": 0.24026509281247854, "epoch": 1.3765007576640635, "grad_norm": 3.578125, "learning_rate": 1.994924693135307e-05, "loss": 0.2847, "mean_token_accuracy": 0.9050538659095764, "num_tokens": 24788999.0, "step": 5905 }, { "entropy": 0.21759585216641425, "epoch": 1.3776663946846952, "grad_norm": 6.65625, "learning_rate": 1.9949160120979106e-05, "loss": 0.4946, "mean_token_accuracy": 0.9079568982124329, "num_tokens": 24808413.0, "step": 5910 }, { "entropy": 0.19574114717543126, "epoch": 1.3788320317053269, "grad_norm": 0.52734375, "learning_rate": 1.9949073236805727e-05, "loss": 0.3247, "mean_token_accuracy": 0.9310945808887482, "num_tokens": 24832143.0, "step": 5915 }, { "entropy": 0.2865985196083784, "epoch": 1.3799976687259587, "grad_norm": 2.75, "learning_rate": 1.994898627883422e-05, "loss": 0.418, "mean_token_accuracy": 0.9289934575557709, "num_tokens": 24849135.0, "step": 5920 }, { "entropy": 0.31515090465545653, "epoch": 1.3811633057465906, "grad_norm": 0.62109375, "learning_rate": 1.9948899247065882e-05, "loss": 0.5842, "mean_token_accuracy": 0.8844206809997559, "num_tokens": 24875602.0, "step": 5925 }, { "entropy": 0.336475083976984, "epoch": 1.3823289427672223, "grad_norm": 1.3125, "learning_rate": 1.9948812141502015e-05, "loss": 0.609, "mean_token_accuracy": 0.8807213425636291, "num_tokens": 24896407.0, "step": 5930 }, { "entropy": 0.23444087468087674, "epoch": 1.383494579787854, "grad_norm": 0.90625, "learning_rate": 1.994872496214391e-05, "loss": 0.2867, "mean_token_accuracy": 0.9178794384002685, "num_tokens": 24928449.0, "step": 5935 }, { "entropy": 0.2076961003243923, "epoch": 1.3846602168084858, "grad_norm": 1.265625, "learning_rate": 1.994863770899288e-05, "loss": 0.4733, "mean_token_accuracy": 0.9192006587982178, "num_tokens": 24943725.0, "step": 5940 }, { "entropy": 0.25617918446660043, "epoch": 1.3858258538291177, "grad_norm": 3.9375, "learning_rate": 1.9948550382050217e-05, "loss": 0.3418, "mean_token_accuracy": 0.8931475222110749, "num_tokens": 24962709.0, "step": 5945 }, { "entropy": 0.2509648621082306, "epoch": 1.3869914908497494, "grad_norm": 4.5, "learning_rate": 1.9948462981317224e-05, "loss": 0.5186, "mean_token_accuracy": 0.9041432499885559, "num_tokens": 24975572.0, "step": 5950 }, { "entropy": 0.28281467258930204, "epoch": 1.388157127870381, "grad_norm": 4.71875, "learning_rate": 1.9948375506795203e-05, "loss": 0.6613, "mean_token_accuracy": 0.8867592930793762, "num_tokens": 24985140.0, "step": 5955 }, { "entropy": 0.1830880269408226, "epoch": 1.389322764891013, "grad_norm": 3.3125, "learning_rate": 1.9948287958485462e-05, "loss": 0.3379, "mean_token_accuracy": 0.9403021216392518, "num_tokens": 25003798.0, "step": 5960 }, { "entropy": 0.20799617059528827, "epoch": 1.3904884019116448, "grad_norm": 1.1796875, "learning_rate": 1.9948200336389306e-05, "loss": 0.3895, "mean_token_accuracy": 0.918929374217987, "num_tokens": 25038105.0, "step": 5965 }, { "entropy": 0.216236861795187, "epoch": 1.3916540389322765, "grad_norm": 1.875, "learning_rate": 1.9948112640508038e-05, "loss": 0.5133, "mean_token_accuracy": 0.9075867176055908, "num_tokens": 25054679.0, "step": 5970 }, { "entropy": 0.22477278523147107, "epoch": 1.3928196759529083, "grad_norm": 3.546875, "learning_rate": 1.994802487084297e-05, "loss": 0.5587, "mean_token_accuracy": 0.9134108006954194, "num_tokens": 25079947.0, "step": 5975 }, { "entropy": 0.23681939952075481, "epoch": 1.39398531297354, "grad_norm": 1.5625, "learning_rate": 1.9947937027395407e-05, "loss": 0.3169, "mean_token_accuracy": 0.9177536785602569, "num_tokens": 25106368.0, "step": 5980 }, { "entropy": 0.22461127489805222, "epoch": 1.3951509499941719, "grad_norm": 0.54296875, "learning_rate": 1.994784911016666e-05, "loss": 0.3875, "mean_token_accuracy": 0.9304892122745514, "num_tokens": 25132141.0, "step": 5985 }, { "entropy": 0.24312784448266028, "epoch": 1.3963165870148035, "grad_norm": 4.75, "learning_rate": 1.9947761119158046e-05, "loss": 0.4565, "mean_token_accuracy": 0.910781592130661, "num_tokens": 25163258.0, "step": 5990 }, { "entropy": 0.2798545081168413, "epoch": 1.3974822240354354, "grad_norm": 2.25, "learning_rate": 1.9947673054370867e-05, "loss": 0.3462, "mean_token_accuracy": 0.884056442975998, "num_tokens": 25185321.0, "step": 5995 }, { "entropy": 0.2866420477628708, "epoch": 1.398647861056067, "grad_norm": 5.78125, "learning_rate": 1.9947584915806444e-05, "loss": 0.7835, "mean_token_accuracy": 0.8801892518997192, "num_tokens": 25195532.0, "step": 6000 }, { "entropy": 0.2600490044802427, "epoch": 1.399813498076699, "grad_norm": 4.09375, "learning_rate": 1.9947496703466088e-05, "loss": 0.3731, "mean_token_accuracy": 0.9133882701396943, "num_tokens": 25218885.0, "step": 6005 }, { "entropy": 0.18085310496389867, "epoch": 1.4009791350973306, "grad_norm": 3.140625, "learning_rate": 1.9947408417351114e-05, "loss": 0.2987, "mean_token_accuracy": 0.9389593541622162, "num_tokens": 25248167.0, "step": 6010 }, { "entropy": 0.401553612947464, "epoch": 1.4021447721179625, "grad_norm": 2.9375, "learning_rate": 1.9947320057462836e-05, "loss": 0.6126, "mean_token_accuracy": 0.8375705778598785, "num_tokens": 25272827.0, "step": 6015 }, { "entropy": 0.27304190024733543, "epoch": 1.4033104091385942, "grad_norm": 2.34375, "learning_rate": 1.994723162380258e-05, "loss": 0.4852, "mean_token_accuracy": 0.9171205282211303, "num_tokens": 25284313.0, "step": 6020 }, { "entropy": 0.3541749894618988, "epoch": 1.404476046159226, "grad_norm": 1.1953125, "learning_rate": 1.9947143116371656e-05, "loss": 0.61, "mean_token_accuracy": 0.8762133181095123, "num_tokens": 25303657.0, "step": 6025 }, { "entropy": 0.21734450608491898, "epoch": 1.4056416831798577, "grad_norm": 2.34375, "learning_rate": 1.994705453517139e-05, "loss": 0.4161, "mean_token_accuracy": 0.9308688163757324, "num_tokens": 25316085.0, "step": 6030 }, { "entropy": 0.21970502883195878, "epoch": 1.4068073202004896, "grad_norm": 0.74609375, "learning_rate": 1.9946965880203098e-05, "loss": 0.3519, "mean_token_accuracy": 0.9339965403079986, "num_tokens": 25336522.0, "step": 6035 }, { "entropy": 0.2662087522447109, "epoch": 1.4079729572211215, "grad_norm": 2.328125, "learning_rate": 1.9946877151468103e-05, "loss": 0.5657, "mean_token_accuracy": 0.8919131517410278, "num_tokens": 25352103.0, "step": 6040 }, { "entropy": 0.3002671368420124, "epoch": 1.4091385942417531, "grad_norm": 0.63671875, "learning_rate": 1.9946788348967732e-05, "loss": 0.4606, "mean_token_accuracy": 0.9111246347427369, "num_tokens": 25371971.0, "step": 6045 }, { "entropy": 0.2485044565051794, "epoch": 1.4103042312623848, "grad_norm": 3.15625, "learning_rate": 1.9946699472703305e-05, "loss": 0.4732, "mean_token_accuracy": 0.9000851452350617, "num_tokens": 25397621.0, "step": 6050 }, { "entropy": 0.3733428567647934, "epoch": 1.4114698682830167, "grad_norm": 2.0625, "learning_rate": 1.9946610522676148e-05, "loss": 0.4188, "mean_token_accuracy": 0.9120821356773376, "num_tokens": 25408478.0, "step": 6055 }, { "entropy": 0.18461912628263236, "epoch": 1.4126355053036486, "grad_norm": 2.4375, "learning_rate": 1.9946521498887587e-05, "loss": 0.2562, "mean_token_accuracy": 0.9476583361625671, "num_tokens": 25430854.0, "step": 6060 }, { "entropy": 0.3762295678257942, "epoch": 1.4138011423242802, "grad_norm": 4.28125, "learning_rate": 1.9946432401338952e-05, "loss": 0.6702, "mean_token_accuracy": 0.8688174486160278, "num_tokens": 25443403.0, "step": 6065 }, { "entropy": 0.22310646399855613, "epoch": 1.4149667793449119, "grad_norm": 0.9375, "learning_rate": 1.994634323003157e-05, "loss": 0.2445, "mean_token_accuracy": 0.9442370891571045, "num_tokens": 25461873.0, "step": 6070 }, { "entropy": 0.1819608300924301, "epoch": 1.4161324163655438, "grad_norm": 2.703125, "learning_rate": 1.994625398496677e-05, "loss": 0.319, "mean_token_accuracy": 0.9387645661830902, "num_tokens": 25479569.0, "step": 6075 }, { "entropy": 0.2639749272726476, "epoch": 1.4172980533861756, "grad_norm": 2.203125, "learning_rate": 1.9946164666145887e-05, "loss": 0.4714, "mean_token_accuracy": 0.9076428413391113, "num_tokens": 25498608.0, "step": 6080 }, { "entropy": 0.33660699874162675, "epoch": 1.4184636904068073, "grad_norm": 1.9453125, "learning_rate": 1.9946075273570246e-05, "loss": 0.712, "mean_token_accuracy": 0.8807506680488586, "num_tokens": 25508015.0, "step": 6085 }, { "entropy": 0.24338987190276384, "epoch": 1.419629327427439, "grad_norm": 4.90625, "learning_rate": 1.9945985807241183e-05, "loss": 0.4234, "mean_token_accuracy": 0.9177171468734742, "num_tokens": 25536317.0, "step": 6090 }, { "entropy": 0.1747225273400545, "epoch": 1.4207949644480709, "grad_norm": 0.38671875, "learning_rate": 1.9945896267160033e-05, "loss": 0.1504, "mean_token_accuracy": 0.9511634230613708, "num_tokens": 25581641.0, "step": 6095 }, { "entropy": 0.24279556684195996, "epoch": 1.4219606014687027, "grad_norm": 6.4375, "learning_rate": 1.994580665332813e-05, "loss": 0.3479, "mean_token_accuracy": 0.9131585538387299, "num_tokens": 25609683.0, "step": 6100 }, { "entropy": 0.21206288058310746, "epoch": 1.4231262384893344, "grad_norm": 5.125, "learning_rate": 1.9945716965746815e-05, "loss": 0.4123, "mean_token_accuracy": 0.9267059683799743, "num_tokens": 25631691.0, "step": 6105 }, { "entropy": 0.20703665874898433, "epoch": 1.4242918755099663, "grad_norm": 0.53515625, "learning_rate": 1.9945627204417417e-05, "loss": 0.3589, "mean_token_accuracy": 0.9129282414913178, "num_tokens": 25647570.0, "step": 6110 }, { "entropy": 0.2758414391428232, "epoch": 1.425457512530598, "grad_norm": 4.03125, "learning_rate": 1.994553736934128e-05, "loss": 0.4752, "mean_token_accuracy": 0.9155567049980163, "num_tokens": 25666958.0, "step": 6115 }, { "entropy": 0.24596691727638245, "epoch": 1.4266231495512298, "grad_norm": 1.9140625, "learning_rate": 1.9945447460519744e-05, "loss": 0.4994, "mean_token_accuracy": 0.907034718990326, "num_tokens": 25683703.0, "step": 6120 }, { "entropy": 0.22788504436612128, "epoch": 1.4277887865718615, "grad_norm": 2.21875, "learning_rate": 1.9945357477954146e-05, "loss": 0.4048, "mean_token_accuracy": 0.9154785871505737, "num_tokens": 25704111.0, "step": 6125 }, { "entropy": 0.2857265181839466, "epoch": 1.4289544235924934, "grad_norm": 3.890625, "learning_rate": 1.994526742164583e-05, "loss": 0.5323, "mean_token_accuracy": 0.9008615195751191, "num_tokens": 25715542.0, "step": 6130 }, { "entropy": 0.33440128406509756, "epoch": 1.430120060613125, "grad_norm": 0.38671875, "learning_rate": 1.9945177291596138e-05, "loss": 0.5732, "mean_token_accuracy": 0.8830467939376831, "num_tokens": 25748108.0, "step": 6135 }, { "entropy": 0.20241359770298004, "epoch": 1.431285697633757, "grad_norm": 1.734375, "learning_rate": 1.9945087087806418e-05, "loss": 0.3625, "mean_token_accuracy": 0.9333572685718536, "num_tokens": 25762188.0, "step": 6140 }, { "entropy": 0.2195176776498556, "epoch": 1.4324513346543886, "grad_norm": 2.984375, "learning_rate": 1.9944996810278004e-05, "loss": 0.3942, "mean_token_accuracy": 0.9176979422569275, "num_tokens": 25779783.0, "step": 6145 }, { "entropy": 0.24785535782575607, "epoch": 1.4336169716750204, "grad_norm": 0.44921875, "learning_rate": 1.9944906459012256e-05, "loss": 0.3539, "mean_token_accuracy": 0.917992216348648, "num_tokens": 25808857.0, "step": 6150 }, { "entropy": 0.2307575661689043, "epoch": 1.434782608695652, "grad_norm": 1.5078125, "learning_rate": 1.9944816034010514e-05, "loss": 0.3912, "mean_token_accuracy": 0.9268999874591828, "num_tokens": 25827633.0, "step": 6155 }, { "entropy": 0.3657380998134613, "epoch": 1.435948245716284, "grad_norm": 3.875, "learning_rate": 1.994472553527413e-05, "loss": 0.6706, "mean_token_accuracy": 0.8987206935882568, "num_tokens": 25835309.0, "step": 6160 }, { "entropy": 0.2581876091659069, "epoch": 1.4371138827369156, "grad_norm": 1.6484375, "learning_rate": 1.9944634962804447e-05, "loss": 0.3631, "mean_token_accuracy": 0.9164925396442414, "num_tokens": 25854498.0, "step": 6165 }, { "entropy": 0.23693926855921746, "epoch": 1.4382795197575475, "grad_norm": 3.921875, "learning_rate": 1.9944544316602822e-05, "loss": 0.4131, "mean_token_accuracy": 0.9203273713588714, "num_tokens": 25867321.0, "step": 6170 }, { "entropy": 0.21460621878504754, "epoch": 1.4394451567781794, "grad_norm": 3.53125, "learning_rate": 1.99444535966706e-05, "loss": 0.3414, "mean_token_accuracy": 0.9152821362018585, "num_tokens": 25883601.0, "step": 6175 }, { "entropy": 0.18589368872344494, "epoch": 1.440610793798811, "grad_norm": 0.55078125, "learning_rate": 1.9944362803009143e-05, "loss": 0.2293, "mean_token_accuracy": 0.9320219933986664, "num_tokens": 25907701.0, "step": 6180 }, { "entropy": 0.19626751802861692, "epoch": 1.4417764308194427, "grad_norm": 2.46875, "learning_rate": 1.99442719356198e-05, "loss": 0.325, "mean_token_accuracy": 0.9340706586837768, "num_tokens": 25923590.0, "step": 6185 }, { "entropy": 0.22295918092131614, "epoch": 1.4429420678400746, "grad_norm": 4.78125, "learning_rate": 1.9944180994503924e-05, "loss": 0.3643, "mean_token_accuracy": 0.9329910993576049, "num_tokens": 25949742.0, "step": 6190 }, { "entropy": 0.2750854782760143, "epoch": 1.4441077048607065, "grad_norm": 7.5, "learning_rate": 1.994408997966287e-05, "loss": 0.5914, "mean_token_accuracy": 0.894541758298874, "num_tokens": 25966558.0, "step": 6195 }, { "entropy": 0.2991751916706562, "epoch": 1.4452733418813382, "grad_norm": 4.09375, "learning_rate": 1.9943998891098002e-05, "loss": 0.541, "mean_token_accuracy": 0.9036483585834503, "num_tokens": 25980801.0, "step": 6200 }, { "entropy": 0.24568053856492042, "epoch": 1.4464389789019698, "grad_norm": 2.40625, "learning_rate": 1.9943907728810675e-05, "loss": 0.3813, "mean_token_accuracy": 0.9138169586658478, "num_tokens": 25998947.0, "step": 6205 }, { "entropy": 0.35819384828209877, "epoch": 1.4476046159226017, "grad_norm": 7.03125, "learning_rate": 1.9943816492802245e-05, "loss": 0.7506, "mean_token_accuracy": 0.8739102721214295, "num_tokens": 26018713.0, "step": 6210 }, { "entropy": 0.31613691374659536, "epoch": 1.4487702529432336, "grad_norm": 0.96875, "learning_rate": 1.9943725183074078e-05, "loss": 0.5115, "mean_token_accuracy": 0.8758203983306885, "num_tokens": 26046018.0, "step": 6215 }, { "entropy": 0.29431228525936604, "epoch": 1.4499358899638652, "grad_norm": 0.6171875, "learning_rate": 1.994363379962753e-05, "loss": 0.5854, "mean_token_accuracy": 0.8824435234069824, "num_tokens": 26072080.0, "step": 6220 }, { "entropy": 0.2346148520708084, "epoch": 1.451101526984497, "grad_norm": 0.95703125, "learning_rate": 1.9943542342463967e-05, "loss": 0.3645, "mean_token_accuracy": 0.9219622015953064, "num_tokens": 26095299.0, "step": 6225 }, { "entropy": 0.2745875285938382, "epoch": 1.4522671640051288, "grad_norm": 4.21875, "learning_rate": 1.9943450811584754e-05, "loss": 0.4315, "mean_token_accuracy": 0.9061646342277527, "num_tokens": 26114670.0, "step": 6230 }, { "entropy": 0.2439524047076702, "epoch": 1.4534328010257607, "grad_norm": 1.8203125, "learning_rate": 1.994335920699125e-05, "loss": 0.49, "mean_token_accuracy": 0.9195599555969238, "num_tokens": 26127294.0, "step": 6235 }, { "entropy": 0.2234463717788458, "epoch": 1.4545984380463923, "grad_norm": 0.90234375, "learning_rate": 1.9943267528684825e-05, "loss": 0.2566, "mean_token_accuracy": 0.9297012031078339, "num_tokens": 26157921.0, "step": 6240 }, { "entropy": 0.2692530658096075, "epoch": 1.4557640750670242, "grad_norm": 0.2294921875, "learning_rate": 1.994317577666685e-05, "loss": 0.57, "mean_token_accuracy": 0.9084669768810272, "num_tokens": 26184039.0, "step": 6245 }, { "entropy": 0.24896195121109485, "epoch": 1.4569297120876559, "grad_norm": 4.59375, "learning_rate": 1.9943083950938688e-05, "loss": 0.249, "mean_token_accuracy": 0.9177485883235932, "num_tokens": 26210767.0, "step": 6250 }, { "entropy": 0.3322709981352091, "epoch": 1.4580953491082878, "grad_norm": 4.9375, "learning_rate": 1.994299205150171e-05, "loss": 0.553, "mean_token_accuracy": 0.882332730293274, "num_tokens": 26232162.0, "step": 6255 }, { "entropy": 0.28127521388232707, "epoch": 1.4592609861289194, "grad_norm": 5.96875, "learning_rate": 1.9942900078357285e-05, "loss": 0.484, "mean_token_accuracy": 0.9012116551399231, "num_tokens": 26249149.0, "step": 6260 }, { "entropy": 0.27540549375116824, "epoch": 1.4604266231495513, "grad_norm": 5.03125, "learning_rate": 1.9942808031506783e-05, "loss": 0.5053, "mean_token_accuracy": 0.8840371131896972, "num_tokens": 26262178.0, "step": 6265 }, { "entropy": 0.35837407857179643, "epoch": 1.461592260170183, "grad_norm": 3.28125, "learning_rate": 1.9942715910951584e-05, "loss": 0.5515, "mean_token_accuracy": 0.8817245364189148, "num_tokens": 26290230.0, "step": 6270 }, { "entropy": 0.18010530173778533, "epoch": 1.4627578971908148, "grad_norm": 0.1943359375, "learning_rate": 1.9942623716693053e-05, "loss": 0.2224, "mean_token_accuracy": 0.9430947363376617, "num_tokens": 26329113.0, "step": 6275 }, { "entropy": 0.22894675768911837, "epoch": 1.4639235342114465, "grad_norm": 2.625, "learning_rate": 1.994253144873257e-05, "loss": 0.3407, "mean_token_accuracy": 0.9291040778160096, "num_tokens": 26354065.0, "step": 6280 }, { "entropy": 0.3529716327786446, "epoch": 1.4650891712320784, "grad_norm": 3.890625, "learning_rate": 1.9942439107071508e-05, "loss": 0.6958, "mean_token_accuracy": 0.8839489579200744, "num_tokens": 26364152.0, "step": 6285 }, { "entropy": 0.29959765523672105, "epoch": 1.46625480825271, "grad_norm": 2.515625, "learning_rate": 1.9942346691711245e-05, "loss": 0.6005, "mean_token_accuracy": 0.9027487754821777, "num_tokens": 26373463.0, "step": 6290 }, { "entropy": 0.2790189601480961, "epoch": 1.467420445273342, "grad_norm": 0.48828125, "learning_rate": 1.9942254202653156e-05, "loss": 0.3838, "mean_token_accuracy": 0.896321427822113, "num_tokens": 26397161.0, "step": 6295 }, { "entropy": 0.29518115893006325, "epoch": 1.4685860822939736, "grad_norm": 0.44921875, "learning_rate": 1.994216163989863e-05, "loss": 0.6156, "mean_token_accuracy": 0.8970506012439727, "num_tokens": 26418252.0, "step": 6300 }, { "entropy": 0.24554601386189462, "epoch": 1.4697517193146055, "grad_norm": 4.25, "learning_rate": 1.9942069003449035e-05, "loss": 0.4689, "mean_token_accuracy": 0.9052075982093811, "num_tokens": 26439931.0, "step": 6305 }, { "entropy": 0.17828930951654912, "epoch": 1.4709173563352371, "grad_norm": 1.6796875, "learning_rate": 1.994197629330576e-05, "loss": 0.1967, "mean_token_accuracy": 0.9368055939674378, "num_tokens": 26481238.0, "step": 6310 }, { "entropy": 0.23349117934703828, "epoch": 1.472082993355869, "grad_norm": 3.078125, "learning_rate": 1.9941883509470185e-05, "loss": 0.4269, "mean_token_accuracy": 0.9104501307010651, "num_tokens": 26495108.0, "step": 6315 }, { "entropy": 0.18809721656143666, "epoch": 1.4732486303765007, "grad_norm": 0.40234375, "learning_rate": 1.9941790651943694e-05, "loss": 0.3237, "mean_token_accuracy": 0.9333221316337585, "num_tokens": 26522784.0, "step": 6320 }, { "entropy": 0.3458826899528503, "epoch": 1.4744142673971325, "grad_norm": 4.78125, "learning_rate": 1.9941697720727674e-05, "loss": 0.6462, "mean_token_accuracy": 0.8908734023571014, "num_tokens": 26532735.0, "step": 6325 }, { "entropy": 0.23778062015771867, "epoch": 1.4755799044177644, "grad_norm": 1.3125, "learning_rate": 1.9941604715823505e-05, "loss": 0.4793, "mean_token_accuracy": 0.9031484723091125, "num_tokens": 26554290.0, "step": 6330 }, { "entropy": 0.2718735795468092, "epoch": 1.476745541438396, "grad_norm": 7.4375, "learning_rate": 1.994151163723258e-05, "loss": 0.2499, "mean_token_accuracy": 0.9027468800544739, "num_tokens": 26595658.0, "step": 6335 }, { "entropy": 0.2860164247453213, "epoch": 1.4779111784590278, "grad_norm": 0.4296875, "learning_rate": 1.994141848495628e-05, "loss": 0.4303, "mean_token_accuracy": 0.9010000109672547, "num_tokens": 26618053.0, "step": 6340 }, { "entropy": 0.33337589353322983, "epoch": 1.4790768154796596, "grad_norm": 3.078125, "learning_rate": 1.9941325258996e-05, "loss": 0.5813, "mean_token_accuracy": 0.8955310165882111, "num_tokens": 26627218.0, "step": 6345 }, { "entropy": 0.2800572752952576, "epoch": 1.4802424525002915, "grad_norm": 3.125, "learning_rate": 1.9941231959353127e-05, "loss": 0.5138, "mean_token_accuracy": 0.9197395205497741, "num_tokens": 26646367.0, "step": 6350 }, { "entropy": 0.24946743622422218, "epoch": 1.4814080895209232, "grad_norm": 1.7734375, "learning_rate": 1.9941138586029057e-05, "loss": 0.4069, "mean_token_accuracy": 0.8970791816711425, "num_tokens": 26666179.0, "step": 6355 }, { "entropy": 0.2526698425412178, "epoch": 1.4825737265415548, "grad_norm": 2.09375, "learning_rate": 1.9941045139025175e-05, "loss": 0.3809, "mean_token_accuracy": 0.919477504491806, "num_tokens": 26682431.0, "step": 6360 }, { "entropy": 0.2440191276371479, "epoch": 1.4837393635621867, "grad_norm": 0.404296875, "learning_rate": 1.994095161834288e-05, "loss": 0.2937, "mean_token_accuracy": 0.916581517457962, "num_tokens": 26708653.0, "step": 6365 }, { "entropy": 0.2495467372238636, "epoch": 1.4849050005828186, "grad_norm": 3.40625, "learning_rate": 1.9940858023983564e-05, "loss": 0.4609, "mean_token_accuracy": 0.9264641582965851, "num_tokens": 26729923.0, "step": 6370 }, { "entropy": 0.31366735994815825, "epoch": 1.4860706376034503, "grad_norm": 2.453125, "learning_rate": 1.9940764355948624e-05, "loss": 0.5459, "mean_token_accuracy": 0.8930998623371125, "num_tokens": 26747852.0, "step": 6375 }, { "entropy": 0.17747897058725357, "epoch": 1.4872362746240821, "grad_norm": 0.296875, "learning_rate": 1.9940670614239455e-05, "loss": 0.4019, "mean_token_accuracy": 0.9336903989315033, "num_tokens": 26771977.0, "step": 6380 }, { "entropy": 0.31067068725824354, "epoch": 1.4884019116447138, "grad_norm": 2.171875, "learning_rate": 1.9940576798857458e-05, "loss": 0.5135, "mean_token_accuracy": 0.8752427935600281, "num_tokens": 26803778.0, "step": 6385 }, { "entropy": 0.2708073750138283, "epoch": 1.4895675486653457, "grad_norm": 0.34765625, "learning_rate": 1.994048290980403e-05, "loss": 0.5891, "mean_token_accuracy": 0.901687479019165, "num_tokens": 26825704.0, "step": 6390 }, { "entropy": 0.2440513141453266, "epoch": 1.4907331856859773, "grad_norm": 0.86328125, "learning_rate": 1.994038894708057e-05, "loss": 0.3371, "mean_token_accuracy": 0.9266144573688507, "num_tokens": 26850678.0, "step": 6395 }, { "entropy": 0.21062698140740393, "epoch": 1.4918988227066092, "grad_norm": 2.140625, "learning_rate": 1.994029491068848e-05, "loss": 0.2388, "mean_token_accuracy": 0.9315842866897583, "num_tokens": 26871585.0, "step": 6400 }, { "entropy": 0.37964350283145903, "epoch": 1.493064459727241, "grad_norm": 5.0625, "learning_rate": 1.994020080062916e-05, "loss": 0.8883, "mean_token_accuracy": 0.8595846474170685, "num_tokens": 26879318.0, "step": 6405 }, { "entropy": 0.26095555648207663, "epoch": 1.4942300967478728, "grad_norm": 3.59375, "learning_rate": 1.9940106616904018e-05, "loss": 0.5295, "mean_token_accuracy": 0.8947687804698944, "num_tokens": 26898969.0, "step": 6410 }, { "entropy": 0.25472576431930066, "epoch": 1.4953957337685044, "grad_norm": 3.28125, "learning_rate": 1.9940012359514456e-05, "loss": 0.3701, "mean_token_accuracy": 0.9185781836509704, "num_tokens": 26921627.0, "step": 6415 }, { "entropy": 0.3112226489931345, "epoch": 1.4965613707891363, "grad_norm": 3.0, "learning_rate": 1.993991802846188e-05, "loss": 0.4863, "mean_token_accuracy": 0.8925800144672393, "num_tokens": 26943250.0, "step": 6420 }, { "entropy": 0.2728789247572422, "epoch": 1.497727007809768, "grad_norm": 0.75, "learning_rate": 1.9939823623747695e-05, "loss": 0.4913, "mean_token_accuracy": 0.9044696092605591, "num_tokens": 26966039.0, "step": 6425 }, { "entropy": 0.22179466150701047, "epoch": 1.4988926448303999, "grad_norm": 0.703125, "learning_rate": 1.993972914537331e-05, "loss": 0.465, "mean_token_accuracy": 0.9178914129734039, "num_tokens": 26984909.0, "step": 6430 }, { "entropy": 0.22185338092967868, "epoch": 1.5000582818510315, "grad_norm": 3.640625, "learning_rate": 1.9939634593340133e-05, "loss": 0.3028, "mean_token_accuracy": 0.9346797227859497, "num_tokens": 27017339.0, "step": 6435 }, { "entropy": 0.29497579857707024, "epoch": 1.5012239188716634, "grad_norm": 5.78125, "learning_rate": 1.9939539967649576e-05, "loss": 0.7158, "mean_token_accuracy": 0.876299786567688, "num_tokens": 27029104.0, "step": 6440 }, { "entropy": 0.2623417802155018, "epoch": 1.5023895558922953, "grad_norm": 0.6875, "learning_rate": 1.9939445268303047e-05, "loss": 0.5434, "mean_token_accuracy": 0.8918785274028778, "num_tokens": 27047768.0, "step": 6445 }, { "entropy": 0.20947806648910045, "epoch": 1.503555192912927, "grad_norm": 0.357421875, "learning_rate": 1.993935049530196e-05, "loss": 0.4116, "mean_token_accuracy": 0.9175561010837555, "num_tokens": 27073094.0, "step": 6450 }, { "entropy": 0.3465562131255865, "epoch": 1.5047208299335586, "grad_norm": 2.015625, "learning_rate": 1.9939255648647732e-05, "loss": 0.542, "mean_token_accuracy": 0.8854370713233948, "num_tokens": 27096021.0, "step": 6455 }, { "entropy": 0.27758454382419584, "epoch": 1.5058864669541905, "grad_norm": 3.421875, "learning_rate": 1.993916072834177e-05, "loss": 0.5268, "mean_token_accuracy": 0.8995888471603394, "num_tokens": 27123583.0, "step": 6460 }, { "entropy": 0.24073500484228133, "epoch": 1.5070521039748224, "grad_norm": 2.6875, "learning_rate": 1.993906573438549e-05, "loss": 0.383, "mean_token_accuracy": 0.9229654431343078, "num_tokens": 27142117.0, "step": 6465 }, { "entropy": 0.2729795090854168, "epoch": 1.508217740995454, "grad_norm": 0.59765625, "learning_rate": 1.9938970666780312e-05, "loss": 0.6103, "mean_token_accuracy": 0.8879354178905488, "num_tokens": 27155818.0, "step": 6470 }, { "entropy": 0.18979131281375886, "epoch": 1.5093833780160857, "grad_norm": 0.7890625, "learning_rate": 1.9938875525527658e-05, "loss": 0.192, "mean_token_accuracy": 0.9271632313728333, "num_tokens": 27183172.0, "step": 6475 }, { "entropy": 0.21792179644107817, "epoch": 1.5105490150367176, "grad_norm": 1.1875, "learning_rate": 1.9938780310628935e-05, "loss": 0.3931, "mean_token_accuracy": 0.9339845836162567, "num_tokens": 27196815.0, "step": 6480 }, { "entropy": 0.2289236381649971, "epoch": 1.5117146520573495, "grad_norm": 5.625, "learning_rate": 1.9938685022085573e-05, "loss": 0.452, "mean_token_accuracy": 0.9143205046653747, "num_tokens": 27207717.0, "step": 6485 }, { "entropy": 0.33930057361721994, "epoch": 1.512880289077981, "grad_norm": 5.90625, "learning_rate": 1.9938589659898987e-05, "loss": 0.5654, "mean_token_accuracy": 0.8818209409713745, "num_tokens": 27233471.0, "step": 6490 }, { "entropy": 0.2666054558008909, "epoch": 1.5140459260986128, "grad_norm": 0.427734375, "learning_rate": 1.9938494224070605e-05, "loss": 0.6512, "mean_token_accuracy": 0.8890865564346313, "num_tokens": 27253476.0, "step": 6495 }, { "entropy": 0.2990569584071636, "epoch": 1.5152115631192447, "grad_norm": 6.03125, "learning_rate": 1.993839871460184e-05, "loss": 0.5722, "mean_token_accuracy": 0.9082420587539672, "num_tokens": 27270163.0, "step": 6500 }, { "entropy": 0.25313875190913676, "epoch": 1.5163772001398765, "grad_norm": 5.625, "learning_rate": 1.9938303131494127e-05, "loss": 0.42, "mean_token_accuracy": 0.9069484531879425, "num_tokens": 27287541.0, "step": 6505 }, { "entropy": 0.2387036457657814, "epoch": 1.5175428371605082, "grad_norm": 4.8125, "learning_rate": 1.9938207474748886e-05, "loss": 0.5459, "mean_token_accuracy": 0.9052523851394654, "num_tokens": 27302864.0, "step": 6510 }, { "entropy": 0.2785582607612014, "epoch": 1.5187084741811399, "grad_norm": 4.03125, "learning_rate": 1.9938111744367545e-05, "loss": 0.4533, "mean_token_accuracy": 0.8836054503917694, "num_tokens": 27338949.0, "step": 6515 }, { "entropy": 0.2182111766189337, "epoch": 1.5198741112017717, "grad_norm": 0.59375, "learning_rate": 1.9938015940351528e-05, "loss": 0.4181, "mean_token_accuracy": 0.9164810359477997, "num_tokens": 27356468.0, "step": 6520 }, { "entropy": 0.3113727495074272, "epoch": 1.5210397482224036, "grad_norm": 2.546875, "learning_rate": 1.9937920062702267e-05, "loss": 0.501, "mean_token_accuracy": 0.905505508184433, "num_tokens": 27366690.0, "step": 6525 }, { "entropy": 0.23484750241041183, "epoch": 1.5222053852430353, "grad_norm": 7.5, "learning_rate": 1.993782411142119e-05, "loss": 0.3325, "mean_token_accuracy": 0.9258572041988373, "num_tokens": 27396712.0, "step": 6530 }, { "entropy": 0.22683720849454403, "epoch": 1.523371022263667, "grad_norm": 4.96875, "learning_rate": 1.9937728086509732e-05, "loss": 0.2282, "mean_token_accuracy": 0.9178965389728546, "num_tokens": 27428456.0, "step": 6535 }, { "entropy": 0.2929589316248894, "epoch": 1.5245366592842988, "grad_norm": 1.4765625, "learning_rate": 1.9937631987969318e-05, "loss": 0.5289, "mean_token_accuracy": 0.8907686352729798, "num_tokens": 27446640.0, "step": 6540 }, { "entropy": 0.22855143621563911, "epoch": 1.5257022963049307, "grad_norm": 2.8125, "learning_rate": 1.9937535815801385e-05, "loss": 0.3996, "mean_token_accuracy": 0.907921028137207, "num_tokens": 27460744.0, "step": 6545 }, { "entropy": 0.3677578168921173, "epoch": 1.5268679333255624, "grad_norm": 5.28125, "learning_rate": 1.993743957000737e-05, "loss": 0.612, "mean_token_accuracy": 0.8666451275348663, "num_tokens": 27492855.0, "step": 6550 }, { "entropy": 0.3148959677666426, "epoch": 1.5280335703461942, "grad_norm": 4.46875, "learning_rate": 1.9937343250588698e-05, "loss": 0.4407, "mean_token_accuracy": 0.8961562097072602, "num_tokens": 27518578.0, "step": 6555 }, { "entropy": 0.2055843833833933, "epoch": 1.5291992073668261, "grad_norm": 0.6953125, "learning_rate": 1.9937246857546817e-05, "loss": 0.3653, "mean_token_accuracy": 0.928001344203949, "num_tokens": 27534989.0, "step": 6560 }, { "entropy": 0.3105059742927551, "epoch": 1.5303648443874578, "grad_norm": 3.046875, "learning_rate": 1.9937150390883156e-05, "loss": 0.7427, "mean_token_accuracy": 0.8818167328834534, "num_tokens": 27545231.0, "step": 6565 }, { "entropy": 0.21620201570913194, "epoch": 1.5315304814080895, "grad_norm": 6.46875, "learning_rate": 1.9937053850599163e-05, "loss": 0.3257, "mean_token_accuracy": 0.9393852114677429, "num_tokens": 27566195.0, "step": 6570 }, { "entropy": 0.21292497627437115, "epoch": 1.5326961184287213, "grad_norm": 0.349609375, "learning_rate": 1.9936957236696264e-05, "loss": 0.4618, "mean_token_accuracy": 0.9288613259792328, "num_tokens": 27589114.0, "step": 6575 }, { "entropy": 0.23033034205436706, "epoch": 1.5338617554493532, "grad_norm": 4.8125, "learning_rate": 1.993686054917591e-05, "loss": 0.449, "mean_token_accuracy": 0.9169645845890045, "num_tokens": 27610688.0, "step": 6580 }, { "entropy": 0.24333803839981555, "epoch": 1.5350273924699849, "grad_norm": 6.71875, "learning_rate": 1.9936763788039543e-05, "loss": 0.3681, "mean_token_accuracy": 0.9251050651073456, "num_tokens": 27628200.0, "step": 6585 }, { "entropy": 0.24377419464290143, "epoch": 1.5361930294906165, "grad_norm": 2.625, "learning_rate": 1.9936666953288598e-05, "loss": 0.3051, "mean_token_accuracy": 0.9210731863975525, "num_tokens": 27651910.0, "step": 6590 }, { "entropy": 0.23668837770819665, "epoch": 1.5373586665112484, "grad_norm": 1.015625, "learning_rate": 1.9936570044924526e-05, "loss": 0.4558, "mean_token_accuracy": 0.9113388657569885, "num_tokens": 27665162.0, "step": 6595 }, { "entropy": 0.2739979222416878, "epoch": 1.5385243035318803, "grad_norm": 3.34375, "learning_rate": 1.9936473062948765e-05, "loss": 0.6057, "mean_token_accuracy": 0.9049014866352081, "num_tokens": 27676458.0, "step": 6600 }, { "entropy": 0.32901543751358986, "epoch": 1.539689940552512, "grad_norm": 2.734375, "learning_rate": 1.993637600736277e-05, "loss": 0.3299, "mean_token_accuracy": 0.8835425734519958, "num_tokens": 27711994.0, "step": 6605 }, { "entropy": 0.292670027539134, "epoch": 1.5408555775731436, "grad_norm": 0.5078125, "learning_rate": 1.9936278878167985e-05, "loss": 0.565, "mean_token_accuracy": 0.8792059600353241, "num_tokens": 27732770.0, "step": 6610 }, { "entropy": 0.2861447758972645, "epoch": 1.5420212145937755, "grad_norm": 3.96875, "learning_rate": 1.9936181675365856e-05, "loss": 0.5155, "mean_token_accuracy": 0.9069945335388183, "num_tokens": 27756632.0, "step": 6615 }, { "entropy": 0.22006136178970337, "epoch": 1.5431868516144074, "grad_norm": 1.4921875, "learning_rate": 1.9936084398957834e-05, "loss": 0.3175, "mean_token_accuracy": 0.9396511971950531, "num_tokens": 27778830.0, "step": 6620 }, { "entropy": 0.33254698645323516, "epoch": 1.544352488635039, "grad_norm": 4.21875, "learning_rate": 1.993598704894537e-05, "loss": 0.5365, "mean_token_accuracy": 0.8744619786739349, "num_tokens": 27805975.0, "step": 6625 }, { "entropy": 0.3152335677295923, "epoch": 1.5455181256556707, "grad_norm": 0.48828125, "learning_rate": 1.9935889625329913e-05, "loss": 0.5482, "mean_token_accuracy": 0.9010307848453522, "num_tokens": 27828444.0, "step": 6630 }, { "entropy": 0.2763508759438992, "epoch": 1.5466837626763026, "grad_norm": 1.78125, "learning_rate": 1.9935792128112918e-05, "loss": 0.416, "mean_token_accuracy": 0.899656081199646, "num_tokens": 27856252.0, "step": 6635 }, { "entropy": 0.29827424213290216, "epoch": 1.5478493996969345, "grad_norm": 0.5, "learning_rate": 1.993569455729584e-05, "loss": 0.462, "mean_token_accuracy": 0.8946270823478699, "num_tokens": 27880426.0, "step": 6640 }, { "entropy": 0.20729903429746627, "epoch": 1.5490150367175661, "grad_norm": 1.1796875, "learning_rate": 1.993559691288013e-05, "loss": 0.2037, "mean_token_accuracy": 0.9211410582065582, "num_tokens": 27901734.0, "step": 6645 }, { "entropy": 0.34493443965911863, "epoch": 1.5501806737381978, "grad_norm": 8.625, "learning_rate": 1.993549919486725e-05, "loss": 0.7576, "mean_token_accuracy": 0.864366763830185, "num_tokens": 27921161.0, "step": 6650 }, { "entropy": 0.29178492836654185, "epoch": 1.5513463107588297, "grad_norm": 10.25, "learning_rate": 1.993540140325865e-05, "loss": 0.5045, "mean_token_accuracy": 0.885959267616272, "num_tokens": 27954097.0, "step": 6655 }, { "entropy": 0.19429666809737683, "epoch": 1.5525119477794616, "grad_norm": 0.49609375, "learning_rate": 1.9935303538055796e-05, "loss": 0.2965, "mean_token_accuracy": 0.9343727469444275, "num_tokens": 27981520.0, "step": 6660 }, { "entropy": 0.25541278421878816, "epoch": 1.5536775848000932, "grad_norm": 4.0, "learning_rate": 1.993520559926014e-05, "loss": 0.4689, "mean_token_accuracy": 0.9169036328792572, "num_tokens": 27994168.0, "step": 6665 }, { "entropy": 0.32678827494382856, "epoch": 1.5548432218207249, "grad_norm": 3.375, "learning_rate": 1.9935107586873145e-05, "loss": 0.661, "mean_token_accuracy": 0.8745863318443299, "num_tokens": 28006200.0, "step": 6670 }, { "entropy": 0.2589764386415482, "epoch": 1.5560088588413568, "grad_norm": 1.0859375, "learning_rate": 1.9935009500896273e-05, "loss": 0.378, "mean_token_accuracy": 0.9126704931259155, "num_tokens": 28022450.0, "step": 6675 }, { "entropy": 0.28034481406211853, "epoch": 1.5571744958619886, "grad_norm": 1.0859375, "learning_rate": 1.9934911341330986e-05, "loss": 0.4009, "mean_token_accuracy": 0.9167719483375549, "num_tokens": 28045500.0, "step": 6680 }, { "entropy": 0.30468578450381756, "epoch": 1.5583401328826203, "grad_norm": 1.015625, "learning_rate": 1.9934813108178752e-05, "loss": 0.3279, "mean_token_accuracy": 0.9120621025562287, "num_tokens": 28088547.0, "step": 6685 }, { "entropy": 0.20318310568109155, "epoch": 1.5595057699032522, "grad_norm": 0.23046875, "learning_rate": 1.9934714801441032e-05, "loss": 0.4988, "mean_token_accuracy": 0.9183111727237702, "num_tokens": 28120417.0, "step": 6690 }, { "entropy": 0.19271549321711062, "epoch": 1.560671406923884, "grad_norm": 5.25, "learning_rate": 1.9934616421119287e-05, "loss": 0.2812, "mean_token_accuracy": 0.9212694585323333, "num_tokens": 28146606.0, "step": 6695 }, { "entropy": 0.23064907528460027, "epoch": 1.5618370439445157, "grad_norm": 3.609375, "learning_rate": 1.9934517967214993e-05, "loss": 0.2526, "mean_token_accuracy": 0.9413141012191772, "num_tokens": 28177886.0, "step": 6700 }, { "entropy": 0.2984270632266998, "epoch": 1.5630026809651474, "grad_norm": 4.125, "learning_rate": 1.9934419439729615e-05, "loss": 0.5552, "mean_token_accuracy": 0.9000501334667206, "num_tokens": 28196847.0, "step": 6705 }, { "entropy": 0.1965206727385521, "epoch": 1.5641683179857793, "grad_norm": 1.84375, "learning_rate": 1.993432083866462e-05, "loss": 0.2889, "mean_token_accuracy": 0.9421004772186279, "num_tokens": 28219554.0, "step": 6710 }, { "entropy": 0.3038263775408268, "epoch": 1.5653339550064111, "grad_norm": 4.59375, "learning_rate": 1.993422216402148e-05, "loss": 0.4245, "mean_token_accuracy": 0.9077933967113495, "num_tokens": 28241847.0, "step": 6715 }, { "entropy": 0.22943210899829863, "epoch": 1.5664995920270428, "grad_norm": 3.265625, "learning_rate": 1.9934123415801666e-05, "loss": 0.4105, "mean_token_accuracy": 0.9183744013309478, "num_tokens": 28270049.0, "step": 6720 }, { "entropy": 0.3342425040900707, "epoch": 1.5676652290476745, "grad_norm": 3.53125, "learning_rate": 1.993402459400665e-05, "loss": 0.6575, "mean_token_accuracy": 0.8717486083507537, "num_tokens": 28291347.0, "step": 6725 }, { "entropy": 0.20970200896263122, "epoch": 1.5688308660683064, "grad_norm": 0.419921875, "learning_rate": 1.9933925698637905e-05, "loss": 0.2657, "mean_token_accuracy": 0.928368890285492, "num_tokens": 28314009.0, "step": 6730 }, { "entropy": 0.3026204094290733, "epoch": 1.5699965030889382, "grad_norm": 5.34375, "learning_rate": 1.993382672969691e-05, "loss": 0.6261, "mean_token_accuracy": 0.8962989449501038, "num_tokens": 28323739.0, "step": 6735 }, { "entropy": 0.23009593058377503, "epoch": 1.57116214010957, "grad_norm": 0.703125, "learning_rate": 1.9933727687185137e-05, "loss": 0.3074, "mean_token_accuracy": 0.9207689642906189, "num_tokens": 28364853.0, "step": 6740 }, { "entropy": 0.35999022983014584, "epoch": 1.5723277771302016, "grad_norm": 1.03125, "learning_rate": 1.993362857110406e-05, "loss": 0.576, "mean_token_accuracy": 0.8650194406509399, "num_tokens": 28389330.0, "step": 6745 }, { "entropy": 0.27033420875668523, "epoch": 1.5734934141508334, "grad_norm": 2.671875, "learning_rate": 1.9933529381455163e-05, "loss": 0.4767, "mean_token_accuracy": 0.9208850085735321, "num_tokens": 28411791.0, "step": 6750 }, { "entropy": 0.19342294819653033, "epoch": 1.5746590511714653, "grad_norm": 2.0, "learning_rate": 1.993343011823992e-05, "loss": 0.3283, "mean_token_accuracy": 0.9273386657238006, "num_tokens": 28434134.0, "step": 6755 }, { "entropy": 0.22842872738838196, "epoch": 1.575824688192097, "grad_norm": 0.48046875, "learning_rate": 1.993333078145982e-05, "loss": 0.4746, "mean_token_accuracy": 0.9192339181900024, "num_tokens": 28455393.0, "step": 6760 }, { "entropy": 0.20059427116066217, "epoch": 1.5769903252127286, "grad_norm": 4.1875, "learning_rate": 1.993323137111633e-05, "loss": 0.4625, "mean_token_accuracy": 0.9268925726413727, "num_tokens": 28485132.0, "step": 6765 }, { "entropy": 0.25664917901158335, "epoch": 1.5781559622333605, "grad_norm": 5.28125, "learning_rate": 1.9933131887210944e-05, "loss": 0.499, "mean_token_accuracy": 0.9118769764900208, "num_tokens": 28496012.0, "step": 6770 }, { "entropy": 0.29543297439813615, "epoch": 1.5793215992539924, "grad_norm": 3.53125, "learning_rate": 1.9933032329745144e-05, "loss": 0.513, "mean_token_accuracy": 0.8905039012432099, "num_tokens": 28508468.0, "step": 6775 }, { "entropy": 0.27861638814210893, "epoch": 1.580487236274624, "grad_norm": 1.984375, "learning_rate": 1.993293269872041e-05, "loss": 0.7512, "mean_token_accuracy": 0.8770630955696106, "num_tokens": 28520357.0, "step": 6780 }, { "entropy": 0.26711825728416444, "epoch": 1.5816528732952557, "grad_norm": 3.234375, "learning_rate": 1.993283299413823e-05, "loss": 0.6114, "mean_token_accuracy": 0.8977750062942504, "num_tokens": 28529643.0, "step": 6785 }, { "entropy": 0.3120358551852405, "epoch": 1.5828185103158876, "grad_norm": 0.337890625, "learning_rate": 1.993273321600009e-05, "loss": 0.5817, "mean_token_accuracy": 0.8871180176734924, "num_tokens": 28549177.0, "step": 6790 }, { "entropy": 0.2286653283983469, "epoch": 1.5839841473365195, "grad_norm": 0.671875, "learning_rate": 1.9932633364307478e-05, "loss": 0.3054, "mean_token_accuracy": 0.9169903874397278, "num_tokens": 28585854.0, "step": 6795 }, { "entropy": 0.37026937305927277, "epoch": 1.5851497843571511, "grad_norm": 5.0, "learning_rate": 1.9932533439061886e-05, "loss": 0.516, "mean_token_accuracy": 0.8859709143638611, "num_tokens": 28603616.0, "step": 6800 }, { "entropy": 0.4093021884560585, "epoch": 1.5863154213777828, "grad_norm": 3.6875, "learning_rate": 1.9932433440264798e-05, "loss": 0.6387, "mean_token_accuracy": 0.8732196629047394, "num_tokens": 28623869.0, "step": 6805 }, { "entropy": 0.2910556077957153, "epoch": 1.5874810583984147, "grad_norm": 1.8828125, "learning_rate": 1.993233336791771e-05, "loss": 0.5225, "mean_token_accuracy": 0.8892961859703064, "num_tokens": 28653785.0, "step": 6810 }, { "entropy": 0.2758349239826202, "epoch": 1.5886466954190466, "grad_norm": 6.09375, "learning_rate": 1.9932233222022113e-05, "loss": 0.6505, "mean_token_accuracy": 0.8887870073318481, "num_tokens": 28664354.0, "step": 6815 }, { "entropy": 0.2991601226851344, "epoch": 1.5898123324396782, "grad_norm": 8.4375, "learning_rate": 1.9932133002579502e-05, "loss": 0.6383, "mean_token_accuracy": 0.8858659088611602, "num_tokens": 28687769.0, "step": 6820 }, { "entropy": 0.263477098941803, "epoch": 1.59097796946031, "grad_norm": 1.3359375, "learning_rate": 1.9932032709591368e-05, "loss": 0.2739, "mean_token_accuracy": 0.9084848940372467, "num_tokens": 28720836.0, "step": 6825 }, { "entropy": 0.19756391048431396, "epoch": 1.592143606480942, "grad_norm": 1.9609375, "learning_rate": 1.9931932343059208e-05, "loss": 0.414, "mean_token_accuracy": 0.9262231230735779, "num_tokens": 28736798.0, "step": 6830 }, { "entropy": 0.2347656786441803, "epoch": 1.5933092435015737, "grad_norm": 0.69921875, "learning_rate": 1.993183190298452e-05, "loss": 0.4251, "mean_token_accuracy": 0.9266911625862122, "num_tokens": 28752927.0, "step": 6835 }, { "entropy": 0.2072462685406208, "epoch": 1.5944748805222053, "grad_norm": 3.71875, "learning_rate": 1.9931731389368797e-05, "loss": 0.4016, "mean_token_accuracy": 0.9348695755004883, "num_tokens": 28788562.0, "step": 6840 }, { "entropy": 0.22971604485064745, "epoch": 1.5956405175428372, "grad_norm": 3.03125, "learning_rate": 1.9931630802213543e-05, "loss": 0.4706, "mean_token_accuracy": 0.9057029843330383, "num_tokens": 28809929.0, "step": 6845 }, { "entropy": 0.22827026546001433, "epoch": 1.596806154563469, "grad_norm": 3.953125, "learning_rate": 1.9931530141520258e-05, "loss": 0.5347, "mean_token_accuracy": 0.9116665422916412, "num_tokens": 28827185.0, "step": 6850 }, { "entropy": 0.2197150869295001, "epoch": 1.5979717915841007, "grad_norm": 0.5625, "learning_rate": 1.993142940729044e-05, "loss": 0.5541, "mean_token_accuracy": 0.8902185618877411, "num_tokens": 28860613.0, "step": 6855 }, { "entropy": 0.2837262708693743, "epoch": 1.5991374286047324, "grad_norm": 0.60546875, "learning_rate": 1.9931328599525593e-05, "loss": 0.67, "mean_token_accuracy": 0.8655347049236297, "num_tokens": 28875950.0, "step": 6860 }, { "entropy": 0.24237373284995556, "epoch": 1.6003030656253643, "grad_norm": 0.47265625, "learning_rate": 1.993122771822722e-05, "loss": 0.4202, "mean_token_accuracy": 0.9144159615039825, "num_tokens": 28901490.0, "step": 6865 }, { "entropy": 0.24070470854640008, "epoch": 1.6014687026459962, "grad_norm": 1.703125, "learning_rate": 1.9931126763396823e-05, "loss": 0.3715, "mean_token_accuracy": 0.9270369708538055, "num_tokens": 28917847.0, "step": 6870 }, { "entropy": 0.3077314287424088, "epoch": 1.6026343396666278, "grad_norm": 3.515625, "learning_rate": 1.9931025735035908e-05, "loss": 0.4596, "mean_token_accuracy": 0.8983159244060517, "num_tokens": 28941625.0, "step": 6875 }, { "entropy": 0.2766427606344223, "epoch": 1.6037999766872595, "grad_norm": 2.15625, "learning_rate": 1.9930924633145983e-05, "loss": 0.5664, "mean_token_accuracy": 0.9030839681625367, "num_tokens": 28952505.0, "step": 6880 }, { "entropy": 0.21303706653416157, "epoch": 1.6049656137078914, "grad_norm": 0.36328125, "learning_rate": 1.9930823457728556e-05, "loss": 0.3332, "mean_token_accuracy": 0.9256323635578155, "num_tokens": 28980831.0, "step": 6885 }, { "entropy": 0.26443149968981744, "epoch": 1.6061312507285233, "grad_norm": 4.28125, "learning_rate": 1.9930722208785137e-05, "loss": 0.4801, "mean_token_accuracy": 0.9085823595523834, "num_tokens": 28993269.0, "step": 6890 }, { "entropy": 0.3455100655555725, "epoch": 1.607296887749155, "grad_norm": 4.875, "learning_rate": 1.993062088631723e-05, "loss": 0.5618, "mean_token_accuracy": 0.8891852796077728, "num_tokens": 29002688.0, "step": 6895 }, { "entropy": 0.29940509535372256, "epoch": 1.6084625247697866, "grad_norm": 0.27734375, "learning_rate": 1.9930519490326354e-05, "loss": 0.474, "mean_token_accuracy": 0.9166371762752533, "num_tokens": 29033884.0, "step": 6900 }, { "entropy": 0.13665635734796525, "epoch": 1.6096281617904185, "grad_norm": 2.1875, "learning_rate": 1.993041802081401e-05, "loss": 0.1412, "mean_token_accuracy": 0.9601432979106903, "num_tokens": 29071820.0, "step": 6905 }, { "entropy": 0.24027777165174485, "epoch": 1.6107937988110503, "grad_norm": 0.357421875, "learning_rate": 1.9930316477781722e-05, "loss": 0.4429, "mean_token_accuracy": 0.9213906168937683, "num_tokens": 29091290.0, "step": 6910 }, { "entropy": 0.37214391976594924, "epoch": 1.611959435831682, "grad_norm": 1.09375, "learning_rate": 1.9930214861230998e-05, "loss": 0.5121, "mean_token_accuracy": 0.886109858751297, "num_tokens": 29123342.0, "step": 6915 }, { "entropy": 0.12675322201102973, "epoch": 1.6131250728523137, "grad_norm": 0.43359375, "learning_rate": 1.9930113171163356e-05, "loss": 0.0864, "mean_token_accuracy": 0.9702485501766205, "num_tokens": 29174885.0, "step": 6920 }, { "entropy": 0.2880819053389132, "epoch": 1.6142907098729455, "grad_norm": 0.828125, "learning_rate": 1.9930011407580312e-05, "loss": 0.3827, "mean_token_accuracy": 0.8916058301925659, "num_tokens": 29214406.0, "step": 6925 }, { "entropy": 0.23583326674997807, "epoch": 1.6154563468935774, "grad_norm": 6.3125, "learning_rate": 1.9929909570483383e-05, "loss": 0.3784, "mean_token_accuracy": 0.9003340721130371, "num_tokens": 29238310.0, "step": 6930 }, { "entropy": 0.23994187042117118, "epoch": 1.616621983914209, "grad_norm": 1.6171875, "learning_rate": 1.9929807659874085e-05, "loss": 0.4249, "mean_token_accuracy": 0.9134172797203064, "num_tokens": 29258683.0, "step": 6935 }, { "entropy": 0.38729229420423505, "epoch": 1.6177876209348407, "grad_norm": 4.15625, "learning_rate": 1.992970567575394e-05, "loss": 0.5001, "mean_token_accuracy": 0.9179852843284607, "num_tokens": 29280874.0, "step": 6940 }, { "entropy": 0.30107422918081284, "epoch": 1.6189532579554726, "grad_norm": 2.328125, "learning_rate": 1.992960361812447e-05, "loss": 0.7504, "mean_token_accuracy": 0.8799044847488403, "num_tokens": 29289798.0, "step": 6945 }, { "entropy": 0.25648994743824005, "epoch": 1.6201188949761045, "grad_norm": 2.4375, "learning_rate": 1.9929501486987196e-05, "loss": 0.5502, "mean_token_accuracy": 0.9077392220497131, "num_tokens": 29300408.0, "step": 6950 }, { "entropy": 0.2755729131400585, "epoch": 1.6212845319967362, "grad_norm": 2.59375, "learning_rate": 1.992939928234364e-05, "loss": 0.6221, "mean_token_accuracy": 0.9050759017467499, "num_tokens": 29313660.0, "step": 6955 }, { "entropy": 0.1966082751750946, "epoch": 1.6224501690173678, "grad_norm": 0.3203125, "learning_rate": 1.9929297004195328e-05, "loss": 0.3258, "mean_token_accuracy": 0.9240009307861328, "num_tokens": 29343988.0, "step": 6960 }, { "entropy": 0.2664320237934589, "epoch": 1.623615806038, "grad_norm": 4.25, "learning_rate": 1.9929194652543784e-05, "loss": 0.3882, "mean_token_accuracy": 0.9111813545227051, "num_tokens": 29363067.0, "step": 6965 }, { "entropy": 0.4264032058417797, "epoch": 1.6247814430586316, "grad_norm": 2.46875, "learning_rate": 1.9929092227390528e-05, "loss": 0.6451, "mean_token_accuracy": 0.8786540269851685, "num_tokens": 29386786.0, "step": 6970 }, { "entropy": 0.33386958464980127, "epoch": 1.6259470800792633, "grad_norm": 6.25, "learning_rate": 1.9928989728737097e-05, "loss": 0.6114, "mean_token_accuracy": 0.8769558131694793, "num_tokens": 29417271.0, "step": 6975 }, { "entropy": 0.29403403401374817, "epoch": 1.6271127170998951, "grad_norm": 4.25, "learning_rate": 1.9928887156585017e-05, "loss": 0.4335, "mean_token_accuracy": 0.9137054145336151, "num_tokens": 29442906.0, "step": 6980 }, { "entropy": 0.23713123053312302, "epoch": 1.628278354120527, "grad_norm": 2.703125, "learning_rate": 1.9928784510935814e-05, "loss": 0.5125, "mean_token_accuracy": 0.919365006685257, "num_tokens": 29454928.0, "step": 6985 }, { "entropy": 0.36967662423849107, "epoch": 1.6294439911411587, "grad_norm": 11.6875, "learning_rate": 1.992868179179102e-05, "loss": 0.9495, "mean_token_accuracy": 0.8575898349285126, "num_tokens": 29462426.0, "step": 6990 }, { "entropy": 0.24759368523955344, "epoch": 1.6306096281617903, "grad_norm": 5.59375, "learning_rate": 1.9928578999152168e-05, "loss": 0.435, "mean_token_accuracy": 0.9145466089248657, "num_tokens": 29481420.0, "step": 6995 }, { "entropy": 0.3167073156684637, "epoch": 1.6317752651824222, "grad_norm": 7.0, "learning_rate": 1.992847613302079e-05, "loss": 0.7354, "mean_token_accuracy": 0.8781661272048951, "num_tokens": 29493112.0, "step": 7000 }, { "entropy": 0.23697575367987156, "epoch": 1.632940902203054, "grad_norm": 3.578125, "learning_rate": 1.992837319339842e-05, "loss": 0.3644, "mean_token_accuracy": 0.9137583911418915, "num_tokens": 29521475.0, "step": 7005 }, { "entropy": 0.28280975893139837, "epoch": 1.6341065392236858, "grad_norm": 0.69921875, "learning_rate": 1.9928270180286594e-05, "loss": 0.4446, "mean_token_accuracy": 0.9044004201889038, "num_tokens": 29544330.0, "step": 7010 }, { "entropy": 0.294763021543622, "epoch": 1.6352721762443174, "grad_norm": 4.6875, "learning_rate": 1.9928167093686848e-05, "loss": 0.45, "mean_token_accuracy": 0.8993681728839874, "num_tokens": 29567647.0, "step": 7015 }, { "entropy": 0.29370351433753966, "epoch": 1.6364378132649493, "grad_norm": 0.78515625, "learning_rate": 1.9928063933600716e-05, "loss": 0.6359, "mean_token_accuracy": 0.893934679031372, "num_tokens": 29579581.0, "step": 7020 }, { "entropy": 0.21342548448592424, "epoch": 1.6376034502855812, "grad_norm": 3.703125, "learning_rate": 1.992796070002974e-05, "loss": 0.4403, "mean_token_accuracy": 0.9128808677196503, "num_tokens": 29601334.0, "step": 7025 }, { "entropy": 0.27890672609210015, "epoch": 1.6387690873062128, "grad_norm": 0.330078125, "learning_rate": 1.9927857392975456e-05, "loss": 0.4488, "mean_token_accuracy": 0.8986436545848846, "num_tokens": 29631924.0, "step": 7030 }, { "entropy": 0.24148422684520482, "epoch": 1.6399347243268445, "grad_norm": 7.25, "learning_rate": 1.9927754012439407e-05, "loss": 0.4239, "mean_token_accuracy": 0.9171118617057801, "num_tokens": 29660277.0, "step": 7035 }, { "entropy": 0.2776141263544559, "epoch": 1.6411003613474764, "grad_norm": 4.96875, "learning_rate": 1.9927650558423136e-05, "loss": 0.524, "mean_token_accuracy": 0.905617070198059, "num_tokens": 29671778.0, "step": 7040 }, { "entropy": 0.19468433856964112, "epoch": 1.6422659983681083, "grad_norm": 1.3046875, "learning_rate": 1.9927547030928182e-05, "loss": 0.3035, "mean_token_accuracy": 0.9407100915908814, "num_tokens": 29688438.0, "step": 7045 }, { "entropy": 0.3036914974451065, "epoch": 1.64343163538874, "grad_norm": 1.4765625, "learning_rate": 1.9927443429956092e-05, "loss": 0.5092, "mean_token_accuracy": 0.8930172383785248, "num_tokens": 29718105.0, "step": 7050 }, { "entropy": 0.270086932182312, "epoch": 1.6445972724093716, "grad_norm": 6.625, "learning_rate": 1.9927339755508406e-05, "loss": 0.5219, "mean_token_accuracy": 0.907888799905777, "num_tokens": 29732103.0, "step": 7055 }, { "entropy": 0.2985291965305805, "epoch": 1.6457629094300035, "grad_norm": 0.76953125, "learning_rate": 1.9927236007586674e-05, "loss": 0.3884, "mean_token_accuracy": 0.9194758594036102, "num_tokens": 29763323.0, "step": 7060 }, { "entropy": 0.24300597086548806, "epoch": 1.6469285464506354, "grad_norm": 0.72265625, "learning_rate": 1.9927132186192443e-05, "loss": 0.5663, "mean_token_accuracy": 0.9005017936229706, "num_tokens": 29783730.0, "step": 7065 }, { "entropy": 0.20479331091046332, "epoch": 1.648094183471267, "grad_norm": 0.5390625, "learning_rate": 1.9927028291327262e-05, "loss": 0.4199, "mean_token_accuracy": 0.9226056575775147, "num_tokens": 29805567.0, "step": 7070 }, { "entropy": 0.24875131323933602, "epoch": 1.6492598204918987, "grad_norm": 0.369140625, "learning_rate": 1.9926924322992677e-05, "loss": 0.4545, "mean_token_accuracy": 0.90796457529068, "num_tokens": 29833232.0, "step": 7075 }, { "entropy": 0.4148341499269009, "epoch": 1.6504254575125306, "grad_norm": 1.2265625, "learning_rate": 1.9926820281190237e-05, "loss": 0.5433, "mean_token_accuracy": 0.8647517114877701, "num_tokens": 29851404.0, "step": 7080 }, { "entropy": 0.32308564633131026, "epoch": 1.6515910945331624, "grad_norm": 6.15625, "learning_rate": 1.99267161659215e-05, "loss": 0.7448, "mean_token_accuracy": 0.8654117107391357, "num_tokens": 29859904.0, "step": 7085 }, { "entropy": 0.22259309757500886, "epoch": 1.652756731553794, "grad_norm": 9.3125, "learning_rate": 1.9926611977188013e-05, "loss": 0.4774, "mean_token_accuracy": 0.9207153677940368, "num_tokens": 29885795.0, "step": 7090 }, { "entropy": 0.20272818468511106, "epoch": 1.6539223685744258, "grad_norm": 0.375, "learning_rate": 1.992650771499133e-05, "loss": 0.4031, "mean_token_accuracy": 0.9296795845031738, "num_tokens": 29906835.0, "step": 7095 }, { "entropy": 0.31789565905928613, "epoch": 1.6550880055950579, "grad_norm": 3.890625, "learning_rate": 1.992640337933301e-05, "loss": 0.4696, "mean_token_accuracy": 0.8921868026256561, "num_tokens": 29929819.0, "step": 7100 }, { "entropy": 0.30611699670553205, "epoch": 1.6562536426156895, "grad_norm": 0.8671875, "learning_rate": 1.9926298970214605e-05, "loss": 0.6593, "mean_token_accuracy": 0.8659330070018768, "num_tokens": 29943553.0, "step": 7105 }, { "entropy": 0.2864277273416519, "epoch": 1.6574192796363212, "grad_norm": 1.375, "learning_rate": 1.992619448763767e-05, "loss": 0.6515, "mean_token_accuracy": 0.8865693151950836, "num_tokens": 29959174.0, "step": 7110 }, { "entropy": 0.3368852075189352, "epoch": 1.658584916656953, "grad_norm": 5.34375, "learning_rate": 1.992608993160377e-05, "loss": 0.661, "mean_token_accuracy": 0.8898778975009918, "num_tokens": 29970033.0, "step": 7115 }, { "entropy": 0.205504653416574, "epoch": 1.659750553677585, "grad_norm": 0.62890625, "learning_rate": 1.9925985302114458e-05, "loss": 0.3969, "mean_token_accuracy": 0.9380353331565857, "num_tokens": 30002128.0, "step": 7120 }, { "entropy": 0.36266718367114664, "epoch": 1.6609161906982166, "grad_norm": 1.6640625, "learning_rate": 1.9925880599171297e-05, "loss": 0.6024, "mean_token_accuracy": 0.8730053007602692, "num_tokens": 30031590.0, "step": 7125 }, { "entropy": 0.4454704590141773, "epoch": 1.6620818277188483, "grad_norm": 3.8125, "learning_rate": 1.9925775822775848e-05, "loss": 0.743, "mean_token_accuracy": 0.8447412520647049, "num_tokens": 30047795.0, "step": 7130 }, { "entropy": 0.24732003286480903, "epoch": 1.6632474647394802, "grad_norm": 2.75, "learning_rate": 1.992567097292967e-05, "loss": 0.4618, "mean_token_accuracy": 0.9004866123199463, "num_tokens": 30060337.0, "step": 7135 }, { "entropy": 0.299695748090744, "epoch": 1.664413101760112, "grad_norm": 0.50390625, "learning_rate": 1.992556604963433e-05, "loss": 0.3696, "mean_token_accuracy": 0.9032521545886993, "num_tokens": 30097236.0, "step": 7140 }, { "entropy": 0.29932774901390075, "epoch": 1.6655787387807437, "grad_norm": 4.5, "learning_rate": 1.9925461052891394e-05, "loss": 0.5304, "mean_token_accuracy": 0.8978306174278259, "num_tokens": 30123892.0, "step": 7145 }, { "entropy": 0.2939157888293266, "epoch": 1.6667443758013754, "grad_norm": 5.28125, "learning_rate": 1.9925355982702422e-05, "loss": 0.485, "mean_token_accuracy": 0.9099433124065399, "num_tokens": 30140095.0, "step": 7150 }, { "entropy": 0.3226566888391972, "epoch": 1.6679100128220072, "grad_norm": 5.71875, "learning_rate": 1.9925250839068985e-05, "loss": 0.6285, "mean_token_accuracy": 0.878937691450119, "num_tokens": 30153066.0, "step": 7155 }, { "entropy": 0.317183431237936, "epoch": 1.6690756498426391, "grad_norm": 0.5625, "learning_rate": 1.992514562199265e-05, "loss": 0.5418, "mean_token_accuracy": 0.8883817255496979, "num_tokens": 30172082.0, "step": 7160 }, { "entropy": 0.36269724033772943, "epoch": 1.6702412868632708, "grad_norm": 4.625, "learning_rate": 1.992504033147499e-05, "loss": 0.6548, "mean_token_accuracy": 0.8761373639106751, "num_tokens": 30194571.0, "step": 7165 }, { "entropy": 0.30474191829562186, "epoch": 1.6714069238839024, "grad_norm": 2.65625, "learning_rate": 1.9924934967517566e-05, "loss": 0.494, "mean_token_accuracy": 0.9052575767040253, "num_tokens": 30205882.0, "step": 7170 }, { "entropy": 0.27710217610001564, "epoch": 1.6725725609045343, "grad_norm": 0.49609375, "learning_rate": 1.9924829530121955e-05, "loss": 0.5382, "mean_token_accuracy": 0.8841014266014099, "num_tokens": 30227634.0, "step": 7175 }, { "entropy": 0.35291581228375435, "epoch": 1.6737381979251662, "grad_norm": 3.5625, "learning_rate": 1.992472401928973e-05, "loss": 0.5598, "mean_token_accuracy": 0.8761431634426117, "num_tokens": 30247877.0, "step": 7180 }, { "entropy": 0.20774665847420692, "epoch": 1.6749038349457979, "grad_norm": 0.4140625, "learning_rate": 1.992461843502246e-05, "loss": 0.1989, "mean_token_accuracy": 0.9413784801959991, "num_tokens": 30284908.0, "step": 7185 }, { "entropy": 0.1597726047039032, "epoch": 1.6760694719664295, "grad_norm": 1.0234375, "learning_rate": 1.9924512777321724e-05, "loss": 0.2075, "mean_token_accuracy": 0.9432705879211426, "num_tokens": 30313340.0, "step": 7190 }, { "entropy": 0.2070239342749119, "epoch": 1.6772351089870614, "grad_norm": 3.625, "learning_rate": 1.9924407046189097e-05, "loss": 0.3967, "mean_token_accuracy": 0.937429141998291, "num_tokens": 30327617.0, "step": 7195 }, { "entropy": 0.4867036297917366, "epoch": 1.6784007460076933, "grad_norm": 5.40625, "learning_rate": 1.992430124162615e-05, "loss": 0.8424, "mean_token_accuracy": 0.8647673785686493, "num_tokens": 30345129.0, "step": 7200 }, { "entropy": 0.3646535977721214, "epoch": 1.679566383028325, "grad_norm": 1.03125, "learning_rate": 1.992419536363447e-05, "loss": 0.6495, "mean_token_accuracy": 0.8817200243473053, "num_tokens": 30361294.0, "step": 7205 }, { "entropy": 0.21611667182296515, "epoch": 1.6807320200489566, "grad_norm": 4.3125, "learning_rate": 1.992408941221563e-05, "loss": 0.5187, "mean_token_accuracy": 0.901872044801712, "num_tokens": 30382658.0, "step": 7210 }, { "entropy": 0.2701580785214901, "epoch": 1.6818976570695885, "grad_norm": 2.28125, "learning_rate": 1.9923983387371207e-05, "loss": 0.4991, "mean_token_accuracy": 0.906497186422348, "num_tokens": 30401438.0, "step": 7215 }, { "entropy": 0.3351647056639194, "epoch": 1.6830632940902204, "grad_norm": 0.5703125, "learning_rate": 1.992387728910279e-05, "loss": 0.5747, "mean_token_accuracy": 0.8796181321144104, "num_tokens": 30421983.0, "step": 7220 }, { "entropy": 0.3249632440507412, "epoch": 1.684228931110852, "grad_norm": 1.15625, "learning_rate": 1.992377111741195e-05, "loss": 0.6371, "mean_token_accuracy": 0.8933514654636383, "num_tokens": 30433163.0, "step": 7225 }, { "entropy": 0.15651183463633062, "epoch": 1.6853945681314837, "grad_norm": 1.265625, "learning_rate": 1.9923664872300284e-05, "loss": 0.265, "mean_token_accuracy": 0.9518414855003356, "num_tokens": 30461301.0, "step": 7230 }, { "entropy": 0.29888438284397123, "epoch": 1.6865602051521156, "grad_norm": 4.59375, "learning_rate": 1.9923558553769363e-05, "loss": 0.4514, "mean_token_accuracy": 0.8969875752925873, "num_tokens": 30490325.0, "step": 7235 }, { "entropy": 0.2710587065666914, "epoch": 1.6877258421727475, "grad_norm": 1.9453125, "learning_rate": 1.9923452161820785e-05, "loss": 0.431, "mean_token_accuracy": 0.9071939885616302, "num_tokens": 30507491.0, "step": 7240 }, { "entropy": 0.2726780742406845, "epoch": 1.6888914791933791, "grad_norm": 6.0, "learning_rate": 1.9923345696456126e-05, "loss": 0.7774, "mean_token_accuracy": 0.8846882879734039, "num_tokens": 30517367.0, "step": 7245 }, { "entropy": 0.31769172176718713, "epoch": 1.690057116214011, "grad_norm": 2.875, "learning_rate": 1.9923239157676978e-05, "loss": 0.5541, "mean_token_accuracy": 0.898011964559555, "num_tokens": 30538251.0, "step": 7250 }, { "entropy": 0.40227584093809127, "epoch": 1.6912227532346429, "grad_norm": 3.71875, "learning_rate": 1.992313254548493e-05, "loss": 0.7193, "mean_token_accuracy": 0.8725230753421783, "num_tokens": 30556048.0, "step": 7255 }, { "entropy": 0.2761504050344229, "epoch": 1.6923883902552745, "grad_norm": 0.6875, "learning_rate": 1.9923025859881567e-05, "loss": 0.6691, "mean_token_accuracy": 0.8683241248130799, "num_tokens": 30576298.0, "step": 7260 }, { "entropy": 0.27000463437289, "epoch": 1.6935540272759062, "grad_norm": 3.09375, "learning_rate": 1.992291910086849e-05, "loss": 0.4296, "mean_token_accuracy": 0.909073394536972, "num_tokens": 30603726.0, "step": 7265 }, { "entropy": 0.2889959217980504, "epoch": 1.694719664296538, "grad_norm": 4.59375, "learning_rate": 1.9922812268447283e-05, "loss": 0.4767, "mean_token_accuracy": 0.9106442332267761, "num_tokens": 30641205.0, "step": 7270 }, { "entropy": 0.2706956649199128, "epoch": 1.69588530131717, "grad_norm": 1.8359375, "learning_rate": 1.992270536261954e-05, "loss": 0.471, "mean_token_accuracy": 0.8838472962379456, "num_tokens": 30671221.0, "step": 7275 }, { "entropy": 0.2506210308521986, "epoch": 1.6970509383378016, "grad_norm": 1.7109375, "learning_rate": 1.9922598383386854e-05, "loss": 0.2305, "mean_token_accuracy": 0.8959549248218537, "num_tokens": 30697512.0, "step": 7280 }, { "entropy": 0.2710033968091011, "epoch": 1.6982165753584333, "grad_norm": 9.5, "learning_rate": 1.9922491330750824e-05, "loss": 0.6513, "mean_token_accuracy": 0.8868758141994476, "num_tokens": 30714445.0, "step": 7285 }, { "entropy": 0.2399160273373127, "epoch": 1.6993822123790652, "grad_norm": 2.890625, "learning_rate": 1.9922384204713044e-05, "loss": 0.4042, "mean_token_accuracy": 0.9155187606811523, "num_tokens": 30732133.0, "step": 7290 }, { "entropy": 0.23681249283254147, "epoch": 1.700547849399697, "grad_norm": 0.4453125, "learning_rate": 1.992227700527511e-05, "loss": 0.2615, "mean_token_accuracy": 0.9078514516353607, "num_tokens": 30763956.0, "step": 7295 }, { "entropy": 0.15826738066971302, "epoch": 1.7017134864203287, "grad_norm": 0.46484375, "learning_rate": 1.9922169732438624e-05, "loss": 0.2281, "mean_token_accuracy": 0.9464801192283631, "num_tokens": 30803369.0, "step": 7300 }, { "entropy": 0.3240881063044071, "epoch": 1.7028791234409604, "grad_norm": 0.59765625, "learning_rate": 1.9922062386205187e-05, "loss": 0.4112, "mean_token_accuracy": 0.9000102639198303, "num_tokens": 30823285.0, "step": 7305 }, { "entropy": 0.1817174531519413, "epoch": 1.7040447604615923, "grad_norm": 0.7734375, "learning_rate": 1.9921954966576392e-05, "loss": 0.2731, "mean_token_accuracy": 0.9324170172214508, "num_tokens": 30843868.0, "step": 7310 }, { "entropy": 0.26547779366374014, "epoch": 1.7052103974822241, "grad_norm": 4.3125, "learning_rate": 1.992184747355385e-05, "loss": 0.563, "mean_token_accuracy": 0.8946157336235047, "num_tokens": 30857862.0, "step": 7315 }, { "entropy": 0.19620814472436904, "epoch": 1.7063760345028558, "grad_norm": 2.46875, "learning_rate": 1.9921739907139153e-05, "loss": 0.3691, "mean_token_accuracy": 0.9361901104450225, "num_tokens": 30870783.0, "step": 7320 }, { "entropy": 0.21691591143608094, "epoch": 1.7075416715234875, "grad_norm": 1.734375, "learning_rate": 1.9921632267333915e-05, "loss": 0.5116, "mean_token_accuracy": 0.914357042312622, "num_tokens": 30883953.0, "step": 7325 }, { "entropy": 0.2352630764245987, "epoch": 1.7087073085441193, "grad_norm": 4.15625, "learning_rate": 1.9921524554139736e-05, "loss": 0.4476, "mean_token_accuracy": 0.916910320520401, "num_tokens": 30899288.0, "step": 7330 }, { "entropy": 0.15292470753192902, "epoch": 1.7098729455647512, "grad_norm": 0.423828125, "learning_rate": 1.9921416767558227e-05, "loss": 0.2157, "mean_token_accuracy": 0.945728212594986, "num_tokens": 30931355.0, "step": 7335 }, { "entropy": 0.2981669930741191, "epoch": 1.7110385825853829, "grad_norm": 1.6640625, "learning_rate": 1.992130890759099e-05, "loss": 0.5402, "mean_token_accuracy": 0.8897511839866639, "num_tokens": 30950440.0, "step": 7340 }, { "entropy": 0.23863260447978973, "epoch": 1.7122042196060145, "grad_norm": 5.53125, "learning_rate": 1.9921200974239632e-05, "loss": 0.5498, "mean_token_accuracy": 0.9120193362236023, "num_tokens": 30962258.0, "step": 7345 }, { "entropy": 0.19053780883550644, "epoch": 1.7133698566266464, "grad_norm": 0.7265625, "learning_rate": 1.992109296750577e-05, "loss": 0.2194, "mean_token_accuracy": 0.934450340270996, "num_tokens": 30997839.0, "step": 7350 }, { "entropy": 0.18329034112393855, "epoch": 1.7145354936472783, "grad_norm": 2.59375, "learning_rate": 1.9920984887391005e-05, "loss": 0.4037, "mean_token_accuracy": 0.9329216539859772, "num_tokens": 31025334.0, "step": 7355 }, { "entropy": 0.19702217131853103, "epoch": 1.71570113066791, "grad_norm": 3.234375, "learning_rate": 1.9920876733896957e-05, "loss": 0.3438, "mean_token_accuracy": 0.9348407089710236, "num_tokens": 31050064.0, "step": 7360 }, { "entropy": 0.320951434969902, "epoch": 1.7168667676885416, "grad_norm": 3.3125, "learning_rate": 1.9920768507025235e-05, "loss": 0.5197, "mean_token_accuracy": 0.8957397937774658, "num_tokens": 31060139.0, "step": 7365 }, { "entropy": 0.19964747801423072, "epoch": 1.7180324047091735, "grad_norm": 3.953125, "learning_rate": 1.9920660206777457e-05, "loss": 0.3856, "mean_token_accuracy": 0.9334841907024384, "num_tokens": 31072742.0, "step": 7370 }, { "entropy": 0.17681464105844497, "epoch": 1.7191980417298054, "grad_norm": 0.435546875, "learning_rate": 1.992055183315523e-05, "loss": 0.211, "mean_token_accuracy": 0.9506916999816895, "num_tokens": 31104957.0, "step": 7375 }, { "entropy": 0.22340110838413238, "epoch": 1.720363678750437, "grad_norm": 6.9375, "learning_rate": 1.9920443386160177e-05, "loss": 0.4826, "mean_token_accuracy": 0.9081883072853089, "num_tokens": 31123653.0, "step": 7380 }, { "entropy": 0.18643004707992078, "epoch": 1.721529315771069, "grad_norm": 0.65234375, "learning_rate": 1.992033486579391e-05, "loss": 0.2215, "mean_token_accuracy": 0.9491021037101746, "num_tokens": 31157290.0, "step": 7385 }, { "entropy": 0.2977891772985458, "epoch": 1.7226949527917008, "grad_norm": 4.625, "learning_rate": 1.992022627205805e-05, "loss": 0.6306, "mean_token_accuracy": 0.8962555050849914, "num_tokens": 31167243.0, "step": 7390 }, { "entropy": 0.1719819199293852, "epoch": 1.7238605898123325, "grad_norm": 0.8203125, "learning_rate": 1.9920117604954213e-05, "loss": 0.1391, "mean_token_accuracy": 0.9344309747219086, "num_tokens": 31205604.0, "step": 7395 }, { "entropy": 0.2909649141132832, "epoch": 1.7250262268329641, "grad_norm": 5.40625, "learning_rate": 1.9920008864484026e-05, "loss": 0.4333, "mean_token_accuracy": 0.9086243093013764, "num_tokens": 31219789.0, "step": 7400 }, { "entropy": 0.31774648912250997, "epoch": 1.726191863853596, "grad_norm": 0.9140625, "learning_rate": 1.9919900050649106e-05, "loss": 0.6789, "mean_token_accuracy": 0.8734631478786469, "num_tokens": 31234682.0, "step": 7405 }, { "entropy": 0.2277542307972908, "epoch": 1.727357500874228, "grad_norm": 0.609375, "learning_rate": 1.9919791163451078e-05, "loss": 0.349, "mean_token_accuracy": 0.9085290551185607, "num_tokens": 31252277.0, "step": 7410 }, { "entropy": 0.23079803325235843, "epoch": 1.7285231378948596, "grad_norm": 5.0, "learning_rate": 1.991968220289156e-05, "loss": 0.3643, "mean_token_accuracy": 0.9307330787181854, "num_tokens": 31275731.0, "step": 7415 }, { "entropy": 0.3496759317815304, "epoch": 1.7296887749154912, "grad_norm": 6.5625, "learning_rate": 1.991957316897218e-05, "loss": 0.7099, "mean_token_accuracy": 0.8756554901599884, "num_tokens": 31296718.0, "step": 7420 }, { "entropy": 0.32018161565065384, "epoch": 1.730854411936123, "grad_norm": 0.83984375, "learning_rate": 1.9919464061694573e-05, "loss": 0.513, "mean_token_accuracy": 0.8961473703384399, "num_tokens": 31309746.0, "step": 7425 }, { "entropy": 0.2859540117904544, "epoch": 1.732020048956755, "grad_norm": 2.78125, "learning_rate": 1.991935488106035e-05, "loss": 0.6537, "mean_token_accuracy": 0.887366658449173, "num_tokens": 31327616.0, "step": 7430 }, { "entropy": 0.31319847926497457, "epoch": 1.7331856859773866, "grad_norm": 1.53125, "learning_rate": 1.991924562707115e-05, "loss": 0.5387, "mean_token_accuracy": 0.8870364725589752, "num_tokens": 31349824.0, "step": 7435 }, { "entropy": 0.3842922620475292, "epoch": 1.7343513229980183, "grad_norm": 0.490234375, "learning_rate": 1.9919136299728597e-05, "loss": 0.4692, "mean_token_accuracy": 0.8711851298809051, "num_tokens": 31373296.0, "step": 7440 }, { "entropy": 0.24882024489343166, "epoch": 1.7355169600186502, "grad_norm": 4.25, "learning_rate": 1.991902689903432e-05, "loss": 0.4052, "mean_token_accuracy": 0.9191643178462983, "num_tokens": 31396449.0, "step": 7445 }, { "entropy": 0.299553207680583, "epoch": 1.736682597039282, "grad_norm": 3.921875, "learning_rate": 1.991891742498996e-05, "loss": 0.454, "mean_token_accuracy": 0.9090847849845887, "num_tokens": 31416553.0, "step": 7450 }, { "entropy": 0.22123729214072227, "epoch": 1.7378482340599137, "grad_norm": 5.78125, "learning_rate": 1.9918807877597137e-05, "loss": 0.3916, "mean_token_accuracy": 0.9285433471202851, "num_tokens": 31431941.0, "step": 7455 }, { "entropy": 0.2913926810026169, "epoch": 1.7390138710805454, "grad_norm": 2.265625, "learning_rate": 1.991869825685749e-05, "loss": 0.6338, "mean_token_accuracy": 0.8911004424095154, "num_tokens": 31441855.0, "step": 7460 }, { "entropy": 0.29126920998096467, "epoch": 1.7401795081011773, "grad_norm": 5.125, "learning_rate": 1.9918588562772658e-05, "loss": 0.4673, "mean_token_accuracy": 0.9109508275985718, "num_tokens": 31452820.0, "step": 7465 }, { "entropy": 0.23168888092041015, "epoch": 1.7413451451218092, "grad_norm": 3.203125, "learning_rate": 1.991847879534427e-05, "loss": 0.4493, "mean_token_accuracy": 0.9215346693992614, "num_tokens": 31465636.0, "step": 7470 }, { "entropy": 0.2087232932448387, "epoch": 1.7425107821424408, "grad_norm": 1.328125, "learning_rate": 1.991836895457397e-05, "loss": 0.36, "mean_token_accuracy": 0.9289436519145966, "num_tokens": 31481900.0, "step": 7475 }, { "entropy": 0.27741892635822296, "epoch": 1.7436764191630725, "grad_norm": 4.0625, "learning_rate": 1.9918259040463387e-05, "loss": 0.5337, "mean_token_accuracy": 0.9090081572532653, "num_tokens": 31493712.0, "step": 7480 }, { "entropy": 0.5755782432854175, "epoch": 1.7448420561837044, "grad_norm": 1.359375, "learning_rate": 1.9918149053014165e-05, "loss": 1.1664, "mean_token_accuracy": 0.8490794718265533, "num_tokens": 31521846.0, "step": 7485 }, { "entropy": 0.26485144533216953, "epoch": 1.7460076932043362, "grad_norm": 8.6875, "learning_rate": 1.9918038992227942e-05, "loss": 0.5802, "mean_token_accuracy": 0.9102464139461517, "num_tokens": 31535230.0, "step": 7490 }, { "entropy": 0.32285096058622004, "epoch": 1.747173330224968, "grad_norm": 6.6875, "learning_rate": 1.9917928858106363e-05, "loss": 0.5593, "mean_token_accuracy": 0.8858695566654206, "num_tokens": 31560034.0, "step": 7495 }, { "entropy": 0.312009833753109, "epoch": 1.7483389672455996, "grad_norm": 5.0625, "learning_rate": 1.9917818650651062e-05, "loss": 0.5231, "mean_token_accuracy": 0.8962408840656281, "num_tokens": 31572987.0, "step": 7500 }, { "entropy": 0.1852190401405096, "epoch": 1.7495046042662314, "grad_norm": 2.53125, "learning_rate": 1.9917708369863695e-05, "loss": 0.3109, "mean_token_accuracy": 0.9369418799877167, "num_tokens": 31593919.0, "step": 7505 }, { "entropy": 0.28790936544537543, "epoch": 1.7506702412868633, "grad_norm": 10.25, "learning_rate": 1.9917598015745897e-05, "loss": 0.4879, "mean_token_accuracy": 0.8942620098590851, "num_tokens": 31614322.0, "step": 7510 }, { "entropy": 0.2300134082324803, "epoch": 1.751835878307495, "grad_norm": 0.56640625, "learning_rate": 1.9917487588299315e-05, "loss": 0.4391, "mean_token_accuracy": 0.9022059321403504, "num_tokens": 31645430.0, "step": 7515 }, { "entropy": 0.2961542010307312, "epoch": 1.7530015153281269, "grad_norm": 4.1875, "learning_rate": 1.99173770875256e-05, "loss": 0.3211, "mean_token_accuracy": 0.9309226989746093, "num_tokens": 31668774.0, "step": 7520 }, { "entropy": 0.29538979530334475, "epoch": 1.7541671523487588, "grad_norm": 0.94140625, "learning_rate": 1.9917266513426395e-05, "loss": 0.4159, "mean_token_accuracy": 0.9046856939792634, "num_tokens": 31690334.0, "step": 7525 }, { "entropy": 0.26101839244365693, "epoch": 1.7553327893693904, "grad_norm": 3.71875, "learning_rate": 1.9917155866003348e-05, "loss": 0.4387, "mean_token_accuracy": 0.919017630815506, "num_tokens": 31713885.0, "step": 7530 }, { "entropy": 0.2481499405577779, "epoch": 1.756498426390022, "grad_norm": 1.1796875, "learning_rate": 1.9917045145258113e-05, "loss": 0.3334, "mean_token_accuracy": 0.9115027070045472, "num_tokens": 31738068.0, "step": 7535 }, { "entropy": 0.24259122014045714, "epoch": 1.757664063410654, "grad_norm": 1.2890625, "learning_rate": 1.9916934351192337e-05, "loss": 0.4097, "mean_token_accuracy": 0.9250048696994781, "num_tokens": 31751725.0, "step": 7540 }, { "entropy": 0.2630830302834511, "epoch": 1.7588297004312858, "grad_norm": 6.03125, "learning_rate": 1.9916823483807677e-05, "loss": 0.6452, "mean_token_accuracy": 0.8848126351833343, "num_tokens": 31766007.0, "step": 7545 }, { "entropy": 0.29085163548588755, "epoch": 1.7599953374519175, "grad_norm": 7.65625, "learning_rate": 1.9916712543105784e-05, "loss": 0.591, "mean_token_accuracy": 0.8972097933292389, "num_tokens": 31777744.0, "step": 7550 }, { "entropy": 0.21113423565402628, "epoch": 1.7611609744725492, "grad_norm": 4.65625, "learning_rate": 1.991660152908831e-05, "loss": 0.3933, "mean_token_accuracy": 0.919322258234024, "num_tokens": 31808311.0, "step": 7555 }, { "entropy": 0.2468874402344227, "epoch": 1.762326611493181, "grad_norm": 3.3125, "learning_rate": 1.9916490441756916e-05, "loss": 0.3811, "mean_token_accuracy": 0.9185133218765259, "num_tokens": 31829676.0, "step": 7560 }, { "entropy": 0.22626171559095382, "epoch": 1.763492248513813, "grad_norm": 6.3125, "learning_rate": 1.991637928111325e-05, "loss": 0.4106, "mean_token_accuracy": 0.921566492319107, "num_tokens": 31851995.0, "step": 7565 }, { "entropy": 0.33512529246509076, "epoch": 1.7646578855344446, "grad_norm": 7.46875, "learning_rate": 1.9916268047158976e-05, "loss": 0.6447, "mean_token_accuracy": 0.8610825300216675, "num_tokens": 31876760.0, "step": 7570 }, { "entropy": 0.3237742781639099, "epoch": 1.7658235225550762, "grad_norm": 0.6328125, "learning_rate": 1.9916156739895748e-05, "loss": 0.555, "mean_token_accuracy": 0.885119891166687, "num_tokens": 31900782.0, "step": 7575 }, { "entropy": 0.2589297104626894, "epoch": 1.7669891595757081, "grad_norm": 0.8828125, "learning_rate": 1.9916045359325234e-05, "loss": 0.3046, "mean_token_accuracy": 0.9248237907886505, "num_tokens": 31933210.0, "step": 7580 }, { "entropy": 0.18404842913150787, "epoch": 1.76815479659634, "grad_norm": 0.8125, "learning_rate": 1.9915933905449087e-05, "loss": 0.1783, "mean_token_accuracy": 0.9453945815563202, "num_tokens": 31967439.0, "step": 7585 }, { "entropy": 0.14019958060234786, "epoch": 1.7693204336169717, "grad_norm": 2.015625, "learning_rate": 1.9915822378268973e-05, "loss": 0.2196, "mean_token_accuracy": 0.9397667944431305, "num_tokens": 31994838.0, "step": 7590 }, { "entropy": 0.24855429008603097, "epoch": 1.7704860706376033, "grad_norm": 1.9375, "learning_rate": 1.991571077778655e-05, "loss": 0.5965, "mean_token_accuracy": 0.8934241354465484, "num_tokens": 32009912.0, "step": 7595 }, { "entropy": 0.18048623353242874, "epoch": 1.7716517076582352, "grad_norm": 2.671875, "learning_rate": 1.9915599104003486e-05, "loss": 0.171, "mean_token_accuracy": 0.9239882111549378, "num_tokens": 32039346.0, "step": 7600 }, { "entropy": 0.20122657679021358, "epoch": 1.772817344678867, "grad_norm": 0.65234375, "learning_rate": 1.991548735692145e-05, "loss": 0.3101, "mean_token_accuracy": 0.9241252362728118, "num_tokens": 32068148.0, "step": 7605 }, { "entropy": 0.24722633212804795, "epoch": 1.7739829816994988, "grad_norm": 2.5625, "learning_rate": 1.99153755365421e-05, "loss": 0.4331, "mean_token_accuracy": 0.9065563023090363, "num_tokens": 32088079.0, "step": 7610 }, { "entropy": 0.2641620749607682, "epoch": 1.7751486187201304, "grad_norm": 1.1875, "learning_rate": 1.991526364286711e-05, "loss": 0.5494, "mean_token_accuracy": 0.8998633921146393, "num_tokens": 32112963.0, "step": 7615 }, { "entropy": 0.28498165756464006, "epoch": 1.7763142557407623, "grad_norm": 4.0, "learning_rate": 1.9915151675898144e-05, "loss": 0.6748, "mean_token_accuracy": 0.8952282905578614, "num_tokens": 32123851.0, "step": 7620 }, { "entropy": 0.33082397319376466, "epoch": 1.7774798927613942, "grad_norm": 5.75, "learning_rate": 1.9915039635636876e-05, "loss": 0.6175, "mean_token_accuracy": 0.8862141251564026, "num_tokens": 32138905.0, "step": 7625 }, { "entropy": 0.2725643038749695, "epoch": 1.7786455297820258, "grad_norm": 2.9375, "learning_rate": 1.9914927522084975e-05, "loss": 0.4621, "mean_token_accuracy": 0.9120152533054352, "num_tokens": 32159510.0, "step": 7630 }, { "entropy": 0.21518983766436578, "epoch": 1.7798111668026575, "grad_norm": 6.71875, "learning_rate": 1.991481533524411e-05, "loss": 0.3764, "mean_token_accuracy": 0.9207785665988922, "num_tokens": 32180441.0, "step": 7635 }, { "entropy": 0.2944613240659237, "epoch": 1.7809768038232894, "grad_norm": 5.4375, "learning_rate": 1.991470307511596e-05, "loss": 0.5341, "mean_token_accuracy": 0.8910565853118897, "num_tokens": 32195329.0, "step": 7640 }, { "entropy": 0.37638199105858805, "epoch": 1.7821424408439213, "grad_norm": 10.75, "learning_rate": 1.9914590741702188e-05, "loss": 0.818, "mean_token_accuracy": 0.8501861274242402, "num_tokens": 32212617.0, "step": 7645 }, { "entropy": 0.20262998938560486, "epoch": 1.783308077864553, "grad_norm": 0.94140625, "learning_rate": 1.9914478335004482e-05, "loss": 0.2963, "mean_token_accuracy": 0.9370038688182831, "num_tokens": 32233596.0, "step": 7650 }, { "entropy": 0.24313185326755046, "epoch": 1.7844737148851848, "grad_norm": 1.2265625, "learning_rate": 1.9914365855024508e-05, "loss": 0.3603, "mean_token_accuracy": 0.9154565274715424, "num_tokens": 32252957.0, "step": 7655 }, { "entropy": 0.3318315625190735, "epoch": 1.7856393519058167, "grad_norm": 4.96875, "learning_rate": 1.9914253301763953e-05, "loss": 0.6415, "mean_token_accuracy": 0.8702471375465393, "num_tokens": 32267022.0, "step": 7660 }, { "entropy": 0.2816934622824192, "epoch": 1.7868049889264483, "grad_norm": 0.8984375, "learning_rate": 1.9914140675224483e-05, "loss": 0.4164, "mean_token_accuracy": 0.9272620856761933, "num_tokens": 32280466.0, "step": 7665 }, { "entropy": 0.22036347948014737, "epoch": 1.78797062594708, "grad_norm": 1.140625, "learning_rate": 1.991402797540779e-05, "loss": 0.4315, "mean_token_accuracy": 0.9199424505233764, "num_tokens": 32294548.0, "step": 7670 }, { "entropy": 0.34325801730155947, "epoch": 1.7891362629677119, "grad_norm": 4.8125, "learning_rate": 1.9913915202315544e-05, "loss": 0.5674, "mean_token_accuracy": 0.874156790971756, "num_tokens": 32308005.0, "step": 7675 }, { "entropy": 0.22082845419645308, "epoch": 1.7903018999883438, "grad_norm": 1.53125, "learning_rate": 1.9913802355949436e-05, "loss": 0.4455, "mean_token_accuracy": 0.9140099585056305, "num_tokens": 32323466.0, "step": 7680 }, { "entropy": 0.2676783286035061, "epoch": 1.7914675370089754, "grad_norm": 5.3125, "learning_rate": 1.9913689436311142e-05, "loss": 0.5678, "mean_token_accuracy": 0.8990582466125489, "num_tokens": 32336418.0, "step": 7685 }, { "entropy": 0.38137408494949343, "epoch": 1.792633174029607, "grad_norm": 5.71875, "learning_rate": 1.991357644340235e-05, "loss": 0.7375, "mean_token_accuracy": 0.8788779199123382, "num_tokens": 32344359.0, "step": 7690 }, { "entropy": 0.3954359903931618, "epoch": 1.793798811050239, "grad_norm": 9.75, "learning_rate": 1.9913463377224738e-05, "loss": 0.7942, "mean_token_accuracy": 0.8664931178092956, "num_tokens": 32354899.0, "step": 7695 }, { "entropy": 0.27664031460881233, "epoch": 1.7949644480708709, "grad_norm": 1.015625, "learning_rate": 1.9913350237780002e-05, "loss": 0.3941, "mean_token_accuracy": 0.9219299912452698, "num_tokens": 32367076.0, "step": 7700 }, { "entropy": 0.17025743499398233, "epoch": 1.7961300850915025, "grad_norm": 7.0625, "learning_rate": 1.991323702506982e-05, "loss": 0.2769, "mean_token_accuracy": 0.9408093333244324, "num_tokens": 32397739.0, "step": 7705 }, { "entropy": 0.3191656589508057, "epoch": 1.7972957221121342, "grad_norm": 3.640625, "learning_rate": 1.9913123739095885e-05, "loss": 0.6224, "mean_token_accuracy": 0.8828663647174835, "num_tokens": 32408754.0, "step": 7710 }, { "entropy": 0.266577872633934, "epoch": 1.798461359132766, "grad_norm": 2.359375, "learning_rate": 1.9913010379859885e-05, "loss": 0.4473, "mean_token_accuracy": 0.907596331834793, "num_tokens": 32426950.0, "step": 7715 }, { "entropy": 0.2586742855608463, "epoch": 1.799626996153398, "grad_norm": 8.0625, "learning_rate": 1.991289694736351e-05, "loss": 0.4857, "mean_token_accuracy": 0.9078260838985444, "num_tokens": 32442630.0, "step": 7720 }, { "entropy": 0.21624571941792964, "epoch": 1.8007926331740296, "grad_norm": 3.484375, "learning_rate": 1.9912783441608457e-05, "loss": 0.2872, "mean_token_accuracy": 0.9356445431709289, "num_tokens": 32471635.0, "step": 7725 }, { "entropy": 0.27893735766410827, "epoch": 1.8019582701946613, "grad_norm": 6.03125, "learning_rate": 1.9912669862596407e-05, "loss": 0.5273, "mean_token_accuracy": 0.9174108982086182, "num_tokens": 32485082.0, "step": 7730 }, { "entropy": 0.28767716884613037, "epoch": 1.8031239072152931, "grad_norm": 6.34375, "learning_rate": 1.9912556210329065e-05, "loss": 0.61, "mean_token_accuracy": 0.8947061181068421, "num_tokens": 32496872.0, "step": 7735 }, { "entropy": 0.2328474037349224, "epoch": 1.804289544235925, "grad_norm": 2.859375, "learning_rate": 1.9912442484808117e-05, "loss": 0.2921, "mean_token_accuracy": 0.9219469904899598, "num_tokens": 32528904.0, "step": 7740 }, { "entropy": 0.23528299368917943, "epoch": 1.8054551812565567, "grad_norm": 0.828125, "learning_rate": 1.9912328686035266e-05, "loss": 0.2192, "mean_token_accuracy": 0.9276223361492157, "num_tokens": 32554343.0, "step": 7745 }, { "entropy": 0.2867948904633522, "epoch": 1.8066208182771883, "grad_norm": 1.9921875, "learning_rate": 1.991221481401221e-05, "loss": 0.4437, "mean_token_accuracy": 0.9267707407474518, "num_tokens": 32563606.0, "step": 7750 }, { "entropy": 0.3321516253054142, "epoch": 1.8077864552978202, "grad_norm": 9.25, "learning_rate": 1.9912100868740635e-05, "loss": 0.507, "mean_token_accuracy": 0.8905054688453674, "num_tokens": 32592476.0, "step": 7755 }, { "entropy": 0.2881818190217018, "epoch": 1.808952092318452, "grad_norm": 4.09375, "learning_rate": 1.991198685022225e-05, "loss": 0.527, "mean_token_accuracy": 0.8902274250984192, "num_tokens": 32610498.0, "step": 7760 }, { "entropy": 0.2766866091638803, "epoch": 1.8101177293390838, "grad_norm": 0.5859375, "learning_rate": 1.991187275845875e-05, "loss": 0.3293, "mean_token_accuracy": 0.9011591911315918, "num_tokens": 32636296.0, "step": 7765 }, { "entropy": 0.2094460479915142, "epoch": 1.8112833663597154, "grad_norm": 0.314453125, "learning_rate": 1.9911758593451845e-05, "loss": 0.3593, "mean_token_accuracy": 0.9299488604068756, "num_tokens": 32671123.0, "step": 7770 }, { "entropy": 0.3513875052332878, "epoch": 1.8124490033803473, "grad_norm": 5.28125, "learning_rate": 1.991164435520323e-05, "loss": 0.677, "mean_token_accuracy": 0.8863472640514374, "num_tokens": 32679984.0, "step": 7775 }, { "entropy": 0.20216977708041667, "epoch": 1.8136146404009792, "grad_norm": 4.09375, "learning_rate": 1.9911530043714607e-05, "loss": 0.4019, "mean_token_accuracy": 0.9245426237583161, "num_tokens": 32696537.0, "step": 7780 }, { "entropy": 0.2335958130657673, "epoch": 1.8147802774216109, "grad_norm": 5.8125, "learning_rate": 1.991141565898769e-05, "loss": 0.4809, "mean_token_accuracy": 0.9251964628696442, "num_tokens": 32708173.0, "step": 7785 }, { "entropy": 0.21332778688520193, "epoch": 1.8159459144422427, "grad_norm": 5.3125, "learning_rate": 1.9911301201024174e-05, "loss": 0.4063, "mean_token_accuracy": 0.9151061058044434, "num_tokens": 32736539.0, "step": 7790 }, { "entropy": 0.27434416934847833, "epoch": 1.8171115514628746, "grad_norm": 7.65625, "learning_rate": 1.991118666982577e-05, "loss": 0.4694, "mean_token_accuracy": 0.9003516376018524, "num_tokens": 32762365.0, "step": 7795 }, { "entropy": 0.23991770818829536, "epoch": 1.8182771884835063, "grad_norm": 0.5703125, "learning_rate": 1.9911072065394186e-05, "loss": 0.5031, "mean_token_accuracy": 0.9172163844108582, "num_tokens": 32777557.0, "step": 7800 }, { "entropy": 0.2554063454270363, "epoch": 1.819442825504138, "grad_norm": 0.7734375, "learning_rate": 1.9910957387731133e-05, "loss": 0.5378, "mean_token_accuracy": 0.9102888762950897, "num_tokens": 32791647.0, "step": 7805 }, { "entropy": 0.21996467523276805, "epoch": 1.8206084625247698, "grad_norm": 5.4375, "learning_rate": 1.9910842636838317e-05, "loss": 0.3305, "mean_token_accuracy": 0.9108978271484375, "num_tokens": 32820950.0, "step": 7810 }, { "entropy": 0.16689216084778308, "epoch": 1.8217740995454017, "grad_norm": 0.470703125, "learning_rate": 1.9910727812717453e-05, "loss": 0.2346, "mean_token_accuracy": 0.9388123393058777, "num_tokens": 32852381.0, "step": 7815 }, { "entropy": 0.20206578820943832, "epoch": 1.8229397365660334, "grad_norm": 3.171875, "learning_rate": 1.9910612915370246e-05, "loss": 0.2481, "mean_token_accuracy": 0.9353121876716614, "num_tokens": 32880441.0, "step": 7820 }, { "entropy": 0.17326834462583066, "epoch": 1.824105373586665, "grad_norm": 0.447265625, "learning_rate": 1.991049794479842e-05, "loss": 0.2222, "mean_token_accuracy": 0.943918788433075, "num_tokens": 32906058.0, "step": 7825 }, { "entropy": 0.24531815834343434, "epoch": 1.825271010607297, "grad_norm": 6.09375, "learning_rate": 1.991038290100368e-05, "loss": 0.3671, "mean_token_accuracy": 0.9264979779720306, "num_tokens": 32925098.0, "step": 7830 }, { "entropy": 0.15863981209695338, "epoch": 1.8264366476279288, "grad_norm": 0.97265625, "learning_rate": 1.9910267783987747e-05, "loss": 0.2095, "mean_token_accuracy": 0.9451212227344513, "num_tokens": 32950290.0, "step": 7835 }, { "entropy": 0.30518715754151343, "epoch": 1.8276022846485604, "grad_norm": 5.78125, "learning_rate": 1.9910152593752335e-05, "loss": 0.6845, "mean_token_accuracy": 0.8782230734825134, "num_tokens": 32964196.0, "step": 7840 }, { "entropy": 0.23406622558832169, "epoch": 1.828767921669192, "grad_norm": 0.412109375, "learning_rate": 1.9910037330299165e-05, "loss": 0.4499, "mean_token_accuracy": 0.9033817291259766, "num_tokens": 32989094.0, "step": 7845 }, { "entropy": 0.3372978284955025, "epoch": 1.829933558689824, "grad_norm": 6.96875, "learning_rate": 1.9909921993629954e-05, "loss": 0.5547, "mean_token_accuracy": 0.876672887802124, "num_tokens": 32998403.0, "step": 7850 }, { "entropy": 0.28051941096782684, "epoch": 1.8310991957104559, "grad_norm": 7.6875, "learning_rate": 1.9909806583746417e-05, "loss": 0.6219, "mean_token_accuracy": 0.9012742102146148, "num_tokens": 33007291.0, "step": 7855 }, { "entropy": 0.41001638807356355, "epoch": 1.8322648327310875, "grad_norm": 0.703125, "learning_rate": 1.990969110065028e-05, "loss": 0.4781, "mean_token_accuracy": 0.8795218944549561, "num_tokens": 33040644.0, "step": 7860 }, { "entropy": 0.2866490498185158, "epoch": 1.8334304697517192, "grad_norm": 2.109375, "learning_rate": 1.9909575544343265e-05, "loss": 0.7216, "mean_token_accuracy": 0.8838229179382324, "num_tokens": 33059451.0, "step": 7865 }, { "entropy": 0.3855082355439663, "epoch": 1.834596106772351, "grad_norm": 0.474609375, "learning_rate": 1.9909459914827094e-05, "loss": 0.6194, "mean_token_accuracy": 0.8708645105361938, "num_tokens": 33094571.0, "step": 7870 }, { "entropy": 0.33956164717674253, "epoch": 1.835761743792983, "grad_norm": 16.75, "learning_rate": 1.990934421210349e-05, "loss": 0.9041, "mean_token_accuracy": 0.8637166440486908, "num_tokens": 33102764.0, "step": 7875 }, { "entropy": 0.21683834567666055, "epoch": 1.8369273808136146, "grad_norm": 5.65625, "learning_rate": 1.9909228436174184e-05, "loss": 0.4211, "mean_token_accuracy": 0.9251314640045166, "num_tokens": 33128588.0, "step": 7880 }, { "entropy": 0.240068347658962, "epoch": 1.8380930178342463, "grad_norm": 7.03125, "learning_rate": 1.9909112587040895e-05, "loss": 0.4946, "mean_token_accuracy": 0.888495671749115, "num_tokens": 33155358.0, "step": 7885 }, { "entropy": 0.32991256155073645, "epoch": 1.8392586548548782, "grad_norm": 1.8046875, "learning_rate": 1.9908996664705356e-05, "loss": 0.6577, "mean_token_accuracy": 0.8739120721817016, "num_tokens": 33173942.0, "step": 7890 }, { "entropy": 0.2920090861618519, "epoch": 1.84042429187551, "grad_norm": 8.0625, "learning_rate": 1.990888066916929e-05, "loss": 0.5685, "mean_token_accuracy": 0.8976893186569214, "num_tokens": 33183850.0, "step": 7895 }, { "entropy": 0.21282969750463962, "epoch": 1.8415899288961417, "grad_norm": 6.4375, "learning_rate": 1.9908764600434432e-05, "loss": 0.327, "mean_token_accuracy": 0.9291573703289032, "num_tokens": 33209034.0, "step": 7900 }, { "entropy": 0.20020358134061098, "epoch": 1.8427555659167734, "grad_norm": 1.2265625, "learning_rate": 1.9908648458502508e-05, "loss": 0.2122, "mean_token_accuracy": 0.9414223849773407, "num_tokens": 33242582.0, "step": 7905 }, { "entropy": 0.263228052854538, "epoch": 1.8439212029374052, "grad_norm": 4.28125, "learning_rate": 1.9908532243375254e-05, "loss": 0.4269, "mean_token_accuracy": 0.9060485661029816, "num_tokens": 33253504.0, "step": 7910 }, { "entropy": 0.2593220267444849, "epoch": 1.8450868399580371, "grad_norm": 3.3125, "learning_rate": 1.9908415955054403e-05, "loss": 0.4668, "mean_token_accuracy": 0.8901701092720031, "num_tokens": 33269108.0, "step": 7915 }, { "entropy": 0.20201030634343625, "epoch": 1.8462524769786688, "grad_norm": 1.875, "learning_rate": 1.9908299593541686e-05, "loss": 0.4202, "mean_token_accuracy": 0.9283289551734925, "num_tokens": 33286890.0, "step": 7920 }, { "entropy": 0.22947550304234027, "epoch": 1.8474181139993007, "grad_norm": 2.296875, "learning_rate": 1.990818315883884e-05, "loss": 0.3026, "mean_token_accuracy": 0.913777369260788, "num_tokens": 33313771.0, "step": 7925 }, { "entropy": 0.481933256983757, "epoch": 1.8485837510199326, "grad_norm": 1.0546875, "learning_rate": 1.99080666509476e-05, "loss": 0.6726, "mean_token_accuracy": 0.82081817984581, "num_tokens": 33335889.0, "step": 7930 }, { "entropy": 0.29660770744085313, "epoch": 1.8497493880405642, "grad_norm": 3.53125, "learning_rate": 1.9907950069869704e-05, "loss": 0.572, "mean_token_accuracy": 0.8943454444408416, "num_tokens": 33346056.0, "step": 7935 }, { "entropy": 0.2565585695207119, "epoch": 1.8509150250611959, "grad_norm": 5.90625, "learning_rate": 1.9907833415606893e-05, "loss": 0.475, "mean_token_accuracy": 0.911063802242279, "num_tokens": 33356506.0, "step": 7940 }, { "entropy": 0.19112351574003697, "epoch": 1.8520806620818278, "grad_norm": 0.9609375, "learning_rate": 1.99077166881609e-05, "loss": 0.2182, "mean_token_accuracy": 0.9242577075958252, "num_tokens": 33380844.0, "step": 7945 }, { "entropy": 0.24360812529921533, "epoch": 1.8532462991024596, "grad_norm": 3.40625, "learning_rate": 1.990759988753347e-05, "loss": 0.4411, "mean_token_accuracy": 0.9256996273994446, "num_tokens": 33404874.0, "step": 7950 }, { "entropy": 0.2820186048746109, "epoch": 1.8544119361230913, "grad_norm": 3.296875, "learning_rate": 1.9907483013726347e-05, "loss": 0.6969, "mean_token_accuracy": 0.8906833052635192, "num_tokens": 33414812.0, "step": 7955 }, { "entropy": 0.24926723837852477, "epoch": 1.855577573143723, "grad_norm": 6.9375, "learning_rate": 1.9907366066741272e-05, "loss": 0.462, "mean_token_accuracy": 0.9122967898845673, "num_tokens": 33426282.0, "step": 7960 }, { "entropy": 0.2691373065114021, "epoch": 1.8567432101643548, "grad_norm": 7.15625, "learning_rate": 1.9907249046579984e-05, "loss": 0.7012, "mean_token_accuracy": 0.8873024821281433, "num_tokens": 33435803.0, "step": 7965 }, { "entropy": 0.22381224762648344, "epoch": 1.8579088471849867, "grad_norm": 0.40625, "learning_rate": 1.9907131953244234e-05, "loss": 0.4412, "mean_token_accuracy": 0.9174625515937805, "num_tokens": 33461242.0, "step": 7970 }, { "entropy": 0.24329218193888663, "epoch": 1.8590744842056184, "grad_norm": 7.1875, "learning_rate": 1.9907014786735764e-05, "loss": 0.5472, "mean_token_accuracy": 0.9130038440227508, "num_tokens": 33488300.0, "step": 7975 }, { "entropy": 0.2573273852467537, "epoch": 1.86024012122625, "grad_norm": 1.171875, "learning_rate": 1.9906897547056326e-05, "loss": 0.3734, "mean_token_accuracy": 0.9164382576942444, "num_tokens": 33510876.0, "step": 7980 }, { "entropy": 0.25100057646632196, "epoch": 1.861405758246882, "grad_norm": 0.8125, "learning_rate": 1.9906780234207662e-05, "loss": 0.4022, "mean_token_accuracy": 0.9272303938865661, "num_tokens": 33529188.0, "step": 7985 }, { "entropy": 0.16136855445802212, "epoch": 1.8625713952675138, "grad_norm": 0.640625, "learning_rate": 1.9906662848191525e-05, "loss": 0.1747, "mean_token_accuracy": 0.9569616258144379, "num_tokens": 33560189.0, "step": 7990 }, { "entropy": 0.22136666998267174, "epoch": 1.8637370322881455, "grad_norm": 1.1171875, "learning_rate": 1.990654538900967e-05, "loss": 0.3632, "mean_token_accuracy": 0.9338773906230926, "num_tokens": 33572755.0, "step": 7995 }, { "entropy": 0.20970266573131086, "epoch": 1.8649026693087771, "grad_norm": 8.8125, "learning_rate": 1.9906427856663837e-05, "loss": 0.4116, "mean_token_accuracy": 0.9170647919178009, "num_tokens": 33593609.0, "step": 8000 }, { "entropy": 0.282043930888176, "epoch": 1.866068306329409, "grad_norm": 3.546875, "learning_rate": 1.9906310251155786e-05, "loss": 0.4717, "mean_token_accuracy": 0.9160214245319367, "num_tokens": 33603633.0, "step": 8005 }, { "entropy": 0.22577755972743035, "epoch": 1.867233943350041, "grad_norm": 5.78125, "learning_rate": 1.990619257248727e-05, "loss": 0.4227, "mean_token_accuracy": 0.9012669861316681, "num_tokens": 33616958.0, "step": 8010 }, { "entropy": 0.2106149446219206, "epoch": 1.8683995803706726, "grad_norm": 4.53125, "learning_rate": 1.9906074820660042e-05, "loss": 0.1722, "mean_token_accuracy": 0.9327871203422546, "num_tokens": 33652503.0, "step": 8015 }, { "entropy": 0.24818255249410867, "epoch": 1.8695652173913042, "grad_norm": 6.84375, "learning_rate": 1.9905956995675864e-05, "loss": 0.4292, "mean_token_accuracy": 0.9094979107379914, "num_tokens": 33678232.0, "step": 8020 }, { "entropy": 0.26563177108764646, "epoch": 1.870730854411936, "grad_norm": 0.51953125, "learning_rate": 1.9905839097536486e-05, "loss": 0.3926, "mean_token_accuracy": 0.912151038646698, "num_tokens": 33701126.0, "step": 8025 }, { "entropy": 0.2256366118788719, "epoch": 1.871896491432568, "grad_norm": 0.765625, "learning_rate": 1.990572112624367e-05, "loss": 0.7137, "mean_token_accuracy": 0.8795951902866364, "num_tokens": 33716219.0, "step": 8030 }, { "entropy": 0.27713018376380205, "epoch": 1.8730621284531996, "grad_norm": 1.1875, "learning_rate": 1.990560308179917e-05, "loss": 0.4991, "mean_token_accuracy": 0.9037650108337403, "num_tokens": 33735725.0, "step": 8035 }, { "entropy": 0.20286475904285908, "epoch": 1.8742277654738313, "grad_norm": 0.4921875, "learning_rate": 1.990548496420475e-05, "loss": 0.3685, "mean_token_accuracy": 0.9244837462902069, "num_tokens": 33767813.0, "step": 8040 }, { "entropy": 0.1981831956654787, "epoch": 1.8753934024944632, "grad_norm": 0.427734375, "learning_rate": 1.990536677346217e-05, "loss": 0.384, "mean_token_accuracy": 0.9329765677452088, "num_tokens": 33786229.0, "step": 8045 }, { "entropy": 0.2955906016752124, "epoch": 1.876559039515095, "grad_norm": 6.75, "learning_rate": 1.99052485095732e-05, "loss": 0.4128, "mean_token_accuracy": 0.9093118727207183, "num_tokens": 33807214.0, "step": 8050 }, { "entropy": 0.3324073665775359, "epoch": 1.8777246765357267, "grad_norm": 6.5, "learning_rate": 1.9905130172539587e-05, "loss": 0.5357, "mean_token_accuracy": 0.9027265429496765, "num_tokens": 33825464.0, "step": 8055 }, { "entropy": 0.26243541240692136, "epoch": 1.8788903135563586, "grad_norm": 0.73046875, "learning_rate": 1.9905011762363114e-05, "loss": 0.5523, "mean_token_accuracy": 0.9006957054138184, "num_tokens": 33854687.0, "step": 8060 }, { "entropy": 0.1906348394230008, "epoch": 1.8800559505769905, "grad_norm": 0.427734375, "learning_rate": 1.9904893279045535e-05, "loss": 0.2821, "mean_token_accuracy": 0.9366668999195099, "num_tokens": 33887491.0, "step": 8065 }, { "entropy": 0.30027063712477686, "epoch": 1.8812215875976221, "grad_norm": 4.625, "learning_rate": 1.9904774722588617e-05, "loss": 0.4721, "mean_token_accuracy": 0.9096989035606384, "num_tokens": 33926836.0, "step": 8070 }, { "entropy": 0.25279470570385454, "epoch": 1.8823872246182538, "grad_norm": 3.09375, "learning_rate": 1.9904656092994134e-05, "loss": 0.5085, "mean_token_accuracy": 0.9071327090263367, "num_tokens": 33942968.0, "step": 8075 }, { "entropy": 0.35142759159207343, "epoch": 1.8835528616388857, "grad_norm": 7.4375, "learning_rate": 1.990453739026385e-05, "loss": 0.5752, "mean_token_accuracy": 0.8828444004058837, "num_tokens": 33959545.0, "step": 8080 }, { "entropy": 0.21633378714323043, "epoch": 1.8847184986595176, "grad_norm": 7.96875, "learning_rate": 1.9904418614399537e-05, "loss": 0.4535, "mean_token_accuracy": 0.9193155586719512, "num_tokens": 33972285.0, "step": 8085 }, { "entropy": 0.3462574012577534, "epoch": 1.8858841356801492, "grad_norm": 2.265625, "learning_rate": 1.9904299765402965e-05, "loss": 0.55, "mean_token_accuracy": 0.8902999520301819, "num_tokens": 33990022.0, "step": 8090 }, { "entropy": 0.2331024192273617, "epoch": 1.887049772700781, "grad_norm": 6.53125, "learning_rate": 1.990418084327591e-05, "loss": 0.4441, "mean_token_accuracy": 0.917637175321579, "num_tokens": 34012120.0, "step": 8095 }, { "entropy": 0.2364817973226309, "epoch": 1.8882154097214128, "grad_norm": 4.25, "learning_rate": 1.990406184802014e-05, "loss": 0.365, "mean_token_accuracy": 0.9218486249446869, "num_tokens": 34029217.0, "step": 8100 }, { "entropy": 0.15866220146417617, "epoch": 1.8893810467420447, "grad_norm": 4.71875, "learning_rate": 1.990394277963743e-05, "loss": 0.2626, "mean_token_accuracy": 0.9401340663433075, "num_tokens": 34057228.0, "step": 8105 }, { "entropy": 0.2617270015180111, "epoch": 1.8905466837626763, "grad_norm": 2.75, "learning_rate": 1.990382363812956e-05, "loss": 0.4059, "mean_token_accuracy": 0.9226888716220856, "num_tokens": 34072082.0, "step": 8110 }, { "entropy": 0.22469778992235662, "epoch": 1.891712320783308, "grad_norm": 5.84375, "learning_rate": 1.9903704423498305e-05, "loss": 0.4612, "mean_token_accuracy": 0.9184304594993591, "num_tokens": 34091194.0, "step": 8115 }, { "entropy": 0.2838134203106165, "epoch": 1.8928779578039399, "grad_norm": 0.9140625, "learning_rate": 1.9903585135745442e-05, "loss": 0.579, "mean_token_accuracy": 0.8832531034946441, "num_tokens": 34110127.0, "step": 8120 }, { "entropy": 0.30173076689243317, "epoch": 1.8940435948245717, "grad_norm": 5.0, "learning_rate": 1.9903465774872744e-05, "loss": 0.6166, "mean_token_accuracy": 0.8917320251464844, "num_tokens": 34119487.0, "step": 8125 }, { "entropy": 0.3589056760072708, "epoch": 1.8952092318452034, "grad_norm": 1.9765625, "learning_rate": 1.9903346340881998e-05, "loss": 0.4016, "mean_token_accuracy": 0.9043947756290436, "num_tokens": 34140987.0, "step": 8130 }, { "entropy": 0.21588889956474305, "epoch": 1.896374868865835, "grad_norm": 9.5625, "learning_rate": 1.9903226833774985e-05, "loss": 0.4883, "mean_token_accuracy": 0.9089700102806091, "num_tokens": 34160289.0, "step": 8135 }, { "entropy": 0.3851318970322609, "epoch": 1.897540505886467, "grad_norm": 2.734375, "learning_rate": 1.9903107253553484e-05, "loss": 0.7253, "mean_token_accuracy": 0.8677813231945037, "num_tokens": 34180349.0, "step": 8140 }, { "entropy": 0.2314096439629793, "epoch": 1.8987061429070988, "grad_norm": 3.09375, "learning_rate": 1.990298760021928e-05, "loss": 0.2881, "mean_token_accuracy": 0.9264096140861511, "num_tokens": 34199167.0, "step": 8145 }, { "entropy": 0.23651491105556488, "epoch": 1.8998717799277305, "grad_norm": 1.125, "learning_rate": 1.9902867873774155e-05, "loss": 0.4517, "mean_token_accuracy": 0.901793110370636, "num_tokens": 34213910.0, "step": 8150 }, { "entropy": 0.28956699259579183, "epoch": 1.9010374169483621, "grad_norm": 0.671875, "learning_rate": 1.9902748074219896e-05, "loss": 0.3724, "mean_token_accuracy": 0.910913062095642, "num_tokens": 34232143.0, "step": 8155 }, { "entropy": 0.26046003252267835, "epoch": 1.902203053968994, "grad_norm": 6.09375, "learning_rate": 1.9902628201558288e-05, "loss": 0.5784, "mean_token_accuracy": 0.8950169742107391, "num_tokens": 34251076.0, "step": 8160 }, { "entropy": 0.22758226692676545, "epoch": 1.903368690989626, "grad_norm": 3.3125, "learning_rate": 1.9902508255791122e-05, "loss": 0.3644, "mean_token_accuracy": 0.9190545618534088, "num_tokens": 34272139.0, "step": 8165 }, { "entropy": 0.2524715639650822, "epoch": 1.9045343280102576, "grad_norm": 5.125, "learning_rate": 1.9902388236920182e-05, "loss": 0.4574, "mean_token_accuracy": 0.9164434134960174, "num_tokens": 34282761.0, "step": 8170 }, { "entropy": 0.21465678084641696, "epoch": 1.9056999650308892, "grad_norm": 1.21875, "learning_rate": 1.990226814494726e-05, "loss": 0.2894, "mean_token_accuracy": 0.9352861166000366, "num_tokens": 34314814.0, "step": 8175 }, { "entropy": 0.20759768709540366, "epoch": 1.9068656020515211, "grad_norm": 1.03125, "learning_rate": 1.990214797987415e-05, "loss": 0.2024, "mean_token_accuracy": 0.936842006444931, "num_tokens": 34355228.0, "step": 8180 }, { "entropy": 0.22588071897625922, "epoch": 1.908031239072153, "grad_norm": 5.84375, "learning_rate": 1.9902027741702636e-05, "loss": 0.4937, "mean_token_accuracy": 0.924340796470642, "num_tokens": 34375090.0, "step": 8185 }, { "entropy": 0.21736494824290276, "epoch": 1.9091968760927847, "grad_norm": 2.328125, "learning_rate": 1.9901907430434516e-05, "loss": 0.3859, "mean_token_accuracy": 0.9157032668590546, "num_tokens": 34390572.0, "step": 8190 }, { "entropy": 0.2940547376871109, "epoch": 1.9103625131134163, "grad_norm": 3.625, "learning_rate": 1.9901787046071587e-05, "loss": 0.6023, "mean_token_accuracy": 0.8895644903182983, "num_tokens": 34410885.0, "step": 8195 }, { "entropy": 0.26008196324110033, "epoch": 1.9115281501340484, "grad_norm": 5.34375, "learning_rate": 1.9901666588615636e-05, "loss": 0.3953, "mean_token_accuracy": 0.9115445196628571, "num_tokens": 34436963.0, "step": 8200 }, { "entropy": 0.3289894135668874, "epoch": 1.91269378715468, "grad_norm": 0.458984375, "learning_rate": 1.9901546058068467e-05, "loss": 0.4833, "mean_token_accuracy": 0.8767846524715424, "num_tokens": 34466493.0, "step": 8205 }, { "entropy": 0.2564432403072715, "epoch": 1.9138594241753117, "grad_norm": 2.328125, "learning_rate": 1.990142545443187e-05, "loss": 0.4118, "mean_token_accuracy": 0.9165556848049163, "num_tokens": 34485513.0, "step": 8210 }, { "entropy": 0.2016835320740938, "epoch": 1.9150250611959436, "grad_norm": 0.4609375, "learning_rate": 1.990130477770765e-05, "loss": 0.352, "mean_token_accuracy": 0.9331004083156585, "num_tokens": 34516163.0, "step": 8215 }, { "entropy": 0.19940345510840415, "epoch": 1.9161906982165755, "grad_norm": 5.03125, "learning_rate": 1.99011840278976e-05, "loss": 0.4198, "mean_token_accuracy": 0.9229754269123077, "num_tokens": 34543852.0, "step": 8220 }, { "entropy": 0.24907640200108289, "epoch": 1.9173563352372072, "grad_norm": 0.59765625, "learning_rate": 1.990106320500353e-05, "loss": 0.4389, "mean_token_accuracy": 0.9167008459568023, "num_tokens": 34572670.0, "step": 8225 }, { "entropy": 0.2844414710998535, "epoch": 1.9185219722578388, "grad_norm": 3.0, "learning_rate": 1.9900942309027234e-05, "loss": 0.4554, "mean_token_accuracy": 0.8996649920940399, "num_tokens": 34592548.0, "step": 8230 }, { "entropy": 0.2390742838382721, "epoch": 1.9196876092784707, "grad_norm": 0.796875, "learning_rate": 1.9900821339970516e-05, "loss": 0.499, "mean_token_accuracy": 0.9069253146648407, "num_tokens": 34615695.0, "step": 8235 }, { "entropy": 0.2000205848366022, "epoch": 1.9208532462991026, "grad_norm": 0.455078125, "learning_rate": 1.9900700297835183e-05, "loss": 0.1793, "mean_token_accuracy": 0.93028022646904, "num_tokens": 34643986.0, "step": 8240 }, { "entropy": 0.3019634872674942, "epoch": 1.9220188833197343, "grad_norm": 4.75, "learning_rate": 1.9900579182623034e-05, "loss": 0.4367, "mean_token_accuracy": 0.9008702993392944, "num_tokens": 34665139.0, "step": 8245 }, { "entropy": 0.2354019209742546, "epoch": 1.923184520340366, "grad_norm": 1.1640625, "learning_rate": 1.990045799433588e-05, "loss": 0.4314, "mean_token_accuracy": 0.9259082734584808, "num_tokens": 34681508.0, "step": 8250 }, { "entropy": 0.28061345368623736, "epoch": 1.9243501573609978, "grad_norm": 3.921875, "learning_rate": 1.9900336732975528e-05, "loss": 0.4488, "mean_token_accuracy": 0.9200140237808228, "num_tokens": 34708622.0, "step": 8255 }, { "entropy": 0.34055967777967455, "epoch": 1.9255157943816297, "grad_norm": 10.125, "learning_rate": 1.990021539854378e-05, "loss": 0.7395, "mean_token_accuracy": 0.8829732298851013, "num_tokens": 34717903.0, "step": 8260 }, { "entropy": 0.2822610292583704, "epoch": 1.9266814314022613, "grad_norm": 0.328125, "learning_rate": 1.9900093991042453e-05, "loss": 0.568, "mean_token_accuracy": 0.8975172460079193, "num_tokens": 34744369.0, "step": 8265 }, { "entropy": 0.20326493717730046, "epoch": 1.927847068422893, "grad_norm": 0.423828125, "learning_rate": 1.9899972510473356e-05, "loss": 0.4511, "mean_token_accuracy": 0.9232929706573486, "num_tokens": 34766479.0, "step": 8270 }, { "entropy": 0.3232834428548813, "epoch": 1.9290127054435249, "grad_norm": 3.421875, "learning_rate": 1.98998509568383e-05, "loss": 0.5616, "mean_token_accuracy": 0.885380220413208, "num_tokens": 34781112.0, "step": 8275 }, { "entropy": 0.17031664066016675, "epoch": 1.9301783424641568, "grad_norm": 0.373046875, "learning_rate": 1.9899729330139092e-05, "loss": 0.2872, "mean_token_accuracy": 0.9361357748508453, "num_tokens": 34812316.0, "step": 8280 }, { "entropy": 0.17128741517663001, "epoch": 1.9313439794847884, "grad_norm": 1.8046875, "learning_rate": 1.9899607630377553e-05, "loss": 0.2588, "mean_token_accuracy": 0.9484428107738495, "num_tokens": 34827639.0, "step": 8285 }, { "entropy": 0.2159100666642189, "epoch": 1.93250961650542, "grad_norm": 0.6484375, "learning_rate": 1.9899485857555492e-05, "loss": 0.3604, "mean_token_accuracy": 0.9325559318065644, "num_tokens": 34846897.0, "step": 8290 }, { "entropy": 0.24534763917326927, "epoch": 1.933675253526052, "grad_norm": 5.90625, "learning_rate": 1.9899364011674732e-05, "loss": 0.4428, "mean_token_accuracy": 0.9214135646820069, "num_tokens": 34858892.0, "step": 8295 }, { "entropy": 0.2856233362108469, "epoch": 1.9348408905466838, "grad_norm": 2.03125, "learning_rate": 1.9899242092737083e-05, "loss": 0.5202, "mean_token_accuracy": 0.887435781955719, "num_tokens": 34873653.0, "step": 8300 }, { "entropy": 0.2192686103284359, "epoch": 1.9360065275673155, "grad_norm": 1.4609375, "learning_rate": 1.9899120100744366e-05, "loss": 0.3997, "mean_token_accuracy": 0.9170324444770813, "num_tokens": 34902515.0, "step": 8305 }, { "entropy": 0.24926665425300598, "epoch": 1.9371721645879472, "grad_norm": 2.9375, "learning_rate": 1.98989980356984e-05, "loss": 0.492, "mean_token_accuracy": 0.908800745010376, "num_tokens": 34914633.0, "step": 8310 }, { "entropy": 0.5896746765822172, "epoch": 1.938337801608579, "grad_norm": 5.75, "learning_rate": 1.9898875897601006e-05, "loss": 0.9085, "mean_token_accuracy": 0.8313942730426789, "num_tokens": 34960403.0, "step": 8315 }, { "entropy": 0.21931098848581315, "epoch": 1.939503438629211, "grad_norm": 2.828125, "learning_rate": 1.9898753686454e-05, "loss": 0.3373, "mean_token_accuracy": 0.9282977223396301, "num_tokens": 34976041.0, "step": 8320 }, { "entropy": 0.19581068167462945, "epoch": 1.9406690756498426, "grad_norm": 5.46875, "learning_rate": 1.989863140225921e-05, "loss": 0.3874, "mean_token_accuracy": 0.9340884864330292, "num_tokens": 35013548.0, "step": 8325 }, { "entropy": 0.22232236564159394, "epoch": 1.9418347126704743, "grad_norm": 6.84375, "learning_rate": 1.9898509045018457e-05, "loss": 0.5121, "mean_token_accuracy": 0.9140840828418731, "num_tokens": 35031301.0, "step": 8330 }, { "entropy": 0.36525106728076934, "epoch": 1.9430003496911064, "grad_norm": 6.0, "learning_rate": 1.989838661473357e-05, "loss": 0.7153, "mean_token_accuracy": 0.8757256925106048, "num_tokens": 35039659.0, "step": 8335 }, { "entropy": 0.24257199615240096, "epoch": 1.944165986711738, "grad_norm": 8.3125, "learning_rate": 1.9898264111406368e-05, "loss": 0.5561, "mean_token_accuracy": 0.8935992956161499, "num_tokens": 35050633.0, "step": 8340 }, { "entropy": 0.16945633180439473, "epoch": 1.9453316237323697, "grad_norm": 3.78125, "learning_rate": 1.9898141535038682e-05, "loss": 0.2938, "mean_token_accuracy": 0.9406558275222778, "num_tokens": 35073446.0, "step": 8345 }, { "entropy": 0.19805988781154155, "epoch": 1.9464972607530016, "grad_norm": 5.15625, "learning_rate": 1.9898018885632338e-05, "loss": 0.3331, "mean_token_accuracy": 0.9338889300823212, "num_tokens": 35090802.0, "step": 8350 }, { "entropy": 0.2623408816754818, "epoch": 1.9476628977736334, "grad_norm": 1.2578125, "learning_rate": 1.9897896163189165e-05, "loss": 0.3252, "mean_token_accuracy": 0.9054560959339142, "num_tokens": 35121812.0, "step": 8355 }, { "entropy": 0.2672824915498495, "epoch": 1.948828534794265, "grad_norm": 0.9765625, "learning_rate": 1.9897773367710993e-05, "loss": 0.3856, "mean_token_accuracy": 0.9044731080532074, "num_tokens": 35157778.0, "step": 8360 }, { "entropy": 0.3403429910540581, "epoch": 1.9499941718148968, "grad_norm": 13.75, "learning_rate": 1.9897650499199654e-05, "loss": 0.8828, "mean_token_accuracy": 0.8561728596687317, "num_tokens": 35174497.0, "step": 8365 }, { "entropy": 0.20890892669558525, "epoch": 1.9511598088355286, "grad_norm": 6.25, "learning_rate": 1.989752755765698e-05, "loss": 0.4039, "mean_token_accuracy": 0.927830719947815, "num_tokens": 35187734.0, "step": 8370 }, { "entropy": 0.29691824987530707, "epoch": 1.9523254458561605, "grad_norm": 7.78125, "learning_rate": 1.9897404543084804e-05, "loss": 0.515, "mean_token_accuracy": 0.8886303842067719, "num_tokens": 35202524.0, "step": 8375 }, { "entropy": 0.2688115481287241, "epoch": 1.9534910828767922, "grad_norm": 5.71875, "learning_rate": 1.989728145548496e-05, "loss": 0.4996, "mean_token_accuracy": 0.9002360701560974, "num_tokens": 35217656.0, "step": 8380 }, { "entropy": 0.24384147226810454, "epoch": 1.9546567198974238, "grad_norm": 0.9296875, "learning_rate": 1.9897158294859282e-05, "loss": 0.5896, "mean_token_accuracy": 0.9057703733444213, "num_tokens": 35231596.0, "step": 8385 }, { "entropy": 0.17410086654126644, "epoch": 1.9558223569180557, "grad_norm": 0.6015625, "learning_rate": 1.9897035061209608e-05, "loss": 0.2515, "mean_token_accuracy": 0.9428678154945374, "num_tokens": 35258131.0, "step": 8390 }, { "entropy": 0.27097597122192385, "epoch": 1.9569879939386876, "grad_norm": 11.1875, "learning_rate": 1.989691175453778e-05, "loss": 0.516, "mean_token_accuracy": 0.9027996301651001, "num_tokens": 35279654.0, "step": 8395 }, { "entropy": 0.2009141666814685, "epoch": 1.9581536309593193, "grad_norm": 5.40625, "learning_rate": 1.9896788374845628e-05, "loss": 0.4027, "mean_token_accuracy": 0.9286759912967681, "num_tokens": 35298667.0, "step": 8400 }, { "entropy": 0.27596412152051925, "epoch": 1.959319267979951, "grad_norm": 5.40625, "learning_rate": 1.9896664922134995e-05, "loss": 0.6089, "mean_token_accuracy": 0.8886427342891693, "num_tokens": 35312271.0, "step": 8405 }, { "entropy": 0.26666624546051027, "epoch": 1.9604849050005828, "grad_norm": 1.7578125, "learning_rate": 1.9896541396407727e-05, "loss": 0.4534, "mean_token_accuracy": 0.8949091792106628, "num_tokens": 35326375.0, "step": 8410 }, { "entropy": 0.25538104176521303, "epoch": 1.9616505420212147, "grad_norm": 0.71484375, "learning_rate": 1.9896417797665663e-05, "loss": 0.4964, "mean_token_accuracy": 0.912453830242157, "num_tokens": 35346842.0, "step": 8415 }, { "entropy": 0.25083237886428833, "epoch": 1.9628161790418464, "grad_norm": 4.53125, "learning_rate": 1.989629412591064e-05, "loss": 0.5104, "mean_token_accuracy": 0.9088790953159332, "num_tokens": 35366749.0, "step": 8420 }, { "entropy": 0.31124439314007757, "epoch": 1.963981816062478, "grad_norm": 4.96875, "learning_rate": 1.989617038114451e-05, "loss": 0.6628, "mean_token_accuracy": 0.8773128747940063, "num_tokens": 35379874.0, "step": 8425 }, { "entropy": 0.2417039457708597, "epoch": 1.96514745308311, "grad_norm": 2.8125, "learning_rate": 1.9896046563369114e-05, "loss": 0.4759, "mean_token_accuracy": 0.9157770335674286, "num_tokens": 35395582.0, "step": 8430 }, { "entropy": 0.24686218202114105, "epoch": 1.9663130901037418, "grad_norm": 9.375, "learning_rate": 1.9895922672586302e-05, "loss": 0.5481, "mean_token_accuracy": 0.9166857957839966, "num_tokens": 35407840.0, "step": 8435 }, { "entropy": 0.3242442309856415, "epoch": 1.9674787271243734, "grad_norm": 0.73046875, "learning_rate": 1.9895798708797917e-05, "loss": 0.5274, "mean_token_accuracy": 0.8759232670068741, "num_tokens": 35431071.0, "step": 8440 }, { "entropy": 0.22474229484796523, "epoch": 1.968644364145005, "grad_norm": 5.1875, "learning_rate": 1.9895674672005812e-05, "loss": 0.3409, "mean_token_accuracy": 0.9216860592365265, "num_tokens": 35451477.0, "step": 8445 }, { "entropy": 0.27756930217146875, "epoch": 1.969810001165637, "grad_norm": 1.09375, "learning_rate": 1.9895550562211833e-05, "loss": 0.4468, "mean_token_accuracy": 0.9098743259906769, "num_tokens": 35465074.0, "step": 8450 }, { "entropy": 0.24696202836930753, "epoch": 1.9709756381862689, "grad_norm": 0.80078125, "learning_rate": 1.9895426379417828e-05, "loss": 0.2022, "mean_token_accuracy": 0.9224382996559143, "num_tokens": 35489306.0, "step": 8455 }, { "entropy": 0.2881707139313221, "epoch": 1.9721412752069005, "grad_norm": 2.6875, "learning_rate": 1.9895302123625656e-05, "loss": 0.6088, "mean_token_accuracy": 0.8947436928749084, "num_tokens": 35498686.0, "step": 8460 }, { "entropy": 0.3253455236554146, "epoch": 1.9733069122275322, "grad_norm": 3.65625, "learning_rate": 1.9895177794837167e-05, "loss": 0.6071, "mean_token_accuracy": 0.8912819802761078, "num_tokens": 35508013.0, "step": 8465 }, { "entropy": 0.2203302625566721, "epoch": 1.9744725492481643, "grad_norm": 0.96875, "learning_rate": 1.9895053393054214e-05, "loss": 0.378, "mean_token_accuracy": 0.9241073846817016, "num_tokens": 35522649.0, "step": 8470 }, { "entropy": 0.18942125625908374, "epoch": 1.975638186268796, "grad_norm": 3.859375, "learning_rate": 1.9894928918278652e-05, "loss": 0.2913, "mean_token_accuracy": 0.9361551582813263, "num_tokens": 35540177.0, "step": 8475 }, { "entropy": 0.24017982184886932, "epoch": 1.9768038232894276, "grad_norm": 4.28125, "learning_rate": 1.989480437051234e-05, "loss": 0.4545, "mean_token_accuracy": 0.9062156975269318, "num_tokens": 35553338.0, "step": 8480 }, { "entropy": 0.2074093535542488, "epoch": 1.9779694603100595, "grad_norm": 2.640625, "learning_rate": 1.9894679749757126e-05, "loss": 0.3295, "mean_token_accuracy": 0.9237248361110687, "num_tokens": 35571435.0, "step": 8485 }, { "entropy": 0.42545375488698484, "epoch": 1.9791350973306914, "grad_norm": 5.4375, "learning_rate": 1.989455505601488e-05, "loss": 0.7273, "mean_token_accuracy": 0.8888332307338714, "num_tokens": 35597466.0, "step": 8490 }, { "entropy": 0.20387993920594455, "epoch": 1.980300734351323, "grad_norm": 0.9375, "learning_rate": 1.9894430289287453e-05, "loss": 0.286, "mean_token_accuracy": 0.9397811949253082, "num_tokens": 35621826.0, "step": 8495 }, { "entropy": 0.25947265028953553, "epoch": 1.9814663713719547, "grad_norm": 4.59375, "learning_rate": 1.9894305449576713e-05, "loss": 0.3965, "mean_token_accuracy": 0.92665935754776, "num_tokens": 35633794.0, "step": 8500 }, { "entropy": 0.30034587234258653, "epoch": 1.9826320083925866, "grad_norm": 0.59765625, "learning_rate": 1.9894180536884514e-05, "loss": 0.5105, "mean_token_accuracy": 0.9011545956134797, "num_tokens": 35656258.0, "step": 8505 }, { "entropy": 0.17847388423979282, "epoch": 1.9837976454132185, "grad_norm": 5.65625, "learning_rate": 1.9894055551212725e-05, "loss": 0.2273, "mean_token_accuracy": 0.925702440738678, "num_tokens": 35683494.0, "step": 8510 }, { "entropy": 0.17927013970911504, "epoch": 1.9849632824338501, "grad_norm": 1.5625, "learning_rate": 1.9893930492563203e-05, "loss": 0.2927, "mean_token_accuracy": 0.9423952460289001, "num_tokens": 35713313.0, "step": 8515 }, { "entropy": 0.30927509516477586, "epoch": 1.9861289194544818, "grad_norm": 1.234375, "learning_rate": 1.9893805360937818e-05, "loss": 0.5729, "mean_token_accuracy": 0.8941884458065033, "num_tokens": 35735043.0, "step": 8520 }, { "entropy": 0.23369169272482396, "epoch": 1.9872945564751137, "grad_norm": 4.65625, "learning_rate": 1.9893680156338434e-05, "loss": 0.4675, "mean_token_accuracy": 0.9103543817996979, "num_tokens": 35752057.0, "step": 8525 }, { "entropy": 0.27408935129642487, "epoch": 1.9884601934957455, "grad_norm": 2.265625, "learning_rate": 1.9893554878766918e-05, "loss": 0.5507, "mean_token_accuracy": 0.8969059348106384, "num_tokens": 35765013.0, "step": 8530 }, { "entropy": 0.2391377042979002, "epoch": 1.9896258305163772, "grad_norm": 3.21875, "learning_rate": 1.9893429528225143e-05, "loss": 0.375, "mean_token_accuracy": 0.925329464673996, "num_tokens": 35780367.0, "step": 8535 }, { "entropy": 0.2847207933664322, "epoch": 1.9907914675370089, "grad_norm": 2.171875, "learning_rate": 1.989330410471497e-05, "loss": 0.5161, "mean_token_accuracy": 0.9022741556167603, "num_tokens": 35791583.0, "step": 8540 }, { "entropy": 0.26735531222075226, "epoch": 1.9919571045576407, "grad_norm": 0.63671875, "learning_rate": 1.989317860823827e-05, "loss": 0.4382, "mean_token_accuracy": 0.9113502144813538, "num_tokens": 35812087.0, "step": 8545 }, { "entropy": 0.28238332718610765, "epoch": 1.9931227415782726, "grad_norm": 6.96875, "learning_rate": 1.989305303879692e-05, "loss": 0.647, "mean_token_accuracy": 0.8785060703754425, "num_tokens": 35822522.0, "step": 8550 }, { "entropy": 0.24268617071211337, "epoch": 1.9942883785989043, "grad_norm": 5.875, "learning_rate": 1.989292739639279e-05, "loss": 0.3323, "mean_token_accuracy": 0.9189518809318542, "num_tokens": 35843104.0, "step": 8555 }, { "entropy": 0.28384055122733115, "epoch": 1.995454015619536, "grad_norm": 3.75, "learning_rate": 1.9892801681027754e-05, "loss": 0.5158, "mean_token_accuracy": 0.8955554962158203, "num_tokens": 35856347.0, "step": 8560 }, { "entropy": 0.21551661379635334, "epoch": 1.9966196526401678, "grad_norm": 1.5546875, "learning_rate": 1.9892675892703685e-05, "loss": 0.3563, "mean_token_accuracy": 0.9304292321205139, "num_tokens": 35880168.0, "step": 8565 }, { "entropy": 0.2399544682353735, "epoch": 1.9977852896607997, "grad_norm": 0.6640625, "learning_rate": 1.989255003142246e-05, "loss": 0.4766, "mean_token_accuracy": 0.9071892321109771, "num_tokens": 35901547.0, "step": 8570 }, { "entropy": 0.1419397760182619, "epoch": 1.9989509266814314, "grad_norm": 0.423828125, "learning_rate": 1.9892424097185953e-05, "loss": 0.1211, "mean_token_accuracy": 0.9566732287406922, "num_tokens": 35938802.0, "step": 8575 }, { "entropy": 0.2179360402127107, "epoch": 2.0, "grad_norm": 14.5625, "learning_rate": 1.989229808999604e-05, "loss": 0.4882, "mean_token_accuracy": 0.9187219209141202, "num_tokens": 35961500.0, "step": 8580 }, { "entropy": 0.23930302262306213, "epoch": 2.0011656370206317, "grad_norm": 1.0859375, "learning_rate": 3.0769230769230774e-06, "loss": 0.2686, "mean_token_accuracy": 0.918672627210617, "num_tokens": 33462.0, "step": 8585 }, { "entropy": 0.17616038862615824, "epoch": 2.0023312740412638, "grad_norm": 6.09375, "learning_rate": 6.923076923076923e-06, "loss": 0.2979, "mean_token_accuracy": 0.9223396599292755, "num_tokens": 68428.0, "step": 8590 }, { "entropy": 0.22200995236635207, "epoch": 2.0034969110618954, "grad_norm": 1.875, "learning_rate": 1.0769230769230771e-05, "loss": 0.3672, "mean_token_accuracy": 0.9198292434215546, "num_tokens": 80228.0, "step": 8595 }, { "entropy": 0.19995370060205458, "epoch": 2.004662548082527, "grad_norm": 1.734375, "learning_rate": 1.4615384615384617e-05, "loss": 0.3429, "mean_token_accuracy": 0.9279805183410644, "num_tokens": 99552.0, "step": 8600 }, { "entropy": 0.19114257879555224, "epoch": 2.0058281851031587, "grad_norm": 0.98828125, "learning_rate": 1.8461538461538465e-05, "loss": 0.3238, "mean_token_accuracy": 0.9297996819019317, "num_tokens": 115995.0, "step": 8605 }, { "entropy": 0.12970165833830832, "epoch": 2.006993822123791, "grad_norm": 0.9296875, "learning_rate": 2.230769230769231e-05, "loss": 0.2439, "mean_token_accuracy": 0.9543951272964477, "num_tokens": 136920.0, "step": 8610 }, { "entropy": 0.2722026415169239, "epoch": 2.0081594591444225, "grad_norm": 2.0625, "learning_rate": 2.6153846153846157e-05, "loss": 0.4356, "mean_token_accuracy": 0.9024666368961334, "num_tokens": 158942.0, "step": 8615 }, { "entropy": 0.21159663870930673, "epoch": 2.009325096165054, "grad_norm": 6.28125, "learning_rate": 3e-05, "loss": 0.3646, "mean_token_accuracy": 0.9342296838760376, "num_tokens": 178917.0, "step": 8620 }, { "entropy": 0.18026983588933945, "epoch": 2.010490733185686, "grad_norm": 2.203125, "learning_rate": 3.384615384615385e-05, "loss": 0.2755, "mean_token_accuracy": 0.93057501912117, "num_tokens": 195722.0, "step": 8625 }, { "entropy": 0.26880242880433797, "epoch": 2.011656370206318, "grad_norm": 11.375, "learning_rate": 3.769230769230769e-05, "loss": 0.5401, "mean_token_accuracy": 0.9098369419574738, "num_tokens": 219314.0, "step": 8630 }, { "entropy": 0.2589976190589368, "epoch": 2.0128220072269496, "grad_norm": 6.75, "learning_rate": 4.1538461538461544e-05, "loss": 0.492, "mean_token_accuracy": 0.909549605846405, "num_tokens": 247321.0, "step": 8635 }, { "entropy": 0.2384048379957676, "epoch": 2.0139876442475813, "grad_norm": 7.28125, "learning_rate": 4.538461538461539e-05, "loss": 0.3351, "mean_token_accuracy": 0.9222664594650268, "num_tokens": 267350.0, "step": 8640 }, { "entropy": 0.1944281917065382, "epoch": 2.015153281268213, "grad_norm": 3.375, "learning_rate": 4.923076923076924e-05, "loss": 0.2959, "mean_token_accuracy": 0.9433736026287078, "num_tokens": 282504.0, "step": 8645 }, { "entropy": 0.28175385296344757, "epoch": 2.016318918288845, "grad_norm": 14.75, "learning_rate": 4.999999994035399e-05, "loss": 0.6312, "mean_token_accuracy": 0.8823612987995147, "num_tokens": 300273.0, "step": 8650 }, { "entropy": 0.22739908546209336, "epoch": 2.0174845553094767, "grad_norm": 1.65625, "learning_rate": 4.999999969804205e-05, "loss": 0.3442, "mean_token_accuracy": 0.9122324585914612, "num_tokens": 323539.0, "step": 8655 }, { "entropy": 0.2628894064575434, "epoch": 2.0186501923301083, "grad_norm": 1.4765625, "learning_rate": 4.9999999269336304e-05, "loss": 0.427, "mean_token_accuracy": 0.9008684635162354, "num_tokens": 348652.0, "step": 8660 }, { "entropy": 0.2020172208547592, "epoch": 2.01981582935074, "grad_norm": 0.65625, "learning_rate": 4.999999865423678e-05, "loss": 0.3984, "mean_token_accuracy": 0.9196797311306, "num_tokens": 369047.0, "step": 8665 }, { "entropy": 0.21287547498941423, "epoch": 2.020981466371372, "grad_norm": 7.15625, "learning_rate": 4.9999997852743475e-05, "loss": 0.5729, "mean_token_accuracy": 0.8993529319763184, "num_tokens": 388371.0, "step": 8670 }, { "entropy": 0.19923520274460316, "epoch": 2.0221471033920038, "grad_norm": 1.1875, "learning_rate": 4.9999996864856396e-05, "loss": 0.2562, "mean_token_accuracy": 0.9336078345775605, "num_tokens": 412629.0, "step": 8675 }, { "entropy": 0.30621002092957494, "epoch": 2.0233127404126354, "grad_norm": 7.9375, "learning_rate": 4.999999569057556e-05, "loss": 0.5006, "mean_token_accuracy": 0.903810465335846, "num_tokens": 432090.0, "step": 8680 }, { "entropy": 0.21448403932154178, "epoch": 2.024478377433267, "grad_norm": 5.21875, "learning_rate": 4.9999994329900996e-05, "loss": 0.4303, "mean_token_accuracy": 0.9238200068473816, "num_tokens": 456098.0, "step": 8685 }, { "entropy": 0.2428566601127386, "epoch": 2.025644014453899, "grad_norm": 0.373046875, "learning_rate": 4.999999278283271e-05, "loss": 0.3631, "mean_token_accuracy": 0.9116427540779114, "num_tokens": 495551.0, "step": 8690 }, { "entropy": 0.29837720319628713, "epoch": 2.026809651474531, "grad_norm": 3.84375, "learning_rate": 4.999999104937073e-05, "loss": 0.3483, "mean_token_accuracy": 0.9145365118980407, "num_tokens": 511921.0, "step": 8695 }, { "entropy": 0.20137713775038718, "epoch": 2.0279752884951625, "grad_norm": 5.34375, "learning_rate": 4.9999989129515084e-05, "loss": 0.3361, "mean_token_accuracy": 0.9304125428199768, "num_tokens": 531784.0, "step": 8700 }, { "entropy": 0.2114957518875599, "epoch": 2.029140925515794, "grad_norm": 1.8515625, "learning_rate": 4.9999987023265806e-05, "loss": 0.4151, "mean_token_accuracy": 0.9195410490036011, "num_tokens": 547546.0, "step": 8705 }, { "entropy": 0.20500697158277034, "epoch": 2.0303065625364263, "grad_norm": 1.59375, "learning_rate": 4.9999984730622904e-05, "loss": 0.3087, "mean_token_accuracy": 0.922219431400299, "num_tokens": 564629.0, "step": 8710 }, { "entropy": 0.19661780446767807, "epoch": 2.031472199557058, "grad_norm": 4.9375, "learning_rate": 4.9999982251586444e-05, "loss": 0.419, "mean_token_accuracy": 0.9265513479709625, "num_tokens": 579540.0, "step": 8715 }, { "entropy": 0.3375698685646057, "epoch": 2.0326378365776896, "grad_norm": 5.78125, "learning_rate": 4.999997958615644e-05, "loss": 0.5782, "mean_token_accuracy": 0.8879574716091156, "num_tokens": 596627.0, "step": 8720 }, { "entropy": 0.21210809499025346, "epoch": 2.0338034735983217, "grad_norm": 2.34375, "learning_rate": 4.999997673433294e-05, "loss": 0.3076, "mean_token_accuracy": 0.9169882595539093, "num_tokens": 617829.0, "step": 8725 }, { "entropy": 0.2801529258489609, "epoch": 2.0349691106189534, "grad_norm": 7.75, "learning_rate": 4.9999973696115984e-05, "loss": 0.8278, "mean_token_accuracy": 0.8704663276672363, "num_tokens": 627421.0, "step": 8730 }, { "entropy": 0.31733031272888185, "epoch": 2.036134747639585, "grad_norm": 7.65625, "learning_rate": 4.9999970471505634e-05, "loss": 0.6683, "mean_token_accuracy": 0.8924909234046936, "num_tokens": 639800.0, "step": 8735 }, { "entropy": 0.21051425635814666, "epoch": 2.0373003846602167, "grad_norm": 6.84375, "learning_rate": 4.999996706050191e-05, "loss": 0.4327, "mean_token_accuracy": 0.9305059254169464, "num_tokens": 654829.0, "step": 8740 }, { "entropy": 0.2580501724034548, "epoch": 2.038466021680849, "grad_norm": 0.5234375, "learning_rate": 4.999996346310487e-05, "loss": 0.3248, "mean_token_accuracy": 0.9246833562850952, "num_tokens": 681095.0, "step": 8745 }, { "entropy": 0.3147982403635979, "epoch": 2.0396316587014804, "grad_norm": 4.09375, "learning_rate": 4.9999959679314586e-05, "loss": 0.5683, "mean_token_accuracy": 0.902126133441925, "num_tokens": 700848.0, "step": 8750 }, { "entropy": 0.21838752701878547, "epoch": 2.040797295722112, "grad_norm": 1.0625, "learning_rate": 4.99999557091311e-05, "loss": 0.3909, "mean_token_accuracy": 0.9119821190834045, "num_tokens": 720120.0, "step": 8755 }, { "entropy": 0.2715337313711643, "epoch": 2.0419629327427438, "grad_norm": 3.921875, "learning_rate": 4.999995155255447e-05, "loss": 0.5753, "mean_token_accuracy": 0.8893308877944947, "num_tokens": 740168.0, "step": 8760 }, { "entropy": 0.37294210195541383, "epoch": 2.043128569763376, "grad_norm": 4.34375, "learning_rate": 4.9999947209584754e-05, "loss": 0.6569, "mean_token_accuracy": 0.8944745361804962, "num_tokens": 751619.0, "step": 8765 }, { "entropy": 0.25972907468676565, "epoch": 2.0442942067840075, "grad_norm": 4.8125, "learning_rate": 4.9999942680222036e-05, "loss": 0.4362, "mean_token_accuracy": 0.9104464769363403, "num_tokens": 763407.0, "step": 8770 }, { "entropy": 0.24379628524184227, "epoch": 2.045459843804639, "grad_norm": 2.34375, "learning_rate": 4.999993796446637e-05, "loss": 0.3387, "mean_token_accuracy": 0.9292517900466919, "num_tokens": 781722.0, "step": 8775 }, { "entropy": 0.3385071136057377, "epoch": 2.046625480825271, "grad_norm": 1.203125, "learning_rate": 4.9999933062317826e-05, "loss": 0.7102, "mean_token_accuracy": 0.8811876237392425, "num_tokens": 801495.0, "step": 8780 }, { "entropy": 0.31932273507118225, "epoch": 2.047791117845903, "grad_norm": 7.53125, "learning_rate": 4.9999927973776475e-05, "loss": 0.8247, "mean_token_accuracy": 0.8673913955688477, "num_tokens": 809945.0, "step": 8785 }, { "entropy": 0.40632414594292643, "epoch": 2.0489567548665346, "grad_norm": 5.0, "learning_rate": 4.999992269884241e-05, "loss": 0.7773, "mean_token_accuracy": 0.867892587184906, "num_tokens": 821067.0, "step": 8790 }, { "entropy": 0.2714473832398653, "epoch": 2.0501223918871663, "grad_norm": 1.2734375, "learning_rate": 4.9999917237515684e-05, "loss": 0.3201, "mean_token_accuracy": 0.9276492118835449, "num_tokens": 843342.0, "step": 8795 }, { "entropy": 0.25552888177335265, "epoch": 2.051288028907798, "grad_norm": 4.15625, "learning_rate": 4.9999911589796386e-05, "loss": 0.5356, "mean_token_accuracy": 0.9006185114383698, "num_tokens": 867328.0, "step": 8800 }, { "entropy": 0.42955063804984095, "epoch": 2.05245366592843, "grad_norm": 0.96875, "learning_rate": 4.999990575568462e-05, "loss": 0.9482, "mean_token_accuracy": 0.8779349982738495, "num_tokens": 894639.0, "step": 8805 }, { "entropy": 0.25447643976658585, "epoch": 2.0536193029490617, "grad_norm": 2.453125, "learning_rate": 4.999989973518046e-05, "loss": 0.5433, "mean_token_accuracy": 0.90226930975914, "num_tokens": 914097.0, "step": 8810 }, { "entropy": 0.3173729632049799, "epoch": 2.0547849399696934, "grad_norm": 0.625, "learning_rate": 4.999989352828398e-05, "loss": 0.4612, "mean_token_accuracy": 0.8994117856025696, "num_tokens": 937133.0, "step": 8815 }, { "entropy": 0.27526839822530746, "epoch": 2.055950576990325, "grad_norm": 1.4375, "learning_rate": 4.999988713499529e-05, "loss": 0.6232, "mean_token_accuracy": 0.8864486396312714, "num_tokens": 954495.0, "step": 8820 }, { "entropy": 0.19231984689831733, "epoch": 2.057116214010957, "grad_norm": 5.53125, "learning_rate": 4.999988055531449e-05, "loss": 0.3043, "mean_token_accuracy": 0.928051370382309, "num_tokens": 985578.0, "step": 8825 }, { "entropy": 0.22631950937211515, "epoch": 2.058281851031589, "grad_norm": 0.6640625, "learning_rate": 4.999987378924166e-05, "loss": 0.4999, "mean_token_accuracy": 0.9139285802841186, "num_tokens": 1007174.0, "step": 8830 }, { "entropy": 0.24679026156663894, "epoch": 2.0594474880522204, "grad_norm": 4.3125, "learning_rate": 4.999986683677691e-05, "loss": 0.4819, "mean_token_accuracy": 0.9124704420566558, "num_tokens": 1022221.0, "step": 8835 }, { "entropy": 0.3226664915680885, "epoch": 2.060613125072852, "grad_norm": 4.1875, "learning_rate": 4.999985969792036e-05, "loss": 0.6056, "mean_token_accuracy": 0.9017972528934479, "num_tokens": 1034663.0, "step": 8840 }, { "entropy": 0.35001400858163834, "epoch": 2.061778762093484, "grad_norm": 2.0625, "learning_rate": 4.999985237267209e-05, "loss": 0.5449, "mean_token_accuracy": 0.8659915328025818, "num_tokens": 1068813.0, "step": 8845 }, { "entropy": 0.28483418338000777, "epoch": 2.062944399114116, "grad_norm": 2.15625, "learning_rate": 4.999984486103222e-05, "loss": 0.3729, "mean_token_accuracy": 0.9030164957046509, "num_tokens": 1085834.0, "step": 8850 }, { "entropy": 0.2964754268527031, "epoch": 2.0641100361347475, "grad_norm": 1.8046875, "learning_rate": 4.999983716300086e-05, "loss": 0.4947, "mean_token_accuracy": 0.9007986903190612, "num_tokens": 1107579.0, "step": 8855 }, { "entropy": 0.22273534834384917, "epoch": 2.065275673155379, "grad_norm": 1.890625, "learning_rate": 4.999982927857814e-05, "loss": 0.4777, "mean_token_accuracy": 0.9066579163074493, "num_tokens": 1140203.0, "step": 8860 }, { "entropy": 0.2912256710231304, "epoch": 2.0664413101760113, "grad_norm": 2.265625, "learning_rate": 4.999982120776415e-05, "loss": 0.5221, "mean_token_accuracy": 0.8961217284202576, "num_tokens": 1152463.0, "step": 8865 }, { "entropy": 0.2787528317421675, "epoch": 2.067606947196643, "grad_norm": 0.59765625, "learning_rate": 4.999981295055903e-05, "loss": 0.363, "mean_token_accuracy": 0.8967165648937225, "num_tokens": 1184273.0, "step": 8870 }, { "entropy": 0.27786948047578336, "epoch": 2.0687725842172746, "grad_norm": 1.34375, "learning_rate": 4.99998045069629e-05, "loss": 0.5324, "mean_token_accuracy": 0.9002691745758057, "num_tokens": 1203672.0, "step": 8875 }, { "entropy": 0.25195387527346613, "epoch": 2.0699382212379067, "grad_norm": 4.5, "learning_rate": 4.999979587697589e-05, "loss": 0.678, "mean_token_accuracy": 0.890131413936615, "num_tokens": 1215263.0, "step": 8880 }, { "entropy": 0.21976680774241686, "epoch": 2.0711038582585384, "grad_norm": 2.375, "learning_rate": 4.9999787060598106e-05, "loss": 0.2744, "mean_token_accuracy": 0.9232886552810669, "num_tokens": 1249096.0, "step": 8885 }, { "entropy": 0.2308237187564373, "epoch": 2.07226949527917, "grad_norm": 4.25, "learning_rate": 4.999977805782971e-05, "loss": 0.4887, "mean_token_accuracy": 0.9085934281349182, "num_tokens": 1279525.0, "step": 8890 }, { "entropy": 0.26388890072703364, "epoch": 2.0734351322998017, "grad_norm": 1.609375, "learning_rate": 4.999976886867082e-05, "loss": 0.4447, "mean_token_accuracy": 0.9158779978752136, "num_tokens": 1301732.0, "step": 8895 }, { "entropy": 0.20551118552684783, "epoch": 2.074600769320434, "grad_norm": 0.9140625, "learning_rate": 4.999975949312157e-05, "loss": 0.3894, "mean_token_accuracy": 0.9103192627429962, "num_tokens": 1319107.0, "step": 8900 }, { "entropy": 0.335835388302803, "epoch": 2.0757664063410655, "grad_norm": 1.46875, "learning_rate": 4.999974993118211e-05, "loss": 0.4564, "mean_token_accuracy": 0.8883191406726837, "num_tokens": 1344284.0, "step": 8905 }, { "entropy": 0.2874822109937668, "epoch": 2.076932043361697, "grad_norm": 5.90625, "learning_rate": 4.999974018285258e-05, "loss": 0.6495, "mean_token_accuracy": 0.8915266275405884, "num_tokens": 1353734.0, "step": 8910 }, { "entropy": 0.22463925033807755, "epoch": 2.078097680382329, "grad_norm": 1.765625, "learning_rate": 4.9999730248133115e-05, "loss": 0.3548, "mean_token_accuracy": 0.9258445799350739, "num_tokens": 1383642.0, "step": 8915 }, { "entropy": 0.3080735132098198, "epoch": 2.079263317402961, "grad_norm": 2.59375, "learning_rate": 4.999972012702388e-05, "loss": 0.531, "mean_token_accuracy": 0.9089854300022125, "num_tokens": 1395352.0, "step": 8920 }, { "entropy": 0.28653545752167703, "epoch": 2.0804289544235925, "grad_norm": 1.0, "learning_rate": 4.999970981952501e-05, "loss": 0.4904, "mean_token_accuracy": 0.8980461597442627, "num_tokens": 1409925.0, "step": 8925 }, { "entropy": 0.28012751042842865, "epoch": 2.081594591444224, "grad_norm": 6.5625, "learning_rate": 4.999969932563667e-05, "loss": 0.7632, "mean_token_accuracy": 0.8697837233543396, "num_tokens": 1419995.0, "step": 8930 }, { "entropy": 0.23538232855498792, "epoch": 2.082760228464856, "grad_norm": 0.65234375, "learning_rate": 4.999968864535901e-05, "loss": 0.4925, "mean_token_accuracy": 0.9042471885681153, "num_tokens": 1449443.0, "step": 8935 }, { "entropy": 0.30510496273636817, "epoch": 2.083925865485488, "grad_norm": 5.0625, "learning_rate": 4.99996777786922e-05, "loss": 0.5945, "mean_token_accuracy": 0.8913271486759186, "num_tokens": 1473675.0, "step": 8940 }, { "entropy": 0.3335975080728531, "epoch": 2.0850915025061196, "grad_norm": 1.640625, "learning_rate": 4.9999666725636384e-05, "loss": 0.4122, "mean_token_accuracy": 0.9008392214775085, "num_tokens": 1492252.0, "step": 8945 }, { "entropy": 0.40691106468439103, "epoch": 2.0862571395267513, "grad_norm": 2.59375, "learning_rate": 4.999965548619174e-05, "loss": 0.909, "mean_token_accuracy": 0.8463218748569489, "num_tokens": 1507068.0, "step": 8950 }, { "entropy": 0.27208645939826964, "epoch": 2.087422776547383, "grad_norm": 1.3046875, "learning_rate": 4.999964406035843e-05, "loss": 0.3961, "mean_token_accuracy": 0.9165595531463623, "num_tokens": 1539096.0, "step": 8955 }, { "entropy": 0.2435051068663597, "epoch": 2.088588413568015, "grad_norm": 4.125, "learning_rate": 4.9999632448136634e-05, "loss": 0.5914, "mean_token_accuracy": 0.9110420107841491, "num_tokens": 1561118.0, "step": 8960 }, { "entropy": 0.17810285724699498, "epoch": 2.0897540505886467, "grad_norm": 0.55859375, "learning_rate": 4.999962064952651e-05, "loss": 0.2275, "mean_token_accuracy": 0.9336501121520996, "num_tokens": 1587774.0, "step": 8965 }, { "entropy": 0.3193057609722018, "epoch": 2.0909196876092784, "grad_norm": 0.81640625, "learning_rate": 4.9999608664528244e-05, "loss": 0.3811, "mean_token_accuracy": 0.8954979538917541, "num_tokens": 1619172.0, "step": 8970 }, { "entropy": 0.22312114909291267, "epoch": 2.09208532462991, "grad_norm": 0.71875, "learning_rate": 4.999959649314202e-05, "loss": 0.5581, "mean_token_accuracy": 0.9176813662052155, "num_tokens": 1648290.0, "step": 8975 }, { "entropy": 0.2536992236971855, "epoch": 2.093250961650542, "grad_norm": 3.15625, "learning_rate": 4.999958413536799e-05, "loss": 0.4422, "mean_token_accuracy": 0.9238847613334655, "num_tokens": 1673427.0, "step": 8980 }, { "entropy": 0.22880229726433754, "epoch": 2.094416598671174, "grad_norm": 2.984375, "learning_rate": 4.999957159120639e-05, "loss": 0.4996, "mean_token_accuracy": 0.918055659532547, "num_tokens": 1694895.0, "step": 8985 }, { "entropy": 0.27447298876941206, "epoch": 2.0955822356918055, "grad_norm": 1.6640625, "learning_rate": 4.999955886065735e-05, "loss": 0.4274, "mean_token_accuracy": 0.8964529633522034, "num_tokens": 1720601.0, "step": 8990 }, { "entropy": 0.3642170369625092, "epoch": 2.096747872712437, "grad_norm": 5.40625, "learning_rate": 4.9999545943721105e-05, "loss": 0.9018, "mean_token_accuracy": 0.8594711780548095, "num_tokens": 1729024.0, "step": 8995 }, { "entropy": 0.3267651729285717, "epoch": 2.0979135097330692, "grad_norm": 2.578125, "learning_rate": 4.999953284039782e-05, "loss": 0.7126, "mean_token_accuracy": 0.8791795670986176, "num_tokens": 1741471.0, "step": 9000 }, { "entropy": 0.28976150676608087, "epoch": 2.099079146753701, "grad_norm": 3.03125, "learning_rate": 4.99995195506877e-05, "loss": 0.4418, "mean_token_accuracy": 0.8930846333503724, "num_tokens": 1766579.0, "step": 9005 }, { "entropy": 0.24256085567176341, "epoch": 2.1002447837743325, "grad_norm": 4.125, "learning_rate": 4.999950607459095e-05, "loss": 0.3859, "mean_token_accuracy": 0.9247425496578217, "num_tokens": 1784147.0, "step": 9010 }, { "entropy": 0.2715363338589668, "epoch": 2.1014104207949647, "grad_norm": 1.9453125, "learning_rate": 4.999949241210776e-05, "loss": 0.5245, "mean_token_accuracy": 0.8970009207725524, "num_tokens": 1799052.0, "step": 9015 }, { "entropy": 0.2423691965639591, "epoch": 2.1025760578155963, "grad_norm": 2.546875, "learning_rate": 4.999947856323834e-05, "loss": 0.3323, "mean_token_accuracy": 0.9294469296932221, "num_tokens": 1827445.0, "step": 9020 }, { "entropy": 0.36271433904767036, "epoch": 2.103741694836228, "grad_norm": 6.0, "learning_rate": 4.9999464527982886e-05, "loss": 0.8631, "mean_token_accuracy": 0.8470502078533173, "num_tokens": 1844015.0, "step": 9025 }, { "entropy": 0.3287566237151623, "epoch": 2.1049073318568596, "grad_norm": 5.3125, "learning_rate": 4.9999450306341627e-05, "loss": 0.6193, "mean_token_accuracy": 0.8893731594085693, "num_tokens": 1862324.0, "step": 9030 }, { "entropy": 0.2500194745138288, "epoch": 2.1060729688774917, "grad_norm": 2.5625, "learning_rate": 4.999943589831476e-05, "loss": 0.4734, "mean_token_accuracy": 0.9056059777736664, "num_tokens": 1882622.0, "step": 9035 }, { "entropy": 0.2787175789475441, "epoch": 2.1072386058981234, "grad_norm": 4.34375, "learning_rate": 4.99994213039025e-05, "loss": 0.64, "mean_token_accuracy": 0.8964097678661347, "num_tokens": 1893194.0, "step": 9040 }, { "entropy": 0.30563063621520997, "epoch": 2.108404242918755, "grad_norm": 2.515625, "learning_rate": 4.999940652310507e-05, "loss": 0.6266, "mean_token_accuracy": 0.889850401878357, "num_tokens": 1903993.0, "step": 9045 }, { "entropy": 0.2862144157290459, "epoch": 2.1095698799393867, "grad_norm": 2.859375, "learning_rate": 4.9999391555922695e-05, "loss": 0.7127, "mean_token_accuracy": 0.8805917859077453, "num_tokens": 1913810.0, "step": 9050 }, { "entropy": 0.3218904435634613, "epoch": 2.110735516960019, "grad_norm": 6.28125, "learning_rate": 4.999937640235558e-05, "loss": 0.578, "mean_token_accuracy": 0.9038245022296906, "num_tokens": 1926767.0, "step": 9055 }, { "entropy": 0.4377041935920715, "epoch": 2.1119011539806505, "grad_norm": 5.125, "learning_rate": 4.9999361062403974e-05, "loss": 1.034, "mean_token_accuracy": 0.8412764191627502, "num_tokens": 1933914.0, "step": 9060 }, { "entropy": 0.33127313032746314, "epoch": 2.113066791001282, "grad_norm": 2.46875, "learning_rate": 4.999934553606809e-05, "loss": 0.5326, "mean_token_accuracy": 0.9018651664257049, "num_tokens": 1944254.0, "step": 9065 }, { "entropy": 0.28445643484592437, "epoch": 2.114232428021914, "grad_norm": 2.25, "learning_rate": 4.999932982334816e-05, "loss": 0.4395, "mean_token_accuracy": 0.9009204924106597, "num_tokens": 1987487.0, "step": 9070 }, { "entropy": 0.27651037350296975, "epoch": 2.115398065042546, "grad_norm": 3.265625, "learning_rate": 4.999931392424442e-05, "loss": 0.501, "mean_token_accuracy": 0.9066934764385224, "num_tokens": 2009000.0, "step": 9075 }, { "entropy": 0.3002724215388298, "epoch": 2.1165637020631776, "grad_norm": 3.515625, "learning_rate": 4.999929783875712e-05, "loss": 0.4387, "mean_token_accuracy": 0.9174822568893433, "num_tokens": 2022851.0, "step": 9080 }, { "entropy": 0.3476183444261551, "epoch": 2.1177293390838092, "grad_norm": 2.375, "learning_rate": 4.999928156688648e-05, "loss": 0.6287, "mean_token_accuracy": 0.8917272806167602, "num_tokens": 2033218.0, "step": 9085 }, { "entropy": 0.19192668609321117, "epoch": 2.118894976104441, "grad_norm": 0.7421875, "learning_rate": 4.999926510863276e-05, "loss": 0.296, "mean_token_accuracy": 0.9337211489677429, "num_tokens": 2052410.0, "step": 9090 }, { "entropy": 0.3516515165567398, "epoch": 2.120060613125073, "grad_norm": 6.40625, "learning_rate": 4.999924846399619e-05, "loss": 0.7027, "mean_token_accuracy": 0.8735112309455871, "num_tokens": 2071429.0, "step": 9095 }, { "entropy": 0.18202774375677108, "epoch": 2.1212262501457047, "grad_norm": 3.984375, "learning_rate": 4.999923163297703e-05, "loss": 0.2499, "mean_token_accuracy": 0.9505364894866943, "num_tokens": 2103427.0, "step": 9100 }, { "entropy": 0.35498632192611695, "epoch": 2.1223918871663363, "grad_norm": 2.6875, "learning_rate": 4.999921461557552e-05, "loss": 0.7245, "mean_token_accuracy": 0.8860435366630555, "num_tokens": 2112754.0, "step": 9105 }, { "entropy": 0.30842496901750566, "epoch": 2.123557524186968, "grad_norm": 0.98046875, "learning_rate": 4.999919741179193e-05, "loss": 0.6197, "mean_token_accuracy": 0.8909704387187958, "num_tokens": 2131569.0, "step": 9110 }, { "entropy": 0.30220485720783474, "epoch": 2.1247231612076, "grad_norm": 0.68359375, "learning_rate": 4.99991800216265e-05, "loss": 0.4954, "mean_token_accuracy": 0.8969570279121399, "num_tokens": 2160157.0, "step": 9115 }, { "entropy": 0.2391287475824356, "epoch": 2.1258887982282317, "grad_norm": 1.5, "learning_rate": 4.99991624450795e-05, "loss": 0.2911, "mean_token_accuracy": 0.9212132573127747, "num_tokens": 2183922.0, "step": 9120 }, { "entropy": 0.26402276530861857, "epoch": 2.1270544352488634, "grad_norm": 4.5625, "learning_rate": 4.999914468215119e-05, "loss": 0.6939, "mean_token_accuracy": 0.8836956083774566, "num_tokens": 2198004.0, "step": 9125 }, { "entropy": 0.24503479078412055, "epoch": 2.128220072269495, "grad_norm": 0.765625, "learning_rate": 4.9999126732841825e-05, "loss": 0.3761, "mean_token_accuracy": 0.9078773021697998, "num_tokens": 2222287.0, "step": 9130 }, { "entropy": 0.24612213671207428, "epoch": 2.129385709290127, "grad_norm": 1.4453125, "learning_rate": 4.9999108597151684e-05, "loss": 0.5347, "mean_token_accuracy": 0.8830598950386047, "num_tokens": 2249866.0, "step": 9135 }, { "entropy": 0.29214831814169884, "epoch": 2.130551346310759, "grad_norm": 0.44921875, "learning_rate": 4.999909027508104e-05, "loss": 0.6057, "mean_token_accuracy": 0.8993490874767304, "num_tokens": 2272427.0, "step": 9140 }, { "entropy": 0.31067397333681585, "epoch": 2.1317169833313905, "grad_norm": 6.65625, "learning_rate": 4.999907176663015e-05, "loss": 0.6869, "mean_token_accuracy": 0.8875278353691101, "num_tokens": 2286353.0, "step": 9145 }, { "entropy": 0.37334904372692107, "epoch": 2.1328826203520226, "grad_norm": 3.9375, "learning_rate": 4.999905307179931e-05, "loss": 0.9226, "mean_token_accuracy": 0.858127874135971, "num_tokens": 2294175.0, "step": 9150 }, { "entropy": 0.31301565021276473, "epoch": 2.1340482573726542, "grad_norm": 4.28125, "learning_rate": 4.9999034190588776e-05, "loss": 0.6624, "mean_token_accuracy": 0.8904252409934997, "num_tokens": 2307730.0, "step": 9155 }, { "entropy": 0.2692046908661723, "epoch": 2.135213894393286, "grad_norm": 5.625, "learning_rate": 4.9999015122998855e-05, "loss": 0.4516, "mean_token_accuracy": 0.9097493112087249, "num_tokens": 2336014.0, "step": 9160 }, { "entropy": 0.32199760563671587, "epoch": 2.1363795314139176, "grad_norm": 0.75, "learning_rate": 4.999899586902982e-05, "loss": 0.5622, "mean_token_accuracy": 0.8987860381603241, "num_tokens": 2350363.0, "step": 9165 }, { "entropy": 0.2867584332823753, "epoch": 2.1375451684345497, "grad_norm": 1.5703125, "learning_rate": 4.9998976428681946e-05, "loss": 0.5252, "mean_token_accuracy": 0.9095795571804046, "num_tokens": 2361743.0, "step": 9170 }, { "entropy": 0.23813503086566926, "epoch": 2.1387108054551813, "grad_norm": 5.0625, "learning_rate": 4.999895680195554e-05, "loss": 0.3408, "mean_token_accuracy": 0.9232898116111755, "num_tokens": 2377516.0, "step": 9175 }, { "entropy": 0.20952856484800578, "epoch": 2.139876442475813, "grad_norm": 2.09375, "learning_rate": 4.999893698885089e-05, "loss": 0.2643, "mean_token_accuracy": 0.9201667666435241, "num_tokens": 2412415.0, "step": 9180 }, { "entropy": 0.2775612033903599, "epoch": 2.1410420794964447, "grad_norm": 2.046875, "learning_rate": 4.9998916989368286e-05, "loss": 0.5957, "mean_token_accuracy": 0.9061172902584076, "num_tokens": 2434137.0, "step": 9185 }, { "entropy": 0.33088565692305566, "epoch": 2.1422077165170768, "grad_norm": 2.46875, "learning_rate": 4.999889680350804e-05, "loss": 0.6631, "mean_token_accuracy": 0.8853426575660706, "num_tokens": 2448098.0, "step": 9190 }, { "entropy": 0.35131275504827497, "epoch": 2.1433733535377084, "grad_norm": 4.625, "learning_rate": 4.999887643127044e-05, "loss": 0.7744, "mean_token_accuracy": 0.8763688683509827, "num_tokens": 2457682.0, "step": 9195 }, { "entropy": 0.35075569823384284, "epoch": 2.14453899055834, "grad_norm": 6.3125, "learning_rate": 4.999885587265578e-05, "loss": 0.6041, "mean_token_accuracy": 0.8755062699317933, "num_tokens": 2470650.0, "step": 9200 }, { "entropy": 0.24803269598633051, "epoch": 2.1457046275789717, "grad_norm": 1.140625, "learning_rate": 4.999883512766439e-05, "loss": 0.4269, "mean_token_accuracy": 0.9069607615470886, "num_tokens": 2495284.0, "step": 9205 }, { "entropy": 0.2419408529996872, "epoch": 2.146870264599604, "grad_norm": 3.234375, "learning_rate": 4.999881419629657e-05, "loss": 0.522, "mean_token_accuracy": 0.8945335686206818, "num_tokens": 2515051.0, "step": 9210 }, { "entropy": 0.2983535796403885, "epoch": 2.1480359016202355, "grad_norm": 1.4921875, "learning_rate": 4.999879307855263e-05, "loss": 0.5812, "mean_token_accuracy": 0.8816781997680664, "num_tokens": 2534616.0, "step": 9215 }, { "entropy": 0.2238714762032032, "epoch": 2.149201538640867, "grad_norm": 0.85546875, "learning_rate": 4.9998771774432886e-05, "loss": 0.2949, "mean_token_accuracy": 0.9409159839153289, "num_tokens": 2568484.0, "step": 9220 }, { "entropy": 0.4029300630092621, "epoch": 2.150367175661499, "grad_norm": 5.03125, "learning_rate": 4.999875028393765e-05, "loss": 0.9551, "mean_token_accuracy": 0.8563934624195099, "num_tokens": 2576709.0, "step": 9225 }, { "entropy": 0.23757186979055406, "epoch": 2.151532812682131, "grad_norm": 1.9375, "learning_rate": 4.999872860706725e-05, "loss": 0.5217, "mean_token_accuracy": 0.9103751957416535, "num_tokens": 2594634.0, "step": 9230 }, { "entropy": 0.2861368380486965, "epoch": 2.1526984497027626, "grad_norm": 0.78515625, "learning_rate": 4.999870674382202e-05, "loss": 0.6282, "mean_token_accuracy": 0.8889648199081421, "num_tokens": 2611422.0, "step": 9235 }, { "entropy": 0.27242892384529116, "epoch": 2.1538640867233942, "grad_norm": 1.4453125, "learning_rate": 4.999868469420226e-05, "loss": 0.5124, "mean_token_accuracy": 0.8922030091285705, "num_tokens": 2626832.0, "step": 9240 }, { "entropy": 0.2865546464920044, "epoch": 2.155029723744026, "grad_norm": 1.5390625, "learning_rate": 4.999866245820831e-05, "loss": 0.6028, "mean_token_accuracy": 0.8990983128547668, "num_tokens": 2641289.0, "step": 9245 }, { "entropy": 0.2518985107541084, "epoch": 2.156195360764658, "grad_norm": 1.4921875, "learning_rate": 4.999864003584051e-05, "loss": 0.5041, "mean_token_accuracy": 0.9163178861141205, "num_tokens": 2655191.0, "step": 9250 }, { "entropy": 0.4007525980472565, "epoch": 2.1573609977852897, "grad_norm": 2.546875, "learning_rate": 4.9998617427099185e-05, "loss": 0.7206, "mean_token_accuracy": 0.8826470613479614, "num_tokens": 2665464.0, "step": 9255 }, { "entropy": 0.32439158745110036, "epoch": 2.1585266348059213, "grad_norm": 1.3828125, "learning_rate": 4.999859463198468e-05, "loss": 0.41, "mean_token_accuracy": 0.8896235227584839, "num_tokens": 2688611.0, "step": 9260 }, { "entropy": 0.2774831034243107, "epoch": 2.159692271826553, "grad_norm": 3.75, "learning_rate": 4.999857165049733e-05, "loss": 0.517, "mean_token_accuracy": 0.9092964768409729, "num_tokens": 2700618.0, "step": 9265 }, { "entropy": 0.22982106544077396, "epoch": 2.160857908847185, "grad_norm": 0.77734375, "learning_rate": 4.999854848263747e-05, "loss": 0.3502, "mean_token_accuracy": 0.9041347980499268, "num_tokens": 2729608.0, "step": 9270 }, { "entropy": 0.25315537825226786, "epoch": 2.1620235458678168, "grad_norm": 1.3046875, "learning_rate": 4.999852512840546e-05, "loss": 0.3534, "mean_token_accuracy": 0.9024879693984985, "num_tokens": 2754626.0, "step": 9275 }, { "entropy": 0.42271759584546087, "epoch": 2.1631891828884484, "grad_norm": 5.25, "learning_rate": 4.999850158780164e-05, "loss": 0.6212, "mean_token_accuracy": 0.8725330352783203, "num_tokens": 2773491.0, "step": 9280 }, { "entropy": 0.3202828958630562, "epoch": 2.1643548199090805, "grad_norm": 1.7421875, "learning_rate": 4.999847786082637e-05, "loss": 0.5417, "mean_token_accuracy": 0.8816564381122589, "num_tokens": 2786083.0, "step": 9285 }, { "entropy": 0.23811395298689603, "epoch": 2.165520456929712, "grad_norm": 4.03125, "learning_rate": 4.9998453947479986e-05, "loss": 0.4888, "mean_token_accuracy": 0.9188705086708069, "num_tokens": 2814594.0, "step": 9290 }, { "entropy": 0.3060807779431343, "epoch": 2.166686093950344, "grad_norm": 4.65625, "learning_rate": 4.999842984776285e-05, "loss": 0.2856, "mean_token_accuracy": 0.9265418946743011, "num_tokens": 2847092.0, "step": 9295 }, { "entropy": 0.4221868872642517, "epoch": 2.1678517309709755, "grad_norm": 3.109375, "learning_rate": 4.999840556167534e-05, "loss": 0.7217, "mean_token_accuracy": 0.8905176043510437, "num_tokens": 2856372.0, "step": 9300 }, { "entropy": 0.3070937130600214, "epoch": 2.1690173679916076, "grad_norm": 5.96875, "learning_rate": 4.999838108921779e-05, "loss": 0.4854, "mean_token_accuracy": 0.9074550926685333, "num_tokens": 2880328.0, "step": 9305 }, { "entropy": 0.406447908654809, "epoch": 2.1701830050122393, "grad_norm": 7.78125, "learning_rate": 4.999835643039059e-05, "loss": 0.7267, "mean_token_accuracy": 0.8538496255874634, "num_tokens": 2901644.0, "step": 9310 }, { "entropy": 0.36476370841264727, "epoch": 2.171348642032871, "grad_norm": 4.5, "learning_rate": 4.9998331585194094e-05, "loss": 0.6199, "mean_token_accuracy": 0.8748889327049255, "num_tokens": 2920190.0, "step": 9315 }, { "entropy": 0.39984625913202765, "epoch": 2.1725142790535026, "grad_norm": 4.0625, "learning_rate": 4.9998306553628685e-05, "loss": 0.8104, "mean_token_accuracy": 0.8600794672966003, "num_tokens": 2933890.0, "step": 9320 }, { "entropy": 0.29543534517288206, "epoch": 2.1736799160741347, "grad_norm": 4.34375, "learning_rate": 4.9998281335694715e-05, "loss": 0.5628, "mean_token_accuracy": 0.902722305059433, "num_tokens": 2954062.0, "step": 9325 }, { "entropy": 0.23410847783088684, "epoch": 2.1748455530947663, "grad_norm": 1.6171875, "learning_rate": 4.999825593139257e-05, "loss": 0.2356, "mean_token_accuracy": 0.9225547075271606, "num_tokens": 2985408.0, "step": 9330 }, { "entropy": 0.27478082813322546, "epoch": 2.176011190115398, "grad_norm": 5.125, "learning_rate": 4.999823034072264e-05, "loss": 0.582, "mean_token_accuracy": 0.8869442343711853, "num_tokens": 3002720.0, "step": 9335 }, { "entropy": 0.23570253998041152, "epoch": 2.1771768271360297, "grad_norm": 0.6796875, "learning_rate": 4.999820456368529e-05, "loss": 0.3097, "mean_token_accuracy": 0.9236724317073822, "num_tokens": 3029395.0, "step": 9340 }, { "entropy": 0.22580685541033746, "epoch": 2.1783424641566618, "grad_norm": 0.75390625, "learning_rate": 4.999817860028092e-05, "loss": 0.2503, "mean_token_accuracy": 0.9057783663272858, "num_tokens": 3063358.0, "step": 9345 }, { "entropy": 0.2577318917959929, "epoch": 2.1795081011772934, "grad_norm": 1.0625, "learning_rate": 4.99981524505099e-05, "loss": 0.4401, "mean_token_accuracy": 0.8984253942966461, "num_tokens": 3086596.0, "step": 9350 }, { "entropy": 0.29805635251104834, "epoch": 2.180673738197925, "grad_norm": 6.1875, "learning_rate": 4.9998126114372625e-05, "loss": 0.431, "mean_token_accuracy": 0.9108638763427734, "num_tokens": 3104755.0, "step": 9355 }, { "entropy": 0.2779708236455917, "epoch": 2.1818393752185568, "grad_norm": 5.625, "learning_rate": 4.99980995918695e-05, "loss": 0.7074, "mean_token_accuracy": 0.880797129869461, "num_tokens": 3115412.0, "step": 9360 }, { "entropy": 0.2565476704388857, "epoch": 2.183005012239189, "grad_norm": 9.375, "learning_rate": 4.9998072883000916e-05, "loss": 0.5649, "mean_token_accuracy": 0.9002249300479889, "num_tokens": 3128809.0, "step": 9365 }, { "entropy": 0.24895308613777162, "epoch": 2.1841706492598205, "grad_norm": 2.859375, "learning_rate": 4.999804598776726e-05, "loss": 0.5438, "mean_token_accuracy": 0.8996934533119202, "num_tokens": 3140578.0, "step": 9370 }, { "entropy": 0.2575036585330963, "epoch": 2.185336286280452, "grad_norm": 3.78125, "learning_rate": 4.999801890616894e-05, "loss": 0.3789, "mean_token_accuracy": 0.9098849713802337, "num_tokens": 3163367.0, "step": 9375 }, { "entropy": 0.3303291242569685, "epoch": 2.186501923301084, "grad_norm": 4.71875, "learning_rate": 4.999799163820636e-05, "loss": 0.6691, "mean_token_accuracy": 0.8991015374660491, "num_tokens": 3177936.0, "step": 9380 }, { "entropy": 0.23629399687051772, "epoch": 2.187667560321716, "grad_norm": 1.515625, "learning_rate": 4.999796418387993e-05, "loss": 0.3031, "mean_token_accuracy": 0.9319840252399445, "num_tokens": 3202018.0, "step": 9385 }, { "entropy": 0.2849699892103672, "epoch": 2.1888331973423476, "grad_norm": 3.265625, "learning_rate": 4.9997936543190055e-05, "loss": 0.4566, "mean_token_accuracy": 0.9006444752216339, "num_tokens": 3215472.0, "step": 9390 }, { "entropy": 0.2143367573618889, "epoch": 2.1899988343629793, "grad_norm": 0.498046875, "learning_rate": 4.9997908716137144e-05, "loss": 0.4926, "mean_token_accuracy": 0.9119088351726532, "num_tokens": 3240374.0, "step": 9395 }, { "entropy": 0.28837391287088393, "epoch": 2.191164471383611, "grad_norm": 3.484375, "learning_rate": 4.999788070272162e-05, "loss": 0.6007, "mean_token_accuracy": 0.8912509143352508, "num_tokens": 3260575.0, "step": 9400 }, { "entropy": 0.28325546756386755, "epoch": 2.192330108404243, "grad_norm": 0.70703125, "learning_rate": 4.99978525029439e-05, "loss": 0.4228, "mean_token_accuracy": 0.9063177049160004, "num_tokens": 3289344.0, "step": 9405 }, { "entropy": 0.3486835271120071, "epoch": 2.1934957454248747, "grad_norm": 0.859375, "learning_rate": 4.999782411680439e-05, "loss": 0.5126, "mean_token_accuracy": 0.8930414140224456, "num_tokens": 3313588.0, "step": 9410 }, { "entropy": 0.24004043117165566, "epoch": 2.1946613824455063, "grad_norm": 0.671875, "learning_rate": 4.999779554430354e-05, "loss": 0.5383, "mean_token_accuracy": 0.8893763542175293, "num_tokens": 3335335.0, "step": 9415 }, { "entropy": 0.34101747423410417, "epoch": 2.1958270194661385, "grad_norm": 5.0, "learning_rate": 4.9997766785441744e-05, "loss": 0.6741, "mean_token_accuracy": 0.8797512233257294, "num_tokens": 3344468.0, "step": 9420 }, { "entropy": 0.2273413881659508, "epoch": 2.19699265648677, "grad_norm": 4.09375, "learning_rate": 4.999773784021946e-05, "loss": 0.3916, "mean_token_accuracy": 0.922112226486206, "num_tokens": 3368729.0, "step": 9425 }, { "entropy": 0.3727362662553787, "epoch": 2.1981582935074018, "grad_norm": 3.21875, "learning_rate": 4.99977087086371e-05, "loss": 0.6149, "mean_token_accuracy": 0.8818917512893677, "num_tokens": 3382191.0, "step": 9430 }, { "entropy": 0.2546384513378143, "epoch": 2.1993239305280334, "grad_norm": 2.46875, "learning_rate": 4.999767939069511e-05, "loss": 0.3529, "mean_token_accuracy": 0.9229681074619294, "num_tokens": 3397221.0, "step": 9435 }, { "entropy": 0.2805795904248953, "epoch": 2.2004895675486655, "grad_norm": 4.34375, "learning_rate": 4.999764988639393e-05, "loss": 0.3557, "mean_token_accuracy": 0.9172143399715423, "num_tokens": 3420792.0, "step": 9440 }, { "entropy": 0.16744249165058137, "epoch": 2.201655204569297, "grad_norm": 2.5625, "learning_rate": 4.999762019573398e-05, "loss": 0.174, "mean_token_accuracy": 0.9522073805332184, "num_tokens": 3456134.0, "step": 9445 }, { "entropy": 0.29852482452988627, "epoch": 2.202820841589929, "grad_norm": 4.96875, "learning_rate": 4.999759031871572e-05, "loss": 0.425, "mean_token_accuracy": 0.8770089387893677, "num_tokens": 3492906.0, "step": 9450 }, { "entropy": 0.38035735934972764, "epoch": 2.2039864786105605, "grad_norm": 3.421875, "learning_rate": 4.9997560255339594e-05, "loss": 0.6763, "mean_token_accuracy": 0.8728886842727661, "num_tokens": 3511065.0, "step": 9455 }, { "entropy": 0.27133081033825873, "epoch": 2.2051521156311926, "grad_norm": 1.734375, "learning_rate": 4.999753000560604e-05, "loss": 0.4138, "mean_token_accuracy": 0.9164373219013214, "num_tokens": 3541530.0, "step": 9460 }, { "entropy": 0.19618333354592324, "epoch": 2.2063177526518243, "grad_norm": 1.1484375, "learning_rate": 4.999749956951552e-05, "loss": 0.3624, "mean_token_accuracy": 0.9286963403224945, "num_tokens": 3584076.0, "step": 9465 }, { "entropy": 0.28177291825413703, "epoch": 2.207483389672456, "grad_norm": 3.53125, "learning_rate": 4.999746894706848e-05, "loss": 0.5682, "mean_token_accuracy": 0.9091279208660126, "num_tokens": 3597496.0, "step": 9470 }, { "entropy": 0.2513377882540226, "epoch": 2.2086490266930876, "grad_norm": 2.78125, "learning_rate": 4.999743813826539e-05, "loss": 0.5108, "mean_token_accuracy": 0.9064539134502411, "num_tokens": 3615165.0, "step": 9475 }, { "entropy": 0.1830581733956933, "epoch": 2.2098146637137197, "grad_norm": 2.46875, "learning_rate": 4.999740714310669e-05, "loss": 0.3692, "mean_token_accuracy": 0.9332661211490632, "num_tokens": 3645536.0, "step": 9480 }, { "entropy": 0.2112976111471653, "epoch": 2.2109803007343514, "grad_norm": 1.5234375, "learning_rate": 4.999737596159286e-05, "loss": 0.265, "mean_token_accuracy": 0.933832836151123, "num_tokens": 3681490.0, "step": 9485 }, { "entropy": 0.18377817682921888, "epoch": 2.212145937754983, "grad_norm": 0.61328125, "learning_rate": 4.999734459372436e-05, "loss": 0.2467, "mean_token_accuracy": 0.9410947382450103, "num_tokens": 3720285.0, "step": 9490 }, { "entropy": 0.2656033456325531, "epoch": 2.2133115747756147, "grad_norm": 4.1875, "learning_rate": 4.9997313039501645e-05, "loss": 0.3963, "mean_token_accuracy": 0.9108889162540436, "num_tokens": 3740548.0, "step": 9495 }, { "entropy": 0.23448058478534223, "epoch": 2.214477211796247, "grad_norm": 0.73046875, "learning_rate": 4.99972812989252e-05, "loss": 0.3424, "mean_token_accuracy": 0.915939325094223, "num_tokens": 3765814.0, "step": 9500 }, { "entropy": 0.2662409141659737, "epoch": 2.2156428488168785, "grad_norm": 3.1875, "learning_rate": 4.9997249371995495e-05, "loss": 0.4384, "mean_token_accuracy": 0.903855949640274, "num_tokens": 3788788.0, "step": 9505 }, { "entropy": 0.3477610141038895, "epoch": 2.21680848583751, "grad_norm": 6.4375, "learning_rate": 4.999721725871301e-05, "loss": 0.8293, "mean_token_accuracy": 0.8557975172996521, "num_tokens": 3797775.0, "step": 9510 }, { "entropy": 0.2112328600138426, "epoch": 2.2179741228581418, "grad_norm": 1.015625, "learning_rate": 4.999718495907821e-05, "loss": 0.4428, "mean_token_accuracy": 0.9236923813819885, "num_tokens": 3821084.0, "step": 9515 }, { "entropy": 0.2622928474098444, "epoch": 2.219139759878774, "grad_norm": 4.375, "learning_rate": 4.999715247309159e-05, "loss": 0.4995, "mean_token_accuracy": 0.9056877613067627, "num_tokens": 3843608.0, "step": 9520 }, { "entropy": 0.27753055542707444, "epoch": 2.2203053968994055, "grad_norm": 4.53125, "learning_rate": 4.999711980075363e-05, "loss": 0.4655, "mean_token_accuracy": 0.9027316212654114, "num_tokens": 3870966.0, "step": 9525 }, { "entropy": 0.24506778195500373, "epoch": 2.221471033920037, "grad_norm": 7.65625, "learning_rate": 4.999708694206481e-05, "loss": 0.4823, "mean_token_accuracy": 0.893078601360321, "num_tokens": 3898239.0, "step": 9530 }, { "entropy": 0.2446131432428956, "epoch": 2.222636670940669, "grad_norm": 0.78515625, "learning_rate": 4.999705389702564e-05, "loss": 0.2505, "mean_token_accuracy": 0.9135530173778534, "num_tokens": 3927262.0, "step": 9535 }, { "entropy": 0.2848976690322161, "epoch": 2.223802307961301, "grad_norm": 3.875, "learning_rate": 4.999702066563658e-05, "loss": 0.4168, "mean_token_accuracy": 0.9176702558994293, "num_tokens": 3970657.0, "step": 9540 }, { "entropy": 0.26413310021162034, "epoch": 2.2249679449819326, "grad_norm": 0.6015625, "learning_rate": 4.9996987247898165e-05, "loss": 0.4095, "mean_token_accuracy": 0.9062711656093597, "num_tokens": 3998608.0, "step": 9545 }, { "entropy": 0.26131284348666667, "epoch": 2.2261335820025643, "grad_norm": 3.34375, "learning_rate": 4.9996953643810865e-05, "loss": 0.6044, "mean_token_accuracy": 0.8963152229785919, "num_tokens": 4026551.0, "step": 9550 }, { "entropy": 0.3163293443620205, "epoch": 2.2272992190231964, "grad_norm": 4.125, "learning_rate": 4.999691985337519e-05, "loss": 0.5343, "mean_token_accuracy": 0.9052989423274994, "num_tokens": 4042248.0, "step": 9555 }, { "entropy": 0.2981589982286096, "epoch": 2.228464856043828, "grad_norm": 2.21875, "learning_rate": 4.999688587659165e-05, "loss": 0.3268, "mean_token_accuracy": 0.9134493827819824, "num_tokens": 4070174.0, "step": 9560 }, { "entropy": 0.4530819907784462, "epoch": 2.2296304930644597, "grad_norm": 15.125, "learning_rate": 4.999685171346073e-05, "loss": 0.7796, "mean_token_accuracy": 0.8395863771438599, "num_tokens": 4097335.0, "step": 9565 }, { "entropy": 0.3206008315086365, "epoch": 2.2307961300850914, "grad_norm": 4.6875, "learning_rate": 4.999681736398297e-05, "loss": 0.6619, "mean_token_accuracy": 0.8948346257209778, "num_tokens": 4108162.0, "step": 9570 }, { "entropy": 0.22897542864084244, "epoch": 2.2319617671057235, "grad_norm": 3.96875, "learning_rate": 4.9996782828158864e-05, "loss": 0.5222, "mean_token_accuracy": 0.8944558799266815, "num_tokens": 4127142.0, "step": 9575 }, { "entropy": 0.30314663872122766, "epoch": 2.233127404126355, "grad_norm": 1.6875, "learning_rate": 4.999674810598892e-05, "loss": 0.6904, "mean_token_accuracy": 0.8810445845127106, "num_tokens": 4147146.0, "step": 9580 }, { "entropy": 0.27426308766007423, "epoch": 2.234293041146987, "grad_norm": 4.875, "learning_rate": 4.9996713197473674e-05, "loss": 0.4958, "mean_token_accuracy": 0.8857026934623718, "num_tokens": 4162818.0, "step": 9585 }, { "entropy": 0.24289622381329537, "epoch": 2.2354586781676185, "grad_norm": 3.65625, "learning_rate": 4.999667810261364e-05, "loss": 0.3609, "mean_token_accuracy": 0.9134008288383484, "num_tokens": 4180167.0, "step": 9590 }, { "entropy": 0.27631061524152756, "epoch": 2.2366243151882506, "grad_norm": 2.59375, "learning_rate": 4.9996642821409337e-05, "loss": 0.5898, "mean_token_accuracy": 0.9002882838249207, "num_tokens": 4192730.0, "step": 9595 }, { "entropy": 0.3148947186768055, "epoch": 2.237789952208882, "grad_norm": 1.984375, "learning_rate": 4.999660735386129e-05, "loss": 0.5189, "mean_token_accuracy": 0.905198335647583, "num_tokens": 4213631.0, "step": 9600 }, { "entropy": 0.2848841480910778, "epoch": 2.238955589229514, "grad_norm": 1.046875, "learning_rate": 4.999657169997003e-05, "loss": 0.4329, "mean_token_accuracy": 0.9120867669582366, "num_tokens": 4242635.0, "step": 9605 }, { "entropy": 0.2845593631267548, "epoch": 2.2401212262501455, "grad_norm": 0.4921875, "learning_rate": 4.9996535859736094e-05, "loss": 0.4789, "mean_token_accuracy": 0.8860286891460418, "num_tokens": 4269157.0, "step": 9610 }, { "entropy": 0.19470894411206247, "epoch": 2.2412868632707776, "grad_norm": 1.7890625, "learning_rate": 4.999649983316002e-05, "loss": 0.3254, "mean_token_accuracy": 0.9371213376522064, "num_tokens": 4294105.0, "step": 9615 }, { "entropy": 0.227097824588418, "epoch": 2.2424525002914093, "grad_norm": 0.5859375, "learning_rate": 4.999646362024232e-05, "loss": 0.3103, "mean_token_accuracy": 0.9307858169078826, "num_tokens": 4312123.0, "step": 9620 }, { "entropy": 0.4397062212228775, "epoch": 2.243618137312041, "grad_norm": 1.6484375, "learning_rate": 4.9996427220983566e-05, "loss": 0.685, "mean_token_accuracy": 0.8548967897891998, "num_tokens": 4332766.0, "step": 9625 }, { "entropy": 0.32833139449357984, "epoch": 2.2447837743326726, "grad_norm": 1.65625, "learning_rate": 4.999639063538428e-05, "loss": 0.5871, "mean_token_accuracy": 0.9062894225120545, "num_tokens": 4343414.0, "step": 9630 }, { "entropy": 0.2734944049268961, "epoch": 2.2459494113533047, "grad_norm": 7.65625, "learning_rate": 4.9996353863445016e-05, "loss": 0.4236, "mean_token_accuracy": 0.8899337708950043, "num_tokens": 4374207.0, "step": 9635 }, { "entropy": 0.358840149641037, "epoch": 2.2471150483739364, "grad_norm": 4.84375, "learning_rate": 4.999631690516632e-05, "loss": 0.8609, "mean_token_accuracy": 0.862494820356369, "num_tokens": 4382229.0, "step": 9640 }, { "entropy": 0.30931427925825117, "epoch": 2.248280685394568, "grad_norm": 3.875, "learning_rate": 4.9996279760548746e-05, "loss": 0.5868, "mean_token_accuracy": 0.8713773310184478, "num_tokens": 4402365.0, "step": 9645 }, { "entropy": 0.45645159631967547, "epoch": 2.2494463224151997, "grad_norm": 4.34375, "learning_rate": 4.9996242429592846e-05, "loss": 1.3055, "mean_token_accuracy": 0.8058351427316666, "num_tokens": 4412470.0, "step": 9650 }, { "entropy": 0.38287107944488524, "epoch": 2.250611959435832, "grad_norm": 4.15625, "learning_rate": 4.999620491229917e-05, "loss": 0.7953, "mean_token_accuracy": 0.8735459566116333, "num_tokens": 4420732.0, "step": 9655 }, { "entropy": 0.19908476956188678, "epoch": 2.2517775964564635, "grad_norm": 3.5625, "learning_rate": 4.9996167208668285e-05, "loss": 0.2668, "mean_token_accuracy": 0.9488315343856811, "num_tokens": 4449369.0, "step": 9660 }, { "entropy": 0.2650495123118162, "epoch": 2.252943233477095, "grad_norm": 3.96875, "learning_rate": 4.9996129318700754e-05, "loss": 0.3855, "mean_token_accuracy": 0.8953422248363495, "num_tokens": 4469436.0, "step": 9665 }, { "entropy": 0.267975103110075, "epoch": 2.254108870497727, "grad_norm": 1.546875, "learning_rate": 4.999609124239714e-05, "loss": 0.4036, "mean_token_accuracy": 0.9119938552379608, "num_tokens": 4490774.0, "step": 9670 }, { "entropy": 0.18695454075932502, "epoch": 2.255274507518359, "grad_norm": 9.4375, "learning_rate": 4.999605297975801e-05, "loss": 0.3051, "mean_token_accuracy": 0.9388129413127899, "num_tokens": 4522761.0, "step": 9675 }, { "entropy": 0.21596661265939474, "epoch": 2.2564401445389906, "grad_norm": 3.484375, "learning_rate": 4.999601453078394e-05, "loss": 0.4132, "mean_token_accuracy": 0.9199613273143769, "num_tokens": 4559702.0, "step": 9680 }, { "entropy": 0.19011754989624025, "epoch": 2.257605781559622, "grad_norm": 7.84375, "learning_rate": 4.999597589547549e-05, "loss": 0.4957, "mean_token_accuracy": 0.9160790145397186, "num_tokens": 4585921.0, "step": 9685 }, { "entropy": 0.2749990925192833, "epoch": 2.2587714185802543, "grad_norm": 2.21875, "learning_rate": 4.999593707383324e-05, "loss": 0.4338, "mean_token_accuracy": 0.9007754683494568, "num_tokens": 4602847.0, "step": 9690 }, { "entropy": 0.3145416095852852, "epoch": 2.259937055600886, "grad_norm": 2.234375, "learning_rate": 4.999589806585778e-05, "loss": 0.4691, "mean_token_accuracy": 0.8774303197860718, "num_tokens": 4639511.0, "step": 9695 }, { "entropy": 0.2634598663076758, "epoch": 2.2611026926215176, "grad_norm": 6.375, "learning_rate": 4.999585887154969e-05, "loss": 0.3744, "mean_token_accuracy": 0.90503711104393, "num_tokens": 4667015.0, "step": 9700 }, { "entropy": 0.32402964755892755, "epoch": 2.2622683296421493, "grad_norm": 1.5390625, "learning_rate": 4.9995819490909544e-05, "loss": 0.6425, "mean_token_accuracy": 0.8781411290168762, "num_tokens": 4681786.0, "step": 9705 }, { "entropy": 0.384226082265377, "epoch": 2.263433966662781, "grad_norm": 3.890625, "learning_rate": 4.999577992393794e-05, "loss": 0.436, "mean_token_accuracy": 0.8902345955371856, "num_tokens": 4698503.0, "step": 9710 }, { "entropy": 0.2572923541069031, "epoch": 2.264599603683413, "grad_norm": 3.15625, "learning_rate": 4.9995740170635454e-05, "loss": 0.5012, "mean_token_accuracy": 0.9116134166717529, "num_tokens": 4719487.0, "step": 9715 }, { "entropy": 0.26376195102930067, "epoch": 2.2657652407040447, "grad_norm": 3.34375, "learning_rate": 4.999570023100269e-05, "loss": 0.6524, "mean_token_accuracy": 0.8757637798786163, "num_tokens": 4731607.0, "step": 9720 }, { "entropy": 0.3332611836493015, "epoch": 2.2669308777246764, "grad_norm": 4.71875, "learning_rate": 4.999566010504024e-05, "loss": 0.6479, "mean_token_accuracy": 0.8807567059993744, "num_tokens": 4747368.0, "step": 9725 }, { "entropy": 0.25054799765348434, "epoch": 2.2680965147453085, "grad_norm": 4.875, "learning_rate": 4.9995619792748704e-05, "loss": 0.3999, "mean_token_accuracy": 0.9023214876651764, "num_tokens": 4770577.0, "step": 9730 }, { "entropy": 0.30612233132123945, "epoch": 2.26926215176594, "grad_norm": 4.90625, "learning_rate": 4.999557929412868e-05, "loss": 0.6414, "mean_token_accuracy": 0.8935961663722992, "num_tokens": 4780746.0, "step": 9735 }, { "entropy": 0.27663658782839773, "epoch": 2.270427788786572, "grad_norm": 5.75, "learning_rate": 4.9995538609180786e-05, "loss": 0.5946, "mean_token_accuracy": 0.9024072468280793, "num_tokens": 4803752.0, "step": 9740 }, { "entropy": 0.28249269388616083, "epoch": 2.2715934258072035, "grad_norm": 0.73046875, "learning_rate": 4.9995497737905604e-05, "loss": 0.4113, "mean_token_accuracy": 0.8761835098266602, "num_tokens": 4827859.0, "step": 9745 }, { "entropy": 0.32249484956264496, "epoch": 2.2727590628278356, "grad_norm": 4.625, "learning_rate": 4.999545668030377e-05, "loss": 0.4384, "mean_token_accuracy": 0.8888987183570862, "num_tokens": 4842531.0, "step": 9750 }, { "entropy": 0.269062814116478, "epoch": 2.2739246998484672, "grad_norm": 0.494140625, "learning_rate": 4.999541543637587e-05, "loss": 0.2706, "mean_token_accuracy": 0.9015085399150848, "num_tokens": 4867038.0, "step": 9755 }, { "entropy": 0.3270197004079819, "epoch": 2.275090336869099, "grad_norm": 2.671875, "learning_rate": 4.999537400612254e-05, "loss": 0.7103, "mean_token_accuracy": 0.8657562255859375, "num_tokens": 4887793.0, "step": 9760 }, { "entropy": 0.3015760459005833, "epoch": 2.2762559738897306, "grad_norm": 2.203125, "learning_rate": 4.999533238954438e-05, "loss": 0.4924, "mean_token_accuracy": 0.8944982051849365, "num_tokens": 4903059.0, "step": 9765 }, { "entropy": 0.3116227902472019, "epoch": 2.2774216109103627, "grad_norm": 0.71875, "learning_rate": 4.999529058664203e-05, "loss": 0.5879, "mean_token_accuracy": 0.8805232584476471, "num_tokens": 4929379.0, "step": 9770 }, { "entropy": 0.21581790670752526, "epoch": 2.2785872479309943, "grad_norm": 4.125, "learning_rate": 4.9995248597416095e-05, "loss": 0.409, "mean_token_accuracy": 0.925852793455124, "num_tokens": 4951193.0, "step": 9775 }, { "entropy": 0.2722294881939888, "epoch": 2.279752884951626, "grad_norm": 1.3125, "learning_rate": 4.9995206421867214e-05, "loss": 0.526, "mean_token_accuracy": 0.888937509059906, "num_tokens": 4975705.0, "step": 9780 }, { "entropy": 0.3000844083726406, "epoch": 2.280918521972258, "grad_norm": 1.671875, "learning_rate": 4.999516405999601e-05, "loss": 0.4401, "mean_token_accuracy": 0.8820734798908234, "num_tokens": 4995047.0, "step": 9785 }, { "entropy": 0.4159097492694855, "epoch": 2.2820841589928897, "grad_norm": 1.8203125, "learning_rate": 4.999512151180312e-05, "loss": 0.7088, "mean_token_accuracy": 0.850404542684555, "num_tokens": 5011382.0, "step": 9790 }, { "entropy": 0.3981002628803253, "epoch": 2.2832497960135214, "grad_norm": 1.2890625, "learning_rate": 4.999507877728917e-05, "loss": 0.6603, "mean_token_accuracy": 0.8550622880458831, "num_tokens": 5038138.0, "step": 9795 }, { "entropy": 0.32265768349170687, "epoch": 2.284415433034153, "grad_norm": 3.0625, "learning_rate": 4.99950358564548e-05, "loss": 0.5588, "mean_token_accuracy": 0.8854742705821991, "num_tokens": 5065301.0, "step": 9800 }, { "entropy": 0.29461741372942923, "epoch": 2.2855810700547847, "grad_norm": 4.625, "learning_rate": 4.9994992749300656e-05, "loss": 0.6825, "mean_token_accuracy": 0.8878226995468139, "num_tokens": 5082478.0, "step": 9805 }, { "entropy": 0.29813172612339256, "epoch": 2.286746707075417, "grad_norm": 4.5625, "learning_rate": 4.999494945582738e-05, "loss": 0.6086, "mean_token_accuracy": 0.8924034178256989, "num_tokens": 5101083.0, "step": 9810 }, { "entropy": 0.2548986107110977, "epoch": 2.2879123440960485, "grad_norm": 0.85546875, "learning_rate": 4.99949059760356e-05, "loss": 0.3604, "mean_token_accuracy": 0.9100441932678223, "num_tokens": 5128706.0, "step": 9815 }, { "entropy": 0.45110637992620467, "epoch": 2.28907798111668, "grad_norm": 6.15625, "learning_rate": 4.9994862309925995e-05, "loss": 0.6881, "mean_token_accuracy": 0.884794396162033, "num_tokens": 5171832.0, "step": 9820 }, { "entropy": 0.33002062514424324, "epoch": 2.2902436181373123, "grad_norm": 2.390625, "learning_rate": 4.999481845749919e-05, "loss": 0.6272, "mean_token_accuracy": 0.883812814950943, "num_tokens": 5184622.0, "step": 9825 }, { "entropy": 0.23434590175747871, "epoch": 2.291409255157944, "grad_norm": 0.70703125, "learning_rate": 4.999477441875585e-05, "loss": 0.2984, "mean_token_accuracy": 0.9214715182781219, "num_tokens": 5212319.0, "step": 9830 }, { "entropy": 0.20613327473402024, "epoch": 2.2925748921785756, "grad_norm": 2.1875, "learning_rate": 4.9994730193696634e-05, "loss": 0.2315, "mean_token_accuracy": 0.9313937187194824, "num_tokens": 5234725.0, "step": 9835 }, { "entropy": 0.22340739741921425, "epoch": 2.2937405291992072, "grad_norm": 7.78125, "learning_rate": 4.9994685782322195e-05, "loss": 0.5222, "mean_token_accuracy": 0.9099694013595581, "num_tokens": 5248701.0, "step": 9840 }, { "entropy": 0.33365772254765036, "epoch": 2.294906166219839, "grad_norm": 5.28125, "learning_rate": 4.9994641184633195e-05, "loss": 0.6683, "mean_token_accuracy": 0.8572822451591492, "num_tokens": 5264275.0, "step": 9845 }, { "entropy": 0.1759272739291191, "epoch": 2.296071803240471, "grad_norm": 1.71875, "learning_rate": 4.999459640063031e-05, "loss": 0.2545, "mean_token_accuracy": 0.9185454368591308, "num_tokens": 5291729.0, "step": 9850 }, { "entropy": 0.27733614556491376, "epoch": 2.2972374402611027, "grad_norm": 6.375, "learning_rate": 4.999455143031419e-05, "loss": 0.5294, "mean_token_accuracy": 0.8734292328357697, "num_tokens": 5312219.0, "step": 9855 }, { "entropy": 0.2136335577815771, "epoch": 2.2984030772817343, "grad_norm": 6.03125, "learning_rate": 4.999450627368552e-05, "loss": 0.4283, "mean_token_accuracy": 0.9121579229831696, "num_tokens": 5353777.0, "step": 9860 }, { "entropy": 0.26821391507983205, "epoch": 2.2995687143023664, "grad_norm": 2.65625, "learning_rate": 4.999446093074497e-05, "loss": 0.298, "mean_token_accuracy": 0.9142944037914276, "num_tokens": 5370503.0, "step": 9865 }, { "entropy": 0.22883135303854943, "epoch": 2.300734351322998, "grad_norm": 1.7265625, "learning_rate": 4.999441540149321e-05, "loss": 0.3735, "mean_token_accuracy": 0.9307141840457916, "num_tokens": 5394825.0, "step": 9870 }, { "entropy": 0.22144275531172752, "epoch": 2.3018999883436297, "grad_norm": 2.90625, "learning_rate": 4.999436968593093e-05, "loss": 0.4087, "mean_token_accuracy": 0.9137353122234344, "num_tokens": 5410431.0, "step": 9875 }, { "entropy": 0.27658817134797575, "epoch": 2.3030656253642614, "grad_norm": 4.90625, "learning_rate": 4.9994323784058805e-05, "loss": 0.5001, "mean_token_accuracy": 0.9053739786148072, "num_tokens": 5430503.0, "step": 9880 }, { "entropy": 0.28773905634880065, "epoch": 2.3042312623848935, "grad_norm": 5.84375, "learning_rate": 4.999427769587752e-05, "loss": 0.6671, "mean_token_accuracy": 0.8880759894847869, "num_tokens": 5441651.0, "step": 9885 }, { "entropy": 0.2942317187786102, "epoch": 2.305396899405525, "grad_norm": 2.921875, "learning_rate": 4.999423142138775e-05, "loss": 0.6182, "mean_token_accuracy": 0.8829988837242126, "num_tokens": 5467590.0, "step": 9890 }, { "entropy": 0.37470342963933945, "epoch": 2.306562536426157, "grad_norm": 4.625, "learning_rate": 4.9994184960590215e-05, "loss": 0.8504, "mean_token_accuracy": 0.8617240965366364, "num_tokens": 5475251.0, "step": 9895 }, { "entropy": 0.2544895693659782, "epoch": 2.3077281734467885, "grad_norm": 2.234375, "learning_rate": 4.999413831348558e-05, "loss": 0.324, "mean_token_accuracy": 0.9116761982440948, "num_tokens": 5489862.0, "step": 9900 }, { "entropy": 0.23354592993855477, "epoch": 2.3088938104674206, "grad_norm": 1.609375, "learning_rate": 4.999409148007455e-05, "loss": 0.4133, "mean_token_accuracy": 0.908677488565445, "num_tokens": 5515027.0, "step": 9905 }, { "entropy": 0.3375227749347687, "epoch": 2.3100594474880523, "grad_norm": 5.4375, "learning_rate": 4.999404446035782e-05, "loss": 0.6789, "mean_token_accuracy": 0.8693827629089356, "num_tokens": 5524493.0, "step": 9910 }, { "entropy": 0.3225863866508007, "epoch": 2.311225084508684, "grad_norm": 2.96875, "learning_rate": 4.9993997254336104e-05, "loss": 0.5651, "mean_token_accuracy": 0.8935315608978271, "num_tokens": 5538810.0, "step": 9915 }, { "entropy": 0.3349943034350872, "epoch": 2.312390721529316, "grad_norm": 5.3125, "learning_rate": 4.99939498620101e-05, "loss": 0.718, "mean_token_accuracy": 0.868977564573288, "num_tokens": 5554161.0, "step": 9920 }, { "entropy": 0.33131011128425597, "epoch": 2.3135563585499477, "grad_norm": 4.84375, "learning_rate": 4.99939022833805e-05, "loss": 0.6606, "mean_token_accuracy": 0.886441296339035, "num_tokens": 5564690.0, "step": 9925 }, { "entropy": 0.3090111125260592, "epoch": 2.3147219955705793, "grad_norm": 1.140625, "learning_rate": 4.9993854518448033e-05, "loss": 0.4239, "mean_token_accuracy": 0.8963955879211426, "num_tokens": 5594772.0, "step": 9930 }, { "entropy": 0.366110710054636, "epoch": 2.315887632591211, "grad_norm": 6.15625, "learning_rate": 4.99938065672134e-05, "loss": 0.7542, "mean_token_accuracy": 0.8663625955581665, "num_tokens": 5604796.0, "step": 9935 }, { "entropy": 0.2388034023344517, "epoch": 2.3170532696118427, "grad_norm": 2.671875, "learning_rate": 4.9993758429677324e-05, "loss": 0.4317, "mean_token_accuracy": 0.9069565892219543, "num_tokens": 5621314.0, "step": 9940 }, { "entropy": 0.2781688906252384, "epoch": 2.3182189066324748, "grad_norm": 3.265625, "learning_rate": 4.999371010584051e-05, "loss": 0.6039, "mean_token_accuracy": 0.9053974390029907, "num_tokens": 5634592.0, "step": 9945 }, { "entropy": 0.40243220534175633, "epoch": 2.3193845436531064, "grad_norm": 0.98828125, "learning_rate": 4.999366159570369e-05, "loss": 0.679, "mean_token_accuracy": 0.8980438828468322, "num_tokens": 5670943.0, "step": 9950 }, { "entropy": 0.2519418615847826, "epoch": 2.320550180673738, "grad_norm": 1.4921875, "learning_rate": 4.999361289926759e-05, "loss": 0.4197, "mean_token_accuracy": 0.9068368792533874, "num_tokens": 5690810.0, "step": 9955 }, { "entropy": 0.27322563864290716, "epoch": 2.32171581769437, "grad_norm": 1.4765625, "learning_rate": 4.999356401653292e-05, "loss": 0.4227, "mean_token_accuracy": 0.8880882680416107, "num_tokens": 5715025.0, "step": 9960 }, { "entropy": 0.26188406348228455, "epoch": 2.322881454715002, "grad_norm": 1.3828125, "learning_rate": 4.999351494750043e-05, "loss": 0.5393, "mean_token_accuracy": 0.8850970089435577, "num_tokens": 5728900.0, "step": 9965 }, { "entropy": 0.2880741149187088, "epoch": 2.3240470917356335, "grad_norm": 2.5, "learning_rate": 4.9993465692170834e-05, "loss": 0.5976, "mean_token_accuracy": 0.8882794141769409, "num_tokens": 5749009.0, "step": 9970 }, { "entropy": 0.24220139384269715, "epoch": 2.325212728756265, "grad_norm": 3.0625, "learning_rate": 4.999341625054487e-05, "loss": 0.3774, "mean_token_accuracy": 0.9123858332633972, "num_tokens": 5775137.0, "step": 9975 }, { "entropy": 0.4156968787312508, "epoch": 2.326378365776897, "grad_norm": 2.21875, "learning_rate": 4.9993366622623286e-05, "loss": 0.7635, "mean_token_accuracy": 0.8408162415027618, "num_tokens": 5796075.0, "step": 9980 }, { "entropy": 0.28073378577828406, "epoch": 2.327544002797529, "grad_norm": 1.5703125, "learning_rate": 4.9993316808406807e-05, "loss": 0.3659, "mean_token_accuracy": 0.9161297023296356, "num_tokens": 5823916.0, "step": 9985 }, { "entropy": 0.2217119388282299, "epoch": 2.3287096398181606, "grad_norm": 1.890625, "learning_rate": 4.999326680789619e-05, "loss": 0.2558, "mean_token_accuracy": 0.9355304777622223, "num_tokens": 5847977.0, "step": 9990 }, { "entropy": 0.29391138851642606, "epoch": 2.3298752768387923, "grad_norm": 0.80078125, "learning_rate": 4.999321662109218e-05, "loss": 0.4333, "mean_token_accuracy": 0.9192675650119781, "num_tokens": 5883540.0, "step": 9995 }, { "entropy": 0.3789336267858744, "epoch": 2.3310409138594244, "grad_norm": 5.28125, "learning_rate": 4.999316624799551e-05, "loss": 0.754, "mean_token_accuracy": 0.8675079941749573, "num_tokens": 5896164.0, "step": 10000 }, { "entropy": 0.35315763801336286, "epoch": 2.332206550880056, "grad_norm": 5.1875, "learning_rate": 4.999311568860694e-05, "loss": 0.7894, "mean_token_accuracy": 0.8786959171295166, "num_tokens": 5922708.0, "step": 10005 }, { "entropy": 0.346806064248085, "epoch": 2.3333721879006877, "grad_norm": 5.75, "learning_rate": 4.999306494292722e-05, "loss": 0.6594, "mean_token_accuracy": 0.8650177419185638, "num_tokens": 5942033.0, "step": 10010 }, { "entropy": 0.34452898651361463, "epoch": 2.3345378249213193, "grad_norm": 4.90625, "learning_rate": 4.999301401095712e-05, "loss": 0.7611, "mean_token_accuracy": 0.8749968349933624, "num_tokens": 5950976.0, "step": 10015 }, { "entropy": 0.22225252017378808, "epoch": 2.3357034619419514, "grad_norm": 0.65234375, "learning_rate": 4.999296289269739e-05, "loss": 0.4898, "mean_token_accuracy": 0.9027444779872894, "num_tokens": 5982814.0, "step": 10020 }, { "entropy": 0.23755434323102237, "epoch": 2.336869098962583, "grad_norm": 2.0, "learning_rate": 4.999291158814879e-05, "loss": 0.3463, "mean_token_accuracy": 0.9088250041007996, "num_tokens": 6005082.0, "step": 10025 }, { "entropy": 0.29438409954309464, "epoch": 2.3380347359832148, "grad_norm": 3.78125, "learning_rate": 4.999286009731209e-05, "loss": 0.6279, "mean_token_accuracy": 0.8902266621589661, "num_tokens": 6018880.0, "step": 10030 }, { "entropy": 0.3445430710911751, "epoch": 2.3392003730038464, "grad_norm": 2.21875, "learning_rate": 4.9992808420188056e-05, "loss": 0.4228, "mean_token_accuracy": 0.8743822872638702, "num_tokens": 6045068.0, "step": 10035 }, { "entropy": 0.24550675339996814, "epoch": 2.3403660100244785, "grad_norm": 1.7109375, "learning_rate": 4.9992756556777457e-05, "loss": 0.385, "mean_token_accuracy": 0.9098658263683319, "num_tokens": 6087604.0, "step": 10040 }, { "entropy": 0.21833256110548974, "epoch": 2.34153164704511, "grad_norm": 4.40625, "learning_rate": 4.999270450708106e-05, "loss": 0.3856, "mean_token_accuracy": 0.9172523081302643, "num_tokens": 6112153.0, "step": 10045 }, { "entropy": 0.18027405068278313, "epoch": 2.342697284065742, "grad_norm": 0.76953125, "learning_rate": 4.999265227109966e-05, "loss": 0.2369, "mean_token_accuracy": 0.9305807530879975, "num_tokens": 6142263.0, "step": 10050 }, { "entropy": 0.3012556668370962, "epoch": 2.343862921086374, "grad_norm": 2.5, "learning_rate": 4.999259984883402e-05, "loss": 0.3072, "mean_token_accuracy": 0.9214911222457886, "num_tokens": 6162712.0, "step": 10055 }, { "entropy": 0.2414890991523862, "epoch": 2.3450285581070056, "grad_norm": 3.8125, "learning_rate": 4.9992547240284926e-05, "loss": 0.4834, "mean_token_accuracy": 0.8984517753124237, "num_tokens": 6183240.0, "step": 10060 }, { "entropy": 0.29529370069503785, "epoch": 2.3461941951276373, "grad_norm": 0.8984375, "learning_rate": 4.9992494445453166e-05, "loss": 0.4558, "mean_token_accuracy": 0.8840475618839264, "num_tokens": 6220703.0, "step": 10065 }, { "entropy": 0.22695901058614254, "epoch": 2.347359832148269, "grad_norm": 1.4921875, "learning_rate": 4.9992441464339516e-05, "loss": 0.3135, "mean_token_accuracy": 0.928707069158554, "num_tokens": 6248979.0, "step": 10070 }, { "entropy": 0.2766278892755508, "epoch": 2.3485254691689006, "grad_norm": 1.953125, "learning_rate": 4.999238829694478e-05, "loss": 0.549, "mean_token_accuracy": 0.9088833332061768, "num_tokens": 6260919.0, "step": 10075 }, { "entropy": 0.1924386665225029, "epoch": 2.3496911061895327, "grad_norm": 1.03125, "learning_rate": 4.9992334943269746e-05, "loss": 0.2784, "mean_token_accuracy": 0.929617577791214, "num_tokens": 6286410.0, "step": 10080 }, { "entropy": 0.32888844013214114, "epoch": 2.3508567432101644, "grad_norm": 2.234375, "learning_rate": 4.9992281403315206e-05, "loss": 0.6281, "mean_token_accuracy": 0.8847958862781524, "num_tokens": 6296942.0, "step": 10085 }, { "entropy": 0.2987643375992775, "epoch": 2.352022380230796, "grad_norm": 4.0, "learning_rate": 4.999222767708196e-05, "loss": 0.5998, "mean_token_accuracy": 0.8935287356376648, "num_tokens": 6314166.0, "step": 10090 }, { "entropy": 0.15821853913366796, "epoch": 2.353188017251428, "grad_norm": 0.953125, "learning_rate": 4.999217376457082e-05, "loss": 0.1544, "mean_token_accuracy": 0.9383697509765625, "num_tokens": 6351218.0, "step": 10095 }, { "entropy": 0.3634513683617115, "epoch": 2.35435365427206, "grad_norm": 5.09375, "learning_rate": 4.9992119665782564e-05, "loss": 0.6301, "mean_token_accuracy": 0.8913884639739991, "num_tokens": 6369883.0, "step": 10100 }, { "entropy": 0.30150746181607246, "epoch": 2.3555192912926914, "grad_norm": 2.0625, "learning_rate": 4.999206538071802e-05, "loss": 0.5422, "mean_token_accuracy": 0.899857884645462, "num_tokens": 6390839.0, "step": 10105 }, { "entropy": 0.34220693577080963, "epoch": 2.356684928313323, "grad_norm": 3.71875, "learning_rate": 4.999201090937799e-05, "loss": 0.5697, "mean_token_accuracy": 0.8890272676944733, "num_tokens": 6409615.0, "step": 10110 }, { "entropy": 0.35875163078308103, "epoch": 2.3578505653339548, "grad_norm": 0.9296875, "learning_rate": 4.99919562517633e-05, "loss": 0.6268, "mean_token_accuracy": 0.8982849538326263, "num_tokens": 6437909.0, "step": 10115 }, { "entropy": 0.33476879745721816, "epoch": 2.359016202354587, "grad_norm": 8.875, "learning_rate": 4.9991901407874744e-05, "loss": 0.5472, "mean_token_accuracy": 0.8878929436206817, "num_tokens": 6455517.0, "step": 10120 }, { "entropy": 0.2315022237598896, "epoch": 2.3601818393752185, "grad_norm": 3.265625, "learning_rate": 4.999184637771315e-05, "loss": 0.3871, "mean_token_accuracy": 0.9159708678722381, "num_tokens": 6469407.0, "step": 10125 }, { "entropy": 0.3491888826712966, "epoch": 2.36134747639585, "grad_norm": 5.4375, "learning_rate": 4.9991791161279335e-05, "loss": 0.3751, "mean_token_accuracy": 0.8821563601493836, "num_tokens": 6502505.0, "step": 10130 }, { "entropy": 0.23251214995980263, "epoch": 2.3625131134164823, "grad_norm": 3.25, "learning_rate": 4.999173575857413e-05, "loss": 0.3658, "mean_token_accuracy": 0.9205073654651642, "num_tokens": 6529179.0, "step": 10135 }, { "entropy": 0.22121235951781273, "epoch": 2.363678750437114, "grad_norm": 1.21875, "learning_rate": 4.999168016959836e-05, "loss": 0.365, "mean_token_accuracy": 0.9118550777435303, "num_tokens": 6547764.0, "step": 10140 }, { "entropy": 0.3553631380200386, "epoch": 2.3648443874577456, "grad_norm": 4.34375, "learning_rate": 4.999162439435284e-05, "loss": 0.5203, "mean_token_accuracy": 0.8921437501907349, "num_tokens": 6566759.0, "step": 10145 }, { "entropy": 0.2485006831586361, "epoch": 2.3660100244783773, "grad_norm": 1.125, "learning_rate": 4.999156843283842e-05, "loss": 0.5423, "mean_token_accuracy": 0.9096591651439667, "num_tokens": 6582040.0, "step": 10150 }, { "entropy": 0.24684040918946265, "epoch": 2.3671756614990094, "grad_norm": 5.78125, "learning_rate": 4.9991512285055925e-05, "loss": 0.3533, "mean_token_accuracy": 0.9087679147720337, "num_tokens": 6612175.0, "step": 10155 }, { "entropy": 0.25654774941504, "epoch": 2.368341298519641, "grad_norm": 0.65234375, "learning_rate": 4.999145595100619e-05, "loss": 0.4405, "mean_token_accuracy": 0.9002897024154664, "num_tokens": 6633403.0, "step": 10160 }, { "entropy": 0.32544124498963356, "epoch": 2.3695069355402727, "grad_norm": 4.4375, "learning_rate": 4.9991399430690065e-05, "loss": 0.5944, "mean_token_accuracy": 0.8853745698928833, "num_tokens": 6651214.0, "step": 10165 }, { "entropy": 0.286426867544651, "epoch": 2.3706725725609044, "grad_norm": 3.234375, "learning_rate": 4.9991342724108373e-05, "loss": 0.5173, "mean_token_accuracy": 0.9167122066020965, "num_tokens": 6661904.0, "step": 10170 }, { "entropy": 0.23976799920201303, "epoch": 2.3718382095815365, "grad_norm": 0.90625, "learning_rate": 4.999128583126199e-05, "loss": 0.3663, "mean_token_accuracy": 0.9295875430107117, "num_tokens": 6689129.0, "step": 10175 }, { "entropy": 0.2436413548886776, "epoch": 2.373003846602168, "grad_norm": 6.21875, "learning_rate": 4.999122875215173e-05, "loss": 0.3969, "mean_token_accuracy": 0.9143813967704773, "num_tokens": 6704633.0, "step": 10180 }, { "entropy": 0.20087213069200516, "epoch": 2.3741694836228, "grad_norm": 3.9375, "learning_rate": 4.9991171486778475e-05, "loss": 0.2498, "mean_token_accuracy": 0.9344460725784302, "num_tokens": 6732545.0, "step": 10185 }, { "entropy": 0.2517100729048252, "epoch": 2.3753351206434314, "grad_norm": 3.34375, "learning_rate": 4.999111403514306e-05, "loss": 0.4214, "mean_token_accuracy": 0.9093203961849212, "num_tokens": 6753968.0, "step": 10190 }, { "entropy": 0.18460363671183586, "epoch": 2.3765007576640635, "grad_norm": 2.5, "learning_rate": 4.999105639724635e-05, "loss": 0.2715, "mean_token_accuracy": 0.9433306694030762, "num_tokens": 6778317.0, "step": 10195 }, { "entropy": 0.2778391644358635, "epoch": 2.377666394684695, "grad_norm": 2.328125, "learning_rate": 4.99909985730892e-05, "loss": 0.5476, "mean_token_accuracy": 0.9003312945365906, "num_tokens": 6790474.0, "step": 10200 }, { "entropy": 0.2117137961089611, "epoch": 2.378832031705327, "grad_norm": 2.78125, "learning_rate": 4.9990940562672466e-05, "loss": 0.3378, "mean_token_accuracy": 0.9290489733219147, "num_tokens": 6809944.0, "step": 10205 }, { "entropy": 0.25846754014492035, "epoch": 2.3799976687259585, "grad_norm": 3.9375, "learning_rate": 4.999088236599703e-05, "loss": 0.4026, "mean_token_accuracy": 0.9089631319046021, "num_tokens": 6826269.0, "step": 10210 }, { "entropy": 0.24787395931780337, "epoch": 2.3811633057465906, "grad_norm": 6.09375, "learning_rate": 4.999082398306375e-05, "loss": 0.4468, "mean_token_accuracy": 0.9027640461921692, "num_tokens": 6842642.0, "step": 10215 }, { "entropy": 0.31294687986373904, "epoch": 2.3823289427672223, "grad_norm": 2.734375, "learning_rate": 4.99907654138735e-05, "loss": 0.41, "mean_token_accuracy": 0.9093664288520813, "num_tokens": 6871017.0, "step": 10220 }, { "entropy": 0.4047424577176571, "epoch": 2.383494579787854, "grad_norm": 5.8125, "learning_rate": 4.999070665842714e-05, "loss": 0.6061, "mean_token_accuracy": 0.8631377577781677, "num_tokens": 6899208.0, "step": 10225 }, { "entropy": 0.32310840412974356, "epoch": 2.384660216808486, "grad_norm": 4.15625, "learning_rate": 4.9990647716725565e-05, "loss": 0.5589, "mean_token_accuracy": 0.8941978096961976, "num_tokens": 6915248.0, "step": 10230 }, { "entropy": 0.17672128304839135, "epoch": 2.3858258538291177, "grad_norm": 0.85546875, "learning_rate": 4.9990588588769636e-05, "loss": 0.1898, "mean_token_accuracy": 0.932169246673584, "num_tokens": 6948634.0, "step": 10235 }, { "entropy": 0.2601334646344185, "epoch": 2.3869914908497494, "grad_norm": 1.890625, "learning_rate": 4.9990529274560256e-05, "loss": 0.5304, "mean_token_accuracy": 0.9085682332515717, "num_tokens": 6959783.0, "step": 10240 }, { "entropy": 0.2183239098638296, "epoch": 2.388157127870381, "grad_norm": 0.70703125, "learning_rate": 4.999046977409829e-05, "loss": 0.3807, "mean_token_accuracy": 0.9079346299171448, "num_tokens": 6980023.0, "step": 10245 }, { "entropy": 0.2789805203676224, "epoch": 2.3893227648910127, "grad_norm": 4.3125, "learning_rate": 4.999041008738464e-05, "loss": 0.4494, "mean_token_accuracy": 0.9105753004550934, "num_tokens": 7001779.0, "step": 10250 }, { "entropy": 0.23990599066019058, "epoch": 2.390488401911645, "grad_norm": 1.2578125, "learning_rate": 4.999035021442018e-05, "loss": 0.2052, "mean_token_accuracy": 0.9130673289299012, "num_tokens": 7042231.0, "step": 10255 }, { "entropy": 0.33709273431450126, "epoch": 2.3916540389322765, "grad_norm": 3.28125, "learning_rate": 4.999029015520582e-05, "loss": 0.4839, "mean_token_accuracy": 0.9022141516208648, "num_tokens": 7064958.0, "step": 10260 }, { "entropy": 0.28133830726146697, "epoch": 2.392819675952908, "grad_norm": 3.34375, "learning_rate": 4.999022990974244e-05, "loss": 0.5064, "mean_token_accuracy": 0.9095320522785186, "num_tokens": 7075220.0, "step": 10265 }, { "entropy": 0.25721886157989504, "epoch": 2.3939853129735402, "grad_norm": 0.90625, "learning_rate": 4.9990169478030955e-05, "loss": 0.4456, "mean_token_accuracy": 0.9056904733180999, "num_tokens": 7103947.0, "step": 10270 }, { "entropy": 0.2558513440191746, "epoch": 2.395150949994172, "grad_norm": 5.90625, "learning_rate": 4.999010886007225e-05, "loss": 0.6014, "mean_token_accuracy": 0.9011744916439056, "num_tokens": 7115900.0, "step": 10275 }, { "entropy": 0.3650269165635109, "epoch": 2.3963165870148035, "grad_norm": 5.625, "learning_rate": 4.9990048055867236e-05, "loss": 0.8017, "mean_token_accuracy": 0.8734424829483032, "num_tokens": 7123489.0, "step": 10280 }, { "entropy": 0.41591184586286545, "epoch": 2.397482224035435, "grad_norm": 2.15625, "learning_rate": 4.998998706541682e-05, "loss": 0.7947, "mean_token_accuracy": 0.8584976613521575, "num_tokens": 7135823.0, "step": 10285 }, { "entropy": 0.27260911352932454, "epoch": 2.3986478610560673, "grad_norm": 0.8671875, "learning_rate": 4.998992588872191e-05, "loss": 0.4248, "mean_token_accuracy": 0.9116097688674927, "num_tokens": 7155099.0, "step": 10290 }, { "entropy": 0.3118731968104839, "epoch": 2.399813498076699, "grad_norm": 3.78125, "learning_rate": 4.9989864525783426e-05, "loss": 0.673, "mean_token_accuracy": 0.8843840837478638, "num_tokens": 7167022.0, "step": 10295 }, { "entropy": 0.2853694766759872, "epoch": 2.4009791350973306, "grad_norm": 1.9375, "learning_rate": 4.998980297660227e-05, "loss": 0.5487, "mean_token_accuracy": 0.9019811570644378, "num_tokens": 7178667.0, "step": 10300 }, { "entropy": 0.176923687197268, "epoch": 2.4021447721179623, "grad_norm": 3.109375, "learning_rate": 4.998974124117937e-05, "loss": 0.3027, "mean_token_accuracy": 0.9455612003803253, "num_tokens": 7213035.0, "step": 10305 }, { "entropy": 0.25026210620999334, "epoch": 2.4033104091385944, "grad_norm": 1.265625, "learning_rate": 4.9989679319515637e-05, "loss": 0.3769, "mean_token_accuracy": 0.902446985244751, "num_tokens": 7230402.0, "step": 10310 }, { "entropy": 0.31051740124821664, "epoch": 2.404476046159226, "grad_norm": 0.765625, "learning_rate": 4.9989617211612e-05, "loss": 0.6664, "mean_token_accuracy": 0.8678971707820893, "num_tokens": 7262503.0, "step": 10315 }, { "entropy": 0.3076247863471508, "epoch": 2.4056416831798577, "grad_norm": 2.671875, "learning_rate": 4.998955491746938e-05, "loss": 0.5008, "mean_token_accuracy": 0.8952358782291412, "num_tokens": 7276921.0, "step": 10320 }, { "entropy": 0.28582123182713987, "epoch": 2.4068073202004894, "grad_norm": 0.62109375, "learning_rate": 4.9989492437088724e-05, "loss": 0.4781, "mean_token_accuracy": 0.9080114364624023, "num_tokens": 7299222.0, "step": 10325 }, { "entropy": 0.22197759114205837, "epoch": 2.4079729572211215, "grad_norm": 3.84375, "learning_rate": 4.998942977047094e-05, "loss": 0.4596, "mean_token_accuracy": 0.9132648468017578, "num_tokens": 7330389.0, "step": 10330 }, { "entropy": 0.31743494495749475, "epoch": 2.409138594241753, "grad_norm": 4.84375, "learning_rate": 4.998936691761698e-05, "loss": 0.642, "mean_token_accuracy": 0.8967314422130584, "num_tokens": 7341450.0, "step": 10335 }, { "entropy": 0.23332953825592995, "epoch": 2.410304231262385, "grad_norm": 0.62890625, "learning_rate": 4.9989303878527774e-05, "loss": 0.3062, "mean_token_accuracy": 0.9363838791847229, "num_tokens": 7378009.0, "step": 10340 }, { "entropy": 0.2116334456950426, "epoch": 2.4114698682830165, "grad_norm": 1.0234375, "learning_rate": 4.998924065320426e-05, "loss": 0.4065, "mean_token_accuracy": 0.9076114535331726, "num_tokens": 7395486.0, "step": 10345 }, { "entropy": 0.37385508567094805, "epoch": 2.4126355053036486, "grad_norm": 5.125, "learning_rate": 4.9989177241647376e-05, "loss": 0.8335, "mean_token_accuracy": 0.8719491124153137, "num_tokens": 7403633.0, "step": 10350 }, { "entropy": 0.2373753260821104, "epoch": 2.4138011423242802, "grad_norm": 1.609375, "learning_rate": 4.998911364385808e-05, "loss": 0.3948, "mean_token_accuracy": 0.9229023635387421, "num_tokens": 7427889.0, "step": 10355 }, { "entropy": 0.26355861928313973, "epoch": 2.414966779344912, "grad_norm": 0.546875, "learning_rate": 4.998904985983732e-05, "loss": 0.3508, "mean_token_accuracy": 0.9301092326641083, "num_tokens": 7452324.0, "step": 10360 }, { "entropy": 0.2737158641219139, "epoch": 2.416132416365544, "grad_norm": 0.8828125, "learning_rate": 4.998898588958604e-05, "loss": 0.414, "mean_token_accuracy": 0.9086792588233947, "num_tokens": 7470897.0, "step": 10365 }, { "entropy": 0.19756668284535409, "epoch": 2.4172980533861756, "grad_norm": 2.671875, "learning_rate": 4.9988921733105196e-05, "loss": 0.35, "mean_token_accuracy": 0.9091159641742707, "num_tokens": 7499445.0, "step": 10370 }, { "entropy": 0.23579902928322555, "epoch": 2.4184636904068073, "grad_norm": 9.0625, "learning_rate": 4.998885739039574e-05, "loss": 0.5282, "mean_token_accuracy": 0.9076445281505585, "num_tokens": 7532011.0, "step": 10375 }, { "entropy": 0.32499233111739156, "epoch": 2.419629327427439, "grad_norm": 1.421875, "learning_rate": 4.998879286145863e-05, "loss": 0.5701, "mean_token_accuracy": 0.8866388320922851, "num_tokens": 7552376.0, "step": 10380 }, { "entropy": 0.24429988935589791, "epoch": 2.4207949644480706, "grad_norm": 4.125, "learning_rate": 4.998872814629485e-05, "loss": 0.4168, "mean_token_accuracy": 0.9179956972599029, "num_tokens": 7576024.0, "step": 10385 }, { "entropy": 0.2302600122988224, "epoch": 2.4219606014687027, "grad_norm": 1.4375, "learning_rate": 4.998866324490534e-05, "loss": 0.1357, "mean_token_accuracy": 0.9121902585029602, "num_tokens": 7622278.0, "step": 10390 }, { "entropy": 0.2962619811296463, "epoch": 2.4231262384893344, "grad_norm": 6.5625, "learning_rate": 4.9988598157291076e-05, "loss": 0.5138, "mean_token_accuracy": 0.8958117604255676, "num_tokens": 7644126.0, "step": 10395 }, { "entropy": 0.2693315804004669, "epoch": 2.424291875509966, "grad_norm": 0.69140625, "learning_rate": 4.998853288345303e-05, "loss": 0.4465, "mean_token_accuracy": 0.9200535476207733, "num_tokens": 7666542.0, "step": 10400 }, { "entropy": 0.25582237504422667, "epoch": 2.425457512530598, "grad_norm": 6.4375, "learning_rate": 4.9988467423392184e-05, "loss": 0.3563, "mean_token_accuracy": 0.9283755540847778, "num_tokens": 7687016.0, "step": 10405 }, { "entropy": 0.32354746013879776, "epoch": 2.42662314955123, "grad_norm": 5.0, "learning_rate": 4.99884017771095e-05, "loss": 0.658, "mean_token_accuracy": 0.870765084028244, "num_tokens": 7710754.0, "step": 10410 }, { "entropy": 0.28638599887490274, "epoch": 2.4277887865718615, "grad_norm": 5.59375, "learning_rate": 4.998833594460596e-05, "loss": 0.4625, "mean_token_accuracy": 0.9126111924648285, "num_tokens": 7722276.0, "step": 10415 }, { "entropy": 0.27470932640135287, "epoch": 2.428954423592493, "grad_norm": 0.8984375, "learning_rate": 4.9988269925882545e-05, "loss": 0.2832, "mean_token_accuracy": 0.9213086009025574, "num_tokens": 7746462.0, "step": 10420 }, { "entropy": 0.2574032604694366, "epoch": 2.4301200606131252, "grad_norm": 3.703125, "learning_rate": 4.998820372094024e-05, "loss": 0.5251, "mean_token_accuracy": 0.906989449262619, "num_tokens": 7766979.0, "step": 10425 }, { "entropy": 0.29713969230651854, "epoch": 2.431285697633757, "grad_norm": 4.96875, "learning_rate": 4.9988137329780045e-05, "loss": 0.4792, "mean_token_accuracy": 0.9020000696182251, "num_tokens": 7788246.0, "step": 10430 }, { "entropy": 0.29970705658197405, "epoch": 2.4324513346543886, "grad_norm": 6.25, "learning_rate": 4.998807075240293e-05, "loss": 0.5492, "mean_token_accuracy": 0.8868921577930451, "num_tokens": 7806839.0, "step": 10435 }, { "entropy": 0.24076540470123292, "epoch": 2.4336169716750202, "grad_norm": 1.2421875, "learning_rate": 4.99880039888099e-05, "loss": 0.2813, "mean_token_accuracy": 0.9311257004737854, "num_tokens": 7834109.0, "step": 10440 }, { "entropy": 0.36454578340053556, "epoch": 2.4347826086956523, "grad_norm": 4.25, "learning_rate": 4.998793703900195e-05, "loss": 0.7615, "mean_token_accuracy": 0.8765661299228669, "num_tokens": 7842120.0, "step": 10445 }, { "entropy": 0.27470249533653257, "epoch": 2.435948245716284, "grad_norm": 5.0625, "learning_rate": 4.9987869902980075e-05, "loss": 0.5855, "mean_token_accuracy": 0.8889828503131867, "num_tokens": 7853213.0, "step": 10450 }, { "entropy": 0.2980054959654808, "epoch": 2.4371138827369156, "grad_norm": 6.375, "learning_rate": 4.998780258074527e-05, "loss": 0.5448, "mean_token_accuracy": 0.8802853941917419, "num_tokens": 7871802.0, "step": 10455 }, { "entropy": 0.21270959489047528, "epoch": 2.4382795197575473, "grad_norm": 2.921875, "learning_rate": 4.998773507229855e-05, "loss": 0.3678, "mean_token_accuracy": 0.9348537504673005, "num_tokens": 7892438.0, "step": 10460 }, { "entropy": 0.21355258971452712, "epoch": 2.4394451567781794, "grad_norm": 2.875, "learning_rate": 4.9987667377640924e-05, "loss": 0.3796, "mean_token_accuracy": 0.9237074673175811, "num_tokens": 7931868.0, "step": 10465 }, { "entropy": 0.24989582411944866, "epoch": 2.440610793798811, "grad_norm": 6.84375, "learning_rate": 4.9987599496773385e-05, "loss": 0.473, "mean_token_accuracy": 0.9103607594966888, "num_tokens": 7953162.0, "step": 10470 }, { "entropy": 0.2442274335771799, "epoch": 2.4417764308194427, "grad_norm": 2.59375, "learning_rate": 4.998753142969696e-05, "loss": 0.5058, "mean_token_accuracy": 0.914693397283554, "num_tokens": 7983835.0, "step": 10475 }, { "entropy": 0.23022452220320702, "epoch": 2.4429420678400744, "grad_norm": 1.0390625, "learning_rate": 4.9987463176412664e-05, "loss": 0.2837, "mean_token_accuracy": 0.9182737469673157, "num_tokens": 8009550.0, "step": 10480 }, { "entropy": 0.3430052354931831, "epoch": 2.4441077048607065, "grad_norm": 2.859375, "learning_rate": 4.99873947369215e-05, "loss": 0.6588, "mean_token_accuracy": 0.8753048777580261, "num_tokens": 8021958.0, "step": 10485 }, { "entropy": 0.29453707188367845, "epoch": 2.445273341881338, "grad_norm": 4.4375, "learning_rate": 4.9987326111224506e-05, "loss": 0.4625, "mean_token_accuracy": 0.9004810392856598, "num_tokens": 8042746.0, "step": 10490 }, { "entropy": 0.2460491180419922, "epoch": 2.44643897890197, "grad_norm": 0.99609375, "learning_rate": 4.998725729932269e-05, "loss": 0.3338, "mean_token_accuracy": 0.9131915867328644, "num_tokens": 8061108.0, "step": 10495 }, { "entropy": 0.36034695208072665, "epoch": 2.447604615922602, "grad_norm": 1.578125, "learning_rate": 4.9987188301217095e-05, "loss": 0.6873, "mean_token_accuracy": 0.8622980952262879, "num_tokens": 8081809.0, "step": 10500 }, { "entropy": 0.30376859456300737, "epoch": 2.4487702529432336, "grad_norm": 3.359375, "learning_rate": 4.9987119116908734e-05, "loss": 0.4464, "mean_token_accuracy": 0.9128959953784943, "num_tokens": 8121401.0, "step": 10505 }, { "entropy": 0.2379370667040348, "epoch": 2.4499358899638652, "grad_norm": 4.125, "learning_rate": 4.9987049746398645e-05, "loss": 0.4646, "mean_token_accuracy": 0.9152859389781952, "num_tokens": 8145718.0, "step": 10510 }, { "entropy": 0.2736461482942104, "epoch": 2.451101526984497, "grad_norm": 4.78125, "learning_rate": 4.9986980189687865e-05, "loss": 0.3996, "mean_token_accuracy": 0.9089068353176117, "num_tokens": 8173641.0, "step": 10515 }, { "entropy": 0.24744717925786971, "epoch": 2.4522671640051286, "grad_norm": 1.234375, "learning_rate": 4.998691044677743e-05, "loss": 0.3728, "mean_token_accuracy": 0.9204553902149201, "num_tokens": 8200222.0, "step": 10520 }, { "entropy": 0.2574811242520809, "epoch": 2.4534328010257607, "grad_norm": 4.21875, "learning_rate": 4.998684051766838e-05, "loss": 0.5571, "mean_token_accuracy": 0.9103271067142487, "num_tokens": 8214116.0, "step": 10525 }, { "entropy": 0.31053818836808206, "epoch": 2.4545984380463923, "grad_norm": 4.0, "learning_rate": 4.998677040236175e-05, "loss": 0.6819, "mean_token_accuracy": 0.8785336196422577, "num_tokens": 8224261.0, "step": 10530 }, { "entropy": 0.35476598888635635, "epoch": 2.455764075067024, "grad_norm": 6.1875, "learning_rate": 4.99867001008586e-05, "loss": 0.6586, "mean_token_accuracy": 0.8944872498512269, "num_tokens": 8232938.0, "step": 10535 }, { "entropy": 0.2782547645270824, "epoch": 2.456929712087656, "grad_norm": 4.09375, "learning_rate": 4.998662961315996e-05, "loss": 0.5864, "mean_token_accuracy": 0.8916796863079071, "num_tokens": 8243533.0, "step": 10540 }, { "entropy": 0.34424693398177625, "epoch": 2.4580953491082878, "grad_norm": 64.5, "learning_rate": 4.9986558939266906e-05, "loss": 0.2403, "mean_token_accuracy": 0.9354278743267059, "num_tokens": 8285890.0, "step": 10545 }, { "entropy": 0.3282710451632738, "epoch": 2.4592609861289194, "grad_norm": 3.875, "learning_rate": 4.9986488079180464e-05, "loss": 0.441, "mean_token_accuracy": 0.8940746068954468, "num_tokens": 8310274.0, "step": 10550 }, { "entropy": 0.2680108778178692, "epoch": 2.460426623149551, "grad_norm": 3.671875, "learning_rate": 4.9986417032901714e-05, "loss": 0.5245, "mean_token_accuracy": 0.8968818008899688, "num_tokens": 8336585.0, "step": 10555 }, { "entropy": 0.2213997296988964, "epoch": 2.461592260170183, "grad_norm": 4.03125, "learning_rate": 4.99863458004317e-05, "loss": 0.3793, "mean_token_accuracy": 0.911593621969223, "num_tokens": 8369433.0, "step": 10560 }, { "entropy": 0.2799895711243153, "epoch": 2.462757897190815, "grad_norm": 1.8125, "learning_rate": 4.998627438177149e-05, "loss": 0.4496, "mean_token_accuracy": 0.897063159942627, "num_tokens": 8390854.0, "step": 10565 }, { "entropy": 0.3811504438519478, "epoch": 2.4639235342114465, "grad_norm": 7.8125, "learning_rate": 4.998620277692215e-05, "loss": 0.5295, "mean_token_accuracy": 0.8757772445678711, "num_tokens": 8417799.0, "step": 10570 }, { "entropy": 0.27391971051692965, "epoch": 2.465089171232078, "grad_norm": 2.734375, "learning_rate": 4.998613098588475e-05, "loss": 0.4542, "mean_token_accuracy": 0.9075638949871063, "num_tokens": 8432407.0, "step": 10575 }, { "entropy": 0.31272627636790273, "epoch": 2.4662548082527103, "grad_norm": 0.76171875, "learning_rate": 4.998605900866035e-05, "loss": 0.4501, "mean_token_accuracy": 0.8941393792629242, "num_tokens": 8454557.0, "step": 10580 }, { "entropy": 0.2570257782936096, "epoch": 2.467420445273342, "grad_norm": 4.78125, "learning_rate": 4.998598684525003e-05, "loss": 0.5738, "mean_token_accuracy": 0.9066219568252564, "num_tokens": 8467066.0, "step": 10585 }, { "entropy": 0.22484007887542248, "epoch": 2.4685860822939736, "grad_norm": 2.4375, "learning_rate": 4.9985914495654865e-05, "loss": 0.2423, "mean_token_accuracy": 0.9273537278175354, "num_tokens": 8492449.0, "step": 10590 }, { "entropy": 0.2611137468367815, "epoch": 2.4697517193146052, "grad_norm": 0.6015625, "learning_rate": 4.998584195987595e-05, "loss": 0.3819, "mean_token_accuracy": 0.9144336521625519, "num_tokens": 8520080.0, "step": 10595 }, { "entropy": 0.346079520508647, "epoch": 2.4709173563352373, "grad_norm": 0.85546875, "learning_rate": 4.998576923791433e-05, "loss": 0.5432, "mean_token_accuracy": 0.8557630747556686, "num_tokens": 8545422.0, "step": 10600 }, { "entropy": 0.3471344619989395, "epoch": 2.472082993355869, "grad_norm": 1.953125, "learning_rate": 4.998569632977112e-05, "loss": 0.7202, "mean_token_accuracy": 0.8664121389389038, "num_tokens": 8558882.0, "step": 10605 }, { "entropy": 0.2631093353033066, "epoch": 2.4732486303765007, "grad_norm": 4.46875, "learning_rate": 4.9985623235447405e-05, "loss": 0.6804, "mean_token_accuracy": 0.8926944673061371, "num_tokens": 8570144.0, "step": 10610 }, { "entropy": 0.23091779723763467, "epoch": 2.4744142673971323, "grad_norm": 1.390625, "learning_rate": 4.998554995494426e-05, "loss": 0.3858, "mean_token_accuracy": 0.9179224729537964, "num_tokens": 8605575.0, "step": 10615 }, { "entropy": 0.2587498303502798, "epoch": 2.4755799044177644, "grad_norm": 1.328125, "learning_rate": 4.998547648826279e-05, "loss": 0.5111, "mean_token_accuracy": 0.9113299548625946, "num_tokens": 8634576.0, "step": 10620 }, { "entropy": 0.3614876292645931, "epoch": 2.476745541438396, "grad_norm": 2.234375, "learning_rate": 4.998540283540408e-05, "loss": 0.4811, "mean_token_accuracy": 0.892818284034729, "num_tokens": 8653804.0, "step": 10625 }, { "entropy": 0.30118090808391573, "epoch": 2.4779111784590278, "grad_norm": 0.458984375, "learning_rate": 4.998532899636925e-05, "loss": 0.523, "mean_token_accuracy": 0.8992532551288605, "num_tokens": 8681806.0, "step": 10630 }, { "entropy": 0.29017008394002913, "epoch": 2.47907681547966, "grad_norm": 2.03125, "learning_rate": 4.998525497115937e-05, "loss": 0.5133, "mean_token_accuracy": 0.8909639179706573, "num_tokens": 8694925.0, "step": 10635 }, { "entropy": 0.23841165266931058, "epoch": 2.4802424525002915, "grad_norm": 3.46875, "learning_rate": 4.9985180759775566e-05, "loss": 0.3843, "mean_token_accuracy": 0.9136591196060181, "num_tokens": 8714809.0, "step": 10640 }, { "entropy": 0.2732238098978996, "epoch": 2.481408089520923, "grad_norm": 5.09375, "learning_rate": 4.9985106362218935e-05, "loss": 0.3983, "mean_token_accuracy": 0.924947077035904, "num_tokens": 8738299.0, "step": 10645 }, { "entropy": 0.3165955483913422, "epoch": 2.482573726541555, "grad_norm": 4.25, "learning_rate": 4.998503177849059e-05, "loss": 0.5408, "mean_token_accuracy": 0.9061666548252105, "num_tokens": 8747927.0, "step": 10650 }, { "entropy": 0.2890742287039757, "epoch": 2.4837393635621865, "grad_norm": 2.953125, "learning_rate": 4.9984957008591644e-05, "loss": 0.4619, "mean_token_accuracy": 0.9118583619594574, "num_tokens": 8770724.0, "step": 10655 }, { "entropy": 0.2947452884167433, "epoch": 2.4849050005828186, "grad_norm": 4.1875, "learning_rate": 4.9984882052523206e-05, "loss": 0.6179, "mean_token_accuracy": 0.8985697567462921, "num_tokens": 8786494.0, "step": 10660 }, { "entropy": 0.22197375893592836, "epoch": 2.4860706376034503, "grad_norm": 0.96484375, "learning_rate": 4.9984806910286406e-05, "loss": 0.4445, "mean_token_accuracy": 0.9120029747486115, "num_tokens": 8803727.0, "step": 10665 }, { "entropy": 0.26896577328443527, "epoch": 2.487236274624082, "grad_norm": 5.0, "learning_rate": 4.9984731581882355e-05, "loss": 0.5693, "mean_token_accuracy": 0.8987299025058746, "num_tokens": 8814973.0, "step": 10670 }, { "entropy": 0.23743227310478687, "epoch": 2.488401911644714, "grad_norm": 2.96875, "learning_rate": 4.998465606731217e-05, "loss": 0.4218, "mean_token_accuracy": 0.9243048131465912, "num_tokens": 8831458.0, "step": 10675 }, { "entropy": 0.21038546413183212, "epoch": 2.4895675486653457, "grad_norm": 3.71875, "learning_rate": 4.9984580366576996e-05, "loss": 0.454, "mean_token_accuracy": 0.922557246685028, "num_tokens": 8847184.0, "step": 10680 }, { "entropy": 0.23371059447526932, "epoch": 2.4907331856859773, "grad_norm": 4.90625, "learning_rate": 4.998450447967794e-05, "loss": 0.392, "mean_token_accuracy": 0.8991166591644287, "num_tokens": 8866368.0, "step": 10685 }, { "entropy": 0.2095944918692112, "epoch": 2.491898822706609, "grad_norm": 6.3125, "learning_rate": 4.998442840661616e-05, "loss": 0.5321, "mean_token_accuracy": 0.9088472187519073, "num_tokens": 8891458.0, "step": 10690 }, { "entropy": 0.275274308398366, "epoch": 2.493064459727241, "grad_norm": 1.4296875, "learning_rate": 4.998435214739276e-05, "loss": 0.3744, "mean_token_accuracy": 0.9091500878334046, "num_tokens": 8917181.0, "step": 10695 }, { "entropy": 0.19943911097943784, "epoch": 2.4942300967478728, "grad_norm": 4.15625, "learning_rate": 4.99842757020089e-05, "loss": 0.3194, "mean_token_accuracy": 0.9300190031528472, "num_tokens": 8948477.0, "step": 10700 }, { "entropy": 0.22412441447377204, "epoch": 2.4953957337685044, "grad_norm": 2.46875, "learning_rate": 4.9984199070465707e-05, "loss": 0.3621, "mean_token_accuracy": 0.9259128868579865, "num_tokens": 8970395.0, "step": 10705 }, { "entropy": 0.2479204297065735, "epoch": 2.496561370789136, "grad_norm": 0.62109375, "learning_rate": 4.998412225276433e-05, "loss": 0.3383, "mean_token_accuracy": 0.9196534276008606, "num_tokens": 8995002.0, "step": 10710 }, { "entropy": 0.1580579474568367, "epoch": 2.497727007809768, "grad_norm": 1.3125, "learning_rate": 4.998404524890592e-05, "loss": 0.144, "mean_token_accuracy": 0.954412579536438, "num_tokens": 9034707.0, "step": 10715 }, { "entropy": 0.24564942046999932, "epoch": 2.4988926448304, "grad_norm": 0.625, "learning_rate": 4.998396805889161e-05, "loss": 0.4918, "mean_token_accuracy": 0.9128167867660523, "num_tokens": 9055018.0, "step": 10720 }, { "entropy": 0.26528125405311587, "epoch": 2.5000582818510315, "grad_norm": 4.28125, "learning_rate": 4.998389068272256e-05, "loss": 0.4535, "mean_token_accuracy": 0.9169175028800964, "num_tokens": 9069174.0, "step": 10725 }, { "entropy": 0.2899410419166088, "epoch": 2.5012239188716636, "grad_norm": 1.6171875, "learning_rate": 4.998381312039992e-05, "loss": 0.5064, "mean_token_accuracy": 0.9001691699028015, "num_tokens": 9087041.0, "step": 10730 }, { "entropy": 0.19549113065004348, "epoch": 2.5023895558922953, "grad_norm": 2.1875, "learning_rate": 4.998373537192486e-05, "loss": 0.2986, "mean_token_accuracy": 0.9320485651493072, "num_tokens": 9104595.0, "step": 10735 }, { "entropy": 0.29923654198646543, "epoch": 2.503555192912927, "grad_norm": 2.078125, "learning_rate": 4.998365743729852e-05, "loss": 0.464, "mean_token_accuracy": 0.917408549785614, "num_tokens": 9116161.0, "step": 10740 }, { "entropy": 0.2908829629421234, "epoch": 2.5047208299335586, "grad_norm": 3.4375, "learning_rate": 4.998357931652208e-05, "loss": 0.5, "mean_token_accuracy": 0.9038378894329071, "num_tokens": 9133475.0, "step": 10745 }, { "entropy": 0.325673321634531, "epoch": 2.5058864669541903, "grad_norm": 3.6875, "learning_rate": 4.998350100959669e-05, "loss": 0.5424, "mean_token_accuracy": 0.8966376125812531, "num_tokens": 9146126.0, "step": 10750 }, { "entropy": 0.24616522938013077, "epoch": 2.5070521039748224, "grad_norm": 1.5625, "learning_rate": 4.9983422516523524e-05, "loss": 0.2964, "mean_token_accuracy": 0.9170722723007202, "num_tokens": 9177231.0, "step": 10755 }, { "entropy": 0.30409452468156817, "epoch": 2.508217740995454, "grad_norm": 0.46484375, "learning_rate": 4.9983343837303755e-05, "loss": 0.4707, "mean_token_accuracy": 0.8934083580970764, "num_tokens": 9201339.0, "step": 10760 }, { "entropy": 0.3546369731426239, "epoch": 2.5093833780160857, "grad_norm": 2.796875, "learning_rate": 4.998326497193855e-05, "loss": 0.7408, "mean_token_accuracy": 0.8691607177257538, "num_tokens": 9220413.0, "step": 10765 }, { "entropy": 0.3403468161821365, "epoch": 2.510549015036718, "grad_norm": 4.0625, "learning_rate": 4.998318592042909e-05, "loss": 0.7227, "mean_token_accuracy": 0.8796669840812683, "num_tokens": 9229228.0, "step": 10770 }, { "entropy": 0.19227586612105368, "epoch": 2.5117146520573495, "grad_norm": 0.58984375, "learning_rate": 4.998310668277655e-05, "loss": 0.2376, "mean_token_accuracy": 0.9470361351966858, "num_tokens": 9262657.0, "step": 10775 }, { "entropy": 0.2915323942899704, "epoch": 2.512880289077981, "grad_norm": 5.125, "learning_rate": 4.998302725898211e-05, "loss": 0.6645, "mean_token_accuracy": 0.884123957157135, "num_tokens": 9276618.0, "step": 10780 }, { "entropy": 0.2815995916724205, "epoch": 2.5140459260986128, "grad_norm": 4.5, "learning_rate": 4.9982947649046965e-05, "loss": 0.3543, "mean_token_accuracy": 0.9117548644542695, "num_tokens": 9297003.0, "step": 10785 }, { "entropy": 0.270504492521286, "epoch": 2.5152115631192444, "grad_norm": 6.03125, "learning_rate": 4.998286785297229e-05, "loss": 0.47, "mean_token_accuracy": 0.909794807434082, "num_tokens": 9316305.0, "step": 10790 }, { "entropy": 0.260904598236084, "epoch": 2.5163772001398765, "grad_norm": 3.1875, "learning_rate": 4.9982787870759285e-05, "loss": 0.6683, "mean_token_accuracy": 0.891952395439148, "num_tokens": 9327397.0, "step": 10795 }, { "entropy": 0.3234328027814627, "epoch": 2.517542837160508, "grad_norm": 3.515625, "learning_rate": 4.998270770240914e-05, "loss": 0.4856, "mean_token_accuracy": 0.8988953709602356, "num_tokens": 9362296.0, "step": 10800 }, { "entropy": 0.3260183773934841, "epoch": 2.51870847418114, "grad_norm": 0.6328125, "learning_rate": 4.998262734792304e-05, "loss": 0.5501, "mean_token_accuracy": 0.8724212884902954, "num_tokens": 9386671.0, "step": 10805 }, { "entropy": 0.23665817752480506, "epoch": 2.519874111201772, "grad_norm": 2.40625, "learning_rate": 4.99825468073022e-05, "loss": 0.2604, "mean_token_accuracy": 0.9227067410945893, "num_tokens": 9406591.0, "step": 10810 }, { "entropy": 0.3936871213838458, "epoch": 2.5210397482224036, "grad_norm": 1.4921875, "learning_rate": 4.998246608054781e-05, "loss": 0.7339, "mean_token_accuracy": 0.8664057910442352, "num_tokens": 9436139.0, "step": 10815 }, { "entropy": 0.35211950838565825, "epoch": 2.5222053852430353, "grad_norm": 2.90625, "learning_rate": 4.9982385167661075e-05, "loss": 0.6026, "mean_token_accuracy": 0.894438910484314, "num_tokens": 9448411.0, "step": 10820 }, { "entropy": 0.2504681311547756, "epoch": 2.523371022263667, "grad_norm": 2.09375, "learning_rate": 4.998230406864319e-05, "loss": 0.4038, "mean_token_accuracy": 0.9226616859436035, "num_tokens": 9472537.0, "step": 10825 }, { "entropy": 0.3123652219772339, "epoch": 2.5245366592842986, "grad_norm": 0.75390625, "learning_rate": 4.998222278349539e-05, "loss": 0.4818, "mean_token_accuracy": 0.8915808320045471, "num_tokens": 9500630.0, "step": 10830 }, { "entropy": 0.30130189210176467, "epoch": 2.5257022963049307, "grad_norm": 5.5, "learning_rate": 4.9982141312218875e-05, "loss": 0.5678, "mean_token_accuracy": 0.897449004650116, "num_tokens": 9514002.0, "step": 10835 }, { "entropy": 0.25896636620163915, "epoch": 2.5268679333255624, "grad_norm": 1.9765625, "learning_rate": 4.998205965481486e-05, "loss": 0.4619, "mean_token_accuracy": 0.902980488538742, "num_tokens": 9529717.0, "step": 10840 }, { "entropy": 0.40680873990058897, "epoch": 2.528033570346194, "grad_norm": 2.59375, "learning_rate": 4.998197781128455e-05, "loss": 0.649, "mean_token_accuracy": 0.8700441300868988, "num_tokens": 9542535.0, "step": 10845 }, { "entropy": 0.26954654008150103, "epoch": 2.529199207366826, "grad_norm": 5.71875, "learning_rate": 4.9981895781629186e-05, "loss": 0.5731, "mean_token_accuracy": 0.8942017436027527, "num_tokens": 9553660.0, "step": 10850 }, { "entropy": 0.6126003712415695, "epoch": 2.530364844387458, "grad_norm": 2.078125, "learning_rate": 4.9981813565849985e-05, "loss": 0.8439, "mean_token_accuracy": 0.8094934105873108, "num_tokens": 9583350.0, "step": 10855 }, { "entropy": 0.256740565598011, "epoch": 2.5315304814080895, "grad_norm": 7.625, "learning_rate": 4.998173116394816e-05, "loss": 0.5263, "mean_token_accuracy": 0.8800818026065826, "num_tokens": 9603578.0, "step": 10860 }, { "entropy": 0.24329185411334037, "epoch": 2.5326961184287216, "grad_norm": 2.25, "learning_rate": 4.9981648575924956e-05, "loss": 0.4655, "mean_token_accuracy": 0.9080992102622986, "num_tokens": 9615426.0, "step": 10865 }, { "entropy": 0.18521431982517242, "epoch": 2.533861755449353, "grad_norm": 2.4375, "learning_rate": 4.99815658017816e-05, "loss": 0.2018, "mean_token_accuracy": 0.9450451016426087, "num_tokens": 9643789.0, "step": 10870 }, { "entropy": 0.25476645715534685, "epoch": 2.535027392469985, "grad_norm": 1.8359375, "learning_rate": 4.9981482841519325e-05, "loss": 0.2879, "mean_token_accuracy": 0.9163881063461303, "num_tokens": 9669216.0, "step": 10875 }, { "entropy": 0.206333290040493, "epoch": 2.5361930294906165, "grad_norm": 5.6875, "learning_rate": 4.998139969513936e-05, "loss": 0.2351, "mean_token_accuracy": 0.9330012440681458, "num_tokens": 9701874.0, "step": 10880 }, { "entropy": 0.3087175332009792, "epoch": 2.537358666511248, "grad_norm": 0.416015625, "learning_rate": 4.998131636264296e-05, "loss": 0.6032, "mean_token_accuracy": 0.8812095761299134, "num_tokens": 9743844.0, "step": 10885 }, { "entropy": 0.30272372961044314, "epoch": 2.5385243035318803, "grad_norm": 4.90625, "learning_rate": 4.9981232844031357e-05, "loss": 0.6009, "mean_token_accuracy": 0.8849609076976777, "num_tokens": 9762894.0, "step": 10890 }, { "entropy": 0.18856640644371508, "epoch": 2.539689940552512, "grad_norm": 4.15625, "learning_rate": 4.998114913930579e-05, "loss": 0.3114, "mean_token_accuracy": 0.9368799209594727, "num_tokens": 9781940.0, "step": 10895 }, { "entropy": 0.2122984491288662, "epoch": 2.5408555775731436, "grad_norm": 0.91015625, "learning_rate": 4.998106524846753e-05, "loss": 0.293, "mean_token_accuracy": 0.9302926242351532, "num_tokens": 9826919.0, "step": 10900 }, { "entropy": 0.3222149532288313, "epoch": 2.5420212145937757, "grad_norm": 1.0, "learning_rate": 4.998098117151782e-05, "loss": 0.4287, "mean_token_accuracy": 0.900217491388321, "num_tokens": 9844075.0, "step": 10905 }, { "entropy": 0.23695617094635962, "epoch": 2.5431868516144074, "grad_norm": 0.412109375, "learning_rate": 4.998089690845789e-05, "loss": 0.3804, "mean_token_accuracy": 0.9084706485271454, "num_tokens": 9869917.0, "step": 10910 }, { "entropy": 0.28979590311646464, "epoch": 2.544352488635039, "grad_norm": 9.875, "learning_rate": 4.998081245928903e-05, "loss": 0.5758, "mean_token_accuracy": 0.8905119478702546, "num_tokens": 9885346.0, "step": 10915 }, { "entropy": 0.2431908842176199, "epoch": 2.5455181256556707, "grad_norm": 1.390625, "learning_rate": 4.998072782401248e-05, "loss": 0.3191, "mean_token_accuracy": 0.8980298042297363, "num_tokens": 9912476.0, "step": 10920 }, { "entropy": 0.23238140121102333, "epoch": 2.5466837626763024, "grad_norm": 3.671875, "learning_rate": 4.99806430026295e-05, "loss": 0.4056, "mean_token_accuracy": 0.909502238035202, "num_tokens": 9927690.0, "step": 10925 }, { "entropy": 0.23011131063103676, "epoch": 2.5478493996969345, "grad_norm": 0.94140625, "learning_rate": 4.9980557995141364e-05, "loss": 0.569, "mean_token_accuracy": 0.887377279996872, "num_tokens": 9945641.0, "step": 10930 }, { "entropy": 0.27550188899040223, "epoch": 2.549015036717566, "grad_norm": 3.078125, "learning_rate": 4.998047280154934e-05, "loss": 0.4363, "mean_token_accuracy": 0.9162426710128784, "num_tokens": 9959509.0, "step": 10935 }, { "entropy": 0.29937262982130053, "epoch": 2.550180673738198, "grad_norm": 6.25, "learning_rate": 4.998038742185469e-05, "loss": 0.5956, "mean_token_accuracy": 0.9001171708106994, "num_tokens": 9972523.0, "step": 10940 }, { "entropy": 0.20945103876292706, "epoch": 2.55134631075883, "grad_norm": 2.03125, "learning_rate": 4.998030185605869e-05, "loss": 0.2486, "mean_token_accuracy": 0.9239367246627808, "num_tokens": 10003818.0, "step": 10945 }, { "entropy": 0.2474493345245719, "epoch": 2.5525119477794616, "grad_norm": 0.46875, "learning_rate": 4.9980216104162627e-05, "loss": 0.3619, "mean_token_accuracy": 0.9106339871883392, "num_tokens": 10029914.0, "step": 10950 }, { "entropy": 0.2488720454275608, "epoch": 2.553677584800093, "grad_norm": 5.3125, "learning_rate": 4.998013016616776e-05, "loss": 0.5409, "mean_token_accuracy": 0.8993008673191071, "num_tokens": 10041210.0, "step": 10955 }, { "entropy": 0.20905282385647297, "epoch": 2.554843221820725, "grad_norm": 2.734375, "learning_rate": 4.998004404207539e-05, "loss": 0.2344, "mean_token_accuracy": 0.9224939465522766, "num_tokens": 10065172.0, "step": 10960 }, { "entropy": 0.27664083521813154, "epoch": 2.5560088588413565, "grad_norm": 5.3125, "learning_rate": 4.997995773188679e-05, "loss": 0.443, "mean_token_accuracy": 0.8938161969184876, "num_tokens": 10083266.0, "step": 10965 }, { "entropy": 0.2526406615972519, "epoch": 2.5571744958619886, "grad_norm": 4.65625, "learning_rate": 4.997987123560325e-05, "loss": 0.5609, "mean_token_accuracy": 0.8953101098537445, "num_tokens": 10093397.0, "step": 10970 }, { "entropy": 0.2602951280772686, "epoch": 2.5583401328826203, "grad_norm": 4.75, "learning_rate": 4.997978455322605e-05, "loss": 0.5184, "mean_token_accuracy": 0.8793947219848632, "num_tokens": 10124435.0, "step": 10975 }, { "entropy": 0.2883452221751213, "epoch": 2.559505769903252, "grad_norm": 1.4140625, "learning_rate": 4.99796976847565e-05, "loss": 0.4473, "mean_token_accuracy": 0.9102245748043061, "num_tokens": 10147972.0, "step": 10980 }, { "entropy": 0.25813480094075203, "epoch": 2.560671406923884, "grad_norm": 5.1875, "learning_rate": 4.9979610630195886e-05, "loss": 0.5154, "mean_token_accuracy": 0.9058370411396026, "num_tokens": 10163425.0, "step": 10985 }, { "entropy": 0.2699072379618883, "epoch": 2.5618370439445157, "grad_norm": 0.60546875, "learning_rate": 4.9979523389545514e-05, "loss": 0.276, "mean_token_accuracy": 0.9388366758823394, "num_tokens": 10189255.0, "step": 10990 }, { "entropy": 0.19736984223127366, "epoch": 2.5630026809651474, "grad_norm": 3.109375, "learning_rate": 4.9979435962806664e-05, "loss": 0.2373, "mean_token_accuracy": 0.9463130176067353, "num_tokens": 10214635.0, "step": 10995 }, { "entropy": 0.24606726691126823, "epoch": 2.5641683179857795, "grad_norm": 5.15625, "learning_rate": 4.997934834998067e-05, "loss": 0.3809, "mean_token_accuracy": 0.9238426387310028, "num_tokens": 10234190.0, "step": 11000 }, { "entropy": 0.3558468287810683, "epoch": 2.565333955006411, "grad_norm": 5.6875, "learning_rate": 4.997926055106881e-05, "loss": 0.678, "mean_token_accuracy": 0.8645303964614868, "num_tokens": 10254717.0, "step": 11005 }, { "entropy": 0.24228348731994628, "epoch": 2.566499592027043, "grad_norm": 1.40625, "learning_rate": 4.9979172566072404e-05, "loss": 0.3428, "mean_token_accuracy": 0.9213858485221863, "num_tokens": 10280458.0, "step": 11010 }, { "entropy": 0.1987355647608638, "epoch": 2.5676652290476745, "grad_norm": 5.875, "learning_rate": 4.997908439499277e-05, "loss": 0.4407, "mean_token_accuracy": 0.9114084959030151, "num_tokens": 10302557.0, "step": 11015 }, { "entropy": 0.25376520678400993, "epoch": 2.568830866068306, "grad_norm": 2.890625, "learning_rate": 4.9978996037831215e-05, "loss": 0.572, "mean_token_accuracy": 0.9063213229179382, "num_tokens": 10312816.0, "step": 11020 }, { "entropy": 0.1864607885479927, "epoch": 2.5699965030889382, "grad_norm": 1.7578125, "learning_rate": 4.9978907494589066e-05, "loss": 0.2917, "mean_token_accuracy": 0.9306336164474487, "num_tokens": 10337096.0, "step": 11025 }, { "entropy": 0.21803455986082554, "epoch": 2.57116214010957, "grad_norm": 4.1875, "learning_rate": 4.997881876526763e-05, "loss": 0.4001, "mean_token_accuracy": 0.9231861233711243, "num_tokens": 10360434.0, "step": 11030 }, { "entropy": 0.37254651412367823, "epoch": 2.5723277771302016, "grad_norm": 0.83984375, "learning_rate": 4.997872984986825e-05, "loss": 0.5981, "mean_token_accuracy": 0.8917929172515869, "num_tokens": 10385687.0, "step": 11035 }, { "entropy": 0.2153875719755888, "epoch": 2.5734934141508337, "grad_norm": 0.58203125, "learning_rate": 4.997864074839222e-05, "loss": 0.2672, "mean_token_accuracy": 0.9379128754138947, "num_tokens": 10423671.0, "step": 11040 }, { "entropy": 0.21650544218719006, "epoch": 2.5746590511714653, "grad_norm": 3.71875, "learning_rate": 4.9978551460840895e-05, "loss": 0.3612, "mean_token_accuracy": 0.9156126797199249, "num_tokens": 10443832.0, "step": 11045 }, { "entropy": 0.24414923898875712, "epoch": 2.575824688192097, "grad_norm": 5.3125, "learning_rate": 4.99784619872156e-05, "loss": 0.4182, "mean_token_accuracy": 0.9150607287883759, "num_tokens": 10464733.0, "step": 11050 }, { "entropy": 0.2163931004703045, "epoch": 2.5769903252127286, "grad_norm": 1.8984375, "learning_rate": 4.997837232751767e-05, "loss": 0.313, "mean_token_accuracy": 0.919309800863266, "num_tokens": 10483052.0, "step": 11055 }, { "entropy": 0.22960688844323157, "epoch": 2.5781559622333603, "grad_norm": 4.0, "learning_rate": 4.997828248174844e-05, "loss": 0.3334, "mean_token_accuracy": 0.928122466802597, "num_tokens": 10497890.0, "step": 11060 }, { "entropy": 0.21383021138608455, "epoch": 2.5793215992539924, "grad_norm": 0.875, "learning_rate": 4.997819244990925e-05, "loss": 0.2845, "mean_token_accuracy": 0.929515129327774, "num_tokens": 10517123.0, "step": 11065 }, { "entropy": 0.1973729684948921, "epoch": 2.580487236274624, "grad_norm": 0.60546875, "learning_rate": 4.997810223200144e-05, "loss": 0.3806, "mean_token_accuracy": 0.9143800556659698, "num_tokens": 10536675.0, "step": 11070 }, { "entropy": 0.30555715411901474, "epoch": 2.5816528732952557, "grad_norm": 1.5859375, "learning_rate": 4.997801182802635e-05, "loss": 0.5375, "mean_token_accuracy": 0.8875338017940522, "num_tokens": 10564485.0, "step": 11075 }, { "entropy": 0.23367904797196387, "epoch": 2.582818510315888, "grad_norm": 3.34375, "learning_rate": 4.997792123798535e-05, "loss": 0.2932, "mean_token_accuracy": 0.9282478809356689, "num_tokens": 10592613.0, "step": 11080 }, { "entropy": 0.246141454577446, "epoch": 2.5839841473365195, "grad_norm": 5.0625, "learning_rate": 4.997783046187977e-05, "loss": 0.4879, "mean_token_accuracy": 0.898841404914856, "num_tokens": 10620697.0, "step": 11085 }, { "entropy": 0.3161863937973976, "epoch": 2.585149784357151, "grad_norm": 3.96875, "learning_rate": 4.997773949971097e-05, "loss": 0.5144, "mean_token_accuracy": 0.901045823097229, "num_tokens": 10630312.0, "step": 11090 }, { "entropy": 0.27218768149614336, "epoch": 2.586315421377783, "grad_norm": 0.67578125, "learning_rate": 4.997764835148031e-05, "loss": 0.3202, "mean_token_accuracy": 0.9121147453784942, "num_tokens": 10652774.0, "step": 11095 }, { "entropy": 0.31504967212677004, "epoch": 2.5874810583984145, "grad_norm": 3.21875, "learning_rate": 4.997755701718914e-05, "loss": 0.7055, "mean_token_accuracy": 0.8863767445087433, "num_tokens": 10661302.0, "step": 11100 }, { "entropy": 0.278329698368907, "epoch": 2.5886466954190466, "grad_norm": 2.8125, "learning_rate": 4.9977465496838835e-05, "loss": 0.5463, "mean_token_accuracy": 0.8992919802665711, "num_tokens": 10680681.0, "step": 11105 }, { "entropy": 0.2613543540239334, "epoch": 2.5898123324396782, "grad_norm": 0.9921875, "learning_rate": 4.997737379043074e-05, "loss": 0.507, "mean_token_accuracy": 0.9085290372371674, "num_tokens": 10693761.0, "step": 11110 }, { "entropy": 0.26452816352248193, "epoch": 2.59097796946031, "grad_norm": 3.65625, "learning_rate": 4.9977281897966246e-05, "loss": 0.4636, "mean_token_accuracy": 0.905288678407669, "num_tokens": 10706089.0, "step": 11115 }, { "entropy": 0.25550087746232747, "epoch": 2.592143606480942, "grad_norm": 1.0703125, "learning_rate": 4.997718981944671e-05, "loss": 0.3773, "mean_token_accuracy": 0.9075290858745575, "num_tokens": 10740894.0, "step": 11120 }, { "entropy": 0.26064078956842424, "epoch": 2.5933092435015737, "grad_norm": 0.6484375, "learning_rate": 4.99770975548735e-05, "loss": 0.5987, "mean_token_accuracy": 0.8941443741321564, "num_tokens": 10758815.0, "step": 11125 }, { "entropy": 0.222178865224123, "epoch": 2.5944748805222053, "grad_norm": 5.71875, "learning_rate": 4.997700510424801e-05, "loss": 0.4556, "mean_token_accuracy": 0.9190146684646606, "num_tokens": 10775070.0, "step": 11130 }, { "entropy": 0.33019725382328036, "epoch": 2.5956405175428374, "grad_norm": 3.296875, "learning_rate": 4.9976912467571605e-05, "loss": 0.5227, "mean_token_accuracy": 0.8845232129096985, "num_tokens": 10789487.0, "step": 11135 }, { "entropy": 0.3958972916007042, "epoch": 2.596806154563469, "grad_norm": 4.28125, "learning_rate": 4.997681964484566e-05, "loss": 0.8221, "mean_token_accuracy": 0.8452759385108948, "num_tokens": 10802456.0, "step": 11140 }, { "entropy": 0.2777945823967457, "epoch": 2.5979717915841007, "grad_norm": 0.625, "learning_rate": 4.997672663607157e-05, "loss": 0.3402, "mean_token_accuracy": 0.9086602687835693, "num_tokens": 10831872.0, "step": 11145 }, { "entropy": 0.3942961137741804, "epoch": 2.5991374286047324, "grad_norm": 5.5, "learning_rate": 4.997663344125072e-05, "loss": 0.6087, "mean_token_accuracy": 0.8671041011810303, "num_tokens": 10846779.0, "step": 11150 }, { "entropy": 0.3075502276420593, "epoch": 2.600303065625364, "grad_norm": 1.3359375, "learning_rate": 4.9976540060384506e-05, "loss": 0.547, "mean_token_accuracy": 0.8964202046394348, "num_tokens": 10862884.0, "step": 11155 }, { "entropy": 0.27614784575998785, "epoch": 2.601468702645996, "grad_norm": 0.6953125, "learning_rate": 4.997644649347431e-05, "loss": 0.5464, "mean_token_accuracy": 0.9018550217151642, "num_tokens": 10885264.0, "step": 11160 }, { "entropy": 0.1691011071205139, "epoch": 2.602634339666628, "grad_norm": 0.466796875, "learning_rate": 4.9976352740521536e-05, "loss": 0.2883, "mean_token_accuracy": 0.93958500623703, "num_tokens": 10913916.0, "step": 11165 }, { "entropy": 0.247661954164505, "epoch": 2.6037999766872595, "grad_norm": 3.421875, "learning_rate": 4.997625880152757e-05, "loss": 0.347, "mean_token_accuracy": 0.9147677958011627, "num_tokens": 10946733.0, "step": 11170 }, { "entropy": 0.3043744258582592, "epoch": 2.6049656137078916, "grad_norm": 3.421875, "learning_rate": 4.997616467649382e-05, "loss": 0.6684, "mean_token_accuracy": 0.8797974646091461, "num_tokens": 10958671.0, "step": 11175 }, { "entropy": 0.16336182728409768, "epoch": 2.6061312507285233, "grad_norm": 1.234375, "learning_rate": 4.99760703654217e-05, "loss": 0.1688, "mean_token_accuracy": 0.9447501301765442, "num_tokens": 10984420.0, "step": 11180 }, { "entropy": 0.2802146412432194, "epoch": 2.607296887749155, "grad_norm": 6.375, "learning_rate": 4.997597586831259e-05, "loss": 0.4913, "mean_token_accuracy": 0.8942979097366333, "num_tokens": 10998949.0, "step": 11185 }, { "entropy": 0.34349353760480883, "epoch": 2.6084625247697866, "grad_norm": 2.03125, "learning_rate": 4.9975881185167926e-05, "loss": 0.5006, "mean_token_accuracy": 0.9019432067871094, "num_tokens": 11017593.0, "step": 11190 }, { "entropy": 0.26689242795109747, "epoch": 2.6096281617904182, "grad_norm": 1.96875, "learning_rate": 4.99757863159891e-05, "loss": 0.5055, "mean_token_accuracy": 0.908399498462677, "num_tokens": 11034394.0, "step": 11195 }, { "entropy": 0.15737572349607945, "epoch": 2.6107937988110503, "grad_norm": 0.7421875, "learning_rate": 4.997569126077754e-05, "loss": 0.2025, "mean_token_accuracy": 0.9473043262958527, "num_tokens": 11080961.0, "step": 11200 }, { "entropy": 0.19748403411358595, "epoch": 2.611959435831682, "grad_norm": 0.828125, "learning_rate": 4.9975596019534666e-05, "loss": 0.2864, "mean_token_accuracy": 0.9375809073448181, "num_tokens": 11102483.0, "step": 11205 }, { "entropy": 0.30650331676006315, "epoch": 2.6131250728523137, "grad_norm": 7.21875, "learning_rate": 4.997550059226188e-05, "loss": 0.7507, "mean_token_accuracy": 0.8718282222747803, "num_tokens": 11121897.0, "step": 11210 }, { "entropy": 0.24957393128424882, "epoch": 2.6142907098729458, "grad_norm": 5.21875, "learning_rate": 4.9975404978960626e-05, "loss": 0.4278, "mean_token_accuracy": 0.902422821521759, "num_tokens": 11142371.0, "step": 11215 }, { "entropy": 0.2951029822230339, "epoch": 2.6154563468935774, "grad_norm": 4.84375, "learning_rate": 4.997530917963231e-05, "loss": 0.6542, "mean_token_accuracy": 0.8763157844543457, "num_tokens": 11151119.0, "step": 11220 }, { "entropy": 0.2446484286338091, "epoch": 2.616621983914209, "grad_norm": 7.6875, "learning_rate": 4.997521319427838e-05, "loss": 0.5281, "mean_token_accuracy": 0.9024474620819092, "num_tokens": 11173901.0, "step": 11225 }, { "entropy": 0.2768650382757187, "epoch": 2.6177876209348407, "grad_norm": 5.25, "learning_rate": 4.9975117022900256e-05, "loss": 0.543, "mean_token_accuracy": 0.9006280660629272, "num_tokens": 11192978.0, "step": 11230 }, { "entropy": 0.2325539279729128, "epoch": 2.6189532579554724, "grad_norm": 5.0, "learning_rate": 4.997502066549936e-05, "loss": 0.4164, "mean_token_accuracy": 0.9057820856571197, "num_tokens": 11209052.0, "step": 11235 }, { "entropy": 0.2782548785209656, "epoch": 2.6201188949761045, "grad_norm": 4.5625, "learning_rate": 4.9974924122077154e-05, "loss": 0.6466, "mean_token_accuracy": 0.8900192558765412, "num_tokens": 11224304.0, "step": 11240 }, { "entropy": 0.18933181073516608, "epoch": 2.621284531996736, "grad_norm": 4.25, "learning_rate": 4.9974827392635064e-05, "loss": 0.2213, "mean_token_accuracy": 0.9298280954360962, "num_tokens": 11252668.0, "step": 11245 }, { "entropy": 0.2963331826031208, "epoch": 2.622450169017368, "grad_norm": 2.125, "learning_rate": 4.997473047717454e-05, "loss": 0.6452, "mean_token_accuracy": 0.8773207426071167, "num_tokens": 11262336.0, "step": 11250 }, { "entropy": 0.3698550321161747, "epoch": 2.623615806038, "grad_norm": 2.109375, "learning_rate": 4.997463337569701e-05, "loss": 0.4824, "mean_token_accuracy": 0.8727827727794647, "num_tokens": 11283605.0, "step": 11255 }, { "entropy": 0.29769267812371253, "epoch": 2.6247814430586316, "grad_norm": 0.50390625, "learning_rate": 4.997453608820394e-05, "loss": 0.4903, "mean_token_accuracy": 0.8827204525470733, "num_tokens": 11311531.0, "step": 11260 }, { "entropy": 0.33199090249836444, "epoch": 2.6259470800792633, "grad_norm": 7.5625, "learning_rate": 4.9974438614696775e-05, "loss": 0.4359, "mean_token_accuracy": 0.9025272786617279, "num_tokens": 11335227.0, "step": 11265 }, { "entropy": 0.2078126695007086, "epoch": 2.6271127170998954, "grad_norm": 0.78125, "learning_rate": 4.997434095517697e-05, "loss": 0.3611, "mean_token_accuracy": 0.9238122761249542, "num_tokens": 11370878.0, "step": 11270 }, { "entropy": 0.2649641171097755, "epoch": 2.628278354120527, "grad_norm": 1.953125, "learning_rate": 4.9974243109645966e-05, "loss": 0.4839, "mean_token_accuracy": 0.9140565156936645, "num_tokens": 11386737.0, "step": 11275 }, { "entropy": 0.26932487562298774, "epoch": 2.6294439911411587, "grad_norm": 6.71875, "learning_rate": 4.997414507810525e-05, "loss": 0.494, "mean_token_accuracy": 0.9060956716537476, "num_tokens": 11398109.0, "step": 11280 }, { "entropy": 0.3336785912513733, "epoch": 2.6306096281617903, "grad_norm": 5.96875, "learning_rate": 4.997404686055626e-05, "loss": 0.6023, "mean_token_accuracy": 0.8886629402637481, "num_tokens": 11418540.0, "step": 11285 }, { "entropy": 0.27329583168029786, "epoch": 2.631775265182422, "grad_norm": 2.9375, "learning_rate": 4.9973948457000476e-05, "loss": 0.5387, "mean_token_accuracy": 0.895511794090271, "num_tokens": 11430073.0, "step": 11290 }, { "entropy": 0.3408983126282692, "epoch": 2.632940902203054, "grad_norm": 10.625, "learning_rate": 4.9973849867439346e-05, "loss": 0.6893, "mean_token_accuracy": 0.8785698235034942, "num_tokens": 11449043.0, "step": 11295 }, { "entropy": 0.2924422096461058, "epoch": 2.6341065392236858, "grad_norm": 7.5, "learning_rate": 4.997375109187437e-05, "loss": 0.5025, "mean_token_accuracy": 0.8872777104377747, "num_tokens": 11464992.0, "step": 11300 }, { "entropy": 0.2665913224220276, "epoch": 2.6352721762443174, "grad_norm": 6.59375, "learning_rate": 4.9973652130306994e-05, "loss": 0.7234, "mean_token_accuracy": 0.8719295203685761, "num_tokens": 11474815.0, "step": 11305 }, { "entropy": 0.288311780244112, "epoch": 2.6364378132649495, "grad_norm": 1.5234375, "learning_rate": 4.9973552982738705e-05, "loss": 0.4171, "mean_token_accuracy": 0.9102599263191223, "num_tokens": 11490700.0, "step": 11310 }, { "entropy": 0.26549833118915556, "epoch": 2.637603450285581, "grad_norm": 0.69140625, "learning_rate": 4.9973453649170974e-05, "loss": 0.4156, "mean_token_accuracy": 0.9002930223941803, "num_tokens": 11512555.0, "step": 11315 }, { "entropy": 0.2690326914191246, "epoch": 2.638769087306213, "grad_norm": 1.3828125, "learning_rate": 4.9973354129605296e-05, "loss": 0.5101, "mean_token_accuracy": 0.9137054443359375, "num_tokens": 11526262.0, "step": 11320 }, { "entropy": 0.23037639632821083, "epoch": 2.6399347243268445, "grad_norm": 2.6875, "learning_rate": 4.9973254424043144e-05, "loss": 0.4401, "mean_token_accuracy": 0.9020182192325592, "num_tokens": 11541236.0, "step": 11325 }, { "entropy": 0.18799546770751477, "epoch": 2.641100361347476, "grad_norm": 5.28125, "learning_rate": 4.997315453248601e-05, "loss": 0.2115, "mean_token_accuracy": 0.9374987721443176, "num_tokens": 11575904.0, "step": 11330 }, { "entropy": 0.21427544951438904, "epoch": 2.6422659983681083, "grad_norm": 3.390625, "learning_rate": 4.997305445493538e-05, "loss": 0.3937, "mean_token_accuracy": 0.9206536293029786, "num_tokens": 11597724.0, "step": 11335 }, { "entropy": 0.2719633191823959, "epoch": 2.64343163538874, "grad_norm": 0.51953125, "learning_rate": 4.997295419139274e-05, "loss": 0.3489, "mean_token_accuracy": 0.9026310503482818, "num_tokens": 11624071.0, "step": 11340 }, { "entropy": 0.2608350694179535, "epoch": 2.6445972724093716, "grad_norm": 1.375, "learning_rate": 4.99728537418596e-05, "loss": 0.5964, "mean_token_accuracy": 0.8827403604984283, "num_tokens": 11637682.0, "step": 11345 }, { "entropy": 0.2973989363759756, "epoch": 2.6457629094300037, "grad_norm": 5.03125, "learning_rate": 4.997275310633745e-05, "loss": 0.4141, "mean_token_accuracy": 0.9107711672782898, "num_tokens": 11661335.0, "step": 11350 }, { "entropy": 0.2372315250337124, "epoch": 2.6469285464506354, "grad_norm": 1.1875, "learning_rate": 4.997265228482779e-05, "loss": 0.2801, "mean_token_accuracy": 0.9254857778549195, "num_tokens": 11680929.0, "step": 11355 }, { "entropy": 0.2743291571736336, "epoch": 2.648094183471267, "grad_norm": 6.84375, "learning_rate": 4.997255127733212e-05, "loss": 0.6052, "mean_token_accuracy": 0.8960570693016052, "num_tokens": 11691031.0, "step": 11360 }, { "entropy": 0.18517222441732883, "epoch": 2.6492598204918987, "grad_norm": 3.0625, "learning_rate": 4.9972450083851965e-05, "loss": 0.3446, "mean_token_accuracy": 0.9325447082519531, "num_tokens": 11710257.0, "step": 11365 }, { "entropy": 0.31101489067077637, "epoch": 2.6504254575125303, "grad_norm": 1.09375, "learning_rate": 4.9972348704388805e-05, "loss": 0.5734, "mean_token_accuracy": 0.897227269411087, "num_tokens": 11720796.0, "step": 11370 }, { "entropy": 0.2728283330798149, "epoch": 2.6515910945331624, "grad_norm": 3.578125, "learning_rate": 4.997224713894417e-05, "loss": 0.638, "mean_token_accuracy": 0.8861160099506378, "num_tokens": 11738627.0, "step": 11375 }, { "entropy": 0.16300744675099849, "epoch": 2.652756731553794, "grad_norm": 4.59375, "learning_rate": 4.997214538751958e-05, "loss": 0.2844, "mean_token_accuracy": 0.9426489353179932, "num_tokens": 11785879.0, "step": 11380 }, { "entropy": 0.25653175823390484, "epoch": 2.6539223685744258, "grad_norm": 3.3125, "learning_rate": 4.997204345011653e-05, "loss": 0.3171, "mean_token_accuracy": 0.8947100043296814, "num_tokens": 11824201.0, "step": 11385 }, { "entropy": 0.34817626476287844, "epoch": 2.655088005595058, "grad_norm": 4.40625, "learning_rate": 4.997194132673656e-05, "loss": 0.7191, "mean_token_accuracy": 0.8866174995899201, "num_tokens": 11832982.0, "step": 11390 }, { "entropy": 0.28169769085943697, "epoch": 2.6562536426156895, "grad_norm": 0.71875, "learning_rate": 4.9971839017381185e-05, "loss": 0.3534, "mean_token_accuracy": 0.9082461953163147, "num_tokens": 11854447.0, "step": 11395 }, { "entropy": 0.2898616187274456, "epoch": 2.657419279636321, "grad_norm": 1.3203125, "learning_rate": 4.997173652205193e-05, "loss": 0.3934, "mean_token_accuracy": 0.9070635378360749, "num_tokens": 11877699.0, "step": 11400 }, { "entropy": 0.3388654485344887, "epoch": 2.6585849166569533, "grad_norm": 5.59375, "learning_rate": 4.997163384075033e-05, "loss": 0.7418, "mean_token_accuracy": 0.8552682518959045, "num_tokens": 11889240.0, "step": 11405 }, { "entropy": 0.3616279847919941, "epoch": 2.659750553677585, "grad_norm": 7.875, "learning_rate": 4.997153097347791e-05, "loss": 0.5852, "mean_token_accuracy": 0.8689880043268203, "num_tokens": 11919570.0, "step": 11410 }, { "entropy": 0.2243417389690876, "epoch": 2.6609161906982166, "grad_norm": 1.1015625, "learning_rate": 4.99714279202362e-05, "loss": 0.5047, "mean_token_accuracy": 0.9167357087135315, "num_tokens": 11938429.0, "step": 11415 }, { "entropy": 0.25835750699043275, "epoch": 2.6620818277188483, "grad_norm": 3.78125, "learning_rate": 4.997132468102674e-05, "loss": 0.4717, "mean_token_accuracy": 0.889584195613861, "num_tokens": 11950186.0, "step": 11420 }, { "entropy": 0.2650266006588936, "epoch": 2.66324746473948, "grad_norm": 2.71875, "learning_rate": 4.997122125585108e-05, "loss": 0.5369, "mean_token_accuracy": 0.8962382078170776, "num_tokens": 11969804.0, "step": 11425 }, { "entropy": 0.34565363321453335, "epoch": 2.664413101760112, "grad_norm": 0.5625, "learning_rate": 4.9971117644710745e-05, "loss": 0.5914, "mean_token_accuracy": 0.861111444234848, "num_tokens": 12004415.0, "step": 11430 }, { "entropy": 0.3819424480199814, "epoch": 2.6655787387807437, "grad_norm": 3.53125, "learning_rate": 4.99710138476073e-05, "loss": 0.7971, "mean_token_accuracy": 0.8668591558933259, "num_tokens": 12014170.0, "step": 11435 }, { "entropy": 0.22254518344998359, "epoch": 2.6667443758013754, "grad_norm": 4.75, "learning_rate": 4.997090986454227e-05, "loss": 0.4434, "mean_token_accuracy": 0.927557897567749, "num_tokens": 12029565.0, "step": 11440 }, { "entropy": 0.25394832119345667, "epoch": 2.6679100128220075, "grad_norm": 0.388671875, "learning_rate": 4.997080569551721e-05, "loss": 0.3403, "mean_token_accuracy": 0.9082087457180024, "num_tokens": 12055656.0, "step": 11445 }, { "entropy": 0.24122598469257356, "epoch": 2.669075649842639, "grad_norm": 1.09375, "learning_rate": 4.9970701340533694e-05, "loss": 0.3257, "mean_token_accuracy": 0.9278246402740479, "num_tokens": 12073906.0, "step": 11450 }, { "entropy": 0.22207538709044455, "epoch": 2.670241286863271, "grad_norm": 1.1796875, "learning_rate": 4.997059679959326e-05, "loss": 0.3995, "mean_token_accuracy": 0.9133785367012024, "num_tokens": 12086888.0, "step": 11455 }, { "entropy": 0.2285417139530182, "epoch": 2.6714069238839024, "grad_norm": 1.015625, "learning_rate": 4.997049207269747e-05, "loss": 0.378, "mean_token_accuracy": 0.9328377544879913, "num_tokens": 12112474.0, "step": 11460 }, { "entropy": 0.23440224751830102, "epoch": 2.672572560904534, "grad_norm": 2.171875, "learning_rate": 4.997038715984789e-05, "loss": 0.3692, "mean_token_accuracy": 0.920467472076416, "num_tokens": 12134446.0, "step": 11465 }, { "entropy": 0.34350576922297477, "epoch": 2.673738197925166, "grad_norm": 6.71875, "learning_rate": 4.997028206104607e-05, "loss": 0.6887, "mean_token_accuracy": 0.8498728811740875, "num_tokens": 12152362.0, "step": 11470 }, { "entropy": 0.2824136093258858, "epoch": 2.674903834945798, "grad_norm": 3.0, "learning_rate": 4.99701767762936e-05, "loss": 0.4878, "mean_token_accuracy": 0.9034132361412048, "num_tokens": 12162776.0, "step": 11475 }, { "entropy": 0.22825291007757187, "epoch": 2.6760694719664295, "grad_norm": 0.890625, "learning_rate": 4.997007130559203e-05, "loss": 0.507, "mean_token_accuracy": 0.9103431522846221, "num_tokens": 12178504.0, "step": 11480 }, { "entropy": 0.2933326005935669, "epoch": 2.6772351089870616, "grad_norm": 2.8125, "learning_rate": 4.9969965648942944e-05, "loss": 0.644, "mean_token_accuracy": 0.8840182542800903, "num_tokens": 12187928.0, "step": 11485 }, { "entropy": 0.2727994412183762, "epoch": 2.6784007460076933, "grad_norm": 0.69921875, "learning_rate": 4.996985980634792e-05, "loss": 0.4404, "mean_token_accuracy": 0.898718672990799, "num_tokens": 12208235.0, "step": 11490 }, { "entropy": 0.24456576406955718, "epoch": 2.679566383028325, "grad_norm": 4.4375, "learning_rate": 4.9969753777808524e-05, "loss": 0.358, "mean_token_accuracy": 0.9193500220775604, "num_tokens": 12233805.0, "step": 11495 }, { "entropy": 0.24613263513892888, "epoch": 2.6807320200489566, "grad_norm": 1.7265625, "learning_rate": 4.996964756332634e-05, "loss": 0.5134, "mean_token_accuracy": 0.9112427711486817, "num_tokens": 12254031.0, "step": 11500 }, { "entropy": 0.33642361164093015, "epoch": 2.6818976570695883, "grad_norm": 8.3125, "learning_rate": 4.9969541162902964e-05, "loss": 0.5481, "mean_token_accuracy": 0.8862937986850739, "num_tokens": 12272378.0, "step": 11505 }, { "entropy": 0.2044012688100338, "epoch": 2.6830632940902204, "grad_norm": 1.8671875, "learning_rate": 4.996943457653997e-05, "loss": 0.3709, "mean_token_accuracy": 0.9268832981586457, "num_tokens": 12291946.0, "step": 11510 }, { "entropy": 0.24931253343820572, "epoch": 2.684228931110852, "grad_norm": 1.40625, "learning_rate": 4.9969327804238956e-05, "loss": 0.4929, "mean_token_accuracy": 0.9052222788333892, "num_tokens": 12303818.0, "step": 11515 }, { "entropy": 0.20881590992212296, "epoch": 2.6853945681314837, "grad_norm": 2.46875, "learning_rate": 4.99692208460015e-05, "loss": 0.2622, "mean_token_accuracy": 0.9323864638805389, "num_tokens": 12324390.0, "step": 11520 }, { "entropy": 0.2012042384594679, "epoch": 2.686560205152116, "grad_norm": 0.59765625, "learning_rate": 4.996911370182921e-05, "loss": 0.1416, "mean_token_accuracy": 0.9535678446292877, "num_tokens": 12359545.0, "step": 11525 }, { "entropy": 0.3009677939116955, "epoch": 2.6877258421727475, "grad_norm": 1.171875, "learning_rate": 4.996900637172369e-05, "loss": 0.5281, "mean_token_accuracy": 0.875782185792923, "num_tokens": 12373919.0, "step": 11530 }, { "entropy": 0.22892187163233757, "epoch": 2.688891479193379, "grad_norm": 4.625, "learning_rate": 4.996889885568652e-05, "loss": 0.4858, "mean_token_accuracy": 0.8961142063140869, "num_tokens": 12389644.0, "step": 11535 }, { "entropy": 0.1797600243240595, "epoch": 2.6900571162140112, "grad_norm": 0.92578125, "learning_rate": 4.996879115371931e-05, "loss": 0.3608, "mean_token_accuracy": 0.9254206895828248, "num_tokens": 12427754.0, "step": 11540 }, { "entropy": 0.23216586112976073, "epoch": 2.691222753234643, "grad_norm": 5.15625, "learning_rate": 4.996868326582368e-05, "loss": 0.4581, "mean_token_accuracy": 0.9157405376434327, "num_tokens": 12449553.0, "step": 11545 }, { "entropy": 0.27715420797467233, "epoch": 2.6923883902552745, "grad_norm": 4.0, "learning_rate": 4.996857519200122e-05, "loss": 0.3604, "mean_token_accuracy": 0.909223598241806, "num_tokens": 12469756.0, "step": 11550 }, { "entropy": 0.2619450569152832, "epoch": 2.693554027275906, "grad_norm": 5.90625, "learning_rate": 4.9968466932253564e-05, "loss": 0.5006, "mean_token_accuracy": 0.9076269268989563, "num_tokens": 12490124.0, "step": 11555 }, { "entropy": 0.2845116063952446, "epoch": 2.694719664296538, "grad_norm": 0.90234375, "learning_rate": 4.99683584865823e-05, "loss": 0.3196, "mean_token_accuracy": 0.9181459307670593, "num_tokens": 12512201.0, "step": 11560 }, { "entropy": 0.3422266826033592, "epoch": 2.69588530131717, "grad_norm": 4.75, "learning_rate": 4.9968249854989054e-05, "loss": 0.6682, "mean_token_accuracy": 0.8535523563623428, "num_tokens": 12528970.0, "step": 11565 }, { "entropy": 0.47459706813097, "epoch": 2.6970509383378016, "grad_norm": 0.703125, "learning_rate": 4.996814103747546e-05, "loss": 0.8362, "mean_token_accuracy": 0.8407033622264862, "num_tokens": 12556540.0, "step": 11570 }, { "entropy": 0.22336104661226272, "epoch": 2.6982165753584333, "grad_norm": 1.2578125, "learning_rate": 4.996803203404313e-05, "loss": 0.305, "mean_token_accuracy": 0.9277842044830322, "num_tokens": 12579634.0, "step": 11575 }, { "entropy": 0.30175293795764446, "epoch": 2.6993822123790654, "grad_norm": 9.1875, "learning_rate": 4.996792284469368e-05, "loss": 0.4639, "mean_token_accuracy": 0.9062645494937897, "num_tokens": 12595242.0, "step": 11580 }, { "entropy": 0.22965750470757484, "epoch": 2.700547849399697, "grad_norm": 3.6875, "learning_rate": 4.996781346942875e-05, "loss": 0.4088, "mean_token_accuracy": 0.9102014899253845, "num_tokens": 12622902.0, "step": 11585 }, { "entropy": 0.3243948698043823, "epoch": 2.7017134864203287, "grad_norm": 5.40625, "learning_rate": 4.996770390824998e-05, "loss": 0.5887, "mean_token_accuracy": 0.8744899332523346, "num_tokens": 12633690.0, "step": 11590 }, { "entropy": 0.23037907853722572, "epoch": 2.7028791234409604, "grad_norm": 2.609375, "learning_rate": 4.996759416115898e-05, "loss": 0.4174, "mean_token_accuracy": 0.9255319714546204, "num_tokens": 12645802.0, "step": 11595 }, { "entropy": 0.3350222710520029, "epoch": 2.704044760461592, "grad_norm": 0.640625, "learning_rate": 4.99674842281574e-05, "loss": 0.4574, "mean_token_accuracy": 0.9005424678325653, "num_tokens": 12671417.0, "step": 11600 }, { "entropy": 0.20586702935397624, "epoch": 2.705210397482224, "grad_norm": 3.71875, "learning_rate": 4.996737410924688e-05, "loss": 0.3393, "mean_token_accuracy": 0.9213223278522491, "num_tokens": 12697041.0, "step": 11605 }, { "entropy": 0.21617397107183933, "epoch": 2.706376034502856, "grad_norm": 0.703125, "learning_rate": 4.996726380442906e-05, "loss": 0.3141, "mean_token_accuracy": 0.9104293942451477, "num_tokens": 12727529.0, "step": 11610 }, { "entropy": 0.20006494000554084, "epoch": 2.7075416715234875, "grad_norm": 2.1875, "learning_rate": 4.996715331370558e-05, "loss": 0.2495, "mean_token_accuracy": 0.9327197790145874, "num_tokens": 12756256.0, "step": 11615 }, { "entropy": 0.16451627649366857, "epoch": 2.7087073085441196, "grad_norm": 1.7109375, "learning_rate": 4.9967042637078104e-05, "loss": 0.279, "mean_token_accuracy": 0.9395867764949799, "num_tokens": 12777458.0, "step": 11620 }, { "entropy": 0.17803059592843057, "epoch": 2.7098729455647512, "grad_norm": 2.609375, "learning_rate": 4.996693177454827e-05, "loss": 0.2705, "mean_token_accuracy": 0.9386874735355377, "num_tokens": 12794989.0, "step": 11625 }, { "entropy": 0.3340757980942726, "epoch": 2.711038582585383, "grad_norm": 6.6875, "learning_rate": 4.996682072611772e-05, "loss": 0.578, "mean_token_accuracy": 0.8854405045509338, "num_tokens": 12806814.0, "step": 11630 }, { "entropy": 0.30254448503255843, "epoch": 2.7122042196060145, "grad_norm": 6.9375, "learning_rate": 4.996670949178813e-05, "loss": 0.5859, "mean_token_accuracy": 0.8814816117286682, "num_tokens": 12819725.0, "step": 11635 }, { "entropy": 0.2472503509372473, "epoch": 2.713369856626646, "grad_norm": 1.296875, "learning_rate": 4.996659807156115e-05, "loss": 0.2973, "mean_token_accuracy": 0.9119050920009613, "num_tokens": 12843042.0, "step": 11640 }, { "entropy": 0.25286680161952974, "epoch": 2.7145354936472783, "grad_norm": 4.6875, "learning_rate": 4.9966486465438437e-05, "loss": 0.4863, "mean_token_accuracy": 0.9123594641685486, "num_tokens": 12854570.0, "step": 11645 }, { "entropy": 0.13867413848638535, "epoch": 2.71570113066791, "grad_norm": 0.9921875, "learning_rate": 4.9966374673421665e-05, "loss": 0.1569, "mean_token_accuracy": 0.9393254101276398, "num_tokens": 12888729.0, "step": 11650 }, { "entropy": 0.27010712325572966, "epoch": 2.7168667676885416, "grad_norm": 2.015625, "learning_rate": 4.9966262695512494e-05, "loss": 0.4142, "mean_token_accuracy": 0.906298041343689, "num_tokens": 12914069.0, "step": 11655 }, { "entropy": 0.2274789983406663, "epoch": 2.7180324047091737, "grad_norm": 2.625, "learning_rate": 4.99661505317126e-05, "loss": 0.469, "mean_token_accuracy": 0.8987535774707794, "num_tokens": 12934396.0, "step": 11660 }, { "entropy": 0.23609692528843879, "epoch": 2.7191980417298054, "grad_norm": 1.78125, "learning_rate": 4.9966038182023646e-05, "loss": 0.4503, "mean_token_accuracy": 0.9209963023662567, "num_tokens": 12950142.0, "step": 11665 }, { "entropy": 0.21072908565402032, "epoch": 2.720363678750437, "grad_norm": 1.1484375, "learning_rate": 4.9965925646447316e-05, "loss": 0.2943, "mean_token_accuracy": 0.9385083317756653, "num_tokens": 12986129.0, "step": 11670 }, { "entropy": 0.2646354716271162, "epoch": 2.721529315771069, "grad_norm": 2.375, "learning_rate": 4.996581292498528e-05, "loss": 0.3633, "mean_token_accuracy": 0.9122869968414307, "num_tokens": 13011072.0, "step": 11675 }, { "entropy": 0.3415803790092468, "epoch": 2.722694952791701, "grad_norm": 4.375, "learning_rate": 4.9965700017639226e-05, "loss": 0.5222, "mean_token_accuracy": 0.8990362405776977, "num_tokens": 13029584.0, "step": 11680 }, { "entropy": 0.24723002538084984, "epoch": 2.7238605898123325, "grad_norm": 4.375, "learning_rate": 4.996558692441084e-05, "loss": 0.4214, "mean_token_accuracy": 0.9199587941169739, "num_tokens": 13056668.0, "step": 11685 }, { "entropy": 0.16878330241888762, "epoch": 2.725026226832964, "grad_norm": 0.45703125, "learning_rate": 4.99654736453018e-05, "loss": 0.3618, "mean_token_accuracy": 0.9300837516784668, "num_tokens": 13081977.0, "step": 11690 }, { "entropy": 0.25726300925016404, "epoch": 2.726191863853596, "grad_norm": 3.921875, "learning_rate": 4.9965360180313804e-05, "loss": 0.4976, "mean_token_accuracy": 0.9045258402824402, "num_tokens": 13092587.0, "step": 11695 }, { "entropy": 0.4012131579220295, "epoch": 2.727357500874228, "grad_norm": 4.6875, "learning_rate": 4.996524652944853e-05, "loss": 0.6169, "mean_token_accuracy": 0.8621207654476166, "num_tokens": 13115618.0, "step": 11700 }, { "entropy": 0.26291630491614343, "epoch": 2.7285231378948596, "grad_norm": 5.71875, "learning_rate": 4.9965132692707686e-05, "loss": 0.5346, "mean_token_accuracy": 0.9023758828639984, "num_tokens": 13126747.0, "step": 11705 }, { "entropy": 0.2871785953640938, "epoch": 2.7296887749154912, "grad_norm": 3.796875, "learning_rate": 4.996501867009296e-05, "loss": 0.5377, "mean_token_accuracy": 0.908946031332016, "num_tokens": 13136640.0, "step": 11710 }, { "entropy": 0.34161866158246995, "epoch": 2.7308544119361233, "grad_norm": 4.46875, "learning_rate": 4.9964904461606066e-05, "loss": 0.6964, "mean_token_accuracy": 0.8712542295455933, "num_tokens": 13144769.0, "step": 11715 }, { "entropy": 0.24447038322687148, "epoch": 2.732020048956755, "grad_norm": 9.6875, "learning_rate": 4.996479006724869e-05, "loss": 0.5576, "mean_token_accuracy": 0.9012001335620881, "num_tokens": 13165742.0, "step": 11720 }, { "entropy": 0.19883279278874397, "epoch": 2.7331856859773866, "grad_norm": 7.59375, "learning_rate": 4.996467548702255e-05, "loss": 0.4156, "mean_token_accuracy": 0.9189273893833161, "num_tokens": 13179850.0, "step": 11725 }, { "entropy": 0.1922806277871132, "epoch": 2.7343513229980183, "grad_norm": 1.5390625, "learning_rate": 4.9964560720929355e-05, "loss": 0.2315, "mean_token_accuracy": 0.9458309173583984, "num_tokens": 13207309.0, "step": 11730 }, { "entropy": 0.2514540385454893, "epoch": 2.73551696001865, "grad_norm": 0.70703125, "learning_rate": 4.99644457689708e-05, "loss": 0.3651, "mean_token_accuracy": 0.9023769974708558, "num_tokens": 13239563.0, "step": 11735 }, { "entropy": 0.19213453009724618, "epoch": 2.736682597039282, "grad_norm": 4.25, "learning_rate": 4.996433063114862e-05, "loss": 0.3256, "mean_token_accuracy": 0.9336883068084717, "num_tokens": 13262413.0, "step": 11740 }, { "entropy": 0.3445202559232712, "epoch": 2.7378482340599137, "grad_norm": 6.0625, "learning_rate": 4.996421530746452e-05, "loss": 0.5881, "mean_token_accuracy": 0.8524076044559479, "num_tokens": 13278522.0, "step": 11745 }, { "entropy": 0.17778736725449562, "epoch": 2.7390138710805454, "grad_norm": 1.421875, "learning_rate": 4.9964099797920224e-05, "loss": 0.2799, "mean_token_accuracy": 0.9420393466949463, "num_tokens": 13299599.0, "step": 11750 }, { "entropy": 0.33492958918213844, "epoch": 2.7401795081011775, "grad_norm": 1.6328125, "learning_rate": 4.9963984102517456e-05, "loss": 0.4627, "mean_token_accuracy": 0.8947258800268173, "num_tokens": 13327734.0, "step": 11755 }, { "entropy": 0.24462985396385192, "epoch": 2.741345145121809, "grad_norm": 1.8828125, "learning_rate": 4.996386822125794e-05, "loss": 0.3535, "mean_token_accuracy": 0.9099364161491394, "num_tokens": 13344731.0, "step": 11760 }, { "entropy": 0.18358724601566792, "epoch": 2.742510782142441, "grad_norm": 1.6484375, "learning_rate": 4.996375215414339e-05, "loss": 0.2308, "mean_token_accuracy": 0.9195159077644348, "num_tokens": 13379085.0, "step": 11765 }, { "entropy": 0.15232093259692192, "epoch": 2.7436764191630725, "grad_norm": 2.703125, "learning_rate": 4.996363590117556e-05, "loss": 0.2483, "mean_token_accuracy": 0.9433568239212036, "num_tokens": 13397236.0, "step": 11770 }, { "entropy": 0.3235445529222488, "epoch": 2.744842056183704, "grad_norm": 15.1875, "learning_rate": 4.996351946235616e-05, "loss": 0.7299, "mean_token_accuracy": 0.8768341243267059, "num_tokens": 13404816.0, "step": 11775 }, { "entropy": 0.27746813744306564, "epoch": 2.7460076932043362, "grad_norm": 3.53125, "learning_rate": 4.996340283768695e-05, "loss": 0.3638, "mean_token_accuracy": 0.8914911150932312, "num_tokens": 13434499.0, "step": 11780 }, { "entropy": 0.2395051196217537, "epoch": 2.747173330224968, "grad_norm": 0.609375, "learning_rate": 4.996328602716965e-05, "loss": 0.295, "mean_token_accuracy": 0.9167731881141663, "num_tokens": 13462582.0, "step": 11785 }, { "entropy": 0.20807163119316102, "epoch": 2.7483389672455996, "grad_norm": 0.7578125, "learning_rate": 4.996316903080602e-05, "loss": 0.25, "mean_token_accuracy": 0.933128297328949, "num_tokens": 13481898.0, "step": 11790 }, { "entropy": 0.27841252386569976, "epoch": 2.7495046042662317, "grad_norm": 2.09375, "learning_rate": 4.9963051848597785e-05, "loss": 0.3034, "mean_token_accuracy": 0.8982349395751953, "num_tokens": 13505194.0, "step": 11795 }, { "entropy": 0.22490386068820953, "epoch": 2.7506702412868633, "grad_norm": 3.859375, "learning_rate": 4.996293448054671e-05, "loss": 0.424, "mean_token_accuracy": 0.9229201734066009, "num_tokens": 13523744.0, "step": 11800 }, { "entropy": 0.23477228432893754, "epoch": 2.751835878307495, "grad_norm": 6.25, "learning_rate": 4.9962816926654525e-05, "loss": 0.4653, "mean_token_accuracy": 0.9090815782546997, "num_tokens": 13535843.0, "step": 11805 }, { "entropy": 0.2388323299586773, "epoch": 2.753001515328127, "grad_norm": 1.3125, "learning_rate": 4.9962699186923e-05, "loss": 0.2872, "mean_token_accuracy": 0.9178419411182404, "num_tokens": 13585893.0, "step": 11810 }, { "entropy": 0.19899034015834333, "epoch": 2.7541671523487588, "grad_norm": 2.328125, "learning_rate": 4.996258126135388e-05, "loss": 0.2889, "mean_token_accuracy": 0.9384098529815674, "num_tokens": 13608252.0, "step": 11815 }, { "entropy": 0.28082182705402375, "epoch": 2.7553327893693904, "grad_norm": 5.875, "learning_rate": 4.996246314994894e-05, "loss": 0.5502, "mean_token_accuracy": 0.8987651646137238, "num_tokens": 13618617.0, "step": 11820 }, { "entropy": 0.28569440320134165, "epoch": 2.756498426390022, "grad_norm": 3.046875, "learning_rate": 4.9962344852709926e-05, "loss": 0.6317, "mean_token_accuracy": 0.8887876272201538, "num_tokens": 13633228.0, "step": 11825 }, { "entropy": 0.21487718708813192, "epoch": 2.7576640634106537, "grad_norm": 2.28125, "learning_rate": 4.9962226369638604e-05, "loss": 0.4465, "mean_token_accuracy": 0.9066687405109406, "num_tokens": 13655881.0, "step": 11830 }, { "entropy": 0.29074698835611346, "epoch": 2.758829700431286, "grad_norm": 2.15625, "learning_rate": 4.996210770073674e-05, "loss": 0.3076, "mean_token_accuracy": 0.9369310677051544, "num_tokens": 13678361.0, "step": 11835 }, { "entropy": 0.2757992595434189, "epoch": 2.7599953374519175, "grad_norm": 2.109375, "learning_rate": 4.996198884600611e-05, "loss": 0.4553, "mean_token_accuracy": 0.9155687928199768, "num_tokens": 13689044.0, "step": 11840 }, { "entropy": 0.23771463334560394, "epoch": 2.761160974472549, "grad_norm": 4.59375, "learning_rate": 4.9961869805448483e-05, "loss": 0.28, "mean_token_accuracy": 0.9290770351886749, "num_tokens": 13702931.0, "step": 11845 }, { "entropy": 0.21810585632920265, "epoch": 2.7623266114931813, "grad_norm": 0.91796875, "learning_rate": 4.996175057906563e-05, "loss": 0.2698, "mean_token_accuracy": 0.9399433553218841, "num_tokens": 13744305.0, "step": 11850 }, { "entropy": 0.20287301018834114, "epoch": 2.763492248513813, "grad_norm": 1.015625, "learning_rate": 4.996163116685933e-05, "loss": 0.2539, "mean_token_accuracy": 0.9406480073928833, "num_tokens": 13763052.0, "step": 11855 }, { "entropy": 0.22470155581831933, "epoch": 2.7646578855344446, "grad_norm": 5.03125, "learning_rate": 4.996151156883137e-05, "loss": 0.417, "mean_token_accuracy": 0.9212198674678802, "num_tokens": 13781973.0, "step": 11860 }, { "entropy": 0.20656557697802783, "epoch": 2.7658235225550762, "grad_norm": 1.4765625, "learning_rate": 4.996139178498353e-05, "loss": 0.283, "mean_token_accuracy": 0.9304579198360443, "num_tokens": 13811417.0, "step": 11865 }, { "entropy": 0.22034696750342847, "epoch": 2.766989159575708, "grad_norm": 0.5234375, "learning_rate": 4.9961271815317594e-05, "loss": 0.3176, "mean_token_accuracy": 0.91060671210289, "num_tokens": 13835697.0, "step": 11870 }, { "entropy": 0.2568760313093662, "epoch": 2.76815479659634, "grad_norm": 5.25, "learning_rate": 4.996115165983535e-05, "loss": 0.3898, "mean_token_accuracy": 0.9200484037399292, "num_tokens": 13849117.0, "step": 11875 }, { "entropy": 0.3268521726131439, "epoch": 2.7693204336169717, "grad_norm": 5.46875, "learning_rate": 4.99610313185386e-05, "loss": 0.5835, "mean_token_accuracy": 0.8949032425880432, "num_tokens": 13857116.0, "step": 11880 }, { "entropy": 0.2411774557083845, "epoch": 2.7704860706376033, "grad_norm": 0.86328125, "learning_rate": 4.996091079142913e-05, "loss": 0.3025, "mean_token_accuracy": 0.9226352453231812, "num_tokens": 13897630.0, "step": 11885 }, { "entropy": 0.19736510664224624, "epoch": 2.7716517076582354, "grad_norm": 2.4375, "learning_rate": 4.996079007850873e-05, "loss": 0.2608, "mean_token_accuracy": 0.9343535602092743, "num_tokens": 13921097.0, "step": 11890 }, { "entropy": 0.2812109630554914, "epoch": 2.772817344678867, "grad_norm": 6.5, "learning_rate": 4.9960669179779205e-05, "loss": 0.3654, "mean_token_accuracy": 0.9012946605682373, "num_tokens": 13944408.0, "step": 11895 }, { "entropy": 0.2541120745241642, "epoch": 2.7739829816994988, "grad_norm": 2.828125, "learning_rate": 4.996054809524237e-05, "loss": 0.4858, "mean_token_accuracy": 0.9016520738601684, "num_tokens": 13959937.0, "step": 11900 }, { "entropy": 0.2620885193347931, "epoch": 2.7751486187201304, "grad_norm": 4.375, "learning_rate": 4.996042682490002e-05, "loss": 0.3841, "mean_token_accuracy": 0.910321581363678, "num_tokens": 13975779.0, "step": 11905 }, { "entropy": 0.2132838958874345, "epoch": 2.776314255740762, "grad_norm": 6.0, "learning_rate": 4.996030536875396e-05, "loss": 0.3785, "mean_token_accuracy": 0.9210842311382293, "num_tokens": 13997478.0, "step": 11910 }, { "entropy": 0.21150433290749787, "epoch": 2.777479892761394, "grad_norm": 4.5, "learning_rate": 4.996018372680601e-05, "loss": 0.3659, "mean_token_accuracy": 0.9263874173164368, "num_tokens": 14017546.0, "step": 11915 }, { "entropy": 0.24220035672187806, "epoch": 2.778645529782026, "grad_norm": 7.0625, "learning_rate": 4.996006189905798e-05, "loss": 0.3332, "mean_token_accuracy": 0.9200899660587311, "num_tokens": 14038106.0, "step": 11920 }, { "entropy": 0.24553507566452026, "epoch": 2.7798111668026575, "grad_norm": 4.78125, "learning_rate": 4.995993988551168e-05, "loss": 0.3851, "mean_token_accuracy": 0.9134992480278015, "num_tokens": 14059942.0, "step": 11925 }, { "entropy": 0.19736606925725936, "epoch": 2.7809768038232896, "grad_norm": 0.57421875, "learning_rate": 4.9959817686168945e-05, "loss": 0.3491, "mean_token_accuracy": 0.9388951897621155, "num_tokens": 14084931.0, "step": 11930 }, { "entropy": 0.36639479398727415, "epoch": 2.7821424408439213, "grad_norm": 7.1875, "learning_rate": 4.995969530103158e-05, "loss": 0.5662, "mean_token_accuracy": 0.8725821018218994, "num_tokens": 14100686.0, "step": 11935 }, { "entropy": 0.2210806304588914, "epoch": 2.783308077864553, "grad_norm": 5.09375, "learning_rate": 4.9959572730101416e-05, "loss": 0.3382, "mean_token_accuracy": 0.9269168257713318, "num_tokens": 14127188.0, "step": 11940 }, { "entropy": 0.2088917564600706, "epoch": 2.784473714885185, "grad_norm": 3.84375, "learning_rate": 4.995944997338029e-05, "loss": 0.3704, "mean_token_accuracy": 0.9320711851119995, "num_tokens": 14145313.0, "step": 11945 }, { "entropy": 0.2135708898305893, "epoch": 2.7856393519058167, "grad_norm": 1.046875, "learning_rate": 4.9959327030870016e-05, "loss": 0.3219, "mean_token_accuracy": 0.9356307864189148, "num_tokens": 14168427.0, "step": 11950 }, { "entropy": 0.20443826280534266, "epoch": 2.7868049889264483, "grad_norm": 1.7890625, "learning_rate": 4.9959203902572446e-05, "loss": 0.2976, "mean_token_accuracy": 0.9165285766124726, "num_tokens": 14188997.0, "step": 11955 }, { "entropy": 0.23184158205986022, "epoch": 2.78797062594708, "grad_norm": 2.171875, "learning_rate": 4.99590805884894e-05, "loss": 0.3338, "mean_token_accuracy": 0.923123162984848, "num_tokens": 14218749.0, "step": 11960 }, { "entropy": 0.30974789895117283, "epoch": 2.7891362629677117, "grad_norm": 0.53515625, "learning_rate": 4.995895708862272e-05, "loss": 0.6093, "mean_token_accuracy": 0.8700804620981216, "num_tokens": 14247068.0, "step": 11965 }, { "entropy": 0.2754995569586754, "epoch": 2.7903018999883438, "grad_norm": 4.875, "learning_rate": 4.9958833402974255e-05, "loss": 0.7198, "mean_token_accuracy": 0.8932251870632172, "num_tokens": 14260782.0, "step": 11970 }, { "entropy": 0.2595931053161621, "epoch": 2.7914675370089754, "grad_norm": 3.546875, "learning_rate": 4.995870953154585e-05, "loss": 0.5462, "mean_token_accuracy": 0.9064160704612731, "num_tokens": 14270942.0, "step": 11975 }, { "entropy": 0.3655819445848465, "epoch": 2.792633174029607, "grad_norm": 5.5, "learning_rate": 4.995858547433934e-05, "loss": 0.7201, "mean_token_accuracy": 0.8726738154888153, "num_tokens": 14279499.0, "step": 11980 }, { "entropy": 0.23394382521510124, "epoch": 2.793798811050239, "grad_norm": 0.380859375, "learning_rate": 4.995846123135658e-05, "loss": 0.405, "mean_token_accuracy": 0.9207181870937348, "num_tokens": 14301846.0, "step": 11985 }, { "entropy": 0.29523569345474243, "epoch": 2.794964448070871, "grad_norm": 1.640625, "learning_rate": 4.9958336802599426e-05, "loss": 0.572, "mean_token_accuracy": 0.8870051562786102, "num_tokens": 14314057.0, "step": 11990 }, { "entropy": 0.1730364289134741, "epoch": 2.7961300850915025, "grad_norm": 3.609375, "learning_rate": 4.995821218806973e-05, "loss": 0.2169, "mean_token_accuracy": 0.9259053111076355, "num_tokens": 14334832.0, "step": 11995 }, { "entropy": 0.24064609967172146, "epoch": 2.797295722112134, "grad_norm": 5.625, "learning_rate": 4.995808738776936e-05, "loss": 0.3937, "mean_token_accuracy": 0.9231860220432282, "num_tokens": 14349962.0, "step": 12000 }, { "entropy": 0.20124710947275162, "epoch": 2.798461359132766, "grad_norm": 5.125, "learning_rate": 4.9957962401700165e-05, "loss": 0.3264, "mean_token_accuracy": 0.9315138995647431, "num_tokens": 14375096.0, "step": 12005 }, { "entropy": 0.28214637413620947, "epoch": 2.799626996153398, "grad_norm": 3.734375, "learning_rate": 4.9957837229864006e-05, "loss": 0.4613, "mean_token_accuracy": 0.9064045310020447, "num_tokens": 14390327.0, "step": 12010 }, { "entropy": 0.2217831529676914, "epoch": 2.8007926331740296, "grad_norm": 4.4375, "learning_rate": 4.995771187226277e-05, "loss": 0.3759, "mean_token_accuracy": 0.9254004776477813, "num_tokens": 14408761.0, "step": 12015 }, { "entropy": 0.22049227058887483, "epoch": 2.8019582701946613, "grad_norm": 6.28125, "learning_rate": 4.99575863288983e-05, "loss": 0.4114, "mean_token_accuracy": 0.9224657356739044, "num_tokens": 14419772.0, "step": 12020 }, { "entropy": 0.19187535513192416, "epoch": 2.8031239072152934, "grad_norm": 0.6484375, "learning_rate": 4.9957460599772484e-05, "loss": 0.256, "mean_token_accuracy": 0.9346370995044708, "num_tokens": 14445762.0, "step": 12025 }, { "entropy": 0.1792112410068512, "epoch": 2.804289544235925, "grad_norm": 4.90625, "learning_rate": 4.9957334684887195e-05, "loss": 0.2564, "mean_token_accuracy": 0.9480877816677094, "num_tokens": 14466302.0, "step": 12030 }, { "entropy": 0.23121217228472232, "epoch": 2.8054551812565567, "grad_norm": 5.53125, "learning_rate": 4.995720858424431e-05, "loss": 0.3067, "mean_token_accuracy": 0.9157763183116913, "num_tokens": 14485887.0, "step": 12035 }, { "entropy": 0.2949991822242737, "epoch": 2.8066208182771883, "grad_norm": 3.34375, "learning_rate": 4.9957082297845706e-05, "loss": 0.4807, "mean_token_accuracy": 0.9019329190254212, "num_tokens": 14499730.0, "step": 12040 }, { "entropy": 0.2044808973558247, "epoch": 2.80778645529782, "grad_norm": 7.1875, "learning_rate": 4.9956955825693267e-05, "loss": 0.3778, "mean_token_accuracy": 0.9192876040935516, "num_tokens": 14530684.0, "step": 12045 }, { "entropy": 0.2982170686125755, "epoch": 2.808952092318452, "grad_norm": 5.59375, "learning_rate": 4.995682916778889e-05, "loss": 0.5481, "mean_token_accuracy": 0.8952968299388886, "num_tokens": 14546286.0, "step": 12050 }, { "entropy": 0.2020443793386221, "epoch": 2.8101177293390838, "grad_norm": 3.03125, "learning_rate": 4.995670232413444e-05, "loss": 0.378, "mean_token_accuracy": 0.9278689801692963, "num_tokens": 14561116.0, "step": 12055 }, { "entropy": 0.42926357612013816, "epoch": 2.8112833663597154, "grad_norm": 1.3125, "learning_rate": 4.9956575294731836e-05, "loss": 0.6813, "mean_token_accuracy": 0.8541025578975677, "num_tokens": 14586798.0, "step": 12060 }, { "entropy": 0.24253510124981403, "epoch": 2.8124490033803475, "grad_norm": 7.25, "learning_rate": 4.9956448079582946e-05, "loss": 0.5191, "mean_token_accuracy": 0.9105861961841584, "num_tokens": 14606560.0, "step": 12065 }, { "entropy": 0.32413134127855303, "epoch": 2.813614640400979, "grad_norm": 5.3125, "learning_rate": 4.995632067868969e-05, "loss": 0.3997, "mean_token_accuracy": 0.8935954630374908, "num_tokens": 14631434.0, "step": 12070 }, { "entropy": 0.19401369988918304, "epoch": 2.814780277421611, "grad_norm": 5.9375, "learning_rate": 4.995619309205395e-05, "loss": 0.4599, "mean_token_accuracy": 0.9210140883922577, "num_tokens": 14653039.0, "step": 12075 }, { "entropy": 0.2677511714398861, "epoch": 2.815945914442243, "grad_norm": 5.28125, "learning_rate": 4.995606531967764e-05, "loss": 0.5118, "mean_token_accuracy": 0.9019227385520935, "num_tokens": 14663412.0, "step": 12080 }, { "entropy": 0.21213569901883603, "epoch": 2.8171115514628746, "grad_norm": 5.125, "learning_rate": 4.995593736156266e-05, "loss": 0.2504, "mean_token_accuracy": 0.9331638395786286, "num_tokens": 14694986.0, "step": 12085 }, { "entropy": 0.22414507642388343, "epoch": 2.8182771884835063, "grad_norm": 4.3125, "learning_rate": 4.995580921771091e-05, "loss": 0.4155, "mean_token_accuracy": 0.9196042895317078, "num_tokens": 14707633.0, "step": 12090 }, { "entropy": 0.31253494918346403, "epoch": 2.819442825504138, "grad_norm": 6.03125, "learning_rate": 4.9955680888124324e-05, "loss": 0.735, "mean_token_accuracy": 0.8750035762786865, "num_tokens": 14716172.0, "step": 12095 }, { "entropy": 0.2718751896172762, "epoch": 2.8206084625247696, "grad_norm": 6.1875, "learning_rate": 4.9955552372804796e-05, "loss": 0.36, "mean_token_accuracy": 0.9038479328155518, "num_tokens": 14746025.0, "step": 12100 }, { "entropy": 0.25656766891479493, "epoch": 2.8217740995454017, "grad_norm": 5.21875, "learning_rate": 4.9955423671754254e-05, "loss": 0.4172, "mean_token_accuracy": 0.9210475564002991, "num_tokens": 14758139.0, "step": 12105 }, { "entropy": 0.22321480922400952, "epoch": 2.8229397365660334, "grad_norm": 5.5, "learning_rate": 4.995529478497461e-05, "loss": 0.4913, "mean_token_accuracy": 0.9010767698287964, "num_tokens": 14777336.0, "step": 12110 }, { "entropy": 0.25825431272387506, "epoch": 2.824105373586665, "grad_norm": 3.25, "learning_rate": 4.9955165712467774e-05, "loss": 0.5626, "mean_token_accuracy": 0.8835008561611175, "num_tokens": 14797899.0, "step": 12115 }, { "entropy": 0.3582488939166069, "epoch": 2.825271010607297, "grad_norm": 5.84375, "learning_rate": 4.995503645423569e-05, "loss": 0.8463, "mean_token_accuracy": 0.8668466031551361, "num_tokens": 14807031.0, "step": 12120 }, { "entropy": 0.2489135056734085, "epoch": 2.826436647627929, "grad_norm": 5.15625, "learning_rate": 4.995490701028028e-05, "loss": 0.4786, "mean_token_accuracy": 0.9056442618370056, "num_tokens": 14819392.0, "step": 12125 }, { "entropy": 0.19951955564320087, "epoch": 2.8276022846485604, "grad_norm": 6.1875, "learning_rate": 4.9954777380603476e-05, "loss": 0.3804, "mean_token_accuracy": 0.922314727306366, "num_tokens": 14837431.0, "step": 12130 }, { "entropy": 0.26699568033218385, "epoch": 2.828767921669192, "grad_norm": 6.0, "learning_rate": 4.995464756520721e-05, "loss": 0.4694, "mean_token_accuracy": 0.918603527545929, "num_tokens": 14848180.0, "step": 12135 }, { "entropy": 0.46731987968087196, "epoch": 2.8299335586898238, "grad_norm": 6.21875, "learning_rate": 4.9954517564093406e-05, "loss": 0.8151, "mean_token_accuracy": 0.8681573092937469, "num_tokens": 14867566.0, "step": 12140 }, { "entropy": 0.30520070940256117, "epoch": 2.831099195710456, "grad_norm": 5.71875, "learning_rate": 4.9954387377264024e-05, "loss": 0.6726, "mean_token_accuracy": 0.8926642715930939, "num_tokens": 14876554.0, "step": 12145 }, { "entropy": 0.26313832104206086, "epoch": 2.8322648327310875, "grad_norm": 6.65625, "learning_rate": 4.995425700472098e-05, "loss": 0.6288, "mean_token_accuracy": 0.880269593000412, "num_tokens": 14893948.0, "step": 12150 }, { "entropy": 0.26269229091703894, "epoch": 2.833430469751719, "grad_norm": 0.64453125, "learning_rate": 4.995412644646625e-05, "loss": 0.3725, "mean_token_accuracy": 0.9041543662548065, "num_tokens": 14928840.0, "step": 12155 }, { "entropy": 0.24252827167510987, "epoch": 2.8345961067723513, "grad_norm": 3.0625, "learning_rate": 4.9953995702501746e-05, "loss": 0.3369, "mean_token_accuracy": 0.9155629277229309, "num_tokens": 14945946.0, "step": 12160 }, { "entropy": 0.22055025771260262, "epoch": 2.835761743792983, "grad_norm": 1.234375, "learning_rate": 4.9953864772829444e-05, "loss": 0.3357, "mean_token_accuracy": 0.901597386598587, "num_tokens": 14965542.0, "step": 12165 }, { "entropy": 0.18117874898016453, "epoch": 2.8369273808136146, "grad_norm": 1.125, "learning_rate": 4.9953733657451286e-05, "loss": 0.2497, "mean_token_accuracy": 0.9453995823860168, "num_tokens": 14983251.0, "step": 12170 }, { "entropy": 0.18524248637259005, "epoch": 2.8380930178342463, "grad_norm": 0.59765625, "learning_rate": 4.9953602356369225e-05, "loss": 0.3304, "mean_token_accuracy": 0.9263292074203491, "num_tokens": 15008932.0, "step": 12175 }, { "entropy": 0.2279165990650654, "epoch": 2.839258654854878, "grad_norm": 1.7421875, "learning_rate": 4.995347086958522e-05, "loss": 0.3454, "mean_token_accuracy": 0.9205220639705658, "num_tokens": 15038147.0, "step": 12180 }, { "entropy": 0.23292369097471238, "epoch": 2.84042429187551, "grad_norm": 11.1875, "learning_rate": 4.9953339197101235e-05, "loss": 0.5381, "mean_token_accuracy": 0.9021409928798676, "num_tokens": 15050882.0, "step": 12185 }, { "entropy": 0.17848289832472802, "epoch": 2.8415899288961417, "grad_norm": 0.515625, "learning_rate": 4.9953207338919235e-05, "loss": 0.2198, "mean_token_accuracy": 0.94959676861763, "num_tokens": 15077393.0, "step": 12190 }, { "entropy": 0.2969540163874626, "epoch": 2.8427555659167734, "grad_norm": 7.15625, "learning_rate": 4.995307529504117e-05, "loss": 0.6852, "mean_token_accuracy": 0.8878288805484772, "num_tokens": 15091979.0, "step": 12195 }, { "entropy": 0.20284539386630057, "epoch": 2.8439212029374055, "grad_norm": 3.53125, "learning_rate": 4.995294306546904e-05, "loss": 0.3946, "mean_token_accuracy": 0.9282336890697479, "num_tokens": 15114542.0, "step": 12200 }, { "entropy": 0.20276891775429248, "epoch": 2.845086839958037, "grad_norm": 7.0625, "learning_rate": 4.995281065020479e-05, "loss": 0.3412, "mean_token_accuracy": 0.9302069902420044, "num_tokens": 15138680.0, "step": 12205 }, { "entropy": 0.19186981171369552, "epoch": 2.846252476978669, "grad_norm": 7.4375, "learning_rate": 4.99526780492504e-05, "loss": 0.524, "mean_token_accuracy": 0.91390061378479, "num_tokens": 15169710.0, "step": 12210 }, { "entropy": 0.2559481278061867, "epoch": 2.847418113999301, "grad_norm": 2.328125, "learning_rate": 4.995254526260786e-05, "loss": 0.5895, "mean_token_accuracy": 0.8948764860630035, "num_tokens": 15180608.0, "step": 12215 }, { "entropy": 0.2940599586814642, "epoch": 2.8485837510199326, "grad_norm": 5.1875, "learning_rate": 4.995241229027913e-05, "loss": 0.4582, "mean_token_accuracy": 0.892432689666748, "num_tokens": 15213859.0, "step": 12220 }, { "entropy": 0.22507907301187516, "epoch": 2.849749388040564, "grad_norm": 1.9296875, "learning_rate": 4.995227913226621e-05, "loss": 0.3666, "mean_token_accuracy": 0.9292806625366211, "num_tokens": 15235352.0, "step": 12225 }, { "entropy": 0.23611784502863883, "epoch": 2.850915025061196, "grad_norm": 7.0625, "learning_rate": 4.9952145788571074e-05, "loss": 0.311, "mean_token_accuracy": 0.9268670618534088, "num_tokens": 15264026.0, "step": 12230 }, { "entropy": 0.22443998456001282, "epoch": 2.8520806620818275, "grad_norm": 4.96875, "learning_rate": 4.995201225919572e-05, "loss": 0.4376, "mean_token_accuracy": 0.9172143876552582, "num_tokens": 15274274.0, "step": 12235 }, { "entropy": 0.2179919883608818, "epoch": 2.8532462991024596, "grad_norm": 3.9375, "learning_rate": 4.995187854414213e-05, "loss": 0.3341, "mean_token_accuracy": 0.9146465182304382, "num_tokens": 15291454.0, "step": 12240 }, { "entropy": 0.12379531040787697, "epoch": 2.8544119361230913, "grad_norm": 0.9296875, "learning_rate": 4.9951744643412304e-05, "loss": 0.1584, "mean_token_accuracy": 0.964534604549408, "num_tokens": 15315876.0, "step": 12245 }, { "entropy": 0.14848388805985452, "epoch": 2.855577573143723, "grad_norm": 1.34375, "learning_rate": 4.995161055700824e-05, "loss": 0.1625, "mean_token_accuracy": 0.954338264465332, "num_tokens": 15343677.0, "step": 12250 }, { "entropy": 0.2934903770685196, "epoch": 2.856743210164355, "grad_norm": 2.453125, "learning_rate": 4.995147628493193e-05, "loss": 0.5294, "mean_token_accuracy": 0.9005158245563507, "num_tokens": 15353076.0, "step": 12255 }, { "entropy": 0.18981227725744249, "epoch": 2.8579088471849867, "grad_norm": 4.09375, "learning_rate": 4.995134182718538e-05, "loss": 0.2938, "mean_token_accuracy": 0.9368978083133698, "num_tokens": 15372999.0, "step": 12260 }, { "entropy": 0.21589052006602288, "epoch": 2.8590744842056184, "grad_norm": 0.6328125, "learning_rate": 4.99512071837706e-05, "loss": 0.3413, "mean_token_accuracy": 0.9222669243812561, "num_tokens": 15394936.0, "step": 12265 }, { "entropy": 0.41172001510858536, "epoch": 2.86024012122625, "grad_norm": 5.3125, "learning_rate": 4.9951072354689585e-05, "loss": 0.6215, "mean_token_accuracy": 0.8982397258281708, "num_tokens": 15428941.0, "step": 12270 }, { "entropy": 0.21318286955356597, "epoch": 2.8614057582468817, "grad_norm": 5.53125, "learning_rate": 4.9950937339944365e-05, "loss": 0.4136, "mean_token_accuracy": 0.926407641172409, "num_tokens": 15446441.0, "step": 12275 }, { "entropy": 0.2053906600922346, "epoch": 2.862571395267514, "grad_norm": 1.75, "learning_rate": 4.9950802139536937e-05, "loss": 0.3031, "mean_token_accuracy": 0.9017199754714966, "num_tokens": 15468009.0, "step": 12280 }, { "entropy": 0.11248552985489368, "epoch": 2.8637370322881455, "grad_norm": 1.296875, "learning_rate": 4.9950666753469325e-05, "loss": 0.141, "mean_token_accuracy": 0.967561000585556, "num_tokens": 15517507.0, "step": 12285 }, { "entropy": 0.30170712172985076, "epoch": 2.864902669308777, "grad_norm": 6.0625, "learning_rate": 4.995053118174353e-05, "loss": 0.5021, "mean_token_accuracy": 0.882989478111267, "num_tokens": 15528528.0, "step": 12290 }, { "entropy": 0.35192948803305624, "epoch": 2.8660683063294092, "grad_norm": 1.4140625, "learning_rate": 4.9950395424361604e-05, "loss": 0.6302, "mean_token_accuracy": 0.8598722696304322, "num_tokens": 15557410.0, "step": 12295 }, { "entropy": 0.32548448368906974, "epoch": 2.867233943350041, "grad_norm": 2.953125, "learning_rate": 4.9950259481325554e-05, "loss": 0.6216, "mean_token_accuracy": 0.8807716012001038, "num_tokens": 15579910.0, "step": 12300 }, { "entropy": 0.27001427859067917, "epoch": 2.8683995803706726, "grad_norm": 3.46875, "learning_rate": 4.99501233526374e-05, "loss": 0.4658, "mean_token_accuracy": 0.8968069672584533, "num_tokens": 15602595.0, "step": 12305 }, { "entropy": 0.22677491679787637, "epoch": 2.869565217391304, "grad_norm": 7.28125, "learning_rate": 4.994998703829919e-05, "loss": 0.5024, "mean_token_accuracy": 0.9079581201076508, "num_tokens": 15614836.0, "step": 12310 }, { "entropy": 0.22498192228376865, "epoch": 2.870730854411936, "grad_norm": 4.53125, "learning_rate": 4.994985053831295e-05, "loss": 0.4057, "mean_token_accuracy": 0.9227579593658447, "num_tokens": 15629028.0, "step": 12315 }, { "entropy": 0.21907037869095802, "epoch": 2.871896491432568, "grad_norm": 1.90625, "learning_rate": 4.99497138526807e-05, "loss": 0.2842, "mean_token_accuracy": 0.9375577330589294, "num_tokens": 15670225.0, "step": 12320 }, { "entropy": 0.21589868012815713, "epoch": 2.8730621284531996, "grad_norm": 3.078125, "learning_rate": 4.99495769814045e-05, "loss": 0.3292, "mean_token_accuracy": 0.9117425501346588, "num_tokens": 15700279.0, "step": 12325 }, { "entropy": 0.19196809008717536, "epoch": 2.8742277654738313, "grad_norm": 6.0, "learning_rate": 4.994943992448638e-05, "loss": 0.3625, "mean_token_accuracy": 0.9180894255638122, "num_tokens": 15714776.0, "step": 12330 }, { "entropy": 0.19620308466255665, "epoch": 2.8753934024944634, "grad_norm": 2.890625, "learning_rate": 4.994930268192839e-05, "loss": 0.3987, "mean_token_accuracy": 0.9193916916847229, "num_tokens": 15741913.0, "step": 12335 }, { "entropy": 0.28176852613687514, "epoch": 2.876559039515095, "grad_norm": 1.0, "learning_rate": 4.9949165253732565e-05, "loss": 0.4444, "mean_token_accuracy": 0.904612535238266, "num_tokens": 15763861.0, "step": 12340 }, { "entropy": 0.18242901414632798, "epoch": 2.8777246765357267, "grad_norm": 6.21875, "learning_rate": 4.994902763990097e-05, "loss": 0.4322, "mean_token_accuracy": 0.911017370223999, "num_tokens": 15778174.0, "step": 12345 }, { "entropy": 0.29833044596016406, "epoch": 2.878890313556359, "grad_norm": 0.99609375, "learning_rate": 4.994888984043564e-05, "loss": 0.3599, "mean_token_accuracy": 0.9066588163375855, "num_tokens": 15797512.0, "step": 12350 }, { "entropy": 0.19635452032089235, "epoch": 2.8800559505769905, "grad_norm": 2.765625, "learning_rate": 4.994875185533864e-05, "loss": 0.254, "mean_token_accuracy": 0.9306394994258881, "num_tokens": 15827727.0, "step": 12355 }, { "entropy": 0.27425159960985185, "epoch": 2.881221587597622, "grad_norm": 3.671875, "learning_rate": 4.994861368461203e-05, "loss": 0.5238, "mean_token_accuracy": 0.9004498362541199, "num_tokens": 15841398.0, "step": 12360 }, { "entropy": 0.2255168441683054, "epoch": 2.882387224618254, "grad_norm": 4.53125, "learning_rate": 4.994847532825786e-05, "loss": 0.4827, "mean_token_accuracy": 0.9102801382541656, "num_tokens": 15858611.0, "step": 12365 }, { "entropy": 0.25632177069783213, "epoch": 2.8835528616388855, "grad_norm": 0.77734375, "learning_rate": 4.9948336786278204e-05, "loss": 0.5552, "mean_token_accuracy": 0.9066780388355256, "num_tokens": 15878703.0, "step": 12370 }, { "entropy": 0.27798714153468607, "epoch": 2.8847184986595176, "grad_norm": 5.90625, "learning_rate": 4.994819805867512e-05, "loss": 0.4013, "mean_token_accuracy": 0.9049887299537659, "num_tokens": 15903115.0, "step": 12375 }, { "entropy": 0.27522594928741456, "epoch": 2.8858841356801492, "grad_norm": 7.125, "learning_rate": 4.994805914545068e-05, "loss": 0.5589, "mean_token_accuracy": 0.8919727146625519, "num_tokens": 15912967.0, "step": 12380 }, { "entropy": 0.2859261706471443, "epoch": 2.887049772700781, "grad_norm": 4.34375, "learning_rate": 4.994792004660696e-05, "loss": 0.6367, "mean_token_accuracy": 0.8898451209068299, "num_tokens": 15922730.0, "step": 12385 }, { "entropy": 0.21028335131704806, "epoch": 2.888215409721413, "grad_norm": 4.71875, "learning_rate": 4.9947780762146024e-05, "loss": 0.2133, "mean_token_accuracy": 0.9456324815750122, "num_tokens": 15952867.0, "step": 12390 }, { "entropy": 0.2555417686700821, "epoch": 2.8893810467420447, "grad_norm": 0.59375, "learning_rate": 4.994764129206996e-05, "loss": 0.4049, "mean_token_accuracy": 0.9144223809242249, "num_tokens": 15974845.0, "step": 12395 }, { "entropy": 0.2397582620382309, "epoch": 2.8905466837626763, "grad_norm": 1.7734375, "learning_rate": 4.994750163638084e-05, "loss": 0.2899, "mean_token_accuracy": 0.9156318068504333, "num_tokens": 15993164.0, "step": 12400 }, { "entropy": 0.2651869673281908, "epoch": 2.891712320783308, "grad_norm": 0.87109375, "learning_rate": 4.9947361795080746e-05, "loss": 0.3185, "mean_token_accuracy": 0.9095015168190003, "num_tokens": 16026317.0, "step": 12405 }, { "entropy": 0.19921966940164565, "epoch": 2.8928779578039396, "grad_norm": 3.21875, "learning_rate": 4.994722176817177e-05, "loss": 0.2915, "mean_token_accuracy": 0.9439963281154633, "num_tokens": 16043945.0, "step": 12410 }, { "entropy": 0.27657449916005133, "epoch": 2.8940435948245717, "grad_norm": 8.3125, "learning_rate": 4.994708155565599e-05, "loss": 0.2701, "mean_token_accuracy": 0.9275880873203277, "num_tokens": 16071942.0, "step": 12415 }, { "entropy": 0.22481039054691793, "epoch": 2.8952092318452034, "grad_norm": 0.57421875, "learning_rate": 4.9946941157535504e-05, "loss": 0.3073, "mean_token_accuracy": 0.9280721783638001, "num_tokens": 16104638.0, "step": 12420 }, { "entropy": 0.1833694539964199, "epoch": 2.896374868865835, "grad_norm": 4.125, "learning_rate": 4.994680057381241e-05, "loss": 0.3048, "mean_token_accuracy": 0.9380892634391784, "num_tokens": 16128815.0, "step": 12425 }, { "entropy": 0.2773961193859577, "epoch": 2.897540505886467, "grad_norm": 1.125, "learning_rate": 4.994665980448879e-05, "loss": 0.4982, "mean_token_accuracy": 0.9037495434284211, "num_tokens": 16149828.0, "step": 12430 }, { "entropy": 0.3096873864531517, "epoch": 2.898706142907099, "grad_norm": 3.71875, "learning_rate": 4.994651884956675e-05, "loss": 0.4956, "mean_token_accuracy": 0.8928325831890106, "num_tokens": 16167468.0, "step": 12435 }, { "entropy": 0.3326524205505848, "epoch": 2.8998717799277305, "grad_norm": 0.6875, "learning_rate": 4.9946377709048404e-05, "loss": 0.5563, "mean_token_accuracy": 0.8875405877828598, "num_tokens": 16195599.0, "step": 12440 }, { "entropy": 0.23935375213623047, "epoch": 2.901037416948362, "grad_norm": 1.3359375, "learning_rate": 4.994623638293584e-05, "loss": 0.4938, "mean_token_accuracy": 0.9130324602127076, "num_tokens": 16212825.0, "step": 12445 }, { "entropy": 0.20335310474038124, "epoch": 2.902203053968994, "grad_norm": 0.828125, "learning_rate": 4.994609487123118e-05, "loss": 0.2376, "mean_token_accuracy": 0.9485228300094605, "num_tokens": 16248299.0, "step": 12450 }, { "entropy": 0.20197730883955956, "epoch": 2.903368690989626, "grad_norm": 6.5625, "learning_rate": 4.994595317393651e-05, "loss": 0.3259, "mean_token_accuracy": 0.9246832191944122, "num_tokens": 16269044.0, "step": 12455 }, { "entropy": 0.2346881277859211, "epoch": 2.9045343280102576, "grad_norm": 4.4375, "learning_rate": 4.994581129105397e-05, "loss": 0.3763, "mean_token_accuracy": 0.9140804708003998, "num_tokens": 16281323.0, "step": 12460 }, { "entropy": 0.16434445828199387, "epoch": 2.9056999650308892, "grad_norm": 0.87890625, "learning_rate": 4.9945669222585656e-05, "loss": 0.2848, "mean_token_accuracy": 0.9422541975975036, "num_tokens": 16312296.0, "step": 12465 }, { "entropy": 0.24725492149591446, "epoch": 2.9068656020515213, "grad_norm": 5.96875, "learning_rate": 4.9945526968533694e-05, "loss": 0.5421, "mean_token_accuracy": 0.9044016242027283, "num_tokens": 16323044.0, "step": 12470 }, { "entropy": 0.2235550694167614, "epoch": 2.908031239072153, "grad_norm": 6.40625, "learning_rate": 4.9945384528900214e-05, "loss": 0.4786, "mean_token_accuracy": 0.9164490222930908, "num_tokens": 16335856.0, "step": 12475 }, { "entropy": 0.23978302478790284, "epoch": 2.9091968760927847, "grad_norm": 6.21875, "learning_rate": 4.994524190368733e-05, "loss": 0.303, "mean_token_accuracy": 0.9199592411518097, "num_tokens": 16358188.0, "step": 12480 }, { "entropy": 0.2748368114233017, "epoch": 2.9103625131134163, "grad_norm": 4.71875, "learning_rate": 4.994509909289716e-05, "loss": 0.4885, "mean_token_accuracy": 0.9092333793640137, "num_tokens": 16368468.0, "step": 12485 }, { "entropy": 0.21827991865575314, "epoch": 2.9115281501340484, "grad_norm": 3.9375, "learning_rate": 4.9944956096531856e-05, "loss": 0.263, "mean_token_accuracy": 0.9356384515762329, "num_tokens": 16415219.0, "step": 12490 }, { "entropy": 0.24743718206882476, "epoch": 2.91269378715468, "grad_norm": 2.9375, "learning_rate": 4.994481291459354e-05, "loss": 0.5131, "mean_token_accuracy": 0.8959756314754486, "num_tokens": 16427255.0, "step": 12495 }, { "entropy": 0.2174059823155403, "epoch": 2.9138594241753117, "grad_norm": 9.6875, "learning_rate": 4.9944669547084335e-05, "loss": 0.4839, "mean_token_accuracy": 0.9092626631259918, "num_tokens": 16447024.0, "step": 12500 }, { "entropy": 0.18768667057156563, "epoch": 2.9150250611959434, "grad_norm": 2.484375, "learning_rate": 4.9944525994006395e-05, "loss": 0.2285, "mean_token_accuracy": 0.9509588420391083, "num_tokens": 16464340.0, "step": 12505 }, { "entropy": 0.1717726208269596, "epoch": 2.9161906982165755, "grad_norm": 5.625, "learning_rate": 4.994438225536185e-05, "loss": 0.2924, "mean_token_accuracy": 0.9374354660511017, "num_tokens": 16482817.0, "step": 12510 }, { "entropy": 0.2385638602077961, "epoch": 2.917356335237207, "grad_norm": 7.59375, "learning_rate": 4.9944238331152856e-05, "loss": 0.2932, "mean_token_accuracy": 0.9031573534011841, "num_tokens": 16508927.0, "step": 12515 }, { "entropy": 0.15317164026200772, "epoch": 2.918521972257839, "grad_norm": 1.6796875, "learning_rate": 4.9944094221381546e-05, "loss": 0.1075, "mean_token_accuracy": 0.9532822012901306, "num_tokens": 16543572.0, "step": 12520 }, { "entropy": 0.23142515420913695, "epoch": 2.919687609278471, "grad_norm": 0.80859375, "learning_rate": 4.9943949926050085e-05, "loss": 0.2399, "mean_token_accuracy": 0.9371400833129883, "num_tokens": 16576364.0, "step": 12525 }, { "entropy": 0.2312719516456127, "epoch": 2.9208532462991026, "grad_norm": 1.21875, "learning_rate": 4.99438054451606e-05, "loss": 0.3978, "mean_token_accuracy": 0.9079957783222199, "num_tokens": 16606211.0, "step": 12530 }, { "entropy": 0.19937541633844375, "epoch": 2.9220188833197343, "grad_norm": 2.703125, "learning_rate": 4.994366077871526e-05, "loss": 0.4725, "mean_token_accuracy": 0.9072546362876892, "num_tokens": 16627977.0, "step": 12535 }, { "entropy": 0.306121326982975, "epoch": 2.923184520340366, "grad_norm": 8.375, "learning_rate": 4.9943515926716225e-05, "loss": 0.4601, "mean_token_accuracy": 0.8869144976139068, "num_tokens": 16650660.0, "step": 12540 }, { "entropy": 0.2306956544518471, "epoch": 2.9243501573609976, "grad_norm": 4.96875, "learning_rate": 4.994337088916566e-05, "loss": 0.3131, "mean_token_accuracy": 0.9148689210414886, "num_tokens": 16675489.0, "step": 12545 }, { "entropy": 0.21608939282596112, "epoch": 2.9255157943816297, "grad_norm": 1.0546875, "learning_rate": 4.994322566606571e-05, "loss": 0.3405, "mean_token_accuracy": 0.9272489190101624, "num_tokens": 16707904.0, "step": 12550 }, { "entropy": 0.23308916240930558, "epoch": 2.9266814314022613, "grad_norm": 1.5078125, "learning_rate": 4.994308025741855e-05, "loss": 0.2674, "mean_token_accuracy": 0.9193484723567963, "num_tokens": 16742210.0, "step": 12555 }, { "entropy": 0.37350448220968246, "epoch": 2.927847068422893, "grad_norm": 3.046875, "learning_rate": 4.994293466322635e-05, "loss": 0.7759, "mean_token_accuracy": 0.8571257382631302, "num_tokens": 16761736.0, "step": 12560 }, { "entropy": 0.2372276846319437, "epoch": 2.929012705443525, "grad_norm": 5.40625, "learning_rate": 4.994278888349128e-05, "loss": 0.4803, "mean_token_accuracy": 0.9188945889472961, "num_tokens": 16776070.0, "step": 12565 }, { "entropy": 0.26332382038235663, "epoch": 2.9301783424641568, "grad_norm": 8.25, "learning_rate": 4.99426429182155e-05, "loss": 0.396, "mean_token_accuracy": 0.908951860666275, "num_tokens": 16797430.0, "step": 12570 }, { "entropy": 0.23919696919620037, "epoch": 2.9313439794847884, "grad_norm": 1.3515625, "learning_rate": 4.994249676740121e-05, "loss": 0.3972, "mean_token_accuracy": 0.9126874089241028, "num_tokens": 16818531.0, "step": 12575 }, { "entropy": 0.16979910265654324, "epoch": 2.93250961650542, "grad_norm": 0.65625, "learning_rate": 4.994235043105058e-05, "loss": 0.3149, "mean_token_accuracy": 0.9449944615364074, "num_tokens": 16843185.0, "step": 12580 }, { "entropy": 0.26157992482185366, "epoch": 2.9336752535260517, "grad_norm": 2.640625, "learning_rate": 4.994220390916579e-05, "loss": 0.4379, "mean_token_accuracy": 0.9097245037555695, "num_tokens": 16856276.0, "step": 12585 }, { "entropy": 0.3574902083724737, "epoch": 2.934840890546684, "grad_norm": 6.84375, "learning_rate": 4.9942057201749024e-05, "loss": 0.8333, "mean_token_accuracy": 0.8867159873247147, "num_tokens": 16897168.0, "step": 12590 }, { "entropy": 0.26900804713368415, "epoch": 2.9360065275673155, "grad_norm": 1.46875, "learning_rate": 4.9941910308802474e-05, "loss": 0.4726, "mean_token_accuracy": 0.911369127035141, "num_tokens": 16913957.0, "step": 12595 }, { "entropy": 0.20102691166102887, "epoch": 2.937172164587947, "grad_norm": 1.1640625, "learning_rate": 4.9941763230328325e-05, "loss": 0.4644, "mean_token_accuracy": 0.9198138236999511, "num_tokens": 16933970.0, "step": 12600 }, { "entropy": 0.2640152137726545, "epoch": 2.9383378016085793, "grad_norm": 3.71875, "learning_rate": 4.994161596632877e-05, "loss": 0.2972, "mean_token_accuracy": 0.9337838649749756, "num_tokens": 16955313.0, "step": 12605 }, { "entropy": 0.22733655273914338, "epoch": 2.939503438629211, "grad_norm": 6.53125, "learning_rate": 4.9941468516806015e-05, "loss": 0.4618, "mean_token_accuracy": 0.9171272695064545, "num_tokens": 16967898.0, "step": 12610 }, { "entropy": 0.1735091220587492, "epoch": 2.9406690756498426, "grad_norm": 3.9375, "learning_rate": 4.9941320881762244e-05, "loss": 0.3008, "mean_token_accuracy": 0.9320429384708404, "num_tokens": 16993312.0, "step": 12615 }, { "entropy": 0.23960115239024163, "epoch": 2.9418347126704743, "grad_norm": 4.0, "learning_rate": 4.994117306119967e-05, "loss": 0.442, "mean_token_accuracy": 0.9177091896533967, "num_tokens": 17005693.0, "step": 12620 }, { "entropy": 0.23045891597867013, "epoch": 2.9430003496911064, "grad_norm": 0.91015625, "learning_rate": 4.9941025055120496e-05, "loss": 0.3941, "mean_token_accuracy": 0.9129961967468262, "num_tokens": 17027734.0, "step": 12625 }, { "entropy": 0.23398528508841993, "epoch": 2.944165986711738, "grad_norm": 1.2109375, "learning_rate": 4.994087686352692e-05, "loss": 0.3152, "mean_token_accuracy": 0.9212701916694641, "num_tokens": 17056712.0, "step": 12630 }, { "entropy": 0.2694935627281666, "epoch": 2.9453316237323697, "grad_norm": 5.75, "learning_rate": 4.994072848642116e-05, "loss": 0.4259, "mean_token_accuracy": 0.8920988500118255, "num_tokens": 17070954.0, "step": 12635 }, { "entropy": 0.20029729194939136, "epoch": 2.9464972607530013, "grad_norm": 7.5625, "learning_rate": 4.994057992380542e-05, "loss": 0.4291, "mean_token_accuracy": 0.9236281037330627, "num_tokens": 17091261.0, "step": 12640 }, { "entropy": 0.23001744896173476, "epoch": 2.9476628977736334, "grad_norm": 0.5546875, "learning_rate": 4.994043117568194e-05, "loss": 0.3814, "mean_token_accuracy": 0.9207939684391022, "num_tokens": 17112862.0, "step": 12645 }, { "entropy": 0.22075748890638353, "epoch": 2.948828534794265, "grad_norm": 1.5546875, "learning_rate": 4.99402822420529e-05, "loss": 0.5102, "mean_token_accuracy": 0.9173259139060974, "num_tokens": 17125175.0, "step": 12650 }, { "entropy": 0.21131634674966335, "epoch": 2.9499941718148968, "grad_norm": 7.71875, "learning_rate": 4.994013312292055e-05, "loss": 0.4822, "mean_token_accuracy": 0.9125283896923065, "num_tokens": 17142485.0, "step": 12655 }, { "entropy": 0.21084157526493072, "epoch": 2.951159808835529, "grad_norm": 0.94140625, "learning_rate": 4.99399838182871e-05, "loss": 0.4204, "mean_token_accuracy": 0.9137578189373017, "num_tokens": 17170566.0, "step": 12660 }, { "entropy": 0.34762868024408816, "epoch": 2.9523254458561605, "grad_norm": 8.0625, "learning_rate": 4.9939834328154786e-05, "loss": 0.5129, "mean_token_accuracy": 0.8656055808067322, "num_tokens": 17193691.0, "step": 12665 }, { "entropy": 0.2598606400191784, "epoch": 2.953491082876792, "grad_norm": 2.4375, "learning_rate": 4.993968465252583e-05, "loss": 0.3879, "mean_token_accuracy": 0.9147687435150147, "num_tokens": 17207788.0, "step": 12670 }, { "entropy": 0.2572662975639105, "epoch": 2.954656719897424, "grad_norm": 4.4375, "learning_rate": 4.9939534791402464e-05, "loss": 0.4912, "mean_token_accuracy": 0.8920653343200684, "num_tokens": 17234544.0, "step": 12675 }, { "entropy": 0.2845245450735092, "epoch": 2.9558223569180555, "grad_norm": 6.03125, "learning_rate": 4.9939384744786924e-05, "loss": 0.4841, "mean_token_accuracy": 0.8954847276210784, "num_tokens": 17254682.0, "step": 12680 }, { "entropy": 0.20575838908553123, "epoch": 2.9569879939386876, "grad_norm": 6.9375, "learning_rate": 4.993923451268144e-05, "loss": 0.4489, "mean_token_accuracy": 0.9146290481090545, "num_tokens": 17270762.0, "step": 12685 }, { "entropy": 0.23746194317936897, "epoch": 2.9581536309593193, "grad_norm": 5.375, "learning_rate": 4.993908409508827e-05, "loss": 0.4141, "mean_token_accuracy": 0.9214969217777252, "num_tokens": 17282257.0, "step": 12690 }, { "entropy": 0.25392072908580304, "epoch": 2.959319267979951, "grad_norm": 6.28125, "learning_rate": 4.9938933492009646e-05, "loss": 0.3244, "mean_token_accuracy": 0.930941891670227, "num_tokens": 17296030.0, "step": 12695 }, { "entropy": 0.30570143423974516, "epoch": 2.960484905000583, "grad_norm": 7.9375, "learning_rate": 4.993878270344781e-05, "loss": 0.4522, "mean_token_accuracy": 0.9098774671554566, "num_tokens": 17317480.0, "step": 12700 }, { "entropy": 0.4143160991370678, "epoch": 2.9616505420212147, "grad_norm": 5.28125, "learning_rate": 4.993863172940502e-05, "loss": 0.7403, "mean_token_accuracy": 0.8491365373134613, "num_tokens": 17342663.0, "step": 12705 }, { "entropy": 0.2186424985527992, "epoch": 2.9628161790418464, "grad_norm": 2.34375, "learning_rate": 4.9938480569883525e-05, "loss": 0.3605, "mean_token_accuracy": 0.932968944311142, "num_tokens": 17355353.0, "step": 12710 }, { "entropy": 0.14407789390534162, "epoch": 2.963981816062478, "grad_norm": 0.7578125, "learning_rate": 4.993832922488557e-05, "loss": 0.1612, "mean_token_accuracy": 0.9588046312332154, "num_tokens": 17394368.0, "step": 12715 }, { "entropy": 0.20839739665389062, "epoch": 2.9651474530831097, "grad_norm": 3.6875, "learning_rate": 4.993817769441341e-05, "loss": 0.4005, "mean_token_accuracy": 0.9120141923427582, "num_tokens": 17406822.0, "step": 12720 }, { "entropy": 0.18153918944299222, "epoch": 2.966313090103742, "grad_norm": 0.63671875, "learning_rate": 4.993802597846933e-05, "loss": 0.2279, "mean_token_accuracy": 0.9474944174289703, "num_tokens": 17428079.0, "step": 12725 }, { "entropy": 0.19154103100299835, "epoch": 2.9674787271243734, "grad_norm": 4.09375, "learning_rate": 4.993787407705556e-05, "loss": 0.2736, "mean_token_accuracy": 0.9374960124492645, "num_tokens": 17447557.0, "step": 12730 }, { "entropy": 0.1938954271376133, "epoch": 2.968644364145005, "grad_norm": 1.4609375, "learning_rate": 4.993772199017439e-05, "loss": 0.2667, "mean_token_accuracy": 0.9410579323768615, "num_tokens": 17463243.0, "step": 12735 }, { "entropy": 0.261495478451252, "epoch": 2.969810001165637, "grad_norm": 5.53125, "learning_rate": 4.993756971782807e-05, "loss": 0.5759, "mean_token_accuracy": 0.896853107213974, "num_tokens": 17472768.0, "step": 12740 }, { "entropy": 0.187485178001225, "epoch": 2.970975638186269, "grad_norm": 4.5625, "learning_rate": 4.993741726001888e-05, "loss": 0.4988, "mean_token_accuracy": 0.9167073547840119, "num_tokens": 17496526.0, "step": 12745 }, { "entropy": 0.18627914749085903, "epoch": 2.9721412752069005, "grad_norm": 0.7734375, "learning_rate": 4.99372646167491e-05, "loss": 0.2441, "mean_token_accuracy": 0.9456724762916565, "num_tokens": 17534475.0, "step": 12750 }, { "entropy": 0.1453916199505329, "epoch": 2.973306912227532, "grad_norm": 4.46875, "learning_rate": 4.993711178802099e-05, "loss": 0.253, "mean_token_accuracy": 0.9423149466514588, "num_tokens": 17559219.0, "step": 12755 }, { "entropy": 0.19363697096705437, "epoch": 2.9744725492481643, "grad_norm": 1.4296875, "learning_rate": 4.993695877383684e-05, "loss": 0.3023, "mean_token_accuracy": 0.9373092234134675, "num_tokens": 17576680.0, "step": 12760 }, { "entropy": 0.2151080034673214, "epoch": 2.975638186268796, "grad_norm": 0.451171875, "learning_rate": 4.9936805574198925e-05, "loss": 0.2392, "mean_token_accuracy": 0.9359769761562348, "num_tokens": 17606807.0, "step": 12765 }, { "entropy": 0.2870198667049408, "epoch": 2.9768038232894276, "grad_norm": 4.28125, "learning_rate": 4.993665218910953e-05, "loss": 0.5378, "mean_token_accuracy": 0.8911948084831238, "num_tokens": 17624236.0, "step": 12770 }, { "entropy": 0.24478442054241895, "epoch": 2.9779694603100593, "grad_norm": 0.54296875, "learning_rate": 4.9936498618570956e-05, "loss": 0.4086, "mean_token_accuracy": 0.9145197212696076, "num_tokens": 17661090.0, "step": 12775 }, { "entropy": 0.22513844072818756, "epoch": 2.9791350973306914, "grad_norm": 8.875, "learning_rate": 4.9936344862585474e-05, "loss": 0.4294, "mean_token_accuracy": 0.9144389748573303, "num_tokens": 17673989.0, "step": 12780 }, { "entropy": 0.2467608630657196, "epoch": 2.980300734351323, "grad_norm": 7.8125, "learning_rate": 4.993619092115539e-05, "loss": 0.4108, "mean_token_accuracy": 0.9199643731117249, "num_tokens": 17694785.0, "step": 12785 }, { "entropy": 0.19479428604245186, "epoch": 2.9814663713719547, "grad_norm": 2.890625, "learning_rate": 4.9936036794282984e-05, "loss": 0.3532, "mean_token_accuracy": 0.9233082294464111, "num_tokens": 17708483.0, "step": 12790 }, { "entropy": 0.2902163352817297, "epoch": 2.982632008392587, "grad_norm": 14.625, "learning_rate": 4.993588248197058e-05, "loss": 0.4875, "mean_token_accuracy": 0.8970877230167389, "num_tokens": 17722973.0, "step": 12795 }, { "entropy": 0.2668305665254593, "epoch": 2.9837976454132185, "grad_norm": 9.0625, "learning_rate": 4.993572798422045e-05, "loss": 0.6425, "mean_token_accuracy": 0.8881903946399688, "num_tokens": 17732394.0, "step": 12800 }, { "entropy": 0.2405627638101578, "epoch": 2.98496328243385, "grad_norm": 6.9375, "learning_rate": 4.9935573301034916e-05, "loss": 0.4664, "mean_token_accuracy": 0.9050011456012725, "num_tokens": 17745099.0, "step": 12805 }, { "entropy": 0.3110461011528969, "epoch": 2.986128919454482, "grad_norm": 7.4375, "learning_rate": 4.993541843241628e-05, "loss": 0.5163, "mean_token_accuracy": 0.9056003808975219, "num_tokens": 17757821.0, "step": 12810 }, { "entropy": 0.19741073474287987, "epoch": 2.9872945564751134, "grad_norm": 2.6875, "learning_rate": 4.9935263378366854e-05, "loss": 0.1996, "mean_token_accuracy": 0.9366816639900207, "num_tokens": 17783464.0, "step": 12815 }, { "entropy": 0.2888368025422096, "epoch": 2.9884601934957455, "grad_norm": 5.28125, "learning_rate": 4.993510813888894e-05, "loss": 0.5255, "mean_token_accuracy": 0.9123888671398163, "num_tokens": 17791795.0, "step": 12820 }, { "entropy": 0.20527897477149964, "epoch": 2.989625830516377, "grad_norm": 1.59375, "learning_rate": 4.9934952713984865e-05, "loss": 0.2343, "mean_token_accuracy": 0.9191898763179779, "num_tokens": 17815861.0, "step": 12825 }, { "entropy": 0.23771159574389458, "epoch": 2.990791467537009, "grad_norm": 8.125, "learning_rate": 4.993479710365695e-05, "loss": 0.4917, "mean_token_accuracy": 0.9103189170360565, "num_tokens": 17834933.0, "step": 12830 }, { "entropy": 0.2657750204205513, "epoch": 2.991957104557641, "grad_norm": 10.5, "learning_rate": 4.9934641307907495e-05, "loss": 0.5174, "mean_token_accuracy": 0.8834215998649597, "num_tokens": 17845860.0, "step": 12835 }, { "entropy": 0.16792013086378574, "epoch": 2.9931227415782726, "grad_norm": 1.3984375, "learning_rate": 4.993448532673884e-05, "loss": 0.1993, "mean_token_accuracy": 0.9521631598472595, "num_tokens": 17866162.0, "step": 12840 }, { "entropy": 0.22297072932124137, "epoch": 2.9942883785989043, "grad_norm": 4.75, "learning_rate": 4.99343291601533e-05, "loss": 0.3531, "mean_token_accuracy": 0.9260120809078216, "num_tokens": 17879511.0, "step": 12845 }, { "entropy": 0.19128753133118154, "epoch": 2.995454015619536, "grad_norm": 3.328125, "learning_rate": 4.9934172808153224e-05, "loss": 0.2676, "mean_token_accuracy": 0.9386341869831085, "num_tokens": 17899042.0, "step": 12850 }, { "entropy": 0.20055814646184444, "epoch": 2.9966196526401676, "grad_norm": 3.5625, "learning_rate": 4.993401627074092e-05, "loss": 0.3542, "mean_token_accuracy": 0.9243052423000335, "num_tokens": 17919406.0, "step": 12855 }, { "entropy": 0.25509237721562383, "epoch": 2.9977852896607997, "grad_norm": 6.375, "learning_rate": 4.993385954791873e-05, "loss": 0.5408, "mean_token_accuracy": 0.8878538310527802, "num_tokens": 17934902.0, "step": 12860 }, { "entropy": 0.2672757588326931, "epoch": 2.9989509266814314, "grad_norm": 6.0, "learning_rate": 4.9933702639689e-05, "loss": 0.487, "mean_token_accuracy": 0.9051804602146148, "num_tokens": 17950700.0, "step": 12865 }, { "entropy": 0.1652403457297219, "epoch": 3.0, "grad_norm": 0.609375, "learning_rate": 4.993354554605405e-05, "loss": 0.2207, "mean_token_accuracy": 0.9246720737881131, "num_tokens": 17980750.0, "step": 12870 }, { "entropy": 0.1433550551533699, "epoch": 3.0011656370206317, "grad_norm": 4.75, "learning_rate": 4.9933388267016246e-05, "loss": 0.1786, "mean_token_accuracy": 0.9587549924850464, "num_tokens": 17995622.0, "step": 12875 }, { "entropy": 0.21168565675616263, "epoch": 3.0023312740412638, "grad_norm": 4.46875, "learning_rate": 4.993323080257792e-05, "loss": 0.3623, "mean_token_accuracy": 0.9260616600513458, "num_tokens": 18015010.0, "step": 12880 }, { "entropy": 0.212300780788064, "epoch": 3.0034969110618954, "grad_norm": 9.5, "learning_rate": 4.993307315274142e-05, "loss": 0.3162, "mean_token_accuracy": 0.9363721132278442, "num_tokens": 18029427.0, "step": 12885 }, { "entropy": 0.1385662153363228, "epoch": 3.004662548082527, "grad_norm": 8.1875, "learning_rate": 4.99329153175091e-05, "loss": 0.2069, "mean_token_accuracy": 0.942263925075531, "num_tokens": 18051017.0, "step": 12890 }, { "entropy": 0.12007400654256344, "epoch": 3.0058281851031587, "grad_norm": 2.296875, "learning_rate": 4.9932757296883306e-05, "loss": 0.2103, "mean_token_accuracy": 0.9540059566497803, "num_tokens": 18065526.0, "step": 12895 }, { "entropy": 0.13780878372490407, "epoch": 3.006993822123791, "grad_norm": 2.40625, "learning_rate": 4.993259909086641e-05, "loss": 0.1755, "mean_token_accuracy": 0.9513382017612457, "num_tokens": 18082091.0, "step": 12900 }, { "entropy": 0.09654690567404031, "epoch": 3.0081594591444225, "grad_norm": 0.71875, "learning_rate": 4.993244069946076e-05, "loss": 0.1812, "mean_token_accuracy": 0.9618777215480805, "num_tokens": 18117232.0, "step": 12905 }, { "entropy": 0.0864265939220786, "epoch": 3.009325096165054, "grad_norm": 0.8046875, "learning_rate": 4.9932282122668715e-05, "loss": 0.0861, "mean_token_accuracy": 0.9753880679607392, "num_tokens": 18151282.0, "step": 12910 }, { "entropy": 0.1271039350889623, "epoch": 3.010490733185686, "grad_norm": 0.326171875, "learning_rate": 4.993212336049265e-05, "loss": 0.2512, "mean_token_accuracy": 0.9457149386405945, "num_tokens": 18170371.0, "step": 12915 }, { "entropy": 0.1285300085321069, "epoch": 3.011656370206318, "grad_norm": 3.109375, "learning_rate": 4.993196441293492e-05, "loss": 0.1651, "mean_token_accuracy": 0.9519018471240998, "num_tokens": 18191983.0, "step": 12920 }, { "entropy": 0.11733712311834096, "epoch": 3.0128220072269496, "grad_norm": 1.03125, "learning_rate": 4.993180527999791e-05, "loss": 0.064, "mean_token_accuracy": 0.958944684267044, "num_tokens": 18223670.0, "step": 12925 }, { "entropy": 0.11386382738128305, "epoch": 3.0139876442475813, "grad_norm": 0.8515625, "learning_rate": 4.9931645961683984e-05, "loss": 0.1375, "mean_token_accuracy": 0.956270831823349, "num_tokens": 18251671.0, "step": 12930 }, { "entropy": 0.10516616962850094, "epoch": 3.015153281268213, "grad_norm": 1.0078125, "learning_rate": 4.9931486457995515e-05, "loss": 0.1278, "mean_token_accuracy": 0.9688599169254303, "num_tokens": 18271125.0, "step": 12935 }, { "entropy": 0.08430785313248634, "epoch": 3.016318918288845, "grad_norm": 5.5, "learning_rate": 4.993132676893488e-05, "loss": 0.1188, "mean_token_accuracy": 0.9666883111000061, "num_tokens": 18312079.0, "step": 12940 }, { "entropy": 0.16214878372848035, "epoch": 3.0174845553094767, "grad_norm": 5.625, "learning_rate": 4.993116689450447e-05, "loss": 0.1848, "mean_token_accuracy": 0.9511231660842896, "num_tokens": 18323144.0, "step": 12945 }, { "entropy": 0.12150358017534017, "epoch": 3.0186501923301083, "grad_norm": 4.1875, "learning_rate": 4.993100683470667e-05, "loss": 0.1386, "mean_token_accuracy": 0.9623299181461334, "num_tokens": 18343224.0, "step": 12950 }, { "entropy": 0.17119587287306787, "epoch": 3.01981582935074, "grad_norm": 7.5, "learning_rate": 4.9930846589543855e-05, "loss": 0.2687, "mean_token_accuracy": 0.9386284410953522, "num_tokens": 18383414.0, "step": 12955 }, { "entropy": 0.10301913348957896, "epoch": 3.020981466371372, "grad_norm": 4.25, "learning_rate": 4.993068615901842e-05, "loss": 0.1244, "mean_token_accuracy": 0.9650148451328278, "num_tokens": 18406781.0, "step": 12960 }, { "entropy": 0.11102181086316705, "epoch": 3.0221471033920038, "grad_norm": 3.78125, "learning_rate": 4.993052554313275e-05, "loss": 0.1959, "mean_token_accuracy": 0.9532433927059174, "num_tokens": 18426553.0, "step": 12965 }, { "entropy": 0.1458127958700061, "epoch": 3.0233127404126354, "grad_norm": 13.9375, "learning_rate": 4.9930364741889264e-05, "loss": 0.2835, "mean_token_accuracy": 0.9430015027523041, "num_tokens": 18446721.0, "step": 12970 }, { "entropy": 0.10947179980576038, "epoch": 3.024478377433267, "grad_norm": 7.34375, "learning_rate": 4.9930203755290334e-05, "loss": 0.1352, "mean_token_accuracy": 0.9588992714881897, "num_tokens": 18467326.0, "step": 12975 }, { "entropy": 0.18623995631933213, "epoch": 3.025644014453899, "grad_norm": 4.9375, "learning_rate": 4.993004258333837e-05, "loss": 0.2801, "mean_token_accuracy": 0.9345187544822693, "num_tokens": 18485722.0, "step": 12980 }, { "entropy": 0.10628864876925945, "epoch": 3.026809651474531, "grad_norm": 5.25, "learning_rate": 4.992988122603578e-05, "loss": 0.1406, "mean_token_accuracy": 0.963499253988266, "num_tokens": 18509525.0, "step": 12985 }, { "entropy": 0.1489834614098072, "epoch": 3.0279752884951625, "grad_norm": 1.828125, "learning_rate": 4.992971968338496e-05, "loss": 0.1771, "mean_token_accuracy": 0.9506646573543549, "num_tokens": 18541551.0, "step": 12990 }, { "entropy": 0.11751662027090788, "epoch": 3.029140925515794, "grad_norm": 0.609375, "learning_rate": 4.992955795538832e-05, "loss": 0.1912, "mean_token_accuracy": 0.9536070585250854, "num_tokens": 18559322.0, "step": 12995 }, { "entropy": 0.12452268935739993, "epoch": 3.0303065625364263, "grad_norm": 5.59375, "learning_rate": 4.992939604204828e-05, "loss": 0.1767, "mean_token_accuracy": 0.9622524440288543, "num_tokens": 18582602.0, "step": 13000 }, { "entropy": 0.10873323529958726, "epoch": 3.031472199557058, "grad_norm": 1.890625, "learning_rate": 4.992923394336726e-05, "loss": 0.1736, "mean_token_accuracy": 0.9589035868644714, "num_tokens": 18601698.0, "step": 13005 }, { "entropy": 0.09877428058534861, "epoch": 3.0326378365776896, "grad_norm": 1.53125, "learning_rate": 4.992907165934766e-05, "loss": 0.1921, "mean_token_accuracy": 0.9654108107089996, "num_tokens": 18629199.0, "step": 13010 }, { "entropy": 0.16017772555351256, "epoch": 3.0338034735983217, "grad_norm": 7.53125, "learning_rate": 4.99289091899919e-05, "loss": 0.2393, "mean_token_accuracy": 0.9472411513328552, "num_tokens": 18638162.0, "step": 13015 }, { "entropy": 0.11799342557787895, "epoch": 3.0349691106189534, "grad_norm": 6.15625, "learning_rate": 4.9928746535302415e-05, "loss": 0.1876, "mean_token_accuracy": 0.9611060082912445, "num_tokens": 18652672.0, "step": 13020 }, { "entropy": 0.12905988562852144, "epoch": 3.036134747639585, "grad_norm": 12.6875, "learning_rate": 4.992858369528163e-05, "loss": 0.2115, "mean_token_accuracy": 0.9487624883651733, "num_tokens": 18668323.0, "step": 13025 }, { "entropy": 0.11400379352271557, "epoch": 3.0373003846602167, "grad_norm": 8.625, "learning_rate": 4.992842066993196e-05, "loss": 0.2537, "mean_token_accuracy": 0.954912292957306, "num_tokens": 18685966.0, "step": 13030 }, { "entropy": 0.140830048173666, "epoch": 3.038466021680849, "grad_norm": 5.03125, "learning_rate": 4.992825745925585e-05, "loss": 0.2546, "mean_token_accuracy": 0.9469396531581878, "num_tokens": 18698156.0, "step": 13035 }, { "entropy": 0.19331585317850114, "epoch": 3.0396316587014804, "grad_norm": 5.625, "learning_rate": 4.9928094063255734e-05, "loss": 0.3055, "mean_token_accuracy": 0.9267224192619323, "num_tokens": 18705380.0, "step": 13040 }, { "entropy": 0.16906597539782525, "epoch": 3.040797295722112, "grad_norm": 8.75, "learning_rate": 4.9927930481934034e-05, "loss": 0.2464, "mean_token_accuracy": 0.9498988151550293, "num_tokens": 18716545.0, "step": 13045 }, { "entropy": 0.1345493733882904, "epoch": 3.0419629327427438, "grad_norm": 0.75, "learning_rate": 4.99277667152932e-05, "loss": 0.2407, "mean_token_accuracy": 0.9507641017436981, "num_tokens": 18735728.0, "step": 13050 }, { "entropy": 0.20635399986058472, "epoch": 3.043128569763376, "grad_norm": 12.0625, "learning_rate": 4.992760276333567e-05, "loss": 0.2186, "mean_token_accuracy": 0.9326454043388367, "num_tokens": 18767891.0, "step": 13055 }, { "entropy": 0.09413407100364565, "epoch": 3.0442942067840075, "grad_norm": 7.1875, "learning_rate": 4.9927438626063894e-05, "loss": 0.1287, "mean_token_accuracy": 0.9676704227924346, "num_tokens": 18790347.0, "step": 13060 }, { "entropy": 0.12085700780153275, "epoch": 3.045459843804639, "grad_norm": 0.37890625, "learning_rate": 4.992727430348031e-05, "loss": 0.1564, "mean_token_accuracy": 0.9590084195137024, "num_tokens": 18841046.0, "step": 13065 }, { "entropy": 0.15510401986539363, "epoch": 3.046625480825271, "grad_norm": 4.78125, "learning_rate": 4.992710979558738e-05, "loss": 0.2184, "mean_token_accuracy": 0.9496059834957122, "num_tokens": 18872274.0, "step": 13070 }, { "entropy": 0.150761703774333, "epoch": 3.047791117845903, "grad_norm": 6.96875, "learning_rate": 4.992694510238755e-05, "loss": 0.26, "mean_token_accuracy": 0.9450468838214874, "num_tokens": 18892617.0, "step": 13075 }, { "entropy": 0.14352366626262664, "epoch": 3.0489567548665346, "grad_norm": 1.9140625, "learning_rate": 4.992678022388328e-05, "loss": 0.21, "mean_token_accuracy": 0.9523520827293396, "num_tokens": 18903601.0, "step": 13080 }, { "entropy": 0.1325456030666828, "epoch": 3.0501223918871663, "grad_norm": 8.75, "learning_rate": 4.992661516007702e-05, "loss": 0.2799, "mean_token_accuracy": 0.9441640317440033, "num_tokens": 18921826.0, "step": 13085 }, { "entropy": 0.16167889218777418, "epoch": 3.051288028907798, "grad_norm": 4.5, "learning_rate": 4.992644991097125e-05, "loss": 0.297, "mean_token_accuracy": 0.9417232036590576, "num_tokens": 18967125.0, "step": 13090 }, { "entropy": 0.13939319923520088, "epoch": 3.05245366592843, "grad_norm": 1.3125, "learning_rate": 4.992628447656841e-05, "loss": 0.1361, "mean_token_accuracy": 0.9541107892990113, "num_tokens": 18993343.0, "step": 13095 }, { "entropy": 0.1326286420226097, "epoch": 3.0536193029490617, "grad_norm": 0.99609375, "learning_rate": 4.9926118856870976e-05, "loss": 0.1413, "mean_token_accuracy": 0.9580071270465851, "num_tokens": 19023224.0, "step": 13100 }, { "entropy": 0.14778729975223542, "epoch": 3.0547849399696934, "grad_norm": 3.84375, "learning_rate": 4.9925953051881426e-05, "loss": 0.162, "mean_token_accuracy": 0.9471778392791748, "num_tokens": 19037845.0, "step": 13105 }, { "entropy": 0.16834700480103493, "epoch": 3.055950576990325, "grad_norm": 0.8515625, "learning_rate": 4.9925787061602226e-05, "loss": 0.2342, "mean_token_accuracy": 0.9260630905628204, "num_tokens": 19062642.0, "step": 13110 }, { "entropy": 0.11567596178501845, "epoch": 3.057116214010957, "grad_norm": 5.1875, "learning_rate": 4.992562088603585e-05, "loss": 0.2426, "mean_token_accuracy": 0.9500459372997284, "num_tokens": 19079460.0, "step": 13115 }, { "entropy": 0.14092737063765526, "epoch": 3.058281851031589, "grad_norm": 10.875, "learning_rate": 4.992545452518478e-05, "loss": 0.3468, "mean_token_accuracy": 0.9354611694812774, "num_tokens": 19089866.0, "step": 13120 }, { "entropy": 0.14295823201537133, "epoch": 3.0594474880522204, "grad_norm": 6.9375, "learning_rate": 4.992528797905149e-05, "loss": 0.2638, "mean_token_accuracy": 0.9444294333457947, "num_tokens": 19101734.0, "step": 13125 }, { "entropy": 0.11939622908830642, "epoch": 3.060613125072852, "grad_norm": 7.1875, "learning_rate": 4.992512124763847e-05, "loss": 0.176, "mean_token_accuracy": 0.9504463911056519, "num_tokens": 19116952.0, "step": 13130 }, { "entropy": 0.14332516640424728, "epoch": 3.061778762093484, "grad_norm": 8.6875, "learning_rate": 4.99249543309482e-05, "loss": 0.2215, "mean_token_accuracy": 0.9468707919120789, "num_tokens": 19132506.0, "step": 13135 }, { "entropy": 0.1258048728108406, "epoch": 3.062944399114116, "grad_norm": 11.375, "learning_rate": 4.992478722898318e-05, "loss": 0.2369, "mean_token_accuracy": 0.9458638608455658, "num_tokens": 19144849.0, "step": 13140 }, { "entropy": 0.10387789681553841, "epoch": 3.0641100361347475, "grad_norm": 6.71875, "learning_rate": 4.9924619941745886e-05, "loss": 0.133, "mean_token_accuracy": 0.9705339252948761, "num_tokens": 19182754.0, "step": 13145 }, { "entropy": 0.09643221404403449, "epoch": 3.065275673155379, "grad_norm": 8.1875, "learning_rate": 4.9924452469238826e-05, "loss": 0.1519, "mean_token_accuracy": 0.964071798324585, "num_tokens": 19203256.0, "step": 13150 }, { "entropy": 0.19322482496500015, "epoch": 3.0664413101760113, "grad_norm": 2.015625, "learning_rate": 4.99242848114645e-05, "loss": 0.3931, "mean_token_accuracy": 0.9145723462104798, "num_tokens": 19212959.0, "step": 13155 }, { "entropy": 0.25037995502352717, "epoch": 3.067606947196643, "grad_norm": 8.4375, "learning_rate": 4.992411696842538e-05, "loss": 0.4867, "mean_token_accuracy": 0.9011429846286774, "num_tokens": 19231303.0, "step": 13160 }, { "entropy": 0.35061059817671775, "epoch": 3.0687725842172746, "grad_norm": 5.34375, "learning_rate": 4.9923948940124006e-05, "loss": 0.6799, "mean_token_accuracy": 0.9077353715896607, "num_tokens": 19254614.0, "step": 13165 }, { "entropy": 0.1632023498415947, "epoch": 3.0699382212379067, "grad_norm": 1.859375, "learning_rate": 4.9923780726562856e-05, "loss": 0.2547, "mean_token_accuracy": 0.9486920773983002, "num_tokens": 19274238.0, "step": 13170 }, { "entropy": 0.15164818074554204, "epoch": 3.0711038582585384, "grad_norm": 6.15625, "learning_rate": 4.9923612327744454e-05, "loss": 0.238, "mean_token_accuracy": 0.938932454586029, "num_tokens": 19296587.0, "step": 13175 }, { "entropy": 0.13162093460559846, "epoch": 3.07226949527917, "grad_norm": 5.0625, "learning_rate": 4.9923443743671314e-05, "loss": 0.1807, "mean_token_accuracy": 0.9539216995239258, "num_tokens": 19318280.0, "step": 13180 }, { "entropy": 0.16326101645827293, "epoch": 3.0734351322998017, "grad_norm": 0.69921875, "learning_rate": 4.992327497434593e-05, "loss": 0.275, "mean_token_accuracy": 0.9378827571868896, "num_tokens": 19335662.0, "step": 13185 }, { "entropy": 0.14299081321805715, "epoch": 3.074600769320434, "grad_norm": 0.61328125, "learning_rate": 4.992310601977084e-05, "loss": 0.2718, "mean_token_accuracy": 0.9428001344203949, "num_tokens": 19364430.0, "step": 13190 }, { "entropy": 0.11728681959211826, "epoch": 3.0757664063410655, "grad_norm": 3.90625, "learning_rate": 4.992293687994855e-05, "loss": 0.2007, "mean_token_accuracy": 0.9604819655418396, "num_tokens": 19377704.0, "step": 13195 }, { "entropy": 0.13060683608055115, "epoch": 3.076932043361697, "grad_norm": 4.71875, "learning_rate": 4.992276755488158e-05, "loss": 0.2601, "mean_token_accuracy": 0.9461053490638733, "num_tokens": 19401149.0, "step": 13200 }, { "entropy": 0.16250937394797801, "epoch": 3.078097680382329, "grad_norm": 0.96875, "learning_rate": 4.992259804457247e-05, "loss": 0.2137, "mean_token_accuracy": 0.9518900394439698, "num_tokens": 19435512.0, "step": 13205 }, { "entropy": 0.12088603749871255, "epoch": 3.079263317402961, "grad_norm": 4.3125, "learning_rate": 4.992242834902373e-05, "loss": 0.1753, "mean_token_accuracy": 0.9508533656597138, "num_tokens": 19453057.0, "step": 13210 }, { "entropy": 0.12925427481532098, "epoch": 3.0804289544235925, "grad_norm": 2.53125, "learning_rate": 4.99222584682379e-05, "loss": 0.165, "mean_token_accuracy": 0.958453357219696, "num_tokens": 19473192.0, "step": 13215 }, { "entropy": 0.13786442931741477, "epoch": 3.081594591444224, "grad_norm": 1.71875, "learning_rate": 4.992208840221751e-05, "loss": 0.1691, "mean_token_accuracy": 0.9522160708904266, "num_tokens": 19495497.0, "step": 13220 }, { "entropy": 0.11975713893771171, "epoch": 3.082760228464856, "grad_norm": 1.4296875, "learning_rate": 4.9921918150965106e-05, "loss": 0.1748, "mean_token_accuracy": 0.9600012302398682, "num_tokens": 19524516.0, "step": 13225 }, { "entropy": 0.20472723469138146, "epoch": 3.083925865485488, "grad_norm": 10.0, "learning_rate": 4.9921747714483216e-05, "loss": 0.3746, "mean_token_accuracy": 0.9212909162044525, "num_tokens": 19535087.0, "step": 13230 }, { "entropy": 0.12324888203293086, "epoch": 3.0850915025061196, "grad_norm": 3.671875, "learning_rate": 4.9921577092774384e-05, "loss": 0.2137, "mean_token_accuracy": 0.9506260097026825, "num_tokens": 19556253.0, "step": 13235 }, { "entropy": 0.16078415960073472, "epoch": 3.0862571395267513, "grad_norm": 0.7578125, "learning_rate": 4.992140628584116e-05, "loss": 0.2537, "mean_token_accuracy": 0.9428843200206757, "num_tokens": 19578151.0, "step": 13240 }, { "entropy": 0.1569695755839348, "epoch": 3.087422776547383, "grad_norm": 6.3125, "learning_rate": 4.992123529368608e-05, "loss": 0.2207, "mean_token_accuracy": 0.9451090037822724, "num_tokens": 19587923.0, "step": 13245 }, { "entropy": 0.12610364258289336, "epoch": 3.088588413568015, "grad_norm": 7.1875, "learning_rate": 4.99210641163117e-05, "loss": 0.1984, "mean_token_accuracy": 0.9510172843933106, "num_tokens": 19615303.0, "step": 13250 }, { "entropy": 0.12419973127543926, "epoch": 3.0897540505886467, "grad_norm": 0.81640625, "learning_rate": 4.992089275372057e-05, "loss": 0.1556, "mean_token_accuracy": 0.9587218821048736, "num_tokens": 19639739.0, "step": 13255 }, { "entropy": 0.12563247513026, "epoch": 3.0909196876092784, "grad_norm": 2.3125, "learning_rate": 4.992072120591524e-05, "loss": 0.1285, "mean_token_accuracy": 0.9584830939769745, "num_tokens": 19669972.0, "step": 13260 }, { "entropy": 0.11786816529929638, "epoch": 3.09208532462991, "grad_norm": 5.0, "learning_rate": 4.992054947289829e-05, "loss": 0.2424, "mean_token_accuracy": 0.9408693611621857, "num_tokens": 19694543.0, "step": 13265 }, { "entropy": 0.10978089049458503, "epoch": 3.093250961650542, "grad_norm": 2.15625, "learning_rate": 4.992037755467226e-05, "loss": 0.1406, "mean_token_accuracy": 0.9633432149887085, "num_tokens": 19715806.0, "step": 13270 }, { "entropy": 0.09481975287199021, "epoch": 3.094416598671174, "grad_norm": 3.6875, "learning_rate": 4.992020545123972e-05, "loss": 0.1058, "mean_token_accuracy": 0.9747069358825684, "num_tokens": 19749863.0, "step": 13275 }, { "entropy": 0.12356273904442787, "epoch": 3.0955822356918055, "grad_norm": 1.40625, "learning_rate": 4.992003316260323e-05, "loss": 0.1161, "mean_token_accuracy": 0.9546688258647918, "num_tokens": 19782518.0, "step": 13280 }, { "entropy": 0.11005722619593143, "epoch": 3.096747872712437, "grad_norm": 0.79296875, "learning_rate": 4.991986068876537e-05, "loss": 0.23, "mean_token_accuracy": 0.9487053215503692, "num_tokens": 19804363.0, "step": 13285 }, { "entropy": 0.14497708119452, "epoch": 3.0979135097330692, "grad_norm": 1.65625, "learning_rate": 4.99196880297287e-05, "loss": 0.1645, "mean_token_accuracy": 0.9520582973957061, "num_tokens": 19824122.0, "step": 13290 }, { "entropy": 0.0952205179259181, "epoch": 3.099079146753701, "grad_norm": 4.625, "learning_rate": 4.991951518549581e-05, "loss": 0.1178, "mean_token_accuracy": 0.9668820440769196, "num_tokens": 19860008.0, "step": 13295 }, { "entropy": 0.11507775709033012, "epoch": 3.1002447837743325, "grad_norm": 0.87109375, "learning_rate": 4.9919342156069266e-05, "loss": 0.211, "mean_token_accuracy": 0.9555065870285034, "num_tokens": 19877177.0, "step": 13300 }, { "entropy": 0.13037493210285903, "epoch": 3.1014104207949647, "grad_norm": 5.1875, "learning_rate": 4.991916894145164e-05, "loss": 0.1271, "mean_token_accuracy": 0.9565082132816315, "num_tokens": 19906565.0, "step": 13305 }, { "entropy": 0.11838086917996407, "epoch": 3.1025760578155963, "grad_norm": 5.4375, "learning_rate": 4.991899554164554e-05, "loss": 0.2005, "mean_token_accuracy": 0.960281252861023, "num_tokens": 19922383.0, "step": 13310 }, { "entropy": 0.14991481192409992, "epoch": 3.103741694836228, "grad_norm": 6.09375, "learning_rate": 4.991882195665353e-05, "loss": 0.2267, "mean_token_accuracy": 0.9503510117530822, "num_tokens": 19933976.0, "step": 13315 }, { "entropy": 0.11397014874964953, "epoch": 3.1049073318568596, "grad_norm": 3.203125, "learning_rate": 4.991864818647821e-05, "loss": 0.1354, "mean_token_accuracy": 0.9622531473636627, "num_tokens": 19956871.0, "step": 13320 }, { "entropy": 0.14367529824376107, "epoch": 3.1060729688774917, "grad_norm": 7.6875, "learning_rate": 4.991847423112217e-05, "loss": 0.2265, "mean_token_accuracy": 0.9521259009838104, "num_tokens": 19968259.0, "step": 13325 }, { "entropy": 0.11058022249490022, "epoch": 3.1072386058981234, "grad_norm": 2.390625, "learning_rate": 4.991830009058799e-05, "loss": 0.1461, "mean_token_accuracy": 0.9625348687171936, "num_tokens": 19995564.0, "step": 13330 }, { "entropy": 0.1090804586187005, "epoch": 3.108404242918755, "grad_norm": 3.078125, "learning_rate": 4.991812576487829e-05, "loss": 0.2279, "mean_token_accuracy": 0.9549605369567871, "num_tokens": 20013769.0, "step": 13335 }, { "entropy": 0.13848309740424156, "epoch": 3.1095698799393867, "grad_norm": 10.0, "learning_rate": 4.991795125399564e-05, "loss": 0.2078, "mean_token_accuracy": 0.9510918617248535, "num_tokens": 20035255.0, "step": 13340 }, { "entropy": 0.0821663798764348, "epoch": 3.110735516960019, "grad_norm": 1.28125, "learning_rate": 4.9917776557942675e-05, "loss": 0.0816, "mean_token_accuracy": 0.9688577592372895, "num_tokens": 20059178.0, "step": 13345 }, { "entropy": 0.13422894291579723, "epoch": 3.1119011539806505, "grad_norm": 1.0234375, "learning_rate": 4.991760167672198e-05, "loss": 0.2109, "mean_token_accuracy": 0.9463982820510864, "num_tokens": 20075794.0, "step": 13350 }, { "entropy": 0.16679993383586406, "epoch": 3.113066791001282, "grad_norm": 2.890625, "learning_rate": 4.9917426610336165e-05, "loss": 0.1872, "mean_token_accuracy": 0.9396183788776398, "num_tokens": 20091022.0, "step": 13355 }, { "entropy": 0.12409867160022259, "epoch": 3.114232428021914, "grad_norm": 1.5234375, "learning_rate": 4.9917251358787845e-05, "loss": 0.228, "mean_token_accuracy": 0.9440020442008972, "num_tokens": 20104163.0, "step": 13360 }, { "entropy": 0.1242295227944851, "epoch": 3.115398065042546, "grad_norm": 4.0625, "learning_rate": 4.991707592207963e-05, "loss": 0.2196, "mean_token_accuracy": 0.9523638188838959, "num_tokens": 20123125.0, "step": 13365 }, { "entropy": 0.19654411226511, "epoch": 3.1165637020631776, "grad_norm": 0.703125, "learning_rate": 4.991690030021413e-05, "loss": 0.198, "mean_token_accuracy": 0.9465866386890411, "num_tokens": 20145750.0, "step": 13370 }, { "entropy": 0.1174004428088665, "epoch": 3.1177293390838092, "grad_norm": 6.8125, "learning_rate": 4.991672449319398e-05, "loss": 0.1953, "mean_token_accuracy": 0.9528234481811524, "num_tokens": 20168354.0, "step": 13375 }, { "entropy": 0.11760329715907573, "epoch": 3.118894976104441, "grad_norm": 5.125, "learning_rate": 4.9916548501021784e-05, "loss": 0.152, "mean_token_accuracy": 0.9535988092422485, "num_tokens": 20191537.0, "step": 13380 }, { "entropy": 0.151387694850564, "epoch": 3.120060613125073, "grad_norm": 4.9375, "learning_rate": 4.991637232370018e-05, "loss": 0.164, "mean_token_accuracy": 0.9565434157848358, "num_tokens": 20211723.0, "step": 13385 }, { "entropy": 0.13986520580947398, "epoch": 3.1212262501457047, "grad_norm": 7.25, "learning_rate": 4.991619596123178e-05, "loss": 0.2171, "mean_token_accuracy": 0.9584638893604278, "num_tokens": 20238357.0, "step": 13390 }, { "entropy": 0.14957279935479165, "epoch": 3.1223918871663363, "grad_norm": 1.65625, "learning_rate": 4.991601941361923e-05, "loss": 0.1502, "mean_token_accuracy": 0.9497217297554016, "num_tokens": 20253687.0, "step": 13395 }, { "entropy": 0.13997821658849716, "epoch": 3.123557524186968, "grad_norm": 1.40625, "learning_rate": 4.991584268086516e-05, "loss": 0.2023, "mean_token_accuracy": 0.9500046491622924, "num_tokens": 20280327.0, "step": 13400 }, { "entropy": 0.1721389342099428, "epoch": 3.1247231612076, "grad_norm": 6.28125, "learning_rate": 4.9915665762972196e-05, "loss": 0.3787, "mean_token_accuracy": 0.9287561297416687, "num_tokens": 20297460.0, "step": 13405 }, { "entropy": 0.12821940444409846, "epoch": 3.1258887982282317, "grad_norm": 0.62109375, "learning_rate": 4.991548865994298e-05, "loss": 0.1801, "mean_token_accuracy": 0.9497880697250366, "num_tokens": 20333983.0, "step": 13410 }, { "entropy": 0.09938725344836712, "epoch": 3.1270544352488634, "grad_norm": 2.671875, "learning_rate": 4.991531137178016e-05, "loss": 0.1094, "mean_token_accuracy": 0.971398013830185, "num_tokens": 20355626.0, "step": 13415 }, { "entropy": 0.1275652002543211, "epoch": 3.128220072269495, "grad_norm": 2.0, "learning_rate": 4.991513389848637e-05, "loss": 0.1089, "mean_token_accuracy": 0.9720631957054138, "num_tokens": 20375383.0, "step": 13420 }, { "entropy": 0.13102855496108531, "epoch": 3.129385709290127, "grad_norm": 0.388671875, "learning_rate": 4.991495624006426e-05, "loss": 0.1487, "mean_token_accuracy": 0.9604451298713684, "num_tokens": 20397434.0, "step": 13425 }, { "entropy": 0.15394360907375812, "epoch": 3.130551346310759, "grad_norm": 5.21875, "learning_rate": 4.9914778396516484e-05, "loss": 0.2195, "mean_token_accuracy": 0.9505378365516662, "num_tokens": 20412780.0, "step": 13430 }, { "entropy": 0.15660555139184, "epoch": 3.1317169833313905, "grad_norm": 6.21875, "learning_rate": 4.9914600367845685e-05, "loss": 0.2906, "mean_token_accuracy": 0.9402198016643524, "num_tokens": 20422638.0, "step": 13435 }, { "entropy": 0.17960682585835458, "epoch": 3.1328826203520226, "grad_norm": 1.875, "learning_rate": 4.991442215405452e-05, "loss": 0.318, "mean_token_accuracy": 0.9269765436649322, "num_tokens": 20439090.0, "step": 13440 }, { "entropy": 0.20290196798741816, "epoch": 3.1340482573726542, "grad_norm": 0.72265625, "learning_rate": 4.991424375514565e-05, "loss": 0.2952, "mean_token_accuracy": 0.9280561923980712, "num_tokens": 20478063.0, "step": 13445 }, { "entropy": 0.11316476315259934, "epoch": 3.135213894393286, "grad_norm": 2.453125, "learning_rate": 4.991406517112173e-05, "loss": 0.1718, "mean_token_accuracy": 0.9608360230922699, "num_tokens": 20498608.0, "step": 13450 }, { "entropy": 0.13394585978239776, "epoch": 3.1363795314139176, "grad_norm": 8.5625, "learning_rate": 4.991388640198543e-05, "loss": 0.1833, "mean_token_accuracy": 0.9532162606716156, "num_tokens": 20520011.0, "step": 13455 }, { "entropy": 0.1493657087907195, "epoch": 3.1375451684345497, "grad_norm": 0.9375, "learning_rate": 4.9913707447739416e-05, "loss": 0.2914, "mean_token_accuracy": 0.937654048204422, "num_tokens": 20535373.0, "step": 13460 }, { "entropy": 0.15093789100646973, "epoch": 3.1387108054551813, "grad_norm": 4.125, "learning_rate": 4.9913528308386345e-05, "loss": 0.2278, "mean_token_accuracy": 0.9449706077575684, "num_tokens": 20547044.0, "step": 13465 }, { "entropy": 0.12902355715632438, "epoch": 3.139876442475813, "grad_norm": 4.1875, "learning_rate": 4.99133489839289e-05, "loss": 0.2036, "mean_token_accuracy": 0.9510998904705048, "num_tokens": 20566906.0, "step": 13470 }, { "entropy": 0.15702604167163373, "epoch": 3.1410420794964447, "grad_norm": 3.3125, "learning_rate": 4.9913169474369754e-05, "loss": 0.2774, "mean_token_accuracy": 0.9410692870616912, "num_tokens": 20578617.0, "step": 13475 }, { "entropy": 0.14392093010246754, "epoch": 3.1422077165170768, "grad_norm": 3.59375, "learning_rate": 4.991298977971158e-05, "loss": 0.2114, "mean_token_accuracy": 0.9481341958045959, "num_tokens": 20591977.0, "step": 13480 }, { "entropy": 0.1626926977187395, "epoch": 3.1433733535377084, "grad_norm": 2.109375, "learning_rate": 4.9912809899957055e-05, "loss": 0.2463, "mean_token_accuracy": 0.9358658790588379, "num_tokens": 20613827.0, "step": 13485 }, { "entropy": 0.08454335257411003, "epoch": 3.14453899055834, "grad_norm": 1.5625, "learning_rate": 4.9912629835108864e-05, "loss": 0.0539, "mean_token_accuracy": 0.9670248806476593, "num_tokens": 20649688.0, "step": 13490 }, { "entropy": 0.09732284527271987, "epoch": 3.1457046275789717, "grad_norm": 2.015625, "learning_rate": 4.99124495851697e-05, "loss": 0.1081, "mean_token_accuracy": 0.960671192407608, "num_tokens": 20682789.0, "step": 13495 }, { "entropy": 0.09042824842035771, "epoch": 3.146870264599604, "grad_norm": 1.984375, "learning_rate": 4.991226915014223e-05, "loss": 0.0762, "mean_token_accuracy": 0.9731867849826813, "num_tokens": 20710191.0, "step": 13500 }, { "entropy": 0.17311727702617646, "epoch": 3.1480359016202355, "grad_norm": 1.0859375, "learning_rate": 4.9912088530029166e-05, "loss": 0.2479, "mean_token_accuracy": 0.9498288691043854, "num_tokens": 20725259.0, "step": 13505 }, { "entropy": 0.15625217892229556, "epoch": 3.149201538640867, "grad_norm": 6.53125, "learning_rate": 4.9911907724833196e-05, "loss": 0.2065, "mean_token_accuracy": 0.954789811372757, "num_tokens": 20733671.0, "step": 13510 }, { "entropy": 0.12325193397700787, "epoch": 3.150367175661499, "grad_norm": 3.109375, "learning_rate": 4.991172673455701e-05, "loss": 0.2265, "mean_token_accuracy": 0.9545148432254791, "num_tokens": 20744720.0, "step": 13515 }, { "entropy": 0.15157404392957688, "epoch": 3.151532812682131, "grad_norm": 5.0625, "learning_rate": 4.9911545559203306e-05, "loss": 0.2189, "mean_token_accuracy": 0.9399513125419616, "num_tokens": 20765379.0, "step": 13520 }, { "entropy": 0.09883678015321493, "epoch": 3.1526984497027626, "grad_norm": 0.7734375, "learning_rate": 4.991136419877479e-05, "loss": 0.1561, "mean_token_accuracy": 0.9537435114383698, "num_tokens": 20800491.0, "step": 13525 }, { "entropy": 0.1349384069442749, "epoch": 3.1538640867233942, "grad_norm": 4.40625, "learning_rate": 4.991118265327417e-05, "loss": 0.1982, "mean_token_accuracy": 0.9464501976966858, "num_tokens": 20822363.0, "step": 13530 }, { "entropy": 0.13095712885260583, "epoch": 3.155029723744026, "grad_norm": 2.46875, "learning_rate": 4.991100092270415e-05, "loss": 0.2211, "mean_token_accuracy": 0.9556451499462127, "num_tokens": 20845713.0, "step": 13535 }, { "entropy": 0.2365718200802803, "epoch": 3.156195360764658, "grad_norm": 4.65625, "learning_rate": 4.991081900706744e-05, "loss": 0.3579, "mean_token_accuracy": 0.8964258372783661, "num_tokens": 20880390.0, "step": 13540 }, { "entropy": 0.1382368978112936, "epoch": 3.1573609977852897, "grad_norm": 2.5, "learning_rate": 4.9910636906366746e-05, "loss": 0.2208, "mean_token_accuracy": 0.9520070552825928, "num_tokens": 20890351.0, "step": 13545 }, { "entropy": 0.12153224758803845, "epoch": 3.1585266348059213, "grad_norm": 3.84375, "learning_rate": 4.9910454620604794e-05, "loss": 0.2088, "mean_token_accuracy": 0.9552608907222748, "num_tokens": 20907586.0, "step": 13550 }, { "entropy": 0.17177230566740037, "epoch": 3.159692271826553, "grad_norm": 9.6875, "learning_rate": 4.99102721497843e-05, "loss": 0.3257, "mean_token_accuracy": 0.9317836463451385, "num_tokens": 20923892.0, "step": 13555 }, { "entropy": 0.11779150888323783, "epoch": 3.160857908847185, "grad_norm": 1.359375, "learning_rate": 4.991008949390797e-05, "loss": 0.1493, "mean_token_accuracy": 0.9623130559921265, "num_tokens": 20955137.0, "step": 13560 }, { "entropy": 0.12503595724701883, "epoch": 3.1620235458678168, "grad_norm": 6.71875, "learning_rate": 4.990990665297855e-05, "loss": 0.1711, "mean_token_accuracy": 0.9561767220497132, "num_tokens": 20971485.0, "step": 13565 }, { "entropy": 0.15457735285162927, "epoch": 3.1631891828884484, "grad_norm": 4.28125, "learning_rate": 4.9909723626998755e-05, "loss": 0.3141, "mean_token_accuracy": 0.9366472482681274, "num_tokens": 20981271.0, "step": 13570 }, { "entropy": 0.21778239756822587, "epoch": 3.1643548199090805, "grad_norm": 3.734375, "learning_rate": 4.990954041597131e-05, "loss": 0.2955, "mean_token_accuracy": 0.9206785082817077, "num_tokens": 21002375.0, "step": 13575 }, { "entropy": 0.0996066389605403, "epoch": 3.165520456929712, "grad_norm": 0.7578125, "learning_rate": 4.990935701989896e-05, "loss": 0.1601, "mean_token_accuracy": 0.9643292784690857, "num_tokens": 21032367.0, "step": 13580 }, { "entropy": 0.13476105220615864, "epoch": 3.166686093950344, "grad_norm": 5.90625, "learning_rate": 4.990917343878443e-05, "loss": 0.2373, "mean_token_accuracy": 0.9470726490020752, "num_tokens": 21042480.0, "step": 13585 }, { "entropy": 0.11117649544030428, "epoch": 3.1678517309709755, "grad_norm": 4.71875, "learning_rate": 4.990898967263046e-05, "loss": 0.1549, "mean_token_accuracy": 0.9646702468395233, "num_tokens": 21069694.0, "step": 13590 }, { "entropy": 0.12479297295212746, "epoch": 3.1690173679916076, "grad_norm": 5.65625, "learning_rate": 4.990880572143979e-05, "loss": 0.1796, "mean_token_accuracy": 0.9550163507461548, "num_tokens": 21080793.0, "step": 13595 }, { "entropy": 0.12391726523637772, "epoch": 3.1701830050122393, "grad_norm": 2.546875, "learning_rate": 4.9908621585215154e-05, "loss": 0.1415, "mean_token_accuracy": 0.9581521868705749, "num_tokens": 21098312.0, "step": 13600 }, { "entropy": 0.13280257768929005, "epoch": 3.171348642032871, "grad_norm": 0.69921875, "learning_rate": 4.990843726395932e-05, "loss": 0.2267, "mean_token_accuracy": 0.9563954889774322, "num_tokens": 21119709.0, "step": 13605 }, { "entropy": 0.12048251070082187, "epoch": 3.1725142790535026, "grad_norm": 3.328125, "learning_rate": 4.990825275767501e-05, "loss": 0.1771, "mean_token_accuracy": 0.9502053916454315, "num_tokens": 21144999.0, "step": 13610 }, { "entropy": 0.10526430867612362, "epoch": 3.1736799160741347, "grad_norm": 3.640625, "learning_rate": 4.9908068066365e-05, "loss": 0.2226, "mean_token_accuracy": 0.9513380050659179, "num_tokens": 21159827.0, "step": 13615 }, { "entropy": 0.14900004006922246, "epoch": 3.1748455530947663, "grad_norm": 1.015625, "learning_rate": 4.990788319003202e-05, "loss": 0.2445, "mean_token_accuracy": 0.9436604678630829, "num_tokens": 21179584.0, "step": 13620 }, { "entropy": 0.1654419906437397, "epoch": 3.176011190115398, "grad_norm": 9.9375, "learning_rate": 4.990769812867885e-05, "loss": 0.2853, "mean_token_accuracy": 0.9375516057014466, "num_tokens": 21190250.0, "step": 13625 }, { "entropy": 0.16855915486812592, "epoch": 3.1771768271360297, "grad_norm": 4.625, "learning_rate": 4.990751288230824e-05, "loss": 0.3594, "mean_token_accuracy": 0.9368915677070617, "num_tokens": 21200016.0, "step": 13630 }, { "entropy": 0.1235238466411829, "epoch": 3.1783424641566618, "grad_norm": 9.375, "learning_rate": 4.990732745092294e-05, "loss": 0.2636, "mean_token_accuracy": 0.9344924747943878, "num_tokens": 21217432.0, "step": 13635 }, { "entropy": 0.11267777048051357, "epoch": 3.1795081011772934, "grad_norm": 2.09375, "learning_rate": 4.9907141834525734e-05, "loss": 0.2743, "mean_token_accuracy": 0.9462158858776093, "num_tokens": 21233878.0, "step": 13640 }, { "entropy": 0.08724936954677105, "epoch": 3.180673738197925, "grad_norm": 7.78125, "learning_rate": 4.9906956033119376e-05, "loss": 0.1146, "mean_token_accuracy": 0.9678235054016113, "num_tokens": 21258870.0, "step": 13645 }, { "entropy": 0.13224483132362366, "epoch": 3.1818393752185568, "grad_norm": 0.75, "learning_rate": 4.990677004670665e-05, "loss": 0.2061, "mean_token_accuracy": 0.9559370160102845, "num_tokens": 21279265.0, "step": 13650 }, { "entropy": 0.12577038034796714, "epoch": 3.183005012239189, "grad_norm": 5.8125, "learning_rate": 4.990658387529032e-05, "loss": 0.2115, "mean_token_accuracy": 0.9527509272098541, "num_tokens": 21292607.0, "step": 13655 }, { "entropy": 0.09519344065338373, "epoch": 3.1841706492598205, "grad_norm": 2.109375, "learning_rate": 4.990639751887316e-05, "loss": 0.1431, "mean_token_accuracy": 0.9555594265460968, "num_tokens": 21323628.0, "step": 13660 }, { "entropy": 0.14371351413428785, "epoch": 3.185336286280452, "grad_norm": 6.4375, "learning_rate": 4.9906210977457956e-05, "loss": 0.2347, "mean_token_accuracy": 0.9484319508075714, "num_tokens": 21345259.0, "step": 13665 }, { "entropy": 0.1446783158928156, "epoch": 3.186501923301084, "grad_norm": 9.4375, "learning_rate": 4.9906024251047486e-05, "loss": 0.2042, "mean_token_accuracy": 0.9526873469352722, "num_tokens": 21371830.0, "step": 13670 }, { "entropy": 0.11993208415806293, "epoch": 3.187667560321716, "grad_norm": 0.87890625, "learning_rate": 4.990583733964453e-05, "loss": 0.2079, "mean_token_accuracy": 0.9530717611312867, "num_tokens": 21403937.0, "step": 13675 }, { "entropy": 0.1540200922638178, "epoch": 3.1888331973423476, "grad_norm": 1.4140625, "learning_rate": 4.9905650243251886e-05, "loss": 0.1084, "mean_token_accuracy": 0.9452480733394623, "num_tokens": 21447968.0, "step": 13680 }, { "entropy": 0.12013146840035915, "epoch": 3.1899988343629793, "grad_norm": 1.5546875, "learning_rate": 4.9905462961872334e-05, "loss": 0.1844, "mean_token_accuracy": 0.9520611464977264, "num_tokens": 21463123.0, "step": 13685 }, { "entropy": 0.08509744182229043, "epoch": 3.191164471383611, "grad_norm": 0.68359375, "learning_rate": 4.990527549550867e-05, "loss": 0.1185, "mean_token_accuracy": 0.9626811742782593, "num_tokens": 21492569.0, "step": 13690 }, { "entropy": 0.13770532943308353, "epoch": 3.192330108404243, "grad_norm": 2.0, "learning_rate": 4.990508784416369e-05, "loss": 0.1866, "mean_token_accuracy": 0.9388071298599243, "num_tokens": 21513409.0, "step": 13695 }, { "entropy": 0.11494051553308963, "epoch": 3.1934957454248747, "grad_norm": 1.265625, "learning_rate": 4.9904900007840195e-05, "loss": 0.1976, "mean_token_accuracy": 0.9597686767578125, "num_tokens": 21528668.0, "step": 13700 }, { "entropy": 0.12358577623963356, "epoch": 3.1946613824455063, "grad_norm": 4.9375, "learning_rate": 4.990471198654098e-05, "loss": 0.2058, "mean_token_accuracy": 0.9521121263504029, "num_tokens": 21550385.0, "step": 13705 }, { "entropy": 0.09476232714951038, "epoch": 3.1958270194661385, "grad_norm": 2.03125, "learning_rate": 4.9904523780268854e-05, "loss": 0.1059, "mean_token_accuracy": 0.9660671591758728, "num_tokens": 21572739.0, "step": 13710 }, { "entropy": 0.13330335095524787, "epoch": 3.19699265648677, "grad_norm": 13.0, "learning_rate": 4.990433538902662e-05, "loss": 0.271, "mean_token_accuracy": 0.9459414899349212, "num_tokens": 21585685.0, "step": 13715 }, { "entropy": 0.12506634294986724, "epoch": 3.1981582935074018, "grad_norm": 3.765625, "learning_rate": 4.9904146812817087e-05, "loss": 0.1427, "mean_token_accuracy": 0.9513930737972259, "num_tokens": 21603696.0, "step": 13720 }, { "entropy": 0.09047595858573913, "epoch": 3.1993239305280334, "grad_norm": 0.87890625, "learning_rate": 4.990395805164307e-05, "loss": 0.0768, "mean_token_accuracy": 0.9803003549575806, "num_tokens": 21631611.0, "step": 13725 }, { "entropy": 0.1679716443642974, "epoch": 3.2004895675486655, "grad_norm": 2.625, "learning_rate": 4.990376910550738e-05, "loss": 0.1202, "mean_token_accuracy": 0.9510182023048401, "num_tokens": 21684068.0, "step": 13730 }, { "entropy": 0.08525995686650276, "epoch": 3.201655204569297, "grad_norm": 2.578125, "learning_rate": 4.990357997441284e-05, "loss": 0.1319, "mean_token_accuracy": 0.9639061510562896, "num_tokens": 21717942.0, "step": 13735 }, { "entropy": 0.09524129014462232, "epoch": 3.202820841589929, "grad_norm": 3.625, "learning_rate": 4.990339065836226e-05, "loss": 0.1859, "mean_token_accuracy": 0.9589514017105103, "num_tokens": 21738190.0, "step": 13740 }, { "entropy": 0.07734522055834532, "epoch": 3.2039864786105605, "grad_norm": 1.4921875, "learning_rate": 4.990320115735848e-05, "loss": 0.1033, "mean_token_accuracy": 0.9685503840446472, "num_tokens": 21763096.0, "step": 13745 }, { "entropy": 0.0874601237475872, "epoch": 3.2051521156311926, "grad_norm": 0.640625, "learning_rate": 4.9903011471404304e-05, "loss": 0.1462, "mean_token_accuracy": 0.9692184507846833, "num_tokens": 21788974.0, "step": 13750 }, { "entropy": 0.1801584392786026, "epoch": 3.2063177526518243, "grad_norm": 5.15625, "learning_rate": 4.9902821600502575e-05, "loss": 0.2558, "mean_token_accuracy": 0.930813604593277, "num_tokens": 21800267.0, "step": 13755 }, { "entropy": 0.11756899803876877, "epoch": 3.207483389672456, "grad_norm": 7.96875, "learning_rate": 4.990263154465612e-05, "loss": 0.2071, "mean_token_accuracy": 0.9543874800205231, "num_tokens": 21811432.0, "step": 13760 }, { "entropy": 0.11073267944157124, "epoch": 3.2086490266930876, "grad_norm": 1.3515625, "learning_rate": 4.990244130386778e-05, "loss": 0.126, "mean_token_accuracy": 0.9537279605865479, "num_tokens": 21840039.0, "step": 13765 }, { "entropy": 0.14336490221321582, "epoch": 3.2098146637137197, "grad_norm": 3.96875, "learning_rate": 4.990225087814038e-05, "loss": 0.165, "mean_token_accuracy": 0.9559800326824188, "num_tokens": 21852623.0, "step": 13770 }, { "entropy": 0.13291695453226565, "epoch": 3.2109803007343514, "grad_norm": 3.0625, "learning_rate": 4.990206026747677e-05, "loss": 0.1719, "mean_token_accuracy": 0.9552733778953553, "num_tokens": 21863643.0, "step": 13775 }, { "entropy": 0.1144310999661684, "epoch": 3.212145937754983, "grad_norm": 2.328125, "learning_rate": 4.9901869471879784e-05, "loss": 0.1422, "mean_token_accuracy": 0.9642179667949676, "num_tokens": 21886692.0, "step": 13780 }, { "entropy": 0.19398325234651564, "epoch": 3.2133115747756147, "grad_norm": 1.5703125, "learning_rate": 4.990167849135227e-05, "loss": 0.2933, "mean_token_accuracy": 0.9346424221992493, "num_tokens": 21901027.0, "step": 13785 }, { "entropy": 0.14228848069906236, "epoch": 3.214477211796247, "grad_norm": 0.87890625, "learning_rate": 4.990148732589708e-05, "loss": 0.2278, "mean_token_accuracy": 0.9492959976196289, "num_tokens": 21924230.0, "step": 13790 }, { "entropy": 0.11809794455766678, "epoch": 3.2156428488168785, "grad_norm": 4.03125, "learning_rate": 4.990129597551706e-05, "loss": 0.1691, "mean_token_accuracy": 0.9494635581970214, "num_tokens": 21944816.0, "step": 13795 }, { "entropy": 0.14083162061870097, "epoch": 3.21680848583751, "grad_norm": 1.703125, "learning_rate": 4.9901104440215064e-05, "loss": 0.203, "mean_token_accuracy": 0.9425077676773072, "num_tokens": 21957162.0, "step": 13800 }, { "entropy": 0.19819015860557557, "epoch": 3.2179741228581418, "grad_norm": 3.1875, "learning_rate": 4.9900912719993944e-05, "loss": 0.3117, "mean_token_accuracy": 0.9249706268310547, "num_tokens": 21970654.0, "step": 13805 }, { "entropy": 0.10589619725942612, "epoch": 3.219139759878774, "grad_norm": 1.703125, "learning_rate": 4.9900720814856574e-05, "loss": 0.1353, "mean_token_accuracy": 0.961876267194748, "num_tokens": 21989719.0, "step": 13810 }, { "entropy": 0.14038040973246096, "epoch": 3.2203053968994055, "grad_norm": 2.171875, "learning_rate": 4.990052872480579e-05, "loss": 0.1447, "mean_token_accuracy": 0.9591590285301208, "num_tokens": 22008316.0, "step": 13815 }, { "entropy": 0.1346709206700325, "epoch": 3.221471033920037, "grad_norm": 9.625, "learning_rate": 4.9900336449844486e-05, "loss": 0.3254, "mean_token_accuracy": 0.9380843579769135, "num_tokens": 22026814.0, "step": 13820 }, { "entropy": 0.12028101049363613, "epoch": 3.222636670940669, "grad_norm": 1.8046875, "learning_rate": 4.9900143989975515e-05, "loss": 0.1735, "mean_token_accuracy": 0.9532825112342834, "num_tokens": 22047666.0, "step": 13825 }, { "entropy": 0.16554685439914466, "epoch": 3.223802307961301, "grad_norm": 2.21875, "learning_rate": 4.989995134520173e-05, "loss": 0.2984, "mean_token_accuracy": 0.9322503209114075, "num_tokens": 22068533.0, "step": 13830 }, { "entropy": 0.09840068519115448, "epoch": 3.2249679449819326, "grad_norm": 6.0625, "learning_rate": 4.9899758515526035e-05, "loss": 0.1664, "mean_token_accuracy": 0.9617088854312896, "num_tokens": 22092569.0, "step": 13835 }, { "entropy": 0.10480661168694497, "epoch": 3.2261335820025643, "grad_norm": 7.96875, "learning_rate": 4.989956550095129e-05, "loss": 0.1949, "mean_token_accuracy": 0.9511193692684173, "num_tokens": 22110234.0, "step": 13840 }, { "entropy": 0.13478904515504836, "epoch": 3.2272992190231964, "grad_norm": 4.59375, "learning_rate": 4.989937230148036e-05, "loss": 0.1947, "mean_token_accuracy": 0.9495131015777588, "num_tokens": 22126238.0, "step": 13845 }, { "entropy": 0.18446614295244218, "epoch": 3.228464856043828, "grad_norm": 1.2109375, "learning_rate": 4.989917891711615e-05, "loss": 0.33, "mean_token_accuracy": 0.9250796020030976, "num_tokens": 22162234.0, "step": 13850 }, { "entropy": 0.14001993387937545, "epoch": 3.2296304930644597, "grad_norm": 4.78125, "learning_rate": 4.989898534786153e-05, "loss": 0.2614, "mean_token_accuracy": 0.945748233795166, "num_tokens": 22172904.0, "step": 13855 }, { "entropy": 0.1121131750755012, "epoch": 3.2307961300850914, "grad_norm": 0.5703125, "learning_rate": 4.989879159371939e-05, "loss": 0.1443, "mean_token_accuracy": 0.9647936940193176, "num_tokens": 22200250.0, "step": 13860 }, { "entropy": 0.17312668301165104, "epoch": 3.2319617671057235, "grad_norm": 4.0625, "learning_rate": 4.989859765469261e-05, "loss": 0.2362, "mean_token_accuracy": 0.9496867656707764, "num_tokens": 22212661.0, "step": 13865 }, { "entropy": 0.11042605116963386, "epoch": 3.233127404126355, "grad_norm": 4.78125, "learning_rate": 4.989840353078411e-05, "loss": 0.165, "mean_token_accuracy": 0.9618758082389831, "num_tokens": 22230362.0, "step": 13870 }, { "entropy": 0.12297814935445786, "epoch": 3.234293041146987, "grad_norm": 7.96875, "learning_rate": 4.989820922199675e-05, "loss": 0.1651, "mean_token_accuracy": 0.9570007860660553, "num_tokens": 22266384.0, "step": 13875 }, { "entropy": 0.09149080365896226, "epoch": 3.2354586781676185, "grad_norm": 1.5, "learning_rate": 4.9898014728333444e-05, "loss": 0.1237, "mean_token_accuracy": 0.967199444770813, "num_tokens": 22296680.0, "step": 13880 }, { "entropy": 0.1321562185883522, "epoch": 3.2366243151882506, "grad_norm": 1.4140625, "learning_rate": 4.98978200497971e-05, "loss": 0.1815, "mean_token_accuracy": 0.9545339822769165, "num_tokens": 22316879.0, "step": 13885 }, { "entropy": 0.12758674062788486, "epoch": 3.237789952208882, "grad_norm": 11.0, "learning_rate": 4.989762518639059e-05, "loss": 0.2647, "mean_token_accuracy": 0.9446513593196869, "num_tokens": 22331553.0, "step": 13890 }, { "entropy": 0.11283358000218868, "epoch": 3.238955589229514, "grad_norm": 0.5703125, "learning_rate": 4.9897430138116863e-05, "loss": 0.2038, "mean_token_accuracy": 0.9567483723163605, "num_tokens": 22350593.0, "step": 13895 }, { "entropy": 0.1608588956296444, "epoch": 3.2401212262501455, "grad_norm": 4.6875, "learning_rate": 4.9897234904978803e-05, "loss": 0.2525, "mean_token_accuracy": 0.9486888587474823, "num_tokens": 22360472.0, "step": 13900 }, { "entropy": 0.15052568428218366, "epoch": 3.2412868632707776, "grad_norm": 4.46875, "learning_rate": 4.989703948697932e-05, "loss": 0.1805, "mean_token_accuracy": 0.9498693823814393, "num_tokens": 22375242.0, "step": 13905 }, { "entropy": 0.1559467740356922, "epoch": 3.2424525002914093, "grad_norm": 8.125, "learning_rate": 4.989684388412133e-05, "loss": 0.2759, "mean_token_accuracy": 0.9473506152629853, "num_tokens": 22385223.0, "step": 13910 }, { "entropy": 0.12847465351223947, "epoch": 3.243618137312041, "grad_norm": 2.53125, "learning_rate": 4.989664809640775e-05, "loss": 0.2491, "mean_token_accuracy": 0.9525147080421448, "num_tokens": 22397441.0, "step": 13915 }, { "entropy": 0.13161057084798813, "epoch": 3.2447837743326726, "grad_norm": 1.1171875, "learning_rate": 4.989645212384151e-05, "loss": 0.199, "mean_token_accuracy": 0.9566210269927978, "num_tokens": 22424867.0, "step": 13920 }, { "entropy": 0.08073518313467502, "epoch": 3.2459494113533047, "grad_norm": 7.5, "learning_rate": 4.9896255966425516e-05, "loss": 0.1307, "mean_token_accuracy": 0.9721478164196015, "num_tokens": 22445947.0, "step": 13925 }, { "entropy": 0.17617430575191975, "epoch": 3.2471150483739364, "grad_norm": 5.0625, "learning_rate": 4.989605962416271e-05, "loss": 0.2539, "mean_token_accuracy": 0.9380265295505523, "num_tokens": 22461954.0, "step": 13930 }, { "entropy": 0.09188916571438313, "epoch": 3.248280685394568, "grad_norm": 6.09375, "learning_rate": 4.989586309705599e-05, "loss": 0.1049, "mean_token_accuracy": 0.965660160779953, "num_tokens": 22492788.0, "step": 13935 }, { "entropy": 0.12470996119081974, "epoch": 3.2494463224151997, "grad_norm": 0.67578125, "learning_rate": 4.989566638510833e-05, "loss": 0.156, "mean_token_accuracy": 0.9591312229633331, "num_tokens": 22517242.0, "step": 13940 }, { "entropy": 0.124464544467628, "epoch": 3.250611959435832, "grad_norm": 6.6875, "learning_rate": 4.9895469488322624e-05, "loss": 0.18, "mean_token_accuracy": 0.9532681405544281, "num_tokens": 22534897.0, "step": 13945 }, { "entropy": 0.21237018406391145, "epoch": 3.2517775964564635, "grad_norm": 2.8125, "learning_rate": 4.989527240670183e-05, "loss": 0.3355, "mean_token_accuracy": 0.9278428435325623, "num_tokens": 22566016.0, "step": 13950 }, { "entropy": 0.16752017810940742, "epoch": 3.252943233477095, "grad_norm": 8.0625, "learning_rate": 4.9895075140248884e-05, "loss": 0.2726, "mean_token_accuracy": 0.9438336849212646, "num_tokens": 22575102.0, "step": 13955 }, { "entropy": 0.12253744266927243, "epoch": 3.254108870497727, "grad_norm": 5.90625, "learning_rate": 4.9894877688966726e-05, "loss": 0.2037, "mean_token_accuracy": 0.9605319619178772, "num_tokens": 22587999.0, "step": 13960 }, { "entropy": 0.10806054174900055, "epoch": 3.255274507518359, "grad_norm": 5.75, "learning_rate": 4.989468005285829e-05, "loss": 0.1474, "mean_token_accuracy": 0.9701383888721467, "num_tokens": 22612878.0, "step": 13965 }, { "entropy": 0.07961149625480175, "epoch": 3.2564401445389906, "grad_norm": 2.625, "learning_rate": 4.989448223192654e-05, "loss": 0.1128, "mean_token_accuracy": 0.9735390186309815, "num_tokens": 22635307.0, "step": 13970 }, { "entropy": 0.09045897722244263, "epoch": 3.257605781559622, "grad_norm": 6.90625, "learning_rate": 4.989428422617441e-05, "loss": 0.0872, "mean_token_accuracy": 0.9661646127700806, "num_tokens": 22670036.0, "step": 13975 }, { "entropy": 0.14846278242766858, "epoch": 3.2587714185802543, "grad_norm": 1.078125, "learning_rate": 4.989408603560486e-05, "loss": 0.2354, "mean_token_accuracy": 0.9485298454761505, "num_tokens": 22691930.0, "step": 13980 }, { "entropy": 0.14384981635957955, "epoch": 3.259937055600886, "grad_norm": 5.125, "learning_rate": 4.989388766022085e-05, "loss": 0.197, "mean_token_accuracy": 0.9554014205932617, "num_tokens": 22711935.0, "step": 13985 }, { "entropy": 0.1226340863853693, "epoch": 3.2611026926215176, "grad_norm": 2.265625, "learning_rate": 4.989368910002534e-05, "loss": 0.1169, "mean_token_accuracy": 0.958068335056305, "num_tokens": 22736033.0, "step": 13990 }, { "entropy": 0.09807408321648836, "epoch": 3.2622683296421493, "grad_norm": 0.80859375, "learning_rate": 4.9893490355021275e-05, "loss": 0.11, "mean_token_accuracy": 0.9633495807647705, "num_tokens": 22765974.0, "step": 13995 }, { "entropy": 0.13382991403341293, "epoch": 3.263433966662781, "grad_norm": 4.96875, "learning_rate": 4.989329142521163e-05, "loss": 0.2118, "mean_token_accuracy": 0.9530467092990875, "num_tokens": 22785509.0, "step": 14000 }, { "entropy": 0.12002462726086378, "epoch": 3.264599603683413, "grad_norm": 4.1875, "learning_rate": 4.989309231059937e-05, "loss": 0.2169, "mean_token_accuracy": 0.9570177972316742, "num_tokens": 22803325.0, "step": 14005 }, { "entropy": 0.14383577033877373, "epoch": 3.2657652407040447, "grad_norm": 7.59375, "learning_rate": 4.989289301118746e-05, "loss": 0.2408, "mean_token_accuracy": 0.9505583703517914, "num_tokens": 22812269.0, "step": 14010 }, { "entropy": 0.15477120950818063, "epoch": 3.2669308777246764, "grad_norm": 4.40625, "learning_rate": 4.989269352697888e-05, "loss": 0.2265, "mean_token_accuracy": 0.9491691708564758, "num_tokens": 22822670.0, "step": 14015 }, { "entropy": 0.12411811240017415, "epoch": 3.2680965147453085, "grad_norm": 1.046875, "learning_rate": 4.98924938579766e-05, "loss": 0.1797, "mean_token_accuracy": 0.9555955529212952, "num_tokens": 22841556.0, "step": 14020 }, { "entropy": 0.15641675479710102, "epoch": 3.26926215176594, "grad_norm": 6.40625, "learning_rate": 4.989229400418359e-05, "loss": 0.2924, "mean_token_accuracy": 0.9350105941295623, "num_tokens": 22851956.0, "step": 14025 }, { "entropy": 0.17089218497276307, "epoch": 3.270427788786572, "grad_norm": 1.2421875, "learning_rate": 4.9892093965602846e-05, "loss": 0.2689, "mean_token_accuracy": 0.9299245476722717, "num_tokens": 22873141.0, "step": 14030 }, { "entropy": 0.10465769805014133, "epoch": 3.2715934258072035, "grad_norm": 6.125, "learning_rate": 4.9891893742237336e-05, "loss": 0.1531, "mean_token_accuracy": 0.9625381529331207, "num_tokens": 22892855.0, "step": 14035 }, { "entropy": 0.12475298270583153, "epoch": 3.2727590628278356, "grad_norm": 3.015625, "learning_rate": 4.989169333409006e-05, "loss": 0.1886, "mean_token_accuracy": 0.9612498998641967, "num_tokens": 22908889.0, "step": 14040 }, { "entropy": 0.12309679705649615, "epoch": 3.2739246998484672, "grad_norm": 1.109375, "learning_rate": 4.9891492741163986e-05, "loss": 0.1482, "mean_token_accuracy": 0.9679423987865448, "num_tokens": 22926291.0, "step": 14045 }, { "entropy": 0.16394661888480186, "epoch": 3.275090336869099, "grad_norm": 3.515625, "learning_rate": 4.989129196346213e-05, "loss": 0.2189, "mean_token_accuracy": 0.939568281173706, "num_tokens": 22953734.0, "step": 14050 }, { "entropy": 0.22891110852360724, "epoch": 3.2762559738897306, "grad_norm": 3.828125, "learning_rate": 4.989109100098746e-05, "loss": 0.2433, "mean_token_accuracy": 0.9389278292655945, "num_tokens": 22973890.0, "step": 14055 }, { "entropy": 0.12039781026542187, "epoch": 3.2774216109103627, "grad_norm": 7.0, "learning_rate": 4.9890889853743e-05, "loss": 0.2894, "mean_token_accuracy": 0.9522871553897858, "num_tokens": 22987566.0, "step": 14060 }, { "entropy": 0.102488574385643, "epoch": 3.2785872479309943, "grad_norm": 0.8828125, "learning_rate": 4.9890688521731726e-05, "loss": 0.1132, "mean_token_accuracy": 0.9691874027252197, "num_tokens": 23013277.0, "step": 14065 }, { "entropy": 0.11040122248232365, "epoch": 3.279752884951626, "grad_norm": 1.1953125, "learning_rate": 4.989048700495665e-05, "loss": 0.1639, "mean_token_accuracy": 0.9644237160682678, "num_tokens": 23027335.0, "step": 14070 }, { "entropy": 0.13591878786683081, "epoch": 3.280918521972258, "grad_norm": 1.8125, "learning_rate": 4.989028530342078e-05, "loss": 0.1813, "mean_token_accuracy": 0.9562951922416687, "num_tokens": 23040430.0, "step": 14075 }, { "entropy": 0.10207988247275353, "epoch": 3.2820841589928897, "grad_norm": 0.7578125, "learning_rate": 4.989008341712712e-05, "loss": 0.1693, "mean_token_accuracy": 0.9547303438186645, "num_tokens": 23060680.0, "step": 14080 }, { "entropy": 0.12369899153709411, "epoch": 3.2832497960135214, "grad_norm": 1.2265625, "learning_rate": 4.9889881346078675e-05, "loss": 0.1882, "mean_token_accuracy": 0.9508943438529969, "num_tokens": 23078759.0, "step": 14085 }, { "entropy": 0.10847660899162292, "epoch": 3.284415433034153, "grad_norm": 0.5625, "learning_rate": 4.9889679090278466e-05, "loss": 0.1459, "mean_token_accuracy": 0.9644874215126038, "num_tokens": 23102055.0, "step": 14090 }, { "entropy": 0.2650509625673294, "epoch": 3.2855810700547847, "grad_norm": 9.625, "learning_rate": 4.988947664972951e-05, "loss": 0.4359, "mean_token_accuracy": 0.9194409489631653, "num_tokens": 23124775.0, "step": 14095 }, { "entropy": 0.14016255624592305, "epoch": 3.286746707075417, "grad_norm": 3.515625, "learning_rate": 4.9889274024434826e-05, "loss": 0.1227, "mean_token_accuracy": 0.948961591720581, "num_tokens": 23163545.0, "step": 14100 }, { "entropy": 0.11483432129025459, "epoch": 3.2879123440960485, "grad_norm": 2.78125, "learning_rate": 4.988907121439742e-05, "loss": 0.1191, "mean_token_accuracy": 0.9661029517650604, "num_tokens": 23188930.0, "step": 14105 }, { "entropy": 0.12453803811222315, "epoch": 3.28907798111668, "grad_norm": 6.0, "learning_rate": 4.988886821962034e-05, "loss": 0.1842, "mean_token_accuracy": 0.9471354961395264, "num_tokens": 23216209.0, "step": 14110 }, { "entropy": 0.13249999955296515, "epoch": 3.2902436181373123, "grad_norm": 2.046875, "learning_rate": 4.988866504010659e-05, "loss": 0.1485, "mean_token_accuracy": 0.9522889494895935, "num_tokens": 23233724.0, "step": 14115 }, { "entropy": 0.1101855117827654, "epoch": 3.291409255157944, "grad_norm": 7.875, "learning_rate": 4.988846167585922e-05, "loss": 0.1761, "mean_token_accuracy": 0.9595109105110169, "num_tokens": 23255614.0, "step": 14120 }, { "entropy": 0.11480397321283817, "epoch": 3.2925748921785756, "grad_norm": 5.34375, "learning_rate": 4.9888258126881246e-05, "loss": 0.1468, "mean_token_accuracy": 0.961454713344574, "num_tokens": 23275000.0, "step": 14125 }, { "entropy": 0.11717780642211437, "epoch": 3.2937405291992072, "grad_norm": 4.65625, "learning_rate": 4.9888054393175715e-05, "loss": 0.1914, "mean_token_accuracy": 0.9546636343002319, "num_tokens": 23286380.0, "step": 14130 }, { "entropy": 0.10818963898345828, "epoch": 3.294906166219839, "grad_norm": 1.7890625, "learning_rate": 4.9887850474745654e-05, "loss": 0.151, "mean_token_accuracy": 0.9663783788681031, "num_tokens": 23306723.0, "step": 14135 }, { "entropy": 0.241145624127239, "epoch": 3.296071803240471, "grad_norm": 0.5625, "learning_rate": 4.988764637159412e-05, "loss": 0.4151, "mean_token_accuracy": 0.9382971048355102, "num_tokens": 23345607.0, "step": 14140 }, { "entropy": 0.12085740342736244, "epoch": 3.2972374402611027, "grad_norm": 3.390625, "learning_rate": 4.9887442083724146e-05, "loss": 0.2171, "mean_token_accuracy": 0.9586883842945099, "num_tokens": 23358320.0, "step": 14145 }, { "entropy": 0.16930502615869045, "epoch": 3.2984030772817343, "grad_norm": 7.3125, "learning_rate": 4.988723761113877e-05, "loss": 0.2626, "mean_token_accuracy": 0.9430509269237518, "num_tokens": 23378309.0, "step": 14150 }, { "entropy": 0.08466316685080529, "epoch": 3.2995687143023664, "grad_norm": 0.79296875, "learning_rate": 4.988703295384105e-05, "loss": 0.1232, "mean_token_accuracy": 0.9719673752784729, "num_tokens": 23415091.0, "step": 14155 }, { "entropy": 0.09591401666402817, "epoch": 3.300734351322998, "grad_norm": 1.3671875, "learning_rate": 4.9886828111834046e-05, "loss": 0.2293, "mean_token_accuracy": 0.9556362986564636, "num_tokens": 23433837.0, "step": 14160 }, { "entropy": 0.2667720340192318, "epoch": 3.3018999883436297, "grad_norm": 1.296875, "learning_rate": 4.98866230851208e-05, "loss": 0.3849, "mean_token_accuracy": 0.93751819729805, "num_tokens": 23455339.0, "step": 14165 }, { "entropy": 0.12662522951141, "epoch": 3.3030656253642614, "grad_norm": 0.498046875, "learning_rate": 4.988641787370437e-05, "loss": 0.1087, "mean_token_accuracy": 0.9500738441944122, "num_tokens": 23479400.0, "step": 14170 }, { "entropy": 0.09370872657746077, "epoch": 3.3042312623848935, "grad_norm": 1.8359375, "learning_rate": 4.988621247758782e-05, "loss": 0.0845, "mean_token_accuracy": 0.9714587390422821, "num_tokens": 23501072.0, "step": 14175 }, { "entropy": 0.0931572213768959, "epoch": 3.305396899405525, "grad_norm": 5.21875, "learning_rate": 4.988600689677422e-05, "loss": 0.0987, "mean_token_accuracy": 0.9701979279518127, "num_tokens": 23531872.0, "step": 14180 }, { "entropy": 0.17157430276274682, "epoch": 3.306562536426157, "grad_norm": 6.40625, "learning_rate": 4.988580113126662e-05, "loss": 0.3283, "mean_token_accuracy": 0.9170258045196533, "num_tokens": 23550255.0, "step": 14185 }, { "entropy": 0.12436717860400677, "epoch": 3.3077281734467885, "grad_norm": 1.7578125, "learning_rate": 4.9885595181068094e-05, "loss": 0.1538, "mean_token_accuracy": 0.9461873650550843, "num_tokens": 23591479.0, "step": 14190 }, { "entropy": 0.11454317420721054, "epoch": 3.3088938104674206, "grad_norm": 4.125, "learning_rate": 4.988538904618172e-05, "loss": 0.1739, "mean_token_accuracy": 0.956846272945404, "num_tokens": 23619937.0, "step": 14195 }, { "entropy": 0.08084500934928655, "epoch": 3.3100594474880523, "grad_norm": 0.79296875, "learning_rate": 4.988518272661057e-05, "loss": 0.0701, "mean_token_accuracy": 0.9723656892776489, "num_tokens": 23654470.0, "step": 14200 }, { "entropy": 0.1474765609949827, "epoch": 3.311225084508684, "grad_norm": 2.65625, "learning_rate": 4.988497622235771e-05, "loss": 0.2552, "mean_token_accuracy": 0.941499924659729, "num_tokens": 23674024.0, "step": 14205 }, { "entropy": 0.14339925050735475, "epoch": 3.312390721529316, "grad_norm": 6.96875, "learning_rate": 4.988476953342623e-05, "loss": 0.2872, "mean_token_accuracy": 0.94207683801651, "num_tokens": 23683173.0, "step": 14210 }, { "entropy": 0.1142255749553442, "epoch": 3.3135563585499477, "grad_norm": 1.21875, "learning_rate": 4.988456265981921e-05, "loss": 0.1305, "mean_token_accuracy": 0.9632910430431366, "num_tokens": 23708330.0, "step": 14215 }, { "entropy": 0.09450707472860813, "epoch": 3.3147219955705793, "grad_norm": 0.8125, "learning_rate": 4.988435560153972e-05, "loss": 0.1312, "mean_token_accuracy": 0.9680880188941956, "num_tokens": 23739180.0, "step": 14220 }, { "entropy": 0.11961795259267091, "epoch": 3.315887632591211, "grad_norm": 0.640625, "learning_rate": 4.988414835859087e-05, "loss": 0.1746, "mean_token_accuracy": 0.9507930636405945, "num_tokens": 23766179.0, "step": 14225 }, { "entropy": 0.10384149886667729, "epoch": 3.3170532696118427, "grad_norm": 4.21875, "learning_rate": 4.988394093097575e-05, "loss": 0.1533, "mean_token_accuracy": 0.9634926319122314, "num_tokens": 23781500.0, "step": 14230 }, { "entropy": 0.11352912969887256, "epoch": 3.3182189066324748, "grad_norm": 6.875, "learning_rate": 4.9883733318697436e-05, "loss": 0.1626, "mean_token_accuracy": 0.9677087485790252, "num_tokens": 23795117.0, "step": 14235 }, { "entropy": 0.14532850068062544, "epoch": 3.3193845436531064, "grad_norm": 6.1875, "learning_rate": 4.9883525521759034e-05, "loss": 0.2539, "mean_token_accuracy": 0.9434155464172364, "num_tokens": 23814943.0, "step": 14240 }, { "entropy": 0.09582647122442722, "epoch": 3.320550180673738, "grad_norm": 3.0, "learning_rate": 4.9883317540163634e-05, "loss": 0.0985, "mean_token_accuracy": 0.9703680038452148, "num_tokens": 23843871.0, "step": 14245 }, { "entropy": 0.13182559311389924, "epoch": 3.32171581769437, "grad_norm": 5.875, "learning_rate": 4.988310937391435e-05, "loss": 0.2936, "mean_token_accuracy": 0.9366560280323029, "num_tokens": 23864184.0, "step": 14250 }, { "entropy": 0.15998778566718103, "epoch": 3.322881454715002, "grad_norm": 1.078125, "learning_rate": 4.9882901023014284e-05, "loss": 0.249, "mean_token_accuracy": 0.9381362736225128, "num_tokens": 23884339.0, "step": 14255 }, { "entropy": 0.11376429684460163, "epoch": 3.3240470917356335, "grad_norm": 2.078125, "learning_rate": 4.9882692487466534e-05, "loss": 0.1179, "mean_token_accuracy": 0.968958580493927, "num_tokens": 23901021.0, "step": 14260 }, { "entropy": 0.09357526563107968, "epoch": 3.325212728756265, "grad_norm": 4.96875, "learning_rate": 4.988248376727421e-05, "loss": 0.0891, "mean_token_accuracy": 0.9657317698001862, "num_tokens": 23940219.0, "step": 14265 }, { "entropy": 0.13367250710725784, "epoch": 3.326378365776897, "grad_norm": 6.71875, "learning_rate": 4.988227486244044e-05, "loss": 0.2013, "mean_token_accuracy": 0.9435231447219848, "num_tokens": 23962461.0, "step": 14270 }, { "entropy": 0.13902662843465804, "epoch": 3.327544002797529, "grad_norm": 1.7109375, "learning_rate": 4.988206577296832e-05, "loss": 0.2072, "mean_token_accuracy": 0.9430397391319275, "num_tokens": 23979910.0, "step": 14275 }, { "entropy": 0.11999710947275162, "epoch": 3.3287096398181606, "grad_norm": 4.78125, "learning_rate": 4.9881856498860976e-05, "loss": 0.1741, "mean_token_accuracy": 0.9627709746360779, "num_tokens": 23993337.0, "step": 14280 }, { "entropy": 0.16236310750246047, "epoch": 3.3298752768387923, "grad_norm": 5.03125, "learning_rate": 4.988164704012153e-05, "loss": 0.2812, "mean_token_accuracy": 0.9466241359710693, "num_tokens": 24011775.0, "step": 14285 }, { "entropy": 0.13325685746967791, "epoch": 3.3310409138594244, "grad_norm": 2.375, "learning_rate": 4.98814373967531e-05, "loss": 0.1634, "mean_token_accuracy": 0.9539561152458191, "num_tokens": 24038307.0, "step": 14290 }, { "entropy": 0.1551351621747017, "epoch": 3.332206550880056, "grad_norm": 1.90625, "learning_rate": 4.988122756875881e-05, "loss": 0.2918, "mean_token_accuracy": 0.9345535576343537, "num_tokens": 24058169.0, "step": 14295 }, { "entropy": 0.11559103038161993, "epoch": 3.3333721879006877, "grad_norm": 2.703125, "learning_rate": 4.988101755614181e-05, "loss": 0.1565, "mean_token_accuracy": 0.9542689323425293, "num_tokens": 24083609.0, "step": 14300 }, { "entropy": 0.11027558147907257, "epoch": 3.3345378249213193, "grad_norm": 0.9140625, "learning_rate": 4.9880807358905205e-05, "loss": 0.1631, "mean_token_accuracy": 0.9572655200958252, "num_tokens": 24112635.0, "step": 14305 }, { "entropy": 0.12529565021395683, "epoch": 3.3357034619419514, "grad_norm": 6.71875, "learning_rate": 4.9880596977052146e-05, "loss": 0.208, "mean_token_accuracy": 0.9428850889205933, "num_tokens": 24127007.0, "step": 14310 }, { "entropy": 0.12923308033496142, "epoch": 3.336869098962583, "grad_norm": 0.578125, "learning_rate": 4.988038641058577e-05, "loss": 0.1843, "mean_token_accuracy": 0.9490243077278138, "num_tokens": 24155835.0, "step": 14315 }, { "entropy": 0.14287668615579605, "epoch": 3.3380347359832148, "grad_norm": 2.453125, "learning_rate": 4.98801756595092e-05, "loss": 0.188, "mean_token_accuracy": 0.9477911353111267, "num_tokens": 24178011.0, "step": 14320 }, { "entropy": 0.11310774460434914, "epoch": 3.3392003730038464, "grad_norm": 0.99609375, "learning_rate": 4.9879964723825586e-05, "loss": 0.1576, "mean_token_accuracy": 0.9627729594707489, "num_tokens": 24207402.0, "step": 14325 }, { "entropy": 0.12244891636073589, "epoch": 3.3403660100244785, "grad_norm": 1.203125, "learning_rate": 4.9879753603538095e-05, "loss": 0.1238, "mean_token_accuracy": 0.9554929971694947, "num_tokens": 24236148.0, "step": 14330 }, { "entropy": 0.17563813105225562, "epoch": 3.34153164704511, "grad_norm": 5.5625, "learning_rate": 4.987954229864984e-05, "loss": 0.2167, "mean_token_accuracy": 0.9393636643886566, "num_tokens": 24252465.0, "step": 14335 }, { "entropy": 0.13802102711051703, "epoch": 3.342697284065742, "grad_norm": 7.8125, "learning_rate": 4.9879330809164006e-05, "loss": 0.2489, "mean_token_accuracy": 0.9434120118618011, "num_tokens": 24264730.0, "step": 14340 }, { "entropy": 0.08878612257540226, "epoch": 3.343862921086374, "grad_norm": 0.59765625, "learning_rate": 4.987911913508372e-05, "loss": 0.1215, "mean_token_accuracy": 0.9721697270870209, "num_tokens": 24300175.0, "step": 14345 }, { "entropy": 0.10990535113960505, "epoch": 3.3450285581070056, "grad_norm": 6.75, "learning_rate": 4.9878907276412154e-05, "loss": 0.2078, "mean_token_accuracy": 0.9509715735912323, "num_tokens": 24318581.0, "step": 14350 }, { "entropy": 0.10521319657564163, "epoch": 3.3461941951276373, "grad_norm": 7.03125, "learning_rate": 4.987869523315245e-05, "loss": 0.1905, "mean_token_accuracy": 0.9640790045261383, "num_tokens": 24337633.0, "step": 14355 }, { "entropy": 0.14670916181057692, "epoch": 3.347359832148269, "grad_norm": 7.34375, "learning_rate": 4.9878483005307793e-05, "loss": 0.2265, "mean_token_accuracy": 0.9481269121170044, "num_tokens": 24355450.0, "step": 14360 }, { "entropy": 0.1167627077549696, "epoch": 3.3485254691689006, "grad_norm": 3.015625, "learning_rate": 4.987827059288133e-05, "loss": 0.1541, "mean_token_accuracy": 0.9560252726078033, "num_tokens": 24372481.0, "step": 14365 }, { "entropy": 0.08436639718711376, "epoch": 3.3496911061895327, "grad_norm": 2.03125, "learning_rate": 4.9878057995876235e-05, "loss": 0.1231, "mean_token_accuracy": 0.9654347062110901, "num_tokens": 24393139.0, "step": 14370 }, { "entropy": 0.15002177990972995, "epoch": 3.3508567432101644, "grad_norm": 4.84375, "learning_rate": 4.9877845214295685e-05, "loss": 0.1722, "mean_token_accuracy": 0.9522580683231354, "num_tokens": 24414782.0, "step": 14375 }, { "entropy": 0.16212302520871164, "epoch": 3.352022380230796, "grad_norm": 7.625, "learning_rate": 4.987763224814284e-05, "loss": 0.4094, "mean_token_accuracy": 0.9197640836238861, "num_tokens": 24427675.0, "step": 14380 }, { "entropy": 0.15881211534142495, "epoch": 3.353188017251428, "grad_norm": 5.03125, "learning_rate": 4.987741909742088e-05, "loss": 0.2321, "mean_token_accuracy": 0.9404888391494751, "num_tokens": 24444212.0, "step": 14385 }, { "entropy": 0.10046768113970757, "epoch": 3.35435365427206, "grad_norm": 0.5390625, "learning_rate": 4.987720576213299e-05, "loss": 0.1566, "mean_token_accuracy": 0.9620472431182862, "num_tokens": 24470465.0, "step": 14390 }, { "entropy": 0.1034924827516079, "epoch": 3.3555192912926914, "grad_norm": 1.2109375, "learning_rate": 4.987699224228234e-05, "loss": 0.212, "mean_token_accuracy": 0.9532204508781433, "num_tokens": 24483496.0, "step": 14395 }, { "entropy": 0.1358404979109764, "epoch": 3.356684928313323, "grad_norm": 2.0625, "learning_rate": 4.987677853787212e-05, "loss": 0.2205, "mean_token_accuracy": 0.9371620178222656, "num_tokens": 24494400.0, "step": 14400 }, { "entropy": 0.09902511592954397, "epoch": 3.3578505653339548, "grad_norm": 0.85546875, "learning_rate": 4.987656464890552e-05, "loss": 0.1139, "mean_token_accuracy": 0.9678740739822388, "num_tokens": 24535866.0, "step": 14405 }, { "entropy": 0.14913907796144485, "epoch": 3.359016202354587, "grad_norm": 6.5625, "learning_rate": 4.987635057538572e-05, "loss": 0.2283, "mean_token_accuracy": 0.9517066776752472, "num_tokens": 24543669.0, "step": 14410 }, { "entropy": 0.15391396917402744, "epoch": 3.3601818393752185, "grad_norm": 7.0, "learning_rate": 4.987613631731592e-05, "loss": 0.1576, "mean_token_accuracy": 0.941257655620575, "num_tokens": 24571538.0, "step": 14415 }, { "entropy": 0.10840295329689979, "epoch": 3.36134747639585, "grad_norm": 6.71875, "learning_rate": 4.987592187469932e-05, "loss": 0.2069, "mean_token_accuracy": 0.9558283507823944, "num_tokens": 24583454.0, "step": 14420 }, { "entropy": 0.1372427899390459, "epoch": 3.3625131134164823, "grad_norm": 0.6953125, "learning_rate": 4.98757072475391e-05, "loss": 0.1648, "mean_token_accuracy": 0.957892256975174, "num_tokens": 24600439.0, "step": 14425 }, { "entropy": 0.15529920905828476, "epoch": 3.363678750437114, "grad_norm": 8.375, "learning_rate": 4.987549243583848e-05, "loss": 0.2872, "mean_token_accuracy": 0.9361419081687927, "num_tokens": 24615485.0, "step": 14430 }, { "entropy": 0.14903669357299804, "epoch": 3.3648443874577456, "grad_norm": 0.9921875, "learning_rate": 4.9875277439600644e-05, "loss": 0.2364, "mean_token_accuracy": 0.9379843652248383, "num_tokens": 24644974.0, "step": 14435 }, { "entropy": 0.1466895282268524, "epoch": 3.3660100244783773, "grad_norm": 4.40625, "learning_rate": 4.9875062258828815e-05, "loss": 0.1642, "mean_token_accuracy": 0.9672025084495545, "num_tokens": 24669348.0, "step": 14440 }, { "entropy": 0.1412310428917408, "epoch": 3.3671756614990094, "grad_norm": 3.234375, "learning_rate": 4.98748468935262e-05, "loss": 0.2244, "mean_token_accuracy": 0.9454042732715606, "num_tokens": 24685191.0, "step": 14445 }, { "entropy": 0.10494980327785015, "epoch": 3.368341298519641, "grad_norm": 1.0390625, "learning_rate": 4.987463134369599e-05, "loss": 0.1908, "mean_token_accuracy": 0.9530590176582336, "num_tokens": 24709482.0, "step": 14450 }, { "entropy": 0.15416168235242367, "epoch": 3.3695069355402727, "grad_norm": 5.28125, "learning_rate": 4.987441560934142e-05, "loss": 0.2631, "mean_token_accuracy": 0.9420225262641907, "num_tokens": 24718356.0, "step": 14455 }, { "entropy": 0.14280086159706115, "epoch": 3.3706725725609044, "grad_norm": 4.625, "learning_rate": 4.9874199690465705e-05, "loss": 0.3272, "mean_token_accuracy": 0.9363387167453766, "num_tokens": 24727686.0, "step": 14460 }, { "entropy": 0.11754573006182908, "epoch": 3.3718382095815365, "grad_norm": 3.078125, "learning_rate": 4.987398358707206e-05, "loss": 0.1205, "mean_token_accuracy": 0.9647015869617462, "num_tokens": 24747975.0, "step": 14465 }, { "entropy": 0.31885806322097776, "epoch": 3.373003846602168, "grad_norm": 5.9375, "learning_rate": 4.98737672991637e-05, "loss": 0.6347, "mean_token_accuracy": 0.8757389962673188, "num_tokens": 24774500.0, "step": 14470 }, { "entropy": 0.13117293193936347, "epoch": 3.3741694836228, "grad_norm": 8.25, "learning_rate": 4.987355082674387e-05, "loss": 0.2232, "mean_token_accuracy": 0.9562461674213409, "num_tokens": 24794508.0, "step": 14475 }, { "entropy": 0.08942113667726517, "epoch": 3.3753351206434314, "grad_norm": 1.3046875, "learning_rate": 4.987333416981578e-05, "loss": 0.1175, "mean_token_accuracy": 0.9634603500366211, "num_tokens": 24815483.0, "step": 14480 }, { "entropy": 0.09471773691475391, "epoch": 3.3765007576640635, "grad_norm": 3.046875, "learning_rate": 4.987311732838267e-05, "loss": 0.0834, "mean_token_accuracy": 0.9728473007678986, "num_tokens": 24840356.0, "step": 14485 }, { "entropy": 0.1439421821385622, "epoch": 3.377666394684695, "grad_norm": 2.96875, "learning_rate": 4.9872900302447766e-05, "loss": 0.1813, "mean_token_accuracy": 0.9585707783699036, "num_tokens": 24867551.0, "step": 14490 }, { "entropy": 0.13271769210696222, "epoch": 3.378832031705327, "grad_norm": 3.453125, "learning_rate": 4.9872683092014315e-05, "loss": 0.1605, "mean_token_accuracy": 0.9479785680770874, "num_tokens": 24886496.0, "step": 14495 }, { "entropy": 0.11814107708632945, "epoch": 3.3799976687259585, "grad_norm": 2.640625, "learning_rate": 4.987246569708555e-05, "loss": 0.1814, "mean_token_accuracy": 0.9549219727516174, "num_tokens": 24896522.0, "step": 14500 }, { "entropy": 0.09513367228209972, "epoch": 3.3811633057465906, "grad_norm": 2.859375, "learning_rate": 4.9872248117664706e-05, "loss": 0.12, "mean_token_accuracy": 0.9677752256393433, "num_tokens": 24914316.0, "step": 14505 }, { "entropy": 0.0879001509398222, "epoch": 3.3823289427672223, "grad_norm": 8.75, "learning_rate": 4.987203035375503e-05, "loss": 0.1097, "mean_token_accuracy": 0.9770538091659546, "num_tokens": 24941795.0, "step": 14510 }, { "entropy": 0.11058845482766629, "epoch": 3.383494579787854, "grad_norm": 0.61328125, "learning_rate": 4.987181240535978e-05, "loss": 0.1229, "mean_token_accuracy": 0.9565304040908813, "num_tokens": 24980354.0, "step": 14515 }, { "entropy": 0.11777700698003173, "epoch": 3.384660216808486, "grad_norm": 0.328125, "learning_rate": 4.98715942724822e-05, "loss": 0.2174, "mean_token_accuracy": 0.955057030916214, "num_tokens": 25003226.0, "step": 14520 }, { "entropy": 0.10959577150642871, "epoch": 3.3858258538291177, "grad_norm": 4.03125, "learning_rate": 4.9871375955125535e-05, "loss": 0.12, "mean_token_accuracy": 0.9680540144443512, "num_tokens": 25031189.0, "step": 14525 }, { "entropy": 0.16477933339774609, "epoch": 3.3869914908497494, "grad_norm": 6.1875, "learning_rate": 4.9871157453293057e-05, "loss": 0.2975, "mean_token_accuracy": 0.9351774334907532, "num_tokens": 25053474.0, "step": 14530 }, { "entropy": 0.14881827160716057, "epoch": 3.388157127870381, "grad_norm": 0.73046875, "learning_rate": 4.987093876698801e-05, "loss": 0.1589, "mean_token_accuracy": 0.9479979932308197, "num_tokens": 25098699.0, "step": 14535 }, { "entropy": 0.14751329086720943, "epoch": 3.3893227648910127, "grad_norm": 2.15625, "learning_rate": 4.9870719896213654e-05, "loss": 0.2256, "mean_token_accuracy": 0.9514884114265442, "num_tokens": 25119598.0, "step": 14540 }, { "entropy": 0.13617292381823062, "epoch": 3.390488401911645, "grad_norm": 5.25, "learning_rate": 4.987050084097326e-05, "loss": 0.2124, "mean_token_accuracy": 0.9466132402420044, "num_tokens": 25139817.0, "step": 14545 }, { "entropy": 0.13683177419006826, "epoch": 3.3916540389322765, "grad_norm": 1.0625, "learning_rate": 4.987028160127009e-05, "loss": 0.287, "mean_token_accuracy": 0.9442753791809082, "num_tokens": 25160490.0, "step": 14550 }, { "entropy": 0.15925323218107224, "epoch": 3.392819675952908, "grad_norm": 2.53125, "learning_rate": 4.9870062177107425e-05, "loss": 0.1181, "mean_token_accuracy": 0.9543979823589325, "num_tokens": 25182754.0, "step": 14555 }, { "entropy": 0.09092184137552976, "epoch": 3.3939853129735402, "grad_norm": 0.5859375, "learning_rate": 4.986984256848852e-05, "loss": 0.105, "mean_token_accuracy": 0.9717929124832153, "num_tokens": 25211791.0, "step": 14560 }, { "entropy": 0.10400575455278158, "epoch": 3.395150949994172, "grad_norm": 0.8984375, "learning_rate": 4.986962277541665e-05, "loss": 0.1191, "mean_token_accuracy": 0.966571044921875, "num_tokens": 25232556.0, "step": 14565 }, { "entropy": 0.10869345776736736, "epoch": 3.3963165870148035, "grad_norm": 3.4375, "learning_rate": 4.986940279789511e-05, "loss": 0.26, "mean_token_accuracy": 0.9437666535377502, "num_tokens": 25243127.0, "step": 14570 }, { "entropy": 0.14265891797840596, "epoch": 3.397482224035435, "grad_norm": 8.8125, "learning_rate": 4.986918263592717e-05, "loss": 0.2428, "mean_token_accuracy": 0.947214013338089, "num_tokens": 25259552.0, "step": 14575 }, { "entropy": 0.12260825484991074, "epoch": 3.3986478610560673, "grad_norm": 5.34375, "learning_rate": 4.986896228951611e-05, "loss": 0.1781, "mean_token_accuracy": 0.9559799790382385, "num_tokens": 25282016.0, "step": 14580 }, { "entropy": 0.16009515076875686, "epoch": 3.399813498076699, "grad_norm": 7.4375, "learning_rate": 4.9868741758665216e-05, "loss": 0.2073, "mean_token_accuracy": 0.9487410664558411, "num_tokens": 25298413.0, "step": 14585 }, { "entropy": 0.10344462431967258, "epoch": 3.4009791350973306, "grad_norm": 1.4296875, "learning_rate": 4.986852104337778e-05, "loss": 0.097, "mean_token_accuracy": 0.9678920030593872, "num_tokens": 25323048.0, "step": 14590 }, { "entropy": 0.11637456268072129, "epoch": 3.4021447721179623, "grad_norm": 4.25, "learning_rate": 4.986830014365709e-05, "loss": 0.1875, "mean_token_accuracy": 0.9532294034957886, "num_tokens": 25344447.0, "step": 14595 }, { "entropy": 0.12720008194446564, "epoch": 3.4033104091385944, "grad_norm": 3.5625, "learning_rate": 4.9868079059506443e-05, "loss": 0.1928, "mean_token_accuracy": 0.9561345875263214, "num_tokens": 25362513.0, "step": 14600 }, { "entropy": 0.10051075369119644, "epoch": 3.404476046159226, "grad_norm": 0.66015625, "learning_rate": 4.986785779092914e-05, "loss": 0.1544, "mean_token_accuracy": 0.9672173917293548, "num_tokens": 25382170.0, "step": 14605 }, { "entropy": 0.21919383555650712, "epoch": 3.4056416831798577, "grad_norm": 2.125, "learning_rate": 4.986763633792847e-05, "loss": 0.2736, "mean_token_accuracy": 0.931623786687851, "num_tokens": 25395526.0, "step": 14610 }, { "entropy": 0.13503132686018943, "epoch": 3.4068073202004894, "grad_norm": 8.25, "learning_rate": 4.986741470050774e-05, "loss": 0.2221, "mean_token_accuracy": 0.9475953102111816, "num_tokens": 25407123.0, "step": 14615 }, { "entropy": 0.10371251739561557, "epoch": 3.4079729572211215, "grad_norm": 3.328125, "learning_rate": 4.986719287867025e-05, "loss": 0.1929, "mean_token_accuracy": 0.9592093110084534, "num_tokens": 25431016.0, "step": 14620 }, { "entropy": 0.09616609346121549, "epoch": 3.409138594241753, "grad_norm": 0.58984375, "learning_rate": 4.9866970872419324e-05, "loss": 0.1478, "mean_token_accuracy": 0.9624132871627807, "num_tokens": 25449463.0, "step": 14625 }, { "entropy": 0.10277366153895855, "epoch": 3.410304231262385, "grad_norm": 1.515625, "learning_rate": 4.986674868175826e-05, "loss": 0.2068, "mean_token_accuracy": 0.9529636323451995, "num_tokens": 25473260.0, "step": 14630 }, { "entropy": 0.1336559236049652, "epoch": 3.4114698682830165, "grad_norm": 4.40625, "learning_rate": 4.986652630669036e-05, "loss": 0.1747, "mean_token_accuracy": 0.9519469976425171, "num_tokens": 25488285.0, "step": 14635 }, { "entropy": 0.12686145789921283, "epoch": 3.4126355053036486, "grad_norm": 0.5625, "learning_rate": 4.9866303747218966e-05, "loss": 0.2085, "mean_token_accuracy": 0.9547830998897553, "num_tokens": 25511643.0, "step": 14640 }, { "entropy": 0.16887935139238835, "epoch": 3.4138011423242802, "grad_norm": 0.62109375, "learning_rate": 4.986608100334738e-05, "loss": 0.264, "mean_token_accuracy": 0.9450030744075775, "num_tokens": 25529182.0, "step": 14645 }, { "entropy": 0.13547539040446283, "epoch": 3.414966779344912, "grad_norm": 2.859375, "learning_rate": 4.9865858075078925e-05, "loss": 0.1122, "mean_token_accuracy": 0.9562264323234558, "num_tokens": 25549237.0, "step": 14650 }, { "entropy": 0.10412461049854756, "epoch": 3.416132416365544, "grad_norm": 2.140625, "learning_rate": 4.986563496241693e-05, "loss": 0.1686, "mean_token_accuracy": 0.9625400602817535, "num_tokens": 25572688.0, "step": 14655 }, { "entropy": 0.20502724535763264, "epoch": 3.4172980533861756, "grad_norm": 5.125, "learning_rate": 4.986541166536471e-05, "loss": 0.3598, "mean_token_accuracy": 0.9178643345832824, "num_tokens": 25601340.0, "step": 14660 }, { "entropy": 0.1213500926271081, "epoch": 3.4184636904068073, "grad_norm": 0.66796875, "learning_rate": 4.9865188183925614e-05, "loss": 0.0992, "mean_token_accuracy": 0.955825799703598, "num_tokens": 25640563.0, "step": 14665 }, { "entropy": 0.12477913033217192, "epoch": 3.419629327427439, "grad_norm": 6.0625, "learning_rate": 4.9864964518102955e-05, "loss": 0.1823, "mean_token_accuracy": 0.9556873321533204, "num_tokens": 25665320.0, "step": 14670 }, { "entropy": 0.11235918961465359, "epoch": 3.4207949644480706, "grad_norm": 3.703125, "learning_rate": 4.986474066790008e-05, "loss": 0.1455, "mean_token_accuracy": 0.9635839581489563, "num_tokens": 25677602.0, "step": 14675 }, { "entropy": 0.11041145771741867, "epoch": 3.4219606014687027, "grad_norm": 2.578125, "learning_rate": 4.986451663332033e-05, "loss": 0.1105, "mean_token_accuracy": 0.9651696503162384, "num_tokens": 25701403.0, "step": 14680 }, { "entropy": 0.11710798554122448, "epoch": 3.4231262384893344, "grad_norm": 3.171875, "learning_rate": 4.9864292414367035e-05, "loss": 0.1645, "mean_token_accuracy": 0.9639812111854553, "num_tokens": 25714663.0, "step": 14685 }, { "entropy": 0.15187625922262668, "epoch": 3.424291875509966, "grad_norm": 7.8125, "learning_rate": 4.986406801104354e-05, "loss": 0.2054, "mean_token_accuracy": 0.9400863349437714, "num_tokens": 25725105.0, "step": 14690 }, { "entropy": 0.17266749329864978, "epoch": 3.425457512530598, "grad_norm": 0.62109375, "learning_rate": 4.98638434233532e-05, "loss": 0.2918, "mean_token_accuracy": 0.9378050565719604, "num_tokens": 25754734.0, "step": 14695 }, { "entropy": 0.10930528789758683, "epoch": 3.42662314955123, "grad_norm": 2.734375, "learning_rate": 4.986361865129935e-05, "loss": 0.0861, "mean_token_accuracy": 0.9631672918796539, "num_tokens": 25782665.0, "step": 14700 }, { "entropy": 0.09926526248455048, "epoch": 3.4277887865718615, "grad_norm": 0.99609375, "learning_rate": 4.986339369488536e-05, "loss": 0.101, "mean_token_accuracy": 0.9671981871128082, "num_tokens": 25804543.0, "step": 14705 }, { "entropy": 0.12382793109863996, "epoch": 3.428954423592493, "grad_norm": 12.375, "learning_rate": 4.986316855411457e-05, "loss": 0.2453, "mean_token_accuracy": 0.9416598856449128, "num_tokens": 25824472.0, "step": 14710 }, { "entropy": 0.13590311929583548, "epoch": 3.4301200606131252, "grad_norm": 7.21875, "learning_rate": 4.986294322899035e-05, "loss": 0.3006, "mean_token_accuracy": 0.9415552854537964, "num_tokens": 25841402.0, "step": 14715 }, { "entropy": 0.1355541491881013, "epoch": 3.431285697633757, "grad_norm": 0.58203125, "learning_rate": 4.986271771951604e-05, "loss": 0.1188, "mean_token_accuracy": 0.9523917257785797, "num_tokens": 25862438.0, "step": 14720 }, { "entropy": 0.1305895209312439, "epoch": 3.4324513346543886, "grad_norm": 1.421875, "learning_rate": 4.986249202569501e-05, "loss": 0.2254, "mean_token_accuracy": 0.9471198916435242, "num_tokens": 25876083.0, "step": 14725 }, { "entropy": 0.15410227999091147, "epoch": 3.4336169716750202, "grad_norm": 3.671875, "learning_rate": 4.986226614753064e-05, "loss": 0.3039, "mean_token_accuracy": 0.9457458674907684, "num_tokens": 25883947.0, "step": 14730 }, { "entropy": 0.13728398755192756, "epoch": 3.4347826086956523, "grad_norm": 3.0, "learning_rate": 4.9862040085026286e-05, "loss": 0.203, "mean_token_accuracy": 0.9486221551895142, "num_tokens": 25901558.0, "step": 14735 }, { "entropy": 0.08139637187123298, "epoch": 3.435948245716284, "grad_norm": 0.6875, "learning_rate": 4.986181383818532e-05, "loss": 0.1275, "mean_token_accuracy": 0.9661568462848663, "num_tokens": 25936465.0, "step": 14740 }, { "entropy": 0.21345613710582256, "epoch": 3.4371138827369156, "grad_norm": 1.9765625, "learning_rate": 4.986158740701112e-05, "loss": 0.2902, "mean_token_accuracy": 0.9319839298725128, "num_tokens": 25964623.0, "step": 14745 }, { "entropy": 0.12739054299890995, "epoch": 3.4382795197575473, "grad_norm": 5.21875, "learning_rate": 4.986136079150705e-05, "loss": 0.2276, "mean_token_accuracy": 0.9527396023273468, "num_tokens": 25974800.0, "step": 14750 }, { "entropy": 0.12611738108098508, "epoch": 3.4394451567781794, "grad_norm": 0.6953125, "learning_rate": 4.98611339916765e-05, "loss": 0.1493, "mean_token_accuracy": 0.964297991991043, "num_tokens": 25995660.0, "step": 14755 }, { "entropy": 0.09796325005590915, "epoch": 3.440610793798811, "grad_norm": 12.0625, "learning_rate": 4.9860907007522853e-05, "loss": 0.1836, "mean_token_accuracy": 0.9559937536716461, "num_tokens": 26013294.0, "step": 14760 }, { "entropy": 0.10678284913301468, "epoch": 3.4417764308194427, "grad_norm": 1.4453125, "learning_rate": 4.98606798390495e-05, "loss": 0.1968, "mean_token_accuracy": 0.9488790988922119, "num_tokens": 26032299.0, "step": 14765 }, { "entropy": 0.0958446266129613, "epoch": 3.4429420678400744, "grad_norm": 1.1015625, "learning_rate": 4.9860452486259806e-05, "loss": 0.1872, "mean_token_accuracy": 0.9606073677539826, "num_tokens": 26055375.0, "step": 14770 }, { "entropy": 0.17819878049194812, "epoch": 3.4441077048607065, "grad_norm": 12.125, "learning_rate": 4.9860224949157175e-05, "loss": 0.2588, "mean_token_accuracy": 0.9511879086494446, "num_tokens": 26075362.0, "step": 14775 }, { "entropy": 0.10445505864918232, "epoch": 3.445273341881338, "grad_norm": 0.7421875, "learning_rate": 4.985999722774501e-05, "loss": 0.0783, "mean_token_accuracy": 0.96468505859375, "num_tokens": 26102862.0, "step": 14780 }, { "entropy": 0.1406596891582012, "epoch": 3.44643897890197, "grad_norm": 14.75, "learning_rate": 4.985976932202668e-05, "loss": 0.2339, "mean_token_accuracy": 0.9482826471328736, "num_tokens": 26120838.0, "step": 14785 }, { "entropy": 0.18683748096227645, "epoch": 3.447604615922602, "grad_norm": 0.97265625, "learning_rate": 4.9859541232005616e-05, "loss": 0.2159, "mean_token_accuracy": 0.9524526178836823, "num_tokens": 26134765.0, "step": 14790 }, { "entropy": 0.11773408502340317, "epoch": 3.4487702529432336, "grad_norm": 3.109375, "learning_rate": 4.985931295768519e-05, "loss": 0.2107, "mean_token_accuracy": 0.9583574175834656, "num_tokens": 26147152.0, "step": 14795 }, { "entropy": 0.16266305055469274, "epoch": 3.4499358899638652, "grad_norm": 2.125, "learning_rate": 4.985908449906882e-05, "loss": 0.1388, "mean_token_accuracy": 0.9389150321483613, "num_tokens": 26179377.0, "step": 14800 }, { "entropy": 0.14590467549860478, "epoch": 3.451101526984497, "grad_norm": 1.171875, "learning_rate": 4.985885585615991e-05, "loss": 0.2239, "mean_token_accuracy": 0.9484518945217133, "num_tokens": 26204972.0, "step": 14805 }, { "entropy": 0.08754877429455518, "epoch": 3.4522671640051286, "grad_norm": 1.3359375, "learning_rate": 4.985862702896188e-05, "loss": 0.0818, "mean_token_accuracy": 0.9678203165531158, "num_tokens": 26229849.0, "step": 14810 }, { "entropy": 0.1233688484877348, "epoch": 3.4534328010257607, "grad_norm": 5.40625, "learning_rate": 4.985839801747812e-05, "loss": 0.17, "mean_token_accuracy": 0.9536608278751373, "num_tokens": 26253483.0, "step": 14815 }, { "entropy": 0.12931798473000528, "epoch": 3.4545984380463923, "grad_norm": 7.75, "learning_rate": 4.9858168821712065e-05, "loss": 0.2308, "mean_token_accuracy": 0.9489045977592468, "num_tokens": 26269779.0, "step": 14820 }, { "entropy": 0.08671133350580931, "epoch": 3.455764075067024, "grad_norm": 0.66796875, "learning_rate": 4.9857939441667125e-05, "loss": 0.0994, "mean_token_accuracy": 0.9656893134117126, "num_tokens": 26300644.0, "step": 14825 }, { "entropy": 0.09761807462200522, "epoch": 3.456929712087656, "grad_norm": 6.125, "learning_rate": 4.985770987734672e-05, "loss": 0.1859, "mean_token_accuracy": 0.9515304028987884, "num_tokens": 26319996.0, "step": 14830 }, { "entropy": 0.2023943942040205, "epoch": 3.4580953491082878, "grad_norm": 4.6875, "learning_rate": 4.985748012875427e-05, "loss": 0.2977, "mean_token_accuracy": 0.9330312609672546, "num_tokens": 26341742.0, "step": 14835 }, { "entropy": 0.19122447371482848, "epoch": 3.4592609861289194, "grad_norm": 3.34375, "learning_rate": 4.985725019589321e-05, "loss": 0.2801, "mean_token_accuracy": 0.9373561441898346, "num_tokens": 26367285.0, "step": 14840 }, { "entropy": 0.124190529063344, "epoch": 3.460426623149551, "grad_norm": 1.9453125, "learning_rate": 4.985702007876696e-05, "loss": 0.186, "mean_token_accuracy": 0.9488656878471374, "num_tokens": 26378772.0, "step": 14845 }, { "entropy": 0.13527803476899863, "epoch": 3.461592260170183, "grad_norm": 6.375, "learning_rate": 4.985678977737895e-05, "loss": 0.208, "mean_token_accuracy": 0.9437292635440826, "num_tokens": 26396490.0, "step": 14850 }, { "entropy": 0.09913870450109244, "epoch": 3.462757897190815, "grad_norm": 1.40625, "learning_rate": 4.985655929173263e-05, "loss": 0.1981, "mean_token_accuracy": 0.950927072763443, "num_tokens": 26413952.0, "step": 14855 }, { "entropy": 0.10881685335189104, "epoch": 3.4639235342114465, "grad_norm": 0.546875, "learning_rate": 4.985632862183142e-05, "loss": 0.1164, "mean_token_accuracy": 0.9676409482955932, "num_tokens": 26441690.0, "step": 14860 }, { "entropy": 0.11910134218633175, "epoch": 3.465089171232078, "grad_norm": 5.0, "learning_rate": 4.9856097767678764e-05, "loss": 0.1886, "mean_token_accuracy": 0.9555825769901276, "num_tokens": 26465495.0, "step": 14865 }, { "entropy": 0.12688817270100117, "epoch": 3.4662548082527103, "grad_norm": 11.5625, "learning_rate": 4.9855866729278114e-05, "loss": 0.1772, "mean_token_accuracy": 0.951020085811615, "num_tokens": 26486507.0, "step": 14870 }, { "entropy": 0.17504406850785018, "epoch": 3.467420445273342, "grad_norm": 4.34375, "learning_rate": 4.985563550663289e-05, "loss": 0.2582, "mean_token_accuracy": 0.9411736011505127, "num_tokens": 26503076.0, "step": 14875 }, { "entropy": 0.18092816174030305, "epoch": 3.4685860822939736, "grad_norm": 4.5, "learning_rate": 4.9855404099746574e-05, "loss": 0.3976, "mean_token_accuracy": 0.929013067483902, "num_tokens": 26511528.0, "step": 14880 }, { "entropy": 0.09570451155304908, "epoch": 3.4697517193146052, "grad_norm": 0.65234375, "learning_rate": 4.985517250862259e-05, "loss": 0.1375, "mean_token_accuracy": 0.9654882431030274, "num_tokens": 26541793.0, "step": 14885 }, { "entropy": 0.10951367020606995, "epoch": 3.4709173563352373, "grad_norm": 8.0, "learning_rate": 4.98549407332644e-05, "loss": 0.1434, "mean_token_accuracy": 0.9649019360542297, "num_tokens": 26557800.0, "step": 14890 }, { "entropy": 0.20391902434639633, "epoch": 3.472082993355869, "grad_norm": 4.125, "learning_rate": 4.985470877367546e-05, "loss": 0.4358, "mean_token_accuracy": 0.9307071447372437, "num_tokens": 26584988.0, "step": 14895 }, { "entropy": 0.1240554254502058, "epoch": 3.4732486303765007, "grad_norm": 4.15625, "learning_rate": 4.985447662985924e-05, "loss": 0.2078, "mean_token_accuracy": 0.9551225781440735, "num_tokens": 26611675.0, "step": 14900 }, { "entropy": 0.07346545197069645, "epoch": 3.4744142673971323, "grad_norm": 2.46875, "learning_rate": 4.985424430181918e-05, "loss": 0.0766, "mean_token_accuracy": 0.9802018702030182, "num_tokens": 26637716.0, "step": 14905 }, { "entropy": 0.10290342718362808, "epoch": 3.4755799044177644, "grad_norm": 0.7734375, "learning_rate": 4.9854011789558764e-05, "loss": 0.13, "mean_token_accuracy": 0.963810783624649, "num_tokens": 26661275.0, "step": 14910 }, { "entropy": 0.1287055004388094, "epoch": 3.476745541438396, "grad_norm": 1.75, "learning_rate": 4.985377909308144e-05, "loss": 0.1955, "mean_token_accuracy": 0.9553277611732482, "num_tokens": 26677624.0, "step": 14915 }, { "entropy": 0.11266841646283865, "epoch": 3.4779111784590278, "grad_norm": 3.546875, "learning_rate": 4.98535462123907e-05, "loss": 0.126, "mean_token_accuracy": 0.9733292937278748, "num_tokens": 26693952.0, "step": 14920 }, { "entropy": 0.1386932224035263, "epoch": 3.47907681547966, "grad_norm": 1.0078125, "learning_rate": 4.985331314749e-05, "loss": 0.2552, "mean_token_accuracy": 0.9495406329631806, "num_tokens": 26714106.0, "step": 14925 }, { "entropy": 0.10363805964589119, "epoch": 3.4802424525002915, "grad_norm": 4.0625, "learning_rate": 4.985307989838282e-05, "loss": 0.1596, "mean_token_accuracy": 0.9670630037784577, "num_tokens": 26737082.0, "step": 14930 }, { "entropy": 0.08988064369186759, "epoch": 3.481408089520923, "grad_norm": 4.34375, "learning_rate": 4.985284646507264e-05, "loss": 0.1481, "mean_token_accuracy": 0.9675386667251586, "num_tokens": 26760018.0, "step": 14935 }, { "entropy": 0.0987139768898487, "epoch": 3.482573726541555, "grad_norm": 0.6328125, "learning_rate": 4.9852612847562936e-05, "loss": 0.1844, "mean_token_accuracy": 0.9589445412158966, "num_tokens": 26779761.0, "step": 14940 }, { "entropy": 0.12049501091241836, "epoch": 3.4837393635621865, "grad_norm": 9.5625, "learning_rate": 4.98523790458572e-05, "loss": 0.2723, "mean_token_accuracy": 0.9480070292949676, "num_tokens": 26789666.0, "step": 14945 }, { "entropy": 0.08247858472168446, "epoch": 3.4849050005828186, "grad_norm": 1.1875, "learning_rate": 4.985214505995891e-05, "loss": 0.1239, "mean_token_accuracy": 0.9717278420925141, "num_tokens": 26813109.0, "step": 14950 }, { "entropy": 0.12373597361147404, "epoch": 3.4860706376034503, "grad_norm": 0.71484375, "learning_rate": 4.9851910889871554e-05, "loss": 0.1562, "mean_token_accuracy": 0.9588066041469574, "num_tokens": 26843536.0, "step": 14955 }, { "entropy": 0.16234209034591912, "epoch": 3.487236274624082, "grad_norm": 1.515625, "learning_rate": 4.985167653559864e-05, "loss": 0.2294, "mean_token_accuracy": 0.9414169251918793, "num_tokens": 26870752.0, "step": 14960 }, { "entropy": 0.1290282540023327, "epoch": 3.488401911644714, "grad_norm": 6.125, "learning_rate": 4.9851441997143646e-05, "loss": 0.2052, "mean_token_accuracy": 0.9569578051567078, "num_tokens": 26890897.0, "step": 14965 }, { "entropy": 0.12936341762542725, "epoch": 3.4895675486653457, "grad_norm": 6.0625, "learning_rate": 4.985120727451007e-05, "loss": 0.1832, "mean_token_accuracy": 0.9518023371696472, "num_tokens": 26911675.0, "step": 14970 }, { "entropy": 0.11797473523765803, "epoch": 3.4907331856859773, "grad_norm": 2.9375, "learning_rate": 4.985097236770142e-05, "loss": 0.1362, "mean_token_accuracy": 0.9681229948997497, "num_tokens": 26931840.0, "step": 14975 }, { "entropy": 0.12061390187591314, "epoch": 3.491898822706609, "grad_norm": 7.0, "learning_rate": 4.985073727672119e-05, "loss": 0.2357, "mean_token_accuracy": 0.9532876312732697, "num_tokens": 26944348.0, "step": 14980 }, { "entropy": 0.13178243041038512, "epoch": 3.493064459727241, "grad_norm": 6.21875, "learning_rate": 4.9850502001572905e-05, "loss": 0.212, "mean_token_accuracy": 0.9557155132293701, "num_tokens": 26962502.0, "step": 14985 }, { "entropy": 0.09238446112722158, "epoch": 3.4942300967478728, "grad_norm": 4.4375, "learning_rate": 4.9850266542260044e-05, "loss": 0.1369, "mean_token_accuracy": 0.9672260642051697, "num_tokens": 26979970.0, "step": 14990 }, { "entropy": 0.11178217800334096, "epoch": 3.4953957337685044, "grad_norm": 0.61328125, "learning_rate": 4.985003089878614e-05, "loss": 0.1052, "mean_token_accuracy": 0.9628475129604339, "num_tokens": 27000359.0, "step": 14995 }, { "entropy": 0.08186651319265366, "epoch": 3.496561370789136, "grad_norm": 0.390625, "learning_rate": 4.9849795071154696e-05, "loss": 0.0904, "mean_token_accuracy": 0.9762087106704712, "num_tokens": 27037109.0, "step": 15000 }, { "entropy": 0.11758413314819335, "epoch": 3.497727007809768, "grad_norm": 6.15625, "learning_rate": 4.9849559059369236e-05, "loss": 0.1745, "mean_token_accuracy": 0.9557598829269409, "num_tokens": 27053484.0, "step": 15005 }, { "entropy": 0.09997645281255245, "epoch": 3.4988926448304, "grad_norm": 1.0546875, "learning_rate": 4.984932286343327e-05, "loss": 0.152, "mean_token_accuracy": 0.9670864582061768, "num_tokens": 27076982.0, "step": 15010 }, { "entropy": 0.10639596097171307, "epoch": 3.5000582818510315, "grad_norm": 5.59375, "learning_rate": 4.984908648335033e-05, "loss": 0.1861, "mean_token_accuracy": 0.957602858543396, "num_tokens": 27090182.0, "step": 15015 }, { "entropy": 0.10824050679802895, "epoch": 3.5012239188716636, "grad_norm": 3.765625, "learning_rate": 4.984884991912394e-05, "loss": 0.1751, "mean_token_accuracy": 0.9627082347869873, "num_tokens": 27101437.0, "step": 15020 }, { "entropy": 0.1901700021699071, "epoch": 3.5023895558922953, "grad_norm": 4.78125, "learning_rate": 4.984861317075762e-05, "loss": 0.2803, "mean_token_accuracy": 0.9506976902484894, "num_tokens": 27115517.0, "step": 15025 }, { "entropy": 0.08258118499070406, "epoch": 3.503555192912927, "grad_norm": 1.0234375, "learning_rate": 4.98483762382549e-05, "loss": 0.0934, "mean_token_accuracy": 0.9727859675884247, "num_tokens": 27139907.0, "step": 15030 }, { "entropy": 0.1136773657053709, "epoch": 3.5047208299335586, "grad_norm": 6.59375, "learning_rate": 4.984813912161932e-05, "loss": 0.1184, "mean_token_accuracy": 0.9658208072185517, "num_tokens": 27170803.0, "step": 15035 }, { "entropy": 0.1513181956484914, "epoch": 3.5058864669541903, "grad_norm": 4.90625, "learning_rate": 4.984790182085442e-05, "loss": 0.2491, "mean_token_accuracy": 0.943081659078598, "num_tokens": 27201574.0, "step": 15040 }, { "entropy": 0.0724037921987474, "epoch": 3.5070521039748224, "grad_norm": 1.4765625, "learning_rate": 4.984766433596372e-05, "loss": 0.0629, "mean_token_accuracy": 0.9772098004817963, "num_tokens": 27232309.0, "step": 15045 }, { "entropy": 0.1291754100471735, "epoch": 3.508217740995454, "grad_norm": 5.46875, "learning_rate": 4.984742666695078e-05, "loss": 0.2822, "mean_token_accuracy": 0.9417845368385315, "num_tokens": 27242969.0, "step": 15050 }, { "entropy": 0.12059564888477325, "epoch": 3.5093833780160857, "grad_norm": 4.34375, "learning_rate": 4.984718881381914e-05, "loss": 0.2573, "mean_token_accuracy": 0.9473318874835968, "num_tokens": 27256073.0, "step": 15055 }, { "entropy": 0.11320340782403945, "epoch": 3.510549015036718, "grad_norm": 1.3984375, "learning_rate": 4.984695077657234e-05, "loss": 0.2205, "mean_token_accuracy": 0.9602530896663666, "num_tokens": 27268498.0, "step": 15060 }, { "entropy": 0.17097560949623586, "epoch": 3.5117146520573495, "grad_norm": 6.0, "learning_rate": 4.984671255521393e-05, "loss": 0.265, "mean_token_accuracy": 0.944575160741806, "num_tokens": 27287806.0, "step": 15065 }, { "entropy": 0.11076088473200799, "epoch": 3.512880289077981, "grad_norm": 4.0625, "learning_rate": 4.984647414974747e-05, "loss": 0.0947, "mean_token_accuracy": 0.9570927262306214, "num_tokens": 27312827.0, "step": 15070 }, { "entropy": 0.15406885892152786, "epoch": 3.5140459260986128, "grad_norm": 1.390625, "learning_rate": 4.9846235560176505e-05, "loss": 0.2675, "mean_token_accuracy": 0.9428991496562957, "num_tokens": 27334152.0, "step": 15075 }, { "entropy": 0.126622261852026, "epoch": 3.5152115631192444, "grad_norm": 2.578125, "learning_rate": 4.98459967865046e-05, "loss": 0.2088, "mean_token_accuracy": 0.9512693524360657, "num_tokens": 27344318.0, "step": 15080 }, { "entropy": 0.16098451465368271, "epoch": 3.5163772001398765, "grad_norm": 3.921875, "learning_rate": 4.984575782873532e-05, "loss": 0.2481, "mean_token_accuracy": 0.9495172142982483, "num_tokens": 27367074.0, "step": 15085 }, { "entropy": 0.178832789324224, "epoch": 3.517542837160508, "grad_norm": 13.25, "learning_rate": 4.9845518686872215e-05, "loss": 0.2985, "mean_token_accuracy": 0.9411785483360291, "num_tokens": 27383701.0, "step": 15090 }, { "entropy": 0.13766136653721334, "epoch": 3.51870847418114, "grad_norm": 0.8671875, "learning_rate": 4.984527936091885e-05, "loss": 0.2352, "mean_token_accuracy": 0.9489962339401246, "num_tokens": 27399990.0, "step": 15095 }, { "entropy": 0.14877081848680973, "epoch": 3.519874111201772, "grad_norm": 10.8125, "learning_rate": 4.984503985087882e-05, "loss": 0.21, "mean_token_accuracy": 0.9492302298545837, "num_tokens": 27413742.0, "step": 15100 }, { "entropy": 0.15225835423916578, "epoch": 3.5210397482224036, "grad_norm": 10.9375, "learning_rate": 4.9844800156755665e-05, "loss": 0.2496, "mean_token_accuracy": 0.9365932464599609, "num_tokens": 27426570.0, "step": 15105 }, { "entropy": 0.11053324565291404, "epoch": 3.5222053852430353, "grad_norm": 1.3984375, "learning_rate": 4.9844560278552976e-05, "loss": 0.1407, "mean_token_accuracy": 0.9656516313552856, "num_tokens": 27454187.0, "step": 15110 }, { "entropy": 0.0999959884211421, "epoch": 3.523371022263667, "grad_norm": 0.80078125, "learning_rate": 4.9844320216274326e-05, "loss": 0.1142, "mean_token_accuracy": 0.9690761923789978, "num_tokens": 27475207.0, "step": 15115 }, { "entropy": 0.1034471170976758, "epoch": 3.5245366592842986, "grad_norm": 5.46875, "learning_rate": 4.9844079969923295e-05, "loss": 0.1016, "mean_token_accuracy": 0.9620785713195801, "num_tokens": 27494393.0, "step": 15120 }, { "entropy": 0.12301951255649328, "epoch": 3.5257022963049307, "grad_norm": 2.59375, "learning_rate": 4.984383953950346e-05, "loss": 0.1582, "mean_token_accuracy": 0.9564571857452393, "num_tokens": 27522349.0, "step": 15125 }, { "entropy": 0.1520868118852377, "epoch": 3.5268679333255624, "grad_norm": 6.875, "learning_rate": 4.984359892501842e-05, "loss": 0.32, "mean_token_accuracy": 0.9273856461048127, "num_tokens": 27533402.0, "step": 15130 }, { "entropy": 0.1576168665662408, "epoch": 3.528033570346194, "grad_norm": 4.4375, "learning_rate": 4.9843358126471746e-05, "loss": 0.2757, "mean_token_accuracy": 0.9358682692050934, "num_tokens": 27555411.0, "step": 15135 }, { "entropy": 0.16003640741109848, "epoch": 3.529199207366826, "grad_norm": 8.4375, "learning_rate": 4.984311714386704e-05, "loss": 0.2395, "mean_token_accuracy": 0.9503751337528229, "num_tokens": 27563017.0, "step": 15140 }, { "entropy": 0.1150142002850771, "epoch": 3.530364844387458, "grad_norm": 0.67578125, "learning_rate": 4.984287597720789e-05, "loss": 0.2614, "mean_token_accuracy": 0.952860701084137, "num_tokens": 27586379.0, "step": 15145 }, { "entropy": 0.1206765715032816, "epoch": 3.5315304814080895, "grad_norm": 5.84375, "learning_rate": 4.98426346264979e-05, "loss": 0.2821, "mean_token_accuracy": 0.9396198153495788, "num_tokens": 27598286.0, "step": 15150 }, { "entropy": 0.14196645841002464, "epoch": 3.5326961184287216, "grad_norm": 14.6875, "learning_rate": 4.984239309174066e-05, "loss": 0.3251, "mean_token_accuracy": 0.9355909168720246, "num_tokens": 27608408.0, "step": 15155 }, { "entropy": 0.11324986163526773, "epoch": 3.533861755449353, "grad_norm": 4.75, "learning_rate": 4.984215137293978e-05, "loss": 0.1432, "mean_token_accuracy": 0.963469636440277, "num_tokens": 27627909.0, "step": 15160 }, { "entropy": 0.10810782052576542, "epoch": 3.535027392469985, "grad_norm": 2.75, "learning_rate": 4.984190947009885e-05, "loss": 0.1521, "mean_token_accuracy": 0.9573879361152648, "num_tokens": 27644139.0, "step": 15165 }, { "entropy": 0.1313298497349024, "epoch": 3.5361930294906165, "grad_norm": 1.4453125, "learning_rate": 4.9841667383221494e-05, "loss": 0.2283, "mean_token_accuracy": 0.9522750377655029, "num_tokens": 27654841.0, "step": 15170 }, { "entropy": 0.08479751572012902, "epoch": 3.537358666511248, "grad_norm": 1.984375, "learning_rate": 4.984142511231131e-05, "loss": 0.0905, "mean_token_accuracy": 0.966051709651947, "num_tokens": 27679994.0, "step": 15175 }, { "entropy": 0.10010197125375271, "epoch": 3.5385243035318803, "grad_norm": 1.4765625, "learning_rate": 4.9841182657371913e-05, "loss": 0.134, "mean_token_accuracy": 0.96122225522995, "num_tokens": 27700305.0, "step": 15180 }, { "entropy": 0.12406671978533268, "epoch": 3.539689940552512, "grad_norm": 4.59375, "learning_rate": 4.984094001840693e-05, "loss": 0.1602, "mean_token_accuracy": 0.9643674075603486, "num_tokens": 27722193.0, "step": 15185 }, { "entropy": 0.2311142822727561, "epoch": 3.5408555775731436, "grad_norm": 2.53125, "learning_rate": 4.984069719541996e-05, "loss": 0.4184, "mean_token_accuracy": 0.9141157269477844, "num_tokens": 27746084.0, "step": 15190 }, { "entropy": 0.10267463922500611, "epoch": 3.5420212145937757, "grad_norm": 1.25, "learning_rate": 4.984045418841464e-05, "loss": 0.1895, "mean_token_accuracy": 0.9577461659908295, "num_tokens": 27762974.0, "step": 15195 }, { "entropy": 0.142679588124156, "epoch": 3.5431868516144074, "grad_norm": 2.96875, "learning_rate": 4.984021099739458e-05, "loss": 0.2069, "mean_token_accuracy": 0.9549462914466857, "num_tokens": 27781410.0, "step": 15200 }, { "entropy": 0.09814808573573827, "epoch": 3.544352488635039, "grad_norm": 4.375, "learning_rate": 4.983996762236342e-05, "loss": 0.1375, "mean_token_accuracy": 0.9615116119384766, "num_tokens": 27803041.0, "step": 15205 }, { "entropy": 0.15655723661184312, "epoch": 3.5455181256556707, "grad_norm": 3.875, "learning_rate": 4.9839724063324775e-05, "loss": 0.2196, "mean_token_accuracy": 0.9429301500320435, "num_tokens": 27824317.0, "step": 15210 }, { "entropy": 0.12263799570500851, "epoch": 3.5466837626763024, "grad_norm": 7.53125, "learning_rate": 4.9839480320282295e-05, "loss": 0.1773, "mean_token_accuracy": 0.9546642303466797, "num_tokens": 27839225.0, "step": 15215 }, { "entropy": 0.08694457476958632, "epoch": 3.5478493996969345, "grad_norm": 0.6640625, "learning_rate": 4.98392363932396e-05, "loss": 0.0964, "mean_token_accuracy": 0.9705213010311127, "num_tokens": 27874861.0, "step": 15220 }, { "entropy": 0.13864086512476206, "epoch": 3.549015036717566, "grad_norm": 7.25, "learning_rate": 4.983899228220032e-05, "loss": 0.2244, "mean_token_accuracy": 0.9488819360733032, "num_tokens": 27902160.0, "step": 15225 }, { "entropy": 0.1462076909840107, "epoch": 3.550180673738198, "grad_norm": 4.65625, "learning_rate": 4.983874798716812e-05, "loss": 0.1449, "mean_token_accuracy": 0.9579621255397797, "num_tokens": 27921363.0, "step": 15230 }, { "entropy": 0.15108269155025483, "epoch": 3.55134631075883, "grad_norm": 10.4375, "learning_rate": 4.983850350814662e-05, "loss": 0.2535, "mean_token_accuracy": 0.9454373240470886, "num_tokens": 27939954.0, "step": 15235 }, { "entropy": 0.1413711801171303, "epoch": 3.5525119477794616, "grad_norm": 7.78125, "learning_rate": 4.983825884513948e-05, "loss": 0.1943, "mean_token_accuracy": 0.9570164740085602, "num_tokens": 27960307.0, "step": 15240 }, { "entropy": 0.16688954643905163, "epoch": 3.553677584800093, "grad_norm": 4.40625, "learning_rate": 4.983801399815035e-05, "loss": 0.1522, "mean_token_accuracy": 0.9553694128990173, "num_tokens": 27971180.0, "step": 15245 }, { "entropy": 0.11758850514888763, "epoch": 3.554843221820725, "grad_norm": 4.3125, "learning_rate": 4.9837768967182855e-05, "loss": 0.2191, "mean_token_accuracy": 0.9551753103733063, "num_tokens": 27981929.0, "step": 15250 }, { "entropy": 0.08926377333700657, "epoch": 3.5560088588413565, "grad_norm": 2.5625, "learning_rate": 4.983752375224068e-05, "loss": 0.1264, "mean_token_accuracy": 0.970320975780487, "num_tokens": 27997156.0, "step": 15255 }, { "entropy": 0.07973122633993626, "epoch": 3.5571744958619886, "grad_norm": 5.46875, "learning_rate": 4.9837278353327466e-05, "loss": 0.1264, "mean_token_accuracy": 0.9706411957740784, "num_tokens": 28015681.0, "step": 15260 }, { "entropy": 0.12640931233763694, "epoch": 3.5583401328826203, "grad_norm": 7.28125, "learning_rate": 4.9837032770446876e-05, "loss": 0.257, "mean_token_accuracy": 0.9483764410018921, "num_tokens": 28029400.0, "step": 15265 }, { "entropy": 0.16228528693318367, "epoch": 3.559505769903252, "grad_norm": 1.2734375, "learning_rate": 4.9836787003602575e-05, "loss": 0.3166, "mean_token_accuracy": 0.9338264346122742, "num_tokens": 28045344.0, "step": 15270 }, { "entropy": 0.11942652426660061, "epoch": 3.560671406923884, "grad_norm": 9.375, "learning_rate": 4.983654105279822e-05, "loss": 0.1878, "mean_token_accuracy": 0.9502759516239166, "num_tokens": 28066127.0, "step": 15275 }, { "entropy": 0.11859394274652005, "epoch": 3.5618370439445157, "grad_norm": 0.55078125, "learning_rate": 4.983629491803748e-05, "loss": 0.1743, "mean_token_accuracy": 0.9581486761569977, "num_tokens": 28081534.0, "step": 15280 }, { "entropy": 0.13153154766187072, "epoch": 3.5630026809651474, "grad_norm": 6.375, "learning_rate": 4.983604859932404e-05, "loss": 0.203, "mean_token_accuracy": 0.9452811002731323, "num_tokens": 28098550.0, "step": 15285 }, { "entropy": 0.11013969406485558, "epoch": 3.5641683179857795, "grad_norm": 5.03125, "learning_rate": 4.983580209666155e-05, "loss": 0.1825, "mean_token_accuracy": 0.9638396859169006, "num_tokens": 28111009.0, "step": 15290 }, { "entropy": 0.162632117792964, "epoch": 3.565333955006411, "grad_norm": 6.71875, "learning_rate": 4.9835555410053696e-05, "loss": 0.2456, "mean_token_accuracy": 0.943684995174408, "num_tokens": 28129844.0, "step": 15295 }, { "entropy": 0.158411131054163, "epoch": 3.566499592027043, "grad_norm": 3.921875, "learning_rate": 4.983530853950416e-05, "loss": 0.2244, "mean_token_accuracy": 0.9570217251777648, "num_tokens": 28152006.0, "step": 15300 }, { "entropy": 0.10298453513532876, "epoch": 3.5676652290476745, "grad_norm": 3.625, "learning_rate": 4.983506148501662e-05, "loss": 0.154, "mean_token_accuracy": 0.9592744171619415, "num_tokens": 28176371.0, "step": 15305 }, { "entropy": 0.12455929182469845, "epoch": 3.568830866068306, "grad_norm": 3.0, "learning_rate": 4.983481424659476e-05, "loss": 0.1722, "mean_token_accuracy": 0.9571922421455383, "num_tokens": 28194824.0, "step": 15310 }, { "entropy": 0.17322831777855754, "epoch": 3.5699965030889382, "grad_norm": 8.4375, "learning_rate": 4.983456682424227e-05, "loss": 0.3505, "mean_token_accuracy": 0.9319654107093811, "num_tokens": 28220220.0, "step": 15315 }, { "entropy": 0.16658639833331107, "epoch": 3.57116214010957, "grad_norm": 4.0625, "learning_rate": 4.983431921796284e-05, "loss": 0.3279, "mean_token_accuracy": 0.9347608745098114, "num_tokens": 28238594.0, "step": 15320 }, { "entropy": 0.22185911796987057, "epoch": 3.5723277771302016, "grad_norm": 1.1484375, "learning_rate": 4.983407142776015e-05, "loss": 0.2466, "mean_token_accuracy": 0.937779426574707, "num_tokens": 28269567.0, "step": 15325 }, { "entropy": 0.12034954987466336, "epoch": 3.5734934141508337, "grad_norm": 2.484375, "learning_rate": 4.983382345363791e-05, "loss": 0.1659, "mean_token_accuracy": 0.9600392401218414, "num_tokens": 28286618.0, "step": 15330 }, { "entropy": 0.13138836286962033, "epoch": 3.5746590511714653, "grad_norm": 6.5, "learning_rate": 4.98335752955998e-05, "loss": 0.1817, "mean_token_accuracy": 0.9593860268592834, "num_tokens": 28314646.0, "step": 15335 }, { "entropy": 0.0884293843060732, "epoch": 3.575824688192097, "grad_norm": 0.65234375, "learning_rate": 4.983332695364955e-05, "loss": 0.0927, "mean_token_accuracy": 0.973773580789566, "num_tokens": 28348623.0, "step": 15340 }, { "entropy": 0.12610990218818188, "epoch": 3.5769903252127286, "grad_norm": 4.84375, "learning_rate": 4.983307842779083e-05, "loss": 0.1848, "mean_token_accuracy": 0.9506776928901672, "num_tokens": 28359937.0, "step": 15345 }, { "entropy": 0.11546969823539258, "epoch": 3.5781559622333603, "grad_norm": 2.96875, "learning_rate": 4.9832829718027364e-05, "loss": 0.1492, "mean_token_accuracy": 0.9545735061168671, "num_tokens": 28371610.0, "step": 15350 }, { "entropy": 0.14369152104482055, "epoch": 3.5793215992539924, "grad_norm": 4.5, "learning_rate": 4.9832580824362856e-05, "loss": 0.2731, "mean_token_accuracy": 0.9323119401931763, "num_tokens": 28414897.0, "step": 15355 }, { "entropy": 0.11302361041307449, "epoch": 3.580487236274624, "grad_norm": 4.9375, "learning_rate": 4.9832331746801017e-05, "loss": 0.1324, "mean_token_accuracy": 0.961307954788208, "num_tokens": 28439232.0, "step": 15360 }, { "entropy": 0.11460042167454958, "epoch": 3.5816528732952557, "grad_norm": 1.875, "learning_rate": 4.9832082485345573e-05, "loss": 0.1597, "mean_token_accuracy": 0.9591732680797577, "num_tokens": 28472770.0, "step": 15365 }, { "entropy": 0.1156302273273468, "epoch": 3.582818510315888, "grad_norm": 9.125, "learning_rate": 4.983183304000022e-05, "loss": 0.2583, "mean_token_accuracy": 0.9537204146385193, "num_tokens": 28483640.0, "step": 15370 }, { "entropy": 0.11941173635423183, "epoch": 3.5839841473365195, "grad_norm": 0.9140625, "learning_rate": 4.9831583410768695e-05, "loss": 0.089, "mean_token_accuracy": 0.9625566601753235, "num_tokens": 28515232.0, "step": 15375 }, { "entropy": 0.16708160936832428, "epoch": 3.585149784357151, "grad_norm": 3.96875, "learning_rate": 4.983133359765471e-05, "loss": 0.3082, "mean_token_accuracy": 0.9396808683872223, "num_tokens": 28525110.0, "step": 15380 }, { "entropy": 0.10022514667361974, "epoch": 3.586315421377783, "grad_norm": 0.59765625, "learning_rate": 4.9831083600661995e-05, "loss": 0.116, "mean_token_accuracy": 0.9687018394470215, "num_tokens": 28547249.0, "step": 15385 }, { "entropy": 0.10627367310225963, "epoch": 3.5874810583984145, "grad_norm": 0.96875, "learning_rate": 4.983083341979427e-05, "loss": 0.151, "mean_token_accuracy": 0.9632999539375305, "num_tokens": 28562994.0, "step": 15390 }, { "entropy": 0.0709003258496523, "epoch": 3.5886466954190466, "grad_norm": 0.76953125, "learning_rate": 4.983058305505528e-05, "loss": 0.058, "mean_token_accuracy": 0.9755187392234802, "num_tokens": 28598074.0, "step": 15395 }, { "entropy": 0.12664932943880558, "epoch": 3.5898123324396782, "grad_norm": 9.875, "learning_rate": 4.983033250644875e-05, "loss": 0.1625, "mean_token_accuracy": 0.9476115763187408, "num_tokens": 28619669.0, "step": 15400 }, { "entropy": 0.11317505575716495, "epoch": 3.59097796946031, "grad_norm": 3.640625, "learning_rate": 4.983008177397841e-05, "loss": 0.1903, "mean_token_accuracy": 0.9582743942737579, "num_tokens": 28630736.0, "step": 15405 }, { "entropy": 0.20718681290745736, "epoch": 3.592143606480942, "grad_norm": 5.875, "learning_rate": 4.9829830857648016e-05, "loss": 0.3451, "mean_token_accuracy": 0.9288463413715362, "num_tokens": 28638695.0, "step": 15410 }, { "entropy": 0.10314843617379665, "epoch": 3.5933092435015737, "grad_norm": 0.79296875, "learning_rate": 4.98295797574613e-05, "loss": 0.1336, "mean_token_accuracy": 0.9661559939384461, "num_tokens": 28673027.0, "step": 15415 }, { "entropy": 0.10093957595527173, "epoch": 3.5944748805222053, "grad_norm": 5.5625, "learning_rate": 4.982932847342199e-05, "loss": 0.1342, "mean_token_accuracy": 0.9647237479686737, "num_tokens": 28685524.0, "step": 15420 }, { "entropy": 0.1997634179890156, "epoch": 3.5956405175428374, "grad_norm": 4.0, "learning_rate": 4.982907700553386e-05, "loss": 0.3577, "mean_token_accuracy": 0.9233972787857055, "num_tokens": 28707734.0, "step": 15425 }, { "entropy": 0.20143613442778588, "epoch": 3.596806154563469, "grad_norm": 5.78125, "learning_rate": 4.982882535380064e-05, "loss": 0.248, "mean_token_accuracy": 0.9392694950103759, "num_tokens": 28717460.0, "step": 15430 }, { "entropy": 0.15649377331137657, "epoch": 3.5979717915841007, "grad_norm": 0.8046875, "learning_rate": 4.98285735182261e-05, "loss": 0.2217, "mean_token_accuracy": 0.9479660391807556, "num_tokens": 28739023.0, "step": 15435 }, { "entropy": 0.10691715404391289, "epoch": 3.5991374286047324, "grad_norm": 1.703125, "learning_rate": 4.982832149881398e-05, "loss": 0.1454, "mean_token_accuracy": 0.9623469948768616, "num_tokens": 28752932.0, "step": 15440 }, { "entropy": 0.07475876267999411, "epoch": 3.600303065625364, "grad_norm": 1.7578125, "learning_rate": 4.982806929556804e-05, "loss": 0.1132, "mean_token_accuracy": 0.9730147421360016, "num_tokens": 28784051.0, "step": 15445 }, { "entropy": 0.24029637901112438, "epoch": 3.601468702645996, "grad_norm": 11.625, "learning_rate": 4.982781690849205e-05, "loss": 0.5745, "mean_token_accuracy": 0.9266455054283143, "num_tokens": 28817951.0, "step": 15450 }, { "entropy": 0.1538819193840027, "epoch": 3.602634339666628, "grad_norm": 9.4375, "learning_rate": 4.9827564337589764e-05, "loss": 0.264, "mean_token_accuracy": 0.9450575411319733, "num_tokens": 28836860.0, "step": 15455 }, { "entropy": 0.1413545010611415, "epoch": 3.6037999766872595, "grad_norm": 1.0546875, "learning_rate": 4.982731158286495e-05, "loss": 0.1918, "mean_token_accuracy": 0.9462571084499359, "num_tokens": 28868174.0, "step": 15460 }, { "entropy": 0.08998983614146709, "epoch": 3.6049656137078916, "grad_norm": 1.484375, "learning_rate": 4.982705864432139e-05, "loss": 0.0978, "mean_token_accuracy": 0.9719910442829132, "num_tokens": 28896694.0, "step": 15465 }, { "entropy": 0.09497287701815367, "epoch": 3.6061312507285233, "grad_norm": 2.921875, "learning_rate": 4.982680552196284e-05, "loss": 0.1236, "mean_token_accuracy": 0.9718452095985413, "num_tokens": 28930529.0, "step": 15470 }, { "entropy": 0.09434767477214337, "epoch": 3.607296887749155, "grad_norm": 4.6875, "learning_rate": 4.982655221579308e-05, "loss": 0.1505, "mean_token_accuracy": 0.9634428501129151, "num_tokens": 28946170.0, "step": 15475 }, { "entropy": 0.11451233848929405, "epoch": 3.6084625247697866, "grad_norm": 2.234375, "learning_rate": 4.982629872581588e-05, "loss": 0.1832, "mean_token_accuracy": 0.9608821451663971, "num_tokens": 28969256.0, "step": 15480 }, { "entropy": 0.09396213125437498, "epoch": 3.6096281617904182, "grad_norm": 0.65625, "learning_rate": 4.9826045052035026e-05, "loss": 0.1419, "mean_token_accuracy": 0.9580700159072876, "num_tokens": 28991020.0, "step": 15485 }, { "entropy": 0.10509989410638809, "epoch": 3.6107937988110503, "grad_norm": 1.3046875, "learning_rate": 4.9825791194454304e-05, "loss": 0.1666, "mean_token_accuracy": 0.9542161762714386, "num_tokens": 29011662.0, "step": 15490 }, { "entropy": 0.17095814682543278, "epoch": 3.611959435831682, "grad_norm": 3.75, "learning_rate": 4.9825537153077496e-05, "loss": 0.2074, "mean_token_accuracy": 0.936936765909195, "num_tokens": 29032831.0, "step": 15495 }, { "entropy": 0.08246345948427916, "epoch": 3.6131250728523137, "grad_norm": 3.75, "learning_rate": 4.982528292790839e-05, "loss": 0.1081, "mean_token_accuracy": 0.9697481095790863, "num_tokens": 29053059.0, "step": 15500 }, { "entropy": 0.11737008113414049, "epoch": 3.6142907098729458, "grad_norm": 0.353515625, "learning_rate": 4.9825028518950776e-05, "loss": 0.1684, "mean_token_accuracy": 0.9596979439258575, "num_tokens": 29072514.0, "step": 15505 }, { "entropy": 0.14000843446701766, "epoch": 3.6154563468935774, "grad_norm": 4.03125, "learning_rate": 4.9824773926208455e-05, "loss": 0.1935, "mean_token_accuracy": 0.9475890219211578, "num_tokens": 29088807.0, "step": 15510 }, { "entropy": 0.08790351636707783, "epoch": 3.616621983914209, "grad_norm": 3.3125, "learning_rate": 4.982451914968521e-05, "loss": 0.1464, "mean_token_accuracy": 0.9602734923362732, "num_tokens": 29102568.0, "step": 15515 }, { "entropy": 0.12344647385179996, "epoch": 3.6177876209348407, "grad_norm": 1.421875, "learning_rate": 4.982426418938485e-05, "loss": 0.1908, "mean_token_accuracy": 0.9598969042301178, "num_tokens": 29127406.0, "step": 15520 }, { "entropy": 0.13656870573759078, "epoch": 3.6189532579554724, "grad_norm": 1.96875, "learning_rate": 4.9824009045311174e-05, "loss": 0.2374, "mean_token_accuracy": 0.946445894241333, "num_tokens": 29139309.0, "step": 15525 }, { "entropy": 0.12615330442786216, "epoch": 3.6201188949761045, "grad_norm": 4.1875, "learning_rate": 4.982375371746799e-05, "loss": 0.1767, "mean_token_accuracy": 0.9628724992275238, "num_tokens": 29158047.0, "step": 15530 }, { "entropy": 0.13344797156751156, "epoch": 3.621284531996736, "grad_norm": 6.34375, "learning_rate": 4.9823498205859096e-05, "loss": 0.2194, "mean_token_accuracy": 0.953150063753128, "num_tokens": 29172389.0, "step": 15535 }, { "entropy": 0.1036421962082386, "epoch": 3.622450169017368, "grad_norm": 4.875, "learning_rate": 4.9823242510488315e-05, "loss": 0.1068, "mean_token_accuracy": 0.9691031992435455, "num_tokens": 29195919.0, "step": 15540 }, { "entropy": 0.11982145886868238, "epoch": 3.623615806038, "grad_norm": 1.03125, "learning_rate": 4.9822986631359455e-05, "loss": 0.1182, "mean_token_accuracy": 0.9620106518268585, "num_tokens": 29225367.0, "step": 15545 }, { "entropy": 0.10043896548449993, "epoch": 3.6247814430586316, "grad_norm": 1.9296875, "learning_rate": 4.982273056847632e-05, "loss": 0.1494, "mean_token_accuracy": 0.962291032075882, "num_tokens": 29245781.0, "step": 15550 }, { "entropy": 0.12442281097173691, "epoch": 3.6259470800792633, "grad_norm": 5.25, "learning_rate": 4.982247432184275e-05, "loss": 0.1443, "mean_token_accuracy": 0.9543093621730805, "num_tokens": 29272352.0, "step": 15555 }, { "entropy": 0.09449044466018677, "epoch": 3.6271127170998954, "grad_norm": 3.03125, "learning_rate": 4.982221789146254e-05, "loss": 0.1245, "mean_token_accuracy": 0.965887975692749, "num_tokens": 29292819.0, "step": 15560 }, { "entropy": 0.11279590725898743, "epoch": 3.628278354120527, "grad_norm": 7.09375, "learning_rate": 4.982196127733954e-05, "loss": 0.2447, "mean_token_accuracy": 0.9535762310028076, "num_tokens": 29317518.0, "step": 15565 }, { "entropy": 0.07748936731368303, "epoch": 3.6294439911411587, "grad_norm": 2.46875, "learning_rate": 4.982170447947755e-05, "loss": 0.0748, "mean_token_accuracy": 0.9684795379638672, "num_tokens": 29338480.0, "step": 15570 }, { "entropy": 0.17385158613324164, "epoch": 3.6306096281617903, "grad_norm": 0.90234375, "learning_rate": 4.982144749788042e-05, "loss": 0.3076, "mean_token_accuracy": 0.922554886341095, "num_tokens": 29357470.0, "step": 15575 }, { "entropy": 0.11460159979760647, "epoch": 3.631775265182422, "grad_norm": 6.5625, "learning_rate": 4.9821190332551975e-05, "loss": 0.1568, "mean_token_accuracy": 0.9668726980686188, "num_tokens": 29372212.0, "step": 15580 }, { "entropy": 0.15606100708246232, "epoch": 3.632940902203054, "grad_norm": 5.15625, "learning_rate": 4.982093298349605e-05, "loss": 0.2635, "mean_token_accuracy": 0.944490659236908, "num_tokens": 29394169.0, "step": 15585 }, { "entropy": 0.0882917718961835, "epoch": 3.6341065392236858, "grad_norm": 4.3125, "learning_rate": 4.9820675450716484e-05, "loss": 0.1031, "mean_token_accuracy": 0.9658193111419677, "num_tokens": 29408975.0, "step": 15590 }, { "entropy": 0.07895313240587712, "epoch": 3.6352721762443174, "grad_norm": 5.8125, "learning_rate": 4.982041773421712e-05, "loss": 0.0786, "mean_token_accuracy": 0.9723894357681274, "num_tokens": 29434543.0, "step": 15595 }, { "entropy": 0.15614924132823943, "epoch": 3.6364378132649495, "grad_norm": 1.2109375, "learning_rate": 4.982015983400179e-05, "loss": 0.3187, "mean_token_accuracy": 0.9334610044956207, "num_tokens": 29452989.0, "step": 15600 }, { "entropy": 0.08418496306985616, "epoch": 3.637603450285581, "grad_norm": 1.0546875, "learning_rate": 4.981990175007435e-05, "loss": 0.1543, "mean_token_accuracy": 0.9651324570178985, "num_tokens": 29471768.0, "step": 15605 }, { "entropy": 0.099199612531811, "epoch": 3.638769087306213, "grad_norm": 1.15625, "learning_rate": 4.981964348243864e-05, "loss": 0.1925, "mean_token_accuracy": 0.9549100041389466, "num_tokens": 29498761.0, "step": 15610 }, { "entropy": 0.2713124416768551, "epoch": 3.6399347243268445, "grad_norm": 7.75, "learning_rate": 4.981938503109852e-05, "loss": 0.5981, "mean_token_accuracy": 0.9235803484916687, "num_tokens": 29516944.0, "step": 15615 }, { "entropy": 0.23586437962949275, "epoch": 3.641100361347476, "grad_norm": 1.8984375, "learning_rate": 4.981912639605784e-05, "loss": 0.4017, "mean_token_accuracy": 0.9161858439445496, "num_tokens": 29533561.0, "step": 15620 }, { "entropy": 0.13091112915426492, "epoch": 3.6422659983681083, "grad_norm": 6.8125, "learning_rate": 4.981886757732045e-05, "loss": 0.2171, "mean_token_accuracy": 0.9459374427795411, "num_tokens": 29563131.0, "step": 15625 }, { "entropy": 0.3564877349883318, "epoch": 3.64343163538874, "grad_norm": 2.203125, "learning_rate": 4.9818608574890226e-05, "loss": 0.4436, "mean_token_accuracy": 0.9162199199199677, "num_tokens": 29591151.0, "step": 15630 }, { "entropy": 0.08597991708666086, "epoch": 3.6445972724093716, "grad_norm": 1.2578125, "learning_rate": 4.981834938877101e-05, "loss": 0.0575, "mean_token_accuracy": 0.9764993786811829, "num_tokens": 29621998.0, "step": 15635 }, { "entropy": 0.14188469909131526, "epoch": 3.6457629094300037, "grad_norm": 1.921875, "learning_rate": 4.981809001896668e-05, "loss": 0.2125, "mean_token_accuracy": 0.9538449347019196, "num_tokens": 29636259.0, "step": 15640 }, { "entropy": 0.15098804775625468, "epoch": 3.6469285464506354, "grad_norm": 0.5703125, "learning_rate": 4.9817830465481106e-05, "loss": 0.1794, "mean_token_accuracy": 0.9561287760734558, "num_tokens": 29658153.0, "step": 15645 }, { "entropy": 0.11412147097289563, "epoch": 3.648094183471267, "grad_norm": 1.3984375, "learning_rate": 4.981757072831815e-05, "loss": 0.1546, "mean_token_accuracy": 0.9611155807971954, "num_tokens": 29675553.0, "step": 15650 }, { "entropy": 0.11966407895088196, "epoch": 3.6492598204918987, "grad_norm": 4.375, "learning_rate": 4.981731080748169e-05, "loss": 0.1912, "mean_token_accuracy": 0.9574562072753906, "num_tokens": 29698339.0, "step": 15655 }, { "entropy": 0.1513479059562087, "epoch": 3.6504254575125303, "grad_norm": 4.03125, "learning_rate": 4.981705070297559e-05, "loss": 0.1603, "mean_token_accuracy": 0.9487121284008027, "num_tokens": 29733698.0, "step": 15660 }, { "entropy": 0.11037451829761266, "epoch": 3.6515910945331624, "grad_norm": 2.609375, "learning_rate": 4.981679041480375e-05, "loss": 0.207, "mean_token_accuracy": 0.9578937530517578, "num_tokens": 29749055.0, "step": 15665 }, { "entropy": 0.09969578366726636, "epoch": 3.652756731553794, "grad_norm": 0.8046875, "learning_rate": 4.981652994297003e-05, "loss": 0.0758, "mean_token_accuracy": 0.97415571808815, "num_tokens": 29789590.0, "step": 15670 }, { "entropy": 0.13219497799873353, "epoch": 3.6539223685744258, "grad_norm": 7.96875, "learning_rate": 4.981626928747832e-05, "loss": 0.2559, "mean_token_accuracy": 0.9514385044574738, "num_tokens": 29804384.0, "step": 15675 }, { "entropy": 0.12863097973167897, "epoch": 3.655088005595058, "grad_norm": 2.125, "learning_rate": 4.981600844833253e-05, "loss": 0.1296, "mean_token_accuracy": 0.9587861657142639, "num_tokens": 29822822.0, "step": 15680 }, { "entropy": 0.13467212654650212, "epoch": 3.6562536426156895, "grad_norm": 6.625, "learning_rate": 4.9815747425536505e-05, "loss": 0.2266, "mean_token_accuracy": 0.953845477104187, "num_tokens": 29839053.0, "step": 15685 }, { "entropy": 0.1336902890354395, "epoch": 3.657419279636321, "grad_norm": 1.9921875, "learning_rate": 4.981548621909418e-05, "loss": 0.1921, "mean_token_accuracy": 0.9556374251842499, "num_tokens": 29853597.0, "step": 15690 }, { "entropy": 0.1312162894755602, "epoch": 3.6585849166569533, "grad_norm": 5.34375, "learning_rate": 4.9815224829009424e-05, "loss": 0.1911, "mean_token_accuracy": 0.9566432774066925, "num_tokens": 29881783.0, "step": 15695 }, { "entropy": 0.1050132367759943, "epoch": 3.659750553677585, "grad_norm": 1.1953125, "learning_rate": 4.9814963255286144e-05, "loss": 0.16, "mean_token_accuracy": 0.9637480795383453, "num_tokens": 29905353.0, "step": 15700 }, { "entropy": 0.0904469602741301, "epoch": 3.6609161906982166, "grad_norm": 1.640625, "learning_rate": 4.981470149792824e-05, "loss": 0.1005, "mean_token_accuracy": 0.9642527401447296, "num_tokens": 29931329.0, "step": 15705 }, { "entropy": 0.09534500148147344, "epoch": 3.6620818277188483, "grad_norm": 5.59375, "learning_rate": 4.981443955693962e-05, "loss": 0.1104, "mean_token_accuracy": 0.9741943538188934, "num_tokens": 29962728.0, "step": 15710 }, { "entropy": 0.09082780564203859, "epoch": 3.66324746473948, "grad_norm": 0.5078125, "learning_rate": 4.981417743232417e-05, "loss": 0.1501, "mean_token_accuracy": 0.9552464723587036, "num_tokens": 29990387.0, "step": 15715 }, { "entropy": 0.126339378207922, "epoch": 3.664413101760112, "grad_norm": 9.0625, "learning_rate": 4.981391512408582e-05, "loss": 0.2424, "mean_token_accuracy": 0.9529932260513305, "num_tokens": 30000074.0, "step": 15720 }, { "entropy": 0.1250433325767517, "epoch": 3.6655787387807437, "grad_norm": 6.5625, "learning_rate": 4.981365263222848e-05, "loss": 0.141, "mean_token_accuracy": 0.9634235501289368, "num_tokens": 30015666.0, "step": 15725 }, { "entropy": 0.11333463415503502, "epoch": 3.6667443758013754, "grad_norm": 3.640625, "learning_rate": 4.981338995675605e-05, "loss": 0.2123, "mean_token_accuracy": 0.9570781469345093, "num_tokens": 30026972.0, "step": 15730 }, { "entropy": 0.18440841864794494, "epoch": 3.6679100128220075, "grad_norm": 6.3125, "learning_rate": 4.981312709767246e-05, "loss": 0.3143, "mean_token_accuracy": 0.9335676729679108, "num_tokens": 30058406.0, "step": 15735 }, { "entropy": 0.171936047822237, "epoch": 3.669075649842639, "grad_norm": 7.78125, "learning_rate": 4.981286405498163e-05, "loss": 0.23, "mean_token_accuracy": 0.9359781265258789, "num_tokens": 30080231.0, "step": 15740 }, { "entropy": 0.12102098194882274, "epoch": 3.670241286863271, "grad_norm": 1.3125, "learning_rate": 4.981260082868747e-05, "loss": 0.1511, "mean_token_accuracy": 0.9653957903385162, "num_tokens": 30116521.0, "step": 15745 }, { "entropy": 0.18943593911826612, "epoch": 3.6714069238839024, "grad_norm": 0.8828125, "learning_rate": 4.981233741879391e-05, "loss": 0.289, "mean_token_accuracy": 0.9413058817386627, "num_tokens": 30144200.0, "step": 15750 }, { "entropy": 0.11288008131086827, "epoch": 3.672572560904534, "grad_norm": 5.3125, "learning_rate": 4.981207382530489e-05, "loss": 0.1753, "mean_token_accuracy": 0.9527966439723968, "num_tokens": 30171349.0, "step": 15755 }, { "entropy": 0.09640868995338678, "epoch": 3.673738197925166, "grad_norm": 6.46875, "learning_rate": 4.981181004822432e-05, "loss": 0.1566, "mean_token_accuracy": 0.9636990368366242, "num_tokens": 30192008.0, "step": 15760 }, { "entropy": 0.2270708303898573, "epoch": 3.674903834945798, "grad_norm": 4.71875, "learning_rate": 4.981154608755615e-05, "loss": 0.4298, "mean_token_accuracy": 0.912353515625, "num_tokens": 30217346.0, "step": 15765 }, { "entropy": 0.08794712722301483, "epoch": 3.6760694719664295, "grad_norm": 5.28125, "learning_rate": 4.981128194330431e-05, "loss": 0.1921, "mean_token_accuracy": 0.9609225809574127, "num_tokens": 30229956.0, "step": 15770 }, { "entropy": 0.11557548865675926, "epoch": 3.6772351089870616, "grad_norm": 4.8125, "learning_rate": 4.981101761547274e-05, "loss": 0.1623, "mean_token_accuracy": 0.9657979011535645, "num_tokens": 30249241.0, "step": 15775 }, { "entropy": 0.08836949989199638, "epoch": 3.6784007460076933, "grad_norm": 4.78125, "learning_rate": 4.9810753104065376e-05, "loss": 0.0928, "mean_token_accuracy": 0.9697605192661285, "num_tokens": 30277573.0, "step": 15780 }, { "entropy": 0.08709455393254757, "epoch": 3.679566383028325, "grad_norm": 0.92578125, "learning_rate": 4.981048840908618e-05, "loss": 0.1042, "mean_token_accuracy": 0.9733495712280273, "num_tokens": 30302216.0, "step": 15785 }, { "entropy": 0.1160502802580595, "epoch": 3.6807320200489566, "grad_norm": 3.515625, "learning_rate": 4.981022353053907e-05, "loss": 0.1789, "mean_token_accuracy": 0.9632501363754272, "num_tokens": 30313531.0, "step": 15790 }, { "entropy": 0.09411624427884817, "epoch": 3.6818976570695883, "grad_norm": 3.296875, "learning_rate": 4.9809958468428015e-05, "loss": 0.0966, "mean_token_accuracy": 0.9665277242660523, "num_tokens": 30329907.0, "step": 15795 }, { "entropy": 0.19231935106217862, "epoch": 3.6830632940902204, "grad_norm": 9.25, "learning_rate": 4.980969322275697e-05, "loss": 0.3127, "mean_token_accuracy": 0.940207052230835, "num_tokens": 30338833.0, "step": 15800 }, { "entropy": 0.12116588074713945, "epoch": 3.684228931110852, "grad_norm": 7.9375, "learning_rate": 4.980942779352988e-05, "loss": 0.2015, "mean_token_accuracy": 0.9503274917602539, "num_tokens": 30352552.0, "step": 15805 }, { "entropy": 0.11752695105969906, "epoch": 3.6853945681314837, "grad_norm": 7.75, "learning_rate": 4.9809162180750705e-05, "loss": 0.2157, "mean_token_accuracy": 0.9551667273044586, "num_tokens": 30363063.0, "step": 15810 }, { "entropy": 0.12046983316540719, "epoch": 3.686560205152116, "grad_norm": 4.375, "learning_rate": 4.980889638442341e-05, "loss": 0.1791, "mean_token_accuracy": 0.9582276821136475, "num_tokens": 30382257.0, "step": 15815 }, { "entropy": 0.11929085087031126, "epoch": 3.6877258421727475, "grad_norm": 7.75, "learning_rate": 4.980863040455196e-05, "loss": 0.1394, "mean_token_accuracy": 0.9591721177101136, "num_tokens": 30407940.0, "step": 15820 }, { "entropy": 0.10976488478481769, "epoch": 3.688891479193379, "grad_norm": 1.296875, "learning_rate": 4.9808364241140316e-05, "loss": 0.1348, "mean_token_accuracy": 0.965981525182724, "num_tokens": 30436227.0, "step": 15825 }, { "entropy": 0.08953040465712547, "epoch": 3.6900571162140112, "grad_norm": 1.375, "learning_rate": 4.980809789419244e-05, "loss": 0.0962, "mean_token_accuracy": 0.9648583948612213, "num_tokens": 30458523.0, "step": 15830 }, { "entropy": 0.0966216598637402, "epoch": 3.691222753234643, "grad_norm": 2.015625, "learning_rate": 4.980783136371232e-05, "loss": 0.126, "mean_token_accuracy": 0.9684146821498871, "num_tokens": 30475490.0, "step": 15835 }, { "entropy": 0.10115543827414512, "epoch": 3.6923883902552745, "grad_norm": 1.7109375, "learning_rate": 4.980756464970392e-05, "loss": 0.1456, "mean_token_accuracy": 0.9590452373027801, "num_tokens": 30492963.0, "step": 15840 }, { "entropy": 0.11767181642353534, "epoch": 3.693554027275906, "grad_norm": 8.9375, "learning_rate": 4.9807297752171225e-05, "loss": 0.1732, "mean_token_accuracy": 0.9512511551380157, "num_tokens": 30520731.0, "step": 15845 }, { "entropy": 0.07681280169636011, "epoch": 3.694719664296538, "grad_norm": 5.34375, "learning_rate": 4.9807030671118205e-05, "loss": 0.1331, "mean_token_accuracy": 0.9656332790851593, "num_tokens": 30546042.0, "step": 15850 }, { "entropy": 0.09752353429794311, "epoch": 3.69588530131717, "grad_norm": 8.4375, "learning_rate": 4.980676340654884e-05, "loss": 0.1193, "mean_token_accuracy": 0.9660360097885132, "num_tokens": 30561427.0, "step": 15855 }, { "entropy": 0.09423494134098291, "epoch": 3.6970509383378016, "grad_norm": 7.34375, "learning_rate": 4.980649595846713e-05, "loss": 0.1271, "mean_token_accuracy": 0.958463329076767, "num_tokens": 30586406.0, "step": 15860 }, { "entropy": 0.11503859348595143, "epoch": 3.6982165753584333, "grad_norm": 2.046875, "learning_rate": 4.9806228326877056e-05, "loss": 0.1439, "mean_token_accuracy": 0.9584307610988617, "num_tokens": 30596957.0, "step": 15865 }, { "entropy": 0.09770829975605011, "epoch": 3.6993822123790654, "grad_norm": 6.96875, "learning_rate": 4.980596051178261e-05, "loss": 0.1714, "mean_token_accuracy": 0.9600382685661316, "num_tokens": 30608791.0, "step": 15870 }, { "entropy": 0.13881364446133376, "epoch": 3.700547849399697, "grad_norm": 2.640625, "learning_rate": 4.9805692513187774e-05, "loss": 0.2144, "mean_token_accuracy": 0.9591614782810212, "num_tokens": 30638592.0, "step": 15875 }, { "entropy": 0.08609495665878057, "epoch": 3.7017134864203287, "grad_norm": 0.703125, "learning_rate": 4.980542433109656e-05, "loss": 0.1357, "mean_token_accuracy": 0.9631729364395142, "num_tokens": 30659261.0, "step": 15880 }, { "entropy": 0.08448141608387232, "epoch": 3.7028791234409604, "grad_norm": 4.875, "learning_rate": 4.980515596551296e-05, "loss": 0.0884, "mean_token_accuracy": 0.9661660194396973, "num_tokens": 30687231.0, "step": 15885 }, { "entropy": 0.08586684390902519, "epoch": 3.704044760461592, "grad_norm": 1.7265625, "learning_rate": 4.980488741644098e-05, "loss": 0.1603, "mean_token_accuracy": 0.9657577693462371, "num_tokens": 30714234.0, "step": 15890 }, { "entropy": 0.1457499351352453, "epoch": 3.705210397482224, "grad_norm": 2.8125, "learning_rate": 4.980461868388462e-05, "loss": 0.1848, "mean_token_accuracy": 0.9565096437931061, "num_tokens": 30732637.0, "step": 15895 }, { "entropy": 0.16683135665953158, "epoch": 3.706376034502856, "grad_norm": 2.40625, "learning_rate": 4.9804349767847877e-05, "loss": 0.2308, "mean_token_accuracy": 0.9465544700622559, "num_tokens": 30743422.0, "step": 15900 }, { "entropy": 0.08053812086582184, "epoch": 3.7075416715234875, "grad_norm": 13.3125, "learning_rate": 4.9804080668334784e-05, "loss": 0.1439, "mean_token_accuracy": 0.9672502219676972, "num_tokens": 30763615.0, "step": 15905 }, { "entropy": 0.09644758738577366, "epoch": 3.7087073085441196, "grad_norm": 3.21875, "learning_rate": 4.980381138534934e-05, "loss": 0.0996, "mean_token_accuracy": 0.9660190284252167, "num_tokens": 30777896.0, "step": 15910 }, { "entropy": 0.09025746416300535, "epoch": 3.7098729455647512, "grad_norm": 1.46875, "learning_rate": 4.9803541918895565e-05, "loss": 0.1656, "mean_token_accuracy": 0.9610617697238922, "num_tokens": 30791889.0, "step": 15915 }, { "entropy": 0.09136622017249466, "epoch": 3.711038582585383, "grad_norm": 2.5, "learning_rate": 4.980327226897747e-05, "loss": 0.0989, "mean_token_accuracy": 0.9618954002857208, "num_tokens": 30811042.0, "step": 15920 }, { "entropy": 0.0829636923968792, "epoch": 3.7122042196060145, "grad_norm": 2.0, "learning_rate": 4.9803002435599076e-05, "loss": 0.0862, "mean_token_accuracy": 0.9666243731975556, "num_tokens": 30829122.0, "step": 15925 }, { "entropy": 0.11056047528982163, "epoch": 3.713369856626646, "grad_norm": 1.390625, "learning_rate": 4.980273241876442e-05, "loss": 0.1123, "mean_token_accuracy": 0.9589561939239502, "num_tokens": 30850324.0, "step": 15930 }, { "entropy": 0.10144733544439077, "epoch": 3.7145354936472783, "grad_norm": 0.61328125, "learning_rate": 4.980246221847751e-05, "loss": 0.2017, "mean_token_accuracy": 0.9625826895236969, "num_tokens": 30873421.0, "step": 15935 }, { "entropy": 0.11644894815981388, "epoch": 3.71570113066791, "grad_norm": 9.5625, "learning_rate": 4.9802191834742395e-05, "loss": 0.1696, "mean_token_accuracy": 0.9588033437728882, "num_tokens": 30885393.0, "step": 15940 }, { "entropy": 0.07029726225882768, "epoch": 3.7168667676885416, "grad_norm": 2.671875, "learning_rate": 4.980192126756309e-05, "loss": 0.0853, "mean_token_accuracy": 0.965891820192337, "num_tokens": 30915056.0, "step": 15945 }, { "entropy": 0.08881367389112711, "epoch": 3.7180324047091737, "grad_norm": 0.470703125, "learning_rate": 4.9801650516943636e-05, "loss": 0.1229, "mean_token_accuracy": 0.9748471915721894, "num_tokens": 30940177.0, "step": 15950 }, { "entropy": 0.12720200717449187, "epoch": 3.7191980417298054, "grad_norm": 2.03125, "learning_rate": 4.980137958288808e-05, "loss": 0.1362, "mean_token_accuracy": 0.9587613344192505, "num_tokens": 30956685.0, "step": 15955 }, { "entropy": 0.10853961212560534, "epoch": 3.720363678750437, "grad_norm": 6.25, "learning_rate": 4.980110846540044e-05, "loss": 0.1353, "mean_token_accuracy": 0.9610925018787384, "num_tokens": 30976571.0, "step": 15960 }, { "entropy": 0.2672706731595099, "epoch": 3.721529315771069, "grad_norm": 0.498046875, "learning_rate": 4.980083716448477e-05, "loss": 0.4138, "mean_token_accuracy": 0.9393226504325867, "num_tokens": 31008250.0, "step": 15965 }, { "entropy": 0.18283664286136628, "epoch": 3.722694952791701, "grad_norm": 1.078125, "learning_rate": 4.980056568014512e-05, "loss": 0.2432, "mean_token_accuracy": 0.9456781387329102, "num_tokens": 31030726.0, "step": 15970 }, { "entropy": 0.10231209546327591, "epoch": 3.7238605898123325, "grad_norm": 3.703125, "learning_rate": 4.9800294012385534e-05, "loss": 0.1536, "mean_token_accuracy": 0.964515072107315, "num_tokens": 31044932.0, "step": 15975 }, { "entropy": 0.09682576023042203, "epoch": 3.725026226832964, "grad_norm": 2.125, "learning_rate": 4.9800022161210066e-05, "loss": 0.1251, "mean_token_accuracy": 0.9686741828918457, "num_tokens": 31070171.0, "step": 15980 }, { "entropy": 0.09693482723087073, "epoch": 3.726191863853596, "grad_norm": 6.0, "learning_rate": 4.9799750126622766e-05, "loss": 0.0948, "mean_token_accuracy": 0.96324702501297, "num_tokens": 31095575.0, "step": 15985 }, { "entropy": 0.07029144568368792, "epoch": 3.727357500874228, "grad_norm": 0.703125, "learning_rate": 4.979947790862769e-05, "loss": 0.0542, "mean_token_accuracy": 0.9817209541797638, "num_tokens": 31131076.0, "step": 15990 }, { "entropy": 0.11360028982162476, "epoch": 3.7285231378948596, "grad_norm": 3.84375, "learning_rate": 4.97992055072289e-05, "loss": 0.1979, "mean_token_accuracy": 0.9619390547275544, "num_tokens": 31161312.0, "step": 15995 }, { "entropy": 0.09870643690228462, "epoch": 3.7296887749154912, "grad_norm": 8.75, "learning_rate": 4.979893292243045e-05, "loss": 0.1504, "mean_token_accuracy": 0.9549363851547241, "num_tokens": 31177428.0, "step": 16000 }, { "entropy": 0.09069200027734041, "epoch": 3.7308544119361233, "grad_norm": 7.0, "learning_rate": 4.9798660154236425e-05, "loss": 0.1552, "mean_token_accuracy": 0.9646099269390106, "num_tokens": 31192384.0, "step": 16005 }, { "entropy": 0.10570933558046818, "epoch": 3.732020048956755, "grad_norm": 4.46875, "learning_rate": 4.979838720265087e-05, "loss": 0.2227, "mean_token_accuracy": 0.9609293758869171, "num_tokens": 31203928.0, "step": 16010 }, { "entropy": 0.08798787742853165, "epoch": 3.7331856859773866, "grad_norm": 1.828125, "learning_rate": 4.979811406767787e-05, "loss": 0.0826, "mean_token_accuracy": 0.9753459453582763, "num_tokens": 31232416.0, "step": 16015 }, { "entropy": 0.09456034004688263, "epoch": 3.7343513229980183, "grad_norm": 0.4140625, "learning_rate": 4.979784074932149e-05, "loss": 0.099, "mean_token_accuracy": 0.9756748259067536, "num_tokens": 31260445.0, "step": 16020 }, { "entropy": 0.08893920592963696, "epoch": 3.73551696001865, "grad_norm": 4.71875, "learning_rate": 4.97975672475858e-05, "loss": 0.1274, "mean_token_accuracy": 0.9707845091819763, "num_tokens": 31280077.0, "step": 16025 }, { "entropy": 0.11007880065590143, "epoch": 3.736682597039282, "grad_norm": 7.15625, "learning_rate": 4.97972935624749e-05, "loss": 0.158, "mean_token_accuracy": 0.967463493347168, "num_tokens": 31300264.0, "step": 16030 }, { "entropy": 0.11120393015444278, "epoch": 3.7378482340599137, "grad_norm": 1.625, "learning_rate": 4.979701969399284e-05, "loss": 0.1794, "mean_token_accuracy": 0.9625828862190247, "num_tokens": 31313073.0, "step": 16035 }, { "entropy": 0.12304305955767632, "epoch": 3.7390138710805454, "grad_norm": 5.03125, "learning_rate": 4.979674564214373e-05, "loss": 0.2017, "mean_token_accuracy": 0.9605163991451263, "num_tokens": 31323788.0, "step": 16040 }, { "entropy": 0.10260249320417643, "epoch": 3.7401795081011775, "grad_norm": 1.3046875, "learning_rate": 4.979647140693165e-05, "loss": 0.1287, "mean_token_accuracy": 0.9643874883651733, "num_tokens": 31345258.0, "step": 16045 }, { "entropy": 0.15542761236429214, "epoch": 3.741345145121809, "grad_norm": 1.1953125, "learning_rate": 4.979619698836068e-05, "loss": 0.2069, "mean_token_accuracy": 0.9525025904178619, "num_tokens": 31363352.0, "step": 16050 }, { "entropy": 0.10133458431810141, "epoch": 3.742510782142441, "grad_norm": 4.9375, "learning_rate": 4.979592238643492e-05, "loss": 0.143, "mean_token_accuracy": 0.9657983779907227, "num_tokens": 31392370.0, "step": 16055 }, { "entropy": 0.11571516077965498, "epoch": 3.7436764191630725, "grad_norm": 0.51171875, "learning_rate": 4.979564760115846e-05, "loss": 0.1645, "mean_token_accuracy": 0.9564596891403199, "num_tokens": 31417487.0, "step": 16060 }, { "entropy": 0.10357567742466926, "epoch": 3.744842056183704, "grad_norm": 1.3515625, "learning_rate": 4.979537263253541e-05, "loss": 0.2739, "mean_token_accuracy": 0.9473852813243866, "num_tokens": 31430943.0, "step": 16065 }, { "entropy": 0.12955602668225766, "epoch": 3.7460076932043362, "grad_norm": 7.78125, "learning_rate": 4.9795097480569854e-05, "loss": 0.2137, "mean_token_accuracy": 0.9514732956886292, "num_tokens": 31439010.0, "step": 16070 }, { "entropy": 0.1175201378762722, "epoch": 3.747173330224968, "grad_norm": 9.75, "learning_rate": 4.97948221452659e-05, "loss": 0.1842, "mean_token_accuracy": 0.9582869231700897, "num_tokens": 31448832.0, "step": 16075 }, { "entropy": 0.08986708372831345, "epoch": 3.7483389672455996, "grad_norm": 0.77734375, "learning_rate": 4.979454662662766e-05, "loss": 0.078, "mean_token_accuracy": 0.9791659891605378, "num_tokens": 31474476.0, "step": 16080 }, { "entropy": 0.12361742258071899, "epoch": 3.7495046042662317, "grad_norm": 5.15625, "learning_rate": 4.979427092465924e-05, "loss": 0.163, "mean_token_accuracy": 0.9587664127349853, "num_tokens": 31497137.0, "step": 16085 }, { "entropy": 0.09666433380916714, "epoch": 3.7506702412868633, "grad_norm": 5.03125, "learning_rate": 4.9793995039364736e-05, "loss": 0.1255, "mean_token_accuracy": 0.9669358968734741, "num_tokens": 31520290.0, "step": 16090 }, { "entropy": 0.1968850590288639, "epoch": 3.751835878307495, "grad_norm": 3.828125, "learning_rate": 4.979371897074829e-05, "loss": 0.3538, "mean_token_accuracy": 0.9254569470882416, "num_tokens": 31538203.0, "step": 16095 }, { "entropy": 0.11581053957343102, "epoch": 3.753001515328127, "grad_norm": 6.875, "learning_rate": 4.9793442718814e-05, "loss": 0.2938, "mean_token_accuracy": 0.9382094621658326, "num_tokens": 31549347.0, "step": 16100 }, { "entropy": 0.11443403884768485, "epoch": 3.7541671523487588, "grad_norm": 6.96875, "learning_rate": 4.979316628356599e-05, "loss": 0.1811, "mean_token_accuracy": 0.9656529247760772, "num_tokens": 31562419.0, "step": 16105 }, { "entropy": 0.10604585809633135, "epoch": 3.7553327893693904, "grad_norm": 0.390625, "learning_rate": 4.979288966500837e-05, "loss": 0.1336, "mean_token_accuracy": 0.966714721918106, "num_tokens": 31580920.0, "step": 16110 }, { "entropy": 0.1127666326239705, "epoch": 3.756498426390022, "grad_norm": 6.625, "learning_rate": 4.9792612863145284e-05, "loss": 0.1463, "mean_token_accuracy": 0.9673124790191651, "num_tokens": 31594740.0, "step": 16115 }, { "entropy": 0.14349405989050865, "epoch": 3.7576640634106537, "grad_norm": 4.40625, "learning_rate": 4.9792335877980844e-05, "loss": 0.2122, "mean_token_accuracy": 0.9504148721694946, "num_tokens": 31604473.0, "step": 16120 }, { "entropy": 0.16313403341919183, "epoch": 3.758829700431286, "grad_norm": 3.046875, "learning_rate": 4.9792058709519194e-05, "loss": 0.2319, "mean_token_accuracy": 0.9375991344451904, "num_tokens": 31633345.0, "step": 16125 }, { "entropy": 0.1955052137374878, "epoch": 3.7599953374519175, "grad_norm": 5.03125, "learning_rate": 4.979178135776446e-05, "loss": 0.3376, "mean_token_accuracy": 0.9329494535923004, "num_tokens": 31642905.0, "step": 16130 }, { "entropy": 0.12828119061887264, "epoch": 3.761160974472549, "grad_norm": 4.75, "learning_rate": 4.979150382272078e-05, "loss": 0.1884, "mean_token_accuracy": 0.9502387046813965, "num_tokens": 31678138.0, "step": 16135 }, { "entropy": 0.1014595903456211, "epoch": 3.7623266114931813, "grad_norm": 5.8125, "learning_rate": 4.979122610439229e-05, "loss": 0.1236, "mean_token_accuracy": 0.9615878939628602, "num_tokens": 31705191.0, "step": 16140 }, { "entropy": 0.09820052236318588, "epoch": 3.763492248513813, "grad_norm": 13.0625, "learning_rate": 4.979094820278312e-05, "loss": 0.195, "mean_token_accuracy": 0.9605471730232239, "num_tokens": 31727042.0, "step": 16145 }, { "entropy": 0.113966304063797, "epoch": 3.7646578855344446, "grad_norm": 4.5625, "learning_rate": 4.9790670117897434e-05, "loss": 0.1288, "mean_token_accuracy": 0.9665563642978668, "num_tokens": 31747200.0, "step": 16150 }, { "entropy": 0.18605943005532027, "epoch": 3.7658235225550762, "grad_norm": 8.5, "learning_rate": 4.979039184973937e-05, "loss": 0.3071, "mean_token_accuracy": 0.9375176846981048, "num_tokens": 31767394.0, "step": 16155 }, { "entropy": 0.10228492282330989, "epoch": 3.766989159575708, "grad_norm": 3.859375, "learning_rate": 4.979011339831308e-05, "loss": 0.1293, "mean_token_accuracy": 0.9703926205635071, "num_tokens": 31782097.0, "step": 16160 }, { "entropy": 0.13661051522940398, "epoch": 3.76815479659634, "grad_norm": 1.125, "learning_rate": 4.9789834763622715e-05, "loss": 0.1347, "mean_token_accuracy": 0.9575334310531616, "num_tokens": 31810446.0, "step": 16165 }, { "entropy": 0.0924876093864441, "epoch": 3.7693204336169717, "grad_norm": 1.921875, "learning_rate": 4.9789555945672426e-05, "loss": 0.1324, "mean_token_accuracy": 0.9688528537750244, "num_tokens": 31823541.0, "step": 16170 }, { "entropy": 0.13186742961406708, "epoch": 3.7704860706376033, "grad_norm": 9.5, "learning_rate": 4.978927694446637e-05, "loss": 0.249, "mean_token_accuracy": 0.9531232118606567, "num_tokens": 31832728.0, "step": 16175 }, { "entropy": 0.10348946945741773, "epoch": 3.7716517076582354, "grad_norm": 2.265625, "learning_rate": 4.9788997760008713e-05, "loss": 0.1732, "mean_token_accuracy": 0.9628577351570129, "num_tokens": 31850856.0, "step": 16180 }, { "entropy": 0.1540275054052472, "epoch": 3.772817344678867, "grad_norm": 1.5859375, "learning_rate": 4.9788718392303624e-05, "loss": 0.2778, "mean_token_accuracy": 0.9363715410232544, "num_tokens": 31881657.0, "step": 16185 }, { "entropy": 0.11113776378333569, "epoch": 3.7739829816994988, "grad_norm": 0.59375, "learning_rate": 4.978843884135525e-05, "loss": 0.1466, "mean_token_accuracy": 0.9614432454109192, "num_tokens": 31905652.0, "step": 16190 }, { "entropy": 0.11307852622121572, "epoch": 3.7751486187201304, "grad_norm": 3.71875, "learning_rate": 4.9788159107167774e-05, "loss": 0.1413, "mean_token_accuracy": 0.9594118535518646, "num_tokens": 31927717.0, "step": 16195 }, { "entropy": 0.1015424283221364, "epoch": 3.776314255740762, "grad_norm": 0.55078125, "learning_rate": 4.978787918974536e-05, "loss": 0.1726, "mean_token_accuracy": 0.9541314899921417, "num_tokens": 31948045.0, "step": 16200 }, { "entropy": 0.09296782370656728, "epoch": 3.777479892761394, "grad_norm": 3.90625, "learning_rate": 4.9787599089092194e-05, "loss": 0.112, "mean_token_accuracy": 0.96722252368927, "num_tokens": 31975131.0, "step": 16205 }, { "entropy": 0.07211828418076038, "epoch": 3.778645529782026, "grad_norm": 3.875, "learning_rate": 4.9787318805212436e-05, "loss": 0.0967, "mean_token_accuracy": 0.9726602792739868, "num_tokens": 32002367.0, "step": 16210 }, { "entropy": 0.09185118321329355, "epoch": 3.7798111668026575, "grad_norm": 3.59375, "learning_rate": 4.978703833811028e-05, "loss": 0.1118, "mean_token_accuracy": 0.9684987306594849, "num_tokens": 32026660.0, "step": 16215 }, { "entropy": 0.0935923472046852, "epoch": 3.7809768038232896, "grad_norm": 0.8671875, "learning_rate": 4.978675768778989e-05, "loss": 0.1291, "mean_token_accuracy": 0.9658624827861786, "num_tokens": 32059305.0, "step": 16220 }, { "entropy": 0.08868596963584423, "epoch": 3.7821424408439213, "grad_norm": 3.0, "learning_rate": 4.978647685425547e-05, "loss": 0.1118, "mean_token_accuracy": 0.973666113615036, "num_tokens": 32089774.0, "step": 16225 }, { "entropy": 0.1755599969998002, "epoch": 3.783308077864553, "grad_norm": 5.9375, "learning_rate": 4.97861958375112e-05, "loss": 0.2921, "mean_token_accuracy": 0.940058308839798, "num_tokens": 32102936.0, "step": 16230 }, { "entropy": 0.08324588984251022, "epoch": 3.784473714885185, "grad_norm": 0.53125, "learning_rate": 4.978591463756127e-05, "loss": 0.1106, "mean_token_accuracy": 0.9641181349754333, "num_tokens": 32136473.0, "step": 16235 }, { "entropy": 0.10371949858963489, "epoch": 3.7856393519058167, "grad_norm": 4.875, "learning_rate": 4.978563325440988e-05, "loss": 0.1634, "mean_token_accuracy": 0.9677753865718841, "num_tokens": 32147038.0, "step": 16240 }, { "entropy": 0.08506152108311653, "epoch": 3.7868049889264483, "grad_norm": 0.453125, "learning_rate": 4.978535168806121e-05, "loss": 0.1288, "mean_token_accuracy": 0.973295658826828, "num_tokens": 32173810.0, "step": 16245 }, { "entropy": 0.10174280107021332, "epoch": 3.78797062594708, "grad_norm": 8.375, "learning_rate": 4.978506993851947e-05, "loss": 0.1985, "mean_token_accuracy": 0.9564942598342896, "num_tokens": 32186356.0, "step": 16250 }, { "entropy": 0.12382913529872894, "epoch": 3.7891362629677117, "grad_norm": 6.28125, "learning_rate": 4.9784788005788866e-05, "loss": 0.1887, "mean_token_accuracy": 0.9540922164916992, "num_tokens": 32201292.0, "step": 16255 }, { "entropy": 0.12052395232021809, "epoch": 3.7903018999883438, "grad_norm": 7.15625, "learning_rate": 4.978450588987359e-05, "loss": 0.1952, "mean_token_accuracy": 0.9510783553123474, "num_tokens": 32215733.0, "step": 16260 }, { "entropy": 0.09282411560416222, "epoch": 3.7914675370089754, "grad_norm": 3.15625, "learning_rate": 4.9784223590777854e-05, "loss": 0.082, "mean_token_accuracy": 0.9700927495956421, "num_tokens": 32236636.0, "step": 16265 }, { "entropy": 0.1389080710709095, "epoch": 3.792633174029607, "grad_norm": 5.46875, "learning_rate": 4.978394110850587e-05, "loss": 0.2553, "mean_token_accuracy": 0.9495690643787384, "num_tokens": 32245289.0, "step": 16270 }, { "entropy": 0.10744792697951197, "epoch": 3.793798811050239, "grad_norm": 7.4375, "learning_rate": 4.978365844306184e-05, "loss": 0.1681, "mean_token_accuracy": 0.9592471480369568, "num_tokens": 32262720.0, "step": 16275 }, { "entropy": 0.10109451450407506, "epoch": 3.794964448070871, "grad_norm": 3.734375, "learning_rate": 4.978337559445e-05, "loss": 0.1505, "mean_token_accuracy": 0.9673482775688171, "num_tokens": 32283036.0, "step": 16280 }, { "entropy": 0.10058681219816208, "epoch": 3.7961300850915025, "grad_norm": 5.03125, "learning_rate": 4.978309256267455e-05, "loss": 0.1738, "mean_token_accuracy": 0.9640094101428985, "num_tokens": 32293183.0, "step": 16285 }, { "entropy": 0.07011372428387404, "epoch": 3.797295722112134, "grad_norm": 1.0078125, "learning_rate": 4.978280934773971e-05, "loss": 0.0912, "mean_token_accuracy": 0.9802879512310028, "num_tokens": 32311618.0, "step": 16290 }, { "entropy": 0.09535004459321499, "epoch": 3.798461359132766, "grad_norm": 1.0234375, "learning_rate": 4.978252594964971e-05, "loss": 0.1341, "mean_token_accuracy": 0.9680737018585205, "num_tokens": 32328529.0, "step": 16295 }, { "entropy": 0.14605763144791126, "epoch": 3.799626996153398, "grad_norm": 1.734375, "learning_rate": 4.9782242368408775e-05, "loss": 0.1694, "mean_token_accuracy": 0.9519593000411988, "num_tokens": 32365656.0, "step": 16300 }, { "entropy": 0.09797099642455578, "epoch": 3.8007926331740296, "grad_norm": 2.046875, "learning_rate": 4.978195860402114e-05, "loss": 0.1491, "mean_token_accuracy": 0.9651980400085449, "num_tokens": 32377851.0, "step": 16305 }, { "entropy": 0.14139844849705696, "epoch": 3.8019582701946613, "grad_norm": 4.25, "learning_rate": 4.9781674656491016e-05, "loss": 0.2437, "mean_token_accuracy": 0.9429092288017273, "num_tokens": 32394288.0, "step": 16310 }, { "entropy": 0.09763580206781626, "epoch": 3.8031239072152934, "grad_norm": 0.71875, "learning_rate": 4.9781390525822655e-05, "loss": 0.1644, "mean_token_accuracy": 0.9650467038154602, "num_tokens": 32412578.0, "step": 16315 }, { "entropy": 0.10009733010083437, "epoch": 3.804289544235925, "grad_norm": 3.890625, "learning_rate": 4.9781106212020295e-05, "loss": 0.1664, "mean_token_accuracy": 0.9597991704940796, "num_tokens": 32426556.0, "step": 16320 }, { "entropy": 0.10506035517901183, "epoch": 3.8054551812565567, "grad_norm": 5.03125, "learning_rate": 4.9780821715088166e-05, "loss": 0.1962, "mean_token_accuracy": 0.9597808599472046, "num_tokens": 32438924.0, "step": 16325 }, { "entropy": 0.10086954236030579, "epoch": 3.8066208182771883, "grad_norm": 3.703125, "learning_rate": 4.9780537035030515e-05, "loss": 0.155, "mean_token_accuracy": 0.9629509508609772, "num_tokens": 32463099.0, "step": 16330 }, { "entropy": 0.09417558461427689, "epoch": 3.80778645529782, "grad_norm": 3.578125, "learning_rate": 4.9780252171851584e-05, "loss": 0.1001, "mean_token_accuracy": 0.9691354632377625, "num_tokens": 32481659.0, "step": 16335 }, { "entropy": 0.09145726338028907, "epoch": 3.808952092318452, "grad_norm": 1.2734375, "learning_rate": 4.9779967125555623e-05, "loss": 0.169, "mean_token_accuracy": 0.9573414266109467, "num_tokens": 32497033.0, "step": 16340 }, { "entropy": 0.10342365652322769, "epoch": 3.8101177293390838, "grad_norm": 1.2421875, "learning_rate": 4.977968189614688e-05, "loss": 0.133, "mean_token_accuracy": 0.9625088393688201, "num_tokens": 32516572.0, "step": 16345 }, { "entropy": 0.09053916297852993, "epoch": 3.8112833663597154, "grad_norm": 5.8125, "learning_rate": 4.977939648362961e-05, "loss": 0.117, "mean_token_accuracy": 0.965921676158905, "num_tokens": 32538406.0, "step": 16350 }, { "entropy": 0.12717281207442283, "epoch": 3.8124490033803475, "grad_norm": 0.9609375, "learning_rate": 4.977911088800807e-05, "loss": 0.2096, "mean_token_accuracy": 0.9550489544868469, "num_tokens": 32572408.0, "step": 16355 }, { "entropy": 0.10790450349450112, "epoch": 3.813614640400979, "grad_norm": 8.625, "learning_rate": 4.977882510928652e-05, "loss": 0.1321, "mean_token_accuracy": 0.9582541406154632, "num_tokens": 32586738.0, "step": 16360 }, { "entropy": 0.11157775335013867, "epoch": 3.814780277421611, "grad_norm": 5.46875, "learning_rate": 4.977853914746922e-05, "loss": 0.1499, "mean_token_accuracy": 0.9643312692642212, "num_tokens": 32598309.0, "step": 16365 }, { "entropy": 0.10543946204707026, "epoch": 3.815945914442243, "grad_norm": 7.3125, "learning_rate": 4.9778253002560434e-05, "loss": 0.1986, "mean_token_accuracy": 0.9579822719097137, "num_tokens": 32626437.0, "step": 16370 }, { "entropy": 0.07996916975826025, "epoch": 3.8171115514628746, "grad_norm": 2.484375, "learning_rate": 4.9777966674564426e-05, "loss": 0.128, "mean_token_accuracy": 0.9667159378528595, "num_tokens": 32647112.0, "step": 16375 }, { "entropy": 0.15802016891539097, "epoch": 3.8182771884835063, "grad_norm": 3.609375, "learning_rate": 4.9777680163485465e-05, "loss": 0.2629, "mean_token_accuracy": 0.9438641905784607, "num_tokens": 32663012.0, "step": 16380 }, { "entropy": 0.08313553333282471, "epoch": 3.819442825504138, "grad_norm": 3.390625, "learning_rate": 4.977739346932783e-05, "loss": 0.0886, "mean_token_accuracy": 0.973399305343628, "num_tokens": 32685324.0, "step": 16385 }, { "entropy": 0.09466358497738839, "epoch": 3.8206084625247696, "grad_norm": 4.0625, "learning_rate": 4.977710659209579e-05, "loss": 0.1754, "mean_token_accuracy": 0.9577878236770629, "num_tokens": 32703309.0, "step": 16390 }, { "entropy": 0.09490567035973071, "epoch": 3.8217740995454017, "grad_norm": 1.09375, "learning_rate": 4.977681953179363e-05, "loss": 0.1239, "mean_token_accuracy": 0.9690981268882751, "num_tokens": 32725065.0, "step": 16395 }, { "entropy": 0.08475140482187271, "epoch": 3.8229397365660334, "grad_norm": 3.4375, "learning_rate": 4.9776532288425616e-05, "loss": 0.1152, "mean_token_accuracy": 0.9685661673545838, "num_tokens": 32742199.0, "step": 16400 }, { "entropy": 0.1452885389328003, "epoch": 3.824105373586665, "grad_norm": 3.21875, "learning_rate": 4.977624486199605e-05, "loss": 0.1667, "mean_token_accuracy": 0.9582402050495148, "num_tokens": 32753828.0, "step": 16405 }, { "entropy": 0.12028260957449674, "epoch": 3.825271010607297, "grad_norm": 2.6875, "learning_rate": 4.97759572525092e-05, "loss": 0.1261, "mean_token_accuracy": 0.9706174254417419, "num_tokens": 32782189.0, "step": 16410 }, { "entropy": 0.07044358663260937, "epoch": 3.826436647627929, "grad_norm": 1.0859375, "learning_rate": 4.9775669459969364e-05, "loss": 0.0639, "mean_token_accuracy": 0.9793649017810822, "num_tokens": 32816185.0, "step": 16415 }, { "entropy": 0.08048469722270965, "epoch": 3.8276022846485604, "grad_norm": 0.5859375, "learning_rate": 4.977538148438084e-05, "loss": 0.1187, "mean_token_accuracy": 0.9718262076377868, "num_tokens": 32835221.0, "step": 16420 }, { "entropy": 0.08252397803589702, "epoch": 3.828767921669192, "grad_norm": 1.0625, "learning_rate": 4.977509332574791e-05, "loss": 0.1253, "mean_token_accuracy": 0.9690129697322846, "num_tokens": 32869527.0, "step": 16425 }, { "entropy": 0.06499332496896386, "epoch": 3.8299335586898238, "grad_norm": 0.275390625, "learning_rate": 4.977480498407488e-05, "loss": 0.0647, "mean_token_accuracy": 0.979715633392334, "num_tokens": 32894850.0, "step": 16430 }, { "entropy": 0.13496083430945874, "epoch": 3.831099195710456, "grad_norm": 6.75, "learning_rate": 4.977451645936604e-05, "loss": 0.2355, "mean_token_accuracy": 0.9405577182769775, "num_tokens": 32926236.0, "step": 16435 }, { "entropy": 0.07254137042909861, "epoch": 3.8322648327310875, "grad_norm": 1.5078125, "learning_rate": 4.97742277516257e-05, "loss": 0.0907, "mean_token_accuracy": 0.9758977711200714, "num_tokens": 32950365.0, "step": 16440 }, { "entropy": 0.11544609442353249, "epoch": 3.833430469751719, "grad_norm": 4.125, "learning_rate": 4.977393886085816e-05, "loss": 0.1671, "mean_token_accuracy": 0.9559564173221589, "num_tokens": 32976927.0, "step": 16445 }, { "entropy": 0.1054431926459074, "epoch": 3.8345961067723513, "grad_norm": 4.625, "learning_rate": 4.977364978706773e-05, "loss": 0.1556, "mean_token_accuracy": 0.9585894703865051, "num_tokens": 32991700.0, "step": 16450 }, { "entropy": 0.10639844667166472, "epoch": 3.835761743792983, "grad_norm": 0.68359375, "learning_rate": 4.977336053025873e-05, "loss": 0.0992, "mean_token_accuracy": 0.9651355922222138, "num_tokens": 33013040.0, "step": 16455 }, { "entropy": 0.09124952163547277, "epoch": 3.8369273808136146, "grad_norm": 8.75, "learning_rate": 4.977307109043546e-05, "loss": 0.1246, "mean_token_accuracy": 0.9677917957305908, "num_tokens": 33033465.0, "step": 16460 }, { "entropy": 0.09483155831694604, "epoch": 3.8380930178342463, "grad_norm": 3.46875, "learning_rate": 4.9772781467602235e-05, "loss": 0.1548, "mean_token_accuracy": 0.9650797605514526, "num_tokens": 33046989.0, "step": 16465 }, { "entropy": 0.17525366619229316, "epoch": 3.839258654854878, "grad_norm": 0.64453125, "learning_rate": 4.977249166176339e-05, "loss": 0.3451, "mean_token_accuracy": 0.9297631740570068, "num_tokens": 33061848.0, "step": 16470 }, { "entropy": 0.08751837406307458, "epoch": 3.84042429187551, "grad_norm": 4.1875, "learning_rate": 4.9772201672923225e-05, "loss": 0.095, "mean_token_accuracy": 0.9782902419567108, "num_tokens": 33080890.0, "step": 16475 }, { "entropy": 0.07149492613971234, "epoch": 3.8415899288961417, "grad_norm": 1.328125, "learning_rate": 4.977191150108608e-05, "loss": 0.1256, "mean_token_accuracy": 0.9729875683784485, "num_tokens": 33095960.0, "step": 16480 }, { "entropy": 0.08944402430206537, "epoch": 3.8427555659167734, "grad_norm": 0.94921875, "learning_rate": 4.977162114625627e-05, "loss": 0.0872, "mean_token_accuracy": 0.9720249235630035, "num_tokens": 33126818.0, "step": 16485 }, { "entropy": 0.09480830989778041, "epoch": 3.8439212029374055, "grad_norm": 1.6640625, "learning_rate": 4.9771330608438136e-05, "loss": 0.1766, "mean_token_accuracy": 0.9600850522518158, "num_tokens": 33139661.0, "step": 16490 }, { "entropy": 0.08334624543786048, "epoch": 3.845086839958037, "grad_norm": 7.75, "learning_rate": 4.977103988763601e-05, "loss": 0.1408, "mean_token_accuracy": 0.9648266136646271, "num_tokens": 33154190.0, "step": 16495 }, { "entropy": 0.10106358705088496, "epoch": 3.846252476978669, "grad_norm": 1.328125, "learning_rate": 4.977074898385421e-05, "loss": 0.1453, "mean_token_accuracy": 0.9647497057914733, "num_tokens": 33185980.0, "step": 16500 }, { "entropy": 0.10884370524436235, "epoch": 3.847418113999301, "grad_norm": 0.90625, "learning_rate": 4.97704578970971e-05, "loss": 0.0902, "mean_token_accuracy": 0.9579053461551666, "num_tokens": 33212368.0, "step": 16505 }, { "entropy": 0.07987378798425197, "epoch": 3.8485837510199326, "grad_norm": 2.921875, "learning_rate": 4.9770166627369e-05, "loss": 0.0731, "mean_token_accuracy": 0.975167590379715, "num_tokens": 33240176.0, "step": 16510 }, { "entropy": 0.0864196315407753, "epoch": 3.849749388040564, "grad_norm": 1.8359375, "learning_rate": 4.9769875174674265e-05, "loss": 0.1128, "mean_token_accuracy": 0.9740726411342621, "num_tokens": 33252259.0, "step": 16515 }, { "entropy": 0.09710243921726942, "epoch": 3.850915025061196, "grad_norm": 3.921875, "learning_rate": 4.976958353901723e-05, "loss": 0.1266, "mean_token_accuracy": 0.9676422059535981, "num_tokens": 33281031.0, "step": 16520 }, { "entropy": 0.1039828835055232, "epoch": 3.8520806620818275, "grad_norm": 3.640625, "learning_rate": 4.9769291720402254e-05, "loss": 0.0985, "mean_token_accuracy": 0.9699716866016388, "num_tokens": 33308560.0, "step": 16525 }, { "entropy": 0.11291272137314082, "epoch": 3.8532462991024596, "grad_norm": 7.21875, "learning_rate": 4.976899971883368e-05, "loss": 0.1184, "mean_token_accuracy": 0.9514906764030456, "num_tokens": 33336129.0, "step": 16530 }, { "entropy": 0.12264008279889822, "epoch": 3.8544119361230913, "grad_norm": 4.75, "learning_rate": 4.976870753431587e-05, "loss": 0.185, "mean_token_accuracy": 0.9556744515895843, "num_tokens": 33354277.0, "step": 16535 }, { "entropy": 0.09388621672987937, "epoch": 3.855577573143723, "grad_norm": 6.34375, "learning_rate": 4.9768415166853175e-05, "loss": 0.1608, "mean_token_accuracy": 0.9654055655002594, "num_tokens": 33364865.0, "step": 16540 }, { "entropy": 0.14195291325449944, "epoch": 3.856743210164355, "grad_norm": 1.8203125, "learning_rate": 4.976812261644996e-05, "loss": 0.2461, "mean_token_accuracy": 0.9434653699398041, "num_tokens": 33375626.0, "step": 16545 }, { "entropy": 0.10139231681823731, "epoch": 3.8579088471849867, "grad_norm": 1.28125, "learning_rate": 4.976782988311058e-05, "loss": 0.1317, "mean_token_accuracy": 0.9686038613319397, "num_tokens": 33392944.0, "step": 16550 }, { "entropy": 0.10185308828949928, "epoch": 3.8590744842056184, "grad_norm": 4.3125, "learning_rate": 4.9767536966839414e-05, "loss": 0.1471, "mean_token_accuracy": 0.9665589034557343, "num_tokens": 33412158.0, "step": 16555 }, { "entropy": 0.09140795618295669, "epoch": 3.86024012122625, "grad_norm": 1.296875, "learning_rate": 4.976724386764081e-05, "loss": 0.1415, "mean_token_accuracy": 0.9718575298786163, "num_tokens": 33424867.0, "step": 16560 }, { "entropy": 0.09763126391917468, "epoch": 3.8614057582468817, "grad_norm": 9.1875, "learning_rate": 4.976695058551916e-05, "loss": 0.1007, "mean_token_accuracy": 0.9644618034362793, "num_tokens": 33443510.0, "step": 16565 }, { "entropy": 0.15136264748871325, "epoch": 3.862571395267514, "grad_norm": 5.4375, "learning_rate": 4.976665712047882e-05, "loss": 0.1573, "mean_token_accuracy": 0.9575160503387451, "num_tokens": 33452758.0, "step": 16570 }, { "entropy": 0.0832042837049812, "epoch": 3.8637370322881455, "grad_norm": 1.140625, "learning_rate": 4.976636347252417e-05, "loss": 0.1049, "mean_token_accuracy": 0.9735616624355317, "num_tokens": 33487033.0, "step": 16575 }, { "entropy": 0.130936024710536, "epoch": 3.864902669308777, "grad_norm": 11.75, "learning_rate": 4.976606964165959e-05, "loss": 0.3414, "mean_token_accuracy": 0.9347775518894196, "num_tokens": 33512831.0, "step": 16580 }, { "entropy": 0.09509001523256302, "epoch": 3.8660683063294092, "grad_norm": 0.85546875, "learning_rate": 4.9765775627889466e-05, "loss": 0.0911, "mean_token_accuracy": 0.9706315398216248, "num_tokens": 33540628.0, "step": 16585 }, { "entropy": 0.09184669237583876, "epoch": 3.867233943350041, "grad_norm": 3.25, "learning_rate": 4.9765481431218176e-05, "loss": 0.1339, "mean_token_accuracy": 0.9704009413719177, "num_tokens": 33557555.0, "step": 16590 }, { "entropy": 0.10518418364226818, "epoch": 3.8683995803706726, "grad_norm": 1.3671875, "learning_rate": 4.976518705165011e-05, "loss": 0.1223, "mean_token_accuracy": 0.9689244747161865, "num_tokens": 33582110.0, "step": 16595 }, { "entropy": 0.10216858349740505, "epoch": 3.869565217391304, "grad_norm": 4.1875, "learning_rate": 4.976489248918966e-05, "loss": 0.2179, "mean_token_accuracy": 0.9570053875446319, "num_tokens": 33592239.0, "step": 16600 }, { "entropy": 0.11914208475500346, "epoch": 3.870730854411936, "grad_norm": 1.375, "learning_rate": 4.976459774384121e-05, "loss": 0.165, "mean_token_accuracy": 0.9575796544551849, "num_tokens": 33607680.0, "step": 16605 }, { "entropy": 0.07731572240591049, "epoch": 3.871896491432568, "grad_norm": 1.3671875, "learning_rate": 4.976430281560917e-05, "loss": 0.1315, "mean_token_accuracy": 0.9717673599720001, "num_tokens": 33626618.0, "step": 16610 }, { "entropy": 0.1043876113370061, "epoch": 3.8730621284531996, "grad_norm": 2.484375, "learning_rate": 4.976400770449792e-05, "loss": 0.1537, "mean_token_accuracy": 0.9649846971035003, "num_tokens": 33645847.0, "step": 16615 }, { "entropy": 0.18505688859149813, "epoch": 3.8742277654738313, "grad_norm": 0.91796875, "learning_rate": 4.9763712410511874e-05, "loss": 0.2365, "mean_token_accuracy": 0.9463666260242463, "num_tokens": 33676871.0, "step": 16620 }, { "entropy": 0.08650575876235962, "epoch": 3.8753934024944634, "grad_norm": 4.75, "learning_rate": 4.9763416933655425e-05, "loss": 0.1445, "mean_token_accuracy": 0.967893385887146, "num_tokens": 33691010.0, "step": 16625 }, { "entropy": 0.16968498565256596, "epoch": 3.876559039515095, "grad_norm": 1.3203125, "learning_rate": 4.976312127393299e-05, "loss": 0.1554, "mean_token_accuracy": 0.9321939647197723, "num_tokens": 33724208.0, "step": 16630 }, { "entropy": 0.07792753130197524, "epoch": 3.8777246765357267, "grad_norm": 5.0, "learning_rate": 4.976282543134897e-05, "loss": 0.1136, "mean_token_accuracy": 0.9741957724094391, "num_tokens": 33742948.0, "step": 16635 }, { "entropy": 0.08758415430784225, "epoch": 3.878890313556359, "grad_norm": 1.0390625, "learning_rate": 4.976252940590778e-05, "loss": 0.1177, "mean_token_accuracy": 0.972603303194046, "num_tokens": 33766005.0, "step": 16640 }, { "entropy": 0.1088608767837286, "epoch": 3.8800559505769905, "grad_norm": 5.0625, "learning_rate": 4.9762233197613837e-05, "loss": 0.2298, "mean_token_accuracy": 0.9552813589572906, "num_tokens": 33779897.0, "step": 16645 }, { "entropy": 0.09404038712382316, "epoch": 3.881221587597622, "grad_norm": 3.140625, "learning_rate": 4.976193680647154e-05, "loss": 0.1389, "mean_token_accuracy": 0.9661530613899231, "num_tokens": 33795574.0, "step": 16650 }, { "entropy": 0.12309183105826378, "epoch": 3.882387224618254, "grad_norm": 5.0625, "learning_rate": 4.9761640232485334e-05, "loss": 0.2158, "mean_token_accuracy": 0.9480292320251464, "num_tokens": 33805200.0, "step": 16655 }, { "entropy": 0.07593281920999288, "epoch": 3.8835528616388855, "grad_norm": 2.703125, "learning_rate": 4.976134347565963e-05, "loss": 0.0476, "mean_token_accuracy": 0.9800086498260498, "num_tokens": 33846497.0, "step": 16660 }, { "entropy": 0.07238452583551407, "epoch": 3.8847184986595176, "grad_norm": 2.3125, "learning_rate": 4.976104653599884e-05, "loss": 0.0828, "mean_token_accuracy": 0.9750954031944274, "num_tokens": 33890541.0, "step": 16665 }, { "entropy": 0.11538115590810775, "epoch": 3.8858841356801492, "grad_norm": 4.03125, "learning_rate": 4.976074941350741e-05, "loss": 0.1476, "mean_token_accuracy": 0.9668866634368897, "num_tokens": 33904145.0, "step": 16670 }, { "entropy": 0.1241513341665268, "epoch": 3.887049772700781, "grad_norm": 0.73046875, "learning_rate": 4.976045210818977e-05, "loss": 0.1267, "mean_token_accuracy": 0.966703736782074, "num_tokens": 33924578.0, "step": 16675 }, { "entropy": 0.0924358457326889, "epoch": 3.888215409721413, "grad_norm": 8.9375, "learning_rate": 4.976015462005035e-05, "loss": 0.1662, "mean_token_accuracy": 0.9649093568325042, "num_tokens": 33941729.0, "step": 16680 }, { "entropy": 0.11906333230435848, "epoch": 3.8893810467420447, "grad_norm": 1.4921875, "learning_rate": 4.975985694909358e-05, "loss": 0.1623, "mean_token_accuracy": 0.9645924746990204, "num_tokens": 33965840.0, "step": 16685 }, { "entropy": 0.07723705116659403, "epoch": 3.8905466837626763, "grad_norm": 2.03125, "learning_rate": 4.975955909532391e-05, "loss": 0.0603, "mean_token_accuracy": 0.9721529066562653, "num_tokens": 33993315.0, "step": 16690 }, { "entropy": 0.15692530367523433, "epoch": 3.891712320783308, "grad_norm": 0.890625, "learning_rate": 4.9759261058745756e-05, "loss": 0.1724, "mean_token_accuracy": 0.9513945758342743, "num_tokens": 34021121.0, "step": 16695 }, { "entropy": 0.06238477006554603, "epoch": 3.8928779578039396, "grad_norm": 4.59375, "learning_rate": 4.97589628393636e-05, "loss": 0.0723, "mean_token_accuracy": 0.9769518494606018, "num_tokens": 34037255.0, "step": 16700 }, { "entropy": 0.0868049081414938, "epoch": 3.8940435948245717, "grad_norm": 3.234375, "learning_rate": 4.9758664437181856e-05, "loss": 0.1231, "mean_token_accuracy": 0.9709369957447052, "num_tokens": 34049033.0, "step": 16705 }, { "entropy": 0.12496979609131813, "epoch": 3.8952092318452034, "grad_norm": 3.375, "learning_rate": 4.9758365852205e-05, "loss": 0.2056, "mean_token_accuracy": 0.9534848451614379, "num_tokens": 34059272.0, "step": 16710 }, { "entropy": 0.08685902096331119, "epoch": 3.896374868865835, "grad_norm": 4.21875, "learning_rate": 4.975806708443746e-05, "loss": 0.1175, "mean_token_accuracy": 0.970902019739151, "num_tokens": 34072449.0, "step": 16715 }, { "entropy": 0.07332117343321443, "epoch": 3.897540505886467, "grad_norm": 5.65625, "learning_rate": 4.975776813388371e-05, "loss": 0.118, "mean_token_accuracy": 0.9707030892372132, "num_tokens": 34099071.0, "step": 16720 }, { "entropy": 0.1274700254201889, "epoch": 3.898706142907099, "grad_norm": 7.5625, "learning_rate": 4.9757469000548194e-05, "loss": 0.2366, "mean_token_accuracy": 0.9488331198692321, "num_tokens": 34115790.0, "step": 16725 }, { "entropy": 0.18221108466386796, "epoch": 3.8998717799277305, "grad_norm": 0.84375, "learning_rate": 4.9757169684435385e-05, "loss": 0.142, "mean_token_accuracy": 0.9406643450260163, "num_tokens": 34150985.0, "step": 16730 }, { "entropy": 0.10956341233104468, "epoch": 3.901037416948362, "grad_norm": 4.6875, "learning_rate": 4.975687018554974e-05, "loss": 0.1113, "mean_token_accuracy": 0.9680880844593048, "num_tokens": 34177333.0, "step": 16735 }, { "entropy": 0.093040444329381, "epoch": 3.902203053968994, "grad_norm": 7.5625, "learning_rate": 4.975657050389572e-05, "loss": 0.0971, "mean_token_accuracy": 0.9710028231143951, "num_tokens": 34205723.0, "step": 16740 }, { "entropy": 0.08593805404379964, "epoch": 3.903368690989626, "grad_norm": 6.875, "learning_rate": 4.9756270639477804e-05, "loss": 0.1844, "mean_token_accuracy": 0.9584748089313507, "num_tokens": 34221572.0, "step": 16745 }, { "entropy": 0.11587801575660706, "epoch": 3.9045343280102576, "grad_norm": 3.90625, "learning_rate": 4.9755970592300454e-05, "loss": 0.1936, "mean_token_accuracy": 0.9567054510116577, "num_tokens": 34231895.0, "step": 16750 }, { "entropy": 0.08428452573716641, "epoch": 3.9056999650308892, "grad_norm": 2.6875, "learning_rate": 4.975567036236815e-05, "loss": 0.2177, "mean_token_accuracy": 0.9556152582168579, "num_tokens": 34261270.0, "step": 16755 }, { "entropy": 0.08318189261481165, "epoch": 3.9068656020515213, "grad_norm": 0.90625, "learning_rate": 4.975536994968537e-05, "loss": 0.0855, "mean_token_accuracy": 0.9774995982646942, "num_tokens": 34288938.0, "step": 16760 }, { "entropy": 0.07968087457120418, "epoch": 3.908031239072153, "grad_norm": 0.6796875, "learning_rate": 4.975506935425659e-05, "loss": 0.1019, "mean_token_accuracy": 0.9670353651046752, "num_tokens": 34311606.0, "step": 16765 }, { "entropy": 0.11844033598899842, "epoch": 3.9091968760927847, "grad_norm": 0.7421875, "learning_rate": 4.975476857608629e-05, "loss": 0.1238, "mean_token_accuracy": 0.962626975774765, "num_tokens": 34334720.0, "step": 16770 }, { "entropy": 0.11227045767009258, "epoch": 3.9103625131134163, "grad_norm": 8.3125, "learning_rate": 4.9754467615178965e-05, "loss": 0.1875, "mean_token_accuracy": 0.9611839234828949, "num_tokens": 34365228.0, "step": 16775 }, { "entropy": 0.09844578094780446, "epoch": 3.9115281501340484, "grad_norm": 5.53125, "learning_rate": 4.975416647153909e-05, "loss": 0.1339, "mean_token_accuracy": 0.9626381635665894, "num_tokens": 34385551.0, "step": 16780 }, { "entropy": 0.08762693898752331, "epoch": 3.91269378715468, "grad_norm": 1.2578125, "learning_rate": 4.975386514517116e-05, "loss": 0.1465, "mean_token_accuracy": 0.9660818219184876, "num_tokens": 34426491.0, "step": 16785 }, { "entropy": 0.11643909402191639, "epoch": 3.9138594241753117, "grad_norm": 3.0625, "learning_rate": 4.975356363607967e-05, "loss": 0.1443, "mean_token_accuracy": 0.9647420525550843, "num_tokens": 34445386.0, "step": 16790 }, { "entropy": 0.07333908714354039, "epoch": 3.9150250611959434, "grad_norm": 2.234375, "learning_rate": 4.975326194426913e-05, "loss": 0.0984, "mean_token_accuracy": 0.9713580191135407, "num_tokens": 34463617.0, "step": 16795 }, { "entropy": 0.13890503272414206, "epoch": 3.9161906982165755, "grad_norm": 2.921875, "learning_rate": 4.9752960069744e-05, "loss": 0.17, "mean_token_accuracy": 0.9563362002372742, "num_tokens": 34491601.0, "step": 16800 }, { "entropy": 0.0995341569185257, "epoch": 3.917356335237207, "grad_norm": 9.625, "learning_rate": 4.975265801250882e-05, "loss": 0.2026, "mean_token_accuracy": 0.9588484287261962, "num_tokens": 34500896.0, "step": 16805 }, { "entropy": 0.13597904723137616, "epoch": 3.918521972257839, "grad_norm": 1.40625, "learning_rate": 4.9752355772568084e-05, "loss": 0.2381, "mean_token_accuracy": 0.9384510278701782, "num_tokens": 34515025.0, "step": 16810 }, { "entropy": 0.11005090977996587, "epoch": 3.919687609278471, "grad_norm": 8.8125, "learning_rate": 4.9752053349926284e-05, "loss": 0.1877, "mean_token_accuracy": 0.9612878561019897, "num_tokens": 34535079.0, "step": 16815 }, { "entropy": 0.1047309897840023, "epoch": 3.9208532462991026, "grad_norm": 6.03125, "learning_rate": 4.9751750744587955e-05, "loss": 0.1837, "mean_token_accuracy": 0.9651427209377289, "num_tokens": 34563356.0, "step": 16820 }, { "entropy": 0.09921911545097828, "epoch": 3.9220188833197343, "grad_norm": 1.8359375, "learning_rate": 4.975144795655758e-05, "loss": 0.124, "mean_token_accuracy": 0.9721204698085785, "num_tokens": 34576625.0, "step": 16825 }, { "entropy": 0.15272005051374435, "epoch": 3.923184520340366, "grad_norm": 2.234375, "learning_rate": 4.97511449858397e-05, "loss": 0.1905, "mean_token_accuracy": 0.9491037011146546, "num_tokens": 34615813.0, "step": 16830 }, { "entropy": 0.08232778124511242, "epoch": 3.9243501573609976, "grad_norm": 1.359375, "learning_rate": 4.975084183243882e-05, "loss": 0.1059, "mean_token_accuracy": 0.9762903690338135, "num_tokens": 34626874.0, "step": 16835 }, { "entropy": 0.0976740401238203, "epoch": 3.9255157943816297, "grad_norm": 5.96875, "learning_rate": 4.975053849635946e-05, "loss": 0.1366, "mean_token_accuracy": 0.9610688865184784, "num_tokens": 34641904.0, "step": 16840 }, { "entropy": 0.07497776001691818, "epoch": 3.9266814314022613, "grad_norm": 0.61328125, "learning_rate": 4.9750234977606135e-05, "loss": 0.1081, "mean_token_accuracy": 0.9722188532352447, "num_tokens": 34663203.0, "step": 16845 }, { "entropy": 0.10160439331084489, "epoch": 3.927847068422893, "grad_norm": 0.82421875, "learning_rate": 4.9749931276183394e-05, "loss": 0.1481, "mean_token_accuracy": 0.9629201531410218, "num_tokens": 34682215.0, "step": 16850 }, { "entropy": 0.08226745296269655, "epoch": 3.929012705443525, "grad_norm": 1.328125, "learning_rate": 4.974962739209574e-05, "loss": 0.0816, "mean_token_accuracy": 0.9744136035442352, "num_tokens": 34698813.0, "step": 16855 }, { "entropy": 0.16998137403279542, "epoch": 3.9301783424641568, "grad_norm": 1.546875, "learning_rate": 4.974932332534773e-05, "loss": 0.1684, "mean_token_accuracy": 0.9295632243156433, "num_tokens": 34727365.0, "step": 16860 }, { "entropy": 0.11339530013501645, "epoch": 3.9313439794847884, "grad_norm": 3.453125, "learning_rate": 4.974901907594388e-05, "loss": 0.2006, "mean_token_accuracy": 0.9534684479236603, "num_tokens": 34737824.0, "step": 16865 }, { "entropy": 0.08613226562738419, "epoch": 3.93250961650542, "grad_norm": 2.046875, "learning_rate": 4.9748714643888736e-05, "loss": 0.0918, "mean_token_accuracy": 0.9665167927742004, "num_tokens": 34767020.0, "step": 16870 }, { "entropy": 0.08076264001429082, "epoch": 3.9336752535260517, "grad_norm": 2.421875, "learning_rate": 4.9748410029186824e-05, "loss": 0.0912, "mean_token_accuracy": 0.978045392036438, "num_tokens": 34781216.0, "step": 16875 }, { "entropy": 0.11103322636336088, "epoch": 3.934840890546684, "grad_norm": 6.34375, "learning_rate": 4.974810523184271e-05, "loss": 0.1654, "mean_token_accuracy": 0.9646231949329376, "num_tokens": 34795320.0, "step": 16880 }, { "entropy": 0.09688258673995734, "epoch": 3.9360065275673155, "grad_norm": 0.8515625, "learning_rate": 4.974780025186091e-05, "loss": 0.141, "mean_token_accuracy": 0.9685312807559967, "num_tokens": 34816339.0, "step": 16885 }, { "entropy": 0.07648933194577694, "epoch": 3.937172164587947, "grad_norm": 1.6328125, "learning_rate": 4.9747495089246e-05, "loss": 0.0967, "mean_token_accuracy": 0.978608375787735, "num_tokens": 34833298.0, "step": 16890 }, { "entropy": 0.08467755373567343, "epoch": 3.9383378016085793, "grad_norm": 0.5, "learning_rate": 4.974718974400251e-05, "loss": 0.0979, "mean_token_accuracy": 0.9738011240959168, "num_tokens": 34862645.0, "step": 16895 }, { "entropy": 0.11560116838663817, "epoch": 3.939503438629211, "grad_norm": 1.25, "learning_rate": 4.9746884216135e-05, "loss": 0.1156, "mean_token_accuracy": 0.9693970024585724, "num_tokens": 34885171.0, "step": 16900 }, { "entropy": 0.08131022192537785, "epoch": 3.9406690756498426, "grad_norm": 4.03125, "learning_rate": 4.974657850564802e-05, "loss": 0.1359, "mean_token_accuracy": 0.9690242052078247, "num_tokens": 34902737.0, "step": 16905 }, { "entropy": 0.08744903868064284, "epoch": 3.9418347126704743, "grad_norm": 1.59375, "learning_rate": 4.974627261254614e-05, "loss": 0.1904, "mean_token_accuracy": 0.9598405420780182, "num_tokens": 34922558.0, "step": 16910 }, { "entropy": 0.09291955698281526, "epoch": 3.9430003496911064, "grad_norm": 5.53125, "learning_rate": 4.974596653683392e-05, "loss": 0.1861, "mean_token_accuracy": 0.9584287703037262, "num_tokens": 34940507.0, "step": 16915 }, { "entropy": 0.09390394520014525, "epoch": 3.944165986711738, "grad_norm": 4.59375, "learning_rate": 4.9745660278515916e-05, "loss": 0.1347, "mean_token_accuracy": 0.9714871406555176, "num_tokens": 34960577.0, "step": 16920 }, { "entropy": 0.13802707754075527, "epoch": 3.9453316237323697, "grad_norm": 1.7734375, "learning_rate": 4.97453538375967e-05, "loss": 0.1361, "mean_token_accuracy": 0.9666981935501099, "num_tokens": 34979635.0, "step": 16925 }, { "entropy": 0.07457001972943544, "epoch": 3.9464972607530013, "grad_norm": 0.6875, "learning_rate": 4.974504721408084e-05, "loss": 0.1188, "mean_token_accuracy": 0.9700695991516113, "num_tokens": 35005275.0, "step": 16930 }, { "entropy": 0.09323781207203866, "epoch": 3.9476628977736334, "grad_norm": 7.4375, "learning_rate": 4.974474040797291e-05, "loss": 0.1387, "mean_token_accuracy": 0.9643499076366424, "num_tokens": 35024268.0, "step": 16935 }, { "entropy": 0.10064857602119445, "epoch": 3.948828534794265, "grad_norm": 3.96875, "learning_rate": 4.974443341927748e-05, "loss": 0.204, "mean_token_accuracy": 0.9560014724731445, "num_tokens": 35034041.0, "step": 16940 }, { "entropy": 0.11008808445185422, "epoch": 3.9499941718148968, "grad_norm": 7.28125, "learning_rate": 4.974412624799913e-05, "loss": 0.1293, "mean_token_accuracy": 0.9637737035751343, "num_tokens": 35054443.0, "step": 16945 }, { "entropy": 0.1091458585113287, "epoch": 3.951159808835529, "grad_norm": 2.984375, "learning_rate": 4.9743818894142445e-05, "loss": 0.1661, "mean_token_accuracy": 0.9582258284091949, "num_tokens": 35072895.0, "step": 16950 }, { "entropy": 0.09630255661904812, "epoch": 3.9523254458561605, "grad_norm": 2.28125, "learning_rate": 4.974351135771201e-05, "loss": 0.1452, "mean_token_accuracy": 0.9703953325748443, "num_tokens": 35084442.0, "step": 16955 }, { "entropy": 0.08614437934011221, "epoch": 3.953491082876792, "grad_norm": 3.1875, "learning_rate": 4.9743203638712394e-05, "loss": 0.1202, "mean_token_accuracy": 0.9727041780948639, "num_tokens": 35118913.0, "step": 16960 }, { "entropy": 0.09501389507204294, "epoch": 3.954656719897424, "grad_norm": 6.75, "learning_rate": 4.9742895737148204e-05, "loss": 0.1839, "mean_token_accuracy": 0.9623470664024353, "num_tokens": 35142002.0, "step": 16965 }, { "entropy": 0.10413766689598561, "epoch": 3.9558223569180555, "grad_norm": 3.21875, "learning_rate": 4.974258765302402e-05, "loss": 0.1568, "mean_token_accuracy": 0.9626488566398621, "num_tokens": 35160878.0, "step": 16970 }, { "entropy": 0.13350182436406613, "epoch": 3.9569879939386876, "grad_norm": 4.6875, "learning_rate": 4.974227938634444e-05, "loss": 0.1619, "mean_token_accuracy": 0.9620459258556366, "num_tokens": 35170449.0, "step": 16975 }, { "entropy": 0.09258143836632371, "epoch": 3.9581536309593193, "grad_norm": 11.0, "learning_rate": 4.974197093711407e-05, "loss": 0.1253, "mean_token_accuracy": 0.9680348873138428, "num_tokens": 35195509.0, "step": 16980 }, { "entropy": 0.12735018711537122, "epoch": 3.959319267979951, "grad_norm": 2.53125, "learning_rate": 4.974166230533749e-05, "loss": 0.1687, "mean_token_accuracy": 0.9616983890533447, "num_tokens": 35209393.0, "step": 16985 }, { "entropy": 0.08677174393087625, "epoch": 3.960484905000583, "grad_norm": 8.25, "learning_rate": 4.974135349101932e-05, "loss": 0.1203, "mean_token_accuracy": 0.9645900964736939, "num_tokens": 35226604.0, "step": 16990 }, { "entropy": 0.07679356653243304, "epoch": 3.9616505420212147, "grad_norm": 1.640625, "learning_rate": 4.9741044494164155e-05, "loss": 0.0642, "mean_token_accuracy": 0.9766354739665986, "num_tokens": 35255439.0, "step": 16995 }, { "entropy": 0.12029009442776442, "epoch": 3.9628161790418464, "grad_norm": 2.5625, "learning_rate": 4.97407353147766e-05, "loss": 0.1779, "mean_token_accuracy": 0.9526813089847564, "num_tokens": 35273550.0, "step": 17000 }, { "entropy": 0.10844596447423101, "epoch": 3.963981816062478, "grad_norm": 4.90625, "learning_rate": 4.9740425952861275e-05, "loss": 0.1563, "mean_token_accuracy": 0.964635568857193, "num_tokens": 35287014.0, "step": 17005 }, { "entropy": 0.10331882648169995, "epoch": 3.9651474530831097, "grad_norm": 1.078125, "learning_rate": 4.9740116408422786e-05, "loss": 0.1502, "mean_token_accuracy": 0.9674645781517028, "num_tokens": 35308920.0, "step": 17010 }, { "entropy": 0.08422896657139063, "epoch": 3.966313090103742, "grad_norm": 1.59375, "learning_rate": 4.973980668146575e-05, "loss": 0.1126, "mean_token_accuracy": 0.9708844125270844, "num_tokens": 35324331.0, "step": 17015 }, { "entropy": 0.09905137531459332, "epoch": 3.9674787271243734, "grad_norm": 4.6875, "learning_rate": 4.973949677199479e-05, "loss": 0.1579, "mean_token_accuracy": 0.9681069612503052, "num_tokens": 35337312.0, "step": 17020 }, { "entropy": 0.10593837816268206, "epoch": 3.968644364145005, "grad_norm": 8.75, "learning_rate": 4.9739186680014525e-05, "loss": 0.1572, "mean_token_accuracy": 0.9576498866081238, "num_tokens": 35369925.0, "step": 17025 }, { "entropy": 0.1549563642591238, "epoch": 3.969810001165637, "grad_norm": 3.921875, "learning_rate": 4.973887640552958e-05, "loss": 0.2213, "mean_token_accuracy": 0.9465202629566193, "num_tokens": 35394038.0, "step": 17030 }, { "entropy": 0.0999618673697114, "epoch": 3.970975638186269, "grad_norm": 0.8203125, "learning_rate": 4.973856594854457e-05, "loss": 0.1193, "mean_token_accuracy": 0.9646108150482178, "num_tokens": 35422201.0, "step": 17035 }, { "entropy": 0.0862261445261538, "epoch": 3.9721412752069005, "grad_norm": 0.94140625, "learning_rate": 4.973825530906414e-05, "loss": 0.0418, "mean_token_accuracy": 0.9761014401912689, "num_tokens": 35458470.0, "step": 17040 }, { "entropy": 0.0714691722765565, "epoch": 3.973306912227532, "grad_norm": 0.87109375, "learning_rate": 4.9737944487092914e-05, "loss": 0.0984, "mean_token_accuracy": 0.9735678911209107, "num_tokens": 35478787.0, "step": 17045 }, { "entropy": 0.12230119397863745, "epoch": 3.9744725492481643, "grad_norm": 0.75, "learning_rate": 4.973763348263553e-05, "loss": 0.1428, "mean_token_accuracy": 0.9668144583702087, "num_tokens": 35505383.0, "step": 17050 }, { "entropy": 0.1031965110450983, "epoch": 3.975638186268796, "grad_norm": 3.875, "learning_rate": 4.973732229569662e-05, "loss": 0.1709, "mean_token_accuracy": 0.9613557398319245, "num_tokens": 35518250.0, "step": 17055 }, { "entropy": 0.09054919630289078, "epoch": 3.9768038232894276, "grad_norm": 4.78125, "learning_rate": 4.9737010926280844e-05, "loss": 0.0919, "mean_token_accuracy": 0.968346893787384, "num_tokens": 35553291.0, "step": 17060 }, { "entropy": 0.1308944163378328, "epoch": 3.9779694603100593, "grad_norm": 0.69921875, "learning_rate": 4.9736699374392816e-05, "loss": 0.1891, "mean_token_accuracy": 0.9522222697734832, "num_tokens": 35577233.0, "step": 17065 }, { "entropy": 0.09694569632411003, "epoch": 3.9791350973306914, "grad_norm": 5.125, "learning_rate": 4.9736387640037195e-05, "loss": 0.1191, "mean_token_accuracy": 0.969031709432602, "num_tokens": 35610320.0, "step": 17070 }, { "entropy": 0.08072161357849836, "epoch": 3.980300734351323, "grad_norm": 3.25, "learning_rate": 4.9736075723218636e-05, "loss": 0.1532, "mean_token_accuracy": 0.9647042334079743, "num_tokens": 35626307.0, "step": 17075 }, { "entropy": 0.07369415275752544, "epoch": 3.9814663713719547, "grad_norm": 1.1015625, "learning_rate": 4.973576362394178e-05, "loss": 0.0809, "mean_token_accuracy": 0.9759980022907258, "num_tokens": 35644968.0, "step": 17080 }, { "entropy": 0.10177280581556261, "epoch": 3.982632008392587, "grad_norm": 1.9765625, "learning_rate": 4.973545134221128e-05, "loss": 0.0818, "mean_token_accuracy": 0.9739609241485596, "num_tokens": 35666050.0, "step": 17085 }, { "entropy": 0.06352849621325732, "epoch": 3.9837976454132185, "grad_norm": 0.921875, "learning_rate": 4.973513887803181e-05, "loss": 0.0537, "mean_token_accuracy": 0.9793843626976013, "num_tokens": 35689090.0, "step": 17090 }, { "entropy": 0.09791185222566127, "epoch": 3.98496328243385, "grad_norm": 4.53125, "learning_rate": 4.973482623140801e-05, "loss": 0.1461, "mean_token_accuracy": 0.9674197316169739, "num_tokens": 35698247.0, "step": 17095 }, { "entropy": 0.06252257125452161, "epoch": 3.986128919454482, "grad_norm": 0.59375, "learning_rate": 4.973451340234454e-05, "loss": 0.076, "mean_token_accuracy": 0.9785351634025574, "num_tokens": 35722602.0, "step": 17100 }, { "entropy": 0.09273346532136202, "epoch": 3.9872945564751134, "grad_norm": 0.5546875, "learning_rate": 4.973420039084608e-05, "loss": 0.1375, "mean_token_accuracy": 0.965330445766449, "num_tokens": 35759998.0, "step": 17105 }, { "entropy": 0.10029075648635626, "epoch": 3.9884601934957455, "grad_norm": 6.0, "learning_rate": 4.97338871969173e-05, "loss": 0.1704, "mean_token_accuracy": 0.962207305431366, "num_tokens": 35778283.0, "step": 17110 }, { "entropy": 0.16393357664346694, "epoch": 3.989625830516377, "grad_norm": 7.5625, "learning_rate": 4.973357382056285e-05, "loss": 0.2825, "mean_token_accuracy": 0.9334946393966674, "num_tokens": 35790208.0, "step": 17115 }, { "entropy": 0.10811770148575306, "epoch": 3.990791467537009, "grad_norm": 2.953125, "learning_rate": 4.9733260261787415e-05, "loss": 0.1301, "mean_token_accuracy": 0.9654451310634613, "num_tokens": 35807988.0, "step": 17120 }, { "entropy": 0.11802220270037651, "epoch": 3.991957104557641, "grad_norm": 0.734375, "learning_rate": 4.973294652059568e-05, "loss": 0.121, "mean_token_accuracy": 0.9620232224464417, "num_tokens": 35830113.0, "step": 17125 }, { "entropy": 0.12250925246626139, "epoch": 3.9931227415782726, "grad_norm": 5.1875, "learning_rate": 4.9732632596992304e-05, "loss": 0.1242, "mean_token_accuracy": 0.9520814001560212, "num_tokens": 35852219.0, "step": 17130 }, { "entropy": 0.11308533251285553, "epoch": 3.9942883785989043, "grad_norm": 5.9375, "learning_rate": 4.973231849098197e-05, "loss": 0.1257, "mean_token_accuracy": 0.9568483293056488, "num_tokens": 35869300.0, "step": 17135 }, { "entropy": 0.0839522771537304, "epoch": 3.995454015619536, "grad_norm": 3.046875, "learning_rate": 4.973200420256938e-05, "loss": 0.1611, "mean_token_accuracy": 0.9654144763946533, "num_tokens": 35886692.0, "step": 17140 }, { "entropy": 0.09023104514926672, "epoch": 3.9966196526401676, "grad_norm": 4.96875, "learning_rate": 4.9731689731759204e-05, "loss": 0.0774, "mean_token_accuracy": 0.9785473227500916, "num_tokens": 35910204.0, "step": 17145 }, { "entropy": 0.11231829710304737, "epoch": 3.9977852896607997, "grad_norm": 2.640625, "learning_rate": 4.973137507855614e-05, "loss": 0.0889, "mean_token_accuracy": 0.9724850535392762, "num_tokens": 35927582.0, "step": 17150 }, { "entropy": 0.0904229398816824, "epoch": 3.9989509266814314, "grad_norm": 3.125, "learning_rate": 4.9731060242964875e-05, "loss": 0.1323, "mean_token_accuracy": 0.967452985048294, "num_tokens": 35941857.0, "step": 17155 }, { "entropy": 0.09246301154295604, "epoch": 4.0, "grad_norm": 2.234375, "learning_rate": 4.97307452249901e-05, "loss": 0.107, "mean_token_accuracy": 0.9710255795054965, "num_tokens": 35961500.0, "step": 17160 }, { "entropy": 0.07402282971888781, "epoch": 4.001165637020632, "grad_norm": 1.84375, "learning_rate": 4.973043002463653e-05, "loss": 0.0689, "mean_token_accuracy": 0.9799567878246307, "num_tokens": 35977206.0, "step": 17165 }, { "entropy": 0.054261915292590855, "epoch": 4.002331274041263, "grad_norm": 2.078125, "learning_rate": 4.973011464190884e-05, "loss": 0.0509, "mean_token_accuracy": 0.9852329611778259, "num_tokens": 35999402.0, "step": 17170 }, { "entropy": 0.07321383291855454, "epoch": 4.003496911061895, "grad_norm": 1.34375, "learning_rate": 4.9729799076811756e-05, "loss": 0.0939, "mean_token_accuracy": 0.9778148233890533, "num_tokens": 36018866.0, "step": 17175 }, { "entropy": 0.07864432521164418, "epoch": 4.0046625480825275, "grad_norm": 4.71875, "learning_rate": 4.972948332934997e-05, "loss": 0.1028, "mean_token_accuracy": 0.9760511696338654, "num_tokens": 36037205.0, "step": 17180 }, { "entropy": 0.04691657172515988, "epoch": 4.005828185103159, "grad_norm": 0.337890625, "learning_rate": 4.972916739952819e-05, "loss": 0.0351, "mean_token_accuracy": 0.9869137167930603, "num_tokens": 36064889.0, "step": 17185 }, { "entropy": 0.0748681684024632, "epoch": 4.006993822123791, "grad_norm": 5.0625, "learning_rate": 4.972885128735113e-05, "loss": 0.0701, "mean_token_accuracy": 0.9769394755363464, "num_tokens": 36098320.0, "step": 17190 }, { "entropy": 0.08668388687074184, "epoch": 4.008159459144422, "grad_norm": 2.75, "learning_rate": 4.972853499282351e-05, "loss": 0.1585, "mean_token_accuracy": 0.9724346101284027, "num_tokens": 36117696.0, "step": 17195 }, { "entropy": 0.07840844243764877, "epoch": 4.009325096165054, "grad_norm": 3.40625, "learning_rate": 4.9728218515950034e-05, "loss": 0.0892, "mean_token_accuracy": 0.9731666326522828, "num_tokens": 36134581.0, "step": 17200 }, { "entropy": 0.07370718382298946, "epoch": 4.010490733185686, "grad_norm": 0.734375, "learning_rate": 4.972790185673544e-05, "loss": 0.0803, "mean_token_accuracy": 0.9789455652236938, "num_tokens": 36153047.0, "step": 17205 }, { "entropy": 0.083255111053586, "epoch": 4.0116563702063175, "grad_norm": 3.453125, "learning_rate": 4.972758501518442e-05, "loss": 0.0689, "mean_token_accuracy": 0.9771815776824951, "num_tokens": 36185090.0, "step": 17210 }, { "entropy": 0.05564054492861033, "epoch": 4.01282200722695, "grad_norm": 0.97265625, "learning_rate": 4.972726799130173e-05, "loss": 0.0613, "mean_token_accuracy": 0.9788758993148804, "num_tokens": 36198374.0, "step": 17215 }, { "entropy": 0.061925210803747174, "epoch": 4.013987644247582, "grad_norm": 0.8828125, "learning_rate": 4.9726950785092074e-05, "loss": 0.077, "mean_token_accuracy": 0.9766283810138703, "num_tokens": 36219825.0, "step": 17220 }, { "entropy": 0.05524555239826441, "epoch": 4.015153281268213, "grad_norm": 3.859375, "learning_rate": 4.9726633396560195e-05, "loss": 0.0552, "mean_token_accuracy": 0.9815401554107666, "num_tokens": 36243277.0, "step": 17225 }, { "entropy": 0.05170870460569858, "epoch": 4.016318918288845, "grad_norm": 0.86328125, "learning_rate": 4.9726315825710824e-05, "loss": 0.0542, "mean_token_accuracy": 0.9843271493911743, "num_tokens": 36258311.0, "step": 17230 }, { "entropy": 0.08237220542505383, "epoch": 4.017484555309476, "grad_norm": 1.1953125, "learning_rate": 4.972599807254869e-05, "loss": 0.0775, "mean_token_accuracy": 0.9783424258232116, "num_tokens": 36277676.0, "step": 17235 }, { "entropy": 0.09206105470657348, "epoch": 4.018650192330108, "grad_norm": 2.28125, "learning_rate": 4.972568013707854e-05, "loss": 0.115, "mean_token_accuracy": 0.9744808137416839, "num_tokens": 36285581.0, "step": 17240 }, { "entropy": 0.061206897348165513, "epoch": 4.01981582935074, "grad_norm": 2.5625, "learning_rate": 4.9725362019305113e-05, "loss": 0.0575, "mean_token_accuracy": 0.9826982498168946, "num_tokens": 36299733.0, "step": 17245 }, { "entropy": 0.07337401024997234, "epoch": 4.020981466371372, "grad_norm": 3.25, "learning_rate": 4.9725043719233145e-05, "loss": 0.1017, "mean_token_accuracy": 0.9742031693458557, "num_tokens": 36309070.0, "step": 17250 }, { "entropy": 0.07909293696284295, "epoch": 4.022147103392004, "grad_norm": 1.046875, "learning_rate": 4.972472523686739e-05, "loss": 0.095, "mean_token_accuracy": 0.9729432463645935, "num_tokens": 36320642.0, "step": 17255 }, { "entropy": 0.07407073359936475, "epoch": 4.023312740412636, "grad_norm": 3.109375, "learning_rate": 4.972440657221259e-05, "loss": 0.0553, "mean_token_accuracy": 0.9820687472820282, "num_tokens": 36339974.0, "step": 17260 }, { "entropy": 0.05830348208546639, "epoch": 4.024478377433267, "grad_norm": 0.97265625, "learning_rate": 4.9724087725273504e-05, "loss": 0.0615, "mean_token_accuracy": 0.9839694261550903, "num_tokens": 36352595.0, "step": 17265 }, { "entropy": 0.07738137934356928, "epoch": 4.025644014453899, "grad_norm": 0.451171875, "learning_rate": 4.972376869605489e-05, "loss": 0.0682, "mean_token_accuracy": 0.9771198868751526, "num_tokens": 36371942.0, "step": 17270 }, { "entropy": 0.07826981553807855, "epoch": 4.02680965147453, "grad_norm": 1.9765625, "learning_rate": 4.972344948456149e-05, "loss": 0.0744, "mean_token_accuracy": 0.9794186532497406, "num_tokens": 36402138.0, "step": 17275 }, { "entropy": 0.07734978701919318, "epoch": 4.0279752884951625, "grad_norm": 0.59375, "learning_rate": 4.9723130090798083e-05, "loss": 0.0745, "mean_token_accuracy": 0.9787839591503144, "num_tokens": 36425296.0, "step": 17280 }, { "entropy": 0.06703787222504616, "epoch": 4.029140925515795, "grad_norm": 0.734375, "learning_rate": 4.972281051476941e-05, "loss": 0.0682, "mean_token_accuracy": 0.981098610162735, "num_tokens": 36439694.0, "step": 17285 }, { "entropy": 0.06402443414554, "epoch": 4.030306562536426, "grad_norm": 0.205078125, "learning_rate": 4.9722490756480256e-05, "loss": 0.0877, "mean_token_accuracy": 0.9755900740623474, "num_tokens": 36469001.0, "step": 17290 }, { "entropy": 0.05503675839863718, "epoch": 4.031472199557058, "grad_norm": 4.875, "learning_rate": 4.972217081593538e-05, "loss": 0.0577, "mean_token_accuracy": 0.982936006784439, "num_tokens": 36506712.0, "step": 17295 }, { "entropy": 0.05822970187291503, "epoch": 4.03263783657769, "grad_norm": 0.60546875, "learning_rate": 4.9721850693139555e-05, "loss": 0.0667, "mean_token_accuracy": 0.9810732364654541, "num_tokens": 36533546.0, "step": 17300 }, { "entropy": 0.060767234954983, "epoch": 4.033803473598321, "grad_norm": 1.1171875, "learning_rate": 4.972153038809755e-05, "loss": 0.067, "mean_token_accuracy": 0.9782026529312133, "num_tokens": 36552953.0, "step": 17305 }, { "entropy": 0.10037664864212274, "epoch": 4.034969110618953, "grad_norm": 0.8359375, "learning_rate": 4.9721209900814144e-05, "loss": 0.0946, "mean_token_accuracy": 0.9746870577335358, "num_tokens": 36582455.0, "step": 17310 }, { "entropy": 0.07259051175788045, "epoch": 4.0361347476395855, "grad_norm": 5.28125, "learning_rate": 4.972088923129412e-05, "loss": 0.0942, "mean_token_accuracy": 0.9792055189609528, "num_tokens": 36600081.0, "step": 17315 }, { "entropy": 0.0708609121851623, "epoch": 4.037300384660217, "grad_norm": 4.71875, "learning_rate": 4.972056837954226e-05, "loss": 0.1032, "mean_token_accuracy": 0.97678844332695, "num_tokens": 36616329.0, "step": 17320 }, { "entropy": 0.07505875267088413, "epoch": 4.038466021680849, "grad_norm": 3.09375, "learning_rate": 4.972024734556334e-05, "loss": 0.1161, "mean_token_accuracy": 0.9763146281242371, "num_tokens": 36636315.0, "step": 17325 }, { "entropy": 0.07826881892979146, "epoch": 4.03963165870148, "grad_norm": 2.109375, "learning_rate": 4.971992612936215e-05, "loss": 0.0653, "mean_token_accuracy": 0.981414407491684, "num_tokens": 36651632.0, "step": 17330 }, { "entropy": 0.06351137179881335, "epoch": 4.040797295722112, "grad_norm": 0.47265625, "learning_rate": 4.9719604730943485e-05, "loss": 0.0714, "mean_token_accuracy": 0.9803904712200164, "num_tokens": 36678307.0, "step": 17335 }, { "entropy": 0.05514317499473691, "epoch": 4.041962932742744, "grad_norm": 3.28125, "learning_rate": 4.971928315031213e-05, "loss": 0.0582, "mean_token_accuracy": 0.9824005544185639, "num_tokens": 36705678.0, "step": 17340 }, { "entropy": 0.08399317860603332, "epoch": 4.043128569763375, "grad_norm": 3.71875, "learning_rate": 4.971896138747289e-05, "loss": 0.1178, "mean_token_accuracy": 0.9715703725814819, "num_tokens": 36717184.0, "step": 17345 }, { "entropy": 0.06782543286681175, "epoch": 4.0442942067840075, "grad_norm": 1.5078125, "learning_rate": 4.971863944243055e-05, "loss": 0.0828, "mean_token_accuracy": 0.9791005492210388, "num_tokens": 36728421.0, "step": 17350 }, { "entropy": 0.06589771201834083, "epoch": 4.04545984380464, "grad_norm": 2.03125, "learning_rate": 4.9718317315189926e-05, "loss": 0.0638, "mean_token_accuracy": 0.9831419050693512, "num_tokens": 36745939.0, "step": 17355 }, { "entropy": 0.06487294095568359, "epoch": 4.046625480825271, "grad_norm": 0.37109375, "learning_rate": 4.971799500575581e-05, "loss": 0.0922, "mean_token_accuracy": 0.9786526322364807, "num_tokens": 36768446.0, "step": 17360 }, { "entropy": 0.0821248460561037, "epoch": 4.047791117845903, "grad_norm": 6.84375, "learning_rate": 4.971767251413301e-05, "loss": 0.1065, "mean_token_accuracy": 0.9760565519332886, "num_tokens": 36781290.0, "step": 17365 }, { "entropy": 0.08352366182953119, "epoch": 4.048956754866534, "grad_norm": 0.5703125, "learning_rate": 4.9717349840326344e-05, "loss": 0.0809, "mean_token_accuracy": 0.977725625038147, "num_tokens": 36803795.0, "step": 17370 }, { "entropy": 0.07390208840370179, "epoch": 4.050122391887166, "grad_norm": 1.9453125, "learning_rate": 4.971702698434061e-05, "loss": 0.0779, "mean_token_accuracy": 0.9757682621479035, "num_tokens": 36816091.0, "step": 17375 }, { "entropy": 0.0756676783785224, "epoch": 4.051288028907798, "grad_norm": 7.28125, "learning_rate": 4.9716703946180626e-05, "loss": 0.0949, "mean_token_accuracy": 0.9772179245948791, "num_tokens": 36827385.0, "step": 17380 }, { "entropy": 0.06811714279465378, "epoch": 4.05245366592843, "grad_norm": 5.96875, "learning_rate": 4.971638072585121e-05, "loss": 0.0769, "mean_token_accuracy": 0.9762770175933838, "num_tokens": 36846237.0, "step": 17385 }, { "entropy": 0.07330656386911868, "epoch": 4.053619302949062, "grad_norm": 2.578125, "learning_rate": 4.971605732335719e-05, "loss": 0.0601, "mean_token_accuracy": 0.9823174595832824, "num_tokens": 36863604.0, "step": 17390 }, { "entropy": 0.08408041261136531, "epoch": 4.054784939969694, "grad_norm": 5.4375, "learning_rate": 4.971573373870338e-05, "loss": 0.0875, "mean_token_accuracy": 0.9787028014659882, "num_tokens": 36873623.0, "step": 17395 }, { "entropy": 0.09213002622127534, "epoch": 4.055950576990325, "grad_norm": 6.6875, "learning_rate": 4.97154099718946e-05, "loss": 0.1429, "mean_token_accuracy": 0.9639403820037842, "num_tokens": 36882941.0, "step": 17400 }, { "entropy": 0.04812620813027024, "epoch": 4.057116214010957, "grad_norm": 1.3203125, "learning_rate": 4.971508602293569e-05, "loss": 0.057, "mean_token_accuracy": 0.9856364369392395, "num_tokens": 36906085.0, "step": 17405 }, { "entropy": 0.08739807661622763, "epoch": 4.058281851031588, "grad_norm": 3.53125, "learning_rate": 4.971476189183148e-05, "loss": 0.1254, "mean_token_accuracy": 0.9713817596435547, "num_tokens": 36930703.0, "step": 17410 }, { "entropy": 0.07933282367885112, "epoch": 4.05944748805222, "grad_norm": 1.9765625, "learning_rate": 4.971443757858679e-05, "loss": 0.0951, "mean_token_accuracy": 0.9795554041862488, "num_tokens": 36939229.0, "step": 17415 }, { "entropy": 0.054528007935732606, "epoch": 4.0606131250728525, "grad_norm": 1.546875, "learning_rate": 4.971411308320646e-05, "loss": 0.0366, "mean_token_accuracy": 0.987425422668457, "num_tokens": 36965884.0, "step": 17420 }, { "entropy": 0.06623607650399208, "epoch": 4.061778762093484, "grad_norm": 1.8046875, "learning_rate": 4.971378840569534e-05, "loss": 0.0669, "mean_token_accuracy": 0.9831463754177093, "num_tokens": 36983099.0, "step": 17425 }, { "entropy": 0.06812118962407113, "epoch": 4.062944399114116, "grad_norm": 2.0, "learning_rate": 4.971346354605826e-05, "loss": 0.0707, "mean_token_accuracy": 0.9800431668758393, "num_tokens": 37001866.0, "step": 17430 }, { "entropy": 0.05892649330198765, "epoch": 4.064110036134748, "grad_norm": 1.6953125, "learning_rate": 4.971313850430007e-05, "loss": 0.0749, "mean_token_accuracy": 0.9826079845428467, "num_tokens": 37019444.0, "step": 17435 }, { "entropy": 0.06661005020141601, "epoch": 4.065275673155379, "grad_norm": 1.328125, "learning_rate": 4.971281328042562e-05, "loss": 0.0779, "mean_token_accuracy": 0.9750029504299164, "num_tokens": 37041691.0, "step": 17440 }, { "entropy": 0.07503235340118408, "epoch": 4.066441310176011, "grad_norm": 1.1328125, "learning_rate": 4.971248787443975e-05, "loss": 0.0753, "mean_token_accuracy": 0.9808429062366486, "num_tokens": 37064095.0, "step": 17445 }, { "entropy": 0.05903323907405138, "epoch": 4.067606947196643, "grad_norm": 1.2421875, "learning_rate": 4.971216228634732e-05, "loss": 0.0432, "mean_token_accuracy": 0.9815847158432007, "num_tokens": 37084047.0, "step": 17450 }, { "entropy": 0.07304359413683414, "epoch": 4.068772584217275, "grad_norm": 1.1015625, "learning_rate": 4.971183651615318e-05, "loss": 0.0475, "mean_token_accuracy": 0.9817767202854156, "num_tokens": 37108418.0, "step": 17455 }, { "entropy": 0.06335212141275406, "epoch": 4.069938221237907, "grad_norm": 2.109375, "learning_rate": 4.971151056386219e-05, "loss": 0.0475, "mean_token_accuracy": 0.9812131524085999, "num_tokens": 37141312.0, "step": 17460 }, { "entropy": 0.10107212541624903, "epoch": 4.071103858258538, "grad_norm": 4.40625, "learning_rate": 4.9711184429479215e-05, "loss": 0.1212, "mean_token_accuracy": 0.9684840321540833, "num_tokens": 37154069.0, "step": 17465 }, { "entropy": 0.09124523922801017, "epoch": 4.07226949527917, "grad_norm": 4.5625, "learning_rate": 4.971085811300911e-05, "loss": 0.1407, "mean_token_accuracy": 0.9691009283065796, "num_tokens": 37163700.0, "step": 17470 }, { "entropy": 0.05337847024202347, "epoch": 4.073435132299802, "grad_norm": 0.8984375, "learning_rate": 4.971053161445674e-05, "loss": 0.0633, "mean_token_accuracy": 0.9758882224559784, "num_tokens": 37192961.0, "step": 17475 }, { "entropy": 0.0389715145342052, "epoch": 4.074600769320433, "grad_norm": 2.875, "learning_rate": 4.971020493382698e-05, "loss": 0.0377, "mean_token_accuracy": 0.9893102467060089, "num_tokens": 37223557.0, "step": 17480 }, { "entropy": 0.09736668840050697, "epoch": 4.0757664063410655, "grad_norm": 3.5, "learning_rate": 4.97098780711247e-05, "loss": 0.0921, "mean_token_accuracy": 0.9757306635379791, "num_tokens": 37241031.0, "step": 17485 }, { "entropy": 0.07404592633247375, "epoch": 4.076932043361698, "grad_norm": 1.828125, "learning_rate": 4.9709551026354775e-05, "loss": 0.0784, "mean_token_accuracy": 0.9740024745464325, "num_tokens": 37274714.0, "step": 17490 }, { "entropy": 0.06818081270903349, "epoch": 4.078097680382329, "grad_norm": 4.1875, "learning_rate": 4.970922379952208e-05, "loss": 0.0871, "mean_token_accuracy": 0.9766332447528839, "num_tokens": 37285581.0, "step": 17495 }, { "entropy": 0.056024549342691896, "epoch": 4.079263317402961, "grad_norm": 0.38671875, "learning_rate": 4.97088963906315e-05, "loss": 0.0482, "mean_token_accuracy": 0.9804753720760345, "num_tokens": 37305223.0, "step": 17500 }, { "entropy": 0.06913188584148884, "epoch": 4.080428954423592, "grad_norm": 7.875, "learning_rate": 4.97085687996879e-05, "loss": 0.097, "mean_token_accuracy": 0.9744450330734253, "num_tokens": 37325723.0, "step": 17505 }, { "entropy": 0.04517441475763917, "epoch": 4.081594591444224, "grad_norm": 1.609375, "learning_rate": 4.9708241026696186e-05, "loss": 0.0373, "mean_token_accuracy": 0.9858837187290191, "num_tokens": 37353645.0, "step": 17510 }, { "entropy": 0.06553271301090717, "epoch": 4.082760228464856, "grad_norm": 2.34375, "learning_rate": 4.9707913071661225e-05, "loss": 0.0637, "mean_token_accuracy": 0.983744639158249, "num_tokens": 37375003.0, "step": 17515 }, { "entropy": 0.06103522032499313, "epoch": 4.0839258654854875, "grad_norm": 2.4375, "learning_rate": 4.970758493458793e-05, "loss": 0.0575, "mean_token_accuracy": 0.9773332476615906, "num_tokens": 37398649.0, "step": 17520 }, { "entropy": 0.06978270523250103, "epoch": 4.08509150250612, "grad_norm": 2.640625, "learning_rate": 4.970725661548118e-05, "loss": 0.0539, "mean_token_accuracy": 0.9787450671195984, "num_tokens": 37421294.0, "step": 17525 }, { "entropy": 0.06557549051940441, "epoch": 4.086257139526752, "grad_norm": 0.291015625, "learning_rate": 4.970692811434587e-05, "loss": 0.0543, "mean_token_accuracy": 0.9738318443298339, "num_tokens": 37458108.0, "step": 17530 }, { "entropy": 0.0798257420770824, "epoch": 4.087422776547383, "grad_norm": 3.21875, "learning_rate": 4.97065994311869e-05, "loss": 0.0796, "mean_token_accuracy": 0.9754477143287659, "num_tokens": 37477809.0, "step": 17535 }, { "entropy": 0.04820524742826819, "epoch": 4.088588413568015, "grad_norm": 0.51953125, "learning_rate": 4.9706270566009174e-05, "loss": 0.0464, "mean_token_accuracy": 0.9842685222625732, "num_tokens": 37509032.0, "step": 17540 }, { "entropy": 0.07802566029131412, "epoch": 4.089754050588646, "grad_norm": 0.6328125, "learning_rate": 4.9705941518817594e-05, "loss": 0.1019, "mean_token_accuracy": 0.9741903364658355, "num_tokens": 37532273.0, "step": 17545 }, { "entropy": 0.06755186822265387, "epoch": 4.090919687609278, "grad_norm": 0.6796875, "learning_rate": 4.970561228961707e-05, "loss": 0.0649, "mean_token_accuracy": 0.9817712068557739, "num_tokens": 37546529.0, "step": 17550 }, { "entropy": 0.06730409227311611, "epoch": 4.0920853246299105, "grad_norm": 0.462890625, "learning_rate": 4.970528287841251e-05, "loss": 0.0784, "mean_token_accuracy": 0.9731522858142853, "num_tokens": 37567044.0, "step": 17555 }, { "entropy": 0.05120256347581744, "epoch": 4.093250961650542, "grad_norm": 1.0078125, "learning_rate": 4.9704953285208825e-05, "loss": 0.055, "mean_token_accuracy": 0.983589905500412, "num_tokens": 37594124.0, "step": 17560 }, { "entropy": 0.04717886643484235, "epoch": 4.094416598671174, "grad_norm": 0.8828125, "learning_rate": 4.9704623510010926e-05, "loss": 0.0396, "mean_token_accuracy": 0.9903761804103851, "num_tokens": 37624057.0, "step": 17565 }, { "entropy": 0.0668092598207295, "epoch": 4.095582235691806, "grad_norm": 0.357421875, "learning_rate": 4.9704293552823736e-05, "loss": 0.0836, "mean_token_accuracy": 0.975571358203888, "num_tokens": 37645861.0, "step": 17570 }, { "entropy": 0.08568244017660617, "epoch": 4.096747872712437, "grad_norm": 2.4375, "learning_rate": 4.970396341365217e-05, "loss": 0.1166, "mean_token_accuracy": 0.9714062631130218, "num_tokens": 37655194.0, "step": 17575 }, { "entropy": 0.08841509446501732, "epoch": 4.097913509733069, "grad_norm": 3.578125, "learning_rate": 4.970363309250115e-05, "loss": 0.1411, "mean_token_accuracy": 0.9614146292209625, "num_tokens": 37664421.0, "step": 17580 }, { "entropy": 0.07518529873341322, "epoch": 4.099079146753701, "grad_norm": 1.4765625, "learning_rate": 4.970330258937561e-05, "loss": 0.066, "mean_token_accuracy": 0.9781446993350983, "num_tokens": 37686842.0, "step": 17585 }, { "entropy": 0.061594282276928425, "epoch": 4.1002447837743325, "grad_norm": 0.484375, "learning_rate": 4.970297190428047e-05, "loss": 0.0369, "mean_token_accuracy": 0.9831397414207459, "num_tokens": 37710171.0, "step": 17590 }, { "entropy": 0.075823188200593, "epoch": 4.101410420794965, "grad_norm": 2.40625, "learning_rate": 4.9702641037220674e-05, "loss": 0.1047, "mean_token_accuracy": 0.9743071496486664, "num_tokens": 37720106.0, "step": 17595 }, { "entropy": 0.04717852883040905, "epoch": 4.102576057815596, "grad_norm": 0.61328125, "learning_rate": 4.970230998820114e-05, "loss": 0.0397, "mean_token_accuracy": 0.9836111307144165, "num_tokens": 37761881.0, "step": 17600 }, { "entropy": 0.07047794573009014, "epoch": 4.103741694836228, "grad_norm": 0.7578125, "learning_rate": 4.970197875722681e-05, "loss": 0.0958, "mean_token_accuracy": 0.9757523596286773, "num_tokens": 37779264.0, "step": 17605 }, { "entropy": 0.06992449425160885, "epoch": 4.10490733185686, "grad_norm": 4.53125, "learning_rate": 4.9701647344302624e-05, "loss": 0.082, "mean_token_accuracy": 0.9756997048854827, "num_tokens": 37808887.0, "step": 17610 }, { "entropy": 0.05605229511857033, "epoch": 4.106072968877491, "grad_norm": 1.3203125, "learning_rate": 4.970131574943352e-05, "loss": 0.0675, "mean_token_accuracy": 0.9793569207191467, "num_tokens": 37832410.0, "step": 17615 }, { "entropy": 0.07954696863889694, "epoch": 4.107238605898123, "grad_norm": 2.5, "learning_rate": 4.970098397262445e-05, "loss": 0.0884, "mean_token_accuracy": 0.9773578584194184, "num_tokens": 37852432.0, "step": 17620 }, { "entropy": 0.06925328876823186, "epoch": 4.1084042429187555, "grad_norm": 0.8125, "learning_rate": 4.970065201388036e-05, "loss": 0.0791, "mean_token_accuracy": 0.9751325309276581, "num_tokens": 37866193.0, "step": 17625 }, { "entropy": 0.0733806163072586, "epoch": 4.109569879939387, "grad_norm": 1.328125, "learning_rate": 4.97003198732062e-05, "loss": 0.1035, "mean_token_accuracy": 0.972558718919754, "num_tokens": 37875542.0, "step": 17630 }, { "entropy": 0.06492809355258941, "epoch": 4.110735516960019, "grad_norm": 0.2041015625, "learning_rate": 4.9699987550606916e-05, "loss": 0.0683, "mean_token_accuracy": 0.9816878855228424, "num_tokens": 37915793.0, "step": 17635 }, { "entropy": 0.06843568123877049, "epoch": 4.11190115398065, "grad_norm": 3.046875, "learning_rate": 4.969965504608747e-05, "loss": 0.0727, "mean_token_accuracy": 0.980006742477417, "num_tokens": 37934429.0, "step": 17640 }, { "entropy": 0.07521101422607898, "epoch": 4.113066791001282, "grad_norm": 1.4609375, "learning_rate": 4.969932235965281e-05, "loss": 0.0788, "mean_token_accuracy": 0.9817327499389649, "num_tokens": 37948166.0, "step": 17645 }, { "entropy": 0.07668667826801538, "epoch": 4.114232428021914, "grad_norm": 0.6328125, "learning_rate": 4.969898949130791e-05, "loss": 0.0861, "mean_token_accuracy": 0.976869261264801, "num_tokens": 37964025.0, "step": 17650 }, { "entropy": 0.0827770703472197, "epoch": 4.1153980650425455, "grad_norm": 1.609375, "learning_rate": 4.969865644105773e-05, "loss": 0.0854, "mean_token_accuracy": 0.9789432466030121, "num_tokens": 37982005.0, "step": 17655 }, { "entropy": 0.06624055663123726, "epoch": 4.116563702063178, "grad_norm": 0.400390625, "learning_rate": 4.969832320890724e-05, "loss": 0.0507, "mean_token_accuracy": 0.9831128180027008, "num_tokens": 38002010.0, "step": 17660 }, { "entropy": 0.06770137306302786, "epoch": 4.11772933908381, "grad_norm": 2.296875, "learning_rate": 4.96979897948614e-05, "loss": 0.0553, "mean_token_accuracy": 0.9802413940429687, "num_tokens": 38018709.0, "step": 17665 }, { "entropy": 0.08137813555076719, "epoch": 4.118894976104441, "grad_norm": 2.484375, "learning_rate": 4.969765619892518e-05, "loss": 0.0963, "mean_token_accuracy": 0.9716586530208587, "num_tokens": 38040955.0, "step": 17670 }, { "entropy": 0.06633272357285022, "epoch": 4.120060613125073, "grad_norm": 1.4765625, "learning_rate": 4.9697322421103564e-05, "loss": 0.0787, "mean_token_accuracy": 0.9814836859703064, "num_tokens": 38063011.0, "step": 17675 }, { "entropy": 0.0662471629679203, "epoch": 4.121226250145704, "grad_norm": 0.56640625, "learning_rate": 4.969698846140152e-05, "loss": 0.0929, "mean_token_accuracy": 0.9769213080406189, "num_tokens": 38077718.0, "step": 17680 }, { "entropy": 0.08769634570926428, "epoch": 4.122391887166336, "grad_norm": 4.21875, "learning_rate": 4.969665431982404e-05, "loss": 0.0851, "mean_token_accuracy": 0.9764888525009155, "num_tokens": 38090501.0, "step": 17685 }, { "entropy": 0.051108538545668124, "epoch": 4.123557524186968, "grad_norm": 5.9375, "learning_rate": 4.96963199963761e-05, "loss": 0.0593, "mean_token_accuracy": 0.9822099328041076, "num_tokens": 38115330.0, "step": 17690 }, { "entropy": 0.06495916079729795, "epoch": 4.1247231612076, "grad_norm": 0.34375, "learning_rate": 4.9695985491062674e-05, "loss": 0.0726, "mean_token_accuracy": 0.9816126704216004, "num_tokens": 38129691.0, "step": 17695 }, { "entropy": 0.07993561625480652, "epoch": 4.125888798228232, "grad_norm": 0.6796875, "learning_rate": 4.9695650803888764e-05, "loss": 0.0707, "mean_token_accuracy": 0.9806294143199921, "num_tokens": 38148764.0, "step": 17700 }, { "entropy": 0.06839036452583969, "epoch": 4.127054435248864, "grad_norm": 3.03125, "learning_rate": 4.969531593485937e-05, "loss": 0.0456, "mean_token_accuracy": 0.9780635833740234, "num_tokens": 38196213.0, "step": 17705 }, { "entropy": 0.08365984680131078, "epoch": 4.128220072269495, "grad_norm": 2.53125, "learning_rate": 4.969498088397946e-05, "loss": 0.0896, "mean_token_accuracy": 0.9761638879776001, "num_tokens": 38211434.0, "step": 17710 }, { "entropy": 0.063323774933815, "epoch": 4.129385709290127, "grad_norm": 2.0625, "learning_rate": 4.969464565125404e-05, "loss": 0.0562, "mean_token_accuracy": 0.9819753110408783, "num_tokens": 38228816.0, "step": 17715 }, { "entropy": 0.08252546712756156, "epoch": 4.130551346310758, "grad_norm": 2.203125, "learning_rate": 4.969431023668812e-05, "loss": 0.1077, "mean_token_accuracy": 0.9741408407688141, "num_tokens": 38238471.0, "step": 17720 }, { "entropy": 0.06147185619920492, "epoch": 4.1317169833313905, "grad_norm": 4.0, "learning_rate": 4.969397464028669e-05, "loss": 0.0671, "mean_token_accuracy": 0.9822175681591034, "num_tokens": 38256366.0, "step": 17725 }, { "entropy": 0.057942528277635574, "epoch": 4.132882620352023, "grad_norm": 1.703125, "learning_rate": 4.969363886205476e-05, "loss": 0.0512, "mean_token_accuracy": 0.9869576156139374, "num_tokens": 38283809.0, "step": 17730 }, { "entropy": 0.0631272815167904, "epoch": 4.134048257372654, "grad_norm": 2.53125, "learning_rate": 4.969330290199733e-05, "loss": 0.0823, "mean_token_accuracy": 0.981304931640625, "num_tokens": 38296792.0, "step": 17735 }, { "entropy": 0.21083402447402477, "epoch": 4.135213894393286, "grad_norm": 1.171875, "learning_rate": 4.969296676011941e-05, "loss": 0.4323, "mean_token_accuracy": 0.95402672290802, "num_tokens": 38323429.0, "step": 17740 }, { "entropy": 0.05955973602831364, "epoch": 4.136379531413918, "grad_norm": 1.8203125, "learning_rate": 4.969263043642602e-05, "loss": 0.0549, "mean_token_accuracy": 0.9837433099746704, "num_tokens": 38355751.0, "step": 17745 }, { "entropy": 0.07743876222521066, "epoch": 4.137545168434549, "grad_norm": 1.3515625, "learning_rate": 4.969229393092218e-05, "loss": 0.0934, "mean_token_accuracy": 0.9781845211982727, "num_tokens": 38374521.0, "step": 17750 }, { "entropy": 0.059616895485669376, "epoch": 4.138710805455181, "grad_norm": 3.875, "learning_rate": 4.969195724361289e-05, "loss": 0.0718, "mean_token_accuracy": 0.9815534234046936, "num_tokens": 38401132.0, "step": 17755 }, { "entropy": 0.07294055828824639, "epoch": 4.139876442475813, "grad_norm": 2.484375, "learning_rate": 4.969162037450318e-05, "loss": 0.1015, "mean_token_accuracy": 0.9737964630126953, "num_tokens": 38423254.0, "step": 17760 }, { "entropy": 0.09278177451342344, "epoch": 4.141042079496445, "grad_norm": 3.484375, "learning_rate": 4.969128332359808e-05, "loss": 0.0911, "mean_token_accuracy": 0.9769309103488922, "num_tokens": 38453027.0, "step": 17765 }, { "entropy": 0.08387061543762683, "epoch": 4.142207716517077, "grad_norm": 1.359375, "learning_rate": 4.969094609090261e-05, "loss": 0.1126, "mean_token_accuracy": 0.9712442636489869, "num_tokens": 38462528.0, "step": 17770 }, { "entropy": 0.07613757401704788, "epoch": 4.143373353537708, "grad_norm": 5.34375, "learning_rate": 4.969060867642179e-05, "loss": 0.0924, "mean_token_accuracy": 0.97659130692482, "num_tokens": 38473163.0, "step": 17775 }, { "entropy": 0.09136496745049953, "epoch": 4.14453899055834, "grad_norm": 3.75, "learning_rate": 4.969027108016065e-05, "loss": 0.1467, "mean_token_accuracy": 0.9635896980762482, "num_tokens": 38493707.0, "step": 17780 }, { "entropy": 0.1104026323184371, "epoch": 4.145704627578972, "grad_norm": 1.734375, "learning_rate": 4.9689933302124255e-05, "loss": 0.1351, "mean_token_accuracy": 0.9675468266010284, "num_tokens": 38515217.0, "step": 17785 }, { "entropy": 0.0909642169252038, "epoch": 4.146870264599603, "grad_norm": 1.625, "learning_rate": 4.968959534231761e-05, "loss": 0.1052, "mean_token_accuracy": 0.9722971677780151, "num_tokens": 38533758.0, "step": 17790 }, { "entropy": 0.09088087901473045, "epoch": 4.1480359016202355, "grad_norm": 1.3046875, "learning_rate": 4.968925720074576e-05, "loss": 0.1113, "mean_token_accuracy": 0.9731328010559082, "num_tokens": 38563161.0, "step": 17795 }, { "entropy": 0.08471710272133351, "epoch": 4.149201538640868, "grad_norm": 2.25, "learning_rate": 4.9688918877413756e-05, "loss": 0.1387, "mean_token_accuracy": 0.9690379083156586, "num_tokens": 38572656.0, "step": 17800 }, { "entropy": 0.08337556011974812, "epoch": 4.150367175661499, "grad_norm": 1.7421875, "learning_rate": 4.968858037232663e-05, "loss": 0.1503, "mean_token_accuracy": 0.9631303191184998, "num_tokens": 38582815.0, "step": 17805 }, { "entropy": 0.07923096120357513, "epoch": 4.151532812682131, "grad_norm": 4.625, "learning_rate": 4.968824168548945e-05, "loss": 0.0877, "mean_token_accuracy": 0.9760537326335907, "num_tokens": 38602476.0, "step": 17810 }, { "entropy": 0.07994878720492124, "epoch": 4.152698449702762, "grad_norm": 4.5625, "learning_rate": 4.968790281690725e-05, "loss": 0.081, "mean_token_accuracy": 0.9776585280895234, "num_tokens": 38616516.0, "step": 17815 }, { "entropy": 0.069281514454633, "epoch": 4.153864086723394, "grad_norm": 0.5546875, "learning_rate": 4.9687563766585086e-05, "loss": 0.1041, "mean_token_accuracy": 0.9738664746284484, "num_tokens": 38643610.0, "step": 17820 }, { "entropy": 0.06928181694820523, "epoch": 4.155029723744026, "grad_norm": 3.421875, "learning_rate": 4.9687224534528015e-05, "loss": 0.0961, "mean_token_accuracy": 0.9771531283855438, "num_tokens": 38665170.0, "step": 17825 }, { "entropy": 0.08383396286517382, "epoch": 4.156195360764658, "grad_norm": 0.79296875, "learning_rate": 4.96868851207411e-05, "loss": 0.0858, "mean_token_accuracy": 0.9715451300144196, "num_tokens": 38683037.0, "step": 17830 }, { "entropy": 0.1132634285837412, "epoch": 4.15736099778529, "grad_norm": 4.25, "learning_rate": 4.968654552522939e-05, "loss": 0.1926, "mean_token_accuracy": 0.9579161942005158, "num_tokens": 38702652.0, "step": 17835 }, { "entropy": 0.08075078241527081, "epoch": 4.158526634805922, "grad_norm": 3.484375, "learning_rate": 4.968620574799796e-05, "loss": 0.0914, "mean_token_accuracy": 0.9716982245445251, "num_tokens": 38725463.0, "step": 17840 }, { "entropy": 0.060110028833150864, "epoch": 4.159692271826553, "grad_norm": 4.53125, "learning_rate": 4.968586578905188e-05, "loss": 0.0584, "mean_token_accuracy": 0.9838882625102997, "num_tokens": 38744794.0, "step": 17845 }, { "entropy": 0.051263061724603175, "epoch": 4.160857908847185, "grad_norm": 0.330078125, "learning_rate": 4.9685525648396205e-05, "loss": 0.0322, "mean_token_accuracy": 0.9860285639762878, "num_tokens": 38773689.0, "step": 17850 }, { "entropy": 0.04591662436723709, "epoch": 4.162023545867816, "grad_norm": 0.197265625, "learning_rate": 4.968518532603601e-05, "loss": 0.0386, "mean_token_accuracy": 0.9850606024265289, "num_tokens": 38799162.0, "step": 17855 }, { "entropy": 0.091443323623389, "epoch": 4.163189182888448, "grad_norm": 10.6875, "learning_rate": 4.968484482197639e-05, "loss": 0.1426, "mean_token_accuracy": 0.9635275840759278, "num_tokens": 38814549.0, "step": 17860 }, { "entropy": 0.05765872802585363, "epoch": 4.1643548199090805, "grad_norm": 1.71875, "learning_rate": 4.9684504136222386e-05, "loss": 0.0688, "mean_token_accuracy": 0.982217013835907, "num_tokens": 38837038.0, "step": 17865 }, { "entropy": 0.06157895103096962, "epoch": 4.165520456929712, "grad_norm": 0.73046875, "learning_rate": 4.968416326877911e-05, "loss": 0.0598, "mean_token_accuracy": 0.9825385868549347, "num_tokens": 38855466.0, "step": 17870 }, { "entropy": 0.05540731241926551, "epoch": 4.166686093950344, "grad_norm": 1.078125, "learning_rate": 4.9683822219651636e-05, "loss": 0.0447, "mean_token_accuracy": 0.9826449275016784, "num_tokens": 38884459.0, "step": 17875 }, { "entropy": 0.13959228973835708, "epoch": 4.167851730970976, "grad_norm": 2.328125, "learning_rate": 4.9683480988845045e-05, "loss": 0.1112, "mean_token_accuracy": 0.9676654815673829, "num_tokens": 38916660.0, "step": 17880 }, { "entropy": 0.07438411880284548, "epoch": 4.169017367991607, "grad_norm": 0.91015625, "learning_rate": 4.968313957636442e-05, "loss": 0.0582, "mean_token_accuracy": 0.9798523426055908, "num_tokens": 38929734.0, "step": 17885 }, { "entropy": 0.07969954237341881, "epoch": 4.170183005012239, "grad_norm": 5.125, "learning_rate": 4.968279798221487e-05, "loss": 0.1093, "mean_token_accuracy": 0.9738732576370239, "num_tokens": 38943465.0, "step": 17890 }, { "entropy": 0.06127149565145373, "epoch": 4.171348642032871, "grad_norm": 2.9375, "learning_rate": 4.9682456206401476e-05, "loss": 0.0771, "mean_token_accuracy": 0.9770678341388702, "num_tokens": 38962693.0, "step": 17895 }, { "entropy": 0.07192683843895793, "epoch": 4.172514279053503, "grad_norm": 1.9609375, "learning_rate": 4.968211424892934e-05, "loss": 0.049, "mean_token_accuracy": 0.9862643003463745, "num_tokens": 38986065.0, "step": 17900 }, { "entropy": 0.06356288734823465, "epoch": 4.173679916074135, "grad_norm": 0.6484375, "learning_rate": 4.968177210980355e-05, "loss": 0.0678, "mean_token_accuracy": 0.9798180103302002, "num_tokens": 39013369.0, "step": 17905 }, { "entropy": 0.0715787623077631, "epoch": 4.174845553094766, "grad_norm": 3.09375, "learning_rate": 4.9681429789029216e-05, "loss": 0.0793, "mean_token_accuracy": 0.9760327398777008, "num_tokens": 39028300.0, "step": 17910 }, { "entropy": 0.06488925497978926, "epoch": 4.176011190115398, "grad_norm": 0.5859375, "learning_rate": 4.9681087286611445e-05, "loss": 0.072, "mean_token_accuracy": 0.9803975045680999, "num_tokens": 39044881.0, "step": 17915 }, { "entropy": 0.05183368735015392, "epoch": 4.17717682713603, "grad_norm": 0.69140625, "learning_rate": 4.968074460255534e-05, "loss": 0.0576, "mean_token_accuracy": 0.9789083540439606, "num_tokens": 39066214.0, "step": 17920 }, { "entropy": 0.06136147491633892, "epoch": 4.178342464156661, "grad_norm": 0.9375, "learning_rate": 4.968040173686601e-05, "loss": 0.0694, "mean_token_accuracy": 0.9841551721096039, "num_tokens": 39094170.0, "step": 17925 }, { "entropy": 0.048913817014545204, "epoch": 4.179508101177293, "grad_norm": 2.515625, "learning_rate": 4.968005868954857e-05, "loss": 0.0519, "mean_token_accuracy": 0.9821430623531342, "num_tokens": 39121933.0, "step": 17930 }, { "entropy": 0.08498697010800242, "epoch": 4.1806737381979255, "grad_norm": 5.34375, "learning_rate": 4.967971546060814e-05, "loss": 0.1469, "mean_token_accuracy": 0.9696698248386383, "num_tokens": 39155746.0, "step": 17935 }, { "entropy": 0.06178603731095791, "epoch": 4.181839375218557, "grad_norm": 0.5859375, "learning_rate": 4.967937205004983e-05, "loss": 0.0313, "mean_token_accuracy": 0.9859042346477509, "num_tokens": 39182421.0, "step": 17940 }, { "entropy": 0.07212042324244976, "epoch": 4.183005012239189, "grad_norm": 4.8125, "learning_rate": 4.9679028457878764e-05, "loss": 0.0923, "mean_token_accuracy": 0.9786004006862641, "num_tokens": 39202742.0, "step": 17945 }, { "entropy": 0.07441701972857118, "epoch": 4.18417064925982, "grad_norm": 0.5859375, "learning_rate": 4.967868468410006e-05, "loss": 0.0483, "mean_token_accuracy": 0.9796132147312164, "num_tokens": 39247317.0, "step": 17950 }, { "entropy": 0.07962618097662925, "epoch": 4.185336286280452, "grad_norm": 3.21875, "learning_rate": 4.967834072871886e-05, "loss": 0.0798, "mean_token_accuracy": 0.978544956445694, "num_tokens": 39258240.0, "step": 17955 }, { "entropy": 0.06471439627930523, "epoch": 4.186501923301084, "grad_norm": 4.03125, "learning_rate": 4.9677996591740277e-05, "loss": 0.0547, "mean_token_accuracy": 0.9785925149917603, "num_tokens": 39281630.0, "step": 17960 }, { "entropy": 0.06293684775009752, "epoch": 4.1876675603217155, "grad_norm": 0.7109375, "learning_rate": 4.967765227316945e-05, "loss": 0.0562, "mean_token_accuracy": 0.9819996774196624, "num_tokens": 39308837.0, "step": 17965 }, { "entropy": 0.06763023445382714, "epoch": 4.188833197342348, "grad_norm": 0.482421875, "learning_rate": 4.96773077730115e-05, "loss": 0.0221, "mean_token_accuracy": 0.9898829996585846, "num_tokens": 39345381.0, "step": 17970 }, { "entropy": 0.0727597183547914, "epoch": 4.18999883436298, "grad_norm": 0.337890625, "learning_rate": 4.967696309127159e-05, "loss": 0.0545, "mean_token_accuracy": 0.9819984555244445, "num_tokens": 39364933.0, "step": 17975 }, { "entropy": 0.0631346826441586, "epoch": 4.191164471383611, "grad_norm": 1.4921875, "learning_rate": 4.967661822795485e-05, "loss": 0.0549, "mean_token_accuracy": 0.9786123156547546, "num_tokens": 39384010.0, "step": 17980 }, { "entropy": 0.054526901617646215, "epoch": 4.192330108404243, "grad_norm": 0.48828125, "learning_rate": 4.96762731830664e-05, "loss": 0.053, "mean_token_accuracy": 0.9847536087036133, "num_tokens": 39411097.0, "step": 17985 }, { "entropy": 0.09764510169625282, "epoch": 4.193495745424874, "grad_norm": 3.484375, "learning_rate": 4.9675927956611415e-05, "loss": 0.1006, "mean_token_accuracy": 0.9744187951087951, "num_tokens": 39419977.0, "step": 17990 }, { "entropy": 0.05835923943668604, "epoch": 4.194661382445506, "grad_norm": 0.29296875, "learning_rate": 4.9675582548595024e-05, "loss": 0.0573, "mean_token_accuracy": 0.9823970139026642, "num_tokens": 39442667.0, "step": 17995 }, { "entropy": 0.06732565024867654, "epoch": 4.1958270194661385, "grad_norm": 2.4375, "learning_rate": 4.9675236959022385e-05, "loss": 0.0605, "mean_token_accuracy": 0.9767836272716522, "num_tokens": 39474692.0, "step": 18000 }, { "entropy": 0.06998653383925557, "epoch": 4.19699265648677, "grad_norm": 3.421875, "learning_rate": 4.967489118789866e-05, "loss": 0.0964, "mean_token_accuracy": 0.9786720693111419, "num_tokens": 39493889.0, "step": 18005 }, { "entropy": 0.055152688175439835, "epoch": 4.198158293507402, "grad_norm": 3.296875, "learning_rate": 4.967454523522898e-05, "loss": 0.0858, "mean_token_accuracy": 0.9816052854061127, "num_tokens": 39510892.0, "step": 18010 }, { "entropy": 0.059907327964901926, "epoch": 4.199323930528034, "grad_norm": 0.65625, "learning_rate": 4.967419910101853e-05, "loss": 0.0483, "mean_token_accuracy": 0.9820407211780549, "num_tokens": 39536981.0, "step": 18015 }, { "entropy": 0.13438040837645532, "epoch": 4.200489567548665, "grad_norm": 4.25, "learning_rate": 4.9673852785272456e-05, "loss": 0.2446, "mean_token_accuracy": 0.9476132929325104, "num_tokens": 39555531.0, "step": 18020 }, { "entropy": 0.059680545888841155, "epoch": 4.201655204569297, "grad_norm": 2.34375, "learning_rate": 4.9673506287995926e-05, "loss": 0.0636, "mean_token_accuracy": 0.9831001102924347, "num_tokens": 39572444.0, "step": 18025 }, { "entropy": 0.07699852250516415, "epoch": 4.202820841589929, "grad_norm": 4.71875, "learning_rate": 4.967315960919411e-05, "loss": 0.1543, "mean_token_accuracy": 0.9660988092422486, "num_tokens": 39584117.0, "step": 18030 }, { "entropy": 0.08583300039172173, "epoch": 4.2039864786105605, "grad_norm": 2.015625, "learning_rate": 4.967281274887217e-05, "loss": 0.0709, "mean_token_accuracy": 0.9798553586006165, "num_tokens": 39594900.0, "step": 18035 }, { "entropy": 0.0706296787597239, "epoch": 4.205152115631193, "grad_norm": 0.28515625, "learning_rate": 4.967246570703529e-05, "loss": 0.0989, "mean_token_accuracy": 0.9756020784378052, "num_tokens": 39616958.0, "step": 18040 }, { "entropy": 0.05387895340099931, "epoch": 4.206317752651824, "grad_norm": 0.5859375, "learning_rate": 4.967211848368863e-05, "loss": 0.0769, "mean_token_accuracy": 0.9803040981292724, "num_tokens": 39635355.0, "step": 18045 }, { "entropy": 0.07473008846864104, "epoch": 4.207483389672456, "grad_norm": 1.375, "learning_rate": 4.967177107883738e-05, "loss": 0.0567, "mean_token_accuracy": 0.9801276922225952, "num_tokens": 39659844.0, "step": 18050 }, { "entropy": 0.10034794714301824, "epoch": 4.208649026693088, "grad_norm": 3.4375, "learning_rate": 4.967142349248671e-05, "loss": 0.1122, "mean_token_accuracy": 0.964007842540741, "num_tokens": 39690318.0, "step": 18055 }, { "entropy": 0.07286425046622753, "epoch": 4.209814663713719, "grad_norm": 1.3203125, "learning_rate": 4.967107572464182e-05, "loss": 0.0714, "mean_token_accuracy": 0.9797383069992065, "num_tokens": 39701986.0, "step": 18060 }, { "entropy": 0.047049607150256634, "epoch": 4.210980300734351, "grad_norm": 1.40625, "learning_rate": 4.967072777530788e-05, "loss": 0.069, "mean_token_accuracy": 0.9827369391918183, "num_tokens": 39723435.0, "step": 18065 }, { "entropy": 0.054467049427330494, "epoch": 4.2121459377549835, "grad_norm": 1.734375, "learning_rate": 4.967037964449008e-05, "loss": 0.037, "mean_token_accuracy": 0.9811189293861389, "num_tokens": 39741765.0, "step": 18070 }, { "entropy": 0.06904594264924527, "epoch": 4.213311574775615, "grad_norm": 1.109375, "learning_rate": 4.967003133219361e-05, "loss": 0.0568, "mean_token_accuracy": 0.9788180887699127, "num_tokens": 39760359.0, "step": 18075 }, { "entropy": 0.07464666366577148, "epoch": 4.214477211796247, "grad_norm": 2.703125, "learning_rate": 4.966968283842368e-05, "loss": 0.0931, "mean_token_accuracy": 0.9809688687324524, "num_tokens": 39775682.0, "step": 18080 }, { "entropy": 0.2211771672591567, "epoch": 4.215642848816878, "grad_norm": 2.90625, "learning_rate": 4.9669334163185466e-05, "loss": 0.3894, "mean_token_accuracy": 0.9507070541381836, "num_tokens": 39802482.0, "step": 18085 }, { "entropy": 0.0948345759883523, "epoch": 4.21680848583751, "grad_norm": 0.59375, "learning_rate": 4.9668985306484175e-05, "loss": 0.1101, "mean_token_accuracy": 0.9713608205318451, "num_tokens": 39811315.0, "step": 18090 }, { "entropy": 0.11147435549646616, "epoch": 4.217974122858142, "grad_norm": 3.21875, "learning_rate": 4.966863626832502e-05, "loss": 0.1236, "mean_token_accuracy": 0.9693335950374603, "num_tokens": 39821734.0, "step": 18095 }, { "entropy": 0.05048245368525386, "epoch": 4.219139759878773, "grad_norm": 2.109375, "learning_rate": 4.966828704871319e-05, "loss": 0.0381, "mean_token_accuracy": 0.9865420877933502, "num_tokens": 39846251.0, "step": 18100 }, { "entropy": 0.09908029530197382, "epoch": 4.2203053968994055, "grad_norm": 2.4375, "learning_rate": 4.9667937647653894e-05, "loss": 0.1254, "mean_token_accuracy": 0.9632340371608734, "num_tokens": 39880560.0, "step": 18105 }, { "entropy": 0.07764818742871285, "epoch": 4.221471033920038, "grad_norm": 2.609375, "learning_rate": 4.966758806515235e-05, "loss": 0.0853, "mean_token_accuracy": 0.9731331884860992, "num_tokens": 39902255.0, "step": 18110 }, { "entropy": 0.061224596202373506, "epoch": 4.222636670940669, "grad_norm": 2.375, "learning_rate": 4.966723830121377e-05, "loss": 0.0735, "mean_token_accuracy": 0.9794311761856079, "num_tokens": 39921371.0, "step": 18115 }, { "entropy": 0.07678266037255525, "epoch": 4.223802307961301, "grad_norm": 4.28125, "learning_rate": 4.966688835584336e-05, "loss": 0.1015, "mean_token_accuracy": 0.9721475660800933, "num_tokens": 39940535.0, "step": 18120 }, { "entropy": 0.08287368789315223, "epoch": 4.224967944981932, "grad_norm": 5.25, "learning_rate": 4.966653822904634e-05, "loss": 0.1251, "mean_token_accuracy": 0.9700684547424316, "num_tokens": 39949215.0, "step": 18125 }, { "entropy": 0.0665527991950512, "epoch": 4.226133582002564, "grad_norm": 3.984375, "learning_rate": 4.966618792082794e-05, "loss": 0.0779, "mean_token_accuracy": 0.980407428741455, "num_tokens": 39963174.0, "step": 18130 }, { "entropy": 0.09068935289978981, "epoch": 4.227299219023196, "grad_norm": 1.71875, "learning_rate": 4.9665837431193387e-05, "loss": 0.0771, "mean_token_accuracy": 0.9779536366462708, "num_tokens": 39975323.0, "step": 18135 }, { "entropy": 0.0727505961433053, "epoch": 4.228464856043828, "grad_norm": 3.046875, "learning_rate": 4.9665486760147895e-05, "loss": 0.073, "mean_token_accuracy": 0.9798406302928925, "num_tokens": 40001191.0, "step": 18140 }, { "entropy": 0.06337338108569383, "epoch": 4.22963049306446, "grad_norm": 2.46875, "learning_rate": 4.96651359076967e-05, "loss": 0.0603, "mean_token_accuracy": 0.9807958424091339, "num_tokens": 40015378.0, "step": 18145 }, { "entropy": 0.061356103606522085, "epoch": 4.230796130085092, "grad_norm": 2.46875, "learning_rate": 4.9664784873845025e-05, "loss": 0.075, "mean_token_accuracy": 0.9822879672050476, "num_tokens": 40029798.0, "step": 18150 }, { "entropy": 0.06835889825597405, "epoch": 4.231961767105723, "grad_norm": 1.5390625, "learning_rate": 4.966443365859812e-05, "loss": 0.0821, "mean_token_accuracy": 0.9771220684051514, "num_tokens": 40044355.0, "step": 18155 }, { "entropy": 0.07135962946340442, "epoch": 4.233127404126355, "grad_norm": 0.38671875, "learning_rate": 4.96640822619612e-05, "loss": 0.067, "mean_token_accuracy": 0.9825757741928101, "num_tokens": 40061672.0, "step": 18160 }, { "entropy": 0.05592022901400924, "epoch": 4.234293041146987, "grad_norm": 0.55859375, "learning_rate": 4.9663730683939524e-05, "loss": 0.0706, "mean_token_accuracy": 0.984505432844162, "num_tokens": 40085250.0, "step": 18165 }, { "entropy": 0.08148605488240719, "epoch": 4.2354586781676185, "grad_norm": 2.09375, "learning_rate": 4.966337892453833e-05, "loss": 0.0727, "mean_token_accuracy": 0.9765782058238983, "num_tokens": 40114042.0, "step": 18170 }, { "entropy": 0.0687420979142189, "epoch": 4.2366243151882506, "grad_norm": 1.484375, "learning_rate": 4.9663026983762855e-05, "loss": 0.0751, "mean_token_accuracy": 0.981961190700531, "num_tokens": 40129373.0, "step": 18175 }, { "entropy": 0.07640882469713688, "epoch": 4.237789952208882, "grad_norm": 10.4375, "learning_rate": 4.966267486161836e-05, "loss": 0.1158, "mean_token_accuracy": 0.9757577538490295, "num_tokens": 40149001.0, "step": 18180 }, { "entropy": 0.067472736351192, "epoch": 4.238955589229514, "grad_norm": 6.15625, "learning_rate": 4.9662322558110084e-05, "loss": 0.0897, "mean_token_accuracy": 0.9813668429851532, "num_tokens": 40162262.0, "step": 18185 }, { "entropy": 0.051194218918681145, "epoch": 4.240121226250146, "grad_norm": 3.4375, "learning_rate": 4.966197007324329e-05, "loss": 0.0542, "mean_token_accuracy": 0.9833743572235107, "num_tokens": 40189423.0, "step": 18190 }, { "entropy": 0.06599115077406167, "epoch": 4.241286863270777, "grad_norm": 2.59375, "learning_rate": 4.966161740702323e-05, "loss": 0.0717, "mean_token_accuracy": 0.9787083387374877, "num_tokens": 40212858.0, "step": 18195 }, { "entropy": 0.09262879192829132, "epoch": 4.242452500291409, "grad_norm": 2.015625, "learning_rate": 4.966126455945516e-05, "loss": 0.0772, "mean_token_accuracy": 0.9747631430625916, "num_tokens": 40231966.0, "step": 18200 }, { "entropy": 0.06321851387619973, "epoch": 4.243618137312041, "grad_norm": 4.625, "learning_rate": 4.966091153054434e-05, "loss": 0.0869, "mean_token_accuracy": 0.9782408058643342, "num_tokens": 40243075.0, "step": 18205 }, { "entropy": 0.07191295428201556, "epoch": 4.244783774332673, "grad_norm": 4.21875, "learning_rate": 4.9660558320296045e-05, "loss": 0.0713, "mean_token_accuracy": 0.9761281490325928, "num_tokens": 40268059.0, "step": 18210 }, { "entropy": 0.06418008059263229, "epoch": 4.245949411353305, "grad_norm": 3.171875, "learning_rate": 4.966020492871553e-05, "loss": 0.0995, "mean_token_accuracy": 0.9752312481403351, "num_tokens": 40288817.0, "step": 18215 }, { "entropy": 0.0693417014554143, "epoch": 4.247115048373936, "grad_norm": 2.421875, "learning_rate": 4.9659851355808076e-05, "loss": 0.0671, "mean_token_accuracy": 0.9787382364273072, "num_tokens": 40318263.0, "step": 18220 }, { "entropy": 0.06724477596580983, "epoch": 4.248280685394568, "grad_norm": 1.125, "learning_rate": 4.965949760157894e-05, "loss": 0.0847, "mean_token_accuracy": 0.9804198741912842, "num_tokens": 40328010.0, "step": 18225 }, { "entropy": 0.1094695933163166, "epoch": 4.2494463224152, "grad_norm": 5.09375, "learning_rate": 4.9659143666033416e-05, "loss": 0.1179, "mean_token_accuracy": 0.9737821757793427, "num_tokens": 40340341.0, "step": 18230 }, { "entropy": 0.053947434015572074, "epoch": 4.250611959435831, "grad_norm": 0.7578125, "learning_rate": 4.965878954917676e-05, "loss": 0.0439, "mean_token_accuracy": 0.9878208816051484, "num_tokens": 40379611.0, "step": 18235 }, { "entropy": 0.06869078604504467, "epoch": 4.2517775964564635, "grad_norm": 0.2734375, "learning_rate": 4.965843525101427e-05, "loss": 0.0668, "mean_token_accuracy": 0.9799235224723816, "num_tokens": 40396991.0, "step": 18240 }, { "entropy": 0.06360419914126396, "epoch": 4.252943233477096, "grad_norm": 0.88671875, "learning_rate": 4.965808077155123e-05, "loss": 0.0698, "mean_token_accuracy": 0.9827209532260894, "num_tokens": 40411549.0, "step": 18245 }, { "entropy": 0.21973246708512306, "epoch": 4.254108870497727, "grad_norm": 0.59375, "learning_rate": 4.9657726110792914e-05, "loss": 0.3485, "mean_token_accuracy": 0.9430919229984284, "num_tokens": 40453795.0, "step": 18250 }, { "entropy": 0.07258025612682104, "epoch": 4.255274507518359, "grad_norm": 0.546875, "learning_rate": 4.965737126874461e-05, "loss": 0.0478, "mean_token_accuracy": 0.987526661157608, "num_tokens": 40470031.0, "step": 18255 }, { "entropy": 0.06506192674860359, "epoch": 4.25644014453899, "grad_norm": 0.89453125, "learning_rate": 4.9657016245411614e-05, "loss": 0.07, "mean_token_accuracy": 0.9759842813014984, "num_tokens": 40494247.0, "step": 18260 }, { "entropy": 0.07043304983526469, "epoch": 4.257605781559622, "grad_norm": 0.51171875, "learning_rate": 4.965666104079923e-05, "loss": 0.0668, "mean_token_accuracy": 0.9803885996341706, "num_tokens": 40507863.0, "step": 18265 }, { "entropy": 0.08717799168080091, "epoch": 4.258771418580254, "grad_norm": 0.478515625, "learning_rate": 4.965630565491274e-05, "loss": 0.0781, "mean_token_accuracy": 0.9728128552436829, "num_tokens": 40523582.0, "step": 18270 }, { "entropy": 0.09992110282182694, "epoch": 4.2599370556008855, "grad_norm": 1.546875, "learning_rate": 4.965595008775745e-05, "loss": 0.1153, "mean_token_accuracy": 0.9688679814338684, "num_tokens": 40534326.0, "step": 18275 }, { "entropy": 0.060247833095490935, "epoch": 4.261102692621518, "grad_norm": 1.4609375, "learning_rate": 4.9655594339338654e-05, "loss": 0.0546, "mean_token_accuracy": 0.9846014022827149, "num_tokens": 40552515.0, "step": 18280 }, { "entropy": 0.07441337686032057, "epoch": 4.26226832964215, "grad_norm": 2.71875, "learning_rate": 4.965523840966167e-05, "loss": 0.0724, "mean_token_accuracy": 0.9776188433170319, "num_tokens": 40574150.0, "step": 18285 }, { "entropy": 0.08602970764040947, "epoch": 4.263433966662781, "grad_norm": 1.640625, "learning_rate": 4.96548822987318e-05, "loss": 0.1114, "mean_token_accuracy": 0.9711106956005097, "num_tokens": 40583114.0, "step": 18290 }, { "entropy": 0.05823810677975416, "epoch": 4.264599603683413, "grad_norm": 6.0, "learning_rate": 4.965452600655435e-05, "loss": 0.0864, "mean_token_accuracy": 0.9759131968021393, "num_tokens": 40611013.0, "step": 18295 }, { "entropy": 0.06771047422662377, "epoch": 4.265765240704045, "grad_norm": 2.09375, "learning_rate": 4.965416953313463e-05, "loss": 0.0728, "mean_token_accuracy": 0.9816241145133973, "num_tokens": 40627138.0, "step": 18300 }, { "entropy": 0.059956477768719194, "epoch": 4.266930877724676, "grad_norm": 2.40625, "learning_rate": 4.9653812878477976e-05, "loss": 0.0676, "mean_token_accuracy": 0.9846004903316498, "num_tokens": 40650411.0, "step": 18305 }, { "entropy": 0.061690607108175755, "epoch": 4.2680965147453085, "grad_norm": 1.0234375, "learning_rate": 4.965345604258968e-05, "loss": 0.0511, "mean_token_accuracy": 0.9836044013500214, "num_tokens": 40676939.0, "step": 18310 }, { "entropy": 0.06359901251271367, "epoch": 4.26926215176594, "grad_norm": 0.2158203125, "learning_rate": 4.9653099025475076e-05, "loss": 0.0546, "mean_token_accuracy": 0.9798293471336365, "num_tokens": 40711483.0, "step": 18315 }, { "entropy": 0.08764311112463474, "epoch": 4.270427788786572, "grad_norm": 0.203125, "learning_rate": 4.965274182713949e-05, "loss": 0.1347, "mean_token_accuracy": 0.9690339505672455, "num_tokens": 40730163.0, "step": 18320 }, { "entropy": 0.07748756930232048, "epoch": 4.271593425807204, "grad_norm": 1.171875, "learning_rate": 4.965238444758824e-05, "loss": 0.0988, "mean_token_accuracy": 0.9777417302131652, "num_tokens": 40739083.0, "step": 18325 }, { "entropy": 0.06385232815518975, "epoch": 4.272759062827835, "grad_norm": 0.396484375, "learning_rate": 4.9652026886826666e-05, "loss": 0.0722, "mean_token_accuracy": 0.9804993093013763, "num_tokens": 40764097.0, "step": 18330 }, { "entropy": 0.06116134990006685, "epoch": 4.273924699848467, "grad_norm": 0.44921875, "learning_rate": 4.965166914486008e-05, "loss": 0.065, "mean_token_accuracy": 0.9793049991130829, "num_tokens": 40785512.0, "step": 18335 }, { "entropy": 0.08398058190941811, "epoch": 4.275090336869099, "grad_norm": 4.03125, "learning_rate": 4.9651311221693845e-05, "loss": 0.0871, "mean_token_accuracy": 0.9712727069854736, "num_tokens": 40806382.0, "step": 18340 }, { "entropy": 0.17974007017910482, "epoch": 4.2762559738897306, "grad_norm": 4.03125, "learning_rate": 4.9650953117333275e-05, "loss": 0.2688, "mean_token_accuracy": 0.9447404503822326, "num_tokens": 40828370.0, "step": 18345 }, { "entropy": 0.059681543800979855, "epoch": 4.277421610910363, "grad_norm": 0.388671875, "learning_rate": 4.9650594831783724e-05, "loss": 0.0428, "mean_token_accuracy": 0.9817675530910492, "num_tokens": 40866610.0, "step": 18350 }, { "entropy": 0.09934973865747451, "epoch": 4.278587247930994, "grad_norm": 7.09375, "learning_rate": 4.9650236365050525e-05, "loss": 0.1247, "mean_token_accuracy": 0.9634607434272766, "num_tokens": 40879155.0, "step": 18355 }, { "entropy": 0.06509402473457157, "epoch": 4.279752884951626, "grad_norm": 0.341796875, "learning_rate": 4.9649877717139026e-05, "loss": 0.0763, "mean_token_accuracy": 0.9754965901374817, "num_tokens": 40898689.0, "step": 18360 }, { "entropy": 0.0826711606234312, "epoch": 4.280918521972258, "grad_norm": 2.46875, "learning_rate": 4.964951888805458e-05, "loss": 0.0876, "mean_token_accuracy": 0.9787150621414185, "num_tokens": 40917534.0, "step": 18365 }, { "entropy": 0.13180441725999117, "epoch": 4.282084158992889, "grad_norm": 2.453125, "learning_rate": 4.9649159877802524e-05, "loss": 0.1874, "mean_token_accuracy": 0.9560307621955871, "num_tokens": 40944410.0, "step": 18370 }, { "entropy": 0.06450794208794833, "epoch": 4.283249796013521, "grad_norm": 2.453125, "learning_rate": 4.964880068638823e-05, "loss": 0.0735, "mean_token_accuracy": 0.9780965685844422, "num_tokens": 40960755.0, "step": 18375 }, { "entropy": 0.04702851264737547, "epoch": 4.2844154330341535, "grad_norm": 0.2119140625, "learning_rate": 4.964844131381704e-05, "loss": 0.0457, "mean_token_accuracy": 0.986723518371582, "num_tokens": 40990086.0, "step": 18380 }, { "entropy": 0.11209305711090564, "epoch": 4.285581070054785, "grad_norm": 1.59375, "learning_rate": 4.9648081760094324e-05, "loss": 0.1346, "mean_token_accuracy": 0.967307448387146, "num_tokens": 41010686.0, "step": 18385 }, { "entropy": 0.08510778667405247, "epoch": 4.286746707075417, "grad_norm": 0.69921875, "learning_rate": 4.964772202522543e-05, "loss": 0.1131, "mean_token_accuracy": 0.9747722148895264, "num_tokens": 41029033.0, "step": 18390 }, { "entropy": 0.06896943571045995, "epoch": 4.287912344096048, "grad_norm": 5.53125, "learning_rate": 4.9647362109215735e-05, "loss": 0.0983, "mean_token_accuracy": 0.9716754376888275, "num_tokens": 41046478.0, "step": 18395 }, { "entropy": 0.04878347143530846, "epoch": 4.28907798111668, "grad_norm": 0.482421875, "learning_rate": 4.96470020120706e-05, "loss": 0.0798, "mean_token_accuracy": 0.9811624825000763, "num_tokens": 41078644.0, "step": 18400 }, { "entropy": 0.05125298034399748, "epoch": 4.290243618137312, "grad_norm": 1.4609375, "learning_rate": 4.964664173379539e-05, "loss": 0.0611, "mean_token_accuracy": 0.9861126601696014, "num_tokens": 41099848.0, "step": 18405 }, { "entropy": 0.07087703254073859, "epoch": 4.2914092551579435, "grad_norm": 0.53515625, "learning_rate": 4.9646281274395484e-05, "loss": 0.0763, "mean_token_accuracy": 0.9810361623764038, "num_tokens": 41127755.0, "step": 18410 }, { "entropy": 0.06897338693961501, "epoch": 4.292574892178576, "grad_norm": 0.3671875, "learning_rate": 4.964592063387625e-05, "loss": 0.0599, "mean_token_accuracy": 0.9793107509613037, "num_tokens": 41151591.0, "step": 18415 }, { "entropy": 0.17628241926431656, "epoch": 4.293740529199208, "grad_norm": 0.74609375, "learning_rate": 4.964555981224308e-05, "loss": 0.192, "mean_token_accuracy": 0.937102484703064, "num_tokens": 41186067.0, "step": 18420 }, { "entropy": 0.07771440567448736, "epoch": 4.294906166219839, "grad_norm": 4.8125, "learning_rate": 4.964519880950134e-05, "loss": 0.0837, "mean_token_accuracy": 0.9757640421390533, "num_tokens": 41206154.0, "step": 18425 }, { "entropy": 0.047894996032118796, "epoch": 4.296071803240471, "grad_norm": 2.03125, "learning_rate": 4.9644837625656425e-05, "loss": 0.0532, "mean_token_accuracy": 0.9868818938732147, "num_tokens": 41224421.0, "step": 18430 }, { "entropy": 0.05917087513953447, "epoch": 4.297237440261103, "grad_norm": 0.5078125, "learning_rate": 4.964447626071371e-05, "loss": 0.039, "mean_token_accuracy": 0.9861246168613433, "num_tokens": 41260158.0, "step": 18435 }, { "entropy": 0.06869417782872915, "epoch": 4.298403077281734, "grad_norm": 3.609375, "learning_rate": 4.964411471467859e-05, "loss": 0.0962, "mean_token_accuracy": 0.9750606179237366, "num_tokens": 41276685.0, "step": 18440 }, { "entropy": 0.050992142967879774, "epoch": 4.299568714302366, "grad_norm": 0.296875, "learning_rate": 4.964375298755645e-05, "loss": 0.0411, "mean_token_accuracy": 0.9854860126972198, "num_tokens": 41313001.0, "step": 18445 }, { "entropy": 0.05416600527241826, "epoch": 4.300734351322998, "grad_norm": 3.3125, "learning_rate": 4.9643391079352684e-05, "loss": 0.0433, "mean_token_accuracy": 0.9865663528442383, "num_tokens": 41350294.0, "step": 18450 }, { "entropy": 0.06011302322149277, "epoch": 4.30189998834363, "grad_norm": 1.3046875, "learning_rate": 4.96430289900727e-05, "loss": 0.0603, "mean_token_accuracy": 0.9830720722675323, "num_tokens": 41372750.0, "step": 18455 }, { "entropy": 0.07380872648209333, "epoch": 4.303065625364262, "grad_norm": 2.0, "learning_rate": 4.964266671972189e-05, "loss": 0.0509, "mean_token_accuracy": 0.9841031551361084, "num_tokens": 41393418.0, "step": 18460 }, { "entropy": 0.06437693070620298, "epoch": 4.304231262384893, "grad_norm": 2.5625, "learning_rate": 4.964230426830564e-05, "loss": 0.0539, "mean_token_accuracy": 0.9840012729167938, "num_tokens": 41413683.0, "step": 18465 }, { "entropy": 0.08422975167632103, "epoch": 4.305396899405525, "grad_norm": 0.828125, "learning_rate": 4.9641941635829384e-05, "loss": 0.0663, "mean_token_accuracy": 0.9783717930316925, "num_tokens": 41437122.0, "step": 18470 }, { "entropy": 0.0769905123859644, "epoch": 4.306562536426157, "grad_norm": 1.78125, "learning_rate": 4.964157882229852e-05, "loss": 0.1124, "mean_token_accuracy": 0.9747823476791382, "num_tokens": 41446558.0, "step": 18475 }, { "entropy": 0.0603149157948792, "epoch": 4.3077281734467885, "grad_norm": 4.46875, "learning_rate": 4.9641215827718444e-05, "loss": 0.078, "mean_token_accuracy": 0.9764757096767426, "num_tokens": 41466108.0, "step": 18480 }, { "entropy": 0.05723539115861058, "epoch": 4.308893810467421, "grad_norm": 0.50390625, "learning_rate": 4.9640852652094586e-05, "loss": 0.0428, "mean_token_accuracy": 0.9779875040054321, "num_tokens": 41494076.0, "step": 18485 }, { "entropy": 0.08054400235414505, "epoch": 4.310059447488052, "grad_norm": 0.546875, "learning_rate": 4.964048929543235e-05, "loss": 0.093, "mean_token_accuracy": 0.9737628102302551, "num_tokens": 41517453.0, "step": 18490 }, { "entropy": 0.07607092708349228, "epoch": 4.311225084508684, "grad_norm": 0.70703125, "learning_rate": 4.9640125757737156e-05, "loss": 0.0751, "mean_token_accuracy": 0.9802489399909973, "num_tokens": 41545604.0, "step": 18495 }, { "entropy": 0.06756752827204764, "epoch": 4.312390721529316, "grad_norm": 0.1767578125, "learning_rate": 4.9639762039014434e-05, "loss": 0.0619, "mean_token_accuracy": 0.9819507598876953, "num_tokens": 41571454.0, "step": 18500 }, { "entropy": 0.07994569651782513, "epoch": 4.313556358549947, "grad_norm": 3.953125, "learning_rate": 4.9639398139269597e-05, "loss": 0.105, "mean_token_accuracy": 0.9740553617477417, "num_tokens": 41586939.0, "step": 18505 }, { "entropy": 0.07430811729282141, "epoch": 4.314721995570579, "grad_norm": 0.97265625, "learning_rate": 4.963903405850807e-05, "loss": 0.1093, "mean_token_accuracy": 0.9766282320022583, "num_tokens": 41598356.0, "step": 18510 }, { "entropy": 0.0700147021561861, "epoch": 4.315887632591211, "grad_norm": 3.328125, "learning_rate": 4.9638669796735295e-05, "loss": 0.0764, "mean_token_accuracy": 0.9803567469120026, "num_tokens": 41611133.0, "step": 18515 }, { "entropy": 0.08196193277835846, "epoch": 4.317053269611843, "grad_norm": 0.74609375, "learning_rate": 4.9638305353956694e-05, "loss": 0.083, "mean_token_accuracy": 0.975335168838501, "num_tokens": 41632870.0, "step": 18520 }, { "entropy": 0.07142581269145012, "epoch": 4.318218906632475, "grad_norm": 4.3125, "learning_rate": 4.96379407301777e-05, "loss": 0.0885, "mean_token_accuracy": 0.9793270707130433, "num_tokens": 41654489.0, "step": 18525 }, { "entropy": 0.05450365114957094, "epoch": 4.319384543653106, "grad_norm": 0.76953125, "learning_rate": 4.9637575925403755e-05, "loss": 0.059, "mean_token_accuracy": 0.9803845226764679, "num_tokens": 41680852.0, "step": 18530 }, { "entropy": 0.08670583032071591, "epoch": 4.320550180673738, "grad_norm": 3.25, "learning_rate": 4.96372109396403e-05, "loss": 0.0813, "mean_token_accuracy": 0.9761635422706604, "num_tokens": 41699379.0, "step": 18535 }, { "entropy": 0.08022739067673683, "epoch": 4.32171581769437, "grad_norm": 1.1171875, "learning_rate": 4.963684577289277e-05, "loss": 0.0909, "mean_token_accuracy": 0.9803108692169189, "num_tokens": 41710174.0, "step": 18540 }, { "entropy": 0.05595881547778845, "epoch": 4.322881454715001, "grad_norm": 2.71875, "learning_rate": 4.963648042516661e-05, "loss": 0.0616, "mean_token_accuracy": 0.9813150882720947, "num_tokens": 41738717.0, "step": 18545 }, { "entropy": 0.06690775705501437, "epoch": 4.3240470917356335, "grad_norm": 2.546875, "learning_rate": 4.963611489646728e-05, "loss": 0.1002, "mean_token_accuracy": 0.9740199089050293, "num_tokens": 41755906.0, "step": 18550 }, { "entropy": 0.07001672107726335, "epoch": 4.325212728756266, "grad_norm": 0.640625, "learning_rate": 4.9635749186800225e-05, "loss": 0.0494, "mean_token_accuracy": 0.9815467417240142, "num_tokens": 41784497.0, "step": 18555 }, { "entropy": 0.053368914686143396, "epoch": 4.326378365776897, "grad_norm": 1.5625, "learning_rate": 4.963538329617089e-05, "loss": 0.0466, "mean_token_accuracy": 0.9881910800933837, "num_tokens": 41803148.0, "step": 18560 }, { "entropy": 0.07704963702708482, "epoch": 4.327544002797529, "grad_norm": 1.265625, "learning_rate": 4.963501722458474e-05, "loss": 0.0593, "mean_token_accuracy": 0.9782488167285919, "num_tokens": 41823408.0, "step": 18565 }, { "entropy": 0.07265666145831347, "epoch": 4.328709639818161, "grad_norm": 0.416015625, "learning_rate": 4.9634650972047235e-05, "loss": 0.0633, "mean_token_accuracy": 0.9832420825958252, "num_tokens": 41841017.0, "step": 18570 }, { "entropy": 0.06052091708406806, "epoch": 4.329875276838792, "grad_norm": 3.328125, "learning_rate": 4.963428453856383e-05, "loss": 0.046, "mean_token_accuracy": 0.9867750465869903, "num_tokens": 41875414.0, "step": 18575 }, { "entropy": 0.08009387571364641, "epoch": 4.331040913859424, "grad_norm": 2.5625, "learning_rate": 4.963391792413999e-05, "loss": 0.0915, "mean_token_accuracy": 0.9757187187671661, "num_tokens": 41902076.0, "step": 18580 }, { "entropy": 0.08106917953118682, "epoch": 4.332206550880056, "grad_norm": 1.296875, "learning_rate": 4.9633551128781186e-05, "loss": 0.0679, "mean_token_accuracy": 0.980489581823349, "num_tokens": 41933448.0, "step": 18585 }, { "entropy": 0.09040620159357786, "epoch": 4.333372187900688, "grad_norm": 3.53125, "learning_rate": 4.963318415249289e-05, "loss": 0.1102, "mean_token_accuracy": 0.9753665149211883, "num_tokens": 41953266.0, "step": 18590 }, { "entropy": 0.06375911897048354, "epoch": 4.33453782492132, "grad_norm": 6.1875, "learning_rate": 4.963281699528055e-05, "loss": 0.0859, "mean_token_accuracy": 0.9811161577701568, "num_tokens": 41983894.0, "step": 18595 }, { "entropy": 0.08802753314375877, "epoch": 4.335703461941951, "grad_norm": 3.421875, "learning_rate": 4.963244965714968e-05, "loss": 0.0661, "mean_token_accuracy": 0.9772323071956635, "num_tokens": 41997012.0, "step": 18600 }, { "entropy": 0.060184755455702545, "epoch": 4.336869098962583, "grad_norm": 1.2734375, "learning_rate": 4.9632082138105726e-05, "loss": 0.0496, "mean_token_accuracy": 0.983251416683197, "num_tokens": 42017776.0, "step": 18605 }, { "entropy": 0.06990799438208342, "epoch": 4.338034735983215, "grad_norm": 3.484375, "learning_rate": 4.963171443815418e-05, "loss": 0.0899, "mean_token_accuracy": 0.9801129817962646, "num_tokens": 42051407.0, "step": 18610 }, { "entropy": 0.06780791487544775, "epoch": 4.339200373003846, "grad_norm": 4.21875, "learning_rate": 4.963134655730053e-05, "loss": 0.0529, "mean_token_accuracy": 0.9823335826396942, "num_tokens": 42077316.0, "step": 18615 }, { "entropy": 0.09103262685239315, "epoch": 4.3403660100244785, "grad_norm": 0.7109375, "learning_rate": 4.963097849555025e-05, "loss": 0.0988, "mean_token_accuracy": 0.9731744289398193, "num_tokens": 42097680.0, "step": 18620 }, { "entropy": 0.06253818608820438, "epoch": 4.34153164704511, "grad_norm": 3.375, "learning_rate": 4.963061025290884e-05, "loss": 0.0494, "mean_token_accuracy": 0.9833706498146058, "num_tokens": 42118703.0, "step": 18625 }, { "entropy": 0.08283813260495662, "epoch": 4.342697284065742, "grad_norm": 2.234375, "learning_rate": 4.963024182938179e-05, "loss": 0.0689, "mean_token_accuracy": 0.9750663280487061, "num_tokens": 42135563.0, "step": 18630 }, { "entropy": 0.07473634798079729, "epoch": 4.343862921086374, "grad_norm": 2.390625, "learning_rate": 4.962987322497458e-05, "loss": 0.0715, "mean_token_accuracy": 0.9804068505764008, "num_tokens": 42154972.0, "step": 18635 }, { "entropy": 0.08067142516374588, "epoch": 4.345028558107005, "grad_norm": 1.5078125, "learning_rate": 4.9629504439692717e-05, "loss": 0.1013, "mean_token_accuracy": 0.977970826625824, "num_tokens": 42166016.0, "step": 18640 }, { "entropy": 0.045968357706442475, "epoch": 4.346194195127637, "grad_norm": 1.1171875, "learning_rate": 4.96291354735417e-05, "loss": 0.0356, "mean_token_accuracy": 0.9882686376571655, "num_tokens": 42192263.0, "step": 18645 }, { "entropy": 0.05396311990916729, "epoch": 4.347359832148269, "grad_norm": 0.73828125, "learning_rate": 4.962876632652703e-05, "loss": 0.045, "mean_token_accuracy": 0.984005081653595, "num_tokens": 42225107.0, "step": 18650 }, { "entropy": 0.07691260352730751, "epoch": 4.348525469168901, "grad_norm": 2.640625, "learning_rate": 4.962839699865421e-05, "loss": 0.0457, "mean_token_accuracy": 0.9825477838516236, "num_tokens": 42247555.0, "step": 18655 }, { "entropy": 0.07492669681087136, "epoch": 4.349691106189533, "grad_norm": 1.3671875, "learning_rate": 4.962802748992875e-05, "loss": 0.1, "mean_token_accuracy": 0.9735464215278625, "num_tokens": 42266583.0, "step": 18660 }, { "entropy": 0.04886567974463105, "epoch": 4.350856743210164, "grad_norm": 1.734375, "learning_rate": 4.962765780035616e-05, "loss": 0.0496, "mean_token_accuracy": 0.9842284440994262, "num_tokens": 42289094.0, "step": 18665 }, { "entropy": 0.08082684567198158, "epoch": 4.352022380230796, "grad_norm": 0.703125, "learning_rate": 4.962728792994196e-05, "loss": 0.0916, "mean_token_accuracy": 0.9775077998638153, "num_tokens": 42320993.0, "step": 18670 }, { "entropy": 0.14611582197248935, "epoch": 4.353188017251428, "grad_norm": 2.015625, "learning_rate": 4.962691787869164e-05, "loss": 0.2335, "mean_token_accuracy": 0.9521386444568634, "num_tokens": 42350358.0, "step": 18675 }, { "entropy": 0.057045502867549655, "epoch": 4.354353654272059, "grad_norm": 0.318359375, "learning_rate": 4.962654764661074e-05, "loss": 0.0453, "mean_token_accuracy": 0.9859548568725586, "num_tokens": 42391759.0, "step": 18680 }, { "entropy": 0.0750160675495863, "epoch": 4.355519291292691, "grad_norm": 0.30078125, "learning_rate": 4.962617723370478e-05, "loss": 0.0515, "mean_token_accuracy": 0.9814619541168212, "num_tokens": 42420280.0, "step": 18685 }, { "entropy": 0.06027145287953317, "epoch": 4.3566849283133235, "grad_norm": 0.30859375, "learning_rate": 4.962580663997928e-05, "loss": 0.0426, "mean_token_accuracy": 0.9794057428836822, "num_tokens": 42453195.0, "step": 18690 }, { "entropy": 0.0616011893376708, "epoch": 4.357850565333955, "grad_norm": 0.66796875, "learning_rate": 4.9625435865439756e-05, "loss": 0.0592, "mean_token_accuracy": 0.9814781188964844, "num_tokens": 42469008.0, "step": 18695 }, { "entropy": 0.06871538469567895, "epoch": 4.359016202354587, "grad_norm": 4.3125, "learning_rate": 4.9625064910091753e-05, "loss": 0.1116, "mean_token_accuracy": 0.9721579551696777, "num_tokens": 42484841.0, "step": 18700 }, { "entropy": 0.08792795054614544, "epoch": 4.360181839375219, "grad_norm": 7.78125, "learning_rate": 4.962469377394079e-05, "loss": 0.1013, "mean_token_accuracy": 0.9731840312480926, "num_tokens": 42513259.0, "step": 18705 }, { "entropy": 0.12590097589418292, "epoch": 4.36134747639585, "grad_norm": 0.5625, "learning_rate": 4.962432245699241e-05, "loss": 0.1736, "mean_token_accuracy": 0.9640897929668426, "num_tokens": 42537247.0, "step": 18710 }, { "entropy": 0.0786399308592081, "epoch": 4.362513113416482, "grad_norm": 2.625, "learning_rate": 4.962395095925214e-05, "loss": 0.104, "mean_token_accuracy": 0.9724407613277435, "num_tokens": 42555543.0, "step": 18715 }, { "entropy": 0.0821190400980413, "epoch": 4.3636787504371135, "grad_norm": 0.390625, "learning_rate": 4.962357928072553e-05, "loss": 0.0986, "mean_token_accuracy": 0.9734818339347839, "num_tokens": 42578885.0, "step": 18720 }, { "entropy": 0.08184778075665236, "epoch": 4.364844387457746, "grad_norm": 2.046875, "learning_rate": 4.962320742141812e-05, "loss": 0.0699, "mean_token_accuracy": 0.9773243069648743, "num_tokens": 42604667.0, "step": 18725 }, { "entropy": 0.07144193323329091, "epoch": 4.366010024478378, "grad_norm": 0.412109375, "learning_rate": 4.962283538133545e-05, "loss": 0.0712, "mean_token_accuracy": 0.9800425589084625, "num_tokens": 42618866.0, "step": 18730 }, { "entropy": 0.04785723239183426, "epoch": 4.367175661499009, "grad_norm": 3.78125, "learning_rate": 4.962246316048307e-05, "loss": 0.0507, "mean_token_accuracy": 0.9854453265666961, "num_tokens": 42638384.0, "step": 18735 }, { "entropy": 0.08500594049692153, "epoch": 4.368341298519641, "grad_norm": 2.875, "learning_rate": 4.9622090758866534e-05, "loss": 0.1412, "mean_token_accuracy": 0.9683994054794312, "num_tokens": 42646730.0, "step": 18740 }, { "entropy": 0.06168932262808084, "epoch": 4.369506935540273, "grad_norm": 4.0, "learning_rate": 4.962171817649139e-05, "loss": 0.0715, "mean_token_accuracy": 0.9813813984394073, "num_tokens": 42665053.0, "step": 18745 }, { "entropy": 0.06118791922926903, "epoch": 4.370672572560904, "grad_norm": 3.796875, "learning_rate": 4.962134541336319e-05, "loss": 0.0952, "mean_token_accuracy": 0.9771844387054444, "num_tokens": 42675690.0, "step": 18750 }, { "entropy": 0.08084941320121289, "epoch": 4.3718382095815365, "grad_norm": 1.8984375, "learning_rate": 4.9620972469487515e-05, "loss": 0.1111, "mean_token_accuracy": 0.9746397316455842, "num_tokens": 42689042.0, "step": 18755 }, { "entropy": 0.061972387880086896, "epoch": 4.373003846602168, "grad_norm": 2.265625, "learning_rate": 4.96205993448699e-05, "loss": 0.0656, "mean_token_accuracy": 0.9821341514587403, "num_tokens": 42711886.0, "step": 18760 }, { "entropy": 0.06485986206680536, "epoch": 4.3741694836228, "grad_norm": 0.6015625, "learning_rate": 4.962022603951592e-05, "loss": 0.0781, "mean_token_accuracy": 0.9823011875152587, "num_tokens": 42743918.0, "step": 18765 }, { "entropy": 0.06315469332039356, "epoch": 4.375335120643432, "grad_norm": 0.52734375, "learning_rate": 4.961985255343113e-05, "loss": 0.0627, "mean_token_accuracy": 0.9816456615924836, "num_tokens": 42758201.0, "step": 18770 }, { "entropy": 0.06303026769310235, "epoch": 4.376500757664063, "grad_norm": 1.71875, "learning_rate": 4.961947888662112e-05, "loss": 0.0685, "mean_token_accuracy": 0.9812012195587159, "num_tokens": 42775902.0, "step": 18775 }, { "entropy": 0.05505295917391777, "epoch": 4.377666394684695, "grad_norm": 0.609375, "learning_rate": 4.961910503909145e-05, "loss": 0.0615, "mean_token_accuracy": 0.9768405258655548, "num_tokens": 42796697.0, "step": 18780 }, { "entropy": 0.060315626300871374, "epoch": 4.378832031705327, "grad_norm": 1.0546875, "learning_rate": 4.96187310108477e-05, "loss": 0.065, "mean_token_accuracy": 0.9849414944648742, "num_tokens": 42813530.0, "step": 18785 }, { "entropy": 0.11221035532653331, "epoch": 4.3799976687259585, "grad_norm": 3.953125, "learning_rate": 4.961835680189543e-05, "loss": 0.1438, "mean_token_accuracy": 0.9659707307815552, "num_tokens": 42835504.0, "step": 18790 }, { "entropy": 0.054355106130242345, "epoch": 4.381163305746591, "grad_norm": 3.734375, "learning_rate": 4.961798241224024e-05, "loss": 0.0747, "mean_token_accuracy": 0.9837758183479309, "num_tokens": 42851192.0, "step": 18795 }, { "entropy": 0.05948843127116561, "epoch": 4.382328942767222, "grad_norm": 5.34375, "learning_rate": 4.9617607841887707e-05, "loss": 0.0572, "mean_token_accuracy": 0.9820643961429596, "num_tokens": 42873119.0, "step": 18800 }, { "entropy": 0.07751112319529056, "epoch": 4.383494579787854, "grad_norm": 1.21875, "learning_rate": 4.961723309084341e-05, "loss": 0.1139, "mean_token_accuracy": 0.9708074569702149, "num_tokens": 42882581.0, "step": 18805 }, { "entropy": 0.08554555289447308, "epoch": 4.384660216808486, "grad_norm": 1.6328125, "learning_rate": 4.961685815911295e-05, "loss": 0.1082, "mean_token_accuracy": 0.9752114892005921, "num_tokens": 42891543.0, "step": 18810 }, { "entropy": 0.062315055076032876, "epoch": 4.385825853829117, "grad_norm": 0.6953125, "learning_rate": 4.961648304670191e-05, "loss": 0.0535, "mean_token_accuracy": 0.9814134836196899, "num_tokens": 42919010.0, "step": 18815 }, { "entropy": 0.068437035754323, "epoch": 4.386991490849749, "grad_norm": 2.71875, "learning_rate": 4.961610775361588e-05, "loss": 0.0919, "mean_token_accuracy": 0.9802298843860626, "num_tokens": 42933957.0, "step": 18820 }, { "entropy": 0.07225360907614231, "epoch": 4.3881571278703815, "grad_norm": 2.65625, "learning_rate": 4.961573227986045e-05, "loss": 0.0631, "mean_token_accuracy": 0.9835760712623596, "num_tokens": 42962988.0, "step": 18825 }, { "entropy": 0.06295080268755555, "epoch": 4.389322764891013, "grad_norm": 0.330078125, "learning_rate": 4.9615356625441246e-05, "loss": 0.1016, "mean_token_accuracy": 0.9730377972126008, "num_tokens": 42983746.0, "step": 18830 }, { "entropy": 0.06006499025970698, "epoch": 4.390488401911645, "grad_norm": 2.359375, "learning_rate": 4.961498079036384e-05, "loss": 0.0629, "mean_token_accuracy": 0.980323189496994, "num_tokens": 43002500.0, "step": 18835 }, { "entropy": 0.06451915800571442, "epoch": 4.391654038932277, "grad_norm": 3.734375, "learning_rate": 4.961460477463385e-05, "loss": 0.0428, "mean_token_accuracy": 0.9831142783164978, "num_tokens": 43030396.0, "step": 18840 }, { "entropy": 0.10234779641032218, "epoch": 4.392819675952908, "grad_norm": 3.9375, "learning_rate": 4.961422857825689e-05, "loss": 0.1293, "mean_token_accuracy": 0.9682917714118957, "num_tokens": 43037539.0, "step": 18845 }, { "entropy": 0.1020116938278079, "epoch": 4.39398531297354, "grad_norm": 1.4921875, "learning_rate": 4.961385220123855e-05, "loss": 0.1034, "mean_token_accuracy": 0.9642068803310394, "num_tokens": 43050680.0, "step": 18850 }, { "entropy": 0.04995893612504006, "epoch": 4.395150949994171, "grad_norm": 0.43359375, "learning_rate": 4.961347564358446e-05, "loss": 0.0376, "mean_token_accuracy": 0.9891486525535583, "num_tokens": 43078610.0, "step": 18855 }, { "entropy": 0.07869170345366001, "epoch": 4.3963165870148035, "grad_norm": 2.796875, "learning_rate": 4.961309890530023e-05, "loss": 0.0852, "mean_token_accuracy": 0.9785373866558075, "num_tokens": 43098034.0, "step": 18860 }, { "entropy": 0.06370147401466966, "epoch": 4.397482224035436, "grad_norm": 2.53125, "learning_rate": 4.9612721986391474e-05, "loss": 0.0721, "mean_token_accuracy": 0.9796221852302551, "num_tokens": 43115615.0, "step": 18865 }, { "entropy": 0.06276164511218667, "epoch": 4.398647861056067, "grad_norm": 0.3515625, "learning_rate": 4.961234488686382e-05, "loss": 0.0751, "mean_token_accuracy": 0.9809258460998536, "num_tokens": 43138300.0, "step": 18870 }, { "entropy": 0.07407207041978836, "epoch": 4.399813498076699, "grad_norm": 1.5859375, "learning_rate": 4.961196760672288e-05, "loss": 0.0753, "mean_token_accuracy": 0.9747194409370422, "num_tokens": 43161841.0, "step": 18875 }, { "entropy": 0.05361275505274534, "epoch": 4.400979135097331, "grad_norm": 3.609375, "learning_rate": 4.961159014597428e-05, "loss": 0.0564, "mean_token_accuracy": 0.9803205966949463, "num_tokens": 43179908.0, "step": 18880 }, { "entropy": 0.08528228178620338, "epoch": 4.402144772117962, "grad_norm": 1.984375, "learning_rate": 4.961121250462366e-05, "loss": 0.1197, "mean_token_accuracy": 0.9708484292030335, "num_tokens": 43188893.0, "step": 18885 }, { "entropy": 0.0671801614575088, "epoch": 4.403310409138594, "grad_norm": 1.75, "learning_rate": 4.961083468267664e-05, "loss": 0.0874, "mean_token_accuracy": 0.9767676532268524, "num_tokens": 43227304.0, "step": 18890 }, { "entropy": 0.053513195179402825, "epoch": 4.404476046159226, "grad_norm": 0.80078125, "learning_rate": 4.961045668013887e-05, "loss": 0.0373, "mean_token_accuracy": 0.98809694647789, "num_tokens": 43252473.0, "step": 18895 }, { "entropy": 0.08175447061657906, "epoch": 4.405641683179858, "grad_norm": 2.765625, "learning_rate": 4.961007849701596e-05, "loss": 0.1035, "mean_token_accuracy": 0.9751053869724273, "num_tokens": 43264002.0, "step": 18900 }, { "entropy": 0.08174102194607258, "epoch": 4.40680732020049, "grad_norm": 1.1171875, "learning_rate": 4.960970013331358e-05, "loss": 0.0854, "mean_token_accuracy": 0.9750945091247558, "num_tokens": 43278951.0, "step": 18905 }, { "entropy": 0.07742302156984807, "epoch": 4.407972957221121, "grad_norm": 2.1875, "learning_rate": 4.960932158903735e-05, "loss": 0.0814, "mean_token_accuracy": 0.9777226269245147, "num_tokens": 43292822.0, "step": 18910 }, { "entropy": 0.048268615175038575, "epoch": 4.409138594241753, "grad_norm": 0.5, "learning_rate": 4.960894286419293e-05, "loss": 0.0507, "mean_token_accuracy": 0.9873724281787872, "num_tokens": 43326562.0, "step": 18915 }, { "entropy": 0.05129559133201837, "epoch": 4.410304231262385, "grad_norm": 0.4375, "learning_rate": 4.9608563958785945e-05, "loss": 0.0574, "mean_token_accuracy": 0.9859373986721038, "num_tokens": 43359792.0, "step": 18920 }, { "entropy": 0.06564451195299625, "epoch": 4.4114698682830165, "grad_norm": 5.90625, "learning_rate": 4.9608184872822065e-05, "loss": 0.0946, "mean_token_accuracy": 0.978387588262558, "num_tokens": 43383462.0, "step": 18925 }, { "entropy": 0.08527331128716469, "epoch": 4.412635505303649, "grad_norm": 2.28125, "learning_rate": 4.960780560630694e-05, "loss": 0.0894, "mean_token_accuracy": 0.9752187550067901, "num_tokens": 43414584.0, "step": 18930 }, { "entropy": 0.07463353797793389, "epoch": 4.41380114232428, "grad_norm": 3.15625, "learning_rate": 4.9607426159246226e-05, "loss": 0.0691, "mean_token_accuracy": 0.9808345139026642, "num_tokens": 43445164.0, "step": 18935 }, { "entropy": 0.06146889589726925, "epoch": 4.414966779344912, "grad_norm": 1.4296875, "learning_rate": 4.960704653164557e-05, "loss": 0.051, "mean_token_accuracy": 0.9783270239830018, "num_tokens": 43463080.0, "step": 18940 }, { "entropy": 0.07272496372461319, "epoch": 4.416132416365544, "grad_norm": 1.046875, "learning_rate": 4.960666672351064e-05, "loss": 0.1171, "mean_token_accuracy": 0.971141928434372, "num_tokens": 43472428.0, "step": 18945 }, { "entropy": 0.05846181372180581, "epoch": 4.417298053386175, "grad_norm": 0.6640625, "learning_rate": 4.96062867348471e-05, "loss": 0.0519, "mean_token_accuracy": 0.9783796727657318, "num_tokens": 43496040.0, "step": 18950 }, { "entropy": 0.07526832111179829, "epoch": 4.418463690406807, "grad_norm": 0.51953125, "learning_rate": 4.960590656566062e-05, "loss": 0.0657, "mean_token_accuracy": 0.9777651190757751, "num_tokens": 43515069.0, "step": 18955 }, { "entropy": 0.06531362514942884, "epoch": 4.419629327427439, "grad_norm": 1.03125, "learning_rate": 4.960552621595686e-05, "loss": 0.0776, "mean_token_accuracy": 0.9797972917556763, "num_tokens": 43534735.0, "step": 18960 }, { "entropy": 0.07656772956252098, "epoch": 4.420794964448071, "grad_norm": 2.171875, "learning_rate": 4.96051456857415e-05, "loss": 0.0941, "mean_token_accuracy": 0.9756794929504394, "num_tokens": 43543891.0, "step": 18965 }, { "entropy": 0.06007896903902292, "epoch": 4.421960601468703, "grad_norm": 0.2470703125, "learning_rate": 4.960476497502021e-05, "loss": 0.0712, "mean_token_accuracy": 0.9770024001598359, "num_tokens": 43567670.0, "step": 18970 }, { "entropy": 0.07055467199534178, "epoch": 4.423126238489335, "grad_norm": 1.734375, "learning_rate": 4.960438408379867e-05, "loss": 0.0692, "mean_token_accuracy": 0.9769983887672424, "num_tokens": 43583601.0, "step": 18975 }, { "entropy": 0.0902567700482905, "epoch": 4.424291875509966, "grad_norm": 1.7109375, "learning_rate": 4.960400301208255e-05, "loss": 0.0891, "mean_token_accuracy": 0.9708002507686615, "num_tokens": 43609479.0, "step": 18980 }, { "entropy": 0.07092046402394772, "epoch": 4.425457512530598, "grad_norm": 6.65625, "learning_rate": 4.9603621759877544e-05, "loss": 0.0864, "mean_token_accuracy": 0.9759444415569305, "num_tokens": 43631194.0, "step": 18985 }, { "entropy": 0.09915064247325063, "epoch": 4.426623149551229, "grad_norm": 7.53125, "learning_rate": 4.9603240327189335e-05, "loss": 0.0668, "mean_token_accuracy": 0.9716208279132843, "num_tokens": 43653279.0, "step": 18990 }, { "entropy": 0.05771286068484187, "epoch": 4.4277887865718615, "grad_norm": 1.5546875, "learning_rate": 4.9602858714023605e-05, "loss": 0.0702, "mean_token_accuracy": 0.9814035415649414, "num_tokens": 43673620.0, "step": 18995 }, { "entropy": 0.06412562178447842, "epoch": 4.428954423592494, "grad_norm": 6.0, "learning_rate": 4.960247692038605e-05, "loss": 0.1015, "mean_token_accuracy": 0.9743063271045684, "num_tokens": 43690001.0, "step": 19000 }, { "entropy": 0.07472214587032795, "epoch": 4.430120060613125, "grad_norm": 3.359375, "learning_rate": 4.960209494628236e-05, "loss": 0.0712, "mean_token_accuracy": 0.9787097334861755, "num_tokens": 43704465.0, "step": 19005 }, { "entropy": 0.06662010606378317, "epoch": 4.431285697633757, "grad_norm": 0.84375, "learning_rate": 4.960171279171823e-05, "loss": 0.0735, "mean_token_accuracy": 0.9809480428695678, "num_tokens": 43716993.0, "step": 19010 }, { "entropy": 0.08009600536897779, "epoch": 4.432451334654389, "grad_norm": 0.25, "learning_rate": 4.9601330456699366e-05, "loss": 0.0857, "mean_token_accuracy": 0.9711295068264008, "num_tokens": 43750588.0, "step": 19015 }, { "entropy": 0.09564240127801896, "epoch": 4.43361697167502, "grad_norm": 2.328125, "learning_rate": 4.960094794123146e-05, "loss": 0.1229, "mean_token_accuracy": 0.970706331729889, "num_tokens": 43761979.0, "step": 19020 }, { "entropy": 0.06584495399147272, "epoch": 4.434782608695652, "grad_norm": 0.68359375, "learning_rate": 4.9600565245320215e-05, "loss": 0.0742, "mean_token_accuracy": 0.9821219086647034, "num_tokens": 43793503.0, "step": 19025 }, { "entropy": 0.049910994991660115, "epoch": 4.4359482457162835, "grad_norm": 0.66015625, "learning_rate": 4.9600182368971344e-05, "loss": 0.0401, "mean_token_accuracy": 0.98340545296669, "num_tokens": 43822739.0, "step": 19030 }, { "entropy": 0.10595990158617496, "epoch": 4.437113882736916, "grad_norm": 2.578125, "learning_rate": 4.959979931219056e-05, "loss": 0.0965, "mean_token_accuracy": 0.9769360482692718, "num_tokens": 43834491.0, "step": 19035 }, { "entropy": 0.09281698856502771, "epoch": 4.438279519757548, "grad_norm": 3.921875, "learning_rate": 4.959941607498356e-05, "loss": 0.1186, "mean_token_accuracy": 0.9695560753345489, "num_tokens": 43867073.0, "step": 19040 }, { "entropy": 0.10635361950844527, "epoch": 4.439445156778179, "grad_norm": 5.375, "learning_rate": 4.959903265735607e-05, "loss": 0.1358, "mean_token_accuracy": 0.9646057069301606, "num_tokens": 43880104.0, "step": 19045 }, { "entropy": 0.05380313564091921, "epoch": 4.440610793798811, "grad_norm": 1.6875, "learning_rate": 4.959864905931381e-05, "loss": 0.0628, "mean_token_accuracy": 0.9821853876113892, "num_tokens": 43896475.0, "step": 19050 }, { "entropy": 0.18970977468416095, "epoch": 4.441776430819443, "grad_norm": 1.5859375, "learning_rate": 4.959826528086249e-05, "loss": 0.2268, "mean_token_accuracy": 0.9328379809856415, "num_tokens": 43935237.0, "step": 19055 }, { "entropy": 0.0632144408300519, "epoch": 4.442942067840074, "grad_norm": 0.470703125, "learning_rate": 4.9597881322007845e-05, "loss": 0.0726, "mean_token_accuracy": 0.9753654658794403, "num_tokens": 43953674.0, "step": 19060 }, { "entropy": 0.0784110258333385, "epoch": 4.4441077048607065, "grad_norm": 1.6484375, "learning_rate": 4.959749718275559e-05, "loss": 0.0503, "mean_token_accuracy": 0.9830631732940673, "num_tokens": 43973634.0, "step": 19065 }, { "entropy": 0.1564923981204629, "epoch": 4.445273341881338, "grad_norm": 6.34375, "learning_rate": 4.9597112863111455e-05, "loss": 0.143, "mean_token_accuracy": 0.9643102347850799, "num_tokens": 43984468.0, "step": 19070 }, { "entropy": 0.051792218908667566, "epoch": 4.44643897890197, "grad_norm": 0.92578125, "learning_rate": 4.9596728363081177e-05, "loss": 0.0531, "mean_token_accuracy": 0.9854408144950867, "num_tokens": 43999903.0, "step": 19075 }, { "entropy": 0.08109848536550998, "epoch": 4.447604615922602, "grad_norm": 3.4375, "learning_rate": 4.9596343682670475e-05, "loss": 0.0734, "mean_token_accuracy": 0.9807873725891113, "num_tokens": 44015648.0, "step": 19080 }, { "entropy": 0.05638847313821316, "epoch": 4.448770252943233, "grad_norm": 1.53125, "learning_rate": 4.9595958821885104e-05, "loss": 0.0578, "mean_token_accuracy": 0.9812259316444397, "num_tokens": 44035719.0, "step": 19085 }, { "entropy": 0.09323077034205199, "epoch": 4.449935889963865, "grad_norm": 0.390625, "learning_rate": 4.959557378073079e-05, "loss": 0.0712, "mean_token_accuracy": 0.9786225438117981, "num_tokens": 44058887.0, "step": 19090 }, { "entropy": 0.061530550755560395, "epoch": 4.451101526984497, "grad_norm": 2.6875, "learning_rate": 4.9595188559213276e-05, "loss": 0.0649, "mean_token_accuracy": 0.9821036100387573, "num_tokens": 44079240.0, "step": 19095 }, { "entropy": 0.14186363713815808, "epoch": 4.452267164005129, "grad_norm": 0.458984375, "learning_rate": 4.959480315733831e-05, "loss": 0.2516, "mean_token_accuracy": 0.9597993791103363, "num_tokens": 44110096.0, "step": 19100 }, { "entropy": 0.05938525218516588, "epoch": 4.453432801025761, "grad_norm": 2.46875, "learning_rate": 4.9594417575111634e-05, "loss": 0.0548, "mean_token_accuracy": 0.9803711473941803, "num_tokens": 44143221.0, "step": 19105 }, { "entropy": 0.05355137772858143, "epoch": 4.454598438046393, "grad_norm": 4.1875, "learning_rate": 4.9594031812539006e-05, "loss": 0.0585, "mean_token_accuracy": 0.9849870860576629, "num_tokens": 44163401.0, "step": 19110 }, { "entropy": 0.08820384666323662, "epoch": 4.455764075067024, "grad_norm": 1.328125, "learning_rate": 4.959364586962617e-05, "loss": 0.0717, "mean_token_accuracy": 0.9808596074581146, "num_tokens": 44179516.0, "step": 19115 }, { "entropy": 0.058700266759842634, "epoch": 4.456929712087656, "grad_norm": 2.15625, "learning_rate": 4.959325974637888e-05, "loss": 0.0403, "mean_token_accuracy": 0.9841213166713715, "num_tokens": 44225512.0, "step": 19120 }, { "entropy": 0.07730643711984157, "epoch": 4.458095349108287, "grad_norm": 2.40625, "learning_rate": 4.959287344280291e-05, "loss": 0.0904, "mean_token_accuracy": 0.9748827397823334, "num_tokens": 44237791.0, "step": 19125 }, { "entropy": 0.05126442005857825, "epoch": 4.459260986128919, "grad_norm": 4.6875, "learning_rate": 4.9592486958904e-05, "loss": 0.0536, "mean_token_accuracy": 0.9859380066394806, "num_tokens": 44281173.0, "step": 19130 }, { "entropy": 0.15797716118395327, "epoch": 4.4604266231495515, "grad_norm": 1.109375, "learning_rate": 4.959210029468793e-05, "loss": 0.315, "mean_token_accuracy": 0.9346661984920501, "num_tokens": 44301965.0, "step": 19135 }, { "entropy": 0.06716629974544049, "epoch": 4.461592260170183, "grad_norm": 2.25, "learning_rate": 4.959171345016045e-05, "loss": 0.0949, "mean_token_accuracy": 0.9775259852409363, "num_tokens": 44313084.0, "step": 19140 }, { "entropy": 0.06034324299544096, "epoch": 4.462757897190815, "grad_norm": 2.015625, "learning_rate": 4.9591326425327334e-05, "loss": 0.0657, "mean_token_accuracy": 0.9777418315410614, "num_tokens": 44328344.0, "step": 19145 }, { "entropy": 0.07359747290611267, "epoch": 4.463923534211447, "grad_norm": 2.796875, "learning_rate": 4.959093922019435e-05, "loss": 0.0932, "mean_token_accuracy": 0.9751519203186035, "num_tokens": 44346989.0, "step": 19150 }, { "entropy": 0.088407745026052, "epoch": 4.465089171232078, "grad_norm": 4.75, "learning_rate": 4.9590551834767285e-05, "loss": 0.0916, "mean_token_accuracy": 0.973981785774231, "num_tokens": 44359350.0, "step": 19155 }, { "entropy": 0.05561266434378922, "epoch": 4.46625480825271, "grad_norm": 0.53125, "learning_rate": 4.959016426905191e-05, "loss": 0.0556, "mean_token_accuracy": 0.9856752693653107, "num_tokens": 44384065.0, "step": 19160 }, { "entropy": 0.07562111020088196, "epoch": 4.4674204452733415, "grad_norm": 1.2265625, "learning_rate": 4.958977652305399e-05, "loss": 0.0683, "mean_token_accuracy": 0.9820806801319122, "num_tokens": 44395092.0, "step": 19165 }, { "entropy": 0.05588637031614781, "epoch": 4.468586082293974, "grad_norm": 5.65625, "learning_rate": 4.958938859677932e-05, "loss": 0.066, "mean_token_accuracy": 0.9826294183731079, "num_tokens": 44418945.0, "step": 19170 }, { "entropy": 0.060627684276551005, "epoch": 4.469751719314606, "grad_norm": 1.109375, "learning_rate": 4.95890004902337e-05, "loss": 0.0648, "mean_token_accuracy": 0.9832158327102661, "num_tokens": 44443933.0, "step": 19175 }, { "entropy": 0.05912665966898203, "epoch": 4.470917356335237, "grad_norm": 2.53125, "learning_rate": 4.958861220342288e-05, "loss": 0.075, "mean_token_accuracy": 0.9819551467895508, "num_tokens": 44458854.0, "step": 19180 }, { "entropy": 0.08281111363321543, "epoch": 4.472082993355869, "grad_norm": 0.51171875, "learning_rate": 4.9588223736352674e-05, "loss": 0.0715, "mean_token_accuracy": 0.9780626833438874, "num_tokens": 44472980.0, "step": 19185 }, { "entropy": 0.0640386096201837, "epoch": 4.473248630376501, "grad_norm": 1.9921875, "learning_rate": 4.958783508902887e-05, "loss": 0.0512, "mean_token_accuracy": 0.9844207406044007, "num_tokens": 44500670.0, "step": 19190 }, { "entropy": 0.06259249579161405, "epoch": 4.474414267397132, "grad_norm": 3.078125, "learning_rate": 4.958744626145727e-05, "loss": 0.0545, "mean_token_accuracy": 0.9792797148227692, "num_tokens": 44525316.0, "step": 19195 }, { "entropy": 0.11668573003262281, "epoch": 4.475579904417764, "grad_norm": 1.6484375, "learning_rate": 4.958705725364366e-05, "loss": 0.133, "mean_token_accuracy": 0.9686145961284638, "num_tokens": 44536734.0, "step": 19200 }, { "entropy": 0.059713400527834894, "epoch": 4.476745541438396, "grad_norm": 2.296875, "learning_rate": 4.958666806559385e-05, "loss": 0.087, "mean_token_accuracy": 0.9807201862335205, "num_tokens": 44564065.0, "step": 19205 }, { "entropy": 0.09271037932485342, "epoch": 4.477911178459028, "grad_norm": 0.36328125, "learning_rate": 4.9586278697313636e-05, "loss": 0.1193, "mean_token_accuracy": 0.9705138206481934, "num_tokens": 44605995.0, "step": 19210 }, { "entropy": 0.07151696030050517, "epoch": 4.47907681547966, "grad_norm": 6.6875, "learning_rate": 4.958588914880883e-05, "loss": 0.0852, "mean_token_accuracy": 0.9804966628551484, "num_tokens": 44635863.0, "step": 19215 }, { "entropy": 0.08933782912790775, "epoch": 4.480242452500291, "grad_norm": 2.4375, "learning_rate": 4.958549942008524e-05, "loss": 0.1262, "mean_token_accuracy": 0.9740311861038208, "num_tokens": 44650230.0, "step": 19220 }, { "entropy": 0.09002497904002667, "epoch": 4.481408089520923, "grad_norm": 2.53125, "learning_rate": 4.958510951114868e-05, "loss": 0.0666, "mean_token_accuracy": 0.9774248003959656, "num_tokens": 44668524.0, "step": 19225 }, { "entropy": 0.0647845333442092, "epoch": 4.482573726541555, "grad_norm": 1.34375, "learning_rate": 4.9584719422004945e-05, "loss": 0.0598, "mean_token_accuracy": 0.9832403361797333, "num_tokens": 44694794.0, "step": 19230 }, { "entropy": 0.0654996738769114, "epoch": 4.4837393635621865, "grad_norm": 0.62109375, "learning_rate": 4.958432915265988e-05, "loss": 0.0382, "mean_token_accuracy": 0.9838578164577484, "num_tokens": 44720301.0, "step": 19235 }, { "entropy": 0.0657532449811697, "epoch": 4.484905000582819, "grad_norm": 0.83984375, "learning_rate": 4.958393870311929e-05, "loss": 0.0597, "mean_token_accuracy": 0.980295842885971, "num_tokens": 44734794.0, "step": 19240 }, { "entropy": 0.08420066647231579, "epoch": 4.486070637603451, "grad_norm": 3.671875, "learning_rate": 4.958354807338899e-05, "loss": 0.1104, "mean_token_accuracy": 0.9734029173851013, "num_tokens": 44745211.0, "step": 19245 }, { "entropy": 0.0867409948259592, "epoch": 4.487236274624082, "grad_norm": 2.9375, "learning_rate": 4.958315726347482e-05, "loss": 0.0558, "mean_token_accuracy": 0.9785344064235687, "num_tokens": 44770497.0, "step": 19250 }, { "entropy": 0.08636245997622609, "epoch": 4.488401911644714, "grad_norm": 0.2041015625, "learning_rate": 4.9582766273382604e-05, "loss": 0.1164, "mean_token_accuracy": 0.9738179206848144, "num_tokens": 44783268.0, "step": 19255 }, { "entropy": 0.07308413162827491, "epoch": 4.489567548665345, "grad_norm": 1.9296875, "learning_rate": 4.958237510311817e-05, "loss": 0.0694, "mean_token_accuracy": 0.9750112354755401, "num_tokens": 44804127.0, "step": 19260 }, { "entropy": 0.0728783905506134, "epoch": 4.490733185685977, "grad_norm": 0.5390625, "learning_rate": 4.958198375268734e-05, "loss": 0.0839, "mean_token_accuracy": 0.9792826414108277, "num_tokens": 44823592.0, "step": 19265 }, { "entropy": 0.0877895756624639, "epoch": 4.4918988227066095, "grad_norm": 2.21875, "learning_rate": 4.9581592222095974e-05, "loss": 0.1242, "mean_token_accuracy": 0.9677175223827362, "num_tokens": 44841262.0, "step": 19270 }, { "entropy": 0.08363472241908312, "epoch": 4.493064459727241, "grad_norm": 2.046875, "learning_rate": 4.9581200511349886e-05, "loss": 0.1032, "mean_token_accuracy": 0.97632537484169, "num_tokens": 44852453.0, "step": 19275 }, { "entropy": 0.07496081218123436, "epoch": 4.494230096747873, "grad_norm": 0.921875, "learning_rate": 4.9580808620454934e-05, "loss": 0.0769, "mean_token_accuracy": 0.9786527574062347, "num_tokens": 44865486.0, "step": 19280 }, { "entropy": 0.04869019603356719, "epoch": 4.495395733768505, "grad_norm": 0.4140625, "learning_rate": 4.9580416549416945e-05, "loss": 0.0513, "mean_token_accuracy": 0.9858419060707092, "num_tokens": 44892406.0, "step": 19285 }, { "entropy": 0.08506630435585975, "epoch": 4.496561370789136, "grad_norm": 1.8359375, "learning_rate": 4.958002429824179e-05, "loss": 0.0807, "mean_token_accuracy": 0.9732596397399902, "num_tokens": 44904489.0, "step": 19290 }, { "entropy": 0.05554007384926081, "epoch": 4.497727007809768, "grad_norm": 0.80078125, "learning_rate": 4.957963186693529e-05, "loss": 0.0315, "mean_token_accuracy": 0.9892236590385437, "num_tokens": 44936231.0, "step": 19295 }, { "entropy": 0.07873046463355422, "epoch": 4.498892644830399, "grad_norm": 2.359375, "learning_rate": 4.9579239255503316e-05, "loss": 0.068, "mean_token_accuracy": 0.9715416669845581, "num_tokens": 44961471.0, "step": 19300 }, { "entropy": 0.07022126615047455, "epoch": 4.5000582818510315, "grad_norm": 0.7421875, "learning_rate": 4.957884646395171e-05, "loss": 0.0775, "mean_token_accuracy": 0.9786169409751893, "num_tokens": 44975004.0, "step": 19305 }, { "entropy": 0.07414238564670086, "epoch": 4.501223918871664, "grad_norm": 1.3359375, "learning_rate": 4.9578453492286345e-05, "loss": 0.1074, "mean_token_accuracy": 0.9741037786006927, "num_tokens": 44986316.0, "step": 19310 }, { "entropy": 0.06832624040544033, "epoch": 4.502389555892295, "grad_norm": 0.490234375, "learning_rate": 4.957806034051307e-05, "loss": 0.0715, "mean_token_accuracy": 0.9820472538471222, "num_tokens": 45025979.0, "step": 19315 }, { "entropy": 0.06494354400783778, "epoch": 4.503555192912927, "grad_norm": 0.54296875, "learning_rate": 4.957766700863774e-05, "loss": 0.0804, "mean_token_accuracy": 0.9823660254478455, "num_tokens": 45042042.0, "step": 19320 }, { "entropy": 0.08731096163392067, "epoch": 4.504720829933559, "grad_norm": 4.5, "learning_rate": 4.9577273496666236e-05, "loss": 0.1369, "mean_token_accuracy": 0.9645826160907746, "num_tokens": 45052645.0, "step": 19325 }, { "entropy": 0.06000461746007204, "epoch": 4.50588646695419, "grad_norm": 0.408203125, "learning_rate": 4.957687980460442e-05, "loss": 0.0751, "mean_token_accuracy": 0.9814544498920441, "num_tokens": 45086655.0, "step": 19330 }, { "entropy": 0.056810198538005355, "epoch": 4.507052103974822, "grad_norm": 0.68359375, "learning_rate": 4.957648593245816e-05, "loss": 0.0602, "mean_token_accuracy": 0.9821222722530365, "num_tokens": 45103298.0, "step": 19335 }, { "entropy": 0.10935597391799093, "epoch": 4.508217740995454, "grad_norm": 2.609375, "learning_rate": 4.9576091880233335e-05, "loss": 0.148, "mean_token_accuracy": 0.965090674161911, "num_tokens": 45126200.0, "step": 19340 }, { "entropy": 0.06502020470798016, "epoch": 4.509383378016086, "grad_norm": 0.337890625, "learning_rate": 4.957569764793581e-05, "loss": 0.0791, "mean_token_accuracy": 0.9803296446800231, "num_tokens": 45149472.0, "step": 19345 }, { "entropy": 0.07320923134684562, "epoch": 4.510549015036718, "grad_norm": 2.375, "learning_rate": 4.957530323557147e-05, "loss": 0.0541, "mean_token_accuracy": 0.9775784730911254, "num_tokens": 45184300.0, "step": 19350 }, { "entropy": 0.07345627807080746, "epoch": 4.511714652057349, "grad_norm": 3.171875, "learning_rate": 4.95749086431462e-05, "loss": 0.1122, "mean_token_accuracy": 0.972129487991333, "num_tokens": 45194944.0, "step": 19355 }, { "entropy": 0.08000601641833782, "epoch": 4.512880289077981, "grad_norm": 5.4375, "learning_rate": 4.957451387066588e-05, "loss": 0.106, "mean_token_accuracy": 0.9739298522472382, "num_tokens": 45204799.0, "step": 19360 }, { "entropy": 0.08380871191620827, "epoch": 4.514045926098613, "grad_norm": 3.28125, "learning_rate": 4.957411891813639e-05, "loss": 0.1116, "mean_token_accuracy": 0.9741986215114593, "num_tokens": 45222726.0, "step": 19365 }, { "entropy": 0.06374263260513544, "epoch": 4.515211563119244, "grad_norm": 1.9140625, "learning_rate": 4.9573723785563636e-05, "loss": 0.0625, "mean_token_accuracy": 0.9786804854869843, "num_tokens": 45242007.0, "step": 19370 }, { "entropy": 0.07915383875370026, "epoch": 4.5163772001398765, "grad_norm": 1.2734375, "learning_rate": 4.9573328472953496e-05, "loss": 0.0798, "mean_token_accuracy": 0.9795429646968842, "num_tokens": 45259231.0, "step": 19375 }, { "entropy": 0.07513550948351622, "epoch": 4.517542837160509, "grad_norm": 2.328125, "learning_rate": 4.957293298031187e-05, "loss": 0.0564, "mean_token_accuracy": 0.9819276630878448, "num_tokens": 45276816.0, "step": 19380 }, { "entropy": 0.08630263023078441, "epoch": 4.51870847418114, "grad_norm": 2.0, "learning_rate": 4.957253730764466e-05, "loss": 0.1117, "mean_token_accuracy": 0.9731955528259277, "num_tokens": 45286266.0, "step": 19385 }, { "entropy": 0.07398798689246178, "epoch": 4.519874111201772, "grad_norm": 3.96875, "learning_rate": 4.957214145495775e-05, "loss": 0.0862, "mean_token_accuracy": 0.9772976040840149, "num_tokens": 45304714.0, "step": 19390 }, { "entropy": 0.06771981194615365, "epoch": 4.521039748222403, "grad_norm": 2.0, "learning_rate": 4.957174542225706e-05, "loss": 0.0927, "mean_token_accuracy": 0.9755495727062226, "num_tokens": 45320062.0, "step": 19395 }, { "entropy": 0.08253844156861305, "epoch": 4.522205385243035, "grad_norm": 0.62109375, "learning_rate": 4.957134920954849e-05, "loss": 0.0593, "mean_token_accuracy": 0.9802878797054291, "num_tokens": 45344591.0, "step": 19400 }, { "entropy": 0.06176201160997152, "epoch": 4.523371022263667, "grad_norm": 3.78125, "learning_rate": 4.957095281683794e-05, "loss": 0.0679, "mean_token_accuracy": 0.9821565389633179, "num_tokens": 45362801.0, "step": 19405 }, { "entropy": 0.08072059694677591, "epoch": 4.524536659284299, "grad_norm": 1.171875, "learning_rate": 4.957055624413134e-05, "loss": 0.0863, "mean_token_accuracy": 0.9759260535240173, "num_tokens": 45378106.0, "step": 19410 }, { "entropy": 0.062263531237840654, "epoch": 4.525702296304931, "grad_norm": 3.125, "learning_rate": 4.957015949143458e-05, "loss": 0.0757, "mean_token_accuracy": 0.9812383353710175, "num_tokens": 45403774.0, "step": 19415 }, { "entropy": 0.09101590849459171, "epoch": 4.526867933325562, "grad_norm": 2.09375, "learning_rate": 4.956976255875359e-05, "loss": 0.1214, "mean_token_accuracy": 0.9659562647342682, "num_tokens": 45412925.0, "step": 19420 }, { "entropy": 0.062494827434420586, "epoch": 4.528033570346194, "grad_norm": 1.6875, "learning_rate": 4.956936544609429e-05, "loss": 0.0654, "mean_token_accuracy": 0.9782341003417969, "num_tokens": 45430972.0, "step": 19425 }, { "entropy": 0.06619368139654398, "epoch": 4.529199207366826, "grad_norm": 0.5625, "learning_rate": 4.956896815346259e-05, "loss": 0.0622, "mean_token_accuracy": 0.9839639365673065, "num_tokens": 45459186.0, "step": 19430 }, { "entropy": 0.06992116756737232, "epoch": 4.530364844387458, "grad_norm": 2.515625, "learning_rate": 4.956857068086443e-05, "loss": 0.1111, "mean_token_accuracy": 0.9732448756694794, "num_tokens": 45477375.0, "step": 19435 }, { "entropy": 0.08637938443571329, "epoch": 4.5315304814080895, "grad_norm": 4.0625, "learning_rate": 4.9568173028305724e-05, "loss": 0.0684, "mean_token_accuracy": 0.9800038695335388, "num_tokens": 45502943.0, "step": 19440 }, { "entropy": 0.05814673639833927, "epoch": 4.5326961184287216, "grad_norm": 0.8125, "learning_rate": 4.956777519579241e-05, "loss": 0.0598, "mean_token_accuracy": 0.9838175058364869, "num_tokens": 45518792.0, "step": 19445 }, { "entropy": 0.07371747437864543, "epoch": 4.533861755449353, "grad_norm": 6.6875, "learning_rate": 4.956737718333042e-05, "loss": 0.077, "mean_token_accuracy": 0.98111212849617, "num_tokens": 45540871.0, "step": 19450 }, { "entropy": 0.07126566041260958, "epoch": 4.535027392469985, "grad_norm": 3.390625, "learning_rate": 4.9566978990925686e-05, "loss": 0.0665, "mean_token_accuracy": 0.9774880886077881, "num_tokens": 45563615.0, "step": 19455 }, { "entropy": 0.07104693334549665, "epoch": 4.536193029490617, "grad_norm": 2.125, "learning_rate": 4.9566580618584136e-05, "loss": 0.0682, "mean_token_accuracy": 0.9758515536785126, "num_tokens": 45594215.0, "step": 19460 }, { "entropy": 0.07539936387911439, "epoch": 4.537358666511248, "grad_norm": 0.578125, "learning_rate": 4.956618206631172e-05, "loss": 0.0955, "mean_token_accuracy": 0.9738450884819031, "num_tokens": 45620724.0, "step": 19465 }, { "entropy": 0.091930534504354, "epoch": 4.53852430353188, "grad_norm": 5.78125, "learning_rate": 4.956578333411439e-05, "loss": 0.0978, "mean_token_accuracy": 0.9745730400085449, "num_tokens": 45631916.0, "step": 19470 }, { "entropy": 0.08296403437852859, "epoch": 4.5396899405525115, "grad_norm": 1.875, "learning_rate": 4.956538442199808e-05, "loss": 0.0521, "mean_token_accuracy": 0.9824115693569183, "num_tokens": 45649779.0, "step": 19475 }, { "entropy": 0.07108269976451993, "epoch": 4.540855577573144, "grad_norm": 1.7734375, "learning_rate": 4.956498532996874e-05, "loss": 0.0539, "mean_token_accuracy": 0.9788315117359161, "num_tokens": 45674396.0, "step": 19480 }, { "entropy": 0.11804137602448464, "epoch": 4.542021214593776, "grad_norm": 1.59375, "learning_rate": 4.956458605803232e-05, "loss": 0.138, "mean_token_accuracy": 0.9672506809234619, "num_tokens": 45701462.0, "step": 19485 }, { "entropy": 0.06310545764863491, "epoch": 4.543186851614407, "grad_norm": 1.2265625, "learning_rate": 4.956418660619477e-05, "loss": 0.0675, "mean_token_accuracy": 0.9810591280460358, "num_tokens": 45712067.0, "step": 19490 }, { "entropy": 0.06863165087997913, "epoch": 4.544352488635039, "grad_norm": 0.72265625, "learning_rate": 4.9563786974462064e-05, "loss": 0.0905, "mean_token_accuracy": 0.97963907122612, "num_tokens": 45727858.0, "step": 19495 }, { "entropy": 0.0643789792433381, "epoch": 4.545518125655671, "grad_norm": 1.4453125, "learning_rate": 4.9563387162840144e-05, "loss": 0.0696, "mean_token_accuracy": 0.9800073564052582, "num_tokens": 45740208.0, "step": 19500 }, { "entropy": 0.06495386781170964, "epoch": 4.546683762676302, "grad_norm": 3.109375, "learning_rate": 4.9562987171334976e-05, "loss": 0.0618, "mean_token_accuracy": 0.9782070398330689, "num_tokens": 45764350.0, "step": 19505 }, { "entropy": 0.0650036720559001, "epoch": 4.5478493996969345, "grad_norm": 2.9375, "learning_rate": 4.956258699995253e-05, "loss": 0.0644, "mean_token_accuracy": 0.9787128925323486, "num_tokens": 45786544.0, "step": 19510 }, { "entropy": 0.12152950577437878, "epoch": 4.549015036717567, "grad_norm": 3.84375, "learning_rate": 4.956218664869876e-05, "loss": 0.2017, "mean_token_accuracy": 0.9524850487709046, "num_tokens": 45809166.0, "step": 19515 }, { "entropy": 0.0705368846654892, "epoch": 4.550180673738198, "grad_norm": 2.640625, "learning_rate": 4.956178611757966e-05, "loss": 0.093, "mean_token_accuracy": 0.9743614614009857, "num_tokens": 45820714.0, "step": 19520 }, { "entropy": 0.04608556115999818, "epoch": 4.55134631075883, "grad_norm": 0.78125, "learning_rate": 4.9561385406601167e-05, "loss": 0.0247, "mean_token_accuracy": 0.9896496951580047, "num_tokens": 45845944.0, "step": 19525 }, { "entropy": 0.07118664849549532, "epoch": 4.552511947779461, "grad_norm": 0.65625, "learning_rate": 4.956098451576929e-05, "loss": 0.1066, "mean_token_accuracy": 0.970186847448349, "num_tokens": 45860384.0, "step": 19530 }, { "entropy": 0.035560993710532784, "epoch": 4.553677584800093, "grad_norm": 0.59375, "learning_rate": 4.956058344508999e-05, "loss": 0.0216, "mean_token_accuracy": 0.9867688715457916, "num_tokens": 45896561.0, "step": 19535 }, { "entropy": 0.07229228690266609, "epoch": 4.554843221820725, "grad_norm": 0.8359375, "learning_rate": 4.9560182194569246e-05, "loss": 0.0968, "mean_token_accuracy": 0.9724124073982239, "num_tokens": 45916494.0, "step": 19540 }, { "entropy": 0.05498791430145502, "epoch": 4.5560088588413565, "grad_norm": 1.0703125, "learning_rate": 4.955978076421305e-05, "loss": 0.0327, "mean_token_accuracy": 0.9827622950077057, "num_tokens": 45937511.0, "step": 19545 }, { "entropy": 0.08500229343771934, "epoch": 4.557174495861989, "grad_norm": 3.5, "learning_rate": 4.9559379154027386e-05, "loss": 0.1174, "mean_token_accuracy": 0.9725946724414826, "num_tokens": 45948842.0, "step": 19550 }, { "entropy": 0.06914058709517121, "epoch": 4.55834013288262, "grad_norm": 0.4921875, "learning_rate": 4.955897736401824e-05, "loss": 0.0767, "mean_token_accuracy": 0.9804995000362396, "num_tokens": 45964465.0, "step": 19555 }, { "entropy": 0.048480591550469396, "epoch": 4.559505769903252, "grad_norm": 1.1171875, "learning_rate": 4.9558575394191605e-05, "loss": 0.0468, "mean_token_accuracy": 0.987060832977295, "num_tokens": 45998617.0, "step": 19560 }, { "entropy": 0.067578933108598, "epoch": 4.560671406923884, "grad_norm": 3.296875, "learning_rate": 4.955817324455347e-05, "loss": 0.0586, "mean_token_accuracy": 0.9790487349033355, "num_tokens": 46022443.0, "step": 19565 }, { "entropy": 0.08472463395446539, "epoch": 4.561837043944516, "grad_norm": 4.625, "learning_rate": 4.955777091510984e-05, "loss": 0.1074, "mean_token_accuracy": 0.9704361200332642, "num_tokens": 46035920.0, "step": 19570 }, { "entropy": 0.06885777739807963, "epoch": 4.563002680965147, "grad_norm": 7.25, "learning_rate": 4.95573684058667e-05, "loss": 0.0704, "mean_token_accuracy": 0.9791424155235291, "num_tokens": 46058228.0, "step": 19575 }, { "entropy": 0.09059565905481577, "epoch": 4.5641683179857795, "grad_norm": 7.84375, "learning_rate": 4.955696571683007e-05, "loss": 0.0842, "mean_token_accuracy": 0.9713367521762848, "num_tokens": 46095283.0, "step": 19580 }, { "entropy": 0.06202486571855843, "epoch": 4.565333955006411, "grad_norm": 1.9765625, "learning_rate": 4.9556562848005946e-05, "loss": 0.0633, "mean_token_accuracy": 0.9819358885288239, "num_tokens": 46127335.0, "step": 19585 }, { "entropy": 0.047707998938858506, "epoch": 4.566499592027043, "grad_norm": 5.3125, "learning_rate": 4.9556159799400334e-05, "loss": 0.0512, "mean_token_accuracy": 0.981537401676178, "num_tokens": 46157178.0, "step": 19590 }, { "entropy": 0.06663997173309326, "epoch": 4.567665229047675, "grad_norm": 5.875, "learning_rate": 4.955575657101924e-05, "loss": 0.1047, "mean_token_accuracy": 0.9750019073486328, "num_tokens": 46168435.0, "step": 19595 }, { "entropy": 0.06492017675191164, "epoch": 4.568830866068306, "grad_norm": 1.046875, "learning_rate": 4.955535316286869e-05, "loss": 0.0837, "mean_token_accuracy": 0.9799909651279449, "num_tokens": 46183144.0, "step": 19600 }, { "entropy": 0.06846091412007808, "epoch": 4.569996503088938, "grad_norm": 1.3125, "learning_rate": 4.955494957495469e-05, "loss": 0.0861, "mean_token_accuracy": 0.9765637576580047, "num_tokens": 46208042.0, "step": 19605 }, { "entropy": 0.08362425286322832, "epoch": 4.5711621401095694, "grad_norm": 1.2890625, "learning_rate": 4.955454580728327e-05, "loss": 0.0351, "mean_token_accuracy": 0.9863261938095093, "num_tokens": 46242450.0, "step": 19610 }, { "entropy": 0.0774501122534275, "epoch": 4.5723277771302016, "grad_norm": 1.265625, "learning_rate": 4.955414185986043e-05, "loss": 0.0665, "mean_token_accuracy": 0.9804924190044403, "num_tokens": 46256250.0, "step": 19615 }, { "entropy": 0.08892765715718269, "epoch": 4.573493414150834, "grad_norm": 6.15625, "learning_rate": 4.95537377326922e-05, "loss": 0.0906, "mean_token_accuracy": 0.9789716601371765, "num_tokens": 46265601.0, "step": 19620 }, { "entropy": 0.05659576002508402, "epoch": 4.574659051171465, "grad_norm": 0.5, "learning_rate": 4.955333342578462e-05, "loss": 0.0825, "mean_token_accuracy": 0.9786292016506195, "num_tokens": 46284258.0, "step": 19625 }, { "entropy": 0.05378645788878202, "epoch": 4.575824688192097, "grad_norm": 0.42578125, "learning_rate": 4.955292893914371e-05, "loss": 0.0686, "mean_token_accuracy": 0.9818499684333801, "num_tokens": 46306611.0, "step": 19630 }, { "entropy": 0.08624094808474184, "epoch": 4.576990325212729, "grad_norm": 4.65625, "learning_rate": 4.955252427277549e-05, "loss": 0.0798, "mean_token_accuracy": 0.9779546380043029, "num_tokens": 46325031.0, "step": 19635 }, { "entropy": 0.06809303583577275, "epoch": 4.57815596223336, "grad_norm": 0.88671875, "learning_rate": 4.955211942668602e-05, "loss": 0.0799, "mean_token_accuracy": 0.9798089146614075, "num_tokens": 46343106.0, "step": 19640 }, { "entropy": 0.05596675332635641, "epoch": 4.579321599253992, "grad_norm": 4.0625, "learning_rate": 4.955171440088131e-05, "loss": 0.0629, "mean_token_accuracy": 0.9851307094097137, "num_tokens": 46369488.0, "step": 19645 }, { "entropy": 0.05978215290233493, "epoch": 4.5804872362746245, "grad_norm": 3.0625, "learning_rate": 4.955130919536741e-05, "loss": 0.0598, "mean_token_accuracy": 0.983077323436737, "num_tokens": 46398374.0, "step": 19650 }, { "entropy": 0.07102181990630925, "epoch": 4.581652873295256, "grad_norm": 1.3828125, "learning_rate": 4.955090381015037e-05, "loss": 0.0563, "mean_token_accuracy": 0.9782206952571869, "num_tokens": 46417471.0, "step": 19655 }, { "entropy": 0.08769596517086028, "epoch": 4.582818510315888, "grad_norm": 3.5, "learning_rate": 4.955049824523623e-05, "loss": 0.1106, "mean_token_accuracy": 0.9734553158283233, "num_tokens": 46428623.0, "step": 19660 }, { "entropy": 0.07133472822606564, "epoch": 4.583984147336519, "grad_norm": 5.75, "learning_rate": 4.9550092500631034e-05, "loss": 0.0735, "mean_token_accuracy": 0.9771177768707275, "num_tokens": 46451312.0, "step": 19665 }, { "entropy": 0.05361058982089162, "epoch": 4.585149784357151, "grad_norm": 1.7734375, "learning_rate": 4.9549686576340834e-05, "loss": 0.0456, "mean_token_accuracy": 0.9819006502628327, "num_tokens": 46484205.0, "step": 19670 }, { "entropy": 0.1158721529878676, "epoch": 4.586315421377783, "grad_norm": 1.84375, "learning_rate": 4.954928047237168e-05, "loss": 0.1289, "mean_token_accuracy": 0.9689590871334076, "num_tokens": 46511834.0, "step": 19675 }, { "entropy": 0.06347185205668211, "epoch": 4.5874810583984145, "grad_norm": 1.03125, "learning_rate": 4.9548874188729645e-05, "loss": 0.0572, "mean_token_accuracy": 0.9769329488277435, "num_tokens": 46532517.0, "step": 19680 }, { "entropy": 0.15378685537725686, "epoch": 4.588646695419047, "grad_norm": 1.4453125, "learning_rate": 4.954846772542076e-05, "loss": 0.1941, "mean_token_accuracy": 0.9541963636875153, "num_tokens": 46547581.0, "step": 19685 }, { "entropy": 0.04796793041750789, "epoch": 4.589812332439678, "grad_norm": 1.3671875, "learning_rate": 4.95480610824511e-05, "loss": 0.0561, "mean_token_accuracy": 0.9836558759212494, "num_tokens": 46571459.0, "step": 19690 }, { "entropy": 0.08416934944689274, "epoch": 4.59097796946031, "grad_norm": 0.78515625, "learning_rate": 4.9547654259826734e-05, "loss": 0.0743, "mean_token_accuracy": 0.9806716084480286, "num_tokens": 46591900.0, "step": 19695 }, { "entropy": 0.07612838819622994, "epoch": 4.592143606480942, "grad_norm": 3.828125, "learning_rate": 4.954724725755372e-05, "loss": 0.0916, "mean_token_accuracy": 0.9744833171367645, "num_tokens": 46606849.0, "step": 19700 }, { "entropy": 0.05052490308880806, "epoch": 4.593309243501574, "grad_norm": 1.15625, "learning_rate": 4.954684007563813e-05, "loss": 0.039, "mean_token_accuracy": 0.9903412938117981, "num_tokens": 46626272.0, "step": 19705 }, { "entropy": 0.06455773105844856, "epoch": 4.594474880522205, "grad_norm": 2.53125, "learning_rate": 4.9546432714086035e-05, "loss": 0.0762, "mean_token_accuracy": 0.9837546706199646, "num_tokens": 46652485.0, "step": 19710 }, { "entropy": 0.06334900464862585, "epoch": 4.595640517542837, "grad_norm": 0.326171875, "learning_rate": 4.9546025172903505e-05, "loss": 0.0846, "mean_token_accuracy": 0.9812196731567383, "num_tokens": 46676087.0, "step": 19715 }, { "entropy": 0.04339192071929574, "epoch": 4.596806154563469, "grad_norm": 0.392578125, "learning_rate": 4.954561745209662e-05, "loss": 0.0296, "mean_token_accuracy": 0.986208838224411, "num_tokens": 46728355.0, "step": 19720 }, { "entropy": 0.09071781933307647, "epoch": 4.597971791584101, "grad_norm": 1.3359375, "learning_rate": 4.954520955167147e-05, "loss": 0.1165, "mean_token_accuracy": 0.9729561984539032, "num_tokens": 46737441.0, "step": 19725 }, { "entropy": 0.08092315215617418, "epoch": 4.599137428604733, "grad_norm": 6.46875, "learning_rate": 4.954480147163412e-05, "loss": 0.0779, "mean_token_accuracy": 0.9785025775432586, "num_tokens": 46754296.0, "step": 19730 }, { "entropy": 0.08478829376399517, "epoch": 4.600303065625364, "grad_norm": 0.78515625, "learning_rate": 4.954439321199067e-05, "loss": 0.1018, "mean_token_accuracy": 0.9742565453052521, "num_tokens": 46775708.0, "step": 19735 }, { "entropy": 0.07357489094138145, "epoch": 4.601468702645996, "grad_norm": 1.609375, "learning_rate": 4.95439847727472e-05, "loss": 0.1158, "mean_token_accuracy": 0.9726702034473419, "num_tokens": 46794542.0, "step": 19740 }, { "entropy": 0.07723245853558183, "epoch": 4.602634339666627, "grad_norm": 0.32421875, "learning_rate": 4.95435761539098e-05, "loss": 0.0923, "mean_token_accuracy": 0.97293501496315, "num_tokens": 46811394.0, "step": 19745 }, { "entropy": 0.06583732040598989, "epoch": 4.6037999766872595, "grad_norm": 0.419921875, "learning_rate": 4.954316735548456e-05, "loss": 0.0813, "mean_token_accuracy": 0.979914003610611, "num_tokens": 46829430.0, "step": 19750 }, { "entropy": 0.0757858750410378, "epoch": 4.604965613707892, "grad_norm": 6.0, "learning_rate": 4.954275837747759e-05, "loss": 0.0785, "mean_token_accuracy": 0.9767545580863952, "num_tokens": 46860533.0, "step": 19755 }, { "entropy": 0.05514599541202188, "epoch": 4.606131250728523, "grad_norm": 4.96875, "learning_rate": 4.954234921989498e-05, "loss": 0.083, "mean_token_accuracy": 0.9799856185913086, "num_tokens": 46896620.0, "step": 19760 }, { "entropy": 0.060221548471599815, "epoch": 4.607296887749155, "grad_norm": 1.203125, "learning_rate": 4.954193988274282e-05, "loss": 0.0673, "mean_token_accuracy": 0.9807502627372742, "num_tokens": 46916848.0, "step": 19765 }, { "entropy": 0.06733870087191463, "epoch": 4.608462524769787, "grad_norm": 4.625, "learning_rate": 4.954153036602723e-05, "loss": 0.0848, "mean_token_accuracy": 0.976883488893509, "num_tokens": 46933000.0, "step": 19770 }, { "entropy": 0.07344697508960962, "epoch": 4.609628161790418, "grad_norm": 1.875, "learning_rate": 4.954112066975431e-05, "loss": 0.0892, "mean_token_accuracy": 0.9760464787483215, "num_tokens": 46947494.0, "step": 19775 }, { "entropy": 0.3117006901651621, "epoch": 4.61079379881105, "grad_norm": 1.8203125, "learning_rate": 4.9540710793930174e-05, "loss": 0.4114, "mean_token_accuracy": 0.9219294607639312, "num_tokens": 46976816.0, "step": 19780 }, { "entropy": 0.07747607827186584, "epoch": 4.611959435831682, "grad_norm": 2.796875, "learning_rate": 4.954030073856093e-05, "loss": 0.1049, "mean_token_accuracy": 0.9715212643146515, "num_tokens": 46991104.0, "step": 19785 }, { "entropy": 0.06775341653265059, "epoch": 4.613125072852314, "grad_norm": 2.53125, "learning_rate": 4.9539890503652684e-05, "loss": 0.0557, "mean_token_accuracy": 0.977581012248993, "num_tokens": 47011058.0, "step": 19790 }, { "entropy": 0.06645539095625282, "epoch": 4.614290709872946, "grad_norm": 0.65234375, "learning_rate": 4.953948008921157e-05, "loss": 0.0174, "mean_token_accuracy": 0.9802887737751007, "num_tokens": 47052454.0, "step": 19795 }, { "entropy": 0.05235816705971956, "epoch": 4.615456346893577, "grad_norm": 2.734375, "learning_rate": 4.9539069495243694e-05, "loss": 0.0522, "mean_token_accuracy": 0.9832578837871552, "num_tokens": 47074809.0, "step": 19800 }, { "entropy": 0.04748070854693651, "epoch": 4.616621983914209, "grad_norm": 2.09375, "learning_rate": 4.953865872175519e-05, "loss": 0.0417, "mean_token_accuracy": 0.9883626997470856, "num_tokens": 47107110.0, "step": 19805 }, { "entropy": 0.08402322083711625, "epoch": 4.617787620934841, "grad_norm": 6.53125, "learning_rate": 4.953824776875217e-05, "loss": 0.1434, "mean_token_accuracy": 0.9647410392761231, "num_tokens": 47116318.0, "step": 19810 }, { "entropy": 0.08315610075369477, "epoch": 4.618953257955472, "grad_norm": 0.390625, "learning_rate": 4.953783663624077e-05, "loss": 0.0971, "mean_token_accuracy": 0.9753001630306244, "num_tokens": 47156487.0, "step": 19815 }, { "entropy": 0.06097006350755692, "epoch": 4.6201188949761045, "grad_norm": 4.4375, "learning_rate": 4.953742532422713e-05, "loss": 0.0812, "mean_token_accuracy": 0.9757486581802368, "num_tokens": 47182047.0, "step": 19820 }, { "entropy": 0.06232939455658197, "epoch": 4.621284531996736, "grad_norm": 1.5, "learning_rate": 4.953701383271736e-05, "loss": 0.0785, "mean_token_accuracy": 0.9787400305271149, "num_tokens": 47206737.0, "step": 19825 }, { "entropy": 0.07709433417767286, "epoch": 4.622450169017368, "grad_norm": 2.234375, "learning_rate": 4.953660216171762e-05, "loss": 0.1025, "mean_token_accuracy": 0.9743568658828735, "num_tokens": 47221486.0, "step": 19830 }, { "entropy": 0.07625074442476035, "epoch": 4.623615806038, "grad_norm": 2.9375, "learning_rate": 4.953619031123403e-05, "loss": 0.0868, "mean_token_accuracy": 0.975380665063858, "num_tokens": 47235041.0, "step": 19835 }, { "entropy": 0.06530127339065075, "epoch": 4.624781443058632, "grad_norm": 1.5078125, "learning_rate": 4.953577828127274e-05, "loss": 0.0443, "mean_token_accuracy": 0.982153731584549, "num_tokens": 47258958.0, "step": 19840 }, { "entropy": 0.08391881920397282, "epoch": 4.625947080079263, "grad_norm": 0.73828125, "learning_rate": 4.9535366071839894e-05, "loss": 0.0941, "mean_token_accuracy": 0.9758387804031372, "num_tokens": 47269042.0, "step": 19845 }, { "entropy": 0.08590067960321904, "epoch": 4.627112717099895, "grad_norm": 2.5, "learning_rate": 4.953495368294164e-05, "loss": 0.117, "mean_token_accuracy": 0.9718995273113251, "num_tokens": 47279024.0, "step": 19850 }, { "entropy": 0.08923376137390733, "epoch": 4.628278354120527, "grad_norm": 1.609375, "learning_rate": 4.953454111458413e-05, "loss": 0.0982, "mean_token_accuracy": 0.970050984621048, "num_tokens": 47294540.0, "step": 19855 }, { "entropy": 0.05908918278291821, "epoch": 4.629443991141159, "grad_norm": 0.396484375, "learning_rate": 4.95341283667735e-05, "loss": 0.0755, "mean_token_accuracy": 0.9823239386081696, "num_tokens": 47332055.0, "step": 19860 }, { "entropy": 0.041251880768686536, "epoch": 4.630609628161791, "grad_norm": 2.25, "learning_rate": 4.9533715439515914e-05, "loss": 0.0329, "mean_token_accuracy": 0.9873129487037658, "num_tokens": 47365297.0, "step": 19865 }, { "entropy": 0.0673784639686346, "epoch": 4.631775265182422, "grad_norm": 1.875, "learning_rate": 4.953330233281754e-05, "loss": 0.082, "mean_token_accuracy": 0.9806714713573456, "num_tokens": 47375480.0, "step": 19870 }, { "entropy": 0.06148081440478563, "epoch": 4.632940902203054, "grad_norm": 0.72265625, "learning_rate": 4.953288904668453e-05, "loss": 0.0695, "mean_token_accuracy": 0.982768303155899, "num_tokens": 47401613.0, "step": 19875 }, { "entropy": 0.06011179555207491, "epoch": 4.634106539223685, "grad_norm": 2.078125, "learning_rate": 4.953247558112304e-05, "loss": 0.051, "mean_token_accuracy": 0.9858065843582153, "num_tokens": 47428467.0, "step": 19880 }, { "entropy": 0.08135449420660734, "epoch": 4.635272176244317, "grad_norm": 1.3203125, "learning_rate": 4.953206193613924e-05, "loss": 0.1072, "mean_token_accuracy": 0.9746085166931152, "num_tokens": 47458428.0, "step": 19885 }, { "entropy": 0.039868217147886756, "epoch": 4.6364378132649495, "grad_norm": 0.96875, "learning_rate": 4.95316481117393e-05, "loss": 0.0313, "mean_token_accuracy": 0.9860462665557861, "num_tokens": 47484331.0, "step": 19890 }, { "entropy": 0.09878461733460427, "epoch": 4.637603450285581, "grad_norm": 7.78125, "learning_rate": 4.9531234107929396e-05, "loss": 0.1179, "mean_token_accuracy": 0.9705485343933106, "num_tokens": 47494448.0, "step": 19895 }, { "entropy": 0.043826198158785704, "epoch": 4.638769087306213, "grad_norm": 3.53125, "learning_rate": 4.9530819924715696e-05, "loss": 0.0339, "mean_token_accuracy": 0.9882486462593079, "num_tokens": 47530441.0, "step": 19900 }, { "entropy": 0.07388420086354017, "epoch": 4.639934724326845, "grad_norm": 2.390625, "learning_rate": 4.953040556210437e-05, "loss": 0.081, "mean_token_accuracy": 0.9756742894649506, "num_tokens": 47546029.0, "step": 19905 }, { "entropy": 0.06827450687997043, "epoch": 4.641100361347476, "grad_norm": 0.341796875, "learning_rate": 4.95299910201016e-05, "loss": 0.0461, "mean_token_accuracy": 0.97547847032547, "num_tokens": 47588347.0, "step": 19910 }, { "entropy": 0.05796924643218517, "epoch": 4.642265998368108, "grad_norm": 0.5234375, "learning_rate": 4.952957629871358e-05, "loss": 0.0663, "mean_token_accuracy": 0.9812373161315918, "num_tokens": 47607383.0, "step": 19915 }, { "entropy": 0.050525398924946785, "epoch": 4.64343163538874, "grad_norm": 1.2265625, "learning_rate": 4.952916139794648e-05, "loss": 0.0771, "mean_token_accuracy": 0.9827237486839294, "num_tokens": 47622336.0, "step": 19920 }, { "entropy": 0.07169684544205665, "epoch": 4.644597272409372, "grad_norm": 1.0625, "learning_rate": 4.952874631780648e-05, "loss": 0.0782, "mean_token_accuracy": 0.9775870382785797, "num_tokens": 47645151.0, "step": 19925 }, { "entropy": 0.06724204597994685, "epoch": 4.645762909430004, "grad_norm": 7.375, "learning_rate": 4.952833105829979e-05, "loss": 0.0897, "mean_token_accuracy": 0.9776890814304352, "num_tokens": 47666927.0, "step": 19930 }, { "entropy": 0.06391678284853697, "epoch": 4.646928546450635, "grad_norm": 0.93359375, "learning_rate": 4.952791561943259e-05, "loss": 0.0661, "mean_token_accuracy": 0.984687602519989, "num_tokens": 47684001.0, "step": 19935 }, { "entropy": 0.09616228733211755, "epoch": 4.648094183471267, "grad_norm": 1.375, "learning_rate": 4.952750000121108e-05, "loss": 0.1775, "mean_token_accuracy": 0.9602535605430603, "num_tokens": 47693787.0, "step": 19940 }, { "entropy": 0.07046541702002287, "epoch": 4.649259820491899, "grad_norm": 3.140625, "learning_rate": 4.952708420364145e-05, "loss": 0.1064, "mean_token_accuracy": 0.9723043143749237, "num_tokens": 47707582.0, "step": 19945 }, { "entropy": 0.13581852428615093, "epoch": 4.65042545751253, "grad_norm": 1.78125, "learning_rate": 4.952666822672991e-05, "loss": 0.2336, "mean_token_accuracy": 0.9517211794853211, "num_tokens": 47726482.0, "step": 19950 }, { "entropy": 0.060800166614353654, "epoch": 4.651591094533162, "grad_norm": 0.1865234375, "learning_rate": 4.952625207048265e-05, "loss": 0.0302, "mean_token_accuracy": 0.9864785373210907, "num_tokens": 47768498.0, "step": 19955 }, { "entropy": 0.08810051530599594, "epoch": 4.652756731553794, "grad_norm": 5.375, "learning_rate": 4.952583573490589e-05, "loss": 0.135, "mean_token_accuracy": 0.9718170285224914, "num_tokens": 47787658.0, "step": 19960 }, { "entropy": 0.05881752036511898, "epoch": 4.653922368574426, "grad_norm": 5.6875, "learning_rate": 4.952541922000583e-05, "loss": 0.0759, "mean_token_accuracy": 0.9819856464862824, "num_tokens": 47817274.0, "step": 19965 }, { "entropy": 0.06032843859866262, "epoch": 4.655088005595058, "grad_norm": 4.625, "learning_rate": 4.9525002525788685e-05, "loss": 0.055, "mean_token_accuracy": 0.981800502538681, "num_tokens": 47839225.0, "step": 19970 }, { "entropy": 0.10100402384996414, "epoch": 4.65625364261569, "grad_norm": 1.359375, "learning_rate": 4.952458565226066e-05, "loss": 0.1203, "mean_token_accuracy": 0.9719790756702423, "num_tokens": 47859646.0, "step": 19975 }, { "entropy": 0.05944112166762352, "epoch": 4.657419279636321, "grad_norm": 3.171875, "learning_rate": 4.952416859942798e-05, "loss": 0.0631, "mean_token_accuracy": 0.9825767993927002, "num_tokens": 47875250.0, "step": 19980 }, { "entropy": 0.0668319322168827, "epoch": 4.658584916656953, "grad_norm": 0.66796875, "learning_rate": 4.952375136729686e-05, "loss": 0.0748, "mean_token_accuracy": 0.9800704658031464, "num_tokens": 47897665.0, "step": 19985 }, { "entropy": 0.06467094738036394, "epoch": 4.6597505536775845, "grad_norm": 1.1484375, "learning_rate": 4.952333395587352e-05, "loss": 0.0524, "mean_token_accuracy": 0.9808013200759887, "num_tokens": 47916914.0, "step": 19990 }, { "entropy": 0.07919787243008614, "epoch": 4.660916190698217, "grad_norm": 3.515625, "learning_rate": 4.952291636516419e-05, "loss": 0.0853, "mean_token_accuracy": 0.9790023684501648, "num_tokens": 47926765.0, "step": 19995 }, { "entropy": 0.06067050509154796, "epoch": 4.662081827718849, "grad_norm": 2.640625, "learning_rate": 4.9522498595175093e-05, "loss": 0.0907, "mean_token_accuracy": 0.9813133835792541, "num_tokens": 47939441.0, "step": 20000 }, { "entropy": 0.04763990985229612, "epoch": 4.66324746473948, "grad_norm": 1.546875, "learning_rate": 4.952208064591246e-05, "loss": 0.0322, "mean_token_accuracy": 0.9868841648101807, "num_tokens": 47968277.0, "step": 20005 }, { "entropy": 0.07200488224625587, "epoch": 4.664413101760112, "grad_norm": 2.28125, "learning_rate": 4.952166251738252e-05, "loss": 0.0819, "mean_token_accuracy": 0.9793427646160126, "num_tokens": 47981850.0, "step": 20010 }, { "entropy": 0.07122592218220233, "epoch": 4.665578738780743, "grad_norm": 1.953125, "learning_rate": 4.95212442095915e-05, "loss": 0.0957, "mean_token_accuracy": 0.9757452428340911, "num_tokens": 47994291.0, "step": 20015 }, { "entropy": 0.07337831128388643, "epoch": 4.666744375801375, "grad_norm": 0.7265625, "learning_rate": 4.9520825722545664e-05, "loss": 0.0756, "mean_token_accuracy": 0.9799136698246003, "num_tokens": 48015925.0, "step": 20020 }, { "entropy": 0.06564246322959662, "epoch": 4.6679100128220075, "grad_norm": 3.703125, "learning_rate": 4.9520407056251235e-05, "loss": 0.0842, "mean_token_accuracy": 0.9770355999469758, "num_tokens": 48031165.0, "step": 20025 }, { "entropy": 0.09497220404446124, "epoch": 4.669075649842639, "grad_norm": 7.375, "learning_rate": 4.951998821071445e-05, "loss": 0.1242, "mean_token_accuracy": 0.9703881442546844, "num_tokens": 48047560.0, "step": 20030 }, { "entropy": 0.07135007679462432, "epoch": 4.670241286863271, "grad_norm": 0.5390625, "learning_rate": 4.951956918594157e-05, "loss": 0.0942, "mean_token_accuracy": 0.979320478439331, "num_tokens": 48060649.0, "step": 20035 }, { "entropy": 0.0695052114315331, "epoch": 4.671406923883903, "grad_norm": 0.212890625, "learning_rate": 4.951914998193883e-05, "loss": 0.0675, "mean_token_accuracy": 0.9792129814624786, "num_tokens": 48078205.0, "step": 20040 }, { "entropy": 0.09717717897146941, "epoch": 4.672572560904534, "grad_norm": 6.875, "learning_rate": 4.951873059871248e-05, "loss": 0.1239, "mean_token_accuracy": 0.9706105470657349, "num_tokens": 48093089.0, "step": 20045 }, { "entropy": 0.05780693581327796, "epoch": 4.673738197925166, "grad_norm": 0.99609375, "learning_rate": 4.9518311036268785e-05, "loss": 0.0629, "mean_token_accuracy": 0.9817413330078125, "num_tokens": 48122792.0, "step": 20050 }, { "entropy": 0.08879671581089496, "epoch": 4.674903834945798, "grad_norm": 3.3125, "learning_rate": 4.9517891294613995e-05, "loss": 0.0988, "mean_token_accuracy": 0.9761231005191803, "num_tokens": 48145476.0, "step": 20055 }, { "entropy": 0.054804504942148925, "epoch": 4.6760694719664295, "grad_norm": 0.30078125, "learning_rate": 4.9517471373754374e-05, "loss": 0.0455, "mean_token_accuracy": 0.9853194117546081, "num_tokens": 48179724.0, "step": 20060 }, { "entropy": 0.06867078095674514, "epoch": 4.677235108987062, "grad_norm": 4.59375, "learning_rate": 4.951705127369617e-05, "loss": 0.1, "mean_token_accuracy": 0.9770641207695008, "num_tokens": 48190098.0, "step": 20065 }, { "entropy": 0.0939449267461896, "epoch": 4.678400746007693, "grad_norm": 2.65625, "learning_rate": 4.951663099444567e-05, "loss": 0.102, "mean_token_accuracy": 0.9720357954502106, "num_tokens": 48207731.0, "step": 20070 }, { "entropy": 0.11643363423645496, "epoch": 4.679566383028325, "grad_norm": 3.03125, "learning_rate": 4.951621053600912e-05, "loss": 0.1497, "mean_token_accuracy": 0.9660777747631073, "num_tokens": 48236231.0, "step": 20075 }, { "entropy": 0.058504576422274114, "epoch": 4.680732020048957, "grad_norm": 0.890625, "learning_rate": 4.95157898983928e-05, "loss": 0.0385, "mean_token_accuracy": 0.9740236282348633, "num_tokens": 48270224.0, "step": 20080 }, { "entropy": 0.05779395885765552, "epoch": 4.681897657069588, "grad_norm": 0.30078125, "learning_rate": 4.9515369081602984e-05, "loss": 0.0523, "mean_token_accuracy": 0.9832865595817566, "num_tokens": 48292194.0, "step": 20085 }, { "entropy": 0.07101827822625636, "epoch": 4.68306329409022, "grad_norm": 0.60546875, "learning_rate": 4.951494808564593e-05, "loss": 0.0612, "mean_token_accuracy": 0.9771855533123016, "num_tokens": 48321133.0, "step": 20090 }, { "entropy": 0.07946205716580153, "epoch": 4.684228931110852, "grad_norm": 3.921875, "learning_rate": 4.951452691052794e-05, "loss": 0.1363, "mean_token_accuracy": 0.969723004102707, "num_tokens": 48342279.0, "step": 20095 }, { "entropy": 0.06795702120289207, "epoch": 4.685394568131484, "grad_norm": 1.78125, "learning_rate": 4.951410555625527e-05, "loss": 0.074, "mean_token_accuracy": 0.9802046597003937, "num_tokens": 48357912.0, "step": 20100 }, { "entropy": 0.07630017213523388, "epoch": 4.686560205152116, "grad_norm": 1.3046875, "learning_rate": 4.951368402283423e-05, "loss": 0.0866, "mean_token_accuracy": 0.9766553997993469, "num_tokens": 48371624.0, "step": 20105 }, { "entropy": 0.06927125807851553, "epoch": 4.687725842172748, "grad_norm": 1.8671875, "learning_rate": 4.9513262310271084e-05, "loss": 0.0483, "mean_token_accuracy": 0.9857329249382019, "num_tokens": 48389135.0, "step": 20110 }, { "entropy": 0.054950883705168964, "epoch": 4.688891479193379, "grad_norm": 3.265625, "learning_rate": 4.951284041857213e-05, "loss": 0.0493, "mean_token_accuracy": 0.982935881614685, "num_tokens": 48410987.0, "step": 20115 }, { "entropy": 0.0935236718505621, "epoch": 4.690057116214011, "grad_norm": 7.3125, "learning_rate": 4.9512418347743664e-05, "loss": 0.1299, "mean_token_accuracy": 0.9671928882598877, "num_tokens": 48423657.0, "step": 20120 }, { "entropy": 0.059754582960158585, "epoch": 4.691222753234642, "grad_norm": 2.140625, "learning_rate": 4.9511996097791965e-05, "loss": 0.0525, "mean_token_accuracy": 0.9828765392303467, "num_tokens": 48445677.0, "step": 20125 }, { "entropy": 0.06901389230042695, "epoch": 4.6923883902552745, "grad_norm": 0.78515625, "learning_rate": 4.951157366872334e-05, "loss": 0.0586, "mean_token_accuracy": 0.9815558969974518, "num_tokens": 48478911.0, "step": 20130 }, { "entropy": 0.1916733231395483, "epoch": 4.693554027275907, "grad_norm": 2.203125, "learning_rate": 4.951115106054408e-05, "loss": 0.3589, "mean_token_accuracy": 0.9531080842018127, "num_tokens": 48507415.0, "step": 20135 }, { "entropy": 0.053913332894444464, "epoch": 4.694719664296538, "grad_norm": 1.7734375, "learning_rate": 4.9510728273260496e-05, "loss": 0.0681, "mean_token_accuracy": 0.9818377435207367, "num_tokens": 48526630.0, "step": 20140 }, { "entropy": 0.05371881201863289, "epoch": 4.69588530131717, "grad_norm": 0.388671875, "learning_rate": 4.951030530687889e-05, "loss": 0.0482, "mean_token_accuracy": 0.9868282377719879, "num_tokens": 48546033.0, "step": 20145 }, { "entropy": 0.06295309253036976, "epoch": 4.697050938337801, "grad_norm": 5.59375, "learning_rate": 4.9509882161405566e-05, "loss": 0.0881, "mean_token_accuracy": 0.9804576992988586, "num_tokens": 48558724.0, "step": 20150 }, { "entropy": 0.061114361975342035, "epoch": 4.698216575358433, "grad_norm": 2.140625, "learning_rate": 4.950945883684683e-05, "loss": 0.0577, "mean_token_accuracy": 0.9806105256080627, "num_tokens": 48586711.0, "step": 20155 }, { "entropy": 0.0676982618868351, "epoch": 4.699382212379065, "grad_norm": 1.015625, "learning_rate": 4.9509035333209005e-05, "loss": 0.0669, "mean_token_accuracy": 0.9803455650806427, "num_tokens": 48599778.0, "step": 20160 }, { "entropy": 0.057492456119507554, "epoch": 4.700547849399697, "grad_norm": 0.82421875, "learning_rate": 4.95086116504984e-05, "loss": 0.0555, "mean_token_accuracy": 0.9830128908157348, "num_tokens": 48618248.0, "step": 20165 }, { "entropy": 0.08579396829009056, "epoch": 4.701713486420329, "grad_norm": 4.28125, "learning_rate": 4.950818778872133e-05, "loss": 0.0526, "mean_token_accuracy": 0.9830702006816864, "num_tokens": 48645960.0, "step": 20170 }, { "entropy": 0.08069932255893945, "epoch": 4.702879123440961, "grad_norm": 6.09375, "learning_rate": 4.950776374788412e-05, "loss": 0.0841, "mean_token_accuracy": 0.9744628429412842, "num_tokens": 48664011.0, "step": 20175 }, { "entropy": 0.04855256769806147, "epoch": 4.704044760461592, "grad_norm": 0.55078125, "learning_rate": 4.9507339527993095e-05, "loss": 0.0426, "mean_token_accuracy": 0.982448011636734, "num_tokens": 48684718.0, "step": 20180 }, { "entropy": 0.09109218874946237, "epoch": 4.705210397482224, "grad_norm": 7.34375, "learning_rate": 4.9506915129054576e-05, "loss": 0.17, "mean_token_accuracy": 0.9593676149845123, "num_tokens": 48702995.0, "step": 20185 }, { "entropy": 0.07370678829029202, "epoch": 4.706376034502856, "grad_norm": 4.09375, "learning_rate": 4.950649055107489e-05, "loss": 0.0964, "mean_token_accuracy": 0.9742493212223053, "num_tokens": 48719147.0, "step": 20190 }, { "entropy": 0.08082408849149943, "epoch": 4.7075416715234875, "grad_norm": 3.125, "learning_rate": 4.9506065794060375e-05, "loss": 0.0533, "mean_token_accuracy": 0.9826491594314575, "num_tokens": 48741412.0, "step": 20195 }, { "entropy": 0.05809831535443664, "epoch": 4.70870730854412, "grad_norm": 0.4296875, "learning_rate": 4.950564085801736e-05, "loss": 0.0644, "mean_token_accuracy": 0.978083735704422, "num_tokens": 48779819.0, "step": 20200 }, { "entropy": 0.06636287728324533, "epoch": 4.709872945564751, "grad_norm": 5.59375, "learning_rate": 4.9505215742952184e-05, "loss": 0.0837, "mean_token_accuracy": 0.9786609828472137, "num_tokens": 48806468.0, "step": 20205 }, { "entropy": 0.05410275729373097, "epoch": 4.711038582585383, "grad_norm": 1.375, "learning_rate": 4.950479044887118e-05, "loss": 0.0475, "mean_token_accuracy": 0.9868753671646118, "num_tokens": 48822312.0, "step": 20210 }, { "entropy": 0.2814826850313693, "epoch": 4.712204219606015, "grad_norm": 0.37109375, "learning_rate": 4.9504364975780696e-05, "loss": 0.5149, "mean_token_accuracy": 0.9334210813045501, "num_tokens": 48866242.0, "step": 20215 }, { "entropy": 0.059157754946500066, "epoch": 4.713369856626646, "grad_norm": 2.4375, "learning_rate": 4.9503939323687073e-05, "loss": 0.0591, "mean_token_accuracy": 0.9809535264968872, "num_tokens": 48892857.0, "step": 20220 }, { "entropy": 0.0832524687051773, "epoch": 4.714535493647278, "grad_norm": 0.59375, "learning_rate": 4.9503513492596666e-05, "loss": 0.1003, "mean_token_accuracy": 0.9713628351688385, "num_tokens": 48904319.0, "step": 20225 }, { "entropy": 0.05422810595482588, "epoch": 4.7157011306679095, "grad_norm": 0.44140625, "learning_rate": 4.9503087482515817e-05, "loss": 0.0475, "mean_token_accuracy": 0.9819365739822388, "num_tokens": 48936321.0, "step": 20230 }, { "entropy": 0.07771264165639877, "epoch": 4.716866767688542, "grad_norm": 4.46875, "learning_rate": 4.9502661293450874e-05, "loss": 0.1248, "mean_token_accuracy": 0.9761758804321289, "num_tokens": 48945222.0, "step": 20235 }, { "entropy": 0.07859923299401998, "epoch": 4.718032404709174, "grad_norm": 4.375, "learning_rate": 4.95022349254082e-05, "loss": 0.0766, "mean_token_accuracy": 0.9763111054897309, "num_tokens": 48958764.0, "step": 20240 }, { "entropy": 0.06803965084254741, "epoch": 4.719198041729806, "grad_norm": 4.03125, "learning_rate": 4.950180837839416e-05, "loss": 0.0812, "mean_token_accuracy": 0.9793858110904694, "num_tokens": 48975984.0, "step": 20245 }, { "entropy": 0.048299599625170234, "epoch": 4.720363678750437, "grad_norm": 0.2314453125, "learning_rate": 4.950138165241509e-05, "loss": 0.0246, "mean_token_accuracy": 0.9865788578987121, "num_tokens": 49004389.0, "step": 20250 }, { "entropy": 0.061194864101707935, "epoch": 4.721529315771069, "grad_norm": 0.3125, "learning_rate": 4.950095474747738e-05, "loss": 0.0622, "mean_token_accuracy": 0.9849075496196746, "num_tokens": 49029294.0, "step": 20255 }, { "entropy": 0.08798045851290226, "epoch": 4.7226949527917, "grad_norm": 4.0625, "learning_rate": 4.9500527663587375e-05, "loss": 0.1386, "mean_token_accuracy": 0.9668548405170441, "num_tokens": 49038976.0, "step": 20260 }, { "entropy": 0.05687556634657085, "epoch": 4.7238605898123325, "grad_norm": 2.25, "learning_rate": 4.950010040075146e-05, "loss": 0.0464, "mean_token_accuracy": 0.9875887632369995, "num_tokens": 49060951.0, "step": 20265 }, { "entropy": 0.06489252224564553, "epoch": 4.725026226832965, "grad_norm": 3.71875, "learning_rate": 4.9499672958975995e-05, "loss": 0.0709, "mean_token_accuracy": 0.979289311170578, "num_tokens": 49074482.0, "step": 20270 }, { "entropy": 0.06801861561834813, "epoch": 4.726191863853596, "grad_norm": 5.40625, "learning_rate": 4.949924533826736e-05, "loss": 0.1091, "mean_token_accuracy": 0.9740906596183777, "num_tokens": 49087207.0, "step": 20275 }, { "entropy": 0.10587560161948203, "epoch": 4.727357500874228, "grad_norm": 1.1015625, "learning_rate": 4.949881753863193e-05, "loss": 0.1184, "mean_token_accuracy": 0.9726275205612183, "num_tokens": 49112264.0, "step": 20280 }, { "entropy": 0.10358101055026055, "epoch": 4.728523137894859, "grad_norm": 2.0, "learning_rate": 4.9498389560076084e-05, "loss": 0.159, "mean_token_accuracy": 0.9651169836521148, "num_tokens": 49132458.0, "step": 20285 }, { "entropy": 0.07152350768446922, "epoch": 4.729688774915491, "grad_norm": 2.671875, "learning_rate": 4.9497961402606204e-05, "loss": 0.0984, "mean_token_accuracy": 0.9762173771858216, "num_tokens": 49144811.0, "step": 20290 }, { "entropy": 0.10446326434612274, "epoch": 4.730854411936123, "grad_norm": 0.6171875, "learning_rate": 4.949753306622867e-05, "loss": 0.0961, "mean_token_accuracy": 0.9760794460773468, "num_tokens": 49158186.0, "step": 20295 }, { "entropy": 0.0776900127530098, "epoch": 4.7320200489567545, "grad_norm": 2.75, "learning_rate": 4.949710455094987e-05, "loss": 0.1157, "mean_token_accuracy": 0.9754393517971038, "num_tokens": 49167881.0, "step": 20300 }, { "entropy": 0.04937164485454559, "epoch": 4.733185685977387, "grad_norm": 0.73828125, "learning_rate": 4.94966758567762e-05, "loss": 0.0273, "mean_token_accuracy": 0.9871737420558929, "num_tokens": 49189512.0, "step": 20305 }, { "entropy": 0.06925871726125479, "epoch": 4.734351322998019, "grad_norm": 1.1328125, "learning_rate": 4.949624698371405e-05, "loss": 0.0801, "mean_token_accuracy": 0.9783924877643585, "num_tokens": 49204116.0, "step": 20310 }, { "entropy": 0.09072446711361408, "epoch": 4.73551696001865, "grad_norm": 1.1875, "learning_rate": 4.949581793176981e-05, "loss": 0.0981, "mean_token_accuracy": 0.969156926870346, "num_tokens": 49215356.0, "step": 20315 }, { "entropy": 0.07261850405484438, "epoch": 4.736682597039282, "grad_norm": 3.296875, "learning_rate": 4.9495388700949885e-05, "loss": 0.0857, "mean_token_accuracy": 0.9742332696914673, "num_tokens": 49232369.0, "step": 20320 }, { "entropy": 0.16985770147293805, "epoch": 4.737848234059914, "grad_norm": 0.76171875, "learning_rate": 4.9494959291260676e-05, "loss": 0.2871, "mean_token_accuracy": 0.932180005311966, "num_tokens": 49258087.0, "step": 20325 }, { "entropy": 0.07392840981483459, "epoch": 4.739013871080545, "grad_norm": 1.171875, "learning_rate": 4.949452970270857e-05, "loss": 0.1103, "mean_token_accuracy": 0.9722067236900329, "num_tokens": 49267408.0, "step": 20330 }, { "entropy": 0.060131664481014015, "epoch": 4.7401795081011775, "grad_norm": 2.875, "learning_rate": 4.9494099935299996e-05, "loss": 0.0497, "mean_token_accuracy": 0.9814348340034484, "num_tokens": 49292748.0, "step": 20335 }, { "entropy": 0.12636512629687785, "epoch": 4.741345145121809, "grad_norm": 4.3125, "learning_rate": 4.9493669989041353e-05, "loss": 0.165, "mean_token_accuracy": 0.9643090963363647, "num_tokens": 49305455.0, "step": 20340 }, { "entropy": 0.05997271528467536, "epoch": 4.742510782142441, "grad_norm": 1.1953125, "learning_rate": 4.949323986393905e-05, "loss": 0.0747, "mean_token_accuracy": 0.9823045790195465, "num_tokens": 49321546.0, "step": 20345 }, { "entropy": 0.05400382243096828, "epoch": 4.743676419163073, "grad_norm": 2.28125, "learning_rate": 4.9492809559999495e-05, "loss": 0.0464, "mean_token_accuracy": 0.9806383967399597, "num_tokens": 49370181.0, "step": 20350 }, { "entropy": 0.06353923268616199, "epoch": 4.744842056183704, "grad_norm": 0.318359375, "learning_rate": 4.949237907722912e-05, "loss": 0.0788, "mean_token_accuracy": 0.977184635400772, "num_tokens": 49392823.0, "step": 20355 }, { "entropy": 0.09873983627185226, "epoch": 4.746007693204336, "grad_norm": 0.88671875, "learning_rate": 4.9491948415634335e-05, "loss": 0.0789, "mean_token_accuracy": 0.974556165933609, "num_tokens": 49408279.0, "step": 20360 }, { "entropy": 0.09930532816797495, "epoch": 4.7471733302249675, "grad_norm": 0.42578125, "learning_rate": 4.949151757522155e-05, "loss": 0.123, "mean_token_accuracy": 0.9696231663227082, "num_tokens": 49429537.0, "step": 20365 }, { "entropy": 0.0745665643364191, "epoch": 4.7483389672456, "grad_norm": 1.296875, "learning_rate": 4.949108655599721e-05, "loss": 0.054, "mean_token_accuracy": 0.9802153050899506, "num_tokens": 49455342.0, "step": 20370 }, { "entropy": 0.0746733145788312, "epoch": 4.749504604266232, "grad_norm": 0.9765625, "learning_rate": 4.949065535796774e-05, "loss": 0.0982, "mean_token_accuracy": 0.9758549451828002, "num_tokens": 49476536.0, "step": 20375 }, { "entropy": 0.05807813517749309, "epoch": 4.750670241286863, "grad_norm": 2.828125, "learning_rate": 4.949022398113955e-05, "loss": 0.0743, "mean_token_accuracy": 0.9838265895843505, "num_tokens": 49494710.0, "step": 20380 }, { "entropy": 0.07137834914028644, "epoch": 4.751835878307495, "grad_norm": 2.46875, "learning_rate": 4.9489792425519097e-05, "loss": 0.1034, "mean_token_accuracy": 0.9739278852939606, "num_tokens": 49505900.0, "step": 20385 }, { "entropy": 0.06287243217229843, "epoch": 4.753001515328127, "grad_norm": 3.421875, "learning_rate": 4.948936069111281e-05, "loss": 0.0759, "mean_token_accuracy": 0.9815413355827332, "num_tokens": 49519016.0, "step": 20390 }, { "entropy": 0.05989686464890838, "epoch": 4.754167152348758, "grad_norm": 0.56640625, "learning_rate": 4.948892877792711e-05, "loss": 0.056, "mean_token_accuracy": 0.9855867683887481, "num_tokens": 49537367.0, "step": 20395 }, { "entropy": 0.09508889261633158, "epoch": 4.75533278936939, "grad_norm": 1.765625, "learning_rate": 4.9488496685968455e-05, "loss": 0.1299, "mean_token_accuracy": 0.9706778466701508, "num_tokens": 49561307.0, "step": 20400 }, { "entropy": 0.07447026818990707, "epoch": 4.7564984263900225, "grad_norm": 4.53125, "learning_rate": 4.948806441524328e-05, "loss": 0.0922, "mean_token_accuracy": 0.9730834066867828, "num_tokens": 49578999.0, "step": 20405 }, { "entropy": 0.08297924473881721, "epoch": 4.757664063410654, "grad_norm": 2.0, "learning_rate": 4.9487631965758034e-05, "loss": 0.109, "mean_token_accuracy": 0.9753465235233307, "num_tokens": 49601086.0, "step": 20410 }, { "entropy": 0.07900863215327263, "epoch": 4.758829700431286, "grad_norm": 4.6875, "learning_rate": 4.948719933751916e-05, "loss": 0.0983, "mean_token_accuracy": 0.974524050951004, "num_tokens": 49611278.0, "step": 20415 }, { "entropy": 0.08208957873284817, "epoch": 4.759995337451917, "grad_norm": 2.09375, "learning_rate": 4.9486766530533126e-05, "loss": 0.0715, "mean_token_accuracy": 0.9752253651618957, "num_tokens": 49631030.0, "step": 20420 }, { "entropy": 0.057911211252212526, "epoch": 4.761160974472549, "grad_norm": 0.490234375, "learning_rate": 4.9486333544806365e-05, "loss": 0.0377, "mean_token_accuracy": 0.983147144317627, "num_tokens": 49649629.0, "step": 20425 }, { "entropy": 0.08789655100554228, "epoch": 4.762326611493181, "grad_norm": 2.328125, "learning_rate": 4.948590038034535e-05, "loss": 0.102, "mean_token_accuracy": 0.9745977938175201, "num_tokens": 49660399.0, "step": 20430 }, { "entropy": 0.08837173730134965, "epoch": 4.7634922485138125, "grad_norm": 1.5, "learning_rate": 4.9485467037156525e-05, "loss": 0.1278, "mean_token_accuracy": 0.9708876311779022, "num_tokens": 49669243.0, "step": 20435 }, { "entropy": 0.05032849675044417, "epoch": 4.764657885534445, "grad_norm": 0.546875, "learning_rate": 4.948503351524636e-05, "loss": 0.043, "mean_token_accuracy": 0.9866231560707093, "num_tokens": 49700542.0, "step": 20440 }, { "entropy": 0.08655855841934681, "epoch": 4.765823522555077, "grad_norm": 1.1015625, "learning_rate": 4.948459981462132e-05, "loss": 0.0835, "mean_token_accuracy": 0.9743392825126648, "num_tokens": 49718787.0, "step": 20445 }, { "entropy": 0.06486239023506642, "epoch": 4.766989159575708, "grad_norm": 1.3515625, "learning_rate": 4.948416593528787e-05, "loss": 0.082, "mean_token_accuracy": 0.9798474609851837, "num_tokens": 49734457.0, "step": 20450 }, { "entropy": 0.0723851252347231, "epoch": 4.76815479659634, "grad_norm": 2.078125, "learning_rate": 4.948373187725249e-05, "loss": 0.0684, "mean_token_accuracy": 0.9775489926338196, "num_tokens": 49753956.0, "step": 20455 }, { "entropy": 0.08052669223397971, "epoch": 4.769320433616972, "grad_norm": 1.28125, "learning_rate": 4.948329764052163e-05, "loss": 0.0704, "mean_token_accuracy": 0.9762837886810303, "num_tokens": 49773734.0, "step": 20460 }, { "entropy": 0.08943664506077767, "epoch": 4.770486070637603, "grad_norm": 0.5859375, "learning_rate": 4.948286322510179e-05, "loss": 0.083, "mean_token_accuracy": 0.973814857006073, "num_tokens": 49787380.0, "step": 20465 }, { "entropy": 0.07294655237346888, "epoch": 4.771651707658235, "grad_norm": 2.859375, "learning_rate": 4.9482428630999426e-05, "loss": 0.0817, "mean_token_accuracy": 0.979939204454422, "num_tokens": 49805226.0, "step": 20470 }, { "entropy": 0.09040804943069816, "epoch": 4.772817344678867, "grad_norm": 3.59375, "learning_rate": 4.948199385822103e-05, "loss": 0.1118, "mean_token_accuracy": 0.9751499712467193, "num_tokens": 49817884.0, "step": 20475 }, { "entropy": 0.05256835455074906, "epoch": 4.773982981699499, "grad_norm": 0.50390625, "learning_rate": 4.948155890677309e-05, "loss": 0.0451, "mean_token_accuracy": 0.9860180079936981, "num_tokens": 49850987.0, "step": 20480 }, { "entropy": 0.10846010688692331, "epoch": 4.775148618720131, "grad_norm": 3.1875, "learning_rate": 4.948112377666208e-05, "loss": 0.1126, "mean_token_accuracy": 0.9711306512355804, "num_tokens": 49867014.0, "step": 20485 }, { "entropy": 0.05567761724814772, "epoch": 4.776314255740762, "grad_norm": 0.53515625, "learning_rate": 4.94806884678945e-05, "loss": 0.0604, "mean_token_accuracy": 0.9824544131755829, "num_tokens": 49899711.0, "step": 20490 }, { "entropy": 0.07059495002031327, "epoch": 4.777479892761394, "grad_norm": 1.90625, "learning_rate": 4.9480252980476825e-05, "loss": 0.0977, "mean_token_accuracy": 0.9781529903411865, "num_tokens": 49932999.0, "step": 20495 }, { "entropy": 0.0721295103430748, "epoch": 4.778645529782025, "grad_norm": 2.875, "learning_rate": 4.947981731441557e-05, "loss": 0.0914, "mean_token_accuracy": 0.9754843413829803, "num_tokens": 49942908.0, "step": 20500 }, { "entropy": 0.11595875062048436, "epoch": 4.7798111668026575, "grad_norm": 6.34375, "learning_rate": 4.947938146971721e-05, "loss": 0.2174, "mean_token_accuracy": 0.9521984398365021, "num_tokens": 49961131.0, "step": 20505 }, { "entropy": 0.06587783191353083, "epoch": 4.78097680382329, "grad_norm": 4.78125, "learning_rate": 4.9478945446388255e-05, "loss": 0.0809, "mean_token_accuracy": 0.9789382636547088, "num_tokens": 49974497.0, "step": 20510 }, { "entropy": 0.059248296450823544, "epoch": 4.782142440843921, "grad_norm": 2.875, "learning_rate": 4.947850924443521e-05, "loss": 0.0772, "mean_token_accuracy": 0.9822989463806152, "num_tokens": 49993498.0, "step": 20515 }, { "entropy": 0.09005047529935836, "epoch": 4.783308077864553, "grad_norm": 2.46875, "learning_rate": 4.9478072863864576e-05, "loss": 0.1128, "mean_token_accuracy": 0.9668475985527039, "num_tokens": 50002253.0, "step": 20520 }, { "entropy": 0.09272575750946999, "epoch": 4.784473714885185, "grad_norm": 2.859375, "learning_rate": 4.947763630468286e-05, "loss": 0.1423, "mean_token_accuracy": 0.9673610985279083, "num_tokens": 50010592.0, "step": 20525 }, { "entropy": 0.07717711478471756, "epoch": 4.785639351905816, "grad_norm": 0.96484375, "learning_rate": 4.947719956689657e-05, "loss": 0.0943, "mean_token_accuracy": 0.9727614760398865, "num_tokens": 50041275.0, "step": 20530 }, { "entropy": 0.12004315797239543, "epoch": 4.786804988926448, "grad_norm": 1.296875, "learning_rate": 4.947676265051222e-05, "loss": 0.1233, "mean_token_accuracy": 0.9705375373363495, "num_tokens": 50068454.0, "step": 20535 }, { "entropy": 0.07685479950159788, "epoch": 4.7879706259470804, "grad_norm": 5.90625, "learning_rate": 4.947632555553633e-05, "loss": 0.0732, "mean_token_accuracy": 0.9742613315582276, "num_tokens": 50090905.0, "step": 20540 }, { "entropy": 0.08232810776680707, "epoch": 4.789136262967712, "grad_norm": 12.875, "learning_rate": 4.9475888281975404e-05, "loss": 0.1358, "mean_token_accuracy": 0.9695055544376373, "num_tokens": 50104591.0, "step": 20545 }, { "entropy": 0.09904810027219355, "epoch": 4.790301899988344, "grad_norm": 0.2236328125, "learning_rate": 4.947545082983597e-05, "loss": 0.1956, "mean_token_accuracy": 0.9490303874015809, "num_tokens": 50151001.0, "step": 20550 }, { "entropy": 0.05315098352730274, "epoch": 4.791467537008975, "grad_norm": 2.640625, "learning_rate": 4.9475013199124556e-05, "loss": 0.0487, "mean_token_accuracy": 0.9870794475078583, "num_tokens": 50176426.0, "step": 20555 }, { "entropy": 0.08232418056577444, "epoch": 4.792633174029607, "grad_norm": 3.15625, "learning_rate": 4.947457538984769e-05, "loss": 0.0556, "mean_token_accuracy": 0.9761513948440552, "num_tokens": 50205312.0, "step": 20560 }, { "entropy": 0.06682766638696194, "epoch": 4.793798811050239, "grad_norm": 4.125, "learning_rate": 4.947413740201189e-05, "loss": 0.0881, "mean_token_accuracy": 0.9708187401294708, "num_tokens": 50234640.0, "step": 20565 }, { "entropy": 0.08927183225750923, "epoch": 4.79496444807087, "grad_norm": 1.1875, "learning_rate": 4.9473699235623686e-05, "loss": 0.1019, "mean_token_accuracy": 0.9742757558822632, "num_tokens": 50256381.0, "step": 20570 }, { "entropy": 0.07163733001798392, "epoch": 4.7961300850915025, "grad_norm": 0.74609375, "learning_rate": 4.947326089068962e-05, "loss": 0.0766, "mean_token_accuracy": 0.9792376101016999, "num_tokens": 50272402.0, "step": 20575 }, { "entropy": 0.0808179883286357, "epoch": 4.797295722112135, "grad_norm": 3.796875, "learning_rate": 4.9472822367216225e-05, "loss": 0.0901, "mean_token_accuracy": 0.9793714106082916, "num_tokens": 50296722.0, "step": 20580 }, { "entropy": 0.10730183199048042, "epoch": 4.798461359132766, "grad_norm": 4.71875, "learning_rate": 4.9472383665210045e-05, "loss": 0.1544, "mean_token_accuracy": 0.9621846616268158, "num_tokens": 50315927.0, "step": 20585 }, { "entropy": 0.056595608685165645, "epoch": 4.799626996153398, "grad_norm": 2.265625, "learning_rate": 4.947194478467761e-05, "loss": 0.0463, "mean_token_accuracy": 0.9833418965339661, "num_tokens": 50339939.0, "step": 20590 }, { "entropy": 0.07467662245035171, "epoch": 4.80079263317403, "grad_norm": 2.96875, "learning_rate": 4.9471505725625475e-05, "loss": 0.0945, "mean_token_accuracy": 0.9767918467521668, "num_tokens": 50349137.0, "step": 20595 }, { "entropy": 0.09103550501167774, "epoch": 4.801958270194661, "grad_norm": 1.234375, "learning_rate": 4.947106648806018e-05, "loss": 0.1074, "mean_token_accuracy": 0.9768331527709961, "num_tokens": 50371173.0, "step": 20600 }, { "entropy": 0.07896652333438396, "epoch": 4.803123907215293, "grad_norm": 0.9296875, "learning_rate": 4.947062707198829e-05, "loss": 0.1035, "mean_token_accuracy": 0.9723316550254821, "num_tokens": 50397114.0, "step": 20605 }, { "entropy": 0.09035487687215209, "epoch": 4.804289544235925, "grad_norm": 2.734375, "learning_rate": 4.947018747741633e-05, "loss": 0.1021, "mean_token_accuracy": 0.9717907607555389, "num_tokens": 50426955.0, "step": 20610 }, { "entropy": 0.08067240752279758, "epoch": 4.805455181256557, "grad_norm": 1.5078125, "learning_rate": 4.946974770435088e-05, "loss": 0.0624, "mean_token_accuracy": 0.974557638168335, "num_tokens": 50446613.0, "step": 20615 }, { "entropy": 0.08309468347579241, "epoch": 4.806620818277189, "grad_norm": 1.03125, "learning_rate": 4.946930775279848e-05, "loss": 0.1288, "mean_token_accuracy": 0.9713143050670624, "num_tokens": 50457957.0, "step": 20620 }, { "entropy": 0.06522488570772111, "epoch": 4.80778645529782, "grad_norm": 1.3984375, "learning_rate": 4.946886762276571e-05, "loss": 0.0816, "mean_token_accuracy": 0.976836746931076, "num_tokens": 50479228.0, "step": 20625 }, { "entropy": 0.06677600918337703, "epoch": 4.808952092318452, "grad_norm": 2.03125, "learning_rate": 4.946842731425911e-05, "loss": 0.0917, "mean_token_accuracy": 0.9727399408817291, "num_tokens": 50503005.0, "step": 20630 }, { "entropy": 0.08386764228343964, "epoch": 4.810117729339083, "grad_norm": 5.59375, "learning_rate": 4.9467986827285265e-05, "loss": 0.1182, "mean_token_accuracy": 0.9724303543567657, "num_tokens": 50514092.0, "step": 20635 }, { "entropy": 0.05374229941517115, "epoch": 4.811283366359715, "grad_norm": 0.44140625, "learning_rate": 4.946754616185073e-05, "loss": 0.0518, "mean_token_accuracy": 0.9885022401809692, "num_tokens": 50554374.0, "step": 20640 }, { "entropy": 0.06895175650715828, "epoch": 4.8124490033803475, "grad_norm": 1.25, "learning_rate": 4.946710531796209e-05, "loss": 0.1335, "mean_token_accuracy": 0.9721293866634368, "num_tokens": 50565314.0, "step": 20645 }, { "entropy": 0.07366420496255159, "epoch": 4.813614640400979, "grad_norm": 1.59375, "learning_rate": 4.94666642956259e-05, "loss": 0.0786, "mean_token_accuracy": 0.9837552666664123, "num_tokens": 50577374.0, "step": 20650 }, { "entropy": 0.09850245006382466, "epoch": 4.814780277421611, "grad_norm": 1.359375, "learning_rate": 4.946622309484875e-05, "loss": 0.0968, "mean_token_accuracy": 0.9730374455451966, "num_tokens": 50587536.0, "step": 20655 }, { "entropy": 0.08902466334402562, "epoch": 4.815945914442243, "grad_norm": 4.34375, "learning_rate": 4.9465781715637224e-05, "loss": 0.1062, "mean_token_accuracy": 0.9702731013298035, "num_tokens": 50597004.0, "step": 20660 }, { "entropy": 0.06709796143695712, "epoch": 4.817111551462874, "grad_norm": 1.75, "learning_rate": 4.946534015799789e-05, "loss": 0.0396, "mean_token_accuracy": 0.983069121837616, "num_tokens": 50624783.0, "step": 20665 }, { "entropy": 0.06530861468054354, "epoch": 4.818277188483506, "grad_norm": 1.625, "learning_rate": 4.946489842193733e-05, "loss": 0.0506, "mean_token_accuracy": 0.9851066172122955, "num_tokens": 50652000.0, "step": 20670 }, { "entropy": 0.08375565335154533, "epoch": 4.819442825504138, "grad_norm": 2.578125, "learning_rate": 4.946445650746214e-05, "loss": 0.1049, "mean_token_accuracy": 0.9721269071102142, "num_tokens": 50661199.0, "step": 20675 }, { "entropy": 0.0636015109717846, "epoch": 4.82060846252477, "grad_norm": 1.453125, "learning_rate": 4.946401441457891e-05, "loss": 0.1162, "mean_token_accuracy": 0.9724109470844269, "num_tokens": 50671843.0, "step": 20680 }, { "entropy": 0.07089730594307184, "epoch": 4.821774099545402, "grad_norm": 2.890625, "learning_rate": 4.946357214329423e-05, "loss": 0.0719, "mean_token_accuracy": 0.9801780402660369, "num_tokens": 50686035.0, "step": 20685 }, { "entropy": 0.06434464119374753, "epoch": 4.822939736566033, "grad_norm": 5.28125, "learning_rate": 4.9463129693614705e-05, "loss": 0.0852, "mean_token_accuracy": 0.9769482016563416, "num_tokens": 50707631.0, "step": 20690 }, { "entropy": 0.07662344705313444, "epoch": 4.824105373586665, "grad_norm": 4.15625, "learning_rate": 4.946268706554691e-05, "loss": 0.1046, "mean_token_accuracy": 0.9777352511882782, "num_tokens": 50724343.0, "step": 20695 }, { "entropy": 0.06888558939099312, "epoch": 4.825271010607297, "grad_norm": 2.34375, "learning_rate": 4.946224425909746e-05, "loss": 0.0598, "mean_token_accuracy": 0.9821949362754822, "num_tokens": 50745360.0, "step": 20700 }, { "entropy": 0.06832386320456862, "epoch": 4.826436647627928, "grad_norm": 0.349609375, "learning_rate": 4.946180127427296e-05, "loss": 0.0733, "mean_token_accuracy": 0.980415141582489, "num_tokens": 50771964.0, "step": 20705 }, { "entropy": 0.0478357121348381, "epoch": 4.8276022846485604, "grad_norm": 1.1875, "learning_rate": 4.9461358111080015e-05, "loss": 0.0616, "mean_token_accuracy": 0.9851399242877961, "num_tokens": 50815419.0, "step": 20710 }, { "entropy": 0.07840246148407459, "epoch": 4.8287679216691926, "grad_norm": 0.255859375, "learning_rate": 4.946091476952522e-05, "loss": 0.0939, "mean_token_accuracy": 0.9758677661418915, "num_tokens": 50839360.0, "step": 20715 }, { "entropy": 0.12673660293221473, "epoch": 4.829933558689824, "grad_norm": 4.1875, "learning_rate": 4.94604712496152e-05, "loss": 0.1517, "mean_token_accuracy": 0.963898116350174, "num_tokens": 50855945.0, "step": 20720 }, { "entropy": 0.04601201200857759, "epoch": 4.831099195710456, "grad_norm": 0.41796875, "learning_rate": 4.9460027551356566e-05, "loss": 0.0403, "mean_token_accuracy": 0.9830052733421326, "num_tokens": 50883110.0, "step": 20725 }, { "entropy": 0.06605788934975862, "epoch": 4.832264832731088, "grad_norm": 0.333984375, "learning_rate": 4.945958367475593e-05, "loss": 0.0815, "mean_token_accuracy": 0.9799921751022339, "num_tokens": 50905024.0, "step": 20730 }, { "entropy": 0.06048436416313052, "epoch": 4.833430469751719, "grad_norm": 1.875, "learning_rate": 4.945913961981992e-05, "loss": 0.0724, "mean_token_accuracy": 0.9796711564064026, "num_tokens": 50920962.0, "step": 20735 }, { "entropy": 0.08121685888618231, "epoch": 4.834596106772351, "grad_norm": 1.03125, "learning_rate": 4.9458695386555135e-05, "loss": 0.0782, "mean_token_accuracy": 0.9797780096530915, "num_tokens": 50937892.0, "step": 20740 }, { "entropy": 0.0714781729504466, "epoch": 4.8357617437929825, "grad_norm": 3.59375, "learning_rate": 4.945825097496823e-05, "loss": 0.0972, "mean_token_accuracy": 0.975777804851532, "num_tokens": 50950534.0, "step": 20745 }, { "entropy": 0.050937081407755616, "epoch": 4.836927380813615, "grad_norm": 0.5859375, "learning_rate": 4.945780638506581e-05, "loss": 0.0491, "mean_token_accuracy": 0.9856567502021789, "num_tokens": 50983069.0, "step": 20750 }, { "entropy": 0.05653784843161702, "epoch": 4.838093017834247, "grad_norm": 2.125, "learning_rate": 4.945736161685451e-05, "loss": 0.0419, "mean_token_accuracy": 0.9845134735107421, "num_tokens": 51009508.0, "step": 20755 }, { "entropy": 0.06965322587639093, "epoch": 4.839258654854878, "grad_norm": 1.203125, "learning_rate": 4.945691667034096e-05, "loss": 0.1131, "mean_token_accuracy": 0.968089246749878, "num_tokens": 51029970.0, "step": 20760 }, { "entropy": 0.04931186120957136, "epoch": 4.84042429187551, "grad_norm": 0.57421875, "learning_rate": 4.9456471545531804e-05, "loss": 0.0373, "mean_token_accuracy": 0.9879566371440888, "num_tokens": 51050398.0, "step": 20765 }, { "entropy": 0.06215945780277252, "epoch": 4.841589928896141, "grad_norm": 1.2578125, "learning_rate": 4.945602624243368e-05, "loss": 0.0886, "mean_token_accuracy": 0.9776066720485688, "num_tokens": 51063243.0, "step": 20770 }, { "entropy": 0.08728420175611973, "epoch": 4.842755565916773, "grad_norm": 4.15625, "learning_rate": 4.945558076105321e-05, "loss": 0.0949, "mean_token_accuracy": 0.9721355080604553, "num_tokens": 51073906.0, "step": 20775 }, { "entropy": 0.06952288392931223, "epoch": 4.8439212029374055, "grad_norm": 0.333984375, "learning_rate": 4.9455135101397056e-05, "loss": 0.0831, "mean_token_accuracy": 0.9805684685707092, "num_tokens": 51100643.0, "step": 20780 }, { "entropy": 0.04499766388908029, "epoch": 4.845086839958037, "grad_norm": 0.625, "learning_rate": 4.945468926347185e-05, "loss": 0.0246, "mean_token_accuracy": 0.9909033119678498, "num_tokens": 51136680.0, "step": 20785 }, { "entropy": 0.04354067463427782, "epoch": 4.846252476978669, "grad_norm": 0.3359375, "learning_rate": 4.945424324728425e-05, "loss": 0.0345, "mean_token_accuracy": 0.9890659034252167, "num_tokens": 51159069.0, "step": 20790 }, { "entropy": 0.08422119580209256, "epoch": 4.847418113999301, "grad_norm": 1.0625, "learning_rate": 4.945379705284091e-05, "loss": 0.0883, "mean_token_accuracy": 0.9788319587707519, "num_tokens": 51170620.0, "step": 20795 }, { "entropy": 0.07503297440707683, "epoch": 4.848583751019932, "grad_norm": 4.46875, "learning_rate": 4.945335068014847e-05, "loss": 0.1176, "mean_token_accuracy": 0.9673508882522583, "num_tokens": 51180690.0, "step": 20800 }, { "entropy": 0.09586451314389706, "epoch": 4.849749388040564, "grad_norm": 1.3828125, "learning_rate": 4.945290412921359e-05, "loss": 0.1468, "mean_token_accuracy": 0.9734158456325531, "num_tokens": 51213063.0, "step": 20805 }, { "entropy": 0.06957128308713437, "epoch": 4.850915025061196, "grad_norm": 2.09375, "learning_rate": 4.945245740004293e-05, "loss": 0.0854, "mean_token_accuracy": 0.9737328290939331, "num_tokens": 51233378.0, "step": 20810 }, { "entropy": 0.04465410923585296, "epoch": 4.8520806620818275, "grad_norm": 0.66796875, "learning_rate": 4.9452010492643165e-05, "loss": 0.0513, "mean_token_accuracy": 0.9855859398841857, "num_tokens": 51257918.0, "step": 20815 }, { "entropy": 0.06682367827743292, "epoch": 4.85324629910246, "grad_norm": 1.671875, "learning_rate": 4.945156340702093e-05, "loss": 0.0605, "mean_token_accuracy": 0.9807843506336212, "num_tokens": 51278448.0, "step": 20820 }, { "entropy": 0.09898395230993629, "epoch": 4.854411936123091, "grad_norm": 0.32421875, "learning_rate": 4.945111614318292e-05, "loss": 0.0838, "mean_token_accuracy": 0.9787111759185791, "num_tokens": 51297431.0, "step": 20825 }, { "entropy": 0.07027254728600382, "epoch": 4.855577573143723, "grad_norm": 3.46875, "learning_rate": 4.945066870113579e-05, "loss": 0.0683, "mean_token_accuracy": 0.9835409700870514, "num_tokens": 51320978.0, "step": 20830 }, { "entropy": 0.0634617348201573, "epoch": 4.856743210164355, "grad_norm": 1.0234375, "learning_rate": 4.945022108088621e-05, "loss": 0.0545, "mean_token_accuracy": 0.9819484114646911, "num_tokens": 51348363.0, "step": 20835 }, { "entropy": 0.06744776256382465, "epoch": 4.857908847184986, "grad_norm": 4.59375, "learning_rate": 4.944977328244086e-05, "loss": 0.0571, "mean_token_accuracy": 0.9829931437969208, "num_tokens": 51369185.0, "step": 20840 }, { "entropy": 0.05282375058159232, "epoch": 4.859074484205618, "grad_norm": 0.466796875, "learning_rate": 4.944932530580643e-05, "loss": 0.0313, "mean_token_accuracy": 0.98763307929039, "num_tokens": 51400901.0, "step": 20845 }, { "entropy": 0.06535831596702338, "epoch": 4.8602401212262505, "grad_norm": 0.81640625, "learning_rate": 4.9448877150989576e-05, "loss": 0.0602, "mean_token_accuracy": 0.9842148721218109, "num_tokens": 51422228.0, "step": 20850 }, { "entropy": 0.062378438748419286, "epoch": 4.861405758246882, "grad_norm": 0.85546875, "learning_rate": 4.944842881799699e-05, "loss": 0.0462, "mean_token_accuracy": 0.9865955471992492, "num_tokens": 51439003.0, "step": 20855 }, { "entropy": 0.07847417313605547, "epoch": 4.862571395267514, "grad_norm": 4.875, "learning_rate": 4.9447980306835364e-05, "loss": 0.0648, "mean_token_accuracy": 0.9656040012836457, "num_tokens": 51466913.0, "step": 20860 }, { "entropy": 0.07198535539209842, "epoch": 4.863737032288146, "grad_norm": 3.609375, "learning_rate": 4.944753161751138e-05, "loss": 0.0434, "mean_token_accuracy": 0.9815111875534057, "num_tokens": 51496979.0, "step": 20865 }, { "entropy": 0.16263010874390602, "epoch": 4.864902669308777, "grad_norm": 0.55078125, "learning_rate": 4.9447082750031724e-05, "loss": 0.3119, "mean_token_accuracy": 0.9357062309980393, "num_tokens": 51539514.0, "step": 20870 }, { "entropy": 0.08168750740587712, "epoch": 4.866068306329409, "grad_norm": 3.96875, "learning_rate": 4.94466337044031e-05, "loss": 0.1601, "mean_token_accuracy": 0.9629373013973236, "num_tokens": 51548829.0, "step": 20875 }, { "entropy": 0.06368317557498812, "epoch": 4.8672339433500404, "grad_norm": 0.9453125, "learning_rate": 4.9446184480632194e-05, "loss": 0.0763, "mean_token_accuracy": 0.9779473185539246, "num_tokens": 51573706.0, "step": 20880 }, { "entropy": 0.07042047139257193, "epoch": 4.8683995803706726, "grad_norm": 1.7890625, "learning_rate": 4.944573507872572e-05, "loss": 0.0907, "mean_token_accuracy": 0.9742065310478211, "num_tokens": 51586256.0, "step": 20885 }, { "entropy": 0.07600197829306125, "epoch": 4.869565217391305, "grad_norm": 0.890625, "learning_rate": 4.944528549869036e-05, "loss": 0.1134, "mean_token_accuracy": 0.9720952391624451, "num_tokens": 51597109.0, "step": 20890 }, { "entropy": 0.10058935396373272, "epoch": 4.870730854411936, "grad_norm": 2.546875, "learning_rate": 4.9444835740532825e-05, "loss": 0.1386, "mean_token_accuracy": 0.968506908416748, "num_tokens": 51604876.0, "step": 20895 }, { "entropy": 0.04899873323738575, "epoch": 4.871896491432568, "grad_norm": 0.44921875, "learning_rate": 4.9444385804259826e-05, "loss": 0.0424, "mean_token_accuracy": 0.9887868106365204, "num_tokens": 51637614.0, "step": 20900 }, { "entropy": 0.09722777623683214, "epoch": 4.873062128453199, "grad_norm": 0.30859375, "learning_rate": 4.944393568987807e-05, "loss": 0.0429, "mean_token_accuracy": 0.9840863108634949, "num_tokens": 51671476.0, "step": 20905 }, { "entropy": 0.07712397929280997, "epoch": 4.874227765473831, "grad_norm": 0.5703125, "learning_rate": 4.944348539739427e-05, "loss": 0.0866, "mean_token_accuracy": 0.9797841608524323, "num_tokens": 51684155.0, "step": 20910 }, { "entropy": 0.0837567220441997, "epoch": 4.875393402494463, "grad_norm": 4.625, "learning_rate": 4.944303492681514e-05, "loss": 0.0708, "mean_token_accuracy": 0.9777788579463959, "num_tokens": 51707822.0, "step": 20915 }, { "entropy": 0.1396466862410307, "epoch": 4.876559039515095, "grad_norm": 2.0625, "learning_rate": 4.944258427814739e-05, "loss": 0.2607, "mean_token_accuracy": 0.9586719572544098, "num_tokens": 51734253.0, "step": 20920 }, { "entropy": 0.0678241515532136, "epoch": 4.877724676535727, "grad_norm": 8.9375, "learning_rate": 4.9442133451397745e-05, "loss": 0.0741, "mean_token_accuracy": 0.9762378096580505, "num_tokens": 51760598.0, "step": 20925 }, { "entropy": 0.07093913480639458, "epoch": 4.878890313556359, "grad_norm": 1.4375, "learning_rate": 4.944168244657293e-05, "loss": 0.0865, "mean_token_accuracy": 0.9774516761302948, "num_tokens": 51772195.0, "step": 20930 }, { "entropy": 0.05839260285720229, "epoch": 4.88005595057699, "grad_norm": 2.96875, "learning_rate": 4.9441231263679664e-05, "loss": 0.0666, "mean_token_accuracy": 0.9820129334926605, "num_tokens": 51795228.0, "step": 20935 }, { "entropy": 0.07823163829743862, "epoch": 4.881221587597622, "grad_norm": 7.625, "learning_rate": 4.9440779902724684e-05, "loss": 0.1215, "mean_token_accuracy": 0.9703441679477691, "num_tokens": 51815122.0, "step": 20940 }, { "entropy": 0.05640836558304727, "epoch": 4.882387224618254, "grad_norm": 0.953125, "learning_rate": 4.944032836371472e-05, "loss": 0.0493, "mean_token_accuracy": 0.9837121963500977, "num_tokens": 51840298.0, "step": 20945 }, { "entropy": 0.06406810432672501, "epoch": 4.8835528616388855, "grad_norm": 3.0, "learning_rate": 4.94398766466565e-05, "loss": 0.104, "mean_token_accuracy": 0.9748159289360047, "num_tokens": 51850125.0, "step": 20950 }, { "entropy": 0.06545968130230903, "epoch": 4.884718498659518, "grad_norm": 2.140625, "learning_rate": 4.9439424751556754e-05, "loss": 0.0919, "mean_token_accuracy": 0.9777773797512055, "num_tokens": 51861624.0, "step": 20955 }, { "entropy": 0.059191903471946715, "epoch": 4.885884135680149, "grad_norm": 0.40625, "learning_rate": 4.943897267842223e-05, "loss": 0.0471, "mean_token_accuracy": 0.9841900050640107, "num_tokens": 51885430.0, "step": 20960 }, { "entropy": 0.08465072922408581, "epoch": 4.887049772700781, "grad_norm": 1.1875, "learning_rate": 4.9438520427259674e-05, "loss": 0.0909, "mean_token_accuracy": 0.974427980184555, "num_tokens": 51920408.0, "step": 20965 }, { "entropy": 0.07176385279744864, "epoch": 4.888215409721413, "grad_norm": 1.5703125, "learning_rate": 4.943806799807581e-05, "loss": 0.0573, "mean_token_accuracy": 0.9810054063796997, "num_tokens": 51948620.0, "step": 20970 }, { "entropy": 0.058643194288015364, "epoch": 4.889381046742044, "grad_norm": 1.7734375, "learning_rate": 4.9437615390877404e-05, "loss": 0.054, "mean_token_accuracy": 0.9795984923839569, "num_tokens": 51971871.0, "step": 20975 }, { "entropy": 0.07647083997726441, "epoch": 4.890546683762676, "grad_norm": 0.90625, "learning_rate": 4.943716260567119e-05, "loss": 0.0915, "mean_token_accuracy": 0.9773392856121064, "num_tokens": 51982972.0, "step": 20980 }, { "entropy": 0.06268258430063725, "epoch": 4.891712320783308, "grad_norm": 1.8984375, "learning_rate": 4.943670964246394e-05, "loss": 0.0938, "mean_token_accuracy": 0.9779832363128662, "num_tokens": 51994332.0, "step": 20985 }, { "entropy": 0.05670452257618308, "epoch": 4.89287795780394, "grad_norm": 0.45703125, "learning_rate": 4.943625650126238e-05, "loss": 0.055, "mean_token_accuracy": 0.9824697732925415, "num_tokens": 52012043.0, "step": 20990 }, { "entropy": 0.09376739151775837, "epoch": 4.894043594824572, "grad_norm": 0.6328125, "learning_rate": 4.943580318207329e-05, "loss": 0.102, "mean_token_accuracy": 0.972071361541748, "num_tokens": 52034314.0, "step": 20995 }, { "entropy": 0.06972389779984951, "epoch": 4.895209231845204, "grad_norm": 1.3203125, "learning_rate": 4.943534968490342e-05, "loss": 0.1101, "mean_token_accuracy": 0.9718354284763336, "num_tokens": 52045098.0, "step": 21000 }, { "entropy": 0.06242169812321663, "epoch": 4.896374868865835, "grad_norm": 4.09375, "learning_rate": 4.943489600975953e-05, "loss": 0.0828, "mean_token_accuracy": 0.9797560632228851, "num_tokens": 52068145.0, "step": 21005 }, { "entropy": 0.06226806389167905, "epoch": 4.897540505886467, "grad_norm": 6.9375, "learning_rate": 4.94344421566484e-05, "loss": 0.0678, "mean_token_accuracy": 0.9791005849838257, "num_tokens": 52097424.0, "step": 21010 }, { "entropy": 0.08740999130532146, "epoch": 4.898706142907098, "grad_norm": 7.84375, "learning_rate": 4.9433988125576783e-05, "loss": 0.1157, "mean_token_accuracy": 0.9717265069484711, "num_tokens": 52138832.0, "step": 21015 }, { "entropy": 0.061702551785856485, "epoch": 4.8998717799277305, "grad_norm": 3.109375, "learning_rate": 4.943353391655145e-05, "loss": 0.0524, "mean_token_accuracy": 0.9776601493358612, "num_tokens": 52164329.0, "step": 21020 }, { "entropy": 0.06630434533581138, "epoch": 4.901037416948363, "grad_norm": 0.1982421875, "learning_rate": 4.943307952957918e-05, "loss": 0.0794, "mean_token_accuracy": 0.9799892425537109, "num_tokens": 52179810.0, "step": 21025 }, { "entropy": 0.08890583254396915, "epoch": 4.902203053968994, "grad_norm": 1.0546875, "learning_rate": 4.943262496466675e-05, "loss": 0.1093, "mean_token_accuracy": 0.9698486030101776, "num_tokens": 52194861.0, "step": 21030 }, { "entropy": 0.0646402221173048, "epoch": 4.903368690989626, "grad_norm": 3.703125, "learning_rate": 4.943217022182093e-05, "loss": 0.0697, "mean_token_accuracy": 0.9834041774272919, "num_tokens": 52207180.0, "step": 21035 }, { "entropy": 0.06280097691342235, "epoch": 4.904534328010257, "grad_norm": 2.453125, "learning_rate": 4.9431715301048504e-05, "loss": 0.0596, "mean_token_accuracy": 0.9817287743091583, "num_tokens": 52232325.0, "step": 21040 }, { "entropy": 0.07492497358471155, "epoch": 4.905699965030889, "grad_norm": 0.97265625, "learning_rate": 4.943126020235626e-05, "loss": 0.0795, "mean_token_accuracy": 0.974094831943512, "num_tokens": 52244692.0, "step": 21045 }, { "entropy": 0.07054590750485659, "epoch": 4.906865602051521, "grad_norm": 0.75, "learning_rate": 4.943080492575097e-05, "loss": 0.1018, "mean_token_accuracy": 0.9751786947250366, "num_tokens": 52257785.0, "step": 21050 }, { "entropy": 0.06916769053786993, "epoch": 4.9080312390721526, "grad_norm": 1.625, "learning_rate": 4.943034947123944e-05, "loss": 0.0913, "mean_token_accuracy": 0.9798057615756989, "num_tokens": 52269539.0, "step": 21055 }, { "entropy": 0.04995636437088251, "epoch": 4.909196876092785, "grad_norm": 0.58203125, "learning_rate": 4.9429893838828464e-05, "loss": 0.0399, "mean_token_accuracy": 0.981740140914917, "num_tokens": 52319714.0, "step": 21060 }, { "entropy": 0.08283067289739847, "epoch": 4.910362513113417, "grad_norm": 2.59375, "learning_rate": 4.942943802852482e-05, "loss": 0.0966, "mean_token_accuracy": 0.9736198604106903, "num_tokens": 52332795.0, "step": 21065 }, { "entropy": 0.14895931966602802, "epoch": 4.911528150134048, "grad_norm": 1.953125, "learning_rate": 4.942898204033532e-05, "loss": 0.2523, "mean_token_accuracy": 0.9456245481967926, "num_tokens": 52351199.0, "step": 21070 }, { "entropy": 0.06533634569495916, "epoch": 4.91269378715468, "grad_norm": 2.59375, "learning_rate": 4.942852587426674e-05, "loss": 0.0497, "mean_token_accuracy": 0.9837863922119141, "num_tokens": 52376245.0, "step": 21075 }, { "entropy": 0.10989860650151968, "epoch": 4.913859424175312, "grad_norm": 0.984375, "learning_rate": 4.94280695303259e-05, "loss": 0.1259, "mean_token_accuracy": 0.9637296438217163, "num_tokens": 52436860.0, "step": 21080 }, { "entropy": 0.08045022562146187, "epoch": 4.915025061195943, "grad_norm": 3.03125, "learning_rate": 4.9427613008519616e-05, "loss": 0.0736, "mean_token_accuracy": 0.9755086541175843, "num_tokens": 52452073.0, "step": 21085 }, { "entropy": 0.0864185519516468, "epoch": 4.9161906982165755, "grad_norm": 2.53125, "learning_rate": 4.9427156308854674e-05, "loss": 0.1074, "mean_token_accuracy": 0.9733963072299957, "num_tokens": 52461200.0, "step": 21090 }, { "entropy": 0.09636174971237779, "epoch": 4.917356335237207, "grad_norm": 1.265625, "learning_rate": 4.942669943133789e-05, "loss": 0.1164, "mean_token_accuracy": 0.9673805236816406, "num_tokens": 52477657.0, "step": 21095 }, { "entropy": 0.08892916599288583, "epoch": 4.918521972257839, "grad_norm": 0.88671875, "learning_rate": 4.9426242375976085e-05, "loss": 0.1248, "mean_token_accuracy": 0.9690983772277832, "num_tokens": 52495784.0, "step": 21100 }, { "entropy": 0.07809329750016332, "epoch": 4.919687609278471, "grad_norm": 0.3125, "learning_rate": 4.942578514277606e-05, "loss": 0.1099, "mean_token_accuracy": 0.9738022804260253, "num_tokens": 52512573.0, "step": 21105 }, { "entropy": 0.04925914811901748, "epoch": 4.920853246299102, "grad_norm": 4.71875, "learning_rate": 4.9425327731744644e-05, "loss": 0.0413, "mean_token_accuracy": 0.9875961005687713, "num_tokens": 52550932.0, "step": 21110 }, { "entropy": 0.08235758668743073, "epoch": 4.922018883319734, "grad_norm": 0.41015625, "learning_rate": 4.942487014288866e-05, "loss": 0.0886, "mean_token_accuracy": 0.9814740777015686, "num_tokens": 52587741.0, "step": 21115 }, { "entropy": 0.06665782891213894, "epoch": 4.923184520340366, "grad_norm": 1.578125, "learning_rate": 4.9424412376214915e-05, "loss": 0.0696, "mean_token_accuracy": 0.977788758277893, "num_tokens": 52600393.0, "step": 21120 }, { "entropy": 0.11595227997750043, "epoch": 4.924350157360998, "grad_norm": 1.8359375, "learning_rate": 4.9423954431730254e-05, "loss": 0.0991, "mean_token_accuracy": 0.9738423705101014, "num_tokens": 52610343.0, "step": 21125 }, { "entropy": 0.08311420790851116, "epoch": 4.92551579438163, "grad_norm": 1.3671875, "learning_rate": 4.94234963094415e-05, "loss": 0.0999, "mean_token_accuracy": 0.9729685187339783, "num_tokens": 52628675.0, "step": 21130 }, { "entropy": 0.06154662910848856, "epoch": 4.926681431402262, "grad_norm": 0.45703125, "learning_rate": 4.9423038009355474e-05, "loss": 0.0643, "mean_token_accuracy": 0.9839459717273712, "num_tokens": 52648559.0, "step": 21135 }, { "entropy": 0.06962651396170258, "epoch": 4.927847068422893, "grad_norm": 1.0859375, "learning_rate": 4.942257953147903e-05, "loss": 0.0914, "mean_token_accuracy": 0.9786814033985138, "num_tokens": 52670788.0, "step": 21140 }, { "entropy": 0.06154291275888681, "epoch": 4.929012705443525, "grad_norm": 1.828125, "learning_rate": 4.9422120875818986e-05, "loss": 0.0548, "mean_token_accuracy": 0.9831968486309052, "num_tokens": 52695532.0, "step": 21145 }, { "entropy": 0.0829665838740766, "epoch": 4.930178342464156, "grad_norm": 1.25, "learning_rate": 4.9421662042382185e-05, "loss": 0.1213, "mean_token_accuracy": 0.9721298694610596, "num_tokens": 52713871.0, "step": 21150 }, { "entropy": 0.10626598820090294, "epoch": 4.931343979484788, "grad_norm": 6.25, "learning_rate": 4.942120303117547e-05, "loss": 0.1169, "mean_token_accuracy": 0.9673919379711151, "num_tokens": 52741429.0, "step": 21155 }, { "entropy": 0.07647773548960686, "epoch": 4.9325096165054205, "grad_norm": 1.6875, "learning_rate": 4.94207438422057e-05, "loss": 0.0673, "mean_token_accuracy": 0.9810298800468444, "num_tokens": 52755321.0, "step": 21160 }, { "entropy": 0.05966353416442871, "epoch": 4.933675253526052, "grad_norm": 2.390625, "learning_rate": 4.9420284475479697e-05, "loss": 0.065, "mean_token_accuracy": 0.9799255132675171, "num_tokens": 52780757.0, "step": 21165 }, { "entropy": 0.07079061567783355, "epoch": 4.934840890546684, "grad_norm": 3.4375, "learning_rate": 4.941982493100433e-05, "loss": 0.0882, "mean_token_accuracy": 0.9788235485553741, "num_tokens": 52791577.0, "step": 21170 }, { "entropy": 0.07353595411404967, "epoch": 4.936006527567315, "grad_norm": 1.125, "learning_rate": 4.941936520878644e-05, "loss": 0.0684, "mean_token_accuracy": 0.982946801185608, "num_tokens": 52821422.0, "step": 21175 }, { "entropy": 0.05350867630913854, "epoch": 4.937172164587947, "grad_norm": 3.0, "learning_rate": 4.9418905308832884e-05, "loss": 0.0615, "mean_token_accuracy": 0.9831634402275086, "num_tokens": 52841396.0, "step": 21180 }, { "entropy": 0.07068223357200623, "epoch": 4.938337801608579, "grad_norm": 0.37890625, "learning_rate": 4.941844523115053e-05, "loss": 0.066, "mean_token_accuracy": 0.9852615475654602, "num_tokens": 52853780.0, "step": 21185 }, { "entropy": 0.14305189084261655, "epoch": 4.9395034386292105, "grad_norm": 4.59375, "learning_rate": 4.941798497574623e-05, "loss": 0.1897, "mean_token_accuracy": 0.9685503423213959, "num_tokens": 52889834.0, "step": 21190 }, { "entropy": 0.05313537791371346, "epoch": 4.940669075649843, "grad_norm": 0.345703125, "learning_rate": 4.941752454262685e-05, "loss": 0.0439, "mean_token_accuracy": 0.985480272769928, "num_tokens": 52918093.0, "step": 21195 }, { "entropy": 0.07132979445159435, "epoch": 4.941834712670475, "grad_norm": 2.59375, "learning_rate": 4.9417063931799245e-05, "loss": 0.1044, "mean_token_accuracy": 0.9747491538524627, "num_tokens": 52927212.0, "step": 21200 }, { "entropy": 0.058989010052755475, "epoch": 4.943000349691106, "grad_norm": 0.59765625, "learning_rate": 4.94166031432703e-05, "loss": 0.0747, "mean_token_accuracy": 0.9773088097572327, "num_tokens": 52952905.0, "step": 21205 }, { "entropy": 0.06833735201507807, "epoch": 4.944165986711738, "grad_norm": 0.6875, "learning_rate": 4.941614217704688e-05, "loss": 0.0705, "mean_token_accuracy": 0.9795933485031127, "num_tokens": 52976311.0, "step": 21210 }, { "entropy": 0.07373546492308378, "epoch": 4.94533162373237, "grad_norm": 1.5390625, "learning_rate": 4.941568103313585e-05, "loss": 0.0805, "mean_token_accuracy": 0.9760990738868713, "num_tokens": 52990179.0, "step": 21215 }, { "entropy": 0.0755191221833229, "epoch": 4.946497260753001, "grad_norm": 5.9375, "learning_rate": 4.9415219711544104e-05, "loss": 0.0962, "mean_token_accuracy": 0.9685922086238861, "num_tokens": 53008541.0, "step": 21220 }, { "entropy": 0.07566210273653269, "epoch": 4.947662897773633, "grad_norm": 2.984375, "learning_rate": 4.94147582122785e-05, "loss": 0.1012, "mean_token_accuracy": 0.973409628868103, "num_tokens": 53019565.0, "step": 21225 }, { "entropy": 0.07277205716818572, "epoch": 4.948828534794265, "grad_norm": 0.67578125, "learning_rate": 4.941429653534594e-05, "loss": 0.0632, "mean_token_accuracy": 0.9744443714618682, "num_tokens": 53042532.0, "step": 21230 }, { "entropy": 0.0649688365869224, "epoch": 4.949994171814897, "grad_norm": 2.3125, "learning_rate": 4.9413834680753296e-05, "loss": 0.0657, "mean_token_accuracy": 0.9841285765171051, "num_tokens": 53063708.0, "step": 21235 }, { "entropy": 0.06395247289910913, "epoch": 4.951159808835529, "grad_norm": 1.265625, "learning_rate": 4.941337264850745e-05, "loss": 0.0476, "mean_token_accuracy": 0.9863583087921143, "num_tokens": 53104339.0, "step": 21240 }, { "entropy": 0.07488085981458426, "epoch": 4.95232544585616, "grad_norm": 3.25, "learning_rate": 4.941291043861531e-05, "loss": 0.1062, "mean_token_accuracy": 0.9732397139072418, "num_tokens": 53122662.0, "step": 21245 }, { "entropy": 0.06674009589478373, "epoch": 4.953491082876792, "grad_norm": 3.65625, "learning_rate": 4.941244805108375e-05, "loss": 0.059, "mean_token_accuracy": 0.9823914647102356, "num_tokens": 53149847.0, "step": 21250 }, { "entropy": 0.08388477927073837, "epoch": 4.954656719897424, "grad_norm": 2.71875, "learning_rate": 4.9411985485919674e-05, "loss": 0.0803, "mean_token_accuracy": 0.9737024128437042, "num_tokens": 53183327.0, "step": 21255 }, { "entropy": 0.07561515085399151, "epoch": 4.9558223569180555, "grad_norm": 1.859375, "learning_rate": 4.941152274312998e-05, "loss": 0.0669, "mean_token_accuracy": 0.9786112248897553, "num_tokens": 53208441.0, "step": 21260 }, { "entropy": 0.05005005598068237, "epoch": 4.956987993938688, "grad_norm": 0.45703125, "learning_rate": 4.941105982272156e-05, "loss": 0.049, "mean_token_accuracy": 0.9870032131671905, "num_tokens": 53225543.0, "step": 21265 }, { "entropy": 0.07580123171210289, "epoch": 4.95815363095932, "grad_norm": 5.90625, "learning_rate": 4.941059672470133e-05, "loss": 0.1021, "mean_token_accuracy": 0.9714035093784332, "num_tokens": 53246905.0, "step": 21270 }, { "entropy": 0.05712772505357862, "epoch": 4.959319267979951, "grad_norm": 0.31640625, "learning_rate": 4.941013344907619e-05, "loss": 0.0367, "mean_token_accuracy": 0.9858934164047242, "num_tokens": 53276952.0, "step": 21275 }, { "entropy": 0.0628195583820343, "epoch": 4.960484905000583, "grad_norm": 0.86328125, "learning_rate": 4.9409669995853035e-05, "loss": 0.0744, "mean_token_accuracy": 0.9781266987323761, "num_tokens": 53292563.0, "step": 21280 }, { "entropy": 0.07548150643706322, "epoch": 4.961650542021214, "grad_norm": 4.6875, "learning_rate": 4.9409206365038794e-05, "loss": 0.0886, "mean_token_accuracy": 0.9792620122432709, "num_tokens": 53313173.0, "step": 21285 }, { "entropy": 0.06093268636614084, "epoch": 4.962816179041846, "grad_norm": 1.296875, "learning_rate": 4.940874255664037e-05, "loss": 0.0688, "mean_token_accuracy": 0.9816692888736724, "num_tokens": 53329420.0, "step": 21290 }, { "entropy": 0.09625336267054081, "epoch": 4.9639818160624785, "grad_norm": 3.953125, "learning_rate": 4.940827857066469e-05, "loss": 0.1071, "mean_token_accuracy": 0.9731559336185456, "num_tokens": 53345471.0, "step": 21295 }, { "entropy": 0.04683486856520176, "epoch": 4.96514745308311, "grad_norm": 2.546875, "learning_rate": 4.940781440711866e-05, "loss": 0.0374, "mean_token_accuracy": 0.9829618215560914, "num_tokens": 53384019.0, "step": 21300 }, { "entropy": 0.08568495102226734, "epoch": 4.966313090103742, "grad_norm": 0.4921875, "learning_rate": 4.9407350066009204e-05, "loss": 0.081, "mean_token_accuracy": 0.9735833466053009, "num_tokens": 53398294.0, "step": 21305 }, { "entropy": 0.05991802159696817, "epoch": 4.967478727124373, "grad_norm": 3.453125, "learning_rate": 4.940688554734326e-05, "loss": 0.0576, "mean_token_accuracy": 0.9740480959415436, "num_tokens": 53430339.0, "step": 21310 }, { "entropy": 0.1412233560346067, "epoch": 4.968644364145005, "grad_norm": 0.6015625, "learning_rate": 4.940642085112773e-05, "loss": 0.2138, "mean_token_accuracy": 0.9502263188362121, "num_tokens": 53471198.0, "step": 21315 }, { "entropy": 0.04991894001141191, "epoch": 4.969810001165637, "grad_norm": 1.390625, "learning_rate": 4.9405955977369564e-05, "loss": 0.0592, "mean_token_accuracy": 0.9838922679424286, "num_tokens": 53490441.0, "step": 21320 }, { "entropy": 0.07745058406144381, "epoch": 4.970975638186268, "grad_norm": 0.4140625, "learning_rate": 4.940549092607569e-05, "loss": 0.1122, "mean_token_accuracy": 0.9719259083271027, "num_tokens": 53506639.0, "step": 21325 }, { "entropy": 0.09155577477067708, "epoch": 4.9721412752069005, "grad_norm": 4.75, "learning_rate": 4.940502569725303e-05, "loss": 0.1207, "mean_token_accuracy": 0.9676483154296875, "num_tokens": 53520060.0, "step": 21330 }, { "entropy": 0.07278778329491616, "epoch": 4.973306912227533, "grad_norm": 0.76171875, "learning_rate": 4.940456029090854e-05, "loss": 0.0631, "mean_token_accuracy": 0.9748271405696869, "num_tokens": 53534161.0, "step": 21335 }, { "entropy": 0.08302396573126317, "epoch": 4.974472549248164, "grad_norm": 1.5859375, "learning_rate": 4.9404094707049145e-05, "loss": 0.0896, "mean_token_accuracy": 0.9752457678318024, "num_tokens": 53544994.0, "step": 21340 }, { "entropy": 0.09030197001993656, "epoch": 4.975638186268796, "grad_norm": 1.5546875, "learning_rate": 4.94036289456818e-05, "loss": 0.105, "mean_token_accuracy": 0.9733418822288513, "num_tokens": 53555728.0, "step": 21345 }, { "entropy": 0.09017610475420952, "epoch": 4.976803823289428, "grad_norm": 1.3125, "learning_rate": 4.940316300681344e-05, "loss": 0.0968, "mean_token_accuracy": 0.9723630130290986, "num_tokens": 53567499.0, "step": 21350 }, { "entropy": 0.08766529373824597, "epoch": 4.977969460310059, "grad_norm": 2.875, "learning_rate": 4.940269689045101e-05, "loss": 0.0966, "mean_token_accuracy": 0.9772234261035919, "num_tokens": 53576899.0, "step": 21355 }, { "entropy": 0.06158297499641776, "epoch": 4.979135097330691, "grad_norm": 6.0, "learning_rate": 4.940223059660147e-05, "loss": 0.0737, "mean_token_accuracy": 0.982252448797226, "num_tokens": 53599827.0, "step": 21360 }, { "entropy": 0.06548569137230516, "epoch": 4.980300734351323, "grad_norm": 1.25, "learning_rate": 4.940176412527177e-05, "loss": 0.0958, "mean_token_accuracy": 0.9723960101604462, "num_tokens": 53617066.0, "step": 21365 }, { "entropy": 0.07234325166791677, "epoch": 4.981466371371955, "grad_norm": 3.828125, "learning_rate": 4.940129747646886e-05, "loss": 0.0842, "mean_token_accuracy": 0.9767987549304962, "num_tokens": 53634950.0, "step": 21370 }, { "entropy": 0.08200656361877919, "epoch": 4.982632008392587, "grad_norm": 2.953125, "learning_rate": 4.940083065019972e-05, "loss": 0.1237, "mean_token_accuracy": 0.9713518559932709, "num_tokens": 53643613.0, "step": 21375 }, { "entropy": 0.044780909549444914, "epoch": 4.983797645413218, "grad_norm": 0.56640625, "learning_rate": 4.940036364647129e-05, "loss": 0.0449, "mean_token_accuracy": 0.9856958091259003, "num_tokens": 53669670.0, "step": 21380 }, { "entropy": 0.0722307562828064, "epoch": 4.98496328243385, "grad_norm": 4.59375, "learning_rate": 4.939989646529053e-05, "loss": 0.0864, "mean_token_accuracy": 0.9787289142608643, "num_tokens": 53680604.0, "step": 21385 }, { "entropy": 0.20885901637375354, "epoch": 4.986128919454482, "grad_norm": 2.953125, "learning_rate": 4.939942910666442e-05, "loss": 0.3076, "mean_token_accuracy": 0.9541841089725495, "num_tokens": 53702324.0, "step": 21390 }, { "entropy": 0.0543774152174592, "epoch": 4.987294556475113, "grad_norm": 0.83203125, "learning_rate": 4.939896157059992e-05, "loss": 0.0448, "mean_token_accuracy": 0.9863493800163269, "num_tokens": 53727959.0, "step": 21395 }, { "entropy": 0.1413085490465164, "epoch": 4.9884601934957455, "grad_norm": 3.546875, "learning_rate": 4.939849385710402e-05, "loss": 0.2737, "mean_token_accuracy": 0.9440780580043793, "num_tokens": 53747497.0, "step": 21400 }, { "entropy": 0.07893666541203856, "epoch": 4.989625830516378, "grad_norm": 1.640625, "learning_rate": 4.939802596618366e-05, "loss": 0.0688, "mean_token_accuracy": 0.9808478355407715, "num_tokens": 53763449.0, "step": 21405 }, { "entropy": 0.08251105006784201, "epoch": 4.990791467537009, "grad_norm": 4.5625, "learning_rate": 4.9397557897845856e-05, "loss": 0.1024, "mean_token_accuracy": 0.9750867128372193, "num_tokens": 53776240.0, "step": 21410 }, { "entropy": 0.06223855372518301, "epoch": 4.991957104557641, "grad_norm": 0.6640625, "learning_rate": 4.9397089652097556e-05, "loss": 0.0695, "mean_token_accuracy": 0.9808686673641205, "num_tokens": 53797279.0, "step": 21415 }, { "entropy": 0.05682404525578022, "epoch": 4.993122741578272, "grad_norm": 1.1015625, "learning_rate": 4.939662122894576e-05, "loss": 0.0451, "mean_token_accuracy": 0.9859870493412017, "num_tokens": 53812726.0, "step": 21420 }, { "entropy": 0.08272222261875868, "epoch": 4.994288378598904, "grad_norm": 3.046875, "learning_rate": 4.939615262839745e-05, "loss": 0.0622, "mean_token_accuracy": 0.9833892941474914, "num_tokens": 53827890.0, "step": 21425 }, { "entropy": 0.05443599112331867, "epoch": 4.995454015619536, "grad_norm": 1.0625, "learning_rate": 4.939568385045961e-05, "loss": 0.0402, "mean_token_accuracy": 0.9873492121696472, "num_tokens": 53844135.0, "step": 21430 }, { "entropy": 0.054293790087103846, "epoch": 4.996619652640168, "grad_norm": 2.765625, "learning_rate": 4.939521489513923e-05, "loss": 0.0642, "mean_token_accuracy": 0.982794600725174, "num_tokens": 53868623.0, "step": 21435 }, { "entropy": 0.07399072125554085, "epoch": 4.9977852896608, "grad_norm": 5.15625, "learning_rate": 4.9394745762443304e-05, "loss": 0.0764, "mean_token_accuracy": 0.9782315909862518, "num_tokens": 53881341.0, "step": 21440 }, { "entropy": 0.06524700932204723, "epoch": 4.998950926681431, "grad_norm": 1.65625, "learning_rate": 4.9394276452378827e-05, "loss": 0.0789, "mean_token_accuracy": 0.9781299471855164, "num_tokens": 53897448.0, "step": 21445 }, { "entropy": 0.06756970638202296, "epoch": 5.0, "grad_norm": 0.97265625, "learning_rate": 4.9393806964952806e-05, "loss": 0.0411, "mean_token_accuracy": 0.9809864560763041, "num_tokens": 53942250.0, "step": 21450 }, { "entropy": 0.0722856305539608, "epoch": 5.001165637020632, "grad_norm": 3.796875, "learning_rate": 4.939333730017223e-05, "loss": 0.1013, "mean_token_accuracy": 0.9739380717277527, "num_tokens": 53953317.0, "step": 21455 }, { "entropy": 0.07048999462276698, "epoch": 5.002331274041263, "grad_norm": 0.29296875, "learning_rate": 4.93928674580441e-05, "loss": 0.047, "mean_token_accuracy": 0.9865036189556122, "num_tokens": 54001504.0, "step": 21460 }, { "entropy": 0.07579383421689272, "epoch": 5.003496911061895, "grad_norm": 0.4921875, "learning_rate": 4.9392397438575435e-05, "loss": 0.0639, "mean_token_accuracy": 0.9817220330238342, "num_tokens": 54013211.0, "step": 21465 }, { "entropy": 0.04977593747898936, "epoch": 5.0046625480825275, "grad_norm": 2.109375, "learning_rate": 4.9391927241773226e-05, "loss": 0.0548, "mean_token_accuracy": 0.9827068090438843, "num_tokens": 54036428.0, "step": 21470 }, { "entropy": 0.07021276131272317, "epoch": 5.005828185103159, "grad_norm": 1.328125, "learning_rate": 4.939145686764451e-05, "loss": 0.0799, "mean_token_accuracy": 0.9790279030799866, "num_tokens": 54050241.0, "step": 21475 }, { "entropy": 0.054964121477678415, "epoch": 5.006993822123791, "grad_norm": 0.06689453125, "learning_rate": 4.939098631619628e-05, "loss": 0.059, "mean_token_accuracy": 0.9842049717903137, "num_tokens": 54069745.0, "step": 21480 }, { "entropy": 0.06855918923392892, "epoch": 5.008159459144422, "grad_norm": 0.1396484375, "learning_rate": 4.939051558743556e-05, "loss": 0.0477, "mean_token_accuracy": 0.9819868087768555, "num_tokens": 54088512.0, "step": 21485 }, { "entropy": 0.08289080634713172, "epoch": 5.009325096165054, "grad_norm": 1.3984375, "learning_rate": 4.939004468136937e-05, "loss": 0.0555, "mean_token_accuracy": 0.980892276763916, "num_tokens": 54106277.0, "step": 21490 }, { "entropy": 0.06591970575973391, "epoch": 5.010490733185686, "grad_norm": 2.34375, "learning_rate": 4.9389573598004724e-05, "loss": 0.0699, "mean_token_accuracy": 0.981263279914856, "num_tokens": 54123103.0, "step": 21495 }, { "entropy": 0.056767011620104314, "epoch": 5.0116563702063175, "grad_norm": 0.375, "learning_rate": 4.9389102337348656e-05, "loss": 0.0564, "mean_token_accuracy": 0.9821022152900696, "num_tokens": 54142021.0, "step": 21500 }, { "entropy": 0.044302819995209575, "epoch": 5.01282200722695, "grad_norm": 0.6171875, "learning_rate": 4.938863089940819e-05, "loss": 0.0351, "mean_token_accuracy": 0.9887114048004151, "num_tokens": 54173043.0, "step": 21505 }, { "entropy": 0.05516482591629028, "epoch": 5.013987644247582, "grad_norm": 2.546875, "learning_rate": 4.938815928419036e-05, "loss": 0.066, "mean_token_accuracy": 0.9820286273956299, "num_tokens": 54189916.0, "step": 21510 }, { "entropy": 0.046309492690488695, "epoch": 5.015153281268213, "grad_norm": 0.462890625, "learning_rate": 4.938768749170219e-05, "loss": 0.0435, "mean_token_accuracy": 0.988088858127594, "num_tokens": 54223024.0, "step": 21515 }, { "entropy": 0.05068000312894583, "epoch": 5.016318918288845, "grad_norm": 1.375, "learning_rate": 4.9387215521950716e-05, "loss": 0.0439, "mean_token_accuracy": 0.9892255902290344, "num_tokens": 54244942.0, "step": 21520 }, { "entropy": 0.05760006736963987, "epoch": 5.017484555309476, "grad_norm": 1.3046875, "learning_rate": 4.938674337494299e-05, "loss": 0.0699, "mean_token_accuracy": 0.9819241404533386, "num_tokens": 54259058.0, "step": 21525 }, { "entropy": 0.13578815935179592, "epoch": 5.018650192330108, "grad_norm": 4.5, "learning_rate": 4.938627105068603e-05, "loss": 0.2186, "mean_token_accuracy": 0.9652626574039459, "num_tokens": 54290301.0, "step": 21530 }, { "entropy": 0.07299588825553656, "epoch": 5.01981582935074, "grad_norm": 1.703125, "learning_rate": 4.9385798549186895e-05, "loss": 0.078, "mean_token_accuracy": 0.9776978015899658, "num_tokens": 54322229.0, "step": 21535 }, { "entropy": 0.08828615508973599, "epoch": 5.020981466371372, "grad_norm": 1.9453125, "learning_rate": 4.938532587045263e-05, "loss": 0.0788, "mean_token_accuracy": 0.9777750134468078, "num_tokens": 54333646.0, "step": 21540 }, { "entropy": 0.06954600978642703, "epoch": 5.022147103392004, "grad_norm": 3.09375, "learning_rate": 4.9384853014490274e-05, "loss": 0.0536, "mean_token_accuracy": 0.9855652451515198, "num_tokens": 54347404.0, "step": 21545 }, { "entropy": 0.05489847809076309, "epoch": 5.023312740412636, "grad_norm": 0.1826171875, "learning_rate": 4.9384379981306884e-05, "loss": 0.0521, "mean_token_accuracy": 0.9847877144813537, "num_tokens": 54371568.0, "step": 21550 }, { "entropy": 0.06676510954275727, "epoch": 5.024478377433267, "grad_norm": 0.26953125, "learning_rate": 4.9383906770909517e-05, "loss": 0.0564, "mean_token_accuracy": 0.9804906964302063, "num_tokens": 54390510.0, "step": 21555 }, { "entropy": 0.07030079662799835, "epoch": 5.025644014453899, "grad_norm": 0.357421875, "learning_rate": 4.938343338330522e-05, "loss": 0.0762, "mean_token_accuracy": 0.9829931259155273, "num_tokens": 54401644.0, "step": 21560 }, { "entropy": 0.0746897492557764, "epoch": 5.02680965147453, "grad_norm": 3.5, "learning_rate": 4.938295981850107e-05, "loss": 0.0891, "mean_token_accuracy": 0.9763796985149383, "num_tokens": 54410346.0, "step": 21565 }, { "entropy": 0.051309975795447826, "epoch": 5.0279752884951625, "grad_norm": 1.25, "learning_rate": 4.93824860765041e-05, "loss": 0.0611, "mean_token_accuracy": 0.9852312088012696, "num_tokens": 54430243.0, "step": 21570 }, { "entropy": 0.09667696682736278, "epoch": 5.029140925515795, "grad_norm": 4.0, "learning_rate": 4.93820121573214e-05, "loss": 0.0751, "mean_token_accuracy": 0.9779737412929534, "num_tokens": 54445389.0, "step": 21575 }, { "entropy": 0.05687328353524208, "epoch": 5.030306562536426, "grad_norm": 1.4453125, "learning_rate": 4.938153806096003e-05, "loss": 0.0431, "mean_token_accuracy": 0.9846319437026978, "num_tokens": 54463785.0, "step": 21580 }, { "entropy": 0.05415755575522781, "epoch": 5.031472199557058, "grad_norm": 2.015625, "learning_rate": 4.938106378742705e-05, "loss": 0.0694, "mean_token_accuracy": 0.9817207515239715, "num_tokens": 54477292.0, "step": 21585 }, { "entropy": 0.04446221003308892, "epoch": 5.03263783657769, "grad_norm": 0.169921875, "learning_rate": 4.938058933672954e-05, "loss": 0.0237, "mean_token_accuracy": 0.9912239372730255, "num_tokens": 54506063.0, "step": 21590 }, { "entropy": 0.07755004474893212, "epoch": 5.033803473598321, "grad_norm": 3.375, "learning_rate": 4.938011470887457e-05, "loss": 0.0653, "mean_token_accuracy": 0.9806104242801666, "num_tokens": 54523112.0, "step": 21595 }, { "entropy": 0.0687340309843421, "epoch": 5.034969110618953, "grad_norm": 0.8671875, "learning_rate": 4.937963990386923e-05, "loss": 0.0669, "mean_token_accuracy": 0.9804644048213959, "num_tokens": 54548072.0, "step": 21600 }, { "entropy": 0.05275446167215705, "epoch": 5.0361347476395855, "grad_norm": 2.265625, "learning_rate": 4.937916492172059e-05, "loss": 0.0324, "mean_token_accuracy": 0.9847627699375152, "num_tokens": 54569162.0, "step": 21605 }, { "entropy": 0.07009538058191538, "epoch": 5.037300384660217, "grad_norm": 3.703125, "learning_rate": 4.937868976243573e-05, "loss": 0.0723, "mean_token_accuracy": 0.9828846335411072, "num_tokens": 54583817.0, "step": 21610 }, { "entropy": 0.051291110832244156, "epoch": 5.038466021680849, "grad_norm": 1.75, "learning_rate": 4.937821442602174e-05, "loss": 0.0302, "mean_token_accuracy": 0.9848531365394593, "num_tokens": 54604573.0, "step": 21615 }, { "entropy": 0.061459016799926755, "epoch": 5.03963165870148, "grad_norm": 2.375, "learning_rate": 4.937773891248571e-05, "loss": 0.0828, "mean_token_accuracy": 0.9769286692142487, "num_tokens": 54630544.0, "step": 21620 }, { "entropy": 0.059759095683693884, "epoch": 5.040797295722112, "grad_norm": 1.8515625, "learning_rate": 4.937726322183472e-05, "loss": 0.0399, "mean_token_accuracy": 0.985287070274353, "num_tokens": 54649093.0, "step": 21625 }, { "entropy": 0.0626804206520319, "epoch": 5.041962932742744, "grad_norm": 0.4453125, "learning_rate": 4.937678735407587e-05, "loss": 0.0669, "mean_token_accuracy": 0.9840811491012573, "num_tokens": 54662488.0, "step": 21630 }, { "entropy": 0.048044389486312865, "epoch": 5.043128569763375, "grad_norm": 0.330078125, "learning_rate": 4.937631130921627e-05, "loss": 0.0298, "mean_token_accuracy": 0.9894433438777923, "num_tokens": 54684676.0, "step": 21635 }, { "entropy": 0.062034656014293435, "epoch": 5.0442942067840075, "grad_norm": 3.28125, "learning_rate": 4.937583508726299e-05, "loss": 0.0422, "mean_token_accuracy": 0.9874476134777069, "num_tokens": 54706977.0, "step": 21640 }, { "entropy": 0.04591988567262888, "epoch": 5.04545984380464, "grad_norm": 1.25, "learning_rate": 4.937535868822315e-05, "loss": 0.0414, "mean_token_accuracy": 0.9855768322944641, "num_tokens": 54738130.0, "step": 21645 }, { "entropy": 0.056564598623663184, "epoch": 5.046625480825271, "grad_norm": 1.8671875, "learning_rate": 4.937488211210386e-05, "loss": 0.0585, "mean_token_accuracy": 0.9842835962772369, "num_tokens": 54759483.0, "step": 21650 }, { "entropy": 0.06345367161557078, "epoch": 5.047791117845903, "grad_norm": 4.84375, "learning_rate": 4.9374405358912213e-05, "loss": 0.063, "mean_token_accuracy": 0.9799680531024932, "num_tokens": 54785815.0, "step": 21655 }, { "entropy": 0.05606531724333763, "epoch": 5.048956754866534, "grad_norm": 3.390625, "learning_rate": 4.937392842865532e-05, "loss": 0.0552, "mean_token_accuracy": 0.9835524022579193, "num_tokens": 54803493.0, "step": 21660 }, { "entropy": 0.08263342985883355, "epoch": 5.050122391887166, "grad_norm": 2.015625, "learning_rate": 4.937345132134029e-05, "loss": 0.0752, "mean_token_accuracy": 0.9773358285427094, "num_tokens": 54817570.0, "step": 21665 }, { "entropy": 0.0704351432621479, "epoch": 5.051288028907798, "grad_norm": 1.2734375, "learning_rate": 4.9372974036974254e-05, "loss": 0.0853, "mean_token_accuracy": 0.9777139842510223, "num_tokens": 54826412.0, "step": 21670 }, { "entropy": 0.061422979831695555, "epoch": 5.05245366592843, "grad_norm": 1.4375, "learning_rate": 4.9372496575564306e-05, "loss": 0.0723, "mean_token_accuracy": 0.98321133852005, "num_tokens": 54837967.0, "step": 21675 }, { "entropy": 0.0678645808249712, "epoch": 5.053619302949062, "grad_norm": 2.890625, "learning_rate": 4.937201893711758e-05, "loss": 0.0739, "mean_token_accuracy": 0.9775955259799958, "num_tokens": 54855917.0, "step": 21680 }, { "entropy": 0.051627715677022935, "epoch": 5.054784939969694, "grad_norm": 0.41796875, "learning_rate": 4.9371541121641194e-05, "loss": 0.0282, "mean_token_accuracy": 0.9913087129592896, "num_tokens": 54883469.0, "step": 21685 }, { "entropy": 0.04737357590347528, "epoch": 5.055950576990325, "grad_norm": 0.40234375, "learning_rate": 4.937106312914228e-05, "loss": 0.0206, "mean_token_accuracy": 0.9885388255119324, "num_tokens": 54904299.0, "step": 21690 }, { "entropy": 0.0449544788338244, "epoch": 5.057116214010957, "grad_norm": 0.53515625, "learning_rate": 4.937058495962796e-05, "loss": 0.0403, "mean_token_accuracy": 0.9887046158313751, "num_tokens": 54928834.0, "step": 21695 }, { "entropy": 0.05512272519990802, "epoch": 5.058281851031588, "grad_norm": 1.578125, "learning_rate": 4.937010661310536e-05, "loss": 0.0604, "mean_token_accuracy": 0.9825908660888671, "num_tokens": 54945161.0, "step": 21700 }, { "entropy": 0.06513633020222187, "epoch": 5.05944748805222, "grad_norm": 0.396484375, "learning_rate": 4.936962808958161e-05, "loss": 0.0521, "mean_token_accuracy": 0.9835683524608612, "num_tokens": 54964927.0, "step": 21705 }, { "entropy": 0.10186992576345802, "epoch": 5.0606131250728525, "grad_norm": 0.20703125, "learning_rate": 4.9369149389063865e-05, "loss": 0.1718, "mean_token_accuracy": 0.9644091308116913, "num_tokens": 55003059.0, "step": 21710 }, { "entropy": 0.06891092918813228, "epoch": 5.061778762093484, "grad_norm": 0.52734375, "learning_rate": 4.936867051155924e-05, "loss": 0.0591, "mean_token_accuracy": 0.981193619966507, "num_tokens": 55023452.0, "step": 21715 }, { "entropy": 0.0633782428689301, "epoch": 5.062944399114116, "grad_norm": 0.35546875, "learning_rate": 4.936819145707489e-05, "loss": 0.07, "mean_token_accuracy": 0.9851223707199097, "num_tokens": 55044041.0, "step": 21720 }, { "entropy": 0.06447986625134945, "epoch": 5.064110036134748, "grad_norm": 2.671875, "learning_rate": 4.936771222561796e-05, "loss": 0.0551, "mean_token_accuracy": 0.9847223103046417, "num_tokens": 55057044.0, "step": 21725 }, { "entropy": 0.06763316094875335, "epoch": 5.065275673155379, "grad_norm": 3.53125, "learning_rate": 4.936723281719558e-05, "loss": 0.0749, "mean_token_accuracy": 0.982142984867096, "num_tokens": 55069209.0, "step": 21730 }, { "entropy": 0.04672593493014574, "epoch": 5.066441310176011, "grad_norm": 0.5625, "learning_rate": 4.9366753231814914e-05, "loss": 0.0278, "mean_token_accuracy": 0.9850932538509369, "num_tokens": 55111866.0, "step": 21735 }, { "entropy": 0.06341411881148815, "epoch": 5.067606947196643, "grad_norm": 2.75, "learning_rate": 4.936627346948312e-05, "loss": 0.0683, "mean_token_accuracy": 0.9762008249759674, "num_tokens": 55137517.0, "step": 21740 }, { "entropy": 0.03943348862230778, "epoch": 5.068772584217275, "grad_norm": 0.09619140625, "learning_rate": 4.936579353020732e-05, "loss": 0.0318, "mean_token_accuracy": 0.9900983273983002, "num_tokens": 55178143.0, "step": 21745 }, { "entropy": 0.07379267876967788, "epoch": 5.069938221237907, "grad_norm": 2.171875, "learning_rate": 4.936531341399471e-05, "loss": 0.0689, "mean_token_accuracy": 0.981987190246582, "num_tokens": 55194920.0, "step": 21750 }, { "entropy": 0.06244112215936184, "epoch": 5.071103858258538, "grad_norm": 1.765625, "learning_rate": 4.9364833120852414e-05, "loss": 0.0667, "mean_token_accuracy": 0.9802671313285828, "num_tokens": 55212891.0, "step": 21755 }, { "entropy": 0.06464984249323606, "epoch": 5.07226949527917, "grad_norm": 0.55859375, "learning_rate": 4.9364352650787624e-05, "loss": 0.0414, "mean_token_accuracy": 0.9837022185325622, "num_tokens": 55233297.0, "step": 21760 }, { "entropy": 0.05454264315776527, "epoch": 5.073435132299802, "grad_norm": 0.2578125, "learning_rate": 4.936387200380748e-05, "loss": 0.039, "mean_token_accuracy": 0.9821477770805359, "num_tokens": 55269859.0, "step": 21765 }, { "entropy": 0.05952314343303442, "epoch": 5.074600769320433, "grad_norm": 0.2578125, "learning_rate": 4.9363391179919174e-05, "loss": 0.0572, "mean_token_accuracy": 0.9817304372787475, "num_tokens": 55291450.0, "step": 21770 }, { "entropy": 0.06436049994081258, "epoch": 5.0757664063410655, "grad_norm": 1.5546875, "learning_rate": 4.936291017912985e-05, "loss": 0.0667, "mean_token_accuracy": 0.9814639568328858, "num_tokens": 55308812.0, "step": 21775 }, { "entropy": 0.04403821406885981, "epoch": 5.076932043361698, "grad_norm": 0.23828125, "learning_rate": 4.93624290014467e-05, "loss": 0.0216, "mean_token_accuracy": 0.9926260411739349, "num_tokens": 55343260.0, "step": 21780 }, { "entropy": 0.054392436426132916, "epoch": 5.078097680382329, "grad_norm": 2.234375, "learning_rate": 4.936194764687688e-05, "loss": 0.0588, "mean_token_accuracy": 0.9852981567382812, "num_tokens": 55359488.0, "step": 21785 }, { "entropy": 0.07421798389405013, "epoch": 5.079263317402961, "grad_norm": 3.671875, "learning_rate": 4.936146611542759e-05, "loss": 0.0612, "mean_token_accuracy": 0.9803422451019287, "num_tokens": 55377919.0, "step": 21790 }, { "entropy": 0.06554331891238689, "epoch": 5.080428954423592, "grad_norm": 1.65625, "learning_rate": 4.9360984407105996e-05, "loss": 0.0911, "mean_token_accuracy": 0.9787883877754211, "num_tokens": 55388305.0, "step": 21795 }, { "entropy": 0.055925901234149936, "epoch": 5.081594591444224, "grad_norm": 0.59375, "learning_rate": 4.936050252191928e-05, "loss": 0.0363, "mean_token_accuracy": 0.9892447233200073, "num_tokens": 55421711.0, "step": 21800 }, { "entropy": 0.098357552010566, "epoch": 5.082760228464856, "grad_norm": 2.359375, "learning_rate": 4.936002045987465e-05, "loss": 0.0815, "mean_token_accuracy": 0.978368467092514, "num_tokens": 55435380.0, "step": 21805 }, { "entropy": 0.07150723561644554, "epoch": 5.0839258654854875, "grad_norm": 2.59375, "learning_rate": 4.935953822097926e-05, "loss": 0.0867, "mean_token_accuracy": 0.9787911772727966, "num_tokens": 55444640.0, "step": 21810 }, { "entropy": 0.05865759551525116, "epoch": 5.08509150250612, "grad_norm": 2.234375, "learning_rate": 4.935905580524032e-05, "loss": 0.0516, "mean_token_accuracy": 0.9856998682022095, "num_tokens": 55470590.0, "step": 21815 }, { "entropy": 0.1110074575059116, "epoch": 5.086257139526752, "grad_norm": 0.63671875, "learning_rate": 4.935857321266502e-05, "loss": 0.1294, "mean_token_accuracy": 0.9701458215713501, "num_tokens": 55503985.0, "step": 21820 }, { "entropy": 0.048689931537956, "epoch": 5.087422776547383, "grad_norm": 1.765625, "learning_rate": 4.935809044326056e-05, "loss": 0.0222, "mean_token_accuracy": 0.9833405792713166, "num_tokens": 55538140.0, "step": 21825 }, { "entropy": 0.044951011799275874, "epoch": 5.088588413568015, "grad_norm": 2.578125, "learning_rate": 4.935760749703413e-05, "loss": 0.0427, "mean_token_accuracy": 0.9884091496467591, "num_tokens": 55553734.0, "step": 21830 }, { "entropy": 0.06277897786349058, "epoch": 5.089754050588646, "grad_norm": 1.1875, "learning_rate": 4.9357124373992945e-05, "loss": 0.055, "mean_token_accuracy": 0.9851854026317597, "num_tokens": 55570993.0, "step": 21835 }, { "entropy": 0.06481649186462164, "epoch": 5.090919687609278, "grad_norm": 1.375, "learning_rate": 4.93566410741442e-05, "loss": 0.0596, "mean_token_accuracy": 0.9827612698078155, "num_tokens": 55585785.0, "step": 21840 }, { "entropy": 0.07002595514059066, "epoch": 5.0920853246299105, "grad_norm": 2.5, "learning_rate": 4.93561575974951e-05, "loss": 0.0665, "mean_token_accuracy": 0.9836658596992492, "num_tokens": 55599870.0, "step": 21845 }, { "entropy": 0.07330641448497772, "epoch": 5.093250961650542, "grad_norm": 1.2265625, "learning_rate": 4.935567394405286e-05, "loss": 0.0718, "mean_token_accuracy": 0.9804113328456878, "num_tokens": 55610084.0, "step": 21850 }, { "entropy": 0.05826578103005886, "epoch": 5.094416598671174, "grad_norm": 0.8125, "learning_rate": 4.935519011382469e-05, "loss": 0.0482, "mean_token_accuracy": 0.9851272463798523, "num_tokens": 55626799.0, "step": 21855 }, { "entropy": 0.062410115357488395, "epoch": 5.095582235691806, "grad_norm": 0.408203125, "learning_rate": 4.935470610681781e-05, "loss": 0.0464, "mean_token_accuracy": 0.982362687587738, "num_tokens": 55659544.0, "step": 21860 }, { "entropy": 0.05943414494395256, "epoch": 5.096747872712437, "grad_norm": 1.875, "learning_rate": 4.9354221923039416e-05, "loss": 0.055, "mean_token_accuracy": 0.9825399518013, "num_tokens": 55678976.0, "step": 21865 }, { "entropy": 0.04930404406040907, "epoch": 5.097913509733069, "grad_norm": 1.6953125, "learning_rate": 4.9353737562496755e-05, "loss": 0.041, "mean_token_accuracy": 0.987514328956604, "num_tokens": 55698566.0, "step": 21870 }, { "entropy": 0.08179815951734781, "epoch": 5.099079146753701, "grad_norm": 0.97265625, "learning_rate": 4.935325302519703e-05, "loss": 0.0625, "mean_token_accuracy": 0.9812222301959992, "num_tokens": 55712921.0, "step": 21875 }, { "entropy": 0.06906782537698745, "epoch": 5.1002447837743325, "grad_norm": 3.328125, "learning_rate": 4.935276831114748e-05, "loss": 0.0543, "mean_token_accuracy": 0.9829290509223938, "num_tokens": 55735759.0, "step": 21880 }, { "entropy": 0.0615530351176858, "epoch": 5.101410420794965, "grad_norm": 0.322265625, "learning_rate": 4.9352283420355325e-05, "loss": 0.0769, "mean_token_accuracy": 0.9852207362651825, "num_tokens": 55753062.0, "step": 21885 }, { "entropy": 0.053300064988434315, "epoch": 5.102576057815596, "grad_norm": 0.25390625, "learning_rate": 4.9351798352827805e-05, "loss": 0.0468, "mean_token_accuracy": 0.9863043606281281, "num_tokens": 55770524.0, "step": 21890 }, { "entropy": 0.052490772865712645, "epoch": 5.103741694836228, "grad_norm": 0.287109375, "learning_rate": 4.935131310857214e-05, "loss": 0.0378, "mean_token_accuracy": 0.9889246761798859, "num_tokens": 55811622.0, "step": 21895 }, { "entropy": 0.06909960927441716, "epoch": 5.10490733185686, "grad_norm": 0.1484375, "learning_rate": 4.935082768759557e-05, "loss": 0.0536, "mean_token_accuracy": 0.98121337890625, "num_tokens": 55826654.0, "step": 21900 }, { "entropy": 0.05015625609084964, "epoch": 5.106072968877491, "grad_norm": 0.91015625, "learning_rate": 4.935034208990533e-05, "loss": 0.0453, "mean_token_accuracy": 0.9881203591823577, "num_tokens": 55849840.0, "step": 21905 }, { "entropy": 0.06175452824681997, "epoch": 5.107238605898123, "grad_norm": 2.90625, "learning_rate": 4.934985631550867e-05, "loss": 0.0678, "mean_token_accuracy": 0.9852860569953918, "num_tokens": 55863086.0, "step": 21910 }, { "entropy": 0.06861697025597095, "epoch": 5.1084042429187555, "grad_norm": 0.322265625, "learning_rate": 4.9349370364412836e-05, "loss": 0.0611, "mean_token_accuracy": 0.9846416294574738, "num_tokens": 55878134.0, "step": 21915 }, { "entropy": 0.07329159006476402, "epoch": 5.109569879939387, "grad_norm": 3.40625, "learning_rate": 4.934888423662506e-05, "loss": 0.0769, "mean_token_accuracy": 0.9795122087001801, "num_tokens": 55888333.0, "step": 21920 }, { "entropy": 0.05855609718710184, "epoch": 5.110735516960019, "grad_norm": 0.1689453125, "learning_rate": 4.93483979321526e-05, "loss": 0.0544, "mean_token_accuracy": 0.9840858101844787, "num_tokens": 55911561.0, "step": 21925 }, { "entropy": 0.07522995788604021, "epoch": 5.11190115398065, "grad_norm": 1.4453125, "learning_rate": 4.9347911451002705e-05, "loss": 0.0447, "mean_token_accuracy": 0.9842344880104065, "num_tokens": 55933302.0, "step": 21930 }, { "entropy": 0.0620239726267755, "epoch": 5.113066791001282, "grad_norm": 0.373046875, "learning_rate": 4.934742479318263e-05, "loss": 0.0538, "mean_token_accuracy": 0.9858266294002533, "num_tokens": 55956064.0, "step": 21935 }, { "entropy": 0.08195541994646191, "epoch": 5.114232428021914, "grad_norm": 0.421875, "learning_rate": 4.9346937958699626e-05, "loss": 0.0474, "mean_token_accuracy": 0.9853470683097839, "num_tokens": 55975040.0, "step": 21940 }, { "entropy": 0.05026424927636981, "epoch": 5.1153980650425455, "grad_norm": 0.3203125, "learning_rate": 4.934645094756096e-05, "loss": 0.0451, "mean_token_accuracy": 0.9877060413360595, "num_tokens": 56000378.0, "step": 21945 }, { "entropy": 0.0795700391754508, "epoch": 5.116563702063178, "grad_norm": 0.9375, "learning_rate": 4.93459637597739e-05, "loss": 0.0946, "mean_token_accuracy": 0.9759146511554718, "num_tokens": 56012810.0, "step": 21950 }, { "entropy": 0.08439991511404514, "epoch": 5.11772933908381, "grad_norm": 1.1328125, "learning_rate": 4.93454763953457e-05, "loss": 0.1105, "mean_token_accuracy": 0.9726395010948181, "num_tokens": 56021050.0, "step": 21955 }, { "entropy": 0.08431314816698432, "epoch": 5.118894976104441, "grad_norm": 1.1796875, "learning_rate": 4.934498885428363e-05, "loss": 0.068, "mean_token_accuracy": 0.9810858368873596, "num_tokens": 56035566.0, "step": 21960 }, { "entropy": 0.05840439219027758, "epoch": 5.120060613125073, "grad_norm": 1.9375, "learning_rate": 4.9344501136594963e-05, "loss": 0.0594, "mean_token_accuracy": 0.9811022222042084, "num_tokens": 56050265.0, "step": 21965 }, { "entropy": 0.05464536147192121, "epoch": 5.121226250145704, "grad_norm": 0.49609375, "learning_rate": 4.934401324228696e-05, "loss": 0.0233, "mean_token_accuracy": 0.9924610316753387, "num_tokens": 56081923.0, "step": 21970 }, { "entropy": 0.05458259219303727, "epoch": 5.122391887166336, "grad_norm": 0.2255859375, "learning_rate": 4.934352517136691e-05, "loss": 0.0527, "mean_token_accuracy": 0.9853946685791015, "num_tokens": 56106631.0, "step": 21975 }, { "entropy": 0.04985369872301817, "epoch": 5.123557524186968, "grad_norm": 3.203125, "learning_rate": 4.934303692384209e-05, "loss": 0.0455, "mean_token_accuracy": 0.9850191473960876, "num_tokens": 56135671.0, "step": 21980 }, { "entropy": 0.09102193363942206, "epoch": 5.1247231612076, "grad_norm": 0.38671875, "learning_rate": 4.9342548499719774e-05, "loss": 0.0796, "mean_token_accuracy": 0.9770224332809448, "num_tokens": 56176870.0, "step": 21985 }, { "entropy": 0.061427733348682526, "epoch": 5.125888798228232, "grad_norm": 0.28515625, "learning_rate": 4.9342059899007246e-05, "loss": 0.0186, "mean_token_accuracy": 0.9816450893878936, "num_tokens": 56215200.0, "step": 21990 }, { "entropy": 0.0698608798906207, "epoch": 5.127054435248864, "grad_norm": 0.3984375, "learning_rate": 4.934157112171179e-05, "loss": 0.0609, "mean_token_accuracy": 0.9810095489025116, "num_tokens": 56234551.0, "step": 21995 }, { "entropy": 0.06343160159885883, "epoch": 5.128220072269495, "grad_norm": 2.03125, "learning_rate": 4.934108216784071e-05, "loss": 0.0509, "mean_token_accuracy": 0.9850274443626403, "num_tokens": 56249797.0, "step": 22000 }, { "entropy": 0.060403543058782816, "epoch": 5.129385709290127, "grad_norm": 1.8671875, "learning_rate": 4.9340593037401276e-05, "loss": 0.0612, "mean_token_accuracy": 0.9820111095905304, "num_tokens": 56272150.0, "step": 22005 }, { "entropy": 0.07690092343837023, "epoch": 5.130551346310758, "grad_norm": 1.15625, "learning_rate": 4.9340103730400787e-05, "loss": 0.0506, "mean_token_accuracy": 0.98413365483284, "num_tokens": 56294752.0, "step": 22010 }, { "entropy": 0.05405779052525759, "epoch": 5.1317169833313905, "grad_norm": 0.447265625, "learning_rate": 4.933961424684655e-05, "loss": 0.0251, "mean_token_accuracy": 0.9848962187767029, "num_tokens": 56325302.0, "step": 22015 }, { "entropy": 0.07608631141483783, "epoch": 5.132882620352023, "grad_norm": 2.28125, "learning_rate": 4.933912458674586e-05, "loss": 0.0802, "mean_token_accuracy": 0.9797267496585846, "num_tokens": 56334909.0, "step": 22020 }, { "entropy": 0.05106787588447333, "epoch": 5.134048257372654, "grad_norm": 2.828125, "learning_rate": 4.933863475010601e-05, "loss": 0.0476, "mean_token_accuracy": 0.985850727558136, "num_tokens": 56363979.0, "step": 22025 }, { "entropy": 0.043447258742526175, "epoch": 5.135213894393286, "grad_norm": 0.48046875, "learning_rate": 4.9338144736934305e-05, "loss": 0.0217, "mean_token_accuracy": 0.9914530575275421, "num_tokens": 56391611.0, "step": 22030 }, { "entropy": 0.057629916444420816, "epoch": 5.136379531413918, "grad_norm": 3.3125, "learning_rate": 4.9337654547238064e-05, "loss": 0.0464, "mean_token_accuracy": 0.9802502512931823, "num_tokens": 56411985.0, "step": 22035 }, { "entropy": 0.061352443508803846, "epoch": 5.137545168434549, "grad_norm": 3.9375, "learning_rate": 4.933716418102459e-05, "loss": 0.0643, "mean_token_accuracy": 0.98461874127388, "num_tokens": 56434340.0, "step": 22040 }, { "entropy": 0.07554442938417197, "epoch": 5.138710805455181, "grad_norm": 0.625, "learning_rate": 4.9336673638301184e-05, "loss": 0.0761, "mean_token_accuracy": 0.974240243434906, "num_tokens": 56452406.0, "step": 22045 }, { "entropy": 0.049168387427926064, "epoch": 5.139876442475813, "grad_norm": 1.09375, "learning_rate": 4.9336182919075174e-05, "loss": 0.0452, "mean_token_accuracy": 0.9868712604045868, "num_tokens": 56473667.0, "step": 22050 }, { "entropy": 0.0560465251095593, "epoch": 5.141042079496445, "grad_norm": 2.140625, "learning_rate": 4.933569202335388e-05, "loss": 0.0573, "mean_token_accuracy": 0.9840469479560852, "num_tokens": 56490114.0, "step": 22055 }, { "entropy": 0.04852314190939069, "epoch": 5.142207716517077, "grad_norm": 0.30859375, "learning_rate": 4.9335200951144614e-05, "loss": 0.031, "mean_token_accuracy": 0.9890325367450714, "num_tokens": 56525717.0, "step": 22060 }, { "entropy": 0.054032295290380714, "epoch": 5.143373353537708, "grad_norm": 3.140625, "learning_rate": 4.9334709702454694e-05, "loss": 0.0533, "mean_token_accuracy": 0.9830582797527313, "num_tokens": 56546030.0, "step": 22065 }, { "entropy": 0.05986177334561944, "epoch": 5.14453899055834, "grad_norm": 1.84375, "learning_rate": 4.9334218277291464e-05, "loss": 0.0437, "mean_token_accuracy": 0.9865305006504059, "num_tokens": 56565431.0, "step": 22070 }, { "entropy": 0.062705637793988, "epoch": 5.145704627578972, "grad_norm": 0.31640625, "learning_rate": 4.933372667566223e-05, "loss": 0.0362, "mean_token_accuracy": 0.9859632730484009, "num_tokens": 56593182.0, "step": 22075 }, { "entropy": 0.06045402865856886, "epoch": 5.146870264599603, "grad_norm": 0.2021484375, "learning_rate": 4.933323489757433e-05, "loss": 0.0598, "mean_token_accuracy": 0.9816952824592591, "num_tokens": 56611334.0, "step": 22080 }, { "entropy": 0.06225607329979539, "epoch": 5.1480359016202355, "grad_norm": 0.42578125, "learning_rate": 4.933274294303511e-05, "loss": 0.0728, "mean_token_accuracy": 0.982220607995987, "num_tokens": 56626533.0, "step": 22085 }, { "entropy": 0.08001007661223411, "epoch": 5.149201538640868, "grad_norm": 0.71484375, "learning_rate": 4.933225081205189e-05, "loss": 0.0868, "mean_token_accuracy": 0.9747688889503479, "num_tokens": 56644744.0, "step": 22090 }, { "entropy": 0.0839627580717206, "epoch": 5.150367175661499, "grad_norm": 3.1875, "learning_rate": 4.9331758504632014e-05, "loss": 0.054, "mean_token_accuracy": 0.9822079181671143, "num_tokens": 56663768.0, "step": 22095 }, { "entropy": 0.06215059943497181, "epoch": 5.151532812682131, "grad_norm": 2.015625, "learning_rate": 4.933126602078282e-05, "loss": 0.0696, "mean_token_accuracy": 0.9818137347698211, "num_tokens": 56675332.0, "step": 22100 }, { "entropy": 0.04475753773003817, "epoch": 5.152698449702762, "grad_norm": 2.6875, "learning_rate": 4.9330773360511654e-05, "loss": 0.0283, "mean_token_accuracy": 0.990894901752472, "num_tokens": 56708402.0, "step": 22105 }, { "entropy": 0.07335485322400928, "epoch": 5.153864086723394, "grad_norm": 2.59375, "learning_rate": 4.9330280523825866e-05, "loss": 0.0796, "mean_token_accuracy": 0.9783159673213959, "num_tokens": 56725192.0, "step": 22110 }, { "entropy": 0.06051680203527212, "epoch": 5.155029723744026, "grad_norm": 1.3515625, "learning_rate": 4.932978751073281e-05, "loss": 0.0596, "mean_token_accuracy": 0.9841292977333069, "num_tokens": 56747636.0, "step": 22115 }, { "entropy": 0.059200151823461056, "epoch": 5.156195360764658, "grad_norm": 0.296875, "learning_rate": 4.932929432123982e-05, "loss": 0.0475, "mean_token_accuracy": 0.9841175198554992, "num_tokens": 56779254.0, "step": 22120 }, { "entropy": 0.06318260245025158, "epoch": 5.15736099778529, "grad_norm": 0.24609375, "learning_rate": 4.932880095535425e-05, "loss": 0.0533, "mean_token_accuracy": 0.9854140460491181, "num_tokens": 56800037.0, "step": 22125 }, { "entropy": 0.04333141590468585, "epoch": 5.158526634805922, "grad_norm": 0.17578125, "learning_rate": 4.932830741308348e-05, "loss": 0.0252, "mean_token_accuracy": 0.9886651396751404, "num_tokens": 56831723.0, "step": 22130 }, { "entropy": 0.07760660853236914, "epoch": 5.159692271826553, "grad_norm": 1.2890625, "learning_rate": 4.9327813694434854e-05, "loss": 0.0732, "mean_token_accuracy": 0.9771431624889374, "num_tokens": 56846308.0, "step": 22135 }, { "entropy": 0.04463453097268939, "epoch": 5.160857908847185, "grad_norm": 0.23046875, "learning_rate": 4.932731979941573e-05, "loss": 0.0455, "mean_token_accuracy": 0.9894105195999146, "num_tokens": 56874076.0, "step": 22140 }, { "entropy": 0.05769450534135103, "epoch": 5.162023545867816, "grad_norm": 0.51953125, "learning_rate": 4.9326825728033483e-05, "loss": 0.0588, "mean_token_accuracy": 0.980986213684082, "num_tokens": 56888664.0, "step": 22145 }, { "entropy": 0.18658456727862358, "epoch": 5.163189182888448, "grad_norm": 2.6875, "learning_rate": 4.932633148029547e-05, "loss": 0.3163, "mean_token_accuracy": 0.9311531364917756, "num_tokens": 56916775.0, "step": 22150 }, { "entropy": 0.05970644308254123, "epoch": 5.1643548199090805, "grad_norm": 1.8203125, "learning_rate": 4.932583705620908e-05, "loss": 0.0571, "mean_token_accuracy": 0.9825261533260345, "num_tokens": 56933407.0, "step": 22155 }, { "entropy": 0.05941506642848253, "epoch": 5.165520456929712, "grad_norm": 0.68359375, "learning_rate": 4.932534245578166e-05, "loss": 0.0497, "mean_token_accuracy": 0.9831038832664489, "num_tokens": 56953493.0, "step": 22160 }, { "entropy": 0.07277057701721787, "epoch": 5.166686093950344, "grad_norm": 1.109375, "learning_rate": 4.93248476790206e-05, "loss": 0.0711, "mean_token_accuracy": 0.9815229892730712, "num_tokens": 56969466.0, "step": 22165 }, { "entropy": 0.06678186506032943, "epoch": 5.167851730970976, "grad_norm": 4.0625, "learning_rate": 4.9324352725933284e-05, "loss": 0.069, "mean_token_accuracy": 0.9849565625190735, "num_tokens": 56980588.0, "step": 22170 }, { "entropy": 0.06671747919172048, "epoch": 5.169017367991607, "grad_norm": 1.703125, "learning_rate": 4.932385759652707e-05, "loss": 0.0443, "mean_token_accuracy": 0.9861550271511078, "num_tokens": 57008493.0, "step": 22175 }, { "entropy": 0.07351988535374403, "epoch": 5.170183005012239, "grad_norm": 0.3359375, "learning_rate": 4.932336229080937e-05, "loss": 0.0636, "mean_token_accuracy": 0.9827918767929077, "num_tokens": 57029159.0, "step": 22180 }, { "entropy": 0.04658596199005842, "epoch": 5.171348642032871, "grad_norm": 2.046875, "learning_rate": 4.932286680878754e-05, "loss": 0.0289, "mean_token_accuracy": 0.9881357610225677, "num_tokens": 57065409.0, "step": 22185 }, { "entropy": 0.05278063863515854, "epoch": 5.172514279053503, "grad_norm": 0.365234375, "learning_rate": 4.9322371150468994e-05, "loss": 0.033, "mean_token_accuracy": 0.9894371032714844, "num_tokens": 57094324.0, "step": 22190 }, { "entropy": 0.06439951695501804, "epoch": 5.173679916074135, "grad_norm": 2.125, "learning_rate": 4.932187531586111e-05, "loss": 0.0621, "mean_token_accuracy": 0.980733460187912, "num_tokens": 57106651.0, "step": 22195 }, { "entropy": 0.059175492450594905, "epoch": 5.174845553094766, "grad_norm": 2.46875, "learning_rate": 4.932137930497128e-05, "loss": 0.0418, "mean_token_accuracy": 0.9881093919277191, "num_tokens": 57153428.0, "step": 22200 }, { "entropy": 0.05662951730191708, "epoch": 5.176011190115398, "grad_norm": 2.21875, "learning_rate": 4.932088311780691e-05, "loss": 0.0491, "mean_token_accuracy": 0.9842721045017242, "num_tokens": 57166150.0, "step": 22205 }, { "entropy": 0.06289763897657394, "epoch": 5.17717682713603, "grad_norm": 0.376953125, "learning_rate": 4.9320386754375394e-05, "loss": 0.0476, "mean_token_accuracy": 0.9786098062992096, "num_tokens": 57189967.0, "step": 22210 }, { "entropy": 0.07652818039059639, "epoch": 5.178342464156661, "grad_norm": 4.78125, "learning_rate": 4.931989021468413e-05, "loss": 0.0657, "mean_token_accuracy": 0.9819384813308716, "num_tokens": 57200537.0, "step": 22215 }, { "entropy": 0.06508733816444874, "epoch": 5.179508101177293, "grad_norm": 1.5390625, "learning_rate": 4.9319393498740516e-05, "loss": 0.053, "mean_token_accuracy": 0.9842216610908509, "num_tokens": 57211796.0, "step": 22220 }, { "entropy": 0.06640651468187571, "epoch": 5.1806737381979255, "grad_norm": 0.431640625, "learning_rate": 4.931889660655198e-05, "loss": 0.0582, "mean_token_accuracy": 0.9838066399097443, "num_tokens": 57231428.0, "step": 22225 }, { "entropy": 0.06103291492909193, "epoch": 5.181839375218557, "grad_norm": 3.78125, "learning_rate": 4.9318399538125916e-05, "loss": 0.0691, "mean_token_accuracy": 0.9828658163547516, "num_tokens": 57250840.0, "step": 22230 }, { "entropy": 0.07042492870241404, "epoch": 5.183005012239189, "grad_norm": 1.3359375, "learning_rate": 4.931790229346973e-05, "loss": 0.053, "mean_token_accuracy": 0.984697699546814, "num_tokens": 57273304.0, "step": 22235 }, { "entropy": 0.044906450994312765, "epoch": 5.18417064925982, "grad_norm": 0.341796875, "learning_rate": 4.931740487259085e-05, "loss": 0.0268, "mean_token_accuracy": 0.9909862637519836, "num_tokens": 57312085.0, "step": 22240 }, { "entropy": 0.06908139307051897, "epoch": 5.185336286280452, "grad_norm": 3.09375, "learning_rate": 4.93169072754967e-05, "loss": 0.0437, "mean_token_accuracy": 0.978874671459198, "num_tokens": 57345440.0, "step": 22245 }, { "entropy": 0.056862791441380976, "epoch": 5.186501923301084, "grad_norm": 1.578125, "learning_rate": 4.931640950219468e-05, "loss": 0.0577, "mean_token_accuracy": 0.9856437981128693, "num_tokens": 57372140.0, "step": 22250 }, { "entropy": 0.05833668913692236, "epoch": 5.1876675603217155, "grad_norm": 0.486328125, "learning_rate": 4.931591155269222e-05, "loss": 0.0491, "mean_token_accuracy": 0.9865939795970917, "num_tokens": 57391472.0, "step": 22255 }, { "entropy": 0.17140091247856618, "epoch": 5.188833197342348, "grad_norm": 1.1328125, "learning_rate": 4.931541342699675e-05, "loss": 0.2663, "mean_token_accuracy": 0.9569355547428131, "num_tokens": 57425631.0, "step": 22260 }, { "entropy": 0.2862912990152836, "epoch": 5.18999883436298, "grad_norm": 0.7109375, "learning_rate": 4.931491512511569e-05, "loss": 0.4858, "mean_token_accuracy": 0.9360194027423858, "num_tokens": 57453226.0, "step": 22265 }, { "entropy": 0.06808519745245575, "epoch": 5.191164471383611, "grad_norm": 1.3984375, "learning_rate": 4.9314416647056485e-05, "loss": 0.0803, "mean_token_accuracy": 0.9762260675430298, "num_tokens": 57468653.0, "step": 22270 }, { "entropy": 0.06890982948243618, "epoch": 5.192330108404243, "grad_norm": 0.93359375, "learning_rate": 4.9313917992826544e-05, "loss": 0.0558, "mean_token_accuracy": 0.9842790246009827, "num_tokens": 57482450.0, "step": 22275 }, { "entropy": 0.07730196844786405, "epoch": 5.193495745424874, "grad_norm": 4.375, "learning_rate": 4.9313419162433326e-05, "loss": 0.0841, "mean_token_accuracy": 0.9743519246578216, "num_tokens": 57493336.0, "step": 22280 }, { "entropy": 0.05996605232357979, "epoch": 5.194661382445506, "grad_norm": 1.140625, "learning_rate": 4.931292015588426e-05, "loss": 0.049, "mean_token_accuracy": 0.9876279890537262, "num_tokens": 57515055.0, "step": 22285 }, { "entropy": 0.05181253822520375, "epoch": 5.1958270194661385, "grad_norm": 0.490234375, "learning_rate": 4.931242097318678e-05, "loss": 0.0515, "mean_token_accuracy": 0.985153716802597, "num_tokens": 57545326.0, "step": 22290 }, { "entropy": 0.0514564954675734, "epoch": 5.19699265648677, "grad_norm": 2.078125, "learning_rate": 4.9311921614348343e-05, "loss": 0.0348, "mean_token_accuracy": 0.986234825849533, "num_tokens": 57569516.0, "step": 22295 }, { "entropy": 0.08702254965901375, "epoch": 5.198158293507402, "grad_norm": 5.84375, "learning_rate": 4.931142207937638e-05, "loss": 0.1145, "mean_token_accuracy": 0.9728547394275665, "num_tokens": 57577612.0, "step": 22300 }, { "entropy": 0.09497029753401875, "epoch": 5.199323930528034, "grad_norm": 0.40625, "learning_rate": 4.931092236827836e-05, "loss": 0.0684, "mean_token_accuracy": 0.9809476554393768, "num_tokens": 57588909.0, "step": 22305 }, { "entropy": 0.0746473042294383, "epoch": 5.200489567548665, "grad_norm": 2.5625, "learning_rate": 4.931042248106172e-05, "loss": 0.0562, "mean_token_accuracy": 0.9825978338718414, "num_tokens": 57606043.0, "step": 22310 }, { "entropy": 0.06316058505326509, "epoch": 5.201655204569297, "grad_norm": 2.796875, "learning_rate": 4.930992241773391e-05, "loss": 0.046, "mean_token_accuracy": 0.9862751066684723, "num_tokens": 57627499.0, "step": 22315 }, { "entropy": 0.08704807367175818, "epoch": 5.202820841589929, "grad_norm": 0.51953125, "learning_rate": 4.93094221783024e-05, "loss": 0.0776, "mean_token_accuracy": 0.9783826053142548, "num_tokens": 57642818.0, "step": 22320 }, { "entropy": 0.06668164748698473, "epoch": 5.2039864786105605, "grad_norm": 2.625, "learning_rate": 4.930892176277464e-05, "loss": 0.0598, "mean_token_accuracy": 0.9841929137706756, "num_tokens": 57658432.0, "step": 22325 }, { "entropy": 0.054752088710665704, "epoch": 5.205152115631193, "grad_norm": 0.640625, "learning_rate": 4.93084211711581e-05, "loss": 0.0473, "mean_token_accuracy": 0.9867014527320862, "num_tokens": 57682037.0, "step": 22330 }, { "entropy": 0.06804192513227462, "epoch": 5.206317752651824, "grad_norm": 2.390625, "learning_rate": 4.9307920403460235e-05, "loss": 0.0617, "mean_token_accuracy": 0.9819234848022461, "num_tokens": 57693989.0, "step": 22335 }, { "entropy": 0.06305139996111393, "epoch": 5.207483389672456, "grad_norm": 2.8125, "learning_rate": 4.9307419459688514e-05, "loss": 0.0603, "mean_token_accuracy": 0.9847787320613861, "num_tokens": 57717654.0, "step": 22340 }, { "entropy": 0.08745529353618622, "epoch": 5.208649026693088, "grad_norm": 0.2021484375, "learning_rate": 4.930691833985042e-05, "loss": 0.0526, "mean_token_accuracy": 0.9825003862380981, "num_tokens": 57738130.0, "step": 22345 }, { "entropy": 0.0616550050675869, "epoch": 5.209814663713719, "grad_norm": 0.2470703125, "learning_rate": 4.93064170439534e-05, "loss": 0.0402, "mean_token_accuracy": 0.9874732315540313, "num_tokens": 57756033.0, "step": 22350 }, { "entropy": 0.07887524664402008, "epoch": 5.210980300734351, "grad_norm": 1.546875, "learning_rate": 4.930591557200496e-05, "loss": 0.0592, "mean_token_accuracy": 0.9847594499588013, "num_tokens": 57766455.0, "step": 22355 }, { "entropy": 0.07828300539404154, "epoch": 5.2121459377549835, "grad_norm": 0.890625, "learning_rate": 4.930541392401255e-05, "loss": 0.043, "mean_token_accuracy": 0.9833212852478027, "num_tokens": 57788545.0, "step": 22360 }, { "entropy": 0.06497166268527507, "epoch": 5.213311574775615, "grad_norm": 0.8515625, "learning_rate": 4.930491209998366e-05, "loss": 0.052, "mean_token_accuracy": 0.9866826653480529, "num_tokens": 57813550.0, "step": 22365 }, { "entropy": 0.07942379545420408, "epoch": 5.214477211796247, "grad_norm": 3.34375, "learning_rate": 4.930441009992578e-05, "loss": 0.0693, "mean_token_accuracy": 0.9833049654960633, "num_tokens": 57832481.0, "step": 22370 }, { "entropy": 0.07472335118800402, "epoch": 5.215642848816878, "grad_norm": 1.7265625, "learning_rate": 4.9303907923846394e-05, "loss": 0.0856, "mean_token_accuracy": 0.9751461148262024, "num_tokens": 57843819.0, "step": 22375 }, { "entropy": 0.0662139642983675, "epoch": 5.21680848583751, "grad_norm": 2.328125, "learning_rate": 4.930340557175298e-05, "loss": 0.0667, "mean_token_accuracy": 0.9782337188720703, "num_tokens": 57859104.0, "step": 22380 }, { "entropy": 0.05948888058774173, "epoch": 5.217974122858142, "grad_norm": 0.396484375, "learning_rate": 4.930290304365304e-05, "loss": 0.0567, "mean_token_accuracy": 0.9833056330680847, "num_tokens": 57880523.0, "step": 22385 }, { "entropy": 0.08422890277579427, "epoch": 5.219139759878773, "grad_norm": 3.265625, "learning_rate": 4.9302400339554066e-05, "loss": 0.0725, "mean_token_accuracy": 0.9808120965957642, "num_tokens": 57898052.0, "step": 22390 }, { "entropy": 0.06969092637300492, "epoch": 5.2203053968994055, "grad_norm": 3.453125, "learning_rate": 4.930189745946355e-05, "loss": 0.0783, "mean_token_accuracy": 0.9814900815486908, "num_tokens": 57910841.0, "step": 22395 }, { "entropy": 0.05859537925571203, "epoch": 5.221471033920038, "grad_norm": 0.212890625, "learning_rate": 4.9301394403388984e-05, "loss": 0.0469, "mean_token_accuracy": 0.9872027635574341, "num_tokens": 57933764.0, "step": 22400 }, { "entropy": 0.050899960659444334, "epoch": 5.222636670940669, "grad_norm": 1.25, "learning_rate": 4.930089117133788e-05, "loss": 0.0283, "mean_token_accuracy": 0.9913576006889343, "num_tokens": 57972262.0, "step": 22405 }, { "entropy": 0.08815276809036732, "epoch": 5.223802307961301, "grad_norm": 2.5, "learning_rate": 4.9300387763317745e-05, "loss": 0.0823, "mean_token_accuracy": 0.9758888900279998, "num_tokens": 57980445.0, "step": 22410 }, { "entropy": 0.07962552271783352, "epoch": 5.224967944981932, "grad_norm": 0.31640625, "learning_rate": 4.9299884179336074e-05, "loss": 0.048, "mean_token_accuracy": 0.9821713328361511, "num_tokens": 57995875.0, "step": 22415 }, { "entropy": 0.06803807709366083, "epoch": 5.226133582002564, "grad_norm": 1.5859375, "learning_rate": 4.9299380419400384e-05, "loss": 0.0664, "mean_token_accuracy": 0.9840512633323669, "num_tokens": 58007268.0, "step": 22420 }, { "entropy": 0.08102755229920149, "epoch": 5.227299219023196, "grad_norm": 0.85546875, "learning_rate": 4.9298876483518185e-05, "loss": 0.068, "mean_token_accuracy": 0.9746978640556335, "num_tokens": 58026770.0, "step": 22425 }, { "entropy": 0.058991825021803376, "epoch": 5.228464856043828, "grad_norm": 1.1171875, "learning_rate": 4.9298372371696996e-05, "loss": 0.0457, "mean_token_accuracy": 0.9835934102535248, "num_tokens": 58047531.0, "step": 22430 }, { "entropy": 0.07415626281872392, "epoch": 5.22963049306446, "grad_norm": 3.53125, "learning_rate": 4.929786808394432e-05, "loss": 0.0326, "mean_token_accuracy": 0.9845768630504608, "num_tokens": 58078331.0, "step": 22435 }, { "entropy": 0.06825162693858147, "epoch": 5.230796130085092, "grad_norm": 2.15625, "learning_rate": 4.929736362026769e-05, "loss": 0.0658, "mean_token_accuracy": 0.9820273876190185, "num_tokens": 58090377.0, "step": 22440 }, { "entropy": 0.0566140066832304, "epoch": 5.231961767105723, "grad_norm": 0.423828125, "learning_rate": 4.9296858980674625e-05, "loss": 0.0568, "mean_token_accuracy": 0.9854518592357635, "num_tokens": 58102852.0, "step": 22445 }, { "entropy": 0.0516730266623199, "epoch": 5.233127404126355, "grad_norm": 1.34375, "learning_rate": 4.929635416517265e-05, "loss": 0.0535, "mean_token_accuracy": 0.9852344691753387, "num_tokens": 58128816.0, "step": 22450 }, { "entropy": 0.058171987719833854, "epoch": 5.234293041146987, "grad_norm": 1.03125, "learning_rate": 4.9295849173769294e-05, "loss": 0.0562, "mean_token_accuracy": 0.9863945484161377, "num_tokens": 58149787.0, "step": 22455 }, { "entropy": 0.054046990163624284, "epoch": 5.2354586781676185, "grad_norm": 1.1015625, "learning_rate": 4.929534400647208e-05, "loss": 0.0463, "mean_token_accuracy": 0.9873308002948761, "num_tokens": 58169062.0, "step": 22460 }, { "entropy": 0.05721415225416422, "epoch": 5.2366243151882506, "grad_norm": 3.015625, "learning_rate": 4.929483866328855e-05, "loss": 0.0374, "mean_token_accuracy": 0.9847243249416351, "num_tokens": 58184895.0, "step": 22465 }, { "entropy": 0.07032016962766648, "epoch": 5.237789952208882, "grad_norm": 4.0, "learning_rate": 4.929433314422622e-05, "loss": 0.0711, "mean_token_accuracy": 0.9818163990974427, "num_tokens": 58199417.0, "step": 22470 }, { "entropy": 0.0443870535120368, "epoch": 5.238955589229514, "grad_norm": 0.97265625, "learning_rate": 4.929382744929266e-05, "loss": 0.0304, "mean_token_accuracy": 0.9896786749362946, "num_tokens": 58218118.0, "step": 22475 }, { "entropy": 0.07258482100442051, "epoch": 5.240121226250146, "grad_norm": 1.8515625, "learning_rate": 4.929332157849539e-05, "loss": 0.0535, "mean_token_accuracy": 0.984838330745697, "num_tokens": 58236418.0, "step": 22480 }, { "entropy": 0.053557580430060626, "epoch": 5.241286863270777, "grad_norm": 0.408203125, "learning_rate": 4.929281553184195e-05, "loss": 0.0396, "mean_token_accuracy": 0.9851312339305878, "num_tokens": 58263109.0, "step": 22485 }, { "entropy": 0.04420396983623505, "epoch": 5.242452500291409, "grad_norm": 0.1337890625, "learning_rate": 4.92923093093399e-05, "loss": 0.0157, "mean_token_accuracy": 0.9877875030040741, "num_tokens": 58316098.0, "step": 22490 }, { "entropy": 0.06833171583712101, "epoch": 5.243618137312041, "grad_norm": 4.21875, "learning_rate": 4.929180291099678e-05, "loss": 0.0734, "mean_token_accuracy": 0.9797233819961548, "num_tokens": 58329411.0, "step": 22495 }, { "entropy": 0.06508544906973839, "epoch": 5.244783774332673, "grad_norm": 0.7578125, "learning_rate": 4.929129633682015e-05, "loss": 0.0473, "mean_token_accuracy": 0.9827897489070893, "num_tokens": 58361368.0, "step": 22500 }, { "entropy": 0.05918069491162896, "epoch": 5.245949411353305, "grad_norm": 0.296875, "learning_rate": 4.9290789586817544e-05, "loss": 0.038, "mean_token_accuracy": 0.982359778881073, "num_tokens": 58391277.0, "step": 22505 }, { "entropy": 0.051390673080459234, "epoch": 5.247115048373936, "grad_norm": 0.1689453125, "learning_rate": 4.9290282660996535e-05, "loss": 0.0292, "mean_token_accuracy": 0.987204658985138, "num_tokens": 58446856.0, "step": 22510 }, { "entropy": 0.0984171137213707, "epoch": 5.248280685394568, "grad_norm": 2.578125, "learning_rate": 4.9289775559364684e-05, "loss": 0.0913, "mean_token_accuracy": 0.9747403621673584, "num_tokens": 58455680.0, "step": 22515 }, { "entropy": 0.05602652542293072, "epoch": 5.2494463224152, "grad_norm": 1.890625, "learning_rate": 4.928926828192954e-05, "loss": 0.0699, "mean_token_accuracy": 0.9806085467338562, "num_tokens": 58480807.0, "step": 22520 }, { "entropy": 0.07297561299055814, "epoch": 5.250611959435831, "grad_norm": 0.8046875, "learning_rate": 4.9288760828698674e-05, "loss": 0.0684, "mean_token_accuracy": 0.9783793568611145, "num_tokens": 58502349.0, "step": 22525 }, { "entropy": 0.07672806866467, "epoch": 5.2517775964564635, "grad_norm": 1.9453125, "learning_rate": 4.9288253199679654e-05, "loss": 0.0867, "mean_token_accuracy": 0.9797773718833923, "num_tokens": 58522445.0, "step": 22530 }, { "entropy": 0.06881959196180105, "epoch": 5.252943233477096, "grad_norm": 1.125, "learning_rate": 4.928774539488005e-05, "loss": 0.053, "mean_token_accuracy": 0.9879900455474854, "num_tokens": 58542959.0, "step": 22535 }, { "entropy": 0.07772445082664489, "epoch": 5.254108870497727, "grad_norm": 1.015625, "learning_rate": 4.928723741430743e-05, "loss": 0.0797, "mean_token_accuracy": 0.9801374554634095, "num_tokens": 58553910.0, "step": 22540 }, { "entropy": 0.06708589978516102, "epoch": 5.255274507518359, "grad_norm": 0.81640625, "learning_rate": 4.928672925796937e-05, "loss": 0.0683, "mean_token_accuracy": 0.981471985578537, "num_tokens": 58567114.0, "step": 22545 }, { "entropy": 0.059767977148294446, "epoch": 5.25644014453899, "grad_norm": 0.7734375, "learning_rate": 4.9286220925873457e-05, "loss": 0.0509, "mean_token_accuracy": 0.9889122486114502, "num_tokens": 58583769.0, "step": 22550 }, { "entropy": 0.06370611824095249, "epoch": 5.257605781559622, "grad_norm": 1.8984375, "learning_rate": 4.9285712418027254e-05, "loss": 0.0852, "mean_token_accuracy": 0.9790844082832336, "num_tokens": 58594274.0, "step": 22555 }, { "entropy": 0.08938689008355141, "epoch": 5.258771418580254, "grad_norm": 3.890625, "learning_rate": 4.928520373443836e-05, "loss": 0.0872, "mean_token_accuracy": 0.9796117067337036, "num_tokens": 58602989.0, "step": 22560 }, { "entropy": 0.0646333851851523, "epoch": 5.2599370556008855, "grad_norm": 0.73828125, "learning_rate": 4.928469487511434e-05, "loss": 0.0503, "mean_token_accuracy": 0.9790112435817718, "num_tokens": 58622536.0, "step": 22565 }, { "entropy": 0.06210710275918245, "epoch": 5.261102692621518, "grad_norm": 0.515625, "learning_rate": 4.9284185840062805e-05, "loss": 0.0313, "mean_token_accuracy": 0.9861810505390167, "num_tokens": 58647178.0, "step": 22570 }, { "entropy": 0.07191750202327966, "epoch": 5.26226832964215, "grad_norm": 4.03125, "learning_rate": 4.928367662929133e-05, "loss": 0.0743, "mean_token_accuracy": 0.9779257357120514, "num_tokens": 58660942.0, "step": 22575 }, { "entropy": 0.07680397816002368, "epoch": 5.263433966662781, "grad_norm": 2.265625, "learning_rate": 4.928316724280751e-05, "loss": 0.0628, "mean_token_accuracy": 0.9832376003265381, "num_tokens": 58679318.0, "step": 22580 }, { "entropy": 0.04947316385805607, "epoch": 5.264599603683413, "grad_norm": 0.373046875, "learning_rate": 4.928265768061895e-05, "loss": 0.0281, "mean_token_accuracy": 0.9875920355319977, "num_tokens": 58706921.0, "step": 22585 }, { "entropy": 0.06746490080840886, "epoch": 5.265765240704045, "grad_norm": 1.9375, "learning_rate": 4.928214794273324e-05, "loss": 0.044, "mean_token_accuracy": 0.9858042418956756, "num_tokens": 58731640.0, "step": 22590 }, { "entropy": 0.06307427566498518, "epoch": 5.266930877724676, "grad_norm": 0.388671875, "learning_rate": 4.928163802915798e-05, "loss": 0.042, "mean_token_accuracy": 0.9826907038688659, "num_tokens": 58753369.0, "step": 22595 }, { "entropy": 0.05262499302625656, "epoch": 5.2680965147453085, "grad_norm": 0.609375, "learning_rate": 4.928112793990078e-05, "loss": 0.0234, "mean_token_accuracy": 0.9888963580131531, "num_tokens": 58793583.0, "step": 22600 }, { "entropy": 0.06587472511455417, "epoch": 5.26926215176594, "grad_norm": 0.62109375, "learning_rate": 4.928061767496924e-05, "loss": 0.0519, "mean_token_accuracy": 0.9828464448451996, "num_tokens": 58823115.0, "step": 22605 }, { "entropy": 0.06749656610190868, "epoch": 5.270427788786572, "grad_norm": 2.78125, "learning_rate": 4.928010723437097e-05, "loss": 0.0627, "mean_token_accuracy": 0.9848908543586731, "num_tokens": 58838981.0, "step": 22610 }, { "entropy": 0.06774584236554801, "epoch": 5.271593425807204, "grad_norm": 3.125, "learning_rate": 4.927959661811359e-05, "loss": 0.0602, "mean_token_accuracy": 0.9852911114692688, "num_tokens": 58862052.0, "step": 22615 }, { "entropy": 0.06854531690478324, "epoch": 5.272759062827835, "grad_norm": 0.94921875, "learning_rate": 4.92790858262047e-05, "loss": 0.0544, "mean_token_accuracy": 0.9801726341247559, "num_tokens": 58886661.0, "step": 22620 }, { "entropy": 0.058332338370382784, "epoch": 5.273924699848467, "grad_norm": 2.375, "learning_rate": 4.9278574858651923e-05, "loss": 0.0617, "mean_token_accuracy": 0.9846881628036499, "num_tokens": 58899831.0, "step": 22625 }, { "entropy": 0.06559796500951051, "epoch": 5.275090336869099, "grad_norm": 0.97265625, "learning_rate": 4.927806371546288e-05, "loss": 0.0361, "mean_token_accuracy": 0.9890841841697693, "num_tokens": 58927345.0, "step": 22630 }, { "entropy": 0.06151863979175687, "epoch": 5.2762559738897306, "grad_norm": 4.15625, "learning_rate": 4.927755239664519e-05, "loss": 0.0494, "mean_token_accuracy": 0.985187166929245, "num_tokens": 58952386.0, "step": 22635 }, { "entropy": 0.0975575815886259, "epoch": 5.277421610910363, "grad_norm": 0.984375, "learning_rate": 4.927704090220649e-05, "loss": 0.1269, "mean_token_accuracy": 0.9716195166110992, "num_tokens": 58972365.0, "step": 22640 }, { "entropy": 0.06953037157654762, "epoch": 5.278587247930994, "grad_norm": 1.484375, "learning_rate": 4.927652923215439e-05, "loss": 0.0553, "mean_token_accuracy": 0.9839057862758637, "num_tokens": 58982805.0, "step": 22645 }, { "entropy": 0.051419579051434995, "epoch": 5.279752884951626, "grad_norm": 2.234375, "learning_rate": 4.927601738649652e-05, "loss": 0.0405, "mean_token_accuracy": 0.9825731217861176, "num_tokens": 59008791.0, "step": 22650 }, { "entropy": 0.05781197277829051, "epoch": 5.280918521972258, "grad_norm": 0.4921875, "learning_rate": 4.9275505365240525e-05, "loss": 0.0301, "mean_token_accuracy": 0.9908199787139893, "num_tokens": 59039802.0, "step": 22655 }, { "entropy": 0.1533332671970129, "epoch": 5.282084158992889, "grad_norm": 2.828125, "learning_rate": 4.927499316839403e-05, "loss": 0.213, "mean_token_accuracy": 0.9515707314014434, "num_tokens": 59063712.0, "step": 22660 }, { "entropy": 0.156291969679296, "epoch": 5.283249796013521, "grad_norm": 0.46484375, "learning_rate": 4.927448079596468e-05, "loss": 0.2686, "mean_token_accuracy": 0.943569415807724, "num_tokens": 59085381.0, "step": 22665 }, { "entropy": 0.08054910823702813, "epoch": 5.2844154330341535, "grad_norm": 3.484375, "learning_rate": 4.927396824796011e-05, "loss": 0.0653, "mean_token_accuracy": 0.9849894046783447, "num_tokens": 59096600.0, "step": 22670 }, { "entropy": 0.04917644914239645, "epoch": 5.285581070054785, "grad_norm": 1.0546875, "learning_rate": 4.9273455524387966e-05, "loss": 0.0503, "mean_token_accuracy": 0.9843254506587982, "num_tokens": 59119465.0, "step": 22675 }, { "entropy": 0.041685186047106984, "epoch": 5.286746707075417, "grad_norm": 0.69140625, "learning_rate": 4.927294262525589e-05, "loss": 0.042, "mean_token_accuracy": 0.984868735074997, "num_tokens": 59146252.0, "step": 22680 }, { "entropy": 0.04622435262426734, "epoch": 5.287912344096048, "grad_norm": 1.3828125, "learning_rate": 4.9272429550571536e-05, "loss": 0.0284, "mean_token_accuracy": 0.987972092628479, "num_tokens": 59171522.0, "step": 22685 }, { "entropy": 0.07690389379858971, "epoch": 5.28907798111668, "grad_norm": 1.78125, "learning_rate": 4.9271916300342544e-05, "loss": 0.0789, "mean_token_accuracy": 0.9799573600292206, "num_tokens": 59185612.0, "step": 22690 }, { "entropy": 0.07589298877865076, "epoch": 5.290243618137312, "grad_norm": 1.515625, "learning_rate": 4.927140287457658e-05, "loss": 0.0701, "mean_token_accuracy": 0.9791806638240814, "num_tokens": 59195994.0, "step": 22695 }, { "entropy": 0.06374965868890285, "epoch": 5.2914092551579435, "grad_norm": 1.40625, "learning_rate": 4.927088927328129e-05, "loss": 0.0695, "mean_token_accuracy": 0.9842319011688232, "num_tokens": 59220450.0, "step": 22700 }, { "entropy": 0.08326494041830301, "epoch": 5.292574892178576, "grad_norm": 1.1328125, "learning_rate": 4.927037549646434e-05, "loss": 0.0613, "mean_token_accuracy": 0.9815594732761384, "num_tokens": 59237551.0, "step": 22705 }, { "entropy": 0.053751620929688215, "epoch": 5.293740529199208, "grad_norm": 0.45703125, "learning_rate": 4.926986154413338e-05, "loss": 0.0348, "mean_token_accuracy": 0.9854931950569152, "num_tokens": 59267330.0, "step": 22710 }, { "entropy": 0.076590671017766, "epoch": 5.294906166219839, "grad_norm": 3.375, "learning_rate": 4.926934741629609e-05, "loss": 0.0948, "mean_token_accuracy": 0.977790892124176, "num_tokens": 59279092.0, "step": 22715 }, { "entropy": 0.0669423419982195, "epoch": 5.296071803240471, "grad_norm": 0.455078125, "learning_rate": 4.9268833112960126e-05, "loss": 0.0762, "mean_token_accuracy": 0.9806538939476013, "num_tokens": 59290726.0, "step": 22720 }, { "entropy": 0.06741415970027446, "epoch": 5.297237440261103, "grad_norm": 1.7421875, "learning_rate": 4.926831863413316e-05, "loss": 0.0538, "mean_token_accuracy": 0.9821211993694305, "num_tokens": 59311371.0, "step": 22725 }, { "entropy": 0.0793554861098528, "epoch": 5.298403077281734, "grad_norm": 1.515625, "learning_rate": 4.9267803979822856e-05, "loss": 0.0552, "mean_token_accuracy": 0.9833518862724304, "num_tokens": 59333334.0, "step": 22730 }, { "entropy": 0.07032034313306212, "epoch": 5.299568714302366, "grad_norm": 0.6171875, "learning_rate": 4.92672891500369e-05, "loss": 0.0468, "mean_token_accuracy": 0.9863258957862854, "num_tokens": 59355525.0, "step": 22735 }, { "entropy": 0.06814271304756403, "epoch": 5.300734351322998, "grad_norm": 1.0625, "learning_rate": 4.9266774144782965e-05, "loss": 0.0625, "mean_token_accuracy": 0.9831117630004883, "num_tokens": 59368173.0, "step": 22740 }, { "entropy": 0.08565502576529979, "epoch": 5.30189998834363, "grad_norm": 0.65234375, "learning_rate": 4.926625896406873e-05, "loss": 0.0617, "mean_token_accuracy": 0.9830956637859345, "num_tokens": 59387615.0, "step": 22745 }, { "entropy": 0.07533656526356936, "epoch": 5.303065625364262, "grad_norm": 0.7578125, "learning_rate": 4.926574360790187e-05, "loss": 0.0657, "mean_token_accuracy": 0.9792934238910675, "num_tokens": 59406251.0, "step": 22750 }, { "entropy": 0.0985400104895234, "epoch": 5.304231262384893, "grad_norm": 0.9921875, "learning_rate": 4.926522807629008e-05, "loss": 0.1293, "mean_token_accuracy": 0.9710362374782562, "num_tokens": 59442852.0, "step": 22755 }, { "entropy": 0.05292136138305068, "epoch": 5.305396899405525, "grad_norm": 0.703125, "learning_rate": 4.9264712369241044e-05, "loss": 0.0253, "mean_token_accuracy": 0.9833535492420197, "num_tokens": 59483889.0, "step": 22760 }, { "entropy": 0.06867659520357847, "epoch": 5.306562536426157, "grad_norm": 1.0, "learning_rate": 4.926419648676245e-05, "loss": 0.0453, "mean_token_accuracy": 0.9813343346118927, "num_tokens": 59503823.0, "step": 22765 }, { "entropy": 0.05698272874578834, "epoch": 5.3077281734467885, "grad_norm": 0.419921875, "learning_rate": 4.926368042886199e-05, "loss": 0.0302, "mean_token_accuracy": 0.9861780762672424, "num_tokens": 59529127.0, "step": 22770 }, { "entropy": 0.0628203245345503, "epoch": 5.308893810467421, "grad_norm": 0.337890625, "learning_rate": 4.926316419554737e-05, "loss": 0.06, "mean_token_accuracy": 0.9819603025913238, "num_tokens": 59549039.0, "step": 22775 }, { "entropy": 0.08705004677176476, "epoch": 5.310059447488052, "grad_norm": 2.578125, "learning_rate": 4.926264778682627e-05, "loss": 0.105, "mean_token_accuracy": 0.9772071361541748, "num_tokens": 59556652.0, "step": 22780 }, { "entropy": 0.05819199327379465, "epoch": 5.311225084508684, "grad_norm": 1.3203125, "learning_rate": 4.9262131202706404e-05, "loss": 0.0728, "mean_token_accuracy": 0.9819917857646943, "num_tokens": 59567756.0, "step": 22785 }, { "entropy": 0.07961954735219479, "epoch": 5.312390721529316, "grad_norm": 1.671875, "learning_rate": 4.926161444319547e-05, "loss": 0.0685, "mean_token_accuracy": 0.98085697889328, "num_tokens": 59578826.0, "step": 22790 }, { "entropy": 0.08680852949619293, "epoch": 5.313556358549947, "grad_norm": 2.75, "learning_rate": 4.926109750830117e-05, "loss": 0.0905, "mean_token_accuracy": 0.9772542119026184, "num_tokens": 59587586.0, "step": 22795 }, { "entropy": 0.08079705536365508, "epoch": 5.314721995570579, "grad_norm": 2.171875, "learning_rate": 4.9260580398031217e-05, "loss": 0.054, "mean_token_accuracy": 0.9830148041248321, "num_tokens": 59599512.0, "step": 22800 }, { "entropy": 0.0764385698363185, "epoch": 5.315887632591211, "grad_norm": 1.9453125, "learning_rate": 4.926006311239333e-05, "loss": 0.0765, "mean_token_accuracy": 0.9792965352535248, "num_tokens": 59617321.0, "step": 22805 }, { "entropy": 0.05908251665532589, "epoch": 5.317053269611843, "grad_norm": 1.203125, "learning_rate": 4.9259545651395206e-05, "loss": 0.0499, "mean_token_accuracy": 0.9890253007411957, "num_tokens": 59637540.0, "step": 22810 }, { "entropy": 0.10033303014934063, "epoch": 5.318218906632475, "grad_norm": 2.296875, "learning_rate": 4.925902801504457e-05, "loss": 0.0497, "mean_token_accuracy": 0.9801102578639984, "num_tokens": 59649451.0, "step": 22815 }, { "entropy": 0.07409988492727279, "epoch": 5.319384543653106, "grad_norm": 2.203125, "learning_rate": 4.925851020334914e-05, "loss": 0.0672, "mean_token_accuracy": 0.9834694027900696, "num_tokens": 59671666.0, "step": 22820 }, { "entropy": 0.05841764649376273, "epoch": 5.320550180673738, "grad_norm": 3.078125, "learning_rate": 4.925799221631664e-05, "loss": 0.0625, "mean_token_accuracy": 0.9858494102954865, "num_tokens": 59689708.0, "step": 22825 }, { "entropy": 0.07404088694602251, "epoch": 5.32171581769437, "grad_norm": 1.3984375, "learning_rate": 4.925747405395479e-05, "loss": 0.0631, "mean_token_accuracy": 0.9827221155166626, "num_tokens": 59701112.0, "step": 22830 }, { "entropy": 0.07440090011805296, "epoch": 5.322881454715001, "grad_norm": 3.109375, "learning_rate": 4.925695571627131e-05, "loss": 0.0681, "mean_token_accuracy": 0.9816492676734925, "num_tokens": 59712390.0, "step": 22835 }, { "entropy": 0.09319182969629765, "epoch": 5.3240470917356335, "grad_norm": 6.5, "learning_rate": 4.925643720327395e-05, "loss": 0.0705, "mean_token_accuracy": 0.9809212386608124, "num_tokens": 59730225.0, "step": 22840 }, { "entropy": 0.06552286930382252, "epoch": 5.325212728756266, "grad_norm": 2.09375, "learning_rate": 4.9255918514970424e-05, "loss": 0.0519, "mean_token_accuracy": 0.9848517298698425, "num_tokens": 59747533.0, "step": 22845 }, { "entropy": 0.06686258316040039, "epoch": 5.326378365776897, "grad_norm": 3.609375, "learning_rate": 4.9255399651368465e-05, "loss": 0.0496, "mean_token_accuracy": 0.9866720378398895, "num_tokens": 59772803.0, "step": 22850 }, { "entropy": 0.08167769797146321, "epoch": 5.327544002797529, "grad_norm": 0.62890625, "learning_rate": 4.9254880612475816e-05, "loss": 0.0716, "mean_token_accuracy": 0.9794778764247895, "num_tokens": 59786475.0, "step": 22855 }, { "entropy": 0.06763834794983267, "epoch": 5.328709639818161, "grad_norm": 0.3203125, "learning_rate": 4.925436139830022e-05, "loss": 0.0275, "mean_token_accuracy": 0.9890903234481812, "num_tokens": 59815060.0, "step": 22860 }, { "entropy": 0.04429660914465785, "epoch": 5.329875276838792, "grad_norm": 1.328125, "learning_rate": 4.925384200884942e-05, "loss": 0.0434, "mean_token_accuracy": 0.9886953115463257, "num_tokens": 59855529.0, "step": 22865 }, { "entropy": 0.06582138538360596, "epoch": 5.331040913859424, "grad_norm": 1.5078125, "learning_rate": 4.925332244413115e-05, "loss": 0.0621, "mean_token_accuracy": 0.981554490327835, "num_tokens": 59869039.0, "step": 22870 }, { "entropy": 0.07861397005617618, "epoch": 5.332206550880056, "grad_norm": 1.765625, "learning_rate": 4.9252802704153176e-05, "loss": 0.0727, "mean_token_accuracy": 0.9805832087993622, "num_tokens": 59883296.0, "step": 22875 }, { "entropy": 0.23328736871480943, "epoch": 5.333372187900688, "grad_norm": 0.94921875, "learning_rate": 4.925228278892323e-05, "loss": 0.3953, "mean_token_accuracy": 0.9536088466644287, "num_tokens": 59902762.0, "step": 22880 }, { "entropy": 0.10579044306650758, "epoch": 5.33453782492132, "grad_norm": 0.52734375, "learning_rate": 4.925176269844907e-05, "loss": 0.051, "mean_token_accuracy": 0.9797735571861267, "num_tokens": 59924881.0, "step": 22885 }, { "entropy": 0.06493972707539797, "epoch": 5.335703461941951, "grad_norm": 1.75, "learning_rate": 4.925124243273845e-05, "loss": 0.0758, "mean_token_accuracy": 0.9782446086406708, "num_tokens": 59934987.0, "step": 22890 }, { "entropy": 0.06513304226100444, "epoch": 5.336869098962583, "grad_norm": 1.1953125, "learning_rate": 4.925072199179913e-05, "loss": 0.0462, "mean_token_accuracy": 0.9851133108139039, "num_tokens": 59959324.0, "step": 22895 }, { "entropy": 0.05224155634641647, "epoch": 5.338034735983215, "grad_norm": 0.76953125, "learning_rate": 4.925020137563887e-05, "loss": 0.044, "mean_token_accuracy": 0.9833649158477783, "num_tokens": 59983184.0, "step": 22900 }, { "entropy": 0.056037331186234954, "epoch": 5.339200373003846, "grad_norm": 0.64453125, "learning_rate": 4.924968058426545e-05, "loss": 0.0449, "mean_token_accuracy": 0.9889161825180054, "num_tokens": 60001573.0, "step": 22905 }, { "entropy": 0.04825269635766745, "epoch": 5.3403660100244785, "grad_norm": 0.3515625, "learning_rate": 4.9249159617686604e-05, "loss": 0.0468, "mean_token_accuracy": 0.9882710158824921, "num_tokens": 60022740.0, "step": 22910 }, { "entropy": 0.1773822302930057, "epoch": 5.34153164704511, "grad_norm": 5.46875, "learning_rate": 4.9248638475910115e-05, "loss": 0.2673, "mean_token_accuracy": 0.9561372995376587, "num_tokens": 60052310.0, "step": 22915 }, { "entropy": 0.08487709350883961, "epoch": 5.342697284065742, "grad_norm": 4.09375, "learning_rate": 4.924811715894376e-05, "loss": 0.0975, "mean_token_accuracy": 0.974593210220337, "num_tokens": 60070713.0, "step": 22920 }, { "entropy": 0.056556498538702725, "epoch": 5.343862921086374, "grad_norm": 3.625, "learning_rate": 4.924759566679531e-05, "loss": 0.0526, "mean_token_accuracy": 0.984348326921463, "num_tokens": 60096937.0, "step": 22925 }, { "entropy": 0.07066576043143868, "epoch": 5.345028558107005, "grad_norm": 2.328125, "learning_rate": 4.9247073999472536e-05, "loss": 0.0562, "mean_token_accuracy": 0.9847615897655487, "num_tokens": 60117778.0, "step": 22930 }, { "entropy": 0.10020845541730523, "epoch": 5.346194195127637, "grad_norm": 1.3515625, "learning_rate": 4.9246552156983224e-05, "loss": 0.0264, "mean_token_accuracy": 0.9856527805328369, "num_tokens": 60146277.0, "step": 22935 }, { "entropy": 0.07661400884389877, "epoch": 5.347359832148269, "grad_norm": 2.71875, "learning_rate": 4.924603013933515e-05, "loss": 0.0775, "mean_token_accuracy": 0.9822577476501465, "num_tokens": 60156866.0, "step": 22940 }, { "entropy": 0.08874692600220442, "epoch": 5.348525469168901, "grad_norm": 2.625, "learning_rate": 4.92455079465361e-05, "loss": 0.0369, "mean_token_accuracy": 0.9810423135757447, "num_tokens": 60176878.0, "step": 22945 }, { "entropy": 0.07762233018875123, "epoch": 5.349691106189533, "grad_norm": 0.921875, "learning_rate": 4.924498557859386e-05, "loss": 0.0807, "mean_token_accuracy": 0.9797823786735534, "num_tokens": 60194815.0, "step": 22950 }, { "entropy": 0.05616168519482016, "epoch": 5.350856743210164, "grad_norm": 0.96484375, "learning_rate": 4.924446303551622e-05, "loss": 0.0522, "mean_token_accuracy": 0.9837803184986115, "num_tokens": 60214031.0, "step": 22955 }, { "entropy": 0.06755325347185134, "epoch": 5.352022380230796, "grad_norm": 2.90625, "learning_rate": 4.9243940317310964e-05, "loss": 0.0697, "mean_token_accuracy": 0.9821909844875336, "num_tokens": 60227009.0, "step": 22960 }, { "entropy": 0.07493439922109246, "epoch": 5.353188017251428, "grad_norm": 0.328125, "learning_rate": 4.92434174239859e-05, "loss": 0.0874, "mean_token_accuracy": 0.9795318305492401, "num_tokens": 60240587.0, "step": 22965 }, { "entropy": 0.059706505248323084, "epoch": 5.354353654272059, "grad_norm": 0.5703125, "learning_rate": 4.924289435554882e-05, "loss": 0.0517, "mean_token_accuracy": 0.9835958003997802, "num_tokens": 60265229.0, "step": 22970 }, { "entropy": 0.0833500050008297, "epoch": 5.355519291292691, "grad_norm": 3.140625, "learning_rate": 4.9242371112007526e-05, "loss": 0.0761, "mean_token_accuracy": 0.9778411746025085, "num_tokens": 60283665.0, "step": 22975 }, { "entropy": 0.0801816951483488, "epoch": 5.3566849283133235, "grad_norm": 3.859375, "learning_rate": 4.9241847693369816e-05, "loss": 0.0882, "mean_token_accuracy": 0.9788227140903473, "num_tokens": 60292492.0, "step": 22980 }, { "entropy": 0.055112460954114796, "epoch": 5.357850565333955, "grad_norm": 0.734375, "learning_rate": 4.924132409964349e-05, "loss": 0.0386, "mean_token_accuracy": 0.9883932769298553, "num_tokens": 60319183.0, "step": 22985 }, { "entropy": 0.06558550810441374, "epoch": 5.359016202354587, "grad_norm": 0.3203125, "learning_rate": 4.924080033083637e-05, "loss": 0.0699, "mean_token_accuracy": 0.9772551417350769, "num_tokens": 60335640.0, "step": 22990 }, { "entropy": 0.05295390598475933, "epoch": 5.360181839375219, "grad_norm": 1.25, "learning_rate": 4.9240276386956246e-05, "loss": 0.0433, "mean_token_accuracy": 0.9863731324672699, "num_tokens": 60352124.0, "step": 22995 }, { "entropy": 0.06834200341254473, "epoch": 5.36134747639585, "grad_norm": 2.453125, "learning_rate": 4.923975226801095e-05, "loss": 0.0382, "mean_token_accuracy": 0.9834942102432251, "num_tokens": 60376603.0, "step": 23000 }, { "entropy": 0.05982331410050392, "epoch": 5.362513113416482, "grad_norm": 1.8046875, "learning_rate": 4.92392279740083e-05, "loss": 0.0499, "mean_token_accuracy": 0.9867896795272827, "num_tokens": 60390601.0, "step": 23005 }, { "entropy": 0.05300872353836894, "epoch": 5.3636787504371135, "grad_norm": 0.310546875, "learning_rate": 4.9238703504956084e-05, "loss": 0.052, "mean_token_accuracy": 0.9840744435787201, "num_tokens": 60413557.0, "step": 23010 }, { "entropy": 0.11512393001466989, "epoch": 5.364844387457746, "grad_norm": 0.466796875, "learning_rate": 4.9238178860862155e-05, "loss": 0.1433, "mean_token_accuracy": 0.9661681652069092, "num_tokens": 60447690.0, "step": 23015 }, { "entropy": 0.08497421033680438, "epoch": 5.366010024478378, "grad_norm": 1.75, "learning_rate": 4.923765404173432e-05, "loss": 0.0769, "mean_token_accuracy": 0.9765770733356476, "num_tokens": 60457956.0, "step": 23020 }, { "entropy": 0.06545198997482657, "epoch": 5.367175661499009, "grad_norm": 3.234375, "learning_rate": 4.923712904758041e-05, "loss": 0.0718, "mean_token_accuracy": 0.9820848286151886, "num_tokens": 60477779.0, "step": 23025 }, { "entropy": 0.05556817147880792, "epoch": 5.368341298519641, "grad_norm": 0.86328125, "learning_rate": 4.923660387840826e-05, "loss": 0.0521, "mean_token_accuracy": 0.9859776258468628, "num_tokens": 60496484.0, "step": 23030 }, { "entropy": 0.06978419441729784, "epoch": 5.369506935540273, "grad_norm": 0.10986328125, "learning_rate": 4.923607853422568e-05, "loss": 0.0377, "mean_token_accuracy": 0.983120220899582, "num_tokens": 60527179.0, "step": 23035 }, { "entropy": 0.05648170947097242, "epoch": 5.370672572560904, "grad_norm": 0.66796875, "learning_rate": 4.9235553015040525e-05, "loss": 0.0363, "mean_token_accuracy": 0.9899722754955291, "num_tokens": 60554130.0, "step": 23040 }, { "entropy": 0.06253222562372684, "epoch": 5.3718382095815365, "grad_norm": 2.59375, "learning_rate": 4.923502732086062e-05, "loss": 0.0654, "mean_token_accuracy": 0.9840362310409546, "num_tokens": 60568544.0, "step": 23045 }, { "entropy": 0.06851723911240697, "epoch": 5.373003846602168, "grad_norm": 1.3515625, "learning_rate": 4.923450145169381e-05, "loss": 0.0727, "mean_token_accuracy": 0.9818732261657714, "num_tokens": 60581662.0, "step": 23050 }, { "entropy": 0.061725224601104856, "epoch": 5.3741694836228, "grad_norm": 1.390625, "learning_rate": 4.923397540754793e-05, "loss": 0.0405, "mean_token_accuracy": 0.9862514436244965, "num_tokens": 60607397.0, "step": 23055 }, { "entropy": 0.04291647081263363, "epoch": 5.375335120643432, "grad_norm": 0.1533203125, "learning_rate": 4.923344918843084e-05, "loss": 0.0226, "mean_token_accuracy": 0.9882668256759644, "num_tokens": 60637436.0, "step": 23060 }, { "entropy": 0.07198273623362184, "epoch": 5.376500757664063, "grad_norm": 1.7890625, "learning_rate": 4.923292279435036e-05, "loss": 0.0546, "mean_token_accuracy": 0.9835138738155365, "num_tokens": 60662709.0, "step": 23065 }, { "entropy": 0.04424673020839691, "epoch": 5.377666394684695, "grad_norm": 3.5625, "learning_rate": 4.923239622531436e-05, "loss": 0.0484, "mean_token_accuracy": 0.9847693383693695, "num_tokens": 60690611.0, "step": 23070 }, { "entropy": 0.07281300444155932, "epoch": 5.378832031705327, "grad_norm": 0.8359375, "learning_rate": 4.923186948133068e-05, "loss": 0.0682, "mean_token_accuracy": 0.9764643311500549, "num_tokens": 60714624.0, "step": 23075 }, { "entropy": 0.0488906929269433, "epoch": 5.3799976687259585, "grad_norm": 2.96875, "learning_rate": 4.923134256240718e-05, "loss": 0.0612, "mean_token_accuracy": 0.985430383682251, "num_tokens": 60747333.0, "step": 23080 }, { "entropy": 0.07100528012961149, "epoch": 5.381163305746591, "grad_norm": 0.498046875, "learning_rate": 4.923081546855173e-05, "loss": 0.0735, "mean_token_accuracy": 0.9796480000019073, "num_tokens": 60764522.0, "step": 23085 }, { "entropy": 0.04467966118827462, "epoch": 5.382328942767222, "grad_norm": 0.57421875, "learning_rate": 4.923028819977217e-05, "loss": 0.0198, "mean_token_accuracy": 0.9906904339790344, "num_tokens": 60794466.0, "step": 23090 }, { "entropy": 0.07430108338594436, "epoch": 5.383494579787854, "grad_norm": 0.64453125, "learning_rate": 4.9229760756076365e-05, "loss": 0.044, "mean_token_accuracy": 0.9841860115528107, "num_tokens": 60823886.0, "step": 23095 }, { "entropy": 0.06416118433699011, "epoch": 5.384660216808486, "grad_norm": 2.375, "learning_rate": 4.922923313747218e-05, "loss": 0.0544, "mean_token_accuracy": 0.9824954152107239, "num_tokens": 60842778.0, "step": 23100 }, { "entropy": 0.06129303025081754, "epoch": 5.385825853829117, "grad_norm": 2.5, "learning_rate": 4.92287053439675e-05, "loss": 0.0441, "mean_token_accuracy": 0.9878499805927277, "num_tokens": 60863963.0, "step": 23105 }, { "entropy": 0.056876167096197604, "epoch": 5.386991490849749, "grad_norm": 1.109375, "learning_rate": 4.922817737557018e-05, "loss": 0.0511, "mean_token_accuracy": 0.9832296967506409, "num_tokens": 60880737.0, "step": 23110 }, { "entropy": 0.06693640761077405, "epoch": 5.3881571278703815, "grad_norm": 3.421875, "learning_rate": 4.922764923228809e-05, "loss": 0.0559, "mean_token_accuracy": 0.9846435010433197, "num_tokens": 60901158.0, "step": 23115 }, { "entropy": 0.05447808532044292, "epoch": 5.389322764891013, "grad_norm": 1.2890625, "learning_rate": 4.922712091412912e-05, "loss": 0.058, "mean_token_accuracy": 0.984618604183197, "num_tokens": 60925795.0, "step": 23120 }, { "entropy": 0.06588453128933906, "epoch": 5.390488401911645, "grad_norm": 1.5234375, "learning_rate": 4.9226592421101134e-05, "loss": 0.0641, "mean_token_accuracy": 0.986005574464798, "num_tokens": 60948082.0, "step": 23125 }, { "entropy": 0.0593166496604681, "epoch": 5.391654038932277, "grad_norm": 0.83984375, "learning_rate": 4.922606375321201e-05, "loss": 0.0496, "mean_token_accuracy": 0.9875987470149994, "num_tokens": 60959806.0, "step": 23130 }, { "entropy": 0.04983818177133799, "epoch": 5.392819675952908, "grad_norm": 0.36328125, "learning_rate": 4.922553491046965e-05, "loss": 0.0235, "mean_token_accuracy": 0.989845621585846, "num_tokens": 60990336.0, "step": 23135 }, { "entropy": 0.043969867564737794, "epoch": 5.39398531297354, "grad_norm": 4.375, "learning_rate": 4.9225005892881917e-05, "loss": 0.0296, "mean_token_accuracy": 0.9865171551704407, "num_tokens": 61029307.0, "step": 23140 }, { "entropy": 0.06361808869987726, "epoch": 5.395150949994171, "grad_norm": 1.71875, "learning_rate": 4.922447670045672e-05, "loss": 0.0494, "mean_token_accuracy": 0.9815893948078156, "num_tokens": 61059113.0, "step": 23145 }, { "entropy": 0.05684220269322395, "epoch": 5.3963165870148035, "grad_norm": 1.390625, "learning_rate": 4.922394733320193e-05, "loss": 0.0478, "mean_token_accuracy": 0.9857088804244996, "num_tokens": 61082757.0, "step": 23150 }, { "entropy": 0.05390456821769476, "epoch": 5.397482224035436, "grad_norm": 0.439453125, "learning_rate": 4.922341779112546e-05, "loss": 0.0586, "mean_token_accuracy": 0.9862619757652282, "num_tokens": 61107803.0, "step": 23155 }, { "entropy": 0.05104854414239526, "epoch": 5.398647861056067, "grad_norm": 1.484375, "learning_rate": 4.9222888074235194e-05, "loss": 0.0264, "mean_token_accuracy": 0.9864234507083893, "num_tokens": 61134142.0, "step": 23160 }, { "entropy": 0.05798858366906643, "epoch": 5.399813498076699, "grad_norm": 1.3671875, "learning_rate": 4.922235818253904e-05, "loss": 0.0315, "mean_token_accuracy": 0.9854933500289917, "num_tokens": 61162301.0, "step": 23165 }, { "entropy": 0.08851273730397224, "epoch": 5.400979135097331, "grad_norm": 2.359375, "learning_rate": 4.922182811604489e-05, "loss": 0.1097, "mean_token_accuracy": 0.9737513303756714, "num_tokens": 61169551.0, "step": 23170 }, { "entropy": 0.05317346574738622, "epoch": 5.402144772117962, "grad_norm": 0.390625, "learning_rate": 4.922129787476065e-05, "loss": 0.027, "mean_token_accuracy": 0.9891198635101318, "num_tokens": 61194654.0, "step": 23175 }, { "entropy": 0.05494374092668295, "epoch": 5.403310409138594, "grad_norm": 0.2470703125, "learning_rate": 4.922076745869423e-05, "loss": 0.0386, "mean_token_accuracy": 0.9859078824520111, "num_tokens": 61216063.0, "step": 23180 }, { "entropy": 0.07559232525527478, "epoch": 5.404476046159226, "grad_norm": 2.4375, "learning_rate": 4.9220236867853544e-05, "loss": 0.0942, "mean_token_accuracy": 0.974918645620346, "num_tokens": 61225374.0, "step": 23185 }, { "entropy": 0.05903605557978153, "epoch": 5.405641683179858, "grad_norm": 2.15625, "learning_rate": 4.9219706102246494e-05, "loss": 0.0425, "mean_token_accuracy": 0.9827019810676575, "num_tokens": 61248622.0, "step": 23190 }, { "entropy": 0.07795067802071572, "epoch": 5.40680732020049, "grad_norm": 1.6875, "learning_rate": 4.9219175161880996e-05, "loss": 0.0693, "mean_token_accuracy": 0.9779790699481964, "num_tokens": 61268201.0, "step": 23195 }, { "entropy": 0.07784415520727635, "epoch": 5.407972957221121, "grad_norm": 0.8203125, "learning_rate": 4.921864404676497e-05, "loss": 0.0689, "mean_token_accuracy": 0.9818279802799225, "num_tokens": 61284818.0, "step": 23200 }, { "entropy": 0.0624196344986558, "epoch": 5.409138594241753, "grad_norm": 2.640625, "learning_rate": 4.921811275690634e-05, "loss": 0.0509, "mean_token_accuracy": 0.985440480709076, "num_tokens": 61299220.0, "step": 23205 }, { "entropy": 0.07447193302214146, "epoch": 5.410304231262385, "grad_norm": 1.28125, "learning_rate": 4.921758129231302e-05, "loss": 0.0516, "mean_token_accuracy": 0.985534542798996, "num_tokens": 61328795.0, "step": 23210 }, { "entropy": 0.07264035008847713, "epoch": 5.4114698682830165, "grad_norm": 2.953125, "learning_rate": 4.921704965299294e-05, "loss": 0.048, "mean_token_accuracy": 0.984207808971405, "num_tokens": 61366583.0, "step": 23215 }, { "entropy": 0.06596151059493423, "epoch": 5.412635505303649, "grad_norm": 0.431640625, "learning_rate": 4.921651783895403e-05, "loss": 0.0489, "mean_token_accuracy": 0.9818928420543671, "num_tokens": 61395745.0, "step": 23220 }, { "entropy": 0.0670158120803535, "epoch": 5.41380114232428, "grad_norm": 0.212890625, "learning_rate": 4.921598585020421e-05, "loss": 0.0558, "mean_token_accuracy": 0.9837601244449615, "num_tokens": 61414740.0, "step": 23225 }, { "entropy": 0.05426953062415123, "epoch": 5.414966779344912, "grad_norm": 1.796875, "learning_rate": 4.9215453686751425e-05, "loss": 0.027, "mean_token_accuracy": 0.9845106363296509, "num_tokens": 61445656.0, "step": 23230 }, { "entropy": 0.09449506886303424, "epoch": 5.416132416365544, "grad_norm": 2.140625, "learning_rate": 4.92149213486036e-05, "loss": 0.1055, "mean_token_accuracy": 0.9766972839832306, "num_tokens": 61467238.0, "step": 23235 }, { "entropy": 0.061407574266195294, "epoch": 5.417298053386175, "grad_norm": 2.421875, "learning_rate": 4.921438883576868e-05, "loss": 0.0448, "mean_token_accuracy": 0.9856343030929565, "num_tokens": 61490562.0, "step": 23240 }, { "entropy": 0.06952263060957194, "epoch": 5.418463690406807, "grad_norm": 3.328125, "learning_rate": 4.92138561482546e-05, "loss": 0.0752, "mean_token_accuracy": 0.9781166553497315, "num_tokens": 61506678.0, "step": 23245 }, { "entropy": 0.06815708391368389, "epoch": 5.419629327427439, "grad_norm": 1.0546875, "learning_rate": 4.9213323286069306e-05, "loss": 0.0527, "mean_token_accuracy": 0.9844055712223053, "num_tokens": 61528547.0, "step": 23250 }, { "entropy": 0.07312852032482624, "epoch": 5.420794964448071, "grad_norm": 1.8984375, "learning_rate": 4.9212790249220746e-05, "loss": 0.0722, "mean_token_accuracy": 0.9812857508659363, "num_tokens": 61536967.0, "step": 23255 }, { "entropy": 0.050287547335028646, "epoch": 5.421960601468703, "grad_norm": 2.359375, "learning_rate": 4.921225703771687e-05, "loss": 0.0424, "mean_token_accuracy": 0.9859198093414306, "num_tokens": 61564543.0, "step": 23260 }, { "entropy": 0.06520504876971245, "epoch": 5.423126238489335, "grad_norm": 1.4375, "learning_rate": 4.921172365156562e-05, "loss": 0.0734, "mean_token_accuracy": 0.9789164006710053, "num_tokens": 61575211.0, "step": 23265 }, { "entropy": 0.0722085983492434, "epoch": 5.424291875509966, "grad_norm": 6.46875, "learning_rate": 4.9211190090774956e-05, "loss": 0.0831, "mean_token_accuracy": 0.9780640542507172, "num_tokens": 61590751.0, "step": 23270 }, { "entropy": 0.09001476243138314, "epoch": 5.425457512530598, "grad_norm": 2.4375, "learning_rate": 4.921065635535284e-05, "loss": 0.0827, "mean_token_accuracy": 0.9779356181621551, "num_tokens": 61598970.0, "step": 23275 }, { "entropy": 0.07482207007706165, "epoch": 5.426623149551229, "grad_norm": 1.7890625, "learning_rate": 4.921012244530721e-05, "loss": 0.0803, "mean_token_accuracy": 0.9787347793579102, "num_tokens": 61608913.0, "step": 23280 }, { "entropy": 0.05302851120941341, "epoch": 5.4277887865718615, "grad_norm": 2.25, "learning_rate": 4.920958836064605e-05, "loss": 0.0378, "mean_token_accuracy": 0.9879450976848603, "num_tokens": 61630650.0, "step": 23285 }, { "entropy": 0.06299825739115476, "epoch": 5.428954423592494, "grad_norm": 1.671875, "learning_rate": 4.920905410137732e-05, "loss": 0.0444, "mean_token_accuracy": 0.983812016248703, "num_tokens": 61653853.0, "step": 23290 }, { "entropy": 0.07575079947710037, "epoch": 5.430120060613125, "grad_norm": 0.474609375, "learning_rate": 4.920851966750897e-05, "loss": 0.0713, "mean_token_accuracy": 0.9819688856601715, "num_tokens": 61671710.0, "step": 23295 }, { "entropy": 0.06498388964682818, "epoch": 5.431285697633757, "grad_norm": 0.9140625, "learning_rate": 4.920798505904899e-05, "loss": 0.0556, "mean_token_accuracy": 0.9803281247615814, "num_tokens": 61695854.0, "step": 23300 }, { "entropy": 0.06389341745525598, "epoch": 5.432451334654389, "grad_norm": 0.703125, "learning_rate": 4.920745027600534e-05, "loss": 0.0505, "mean_token_accuracy": 0.9866482019424438, "num_tokens": 61709957.0, "step": 23305 }, { "entropy": 0.06346920877695084, "epoch": 5.43361697167502, "grad_norm": 1.3671875, "learning_rate": 4.9206915318385996e-05, "loss": 0.0555, "mean_token_accuracy": 0.982583349943161, "num_tokens": 61733369.0, "step": 23310 }, { "entropy": 0.08282421119511127, "epoch": 5.434782608695652, "grad_norm": 2.828125, "learning_rate": 4.920638018619894e-05, "loss": 0.0887, "mean_token_accuracy": 0.9766203939914704, "num_tokens": 61750862.0, "step": 23315 }, { "entropy": 0.07322307983413338, "epoch": 5.4359482457162835, "grad_norm": 2.0625, "learning_rate": 4.920584487945215e-05, "loss": 0.052, "mean_token_accuracy": 0.9795838713645935, "num_tokens": 61775206.0, "step": 23320 }, { "entropy": 0.06913946568965912, "epoch": 5.437113882736916, "grad_norm": 3.59375, "learning_rate": 4.9205309398153596e-05, "loss": 0.0503, "mean_token_accuracy": 0.9846193373203278, "num_tokens": 61792356.0, "step": 23325 }, { "entropy": 0.08056791853159666, "epoch": 5.438279519757548, "grad_norm": 4.875, "learning_rate": 4.9204773742311275e-05, "loss": 0.0916, "mean_token_accuracy": 0.9753158092498779, "num_tokens": 61802752.0, "step": 23330 }, { "entropy": 0.06826584823429585, "epoch": 5.439445156778179, "grad_norm": 2.40625, "learning_rate": 4.920423791193318e-05, "loss": 0.0527, "mean_token_accuracy": 0.9859543740749359, "num_tokens": 61817406.0, "step": 23335 }, { "entropy": 0.0683171335607767, "epoch": 5.440610793798811, "grad_norm": 0.7421875, "learning_rate": 4.920370190702729e-05, "loss": 0.0527, "mean_token_accuracy": 0.9838370501995086, "num_tokens": 61845861.0, "step": 23340 }, { "entropy": 0.05109058595262468, "epoch": 5.441776430819443, "grad_norm": 0.296875, "learning_rate": 4.92031657276016e-05, "loss": 0.0332, "mean_token_accuracy": 0.987951809167862, "num_tokens": 61871427.0, "step": 23345 }, { "entropy": 0.04325137957930565, "epoch": 5.442942067840074, "grad_norm": 0.6796875, "learning_rate": 4.9202629373664114e-05, "loss": 0.0224, "mean_token_accuracy": 0.9934803903102875, "num_tokens": 61905169.0, "step": 23350 }, { "entropy": 0.059416829235851766, "epoch": 5.4441077048607065, "grad_norm": 1.140625, "learning_rate": 4.920209284522282e-05, "loss": 0.0422, "mean_token_accuracy": 0.9885890066623688, "num_tokens": 61924906.0, "step": 23355 }, { "entropy": 0.06894025560468435, "epoch": 5.445273341881338, "grad_norm": 0.8515625, "learning_rate": 4.920155614228571e-05, "loss": 0.0368, "mean_token_accuracy": 0.9853028953075409, "num_tokens": 61949361.0, "step": 23360 }, { "entropy": 0.07303128968924284, "epoch": 5.44643897890197, "grad_norm": 5.6875, "learning_rate": 4.92010192648608e-05, "loss": 0.069, "mean_token_accuracy": 0.9811192095279694, "num_tokens": 61960365.0, "step": 23365 }, { "entropy": 0.07817558608949185, "epoch": 5.447604615922602, "grad_norm": 1.609375, "learning_rate": 4.92004822129561e-05, "loss": 0.0836, "mean_token_accuracy": 0.978845477104187, "num_tokens": 61970637.0, "step": 23370 }, { "entropy": 0.06893630996346474, "epoch": 5.448770252943233, "grad_norm": 2.453125, "learning_rate": 4.9199944986579616e-05, "loss": 0.0599, "mean_token_accuracy": 0.9823831498622895, "num_tokens": 61990173.0, "step": 23375 }, { "entropy": 0.06768325287848712, "epoch": 5.449935889963865, "grad_norm": 0.8359375, "learning_rate": 4.9199407585739344e-05, "loss": 0.0448, "mean_token_accuracy": 0.9853338956832886, "num_tokens": 62007777.0, "step": 23380 }, { "entropy": 0.04943525260314345, "epoch": 5.451101526984497, "grad_norm": 0.89453125, "learning_rate": 4.919887001044332e-05, "loss": 0.0362, "mean_token_accuracy": 0.9880561709403992, "num_tokens": 62029816.0, "step": 23385 }, { "entropy": 0.05788094364106655, "epoch": 5.452267164005129, "grad_norm": 0.349609375, "learning_rate": 4.919833226069954e-05, "loss": 0.0599, "mean_token_accuracy": 0.9839804172515869, "num_tokens": 62044288.0, "step": 23390 }, { "entropy": 0.06028408519923687, "epoch": 5.453432801025761, "grad_norm": 3.375, "learning_rate": 4.919779433651603e-05, "loss": 0.0457, "mean_token_accuracy": 0.9855101108551025, "num_tokens": 62069683.0, "step": 23395 }, { "entropy": 0.07415076242759824, "epoch": 5.454598438046393, "grad_norm": 0.111328125, "learning_rate": 4.9197256237900815e-05, "loss": 0.0554, "mean_token_accuracy": 0.9815491080284119, "num_tokens": 62098121.0, "step": 23400 }, { "entropy": 0.055291141476482154, "epoch": 5.455764075067024, "grad_norm": 0.33203125, "learning_rate": 4.919671796486191e-05, "loss": 0.026, "mean_token_accuracy": 0.9862238466739655, "num_tokens": 62136468.0, "step": 23405 }, { "entropy": 0.046719343215227124, "epoch": 5.456929712087656, "grad_norm": 0.54296875, "learning_rate": 4.919617951740735e-05, "loss": 0.0252, "mean_token_accuracy": 0.9928770422935486, "num_tokens": 62169672.0, "step": 23410 }, { "entropy": 0.0630274849012494, "epoch": 5.458095349108287, "grad_norm": 2.296875, "learning_rate": 4.919564089554516e-05, "loss": 0.0426, "mean_token_accuracy": 0.9808637201786041, "num_tokens": 62191802.0, "step": 23415 }, { "entropy": 0.06985501274466514, "epoch": 5.459260986128919, "grad_norm": 0.61328125, "learning_rate": 4.919510209928338e-05, "loss": 0.0763, "mean_token_accuracy": 0.9818107604980468, "num_tokens": 62211482.0, "step": 23420 }, { "entropy": 0.050887050852179525, "epoch": 5.4604266231495515, "grad_norm": 1.7578125, "learning_rate": 4.9194563128630034e-05, "loss": 0.0469, "mean_token_accuracy": 0.9865157008171082, "num_tokens": 62241942.0, "step": 23425 }, { "entropy": 0.07516262400895357, "epoch": 5.461592260170183, "grad_norm": 4.1875, "learning_rate": 4.9194023983593164e-05, "loss": 0.0863, "mean_token_accuracy": 0.9760660409927369, "num_tokens": 62252888.0, "step": 23430 }, { "entropy": 0.0662376806139946, "epoch": 5.462757897190815, "grad_norm": 1.8046875, "learning_rate": 4.91934846641808e-05, "loss": 0.0362, "mean_token_accuracy": 0.9875830292701722, "num_tokens": 62270136.0, "step": 23435 }, { "entropy": 0.10946454834192991, "epoch": 5.463923534211447, "grad_norm": 3.515625, "learning_rate": 4.9192945170401e-05, "loss": 0.155, "mean_token_accuracy": 0.9659136891365051, "num_tokens": 62300217.0, "step": 23440 }, { "entropy": 0.0465864603407681, "epoch": 5.465089171232078, "grad_norm": 3.328125, "learning_rate": 4.919240550226179e-05, "loss": 0.0499, "mean_token_accuracy": 0.9880475103855133, "num_tokens": 62325726.0, "step": 23445 }, { "entropy": 0.058729467820376156, "epoch": 5.46625480825271, "grad_norm": 1.2109375, "learning_rate": 4.919186565977124e-05, "loss": 0.0248, "mean_token_accuracy": 0.9913538217544555, "num_tokens": 62360995.0, "step": 23450 }, { "entropy": 0.07499024383723736, "epoch": 5.4674204452733415, "grad_norm": 1.046875, "learning_rate": 4.919132564293738e-05, "loss": 0.0848, "mean_token_accuracy": 0.9793896615505219, "num_tokens": 62370698.0, "step": 23455 }, { "entropy": 0.07226166576147079, "epoch": 5.468586082293974, "grad_norm": 2.5, "learning_rate": 4.919078545176827e-05, "loss": 0.0683, "mean_token_accuracy": 0.9799470126628875, "num_tokens": 62381984.0, "step": 23460 }, { "entropy": 0.055472942627966405, "epoch": 5.469751719314606, "grad_norm": 0.79296875, "learning_rate": 4.919024508627197e-05, "loss": 0.0485, "mean_token_accuracy": 0.9879952669143677, "num_tokens": 62399610.0, "step": 23465 }, { "entropy": 0.06007210109382868, "epoch": 5.470917356335237, "grad_norm": 0.62109375, "learning_rate": 4.918970454645653e-05, "loss": 0.0515, "mean_token_accuracy": 0.9846825122833252, "num_tokens": 62419360.0, "step": 23470 }, { "entropy": 0.055671576596796515, "epoch": 5.472082993355869, "grad_norm": 1.140625, "learning_rate": 4.918916383233001e-05, "loss": 0.053, "mean_token_accuracy": 0.9845706820487976, "num_tokens": 62433719.0, "step": 23475 }, { "entropy": 0.10006310492753982, "epoch": 5.473248630376501, "grad_norm": 3.0625, "learning_rate": 4.918862294390048e-05, "loss": 0.1352, "mean_token_accuracy": 0.9714458584785461, "num_tokens": 62449051.0, "step": 23480 }, { "entropy": 0.053108157590031624, "epoch": 5.474414267397132, "grad_norm": 0.435546875, "learning_rate": 4.9188081881176e-05, "loss": 0.054, "mean_token_accuracy": 0.98538578748703, "num_tokens": 62466128.0, "step": 23485 }, { "entropy": 0.06686123460531235, "epoch": 5.475579904417764, "grad_norm": 4.46875, "learning_rate": 4.918754064416464e-05, "loss": 0.0633, "mean_token_accuracy": 0.9822019279003144, "num_tokens": 62487018.0, "step": 23490 }, { "entropy": 0.10811963500455021, "epoch": 5.476745541438396, "grad_norm": 1.671875, "learning_rate": 4.918699923287446e-05, "loss": 0.116, "mean_token_accuracy": 0.9671446025371552, "num_tokens": 62514633.0, "step": 23495 }, { "entropy": 0.04214726220816374, "epoch": 5.477911178459028, "grad_norm": 0.703125, "learning_rate": 4.918645764731355e-05, "loss": 0.0271, "mean_token_accuracy": 0.9906268119812012, "num_tokens": 62545134.0, "step": 23500 }, { "entropy": 0.0906677044928074, "epoch": 5.47907681547966, "grad_norm": 0.9921875, "learning_rate": 4.9185915887489986e-05, "loss": 0.0309, "mean_token_accuracy": 0.9871439218521119, "num_tokens": 62571647.0, "step": 23505 }, { "entropy": 0.06742565836757422, "epoch": 5.480242452500291, "grad_norm": 1.203125, "learning_rate": 4.918537395341184e-05, "loss": 0.0593, "mean_token_accuracy": 0.9825949847698212, "num_tokens": 62585472.0, "step": 23510 }, { "entropy": 0.06784821143373847, "epoch": 5.481408089520923, "grad_norm": 4.53125, "learning_rate": 4.918483184508718e-05, "loss": 0.0866, "mean_token_accuracy": 0.9811110019683837, "num_tokens": 62613113.0, "step": 23515 }, { "entropy": 0.06738835908472537, "epoch": 5.482573726541555, "grad_norm": 2.171875, "learning_rate": 4.9184289562524114e-05, "loss": 0.0763, "mean_token_accuracy": 0.9819285452365876, "num_tokens": 62624262.0, "step": 23520 }, { "entropy": 0.06374572729691863, "epoch": 5.4837393635621865, "grad_norm": 0.55859375, "learning_rate": 4.918374710573071e-05, "loss": 0.0479, "mean_token_accuracy": 0.984711641073227, "num_tokens": 62646149.0, "step": 23525 }, { "entropy": 0.06009006816893816, "epoch": 5.484905000582819, "grad_norm": 0.68359375, "learning_rate": 4.9183204474715066e-05, "loss": 0.0556, "mean_token_accuracy": 0.9834051728248596, "num_tokens": 62672760.0, "step": 23530 }, { "entropy": 0.07242138041183352, "epoch": 5.486070637603451, "grad_norm": 0.8984375, "learning_rate": 4.918266166948527e-05, "loss": 0.0755, "mean_token_accuracy": 0.9801871180534363, "num_tokens": 62685752.0, "step": 23535 }, { "entropy": 0.08361536599695682, "epoch": 5.487236274624082, "grad_norm": 1.390625, "learning_rate": 4.918211869004942e-05, "loss": 0.0631, "mean_token_accuracy": 0.9825032353401184, "num_tokens": 62700324.0, "step": 23540 }, { "entropy": 0.09443598166108132, "epoch": 5.488401911644714, "grad_norm": 1.40625, "learning_rate": 4.91815755364156e-05, "loss": 0.0763, "mean_token_accuracy": 0.9776216208934784, "num_tokens": 62709285.0, "step": 23545 }, { "entropy": 0.05555082568898797, "epoch": 5.489567548665345, "grad_norm": 2.046875, "learning_rate": 4.918103220859193e-05, "loss": 0.0319, "mean_token_accuracy": 0.986697655916214, "num_tokens": 62729216.0, "step": 23550 }, { "entropy": 0.06571632707491518, "epoch": 5.490733185685977, "grad_norm": 1.4765625, "learning_rate": 4.918048870658649e-05, "loss": 0.0546, "mean_token_accuracy": 0.9831242263317108, "num_tokens": 62747103.0, "step": 23555 }, { "entropy": 0.07703730314970017, "epoch": 5.4918988227066095, "grad_norm": 0.3671875, "learning_rate": 4.91799450304074e-05, "loss": 0.0824, "mean_token_accuracy": 0.9807705223560333, "num_tokens": 62760945.0, "step": 23560 }, { "entropy": 0.06581084001809359, "epoch": 5.493064459727241, "grad_norm": 3.078125, "learning_rate": 4.917940118006276e-05, "loss": 0.0616, "mean_token_accuracy": 0.9824943840503693, "num_tokens": 62778982.0, "step": 23565 }, { "entropy": 0.0631984619423747, "epoch": 5.494230096747873, "grad_norm": 1.296875, "learning_rate": 4.9178857155560684e-05, "loss": 0.0596, "mean_token_accuracy": 0.9853246152400971, "num_tokens": 62791874.0, "step": 23570 }, { "entropy": 0.06704922579228878, "epoch": 5.495395733768505, "grad_norm": 2.5, "learning_rate": 4.9178312956909285e-05, "loss": 0.0628, "mean_token_accuracy": 0.9803689122200012, "num_tokens": 62810586.0, "step": 23575 }, { "entropy": 0.052586893737316134, "epoch": 5.496561370789136, "grad_norm": 3.359375, "learning_rate": 4.9177768584116666e-05, "loss": 0.0389, "mean_token_accuracy": 0.9818240642547608, "num_tokens": 62846529.0, "step": 23580 }, { "entropy": 0.10002233702689409, "epoch": 5.497727007809768, "grad_norm": 0.890625, "learning_rate": 4.917722403719096e-05, "loss": 0.0684, "mean_token_accuracy": 0.9777202427387237, "num_tokens": 62859684.0, "step": 23585 }, { "entropy": 0.07765259984880686, "epoch": 5.498892644830399, "grad_norm": 0.291015625, "learning_rate": 4.917667931614028e-05, "loss": 0.0719, "mean_token_accuracy": 0.9798839628696442, "num_tokens": 62874333.0, "step": 23590 }, { "entropy": 0.06515825875103473, "epoch": 5.5000582818510315, "grad_norm": 0.72265625, "learning_rate": 4.917613442097275e-05, "loss": 0.0428, "mean_token_accuracy": 0.9843474686145782, "num_tokens": 62899424.0, "step": 23595 }, { "entropy": 0.039471688726916906, "epoch": 5.501223918871664, "grad_norm": 0.4609375, "learning_rate": 4.917558935169649e-05, "loss": 0.0123, "mean_token_accuracy": 0.9921641826629639, "num_tokens": 62941284.0, "step": 23600 }, { "entropy": 0.060507692955434324, "epoch": 5.502389555892295, "grad_norm": 1.8671875, "learning_rate": 4.917504410831963e-05, "loss": 0.0476, "mean_token_accuracy": 0.9880904495716095, "num_tokens": 62969692.0, "step": 23605 }, { "entropy": 0.05670437887310982, "epoch": 5.503555192912927, "grad_norm": 0.97265625, "learning_rate": 4.91744986908503e-05, "loss": 0.0612, "mean_token_accuracy": 0.9807877004146576, "num_tokens": 62983795.0, "step": 23610 }, { "entropy": 0.05615283492952585, "epoch": 5.504720829933559, "grad_norm": 2.03125, "learning_rate": 4.917395309929664e-05, "loss": 0.0507, "mean_token_accuracy": 0.9852173507213593, "num_tokens": 63003615.0, "step": 23615 }, { "entropy": 0.062404044810682534, "epoch": 5.50588646695419, "grad_norm": 0.37109375, "learning_rate": 4.917340733366678e-05, "loss": 0.0561, "mean_token_accuracy": 0.9824782729148864, "num_tokens": 63042143.0, "step": 23620 }, { "entropy": 0.061724210530519484, "epoch": 5.507052103974822, "grad_norm": 1.1640625, "learning_rate": 4.917286139396886e-05, "loss": 0.0598, "mean_token_accuracy": 0.9842379450798034, "num_tokens": 63055223.0, "step": 23625 }, { "entropy": 0.06506040319800377, "epoch": 5.508217740995454, "grad_norm": 0.255859375, "learning_rate": 4.9172315280211026e-05, "loss": 0.0512, "mean_token_accuracy": 0.9812526404857635, "num_tokens": 63079411.0, "step": 23630 }, { "entropy": 0.08516685860231518, "epoch": 5.509383378016086, "grad_norm": 0.55859375, "learning_rate": 4.91717689924014e-05, "loss": 0.0768, "mean_token_accuracy": 0.9786878407001496, "num_tokens": 63106752.0, "step": 23635 }, { "entropy": 0.059132724441587924, "epoch": 5.510549015036718, "grad_norm": 2.015625, "learning_rate": 4.9171222530548154e-05, "loss": 0.0461, "mean_token_accuracy": 0.9837357699871063, "num_tokens": 63130321.0, "step": 23640 }, { "entropy": 0.09355827569961547, "epoch": 5.511714652057349, "grad_norm": 3.46875, "learning_rate": 4.9170675894659426e-05, "loss": 0.1392, "mean_token_accuracy": 0.9721016824245453, "num_tokens": 63149244.0, "step": 23645 }, { "entropy": 0.07111373730003834, "epoch": 5.512880289077981, "grad_norm": 2.546875, "learning_rate": 4.917012908474336e-05, "loss": 0.0578, "mean_token_accuracy": 0.9821716785430908, "num_tokens": 63164187.0, "step": 23650 }, { "entropy": 0.062192311882972716, "epoch": 5.514045926098613, "grad_norm": 0.51171875, "learning_rate": 4.9169582100808124e-05, "loss": 0.032, "mean_token_accuracy": 0.9870727837085724, "num_tokens": 63181897.0, "step": 23655 }, { "entropy": 0.10410164566710592, "epoch": 5.515211563119244, "grad_norm": 0.265625, "learning_rate": 4.916903494286186e-05, "loss": 0.1312, "mean_token_accuracy": 0.9689555406570435, "num_tokens": 63224033.0, "step": 23660 }, { "entropy": 0.07494805119931698, "epoch": 5.5163772001398765, "grad_norm": 1.6328125, "learning_rate": 4.9168487610912735e-05, "loss": 0.0874, "mean_token_accuracy": 0.9813202261924744, "num_tokens": 63245163.0, "step": 23665 }, { "entropy": 0.0639274563640356, "epoch": 5.517542837160509, "grad_norm": 1.0, "learning_rate": 4.916794010496891e-05, "loss": 0.0521, "mean_token_accuracy": 0.9858429729938507, "num_tokens": 63267004.0, "step": 23670 }, { "entropy": 0.08091693036258221, "epoch": 5.51870847418114, "grad_norm": 2.359375, "learning_rate": 4.916739242503855e-05, "loss": 0.0751, "mean_token_accuracy": 0.9779129147529602, "num_tokens": 63284406.0, "step": 23675 }, { "entropy": 0.055909703485667706, "epoch": 5.519874111201772, "grad_norm": 0.51953125, "learning_rate": 4.916684457112982e-05, "loss": 0.0489, "mean_token_accuracy": 0.985709261894226, "num_tokens": 63305734.0, "step": 23680 }, { "entropy": 0.06152990758419037, "epoch": 5.521039748222403, "grad_norm": 2.796875, "learning_rate": 4.916629654325088e-05, "loss": 0.06, "mean_token_accuracy": 0.9815901160240174, "num_tokens": 63318392.0, "step": 23685 }, { "entropy": 0.0552783340215683, "epoch": 5.522205385243035, "grad_norm": 1.5078125, "learning_rate": 4.9165748341409925e-05, "loss": 0.0527, "mean_token_accuracy": 0.9854685187339782, "num_tokens": 63336247.0, "step": 23690 }, { "entropy": 0.06256055925041437, "epoch": 5.523371022263667, "grad_norm": 0.7734375, "learning_rate": 4.916519996561511e-05, "loss": 0.0481, "mean_token_accuracy": 0.9842760384082794, "num_tokens": 63361742.0, "step": 23695 }, { "entropy": 0.07285189051181078, "epoch": 5.524536659284299, "grad_norm": 2.359375, "learning_rate": 4.916465141587462e-05, "loss": 0.0406, "mean_token_accuracy": 0.9825927436351776, "num_tokens": 63392024.0, "step": 23700 }, { "entropy": 0.06600981298834085, "epoch": 5.525702296304931, "grad_norm": 0.447265625, "learning_rate": 4.916410269219664e-05, "loss": 0.0289, "mean_token_accuracy": 0.9872182309627533, "num_tokens": 63425798.0, "step": 23705 }, { "entropy": 0.06496403273195028, "epoch": 5.526867933325562, "grad_norm": 1.8046875, "learning_rate": 4.916355379458933e-05, "loss": 0.0451, "mean_token_accuracy": 0.9795408308506012, "num_tokens": 63451911.0, "step": 23710 }, { "entropy": 0.08884159103035927, "epoch": 5.528033570346194, "grad_norm": 0.490234375, "learning_rate": 4.9163004723060894e-05, "loss": 0.0754, "mean_token_accuracy": 0.9773557484149933, "num_tokens": 63462839.0, "step": 23715 }, { "entropy": 0.07003546878695488, "epoch": 5.529199207366826, "grad_norm": 0.32421875, "learning_rate": 4.9162455477619517e-05, "loss": 0.0549, "mean_token_accuracy": 0.9811549305915832, "num_tokens": 63479583.0, "step": 23720 }, { "entropy": 0.06235072594136, "epoch": 5.530364844387458, "grad_norm": 1.109375, "learning_rate": 4.916190605827339e-05, "loss": 0.056, "mean_token_accuracy": 0.9822233319282532, "num_tokens": 63491907.0, "step": 23725 }, { "entropy": 0.06424791738390923, "epoch": 5.5315304814080895, "grad_norm": 3.03125, "learning_rate": 4.91613564650307e-05, "loss": 0.0396, "mean_token_accuracy": 0.9856864392757416, "num_tokens": 63509343.0, "step": 23730 }, { "entropy": 0.06360748754814267, "epoch": 5.5326961184287216, "grad_norm": 1.2109375, "learning_rate": 4.916080669789965e-05, "loss": 0.0364, "mean_token_accuracy": 0.9866265952587128, "num_tokens": 63530657.0, "step": 23735 }, { "entropy": 0.07913671238347889, "epoch": 5.533861755449353, "grad_norm": 0.486328125, "learning_rate": 4.916025675688843e-05, "loss": 0.077, "mean_token_accuracy": 0.9761590301990509, "num_tokens": 63549593.0, "step": 23740 }, { "entropy": 0.06635268032550812, "epoch": 5.535027392469985, "grad_norm": 1.3125, "learning_rate": 4.915970664200524e-05, "loss": 0.0418, "mean_token_accuracy": 0.9833239078521728, "num_tokens": 63565771.0, "step": 23745 }, { "entropy": 0.07817679923027754, "epoch": 5.536193029490617, "grad_norm": 4.125, "learning_rate": 4.91591563532583e-05, "loss": 0.0618, "mean_token_accuracy": 0.9776901960372925, "num_tokens": 63581604.0, "step": 23750 }, { "entropy": 0.09487223085016012, "epoch": 5.537358666511248, "grad_norm": 6.03125, "learning_rate": 4.915860589065579e-05, "loss": 0.0598, "mean_token_accuracy": 0.9834141552448272, "num_tokens": 63604443.0, "step": 23755 }, { "entropy": 0.12501354981213808, "epoch": 5.53852430353188, "grad_norm": 1.75, "learning_rate": 4.9158055254205934e-05, "loss": 0.1379, "mean_token_accuracy": 0.9683729887008667, "num_tokens": 63626224.0, "step": 23760 }, { "entropy": 0.058947160560637715, "epoch": 5.5396899405525115, "grad_norm": 0.5703125, "learning_rate": 4.915750444391694e-05, "loss": 0.0521, "mean_token_accuracy": 0.9822892725467682, "num_tokens": 63646271.0, "step": 23765 }, { "entropy": 0.07299381997436286, "epoch": 5.540855577573144, "grad_norm": 0.91015625, "learning_rate": 4.9156953459797024e-05, "loss": 0.0747, "mean_token_accuracy": 0.9818307161331177, "num_tokens": 63658385.0, "step": 23770 }, { "entropy": 0.06397927738726139, "epoch": 5.542021214593776, "grad_norm": 0.5, "learning_rate": 4.9156402301854395e-05, "loss": 0.0473, "mean_token_accuracy": 0.9837787330150605, "num_tokens": 63679376.0, "step": 23775 }, { "entropy": 0.06430526990443468, "epoch": 5.543186851614407, "grad_norm": 1.453125, "learning_rate": 4.915585097009727e-05, "loss": 0.0557, "mean_token_accuracy": 0.9817375063896179, "num_tokens": 63695652.0, "step": 23780 }, { "entropy": 0.08212286848574876, "epoch": 5.544352488635039, "grad_norm": 0.423828125, "learning_rate": 4.9155299464533886e-05, "loss": 0.0664, "mean_token_accuracy": 0.9791503429412842, "num_tokens": 63710844.0, "step": 23785 }, { "entropy": 0.07861670413985848, "epoch": 5.545518125655671, "grad_norm": 1.3671875, "learning_rate": 4.915474778517245e-05, "loss": 0.0593, "mean_token_accuracy": 0.9782687842845916, "num_tokens": 63729543.0, "step": 23790 }, { "entropy": 0.0697979001328349, "epoch": 5.546683762676302, "grad_norm": 0.61328125, "learning_rate": 4.9154195932021195e-05, "loss": 0.0705, "mean_token_accuracy": 0.9821433663368225, "num_tokens": 63742595.0, "step": 23795 }, { "entropy": 0.07056807223707437, "epoch": 5.5478493996969345, "grad_norm": 2.1875, "learning_rate": 4.9153643905088356e-05, "loss": 0.0828, "mean_token_accuracy": 0.979724007844925, "num_tokens": 63754975.0, "step": 23800 }, { "entropy": 0.06173288859426975, "epoch": 5.549015036717567, "grad_norm": 0.30078125, "learning_rate": 4.9153091704382154e-05, "loss": 0.0365, "mean_token_accuracy": 0.9860757470130921, "num_tokens": 63785355.0, "step": 23805 }, { "entropy": 0.06818353859707713, "epoch": 5.550180673738198, "grad_norm": 1.7265625, "learning_rate": 4.915253932991083e-05, "loss": 0.035, "mean_token_accuracy": 0.9851133286952972, "num_tokens": 63808617.0, "step": 23810 }, { "entropy": 0.08297501988708973, "epoch": 5.55134631075883, "grad_norm": 1.1875, "learning_rate": 4.9151986781682615e-05, "loss": 0.079, "mean_token_accuracy": 0.9793727993965149, "num_tokens": 63825558.0, "step": 23815 }, { "entropy": 0.10554131586104631, "epoch": 5.552511947779461, "grad_norm": 0.99609375, "learning_rate": 4.9151434059705745e-05, "loss": 0.0688, "mean_token_accuracy": 0.9800882160663604, "num_tokens": 63846822.0, "step": 23820 }, { "entropy": 0.06566274948418141, "epoch": 5.553677584800093, "grad_norm": 1.953125, "learning_rate": 4.9150881163988484e-05, "loss": 0.0818, "mean_token_accuracy": 0.9768719375133514, "num_tokens": 63864897.0, "step": 23825 }, { "entropy": 0.05825161384418607, "epoch": 5.554843221820725, "grad_norm": 2.1875, "learning_rate": 4.915032809453905e-05, "loss": 0.0797, "mean_token_accuracy": 0.9830494523048401, "num_tokens": 63885269.0, "step": 23830 }, { "entropy": 0.05645349100232124, "epoch": 5.5560088588413565, "grad_norm": 1.8125, "learning_rate": 4.91497748513657e-05, "loss": 0.0418, "mean_token_accuracy": 0.9863303542137146, "num_tokens": 63901941.0, "step": 23835 }, { "entropy": 0.06523125125095248, "epoch": 5.557174495861989, "grad_norm": 0.47265625, "learning_rate": 4.914922143447669e-05, "loss": 0.0544, "mean_token_accuracy": 0.9860906541347504, "num_tokens": 63918098.0, "step": 23840 }, { "entropy": 0.11673079691827297, "epoch": 5.55834013288262, "grad_norm": 1.4921875, "learning_rate": 4.9148667843880266e-05, "loss": 0.0873, "mean_token_accuracy": 0.9748226165771484, "num_tokens": 63927237.0, "step": 23845 }, { "entropy": 0.06276131253689528, "epoch": 5.559505769903252, "grad_norm": 1.0625, "learning_rate": 4.9148114079584684e-05, "loss": 0.0505, "mean_token_accuracy": 0.9863556146621704, "num_tokens": 63945205.0, "step": 23850 }, { "entropy": 0.06383510492742062, "epoch": 5.560671406923884, "grad_norm": 1.5546875, "learning_rate": 4.91475601415982e-05, "loss": 0.0725, "mean_token_accuracy": 0.9803566753864288, "num_tokens": 63956982.0, "step": 23855 }, { "entropy": 0.06630421318113804, "epoch": 5.561837043944516, "grad_norm": 3.546875, "learning_rate": 4.9147006029929074e-05, "loss": 0.0679, "mean_token_accuracy": 0.9794648051261902, "num_tokens": 63974757.0, "step": 23860 }, { "entropy": 0.07054558731615543, "epoch": 5.563002680965147, "grad_norm": 1.7890625, "learning_rate": 4.914645174458557e-05, "loss": 0.0624, "mean_token_accuracy": 0.9827561438083648, "num_tokens": 63986243.0, "step": 23865 }, { "entropy": 0.05838492456823587, "epoch": 5.5641683179857795, "grad_norm": 1.4140625, "learning_rate": 4.914589728557595e-05, "loss": 0.042, "mean_token_accuracy": 0.9874827086925506, "num_tokens": 64005214.0, "step": 23870 }, { "entropy": 0.07171590086072684, "epoch": 5.565333955006411, "grad_norm": 0.88671875, "learning_rate": 4.914534265290849e-05, "loss": 0.0735, "mean_token_accuracy": 0.9795938551425933, "num_tokens": 64041250.0, "step": 23875 }, { "entropy": 0.10566460490226745, "epoch": 5.566499592027043, "grad_norm": 1.4140625, "learning_rate": 4.914478784659146e-05, "loss": 0.0637, "mean_token_accuracy": 0.9845397114753723, "num_tokens": 64066255.0, "step": 23880 }, { "entropy": 0.0610381668433547, "epoch": 5.567665229047675, "grad_norm": 1.7734375, "learning_rate": 4.9144232866633124e-05, "loss": 0.0487, "mean_token_accuracy": 0.9841365456581116, "num_tokens": 64082043.0, "step": 23885 }, { "entropy": 0.074870290979743, "epoch": 5.568830866068306, "grad_norm": 0.236328125, "learning_rate": 4.9143677713041766e-05, "loss": 0.0691, "mean_token_accuracy": 0.9827987253665924, "num_tokens": 64103120.0, "step": 23890 }, { "entropy": 0.057516278512775895, "epoch": 5.569996503088938, "grad_norm": 0.490234375, "learning_rate": 4.914312238582565e-05, "loss": 0.0496, "mean_token_accuracy": 0.9857203364372253, "num_tokens": 64119929.0, "step": 23895 }, { "entropy": 0.0727980025112629, "epoch": 5.5711621401095694, "grad_norm": 2.953125, "learning_rate": 4.9142566884993074e-05, "loss": 0.0572, "mean_token_accuracy": 0.980837094783783, "num_tokens": 64138984.0, "step": 23900 }, { "entropy": 0.05668035298585892, "epoch": 5.5723277771302016, "grad_norm": 0.357421875, "learning_rate": 4.9142011210552314e-05, "loss": 0.0548, "mean_token_accuracy": 0.9852408468723297, "num_tokens": 64161740.0, "step": 23905 }, { "entropy": 0.10799715518951417, "epoch": 5.573493414150834, "grad_norm": 4.09375, "learning_rate": 4.914145536251166e-05, "loss": 0.1483, "mean_token_accuracy": 0.9697591483592987, "num_tokens": 64184455.0, "step": 23910 }, { "entropy": 0.08060030173510313, "epoch": 5.574659051171465, "grad_norm": 2.03125, "learning_rate": 4.914089934087939e-05, "loss": 0.0585, "mean_token_accuracy": 0.9793602645397186, "num_tokens": 64199983.0, "step": 23915 }, { "entropy": 0.0592457982711494, "epoch": 5.575824688192097, "grad_norm": 0.74609375, "learning_rate": 4.91403431456638e-05, "loss": 0.039, "mean_token_accuracy": 0.987294489145279, "num_tokens": 64223559.0, "step": 23920 }, { "entropy": 0.07702452093362808, "epoch": 5.576990325212729, "grad_norm": 1.359375, "learning_rate": 4.913978677687319e-05, "loss": 0.0661, "mean_token_accuracy": 0.9822618186473846, "num_tokens": 64236810.0, "step": 23925 }, { "entropy": 0.08921125689521432, "epoch": 5.57815596223336, "grad_norm": 1.8984375, "learning_rate": 4.913923023451585e-05, "loss": 0.0902, "mean_token_accuracy": 0.9755136132240295, "num_tokens": 64259739.0, "step": 23930 }, { "entropy": 0.0405919854529202, "epoch": 5.579321599253992, "grad_norm": 0.98046875, "learning_rate": 4.9138673518600086e-05, "loss": 0.0294, "mean_token_accuracy": 0.9920165717601777, "num_tokens": 64298571.0, "step": 23935 }, { "entropy": 0.06870214790105819, "epoch": 5.5804872362746245, "grad_norm": 2.34375, "learning_rate": 4.9138116629134196e-05, "loss": 0.0721, "mean_token_accuracy": 0.9808136463165283, "num_tokens": 64312476.0, "step": 23940 }, { "entropy": 0.070391511823982, "epoch": 5.581652873295256, "grad_norm": 1.203125, "learning_rate": 4.913755956612648e-05, "loss": 0.0527, "mean_token_accuracy": 0.984083354473114, "num_tokens": 64331356.0, "step": 23945 }, { "entropy": 0.08463108614087105, "epoch": 5.582818510315888, "grad_norm": 0.8515625, "learning_rate": 4.913700232958524e-05, "loss": 0.1035, "mean_token_accuracy": 0.9746655583381653, "num_tokens": 64340876.0, "step": 23950 }, { "entropy": 0.06346419043838977, "epoch": 5.583984147336519, "grad_norm": 0.921875, "learning_rate": 4.91364449195188e-05, "loss": 0.036, "mean_token_accuracy": 0.9880404353141785, "num_tokens": 64370087.0, "step": 23955 }, { "entropy": 0.09586155414581299, "epoch": 5.585149784357151, "grad_norm": 2.96875, "learning_rate": 4.913588733593546e-05, "loss": 0.0695, "mean_token_accuracy": 0.9790465831756592, "num_tokens": 64382026.0, "step": 23960 }, { "entropy": 0.06308048740029334, "epoch": 5.586315421377783, "grad_norm": 0.361328125, "learning_rate": 4.9135329578843535e-05, "loss": 0.0527, "mean_token_accuracy": 0.9852202475070954, "num_tokens": 64397827.0, "step": 23965 }, { "entropy": 0.07038025036454201, "epoch": 5.5874810583984145, "grad_norm": 0.98828125, "learning_rate": 4.913477164825135e-05, "loss": 0.0717, "mean_token_accuracy": 0.9811891615390778, "num_tokens": 64418130.0, "step": 23970 }, { "entropy": 0.08210363052785397, "epoch": 5.588646695419047, "grad_norm": 2.125, "learning_rate": 4.913421354416722e-05, "loss": 0.068, "mean_token_accuracy": 0.9791256606578826, "num_tokens": 64430360.0, "step": 23975 }, { "entropy": 0.05975691732019186, "epoch": 5.589812332439678, "grad_norm": 2.0, "learning_rate": 4.913365526659946e-05, "loss": 0.0587, "mean_token_accuracy": 0.9852451801300048, "num_tokens": 64454270.0, "step": 23980 }, { "entropy": 0.0696345467120409, "epoch": 5.59097796946031, "grad_norm": 0.46484375, "learning_rate": 4.913309681555642e-05, "loss": 0.039, "mean_token_accuracy": 0.9870121002197265, "num_tokens": 64476946.0, "step": 23985 }, { "entropy": 0.07908033691346646, "epoch": 5.592143606480942, "grad_norm": 0.609375, "learning_rate": 4.913253819104639e-05, "loss": 0.0653, "mean_token_accuracy": 0.9817940413951873, "num_tokens": 64499294.0, "step": 23990 }, { "entropy": 0.06197888310998678, "epoch": 5.593309243501574, "grad_norm": 0.796875, "learning_rate": 4.9131979393077734e-05, "loss": 0.0425, "mean_token_accuracy": 0.9863790690898895, "num_tokens": 64517333.0, "step": 23995 }, { "entropy": 0.07623457312583923, "epoch": 5.594474880522205, "grad_norm": 2.421875, "learning_rate": 4.9131420421658764e-05, "loss": 0.0801, "mean_token_accuracy": 0.9793197691440583, "num_tokens": 64550934.0, "step": 24000 }, { "entropy": 0.05599022675305605, "epoch": 5.595640517542837, "grad_norm": 0.408203125, "learning_rate": 4.913086127679782e-05, "loss": 0.0324, "mean_token_accuracy": 0.9851845264434814, "num_tokens": 64580196.0, "step": 24005 }, { "entropy": 0.07400658950209618, "epoch": 5.596806154563469, "grad_norm": 3.375, "learning_rate": 4.913030195850324e-05, "loss": 0.0727, "mean_token_accuracy": 0.9823880016803741, "num_tokens": 64591487.0, "step": 24010 }, { "entropy": 0.06357642374932766, "epoch": 5.597971791584101, "grad_norm": 0.142578125, "learning_rate": 4.9129742466783364e-05, "loss": 0.037, "mean_token_accuracy": 0.9890712201595306, "num_tokens": 64619890.0, "step": 24015 }, { "entropy": 0.08721558898687362, "epoch": 5.599137428604733, "grad_norm": 0.9453125, "learning_rate": 4.912918280164654e-05, "loss": 0.0571, "mean_token_accuracy": 0.9823896586894989, "num_tokens": 64634109.0, "step": 24020 }, { "entropy": 0.07920245192945004, "epoch": 5.600303065625364, "grad_norm": 0.3203125, "learning_rate": 4.91286229631011e-05, "loss": 0.0529, "mean_token_accuracy": 0.9839257597923279, "num_tokens": 64656244.0, "step": 24025 }, { "entropy": 0.07264061979949474, "epoch": 5.601468702645996, "grad_norm": 0.7578125, "learning_rate": 4.912806295115541e-05, "loss": 0.0694, "mean_token_accuracy": 0.9849851489067077, "num_tokens": 64667128.0, "step": 24030 }, { "entropy": 0.06038454240188003, "epoch": 5.602634339666627, "grad_norm": 0.1552734375, "learning_rate": 4.9127502765817814e-05, "loss": 0.0374, "mean_token_accuracy": 0.9803289711475373, "num_tokens": 64690511.0, "step": 24035 }, { "entropy": 0.07674487382173538, "epoch": 5.6037999766872595, "grad_norm": 2.328125, "learning_rate": 4.912694240709665e-05, "loss": 0.0843, "mean_token_accuracy": 0.978994345664978, "num_tokens": 64710216.0, "step": 24040 }, { "entropy": 0.06889344537630678, "epoch": 5.604965613707892, "grad_norm": 0.8203125, "learning_rate": 4.91263818750003e-05, "loss": 0.0371, "mean_token_accuracy": 0.9858274102210999, "num_tokens": 64732135.0, "step": 24045 }, { "entropy": 0.06218187240883708, "epoch": 5.606131250728523, "grad_norm": 1.359375, "learning_rate": 4.912582116953711e-05, "loss": 0.0547, "mean_token_accuracy": 0.9839303433895111, "num_tokens": 64760928.0, "step": 24050 }, { "entropy": 0.06635083290748298, "epoch": 5.607296887749155, "grad_norm": 1.25, "learning_rate": 4.912526029071543e-05, "loss": 0.0483, "mean_token_accuracy": 0.983081066608429, "num_tokens": 64780021.0, "step": 24055 }, { "entropy": 0.07519938629120589, "epoch": 5.608462524769787, "grad_norm": 3.34375, "learning_rate": 4.912469923854364e-05, "loss": 0.0681, "mean_token_accuracy": 0.981386798620224, "num_tokens": 64794223.0, "step": 24060 }, { "entropy": 0.08381996154785157, "epoch": 5.609628161790418, "grad_norm": 2.15625, "learning_rate": 4.9124138013030094e-05, "loss": 0.084, "mean_token_accuracy": 0.9787656486034393, "num_tokens": 64804725.0, "step": 24065 }, { "entropy": 0.04471158161759377, "epoch": 5.61079379881105, "grad_norm": 0.1953125, "learning_rate": 4.912357661418317e-05, "loss": 0.027, "mean_token_accuracy": 0.9886382281780243, "num_tokens": 64834084.0, "step": 24070 }, { "entropy": 0.08546677436679602, "epoch": 5.611959435831682, "grad_norm": 0.8046875, "learning_rate": 4.912301504201124e-05, "loss": 0.0574, "mean_token_accuracy": 0.9757747769355773, "num_tokens": 64854352.0, "step": 24075 }, { "entropy": 0.07056930642575025, "epoch": 5.613125072852314, "grad_norm": 3.671875, "learning_rate": 4.912245329652267e-05, "loss": 0.0639, "mean_token_accuracy": 0.9803074657917022, "num_tokens": 64873354.0, "step": 24080 }, { "entropy": 0.07950994074344635, "epoch": 5.614290709872946, "grad_norm": 0.80078125, "learning_rate": 4.9121891377725835e-05, "loss": 0.0858, "mean_token_accuracy": 0.9795065879821777, "num_tokens": 64883819.0, "step": 24085 }, { "entropy": 0.05531720239669084, "epoch": 5.615456346893577, "grad_norm": 1.078125, "learning_rate": 4.9121329285629125e-05, "loss": 0.041, "mean_token_accuracy": 0.988337516784668, "num_tokens": 64898970.0, "step": 24090 }, { "entropy": 0.07124143727123737, "epoch": 5.616621983914209, "grad_norm": 4.03125, "learning_rate": 4.912076702024092e-05, "loss": 0.0741, "mean_token_accuracy": 0.9810739636421204, "num_tokens": 64914653.0, "step": 24095 }, { "entropy": 0.09241018267348408, "epoch": 5.617787620934841, "grad_norm": 2.328125, "learning_rate": 4.91202045815696e-05, "loss": 0.095, "mean_token_accuracy": 0.9772006750106812, "num_tokens": 64927604.0, "step": 24100 }, { "entropy": 0.05182990748435259, "epoch": 5.618953257955472, "grad_norm": 2.21875, "learning_rate": 4.911964196962354e-05, "loss": 0.0383, "mean_token_accuracy": 0.9875986397266387, "num_tokens": 64949566.0, "step": 24105 }, { "entropy": 0.08643919341266156, "epoch": 5.6201188949761045, "grad_norm": 3.390625, "learning_rate": 4.9119079184411146e-05, "loss": 0.087, "mean_token_accuracy": 0.9766907870769501, "num_tokens": 64958339.0, "step": 24110 }, { "entropy": 0.11621445324271917, "epoch": 5.621284531996736, "grad_norm": 0.41015625, "learning_rate": 4.911851622594081e-05, "loss": 0.1477, "mean_token_accuracy": 0.9691729426383973, "num_tokens": 64991088.0, "step": 24115 }, { "entropy": 0.06573995789512992, "epoch": 5.622450169017368, "grad_norm": 0.2451171875, "learning_rate": 4.911795309422092e-05, "loss": 0.0704, "mean_token_accuracy": 0.9805086791515351, "num_tokens": 65008055.0, "step": 24120 }, { "entropy": 0.04539413256570697, "epoch": 5.623615806038, "grad_norm": 0.71484375, "learning_rate": 4.9117389789259874e-05, "loss": 0.0336, "mean_token_accuracy": 0.9910531222820282, "num_tokens": 65048328.0, "step": 24125 }, { "entropy": 0.07581941662356258, "epoch": 5.624781443058632, "grad_norm": 1.5390625, "learning_rate": 4.9116826311066075e-05, "loss": 0.1022, "mean_token_accuracy": 0.9789832353591919, "num_tokens": 65072881.0, "step": 24130 }, { "entropy": 0.0626650300808251, "epoch": 5.625947080079263, "grad_norm": 2.046875, "learning_rate": 4.911626265964792e-05, "loss": 0.0612, "mean_token_accuracy": 0.9832917273044586, "num_tokens": 65095466.0, "step": 24135 }, { "entropy": 0.06437070518732071, "epoch": 5.627112717099895, "grad_norm": 1.078125, "learning_rate": 4.9115698835013823e-05, "loss": 0.0539, "mean_token_accuracy": 0.9849261939525604, "num_tokens": 65110121.0, "step": 24140 }, { "entropy": 0.06489365249872207, "epoch": 5.628278354120527, "grad_norm": 0.515625, "learning_rate": 4.9115134837172176e-05, "loss": 0.0466, "mean_token_accuracy": 0.98541299700737, "num_tokens": 65141686.0, "step": 24145 }, { "entropy": 0.0634211104363203, "epoch": 5.629443991141159, "grad_norm": 0.77734375, "learning_rate": 4.9114570666131406e-05, "loss": 0.0446, "mean_token_accuracy": 0.9839524507522583, "num_tokens": 65159098.0, "step": 24150 }, { "entropy": 0.04414686663076282, "epoch": 5.630609628161791, "grad_norm": 0.19921875, "learning_rate": 4.911400632189991e-05, "loss": 0.0129, "mean_token_accuracy": 0.9903716087341309, "num_tokens": 65193723.0, "step": 24155 }, { "entropy": 0.06207293402403593, "epoch": 5.631775265182422, "grad_norm": 1.4765625, "learning_rate": 4.911344180448612e-05, "loss": 0.0396, "mean_token_accuracy": 0.9857115924358368, "num_tokens": 65213189.0, "step": 24160 }, { "entropy": 0.06852437127381564, "epoch": 5.632940902203054, "grad_norm": 0.48828125, "learning_rate": 4.911287711389844e-05, "loss": 0.0622, "mean_token_accuracy": 0.9851773381233215, "num_tokens": 65225625.0, "step": 24165 }, { "entropy": 0.07459627948701382, "epoch": 5.634106539223685, "grad_norm": 1.3828125, "learning_rate": 4.9112312250145296e-05, "loss": 0.0729, "mean_token_accuracy": 0.9799397945404053, "num_tokens": 65253579.0, "step": 24170 }, { "entropy": 0.09025459066033363, "epoch": 5.635272176244317, "grad_norm": 3.734375, "learning_rate": 4.911174721323512e-05, "loss": 0.1323, "mean_token_accuracy": 0.969058758020401, "num_tokens": 65260612.0, "step": 24175 }, { "entropy": 0.060559450881555676, "epoch": 5.6364378132649495, "grad_norm": 0.4296875, "learning_rate": 4.9111182003176315e-05, "loss": 0.0331, "mean_token_accuracy": 0.9849679410457611, "num_tokens": 65290280.0, "step": 24180 }, { "entropy": 0.07913712412118912, "epoch": 5.637603450285581, "grad_norm": 0.51953125, "learning_rate": 4.911061661997733e-05, "loss": 0.0844, "mean_token_accuracy": 0.9759842038154602, "num_tokens": 65305469.0, "step": 24185 }, { "entropy": 0.06736504193395376, "epoch": 5.638769087306213, "grad_norm": 0.75390625, "learning_rate": 4.9110051063646586e-05, "loss": 0.0443, "mean_token_accuracy": 0.9866178870201111, "num_tokens": 65330611.0, "step": 24190 }, { "entropy": 0.08924924843013286, "epoch": 5.639934724326845, "grad_norm": 3.59375, "learning_rate": 4.910948533419251e-05, "loss": 0.092, "mean_token_accuracy": 0.9765866577625275, "num_tokens": 65339161.0, "step": 24195 }, { "entropy": 0.09531253166496753, "epoch": 5.641100361347476, "grad_norm": 2.171875, "learning_rate": 4.910891943162356e-05, "loss": 0.0851, "mean_token_accuracy": 0.9777402102947235, "num_tokens": 65360121.0, "step": 24200 }, { "entropy": 0.07371218502521515, "epoch": 5.642265998368108, "grad_norm": 0.47265625, "learning_rate": 4.910835335594815e-05, "loss": 0.067, "mean_token_accuracy": 0.9795070946216583, "num_tokens": 65374365.0, "step": 24205 }, { "entropy": 0.06099022421985865, "epoch": 5.64343163538874, "grad_norm": 1.140625, "learning_rate": 4.9107787107174735e-05, "loss": 0.0241, "mean_token_accuracy": 0.9872618436813354, "num_tokens": 65398143.0, "step": 24210 }, { "entropy": 0.06053253598511219, "epoch": 5.644597272409372, "grad_norm": 4.53125, "learning_rate": 4.9107220685311756e-05, "loss": 0.0545, "mean_token_accuracy": 0.985034990310669, "num_tokens": 65413147.0, "step": 24215 }, { "entropy": 0.05846654055640101, "epoch": 5.645762909430004, "grad_norm": 2.515625, "learning_rate": 4.910665409036765e-05, "loss": 0.0443, "mean_token_accuracy": 0.983678150177002, "num_tokens": 65438930.0, "step": 24220 }, { "entropy": 0.056470193434506655, "epoch": 5.646928546450635, "grad_norm": 0.1728515625, "learning_rate": 4.910608732235089e-05, "loss": 0.0507, "mean_token_accuracy": 0.9855823874473572, "num_tokens": 65467306.0, "step": 24225 }, { "entropy": 0.0693120218347758, "epoch": 5.648094183471267, "grad_norm": 2.609375, "learning_rate": 4.91055203812699e-05, "loss": 0.0758, "mean_token_accuracy": 0.9780898749828338, "num_tokens": 65485922.0, "step": 24230 }, { "entropy": 0.08311540931463242, "epoch": 5.649259820491899, "grad_norm": 1.3828125, "learning_rate": 4.910495326713315e-05, "loss": 0.0886, "mean_token_accuracy": 0.978338223695755, "num_tokens": 65509037.0, "step": 24235 }, { "entropy": 0.06125993989408016, "epoch": 5.65042545751253, "grad_norm": 2.5, "learning_rate": 4.910438597994909e-05, "loss": 0.0538, "mean_token_accuracy": 0.9835680603981019, "num_tokens": 65524153.0, "step": 24240 }, { "entropy": 0.06852566041052341, "epoch": 5.651591094533162, "grad_norm": 0.56640625, "learning_rate": 4.910381851972618e-05, "loss": 0.0601, "mean_token_accuracy": 0.985653680562973, "num_tokens": 65536747.0, "step": 24245 }, { "entropy": 0.06987055232748389, "epoch": 5.652756731553794, "grad_norm": 0.392578125, "learning_rate": 4.9103250886472886e-05, "loss": 0.0751, "mean_token_accuracy": 0.9799535036087036, "num_tokens": 65556692.0, "step": 24250 }, { "entropy": 0.05222968608140945, "epoch": 5.653922368574426, "grad_norm": 2.328125, "learning_rate": 4.910268308019766e-05, "loss": 0.039, "mean_token_accuracy": 0.9869933426380157, "num_tokens": 65583964.0, "step": 24255 }, { "entropy": 0.06949933720752596, "epoch": 5.655088005595058, "grad_norm": 3.59375, "learning_rate": 4.910211510090899e-05, "loss": 0.077, "mean_token_accuracy": 0.9771763205528259, "num_tokens": 65599854.0, "step": 24260 }, { "entropy": 0.0793121935799718, "epoch": 5.65625364261569, "grad_norm": 1.875, "learning_rate": 4.910154694861533e-05, "loss": 0.0734, "mean_token_accuracy": 0.978898000717163, "num_tokens": 65613891.0, "step": 24265 }, { "entropy": 0.05992090366780758, "epoch": 5.657419279636321, "grad_norm": 2.28125, "learning_rate": 4.910097862332515e-05, "loss": 0.0604, "mean_token_accuracy": 0.9835952401161194, "num_tokens": 65642481.0, "step": 24270 }, { "entropy": 0.056962525472044945, "epoch": 5.658584916656953, "grad_norm": 2.609375, "learning_rate": 4.9100410125046934e-05, "loss": 0.0559, "mean_token_accuracy": 0.9861100137233734, "num_tokens": 65665910.0, "step": 24275 }, { "entropy": 0.0507665459997952, "epoch": 5.6597505536775845, "grad_norm": 0.8125, "learning_rate": 4.909984145378915e-05, "loss": 0.0314, "mean_token_accuracy": 0.986362773180008, "num_tokens": 65711085.0, "step": 24280 }, { "entropy": 0.08935830974951386, "epoch": 5.660916190698217, "grad_norm": 0.23828125, "learning_rate": 4.909927260956029e-05, "loss": 0.1072, "mean_token_accuracy": 0.9721687018871308, "num_tokens": 65735010.0, "step": 24285 }, { "entropy": 0.07759722573682666, "epoch": 5.662081827718849, "grad_norm": 0.251953125, "learning_rate": 4.9098703592368825e-05, "loss": 0.0358, "mean_token_accuracy": 0.9860342264175415, "num_tokens": 65757801.0, "step": 24290 }, { "entropy": 0.06456798985600472, "epoch": 5.66324746473948, "grad_norm": 0.2314453125, "learning_rate": 4.909813440222325e-05, "loss": 0.0502, "mean_token_accuracy": 0.9860153675079346, "num_tokens": 65790889.0, "step": 24295 }, { "entropy": 0.06786645632237195, "epoch": 5.664413101760112, "grad_norm": 0.37890625, "learning_rate": 4.9097565039132034e-05, "loss": 0.0504, "mean_token_accuracy": 0.9867601275444031, "num_tokens": 65810793.0, "step": 24300 }, { "entropy": 0.17485631480813027, "epoch": 5.665578738780743, "grad_norm": 1.7578125, "learning_rate": 4.909699550310369e-05, "loss": 0.34, "mean_token_accuracy": 0.95076944231987, "num_tokens": 65826864.0, "step": 24305 }, { "entropy": 0.07269075997173786, "epoch": 5.666744375801375, "grad_norm": 0.33203125, "learning_rate": 4.90964257941467e-05, "loss": 0.0508, "mean_token_accuracy": 0.9866540789604187, "num_tokens": 65845292.0, "step": 24310 }, { "entropy": 0.07192633729428052, "epoch": 5.6679100128220075, "grad_norm": 3.703125, "learning_rate": 4.909585591226956e-05, "loss": 0.0837, "mean_token_accuracy": 0.9803862571716309, "num_tokens": 65861238.0, "step": 24315 }, { "entropy": 0.05109905377030373, "epoch": 5.669075649842639, "grad_norm": 1.4921875, "learning_rate": 4.909528585748076e-05, "loss": 0.0379, "mean_token_accuracy": 0.9892950415611267, "num_tokens": 65890623.0, "step": 24320 }, { "entropy": 0.06906082537025213, "epoch": 5.670241286863271, "grad_norm": 4.40625, "learning_rate": 4.9094715629788814e-05, "loss": 0.0731, "mean_token_accuracy": 0.9789024889469147, "num_tokens": 65914480.0, "step": 24325 }, { "entropy": 0.057262393087148665, "epoch": 5.671406923883903, "grad_norm": 1.03125, "learning_rate": 4.9094145229202214e-05, "loss": 0.0688, "mean_token_accuracy": 0.9827243983745575, "num_tokens": 65941588.0, "step": 24330 }, { "entropy": 0.0782860929146409, "epoch": 5.672572560904534, "grad_norm": 0.90625, "learning_rate": 4.9093574655729475e-05, "loss": 0.0694, "mean_token_accuracy": 0.9805294811725617, "num_tokens": 65953294.0, "step": 24335 }, { "entropy": 0.05869090519845486, "epoch": 5.673738197925166, "grad_norm": 0.1943359375, "learning_rate": 4.90930039093791e-05, "loss": 0.0613, "mean_token_accuracy": 0.9824559628963471, "num_tokens": 65981631.0, "step": 24340 }, { "entropy": 0.08221575478091836, "epoch": 5.674903834945798, "grad_norm": 4.46875, "learning_rate": 4.909243299015959e-05, "loss": 0.0797, "mean_token_accuracy": 0.9766084969043731, "num_tokens": 66002579.0, "step": 24345 }, { "entropy": 0.056899439869448544, "epoch": 5.6760694719664295, "grad_norm": 0.400390625, "learning_rate": 4.909186189807948e-05, "loss": 0.0522, "mean_token_accuracy": 0.9842858016490936, "num_tokens": 66040407.0, "step": 24350 }, { "entropy": 0.06606799438595772, "epoch": 5.677235108987062, "grad_norm": 0.9765625, "learning_rate": 4.909129063314727e-05, "loss": 0.0535, "mean_token_accuracy": 0.9835278391838074, "num_tokens": 66053760.0, "step": 24355 }, { "entropy": 0.07696728855371475, "epoch": 5.678400746007693, "grad_norm": 1.3203125, "learning_rate": 4.909071919537148e-05, "loss": 0.0787, "mean_token_accuracy": 0.976880544424057, "num_tokens": 66064281.0, "step": 24360 }, { "entropy": 0.06062213182449341, "epoch": 5.679566383028325, "grad_norm": 0.7421875, "learning_rate": 4.9090147584760635e-05, "loss": 0.0449, "mean_token_accuracy": 0.988343334197998, "num_tokens": 66078714.0, "step": 24365 }, { "entropy": 0.07671241629868746, "epoch": 5.680732020048957, "grad_norm": 2.59375, "learning_rate": 4.908957580132326e-05, "loss": 0.0796, "mean_token_accuracy": 0.9792423665523529, "num_tokens": 66112610.0, "step": 24370 }, { "entropy": 0.05066694635897875, "epoch": 5.681897657069588, "grad_norm": 0.6171875, "learning_rate": 4.908900384506787e-05, "loss": 0.0239, "mean_token_accuracy": 0.9861576557159424, "num_tokens": 66142200.0, "step": 24375 }, { "entropy": 0.05256279185414314, "epoch": 5.68306329409022, "grad_norm": 0.25390625, "learning_rate": 4.908843171600301e-05, "loss": 0.0493, "mean_token_accuracy": 0.9895726382732392, "num_tokens": 66170098.0, "step": 24380 }, { "entropy": 0.08458961248397827, "epoch": 5.684228931110852, "grad_norm": 0.5546875, "learning_rate": 4.90878594141372e-05, "loss": 0.0625, "mean_token_accuracy": 0.9803022623062134, "num_tokens": 66189130.0, "step": 24385 }, { "entropy": 0.07308434527367354, "epoch": 5.685394568131484, "grad_norm": 1.7421875, "learning_rate": 4.908728693947898e-05, "loss": 0.0579, "mean_token_accuracy": 0.9815151989459991, "num_tokens": 66202993.0, "step": 24390 }, { "entropy": 0.051412354689091444, "epoch": 5.686560205152116, "grad_norm": 0.76953125, "learning_rate": 4.908671429203687e-05, "loss": 0.0537, "mean_token_accuracy": 0.9856079936027526, "num_tokens": 66232433.0, "step": 24395 }, { "entropy": 0.054090891033411026, "epoch": 5.687725842172748, "grad_norm": 0.9453125, "learning_rate": 4.908614147181944e-05, "loss": 0.0492, "mean_token_accuracy": 0.9889610230922699, "num_tokens": 66261764.0, "step": 24400 }, { "entropy": 0.06562711736187339, "epoch": 5.688891479193379, "grad_norm": 0.353515625, "learning_rate": 4.908556847883521e-05, "loss": 0.0472, "mean_token_accuracy": 0.9882732093334198, "num_tokens": 66288006.0, "step": 24405 }, { "entropy": 0.18064768142066895, "epoch": 5.690057116214011, "grad_norm": 0.306640625, "learning_rate": 4.908499531309272e-05, "loss": 0.3302, "mean_token_accuracy": 0.9339430838823318, "num_tokens": 66316631.0, "step": 24410 }, { "entropy": 0.10602510198950768, "epoch": 5.691222753234642, "grad_norm": 0.74609375, "learning_rate": 4.908442197460053e-05, "loss": 0.1215, "mean_token_accuracy": 0.9769471049308777, "num_tokens": 66336224.0, "step": 24415 }, { "entropy": 0.06400628378614784, "epoch": 5.6923883902552745, "grad_norm": 0.291015625, "learning_rate": 4.908384846336719e-05, "loss": 0.0554, "mean_token_accuracy": 0.9797611176967621, "num_tokens": 66353209.0, "step": 24420 }, { "entropy": 0.07065552687272429, "epoch": 5.693554027275907, "grad_norm": 1.5625, "learning_rate": 4.9083274779401236e-05, "loss": 0.0472, "mean_token_accuracy": 0.9867741584777832, "num_tokens": 66369700.0, "step": 24425 }, { "entropy": 0.07249236945062876, "epoch": 5.694719664296538, "grad_norm": 1.109375, "learning_rate": 4.908270092271124e-05, "loss": 0.0446, "mean_token_accuracy": 0.9849403977394104, "num_tokens": 66385062.0, "step": 24430 }, { "entropy": 0.07194965444505215, "epoch": 5.69588530131717, "grad_norm": 0.53125, "learning_rate": 4.908212689330575e-05, "loss": 0.0627, "mean_token_accuracy": 0.9791293621063233, "num_tokens": 66402553.0, "step": 24435 }, { "entropy": 0.06428283378481865, "epoch": 5.697050938337801, "grad_norm": 0.283203125, "learning_rate": 4.908155269119333e-05, "loss": 0.0495, "mean_token_accuracy": 0.9872363924980163, "num_tokens": 66428555.0, "step": 24440 }, { "entropy": 0.053512285463511944, "epoch": 5.698216575358433, "grad_norm": 1.09375, "learning_rate": 4.908097831638253e-05, "loss": 0.0335, "mean_token_accuracy": 0.9889600455760956, "num_tokens": 66450496.0, "step": 24445 }, { "entropy": 0.03742195102386177, "epoch": 5.699382212379065, "grad_norm": 0.376953125, "learning_rate": 4.9080403768881934e-05, "loss": 0.0243, "mean_token_accuracy": 0.9910203576087951, "num_tokens": 66497427.0, "step": 24450 }, { "entropy": 0.08320451527833939, "epoch": 5.700547849399697, "grad_norm": 3.390625, "learning_rate": 4.90798290487001e-05, "loss": 0.132, "mean_token_accuracy": 0.9729970693588257, "num_tokens": 66529137.0, "step": 24455 }, { "entropy": 0.07671122914180159, "epoch": 5.701713486420329, "grad_norm": 1.8125, "learning_rate": 4.9079254155845596e-05, "loss": 0.0636, "mean_token_accuracy": 0.976071572303772, "num_tokens": 66545793.0, "step": 24460 }, { "entropy": 0.0630271015688777, "epoch": 5.702879123440961, "grad_norm": 0.54296875, "learning_rate": 4.9078679090326995e-05, "loss": 0.0664, "mean_token_accuracy": 0.9832329630851746, "num_tokens": 66565804.0, "step": 24465 }, { "entropy": 0.04792722817510366, "epoch": 5.704044760461592, "grad_norm": 0.8828125, "learning_rate": 4.907810385215287e-05, "loss": 0.0242, "mean_token_accuracy": 0.989013385772705, "num_tokens": 66598404.0, "step": 24470 }, { "entropy": 0.08894434310495854, "epoch": 5.705210397482224, "grad_norm": 4.53125, "learning_rate": 4.907752844133181e-05, "loss": 0.0824, "mean_token_accuracy": 0.9780226051807404, "num_tokens": 66608057.0, "step": 24475 }, { "entropy": 0.08381523173302412, "epoch": 5.706376034502856, "grad_norm": 1.4375, "learning_rate": 4.907695285787238e-05, "loss": 0.0709, "mean_token_accuracy": 0.979823499917984, "num_tokens": 66635470.0, "step": 24480 }, { "entropy": 0.05908362194895744, "epoch": 5.7075416715234875, "grad_norm": 0.6640625, "learning_rate": 4.907637710178318e-05, "loss": 0.0199, "mean_token_accuracy": 0.9885694861412049, "num_tokens": 66669770.0, "step": 24485 }, { "entropy": 0.07350371684879065, "epoch": 5.70870730854412, "grad_norm": 0.2001953125, "learning_rate": 4.9075801173072776e-05, "loss": 0.0414, "mean_token_accuracy": 0.9812188148498535, "num_tokens": 66702048.0, "step": 24490 }, { "entropy": 0.04958838382735849, "epoch": 5.709872945564751, "grad_norm": 0.412109375, "learning_rate": 4.907522507174977e-05, "loss": 0.0265, "mean_token_accuracy": 0.9889841973781586, "num_tokens": 66732023.0, "step": 24495 }, { "entropy": 0.049441782478243115, "epoch": 5.711038582585383, "grad_norm": 0.890625, "learning_rate": 4.907464879782275e-05, "loss": 0.0297, "mean_token_accuracy": 0.9903727948665619, "num_tokens": 66763793.0, "step": 24500 }, { "entropy": 0.07775519993156195, "epoch": 5.712204219606015, "grad_norm": 1.0546875, "learning_rate": 4.90740723513003e-05, "loss": 0.0823, "mean_token_accuracy": 0.9773679137229919, "num_tokens": 66774721.0, "step": 24505 }, { "entropy": 0.06073737666010857, "epoch": 5.713369856626646, "grad_norm": 2.921875, "learning_rate": 4.9073495732191024e-05, "loss": 0.0572, "mean_token_accuracy": 0.9840463936328888, "num_tokens": 66792015.0, "step": 24510 }, { "entropy": 0.06353724235668778, "epoch": 5.714535493647278, "grad_norm": 0.41796875, "learning_rate": 4.9072918940503526e-05, "loss": 0.0465, "mean_token_accuracy": 0.9877960324287415, "num_tokens": 66814488.0, "step": 24515 }, { "entropy": 0.07438728669658304, "epoch": 5.7157011306679095, "grad_norm": 1.1171875, "learning_rate": 4.9072341976246393e-05, "loss": 0.0356, "mean_token_accuracy": 0.985117620229721, "num_tokens": 66840584.0, "step": 24520 }, { "entropy": 0.07487269369885326, "epoch": 5.716866767688542, "grad_norm": 0.2138671875, "learning_rate": 4.907176483942824e-05, "loss": 0.0563, "mean_token_accuracy": 0.9825186014175415, "num_tokens": 66862103.0, "step": 24525 }, { "entropy": 0.11075683934614063, "epoch": 5.718032404709174, "grad_norm": 1.9921875, "learning_rate": 4.907118753005767e-05, "loss": 0.0695, "mean_token_accuracy": 0.9796536266803741, "num_tokens": 66880019.0, "step": 24530 }, { "entropy": 0.07921370901167393, "epoch": 5.719198041729806, "grad_norm": 0.953125, "learning_rate": 4.9070610048143284e-05, "loss": 0.0577, "mean_token_accuracy": 0.9792716860771179, "num_tokens": 66890624.0, "step": 24535 }, { "entropy": 0.062124581448733805, "epoch": 5.720363678750437, "grad_norm": 1.4609375, "learning_rate": 4.907003239369371e-05, "loss": 0.0425, "mean_token_accuracy": 0.9866943776607513, "num_tokens": 66910862.0, "step": 24540 }, { "entropy": 0.14402158558368683, "epoch": 5.721529315771069, "grad_norm": 0.3046875, "learning_rate": 4.906945456671754e-05, "loss": 0.2548, "mean_token_accuracy": 0.9451413333415986, "num_tokens": 66931638.0, "step": 24545 }, { "entropy": 0.04773062439635396, "epoch": 5.7226949527917, "grad_norm": 0.3515625, "learning_rate": 4.90688765672234e-05, "loss": 0.0223, "mean_token_accuracy": 0.9901167631149292, "num_tokens": 66957123.0, "step": 24550 }, { "entropy": 0.07032151008024812, "epoch": 5.7238605898123325, "grad_norm": 0.7265625, "learning_rate": 4.9068298395219915e-05, "loss": 0.0258, "mean_token_accuracy": 0.9859899163246155, "num_tokens": 66983294.0, "step": 24555 }, { "entropy": 0.07757879067212343, "epoch": 5.725026226832965, "grad_norm": 1.28125, "learning_rate": 4.90677200507157e-05, "loss": 0.056, "mean_token_accuracy": 0.9782130300998688, "num_tokens": 67002262.0, "step": 24560 }, { "entropy": 0.080276258289814, "epoch": 5.726191863853596, "grad_norm": 1.2734375, "learning_rate": 4.906714153371937e-05, "loss": 0.069, "mean_token_accuracy": 0.9813676834106445, "num_tokens": 67021324.0, "step": 24565 }, { "entropy": 0.07185540087521076, "epoch": 5.727357500874228, "grad_norm": 1.03125, "learning_rate": 4.906656284423958e-05, "loss": 0.0714, "mean_token_accuracy": 0.9795701384544373, "num_tokens": 67031846.0, "step": 24570 }, { "entropy": 0.08011389188468457, "epoch": 5.728523137894859, "grad_norm": 1.15625, "learning_rate": 4.906598398228493e-05, "loss": 0.0604, "mean_token_accuracy": 0.982747620344162, "num_tokens": 67044837.0, "step": 24575 }, { "entropy": 0.06071140300482512, "epoch": 5.729688774915491, "grad_norm": 2.046875, "learning_rate": 4.9065404947864065e-05, "loss": 0.0501, "mean_token_accuracy": 0.9861461579799652, "num_tokens": 67063797.0, "step": 24580 }, { "entropy": 0.0752846309915185, "epoch": 5.730854411936123, "grad_norm": 0.88671875, "learning_rate": 4.9064825740985615e-05, "loss": 0.0759, "mean_token_accuracy": 0.980017501115799, "num_tokens": 67075399.0, "step": 24585 }, { "entropy": 0.05309299118816853, "epoch": 5.7320200489567545, "grad_norm": 1.234375, "learning_rate": 4.906424636165822e-05, "loss": 0.0353, "mean_token_accuracy": 0.9873521506786347, "num_tokens": 67104347.0, "step": 24590 }, { "entropy": 0.08549241051077842, "epoch": 5.733185685977387, "grad_norm": 1.8828125, "learning_rate": 4.906366680989052e-05, "loss": 0.0628, "mean_token_accuracy": 0.9774488568305969, "num_tokens": 67113934.0, "step": 24595 }, { "entropy": 0.058443698287010196, "epoch": 5.734351322998019, "grad_norm": 2.09375, "learning_rate": 4.906308708569115e-05, "loss": 0.0424, "mean_token_accuracy": 0.9856965959072113, "num_tokens": 67138145.0, "step": 24600 }, { "entropy": 0.05035996092483401, "epoch": 5.73551696001865, "grad_norm": 1.1875, "learning_rate": 4.906250718906877e-05, "loss": 0.0212, "mean_token_accuracy": 0.9902314424514771, "num_tokens": 67168380.0, "step": 24605 }, { "entropy": 0.08484232518821955, "epoch": 5.736682597039282, "grad_norm": 2.625, "learning_rate": 4.906192712003201e-05, "loss": 0.0933, "mean_token_accuracy": 0.9787089943885803, "num_tokens": 67178203.0, "step": 24610 }, { "entropy": 0.06751094851642847, "epoch": 5.737848234059914, "grad_norm": 0.76953125, "learning_rate": 4.906134687858953e-05, "loss": 0.0493, "mean_token_accuracy": 0.978972727060318, "num_tokens": 67192998.0, "step": 24615 }, { "entropy": 0.06727732773870229, "epoch": 5.739013871080545, "grad_norm": 1.1171875, "learning_rate": 4.9060766464749966e-05, "loss": 0.0449, "mean_token_accuracy": 0.9797036170959472, "num_tokens": 67211708.0, "step": 24620 }, { "entropy": 0.07089311145246029, "epoch": 5.7401795081011775, "grad_norm": 1.1171875, "learning_rate": 4.9060185878522e-05, "loss": 0.0451, "mean_token_accuracy": 0.987681120634079, "num_tokens": 67240395.0, "step": 24625 }, { "entropy": 0.07671239618211985, "epoch": 5.741345145121809, "grad_norm": 2.4375, "learning_rate": 4.905960511991427e-05, "loss": 0.0693, "mean_token_accuracy": 0.9838557839393616, "num_tokens": 67260047.0, "step": 24630 }, { "entropy": 0.09863593205809593, "epoch": 5.742510782142441, "grad_norm": 1.59375, "learning_rate": 4.905902418893544e-05, "loss": 0.0601, "mean_token_accuracy": 0.9812259912490845, "num_tokens": 67280789.0, "step": 24635 }, { "entropy": 0.07931637912988662, "epoch": 5.743676419163073, "grad_norm": 2.703125, "learning_rate": 4.905844308559417e-05, "loss": 0.0545, "mean_token_accuracy": 0.9830240905284882, "num_tokens": 67293564.0, "step": 24640 }, { "entropy": 0.07518193898722529, "epoch": 5.744842056183704, "grad_norm": 1.4453125, "learning_rate": 4.905786180989914e-05, "loss": 0.0534, "mean_token_accuracy": 0.9804893136024475, "num_tokens": 67311845.0, "step": 24645 }, { "entropy": 0.08153504896908999, "epoch": 5.746007693204336, "grad_norm": 1.9453125, "learning_rate": 4.9057280361859e-05, "loss": 0.0774, "mean_token_accuracy": 0.9775378167629242, "num_tokens": 67322321.0, "step": 24650 }, { "entropy": 0.11738669704645872, "epoch": 5.7471733302249675, "grad_norm": 2.25, "learning_rate": 4.9056698741482425e-05, "loss": 0.1217, "mean_token_accuracy": 0.9739999532699585, "num_tokens": 67344001.0, "step": 24655 }, { "entropy": 0.06384739736095071, "epoch": 5.7483389672456, "grad_norm": 0.396484375, "learning_rate": 4.9056116948778094e-05, "loss": 0.0589, "mean_token_accuracy": 0.9812793254852294, "num_tokens": 67360094.0, "step": 24660 }, { "entropy": 0.05202016243711114, "epoch": 5.749504604266232, "grad_norm": 0.2177734375, "learning_rate": 4.9055534983754674e-05, "loss": 0.0673, "mean_token_accuracy": 0.984355503320694, "num_tokens": 67388936.0, "step": 24665 }, { "entropy": 0.054006512835621835, "epoch": 5.750670241286863, "grad_norm": 0.44140625, "learning_rate": 4.9054952846420846e-05, "loss": 0.0334, "mean_token_accuracy": 0.9864718377590179, "num_tokens": 67415275.0, "step": 24670 }, { "entropy": 0.057743340730667114, "epoch": 5.751835878307495, "grad_norm": 1.640625, "learning_rate": 4.905437053678529e-05, "loss": 0.0464, "mean_token_accuracy": 0.9884009540081025, "num_tokens": 67431476.0, "step": 24675 }, { "entropy": 0.08277987511828541, "epoch": 5.753001515328127, "grad_norm": 1.40625, "learning_rate": 4.9053788054856695e-05, "loss": 0.0371, "mean_token_accuracy": 0.9903243362903595, "num_tokens": 67459871.0, "step": 24680 }, { "entropy": 0.06267971489578486, "epoch": 5.754167152348758, "grad_norm": 1.71875, "learning_rate": 4.9053205400643745e-05, "loss": 0.0359, "mean_token_accuracy": 0.9884569525718689, "num_tokens": 67489158.0, "step": 24685 }, { "entropy": 0.06875952733680606, "epoch": 5.75533278936939, "grad_norm": 0.890625, "learning_rate": 4.905262257415512e-05, "loss": 0.0641, "mean_token_accuracy": 0.9833064436912536, "num_tokens": 67508215.0, "step": 24690 }, { "entropy": 0.06311565637588501, "epoch": 5.7564984263900225, "grad_norm": 2.734375, "learning_rate": 4.905203957539952e-05, "loss": 0.0724, "mean_token_accuracy": 0.9840085566043854, "num_tokens": 67528107.0, "step": 24695 }, { "entropy": 0.05413348004221916, "epoch": 5.757664063410654, "grad_norm": 0.81640625, "learning_rate": 4.9051456404385634e-05, "loss": 0.0527, "mean_token_accuracy": 0.9849828660488129, "num_tokens": 67554365.0, "step": 24700 }, { "entropy": 0.06338634807616472, "epoch": 5.758829700431286, "grad_norm": 1.4453125, "learning_rate": 4.9050873061122156e-05, "loss": 0.0479, "mean_token_accuracy": 0.984088945388794, "num_tokens": 67574710.0, "step": 24705 }, { "entropy": 0.07404083982110024, "epoch": 5.759995337451917, "grad_norm": 0.85546875, "learning_rate": 4.905028954561779e-05, "loss": 0.0557, "mean_token_accuracy": 0.9842141985893249, "num_tokens": 67603496.0, "step": 24710 }, { "entropy": 0.07604032196104527, "epoch": 5.761160974472549, "grad_norm": 0.7734375, "learning_rate": 4.9049705857881236e-05, "loss": 0.0817, "mean_token_accuracy": 0.9802056968212127, "num_tokens": 67613424.0, "step": 24715 }, { "entropy": 0.05118763484060764, "epoch": 5.762326611493181, "grad_norm": 0.2216796875, "learning_rate": 4.904912199792119e-05, "loss": 0.0312, "mean_token_accuracy": 0.9855857014656066, "num_tokens": 67640651.0, "step": 24720 }, { "entropy": 0.08875577114522457, "epoch": 5.7634922485138125, "grad_norm": 2.765625, "learning_rate": 4.904853796574637e-05, "loss": 0.088, "mean_token_accuracy": 0.980398166179657, "num_tokens": 67649753.0, "step": 24725 }, { "entropy": 0.0819103766232729, "epoch": 5.764657885534445, "grad_norm": 1.3984375, "learning_rate": 4.904795376136547e-05, "loss": 0.0784, "mean_token_accuracy": 0.9783179640769959, "num_tokens": 67667271.0, "step": 24730 }, { "entropy": 0.07176627768203617, "epoch": 5.765823522555077, "grad_norm": 0.435546875, "learning_rate": 4.9047369384787216e-05, "loss": 0.0478, "mean_token_accuracy": 0.984830129146576, "num_tokens": 67695058.0, "step": 24735 }, { "entropy": 0.06707020187750459, "epoch": 5.766989159575708, "grad_norm": 2.609375, "learning_rate": 4.9046784836020315e-05, "loss": 0.0616, "mean_token_accuracy": 0.9823902189731598, "num_tokens": 67711561.0, "step": 24740 }, { "entropy": 0.07110977135598659, "epoch": 5.76815479659634, "grad_norm": 0.8203125, "learning_rate": 4.904620011507349e-05, "loss": 0.0887, "mean_token_accuracy": 0.9792767465114594, "num_tokens": 67721138.0, "step": 24745 }, { "entropy": 0.07955138608813286, "epoch": 5.769320433616972, "grad_norm": 0.5703125, "learning_rate": 4.904561522195545e-05, "loss": 0.0561, "mean_token_accuracy": 0.9796516954898834, "num_tokens": 67737594.0, "step": 24750 }, { "entropy": 0.07870314586907626, "epoch": 5.770486070637603, "grad_norm": 1.421875, "learning_rate": 4.904503015667492e-05, "loss": 0.0583, "mean_token_accuracy": 0.9815060377120972, "num_tokens": 67761885.0, "step": 24755 }, { "entropy": 0.07288762480020523, "epoch": 5.771651707658235, "grad_norm": 2.015625, "learning_rate": 4.904444491924063e-05, "loss": 0.064, "mean_token_accuracy": 0.9826235294342041, "num_tokens": 67779670.0, "step": 24760 }, { "entropy": 0.06011434905230999, "epoch": 5.772817344678867, "grad_norm": 1.1875, "learning_rate": 4.90438595096613e-05, "loss": 0.0502, "mean_token_accuracy": 0.9867224514484405, "num_tokens": 67792226.0, "step": 24765 }, { "entropy": 0.07408210225403308, "epoch": 5.773982981699499, "grad_norm": 4.8125, "learning_rate": 4.904327392794566e-05, "loss": 0.0899, "mean_token_accuracy": 0.9811750411987304, "num_tokens": 67802060.0, "step": 24770 }, { "entropy": 0.09298908114433288, "epoch": 5.775148618720131, "grad_norm": 0.54296875, "learning_rate": 4.904268817410245e-05, "loss": 0.0886, "mean_token_accuracy": 0.9772188723087311, "num_tokens": 67822059.0, "step": 24775 }, { "entropy": 0.06530466936528682, "epoch": 5.776314255740762, "grad_norm": 0.46484375, "learning_rate": 4.904210224814039e-05, "loss": 0.0591, "mean_token_accuracy": 0.983297997713089, "num_tokens": 67845758.0, "step": 24780 }, { "entropy": 0.07186365425586701, "epoch": 5.777479892761394, "grad_norm": 0.77734375, "learning_rate": 4.904151615006823e-05, "loss": 0.0581, "mean_token_accuracy": 0.983722984790802, "num_tokens": 67867822.0, "step": 24785 }, { "entropy": 0.07986092139035464, "epoch": 5.778645529782025, "grad_norm": 2.90625, "learning_rate": 4.90409298798947e-05, "loss": 0.064, "mean_token_accuracy": 0.9811047136783599, "num_tokens": 67879967.0, "step": 24790 }, { "entropy": 0.07390990536659955, "epoch": 5.7798111668026575, "grad_norm": 1.9375, "learning_rate": 4.9040343437628554e-05, "loss": 0.0457, "mean_token_accuracy": 0.9846989333629608, "num_tokens": 67901376.0, "step": 24795 }, { "entropy": 0.061554950382560494, "epoch": 5.78097680382329, "grad_norm": 0.326171875, "learning_rate": 4.903975682327853e-05, "loss": 0.023, "mean_token_accuracy": 0.986127781867981, "num_tokens": 67925039.0, "step": 24800 }, { "entropy": 0.06639809599146247, "epoch": 5.782142440843921, "grad_norm": 2.421875, "learning_rate": 4.903917003685337e-05, "loss": 0.0544, "mean_token_accuracy": 0.983638447523117, "num_tokens": 67951929.0, "step": 24805 }, { "entropy": 0.044928194023668766, "epoch": 5.783308077864553, "grad_norm": 1.4765625, "learning_rate": 4.903858307836183e-05, "loss": 0.0189, "mean_token_accuracy": 0.9886196672916412, "num_tokens": 67986379.0, "step": 24810 }, { "entropy": 0.06619902048259974, "epoch": 5.784473714885185, "grad_norm": 3.390625, "learning_rate": 4.9037995947812656e-05, "loss": 0.0578, "mean_token_accuracy": 0.9844651639461517, "num_tokens": 68004294.0, "step": 24815 }, { "entropy": 0.052559500001370905, "epoch": 5.785639351905816, "grad_norm": 0.640625, "learning_rate": 4.903740864521462e-05, "loss": 0.039, "mean_token_accuracy": 0.9900806844234467, "num_tokens": 68025024.0, "step": 24820 }, { "entropy": 0.07270161435008049, "epoch": 5.786804988926448, "grad_norm": 1.78125, "learning_rate": 4.9036821170576466e-05, "loss": 0.08, "mean_token_accuracy": 0.9809543669223786, "num_tokens": 68035351.0, "step": 24825 }, { "entropy": 0.09084619544446468, "epoch": 5.7879706259470804, "grad_norm": 1.296875, "learning_rate": 4.903623352390695e-05, "loss": 0.0839, "mean_token_accuracy": 0.9778399229049682, "num_tokens": 68043865.0, "step": 24830 }, { "entropy": 0.05886190002784133, "epoch": 5.789136262967712, "grad_norm": 1.3125, "learning_rate": 4.9035645705214836e-05, "loss": 0.0394, "mean_token_accuracy": 0.9871821939945221, "num_tokens": 68067274.0, "step": 24835 }, { "entropy": 0.05819834126159549, "epoch": 5.790301899988344, "grad_norm": 0.53515625, "learning_rate": 4.90350577145089e-05, "loss": 0.032, "mean_token_accuracy": 0.9864734590053559, "num_tokens": 68096912.0, "step": 24840 }, { "entropy": 0.0587355166207999, "epoch": 5.791467537008975, "grad_norm": 1.4453125, "learning_rate": 4.903446955179791e-05, "loss": 0.0359, "mean_token_accuracy": 0.9889220237731934, "num_tokens": 68122295.0, "step": 24845 }, { "entropy": 0.06773356515914201, "epoch": 5.792633174029607, "grad_norm": 1.59375, "learning_rate": 4.903388121709062e-05, "loss": 0.045, "mean_token_accuracy": 0.9855106472969055, "num_tokens": 68150932.0, "step": 24850 }, { "entropy": 0.1423711057752371, "epoch": 5.793798811050239, "grad_norm": 1.328125, "learning_rate": 4.9033292710395815e-05, "loss": 0.2357, "mean_token_accuracy": 0.9351215898990631, "num_tokens": 68191463.0, "step": 24855 }, { "entropy": 0.07398759815841913, "epoch": 5.79496444807087, "grad_norm": 1.6328125, "learning_rate": 4.903270403172228e-05, "loss": 0.0748, "mean_token_accuracy": 0.9809467673301697, "num_tokens": 68206785.0, "step": 24860 }, { "entropy": 0.04514298690482974, "epoch": 5.7961300850915025, "grad_norm": 1.3515625, "learning_rate": 4.9032115181078767e-05, "loss": 0.0361, "mean_token_accuracy": 0.9884274780750275, "num_tokens": 68227908.0, "step": 24865 }, { "entropy": 0.07980956807732582, "epoch": 5.797295722112135, "grad_norm": 3.546875, "learning_rate": 4.9031526158474075e-05, "loss": 0.0986, "mean_token_accuracy": 0.975833123922348, "num_tokens": 68237536.0, "step": 24870 }, { "entropy": 0.0604316022247076, "epoch": 5.798461359132766, "grad_norm": 3.765625, "learning_rate": 4.903093696391699e-05, "loss": 0.0573, "mean_token_accuracy": 0.9862626194953918, "num_tokens": 68251155.0, "step": 24875 }, { "entropy": 0.0642173401080072, "epoch": 5.799626996153398, "grad_norm": 0.921875, "learning_rate": 4.903034759741629e-05, "loss": 0.0497, "mean_token_accuracy": 0.9833175539970398, "num_tokens": 68272527.0, "step": 24880 }, { "entropy": 0.07729770168662072, "epoch": 5.80079263317403, "grad_norm": 4.5625, "learning_rate": 4.9029758058980755e-05, "loss": 0.0728, "mean_token_accuracy": 0.9799280226230621, "num_tokens": 68287262.0, "step": 24885 }, { "entropy": 0.09029734618961811, "epoch": 5.801958270194661, "grad_norm": 1.3046875, "learning_rate": 4.90291683486192e-05, "loss": 0.0831, "mean_token_accuracy": 0.9771609783172608, "num_tokens": 68295984.0, "step": 24890 }, { "entropy": 0.052832887321710584, "epoch": 5.803123907215293, "grad_norm": 0.87109375, "learning_rate": 4.902857846634039e-05, "loss": 0.0481, "mean_token_accuracy": 0.9858577847480774, "num_tokens": 68310873.0, "step": 24895 }, { "entropy": 0.07836331203579902, "epoch": 5.804289544235925, "grad_norm": 1.6953125, "learning_rate": 4.9027988412153147e-05, "loss": 0.0851, "mean_token_accuracy": 0.9786667168140412, "num_tokens": 68320263.0, "step": 24900 }, { "entropy": 0.08196291290223598, "epoch": 5.805455181256557, "grad_norm": 0.8515625, "learning_rate": 4.9027398186066256e-05, "loss": 0.0584, "mean_token_accuracy": 0.9852247297763824, "num_tokens": 68350270.0, "step": 24905 }, { "entropy": 0.13181650806218387, "epoch": 5.806620818277189, "grad_norm": 3.265625, "learning_rate": 4.9026807788088516e-05, "loss": 0.1793, "mean_token_accuracy": 0.9595660865306854, "num_tokens": 68374624.0, "step": 24910 }, { "entropy": 0.0995724380016327, "epoch": 5.80778645529782, "grad_norm": 1.5703125, "learning_rate": 4.902621721822873e-05, "loss": 0.0736, "mean_token_accuracy": 0.9790677905082703, "num_tokens": 68385069.0, "step": 24915 }, { "entropy": 0.0691030714660883, "epoch": 5.808952092318452, "grad_norm": 0.50390625, "learning_rate": 4.902562647649571e-05, "loss": 0.066, "mean_token_accuracy": 0.9839153230190277, "num_tokens": 68400909.0, "step": 24920 }, { "entropy": 0.06300949761644006, "epoch": 5.810117729339083, "grad_norm": 1.640625, "learning_rate": 4.902503556289827e-05, "loss": 0.0507, "mean_token_accuracy": 0.9794705331325531, "num_tokens": 68433503.0, "step": 24925 }, { "entropy": 0.06272582067176699, "epoch": 5.811283366359715, "grad_norm": 2.078125, "learning_rate": 4.9024444477445216e-05, "loss": 0.043, "mean_token_accuracy": 0.9844630897045136, "num_tokens": 68451031.0, "step": 24930 }, { "entropy": 0.056623499002307655, "epoch": 5.8124490033803475, "grad_norm": 1.515625, "learning_rate": 4.9023853220145355e-05, "loss": 0.0447, "mean_token_accuracy": 0.9855444729328156, "num_tokens": 68474316.0, "step": 24935 }, { "entropy": 0.05010606348514557, "epoch": 5.813614640400979, "grad_norm": 1.2734375, "learning_rate": 4.9023261791007514e-05, "loss": 0.0405, "mean_token_accuracy": 0.9896503448486328, "num_tokens": 68495309.0, "step": 24940 }, { "entropy": 0.07280549835413694, "epoch": 5.814780277421611, "grad_norm": 0.91015625, "learning_rate": 4.902267019004051e-05, "loss": 0.0646, "mean_token_accuracy": 0.9830634236335755, "num_tokens": 68507603.0, "step": 24945 }, { "entropy": 0.05676093138754368, "epoch": 5.815945914442243, "grad_norm": 1.6953125, "learning_rate": 4.902207841725315e-05, "loss": 0.057, "mean_token_accuracy": 0.9855646133422852, "num_tokens": 68523419.0, "step": 24950 }, { "entropy": 0.06060009114444256, "epoch": 5.817111551462874, "grad_norm": 0.8125, "learning_rate": 4.9021486472654285e-05, "loss": 0.0292, "mean_token_accuracy": 0.9882301330566406, "num_tokens": 68545958.0, "step": 24955 }, { "entropy": 0.07530084438621998, "epoch": 5.818277188483506, "grad_norm": 0.79296875, "learning_rate": 4.902089435625272e-05, "loss": 0.0792, "mean_token_accuracy": 0.9814654409885406, "num_tokens": 68557288.0, "step": 24960 }, { "entropy": 0.05552833992987871, "epoch": 5.819442825504138, "grad_norm": 0.46484375, "learning_rate": 4.9020302068057296e-05, "loss": 0.0355, "mean_token_accuracy": 0.9849902153015136, "num_tokens": 68591863.0, "step": 24965 }, { "entropy": 0.06605686107650399, "epoch": 5.82060846252477, "grad_norm": 0.5859375, "learning_rate": 4.9019709608076834e-05, "loss": 0.0331, "mean_token_accuracy": 0.9879513800144195, "num_tokens": 68623535.0, "step": 24970 }, { "entropy": 0.06880107838660479, "epoch": 5.821774099545402, "grad_norm": 0.58203125, "learning_rate": 4.901911697632018e-05, "loss": 0.08, "mean_token_accuracy": 0.9790106236934661, "num_tokens": 68636782.0, "step": 24975 }, { "entropy": 0.07890882007777691, "epoch": 5.822939736566033, "grad_norm": 1.6328125, "learning_rate": 4.901852417279617e-05, "loss": 0.063, "mean_token_accuracy": 0.9829611778259277, "num_tokens": 68646651.0, "step": 24980 }, { "entropy": 0.047220236714929344, "epoch": 5.824105373586665, "grad_norm": 0.1455078125, "learning_rate": 4.9017931197513625e-05, "loss": 0.046, "mean_token_accuracy": 0.9886944651603699, "num_tokens": 68672593.0, "step": 24985 }, { "entropy": 0.05217403545975685, "epoch": 5.825271010607297, "grad_norm": 0.51953125, "learning_rate": 4.9017338050481415e-05, "loss": 0.0391, "mean_token_accuracy": 0.9889337956905365, "num_tokens": 68695953.0, "step": 24990 }, { "entropy": 0.05944271394982934, "epoch": 5.826436647627928, "grad_norm": 0.8359375, "learning_rate": 4.901674473170837e-05, "loss": 0.0412, "mean_token_accuracy": 0.9885189294815063, "num_tokens": 68725653.0, "step": 24995 }, { "entropy": 0.06879550032317638, "epoch": 5.8276022846485604, "grad_norm": 1.0703125, "learning_rate": 4.901615124120333e-05, "loss": 0.0664, "mean_token_accuracy": 0.9819593906402588, "num_tokens": 68738463.0, "step": 25000 }, { "entropy": 0.0660898657515645, "epoch": 5.8287679216691926, "grad_norm": 2.234375, "learning_rate": 4.901555757897517e-05, "loss": 0.0783, "mean_token_accuracy": 0.9784412801265716, "num_tokens": 68758793.0, "step": 25005 }, { "entropy": 0.059831819776445624, "epoch": 5.829933558689824, "grad_norm": 3.34375, "learning_rate": 4.9014963745032714e-05, "loss": 0.0446, "mean_token_accuracy": 0.9873811423778533, "num_tokens": 68789321.0, "step": 25010 }, { "entropy": 0.04351752800866961, "epoch": 5.831099195710456, "grad_norm": 0.2353515625, "learning_rate": 4.9014369739384836e-05, "loss": 0.0233, "mean_token_accuracy": 0.9915945589542389, "num_tokens": 68827648.0, "step": 25015 }, { "entropy": 0.04593317694962025, "epoch": 5.832264832731088, "grad_norm": 0.7109375, "learning_rate": 4.9013775562040384e-05, "loss": 0.045, "mean_token_accuracy": 0.9876484513282776, "num_tokens": 68848301.0, "step": 25020 }, { "entropy": 0.08194901645183564, "epoch": 5.833430469751719, "grad_norm": 4.8125, "learning_rate": 4.901318121300822e-05, "loss": 0.0842, "mean_token_accuracy": 0.9750150680541992, "num_tokens": 68857732.0, "step": 25025 }, { "entropy": 0.05717827407643199, "epoch": 5.834596106772351, "grad_norm": 0.388671875, "learning_rate": 4.90125866922972e-05, "loss": 0.053, "mean_token_accuracy": 0.9848073601722718, "num_tokens": 68877921.0, "step": 25030 }, { "entropy": 0.06308312909677624, "epoch": 5.8357617437929825, "grad_norm": 0.333984375, "learning_rate": 4.901199199991621e-05, "loss": 0.05, "mean_token_accuracy": 0.9814503490924835, "num_tokens": 68894260.0, "step": 25035 }, { "entropy": 0.0596246593631804, "epoch": 5.836927380813615, "grad_norm": 2.375, "learning_rate": 4.9011397135874095e-05, "loss": 0.0486, "mean_token_accuracy": 0.985901540517807, "num_tokens": 68914302.0, "step": 25040 }, { "entropy": 0.0679067311808467, "epoch": 5.838093017834247, "grad_norm": 1.0703125, "learning_rate": 4.901080210017974e-05, "loss": 0.06, "mean_token_accuracy": 0.9837885200977325, "num_tokens": 68929950.0, "step": 25045 }, { "entropy": 0.052906651981174946, "epoch": 5.839258654854878, "grad_norm": 0.294921875, "learning_rate": 4.9010206892842004e-05, "loss": 0.0358, "mean_token_accuracy": 0.9889191925525666, "num_tokens": 68958878.0, "step": 25050 }, { "entropy": 0.07227160930633544, "epoch": 5.84042429187551, "grad_norm": 1.7421875, "learning_rate": 4.900961151386978e-05, "loss": 0.0636, "mean_token_accuracy": 0.9838977217674255, "num_tokens": 68970844.0, "step": 25055 }, { "entropy": 0.04979464411735535, "epoch": 5.841589928896141, "grad_norm": 0.203125, "learning_rate": 4.9009015963271935e-05, "loss": 0.0324, "mean_token_accuracy": 0.9865056037902832, "num_tokens": 68996822.0, "step": 25060 }, { "entropy": 0.0801444560289383, "epoch": 5.842755565916773, "grad_norm": 0.953125, "learning_rate": 4.900842024105735e-05, "loss": 0.0591, "mean_token_accuracy": 0.9850879967212677, "num_tokens": 69008715.0, "step": 25065 }, { "entropy": 0.06933778412640094, "epoch": 5.8439212029374055, "grad_norm": 1.4140625, "learning_rate": 4.90078243472349e-05, "loss": 0.0671, "mean_token_accuracy": 0.9800980925559998, "num_tokens": 69027853.0, "step": 25070 }, { "entropy": 0.0758416060358286, "epoch": 5.845086839958037, "grad_norm": 0.51953125, "learning_rate": 4.9007228281813497e-05, "loss": 0.0634, "mean_token_accuracy": 0.9792212009429931, "num_tokens": 69052064.0, "step": 25075 }, { "entropy": 0.04220882719382644, "epoch": 5.846252476978669, "grad_norm": 0.1767578125, "learning_rate": 4.9006632044802005e-05, "loss": 0.019, "mean_token_accuracy": 0.9897603571414948, "num_tokens": 69093950.0, "step": 25080 }, { "entropy": 0.0862182735465467, "epoch": 5.847418113999301, "grad_norm": 1.0859375, "learning_rate": 4.900603563620933e-05, "loss": 0.0696, "mean_token_accuracy": 0.977711945772171, "num_tokens": 69111718.0, "step": 25085 }, { "entropy": 0.05932206539437175, "epoch": 5.848583751019932, "grad_norm": 1.8984375, "learning_rate": 4.9005439056044345e-05, "loss": 0.0557, "mean_token_accuracy": 0.984957355260849, "num_tokens": 69130818.0, "step": 25090 }, { "entropy": 0.0707325934432447, "epoch": 5.849749388040564, "grad_norm": 3.09375, "learning_rate": 4.900484230431596e-05, "loss": 0.0727, "mean_token_accuracy": 0.9831813514232636, "num_tokens": 69150168.0, "step": 25095 }, { "entropy": 0.07086216192692518, "epoch": 5.850915025061196, "grad_norm": 2.515625, "learning_rate": 4.900424538103307e-05, "loss": 0.0651, "mean_token_accuracy": 0.9826680719852448, "num_tokens": 69169786.0, "step": 25100 }, { "entropy": 0.1979743585921824, "epoch": 5.8520806620818275, "grad_norm": 0.5546875, "learning_rate": 4.900364828620459e-05, "loss": 0.2013, "mean_token_accuracy": 0.9713845193386078, "num_tokens": 69203927.0, "step": 25105 }, { "entropy": 0.08615749217569828, "epoch": 5.85324629910246, "grad_norm": 4.3125, "learning_rate": 4.900305101983941e-05, "loss": 0.0884, "mean_token_accuracy": 0.9804681181907654, "num_tokens": 69212942.0, "step": 25110 }, { "entropy": 0.06537931114435196, "epoch": 5.854411936123091, "grad_norm": 2.359375, "learning_rate": 4.9002453581946426e-05, "loss": 0.0269, "mean_token_accuracy": 0.9828318059444427, "num_tokens": 69259238.0, "step": 25115 }, { "entropy": 0.06119229989126325, "epoch": 5.855577573143723, "grad_norm": 0.31640625, "learning_rate": 4.9001855972534566e-05, "loss": 0.0665, "mean_token_accuracy": 0.982484656572342, "num_tokens": 69278421.0, "step": 25120 }, { "entropy": 0.08115847948938608, "epoch": 5.856743210164355, "grad_norm": 0.51953125, "learning_rate": 4.900125819161273e-05, "loss": 0.0287, "mean_token_accuracy": 0.9896356463432312, "num_tokens": 69306027.0, "step": 25125 }, { "entropy": 0.05680079516023397, "epoch": 5.857908847184986, "grad_norm": 3.90625, "learning_rate": 4.900066023918984e-05, "loss": 0.0574, "mean_token_accuracy": 0.9836540341377258, "num_tokens": 69333581.0, "step": 25130 }, { "entropy": 0.06749532804824412, "epoch": 5.859074484205618, "grad_norm": 2.203125, "learning_rate": 4.900006211527481e-05, "loss": 0.0542, "mean_token_accuracy": 0.9833632111549377, "num_tokens": 69353480.0, "step": 25135 }, { "entropy": 0.06454726718366147, "epoch": 5.8602401212262505, "grad_norm": 0.453125, "learning_rate": 4.899946381987655e-05, "loss": 0.0358, "mean_token_accuracy": 0.9881761074066162, "num_tokens": 69387451.0, "step": 25140 }, { "entropy": 0.0817156407982111, "epoch": 5.861405758246882, "grad_norm": 1.96875, "learning_rate": 4.899886535300399e-05, "loss": 0.0876, "mean_token_accuracy": 0.9789597153663635, "num_tokens": 69406319.0, "step": 25145 }, { "entropy": 0.059768567234277724, "epoch": 5.862571395267514, "grad_norm": 0.859375, "learning_rate": 4.899826671466606e-05, "loss": 0.0551, "mean_token_accuracy": 0.9846877813339233, "num_tokens": 69422523.0, "step": 25150 }, { "entropy": 0.057445686869323255, "epoch": 5.863737032288146, "grad_norm": 0.265625, "learning_rate": 4.899766790487167e-05, "loss": 0.0507, "mean_token_accuracy": 0.9862017810344696, "num_tokens": 69456953.0, "step": 25155 }, { "entropy": 0.07467758394777775, "epoch": 5.864902669308777, "grad_norm": 0.4296875, "learning_rate": 4.899706892362976e-05, "loss": 0.0775, "mean_token_accuracy": 0.9791662037372589, "num_tokens": 69468183.0, "step": 25160 }, { "entropy": 0.07420080313459039, "epoch": 5.866068306329409, "grad_norm": 1.9453125, "learning_rate": 4.899646977094926e-05, "loss": 0.0563, "mean_token_accuracy": 0.9782842993736267, "num_tokens": 69483658.0, "step": 25165 }, { "entropy": 0.0779596921056509, "epoch": 5.8672339433500404, "grad_norm": 1.359375, "learning_rate": 4.8995870446839107e-05, "loss": 0.0725, "mean_token_accuracy": 0.9799931585788727, "num_tokens": 69492891.0, "step": 25170 }, { "entropy": 0.058550332672894, "epoch": 5.8683995803706726, "grad_norm": 0.50390625, "learning_rate": 4.899527095130823e-05, "loss": 0.0458, "mean_token_accuracy": 0.9878552556037903, "num_tokens": 69511105.0, "step": 25175 }, { "entropy": 0.11132394783198833, "epoch": 5.869565217391305, "grad_norm": 4.1875, "learning_rate": 4.899467128436558e-05, "loss": 0.0796, "mean_token_accuracy": 0.9794100940227508, "num_tokens": 69525774.0, "step": 25180 }, { "entropy": 0.057683018036186695, "epoch": 5.870730854411936, "grad_norm": 1.71875, "learning_rate": 4.8994071446020086e-05, "loss": 0.0572, "mean_token_accuracy": 0.9849148869514466, "num_tokens": 69539887.0, "step": 25185 }, { "entropy": 0.04408614346757531, "epoch": 5.871896491432568, "grad_norm": 0.578125, "learning_rate": 4.8993471436280696e-05, "loss": 0.0371, "mean_token_accuracy": 0.9894815742969513, "num_tokens": 69574395.0, "step": 25190 }, { "entropy": 0.08741942159831524, "epoch": 5.873062128453199, "grad_norm": 1.7109375, "learning_rate": 4.899287125515637e-05, "loss": 0.0734, "mean_token_accuracy": 0.980907928943634, "num_tokens": 69586689.0, "step": 25195 }, { "entropy": 0.05762271042913199, "epoch": 5.874227765473831, "grad_norm": 0.546875, "learning_rate": 4.899227090265604e-05, "loss": 0.0275, "mean_token_accuracy": 0.9872725248336792, "num_tokens": 69611654.0, "step": 25200 }, { "entropy": 0.08529213555157185, "epoch": 5.875393402494463, "grad_norm": 2.90625, "learning_rate": 4.899167037878867e-05, "loss": 0.0796, "mean_token_accuracy": 0.978586596250534, "num_tokens": 69620488.0, "step": 25205 }, { "entropy": 0.07133572660386563, "epoch": 5.876559039515095, "grad_norm": 1.4296875, "learning_rate": 4.89910696835632e-05, "loss": 0.0635, "mean_token_accuracy": 0.9817762792110443, "num_tokens": 69640920.0, "step": 25210 }, { "entropy": 0.05597841432318092, "epoch": 5.877724676535727, "grad_norm": 0.52734375, "learning_rate": 4.899046881698861e-05, "loss": 0.0475, "mean_token_accuracy": 0.9782732903957367, "num_tokens": 69663848.0, "step": 25215 }, { "entropy": 0.06624952824786305, "epoch": 5.878890313556359, "grad_norm": 2.65625, "learning_rate": 4.898986777907384e-05, "loss": 0.0535, "mean_token_accuracy": 0.9852926313877106, "num_tokens": 69692834.0, "step": 25220 }, { "entropy": 0.06026315214112401, "epoch": 5.88005595057699, "grad_norm": 0.6171875, "learning_rate": 4.8989266569827865e-05, "loss": 0.0363, "mean_token_accuracy": 0.9877964973449707, "num_tokens": 69715431.0, "step": 25225 }, { "entropy": 0.05216656900011003, "epoch": 5.881221587597622, "grad_norm": 0.5625, "learning_rate": 4.898866518925964e-05, "loss": 0.04, "mean_token_accuracy": 0.9874097645282746, "num_tokens": 69741887.0, "step": 25230 }, { "entropy": 0.08897986803203821, "epoch": 5.882387224618254, "grad_norm": 1.359375, "learning_rate": 4.8988063637378135e-05, "loss": 0.0874, "mean_token_accuracy": 0.9695273995399475, "num_tokens": 69751481.0, "step": 25235 }, { "entropy": 0.049010620545595886, "epoch": 5.8835528616388855, "grad_norm": 0.185546875, "learning_rate": 4.898746191419233e-05, "loss": 0.0305, "mean_token_accuracy": 0.9865678668022155, "num_tokens": 69786531.0, "step": 25240 }, { "entropy": 0.057374946866184474, "epoch": 5.884718498659518, "grad_norm": 1.5390625, "learning_rate": 4.898686001971118e-05, "loss": 0.0372, "mean_token_accuracy": 0.9866810023784638, "num_tokens": 69812725.0, "step": 25245 }, { "entropy": 0.06412132494151593, "epoch": 5.885884135680149, "grad_norm": 1.4140625, "learning_rate": 4.8986257953943675e-05, "loss": 0.0747, "mean_token_accuracy": 0.9813116431236267, "num_tokens": 69830628.0, "step": 25250 }, { "entropy": 0.06648487970232964, "epoch": 5.887049772700781, "grad_norm": 0.8828125, "learning_rate": 4.8985655716898794e-05, "loss": 0.0621, "mean_token_accuracy": 0.984495198726654, "num_tokens": 69843119.0, "step": 25255 }, { "entropy": 0.05889953942969441, "epoch": 5.888215409721413, "grad_norm": 1.1640625, "learning_rate": 4.89850533085855e-05, "loss": 0.0558, "mean_token_accuracy": 0.9845393478870392, "num_tokens": 69859043.0, "step": 25260 }, { "entropy": 0.06169169787317515, "epoch": 5.889381046742044, "grad_norm": 1.15625, "learning_rate": 4.898445072901279e-05, "loss": 0.0377, "mean_token_accuracy": 0.9878328561782836, "num_tokens": 69884035.0, "step": 25265 }, { "entropy": 0.06493881866335868, "epoch": 5.890546683762676, "grad_norm": 0.66796875, "learning_rate": 4.898384797818965e-05, "loss": 0.0598, "mean_token_accuracy": 0.9817538022994995, "num_tokens": 69903093.0, "step": 25270 }, { "entropy": 0.07026399262249469, "epoch": 5.891712320783308, "grad_norm": 0.6953125, "learning_rate": 4.898324505612506e-05, "loss": 0.0477, "mean_token_accuracy": 0.9830464065074921, "num_tokens": 69924802.0, "step": 25275 }, { "entropy": 0.07336622122675181, "epoch": 5.89287795780394, "grad_norm": 1.25, "learning_rate": 4.8982641962828016e-05, "loss": 0.0701, "mean_token_accuracy": 0.9820722460746765, "num_tokens": 69940112.0, "step": 25280 }, { "entropy": 0.07239239010959864, "epoch": 5.894043594824572, "grad_norm": 0.4296875, "learning_rate": 4.898203869830751e-05, "loss": 0.0312, "mean_token_accuracy": 0.986901706457138, "num_tokens": 69976722.0, "step": 25285 }, { "entropy": 0.07194672748446465, "epoch": 5.895209231845204, "grad_norm": 3.609375, "learning_rate": 4.898143526257254e-05, "loss": 0.0517, "mean_token_accuracy": 0.985188215970993, "num_tokens": 69992244.0, "step": 25290 }, { "entropy": 0.0679439775645733, "epoch": 5.896374868865835, "grad_norm": 2.234375, "learning_rate": 4.89808316556321e-05, "loss": 0.0652, "mean_token_accuracy": 0.983660078048706, "num_tokens": 70002583.0, "step": 25295 }, { "entropy": 0.045691716391593216, "epoch": 5.897540505886467, "grad_norm": 1.328125, "learning_rate": 4.898022787749518e-05, "loss": 0.0417, "mean_token_accuracy": 0.9899048924446106, "num_tokens": 70036696.0, "step": 25300 }, { "entropy": 0.0655904158949852, "epoch": 5.898706142907098, "grad_norm": 3.34375, "learning_rate": 4.8979623928170807e-05, "loss": 0.0667, "mean_token_accuracy": 0.9835781037807465, "num_tokens": 70057959.0, "step": 25305 }, { "entropy": 0.0502521482296288, "epoch": 5.8998717799277305, "grad_norm": 0.427734375, "learning_rate": 4.897901980766798e-05, "loss": 0.0388, "mean_token_accuracy": 0.98756023645401, "num_tokens": 70097961.0, "step": 25310 }, { "entropy": 0.06395963728427886, "epoch": 5.901037416948363, "grad_norm": 2.078125, "learning_rate": 4.897841551599569e-05, "loss": 0.069, "mean_token_accuracy": 0.9781058013439179, "num_tokens": 70118450.0, "step": 25315 }, { "entropy": 0.07023835629224777, "epoch": 5.902203053968994, "grad_norm": 0.83984375, "learning_rate": 4.8977811053162966e-05, "loss": 0.0714, "mean_token_accuracy": 0.9810861945152283, "num_tokens": 70137787.0, "step": 25320 }, { "entropy": 0.06961941700428724, "epoch": 5.903368690989626, "grad_norm": 1.9296875, "learning_rate": 4.897720641917881e-05, "loss": 0.0599, "mean_token_accuracy": 0.981745857000351, "num_tokens": 70165019.0, "step": 25325 }, { "entropy": 0.05609036097303033, "epoch": 5.904534328010257, "grad_norm": 0.310546875, "learning_rate": 4.897660161405225e-05, "loss": 0.0447, "mean_token_accuracy": 0.9861968576908111, "num_tokens": 70193670.0, "step": 25330 }, { "entropy": 0.07962943669408559, "epoch": 5.905699965030889, "grad_norm": 3.015625, "learning_rate": 4.89759966377923e-05, "loss": 0.075, "mean_token_accuracy": 0.978878664970398, "num_tokens": 70207721.0, "step": 25335 }, { "entropy": 0.06705031860619784, "epoch": 5.906865602051521, "grad_norm": 1.5, "learning_rate": 4.8975391490407974e-05, "loss": 0.0494, "mean_token_accuracy": 0.9814827382564545, "num_tokens": 70227807.0, "step": 25340 }, { "entropy": 0.06056810254231095, "epoch": 5.9080312390721526, "grad_norm": 0.34765625, "learning_rate": 4.89747861719083e-05, "loss": 0.0579, "mean_token_accuracy": 0.9835603713989258, "num_tokens": 70252966.0, "step": 25345 }, { "entropy": 0.07554044676944613, "epoch": 5.909196876092785, "grad_norm": 1.140625, "learning_rate": 4.8974180682302306e-05, "loss": 0.0442, "mean_token_accuracy": 0.9850922226905823, "num_tokens": 70274362.0, "step": 25350 }, { "entropy": 0.0693486931733787, "epoch": 5.910362513113417, "grad_norm": 2.265625, "learning_rate": 4.897357502159902e-05, "loss": 0.0732, "mean_token_accuracy": 0.9794533431529999, "num_tokens": 70309391.0, "step": 25355 }, { "entropy": 0.069387202616781, "epoch": 5.911528150134048, "grad_norm": 0.5, "learning_rate": 4.8972969189807475e-05, "loss": 0.0533, "mean_token_accuracy": 0.9858072459697723, "num_tokens": 70326010.0, "step": 25360 }, { "entropy": 0.03837292743846774, "epoch": 5.91269378715468, "grad_norm": 0.490234375, "learning_rate": 4.8972363186936706e-05, "loss": 0.0166, "mean_token_accuracy": 0.9922250151634217, "num_tokens": 70359986.0, "step": 25365 }, { "entropy": 0.08435314483940601, "epoch": 5.913859424175312, "grad_norm": 1.65625, "learning_rate": 4.897175701299574e-05, "loss": 0.0892, "mean_token_accuracy": 0.977598226070404, "num_tokens": 70380482.0, "step": 25370 }, { "entropy": 0.06352117350324989, "epoch": 5.915025061195943, "grad_norm": 3.328125, "learning_rate": 4.897115066799363e-05, "loss": 0.0618, "mean_token_accuracy": 0.9802210390567779, "num_tokens": 70406011.0, "step": 25375 }, { "entropy": 0.08407215159386397, "epoch": 5.9161906982165755, "grad_norm": 1.0234375, "learning_rate": 4.89705441519394e-05, "loss": 0.0715, "mean_token_accuracy": 0.9791640937328339, "num_tokens": 70416918.0, "step": 25380 }, { "entropy": 0.07379502542316914, "epoch": 5.917356335237207, "grad_norm": 4.375, "learning_rate": 4.8969937464842115e-05, "loss": 0.0648, "mean_token_accuracy": 0.9773381769657135, "num_tokens": 70435729.0, "step": 25385 }, { "entropy": 0.05902850423008203, "epoch": 5.918521972257839, "grad_norm": 0.7890625, "learning_rate": 4.89693306067108e-05, "loss": 0.0354, "mean_token_accuracy": 0.9863932430744171, "num_tokens": 70462047.0, "step": 25390 }, { "entropy": 0.06693649515509606, "epoch": 5.919687609278471, "grad_norm": 1.4453125, "learning_rate": 4.896872357755452e-05, "loss": 0.0455, "mean_token_accuracy": 0.9824758410453797, "num_tokens": 70481681.0, "step": 25395 }, { "entropy": 0.04593549780547619, "epoch": 5.920853246299102, "grad_norm": 0.427734375, "learning_rate": 4.896811637738232e-05, "loss": 0.027, "mean_token_accuracy": 0.9869483411312103, "num_tokens": 70504472.0, "step": 25400 }, { "entropy": 0.07661759108304977, "epoch": 5.922018883319734, "grad_norm": 1.203125, "learning_rate": 4.896750900620326e-05, "loss": 0.0319, "mean_token_accuracy": 0.9825215280056, "num_tokens": 70530182.0, "step": 25405 }, { "entropy": 0.09024886526167393, "epoch": 5.923184520340366, "grad_norm": 1.5625, "learning_rate": 4.89669014640264e-05, "loss": 0.0898, "mean_token_accuracy": 0.9773705959320068, "num_tokens": 70541455.0, "step": 25410 }, { "entropy": 0.08583233803510666, "epoch": 5.924350157360998, "grad_norm": 3.1875, "learning_rate": 4.896629375086077e-05, "loss": 0.0797, "mean_token_accuracy": 0.9771179258823395, "num_tokens": 70550552.0, "step": 25415 }, { "entropy": 0.06757635474205018, "epoch": 5.92551579438163, "grad_norm": 1.7265625, "learning_rate": 4.896568586671547e-05, "loss": 0.0633, "mean_token_accuracy": 0.9803970336914063, "num_tokens": 70573952.0, "step": 25420 }, { "entropy": 0.059069217182695866, "epoch": 5.926681431402262, "grad_norm": 0.94921875, "learning_rate": 4.896507781159954e-05, "loss": 0.0554, "mean_token_accuracy": 0.9838501870632171, "num_tokens": 70591204.0, "step": 25425 }, { "entropy": 0.07336567882448435, "epoch": 5.927847068422893, "grad_norm": 1.4765625, "learning_rate": 4.8964469585522066e-05, "loss": 0.0657, "mean_token_accuracy": 0.9818897664546966, "num_tokens": 70603793.0, "step": 25430 }, { "entropy": 0.08957900255918502, "epoch": 5.929012705443525, "grad_norm": 2.46875, "learning_rate": 4.89638611884921e-05, "loss": 0.0868, "mean_token_accuracy": 0.9778998076915741, "num_tokens": 70612772.0, "step": 25435 }, { "entropy": 0.05743040251545608, "epoch": 5.930178342464156, "grad_norm": 0.78125, "learning_rate": 4.896325262051872e-05, "loss": 0.0192, "mean_token_accuracy": 0.9897181570529938, "num_tokens": 70655512.0, "step": 25440 }, { "entropy": 0.06604121178388596, "epoch": 5.931343979484788, "grad_norm": 1.40625, "learning_rate": 4.8962643881611e-05, "loss": 0.0513, "mean_token_accuracy": 0.9855192124843597, "num_tokens": 70673994.0, "step": 25445 }, { "entropy": 0.08524401690810919, "epoch": 5.9325096165054205, "grad_norm": 4.5, "learning_rate": 4.8962034971778027e-05, "loss": 0.0937, "mean_token_accuracy": 0.9765569984912872, "num_tokens": 70699151.0, "step": 25450 }, { "entropy": 0.0642532754689455, "epoch": 5.933675253526052, "grad_norm": 1.8125, "learning_rate": 4.8961425891028865e-05, "loss": 0.0416, "mean_token_accuracy": 0.9848453521728515, "num_tokens": 70728641.0, "step": 25455 }, { "entropy": 0.08150929920375347, "epoch": 5.934840890546684, "grad_norm": 1.0859375, "learning_rate": 4.8960816639372606e-05, "loss": 0.0724, "mean_token_accuracy": 0.9757475137710572, "num_tokens": 70755529.0, "step": 25460 }, { "entropy": 0.06778738740831614, "epoch": 5.936006527567315, "grad_norm": 1.328125, "learning_rate": 4.8960207216818335e-05, "loss": 0.0593, "mean_token_accuracy": 0.9825225472450256, "num_tokens": 70767883.0, "step": 25465 }, { "entropy": 0.05971136726438999, "epoch": 5.937172164587947, "grad_norm": 0.71875, "learning_rate": 4.8959597623375134e-05, "loss": 0.0462, "mean_token_accuracy": 0.9880515694618225, "num_tokens": 70810408.0, "step": 25470 }, { "entropy": 0.06987338345497847, "epoch": 5.938337801608579, "grad_norm": 1.078125, "learning_rate": 4.8958987859052095e-05, "loss": 0.0533, "mean_token_accuracy": 0.9834602892398834, "num_tokens": 70826934.0, "step": 25475 }, { "entropy": 0.08081718422472477, "epoch": 5.9395034386292105, "grad_norm": 2.078125, "learning_rate": 4.895837792385832e-05, "loss": 0.0893, "mean_token_accuracy": 0.9789685130119323, "num_tokens": 70835743.0, "step": 25480 }, { "entropy": 0.05064443135634065, "epoch": 5.940669075649843, "grad_norm": 0.8828125, "learning_rate": 4.895776781780289e-05, "loss": 0.0274, "mean_token_accuracy": 0.9879032492637634, "num_tokens": 70864498.0, "step": 25485 }, { "entropy": 0.0775157648138702, "epoch": 5.941834712670475, "grad_norm": 0.353515625, "learning_rate": 4.895715754089491e-05, "loss": 0.0509, "mean_token_accuracy": 0.9845824480056763, "num_tokens": 70889378.0, "step": 25490 }, { "entropy": 0.0937342531979084, "epoch": 5.943000349691106, "grad_norm": 2.03125, "learning_rate": 4.8956547093143476e-05, "loss": 0.087, "mean_token_accuracy": 0.9744981229305267, "num_tokens": 70899630.0, "step": 25495 }, { "entropy": 0.05566701851785183, "epoch": 5.944165986711738, "grad_norm": 2.671875, "learning_rate": 4.89559364745577e-05, "loss": 0.0472, "mean_token_accuracy": 0.9846188008785248, "num_tokens": 70923647.0, "step": 25500 }, { "entropy": 0.06326587796211243, "epoch": 5.94533162373237, "grad_norm": 0.77734375, "learning_rate": 4.8955325685146666e-05, "loss": 0.0496, "mean_token_accuracy": 0.9842358469963074, "num_tokens": 70948453.0, "step": 25505 }, { "entropy": 0.060926478728652, "epoch": 5.946497260753001, "grad_norm": 3.96875, "learning_rate": 4.8954714724919506e-05, "loss": 0.055, "mean_token_accuracy": 0.984597235918045, "num_tokens": 70972669.0, "step": 25510 }, { "entropy": 0.06100227106362581, "epoch": 5.947662897773633, "grad_norm": 1.3203125, "learning_rate": 4.895410359388533e-05, "loss": 0.0426, "mean_token_accuracy": 0.9850693285465241, "num_tokens": 71001021.0, "step": 25515 }, { "entropy": 0.06199376685544848, "epoch": 5.948828534794265, "grad_norm": 1.7734375, "learning_rate": 4.895349229205323e-05, "loss": 0.0545, "mean_token_accuracy": 0.984418374300003, "num_tokens": 71021573.0, "step": 25520 }, { "entropy": 0.07707611806690692, "epoch": 5.949994171814897, "grad_norm": 2.53125, "learning_rate": 4.8952880819432333e-05, "loss": 0.0506, "mean_token_accuracy": 0.9833807289600373, "num_tokens": 71038416.0, "step": 25525 }, { "entropy": 0.08036872111260891, "epoch": 5.951159808835529, "grad_norm": 2.4375, "learning_rate": 4.895226917603175e-05, "loss": 0.0776, "mean_token_accuracy": 0.9776204705238343, "num_tokens": 71053137.0, "step": 25530 }, { "entropy": 0.07707045953720808, "epoch": 5.95232544585616, "grad_norm": 1.03125, "learning_rate": 4.8951657361860623e-05, "loss": 0.0602, "mean_token_accuracy": 0.9830064177513123, "num_tokens": 71065875.0, "step": 25535 }, { "entropy": 0.06487810891121626, "epoch": 5.953491082876792, "grad_norm": 0.75390625, "learning_rate": 4.895104537692806e-05, "loss": 0.0282, "mean_token_accuracy": 0.9820800602436066, "num_tokens": 71085386.0, "step": 25540 }, { "entropy": 0.1173665277659893, "epoch": 5.954656719897424, "grad_norm": 2.5625, "learning_rate": 4.895043322124318e-05, "loss": 0.1445, "mean_token_accuracy": 0.9714154601097107, "num_tokens": 71104444.0, "step": 25545 }, { "entropy": 0.0770050410181284, "epoch": 5.9558223569180555, "grad_norm": 0.2021484375, "learning_rate": 4.894982089481511e-05, "loss": 0.0646, "mean_token_accuracy": 0.9791462659835816, "num_tokens": 71122244.0, "step": 25550 }, { "entropy": 0.0616754699498415, "epoch": 5.956987993938688, "grad_norm": 1.09375, "learning_rate": 4.8949208397653004e-05, "loss": 0.0594, "mean_token_accuracy": 0.9835680782794952, "num_tokens": 71147606.0, "step": 25555 }, { "entropy": 0.061316430754959586, "epoch": 5.95815363095932, "grad_norm": 0.79296875, "learning_rate": 4.894859572976598e-05, "loss": 0.0545, "mean_token_accuracy": 0.9843459963798523, "num_tokens": 71162452.0, "step": 25560 }, { "entropy": 0.05889552496373653, "epoch": 5.959319267979951, "grad_norm": 1.140625, "learning_rate": 4.8947982891163164e-05, "loss": 0.045, "mean_token_accuracy": 0.982684051990509, "num_tokens": 71177001.0, "step": 25565 }, { "entropy": 0.09865431338548661, "epoch": 5.960484905000583, "grad_norm": 0.88671875, "learning_rate": 4.894736988185371e-05, "loss": 0.0539, "mean_token_accuracy": 0.9783219814300537, "num_tokens": 71206647.0, "step": 25570 }, { "entropy": 0.046166717913001774, "epoch": 5.961650542021214, "grad_norm": 0.419921875, "learning_rate": 4.894675670184675e-05, "loss": 0.0324, "mean_token_accuracy": 0.9870345771312714, "num_tokens": 71233006.0, "step": 25575 }, { "entropy": 0.06371776573359966, "epoch": 5.962816179041846, "grad_norm": 3.078125, "learning_rate": 4.894614335115143e-05, "loss": 0.0517, "mean_token_accuracy": 0.9826191842556, "num_tokens": 71250289.0, "step": 25580 }, { "entropy": 0.062395652197301386, "epoch": 5.9639818160624785, "grad_norm": 0.60546875, "learning_rate": 4.89455298297769e-05, "loss": 0.0548, "mean_token_accuracy": 0.9834360301494598, "num_tokens": 71263976.0, "step": 25585 }, { "entropy": 0.06196978203952312, "epoch": 5.96514745308311, "grad_norm": 2.4375, "learning_rate": 4.894491613773231e-05, "loss": 0.0487, "mean_token_accuracy": 0.984034126996994, "num_tokens": 71290800.0, "step": 25590 }, { "entropy": 0.07303077168762684, "epoch": 5.966313090103742, "grad_norm": 2.296875, "learning_rate": 4.89443022750268e-05, "loss": 0.0615, "mean_token_accuracy": 0.9787440538406372, "num_tokens": 71303915.0, "step": 25595 }, { "entropy": 0.07619455568492413, "epoch": 5.967478727124373, "grad_norm": 0.69921875, "learning_rate": 4.8943688241669536e-05, "loss": 0.0525, "mean_token_accuracy": 0.9836428880691528, "num_tokens": 71318505.0, "step": 25600 }, { "entropy": 0.06338299652561545, "epoch": 5.968644364145005, "grad_norm": 0.6015625, "learning_rate": 4.8943074037669654e-05, "loss": 0.0555, "mean_token_accuracy": 0.9859287023544312, "num_tokens": 71341853.0, "step": 25605 }, { "entropy": 0.0515817703679204, "epoch": 5.969810001165637, "grad_norm": 0.41015625, "learning_rate": 4.8942459663036346e-05, "loss": 0.0267, "mean_token_accuracy": 0.9896455585956574, "num_tokens": 71377062.0, "step": 25610 }, { "entropy": 0.06094341482967138, "epoch": 5.970975638186268, "grad_norm": 0.35546875, "learning_rate": 4.894184511777874e-05, "loss": 0.0573, "mean_token_accuracy": 0.9822888076305389, "num_tokens": 71390415.0, "step": 25615 }, { "entropy": 0.055656261183321475, "epoch": 5.9721412752069005, "grad_norm": 0.98046875, "learning_rate": 4.894123040190602e-05, "loss": 0.0389, "mean_token_accuracy": 0.987447464466095, "num_tokens": 71427479.0, "step": 25620 }, { "entropy": 0.05644221818074584, "epoch": 5.973306912227533, "grad_norm": 0.478515625, "learning_rate": 4.894061551542734e-05, "loss": 0.0153, "mean_token_accuracy": 0.986136132478714, "num_tokens": 71470655.0, "step": 25625 }, { "entropy": 0.045355524122715, "epoch": 5.974472549248164, "grad_norm": 1.46875, "learning_rate": 4.894000045835188e-05, "loss": 0.0374, "mean_token_accuracy": 0.9896352529525757, "num_tokens": 71499833.0, "step": 25630 }, { "entropy": 0.06681758724153042, "epoch": 5.975638186268796, "grad_norm": 0.17578125, "learning_rate": 4.8939385230688807e-05, "loss": 0.0523, "mean_token_accuracy": 0.983164769411087, "num_tokens": 71524600.0, "step": 25635 }, { "entropy": 0.0596080549992621, "epoch": 5.976803823289428, "grad_norm": 2.296875, "learning_rate": 4.893876983244729e-05, "loss": 0.0522, "mean_token_accuracy": 0.9858713269233703, "num_tokens": 71546467.0, "step": 25640 }, { "entropy": 0.08861620575189591, "epoch": 5.977969460310059, "grad_norm": 8.3125, "learning_rate": 4.893815426363652e-05, "loss": 0.0821, "mean_token_accuracy": 0.9688912212848664, "num_tokens": 71579206.0, "step": 25645 }, { "entropy": 0.08461652826517821, "epoch": 5.979135097330691, "grad_norm": 0.92578125, "learning_rate": 4.893753852426565e-05, "loss": 0.0789, "mean_token_accuracy": 0.9783106327056885, "num_tokens": 71589261.0, "step": 25650 }, { "entropy": 0.060508431307971476, "epoch": 5.980300734351323, "grad_norm": 0.7265625, "learning_rate": 4.893692261434389e-05, "loss": 0.057, "mean_token_accuracy": 0.9857955038547516, "num_tokens": 71603908.0, "step": 25655 }, { "entropy": 0.050046111829578875, "epoch": 5.981466371371955, "grad_norm": 0.83203125, "learning_rate": 4.8936306533880405e-05, "loss": 0.0253, "mean_token_accuracy": 0.986841905117035, "num_tokens": 71626686.0, "step": 25660 }, { "entropy": 0.07740709893405437, "epoch": 5.982632008392587, "grad_norm": 0.96484375, "learning_rate": 4.893569028288439e-05, "loss": 0.0644, "mean_token_accuracy": 0.984851849079132, "num_tokens": 71638219.0, "step": 25665 }, { "entropy": 0.07101895548403263, "epoch": 5.983797645413218, "grad_norm": 2.015625, "learning_rate": 4.8935073861365034e-05, "loss": 0.0591, "mean_token_accuracy": 0.97741579413414, "num_tokens": 71651552.0, "step": 25670 }, { "entropy": 0.06982565447688102, "epoch": 5.98496328243385, "grad_norm": 2.078125, "learning_rate": 4.8934457269331527e-05, "loss": 0.0715, "mean_token_accuracy": 0.9800402879714966, "num_tokens": 71662332.0, "step": 25675 }, { "entropy": 0.088308035582304, "epoch": 5.986128919454482, "grad_norm": 0.90625, "learning_rate": 4.8933840506793065e-05, "loss": 0.0857, "mean_token_accuracy": 0.9786054491996765, "num_tokens": 71671220.0, "step": 25680 }, { "entropy": 0.08883246891200543, "epoch": 5.987294556475113, "grad_norm": 0.3515625, "learning_rate": 4.8933223573758845e-05, "loss": 0.0496, "mean_token_accuracy": 0.9835711300373078, "num_tokens": 71696436.0, "step": 25685 }, { "entropy": 0.07261432018131017, "epoch": 5.9884601934957455, "grad_norm": 2.625, "learning_rate": 4.893260647023806e-05, "loss": 0.0756, "mean_token_accuracy": 0.9792898654937744, "num_tokens": 71711295.0, "step": 25690 }, { "entropy": 0.08523041075095535, "epoch": 5.989625830516378, "grad_norm": 1.7890625, "learning_rate": 4.893198919623992e-05, "loss": 0.0682, "mean_token_accuracy": 0.976850026845932, "num_tokens": 71727227.0, "step": 25695 }, { "entropy": 0.06633602287620306, "epoch": 5.990791467537009, "grad_norm": 1.9375, "learning_rate": 4.893137175177363e-05, "loss": 0.0585, "mean_token_accuracy": 0.9812263011932373, "num_tokens": 71743408.0, "step": 25700 }, { "entropy": 0.05985606387257576, "epoch": 5.991957104557641, "grad_norm": 3.203125, "learning_rate": 4.893075413684839e-05, "loss": 0.0669, "mean_token_accuracy": 0.9821926891803742, "num_tokens": 71766920.0, "step": 25705 }, { "entropy": 0.06824294216930867, "epoch": 5.993122741578272, "grad_norm": 3.171875, "learning_rate": 4.893013635147341e-05, "loss": 0.0909, "mean_token_accuracy": 0.9769490718841553, "num_tokens": 71777572.0, "step": 25710 }, { "entropy": 0.08433944657444954, "epoch": 5.994288378598904, "grad_norm": 1.203125, "learning_rate": 4.89295183956579e-05, "loss": 0.072, "mean_token_accuracy": 0.9772077202796936, "num_tokens": 71788992.0, "step": 25715 }, { "entropy": 0.06912859678268432, "epoch": 5.995454015619536, "grad_norm": 3.078125, "learning_rate": 4.892890026941109e-05, "loss": 0.0489, "mean_token_accuracy": 0.9828975081443787, "num_tokens": 71811320.0, "step": 25720 }, { "entropy": 0.0763224471360445, "epoch": 5.996619652640168, "grad_norm": 1.1171875, "learning_rate": 4.892828197274218e-05, "loss": 0.0814, "mean_token_accuracy": 0.9780821740627289, "num_tokens": 71829482.0, "step": 25725 }, { "entropy": 0.03815199537202716, "epoch": 5.9977852896608, "grad_norm": 0.93359375, "learning_rate": 4.892766350566041e-05, "loss": 0.0193, "mean_token_accuracy": 0.9918882131576539, "num_tokens": 71883754.0, "step": 25730 }, { "entropy": 0.06713078990578651, "epoch": 5.998950926681431, "grad_norm": 1.9765625, "learning_rate": 4.892704486817498e-05, "loss": 0.0522, "mean_token_accuracy": 0.9826019644737244, "num_tokens": 71895540.0, "step": 25735 }, { "entropy": 0.06625933138032754, "epoch": 6.0, "grad_norm": 2.75, "learning_rate": 4.892642606029512e-05, "loss": 0.044, "mean_token_accuracy": 0.985924243927002, "num_tokens": 71923000.0, "step": 25740 }, { "entropy": 0.05685313232243061, "epoch": 6.001165637020632, "grad_norm": 0.62109375, "learning_rate": 4.892580708203007e-05, "loss": 0.0352, "mean_token_accuracy": 0.9892299056053162, "num_tokens": 71937131.0, "step": 25745 }, { "entropy": 0.05876139011234045, "epoch": 6.002331274041263, "grad_norm": 0.57421875, "learning_rate": 4.8925187933389035e-05, "loss": 0.0284, "mean_token_accuracy": 0.9893492102622986, "num_tokens": 71959498.0, "step": 25750 }, { "entropy": 0.05102113718166947, "epoch": 6.003496911061895, "grad_norm": 1.0234375, "learning_rate": 4.892456861438128e-05, "loss": 0.0223, "mean_token_accuracy": 0.9925749123096466, "num_tokens": 71982142.0, "step": 25755 }, { "entropy": 0.05887993331998587, "epoch": 6.0046625480825275, "grad_norm": 1.1484375, "learning_rate": 4.8923949125016013e-05, "loss": 0.0386, "mean_token_accuracy": 0.9897330641746521, "num_tokens": 71994131.0, "step": 25760 }, { "entropy": 0.05944917807355523, "epoch": 6.005828185103159, "grad_norm": 0.94140625, "learning_rate": 4.892332946530249e-05, "loss": 0.0271, "mean_token_accuracy": 0.9928303897380829, "num_tokens": 72013539.0, "step": 25765 }, { "entropy": 0.0366467990912497, "epoch": 6.006993822123791, "grad_norm": 0.5625, "learning_rate": 4.892270963524994e-05, "loss": 0.016, "mean_token_accuracy": 0.9950882434844971, "num_tokens": 72038175.0, "step": 25770 }, { "entropy": 0.05455034887418151, "epoch": 6.008159459144422, "grad_norm": 3.96875, "learning_rate": 4.8922089634867606e-05, "loss": 0.0328, "mean_token_accuracy": 0.9901614844799042, "num_tokens": 72055936.0, "step": 25775 }, { "entropy": 0.05095590045675635, "epoch": 6.009325096165054, "grad_norm": 2.234375, "learning_rate": 4.892146946416474e-05, "loss": 0.0312, "mean_token_accuracy": 0.9916975438594818, "num_tokens": 72073682.0, "step": 25780 }, { "entropy": 0.060593480616807936, "epoch": 6.010490733185686, "grad_norm": 2.03125, "learning_rate": 4.892084912315059e-05, "loss": 0.0325, "mean_token_accuracy": 0.9904182434082032, "num_tokens": 72085726.0, "step": 25785 }, { "entropy": 0.05342343971133232, "epoch": 6.0116563702063175, "grad_norm": 1.015625, "learning_rate": 4.892022861183439e-05, "loss": 0.0268, "mean_token_accuracy": 0.9912295579910279, "num_tokens": 72107893.0, "step": 25790 }, { "entropy": 0.06415905207395553, "epoch": 6.01282200722695, "grad_norm": 1.9921875, "learning_rate": 4.891960793022541e-05, "loss": 0.0397, "mean_token_accuracy": 0.9880828559398651, "num_tokens": 72126518.0, "step": 25795 }, { "entropy": 0.05017009107396007, "epoch": 6.013987644247582, "grad_norm": 1.84375, "learning_rate": 4.8918987078332904e-05, "loss": 0.0242, "mean_token_accuracy": 0.9938202440738678, "num_tokens": 72154112.0, "step": 25800 }, { "entropy": 0.07650583721697331, "epoch": 6.015153281268213, "grad_norm": 3.953125, "learning_rate": 4.8918366056166114e-05, "loss": 0.0514, "mean_token_accuracy": 0.9844214498996735, "num_tokens": 72176069.0, "step": 25805 }, { "entropy": 0.05705469730310142, "epoch": 6.016318918288845, "grad_norm": 0.1572265625, "learning_rate": 4.891774486373432e-05, "loss": 0.0212, "mean_token_accuracy": 0.9931414604187012, "num_tokens": 72199150.0, "step": 25810 }, { "entropy": 0.06287220679223537, "epoch": 6.017484555309476, "grad_norm": 1.7890625, "learning_rate": 4.891712350104678e-05, "loss": 0.0337, "mean_token_accuracy": 0.9904451191425323, "num_tokens": 72218038.0, "step": 25815 }, { "entropy": 0.05191007032990456, "epoch": 6.018650192330108, "grad_norm": 0.490234375, "learning_rate": 4.891650196811275e-05, "loss": 0.0135, "mean_token_accuracy": 0.9937026917934417, "num_tokens": 72264266.0, "step": 25820 }, { "entropy": 0.05475984732620418, "epoch": 6.01981582935074, "grad_norm": 2.75, "learning_rate": 4.891588026494151e-05, "loss": 0.0107, "mean_token_accuracy": 0.9927169740200043, "num_tokens": 72305044.0, "step": 25825 }, { "entropy": 0.05737621607258916, "epoch": 6.020981466371372, "grad_norm": 0.55859375, "learning_rate": 4.8915258391542316e-05, "loss": 0.0269, "mean_token_accuracy": 0.991907000541687, "num_tokens": 72323624.0, "step": 25830 }, { "entropy": 0.06937633194029331, "epoch": 6.022147103392004, "grad_norm": 2.265625, "learning_rate": 4.8914636347924454e-05, "loss": 0.0611, "mean_token_accuracy": 0.9854352653026581, "num_tokens": 72337594.0, "step": 25835 }, { "entropy": 0.06716333478689193, "epoch": 6.023312740412636, "grad_norm": 1.75, "learning_rate": 4.89140141340972e-05, "loss": 0.0324, "mean_token_accuracy": 0.990732753276825, "num_tokens": 72347443.0, "step": 25840 }, { "entropy": 0.05915342541411519, "epoch": 6.024478377433267, "grad_norm": 0.453125, "learning_rate": 4.891339175006981e-05, "loss": 0.0364, "mean_token_accuracy": 0.9867867410182953, "num_tokens": 72384961.0, "step": 25845 }, { "entropy": 0.09259512033313513, "epoch": 6.025644014453899, "grad_norm": 1.3984375, "learning_rate": 4.891276919585161e-05, "loss": 0.0653, "mean_token_accuracy": 0.9823358118534088, "num_tokens": 72399191.0, "step": 25850 }, { "entropy": 0.08896644115447998, "epoch": 6.02680965147453, "grad_norm": 0.296875, "learning_rate": 4.891214647145184e-05, "loss": 0.0761, "mean_token_accuracy": 0.985335499048233, "num_tokens": 72434806.0, "step": 25855 }, { "entropy": 0.05383868329226971, "epoch": 6.0279752884951625, "grad_norm": 0.447265625, "learning_rate": 4.8911523576879795e-05, "loss": 0.0238, "mean_token_accuracy": 0.9926303625106812, "num_tokens": 72452382.0, "step": 25860 }, { "entropy": 0.06760429283604026, "epoch": 6.029140925515795, "grad_norm": 1.625, "learning_rate": 4.891090051214478e-05, "loss": 0.0319, "mean_token_accuracy": 0.989197313785553, "num_tokens": 72473686.0, "step": 25865 }, { "entropy": 0.06802155338227749, "epoch": 6.030306562536426, "grad_norm": 2.3125, "learning_rate": 4.891027727725607e-05, "loss": 0.0272, "mean_token_accuracy": 0.9918610870838165, "num_tokens": 72485853.0, "step": 25870 }, { "entropy": 0.0644018879160285, "epoch": 6.031472199557058, "grad_norm": 2.203125, "learning_rate": 4.890965387222297e-05, "loss": 0.0432, "mean_token_accuracy": 0.9895402014255523, "num_tokens": 72505760.0, "step": 25875 }, { "entropy": 0.06654907325282693, "epoch": 6.03263783657769, "grad_norm": 0.337890625, "learning_rate": 4.8909030297054764e-05, "loss": 0.0427, "mean_token_accuracy": 0.992247325181961, "num_tokens": 72532642.0, "step": 25880 }, { "entropy": 0.05838645258918405, "epoch": 6.033803473598321, "grad_norm": 2.9375, "learning_rate": 4.890840655176076e-05, "loss": 0.0355, "mean_token_accuracy": 0.9868348956108093, "num_tokens": 72556266.0, "step": 25885 }, { "entropy": 0.06037556882947683, "epoch": 6.034969110618953, "grad_norm": 2.234375, "learning_rate": 4.890778263635025e-05, "loss": 0.0318, "mean_token_accuracy": 0.9884869992733002, "num_tokens": 72569724.0, "step": 25890 }, { "entropy": 0.058738169819116594, "epoch": 6.0361347476395855, "grad_norm": 2.609375, "learning_rate": 4.890715855083255e-05, "loss": 0.0247, "mean_token_accuracy": 0.993329894542694, "num_tokens": 72582251.0, "step": 25895 }, { "entropy": 0.06071986351162195, "epoch": 6.037300384660217, "grad_norm": 0.859375, "learning_rate": 4.890653429521695e-05, "loss": 0.0309, "mean_token_accuracy": 0.988233745098114, "num_tokens": 72595487.0, "step": 25900 }, { "entropy": 0.05742992917075753, "epoch": 6.038466021680849, "grad_norm": 1.7421875, "learning_rate": 4.8905909869512775e-05, "loss": 0.0181, "mean_token_accuracy": 0.9917668163776397, "num_tokens": 72627186.0, "step": 25905 }, { "entropy": 0.06477688588201999, "epoch": 6.03963165870148, "grad_norm": 1.1484375, "learning_rate": 4.890528527372933e-05, "loss": 0.0485, "mean_token_accuracy": 0.9858067691326141, "num_tokens": 72637192.0, "step": 25910 }, { "entropy": 0.05110297799110412, "epoch": 6.040797295722112, "grad_norm": 0.98046875, "learning_rate": 4.8904660507875924e-05, "loss": 0.0218, "mean_token_accuracy": 0.9923175871372223, "num_tokens": 72665602.0, "step": 25915 }, { "entropy": 0.12344857305288315, "epoch": 6.041962932742744, "grad_norm": 2.546875, "learning_rate": 4.890403557196187e-05, "loss": 0.056, "mean_token_accuracy": 0.9839066147804261, "num_tokens": 72675252.0, "step": 25920 }, { "entropy": 0.06564762499183416, "epoch": 6.043128569763375, "grad_norm": 4.9375, "learning_rate": 4.8903410465996495e-05, "loss": 0.0278, "mean_token_accuracy": 0.9908899009227753, "num_tokens": 72698132.0, "step": 25925 }, { "entropy": 0.07632733806967736, "epoch": 6.0442942067840075, "grad_norm": 2.890625, "learning_rate": 4.890278518998912e-05, "loss": 0.0393, "mean_token_accuracy": 0.9884995639324188, "num_tokens": 72713793.0, "step": 25930 }, { "entropy": 0.0680435385555029, "epoch": 6.04545984380464, "grad_norm": 0.9609375, "learning_rate": 4.8902159743949073e-05, "loss": 0.0506, "mean_token_accuracy": 0.9874895095825196, "num_tokens": 72723522.0, "step": 25935 }, { "entropy": 0.07830537669360638, "epoch": 6.046625480825271, "grad_norm": 1.671875, "learning_rate": 4.890153412788567e-05, "loss": 0.0334, "mean_token_accuracy": 0.986879688501358, "num_tokens": 72737014.0, "step": 25940 }, { "entropy": 0.07722733989357948, "epoch": 6.047791117845903, "grad_norm": 2.671875, "learning_rate": 4.890090834180824e-05, "loss": 0.0561, "mean_token_accuracy": 0.9857710182666779, "num_tokens": 72745948.0, "step": 25945 }, { "entropy": 0.0765731481835246, "epoch": 6.048956754866534, "grad_norm": 0.55078125, "learning_rate": 4.8900282385726127e-05, "loss": 0.0434, "mean_token_accuracy": 0.9872212827205658, "num_tokens": 72763772.0, "step": 25950 }, { "entropy": 0.06433370187878609, "epoch": 6.050122391887166, "grad_norm": 0.98046875, "learning_rate": 4.8899656259648655e-05, "loss": 0.0257, "mean_token_accuracy": 0.988516104221344, "num_tokens": 72784836.0, "step": 25955 }, { "entropy": 0.04958404209464788, "epoch": 6.051288028907798, "grad_norm": 0.306640625, "learning_rate": 4.8899029963585155e-05, "loss": 0.0303, "mean_token_accuracy": 0.9914787411689758, "num_tokens": 72800542.0, "step": 25960 }, { "entropy": 0.05592640060931444, "epoch": 6.05245366592843, "grad_norm": 1.046875, "learning_rate": 4.889840349754498e-05, "loss": 0.0216, "mean_token_accuracy": 0.9904370605945587, "num_tokens": 72823728.0, "step": 25965 }, { "entropy": 0.06810164107009768, "epoch": 6.053619302949062, "grad_norm": 0.4609375, "learning_rate": 4.889777686153746e-05, "loss": 0.0343, "mean_token_accuracy": 0.9900489509105682, "num_tokens": 72845251.0, "step": 25970 }, { "entropy": 0.07986075691878795, "epoch": 6.054784939969694, "grad_norm": 5.34375, "learning_rate": 4.889715005557194e-05, "loss": 0.0563, "mean_token_accuracy": 0.98703653216362, "num_tokens": 72867014.0, "step": 25975 }, { "entropy": 0.058453739061951636, "epoch": 6.055950576990325, "grad_norm": 0.380859375, "learning_rate": 4.889652307965778e-05, "loss": 0.0185, "mean_token_accuracy": 0.99262655377388, "num_tokens": 72890050.0, "step": 25980 }, { "entropy": 0.06057945545762777, "epoch": 6.057116214010957, "grad_norm": 1.1796875, "learning_rate": 4.889589593380432e-05, "loss": 0.0237, "mean_token_accuracy": 0.9909035623073578, "num_tokens": 72914583.0, "step": 25985 }, { "entropy": 0.1058798679150641, "epoch": 6.058281851031588, "grad_norm": 0.2431640625, "learning_rate": 4.88952686180209e-05, "loss": 0.0489, "mean_token_accuracy": 0.9859530746936798, "num_tokens": 72948565.0, "step": 25990 }, { "entropy": 0.07051466554403304, "epoch": 6.05944748805222, "grad_norm": 2.546875, "learning_rate": 4.889464113231689e-05, "loss": 0.036, "mean_token_accuracy": 0.9863975763320922, "num_tokens": 72960750.0, "step": 25995 }, { "entropy": 0.0715387485921383, "epoch": 6.0606131250728525, "grad_norm": 1.15625, "learning_rate": 4.889401347670165e-05, "loss": 0.047, "mean_token_accuracy": 0.9889802992343902, "num_tokens": 72981087.0, "step": 26000 }, { "entropy": 0.1161620058119297, "epoch": 6.061778762093484, "grad_norm": 0.33984375, "learning_rate": 4.8893385651184524e-05, "loss": 0.0313, "mean_token_accuracy": 0.9857727110385894, "num_tokens": 73000120.0, "step": 26005 }, { "entropy": 0.06805797554552555, "epoch": 6.062944399114116, "grad_norm": 3.21875, "learning_rate": 4.889275765577488e-05, "loss": 0.0362, "mean_token_accuracy": 0.9873246729373932, "num_tokens": 73028760.0, "step": 26010 }, { "entropy": 0.0626982631161809, "epoch": 6.064110036134748, "grad_norm": 1.2890625, "learning_rate": 4.889212949048209e-05, "loss": 0.0254, "mean_token_accuracy": 0.988321989774704, "num_tokens": 73058804.0, "step": 26015 }, { "entropy": 0.053884850721806286, "epoch": 6.065275673155379, "grad_norm": 0.40625, "learning_rate": 4.889150115531551e-05, "loss": 0.0154, "mean_token_accuracy": 0.9942508041858673, "num_tokens": 73080961.0, "step": 26020 }, { "entropy": 0.05890355911105871, "epoch": 6.066441310176011, "grad_norm": 1.03125, "learning_rate": 4.889087265028452e-05, "loss": 0.0467, "mean_token_accuracy": 0.9890576004981995, "num_tokens": 73096398.0, "step": 26025 }, { "entropy": 0.0513009587302804, "epoch": 6.067606947196643, "grad_norm": 0.490234375, "learning_rate": 4.889024397539848e-05, "loss": 0.0192, "mean_token_accuracy": 0.9914946913719177, "num_tokens": 73122113.0, "step": 26030 }, { "entropy": 0.06160991545766592, "epoch": 6.068772584217275, "grad_norm": 0.765625, "learning_rate": 4.888961513066677e-05, "loss": 0.0293, "mean_token_accuracy": 0.9876994311809539, "num_tokens": 73150370.0, "step": 26035 }, { "entropy": 0.06331438571214676, "epoch": 6.069938221237907, "grad_norm": 0.49609375, "learning_rate": 4.888898611609877e-05, "loss": 0.0264, "mean_token_accuracy": 0.9909036993980408, "num_tokens": 73175420.0, "step": 26040 }, { "entropy": 0.13340820157900452, "epoch": 6.071103858258538, "grad_norm": 3.765625, "learning_rate": 4.888835693170386e-05, "loss": 0.1831, "mean_token_accuracy": 0.9607505977153779, "num_tokens": 73217582.0, "step": 26045 }, { "entropy": 0.05735396733507514, "epoch": 6.07226949527917, "grad_norm": 1.203125, "learning_rate": 4.8887727577491414e-05, "loss": 0.0231, "mean_token_accuracy": 0.9909033179283142, "num_tokens": 73246641.0, "step": 26050 }, { "entropy": 0.08030778989195823, "epoch": 6.073435132299802, "grad_norm": 2.40625, "learning_rate": 4.888709805347082e-05, "loss": 0.0286, "mean_token_accuracy": 0.9901704609394073, "num_tokens": 73258761.0, "step": 26055 }, { "entropy": 0.07896085307002068, "epoch": 6.074600769320433, "grad_norm": 4.125, "learning_rate": 4.888646835965147e-05, "loss": 0.0595, "mean_token_accuracy": 0.9851579368114471, "num_tokens": 73268333.0, "step": 26060 }, { "entropy": 0.06064739301800728, "epoch": 6.0757664063410655, "grad_norm": 0.453125, "learning_rate": 4.888583849604275e-05, "loss": 0.0227, "mean_token_accuracy": 0.9928803324699402, "num_tokens": 73287812.0, "step": 26065 }, { "entropy": 0.04188825218006968, "epoch": 6.076932043361698, "grad_norm": 0.51171875, "learning_rate": 4.888520846265405e-05, "loss": 0.0127, "mean_token_accuracy": 0.9933510839939117, "num_tokens": 73325445.0, "step": 26070 }, { "entropy": 0.05650821551680565, "epoch": 6.078097680382329, "grad_norm": 3.15625, "learning_rate": 4.888457825949478e-05, "loss": 0.0233, "mean_token_accuracy": 0.9924087584018707, "num_tokens": 73349736.0, "step": 26075 }, { "entropy": 0.0596388409845531, "epoch": 6.079263317402961, "grad_norm": 0.10205078125, "learning_rate": 4.888394788657431e-05, "loss": 0.0391, "mean_token_accuracy": 0.9876825332641601, "num_tokens": 73384001.0, "step": 26080 }, { "entropy": 0.06745455488562584, "epoch": 6.080428954423592, "grad_norm": 2.515625, "learning_rate": 4.8883317343902065e-05, "loss": 0.0347, "mean_token_accuracy": 0.9912264704704284, "num_tokens": 73405821.0, "step": 26085 }, { "entropy": 0.07572416644543409, "epoch": 6.081594591444224, "grad_norm": 0.6875, "learning_rate": 4.888268663148743e-05, "loss": 0.0218, "mean_token_accuracy": 0.9936153054237366, "num_tokens": 73425459.0, "step": 26090 }, { "entropy": 0.05982452109456062, "epoch": 6.082760228464856, "grad_norm": 1.6953125, "learning_rate": 4.8882055749339814e-05, "loss": 0.0166, "mean_token_accuracy": 0.9946097016334534, "num_tokens": 73451504.0, "step": 26095 }, { "entropy": 0.05443444773554802, "epoch": 6.0839258654854875, "grad_norm": 0.91015625, "learning_rate": 4.8881424697468635e-05, "loss": 0.0124, "mean_token_accuracy": 0.992647510766983, "num_tokens": 73491398.0, "step": 26100 }, { "entropy": 0.06279923100955784, "epoch": 6.08509150250612, "grad_norm": 1.4453125, "learning_rate": 4.88807934758833e-05, "loss": 0.0299, "mean_token_accuracy": 0.9905305862426758, "num_tokens": 73521552.0, "step": 26105 }, { "entropy": 0.07592700524255633, "epoch": 6.086257139526752, "grad_norm": 2.484375, "learning_rate": 4.8880162084593204e-05, "loss": 0.0568, "mean_token_accuracy": 0.9855045139789581, "num_tokens": 73546810.0, "step": 26110 }, { "entropy": 0.060017453879117964, "epoch": 6.087422776547383, "grad_norm": 0.1787109375, "learning_rate": 4.887953052360778e-05, "loss": 0.0246, "mean_token_accuracy": 0.9929731905460357, "num_tokens": 73570295.0, "step": 26115 }, { "entropy": 0.07486429456621409, "epoch": 6.088588413568015, "grad_norm": 2.1875, "learning_rate": 4.887889879293644e-05, "loss": 0.0276, "mean_token_accuracy": 0.9905243337154388, "num_tokens": 73582178.0, "step": 26120 }, { "entropy": 0.09151984080672264, "epoch": 6.089754050588646, "grad_norm": 3.53125, "learning_rate": 4.8878266892588605e-05, "loss": 0.034, "mean_token_accuracy": 0.9858936965465546, "num_tokens": 73597195.0, "step": 26125 }, { "entropy": 0.05330041013658047, "epoch": 6.090919687609278, "grad_norm": 0.73828125, "learning_rate": 4.887763482257369e-05, "loss": 0.0359, "mean_token_accuracy": 0.9910796642303467, "num_tokens": 73622884.0, "step": 26130 }, { "entropy": 0.05964062893763185, "epoch": 6.0920853246299105, "grad_norm": 1.9453125, "learning_rate": 4.887700258290113e-05, "loss": 0.0266, "mean_token_accuracy": 0.9897635579109192, "num_tokens": 73649204.0, "step": 26135 }, { "entropy": 0.06973530426621437, "epoch": 6.093250961650542, "grad_norm": 1.8671875, "learning_rate": 4.887637017358036e-05, "loss": 0.0227, "mean_token_accuracy": 0.9914168953895569, "num_tokens": 73661502.0, "step": 26140 }, { "entropy": 0.07038565017282963, "epoch": 6.094416598671174, "grad_norm": 0.216796875, "learning_rate": 4.887573759462079e-05, "loss": 0.0263, "mean_token_accuracy": 0.9904610097408295, "num_tokens": 73685578.0, "step": 26145 }, { "entropy": 0.04337377091869712, "epoch": 6.095582235691806, "grad_norm": 0.310546875, "learning_rate": 4.887510484603186e-05, "loss": 0.0156, "mean_token_accuracy": 0.9947210431098938, "num_tokens": 73715768.0, "step": 26150 }, { "entropy": 0.07746588047593832, "epoch": 6.096747872712437, "grad_norm": 2.34375, "learning_rate": 4.887447192782302e-05, "loss": 0.0371, "mean_token_accuracy": 0.9873392581939697, "num_tokens": 73728063.0, "step": 26155 }, { "entropy": 0.053429150208830836, "epoch": 6.097913509733069, "grad_norm": 2.421875, "learning_rate": 4.887383884000368e-05, "loss": 0.0274, "mean_token_accuracy": 0.9937780797481537, "num_tokens": 73757134.0, "step": 26160 }, { "entropy": 0.0564187285490334, "epoch": 6.099079146753701, "grad_norm": 3.15625, "learning_rate": 4.88732055825833e-05, "loss": 0.0269, "mean_token_accuracy": 0.9911048114299774, "num_tokens": 73781153.0, "step": 26165 }, { "entropy": 0.07645439747720957, "epoch": 6.1002447837743325, "grad_norm": 0.291015625, "learning_rate": 4.887257215557133e-05, "loss": 0.0337, "mean_token_accuracy": 0.9878023087978363, "num_tokens": 73796312.0, "step": 26170 }, { "entropy": 0.07418763572350144, "epoch": 6.101410420794965, "grad_norm": 0.1796875, "learning_rate": 4.8871938558977194e-05, "loss": 0.0304, "mean_token_accuracy": 0.9894303560256958, "num_tokens": 73814524.0, "step": 26175 }, { "entropy": 0.0833287613466382, "epoch": 6.102576057815596, "grad_norm": 3.328125, "learning_rate": 4.887130479281035e-05, "loss": 0.0294, "mean_token_accuracy": 0.9921909213066101, "num_tokens": 73831240.0, "step": 26180 }, { "entropy": 0.08284763377159834, "epoch": 6.103741694836228, "grad_norm": 0.208984375, "learning_rate": 4.8870670857080246e-05, "loss": 0.0206, "mean_token_accuracy": 0.9901773989200592, "num_tokens": 73853877.0, "step": 26185 }, { "entropy": 0.05893292501568794, "epoch": 6.10490733185686, "grad_norm": 1.3984375, "learning_rate": 4.887003675179634e-05, "loss": 0.034, "mean_token_accuracy": 0.9904161393642426, "num_tokens": 73883298.0, "step": 26190 }, { "entropy": 0.07303530490025878, "epoch": 6.106072968877491, "grad_norm": 0.291015625, "learning_rate": 4.8869402476968083e-05, "loss": 0.0285, "mean_token_accuracy": 0.9904853463172912, "num_tokens": 73903304.0, "step": 26195 }, { "entropy": 0.08997571468353271, "epoch": 6.107238605898123, "grad_norm": 0.96875, "learning_rate": 4.886876803260494e-05, "loss": 0.0275, "mean_token_accuracy": 0.9917174279689789, "num_tokens": 73914506.0, "step": 26200 }, { "entropy": 0.06205089651048183, "epoch": 6.1084042429187555, "grad_norm": 2.671875, "learning_rate": 4.886813341871636e-05, "loss": 0.0318, "mean_token_accuracy": 0.99107666015625, "num_tokens": 73938577.0, "step": 26205 }, { "entropy": 0.053936326131224635, "epoch": 6.109569879939387, "grad_norm": 3.140625, "learning_rate": 4.886749863531182e-05, "loss": 0.038, "mean_token_accuracy": 0.9904813170433044, "num_tokens": 73955143.0, "step": 26210 }, { "entropy": 0.06608153507113457, "epoch": 6.110735516960019, "grad_norm": 1.453125, "learning_rate": 4.886686368240077e-05, "loss": 0.0412, "mean_token_accuracy": 0.9894643902778626, "num_tokens": 73965817.0, "step": 26215 }, { "entropy": 0.04572048811241984, "epoch": 6.11190115398065, "grad_norm": 0.458984375, "learning_rate": 4.8866228559992685e-05, "loss": 0.0144, "mean_token_accuracy": 0.9954964637756347, "num_tokens": 73997786.0, "step": 26220 }, { "entropy": 0.07351892106235028, "epoch": 6.113066791001282, "grad_norm": 0.8984375, "learning_rate": 4.886559326809704e-05, "loss": 0.0449, "mean_token_accuracy": 0.9858495235443115, "num_tokens": 74016499.0, "step": 26225 }, { "entropy": 0.06393312495201826, "epoch": 6.114232428021914, "grad_norm": 1.171875, "learning_rate": 4.8864957806723296e-05, "loss": 0.0212, "mean_token_accuracy": 0.9930548965930939, "num_tokens": 74048662.0, "step": 26230 }, { "entropy": 0.0660844799131155, "epoch": 6.1153980650425455, "grad_norm": 1.046875, "learning_rate": 4.886432217588095e-05, "loss": 0.0265, "mean_token_accuracy": 0.989313280582428, "num_tokens": 74068187.0, "step": 26235 }, { "entropy": 0.055278994515538214, "epoch": 6.116563702063178, "grad_norm": 1.078125, "learning_rate": 4.886368637557946e-05, "loss": 0.0243, "mean_token_accuracy": 0.9904797196388244, "num_tokens": 74091261.0, "step": 26240 }, { "entropy": 0.07485796064138413, "epoch": 6.11772933908381, "grad_norm": 0.8671875, "learning_rate": 4.886305040582832e-05, "loss": 0.0494, "mean_token_accuracy": 0.9882545590400695, "num_tokens": 74101483.0, "step": 26245 }, { "entropy": 0.07578368950635195, "epoch": 6.118894976104441, "grad_norm": 4.09375, "learning_rate": 4.8862414266637e-05, "loss": 0.0357, "mean_token_accuracy": 0.990442156791687, "num_tokens": 74112359.0, "step": 26250 }, { "entropy": 0.05760523192584514, "epoch": 6.120060613125073, "grad_norm": 0.314453125, "learning_rate": 4.8861777958014996e-05, "loss": 0.0118, "mean_token_accuracy": 0.9920580506324768, "num_tokens": 74152099.0, "step": 26255 }, { "entropy": 0.060783774219453336, "epoch": 6.121226250145704, "grad_norm": 0.31640625, "learning_rate": 4.886114147997179e-05, "loss": 0.0185, "mean_token_accuracy": 0.9896743893623352, "num_tokens": 74183403.0, "step": 26260 }, { "entropy": 0.073437774553895, "epoch": 6.122391887166336, "grad_norm": 1.1796875, "learning_rate": 4.886050483251689e-05, "loss": 0.0379, "mean_token_accuracy": 0.9888258576393127, "num_tokens": 74195017.0, "step": 26265 }, { "entropy": 0.061823181249201296, "epoch": 6.123557524186968, "grad_norm": 3.71875, "learning_rate": 4.8859868015659764e-05, "loss": 0.0227, "mean_token_accuracy": 0.9911990523338318, "num_tokens": 74214212.0, "step": 26270 }, { "entropy": 0.044901407044380906, "epoch": 6.1247231612076, "grad_norm": 0.271484375, "learning_rate": 4.8859231029409925e-05, "loss": 0.0157, "mean_token_accuracy": 0.9923342585563659, "num_tokens": 74245056.0, "step": 26275 }, { "entropy": 0.07991915084421634, "epoch": 6.125888798228232, "grad_norm": 2.515625, "learning_rate": 4.885859387377686e-05, "loss": 0.0354, "mean_token_accuracy": 0.9882398664951324, "num_tokens": 74270399.0, "step": 26280 }, { "entropy": 0.06915805991739035, "epoch": 6.127054435248864, "grad_norm": 0.8515625, "learning_rate": 4.885795654877009e-05, "loss": 0.033, "mean_token_accuracy": 0.9874129176139832, "num_tokens": 74286158.0, "step": 26285 }, { "entropy": 0.07844992205500603, "epoch": 6.128220072269495, "grad_norm": 1.171875, "learning_rate": 4.885731905439909e-05, "loss": 0.0574, "mean_token_accuracy": 0.9869462549686432, "num_tokens": 74295775.0, "step": 26290 }, { "entropy": 0.0868721805512905, "epoch": 6.129385709290127, "grad_norm": 2.53125, "learning_rate": 4.885668139067338e-05, "loss": 0.0638, "mean_token_accuracy": 0.9849142253398895, "num_tokens": 74315238.0, "step": 26295 }, { "entropy": 0.0664596289396286, "epoch": 6.130551346310758, "grad_norm": 0.44140625, "learning_rate": 4.885604355760248e-05, "loss": 0.0162, "mean_token_accuracy": 0.9912093698978424, "num_tokens": 74345863.0, "step": 26300 }, { "entropy": 0.09681078474968671, "epoch": 6.1317169833313905, "grad_norm": 0.2890625, "learning_rate": 4.885540555519588e-05, "loss": 0.0525, "mean_token_accuracy": 0.9850651860237122, "num_tokens": 74361453.0, "step": 26305 }, { "entropy": 0.04459414193406701, "epoch": 6.132882620352023, "grad_norm": 2.171875, "learning_rate": 4.8854767383463106e-05, "loss": 0.0168, "mean_token_accuracy": 0.9929663062095642, "num_tokens": 74400020.0, "step": 26310 }, { "entropy": 0.05299360081553459, "epoch": 6.134048257372654, "grad_norm": 0.4140625, "learning_rate": 4.8854129042413674e-05, "loss": 0.0118, "mean_token_accuracy": 0.9963202357292176, "num_tokens": 74443995.0, "step": 26315 }, { "entropy": 0.08458766452968121, "epoch": 6.135213894393286, "grad_norm": 1.3203125, "learning_rate": 4.88534905320571e-05, "loss": 0.0486, "mean_token_accuracy": 0.9887842237949371, "num_tokens": 74454639.0, "step": 26320 }, { "entropy": 0.06806373670697212, "epoch": 6.136379531413918, "grad_norm": 3.265625, "learning_rate": 4.8852851852402906e-05, "loss": 0.0471, "mean_token_accuracy": 0.9882710754871369, "num_tokens": 74485326.0, "step": 26325 }, { "entropy": 0.051118244975805284, "epoch": 6.137545168434549, "grad_norm": 0.154296875, "learning_rate": 4.885221300346061e-05, "loss": 0.0257, "mean_token_accuracy": 0.9931609928607941, "num_tokens": 74511678.0, "step": 26330 }, { "entropy": 0.07300595417618752, "epoch": 6.138710805455181, "grad_norm": 1.6875, "learning_rate": 4.8851573985239753e-05, "loss": 0.0305, "mean_token_accuracy": 0.9920416593551635, "num_tokens": 74540779.0, "step": 26335 }, { "entropy": 0.050485126581043004, "epoch": 6.139876442475813, "grad_norm": 0.62890625, "learning_rate": 4.885093479774985e-05, "loss": 0.0184, "mean_token_accuracy": 0.9928144335746765, "num_tokens": 74575792.0, "step": 26340 }, { "entropy": 0.06935878098011017, "epoch": 6.141042079496445, "grad_norm": 1.6171875, "learning_rate": 4.8850295441000435e-05, "loss": 0.0382, "mean_token_accuracy": 0.9901534140110015, "num_tokens": 74594585.0, "step": 26345 }, { "entropy": 0.06734537519514561, "epoch": 6.142207716517077, "grad_norm": 3.328125, "learning_rate": 4.8849655915001044e-05, "loss": 0.0294, "mean_token_accuracy": 0.9917540729045868, "num_tokens": 74605422.0, "step": 26350 }, { "entropy": 0.060905332677066326, "epoch": 6.143373353537708, "grad_norm": 2.03125, "learning_rate": 4.884901621976121e-05, "loss": 0.0328, "mean_token_accuracy": 0.9897407293319702, "num_tokens": 74620466.0, "step": 26355 }, { "entropy": 0.074892489425838, "epoch": 6.14453899055834, "grad_norm": 0.94921875, "learning_rate": 4.884837635529048e-05, "loss": 0.0652, "mean_token_accuracy": 0.985385262966156, "num_tokens": 74633990.0, "step": 26360 }, { "entropy": 0.05228735357522964, "epoch": 6.145704627578972, "grad_norm": 0.40625, "learning_rate": 4.884773632159839e-05, "loss": 0.0195, "mean_token_accuracy": 0.9903470396995544, "num_tokens": 74665082.0, "step": 26365 }, { "entropy": 0.06522688157856464, "epoch": 6.146870264599603, "grad_norm": 0.44140625, "learning_rate": 4.8847096118694474e-05, "loss": 0.0306, "mean_token_accuracy": 0.9882582664489746, "num_tokens": 74698895.0, "step": 26370 }, { "entropy": 0.07812208170071244, "epoch": 6.1480359016202355, "grad_norm": 2.015625, "learning_rate": 4.8846455746588295e-05, "loss": 0.0563, "mean_token_accuracy": 0.9852927267551422, "num_tokens": 74715568.0, "step": 26375 }, { "entropy": 0.07707515489310027, "epoch": 6.149201538640868, "grad_norm": 1.546875, "learning_rate": 4.884581520528939e-05, "loss": 0.0423, "mean_token_accuracy": 0.9858931481838227, "num_tokens": 74728990.0, "step": 26380 }, { "entropy": 0.07694179080426693, "epoch": 6.150367175661499, "grad_norm": 2.046875, "learning_rate": 4.884517449480732e-05, "loss": 0.0536, "mean_token_accuracy": 0.9824213206768035, "num_tokens": 74739815.0, "step": 26385 }, { "entropy": 0.06638536080718041, "epoch": 6.151532812682131, "grad_norm": 2.078125, "learning_rate": 4.8844533615151633e-05, "loss": 0.0364, "mean_token_accuracy": 0.9892715454101563, "num_tokens": 74759843.0, "step": 26390 }, { "entropy": 0.07396925017237663, "epoch": 6.152698449702762, "grad_norm": 0.33984375, "learning_rate": 4.8843892566331887e-05, "loss": 0.0348, "mean_token_accuracy": 0.9897041201591492, "num_tokens": 74782611.0, "step": 26395 }, { "entropy": 0.08042656015604735, "epoch": 6.153864086723394, "grad_norm": 0.58984375, "learning_rate": 4.884325134835764e-05, "loss": 0.0454, "mean_token_accuracy": 0.9905684471130372, "num_tokens": 74809583.0, "step": 26400 }, { "entropy": 0.07436943799257278, "epoch": 6.155029723744026, "grad_norm": 0.58984375, "learning_rate": 4.884260996123845e-05, "loss": 0.0239, "mean_token_accuracy": 0.9928574979305267, "num_tokens": 74829584.0, "step": 26405 }, { "entropy": 0.04903424307703972, "epoch": 6.156195360764658, "grad_norm": 0.53125, "learning_rate": 4.88419684049839e-05, "loss": 0.0154, "mean_token_accuracy": 0.9906674683094024, "num_tokens": 74866848.0, "step": 26410 }, { "entropy": 0.08828915823251009, "epoch": 6.15736099778529, "grad_norm": 1.21875, "learning_rate": 4.884132667960352e-05, "loss": 0.0524, "mean_token_accuracy": 0.9868596613407135, "num_tokens": 74878559.0, "step": 26415 }, { "entropy": 0.05355936009436846, "epoch": 6.158526634805922, "grad_norm": 2.203125, "learning_rate": 4.8840684785106915e-05, "loss": 0.0199, "mean_token_accuracy": 0.9939637005329132, "num_tokens": 74900968.0, "step": 26420 }, { "entropy": 0.0665479020215571, "epoch": 6.159692271826553, "grad_norm": 0.6328125, "learning_rate": 4.884004272150364e-05, "loss": 0.0273, "mean_token_accuracy": 0.992696750164032, "num_tokens": 74925194.0, "step": 26425 }, { "entropy": 0.09455383159220218, "epoch": 6.160857908847185, "grad_norm": 0.79296875, "learning_rate": 4.883940048880327e-05, "loss": 0.061, "mean_token_accuracy": 0.9805425465106964, "num_tokens": 74944109.0, "step": 26430 }, { "entropy": 0.10350356921553612, "epoch": 6.162023545867816, "grad_norm": 0.26171875, "learning_rate": 4.8838758087015385e-05, "loss": 0.0501, "mean_token_accuracy": 0.9844925940036774, "num_tokens": 74962982.0, "step": 26435 }, { "entropy": 0.07926721423864365, "epoch": 6.163189182888448, "grad_norm": 2.59375, "learning_rate": 4.8838115516149566e-05, "loss": 0.0503, "mean_token_accuracy": 0.9858163237571717, "num_tokens": 74972064.0, "step": 26440 }, { "entropy": 0.05242730937898159, "epoch": 6.1643548199090805, "grad_norm": 1.8984375, "learning_rate": 4.883747277621539e-05, "loss": 0.0224, "mean_token_accuracy": 0.9898411691188812, "num_tokens": 74998421.0, "step": 26445 }, { "entropy": 0.0582451980561018, "epoch": 6.165520456929712, "grad_norm": 1.4140625, "learning_rate": 4.883682986722243e-05, "loss": 0.0268, "mean_token_accuracy": 0.9894807755947113, "num_tokens": 75022066.0, "step": 26450 }, { "entropy": 0.048820036463439465, "epoch": 6.166686093950344, "grad_norm": 1.5703125, "learning_rate": 4.88361867891803e-05, "loss": 0.0201, "mean_token_accuracy": 0.9917604267597199, "num_tokens": 75046513.0, "step": 26455 }, { "entropy": 0.04594447817653417, "epoch": 6.167851730970976, "grad_norm": 1.03125, "learning_rate": 4.883554354209857e-05, "loss": 0.0138, "mean_token_accuracy": 0.9945909082889557, "num_tokens": 75074076.0, "step": 26460 }, { "entropy": 0.06487497426569462, "epoch": 6.169017367991607, "grad_norm": 0.9296875, "learning_rate": 4.883490012598683e-05, "loss": 0.0284, "mean_token_accuracy": 0.9908344566822052, "num_tokens": 75097190.0, "step": 26465 }, { "entropy": 0.10159607119858265, "epoch": 6.170183005012239, "grad_norm": 0.97265625, "learning_rate": 4.883425654085469e-05, "loss": 0.1187, "mean_token_accuracy": 0.9753309786319733, "num_tokens": 75117363.0, "step": 26470 }, { "entropy": 0.06636620126664639, "epoch": 6.171348642032871, "grad_norm": 0.8671875, "learning_rate": 4.883361278671173e-05, "loss": 0.0328, "mean_token_accuracy": 0.9904363811016083, "num_tokens": 75129335.0, "step": 26475 }, { "entropy": 0.04916867651045322, "epoch": 6.172514279053503, "grad_norm": 1.6875, "learning_rate": 4.883296886356756e-05, "loss": 0.0244, "mean_token_accuracy": 0.9905480206012726, "num_tokens": 75156142.0, "step": 26480 }, { "entropy": 0.06334216399118305, "epoch": 6.173679916074135, "grad_norm": 0.6796875, "learning_rate": 4.883232477143178e-05, "loss": 0.0171, "mean_token_accuracy": 0.9912271916866302, "num_tokens": 75181069.0, "step": 26485 }, { "entropy": 0.05516471900045872, "epoch": 6.174845553094766, "grad_norm": 2.140625, "learning_rate": 4.8831680510313994e-05, "loss": 0.0234, "mean_token_accuracy": 0.9919645011425018, "num_tokens": 75197946.0, "step": 26490 }, { "entropy": 0.054226364567875864, "epoch": 6.176011190115398, "grad_norm": 1.890625, "learning_rate": 4.8831036080223804e-05, "loss": 0.0315, "mean_token_accuracy": 0.991038691997528, "num_tokens": 75216585.0, "step": 26495 }, { "entropy": 0.09042618926614523, "epoch": 6.17717682713603, "grad_norm": 5.09375, "learning_rate": 4.883039148117082e-05, "loss": 0.0395, "mean_token_accuracy": 0.9868484675884247, "num_tokens": 75230630.0, "step": 26500 }, { "entropy": 0.06445145141333342, "epoch": 6.178342464156661, "grad_norm": 2.234375, "learning_rate": 4.882974671316466e-05, "loss": 0.0411, "mean_token_accuracy": 0.9895972013473511, "num_tokens": 75245396.0, "step": 26505 }, { "entropy": 0.06774566173553467, "epoch": 6.179508101177293, "grad_norm": 0.91015625, "learning_rate": 4.8829101776214934e-05, "loss": 0.0328, "mean_token_accuracy": 0.9879733324050903, "num_tokens": 75255982.0, "step": 26510 }, { "entropy": 0.05292671788483858, "epoch": 6.1806737381979255, "grad_norm": 1.109375, "learning_rate": 4.882845667033127e-05, "loss": 0.027, "mean_token_accuracy": 0.9933640778064727, "num_tokens": 75283127.0, "step": 26515 }, { "entropy": 0.0652140263468027, "epoch": 6.181839375218557, "grad_norm": 1.0625, "learning_rate": 4.882781139552327e-05, "loss": 0.0242, "mean_token_accuracy": 0.9877884387969971, "num_tokens": 75313041.0, "step": 26520 }, { "entropy": 0.07414305829443038, "epoch": 6.183005012239189, "grad_norm": 1.734375, "learning_rate": 4.8827165951800565e-05, "loss": 0.0511, "mean_token_accuracy": 0.9878878772258759, "num_tokens": 75330555.0, "step": 26525 }, { "entropy": 0.09437569361180068, "epoch": 6.18417064925982, "grad_norm": 1.4921875, "learning_rate": 4.8826520339172774e-05, "loss": 0.0328, "mean_token_accuracy": 0.98801589012146, "num_tokens": 75343418.0, "step": 26530 }, { "entropy": 0.07906931964680552, "epoch": 6.185336286280452, "grad_norm": 0.3359375, "learning_rate": 4.882587455764954e-05, "loss": 0.042, "mean_token_accuracy": 0.9886034429073334, "num_tokens": 75357739.0, "step": 26535 }, { "entropy": 0.09388845460489392, "epoch": 6.186501923301084, "grad_norm": 0.142578125, "learning_rate": 4.882522860724047e-05, "loss": 0.0644, "mean_token_accuracy": 0.9776318430900574, "num_tokens": 75408177.0, "step": 26540 }, { "entropy": 0.06323309913277626, "epoch": 6.1876675603217155, "grad_norm": 1.7265625, "learning_rate": 4.8824582487955214e-05, "loss": 0.0297, "mean_token_accuracy": 0.9861397266387939, "num_tokens": 75445410.0, "step": 26545 }, { "entropy": 0.06687562046572566, "epoch": 6.188833197342348, "grad_norm": 1.703125, "learning_rate": 4.88239361998034e-05, "loss": 0.0224, "mean_token_accuracy": 0.9914937198162079, "num_tokens": 75467771.0, "step": 26550 }, { "entropy": 0.05728769712150097, "epoch": 6.18999883436298, "grad_norm": 1.1015625, "learning_rate": 4.882328974279467e-05, "loss": 0.0159, "mean_token_accuracy": 0.9917247593402863, "num_tokens": 75491896.0, "step": 26555 }, { "entropy": 0.08520476371049882, "epoch": 6.191164471383611, "grad_norm": 0.546875, "learning_rate": 4.882264311693865e-05, "loss": 0.0318, "mean_token_accuracy": 0.9897422969341279, "num_tokens": 75509972.0, "step": 26560 }, { "entropy": 0.05651366077363491, "epoch": 6.192330108404243, "grad_norm": 0.2265625, "learning_rate": 4.882199632224499e-05, "loss": 0.0128, "mean_token_accuracy": 0.9885481774806977, "num_tokens": 75542748.0, "step": 26565 }, { "entropy": 0.07149199210107327, "epoch": 6.193495745424874, "grad_norm": 0.4375, "learning_rate": 4.882134935872334e-05, "loss": 0.027, "mean_token_accuracy": 0.9882903575897217, "num_tokens": 75563403.0, "step": 26570 }, { "entropy": 0.07008785083889961, "epoch": 6.194661382445506, "grad_norm": 2.96875, "learning_rate": 4.882070222638334e-05, "loss": 0.0349, "mean_token_accuracy": 0.9898769140243531, "num_tokens": 75578117.0, "step": 26575 }, { "entropy": 0.079210801795125, "epoch": 6.1958270194661385, "grad_norm": 2.484375, "learning_rate": 4.8820054925234645e-05, "loss": 0.0386, "mean_token_accuracy": 0.986040997505188, "num_tokens": 75597784.0, "step": 26580 }, { "entropy": 0.07456495780497789, "epoch": 6.19699265648677, "grad_norm": 1.828125, "learning_rate": 4.8819407455286905e-05, "loss": 0.026, "mean_token_accuracy": 0.9922024965286255, "num_tokens": 75613004.0, "step": 26585 }, { "entropy": 0.07746983375400304, "epoch": 6.198158293507402, "grad_norm": 2.40625, "learning_rate": 4.881875981654977e-05, "loss": 0.0449, "mean_token_accuracy": 0.9894679367542267, "num_tokens": 75627893.0, "step": 26590 }, { "entropy": 0.05641837287694216, "epoch": 6.199323930528034, "grad_norm": 1.9921875, "learning_rate": 4.881811200903291e-05, "loss": 0.0275, "mean_token_accuracy": 0.9910026013851165, "num_tokens": 75651925.0, "step": 26595 }, { "entropy": 0.07631746679544449, "epoch": 6.200489567548665, "grad_norm": 1.9921875, "learning_rate": 4.881746403274597e-05, "loss": 0.0479, "mean_token_accuracy": 0.990505713224411, "num_tokens": 75661578.0, "step": 26600 }, { "entropy": 0.09214288219809533, "epoch": 6.201655204569297, "grad_norm": 2.75, "learning_rate": 4.881681588769862e-05, "loss": 0.0464, "mean_token_accuracy": 0.9885360062122345, "num_tokens": 75671242.0, "step": 26605 }, { "entropy": 0.05341388094238937, "epoch": 6.202820841589929, "grad_norm": 0.365234375, "learning_rate": 4.8816167573900524e-05, "loss": 0.0176, "mean_token_accuracy": 0.9929543256759643, "num_tokens": 75697552.0, "step": 26610 }, { "entropy": 0.06103728674352169, "epoch": 6.2039864786105605, "grad_norm": 0.96875, "learning_rate": 4.881551909136135e-05, "loss": 0.0253, "mean_token_accuracy": 0.9898181974887847, "num_tokens": 75715801.0, "step": 26615 }, { "entropy": 0.054025298729538915, "epoch": 6.205152115631193, "grad_norm": 1.0703125, "learning_rate": 4.8814870440090764e-05, "loss": 0.0293, "mean_token_accuracy": 0.9889390408992768, "num_tokens": 75733829.0, "step": 26620 }, { "entropy": 0.06002594884485006, "epoch": 6.206317752651824, "grad_norm": 2.890625, "learning_rate": 4.881422162009844e-05, "loss": 0.0377, "mean_token_accuracy": 0.9891004621982574, "num_tokens": 75752108.0, "step": 26625 }, { "entropy": 0.07102447785437108, "epoch": 6.207483389672456, "grad_norm": 7.03125, "learning_rate": 4.881357263139406e-05, "loss": 0.0342, "mean_token_accuracy": 0.9882679045200348, "num_tokens": 75766733.0, "step": 26630 }, { "entropy": 0.06536264475435019, "epoch": 6.208649026693088, "grad_norm": 2.8125, "learning_rate": 4.8812923473987295e-05, "loss": 0.0337, "mean_token_accuracy": 0.9899225354194641, "num_tokens": 75781265.0, "step": 26635 }, { "entropy": 0.05781321842223406, "epoch": 6.209814663713719, "grad_norm": 0.490234375, "learning_rate": 4.881227414788782e-05, "loss": 0.0171, "mean_token_accuracy": 0.9932263731956482, "num_tokens": 75800349.0, "step": 26640 }, { "entropy": 0.06824675220996142, "epoch": 6.210980300734351, "grad_norm": 5.125, "learning_rate": 4.881162465310533e-05, "loss": 0.0343, "mean_token_accuracy": 0.9887260556221008, "num_tokens": 75818446.0, "step": 26645 }, { "entropy": 0.0608520383015275, "epoch": 6.2121459377549835, "grad_norm": 0.40234375, "learning_rate": 4.88109749896495e-05, "loss": 0.0272, "mean_token_accuracy": 0.9919718444347382, "num_tokens": 75839774.0, "step": 26650 }, { "entropy": 0.0646435147151351, "epoch": 6.213311574775615, "grad_norm": 0.26171875, "learning_rate": 4.881032515753002e-05, "loss": 0.016, "mean_token_accuracy": 0.991169399023056, "num_tokens": 75867534.0, "step": 26655 }, { "entropy": 0.07422850346192718, "epoch": 6.214477211796247, "grad_norm": 0.455078125, "learning_rate": 4.880967515675657e-05, "loss": 0.0444, "mean_token_accuracy": 0.9853957951068878, "num_tokens": 75883164.0, "step": 26660 }, { "entropy": 0.05428111115470528, "epoch": 6.215642848816878, "grad_norm": 1.578125, "learning_rate": 4.880902498733887e-05, "loss": 0.0224, "mean_token_accuracy": 0.9905717432498932, "num_tokens": 75909765.0, "step": 26665 }, { "entropy": 0.06294290795922279, "epoch": 6.21680848583751, "grad_norm": 3.203125, "learning_rate": 4.880837464928659e-05, "loss": 0.0475, "mean_token_accuracy": 0.9853145003318786, "num_tokens": 75920338.0, "step": 26670 }, { "entropy": 0.05609114458784461, "epoch": 6.217974122858142, "grad_norm": 0.322265625, "learning_rate": 4.880772414260944e-05, "loss": 0.0273, "mean_token_accuracy": 0.9908358454704285, "num_tokens": 75951772.0, "step": 26675 }, { "entropy": 0.06486607976257801, "epoch": 6.219139759878773, "grad_norm": 2.28125, "learning_rate": 4.88070734673171e-05, "loss": 0.0405, "mean_token_accuracy": 0.9895318448543549, "num_tokens": 75963086.0, "step": 26680 }, { "entropy": 0.06027032807469368, "epoch": 6.2203053968994055, "grad_norm": 0.498046875, "learning_rate": 4.88064226234193e-05, "loss": 0.0284, "mean_token_accuracy": 0.9899581730365753, "num_tokens": 75987378.0, "step": 26685 }, { "entropy": 0.0686921939253807, "epoch": 6.221471033920038, "grad_norm": 0.984375, "learning_rate": 4.880577161092573e-05, "loss": 0.0314, "mean_token_accuracy": 0.9877607941627502, "num_tokens": 76005481.0, "step": 26690 }, { "entropy": 0.07105220509693026, "epoch": 6.222636670940669, "grad_norm": 0.1826171875, "learning_rate": 4.88051204298461e-05, "loss": 0.0438, "mean_token_accuracy": 0.9898497760295868, "num_tokens": 76023271.0, "step": 26695 }, { "entropy": 0.07426328733563423, "epoch": 6.223802307961301, "grad_norm": 2.453125, "learning_rate": 4.8804469080190126e-05, "loss": 0.0321, "mean_token_accuracy": 0.9896513879299164, "num_tokens": 76038726.0, "step": 26700 }, { "entropy": 0.07415295001119375, "epoch": 6.224967944981932, "grad_norm": 1.84375, "learning_rate": 4.880381756196751e-05, "loss": 0.0229, "mean_token_accuracy": 0.9894894480705261, "num_tokens": 76069366.0, "step": 26705 }, { "entropy": 0.0542880711145699, "epoch": 6.226133582002564, "grad_norm": 1.2421875, "learning_rate": 4.8803165875187975e-05, "loss": 0.0165, "mean_token_accuracy": 0.9950318813323975, "num_tokens": 76104221.0, "step": 26710 }, { "entropy": 0.05485262274742127, "epoch": 6.227299219023196, "grad_norm": 0.859375, "learning_rate": 4.880251401986123e-05, "loss": 0.0197, "mean_token_accuracy": 0.9925736427307129, "num_tokens": 76136927.0, "step": 26715 }, { "entropy": 0.0561104491353035, "epoch": 6.228464856043828, "grad_norm": 0.404296875, "learning_rate": 4.8801861995997004e-05, "loss": 0.0225, "mean_token_accuracy": 0.9895463585853577, "num_tokens": 76172523.0, "step": 26720 }, { "entropy": 0.05799694359302521, "epoch": 6.22963049306446, "grad_norm": 0.296875, "learning_rate": 4.880120980360502e-05, "loss": 0.0219, "mean_token_accuracy": 0.9899403989315033, "num_tokens": 76202120.0, "step": 26725 }, { "entropy": 0.054118511453270915, "epoch": 6.230796130085092, "grad_norm": 0.97265625, "learning_rate": 4.880055744269499e-05, "loss": 0.0231, "mean_token_accuracy": 0.9917508125305176, "num_tokens": 76232161.0, "step": 26730 }, { "entropy": 0.07646337822079659, "epoch": 6.231961767105723, "grad_norm": 2.109375, "learning_rate": 4.879990491327667e-05, "loss": 0.0555, "mean_token_accuracy": 0.9848457276821136, "num_tokens": 76241597.0, "step": 26735 }, { "entropy": 0.06108809132128954, "epoch": 6.233127404126355, "grad_norm": 3.875, "learning_rate": 4.879925221535976e-05, "loss": 0.0234, "mean_token_accuracy": 0.9888475894927978, "num_tokens": 76263968.0, "step": 26740 }, { "entropy": 0.04935785736888647, "epoch": 6.234293041146987, "grad_norm": 0.28125, "learning_rate": 4.8798599348954e-05, "loss": 0.029, "mean_token_accuracy": 0.9913921117782593, "num_tokens": 76293126.0, "step": 26745 }, { "entropy": 0.07111759670078754, "epoch": 6.2354586781676185, "grad_norm": 0.68359375, "learning_rate": 4.879794631406914e-05, "loss": 0.0454, "mean_token_accuracy": 0.9879423141479492, "num_tokens": 76314977.0, "step": 26750 }, { "entropy": 0.0636790843680501, "epoch": 6.2366243151882506, "grad_norm": 0.63671875, "learning_rate": 4.8797293110714906e-05, "loss": 0.0249, "mean_token_accuracy": 0.9907831728458405, "num_tokens": 76329840.0, "step": 26755 }, { "entropy": 0.05912710763514042, "epoch": 6.237789952208882, "grad_norm": 0.318359375, "learning_rate": 4.8796639738901026e-05, "loss": 0.0203, "mean_token_accuracy": 0.9898356914520263, "num_tokens": 76354041.0, "step": 26760 }, { "entropy": 0.07571594156324864, "epoch": 6.238955589229514, "grad_norm": 0.35546875, "learning_rate": 4.879598619863727e-05, "loss": 0.0382, "mean_token_accuracy": 0.9869071960449218, "num_tokens": 76372519.0, "step": 26765 }, { "entropy": 0.06836293041706085, "epoch": 6.240121226250146, "grad_norm": 1.0078125, "learning_rate": 4.879533248993337e-05, "loss": 0.0386, "mean_token_accuracy": 0.9874460101127625, "num_tokens": 76386239.0, "step": 26770 }, { "entropy": 0.08493883013725281, "epoch": 6.241286863270777, "grad_norm": 1.9453125, "learning_rate": 4.8794678612799066e-05, "loss": 0.0388, "mean_token_accuracy": 0.9879676699638367, "num_tokens": 76401255.0, "step": 26775 }, { "entropy": 0.07453780174255371, "epoch": 6.242452500291409, "grad_norm": 1.65625, "learning_rate": 4.879402456724412e-05, "loss": 0.0333, "mean_token_accuracy": 0.9884099781513214, "num_tokens": 76420058.0, "step": 26780 }, { "entropy": 0.04887163182720542, "epoch": 6.243618137312041, "grad_norm": 0.103515625, "learning_rate": 4.8793370353278276e-05, "loss": 0.0144, "mean_token_accuracy": 0.9924459099769593, "num_tokens": 76449079.0, "step": 26785 }, { "entropy": 0.058128141332417727, "epoch": 6.244783774332673, "grad_norm": 1.828125, "learning_rate": 4.8792715970911305e-05, "loss": 0.0267, "mean_token_accuracy": 0.9893514454364777, "num_tokens": 76476260.0, "step": 26790 }, { "entropy": 0.07593741156160831, "epoch": 6.245949411353305, "grad_norm": 0.87890625, "learning_rate": 4.879206142015294e-05, "loss": 0.0404, "mean_token_accuracy": 0.9897358655929566, "num_tokens": 76502102.0, "step": 26795 }, { "entropy": 0.09316719993948937, "epoch": 6.247115048373936, "grad_norm": 2.5, "learning_rate": 4.879140670101296e-05, "loss": 0.0473, "mean_token_accuracy": 0.9866836547851563, "num_tokens": 76517793.0, "step": 26800 }, { "entropy": 0.07820145031437278, "epoch": 6.248280685394568, "grad_norm": 3.65625, "learning_rate": 4.879075181350113e-05, "loss": 0.0515, "mean_token_accuracy": 0.9878412783145905, "num_tokens": 76545744.0, "step": 26805 }, { "entropy": 0.06826126556843519, "epoch": 6.2494463224152, "grad_norm": 1.2109375, "learning_rate": 4.8790096757627205e-05, "loss": 0.041, "mean_token_accuracy": 0.9902231276035309, "num_tokens": 76562994.0, "step": 26810 }, { "entropy": 0.05862971879541874, "epoch": 6.250611959435831, "grad_norm": 0.197265625, "learning_rate": 4.878944153340095e-05, "loss": 0.0188, "mean_token_accuracy": 0.9923254668712616, "num_tokens": 76592668.0, "step": 26815 }, { "entropy": 0.041546163335442546, "epoch": 6.2517775964564635, "grad_norm": 0.81640625, "learning_rate": 4.878878614083214e-05, "loss": 0.0154, "mean_token_accuracy": 0.9943924307823181, "num_tokens": 76618811.0, "step": 26820 }, { "entropy": 0.0818638127297163, "epoch": 6.252943233477096, "grad_norm": 0.6953125, "learning_rate": 4.878813057993056e-05, "loss": 0.0442, "mean_token_accuracy": 0.9860123097896576, "num_tokens": 76630583.0, "step": 26825 }, { "entropy": 0.07344717662781478, "epoch": 6.254108870497727, "grad_norm": 0.2060546875, "learning_rate": 4.878747485070597e-05, "loss": 0.0177, "mean_token_accuracy": 0.9916049182415009, "num_tokens": 76654703.0, "step": 26830 }, { "entropy": 0.06005438230931759, "epoch": 6.255274507518359, "grad_norm": 3.59375, "learning_rate": 4.878681895316815e-05, "loss": 0.0361, "mean_token_accuracy": 0.9874625086784363, "num_tokens": 76667234.0, "step": 26835 }, { "entropy": 0.05835657585412264, "epoch": 6.25644014453899, "grad_norm": 0.875, "learning_rate": 4.878616288732688e-05, "loss": 0.0314, "mean_token_accuracy": 0.9923226952552795, "num_tokens": 76680110.0, "step": 26840 }, { "entropy": 0.061682638619095086, "epoch": 6.257605781559622, "grad_norm": 1.890625, "learning_rate": 4.8785506653191956e-05, "loss": 0.024, "mean_token_accuracy": 0.9914807081222534, "num_tokens": 76701644.0, "step": 26845 }, { "entropy": 0.052669542096555236, "epoch": 6.258771418580254, "grad_norm": 0.7265625, "learning_rate": 4.878485025077315e-05, "loss": 0.0218, "mean_token_accuracy": 0.9928339958190918, "num_tokens": 76722251.0, "step": 26850 }, { "entropy": 0.05744164055213332, "epoch": 6.2599370556008855, "grad_norm": 1.0703125, "learning_rate": 4.8784193680080247e-05, "loss": 0.0268, "mean_token_accuracy": 0.9903415560722351, "num_tokens": 76741829.0, "step": 26855 }, { "entropy": 0.056614281982183455, "epoch": 6.261102692621518, "grad_norm": 1.265625, "learning_rate": 4.878353694112305e-05, "loss": 0.016, "mean_token_accuracy": 0.9917404353618622, "num_tokens": 76766115.0, "step": 26860 }, { "entropy": 0.060635652393102646, "epoch": 6.26226832964215, "grad_norm": 1.8203125, "learning_rate": 4.8782880033911346e-05, "loss": 0.0314, "mean_token_accuracy": 0.9926228582859039, "num_tokens": 76777892.0, "step": 26865 }, { "entropy": 0.08467749282717704, "epoch": 6.263433966662781, "grad_norm": 1.34375, "learning_rate": 4.878222295845493e-05, "loss": 0.053, "mean_token_accuracy": 0.984963458776474, "num_tokens": 76786498.0, "step": 26870 }, { "entropy": 0.07869560457766056, "epoch": 6.264599603683413, "grad_norm": 2.0625, "learning_rate": 4.8781565714763594e-05, "loss": 0.0531, "mean_token_accuracy": 0.9836140871047974, "num_tokens": 76806702.0, "step": 26875 }, { "entropy": 0.08117300998419523, "epoch": 6.265765240704045, "grad_norm": 3.984375, "learning_rate": 4.878090830284715e-05, "loss": 0.0588, "mean_token_accuracy": 0.9826340734958648, "num_tokens": 76818610.0, "step": 26880 }, { "entropy": 0.05971610611304641, "epoch": 6.266930877724676, "grad_norm": 0.66796875, "learning_rate": 4.878025072271539e-05, "loss": 0.0255, "mean_token_accuracy": 0.991646808385849, "num_tokens": 76845946.0, "step": 26885 }, { "entropy": 0.06977970357984305, "epoch": 6.2680965147453085, "grad_norm": 0.359375, "learning_rate": 4.877959297437814e-05, "loss": 0.0348, "mean_token_accuracy": 0.9882163524627685, "num_tokens": 76864978.0, "step": 26890 }, { "entropy": 0.07290565092116594, "epoch": 6.26926215176594, "grad_norm": 2.09375, "learning_rate": 4.877893505784518e-05, "loss": 0.0369, "mean_token_accuracy": 0.989104425907135, "num_tokens": 76892345.0, "step": 26895 }, { "entropy": 0.06487658582627773, "epoch": 6.270427788786572, "grad_norm": 2.375, "learning_rate": 4.877827697312634e-05, "loss": 0.0324, "mean_token_accuracy": 0.9904053211212158, "num_tokens": 76910852.0, "step": 26900 }, { "entropy": 0.06390552939847112, "epoch": 6.271593425807204, "grad_norm": 2.84375, "learning_rate": 4.877761872023142e-05, "loss": 0.0255, "mean_token_accuracy": 0.9912206172943115, "num_tokens": 76941311.0, "step": 26905 }, { "entropy": 0.096971770003438, "epoch": 6.272759062827835, "grad_norm": 5.3125, "learning_rate": 4.877696029917025e-05, "loss": 0.1081, "mean_token_accuracy": 0.9758206665515899, "num_tokens": 76960875.0, "step": 26910 }, { "entropy": 0.08263939693570137, "epoch": 6.273924699848467, "grad_norm": 3.78125, "learning_rate": 4.877630170995264e-05, "loss": 0.0536, "mean_token_accuracy": 0.9840788066387176, "num_tokens": 76978127.0, "step": 26915 }, { "entropy": 0.05669274376705289, "epoch": 6.275090336869099, "grad_norm": 0.8046875, "learning_rate": 4.877564295258841e-05, "loss": 0.0296, "mean_token_accuracy": 0.9915313005447388, "num_tokens": 77000427.0, "step": 26920 }, { "entropy": 0.08088835161179304, "epoch": 6.2762559738897306, "grad_norm": 2.859375, "learning_rate": 4.877498402708738e-05, "loss": 0.0379, "mean_token_accuracy": 0.9871138870716095, "num_tokens": 77021781.0, "step": 26925 }, { "entropy": 0.06899372283369302, "epoch": 6.277421610910363, "grad_norm": 0.44921875, "learning_rate": 4.877432493345938e-05, "loss": 0.0245, "mean_token_accuracy": 0.9892875671386718, "num_tokens": 77043096.0, "step": 26930 }, { "entropy": 0.07236437350511551, "epoch": 6.278587247930994, "grad_norm": 2.140625, "learning_rate": 4.877366567171424e-05, "loss": 0.0261, "mean_token_accuracy": 0.9930740773677826, "num_tokens": 77057439.0, "step": 26935 }, { "entropy": 0.06578505616635084, "epoch": 6.279752884951626, "grad_norm": 0.56640625, "learning_rate": 4.8773006241861786e-05, "loss": 0.0416, "mean_token_accuracy": 0.9892131865024567, "num_tokens": 77073294.0, "step": 26940 }, { "entropy": 0.0702615974470973, "epoch": 6.280918521972258, "grad_norm": 4.5625, "learning_rate": 4.877234664391185e-05, "loss": 0.05, "mean_token_accuracy": 0.9869410037994385, "num_tokens": 77087501.0, "step": 26945 }, { "entropy": 0.045784792955964805, "epoch": 6.282084158992889, "grad_norm": 0.91796875, "learning_rate": 4.877168687787428e-05, "loss": 0.0191, "mean_token_accuracy": 0.9897647440433502, "num_tokens": 77120136.0, "step": 26950 }, { "entropy": 0.06734316907823086, "epoch": 6.283249796013521, "grad_norm": 3.921875, "learning_rate": 4.87710269437589e-05, "loss": 0.0418, "mean_token_accuracy": 0.9890086054801941, "num_tokens": 77131782.0, "step": 26955 }, { "entropy": 0.1180669752880931, "epoch": 6.2844154330341535, "grad_norm": 0.34765625, "learning_rate": 4.877036684157556e-05, "loss": 0.1248, "mean_token_accuracy": 0.9731480836868286, "num_tokens": 77158773.0, "step": 26960 }, { "entropy": 0.07690473701804876, "epoch": 6.285581070054785, "grad_norm": 0.271484375, "learning_rate": 4.8769706571334095e-05, "loss": 0.0209, "mean_token_accuracy": 0.9888293027877808, "num_tokens": 77183757.0, "step": 26965 }, { "entropy": 0.0930105771869421, "epoch": 6.286746707075417, "grad_norm": 2.203125, "learning_rate": 4.876904613304435e-05, "loss": 0.051, "mean_token_accuracy": 0.9848386824131012, "num_tokens": 77193681.0, "step": 26970 }, { "entropy": 0.060835771076381204, "epoch": 6.287912344096048, "grad_norm": 3.15625, "learning_rate": 4.876838552671619e-05, "loss": 0.0264, "mean_token_accuracy": 0.9885562717914581, "num_tokens": 77219243.0, "step": 26975 }, { "entropy": 0.06320591568946839, "epoch": 6.28907798111668, "grad_norm": 1.1171875, "learning_rate": 4.876772475235945e-05, "loss": 0.0349, "mean_token_accuracy": 0.9873491525650024, "num_tokens": 77235210.0, "step": 26980 }, { "entropy": 0.06562241949141026, "epoch": 6.290243618137312, "grad_norm": 3.671875, "learning_rate": 4.876706380998398e-05, "loss": 0.0388, "mean_token_accuracy": 0.9866467297077179, "num_tokens": 77249292.0, "step": 26985 }, { "entropy": 0.07758224606513978, "epoch": 6.2914092551579435, "grad_norm": 2.390625, "learning_rate": 4.8766402699599646e-05, "loss": 0.0579, "mean_token_accuracy": 0.981865918636322, "num_tokens": 77260195.0, "step": 26990 }, { "entropy": 0.06526398342102765, "epoch": 6.292574892178576, "grad_norm": 0.4140625, "learning_rate": 4.8765741421216297e-05, "loss": 0.0236, "mean_token_accuracy": 0.9926089167594909, "num_tokens": 77278321.0, "step": 26995 }, { "entropy": 0.047286936268210414, "epoch": 6.293740529199208, "grad_norm": 1.6953125, "learning_rate": 4.87650799748438e-05, "loss": 0.0166, "mean_token_accuracy": 0.9924996316432952, "num_tokens": 77310297.0, "step": 27000 }, { "entropy": 0.08713566735386849, "epoch": 6.294906166219839, "grad_norm": 1.3046875, "learning_rate": 4.876441836049202e-05, "loss": 0.0707, "mean_token_accuracy": 0.9823097467422486, "num_tokens": 77319165.0, "step": 27005 }, { "entropy": 0.0667908507399261, "epoch": 6.296071803240471, "grad_norm": 0.84375, "learning_rate": 4.8763756578170814e-05, "loss": 0.0329, "mean_token_accuracy": 0.9904216408729554, "num_tokens": 77335995.0, "step": 27010 }, { "entropy": 0.07371202344074845, "epoch": 6.297237440261103, "grad_norm": 2.484375, "learning_rate": 4.8763094627890065e-05, "loss": 0.0305, "mean_token_accuracy": 0.9872647106647492, "num_tokens": 77361297.0, "step": 27015 }, { "entropy": 0.07954277824610471, "epoch": 6.298403077281734, "grad_norm": 0.5078125, "learning_rate": 4.876243250965963e-05, "loss": 0.043, "mean_token_accuracy": 0.9880691766738892, "num_tokens": 77376597.0, "step": 27020 }, { "entropy": 0.07487954199314117, "epoch": 6.299568714302366, "grad_norm": 2.90625, "learning_rate": 4.876177022348939e-05, "loss": 0.0461, "mean_token_accuracy": 0.9878607869148255, "num_tokens": 77409350.0, "step": 27025 }, { "entropy": 0.08372844085097313, "epoch": 6.300734351322998, "grad_norm": 2.03125, "learning_rate": 4.8761107769389214e-05, "loss": 0.04, "mean_token_accuracy": 0.988189697265625, "num_tokens": 77427888.0, "step": 27030 }, { "entropy": 0.07974276877939701, "epoch": 6.30189998834363, "grad_norm": 0.9609375, "learning_rate": 4.8760445147368986e-05, "loss": 0.038, "mean_token_accuracy": 0.991102111339569, "num_tokens": 77439750.0, "step": 27035 }, { "entropy": 0.06711346944794058, "epoch": 6.303065625364262, "grad_norm": 0.150390625, "learning_rate": 4.875978235743858e-05, "loss": 0.0259, "mean_token_accuracy": 0.9904236793518066, "num_tokens": 77465080.0, "step": 27040 }, { "entropy": 0.0663998268544674, "epoch": 6.304231262384893, "grad_norm": 1.765625, "learning_rate": 4.875911939960788e-05, "loss": 0.033, "mean_token_accuracy": 0.9886034607887269, "num_tokens": 77477493.0, "step": 27045 }, { "entropy": 0.06868320833891631, "epoch": 6.305396899405525, "grad_norm": 0.158203125, "learning_rate": 4.875845627388678e-05, "loss": 0.0151, "mean_token_accuracy": 0.991125100851059, "num_tokens": 77518272.0, "step": 27050 }, { "entropy": 0.05745923724025488, "epoch": 6.306562536426157, "grad_norm": 0.57421875, "learning_rate": 4.875779298028517e-05, "loss": 0.0158, "mean_token_accuracy": 0.9914752006530761, "num_tokens": 77551818.0, "step": 27055 }, { "entropy": 0.06080328058451414, "epoch": 6.3077281734467885, "grad_norm": 1.046875, "learning_rate": 4.875712951881292e-05, "loss": 0.0336, "mean_token_accuracy": 0.9897435069084167, "num_tokens": 77570567.0, "step": 27060 }, { "entropy": 0.0706357978284359, "epoch": 6.308893810467421, "grad_norm": 0.234375, "learning_rate": 4.875646588947995e-05, "loss": 0.0514, "mean_token_accuracy": 0.9898265242576599, "num_tokens": 77592122.0, "step": 27065 }, { "entropy": 0.05524565796367824, "epoch": 6.310059447488052, "grad_norm": 0.2490234375, "learning_rate": 4.875580209229613e-05, "loss": 0.0172, "mean_token_accuracy": 0.9909223794937134, "num_tokens": 77626893.0, "step": 27070 }, { "entropy": 0.1092427797615528, "epoch": 6.311225084508684, "grad_norm": 0.68359375, "learning_rate": 4.8755138127271383e-05, "loss": 0.0909, "mean_token_accuracy": 0.9715740263462067, "num_tokens": 77657199.0, "step": 27075 }, { "entropy": 0.07431203294545412, "epoch": 6.312390721529316, "grad_norm": 2.890625, "learning_rate": 4.87544739944156e-05, "loss": 0.0384, "mean_token_accuracy": 0.9846733629703521, "num_tokens": 77678532.0, "step": 27080 }, { "entropy": 0.06422578003257513, "epoch": 6.313556358549947, "grad_norm": 1.046875, "learning_rate": 4.875380969373867e-05, "loss": 0.0313, "mean_token_accuracy": 0.9897050440311432, "num_tokens": 77695040.0, "step": 27085 }, { "entropy": 0.05756510868668556, "epoch": 6.314721995570579, "grad_norm": 0.859375, "learning_rate": 4.875314522525052e-05, "loss": 0.0217, "mean_token_accuracy": 0.9928114414215088, "num_tokens": 77716217.0, "step": 27090 }, { "entropy": 0.06490825079381465, "epoch": 6.315887632591211, "grad_norm": 0.447265625, "learning_rate": 4.875248058896104e-05, "loss": 0.0166, "mean_token_accuracy": 0.9889268100261688, "num_tokens": 77744133.0, "step": 27095 }, { "entropy": 0.06178513718768954, "epoch": 6.317053269611843, "grad_norm": 0.263671875, "learning_rate": 4.8751815784880154e-05, "loss": 0.0288, "mean_token_accuracy": 0.9889122724533081, "num_tokens": 77772314.0, "step": 27100 }, { "entropy": 0.05994966216385365, "epoch": 6.318218906632475, "grad_norm": 1.484375, "learning_rate": 4.8751150813017776e-05, "loss": 0.0423, "mean_token_accuracy": 0.989544004201889, "num_tokens": 77784114.0, "step": 27105 }, { "entropy": 0.0725404830649495, "epoch": 6.319384543653106, "grad_norm": 0.3359375, "learning_rate": 4.875048567338381e-05, "loss": 0.0426, "mean_token_accuracy": 0.9853807687759399, "num_tokens": 77805973.0, "step": 27110 }, { "entropy": 0.07376164561137558, "epoch": 6.320550180673738, "grad_norm": 3.109375, "learning_rate": 4.8749820365988185e-05, "loss": 0.0517, "mean_token_accuracy": 0.9803310930728912, "num_tokens": 77829286.0, "step": 27115 }, { "entropy": 0.08759814314544201, "epoch": 6.32171581769437, "grad_norm": 0.79296875, "learning_rate": 4.874915489084081e-05, "loss": 0.0421, "mean_token_accuracy": 0.9889974951744079, "num_tokens": 77840547.0, "step": 27120 }, { "entropy": 0.08322993703186513, "epoch": 6.322881454715001, "grad_norm": 1.578125, "learning_rate": 4.874848924795163e-05, "loss": 0.0499, "mean_token_accuracy": 0.9860299587249756, "num_tokens": 77848476.0, "step": 27125 }, { "entropy": 0.06156183313578367, "epoch": 6.3240470917356335, "grad_norm": 0.361328125, "learning_rate": 4.874782343733055e-05, "loss": 0.021, "mean_token_accuracy": 0.992520546913147, "num_tokens": 77874137.0, "step": 27130 }, { "entropy": 0.04788096770644188, "epoch": 6.325212728756266, "grad_norm": 0.62890625, "learning_rate": 4.8747157458987505e-05, "loss": 0.025, "mean_token_accuracy": 0.9898177683353424, "num_tokens": 77917307.0, "step": 27135 }, { "entropy": 0.0891963217407465, "epoch": 6.326378365776897, "grad_norm": 1.4296875, "learning_rate": 4.874649131293242e-05, "loss": 0.0495, "mean_token_accuracy": 0.9910438537597657, "num_tokens": 77937080.0, "step": 27140 }, { "entropy": 0.07902921987697482, "epoch": 6.327544002797529, "grad_norm": 1.875, "learning_rate": 4.874582499917524e-05, "loss": 0.0296, "mean_token_accuracy": 0.99190593957901, "num_tokens": 77955404.0, "step": 27145 }, { "entropy": 0.06934650149196386, "epoch": 6.328709639818161, "grad_norm": 4.53125, "learning_rate": 4.874515851772589e-05, "loss": 0.0415, "mean_token_accuracy": 0.9878355026245117, "num_tokens": 77977887.0, "step": 27150 }, { "entropy": 0.042633866891264915, "epoch": 6.329875276838792, "grad_norm": 1.3046875, "learning_rate": 4.8744491868594316e-05, "loss": 0.0197, "mean_token_accuracy": 0.995466285943985, "num_tokens": 78004193.0, "step": 27155 }, { "entropy": 0.0678960201330483, "epoch": 6.331040913859424, "grad_norm": 0.5, "learning_rate": 4.8743825051790455e-05, "loss": 0.0328, "mean_token_accuracy": 0.9873064517974853, "num_tokens": 78021840.0, "step": 27160 }, { "entropy": 0.07100022733211517, "epoch": 6.332206550880056, "grad_norm": 0.3046875, "learning_rate": 4.874315806732425e-05, "loss": 0.0395, "mean_token_accuracy": 0.9875366270542145, "num_tokens": 78046750.0, "step": 27165 }, { "entropy": 0.07311735656112432, "epoch": 6.333372187900688, "grad_norm": 1.5703125, "learning_rate": 4.874249091520565e-05, "loss": 0.0356, "mean_token_accuracy": 0.9898627936840058, "num_tokens": 78062381.0, "step": 27170 }, { "entropy": 0.056971798092126845, "epoch": 6.33453782492132, "grad_norm": 0.357421875, "learning_rate": 4.87418235954446e-05, "loss": 0.0097, "mean_token_accuracy": 0.9902470290660859, "num_tokens": 78105924.0, "step": 27175 }, { "entropy": 0.06141122579574585, "epoch": 6.335703461941951, "grad_norm": 0.478515625, "learning_rate": 4.874115610805105e-05, "loss": 0.0343, "mean_token_accuracy": 0.9900152325630188, "num_tokens": 78128152.0, "step": 27180 }, { "entropy": 0.07991018062457442, "epoch": 6.336869098962583, "grad_norm": 0.265625, "learning_rate": 4.8740488453034954e-05, "loss": 0.0248, "mean_token_accuracy": 0.9912156224250793, "num_tokens": 78146706.0, "step": 27185 }, { "entropy": 0.07130510099232197, "epoch": 6.338034735983215, "grad_norm": 3.75, "learning_rate": 4.8739820630406275e-05, "loss": 0.0474, "mean_token_accuracy": 0.9853857576847076, "num_tokens": 78161648.0, "step": 27190 }, { "entropy": 0.04623548369854689, "epoch": 6.339200373003846, "grad_norm": 0.1650390625, "learning_rate": 4.8739152640174956e-05, "loss": 0.0167, "mean_token_accuracy": 0.9954453229904174, "num_tokens": 78200518.0, "step": 27195 }, { "entropy": 0.06505171973258257, "epoch": 6.3403660100244785, "grad_norm": 0.96875, "learning_rate": 4.873848448235097e-05, "loss": 0.0324, "mean_token_accuracy": 0.989247715473175, "num_tokens": 78220373.0, "step": 27200 }, { "entropy": 0.08946949765086173, "epoch": 6.34153164704511, "grad_norm": 3.375, "learning_rate": 4.873781615694428e-05, "loss": 0.0588, "mean_token_accuracy": 0.985131961107254, "num_tokens": 78228640.0, "step": 27205 }, { "entropy": 0.05788529254496098, "epoch": 6.342697284065742, "grad_norm": 0.85546875, "learning_rate": 4.873714766396484e-05, "loss": 0.0231, "mean_token_accuracy": 0.9910314798355102, "num_tokens": 78246948.0, "step": 27210 }, { "entropy": 0.05744266202673316, "epoch": 6.343862921086374, "grad_norm": 1.078125, "learning_rate": 4.8736479003422636e-05, "loss": 0.0316, "mean_token_accuracy": 0.9888568997383118, "num_tokens": 78265111.0, "step": 27215 }, { "entropy": 0.055955256521701816, "epoch": 6.345028558107005, "grad_norm": 2.359375, "learning_rate": 4.873581017532762e-05, "loss": 0.0275, "mean_token_accuracy": 0.9928835809230805, "num_tokens": 78288714.0, "step": 27220 }, { "entropy": 0.08183978088200092, "epoch": 6.346194195127637, "grad_norm": 0.404296875, "learning_rate": 4.8735141179689785e-05, "loss": 0.0569, "mean_token_accuracy": 0.9858071386814118, "num_tokens": 78320371.0, "step": 27225 }, { "entropy": 0.055092441756278275, "epoch": 6.347359832148269, "grad_norm": 0.55078125, "learning_rate": 4.8734472016519097e-05, "loss": 0.0185, "mean_token_accuracy": 0.9905835092067719, "num_tokens": 78343924.0, "step": 27230 }, { "entropy": 0.0675237711519003, "epoch": 6.348525469168901, "grad_norm": 0.59375, "learning_rate": 4.8733802685825525e-05, "loss": 0.0197, "mean_token_accuracy": 0.993051677942276, "num_tokens": 78367520.0, "step": 27235 }, { "entropy": 0.06913798209279776, "epoch": 6.349691106189533, "grad_norm": 0.8984375, "learning_rate": 4.8733133187619065e-05, "loss": 0.0627, "mean_token_accuracy": 0.988431054353714, "num_tokens": 78382339.0, "step": 27240 }, { "entropy": 0.08050851384177804, "epoch": 6.350856743210164, "grad_norm": 1.75, "learning_rate": 4.873246352190969e-05, "loss": 0.0468, "mean_token_accuracy": 0.9877587676048278, "num_tokens": 78400191.0, "step": 27245 }, { "entropy": 0.12997290436178446, "epoch": 6.352022380230796, "grad_norm": 2.921875, "learning_rate": 4.8731793688707386e-05, "loss": 0.1291, "mean_token_accuracy": 0.9744395613670349, "num_tokens": 78423703.0, "step": 27250 }, { "entropy": 0.05893737701699138, "epoch": 6.353188017251428, "grad_norm": 1.9453125, "learning_rate": 4.873112368802215e-05, "loss": 0.0417, "mean_token_accuracy": 0.9888142168521881, "num_tokens": 78438932.0, "step": 27255 }, { "entropy": 0.08109636697918177, "epoch": 6.354353654272059, "grad_norm": 0.5234375, "learning_rate": 4.873045351986396e-05, "loss": 0.0488, "mean_token_accuracy": 0.9866352796554565, "num_tokens": 78450813.0, "step": 27260 }, { "entropy": 0.09451097249984741, "epoch": 6.355519291292691, "grad_norm": 1.0546875, "learning_rate": 4.872978318424283e-05, "loss": 0.0295, "mean_token_accuracy": 0.9913021385669708, "num_tokens": 78465366.0, "step": 27265 }, { "entropy": 0.05986616881564259, "epoch": 6.3566849283133235, "grad_norm": 0.1806640625, "learning_rate": 4.872911268116873e-05, "loss": 0.0381, "mean_token_accuracy": 0.9888629734516143, "num_tokens": 78491298.0, "step": 27270 }, { "entropy": 0.06749337911605835, "epoch": 6.357850565333955, "grad_norm": 1.3359375, "learning_rate": 4.872844201065168e-05, "loss": 0.0364, "mean_token_accuracy": 0.9906283020973206, "num_tokens": 78502915.0, "step": 27275 }, { "entropy": 0.08259217841550708, "epoch": 6.359016202354587, "grad_norm": 2.21875, "learning_rate": 4.872777117270166e-05, "loss": 0.0456, "mean_token_accuracy": 0.984370744228363, "num_tokens": 78519778.0, "step": 27280 }, { "entropy": 0.07694516181945801, "epoch": 6.360181839375219, "grad_norm": 2.375, "learning_rate": 4.8727100167328685e-05, "loss": 0.0425, "mean_token_accuracy": 0.9869538724422455, "num_tokens": 78528363.0, "step": 27285 }, { "entropy": 0.0720290282741189, "epoch": 6.36134747639585, "grad_norm": 1.3203125, "learning_rate": 4.872642899454277e-05, "loss": 0.035, "mean_token_accuracy": 0.9898767948150635, "num_tokens": 78541818.0, "step": 27290 }, { "entropy": 0.07878648396581411, "epoch": 6.362513113416482, "grad_norm": 0.38671875, "learning_rate": 4.872575765435391e-05, "loss": 0.0379, "mean_token_accuracy": 0.9881724655628205, "num_tokens": 78555497.0, "step": 27295 }, { "entropy": 0.0814511626958847, "epoch": 6.3636787504371135, "grad_norm": 0.62109375, "learning_rate": 4.8725086146772115e-05, "loss": 0.0375, "mean_token_accuracy": 0.9854908883571625, "num_tokens": 78567574.0, "step": 27300 }, { "entropy": 0.08116347342729568, "epoch": 6.364844387457746, "grad_norm": 0.8515625, "learning_rate": 4.87244144718074e-05, "loss": 0.0599, "mean_token_accuracy": 0.9835920453071594, "num_tokens": 78577065.0, "step": 27305 }, { "entropy": 0.08684050869196654, "epoch": 6.366010024478378, "grad_norm": 0.6640625, "learning_rate": 4.8723742629469794e-05, "loss": 0.0309, "mean_token_accuracy": 0.9864567041397094, "num_tokens": 78596233.0, "step": 27310 }, { "entropy": 0.07369791343808174, "epoch": 6.367175661499009, "grad_norm": 1.765625, "learning_rate": 4.87230706197693e-05, "loss": 0.0591, "mean_token_accuracy": 0.986175411939621, "num_tokens": 78607212.0, "step": 27315 }, { "entropy": 0.06452398095279932, "epoch": 6.368341298519641, "grad_norm": 3.203125, "learning_rate": 4.872239844271593e-05, "loss": 0.032, "mean_token_accuracy": 0.988612276315689, "num_tokens": 78631724.0, "step": 27320 }, { "entropy": 0.05261941840872168, "epoch": 6.369506935540273, "grad_norm": 0.75390625, "learning_rate": 4.872172609831973e-05, "loss": 0.0289, "mean_token_accuracy": 0.9906743228435516, "num_tokens": 78660307.0, "step": 27325 }, { "entropy": 0.07566324677318334, "epoch": 6.370672572560904, "grad_norm": 0.921875, "learning_rate": 4.8721053586590714e-05, "loss": 0.0241, "mean_token_accuracy": 0.9894809603691102, "num_tokens": 78681145.0, "step": 27330 }, { "entropy": 0.06389159299433231, "epoch": 6.3718382095815365, "grad_norm": 1.09375, "learning_rate": 4.872038090753892e-05, "loss": 0.0185, "mean_token_accuracy": 0.9917346715927124, "num_tokens": 78703037.0, "step": 27335 }, { "entropy": 0.07629008330404759, "epoch": 6.373003846602168, "grad_norm": 0.953125, "learning_rate": 4.8719708061174355e-05, "loss": 0.0275, "mean_token_accuracy": 0.9885548293590546, "num_tokens": 78715880.0, "step": 27340 }, { "entropy": 0.08970846701413393, "epoch": 6.3741694836228, "grad_norm": 3.90625, "learning_rate": 4.871903504750708e-05, "loss": 0.052, "mean_token_accuracy": 0.9867014706134796, "num_tokens": 78730263.0, "step": 27345 }, { "entropy": 0.07629141733050346, "epoch": 6.375335120643432, "grad_norm": 1.6640625, "learning_rate": 4.8718361866547113e-05, "loss": 0.0521, "mean_token_accuracy": 0.9869326591491699, "num_tokens": 78743333.0, "step": 27350 }, { "entropy": 0.07764709144830703, "epoch": 6.376500757664063, "grad_norm": 2.671875, "learning_rate": 4.871768851830449e-05, "loss": 0.0502, "mean_token_accuracy": 0.9871854364871979, "num_tokens": 78754458.0, "step": 27355 }, { "entropy": 0.08753283023834228, "epoch": 6.377666394684695, "grad_norm": 0.8125, "learning_rate": 4.871701500278927e-05, "loss": 0.056, "mean_token_accuracy": 0.9862245202064515, "num_tokens": 78763480.0, "step": 27360 }, { "entropy": 0.062451109476387504, "epoch": 6.378832031705327, "grad_norm": 0.58203125, "learning_rate": 4.8716341320011485e-05, "loss": 0.0352, "mean_token_accuracy": 0.9882869064807892, "num_tokens": 78776564.0, "step": 27365 }, { "entropy": 0.07153270859271288, "epoch": 6.3799976687259585, "grad_norm": 2.0, "learning_rate": 4.871566746998117e-05, "loss": 0.0467, "mean_token_accuracy": 0.987527585029602, "num_tokens": 78790359.0, "step": 27370 }, { "entropy": 0.06521230619400739, "epoch": 6.381163305746591, "grad_norm": 1.5078125, "learning_rate": 4.871499345270839e-05, "loss": 0.0223, "mean_token_accuracy": 0.9910327970981598, "num_tokens": 78815759.0, "step": 27375 }, { "entropy": 0.0744168077595532, "epoch": 6.382328942767222, "grad_norm": 0.92578125, "learning_rate": 4.871431926820319e-05, "loss": 0.0368, "mean_token_accuracy": 0.9912038207054138, "num_tokens": 78833711.0, "step": 27380 }, { "entropy": 0.05522217508405447, "epoch": 6.383494579787854, "grad_norm": 1.4296875, "learning_rate": 4.871364491647562e-05, "loss": 0.0297, "mean_token_accuracy": 0.9883031487464905, "num_tokens": 78858418.0, "step": 27385 }, { "entropy": 0.08326612692326307, "epoch": 6.384660216808486, "grad_norm": 0.33203125, "learning_rate": 4.871297039753575e-05, "loss": 0.0475, "mean_token_accuracy": 0.9871909439563751, "num_tokens": 78875926.0, "step": 27390 }, { "entropy": 0.06716305706650019, "epoch": 6.385825853829117, "grad_norm": 2.078125, "learning_rate": 4.871229571139361e-05, "loss": 0.0264, "mean_token_accuracy": 0.9889111697673798, "num_tokens": 78889549.0, "step": 27395 }, { "entropy": 0.06953895352780819, "epoch": 6.386991490849749, "grad_norm": 4.78125, "learning_rate": 4.871162085805928e-05, "loss": 0.0367, "mean_token_accuracy": 0.9867134690284729, "num_tokens": 78932439.0, "step": 27400 }, { "entropy": 0.05694300038740039, "epoch": 6.3881571278703815, "grad_norm": 2.0, "learning_rate": 4.871094583754283e-05, "loss": 0.021, "mean_token_accuracy": 0.9930303812026977, "num_tokens": 78948125.0, "step": 27405 }, { "entropy": 0.050066758040338756, "epoch": 6.389322764891013, "grad_norm": 0.2060546875, "learning_rate": 4.871027064985431e-05, "loss": 0.0175, "mean_token_accuracy": 0.9904357731342316, "num_tokens": 78982256.0, "step": 27410 }, { "entropy": 0.06765003893524409, "epoch": 6.390488401911645, "grad_norm": 1.828125, "learning_rate": 4.8709595295003786e-05, "loss": 0.0428, "mean_token_accuracy": 0.9883161067962647, "num_tokens": 79007275.0, "step": 27415 }, { "entropy": 0.05757339298725128, "epoch": 6.391654038932277, "grad_norm": 0.416015625, "learning_rate": 4.8708919773001335e-05, "loss": 0.0299, "mean_token_accuracy": 0.9906012296676636, "num_tokens": 79025587.0, "step": 27420 }, { "entropy": 0.04727498330175876, "epoch": 6.392819675952908, "grad_norm": 1.625, "learning_rate": 4.8708244083857044e-05, "loss": 0.0229, "mean_token_accuracy": 0.993421071767807, "num_tokens": 79050082.0, "step": 27425 }, { "entropy": 0.06903645731508731, "epoch": 6.39398531297354, "grad_norm": 0.44140625, "learning_rate": 4.870756822758097e-05, "loss": 0.0483, "mean_token_accuracy": 0.9889946520328522, "num_tokens": 79065697.0, "step": 27430 }, { "entropy": 0.09842782281339169, "epoch": 6.395150949994171, "grad_norm": 2.5, "learning_rate": 4.870689220418319e-05, "loss": 0.0994, "mean_token_accuracy": 0.9789209306240082, "num_tokens": 79084356.0, "step": 27435 }, { "entropy": 0.05659789480268955, "epoch": 6.3963165870148035, "grad_norm": 0.34765625, "learning_rate": 4.87062160136738e-05, "loss": 0.0254, "mean_token_accuracy": 0.9913104176521301, "num_tokens": 79106442.0, "step": 27440 }, { "entropy": 0.05858164490200579, "epoch": 6.397482224035436, "grad_norm": 0.8359375, "learning_rate": 4.8705539656062874e-05, "loss": 0.0177, "mean_token_accuracy": 0.9934798002243042, "num_tokens": 79136838.0, "step": 27445 }, { "entropy": 0.04104721006006003, "epoch": 6.398647861056067, "grad_norm": 5.03125, "learning_rate": 4.870486313136049e-05, "loss": 0.0285, "mean_token_accuracy": 0.9911053597927093, "num_tokens": 79159904.0, "step": 27450 }, { "entropy": 0.05417780401185155, "epoch": 6.399813498076699, "grad_norm": 3.140625, "learning_rate": 4.870418643957675e-05, "loss": 0.0331, "mean_token_accuracy": 0.9912892341613769, "num_tokens": 79182211.0, "step": 27455 }, { "entropy": 0.07004196234047413, "epoch": 6.400979135097331, "grad_norm": 0.369140625, "learning_rate": 4.870350958072173e-05, "loss": 0.0425, "mean_token_accuracy": 0.9870667815208435, "num_tokens": 79200837.0, "step": 27460 }, { "entropy": 0.07679205816239118, "epoch": 6.402144772117962, "grad_norm": 1.65625, "learning_rate": 4.870283255480554e-05, "loss": 0.048, "mean_token_accuracy": 0.9877889931201935, "num_tokens": 79225260.0, "step": 27465 }, { "entropy": 0.06059764139354229, "epoch": 6.403310409138594, "grad_norm": 2.546875, "learning_rate": 4.8702155361838265e-05, "loss": 0.0436, "mean_token_accuracy": 0.9852454006671906, "num_tokens": 79236989.0, "step": 27470 }, { "entropy": 0.07255728393793107, "epoch": 6.404476046159226, "grad_norm": 1.234375, "learning_rate": 4.8701478001830006e-05, "loss": 0.0516, "mean_token_accuracy": 0.9845960021018982, "num_tokens": 79249351.0, "step": 27475 }, { "entropy": 0.09389685951173306, "epoch": 6.405641683179858, "grad_norm": 1.7578125, "learning_rate": 4.870080047479086e-05, "loss": 0.036, "mean_token_accuracy": 0.99000563621521, "num_tokens": 79267778.0, "step": 27480 }, { "entropy": 0.062156752310693265, "epoch": 6.40680732020049, "grad_norm": 3.875, "learning_rate": 4.870012278073093e-05, "loss": 0.0437, "mean_token_accuracy": 0.9860499918460846, "num_tokens": 79284570.0, "step": 27485 }, { "entropy": 0.06316896006464959, "epoch": 6.407972957221121, "grad_norm": 0.51171875, "learning_rate": 4.869944491966033e-05, "loss": 0.0252, "mean_token_accuracy": 0.9924411833286285, "num_tokens": 79301528.0, "step": 27490 }, { "entropy": 0.06141756055876613, "epoch": 6.409138594241753, "grad_norm": 0.2275390625, "learning_rate": 4.8698766891589146e-05, "loss": 0.034, "mean_token_accuracy": 0.987896740436554, "num_tokens": 79334446.0, "step": 27495 }, { "entropy": 0.0701604936271906, "epoch": 6.410304231262385, "grad_norm": 1.5234375, "learning_rate": 4.869808869652752e-05, "loss": 0.0435, "mean_token_accuracy": 0.9874770641326904, "num_tokens": 79345554.0, "step": 27500 }, { "entropy": 0.050602398626506326, "epoch": 6.4114698682830165, "grad_norm": 0.6640625, "learning_rate": 4.869741033448554e-05, "loss": 0.0382, "mean_token_accuracy": 0.9874916076660156, "num_tokens": 79368799.0, "step": 27505 }, { "entropy": 0.06348597463220358, "epoch": 6.412635505303649, "grad_norm": 2.703125, "learning_rate": 4.869673180547333e-05, "loss": 0.0301, "mean_token_accuracy": 0.9886243999004364, "num_tokens": 79393263.0, "step": 27510 }, { "entropy": 0.05499905683100224, "epoch": 6.41380114232428, "grad_norm": 0.8359375, "learning_rate": 4.869605310950102e-05, "loss": 0.0266, "mean_token_accuracy": 0.9914214432239532, "num_tokens": 79414468.0, "step": 27515 }, { "entropy": 0.07658125725574791, "epoch": 6.414966779344912, "grad_norm": 1.8203125, "learning_rate": 4.86953742465787e-05, "loss": 0.0358, "mean_token_accuracy": 0.989252644777298, "num_tokens": 79432609.0, "step": 27520 }, { "entropy": 0.0815320173278451, "epoch": 6.416132416365544, "grad_norm": 3.28125, "learning_rate": 4.869469521671652e-05, "loss": 0.0501, "mean_token_accuracy": 0.9842431008815765, "num_tokens": 79443455.0, "step": 27525 }, { "entropy": 0.07601859644055367, "epoch": 6.417298053386175, "grad_norm": 2.59375, "learning_rate": 4.8694016019924596e-05, "loss": 0.0503, "mean_token_accuracy": 0.9878701090812683, "num_tokens": 79456083.0, "step": 27530 }, { "entropy": 0.06871461626142264, "epoch": 6.418463690406807, "grad_norm": 0.2255859375, "learning_rate": 4.869333665621306e-05, "loss": 0.0361, "mean_token_accuracy": 0.984860771894455, "num_tokens": 79477148.0, "step": 27535 }, { "entropy": 0.06119240690022707, "epoch": 6.419629327427439, "grad_norm": 1.3828125, "learning_rate": 4.869265712559203e-05, "loss": 0.0278, "mean_token_accuracy": 0.9898676156997681, "num_tokens": 79508657.0, "step": 27540 }, { "entropy": 0.08302809465676546, "epoch": 6.420794964448071, "grad_norm": 1.3359375, "learning_rate": 4.869197742807166e-05, "loss": 0.0471, "mean_token_accuracy": 0.9878687500953675, "num_tokens": 79519230.0, "step": 27545 }, { "entropy": 0.08217885680496692, "epoch": 6.421960601468703, "grad_norm": 1.078125, "learning_rate": 4.8691297563662064e-05, "loss": 0.0377, "mean_token_accuracy": 0.9881730794906616, "num_tokens": 79529426.0, "step": 27550 }, { "entropy": 0.07849093452095986, "epoch": 6.423126238489335, "grad_norm": 3.140625, "learning_rate": 4.86906175323734e-05, "loss": 0.0419, "mean_token_accuracy": 0.9865849614143372, "num_tokens": 79539544.0, "step": 27555 }, { "entropy": 0.06886114776134492, "epoch": 6.424291875509966, "grad_norm": 2.078125, "learning_rate": 4.868993733421578e-05, "loss": 0.0396, "mean_token_accuracy": 0.9901333630084992, "num_tokens": 79552090.0, "step": 27560 }, { "entropy": 0.06919397301971912, "epoch": 6.425457512530598, "grad_norm": 0.90625, "learning_rate": 4.868925696919937e-05, "loss": 0.0192, "mean_token_accuracy": 0.9921321332454681, "num_tokens": 79570648.0, "step": 27565 }, { "entropy": 0.057403205148875716, "epoch": 6.426623149551229, "grad_norm": 1.359375, "learning_rate": 4.868857643733431e-05, "loss": 0.0176, "mean_token_accuracy": 0.9934298872947693, "num_tokens": 79589140.0, "step": 27570 }, { "entropy": 0.0688036672770977, "epoch": 6.4277887865718615, "grad_norm": 0.384765625, "learning_rate": 4.868789573863075e-05, "loss": 0.0297, "mean_token_accuracy": 0.9875211656093598, "num_tokens": 79616950.0, "step": 27575 }, { "entropy": 0.07488919515162706, "epoch": 6.428954423592494, "grad_norm": 2.28125, "learning_rate": 4.8687214873098836e-05, "loss": 0.0421, "mean_token_accuracy": 0.9881720423698426, "num_tokens": 79633601.0, "step": 27580 }, { "entropy": 0.05924738338217139, "epoch": 6.430120060613125, "grad_norm": 1.078125, "learning_rate": 4.8686533840748714e-05, "loss": 0.0269, "mean_token_accuracy": 0.988178825378418, "num_tokens": 79659234.0, "step": 27585 }, { "entropy": 0.07275372557342052, "epoch": 6.431285697633757, "grad_norm": 1.5546875, "learning_rate": 4.868585264159056e-05, "loss": 0.0413, "mean_token_accuracy": 0.9883818924427032, "num_tokens": 79669822.0, "step": 27590 }, { "entropy": 0.08007224500179291, "epoch": 6.432451334654389, "grad_norm": 2.6875, "learning_rate": 4.868517127563451e-05, "loss": 0.0564, "mean_token_accuracy": 0.985856169462204, "num_tokens": 79681441.0, "step": 27595 }, { "entropy": 0.09746268084272743, "epoch": 6.43361697167502, "grad_norm": 0.50390625, "learning_rate": 4.8684489742890735e-05, "loss": 0.1101, "mean_token_accuracy": 0.9700672090053558, "num_tokens": 79707024.0, "step": 27600 }, { "entropy": 0.08292770921252668, "epoch": 6.434782608695652, "grad_norm": 0.7578125, "learning_rate": 4.868380804336939e-05, "loss": 0.044, "mean_token_accuracy": 0.9842049777507782, "num_tokens": 79735662.0, "step": 27605 }, { "entropy": 0.06323742168024182, "epoch": 6.4359482457162835, "grad_norm": 1.625, "learning_rate": 4.868312617708065e-05, "loss": 0.036, "mean_token_accuracy": 0.9877136886119843, "num_tokens": 79753008.0, "step": 27610 }, { "entropy": 0.054892979096621275, "epoch": 6.437113882736916, "grad_norm": 0.578125, "learning_rate": 4.868244414403468e-05, "loss": 0.0304, "mean_token_accuracy": 0.9911454975605011, "num_tokens": 79774283.0, "step": 27615 }, { "entropy": 0.060715962387621406, "epoch": 6.438279519757548, "grad_norm": 0.7578125, "learning_rate": 4.868176194424165e-05, "loss": 0.022, "mean_token_accuracy": 0.9909097194671631, "num_tokens": 79802163.0, "step": 27620 }, { "entropy": 0.07650736030191183, "epoch": 6.439445156778179, "grad_norm": 2.203125, "learning_rate": 4.8681079577711733e-05, "loss": 0.0462, "mean_token_accuracy": 0.9869279861450195, "num_tokens": 79817232.0, "step": 27625 }, { "entropy": 0.05185747491195798, "epoch": 6.440610793798811, "grad_norm": 0.546875, "learning_rate": 4.8680397044455095e-05, "loss": 0.017, "mean_token_accuracy": 0.9912841856479645, "num_tokens": 79850502.0, "step": 27630 }, { "entropy": 0.07319100480526686, "epoch": 6.441776430819443, "grad_norm": 2.921875, "learning_rate": 4.867971434448192e-05, "loss": 0.0441, "mean_token_accuracy": 0.9871128141880036, "num_tokens": 79865854.0, "step": 27635 }, { "entropy": 0.059058552328497174, "epoch": 6.442942067840074, "grad_norm": 0.328125, "learning_rate": 4.867903147780239e-05, "loss": 0.0147, "mean_token_accuracy": 0.991374397277832, "num_tokens": 79893417.0, "step": 27640 }, { "entropy": 0.06549884639680385, "epoch": 6.4441077048607065, "grad_norm": 1.296875, "learning_rate": 4.867834844442669e-05, "loss": 0.0292, "mean_token_accuracy": 0.9903046309947967, "num_tokens": 79910987.0, "step": 27645 }, { "entropy": 0.061291656084358694, "epoch": 6.445273341881338, "grad_norm": 2.75, "learning_rate": 4.8677665244365e-05, "loss": 0.025, "mean_token_accuracy": 0.9875168144702912, "num_tokens": 79938899.0, "step": 27650 }, { "entropy": 0.1144998598843813, "epoch": 6.44643897890197, "grad_norm": 0.271484375, "learning_rate": 4.8676981877627516e-05, "loss": 0.0399, "mean_token_accuracy": 0.9874226093292237, "num_tokens": 79964068.0, "step": 27655 }, { "entropy": 0.06761856637895107, "epoch": 6.447604615922602, "grad_norm": 0.78125, "learning_rate": 4.867629834422441e-05, "loss": 0.0201, "mean_token_accuracy": 0.9924596011638641, "num_tokens": 79990446.0, "step": 27660 }, { "entropy": 0.07056233957409859, "epoch": 6.448770252943233, "grad_norm": 0.55078125, "learning_rate": 4.867561464416589e-05, "loss": 0.0274, "mean_token_accuracy": 0.9904316186904907, "num_tokens": 80013369.0, "step": 27665 }, { "entropy": 0.05911567583680153, "epoch": 6.449935889963865, "grad_norm": 0.5, "learning_rate": 4.8674930777462146e-05, "loss": 0.0177, "mean_token_accuracy": 0.9893089294433594, "num_tokens": 80043038.0, "step": 27670 }, { "entropy": 0.061127489618957045, "epoch": 6.451101526984497, "grad_norm": 0.439453125, "learning_rate": 4.867424674412337e-05, "loss": 0.0178, "mean_token_accuracy": 0.9948661983013153, "num_tokens": 80074697.0, "step": 27675 }, { "entropy": 0.08972432166337967, "epoch": 6.452267164005129, "grad_norm": 4.40625, "learning_rate": 4.867356254415979e-05, "loss": 0.059, "mean_token_accuracy": 0.9820292592048645, "num_tokens": 80091290.0, "step": 27680 }, { "entropy": 0.056455508526414636, "epoch": 6.453432801025761, "grad_norm": 1.4921875, "learning_rate": 4.867287817758157e-05, "loss": 0.0406, "mean_token_accuracy": 0.9857260882854462, "num_tokens": 80120900.0, "step": 27685 }, { "entropy": 0.16505700461566447, "epoch": 6.454598438046393, "grad_norm": 1.078125, "learning_rate": 4.867219364439893e-05, "loss": 0.2416, "mean_token_accuracy": 0.9665460169315339, "num_tokens": 80142769.0, "step": 27690 }, { "entropy": 0.05633875224739313, "epoch": 6.455764075067024, "grad_norm": 0.5, "learning_rate": 4.867150894462208e-05, "loss": 0.0103, "mean_token_accuracy": 0.9942323207855225, "num_tokens": 80169351.0, "step": 27695 }, { "entropy": 0.06843089256435633, "epoch": 6.456929712087656, "grad_norm": 0.8359375, "learning_rate": 4.867082407826123e-05, "loss": 0.045, "mean_token_accuracy": 0.9859276533126831, "num_tokens": 80181397.0, "step": 27700 }, { "entropy": 0.07748326919972896, "epoch": 6.458095349108287, "grad_norm": 1.4140625, "learning_rate": 4.867013904532659e-05, "loss": 0.0426, "mean_token_accuracy": 0.98861523270607, "num_tokens": 80191864.0, "step": 27705 }, { "entropy": 0.07460166187956929, "epoch": 6.459260986128919, "grad_norm": 1.4921875, "learning_rate": 4.8669453845828375e-05, "loss": 0.0461, "mean_token_accuracy": 0.9854443371295929, "num_tokens": 80209907.0, "step": 27710 }, { "entropy": 0.06753936400637031, "epoch": 6.4604266231495515, "grad_norm": 2.359375, "learning_rate": 4.8668768479776805e-05, "loss": 0.0257, "mean_token_accuracy": 0.988853371143341, "num_tokens": 80239473.0, "step": 27715 }, { "entropy": 0.08805846348404885, "epoch": 6.461592260170183, "grad_norm": 0.6015625, "learning_rate": 4.8668082947182104e-05, "loss": 0.0344, "mean_token_accuracy": 0.9911517918109893, "num_tokens": 80251800.0, "step": 27720 }, { "entropy": 0.05560419810935855, "epoch": 6.462757897190815, "grad_norm": 2.078125, "learning_rate": 4.866739724805448e-05, "loss": 0.0204, "mean_token_accuracy": 0.9919722378253937, "num_tokens": 80281701.0, "step": 27725 }, { "entropy": 0.04412915213033557, "epoch": 6.463923534211447, "grad_norm": 0.328125, "learning_rate": 4.8666711382404174e-05, "loss": 0.0236, "mean_token_accuracy": 0.9937281429767608, "num_tokens": 80316543.0, "step": 27730 }, { "entropy": 0.06135749835520983, "epoch": 6.465089171232078, "grad_norm": 0.86328125, "learning_rate": 4.8666025350241394e-05, "loss": 0.0355, "mean_token_accuracy": 0.9881977498531341, "num_tokens": 80330948.0, "step": 27735 }, { "entropy": 0.060386568494141105, "epoch": 6.46625480825271, "grad_norm": 0.9921875, "learning_rate": 4.866533915157639e-05, "loss": 0.019, "mean_token_accuracy": 0.9925783932209015, "num_tokens": 80356120.0, "step": 27740 }, { "entropy": 0.08376160068437458, "epoch": 6.4674204452733415, "grad_norm": 2.140625, "learning_rate": 4.866465278641938e-05, "loss": 0.0304, "mean_token_accuracy": 0.9902998149394989, "num_tokens": 80370511.0, "step": 27745 }, { "entropy": 0.07058606464415788, "epoch": 6.468586082293974, "grad_norm": 3.671875, "learning_rate": 4.866396625478061e-05, "loss": 0.0397, "mean_token_accuracy": 0.9877101600170135, "num_tokens": 80391356.0, "step": 27750 }, { "entropy": 0.09883666504174471, "epoch": 6.469751719314606, "grad_norm": 0.59375, "learning_rate": 4.866327955667032e-05, "loss": 0.0804, "mean_token_accuracy": 0.9778140246868133, "num_tokens": 80414428.0, "step": 27755 }, { "entropy": 0.06909555848687887, "epoch": 6.470917356335237, "grad_norm": 1.2421875, "learning_rate": 4.866259269209873e-05, "loss": 0.0526, "mean_token_accuracy": 0.9838371276855469, "num_tokens": 80432420.0, "step": 27760 }, { "entropy": 0.07560903541743755, "epoch": 6.472082993355869, "grad_norm": 2.6875, "learning_rate": 4.866190566107609e-05, "loss": 0.0531, "mean_token_accuracy": 0.9859624147415161, "num_tokens": 80441410.0, "step": 27765 }, { "entropy": 0.06336052715778351, "epoch": 6.473248630376501, "grad_norm": 0.71484375, "learning_rate": 4.866121846361266e-05, "loss": 0.0276, "mean_token_accuracy": 0.9916476845741272, "num_tokens": 80455283.0, "step": 27770 }, { "entropy": 0.05840042941272259, "epoch": 6.474414267397132, "grad_norm": 0.5, "learning_rate": 4.8660531099718666e-05, "loss": 0.024, "mean_token_accuracy": 0.9917756974697113, "num_tokens": 80476389.0, "step": 27775 }, { "entropy": 0.061544339545071124, "epoch": 6.475579904417764, "grad_norm": 0.55078125, "learning_rate": 4.865984356940437e-05, "loss": 0.0275, "mean_token_accuracy": 0.9913029193878173, "num_tokens": 80494281.0, "step": 27780 }, { "entropy": 0.07006441093981267, "epoch": 6.476745541438396, "grad_norm": 1.2734375, "learning_rate": 4.865915587268002e-05, "loss": 0.0356, "mean_token_accuracy": 0.988695627450943, "num_tokens": 80516753.0, "step": 27785 }, { "entropy": 0.06810265779495239, "epoch": 6.477911178459028, "grad_norm": 2.515625, "learning_rate": 4.865846800955587e-05, "loss": 0.0402, "mean_token_accuracy": 0.9881290256977081, "num_tokens": 80528537.0, "step": 27790 }, { "entropy": 0.06130484715104103, "epoch": 6.47907681547966, "grad_norm": 1.3984375, "learning_rate": 4.865777998004218e-05, "loss": 0.0268, "mean_token_accuracy": 0.9925431370735168, "num_tokens": 80554545.0, "step": 27795 }, { "entropy": 0.08006125790998339, "epoch": 6.480242452500291, "grad_norm": 0.232421875, "learning_rate": 4.865709178414921e-05, "loss": 0.0241, "mean_token_accuracy": 0.9917748987674713, "num_tokens": 80572200.0, "step": 27800 }, { "entropy": 0.09746038131415843, "epoch": 6.481408089520923, "grad_norm": 1.1484375, "learning_rate": 4.865640342188722e-05, "loss": 0.0326, "mean_token_accuracy": 0.9908531725406646, "num_tokens": 80602063.0, "step": 27805 }, { "entropy": 0.08190167564898729, "epoch": 6.482573726541555, "grad_norm": 2.765625, "learning_rate": 4.865571489326647e-05, "loss": 0.0393, "mean_token_accuracy": 0.9881881058216095, "num_tokens": 80619369.0, "step": 27810 }, { "entropy": 0.06538397213444114, "epoch": 6.4837393635621865, "grad_norm": 2.46875, "learning_rate": 4.8655026198297235e-05, "loss": 0.0428, "mean_token_accuracy": 0.9888033509254456, "num_tokens": 80641276.0, "step": 27815 }, { "entropy": 0.06671316362917423, "epoch": 6.484905000582819, "grad_norm": 1.6875, "learning_rate": 4.865433733698979e-05, "loss": 0.0388, "mean_token_accuracy": 0.9907189249992371, "num_tokens": 80654599.0, "step": 27820 }, { "entropy": 0.08046202724799514, "epoch": 6.486070637603451, "grad_norm": 1.5234375, "learning_rate": 4.8653648309354385e-05, "loss": 0.0443, "mean_token_accuracy": 0.9860710978507996, "num_tokens": 80675635.0, "step": 27825 }, { "entropy": 0.06237729825079441, "epoch": 6.487236274624082, "grad_norm": 1.703125, "learning_rate": 4.865295911540131e-05, "loss": 0.0445, "mean_token_accuracy": 0.9872939884662628, "num_tokens": 80690700.0, "step": 27830 }, { "entropy": 0.07674776520580054, "epoch": 6.488401911644714, "grad_norm": 4.28125, "learning_rate": 4.865226975514085e-05, "loss": 0.0397, "mean_token_accuracy": 0.9856312572956085, "num_tokens": 80714560.0, "step": 27835 }, { "entropy": 0.11107652802020311, "epoch": 6.489567548665345, "grad_norm": 0.298828125, "learning_rate": 4.8651580228583264e-05, "loss": 0.0416, "mean_token_accuracy": 0.987369966506958, "num_tokens": 80744196.0, "step": 27840 }, { "entropy": 0.07360722348093987, "epoch": 6.490733185685977, "grad_norm": 0.7734375, "learning_rate": 4.8650890535738844e-05, "loss": 0.0301, "mean_token_accuracy": 0.9905798494815826, "num_tokens": 80763300.0, "step": 27845 }, { "entropy": 0.0837981840595603, "epoch": 6.4918988227066095, "grad_norm": 1.4765625, "learning_rate": 4.865020067661788e-05, "loss": 0.043, "mean_token_accuracy": 0.9866504311561585, "num_tokens": 80777853.0, "step": 27850 }, { "entropy": 0.07153965644538403, "epoch": 6.493064459727241, "grad_norm": 3.3125, "learning_rate": 4.864951065123065e-05, "loss": 0.0294, "mean_token_accuracy": 0.9883228242397308, "num_tokens": 80796982.0, "step": 27855 }, { "entropy": 0.07343727601692081, "epoch": 6.494230096747873, "grad_norm": 1.4296875, "learning_rate": 4.864882045958745e-05, "loss": 0.0518, "mean_token_accuracy": 0.9857357382774353, "num_tokens": 80831583.0, "step": 27860 }, { "entropy": 0.06243647811934352, "epoch": 6.495395733768505, "grad_norm": 0.59765625, "learning_rate": 4.8648130101698565e-05, "loss": 0.03, "mean_token_accuracy": 0.9885487198829651, "num_tokens": 80852559.0, "step": 27865 }, { "entropy": 0.06593287736177444, "epoch": 6.496561370789136, "grad_norm": 3.25, "learning_rate": 4.864743957757429e-05, "loss": 0.0309, "mean_token_accuracy": 0.9913531363010406, "num_tokens": 80869445.0, "step": 27870 }, { "entropy": 0.06547895520925522, "epoch": 6.497727007809768, "grad_norm": 0.69921875, "learning_rate": 4.8646748887224926e-05, "loss": 0.0183, "mean_token_accuracy": 0.9927921533584595, "num_tokens": 80894845.0, "step": 27875 }, { "entropy": 0.05576323997229338, "epoch": 6.498892644830399, "grad_norm": 0.50390625, "learning_rate": 4.864605803066077e-05, "loss": 0.0451, "mean_token_accuracy": 0.9896509110927582, "num_tokens": 80930602.0, "step": 27880 }, { "entropy": 0.05927497670054436, "epoch": 6.5000582818510315, "grad_norm": 4.28125, "learning_rate": 4.864536700789212e-05, "loss": 0.0416, "mean_token_accuracy": 0.9875927090644836, "num_tokens": 80951872.0, "step": 27885 }, { "entropy": 0.057063156738877296, "epoch": 6.501223918871664, "grad_norm": 0.216796875, "learning_rate": 4.864467581892929e-05, "loss": 0.0295, "mean_token_accuracy": 0.9899273931980133, "num_tokens": 80977099.0, "step": 27890 }, { "entropy": 0.08189502377063036, "epoch": 6.502389555892295, "grad_norm": 1.1484375, "learning_rate": 4.8643984463782584e-05, "loss": 0.0484, "mean_token_accuracy": 0.9839639067649841, "num_tokens": 80989763.0, "step": 27895 }, { "entropy": 0.08151528304442764, "epoch": 6.503555192912927, "grad_norm": 2.015625, "learning_rate": 4.86432929424623e-05, "loss": 0.044, "mean_token_accuracy": 0.9860894799232482, "num_tokens": 81006752.0, "step": 27900 }, { "entropy": 0.05777513347566128, "epoch": 6.504720829933559, "grad_norm": 1.3984375, "learning_rate": 4.864260125497877e-05, "loss": 0.025, "mean_token_accuracy": 0.9919376611709595, "num_tokens": 81029463.0, "step": 27905 }, { "entropy": 0.052465210948139426, "epoch": 6.50588646695419, "grad_norm": 0.640625, "learning_rate": 4.864190940134228e-05, "loss": 0.02, "mean_token_accuracy": 0.9926451206207275, "num_tokens": 81056765.0, "step": 27910 }, { "entropy": 0.053254136629402635, "epoch": 6.507052103974822, "grad_norm": 0.2490234375, "learning_rate": 4.864121738156317e-05, "loss": 0.023, "mean_token_accuracy": 0.9922501981258393, "num_tokens": 81082674.0, "step": 27915 }, { "entropy": 0.059919987060129645, "epoch": 6.508217740995454, "grad_norm": 0.2490234375, "learning_rate": 4.8640525195651754e-05, "loss": 0.0229, "mean_token_accuracy": 0.988700020313263, "num_tokens": 81117981.0, "step": 27920 }, { "entropy": 0.06886968342587352, "epoch": 6.509383378016086, "grad_norm": 1.546875, "learning_rate": 4.863983284361835e-05, "loss": 0.0388, "mean_token_accuracy": 0.9891575574874878, "num_tokens": 81142768.0, "step": 27925 }, { "entropy": 0.06728349346667528, "epoch": 6.510549015036718, "grad_norm": 0.474609375, "learning_rate": 4.863914032547328e-05, "loss": 0.0352, "mean_token_accuracy": 0.9899146974086761, "num_tokens": 81164421.0, "step": 27930 }, { "entropy": 0.07344648716971278, "epoch": 6.511714652057349, "grad_norm": 0.365234375, "learning_rate": 4.863844764122687e-05, "loss": 0.0304, "mean_token_accuracy": 0.9872526228427887, "num_tokens": 81193947.0, "step": 27935 }, { "entropy": 0.052133775874972345, "epoch": 6.512880289077981, "grad_norm": 0.224609375, "learning_rate": 4.863775479088946e-05, "loss": 0.0254, "mean_token_accuracy": 0.9909688651561737, "num_tokens": 81221469.0, "step": 27940 }, { "entropy": 0.06797988787293434, "epoch": 6.514045926098613, "grad_norm": 1.84375, "learning_rate": 4.863706177447138e-05, "loss": 0.0468, "mean_token_accuracy": 0.9872564733028412, "num_tokens": 81232706.0, "step": 27945 }, { "entropy": 0.07692260686308146, "epoch": 6.515211563119244, "grad_norm": 1.578125, "learning_rate": 4.8636368591982944e-05, "loss": 0.0416, "mean_token_accuracy": 0.9876651465892792, "num_tokens": 81244650.0, "step": 27950 }, { "entropy": 0.07264660159125924, "epoch": 6.5163772001398765, "grad_norm": 1.15625, "learning_rate": 4.863567524343451e-05, "loss": 0.0346, "mean_token_accuracy": 0.9892261922359467, "num_tokens": 81260414.0, "step": 27955 }, { "entropy": 0.06311188600957393, "epoch": 6.517542837160509, "grad_norm": 0.484375, "learning_rate": 4.8634981728836404e-05, "loss": 0.0203, "mean_token_accuracy": 0.9921561360359192, "num_tokens": 81282441.0, "step": 27960 }, { "entropy": 0.0678584418259561, "epoch": 6.51870847418114, "grad_norm": 1.8046875, "learning_rate": 4.863428804819898e-05, "loss": 0.0386, "mean_token_accuracy": 0.9875958859920502, "num_tokens": 81300933.0, "step": 27965 }, { "entropy": 0.06609183494001628, "epoch": 6.519874111201772, "grad_norm": 1.03125, "learning_rate": 4.863359420153257e-05, "loss": 0.0206, "mean_token_accuracy": 0.9928387343883515, "num_tokens": 81318505.0, "step": 27970 }, { "entropy": 0.06018575457856059, "epoch": 6.521039748222403, "grad_norm": 3.21875, "learning_rate": 4.863290018884752e-05, "loss": 0.0296, "mean_token_accuracy": 0.9921539068222046, "num_tokens": 81342932.0, "step": 27975 }, { "entropy": 0.09531013956293463, "epoch": 6.522205385243035, "grad_norm": 4.25, "learning_rate": 4.863220601015419e-05, "loss": 0.0503, "mean_token_accuracy": 0.9850192785263061, "num_tokens": 81361075.0, "step": 27980 }, { "entropy": 0.07941121272742749, "epoch": 6.523371022263667, "grad_norm": 2.515625, "learning_rate": 4.863151166546292e-05, "loss": 0.0497, "mean_token_accuracy": 0.9868826925754547, "num_tokens": 81369715.0, "step": 27985 }, { "entropy": 0.06422848673537374, "epoch": 6.524536659284299, "grad_norm": 0.3359375, "learning_rate": 4.863081715478407e-05, "loss": 0.0259, "mean_token_accuracy": 0.9898504614830017, "num_tokens": 81400861.0, "step": 27990 }, { "entropy": 0.06149542648345232, "epoch": 6.525702296304931, "grad_norm": 2.296875, "learning_rate": 4.8630122478127995e-05, "loss": 0.0255, "mean_token_accuracy": 0.9882985472679138, "num_tokens": 81432239.0, "step": 27995 }, { "entropy": 0.07662085331976413, "epoch": 6.526867933325562, "grad_norm": 2.359375, "learning_rate": 4.8629427635505055e-05, "loss": 0.0576, "mean_token_accuracy": 0.9864896059036254, "num_tokens": 81444100.0, "step": 28000 }, { "entropy": 0.07400370575487614, "epoch": 6.528033570346194, "grad_norm": 1.40625, "learning_rate": 4.8628732626925613e-05, "loss": 0.0409, "mean_token_accuracy": 0.9867150366306305, "num_tokens": 81463262.0, "step": 28005 }, { "entropy": 0.06608630102127791, "epoch": 6.529199207366826, "grad_norm": 6.15625, "learning_rate": 4.862803745240002e-05, "loss": 0.0449, "mean_token_accuracy": 0.9875483572483063, "num_tokens": 81484054.0, "step": 28010 }, { "entropy": 0.06695858966559172, "epoch": 6.530364844387458, "grad_norm": 0.361328125, "learning_rate": 4.862734211193866e-05, "loss": 0.0257, "mean_token_accuracy": 0.9882546544075013, "num_tokens": 81510572.0, "step": 28015 }, { "entropy": 0.06558367498219013, "epoch": 6.5315304814080895, "grad_norm": 1.0859375, "learning_rate": 4.8626646605551876e-05, "loss": 0.0299, "mean_token_accuracy": 0.9903232753276825, "num_tokens": 81534531.0, "step": 28020 }, { "entropy": 0.06639163857325911, "epoch": 6.5326961184287216, "grad_norm": 1.4921875, "learning_rate": 4.862595093325007e-05, "loss": 0.0307, "mean_token_accuracy": 0.9907230257987976, "num_tokens": 81555787.0, "step": 28025 }, { "entropy": 0.06413227356970311, "epoch": 6.533861755449353, "grad_norm": 0.75, "learning_rate": 4.8625255095043595e-05, "loss": 0.0281, "mean_token_accuracy": 0.9888515174388885, "num_tokens": 81568551.0, "step": 28030 }, { "entropy": 0.061216467432677744, "epoch": 6.535027392469985, "grad_norm": 1.3984375, "learning_rate": 4.862455909094284e-05, "loss": 0.0185, "mean_token_accuracy": 0.9905256271362305, "num_tokens": 81600493.0, "step": 28035 }, { "entropy": 0.08351697884500027, "epoch": 6.536193029490617, "grad_norm": 1.59375, "learning_rate": 4.862386292095817e-05, "loss": 0.0363, "mean_token_accuracy": 0.9892892718315125, "num_tokens": 81612813.0, "step": 28040 }, { "entropy": 0.06674035713076591, "epoch": 6.537358666511248, "grad_norm": 0.87890625, "learning_rate": 4.8623166585099974e-05, "loss": 0.0298, "mean_token_accuracy": 0.9903652191162109, "num_tokens": 81627906.0, "step": 28045 }, { "entropy": 0.06334521155804396, "epoch": 6.53852430353188, "grad_norm": 0.8046875, "learning_rate": 4.862247008337864e-05, "loss": 0.0342, "mean_token_accuracy": 0.9886260449886322, "num_tokens": 81641286.0, "step": 28050 }, { "entropy": 0.06510356459766627, "epoch": 6.5396899405525115, "grad_norm": 3.046875, "learning_rate": 4.8621773415804546e-05, "loss": 0.0293, "mean_token_accuracy": 0.9888158679008484, "num_tokens": 81657218.0, "step": 28055 }, { "entropy": 0.06543186036869883, "epoch": 6.540855577573144, "grad_norm": 0.376953125, "learning_rate": 4.862107658238808e-05, "loss": 0.0394, "mean_token_accuracy": 0.988468474149704, "num_tokens": 81679608.0, "step": 28060 }, { "entropy": 0.08569641970098019, "epoch": 6.542021214593776, "grad_norm": 2.859375, "learning_rate": 4.862037958313964e-05, "loss": 0.027, "mean_token_accuracy": 0.988139945268631, "num_tokens": 81708410.0, "step": 28065 }, { "entropy": 0.0672588437795639, "epoch": 6.543186851614407, "grad_norm": 0.90234375, "learning_rate": 4.861968241806961e-05, "loss": 0.0339, "mean_token_accuracy": 0.9860668420791626, "num_tokens": 81732815.0, "step": 28070 }, { "entropy": 0.07395201176404953, "epoch": 6.544352488635039, "grad_norm": 1.0234375, "learning_rate": 4.861898508718838e-05, "loss": 0.046, "mean_token_accuracy": 0.9865674555301667, "num_tokens": 81743650.0, "step": 28075 }, { "entropy": 0.06611506836488842, "epoch": 6.545518125655671, "grad_norm": 3.6875, "learning_rate": 4.8618287590506376e-05, "loss": 0.038, "mean_token_accuracy": 0.9851749837398529, "num_tokens": 81766370.0, "step": 28080 }, { "entropy": 0.03880115207284689, "epoch": 6.546683762676302, "grad_norm": 0.337890625, "learning_rate": 4.8617589928033966e-05, "loss": 0.0166, "mean_token_accuracy": 0.994249552488327, "num_tokens": 81796871.0, "step": 28085 }, { "entropy": 0.06620198376476764, "epoch": 6.5478493996969345, "grad_norm": 0.2041015625, "learning_rate": 4.861689209978158e-05, "loss": 0.0294, "mean_token_accuracy": 0.9911263763904572, "num_tokens": 81816362.0, "step": 28090 }, { "entropy": 0.06923009911552072, "epoch": 6.549015036717567, "grad_norm": 1.3203125, "learning_rate": 4.8616194105759606e-05, "loss": 0.0107, "mean_token_accuracy": 0.9917510986328125, "num_tokens": 81855342.0, "step": 28095 }, { "entropy": 0.06260980144143105, "epoch": 6.550180673738198, "grad_norm": 1.46875, "learning_rate": 4.861549594597846e-05, "loss": 0.0252, "mean_token_accuracy": 0.9904425501823425, "num_tokens": 81871751.0, "step": 28100 }, { "entropy": 0.0857225801795721, "epoch": 6.55134631075883, "grad_norm": 1.265625, "learning_rate": 4.861479762044856e-05, "loss": 0.0714, "mean_token_accuracy": 0.983836007118225, "num_tokens": 81881802.0, "step": 28105 }, { "entropy": 0.06633962662890554, "epoch": 6.552511947779461, "grad_norm": 0.8359375, "learning_rate": 4.861409912918029e-05, "loss": 0.0307, "mean_token_accuracy": 0.9916288614273071, "num_tokens": 81898680.0, "step": 28110 }, { "entropy": 0.08444469049572945, "epoch": 6.553677584800093, "grad_norm": 2.15625, "learning_rate": 4.86134004721841e-05, "loss": 0.0417, "mean_token_accuracy": 0.9863658368587493, "num_tokens": 81918320.0, "step": 28115 }, { "entropy": 0.08844755683094263, "epoch": 6.554843221820725, "grad_norm": 1.3515625, "learning_rate": 4.8612701649470385e-05, "loss": 0.0625, "mean_token_accuracy": 0.9853982567787171, "num_tokens": 81938033.0, "step": 28120 }, { "entropy": 0.08071080623194575, "epoch": 6.5560088588413565, "grad_norm": 1.75, "learning_rate": 4.8612002661049584e-05, "loss": 0.0346, "mean_token_accuracy": 0.9872838437557221, "num_tokens": 81957727.0, "step": 28125 }, { "entropy": 0.11229084599763155, "epoch": 6.557174495861989, "grad_norm": 2.203125, "learning_rate": 4.8611303506932104e-05, "loss": 0.1357, "mean_token_accuracy": 0.9701371610164642, "num_tokens": 81993913.0, "step": 28130 }, { "entropy": 0.06542156785726547, "epoch": 6.55834013288262, "grad_norm": 0.291015625, "learning_rate": 4.861060418712837e-05, "loss": 0.0297, "mean_token_accuracy": 0.9911216259002685, "num_tokens": 82017676.0, "step": 28135 }, { "entropy": 0.0545942329801619, "epoch": 6.559505769903252, "grad_norm": 0.2412109375, "learning_rate": 4.860990470164883e-05, "loss": 0.0177, "mean_token_accuracy": 0.9939680576324463, "num_tokens": 82041966.0, "step": 28140 }, { "entropy": 0.13166623264551164, "epoch": 6.560671406923884, "grad_norm": 0.466796875, "learning_rate": 4.8609205050503895e-05, "loss": 0.1511, "mean_token_accuracy": 0.9682539939880371, "num_tokens": 82075203.0, "step": 28145 }, { "entropy": 0.07135718585923315, "epoch": 6.561837043944516, "grad_norm": 0.8359375, "learning_rate": 4.8608505233704e-05, "loss": 0.0502, "mean_token_accuracy": 0.9858476459980011, "num_tokens": 82098960.0, "step": 28150 }, { "entropy": 0.08739682212471962, "epoch": 6.563002680965147, "grad_norm": 0.455078125, "learning_rate": 4.8607805251259584e-05, "loss": 0.0293, "mean_token_accuracy": 0.9880187332630157, "num_tokens": 82131078.0, "step": 28155 }, { "entropy": 0.058482589572668074, "epoch": 6.5641683179857795, "grad_norm": 1.0390625, "learning_rate": 4.8607105103181086e-05, "loss": 0.0258, "mean_token_accuracy": 0.990253335237503, "num_tokens": 82167502.0, "step": 28160 }, { "entropy": 0.08452012352645397, "epoch": 6.565333955006411, "grad_norm": 1.84375, "learning_rate": 4.860640478947895e-05, "loss": 0.0601, "mean_token_accuracy": 0.9837320148944855, "num_tokens": 82177026.0, "step": 28165 }, { "entropy": 0.06720562288537621, "epoch": 6.566499592027043, "grad_norm": 2.453125, "learning_rate": 4.860570431016361e-05, "loss": 0.0301, "mean_token_accuracy": 0.9914529860019684, "num_tokens": 82213975.0, "step": 28170 }, { "entropy": 0.04906615521758795, "epoch": 6.567665229047675, "grad_norm": 0.443359375, "learning_rate": 4.8605003665245516e-05, "loss": 0.0165, "mean_token_accuracy": 0.9915839433670044, "num_tokens": 82249364.0, "step": 28175 }, { "entropy": 0.06979955788701772, "epoch": 6.568830866068306, "grad_norm": 2.5625, "learning_rate": 4.860430285473511e-05, "loss": 0.0487, "mean_token_accuracy": 0.9855699956417083, "num_tokens": 82267527.0, "step": 28180 }, { "entropy": 0.06681377086788416, "epoch": 6.569996503088938, "grad_norm": 0.1005859375, "learning_rate": 4.860360187864285e-05, "loss": 0.044, "mean_token_accuracy": 0.9887908697128296, "num_tokens": 82287597.0, "step": 28185 }, { "entropy": 0.08111393954604865, "epoch": 6.5711621401095694, "grad_norm": 1.4296875, "learning_rate": 4.8602900736979185e-05, "loss": 0.0386, "mean_token_accuracy": 0.986400431394577, "num_tokens": 82306790.0, "step": 28190 }, { "entropy": 0.07878834493458271, "epoch": 6.5723277771302016, "grad_norm": 2.875, "learning_rate": 4.860219942975457e-05, "loss": 0.0545, "mean_token_accuracy": 0.9820395827293396, "num_tokens": 82323050.0, "step": 28195 }, { "entropy": 0.03850213307887316, "epoch": 6.573493414150834, "grad_norm": 0.2392578125, "learning_rate": 4.860149795697946e-05, "loss": 0.0104, "mean_token_accuracy": 0.9948647737503051, "num_tokens": 82357011.0, "step": 28200 }, { "entropy": 0.06725431568920612, "epoch": 6.574659051171465, "grad_norm": 2.6875, "learning_rate": 4.860079631866432e-05, "loss": 0.0364, "mean_token_accuracy": 0.9887121975421905, "num_tokens": 82372585.0, "step": 28205 }, { "entropy": 0.05918426923453808, "epoch": 6.575824688192097, "grad_norm": 1.2421875, "learning_rate": 4.860009451481961e-05, "loss": 0.0211, "mean_token_accuracy": 0.9932548403739929, "num_tokens": 82408282.0, "step": 28210 }, { "entropy": 0.05952431866899133, "epoch": 6.576990325212729, "grad_norm": 0.15625, "learning_rate": 4.8599392545455797e-05, "loss": 0.0282, "mean_token_accuracy": 0.9910712659358978, "num_tokens": 82440321.0, "step": 28215 }, { "entropy": 0.07454060800373555, "epoch": 6.57815596223336, "grad_norm": 1.984375, "learning_rate": 4.859869041058335e-05, "loss": 0.0395, "mean_token_accuracy": 0.9880998790264129, "num_tokens": 82453169.0, "step": 28220 }, { "entropy": 0.08988696299493312, "epoch": 6.579321599253992, "grad_norm": 2.34375, "learning_rate": 4.859798811021273e-05, "loss": 0.0599, "mean_token_accuracy": 0.9867674469947815, "num_tokens": 82471385.0, "step": 28225 }, { "entropy": 0.07569172158837319, "epoch": 6.5804872362746245, "grad_norm": 1.671875, "learning_rate": 4.859728564435441e-05, "loss": 0.0423, "mean_token_accuracy": 0.9849360704421997, "num_tokens": 82482967.0, "step": 28230 }, { "entropy": 0.06420651264488697, "epoch": 6.581652873295256, "grad_norm": 0.66796875, "learning_rate": 4.8596583013018885e-05, "loss": 0.0427, "mean_token_accuracy": 0.9870175302028656, "num_tokens": 82495378.0, "step": 28235 }, { "entropy": 0.07353771440684795, "epoch": 6.582818510315888, "grad_norm": 1.5234375, "learning_rate": 4.8595880216216604e-05, "loss": 0.0427, "mean_token_accuracy": 0.9905295133590698, "num_tokens": 82506088.0, "step": 28240 }, { "entropy": 0.05138236228376627, "epoch": 6.583984147336519, "grad_norm": 1.46875, "learning_rate": 4.8595177253958064e-05, "loss": 0.024, "mean_token_accuracy": 0.9912134110927582, "num_tokens": 82535805.0, "step": 28245 }, { "entropy": 0.057565122842788696, "epoch": 6.585149784357151, "grad_norm": 1.0078125, "learning_rate": 4.859447412625374e-05, "loss": 0.0339, "mean_token_accuracy": 0.9907476007938385, "num_tokens": 82549312.0, "step": 28250 }, { "entropy": 0.07886847332119942, "epoch": 6.586315421377783, "grad_norm": 4.09375, "learning_rate": 4.859377083311413e-05, "loss": 0.0484, "mean_token_accuracy": 0.9851777493953705, "num_tokens": 82558924.0, "step": 28255 }, { "entropy": 0.06298988554626703, "epoch": 6.5874810583984145, "grad_norm": 2.921875, "learning_rate": 4.85930673745497e-05, "loss": 0.0399, "mean_token_accuracy": 0.9884525418281556, "num_tokens": 82582432.0, "step": 28260 }, { "entropy": 0.08036144189536572, "epoch": 6.588646695419047, "grad_norm": 1.8125, "learning_rate": 4.859236375057095e-05, "loss": 0.0465, "mean_token_accuracy": 0.987638258934021, "num_tokens": 82602132.0, "step": 28265 }, { "entropy": 0.06753709372133017, "epoch": 6.589812332439678, "grad_norm": 1.3125, "learning_rate": 4.859165996118838e-05, "loss": 0.0328, "mean_token_accuracy": 0.9888114511966706, "num_tokens": 82618780.0, "step": 28270 }, { "entropy": 0.05341739971190691, "epoch": 6.59097796946031, "grad_norm": 0.28125, "learning_rate": 4.8590956006412476e-05, "loss": 0.0255, "mean_token_accuracy": 0.9922696411609649, "num_tokens": 82648437.0, "step": 28275 }, { "entropy": 0.0667111149057746, "epoch": 6.592143606480942, "grad_norm": 0.408203125, "learning_rate": 4.859025188625374e-05, "loss": 0.0302, "mean_token_accuracy": 0.9906487345695496, "num_tokens": 82669457.0, "step": 28280 }, { "entropy": 0.06441701222211123, "epoch": 6.593309243501574, "grad_norm": 1.7265625, "learning_rate": 4.858954760072265e-05, "loss": 0.0451, "mean_token_accuracy": 0.9849387526512146, "num_tokens": 82682232.0, "step": 28285 }, { "entropy": 0.08769273720681667, "epoch": 6.594474880522205, "grad_norm": 2.84375, "learning_rate": 4.858884314982974e-05, "loss": 0.065, "mean_token_accuracy": 0.9820234298706054, "num_tokens": 82690955.0, "step": 28290 }, { "entropy": 0.06150923212990165, "epoch": 6.595640517542837, "grad_norm": 1.5625, "learning_rate": 4.85881385335855e-05, "loss": 0.0347, "mean_token_accuracy": 0.9915253400802613, "num_tokens": 82710026.0, "step": 28295 }, { "entropy": 0.058764266222715376, "epoch": 6.596806154563469, "grad_norm": 0.94140625, "learning_rate": 4.858743375200043e-05, "loss": 0.0274, "mean_token_accuracy": 0.9908628404140473, "num_tokens": 82736692.0, "step": 28300 }, { "entropy": 0.07299914155155421, "epoch": 6.597971791584101, "grad_norm": 1.484375, "learning_rate": 4.858672880508506e-05, "loss": 0.0383, "mean_token_accuracy": 0.9905340611934662, "num_tokens": 82758849.0, "step": 28305 }, { "entropy": 0.05621396470814943, "epoch": 6.599137428604733, "grad_norm": 1.65625, "learning_rate": 4.858602369284987e-05, "loss": 0.0233, "mean_token_accuracy": 0.9891703724861145, "num_tokens": 82783679.0, "step": 28310 }, { "entropy": 0.06540145818144083, "epoch": 6.600303065625364, "grad_norm": 2.15625, "learning_rate": 4.8585318415305404e-05, "loss": 0.0413, "mean_token_accuracy": 0.9880593121051788, "num_tokens": 82797141.0, "step": 28315 }, { "entropy": 0.07652120906859636, "epoch": 6.601468702645996, "grad_norm": 2.203125, "learning_rate": 4.858461297246217e-05, "loss": 0.0591, "mean_token_accuracy": 0.9846980929374695, "num_tokens": 82813933.0, "step": 28320 }, { "entropy": 0.061003577709197995, "epoch": 6.602634339666627, "grad_norm": 1.15625, "learning_rate": 4.8583907364330677e-05, "loss": 0.0356, "mean_token_accuracy": 0.9893332183361053, "num_tokens": 82828846.0, "step": 28325 }, { "entropy": 0.08375898487865925, "epoch": 6.6037999766872595, "grad_norm": 2.671875, "learning_rate": 4.858320159092146e-05, "loss": 0.0502, "mean_token_accuracy": 0.9873512983322144, "num_tokens": 82837492.0, "step": 28330 }, { "entropy": 0.07190875075757504, "epoch": 6.604965613707892, "grad_norm": 1.7421875, "learning_rate": 4.858249565224503e-05, "loss": 0.0307, "mean_token_accuracy": 0.9873040735721588, "num_tokens": 82849052.0, "step": 28335 }, { "entropy": 0.0680878208950162, "epoch": 6.606131250728523, "grad_norm": 0.64453125, "learning_rate": 4.8581789548311924e-05, "loss": 0.0497, "mean_token_accuracy": 0.9851384818553924, "num_tokens": 82885860.0, "step": 28340 }, { "entropy": 0.05567469764500856, "epoch": 6.607296887749155, "grad_norm": 0.6796875, "learning_rate": 4.858108327913267e-05, "loss": 0.0194, "mean_token_accuracy": 0.9858059644699096, "num_tokens": 82911685.0, "step": 28345 }, { "entropy": 0.07293141707777977, "epoch": 6.608462524769787, "grad_norm": 3.65625, "learning_rate": 4.858037684471779e-05, "loss": 0.041, "mean_token_accuracy": 0.9894559681415558, "num_tokens": 82924003.0, "step": 28350 }, { "entropy": 0.05879962723702192, "epoch": 6.609628161790418, "grad_norm": 2.96875, "learning_rate": 4.857967024507783e-05, "loss": 0.0248, "mean_token_accuracy": 0.9917656898498535, "num_tokens": 82958399.0, "step": 28355 }, { "entropy": 0.04448529947549105, "epoch": 6.61079379881105, "grad_norm": 0.7890625, "learning_rate": 4.8578963480223326e-05, "loss": 0.0196, "mean_token_accuracy": 0.9942450284957886, "num_tokens": 82998038.0, "step": 28360 }, { "entropy": 0.06249931612983346, "epoch": 6.611959435831682, "grad_norm": 2.03125, "learning_rate": 4.857825655016481e-05, "loss": 0.0256, "mean_token_accuracy": 0.9889679789543152, "num_tokens": 83025588.0, "step": 28365 }, { "entropy": 0.08115007188171149, "epoch": 6.613125072852314, "grad_norm": 0.478515625, "learning_rate": 4.857754945491282e-05, "loss": 0.0458, "mean_token_accuracy": 0.9878861665725708, "num_tokens": 83041745.0, "step": 28370 }, { "entropy": 0.06632826328277588, "epoch": 6.614290709872946, "grad_norm": 1.6796875, "learning_rate": 4.857684219447792e-05, "loss": 0.0247, "mean_token_accuracy": 0.9895213901996612, "num_tokens": 83073646.0, "step": 28375 }, { "entropy": 0.05887887412682176, "epoch": 6.615456346893577, "grad_norm": 1.6171875, "learning_rate": 4.857613476887063e-05, "loss": 0.0214, "mean_token_accuracy": 0.9936350464820862, "num_tokens": 83101095.0, "step": 28380 }, { "entropy": 0.06550744706764818, "epoch": 6.616621983914209, "grad_norm": 1.09375, "learning_rate": 4.857542717810152e-05, "loss": 0.0333, "mean_token_accuracy": 0.98823082447052, "num_tokens": 83117537.0, "step": 28385 }, { "entropy": 0.09187786467373371, "epoch": 6.617787620934841, "grad_norm": 2.296875, "learning_rate": 4.857471942218112e-05, "loss": 0.0488, "mean_token_accuracy": 0.9861462652683258, "num_tokens": 83141574.0, "step": 28390 }, { "entropy": 0.06437947321683168, "epoch": 6.618953257955472, "grad_norm": 0.8125, "learning_rate": 4.857401150112001e-05, "loss": 0.0297, "mean_token_accuracy": 0.9912596046924591, "num_tokens": 83163578.0, "step": 28395 }, { "entropy": 0.07269996423274279, "epoch": 6.6201188949761045, "grad_norm": 0.6640625, "learning_rate": 4.8573303414928725e-05, "loss": 0.0301, "mean_token_accuracy": 0.990373021364212, "num_tokens": 83179042.0, "step": 28400 }, { "entropy": 0.22393528148531913, "epoch": 6.621284531996736, "grad_norm": 2.890625, "learning_rate": 4.857259516361783e-05, "loss": 0.3705, "mean_token_accuracy": 0.9582188367843628, "num_tokens": 83213305.0, "step": 28405 }, { "entropy": 0.06759192235767841, "epoch": 6.622450169017368, "grad_norm": 0.2099609375, "learning_rate": 4.8571886747197893e-05, "loss": 0.0361, "mean_token_accuracy": 0.9905477106571198, "num_tokens": 83229930.0, "step": 28410 }, { "entropy": 0.06327125979587436, "epoch": 6.623615806038, "grad_norm": 3.515625, "learning_rate": 4.857117816567947e-05, "loss": 0.0431, "mean_token_accuracy": 0.985412847995758, "num_tokens": 83246877.0, "step": 28415 }, { "entropy": 0.0795931302011013, "epoch": 6.624781443058632, "grad_norm": 4.5625, "learning_rate": 4.857046941907312e-05, "loss": 0.0357, "mean_token_accuracy": 0.9869856595993042, "num_tokens": 83261191.0, "step": 28420 }, { "entropy": 0.07875011954456568, "epoch": 6.625947080079263, "grad_norm": 3.65625, "learning_rate": 4.8569760507389426e-05, "loss": 0.0442, "mean_token_accuracy": 0.9851264238357544, "num_tokens": 83285754.0, "step": 28425 }, { "entropy": 0.0738863229751587, "epoch": 6.627112717099895, "grad_norm": 1.0390625, "learning_rate": 4.856905143063896e-05, "loss": 0.0335, "mean_token_accuracy": 0.9872049629688263, "num_tokens": 83300747.0, "step": 28430 }, { "entropy": 0.08566249124705791, "epoch": 6.628278354120527, "grad_norm": 3.921875, "learning_rate": 4.8568342188832276e-05, "loss": 0.0469, "mean_token_accuracy": 0.9827580094337464, "num_tokens": 83318653.0, "step": 28435 }, { "entropy": 0.06706524565815926, "epoch": 6.629443991141159, "grad_norm": 1.203125, "learning_rate": 4.856763278197996e-05, "loss": 0.0243, "mean_token_accuracy": 0.9923423230648041, "num_tokens": 83337335.0, "step": 28440 }, { "entropy": 0.06407251004129648, "epoch": 6.630609628161791, "grad_norm": 0.61328125, "learning_rate": 4.8566923210092605e-05, "loss": 0.0239, "mean_token_accuracy": 0.992389714717865, "num_tokens": 83360624.0, "step": 28445 }, { "entropy": 0.0974472158588469, "epoch": 6.631775265182422, "grad_norm": 1.8203125, "learning_rate": 4.856621347318078e-05, "loss": 0.0313, "mean_token_accuracy": 0.987954068183899, "num_tokens": 83399635.0, "step": 28450 }, { "entropy": 0.08182252198457718, "epoch": 6.632940902203054, "grad_norm": 0.5546875, "learning_rate": 4.856550357125506e-05, "loss": 0.0521, "mean_token_accuracy": 0.9852348983287811, "num_tokens": 83410321.0, "step": 28455 }, { "entropy": 0.038520860578864814, "epoch": 6.634106539223685, "grad_norm": 0.447265625, "learning_rate": 4.856479350432604e-05, "loss": 0.0218, "mean_token_accuracy": 0.9928028047084808, "num_tokens": 83451475.0, "step": 28460 }, { "entropy": 0.07194502120837569, "epoch": 6.635272176244317, "grad_norm": 2.921875, "learning_rate": 4.85640832724043e-05, "loss": 0.028, "mean_token_accuracy": 0.9891715884208679, "num_tokens": 83472506.0, "step": 28465 }, { "entropy": 0.06827894113957882, "epoch": 6.6364378132649495, "grad_norm": 3.34375, "learning_rate": 4.856337287550045e-05, "loss": 0.0306, "mean_token_accuracy": 0.9915993750095368, "num_tokens": 83494023.0, "step": 28470 }, { "entropy": 0.0858086671680212, "epoch": 6.637603450285581, "grad_norm": 1.0078125, "learning_rate": 4.856266231362506e-05, "loss": 0.0328, "mean_token_accuracy": 0.9919438421726227, "num_tokens": 83505466.0, "step": 28475 }, { "entropy": 0.08067451752722263, "epoch": 6.638769087306213, "grad_norm": 1.5078125, "learning_rate": 4.856195158678875e-05, "loss": 0.0409, "mean_token_accuracy": 0.9864005327224732, "num_tokens": 83525173.0, "step": 28480 }, { "entropy": 0.0665781082585454, "epoch": 6.639934724326845, "grad_norm": 5.0625, "learning_rate": 4.856124069500209e-05, "loss": 0.0447, "mean_token_accuracy": 0.9896150350570678, "num_tokens": 83545423.0, "step": 28485 }, { "entropy": 0.05508313453756273, "epoch": 6.641100361347476, "grad_norm": 0.205078125, "learning_rate": 4.85605296382757e-05, "loss": 0.0309, "mean_token_accuracy": 0.9883796334266662, "num_tokens": 83574737.0, "step": 28490 }, { "entropy": 0.07376216007396579, "epoch": 6.642265998368108, "grad_norm": 1.015625, "learning_rate": 4.855981841662017e-05, "loss": 0.064, "mean_token_accuracy": 0.9849434971809388, "num_tokens": 83611091.0, "step": 28495 }, { "entropy": 0.10243667252361774, "epoch": 6.64343163538874, "grad_norm": 4.0, "learning_rate": 4.855910703004612e-05, "loss": 0.0562, "mean_token_accuracy": 0.9861430406570435, "num_tokens": 83620959.0, "step": 28500 }, { "entropy": 0.07161519918590784, "epoch": 6.644597272409372, "grad_norm": 1.84375, "learning_rate": 4.855839547856415e-05, "loss": 0.0319, "mean_token_accuracy": 0.9895271420478821, "num_tokens": 83632086.0, "step": 28505 }, { "entropy": 0.09283050457015633, "epoch": 6.645762909430004, "grad_norm": 1.671875, "learning_rate": 4.855768376218487e-05, "loss": 0.0362, "mean_token_accuracy": 0.9906116843223571, "num_tokens": 83662501.0, "step": 28510 }, { "entropy": 0.08217054158449173, "epoch": 6.646928546450635, "grad_norm": 2.1875, "learning_rate": 4.855697188091889e-05, "loss": 0.024, "mean_token_accuracy": 0.9915976047515869, "num_tokens": 83681165.0, "step": 28515 }, { "entropy": 0.06361790988594293, "epoch": 6.648094183471267, "grad_norm": 1.75, "learning_rate": 4.855625983477683e-05, "loss": 0.0299, "mean_token_accuracy": 0.9915489137172699, "num_tokens": 83711612.0, "step": 28520 }, { "entropy": 0.07515859454870225, "epoch": 6.649259820491899, "grad_norm": 2.203125, "learning_rate": 4.85555476237693e-05, "loss": 0.0676, "mean_token_accuracy": 0.9828903555870057, "num_tokens": 83720807.0, "step": 28525 }, { "entropy": 0.07741277245804667, "epoch": 6.65042545751253, "grad_norm": 0.24609375, "learning_rate": 4.855483524790694e-05, "loss": 0.0434, "mean_token_accuracy": 0.9873224675655365, "num_tokens": 83734105.0, "step": 28530 }, { "entropy": 0.07927368283271789, "epoch": 6.651591094533162, "grad_norm": 0.890625, "learning_rate": 4.855412270720035e-05, "loss": 0.0381, "mean_token_accuracy": 0.9878321468830109, "num_tokens": 83744752.0, "step": 28535 }, { "entropy": 0.05704822298139334, "epoch": 6.652756731553794, "grad_norm": 0.162109375, "learning_rate": 4.8553410001660173e-05, "loss": 0.0143, "mean_token_accuracy": 0.9922565579414367, "num_tokens": 83781512.0, "step": 28540 }, { "entropy": 0.09766516759991646, "epoch": 6.653922368574426, "grad_norm": 2.03125, "learning_rate": 4.855269713129702e-05, "loss": 0.0484, "mean_token_accuracy": 0.987651264667511, "num_tokens": 83791362.0, "step": 28545 }, { "entropy": 0.0650267457589507, "epoch": 6.655088005595058, "grad_norm": 3.59375, "learning_rate": 4.855198409612153e-05, "loss": 0.053, "mean_token_accuracy": 0.9855973184108734, "num_tokens": 83813283.0, "step": 28550 }, { "entropy": 0.06372864125296474, "epoch": 6.65625364261569, "grad_norm": 0.75390625, "learning_rate": 4.855127089614433e-05, "loss": 0.0328, "mean_token_accuracy": 0.9887797713279725, "num_tokens": 83832747.0, "step": 28555 }, { "entropy": 0.037981690280139445, "epoch": 6.657419279636321, "grad_norm": 1.28125, "learning_rate": 4.855055753137606e-05, "loss": 0.0137, "mean_token_accuracy": 0.9955213725566864, "num_tokens": 83873690.0, "step": 28560 }, { "entropy": 0.06714175110682845, "epoch": 6.658584916656953, "grad_norm": 1.5625, "learning_rate": 4.854984400182736e-05, "loss": 0.0347, "mean_token_accuracy": 0.9886840522289276, "num_tokens": 83905288.0, "step": 28565 }, { "entropy": 0.07562700193375349, "epoch": 6.6597505536775845, "grad_norm": 4.0625, "learning_rate": 4.854913030750887e-05, "loss": 0.0442, "mean_token_accuracy": 0.986762797832489, "num_tokens": 83924393.0, "step": 28570 }, { "entropy": 0.05191770112141967, "epoch": 6.660916190698217, "grad_norm": 0.44140625, "learning_rate": 4.8548416448431224e-05, "loss": 0.0138, "mean_token_accuracy": 0.9877704203128814, "num_tokens": 83954953.0, "step": 28575 }, { "entropy": 0.0616036182269454, "epoch": 6.662081827718849, "grad_norm": 5.46875, "learning_rate": 4.854770242460507e-05, "loss": 0.0383, "mean_token_accuracy": 0.9877343237400055, "num_tokens": 83969020.0, "step": 28580 }, { "entropy": 0.06464450052008033, "epoch": 6.66324746473948, "grad_norm": 3.34375, "learning_rate": 4.8546988236041054e-05, "loss": 0.04, "mean_token_accuracy": 0.9866331398487092, "num_tokens": 83994057.0, "step": 28585 }, { "entropy": 0.07502462603151798, "epoch": 6.664413101760112, "grad_norm": 0.138671875, "learning_rate": 4.8546273882749825e-05, "loss": 0.0506, "mean_token_accuracy": 0.9836625099182129, "num_tokens": 84011629.0, "step": 28590 }, { "entropy": 0.06252543712034822, "epoch": 6.665578738780743, "grad_norm": 1.0078125, "learning_rate": 4.854555936474204e-05, "loss": 0.0237, "mean_token_accuracy": 0.9895300209522248, "num_tokens": 84030753.0, "step": 28595 }, { "entropy": 0.0681413403712213, "epoch": 6.666744375801375, "grad_norm": 1.015625, "learning_rate": 4.854484468202836e-05, "loss": 0.0365, "mean_token_accuracy": 0.9867991805076599, "num_tokens": 84050125.0, "step": 28600 }, { "entropy": 0.08328782804310322, "epoch": 6.6679100128220075, "grad_norm": 2.5625, "learning_rate": 4.854412983461943e-05, "loss": 0.0478, "mean_token_accuracy": 0.9859695374965668, "num_tokens": 84069321.0, "step": 28605 }, { "entropy": 0.07345206961035729, "epoch": 6.669075649842639, "grad_norm": 0.5390625, "learning_rate": 4.8543414822525904e-05, "loss": 0.0619, "mean_token_accuracy": 0.9833361506462097, "num_tokens": 84078921.0, "step": 28610 }, { "entropy": 0.04605873627588153, "epoch": 6.670241286863271, "grad_norm": 0.125, "learning_rate": 4.854269964575846e-05, "loss": 0.0225, "mean_token_accuracy": 0.9940562188625336, "num_tokens": 84104775.0, "step": 28615 }, { "entropy": 0.0720367580652237, "epoch": 6.671406923883903, "grad_norm": 2.328125, "learning_rate": 4.854198430432776e-05, "loss": 0.0367, "mean_token_accuracy": 0.991383183002472, "num_tokens": 84118238.0, "step": 28620 }, { "entropy": 0.06562741016969084, "epoch": 6.672572560904534, "grad_norm": 0.267578125, "learning_rate": 4.854126879824446e-05, "loss": 0.0207, "mean_token_accuracy": 0.9923710107803345, "num_tokens": 84146884.0, "step": 28625 }, { "entropy": 0.06544369319453835, "epoch": 6.673738197925166, "grad_norm": 1.0390625, "learning_rate": 4.854055312751924e-05, "loss": 0.0188, "mean_token_accuracy": 0.9867700815200806, "num_tokens": 84167785.0, "step": 28630 }, { "entropy": 0.05859682857990265, "epoch": 6.674903834945798, "grad_norm": 0.8984375, "learning_rate": 4.853983729216276e-05, "loss": 0.0242, "mean_token_accuracy": 0.9904904246330262, "num_tokens": 84185300.0, "step": 28635 }, { "entropy": 0.07093516178429127, "epoch": 6.6760694719664295, "grad_norm": 0.51171875, "learning_rate": 4.8539121292185704e-05, "loss": 0.027, "mean_token_accuracy": 0.9912785172462464, "num_tokens": 84199854.0, "step": 28640 }, { "entropy": 0.08791411444544792, "epoch": 6.677235108987062, "grad_norm": 0.275390625, "learning_rate": 4.853840512759874e-05, "loss": 0.052, "mean_token_accuracy": 0.9832299053668976, "num_tokens": 84215162.0, "step": 28645 }, { "entropy": 0.08093488197773695, "epoch": 6.678400746007693, "grad_norm": 0.1552734375, "learning_rate": 4.853768879841256e-05, "loss": 0.0584, "mean_token_accuracy": 0.9838843226432801, "num_tokens": 84232657.0, "step": 28650 }, { "entropy": 0.05880971970036626, "epoch": 6.679566383028325, "grad_norm": 0.61328125, "learning_rate": 4.853697230463784e-05, "loss": 0.0196, "mean_token_accuracy": 0.992617392539978, "num_tokens": 84263155.0, "step": 28655 }, { "entropy": 0.07369680106639862, "epoch": 6.680732020048957, "grad_norm": 2.28125, "learning_rate": 4.853625564628525e-05, "loss": 0.0162, "mean_token_accuracy": 0.9927162766456604, "num_tokens": 84288594.0, "step": 28660 }, { "entropy": 0.0617294343188405, "epoch": 6.681897657069588, "grad_norm": 0.341796875, "learning_rate": 4.85355388233655e-05, "loss": 0.0339, "mean_token_accuracy": 0.9909321308135987, "num_tokens": 84302921.0, "step": 28665 }, { "entropy": 0.05793065996840596, "epoch": 6.68306329409022, "grad_norm": 0.28125, "learning_rate": 4.853482183588927e-05, "loss": 0.0263, "mean_token_accuracy": 0.9909238398075104, "num_tokens": 84330261.0, "step": 28670 }, { "entropy": 0.07100810557603836, "epoch": 6.684228931110852, "grad_norm": 0.7109375, "learning_rate": 4.853410468386724e-05, "loss": 0.0401, "mean_token_accuracy": 0.9874763488769531, "num_tokens": 84347584.0, "step": 28675 }, { "entropy": 0.09726348333060741, "epoch": 6.685394568131484, "grad_norm": 2.65625, "learning_rate": 4.853338736731012e-05, "loss": 0.1012, "mean_token_accuracy": 0.9809386074543, "num_tokens": 84369145.0, "step": 28680 }, { "entropy": 0.06574146244674921, "epoch": 6.686560205152116, "grad_norm": 0.859375, "learning_rate": 4.8532669886228596e-05, "loss": 0.0259, "mean_token_accuracy": 0.9907530903816223, "num_tokens": 84387512.0, "step": 28685 }, { "entropy": 0.06407738225534558, "epoch": 6.687725842172748, "grad_norm": 0.2138671875, "learning_rate": 4.853195224063337e-05, "loss": 0.0372, "mean_token_accuracy": 0.9909645438194274, "num_tokens": 84415777.0, "step": 28690 }, { "entropy": 0.09450783599168062, "epoch": 6.688891479193379, "grad_norm": 0.2109375, "learning_rate": 4.853123443053515e-05, "loss": 0.0401, "mean_token_accuracy": 0.990818876028061, "num_tokens": 84434248.0, "step": 28695 }, { "entropy": 0.08516130037605762, "epoch": 6.690057116214011, "grad_norm": 4.25, "learning_rate": 4.8530516455944625e-05, "loss": 0.0547, "mean_token_accuracy": 0.9852405250072479, "num_tokens": 84447974.0, "step": 28700 }, { "entropy": 0.05056046452373266, "epoch": 6.691222753234642, "grad_norm": 1.6015625, "learning_rate": 4.852979831687251e-05, "loss": 0.0428, "mean_token_accuracy": 0.9874833226203918, "num_tokens": 84478389.0, "step": 28705 }, { "entropy": 0.10436076521873475, "epoch": 6.6923883902552745, "grad_norm": 2.328125, "learning_rate": 4.8529080013329515e-05, "loss": 0.0711, "mean_token_accuracy": 0.986644196510315, "num_tokens": 84497684.0, "step": 28710 }, { "entropy": 0.06655118707567453, "epoch": 6.693554027275907, "grad_norm": 2.90625, "learning_rate": 4.8528361545326345e-05, "loss": 0.0321, "mean_token_accuracy": 0.9903276085853576, "num_tokens": 84516211.0, "step": 28715 }, { "entropy": 0.060899481642991304, "epoch": 6.694719664296538, "grad_norm": 1.328125, "learning_rate": 4.8527642912873714e-05, "loss": 0.0244, "mean_token_accuracy": 0.9928938448429108, "num_tokens": 84544470.0, "step": 28720 }, { "entropy": 0.0634287366643548, "epoch": 6.69588530131717, "grad_norm": 0.66796875, "learning_rate": 4.852692411598235e-05, "loss": 0.0307, "mean_token_accuracy": 0.9917833507061005, "num_tokens": 84560366.0, "step": 28725 }, { "entropy": 0.05534190842881799, "epoch": 6.697050938337801, "grad_norm": 0.263671875, "learning_rate": 4.8526205154662954e-05, "loss": 0.0274, "mean_token_accuracy": 0.992123681306839, "num_tokens": 84584648.0, "step": 28730 }, { "entropy": 0.05186046920716762, "epoch": 6.698216575358433, "grad_norm": 0.47265625, "learning_rate": 4.852548602892626e-05, "loss": 0.0264, "mean_token_accuracy": 0.9879955351352692, "num_tokens": 84612860.0, "step": 28735 }, { "entropy": 0.09077054243534803, "epoch": 6.699382212379065, "grad_norm": 0.9765625, "learning_rate": 4.8524766738782984e-05, "loss": 0.0486, "mean_token_accuracy": 0.9869544804096222, "num_tokens": 84633335.0, "step": 28740 }, { "entropy": 0.08246908560395241, "epoch": 6.700547849399697, "grad_norm": 3.046875, "learning_rate": 4.8524047284243854e-05, "loss": 0.0512, "mean_token_accuracy": 0.9850801050662994, "num_tokens": 84643579.0, "step": 28745 }, { "entropy": 0.08006817158311605, "epoch": 6.701713486420329, "grad_norm": 2.828125, "learning_rate": 4.8523327665319597e-05, "loss": 0.0413, "mean_token_accuracy": 0.9881218016147614, "num_tokens": 84662846.0, "step": 28750 }, { "entropy": 0.0593781216070056, "epoch": 6.702879123440961, "grad_norm": 3.5625, "learning_rate": 4.8522607882020945e-05, "loss": 0.0396, "mean_token_accuracy": 0.987310266494751, "num_tokens": 84691789.0, "step": 28755 }, { "entropy": 0.08961878903210163, "epoch": 6.704044760461592, "grad_norm": 4.875, "learning_rate": 4.852188793435863e-05, "loss": 0.0451, "mean_token_accuracy": 0.9865108668804169, "num_tokens": 84719711.0, "step": 28760 }, { "entropy": 0.051850343402475116, "epoch": 6.705210397482224, "grad_norm": 0.2001953125, "learning_rate": 4.852116782234338e-05, "loss": 0.0189, "mean_token_accuracy": 0.9914988338947296, "num_tokens": 84742705.0, "step": 28765 }, { "entropy": 0.07821713835000992, "epoch": 6.706376034502856, "grad_norm": 1.9375, "learning_rate": 4.852044754598595e-05, "loss": 0.0429, "mean_token_accuracy": 0.9877715647220612, "num_tokens": 84753776.0, "step": 28770 }, { "entropy": 0.08759090341627598, "epoch": 6.7075416715234875, "grad_norm": 1.03125, "learning_rate": 4.8519727105297074e-05, "loss": 0.0473, "mean_token_accuracy": 0.9881977081298828, "num_tokens": 84763735.0, "step": 28775 }, { "entropy": 0.08235547505319118, "epoch": 6.70870730854412, "grad_norm": 2.359375, "learning_rate": 4.851900650028749e-05, "loss": 0.0516, "mean_token_accuracy": 0.9870765507221222, "num_tokens": 84781247.0, "step": 28780 }, { "entropy": 0.07376531232148409, "epoch": 6.709872945564751, "grad_norm": 0.2890625, "learning_rate": 4.8518285730967944e-05, "loss": 0.0458, "mean_token_accuracy": 0.9874989748001098, "num_tokens": 84806872.0, "step": 28785 }, { "entropy": 0.08102515386417508, "epoch": 6.711038582585383, "grad_norm": 0.91796875, "learning_rate": 4.8517564797349185e-05, "loss": 0.0224, "mean_token_accuracy": 0.9898236274719239, "num_tokens": 84834660.0, "step": 28790 }, { "entropy": 0.06574399825185537, "epoch": 6.712204219606015, "grad_norm": 1.6328125, "learning_rate": 4.851684369944197e-05, "loss": 0.0202, "mean_token_accuracy": 0.9922596752643585, "num_tokens": 84849216.0, "step": 28795 }, { "entropy": 0.07711246721446514, "epoch": 6.713369856626646, "grad_norm": 1.3828125, "learning_rate": 4.851612243725703e-05, "loss": 0.0459, "mean_token_accuracy": 0.9878857433795929, "num_tokens": 84861506.0, "step": 28800 }, { "entropy": 0.03763336762785911, "epoch": 6.714535493647278, "grad_norm": 1.5625, "learning_rate": 4.851540101080515e-05, "loss": 0.0149, "mean_token_accuracy": 0.9945106983184815, "num_tokens": 84908241.0, "step": 28805 }, { "entropy": 0.06442044954746962, "epoch": 6.7157011306679095, "grad_norm": 0.3671875, "learning_rate": 4.851467942009706e-05, "loss": 0.0265, "mean_token_accuracy": 0.9908295273780823, "num_tokens": 84928831.0, "step": 28810 }, { "entropy": 0.1289056757465005, "epoch": 6.716866767688542, "grad_norm": 1.5234375, "learning_rate": 4.851395766514355e-05, "loss": 0.229, "mean_token_accuracy": 0.9561670780181885, "num_tokens": 84960498.0, "step": 28815 }, { "entropy": 0.07786722630262374, "epoch": 6.718032404709174, "grad_norm": 2.453125, "learning_rate": 4.851323574595535e-05, "loss": 0.0418, "mean_token_accuracy": 0.9877800464630127, "num_tokens": 84978889.0, "step": 28820 }, { "entropy": 0.06431259289383888, "epoch": 6.719198041729806, "grad_norm": 0.50390625, "learning_rate": 4.8512513662543244e-05, "loss": 0.0261, "mean_token_accuracy": 0.991853529214859, "num_tokens": 85008818.0, "step": 28825 }, { "entropy": 0.05381359262391925, "epoch": 6.720363678750437, "grad_norm": 0.388671875, "learning_rate": 4.8511791414918006e-05, "loss": 0.0148, "mean_token_accuracy": 0.990543258190155, "num_tokens": 85041514.0, "step": 28830 }, { "entropy": 0.0829438241198659, "epoch": 6.721529315771069, "grad_norm": 0.48828125, "learning_rate": 4.851106900309038e-05, "loss": 0.0734, "mean_token_accuracy": 0.9828771650791168, "num_tokens": 85058710.0, "step": 28835 }, { "entropy": 0.07714390307664871, "epoch": 6.7226949527917, "grad_norm": 0.77734375, "learning_rate": 4.851034642707116e-05, "loss": 0.0398, "mean_token_accuracy": 0.9864890038967132, "num_tokens": 85075490.0, "step": 28840 }, { "entropy": 0.07695426233112812, "epoch": 6.7238605898123325, "grad_norm": 0.51953125, "learning_rate": 4.850962368687112e-05, "loss": 0.0572, "mean_token_accuracy": 0.9874246895313263, "num_tokens": 85090486.0, "step": 28845 }, { "entropy": 0.04998069824650884, "epoch": 6.725026226832965, "grad_norm": 2.34375, "learning_rate": 4.850890078250103e-05, "loss": 0.0238, "mean_token_accuracy": 0.9922325253486634, "num_tokens": 85128029.0, "step": 28850 }, { "entropy": 0.0514289460144937, "epoch": 6.726191863853596, "grad_norm": 0.447265625, "learning_rate": 4.850817771397166e-05, "loss": 0.0218, "mean_token_accuracy": 0.9909493029117584, "num_tokens": 85151400.0, "step": 28855 }, { "entropy": 0.0715357482433319, "epoch": 6.727357500874228, "grad_norm": 0.578125, "learning_rate": 4.8507454481293814e-05, "loss": 0.0366, "mean_token_accuracy": 0.9905698835849762, "num_tokens": 85175731.0, "step": 28860 }, { "entropy": 0.07424063235521317, "epoch": 6.728523137894859, "grad_norm": 0.71875, "learning_rate": 4.8506731084478254e-05, "loss": 0.0255, "mean_token_accuracy": 0.9909955024719238, "num_tokens": 85197952.0, "step": 28865 }, { "entropy": 0.12476393207907677, "epoch": 6.729688774915491, "grad_norm": 1.3671875, "learning_rate": 4.850600752353579e-05, "loss": 0.1117, "mean_token_accuracy": 0.975234842300415, "num_tokens": 85218193.0, "step": 28870 }, { "entropy": 0.08583814539015293, "epoch": 6.730854411936123, "grad_norm": 5.53125, "learning_rate": 4.8505283798477195e-05, "loss": 0.0451, "mean_token_accuracy": 0.9879949271678925, "num_tokens": 85229458.0, "step": 28875 }, { "entropy": 0.0823191050440073, "epoch": 6.7320200489567545, "grad_norm": 2.140625, "learning_rate": 4.850455990931327e-05, "loss": 0.0145, "mean_token_accuracy": 0.9918103396892548, "num_tokens": 85259716.0, "step": 28880 }, { "entropy": 0.06291626095771789, "epoch": 6.733185685977387, "grad_norm": 1.0390625, "learning_rate": 4.85038358560548e-05, "loss": 0.0315, "mean_token_accuracy": 0.9903146743774414, "num_tokens": 85270424.0, "step": 28885 }, { "entropy": 0.06514295479282736, "epoch": 6.734351322998019, "grad_norm": 0.74609375, "learning_rate": 4.850311163871259e-05, "loss": 0.0362, "mean_token_accuracy": 0.9864876389503479, "num_tokens": 85288155.0, "step": 28890 }, { "entropy": 0.051016226317733525, "epoch": 6.73551696001865, "grad_norm": 0.80078125, "learning_rate": 4.8502387257297435e-05, "loss": 0.0129, "mean_token_accuracy": 0.9911475658416748, "num_tokens": 85321624.0, "step": 28895 }, { "entropy": 0.0672379924915731, "epoch": 6.736682597039282, "grad_norm": 0.353515625, "learning_rate": 4.8501662711820136e-05, "loss": 0.0431, "mean_token_accuracy": 0.9873587965965271, "num_tokens": 85341463.0, "step": 28900 }, { "entropy": 0.07106356099247932, "epoch": 6.737848234059914, "grad_norm": 1.1640625, "learning_rate": 4.8500938002291494e-05, "loss": 0.0357, "mean_token_accuracy": 0.991501921415329, "num_tokens": 85352796.0, "step": 28905 }, { "entropy": 0.07119618002325297, "epoch": 6.739013871080545, "grad_norm": 0.470703125, "learning_rate": 4.8500213128722326e-05, "loss": 0.0365, "mean_token_accuracy": 0.9895273089408875, "num_tokens": 85368371.0, "step": 28910 }, { "entropy": 0.07297233371064067, "epoch": 6.7401795081011775, "grad_norm": 1.5625, "learning_rate": 4.849948809112344e-05, "loss": 0.0362, "mean_token_accuracy": 0.9880306363105774, "num_tokens": 85386310.0, "step": 28915 }, { "entropy": 0.05042272610589862, "epoch": 6.741345145121809, "grad_norm": 0.5390625, "learning_rate": 4.849876288950563e-05, "loss": 0.0113, "mean_token_accuracy": 0.9967696309089661, "num_tokens": 85415012.0, "step": 28920 }, { "entropy": 0.07025924921035767, "epoch": 6.742510782142441, "grad_norm": 2.859375, "learning_rate": 4.849803752387974e-05, "loss": 0.0318, "mean_token_accuracy": 0.9872465670108795, "num_tokens": 85427889.0, "step": 28925 }, { "entropy": 0.058800591714680196, "epoch": 6.743676419163073, "grad_norm": 0.1767578125, "learning_rate": 4.849731199425655e-05, "loss": 0.0409, "mean_token_accuracy": 0.9868711709976197, "num_tokens": 85453298.0, "step": 28930 }, { "entropy": 0.07147304005920888, "epoch": 6.744842056183704, "grad_norm": 1.3125, "learning_rate": 4.8496586300646905e-05, "loss": 0.0376, "mean_token_accuracy": 0.9878616869449616, "num_tokens": 85469989.0, "step": 28935 }, { "entropy": 0.03608826128765941, "epoch": 6.746007693204336, "grad_norm": 0.19921875, "learning_rate": 4.849586044306162e-05, "loss": 0.0078, "mean_token_accuracy": 0.9953897356986999, "num_tokens": 85507209.0, "step": 28940 }, { "entropy": 0.06290267184376716, "epoch": 6.7471733302249675, "grad_norm": 3.921875, "learning_rate": 4.8495134421511515e-05, "loss": 0.0496, "mean_token_accuracy": 0.9858272433280945, "num_tokens": 85518328.0, "step": 28945 }, { "entropy": 0.07403470184653997, "epoch": 6.7483389672456, "grad_norm": 2.46875, "learning_rate": 4.849440823600742e-05, "loss": 0.0458, "mean_token_accuracy": 0.9867697477340698, "num_tokens": 85533094.0, "step": 28950 }, { "entropy": 0.07181532364338636, "epoch": 6.749504604266232, "grad_norm": 3.359375, "learning_rate": 4.8493681886560154e-05, "loss": 0.0359, "mean_token_accuracy": 0.9885821998119354, "num_tokens": 85557546.0, "step": 28955 }, { "entropy": 0.0670968891121447, "epoch": 6.750670241286863, "grad_norm": 0.30078125, "learning_rate": 4.8492955373180567e-05, "loss": 0.0211, "mean_token_accuracy": 0.992115980386734, "num_tokens": 85583928.0, "step": 28960 }, { "entropy": 0.08538498897105455, "epoch": 6.751835878307495, "grad_norm": 3.4375, "learning_rate": 4.849222869587947e-05, "loss": 0.0511, "mean_token_accuracy": 0.9852666079998016, "num_tokens": 85597053.0, "step": 28965 }, { "entropy": 0.07742054816335439, "epoch": 6.753001515328127, "grad_norm": 2.921875, "learning_rate": 4.849150185466772e-05, "loss": 0.0627, "mean_token_accuracy": 0.9815660178661346, "num_tokens": 85622712.0, "step": 28970 }, { "entropy": 0.0654791722074151, "epoch": 6.754167152348758, "grad_norm": 0.20703125, "learning_rate": 4.849077484955614e-05, "loss": 0.0419, "mean_token_accuracy": 0.9875000894069672, "num_tokens": 85647033.0, "step": 28975 }, { "entropy": 0.062252599932253364, "epoch": 6.75533278936939, "grad_norm": 1.21875, "learning_rate": 4.8490047680555574e-05, "loss": 0.0224, "mean_token_accuracy": 0.9896842062473297, "num_tokens": 85670834.0, "step": 28980 }, { "entropy": 0.0574477035086602, "epoch": 6.7564984263900225, "grad_norm": 0.8671875, "learning_rate": 4.848932034767687e-05, "loss": 0.0186, "mean_token_accuracy": 0.991297823190689, "num_tokens": 85698076.0, "step": 28985 }, { "entropy": 0.06765312943607568, "epoch": 6.757664063410654, "grad_norm": 4.625, "learning_rate": 4.848859285093087e-05, "loss": 0.034, "mean_token_accuracy": 0.9909755229949951, "num_tokens": 85717080.0, "step": 28990 }, { "entropy": 0.07195999156683683, "epoch": 6.758829700431286, "grad_norm": 0.42578125, "learning_rate": 4.848786519032842e-05, "loss": 0.047, "mean_token_accuracy": 0.9886180579662323, "num_tokens": 85737727.0, "step": 28995 }, { "entropy": 0.06367543712258339, "epoch": 6.759995337451917, "grad_norm": 1.2578125, "learning_rate": 4.848713736588038e-05, "loss": 0.0382, "mean_token_accuracy": 0.9904224634170532, "num_tokens": 85752008.0, "step": 29000 }, { "entropy": 0.06906843390315771, "epoch": 6.761160974472549, "grad_norm": 0.99609375, "learning_rate": 4.84864093775976e-05, "loss": 0.0297, "mean_token_accuracy": 0.990409255027771, "num_tokens": 85766147.0, "step": 29005 }, { "entropy": 0.07358511816710234, "epoch": 6.762326611493181, "grad_norm": 4.0, "learning_rate": 4.848568122549092e-05, "loss": 0.0349, "mean_token_accuracy": 0.9889043509960175, "num_tokens": 85779365.0, "step": 29010 }, { "entropy": 0.05689704436808825, "epoch": 6.7634922485138125, "grad_norm": 2.8125, "learning_rate": 4.8484952909571215e-05, "loss": 0.0246, "mean_token_accuracy": 0.9909472405910492, "num_tokens": 85805043.0, "step": 29015 }, { "entropy": 0.061226178891956806, "epoch": 6.764657885534445, "grad_norm": 0.4609375, "learning_rate": 4.848422442984934e-05, "loss": 0.0279, "mean_token_accuracy": 0.9857422173023224, "num_tokens": 85822023.0, "step": 29020 }, { "entropy": 0.08178901560604572, "epoch": 6.765823522555077, "grad_norm": 2.140625, "learning_rate": 4.8483495786336156e-05, "loss": 0.0496, "mean_token_accuracy": 0.9885387241840362, "num_tokens": 85831669.0, "step": 29025 }, { "entropy": 0.06862952103838324, "epoch": 6.766989159575708, "grad_norm": 1.2421875, "learning_rate": 4.848276697904253e-05, "loss": 0.0373, "mean_token_accuracy": 0.989341801404953, "num_tokens": 85854445.0, "step": 29030 }, { "entropy": 0.06690819151699542, "epoch": 6.76815479659634, "grad_norm": 2.359375, "learning_rate": 4.848203800797933e-05, "loss": 0.0267, "mean_token_accuracy": 0.9925275266170501, "num_tokens": 85883012.0, "step": 29035 }, { "entropy": 0.05663586547598243, "epoch": 6.769320433616972, "grad_norm": 0.98828125, "learning_rate": 4.848130887315743e-05, "loss": 0.019, "mean_token_accuracy": 0.9927861154079437, "num_tokens": 85904037.0, "step": 29040 }, { "entropy": 0.06998476311564446, "epoch": 6.770486070637603, "grad_norm": 0.431640625, "learning_rate": 4.848057957458769e-05, "loss": 0.0492, "mean_token_accuracy": 0.9872413158416748, "num_tokens": 85917549.0, "step": 29045 }, { "entropy": 0.07485338505357504, "epoch": 6.771651707658235, "grad_norm": 2.515625, "learning_rate": 4.847985011228099e-05, "loss": 0.032, "mean_token_accuracy": 0.9896779417991638, "num_tokens": 85929221.0, "step": 29050 }, { "entropy": 0.06818428202532231, "epoch": 6.772817344678867, "grad_norm": 0.90625, "learning_rate": 4.847912048624822e-05, "loss": 0.0182, "mean_token_accuracy": 0.9925785660743713, "num_tokens": 85966358.0, "step": 29055 }, { "entropy": 0.05105516081675887, "epoch": 6.773982981699499, "grad_norm": 0.67578125, "learning_rate": 4.847839069650024e-05, "loss": 0.0181, "mean_token_accuracy": 0.9904293179512024, "num_tokens": 85995848.0, "step": 29060 }, { "entropy": 0.05594795113429427, "epoch": 6.775148618720131, "grad_norm": 0.443359375, "learning_rate": 4.847766074304795e-05, "loss": 0.0315, "mean_token_accuracy": 0.9890788733959198, "num_tokens": 86021444.0, "step": 29065 }, { "entropy": 0.2655148051679134, "epoch": 6.776314255740762, "grad_norm": 4.09375, "learning_rate": 4.847693062590223e-05, "loss": 0.4223, "mean_token_accuracy": 0.9431075990200043, "num_tokens": 86041173.0, "step": 29070 }, { "entropy": 0.06743866577744484, "epoch": 6.777479892761394, "grad_norm": 1.109375, "learning_rate": 4.847620034507396e-05, "loss": 0.0372, "mean_token_accuracy": 0.9891158044338226, "num_tokens": 86051811.0, "step": 29075 }, { "entropy": 0.062434398010373114, "epoch": 6.778645529782025, "grad_norm": 0.6015625, "learning_rate": 4.847546990057403e-05, "loss": 0.0236, "mean_token_accuracy": 0.9920885801315308, "num_tokens": 86069985.0, "step": 29080 }, { "entropy": 0.05638897055760026, "epoch": 6.7798111668026575, "grad_norm": 0.53515625, "learning_rate": 4.847473929241334e-05, "loss": 0.0227, "mean_token_accuracy": 0.9927825272083283, "num_tokens": 86100719.0, "step": 29085 }, { "entropy": 0.05632878141477704, "epoch": 6.78097680382329, "grad_norm": 1.6640625, "learning_rate": 4.847400852060278e-05, "loss": 0.0294, "mean_token_accuracy": 0.9924284636974334, "num_tokens": 86124617.0, "step": 29090 }, { "entropy": 0.07136726304888726, "epoch": 6.782142440843921, "grad_norm": 0.3828125, "learning_rate": 4.847327758515324e-05, "loss": 0.0329, "mean_token_accuracy": 0.9870093047618866, "num_tokens": 86147085.0, "step": 29095 }, { "entropy": 0.06836878564208745, "epoch": 6.783308077864553, "grad_norm": 0.63671875, "learning_rate": 4.8472546486075637e-05, "loss": 0.0482, "mean_token_accuracy": 0.9857183873653412, "num_tokens": 86159179.0, "step": 29100 }, { "entropy": 0.06574140526354313, "epoch": 6.784473714885185, "grad_norm": 1.546875, "learning_rate": 4.847181522338086e-05, "loss": 0.0339, "mean_token_accuracy": 0.9897574663162232, "num_tokens": 86176824.0, "step": 29105 }, { "entropy": 0.09539928231388331, "epoch": 6.785639351905816, "grad_norm": 2.296875, "learning_rate": 4.8471083797079814e-05, "loss": 0.0424, "mean_token_accuracy": 0.9886773109436036, "num_tokens": 86190608.0, "step": 29110 }, { "entropy": 0.05910975374281406, "epoch": 6.786804988926448, "grad_norm": 1.765625, "learning_rate": 4.84703522071834e-05, "loss": 0.0246, "mean_token_accuracy": 0.9930814146995545, "num_tokens": 86214225.0, "step": 29115 }, { "entropy": 0.10080632586032152, "epoch": 6.7879706259470804, "grad_norm": 1.9609375, "learning_rate": 4.846962045370255e-05, "loss": 0.0418, "mean_token_accuracy": 0.9877125442028045, "num_tokens": 86233522.0, "step": 29120 }, { "entropy": 0.08956030709668994, "epoch": 6.789136262967712, "grad_norm": 1.203125, "learning_rate": 4.8468888536648146e-05, "loss": 0.0597, "mean_token_accuracy": 0.9848473966121674, "num_tokens": 86252004.0, "step": 29125 }, { "entropy": 0.06908920761197805, "epoch": 6.790301899988344, "grad_norm": 0.95703125, "learning_rate": 4.8468156456031125e-05, "loss": 0.0331, "mean_token_accuracy": 0.987659215927124, "num_tokens": 86267780.0, "step": 29130 }, { "entropy": 0.08533047027885914, "epoch": 6.791467537008975, "grad_norm": 1.9453125, "learning_rate": 4.846742421186238e-05, "loss": 0.0671, "mean_token_accuracy": 0.9829756557941437, "num_tokens": 86277473.0, "step": 29135 }, { "entropy": 0.11114938296377659, "epoch": 6.792633174029607, "grad_norm": 3.796875, "learning_rate": 4.8466691804152856e-05, "loss": 0.1287, "mean_token_accuracy": 0.9729100704193115, "num_tokens": 86298359.0, "step": 29140 }, { "entropy": 0.05975300762802362, "epoch": 6.793798811050239, "grad_norm": 0.828125, "learning_rate": 4.846595923291346e-05, "loss": 0.0355, "mean_token_accuracy": 0.9907340705394745, "num_tokens": 86312783.0, "step": 29145 }, { "entropy": 0.0666638569906354, "epoch": 6.79496444807087, "grad_norm": 0.44140625, "learning_rate": 4.846522649815512e-05, "loss": 0.0402, "mean_token_accuracy": 0.9895487248897552, "num_tokens": 86330191.0, "step": 29150 }, { "entropy": 0.06604398051276802, "epoch": 6.7961300850915025, "grad_norm": 2.4375, "learning_rate": 4.846449359988876e-05, "loss": 0.0348, "mean_token_accuracy": 0.988951587677002, "num_tokens": 86346131.0, "step": 29155 }, { "entropy": 0.07143577151000499, "epoch": 6.797295722112135, "grad_norm": 3.015625, "learning_rate": 4.846376053812531e-05, "loss": 0.0434, "mean_token_accuracy": 0.9879807472229004, "num_tokens": 86357456.0, "step": 29160 }, { "entropy": 0.07275726459920406, "epoch": 6.798461359132766, "grad_norm": 0.5859375, "learning_rate": 4.846302731287569e-05, "loss": 0.0333, "mean_token_accuracy": 0.9879595756530761, "num_tokens": 86374022.0, "step": 29165 }, { "entropy": 0.06351196058094502, "epoch": 6.799626996153398, "grad_norm": 2.796875, "learning_rate": 4.846229392415085e-05, "loss": 0.0215, "mean_token_accuracy": 0.9883199453353881, "num_tokens": 86406372.0, "step": 29170 }, { "entropy": 0.06369423121213913, "epoch": 6.80079263317403, "grad_norm": 0.4765625, "learning_rate": 4.8461560371961726e-05, "loss": 0.0393, "mean_token_accuracy": 0.9890810310840606, "num_tokens": 86428485.0, "step": 29175 }, { "entropy": 0.088828045129776, "epoch": 6.801958270194661, "grad_norm": 2.390625, "learning_rate": 4.8460826656319244e-05, "loss": 0.0467, "mean_token_accuracy": 0.9871364951133728, "num_tokens": 86446053.0, "step": 29180 }, { "entropy": 0.06377274207770825, "epoch": 6.803123907215293, "grad_norm": 1.3359375, "learning_rate": 4.846009277723435e-05, "loss": 0.0247, "mean_token_accuracy": 0.9909086942672729, "num_tokens": 86472828.0, "step": 29185 }, { "entropy": 0.06937651876360178, "epoch": 6.804289544235925, "grad_norm": 0.51953125, "learning_rate": 4.845935873471799e-05, "loss": 0.0465, "mean_token_accuracy": 0.988141006231308, "num_tokens": 86488960.0, "step": 29190 }, { "entropy": 0.052994494885206224, "epoch": 6.805455181256557, "grad_norm": 0.470703125, "learning_rate": 4.84586245287811e-05, "loss": 0.023, "mean_token_accuracy": 0.9916663050651551, "num_tokens": 86524601.0, "step": 29195 }, { "entropy": 0.0682718912139535, "epoch": 6.806620818277189, "grad_norm": 1.8671875, "learning_rate": 4.845789015943464e-05, "loss": 0.0414, "mean_token_accuracy": 0.9885375797748566, "num_tokens": 86543402.0, "step": 29200 }, { "entropy": 0.06436384730041027, "epoch": 6.80778645529782, "grad_norm": 1.53125, "learning_rate": 4.845715562668956e-05, "loss": 0.0251, "mean_token_accuracy": 0.9882299304008484, "num_tokens": 86564820.0, "step": 29205 }, { "entropy": 0.0728614155203104, "epoch": 6.808952092318452, "grad_norm": 0.87890625, "learning_rate": 4.845642093055681e-05, "loss": 0.0372, "mean_token_accuracy": 0.9917413115501403, "num_tokens": 86578411.0, "step": 29210 }, { "entropy": 0.05707103759050369, "epoch": 6.810117729339083, "grad_norm": 1.7578125, "learning_rate": 4.8455686071047334e-05, "loss": 0.0209, "mean_token_accuracy": 0.9903227090835571, "num_tokens": 86598438.0, "step": 29215 }, { "entropy": 0.054839993361383677, "epoch": 6.811283366359715, "grad_norm": 1.6171875, "learning_rate": 4.845495104817211e-05, "loss": 0.0329, "mean_token_accuracy": 0.9889629364013672, "num_tokens": 86618133.0, "step": 29220 }, { "entropy": 0.0673223901540041, "epoch": 6.8124490033803475, "grad_norm": 3.671875, "learning_rate": 4.8454215861942084e-05, "loss": 0.0365, "mean_token_accuracy": 0.9903973639011383, "num_tokens": 86629789.0, "step": 29225 }, { "entropy": 0.05427380716428161, "epoch": 6.813614640400979, "grad_norm": 1.3359375, "learning_rate": 4.845348051236823e-05, "loss": 0.0224, "mean_token_accuracy": 0.9940382957458496, "num_tokens": 86664693.0, "step": 29230 }, { "entropy": 0.06686010584235191, "epoch": 6.814780277421611, "grad_norm": 0.349609375, "learning_rate": 4.84527449994615e-05, "loss": 0.0403, "mean_token_accuracy": 0.9875875532627105, "num_tokens": 86688064.0, "step": 29235 }, { "entropy": 0.07085557524114847, "epoch": 6.815945914442243, "grad_norm": 1.46875, "learning_rate": 4.845200932323287e-05, "loss": 0.0301, "mean_token_accuracy": 0.9859791278839112, "num_tokens": 86702004.0, "step": 29240 }, { "entropy": 0.0595591738820076, "epoch": 6.817111551462874, "grad_norm": 1.09375, "learning_rate": 4.845127348369331e-05, "loss": 0.0311, "mean_token_accuracy": 0.9883405685424804, "num_tokens": 86723765.0, "step": 29245 }, { "entropy": 0.060622527822852136, "epoch": 6.818277188483506, "grad_norm": 1.09375, "learning_rate": 4.845053748085379e-05, "loss": 0.0381, "mean_token_accuracy": 0.9894052445888519, "num_tokens": 86736897.0, "step": 29250 }, { "entropy": 0.06368033736944198, "epoch": 6.819442825504138, "grad_norm": 1.640625, "learning_rate": 4.8449801314725284e-05, "loss": 0.0358, "mean_token_accuracy": 0.9864093840122223, "num_tokens": 86754372.0, "step": 29255 }, { "entropy": 0.05678990054875612, "epoch": 6.82060846252477, "grad_norm": 1.5234375, "learning_rate": 4.844906498531877e-05, "loss": 0.0377, "mean_token_accuracy": 0.9898879945278167, "num_tokens": 86787915.0, "step": 29260 }, { "entropy": 0.0435803915373981, "epoch": 6.821774099545402, "grad_norm": 0.275390625, "learning_rate": 4.8448328492645236e-05, "loss": 0.0175, "mean_token_accuracy": 0.9927274286746979, "num_tokens": 86815853.0, "step": 29265 }, { "entropy": 0.06827406398952007, "epoch": 6.822939736566033, "grad_norm": 1.9609375, "learning_rate": 4.844759183671565e-05, "loss": 0.0418, "mean_token_accuracy": 0.9852350890636444, "num_tokens": 86834030.0, "step": 29270 }, { "entropy": 0.0832410730421543, "epoch": 6.824105373586665, "grad_norm": 6.15625, "learning_rate": 4.8446855017541004e-05, "loss": 0.0632, "mean_token_accuracy": 0.9872022151947022, "num_tokens": 86842714.0, "step": 29275 }, { "entropy": 0.06686999946832657, "epoch": 6.825271010607297, "grad_norm": 0.328125, "learning_rate": 4.844611803513228e-05, "loss": 0.0193, "mean_token_accuracy": 0.9939013659954071, "num_tokens": 86860947.0, "step": 29280 }, { "entropy": 0.053580792341381314, "epoch": 6.826436647627928, "grad_norm": 0.96484375, "learning_rate": 4.844538088950048e-05, "loss": 0.0213, "mean_token_accuracy": 0.9897189140319824, "num_tokens": 86889554.0, "step": 29285 }, { "entropy": 0.0600953180808574, "epoch": 6.8276022846485604, "grad_norm": 1.9609375, "learning_rate": 4.844464358065659e-05, "loss": 0.0439, "mean_token_accuracy": 0.9867989003658295, "num_tokens": 86914705.0, "step": 29290 }, { "entropy": 0.05365408333018422, "epoch": 6.8287679216691926, "grad_norm": 0.2041015625, "learning_rate": 4.8443906108611594e-05, "loss": 0.0169, "mean_token_accuracy": 0.9927012145519256, "num_tokens": 86948313.0, "step": 29295 }, { "entropy": 0.04468016289174557, "epoch": 6.829933558689824, "grad_norm": 0.78125, "learning_rate": 4.84431684733765e-05, "loss": 0.0153, "mean_token_accuracy": 0.9934628009796143, "num_tokens": 86981371.0, "step": 29300 }, { "entropy": 0.060497371945530176, "epoch": 6.831099195710456, "grad_norm": 2.359375, "learning_rate": 4.8442430674962315e-05, "loss": 0.0183, "mean_token_accuracy": 0.9893570840358734, "num_tokens": 87012118.0, "step": 29305 }, { "entropy": 0.05356447799131274, "epoch": 6.832264832731088, "grad_norm": 0.474609375, "learning_rate": 4.844169271338002e-05, "loss": 0.0093, "mean_token_accuracy": 0.9939198553562164, "num_tokens": 87045066.0, "step": 29310 }, { "entropy": 0.08580188415944576, "epoch": 6.833430469751719, "grad_norm": 1.828125, "learning_rate": 4.844095458864063e-05, "loss": 0.042, "mean_token_accuracy": 0.9881276488304138, "num_tokens": 87055976.0, "step": 29315 }, { "entropy": 0.058746106829494237, "epoch": 6.834596106772351, "grad_norm": 0.5078125, "learning_rate": 4.8440216300755156e-05, "loss": 0.0228, "mean_token_accuracy": 0.9936113774776458, "num_tokens": 87085259.0, "step": 29320 }, { "entropy": 0.06592875272035599, "epoch": 6.8357617437929825, "grad_norm": 3.25, "learning_rate": 4.8439477849734596e-05, "loss": 0.0287, "mean_token_accuracy": 0.9858220040798187, "num_tokens": 87107554.0, "step": 29325 }, { "entropy": 0.06077197715640068, "epoch": 6.836927380813615, "grad_norm": 3.15625, "learning_rate": 4.843873923558997e-05, "loss": 0.0373, "mean_token_accuracy": 0.9901536464691162, "num_tokens": 87125825.0, "step": 29330 }, { "entropy": 0.06383555121719837, "epoch": 6.838093017834247, "grad_norm": 2.015625, "learning_rate": 4.843800045833229e-05, "loss": 0.0246, "mean_token_accuracy": 0.992045420408249, "num_tokens": 87148897.0, "step": 29335 }, { "entropy": 0.06727636214345693, "epoch": 6.839258654854878, "grad_norm": 2.796875, "learning_rate": 4.8437261517972565e-05, "loss": 0.0405, "mean_token_accuracy": 0.9885002434253692, "num_tokens": 87177972.0, "step": 29340 }, { "entropy": 0.07472562920302153, "epoch": 6.84042429187551, "grad_norm": 1.75, "learning_rate": 4.843652241452183e-05, "loss": 0.0477, "mean_token_accuracy": 0.9853593826293945, "num_tokens": 87188899.0, "step": 29345 }, { "entropy": 0.045274854823946956, "epoch": 6.841589928896141, "grad_norm": 0.66015625, "learning_rate": 4.8435783147991084e-05, "loss": 0.0232, "mean_token_accuracy": 0.9936028838157653, "num_tokens": 87219880.0, "step": 29350 }, { "entropy": 0.07913060076534748, "epoch": 6.842755565916773, "grad_norm": 0.40625, "learning_rate": 4.8435043718391374e-05, "loss": 0.0466, "mean_token_accuracy": 0.9835161387920379, "num_tokens": 87233980.0, "step": 29355 }, { "entropy": 0.07882125917822122, "epoch": 6.8439212029374055, "grad_norm": 0.92578125, "learning_rate": 4.8434304125733715e-05, "loss": 0.0413, "mean_token_accuracy": 0.9837761163711548, "num_tokens": 87249683.0, "step": 29360 }, { "entropy": 0.08739354386925698, "epoch": 6.845086839958037, "grad_norm": 2.796875, "learning_rate": 4.8433564370029126e-05, "loss": 0.0455, "mean_token_accuracy": 0.9867386996746064, "num_tokens": 87260438.0, "step": 29365 }, { "entropy": 0.09364954633638263, "epoch": 6.846252476978669, "grad_norm": 1.3828125, "learning_rate": 4.843282445128866e-05, "loss": 0.0433, "mean_token_accuracy": 0.9867539525032043, "num_tokens": 87277536.0, "step": 29370 }, { "entropy": 0.09267938621342182, "epoch": 6.847418113999301, "grad_norm": 1.8125, "learning_rate": 4.843208436952333e-05, "loss": 0.0578, "mean_token_accuracy": 0.9815105080604554, "num_tokens": 87287407.0, "step": 29375 }, { "entropy": 0.08081861436367035, "epoch": 6.848583751019932, "grad_norm": 2.1875, "learning_rate": 4.8431344124744174e-05, "loss": 0.0534, "mean_token_accuracy": 0.9844126164913177, "num_tokens": 87297492.0, "step": 29380 }, { "entropy": 0.11397622730582953, "epoch": 6.849749388040564, "grad_norm": 1.71875, "learning_rate": 4.843060371696225e-05, "loss": 0.1607, "mean_token_accuracy": 0.9661115884780884, "num_tokens": 87322709.0, "step": 29385 }, { "entropy": 0.04962232355028391, "epoch": 6.850915025061196, "grad_norm": 0.337890625, "learning_rate": 4.842986314618857e-05, "loss": 0.0259, "mean_token_accuracy": 0.990403151512146, "num_tokens": 87350847.0, "step": 29390 }, { "entropy": 0.0795666191726923, "epoch": 6.8520806620818275, "grad_norm": 1.2109375, "learning_rate": 4.84291224124342e-05, "loss": 0.0304, "mean_token_accuracy": 0.9844847023487091, "num_tokens": 87374215.0, "step": 29395 }, { "entropy": 0.12139319926500321, "epoch": 6.85324629910246, "grad_norm": 0.74609375, "learning_rate": 4.842838151571017e-05, "loss": 0.0829, "mean_token_accuracy": 0.978274130821228, "num_tokens": 87383830.0, "step": 29400 }, { "entropy": 0.09214160200208425, "epoch": 6.854411936123091, "grad_norm": 1.953125, "learning_rate": 4.842764045602754e-05, "loss": 0.0208, "mean_token_accuracy": 0.9909944117069245, "num_tokens": 87410168.0, "step": 29405 }, { "entropy": 0.07607862763106824, "epoch": 6.855577573143723, "grad_norm": 2.5625, "learning_rate": 4.8426899233397346e-05, "loss": 0.0454, "mean_token_accuracy": 0.9829370498657226, "num_tokens": 87428573.0, "step": 29410 }, { "entropy": 0.06585808522067964, "epoch": 6.856743210164355, "grad_norm": 0.412109375, "learning_rate": 4.842615784783066e-05, "loss": 0.0294, "mean_token_accuracy": 0.9886744976043701, "num_tokens": 87450620.0, "step": 29415 }, { "entropy": 0.06280166539363563, "epoch": 6.857908847184986, "grad_norm": 1.6328125, "learning_rate": 4.842541629933852e-05, "loss": 0.0122, "mean_token_accuracy": 0.9930438220500946, "num_tokens": 87491928.0, "step": 29420 }, { "entropy": 0.05158753078430891, "epoch": 6.859074484205618, "grad_norm": 0.77734375, "learning_rate": 4.842467458793199e-05, "loss": 0.0226, "mean_token_accuracy": 0.9912914633750916, "num_tokens": 87512955.0, "step": 29425 }, { "entropy": 0.07368856463581323, "epoch": 6.8602401212262505, "grad_norm": 1.3984375, "learning_rate": 4.842393271362214e-05, "loss": 0.048, "mean_token_accuracy": 0.9881428837776184, "num_tokens": 87527515.0, "step": 29430 }, { "entropy": 0.07875376977026463, "epoch": 6.861405758246882, "grad_norm": 1.2109375, "learning_rate": 4.8423190676420014e-05, "loss": 0.0414, "mean_token_accuracy": 0.988014304637909, "num_tokens": 87537921.0, "step": 29435 }, { "entropy": 0.05999251157045364, "epoch": 6.862571395267514, "grad_norm": 0.333984375, "learning_rate": 4.842244847633669e-05, "loss": 0.0221, "mean_token_accuracy": 0.9940233111381531, "num_tokens": 87554565.0, "step": 29440 }, { "entropy": 0.10110383518040181, "epoch": 6.863737032288146, "grad_norm": 4.40625, "learning_rate": 4.842170611338323e-05, "loss": 0.0724, "mean_token_accuracy": 0.9790929615497589, "num_tokens": 87572406.0, "step": 29445 }, { "entropy": 0.042115158122032884, "epoch": 6.864902669308777, "grad_norm": 0.2421875, "learning_rate": 4.84209635875707e-05, "loss": 0.0132, "mean_token_accuracy": 0.9942797899246216, "num_tokens": 87612902.0, "step": 29450 }, { "entropy": 0.1387539068236947, "epoch": 6.866068306329409, "grad_norm": 7.40625, "learning_rate": 4.8420220898910174e-05, "loss": 0.1285, "mean_token_accuracy": 0.9800481796264648, "num_tokens": 87655268.0, "step": 29455 }, { "entropy": 0.05575822722166777, "epoch": 6.8672339433500404, "grad_norm": 0.40625, "learning_rate": 4.8419478047412736e-05, "loss": 0.0225, "mean_token_accuracy": 0.9901275217533112, "num_tokens": 87683649.0, "step": 29460 }, { "entropy": 0.05922813983634114, "epoch": 6.8683995803706726, "grad_norm": 0.7734375, "learning_rate": 4.841873503308946e-05, "loss": 0.0268, "mean_token_accuracy": 0.9899829208850861, "num_tokens": 87703718.0, "step": 29465 }, { "entropy": 0.09449698030948639, "epoch": 6.869565217391305, "grad_norm": 2.71875, "learning_rate": 4.8417991855951416e-05, "loss": 0.0735, "mean_token_accuracy": 0.9808722853660583, "num_tokens": 87712649.0, "step": 29470 }, { "entropy": 0.07578764576464891, "epoch": 6.870730854411936, "grad_norm": 2.296875, "learning_rate": 4.841724851600969e-05, "loss": 0.0337, "mean_token_accuracy": 0.9895140945911407, "num_tokens": 87727487.0, "step": 29475 }, { "entropy": 0.059785145334899426, "epoch": 6.871896491432568, "grad_norm": 2.0625, "learning_rate": 4.841650501327537e-05, "loss": 0.0436, "mean_token_accuracy": 0.9882297396659852, "num_tokens": 87748033.0, "step": 29480 }, { "entropy": 0.05755780637264252, "epoch": 6.873062128453199, "grad_norm": 1.6953125, "learning_rate": 4.8415761347759546e-05, "loss": 0.0304, "mean_token_accuracy": 0.9886388182640076, "num_tokens": 87765201.0, "step": 29485 }, { "entropy": 0.06052704788744449, "epoch": 6.874227765473831, "grad_norm": 1.671875, "learning_rate": 4.84150175194733e-05, "loss": 0.027, "mean_token_accuracy": 0.9908796191215515, "num_tokens": 87794500.0, "step": 29490 }, { "entropy": 0.058810063265264036, "epoch": 6.875393402494463, "grad_norm": 1.578125, "learning_rate": 4.8414273528427726e-05, "loss": 0.0276, "mean_token_accuracy": 0.9919278442859649, "num_tokens": 87824833.0, "step": 29495 }, { "entropy": 0.05831845905631781, "epoch": 6.876559039515095, "grad_norm": 1.578125, "learning_rate": 4.841352937463392e-05, "loss": 0.0241, "mean_token_accuracy": 0.991849547624588, "num_tokens": 87841593.0, "step": 29500 }, { "entropy": 0.0872832141816616, "epoch": 6.877724676535727, "grad_norm": 3.0625, "learning_rate": 4.841278505810296e-05, "loss": 0.0489, "mean_token_accuracy": 0.9851008772850036, "num_tokens": 87852284.0, "step": 29505 }, { "entropy": 0.0699134774506092, "epoch": 6.878890313556359, "grad_norm": 1.1171875, "learning_rate": 4.841204057884597e-05, "loss": 0.0457, "mean_token_accuracy": 0.9868590772151947, "num_tokens": 87863119.0, "step": 29510 }, { "entropy": 0.0703543471172452, "epoch": 6.88005595057699, "grad_norm": 1.609375, "learning_rate": 4.8411295936874054e-05, "loss": 0.0359, "mean_token_accuracy": 0.9903105676174164, "num_tokens": 87878182.0, "step": 29515 }, { "entropy": 0.08448899425566196, "epoch": 6.881221587597622, "grad_norm": 1.6171875, "learning_rate": 4.8410551132198295e-05, "loss": 0.0394, "mean_token_accuracy": 0.988205897808075, "num_tokens": 87887563.0, "step": 29520 }, { "entropy": 0.06482985820621252, "epoch": 6.882387224618254, "grad_norm": 1.4296875, "learning_rate": 4.840980616482981e-05, "loss": 0.0354, "mean_token_accuracy": 0.9864917457103729, "num_tokens": 87907092.0, "step": 29525 }, { "entropy": 0.0666579614393413, "epoch": 6.8835528616388855, "grad_norm": 1.046875, "learning_rate": 4.84090610347797e-05, "loss": 0.0329, "mean_token_accuracy": 0.9885077834129333, "num_tokens": 87926240.0, "step": 29530 }, { "entropy": 0.06210904810577631, "epoch": 6.884718498659518, "grad_norm": 2.015625, "learning_rate": 4.8408315742059087e-05, "loss": 0.0463, "mean_token_accuracy": 0.9860225260257721, "num_tokens": 87943552.0, "step": 29535 }, { "entropy": 0.05175962019711733, "epoch": 6.885884135680149, "grad_norm": 1.03125, "learning_rate": 4.8407570286679085e-05, "loss": 0.0178, "mean_token_accuracy": 0.995280122756958, "num_tokens": 87963262.0, "step": 29540 }, { "entropy": 0.07738391607999802, "epoch": 6.887049772700781, "grad_norm": 1.4296875, "learning_rate": 4.840682466865079e-05, "loss": 0.0372, "mean_token_accuracy": 0.9886998891830444, "num_tokens": 87982910.0, "step": 29545 }, { "entropy": 0.0720852023921907, "epoch": 6.888215409721413, "grad_norm": 0.212890625, "learning_rate": 4.840607888798535e-05, "loss": 0.0289, "mean_token_accuracy": 0.99115749001503, "num_tokens": 88011529.0, "step": 29550 }, { "entropy": 0.05431947279721498, "epoch": 6.889381046742044, "grad_norm": 2.265625, "learning_rate": 4.840533294469386e-05, "loss": 0.023, "mean_token_accuracy": 0.9919369578361511, "num_tokens": 88041197.0, "step": 29555 }, { "entropy": 0.08043484576046467, "epoch": 6.890546683762676, "grad_norm": 0.80078125, "learning_rate": 4.840458683878745e-05, "loss": 0.0233, "mean_token_accuracy": 0.992747563123703, "num_tokens": 88055226.0, "step": 29560 }, { "entropy": 0.06275735646486283, "epoch": 6.891712320783308, "grad_norm": 1.7578125, "learning_rate": 4.840384057027726e-05, "loss": 0.0407, "mean_token_accuracy": 0.9876458764076232, "num_tokens": 88068457.0, "step": 29565 }, { "entropy": 0.059214035701006654, "epoch": 6.89287795780394, "grad_norm": 2.09375, "learning_rate": 4.84030941391744e-05, "loss": 0.03, "mean_token_accuracy": 0.9914735913276672, "num_tokens": 88102115.0, "step": 29570 }, { "entropy": 0.08597388043999672, "epoch": 6.894043594824572, "grad_norm": 1.7421875, "learning_rate": 4.840234754549001e-05, "loss": 0.0486, "mean_token_accuracy": 0.9860342562198638, "num_tokens": 88121858.0, "step": 29575 }, { "entropy": 0.07845104150474072, "epoch": 6.895209231845204, "grad_norm": 2.796875, "learning_rate": 4.8401600789235215e-05, "loss": 0.0421, "mean_token_accuracy": 0.9856066465377807, "num_tokens": 88138167.0, "step": 29580 }, { "entropy": 0.06135543161071837, "epoch": 6.896374868865835, "grad_norm": 1.7109375, "learning_rate": 4.840085387042116e-05, "loss": 0.0334, "mean_token_accuracy": 0.9912704825401306, "num_tokens": 88161014.0, "step": 29585 }, { "entropy": 0.04807945257052779, "epoch": 6.897540505886467, "grad_norm": 1.8984375, "learning_rate": 4.840010678905898e-05, "loss": 0.0217, "mean_token_accuracy": 0.9900756359100342, "num_tokens": 88187263.0, "step": 29590 }, { "entropy": 0.06505886856466532, "epoch": 6.898706142907098, "grad_norm": 2.984375, "learning_rate": 4.839935954515981e-05, "loss": 0.0498, "mean_token_accuracy": 0.9886844336986542, "num_tokens": 88205357.0, "step": 29595 }, { "entropy": 0.07731727361679078, "epoch": 6.8998717799277305, "grad_norm": 0.271484375, "learning_rate": 4.839861213873479e-05, "loss": 0.035, "mean_token_accuracy": 0.9831252992153168, "num_tokens": 88234551.0, "step": 29600 }, { "entropy": 0.06955102737993002, "epoch": 6.901037416948363, "grad_norm": 0.3828125, "learning_rate": 4.8397864569795074e-05, "loss": 0.0688, "mean_token_accuracy": 0.9834696292877197, "num_tokens": 88248261.0, "step": 29605 }, { "entropy": 0.08642586246132851, "epoch": 6.902203053968994, "grad_norm": 2.34375, "learning_rate": 4.839711683835181e-05, "loss": 0.0707, "mean_token_accuracy": 0.9796038687229156, "num_tokens": 88270591.0, "step": 29610 }, { "entropy": 0.08347761034965515, "epoch": 6.903368690989626, "grad_norm": 1.9296875, "learning_rate": 4.839636894441614e-05, "loss": 0.0355, "mean_token_accuracy": 0.9871561110019684, "num_tokens": 88290667.0, "step": 29615 }, { "entropy": 0.07617175304330885, "epoch": 6.904534328010257, "grad_norm": 1.9921875, "learning_rate": 4.8395620887999224e-05, "loss": 0.0406, "mean_token_accuracy": 0.988088744878769, "num_tokens": 88311904.0, "step": 29620 }, { "entropy": 0.0650593439117074, "epoch": 6.905699965030889, "grad_norm": 2.15625, "learning_rate": 4.839487266911221e-05, "loss": 0.051, "mean_token_accuracy": 0.988483190536499, "num_tokens": 88334225.0, "step": 29625 }, { "entropy": 0.07525872401893138, "epoch": 6.906865602051521, "grad_norm": 1.3359375, "learning_rate": 4.8394124287766254e-05, "loss": 0.0374, "mean_token_accuracy": 0.9889621555805206, "num_tokens": 88356933.0, "step": 29630 }, { "entropy": 0.07050515934824944, "epoch": 6.9080312390721526, "grad_norm": 1.9296875, "learning_rate": 4.8393375743972526e-05, "loss": 0.0343, "mean_token_accuracy": 0.986613517999649, "num_tokens": 88368806.0, "step": 29635 }, { "entropy": 0.04098109249025583, "epoch": 6.909196876092785, "grad_norm": 0.609375, "learning_rate": 4.839262703774218e-05, "loss": 0.0122, "mean_token_accuracy": 0.9954572081565857, "num_tokens": 88396176.0, "step": 29640 }, { "entropy": 0.09802703186869621, "epoch": 6.910362513113417, "grad_norm": 1.6875, "learning_rate": 4.839187816908638e-05, "loss": 0.0434, "mean_token_accuracy": 0.9861961781978608, "num_tokens": 88406007.0, "step": 29645 }, { "entropy": 0.07688620202243328, "epoch": 6.911528150134048, "grad_norm": 1.5625, "learning_rate": 4.839112913801629e-05, "loss": 0.0568, "mean_token_accuracy": 0.9860326409339905, "num_tokens": 88416726.0, "step": 29650 }, { "entropy": 0.06696718856692314, "epoch": 6.91269378715468, "grad_norm": 2.109375, "learning_rate": 4.839037994454309e-05, "loss": 0.0358, "mean_token_accuracy": 0.9892750978469849, "num_tokens": 88436628.0, "step": 29655 }, { "entropy": 0.061067532189190386, "epoch": 6.913859424175312, "grad_norm": 2.1875, "learning_rate": 4.8389630588677934e-05, "loss": 0.0264, "mean_token_accuracy": 0.9902947187423706, "num_tokens": 88455253.0, "step": 29660 }, { "entropy": 0.06965431291610003, "epoch": 6.915025061195943, "grad_norm": 1.515625, "learning_rate": 4.838888107043202e-05, "loss": 0.0269, "mean_token_accuracy": 0.9907572269439697, "num_tokens": 88471514.0, "step": 29665 }, { "entropy": 0.0637282345443964, "epoch": 6.9161906982165755, "grad_norm": 0.89453125, "learning_rate": 4.8388131389816505e-05, "loss": 0.0372, "mean_token_accuracy": 0.9888510942459107, "num_tokens": 88487253.0, "step": 29670 }, { "entropy": 0.06704912669956684, "epoch": 6.917356335237207, "grad_norm": 1.75, "learning_rate": 4.838738154684258e-05, "loss": 0.0458, "mean_token_accuracy": 0.9872386932373047, "num_tokens": 88497642.0, "step": 29675 }, { "entropy": 0.07361556626856328, "epoch": 6.918521972257839, "grad_norm": 2.9375, "learning_rate": 4.838663154152141e-05, "loss": 0.0355, "mean_token_accuracy": 0.9899929702281952, "num_tokens": 88516389.0, "step": 29680 }, { "entropy": 0.07161520551890135, "epoch": 6.919687609278471, "grad_norm": 0.5, "learning_rate": 4.83858813738642e-05, "loss": 0.0284, "mean_token_accuracy": 0.9888733863830567, "num_tokens": 88542247.0, "step": 29685 }, { "entropy": 0.14276986196637154, "epoch": 6.920853246299102, "grad_norm": 2.21875, "learning_rate": 4.838513104388212e-05, "loss": 0.1822, "mean_token_accuracy": 0.9694925010204315, "num_tokens": 88570753.0, "step": 29690 }, { "entropy": 0.03625276731327176, "epoch": 6.922018883319734, "grad_norm": 0.1201171875, "learning_rate": 4.838438055158636e-05, "loss": 0.0113, "mean_token_accuracy": 0.9950872659683228, "num_tokens": 88612661.0, "step": 29695 }, { "entropy": 0.04458741853013635, "epoch": 6.923184520340366, "grad_norm": 0.478515625, "learning_rate": 4.8383629896988126e-05, "loss": 0.0244, "mean_token_accuracy": 0.991793304681778, "num_tokens": 88649892.0, "step": 29700 }, { "entropy": 0.05614535929635167, "epoch": 6.924350157360998, "grad_norm": 0.45703125, "learning_rate": 4.838287908009859e-05, "loss": 0.0321, "mean_token_accuracy": 0.9873059391975403, "num_tokens": 88674555.0, "step": 29705 }, { "entropy": 0.07231198623776436, "epoch": 6.92551579438163, "grad_norm": 1.296875, "learning_rate": 4.8382128100928965e-05, "loss": 0.0338, "mean_token_accuracy": 0.9906602203845978, "num_tokens": 88687291.0, "step": 29710 }, { "entropy": 0.06145105315372348, "epoch": 6.926681431402262, "grad_norm": 0.63671875, "learning_rate": 4.838137695949044e-05, "loss": 0.0335, "mean_token_accuracy": 0.9919077694416046, "num_tokens": 88716467.0, "step": 29715 }, { "entropy": 0.06858117748051881, "epoch": 6.927847068422893, "grad_norm": 2.359375, "learning_rate": 4.8380625655794216e-05, "loss": 0.0487, "mean_token_accuracy": 0.9860714495182037, "num_tokens": 88731304.0, "step": 29720 }, { "entropy": 0.06362678054720164, "epoch": 6.929012705443525, "grad_norm": 1.9453125, "learning_rate": 4.83798741898515e-05, "loss": 0.0281, "mean_token_accuracy": 0.9902177393436432, "num_tokens": 88753419.0, "step": 29725 }, { "entropy": 0.07195475585758686, "epoch": 6.930178342464156, "grad_norm": 0.83984375, "learning_rate": 4.8379122561673496e-05, "loss": 0.0417, "mean_token_accuracy": 0.9873518884181977, "num_tokens": 88762473.0, "step": 29730 }, { "entropy": 0.05631625261157751, "epoch": 6.931343979484788, "grad_norm": 0.1982421875, "learning_rate": 4.837837077127141e-05, "loss": 0.0233, "mean_token_accuracy": 0.9895540356636048, "num_tokens": 88793402.0, "step": 29735 }, { "entropy": 0.08471834966912865, "epoch": 6.9325096165054205, "grad_norm": 2.1875, "learning_rate": 4.8377618818656454e-05, "loss": 0.0406, "mean_token_accuracy": 0.9833233654499054, "num_tokens": 88811724.0, "step": 29740 }, { "entropy": 0.06793053429573774, "epoch": 6.933675253526052, "grad_norm": 2.15625, "learning_rate": 4.837686670383984e-05, "loss": 0.0441, "mean_token_accuracy": 0.9840839326381683, "num_tokens": 88825353.0, "step": 29745 }, { "entropy": 0.06626611463725567, "epoch": 6.934840890546684, "grad_norm": 3.609375, "learning_rate": 4.837611442683279e-05, "loss": 0.0518, "mean_token_accuracy": 0.9845434069633484, "num_tokens": 88835435.0, "step": 29750 }, { "entropy": 0.0747138449922204, "epoch": 6.936006527567315, "grad_norm": 0.4296875, "learning_rate": 4.8375361987646506e-05, "loss": 0.0437, "mean_token_accuracy": 0.9863695621490478, "num_tokens": 88846040.0, "step": 29755 }, { "entropy": 0.05634311148896813, "epoch": 6.937172164587947, "grad_norm": 1.546875, "learning_rate": 4.837460938629222e-05, "loss": 0.0249, "mean_token_accuracy": 0.9908220171928406, "num_tokens": 88880039.0, "step": 29760 }, { "entropy": 0.06870086174458265, "epoch": 6.938337801608579, "grad_norm": 1.71875, "learning_rate": 4.837385662278116e-05, "loss": 0.0392, "mean_token_accuracy": 0.9845155358314515, "num_tokens": 88894671.0, "step": 29765 }, { "entropy": 0.06056760400533676, "epoch": 6.9395034386292105, "grad_norm": 1.234375, "learning_rate": 4.8373103697124535e-05, "loss": 0.0272, "mean_token_accuracy": 0.9929500162601471, "num_tokens": 88918540.0, "step": 29770 }, { "entropy": 0.07818248439580203, "epoch": 6.940669075649843, "grad_norm": 3.6875, "learning_rate": 4.837235060933358e-05, "loss": 0.0348, "mean_token_accuracy": 0.9911899507045746, "num_tokens": 88947902.0, "step": 29775 }, { "entropy": 0.08559078127145767, "epoch": 6.941834712670475, "grad_norm": 2.453125, "learning_rate": 4.837159735941953e-05, "loss": 0.0427, "mean_token_accuracy": 0.988016277551651, "num_tokens": 88959447.0, "step": 29780 }, { "entropy": 0.06198177421465516, "epoch": 6.943000349691106, "grad_norm": 0.29296875, "learning_rate": 4.8370843947393604e-05, "loss": 0.0516, "mean_token_accuracy": 0.9868219137191773, "num_tokens": 88990661.0, "step": 29785 }, { "entropy": 0.11046778000891208, "epoch": 6.944165986711738, "grad_norm": 4.03125, "learning_rate": 4.837009037326705e-05, "loss": 0.0414, "mean_token_accuracy": 0.9880511343479157, "num_tokens": 89009245.0, "step": 29790 }, { "entropy": 0.0634513407945633, "epoch": 6.94533162373237, "grad_norm": 2.0625, "learning_rate": 4.836933663705109e-05, "loss": 0.0465, "mean_token_accuracy": 0.9846070647239685, "num_tokens": 89019910.0, "step": 29795 }, { "entropy": 0.057706226408481595, "epoch": 6.946497260753001, "grad_norm": 1.3828125, "learning_rate": 4.836858273875698e-05, "loss": 0.0395, "mean_token_accuracy": 0.9864011585712433, "num_tokens": 89035451.0, "step": 29800 }, { "entropy": 0.06847372064366937, "epoch": 6.947662897773633, "grad_norm": 1.890625, "learning_rate": 4.836782867839595e-05, "loss": 0.0292, "mean_token_accuracy": 0.9899077951908112, "num_tokens": 89059892.0, "step": 29805 }, { "entropy": 0.16696815257892011, "epoch": 6.948828534794265, "grad_norm": 1.546875, "learning_rate": 4.836707445597925e-05, "loss": 0.3172, "mean_token_accuracy": 0.9391492247581482, "num_tokens": 89083414.0, "step": 29810 }, { "entropy": 0.06408511018380522, "epoch": 6.949994171814897, "grad_norm": 1.21875, "learning_rate": 4.836632007151813e-05, "loss": 0.037, "mean_token_accuracy": 0.9896316051483154, "num_tokens": 89100519.0, "step": 29815 }, { "entropy": 0.06854904610663652, "epoch": 6.951159808835529, "grad_norm": 1.9375, "learning_rate": 4.8365565525023825e-05, "loss": 0.0428, "mean_token_accuracy": 0.987441337108612, "num_tokens": 89115773.0, "step": 29820 }, { "entropy": 0.07351719280704856, "epoch": 6.95232544585616, "grad_norm": 2.09375, "learning_rate": 4.8364810816507596e-05, "loss": 0.0264, "mean_token_accuracy": 0.9904137134552002, "num_tokens": 89140065.0, "step": 29825 }, { "entropy": 0.06502757361158729, "epoch": 6.953491082876792, "grad_norm": 0.3671875, "learning_rate": 4.8364055945980704e-05, "loss": 0.0302, "mean_token_accuracy": 0.9865997076034546, "num_tokens": 89167133.0, "step": 29830 }, { "entropy": 0.058092600852251056, "epoch": 6.954656719897424, "grad_norm": 2.34375, "learning_rate": 4.8363300913454396e-05, "loss": 0.0307, "mean_token_accuracy": 0.9902624368667603, "num_tokens": 89185996.0, "step": 29835 }, { "entropy": 0.07471430338919163, "epoch": 6.9558223569180555, "grad_norm": 2.65625, "learning_rate": 4.836254571893993e-05, "loss": 0.0326, "mean_token_accuracy": 0.9865080058574677, "num_tokens": 89210643.0, "step": 29840 }, { "entropy": 0.06368034984916449, "epoch": 6.956987993938688, "grad_norm": 1.21875, "learning_rate": 4.8361790362448564e-05, "loss": 0.0261, "mean_token_accuracy": 0.9888711094856262, "num_tokens": 89233234.0, "step": 29845 }, { "entropy": 0.06988657917827368, "epoch": 6.95815363095932, "grad_norm": 1.3671875, "learning_rate": 4.836103484399157e-05, "loss": 0.0356, "mean_token_accuracy": 0.9887575566768646, "num_tokens": 89252582.0, "step": 29850 }, { "entropy": 0.05337027087807655, "epoch": 6.959319267979951, "grad_norm": 0.396484375, "learning_rate": 4.836027916358021e-05, "loss": 0.0233, "mean_token_accuracy": 0.9896778404712677, "num_tokens": 89286224.0, "step": 29855 }, { "entropy": 0.06208705846220255, "epoch": 6.960484905000583, "grad_norm": 0.25390625, "learning_rate": 4.835952332122576e-05, "loss": 0.0223, "mean_token_accuracy": 0.9893724799156189, "num_tokens": 89310665.0, "step": 29860 }, { "entropy": 0.07674986347556115, "epoch": 6.961650542021214, "grad_norm": 1.828125, "learning_rate": 4.835876731693948e-05, "loss": 0.0564, "mean_token_accuracy": 0.9860752344131469, "num_tokens": 89323958.0, "step": 29865 }, { "entropy": 0.05415382776409387, "epoch": 6.962816179041846, "grad_norm": 0.47265625, "learning_rate": 4.835801115073264e-05, "loss": 0.0375, "mean_token_accuracy": 0.9895894765853882, "num_tokens": 89336539.0, "step": 29870 }, { "entropy": 0.09419967234134674, "epoch": 6.9639818160624785, "grad_norm": 1.7578125, "learning_rate": 4.8357254822616524e-05, "loss": 0.054, "mean_token_accuracy": 0.9822405338287353, "num_tokens": 89345356.0, "step": 29875 }, { "entropy": 0.08086830032989382, "epoch": 6.96514745308311, "grad_norm": 1.421875, "learning_rate": 4.835649833260242e-05, "loss": 0.0447, "mean_token_accuracy": 0.9842866778373718, "num_tokens": 89363463.0, "step": 29880 }, { "entropy": 0.07272738628089429, "epoch": 6.966313090103742, "grad_norm": 2.53125, "learning_rate": 4.835574168070158e-05, "loss": 0.047, "mean_token_accuracy": 0.9860274016857147, "num_tokens": 89373757.0, "step": 29885 }, { "entropy": 0.06665248805657029, "epoch": 6.967478727124373, "grad_norm": 1.703125, "learning_rate": 4.835498486692531e-05, "loss": 0.0366, "mean_token_accuracy": 0.9895995616912842, "num_tokens": 89402671.0, "step": 29890 }, { "entropy": 0.07039319984614849, "epoch": 6.968644364145005, "grad_norm": 1.2890625, "learning_rate": 4.8354227891284895e-05, "loss": 0.0314, "mean_token_accuracy": 0.9923740804195404, "num_tokens": 89420719.0, "step": 29895 }, { "entropy": 0.07703676130622625, "epoch": 6.969810001165637, "grad_norm": 2.28125, "learning_rate": 4.8353470753791616e-05, "loss": 0.047, "mean_token_accuracy": 0.9882120728492737, "num_tokens": 89441246.0, "step": 29900 }, { "entropy": 0.08545290175825357, "epoch": 6.970975638186268, "grad_norm": 3.5625, "learning_rate": 4.8352713454456755e-05, "loss": 0.045, "mean_token_accuracy": 0.987491762638092, "num_tokens": 89455017.0, "step": 29905 }, { "entropy": 0.08963761646300554, "epoch": 6.9721412752069005, "grad_norm": 1.0390625, "learning_rate": 4.835195599329162e-05, "loss": 0.0597, "mean_token_accuracy": 0.9822163105010986, "num_tokens": 89472799.0, "step": 29910 }, { "entropy": 0.10005468633025885, "epoch": 6.973306912227533, "grad_norm": 1.5078125, "learning_rate": 4.83511983703075e-05, "loss": 0.0376, "mean_token_accuracy": 0.9900958836078644, "num_tokens": 89487215.0, "step": 29915 }, { "entropy": 0.07748437076807022, "epoch": 6.974472549248164, "grad_norm": 2.46875, "learning_rate": 4.8350440585515685e-05, "loss": 0.0452, "mean_token_accuracy": 0.9832747519016266, "num_tokens": 89498468.0, "step": 29920 }, { "entropy": 0.06895052138715982, "epoch": 6.975638186268796, "grad_norm": 2.234375, "learning_rate": 4.834968263892748e-05, "loss": 0.0433, "mean_token_accuracy": 0.9874395668506623, "num_tokens": 89518035.0, "step": 29925 }, { "entropy": 0.06828312175348401, "epoch": 6.976803823289428, "grad_norm": 2.0625, "learning_rate": 4.83489245305542e-05, "loss": 0.0297, "mean_token_accuracy": 0.9890495955944061, "num_tokens": 89546377.0, "step": 29930 }, { "entropy": 0.06735577872022987, "epoch": 6.977969460310059, "grad_norm": 1.7265625, "learning_rate": 4.8348166260407126e-05, "loss": 0.0359, "mean_token_accuracy": 0.9898211777210235, "num_tokens": 89563052.0, "step": 29935 }, { "entropy": 0.07529946230351925, "epoch": 6.979135097330691, "grad_norm": 0.70703125, "learning_rate": 4.834740782849758e-05, "loss": 0.0206, "mean_token_accuracy": 0.9889273405075073, "num_tokens": 89585304.0, "step": 29940 }, { "entropy": 0.06895999712869524, "epoch": 6.980300734351323, "grad_norm": 1.1171875, "learning_rate": 4.8346649234836865e-05, "loss": 0.029, "mean_token_accuracy": 0.9874065518379211, "num_tokens": 89606457.0, "step": 29945 }, { "entropy": 0.07593924328684806, "epoch": 6.981466371371955, "grad_norm": 2.296875, "learning_rate": 4.8345890479436295e-05, "loss": 0.042, "mean_token_accuracy": 0.9867578148841858, "num_tokens": 89618224.0, "step": 29950 }, { "entropy": 0.059674417972564696, "epoch": 6.982632008392587, "grad_norm": 1.25, "learning_rate": 4.834513156230719e-05, "loss": 0.028, "mean_token_accuracy": 0.9930515587329865, "num_tokens": 89636322.0, "step": 29955 }, { "entropy": 0.06889725560322404, "epoch": 6.983797645413218, "grad_norm": 0.3125, "learning_rate": 4.834437248346086e-05, "loss": 0.0429, "mean_token_accuracy": 0.9854258477687836, "num_tokens": 89655119.0, "step": 29960 }, { "entropy": 0.08493516966700554, "epoch": 6.98496328243385, "grad_norm": 1.859375, "learning_rate": 4.8343613242908624e-05, "loss": 0.0396, "mean_token_accuracy": 0.9889319539070129, "num_tokens": 89670553.0, "step": 29965 }, { "entropy": 0.06519560664892196, "epoch": 6.986128919454482, "grad_norm": 2.640625, "learning_rate": 4.834285384066181e-05, "loss": 0.04, "mean_token_accuracy": 0.9877417862415314, "num_tokens": 89690403.0, "step": 29970 }, { "entropy": 0.06538761556148528, "epoch": 6.987294556475113, "grad_norm": 1.0390625, "learning_rate": 4.834209427673173e-05, "loss": 0.0407, "mean_token_accuracy": 0.9880840897560119, "num_tokens": 89702398.0, "step": 29975 }, { "entropy": 0.061143916193395854, "epoch": 6.9884601934957455, "grad_norm": 2.078125, "learning_rate": 4.8341334551129716e-05, "loss": 0.035, "mean_token_accuracy": 0.9870936453342438, "num_tokens": 89724457.0, "step": 29980 }, { "entropy": 0.0721611020155251, "epoch": 6.989625830516378, "grad_norm": 1.2421875, "learning_rate": 4.8340574663867105e-05, "loss": 0.0287, "mean_token_accuracy": 0.9912511527538299, "num_tokens": 89741400.0, "step": 29985 }, { "entropy": 0.07327480353415013, "epoch": 6.990791467537009, "grad_norm": 2.3125, "learning_rate": 4.8339814614955216e-05, "loss": 0.06, "mean_token_accuracy": 0.9843566417694092, "num_tokens": 89751318.0, "step": 29990 }, { "entropy": 0.0823413584381342, "epoch": 6.991957104557641, "grad_norm": 2.140625, "learning_rate": 4.8339054404405384e-05, "loss": 0.0423, "mean_token_accuracy": 0.9876652479171752, "num_tokens": 89762386.0, "step": 29995 }, { "entropy": 0.07767243403941393, "epoch": 6.993122741578272, "grad_norm": 1.9140625, "learning_rate": 4.8338294032228954e-05, "loss": 0.0382, "mean_token_accuracy": 0.9865681767463684, "num_tokens": 89783433.0, "step": 30000 }, { "entropy": 0.058387274667620656, "epoch": 6.994288378598904, "grad_norm": 0.6875, "learning_rate": 4.8337533498437256e-05, "loss": 0.0255, "mean_token_accuracy": 0.9925489664077759, "num_tokens": 89813128.0, "step": 30005 }, { "entropy": 0.05891055092215538, "epoch": 6.995454015619536, "grad_norm": 0.55859375, "learning_rate": 4.833677280304163e-05, "loss": 0.043, "mean_token_accuracy": 0.9888588845729828, "num_tokens": 89827575.0, "step": 30010 }, { "entropy": 0.049707501847296956, "epoch": 6.996619652640168, "grad_norm": 0.8359375, "learning_rate": 4.8336011946053426e-05, "loss": 0.0265, "mean_token_accuracy": 0.992890453338623, "num_tokens": 89855135.0, "step": 30015 }, { "entropy": 0.0824981439858675, "epoch": 6.9977852896608, "grad_norm": 0.734375, "learning_rate": 4.833525092748399e-05, "loss": 0.0532, "mean_token_accuracy": 0.983066338300705, "num_tokens": 89867098.0, "step": 30020 }, { "entropy": 0.08736152742058038, "epoch": 6.998950926681431, "grad_norm": 0.455078125, "learning_rate": 4.8334489747344656e-05, "loss": 0.0337, "mean_token_accuracy": 0.9897234261035919, "num_tokens": 89879929.0, "step": 30025 }, { "entropy": 0.15436643744922346, "epoch": 7.0, "grad_norm": 1.4375, "learning_rate": 4.8333728405646787e-05, "loss": 0.2063, "mean_token_accuracy": 0.9645602371957567, "num_tokens": 89903750.0, "step": 30030 }, { "entropy": 0.056994407624006274, "epoch": 7.001165637020632, "grad_norm": 1.4140625, "learning_rate": 4.8332966902401736e-05, "loss": 0.0151, "mean_token_accuracy": 0.9942268610000611, "num_tokens": 89917618.0, "step": 30035 }, { "entropy": 0.05067940205335617, "epoch": 7.002331274041263, "grad_norm": 1.703125, "learning_rate": 4.833220523762085e-05, "loss": 0.0105, "mean_token_accuracy": 0.9973496615886688, "num_tokens": 89942256.0, "step": 30040 }, { "entropy": 0.05828417530283332, "epoch": 7.003496911061895, "grad_norm": 0.32421875, "learning_rate": 4.833144341131549e-05, "loss": 0.0105, "mean_token_accuracy": 0.9979564428329468, "num_tokens": 89962786.0, "step": 30045 }, { "entropy": 0.03842740654945374, "epoch": 7.0046625480825275, "grad_norm": 0.5390625, "learning_rate": 4.833068142349703e-05, "loss": 0.0043, "mean_token_accuracy": 0.9983078181743622, "num_tokens": 89993518.0, "step": 30050 }, { "entropy": 0.07764594964683055, "epoch": 7.005828185103159, "grad_norm": 1.390625, "learning_rate": 4.8329919274176804e-05, "loss": 0.0146, "mean_token_accuracy": 0.9965053021907806, "num_tokens": 90002798.0, "step": 30055 }, { "entropy": 0.04811737164855003, "epoch": 7.006993822123791, "grad_norm": 0.171875, "learning_rate": 4.83291569633662e-05, "loss": 0.0149, "mean_token_accuracy": 0.9973034262657166, "num_tokens": 90031177.0, "step": 30060 }, { "entropy": 0.05407378124073148, "epoch": 7.008159459144422, "grad_norm": 0.224609375, "learning_rate": 4.832839449107658e-05, "loss": 0.0072, "mean_token_accuracy": 0.9978997349739075, "num_tokens": 90061071.0, "step": 30065 }, { "entropy": 0.051250881422311066, "epoch": 7.009325096165054, "grad_norm": 0.65625, "learning_rate": 4.832763185731931e-05, "loss": 0.0058, "mean_token_accuracy": 0.9979933738708496, "num_tokens": 90092153.0, "step": 30070 }, { "entropy": 0.05817384775727987, "epoch": 7.010490733185686, "grad_norm": 0.3515625, "learning_rate": 4.832686906210576e-05, "loss": 0.0071, "mean_token_accuracy": 0.9981231033802033, "num_tokens": 90107207.0, "step": 30075 }, { "entropy": 0.061513442732393744, "epoch": 7.0116563702063175, "grad_norm": 1.1796875, "learning_rate": 4.83261061054473e-05, "loss": 0.005, "mean_token_accuracy": 0.9992294490337372, "num_tokens": 90120331.0, "step": 30080 }, { "entropy": 0.05991012919694185, "epoch": 7.01282200722695, "grad_norm": 3.09375, "learning_rate": 4.832534298735532e-05, "loss": 0.0233, "mean_token_accuracy": 0.9938737094402313, "num_tokens": 90134757.0, "step": 30085 }, { "entropy": 0.05594187341630459, "epoch": 7.013987644247582, "grad_norm": 1.1640625, "learning_rate": 4.8324579707841206e-05, "loss": 0.0175, "mean_token_accuracy": 0.9964846134185791, "num_tokens": 90148305.0, "step": 30090 }, { "entropy": 0.04562456281855702, "epoch": 7.015153281268213, "grad_norm": 0.296875, "learning_rate": 4.832381626691632e-05, "loss": 0.0042, "mean_token_accuracy": 0.9982622146606446, "num_tokens": 90177205.0, "step": 30095 }, { "entropy": 0.05958506986498833, "epoch": 7.016318918288845, "grad_norm": 1.0859375, "learning_rate": 4.832305266459205e-05, "loss": 0.0142, "mean_token_accuracy": 0.9959238231182098, "num_tokens": 90188118.0, "step": 30100 }, { "entropy": 0.03934578532353043, "epoch": 7.017484555309476, "grad_norm": 0.53515625, "learning_rate": 4.832228890087979e-05, "loss": 0.0085, "mean_token_accuracy": 0.9973318040370941, "num_tokens": 90217972.0, "step": 30105 }, { "entropy": 0.06448184214532375, "epoch": 7.018650192330108, "grad_norm": 1.8984375, "learning_rate": 4.832152497579092e-05, "loss": 0.0131, "mean_token_accuracy": 0.9953521192073822, "num_tokens": 90237568.0, "step": 30110 }, { "entropy": 0.07385947611182928, "epoch": 7.01981582935074, "grad_norm": 1.015625, "learning_rate": 4.8320760889336846e-05, "loss": 0.0063, "mean_token_accuracy": 0.9970715284347534, "num_tokens": 90262706.0, "step": 30115 }, { "entropy": 0.07642426621168852, "epoch": 7.020981466371372, "grad_norm": 1.640625, "learning_rate": 4.8319996641528945e-05, "loss": 0.0144, "mean_token_accuracy": 0.996442312002182, "num_tokens": 90279668.0, "step": 30120 }, { "entropy": 0.05433262949809432, "epoch": 7.022147103392004, "grad_norm": 0.2890625, "learning_rate": 4.831923223237862e-05, "loss": 0.0054, "mean_token_accuracy": 0.9964205861091614, "num_tokens": 90302213.0, "step": 30125 }, { "entropy": 0.05782925793901086, "epoch": 7.023312740412636, "grad_norm": 0.458984375, "learning_rate": 4.831846766189727e-05, "loss": 0.0183, "mean_token_accuracy": 0.995810043811798, "num_tokens": 90323727.0, "step": 30130 }, { "entropy": 0.05381921608932316, "epoch": 7.024478377433267, "grad_norm": 1.046875, "learning_rate": 4.831770293009629e-05, "loss": 0.0072, "mean_token_accuracy": 0.9968258440494537, "num_tokens": 90345446.0, "step": 30135 }, { "entropy": 0.05901498403400183, "epoch": 7.025644014453899, "grad_norm": 0.62109375, "learning_rate": 4.83169380369871e-05, "loss": 0.0191, "mean_token_accuracy": 0.9951477110385895, "num_tokens": 90368135.0, "step": 30140 }, { "entropy": 0.05866720397025347, "epoch": 7.02680965147453, "grad_norm": 1.6328125, "learning_rate": 4.831617298258109e-05, "loss": 0.0148, "mean_token_accuracy": 0.996071708202362, "num_tokens": 90387942.0, "step": 30145 }, { "entropy": 0.05969673302024603, "epoch": 7.0279752884951625, "grad_norm": 0.1318359375, "learning_rate": 4.8315407766889665e-05, "loss": 0.0133, "mean_token_accuracy": 0.9976077020168305, "num_tokens": 90441451.0, "step": 30150 }, { "entropy": 0.05562372365966439, "epoch": 7.029140925515795, "grad_norm": 1.6796875, "learning_rate": 4.8314642389924246e-05, "loss": 0.0118, "mean_token_accuracy": 0.9983762323856353, "num_tokens": 90455069.0, "step": 30155 }, { "entropy": 0.05172123843804002, "epoch": 7.030306562536426, "grad_norm": 0.26171875, "learning_rate": 4.831387685169625e-05, "loss": 0.0075, "mean_token_accuracy": 0.996476697921753, "num_tokens": 90473399.0, "step": 30160 }, { "entropy": 0.04368160245940089, "epoch": 7.031472199557058, "grad_norm": 0.287109375, "learning_rate": 4.831311115221708e-05, "loss": 0.0077, "mean_token_accuracy": 0.998294198513031, "num_tokens": 90500992.0, "step": 30165 }, { "entropy": 0.08033382706344128, "epoch": 7.03263783657769, "grad_norm": 2.203125, "learning_rate": 4.8312345291498154e-05, "loss": 0.0106, "mean_token_accuracy": 0.9958215236663819, "num_tokens": 90514663.0, "step": 30170 }, { "entropy": 0.0787579096853733, "epoch": 7.033803473598321, "grad_norm": 4.125, "learning_rate": 4.83115792695509e-05, "loss": 0.0281, "mean_token_accuracy": 0.9938448548316956, "num_tokens": 90530567.0, "step": 30175 }, { "entropy": 0.05292491652071476, "epoch": 7.034969110618953, "grad_norm": 0.51953125, "learning_rate": 4.831081308638674e-05, "loss": 0.0132, "mean_token_accuracy": 0.9956141471862793, "num_tokens": 90559390.0, "step": 30180 }, { "entropy": 0.05749682649038732, "epoch": 7.0361347476395855, "grad_norm": 0.2890625, "learning_rate": 4.83100467420171e-05, "loss": 0.0096, "mean_token_accuracy": 0.9966318845748902, "num_tokens": 90593190.0, "step": 30185 }, { "entropy": 0.060346757806837556, "epoch": 7.037300384660217, "grad_norm": 0.84375, "learning_rate": 4.8309280236453395e-05, "loss": 0.016, "mean_token_accuracy": 0.9961776256561279, "num_tokens": 90607243.0, "step": 30190 }, { "entropy": 0.0677456783130765, "epoch": 7.038466021680849, "grad_norm": 2.0, "learning_rate": 4.830851356970707e-05, "loss": 0.0147, "mean_token_accuracy": 0.9954315066337586, "num_tokens": 90619100.0, "step": 30195 }, { "entropy": 0.045301594864577056, "epoch": 7.03963165870148, "grad_norm": 1.375, "learning_rate": 4.830774674178955e-05, "loss": 0.0057, "mean_token_accuracy": 0.9975920677185058, "num_tokens": 90659501.0, "step": 30200 }, { "entropy": 0.05319325625896454, "epoch": 7.040797295722112, "grad_norm": 0.71875, "learning_rate": 4.8306979752712264e-05, "loss": 0.016, "mean_token_accuracy": 0.9953602194786072, "num_tokens": 90682812.0, "step": 30205 }, { "entropy": 0.036827477253973485, "epoch": 7.041962932742744, "grad_norm": 1.4375, "learning_rate": 4.830621260248667e-05, "loss": 0.0075, "mean_token_accuracy": 0.9976148486137391, "num_tokens": 90725897.0, "step": 30210 }, { "entropy": 0.06985178105533123, "epoch": 7.043128569763375, "grad_norm": 0.90625, "learning_rate": 4.830544529112418e-05, "loss": 0.0064, "mean_token_accuracy": 0.9991308927536011, "num_tokens": 90745736.0, "step": 30215 }, { "entropy": 0.06130665661767125, "epoch": 7.0442942067840075, "grad_norm": 0.484375, "learning_rate": 4.830467781863625e-05, "loss": 0.018, "mean_token_accuracy": 0.9954739928245544, "num_tokens": 90770570.0, "step": 30220 }, { "entropy": 0.05789151154458523, "epoch": 7.04545984380464, "grad_norm": 1.21875, "learning_rate": 4.830391018503433e-05, "loss": 0.015, "mean_token_accuracy": 0.9938674390316009, "num_tokens": 90783819.0, "step": 30225 }, { "entropy": 0.10048946421593427, "epoch": 7.046625480825271, "grad_norm": 0.73046875, "learning_rate": 4.830314239032985e-05, "loss": 0.0036, "mean_token_accuracy": 0.9974006175994873, "num_tokens": 90802624.0, "step": 30230 }, { "entropy": 0.03963483748957515, "epoch": 7.047791117845903, "grad_norm": 0.349609375, "learning_rate": 4.830237443453427e-05, "loss": 0.005, "mean_token_accuracy": 0.9980258822441102, "num_tokens": 90834972.0, "step": 30235 }, { "entropy": 0.05992593262344599, "epoch": 7.048956754866534, "grad_norm": 0.97265625, "learning_rate": 4.830160631765904e-05, "loss": 0.0133, "mean_token_accuracy": 0.9962817609310151, "num_tokens": 90855001.0, "step": 30240 }, { "entropy": 0.052057741489261386, "epoch": 7.050122391887166, "grad_norm": 0.6640625, "learning_rate": 4.830083803971562e-05, "loss": 0.0037, "mean_token_accuracy": 0.999865609407425, "num_tokens": 90872978.0, "step": 30245 }, { "entropy": 0.05767310978844762, "epoch": 7.051288028907798, "grad_norm": 0.392578125, "learning_rate": 4.830006960071545e-05, "loss": 0.0115, "mean_token_accuracy": 0.9978439092636109, "num_tokens": 90904123.0, "step": 30250 }, { "entropy": 0.24345403034240007, "epoch": 7.05245366592843, "grad_norm": 5.6875, "learning_rate": 4.8299301000670006e-05, "loss": 0.3557, "mean_token_accuracy": 0.9590793073177337, "num_tokens": 90939162.0, "step": 30255 }, { "entropy": 0.08307226374745369, "epoch": 7.053619302949062, "grad_norm": 1.09375, "learning_rate": 4.829853223959073e-05, "loss": 0.0143, "mean_token_accuracy": 0.9957392334938049, "num_tokens": 90956827.0, "step": 30260 }, { "entropy": 0.050349775422364476, "epoch": 7.054784939969694, "grad_norm": 0.59765625, "learning_rate": 4.8297763317489107e-05, "loss": 0.0092, "mean_token_accuracy": 0.9935024440288543, "num_tokens": 90989495.0, "step": 30265 }, { "entropy": 0.0740093344822526, "epoch": 7.055950576990325, "grad_norm": 0.87890625, "learning_rate": 4.829699423437659e-05, "loss": 0.0069, "mean_token_accuracy": 0.9964289367198944, "num_tokens": 91002014.0, "step": 30270 }, { "entropy": 0.04686462339013815, "epoch": 7.057116214010957, "grad_norm": 1.203125, "learning_rate": 4.829622499026465e-05, "loss": 0.0107, "mean_token_accuracy": 0.997639638185501, "num_tokens": 91039458.0, "step": 30275 }, { "entropy": 0.060538587532937525, "epoch": 7.058281851031588, "grad_norm": 1.5078125, "learning_rate": 4.829545558516475e-05, "loss": 0.0121, "mean_token_accuracy": 0.9965390801429749, "num_tokens": 91062900.0, "step": 30280 }, { "entropy": 0.05883708633482456, "epoch": 7.05944748805222, "grad_norm": 4.0625, "learning_rate": 4.8294686019088374e-05, "loss": 0.0164, "mean_token_accuracy": 0.9945460319519043, "num_tokens": 91080738.0, "step": 30285 }, { "entropy": 0.05134750343859196, "epoch": 7.0606131250728525, "grad_norm": 4.40625, "learning_rate": 4.829391629204699e-05, "loss": 0.0223, "mean_token_accuracy": 0.9939786851406097, "num_tokens": 91115245.0, "step": 30290 }, { "entropy": 0.08587418049573899, "epoch": 7.061778762093484, "grad_norm": 1.1953125, "learning_rate": 4.829314640405209e-05, "loss": 0.0136, "mean_token_accuracy": 0.9956977784633636, "num_tokens": 91122351.0, "step": 30295 }, { "entropy": 0.06100914310663939, "epoch": 7.062944399114116, "grad_norm": 2.734375, "learning_rate": 4.829237635511514e-05, "loss": 0.0115, "mean_token_accuracy": 0.9947611093521118, "num_tokens": 91147872.0, "step": 30300 }, { "entropy": 0.07020860947668553, "epoch": 7.064110036134748, "grad_norm": 2.46875, "learning_rate": 4.829160614524762e-05, "loss": 0.0106, "mean_token_accuracy": 0.9977278351783753, "num_tokens": 91161175.0, "step": 30305 }, { "entropy": 0.052560966834425925, "epoch": 7.065275673155379, "grad_norm": 3.484375, "learning_rate": 4.829083577446102e-05, "loss": 0.0178, "mean_token_accuracy": 0.9957786858081817, "num_tokens": 91184931.0, "step": 30310 }, { "entropy": 0.06599666997790336, "epoch": 7.066441310176011, "grad_norm": 3.84375, "learning_rate": 4.829006524276684e-05, "loss": 0.0194, "mean_token_accuracy": 0.9962544083595276, "num_tokens": 91198511.0, "step": 30315 }, { "entropy": 0.061596107203513384, "epoch": 7.067606947196643, "grad_norm": 1.4296875, "learning_rate": 4.8289294550176545e-05, "loss": 0.013, "mean_token_accuracy": 0.994150060415268, "num_tokens": 91219731.0, "step": 30320 }, { "entropy": 0.07121949885040521, "epoch": 7.068772584217275, "grad_norm": 1.453125, "learning_rate": 4.828852369670164e-05, "loss": 0.0177, "mean_token_accuracy": 0.9964398920536042, "num_tokens": 91232224.0, "step": 30325 }, { "entropy": 0.06412322130054235, "epoch": 7.069938221237907, "grad_norm": 0.25390625, "learning_rate": 4.8287752682353626e-05, "loss": 0.0194, "mean_token_accuracy": 0.9957955598831176, "num_tokens": 91248088.0, "step": 30330 }, { "entropy": 0.07131289504468441, "epoch": 7.071103858258538, "grad_norm": 0.65625, "learning_rate": 4.828698150714399e-05, "loss": 0.013, "mean_token_accuracy": 0.9965978026390075, "num_tokens": 91257256.0, "step": 30335 }, { "entropy": 0.07580037731677294, "epoch": 7.07226949527917, "grad_norm": 0.75, "learning_rate": 4.828621017108424e-05, "loss": 0.0295, "mean_token_accuracy": 0.9948435366153717, "num_tokens": 91278451.0, "step": 30340 }, { "entropy": 0.11762396842241288, "epoch": 7.073435132299802, "grad_norm": 0.453125, "learning_rate": 4.8285438674185873e-05, "loss": 0.0959, "mean_token_accuracy": 0.97947016954422, "num_tokens": 91309859.0, "step": 30345 }, { "entropy": 0.05074176751077175, "epoch": 7.074600769320433, "grad_norm": 3.421875, "learning_rate": 4.828466701646039e-05, "loss": 0.0123, "mean_token_accuracy": 0.9963578641414642, "num_tokens": 91336540.0, "step": 30350 }, { "entropy": 0.06217975839972496, "epoch": 7.0757664063410655, "grad_norm": 0.5, "learning_rate": 4.8283895197919304e-05, "loss": 0.0191, "mean_token_accuracy": 0.9958511829376221, "num_tokens": 91347859.0, "step": 30355 }, { "entropy": 0.1270772408694029, "epoch": 7.076932043361698, "grad_norm": 1.4609375, "learning_rate": 4.8283123218574116e-05, "loss": 0.0067, "mean_token_accuracy": 0.9971695721149445, "num_tokens": 91357100.0, "step": 30360 }, { "entropy": 0.056195403542369605, "epoch": 7.078097680382329, "grad_norm": 0.2412109375, "learning_rate": 4.8282351078436345e-05, "loss": 0.0036, "mean_token_accuracy": 0.9992462873458863, "num_tokens": 91391188.0, "step": 30365 }, { "entropy": 0.06651196293532849, "epoch": 7.079263317402961, "grad_norm": 0.353515625, "learning_rate": 4.82815787775175e-05, "loss": 0.0132, "mean_token_accuracy": 0.9964940130710602, "num_tokens": 91402433.0, "step": 30370 }, { "entropy": 0.05127828456461429, "epoch": 7.080428954423592, "grad_norm": 1.3203125, "learning_rate": 4.82808063158291e-05, "loss": 0.0057, "mean_token_accuracy": 0.9980728805065155, "num_tokens": 91430005.0, "step": 30375 }, { "entropy": 0.05886371675878763, "epoch": 7.081594591444224, "grad_norm": 0.294921875, "learning_rate": 4.8280033693382664e-05, "loss": 0.0096, "mean_token_accuracy": 0.997169828414917, "num_tokens": 91456501.0, "step": 30380 }, { "entropy": 0.06507110595703125, "epoch": 7.082760228464856, "grad_norm": 0.85546875, "learning_rate": 4.827926091018971e-05, "loss": 0.0115, "mean_token_accuracy": 0.995796662569046, "num_tokens": 91476101.0, "step": 30385 }, { "entropy": 0.062191806919872764, "epoch": 7.0839258654854875, "grad_norm": 0.53125, "learning_rate": 4.8278487966261765e-05, "loss": 0.0039, "mean_token_accuracy": 0.9979954600334168, "num_tokens": 91514310.0, "step": 30390 }, { "entropy": 0.058631454780697825, "epoch": 7.08509150250612, "grad_norm": 1.2265625, "learning_rate": 4.827771486161035e-05, "loss": 0.0115, "mean_token_accuracy": 0.9947159767150879, "num_tokens": 91537150.0, "step": 30395 }, { "entropy": 0.06342284716665744, "epoch": 7.086257139526752, "grad_norm": 0.435546875, "learning_rate": 4.8276941596246994e-05, "loss": 0.0179, "mean_token_accuracy": 0.994976383447647, "num_tokens": 91558164.0, "step": 30400 }, { "entropy": 0.0707466502673924, "epoch": 7.087422776547383, "grad_norm": 1.625, "learning_rate": 4.8276168170183233e-05, "loss": 0.0087, "mean_token_accuracy": 0.998279196023941, "num_tokens": 91586951.0, "step": 30405 }, { "entropy": 0.05743299555033445, "epoch": 7.088588413568015, "grad_norm": 1.296875, "learning_rate": 4.8275394583430594e-05, "loss": 0.0123, "mean_token_accuracy": 0.9970780968666076, "num_tokens": 91619188.0, "step": 30410 }, { "entropy": 0.05649949889630079, "epoch": 7.089754050588646, "grad_norm": 0.375, "learning_rate": 4.8274620836000616e-05, "loss": 0.0058, "mean_token_accuracy": 0.9983665943145752, "num_tokens": 91636413.0, "step": 30415 }, { "entropy": 0.05755188856273889, "epoch": 7.090919687609278, "grad_norm": 1.1171875, "learning_rate": 4.827384692790484e-05, "loss": 0.005, "mean_token_accuracy": 0.9977169036865234, "num_tokens": 91657970.0, "step": 30420 }, { "entropy": 0.0699540264904499, "epoch": 7.0920853246299105, "grad_norm": 0.275390625, "learning_rate": 4.8273072859154796e-05, "loss": 0.0077, "mean_token_accuracy": 0.9979713082313537, "num_tokens": 91684805.0, "step": 30425 }, { "entropy": 0.060055936314165594, "epoch": 7.093250961650542, "grad_norm": 0.921875, "learning_rate": 4.8272298629762033e-05, "loss": 0.0112, "mean_token_accuracy": 0.998055511713028, "num_tokens": 91712342.0, "step": 30430 }, { "entropy": 0.0625823263078928, "epoch": 7.094416598671174, "grad_norm": 2.96875, "learning_rate": 4.827152423973809e-05, "loss": 0.0215, "mean_token_accuracy": 0.995892733335495, "num_tokens": 91722750.0, "step": 30435 }, { "entropy": 0.13746389281004667, "epoch": 7.095582235691806, "grad_norm": 0.53515625, "learning_rate": 4.827074968909453e-05, "loss": 0.1425, "mean_token_accuracy": 0.9803636312484741, "num_tokens": 91753933.0, "step": 30440 }, { "entropy": 0.07929829806089402, "epoch": 7.096747872712437, "grad_norm": 1.4453125, "learning_rate": 4.826997497784289e-05, "loss": 0.0238, "mean_token_accuracy": 0.9956888616085052, "num_tokens": 91775313.0, "step": 30445 }, { "entropy": 0.06876232139766217, "epoch": 7.097913509733069, "grad_norm": 2.796875, "learning_rate": 4.826920010599472e-05, "loss": 0.016, "mean_token_accuracy": 0.996614670753479, "num_tokens": 91799765.0, "step": 30450 }, { "entropy": 0.07557164933532476, "epoch": 7.099079146753701, "grad_norm": 0.65234375, "learning_rate": 4.8268425073561574e-05, "loss": 0.0144, "mean_token_accuracy": 0.9956696331501007, "num_tokens": 91812729.0, "step": 30455 }, { "entropy": 0.07304494511336088, "epoch": 7.1002447837743325, "grad_norm": 0.81640625, "learning_rate": 4.826764988055502e-05, "loss": 0.0063, "mean_token_accuracy": 0.9968676388263702, "num_tokens": 91834090.0, "step": 30460 }, { "entropy": 0.062426519207656385, "epoch": 7.101410420794965, "grad_norm": 0.26953125, "learning_rate": 4.82668745269866e-05, "loss": 0.0077, "mean_token_accuracy": 0.9980383157730103, "num_tokens": 91849929.0, "step": 30465 }, { "entropy": 0.09062138125300408, "epoch": 7.102576057815596, "grad_norm": 3.3125, "learning_rate": 4.826609901286791e-05, "loss": 0.0194, "mean_token_accuracy": 0.9943084239959716, "num_tokens": 91859970.0, "step": 30470 }, { "entropy": 0.056807669810950756, "epoch": 7.103741694836228, "grad_norm": 0.2578125, "learning_rate": 4.826532333821047e-05, "loss": 0.0083, "mean_token_accuracy": 0.9982002913951874, "num_tokens": 91886469.0, "step": 30475 }, { "entropy": 0.04765348536893725, "epoch": 7.10490733185686, "grad_norm": 0.3515625, "learning_rate": 4.826454750302587e-05, "loss": 0.0137, "mean_token_accuracy": 0.9960334420204162, "num_tokens": 91914766.0, "step": 30480 }, { "entropy": 0.08689562901854515, "epoch": 7.106072968877491, "grad_norm": 0.80859375, "learning_rate": 4.8263771507325685e-05, "loss": 0.0131, "mean_token_accuracy": 0.9975087523460389, "num_tokens": 91943211.0, "step": 30485 }, { "entropy": 0.0745643438771367, "epoch": 7.107238605898123, "grad_norm": 1.328125, "learning_rate": 4.826299535112147e-05, "loss": 0.0079, "mean_token_accuracy": 0.9984472215175628, "num_tokens": 91961538.0, "step": 30490 }, { "entropy": 0.06446758769452572, "epoch": 7.1084042429187555, "grad_norm": 0.88671875, "learning_rate": 4.826221903442481e-05, "loss": 0.0118, "mean_token_accuracy": 0.9975184500217438, "num_tokens": 91970955.0, "step": 30495 }, { "entropy": 0.042829998023808005, "epoch": 7.109569879939387, "grad_norm": 0.90625, "learning_rate": 4.826144255724727e-05, "loss": 0.0091, "mean_token_accuracy": 0.9957435607910157, "num_tokens": 92002665.0, "step": 30500 }, { "entropy": 0.08332742396742106, "epoch": 7.110735516960019, "grad_norm": 0.6640625, "learning_rate": 4.8260665919600436e-05, "loss": 0.0766, "mean_token_accuracy": 0.9868705093860626, "num_tokens": 92028846.0, "step": 30505 }, { "entropy": 0.05979088693857193, "epoch": 7.11190115398065, "grad_norm": 0.66015625, "learning_rate": 4.82598891214959e-05, "loss": 0.0177, "mean_token_accuracy": 0.9947042167186737, "num_tokens": 92044394.0, "step": 30510 }, { "entropy": 0.06625119373202323, "epoch": 7.113066791001282, "grad_norm": 0.306640625, "learning_rate": 4.8259112162945225e-05, "loss": 0.0181, "mean_token_accuracy": 0.9953732848167419, "num_tokens": 92058160.0, "step": 30515 }, { "entropy": 0.06746600233018399, "epoch": 7.114232428021914, "grad_norm": 0.3203125, "learning_rate": 4.825833504396e-05, "loss": 0.0099, "mean_token_accuracy": 0.9972993493080139, "num_tokens": 92081803.0, "step": 30520 }, { "entropy": 0.04053811193443835, "epoch": 7.1153980650425455, "grad_norm": 0.392578125, "learning_rate": 4.8257557764551826e-05, "loss": 0.0046, "mean_token_accuracy": 0.9975557744503021, "num_tokens": 92119868.0, "step": 30525 }, { "entropy": 0.05157309314236045, "epoch": 7.116563702063178, "grad_norm": 0.1845703125, "learning_rate": 4.825678032473229e-05, "loss": 0.0084, "mean_token_accuracy": 0.9957024157047272, "num_tokens": 92153375.0, "step": 30530 }, { "entropy": 0.06940113585442305, "epoch": 7.11772933908381, "grad_norm": 1.0859375, "learning_rate": 4.8256002724512964e-05, "loss": 0.0162, "mean_token_accuracy": 0.9951586663722992, "num_tokens": 92172516.0, "step": 30535 }, { "entropy": 0.07014297656714916, "epoch": 7.118894976104441, "grad_norm": 6.40625, "learning_rate": 4.825522496390547e-05, "loss": 0.0181, "mean_token_accuracy": 0.9962278366088867, "num_tokens": 92182097.0, "step": 30540 }, { "entropy": 0.052831696905195714, "epoch": 7.120060613125073, "grad_norm": 2.0625, "learning_rate": 4.8254447042921394e-05, "loss": 0.01, "mean_token_accuracy": 0.9970639288425446, "num_tokens": 92202430.0, "step": 30545 }, { "entropy": 0.061360220145434144, "epoch": 7.121226250145704, "grad_norm": 1.3203125, "learning_rate": 4.825366896157234e-05, "loss": 0.0063, "mean_token_accuracy": 0.9982588768005372, "num_tokens": 92225929.0, "step": 30550 }, { "entropy": 0.07295540943741799, "epoch": 7.122391887166336, "grad_norm": 0.890625, "learning_rate": 4.82528907198699e-05, "loss": 0.0172, "mean_token_accuracy": 0.9961545705795288, "num_tokens": 92242351.0, "step": 30555 }, { "entropy": 0.0615655729547143, "epoch": 7.123557524186968, "grad_norm": 0.78125, "learning_rate": 4.825211231782569e-05, "loss": 0.01, "mean_token_accuracy": 0.9978924512863159, "num_tokens": 92256604.0, "step": 30560 }, { "entropy": 0.07092278450727463, "epoch": 7.1247231612076, "grad_norm": 2.9375, "learning_rate": 4.825133375545132e-05, "loss": 0.0163, "mean_token_accuracy": 0.9958444178104401, "num_tokens": 92267376.0, "step": 30565 }, { "entropy": 0.05723753683269024, "epoch": 7.125888798228232, "grad_norm": 1.5625, "learning_rate": 4.825055503275838e-05, "loss": 0.0065, "mean_token_accuracy": 0.9972610890865325, "num_tokens": 92291685.0, "step": 30570 }, { "entropy": 0.056301530078053476, "epoch": 7.127054435248864, "grad_norm": 1.6796875, "learning_rate": 4.82497761497585e-05, "loss": 0.0059, "mean_token_accuracy": 0.9979709684848785, "num_tokens": 92316387.0, "step": 30575 }, { "entropy": 0.11123329075053334, "epoch": 7.128220072269495, "grad_norm": 7.5625, "learning_rate": 4.824899710646329e-05, "loss": 0.0204, "mean_token_accuracy": 0.9949623763561248, "num_tokens": 92329740.0, "step": 30580 }, { "entropy": 0.061691082268953326, "epoch": 7.129385709290127, "grad_norm": 1.3046875, "learning_rate": 4.824821790288437e-05, "loss": 0.0169, "mean_token_accuracy": 0.9934255063533783, "num_tokens": 92342801.0, "step": 30585 }, { "entropy": 0.037779290787875654, "epoch": 7.130551346310758, "grad_norm": 0.3671875, "learning_rate": 4.824743853903335e-05, "loss": 0.0083, "mean_token_accuracy": 0.9969915330410004, "num_tokens": 92373646.0, "step": 30590 }, { "entropy": 0.06015878664329648, "epoch": 7.1317169833313905, "grad_norm": 2.234375, "learning_rate": 4.8246659014921855e-05, "loss": 0.0133, "mean_token_accuracy": 0.9963972091674804, "num_tokens": 92387023.0, "step": 30595 }, { "entropy": 0.09237681282684207, "epoch": 7.132882620352023, "grad_norm": 0.55859375, "learning_rate": 4.8245879330561514e-05, "loss": 0.0115, "mean_token_accuracy": 0.994150698184967, "num_tokens": 92408623.0, "step": 30600 }, { "entropy": 0.08946602549403906, "epoch": 7.134048257372654, "grad_norm": 1.90625, "learning_rate": 4.8245099485963944e-05, "loss": 0.0068, "mean_token_accuracy": 0.9951623618602753, "num_tokens": 92419455.0, "step": 30605 }, { "entropy": 0.08082111086696386, "epoch": 7.135213894393286, "grad_norm": 0.73828125, "learning_rate": 4.824431948114079e-05, "loss": 0.0116, "mean_token_accuracy": 0.9975998342037201, "num_tokens": 92429612.0, "step": 30610 }, { "entropy": 0.06038438286632299, "epoch": 7.136379531413918, "grad_norm": 0.341796875, "learning_rate": 4.8243539316103656e-05, "loss": 0.0062, "mean_token_accuracy": 0.9981464624404908, "num_tokens": 92456992.0, "step": 30615 }, { "entropy": 0.0631004961207509, "epoch": 7.137545168434549, "grad_norm": 1.1875, "learning_rate": 4.82427589908642e-05, "loss": 0.012, "mean_token_accuracy": 0.9945440351963043, "num_tokens": 92473120.0, "step": 30620 }, { "entropy": 0.06316535836085677, "epoch": 7.138710805455181, "grad_norm": 1.796875, "learning_rate": 4.8241978505434056e-05, "loss": 0.0218, "mean_token_accuracy": 0.9941232442855835, "num_tokens": 92494820.0, "step": 30625 }, { "entropy": 0.05482297632843256, "epoch": 7.139876442475813, "grad_norm": 0.388671875, "learning_rate": 4.824119785982485e-05, "loss": 0.0082, "mean_token_accuracy": 0.9979051113128662, "num_tokens": 92519947.0, "step": 30630 }, { "entropy": 0.04249793980270624, "epoch": 7.141042079496445, "grad_norm": 1.8515625, "learning_rate": 4.824041705404822e-05, "loss": 0.0073, "mean_token_accuracy": 0.9972292304039001, "num_tokens": 92551694.0, "step": 30635 }, { "entropy": 0.052819710597395896, "epoch": 7.142207716517077, "grad_norm": 1.2421875, "learning_rate": 4.823963608811583e-05, "loss": 0.0045, "mean_token_accuracy": 0.99895738363266, "num_tokens": 92585379.0, "step": 30640 }, { "entropy": 0.06095590833574534, "epoch": 7.143373353537708, "grad_norm": 2.8125, "learning_rate": 4.823885496203931e-05, "loss": 0.0138, "mean_token_accuracy": 0.9947236835956573, "num_tokens": 92605361.0, "step": 30645 }, { "entropy": 0.07070263214409352, "epoch": 7.14453899055834, "grad_norm": 2.90625, "learning_rate": 4.82380736758303e-05, "loss": 0.0223, "mean_token_accuracy": 0.9947317957878112, "num_tokens": 92614248.0, "step": 30650 }, { "entropy": 0.04973842538893223, "epoch": 7.145704627578972, "grad_norm": 0.32421875, "learning_rate": 4.823729222950047e-05, "loss": 0.0057, "mean_token_accuracy": 0.9984661042690277, "num_tokens": 92642526.0, "step": 30655 }, { "entropy": 0.05830220179632306, "epoch": 7.146870264599603, "grad_norm": 2.765625, "learning_rate": 4.823651062306146e-05, "loss": 0.0081, "mean_token_accuracy": 0.9965169250965118, "num_tokens": 92673422.0, "step": 30660 }, { "entropy": 0.05718004386872053, "epoch": 7.1480359016202355, "grad_norm": 0.234375, "learning_rate": 4.8235728856524934e-05, "loss": 0.0093, "mean_token_accuracy": 0.9970381677150726, "num_tokens": 92694855.0, "step": 30665 }, { "entropy": 0.07355883046984672, "epoch": 7.149201538640868, "grad_norm": 0.265625, "learning_rate": 4.823494692990254e-05, "loss": 0.0143, "mean_token_accuracy": 0.995150500535965, "num_tokens": 92706677.0, "step": 30670 }, { "entropy": 0.07787001654505729, "epoch": 7.150367175661499, "grad_norm": 0.9453125, "learning_rate": 4.823416484320594e-05, "loss": 0.0156, "mean_token_accuracy": 0.9955798864364624, "num_tokens": 92728270.0, "step": 30675 }, { "entropy": 0.07409054469317197, "epoch": 7.151532812682131, "grad_norm": 0.482421875, "learning_rate": 4.82333825964468e-05, "loss": 0.0127, "mean_token_accuracy": 0.9979703485965729, "num_tokens": 92741655.0, "step": 30680 }, { "entropy": 0.06822916008532047, "epoch": 7.152698449702762, "grad_norm": 6.5, "learning_rate": 4.8232600189636775e-05, "loss": 0.0165, "mean_token_accuracy": 0.995071417093277, "num_tokens": 92769257.0, "step": 30685 }, { "entropy": 0.07879522778093814, "epoch": 7.153864086723394, "grad_norm": 1.453125, "learning_rate": 4.823181762278754e-05, "loss": 0.0063, "mean_token_accuracy": 0.9965352594852448, "num_tokens": 92786415.0, "step": 30690 }, { "entropy": 0.06394462883472443, "epoch": 7.155029723744026, "grad_norm": 2.765625, "learning_rate": 4.8231034895910766e-05, "loss": 0.0059, "mean_token_accuracy": 0.9973495721817016, "num_tokens": 92813736.0, "step": 30695 }, { "entropy": 0.07654921635985375, "epoch": 7.156195360764658, "grad_norm": 1.890625, "learning_rate": 4.8230252009018116e-05, "loss": 0.0154, "mean_token_accuracy": 0.9948624432086944, "num_tokens": 92825599.0, "step": 30700 }, { "entropy": 0.07231663931161166, "epoch": 7.15736099778529, "grad_norm": 1.4765625, "learning_rate": 4.8229468962121274e-05, "loss": 0.0146, "mean_token_accuracy": 0.9950795829296112, "num_tokens": 92837209.0, "step": 30705 }, { "entropy": 0.05991370286792517, "epoch": 7.158526634805922, "grad_norm": 4.4375, "learning_rate": 4.8228685755231916e-05, "loss": 0.0072, "mean_token_accuracy": 0.997044563293457, "num_tokens": 92852078.0, "step": 30710 }, { "entropy": 0.07421232592314482, "epoch": 7.159692271826553, "grad_norm": 0.248046875, "learning_rate": 4.822790238836171e-05, "loss": 0.0123, "mean_token_accuracy": 0.9958754122257233, "num_tokens": 92876834.0, "step": 30715 }, { "entropy": 0.05952296406030655, "epoch": 7.160857908847185, "grad_norm": 2.953125, "learning_rate": 4.822711886152235e-05, "loss": 0.0143, "mean_token_accuracy": 0.9964723646640777, "num_tokens": 92908025.0, "step": 30720 }, { "entropy": 0.06198753379285336, "epoch": 7.162023545867816, "grad_norm": 0.271484375, "learning_rate": 4.82263351747255e-05, "loss": 0.0111, "mean_token_accuracy": 0.9940212309360504, "num_tokens": 92935393.0, "step": 30725 }, { "entropy": 0.05733865359798074, "epoch": 7.163189182888448, "grad_norm": 0.296875, "learning_rate": 4.8225551327982874e-05, "loss": 0.0128, "mean_token_accuracy": 0.9962757170200348, "num_tokens": 92975482.0, "step": 30730 }, { "entropy": 0.07880000211298466, "epoch": 7.1643548199090805, "grad_norm": 3.625, "learning_rate": 4.8224767321306135e-05, "loss": 0.0196, "mean_token_accuracy": 0.9946611404418946, "num_tokens": 92983434.0, "step": 30735 }, { "entropy": 0.0455553256906569, "epoch": 7.165520456929712, "grad_norm": 0.306640625, "learning_rate": 4.822398315470699e-05, "loss": 0.0061, "mean_token_accuracy": 0.9972747147083283, "num_tokens": 93012570.0, "step": 30740 }, { "entropy": 0.06350925117731095, "epoch": 7.166686093950344, "grad_norm": 0.765625, "learning_rate": 4.822319882819713e-05, "loss": 0.0207, "mean_token_accuracy": 0.9927310764789581, "num_tokens": 93026257.0, "step": 30745 }, { "entropy": 0.07950380994006992, "epoch": 7.167851730970976, "grad_norm": 1.0234375, "learning_rate": 4.8222414341788236e-05, "loss": 0.0097, "mean_token_accuracy": 0.9974476456642151, "num_tokens": 93038257.0, "step": 30750 }, { "entropy": 0.07047421857714653, "epoch": 7.169017367991607, "grad_norm": 2.09375, "learning_rate": 4.822162969549202e-05, "loss": 0.0133, "mean_token_accuracy": 0.9962194740772248, "num_tokens": 93051971.0, "step": 30755 }, { "entropy": 0.07937701903283596, "epoch": 7.170183005012239, "grad_norm": 0.65234375, "learning_rate": 4.8220844889320184e-05, "loss": 0.0227, "mean_token_accuracy": 0.9954998016357421, "num_tokens": 93061600.0, "step": 30760 }, { "entropy": 0.06413892963901162, "epoch": 7.171348642032871, "grad_norm": 0.341796875, "learning_rate": 4.822005992328442e-05, "loss": 0.0243, "mean_token_accuracy": 0.9955684006214142, "num_tokens": 93086529.0, "step": 30765 }, { "entropy": 0.05696111330762506, "epoch": 7.172514279053503, "grad_norm": 0.8125, "learning_rate": 4.821927479739645e-05, "loss": 0.0105, "mean_token_accuracy": 0.9965245842933654, "num_tokens": 93106437.0, "step": 30770 }, { "entropy": 0.0651092673651874, "epoch": 7.173679916074135, "grad_norm": 1.34375, "learning_rate": 4.821848951166796e-05, "loss": 0.0072, "mean_token_accuracy": 0.9980071961879731, "num_tokens": 93128982.0, "step": 30775 }, { "entropy": 0.04683809150010347, "epoch": 7.174845553094766, "grad_norm": 1.734375, "learning_rate": 4.821770406611067e-05, "loss": 0.0073, "mean_token_accuracy": 0.9978996217250824, "num_tokens": 93156822.0, "step": 30780 }, { "entropy": 0.09004619605839252, "epoch": 7.176011190115398, "grad_norm": 2.71875, "learning_rate": 4.821691846073629e-05, "loss": 0.0213, "mean_token_accuracy": 0.9955195188522339, "num_tokens": 93164597.0, "step": 30785 }, { "entropy": 0.0713108105584979, "epoch": 7.17717682713603, "grad_norm": 0.90625, "learning_rate": 4.821613269555654e-05, "loss": 0.0093, "mean_token_accuracy": 0.996299684047699, "num_tokens": 93185720.0, "step": 30790 }, { "entropy": 0.08426001332700253, "epoch": 7.178342464156661, "grad_norm": 3.78125, "learning_rate": 4.821534677058314e-05, "loss": 0.0268, "mean_token_accuracy": 0.9953032076358795, "num_tokens": 93202802.0, "step": 30795 }, { "entropy": 0.05344439307227731, "epoch": 7.179508101177293, "grad_norm": 2.703125, "learning_rate": 4.821456068582779e-05, "loss": 0.0099, "mean_token_accuracy": 0.9985413789749146, "num_tokens": 93235860.0, "step": 30800 }, { "entropy": 0.06476940959692001, "epoch": 7.1806737381979255, "grad_norm": 0.84375, "learning_rate": 4.821377444130223e-05, "loss": 0.0155, "mean_token_accuracy": 0.9960738599300385, "num_tokens": 93246928.0, "step": 30805 }, { "entropy": 0.08522103652358055, "epoch": 7.181839375218557, "grad_norm": 4.34375, "learning_rate": 4.821298803701819e-05, "loss": 0.0307, "mean_token_accuracy": 0.9906064748764039, "num_tokens": 93256045.0, "step": 30810 }, { "entropy": 0.06372003946453333, "epoch": 7.183005012239189, "grad_norm": 2.703125, "learning_rate": 4.8212201472987374e-05, "loss": 0.0134, "mean_token_accuracy": 0.9962263941764832, "num_tokens": 93272200.0, "step": 30815 }, { "entropy": 0.06631615618243814, "epoch": 7.18417064925982, "grad_norm": 0.98828125, "learning_rate": 4.8211414749221515e-05, "loss": 0.0139, "mean_token_accuracy": 0.9977785289287567, "num_tokens": 93303496.0, "step": 30820 }, { "entropy": 0.07567157708108425, "epoch": 7.185336286280452, "grad_norm": 2.140625, "learning_rate": 4.821062786573236e-05, "loss": 0.0147, "mean_token_accuracy": 0.9937583327293396, "num_tokens": 93314712.0, "step": 30825 }, { "entropy": 0.08115587830543518, "epoch": 7.186501923301084, "grad_norm": 3.15625, "learning_rate": 4.8209840822531635e-05, "loss": 0.013, "mean_token_accuracy": 0.994137454032898, "num_tokens": 93326234.0, "step": 30830 }, { "entropy": 0.05947356568649411, "epoch": 7.1876675603217155, "grad_norm": 0.375, "learning_rate": 4.820905361963107e-05, "loss": 0.0178, "mean_token_accuracy": 0.995909696817398, "num_tokens": 93358945.0, "step": 30835 }, { "entropy": 0.07598287798464298, "epoch": 7.188833197342348, "grad_norm": 2.828125, "learning_rate": 4.820826625704242e-05, "loss": 0.0094, "mean_token_accuracy": 0.9963949680328369, "num_tokens": 93374097.0, "step": 30840 }, { "entropy": 0.06978492699563503, "epoch": 7.18999883436298, "grad_norm": 2.734375, "learning_rate": 4.82074787347774e-05, "loss": 0.0144, "mean_token_accuracy": 0.9953686773777009, "num_tokens": 93390116.0, "step": 30845 }, { "entropy": 0.0937451772391796, "epoch": 7.191164471383611, "grad_norm": 0.9140625, "learning_rate": 4.820669105284778e-05, "loss": 0.0216, "mean_token_accuracy": 0.9946691036224365, "num_tokens": 93400332.0, "step": 30850 }, { "entropy": 0.05569373117759824, "epoch": 7.192330108404243, "grad_norm": 0.470703125, "learning_rate": 4.820590321126528e-05, "loss": 0.0095, "mean_token_accuracy": 0.9965024709701538, "num_tokens": 93424812.0, "step": 30855 }, { "entropy": 0.05078269252553582, "epoch": 7.193495745424874, "grad_norm": 1.5546875, "learning_rate": 4.8205115210041665e-05, "loss": 0.0096, "mean_token_accuracy": 0.9979369282722473, "num_tokens": 93443474.0, "step": 30860 }, { "entropy": 0.07130427304655314, "epoch": 7.194661382445506, "grad_norm": 0.9296875, "learning_rate": 4.820432704918868e-05, "loss": 0.016, "mean_token_accuracy": 0.9947200953960419, "num_tokens": 93457213.0, "step": 30865 }, { "entropy": 0.0634447991847992, "epoch": 7.1958270194661385, "grad_norm": 1.171875, "learning_rate": 4.820353872871808e-05, "loss": 0.0107, "mean_token_accuracy": 0.9964709997177124, "num_tokens": 93475313.0, "step": 30870 }, { "entropy": 0.07064968943595887, "epoch": 7.19699265648677, "grad_norm": 0.92578125, "learning_rate": 4.820275024864162e-05, "loss": 0.014, "mean_token_accuracy": 0.9950363337993622, "num_tokens": 93484861.0, "step": 30875 }, { "entropy": 0.068212578445673, "epoch": 7.198158293507402, "grad_norm": 1.59375, "learning_rate": 4.8201961608971055e-05, "loss": 0.0108, "mean_token_accuracy": 0.9966789484024048, "num_tokens": 93497639.0, "step": 30880 }, { "entropy": 0.08882842306047678, "epoch": 7.199323930528034, "grad_norm": 2.921875, "learning_rate": 4.820117280971814e-05, "loss": 0.0233, "mean_token_accuracy": 0.9924453973770142, "num_tokens": 93514110.0, "step": 30885 }, { "entropy": 0.07468615574762225, "epoch": 7.200489567548665, "grad_norm": 3.203125, "learning_rate": 4.8200383850894645e-05, "loss": 0.0238, "mean_token_accuracy": 0.9929708957672119, "num_tokens": 93540971.0, "step": 30890 }, { "entropy": 0.07364825969561935, "epoch": 7.201655204569297, "grad_norm": 2.328125, "learning_rate": 4.819959473251234e-05, "loss": 0.0132, "mean_token_accuracy": 0.9956311583518982, "num_tokens": 93561027.0, "step": 30895 }, { "entropy": 0.05888860169798136, "epoch": 7.202820841589929, "grad_norm": 0.341796875, "learning_rate": 4.8198805454582976e-05, "loss": 0.0088, "mean_token_accuracy": 0.9973181843757629, "num_tokens": 93592104.0, "step": 30900 }, { "entropy": 0.09244950031861662, "epoch": 7.2039864786105605, "grad_norm": 0.310546875, "learning_rate": 4.819801601711834e-05, "loss": 0.0398, "mean_token_accuracy": 0.9923376441001892, "num_tokens": 93616408.0, "step": 30905 }, { "entropy": 0.03939249962568283, "epoch": 7.205152115631193, "grad_norm": 0.2275390625, "learning_rate": 4.819722642013019e-05, "loss": 0.0043, "mean_token_accuracy": 0.9981208801269531, "num_tokens": 93655274.0, "step": 30910 }, { "entropy": 0.06765931397676468, "epoch": 7.206317752651824, "grad_norm": 0.65234375, "learning_rate": 4.81964366636303e-05, "loss": 0.008, "mean_token_accuracy": 0.9975310564041138, "num_tokens": 93670281.0, "step": 30915 }, { "entropy": 0.07954985983669757, "epoch": 7.207483389672456, "grad_norm": 1.3046875, "learning_rate": 4.8195646747630455e-05, "loss": 0.0216, "mean_token_accuracy": 0.9954479992389679, "num_tokens": 93679604.0, "step": 30920 }, { "entropy": 0.05836689174175262, "epoch": 7.208649026693088, "grad_norm": 0.62109375, "learning_rate": 4.819485667214243e-05, "loss": 0.0189, "mean_token_accuracy": 0.9950154960155487, "num_tokens": 93695595.0, "step": 30925 }, { "entropy": 0.06158583052456379, "epoch": 7.209814663713719, "grad_norm": 0.80859375, "learning_rate": 4.8194066437178004e-05, "loss": 0.0084, "mean_token_accuracy": 0.9972615122795105, "num_tokens": 93727751.0, "step": 30930 }, { "entropy": 0.0682966934517026, "epoch": 7.210980300734351, "grad_norm": 1.53125, "learning_rate": 4.8193276042748966e-05, "loss": 0.0115, "mean_token_accuracy": 0.9970169782638549, "num_tokens": 93738365.0, "step": 30935 }, { "entropy": 0.07385541684925556, "epoch": 7.2121459377549835, "grad_norm": 1.203125, "learning_rate": 4.8192485488867094e-05, "loss": 0.0262, "mean_token_accuracy": 0.9942777693271637, "num_tokens": 93756002.0, "step": 30940 }, { "entropy": 0.07035349495708942, "epoch": 7.213311574775615, "grad_norm": 1.984375, "learning_rate": 4.8191694775544185e-05, "loss": 0.0183, "mean_token_accuracy": 0.9962132215499878, "num_tokens": 93788334.0, "step": 30945 }, { "entropy": 0.06736944075673819, "epoch": 7.214477211796247, "grad_norm": 0.71875, "learning_rate": 4.819090390279202e-05, "loss": 0.0074, "mean_token_accuracy": 0.9972393333911895, "num_tokens": 93816896.0, "step": 30950 }, { "entropy": 0.05105153433978558, "epoch": 7.215642848816878, "grad_norm": 2.671875, "learning_rate": 4.8190112870622406e-05, "loss": 0.0123, "mean_token_accuracy": 0.9973272025585175, "num_tokens": 93839774.0, "step": 30955 }, { "entropy": 0.0589878392405808, "epoch": 7.21680848583751, "grad_norm": 0.87109375, "learning_rate": 4.818932167904713e-05, "loss": 0.0154, "mean_token_accuracy": 0.9960299611091614, "num_tokens": 93854636.0, "step": 30960 }, { "entropy": 0.08665582574903966, "epoch": 7.217974122858142, "grad_norm": 1.65625, "learning_rate": 4.818853032807799e-05, "loss": 0.0165, "mean_token_accuracy": 0.9953882753849029, "num_tokens": 93864548.0, "step": 30965 }, { "entropy": 0.05970696024596691, "epoch": 7.219139759878773, "grad_norm": 1.6796875, "learning_rate": 4.818773881772678e-05, "loss": 0.0141, "mean_token_accuracy": 0.9952605664730072, "num_tokens": 93877575.0, "step": 30970 }, { "entropy": 0.07558971364051104, "epoch": 7.2203053968994055, "grad_norm": 0.51171875, "learning_rate": 4.818694714800531e-05, "loss": 0.0053, "mean_token_accuracy": 0.9963312029838562, "num_tokens": 93913383.0, "step": 30975 }, { "entropy": 0.06744187790900469, "epoch": 7.221471033920038, "grad_norm": 1.1484375, "learning_rate": 4.818615531892539e-05, "loss": 0.0094, "mean_token_accuracy": 0.9975740015506744, "num_tokens": 93931739.0, "step": 30980 }, { "entropy": 0.06372264893725514, "epoch": 7.222636670940669, "grad_norm": 2.625, "learning_rate": 4.818536333049881e-05, "loss": 0.014, "mean_token_accuracy": 0.9937172710895539, "num_tokens": 93954776.0, "step": 30985 }, { "entropy": 0.08202089443802833, "epoch": 7.223802307961301, "grad_norm": 1.4140625, "learning_rate": 4.81845711827374e-05, "loss": 0.0077, "mean_token_accuracy": 0.9972925662994385, "num_tokens": 93974412.0, "step": 30990 }, { "entropy": 0.14377728216350077, "epoch": 7.224967944981932, "grad_norm": 0.3203125, "learning_rate": 4.818377887565296e-05, "loss": 0.2043, "mean_token_accuracy": 0.9597078502178192, "num_tokens": 94001058.0, "step": 30995 }, { "entropy": 0.04768119920045137, "epoch": 7.226133582002564, "grad_norm": 1.171875, "learning_rate": 4.8182986409257315e-05, "loss": 0.0141, "mean_token_accuracy": 0.9953707456588745, "num_tokens": 94022836.0, "step": 31000 }, { "entropy": 0.07950283214449883, "epoch": 7.227299219023196, "grad_norm": 2.546875, "learning_rate": 4.818219378356226e-05, "loss": 0.0199, "mean_token_accuracy": 0.9949168682098388, "num_tokens": 94031304.0, "step": 31005 }, { "entropy": 0.06591652268543839, "epoch": 7.228464856043828, "grad_norm": 1.6328125, "learning_rate": 4.818140099857964e-05, "loss": 0.0139, "mean_token_accuracy": 0.9966680228710174, "num_tokens": 94052316.0, "step": 31010 }, { "entropy": 0.055729389935731885, "epoch": 7.22963049306446, "grad_norm": 0.7109375, "learning_rate": 4.8180608054321266e-05, "loss": 0.0086, "mean_token_accuracy": 0.9960857272148133, "num_tokens": 94085005.0, "step": 31015 }, { "entropy": 0.031023029517382384, "epoch": 7.230796130085092, "grad_norm": 0.12109375, "learning_rate": 4.8179814950798956e-05, "loss": 0.0048, "mean_token_accuracy": 0.998980051279068, "num_tokens": 94136607.0, "step": 31020 }, { "entropy": 0.06259532477706671, "epoch": 7.231961767105723, "grad_norm": 2.84375, "learning_rate": 4.8179021688024546e-05, "loss": 0.0134, "mean_token_accuracy": 0.9964537560939789, "num_tokens": 94150543.0, "step": 31025 }, { "entropy": 0.07360369060188532, "epoch": 7.233127404126355, "grad_norm": 0.37109375, "learning_rate": 4.817822826600986e-05, "loss": 0.0134, "mean_token_accuracy": 0.9939245641231537, "num_tokens": 94167749.0, "step": 31030 }, { "entropy": 0.0606883866712451, "epoch": 7.234293041146987, "grad_norm": 0.81640625, "learning_rate": 4.817743468476672e-05, "loss": 0.0129, "mean_token_accuracy": 0.9957722008228302, "num_tokens": 94180749.0, "step": 31035 }, { "entropy": 0.06940312702208758, "epoch": 7.2354586781676185, "grad_norm": 3.234375, "learning_rate": 4.817664094430698e-05, "loss": 0.0155, "mean_token_accuracy": 0.9943855345249176, "num_tokens": 94198809.0, "step": 31040 }, { "entropy": 0.0686707628890872, "epoch": 7.2366243151882506, "grad_norm": 0.2578125, "learning_rate": 4.817584704464246e-05, "loss": 0.0074, "mean_token_accuracy": 0.998262470960617, "num_tokens": 94218672.0, "step": 31045 }, { "entropy": 0.06387984249740838, "epoch": 7.237789952208882, "grad_norm": 1.8203125, "learning_rate": 4.817505298578501e-05, "loss": 0.0166, "mean_token_accuracy": 0.9973008513450623, "num_tokens": 94245039.0, "step": 31050 }, { "entropy": 0.07732600383460522, "epoch": 7.238955589229514, "grad_norm": 0.51171875, "learning_rate": 4.817425876774646e-05, "loss": 0.0173, "mean_token_accuracy": 0.9943410992622376, "num_tokens": 94254637.0, "step": 31055 }, { "entropy": 0.06010149214416742, "epoch": 7.240121226250146, "grad_norm": 2.734375, "learning_rate": 4.817346439053865e-05, "loss": 0.0215, "mean_token_accuracy": 0.9950605452060699, "num_tokens": 94282839.0, "step": 31060 }, { "entropy": 0.06231997692957521, "epoch": 7.241286863270777, "grad_norm": 0.384765625, "learning_rate": 4.8172669854173444e-05, "loss": 0.0128, "mean_token_accuracy": 0.9954926490783691, "num_tokens": 94304068.0, "step": 31065 }, { "entropy": 0.07848136788234114, "epoch": 7.242452500291409, "grad_norm": 4.34375, "learning_rate": 4.8171875158662665e-05, "loss": 0.0132, "mean_token_accuracy": 0.9944968461990357, "num_tokens": 94324087.0, "step": 31070 }, { "entropy": 0.06527646128088235, "epoch": 7.243618137312041, "grad_norm": 0.54296875, "learning_rate": 4.8171080304018186e-05, "loss": 0.0126, "mean_token_accuracy": 0.9959727704524994, "num_tokens": 94337863.0, "step": 31075 }, { "entropy": 0.07139136102050543, "epoch": 7.244783774332673, "grad_norm": 2.15625, "learning_rate": 4.8170285290251846e-05, "loss": 0.016, "mean_token_accuracy": 0.9945382833480835, "num_tokens": 94352721.0, "step": 31080 }, { "entropy": 0.05915911886841059, "epoch": 7.245949411353305, "grad_norm": 1.1953125, "learning_rate": 4.81694901173755e-05, "loss": 0.007, "mean_token_accuracy": 0.9973145842552185, "num_tokens": 94366680.0, "step": 31085 }, { "entropy": 0.04861733708530665, "epoch": 7.247115048373936, "grad_norm": 0.376953125, "learning_rate": 4.8168694785401016e-05, "loss": 0.007, "mean_token_accuracy": 0.9972374498844147, "num_tokens": 94394164.0, "step": 31090 }, { "entropy": 0.053044412098824975, "epoch": 7.248280685394568, "grad_norm": 0.3203125, "learning_rate": 4.816789929434024e-05, "loss": 0.0084, "mean_token_accuracy": 0.9966397881507874, "num_tokens": 94415197.0, "step": 31095 }, { "entropy": 0.07270872667431831, "epoch": 7.2494463224152, "grad_norm": 3.046875, "learning_rate": 4.816710364420504e-05, "loss": 0.0159, "mean_token_accuracy": 0.9957484424114227, "num_tokens": 94435321.0, "step": 31100 }, { "entropy": 0.06985992044210435, "epoch": 7.250611959435831, "grad_norm": 0.78515625, "learning_rate": 4.816630783500729e-05, "loss": 0.0161, "mean_token_accuracy": 0.9964280486106872, "num_tokens": 94445779.0, "step": 31105 }, { "entropy": 0.04990783054381609, "epoch": 7.2517775964564635, "grad_norm": 0.2099609375, "learning_rate": 4.8165511866758835e-05, "loss": 0.0149, "mean_token_accuracy": 0.9973464012145996, "num_tokens": 94479579.0, "step": 31110 }, { "entropy": 0.042833476420491935, "epoch": 7.252943233477096, "grad_norm": 0.6328125, "learning_rate": 4.816471573947156e-05, "loss": 0.0165, "mean_token_accuracy": 0.9955586969852448, "num_tokens": 94510088.0, "step": 31115 }, { "entropy": 0.05222136527299881, "epoch": 7.254108870497727, "grad_norm": 0.314453125, "learning_rate": 4.8163919453157335e-05, "loss": 0.009, "mean_token_accuracy": 0.9957200348377228, "num_tokens": 94546329.0, "step": 31120 }, { "entropy": 0.08356106411665679, "epoch": 7.255274507518359, "grad_norm": 0.609375, "learning_rate": 4.8163123007828024e-05, "loss": 0.0613, "mean_token_accuracy": 0.9861360251903534, "num_tokens": 94571572.0, "step": 31125 }, { "entropy": 0.03666842384263873, "epoch": 7.25644014453899, "grad_norm": 0.310546875, "learning_rate": 4.8162326403495524e-05, "loss": 0.0054, "mean_token_accuracy": 0.9983543157577515, "num_tokens": 94598814.0, "step": 31130 }, { "entropy": 0.08895245492458344, "epoch": 7.257605781559622, "grad_norm": 1.4609375, "learning_rate": 4.816152964017169e-05, "loss": 0.0154, "mean_token_accuracy": 0.995537132024765, "num_tokens": 94608335.0, "step": 31135 }, { "entropy": 0.0762575170956552, "epoch": 7.258771418580254, "grad_norm": 2.625, "learning_rate": 4.816073271786842e-05, "loss": 0.0268, "mean_token_accuracy": 0.9923174262046814, "num_tokens": 94634806.0, "step": 31140 }, { "entropy": 0.0727899644523859, "epoch": 7.2599370556008855, "grad_norm": 0.51171875, "learning_rate": 4.815993563659759e-05, "loss": 0.0164, "mean_token_accuracy": 0.9957876443862915, "num_tokens": 94646234.0, "step": 31145 }, { "entropy": 0.052048580907285216, "epoch": 7.261102692621518, "grad_norm": 0.353515625, "learning_rate": 4.8159138396371075e-05, "loss": 0.009, "mean_token_accuracy": 0.9969389081001282, "num_tokens": 94662074.0, "step": 31150 }, { "entropy": 0.054339785035699605, "epoch": 7.26226832964215, "grad_norm": 0.65234375, "learning_rate": 4.815834099720079e-05, "loss": 0.0139, "mean_token_accuracy": 0.9946308076381684, "num_tokens": 94695848.0, "step": 31155 }, { "entropy": 0.088789024297148, "epoch": 7.263433966662781, "grad_norm": 0.390625, "learning_rate": 4.81575434390986e-05, "loss": 0.0118, "mean_token_accuracy": 0.9980597138404846, "num_tokens": 94712847.0, "step": 31160 }, { "entropy": 0.04380578193813563, "epoch": 7.264599603683413, "grad_norm": 0.51953125, "learning_rate": 4.8156745722076406e-05, "loss": 0.0086, "mean_token_accuracy": 0.9967470228672027, "num_tokens": 94743216.0, "step": 31165 }, { "entropy": 0.05533214872702956, "epoch": 7.265765240704045, "grad_norm": 3.203125, "learning_rate": 4.815594784614611e-05, "loss": 0.0222, "mean_token_accuracy": 0.995652312040329, "num_tokens": 94760494.0, "step": 31170 }, { "entropy": 0.06883603855967521, "epoch": 7.266930877724676, "grad_norm": 0.58984375, "learning_rate": 4.81551498113196e-05, "loss": 0.0202, "mean_token_accuracy": 0.9935734689235687, "num_tokens": 94773816.0, "step": 31175 }, { "entropy": 0.05732705658301711, "epoch": 7.2680965147453085, "grad_norm": 0.640625, "learning_rate": 4.815435161760878e-05, "loss": 0.0162, "mean_token_accuracy": 0.9956275284290313, "num_tokens": 94790719.0, "step": 31180 }, { "entropy": 0.07628957759588957, "epoch": 7.26926215176594, "grad_norm": 0.9453125, "learning_rate": 4.815355326502556e-05, "loss": 0.0157, "mean_token_accuracy": 0.9964455723762512, "num_tokens": 94803985.0, "step": 31185 }, { "entropy": 0.061471117474138734, "epoch": 7.270427788786572, "grad_norm": 1.4765625, "learning_rate": 4.815275475358183e-05, "loss": 0.0164, "mean_token_accuracy": 0.9955154120922088, "num_tokens": 94818813.0, "step": 31190 }, { "entropy": 0.046590684168040755, "epoch": 7.271593425807204, "grad_norm": 0.294921875, "learning_rate": 4.8151956083289504e-05, "loss": 0.0191, "mean_token_accuracy": 0.9956287026405335, "num_tokens": 94848573.0, "step": 31195 }, { "entropy": 0.10268030576407909, "epoch": 7.272759062827835, "grad_norm": 5.28125, "learning_rate": 4.815115725416049e-05, "loss": 0.1076, "mean_token_accuracy": 0.9697431027889252, "num_tokens": 94877708.0, "step": 31200 }, { "entropy": 0.044332510232925414, "epoch": 7.273924699848467, "grad_norm": 0.34375, "learning_rate": 4.8150358266206705e-05, "loss": 0.007, "mean_token_accuracy": 0.9973241150379181, "num_tokens": 94895674.0, "step": 31205 }, { "entropy": 0.05917203584685922, "epoch": 7.275090336869099, "grad_norm": 0.53125, "learning_rate": 4.814955911944006e-05, "loss": 0.0068, "mean_token_accuracy": 0.9969590663909912, "num_tokens": 94913506.0, "step": 31210 }, { "entropy": 0.07185657788068056, "epoch": 7.2762559738897306, "grad_norm": 1.2734375, "learning_rate": 4.814875981387247e-05, "loss": 0.0201, "mean_token_accuracy": 0.9954153180122376, "num_tokens": 94924274.0, "step": 31215 }, { "entropy": 0.05664256140589714, "epoch": 7.277421610910363, "grad_norm": 1.046875, "learning_rate": 4.814796034951585e-05, "loss": 0.011, "mean_token_accuracy": 0.9930629730224609, "num_tokens": 94949709.0, "step": 31220 }, { "entropy": 0.05459074741229415, "epoch": 7.278587247930994, "grad_norm": 2.890625, "learning_rate": 4.814716072638213e-05, "loss": 0.0189, "mean_token_accuracy": 0.9927839577198029, "num_tokens": 94985059.0, "step": 31225 }, { "entropy": 0.045205979235470296, "epoch": 7.279752884951626, "grad_norm": 2.21875, "learning_rate": 4.8146360944483235e-05, "loss": 0.0061, "mean_token_accuracy": 0.9978555560111999, "num_tokens": 95014556.0, "step": 31230 }, { "entropy": 0.0740995816886425, "epoch": 7.280918521972258, "grad_norm": 2.3125, "learning_rate": 4.814556100383108e-05, "loss": 0.0168, "mean_token_accuracy": 0.9948262691497802, "num_tokens": 95035974.0, "step": 31235 }, { "entropy": 0.03894362915307283, "epoch": 7.282084158992889, "grad_norm": 1.453125, "learning_rate": 4.8144760904437594e-05, "loss": 0.0065, "mean_token_accuracy": 0.9980915129184723, "num_tokens": 95080772.0, "step": 31240 }, { "entropy": 0.09444695636630059, "epoch": 7.283249796013521, "grad_norm": 0.5390625, "learning_rate": 4.814396064631471e-05, "loss": 0.0087, "mean_token_accuracy": 0.9984256386756897, "num_tokens": 95097268.0, "step": 31245 }, { "entropy": 0.07851285748183727, "epoch": 7.2844154330341535, "grad_norm": 0.314453125, "learning_rate": 4.814316022947437e-05, "loss": 0.0222, "mean_token_accuracy": 0.9953002572059632, "num_tokens": 95115624.0, "step": 31250 }, { "entropy": 0.0706376725807786, "epoch": 7.285581070054785, "grad_norm": 0.296875, "learning_rate": 4.814235965392851e-05, "loss": 0.0059, "mean_token_accuracy": 0.9987712681293488, "num_tokens": 95140592.0, "step": 31255 }, { "entropy": 0.06456660237163306, "epoch": 7.286746707075417, "grad_norm": 0.373046875, "learning_rate": 4.814155891968905e-05, "loss": 0.013, "mean_token_accuracy": 0.9957951843738556, "num_tokens": 95157676.0, "step": 31260 }, { "entropy": 0.04452027985826135, "epoch": 7.287912344096048, "grad_norm": 2.53125, "learning_rate": 4.814075802676794e-05, "loss": 0.0113, "mean_token_accuracy": 0.9970516860485077, "num_tokens": 95181335.0, "step": 31265 }, { "entropy": 0.06406694650650024, "epoch": 7.28907798111668, "grad_norm": 0.9921875, "learning_rate": 4.813995697517712e-05, "loss": 0.007, "mean_token_accuracy": 0.9972039103507996, "num_tokens": 95195683.0, "step": 31270 }, { "entropy": 0.043226651614531875, "epoch": 7.290243618137312, "grad_norm": 0.67578125, "learning_rate": 4.813915576492855e-05, "loss": 0.0054, "mean_token_accuracy": 0.9987265706062317, "num_tokens": 95224524.0, "step": 31275 }, { "entropy": 0.05426203664392233, "epoch": 7.2914092551579435, "grad_norm": 0.7890625, "learning_rate": 4.8138354396034165e-05, "loss": 0.0177, "mean_token_accuracy": 0.9938402116298676, "num_tokens": 95243533.0, "step": 31280 }, { "entropy": 0.05316199092194438, "epoch": 7.292574892178576, "grad_norm": 2.75, "learning_rate": 4.813755286850591e-05, "loss": 0.0129, "mean_token_accuracy": 0.9960735082626343, "num_tokens": 95266239.0, "step": 31285 }, { "entropy": 0.08508195830509066, "epoch": 7.293740529199208, "grad_norm": 4.5625, "learning_rate": 4.813675118235574e-05, "loss": 0.0252, "mean_token_accuracy": 0.9939753770828247, "num_tokens": 95294519.0, "step": 31290 }, { "entropy": 0.08679478149861097, "epoch": 7.294906166219839, "grad_norm": 0.890625, "learning_rate": 4.813594933759561e-05, "loss": 0.016, "mean_token_accuracy": 0.9938629746437073, "num_tokens": 95313085.0, "step": 31295 }, { "entropy": 0.07704577697440981, "epoch": 7.296071803240471, "grad_norm": 0.2080078125, "learning_rate": 4.813514733423749e-05, "loss": 0.0064, "mean_token_accuracy": 0.9971446514129638, "num_tokens": 95331867.0, "step": 31300 }, { "entropy": 0.038349354080855845, "epoch": 7.297237440261103, "grad_norm": 1.5625, "learning_rate": 4.813434517229331e-05, "loss": 0.01, "mean_token_accuracy": 0.9969878733158112, "num_tokens": 95373062.0, "step": 31305 }, { "entropy": 0.05551351401954889, "epoch": 7.298403077281734, "grad_norm": 0.416015625, "learning_rate": 4.813354285177506e-05, "loss": 0.0133, "mean_token_accuracy": 0.9945835828781128, "num_tokens": 95394246.0, "step": 31310 }, { "entropy": 0.08316299207508564, "epoch": 7.299568714302366, "grad_norm": 3.328125, "learning_rate": 4.813274037269468e-05, "loss": 0.0285, "mean_token_accuracy": 0.9920492529869079, "num_tokens": 95402582.0, "step": 31315 }, { "entropy": 0.05540920048952103, "epoch": 7.300734351322998, "grad_norm": 1.3359375, "learning_rate": 4.8131937735064164e-05, "loss": 0.008, "mean_token_accuracy": 0.9974058926105499, "num_tokens": 95426015.0, "step": 31320 }, { "entropy": 0.07322446051985025, "epoch": 7.30189998834363, "grad_norm": 2.9375, "learning_rate": 4.813113493889546e-05, "loss": 0.0159, "mean_token_accuracy": 0.9926839590072631, "num_tokens": 95442392.0, "step": 31325 }, { "entropy": 0.053282452002167704, "epoch": 7.303065625364262, "grad_norm": 0.50390625, "learning_rate": 4.813033198420054e-05, "loss": 0.0042, "mean_token_accuracy": 0.9982658386230469, "num_tokens": 95460704.0, "step": 31330 }, { "entropy": 0.06922962255775929, "epoch": 7.304231262384893, "grad_norm": 1.171875, "learning_rate": 4.8129528870991386e-05, "loss": 0.0128, "mean_token_accuracy": 0.9957706928253174, "num_tokens": 95484780.0, "step": 31335 }, { "entropy": 0.058275452349334955, "epoch": 7.305396899405525, "grad_norm": 0.33984375, "learning_rate": 4.8128725599279965e-05, "loss": 0.0067, "mean_token_accuracy": 0.9980259537696838, "num_tokens": 95505425.0, "step": 31340 }, { "entropy": 0.0893326872959733, "epoch": 7.306562536426157, "grad_norm": 1.046875, "learning_rate": 4.812792216907826e-05, "loss": 0.0666, "mean_token_accuracy": 0.9879299819469451, "num_tokens": 95527201.0, "step": 31345 }, { "entropy": 0.09231711477041245, "epoch": 7.3077281734467885, "grad_norm": 1.625, "learning_rate": 4.812711858039825e-05, "loss": 0.021, "mean_token_accuracy": 0.9963175833225251, "num_tokens": 95535423.0, "step": 31350 }, { "entropy": 0.06640668958425522, "epoch": 7.308893810467421, "grad_norm": 1.3984375, "learning_rate": 4.8126314833251916e-05, "loss": 0.0044, "mean_token_accuracy": 0.9974156856536865, "num_tokens": 95568929.0, "step": 31355 }, { "entropy": 0.06366755496710538, "epoch": 7.310059447488052, "grad_norm": 4.46875, "learning_rate": 4.812551092765125e-05, "loss": 0.03, "mean_token_accuracy": 0.9931754112243653, "num_tokens": 95583716.0, "step": 31360 }, { "entropy": 0.08571641780436039, "epoch": 7.311225084508684, "grad_norm": 0.671875, "learning_rate": 4.812470686360823e-05, "loss": 0.0187, "mean_token_accuracy": 0.9947358131408691, "num_tokens": 95594442.0, "step": 31365 }, { "entropy": 0.06556187737733125, "epoch": 7.312390721529316, "grad_norm": 1.5078125, "learning_rate": 4.812390264113486e-05, "loss": 0.0104, "mean_token_accuracy": 0.9965194284915924, "num_tokens": 95608186.0, "step": 31370 }, { "entropy": 0.059163821302354334, "epoch": 7.313556358549947, "grad_norm": 2.828125, "learning_rate": 4.812309826024311e-05, "loss": 0.0167, "mean_token_accuracy": 0.9942824065685272, "num_tokens": 95624376.0, "step": 31375 }, { "entropy": 0.050896511506289245, "epoch": 7.314721995570579, "grad_norm": 2.078125, "learning_rate": 4.812229372094499e-05, "loss": 0.0067, "mean_token_accuracy": 0.9977230668067932, "num_tokens": 95649652.0, "step": 31380 }, { "entropy": 0.07648697402328253, "epoch": 7.315887632591211, "grad_norm": 1.9140625, "learning_rate": 4.81214890232525e-05, "loss": 0.0091, "mean_token_accuracy": 0.9963666439056397, "num_tokens": 95670557.0, "step": 31385 }, { "entropy": 0.05390965268015861, "epoch": 7.317053269611843, "grad_norm": 0.26953125, "learning_rate": 4.8120684167177623e-05, "loss": 0.0114, "mean_token_accuracy": 0.9976705610752106, "num_tokens": 95692401.0, "step": 31390 }, { "entropy": 0.07451889421790839, "epoch": 7.318218906632475, "grad_norm": 0.2216796875, "learning_rate": 4.811987915273237e-05, "loss": 0.0196, "mean_token_accuracy": 0.9942530870437623, "num_tokens": 95710753.0, "step": 31395 }, { "entropy": 0.0531097125262022, "epoch": 7.319384543653106, "grad_norm": 3.953125, "learning_rate": 4.8119073979928755e-05, "loss": 0.0106, "mean_token_accuracy": 0.9969158947467804, "num_tokens": 95739383.0, "step": 31400 }, { "entropy": 0.07482300028204918, "epoch": 7.320550180673738, "grad_norm": 1.28125, "learning_rate": 4.8118268648778776e-05, "loss": 0.02, "mean_token_accuracy": 0.9964318454265595, "num_tokens": 95750170.0, "step": 31405 }, { "entropy": 0.06803834708407522, "epoch": 7.32171581769437, "grad_norm": 0.73828125, "learning_rate": 4.811746315929443e-05, "loss": 0.0064, "mean_token_accuracy": 0.9968580782413483, "num_tokens": 95785014.0, "step": 31410 }, { "entropy": 0.06311820847913623, "epoch": 7.322881454715001, "grad_norm": 0.52734375, "learning_rate": 4.8116657511487745e-05, "loss": 0.0066, "mean_token_accuracy": 0.9981098234653473, "num_tokens": 95804301.0, "step": 31415 }, { "entropy": 0.06809269674122334, "epoch": 7.3240470917356335, "grad_norm": 0.9296875, "learning_rate": 4.811585170537073e-05, "loss": 0.0104, "mean_token_accuracy": 0.9975564420223236, "num_tokens": 95816307.0, "step": 31420 }, { "entropy": 0.06396800447255373, "epoch": 7.325212728756266, "grad_norm": 0.365234375, "learning_rate": 4.811504574095539e-05, "loss": 0.0122, "mean_token_accuracy": 0.9960817337036133, "num_tokens": 95838870.0, "step": 31425 }, { "entropy": 0.07934959940612316, "epoch": 7.326378365776897, "grad_norm": 0.84765625, "learning_rate": 4.811423961825377e-05, "loss": 0.0274, "mean_token_accuracy": 0.9933449506759644, "num_tokens": 95847317.0, "step": 31430 }, { "entropy": 0.06323989136144519, "epoch": 7.327544002797529, "grad_norm": 1.7421875, "learning_rate": 4.8113433337277857e-05, "loss": 0.0147, "mean_token_accuracy": 0.9955657243728637, "num_tokens": 95867740.0, "step": 31435 }, { "entropy": 0.07817384712398052, "epoch": 7.328709639818161, "grad_norm": 2.0625, "learning_rate": 4.8112626898039694e-05, "loss": 0.0086, "mean_token_accuracy": 0.9980550169944763, "num_tokens": 95882846.0, "step": 31440 }, { "entropy": 0.07506803907454014, "epoch": 7.329875276838792, "grad_norm": 2.109375, "learning_rate": 4.81118203005513e-05, "loss": 0.0244, "mean_token_accuracy": 0.9934002816677093, "num_tokens": 95902766.0, "step": 31445 }, { "entropy": 0.06314781978726387, "epoch": 7.331040913859424, "grad_norm": 0.345703125, "learning_rate": 4.811101354482471e-05, "loss": 0.0109, "mean_token_accuracy": 0.9966369867324829, "num_tokens": 95921553.0, "step": 31450 }, { "entropy": 0.05526378992944956, "epoch": 7.332206550880056, "grad_norm": 0.52734375, "learning_rate": 4.811020663087194e-05, "loss": 0.0104, "mean_token_accuracy": 0.9964692711830139, "num_tokens": 95950474.0, "step": 31455 }, { "entropy": 0.06638829614967108, "epoch": 7.333372187900688, "grad_norm": 1.46875, "learning_rate": 4.810939955870504e-05, "loss": 0.0158, "mean_token_accuracy": 0.994761723279953, "num_tokens": 95968461.0, "step": 31460 }, { "entropy": 0.04524884703569114, "epoch": 7.33453782492132, "grad_norm": 0.1982421875, "learning_rate": 4.810859232833602e-05, "loss": 0.0049, "mean_token_accuracy": 0.998431783914566, "num_tokens": 95999830.0, "step": 31465 }, { "entropy": 0.070090286526829, "epoch": 7.335703461941951, "grad_norm": 0.333984375, "learning_rate": 4.8107784939776946e-05, "loss": 0.0167, "mean_token_accuracy": 0.9949487507343292, "num_tokens": 96032091.0, "step": 31470 }, { "entropy": 0.07050778605043888, "epoch": 7.336869098962583, "grad_norm": 2.234375, "learning_rate": 4.8106977393039844e-05, "loss": 0.0265, "mean_token_accuracy": 0.9949378252029419, "num_tokens": 96044365.0, "step": 31475 }, { "entropy": 0.0740006735548377, "epoch": 7.338034735983215, "grad_norm": 1.2109375, "learning_rate": 4.810616968813675e-05, "loss": 0.0173, "mean_token_accuracy": 0.9964466452598572, "num_tokens": 96058233.0, "step": 31480 }, { "entropy": 0.07905077319592238, "epoch": 7.339200373003846, "grad_norm": 1.9765625, "learning_rate": 4.810536182507971e-05, "loss": 0.009, "mean_token_accuracy": 0.9967888534069062, "num_tokens": 96069124.0, "step": 31485 }, { "entropy": 0.05249534146860242, "epoch": 7.3403660100244785, "grad_norm": 0.2890625, "learning_rate": 4.810455380388078e-05, "loss": 0.0083, "mean_token_accuracy": 0.9959168851375579, "num_tokens": 96100217.0, "step": 31490 }, { "entropy": 0.08653380684554576, "epoch": 7.34153164704511, "grad_norm": 1.3046875, "learning_rate": 4.8103745624552e-05, "loss": 0.0142, "mean_token_accuracy": 0.9960395634174347, "num_tokens": 96110199.0, "step": 31495 }, { "entropy": 0.0720094045624137, "epoch": 7.342697284065742, "grad_norm": 1.609375, "learning_rate": 4.810293728710542e-05, "loss": 0.0172, "mean_token_accuracy": 0.9962703585624695, "num_tokens": 96129305.0, "step": 31500 }, { "entropy": 0.07461393307894468, "epoch": 7.343862921086374, "grad_norm": 2.515625, "learning_rate": 4.81021287915531e-05, "loss": 0.0143, "mean_token_accuracy": 0.9961204707622529, "num_tokens": 96150839.0, "step": 31505 }, { "entropy": 0.06459999550133944, "epoch": 7.345028558107005, "grad_norm": 0.5703125, "learning_rate": 4.8101320137907095e-05, "loss": 0.0125, "mean_token_accuracy": 0.9957730710506439, "num_tokens": 96182234.0, "step": 31510 }, { "entropy": 0.06358031257987022, "epoch": 7.346194195127637, "grad_norm": 0.65234375, "learning_rate": 4.810051132617947e-05, "loss": 0.0064, "mean_token_accuracy": 0.9975331246852874, "num_tokens": 96195513.0, "step": 31515 }, { "entropy": 0.06775630675256253, "epoch": 7.347359832148269, "grad_norm": 0.8046875, "learning_rate": 4.8099702356382264e-05, "loss": 0.013, "mean_token_accuracy": 0.9957094848155975, "num_tokens": 96212860.0, "step": 31520 }, { "entropy": 0.06747577143833042, "epoch": 7.348525469168901, "grad_norm": 0.361328125, "learning_rate": 4.809889322852756e-05, "loss": 0.0209, "mean_token_accuracy": 0.9947060763835907, "num_tokens": 96227657.0, "step": 31525 }, { "entropy": 0.06826590951532126, "epoch": 7.349691106189533, "grad_norm": 4.0, "learning_rate": 4.809808394262741e-05, "loss": 0.0212, "mean_token_accuracy": 0.9943705976009369, "num_tokens": 96239039.0, "step": 31530 }, { "entropy": 0.05712395738810301, "epoch": 7.350856743210164, "grad_norm": 0.353515625, "learning_rate": 4.809727449869389e-05, "loss": 0.0034, "mean_token_accuracy": 0.9990525543689728, "num_tokens": 96261487.0, "step": 31535 }, { "entropy": 0.06899423655122519, "epoch": 7.352022380230796, "grad_norm": 1.65625, "learning_rate": 4.809646489673907e-05, "loss": 0.0165, "mean_token_accuracy": 0.9933532297611236, "num_tokens": 96272372.0, "step": 31540 }, { "entropy": 0.06043548882007599, "epoch": 7.353188017251428, "grad_norm": 0.484375, "learning_rate": 4.809565513677502e-05, "loss": 0.0141, "mean_token_accuracy": 0.995878529548645, "num_tokens": 96303355.0, "step": 31545 }, { "entropy": 0.05828229039907455, "epoch": 7.354353654272059, "grad_norm": 0.314453125, "learning_rate": 4.809484521881381e-05, "loss": 0.0196, "mean_token_accuracy": 0.9947497963905334, "num_tokens": 96319372.0, "step": 31550 }, { "entropy": 0.0643775088712573, "epoch": 7.355519291292691, "grad_norm": 1.8984375, "learning_rate": 4.8094035142867536e-05, "loss": 0.0068, "mean_token_accuracy": 0.9980500638484955, "num_tokens": 96333139.0, "step": 31555 }, { "entropy": 0.0727043965831399, "epoch": 7.3566849283133235, "grad_norm": 4.78125, "learning_rate": 4.809322490894825e-05, "loss": 0.012, "mean_token_accuracy": 0.9963323414325714, "num_tokens": 96352988.0, "step": 31560 }, { "entropy": 0.05997302522882819, "epoch": 7.357850565333955, "grad_norm": 1.609375, "learning_rate": 4.809241451706805e-05, "loss": 0.0062, "mean_token_accuracy": 0.9979426980018615, "num_tokens": 96378601.0, "step": 31565 }, { "entropy": 0.058050360158085824, "epoch": 7.359016202354587, "grad_norm": 0.447265625, "learning_rate": 4.8091603967239025e-05, "loss": 0.0192, "mean_token_accuracy": 0.9955137133598327, "num_tokens": 96409377.0, "step": 31570 }, { "entropy": 0.06100328993052244, "epoch": 7.360181839375219, "grad_norm": 1.28125, "learning_rate": 4.809079325947325e-05, "loss": 0.0076, "mean_token_accuracy": 0.9978339493274688, "num_tokens": 96442510.0, "step": 31575 }, { "entropy": 0.04876450030133128, "epoch": 7.36134747639585, "grad_norm": 0.2041015625, "learning_rate": 4.808998239378282e-05, "loss": 0.0113, "mean_token_accuracy": 0.995351231098175, "num_tokens": 96465017.0, "step": 31580 }, { "entropy": 0.05196169055998325, "epoch": 7.362513113416482, "grad_norm": 1.5390625, "learning_rate": 4.808917137017982e-05, "loss": 0.0111, "mean_token_accuracy": 0.9975907921791076, "num_tokens": 96480390.0, "step": 31585 }, { "entropy": 0.06991583518683911, "epoch": 7.3636787504371135, "grad_norm": 1.6484375, "learning_rate": 4.8088360188676354e-05, "loss": 0.009, "mean_token_accuracy": 0.9967041075229645, "num_tokens": 96496176.0, "step": 31590 }, { "entropy": 0.0919257765635848, "epoch": 7.364844387457746, "grad_norm": 3.15625, "learning_rate": 4.8087548849284504e-05, "loss": 0.0168, "mean_token_accuracy": 0.9965686082839966, "num_tokens": 96512875.0, "step": 31595 }, { "entropy": 0.08283570520579815, "epoch": 7.366010024478378, "grad_norm": 1.5703125, "learning_rate": 4.808673735201637e-05, "loss": 0.0143, "mean_token_accuracy": 0.9972459256649018, "num_tokens": 96521569.0, "step": 31600 }, { "entropy": 0.06374187842011451, "epoch": 7.367175661499009, "grad_norm": 0.86328125, "learning_rate": 4.8085925696884074e-05, "loss": 0.0162, "mean_token_accuracy": 0.996217918395996, "num_tokens": 96531666.0, "step": 31605 }, { "entropy": 0.07336053401231765, "epoch": 7.368341298519641, "grad_norm": 0.890625, "learning_rate": 4.8085113883899704e-05, "loss": 0.0061, "mean_token_accuracy": 0.9984713077545166, "num_tokens": 96556172.0, "step": 31610 }, { "entropy": 0.06423005685210229, "epoch": 7.369506935540273, "grad_norm": 0.490234375, "learning_rate": 4.808430191307535e-05, "loss": 0.0125, "mean_token_accuracy": 0.9968792617321014, "num_tokens": 96572479.0, "step": 31615 }, { "entropy": 0.07545432206243277, "epoch": 7.370672572560904, "grad_norm": 0.87890625, "learning_rate": 4.808348978442315e-05, "loss": 0.0164, "mean_token_accuracy": 0.9944819986820221, "num_tokens": 96587508.0, "step": 31620 }, { "entropy": 0.07405749782919883, "epoch": 7.3718382095815365, "grad_norm": 3.890625, "learning_rate": 4.808267749795519e-05, "loss": 0.0209, "mean_token_accuracy": 0.9963535487651825, "num_tokens": 96599591.0, "step": 31625 }, { "entropy": 0.057722126320004466, "epoch": 7.373003846602168, "grad_norm": 2.375, "learning_rate": 4.8081865053683595e-05, "loss": 0.008, "mean_token_accuracy": 0.9980563938617706, "num_tokens": 96629812.0, "step": 31630 }, { "entropy": 0.05824717171490192, "epoch": 7.3741694836228, "grad_norm": 0.9609375, "learning_rate": 4.8081052451620476e-05, "loss": 0.0051, "mean_token_accuracy": 0.9970640122890473, "num_tokens": 96661085.0, "step": 31635 }, { "entropy": 0.04979136511683464, "epoch": 7.375335120643432, "grad_norm": 0.244140625, "learning_rate": 4.808023969177795e-05, "loss": 0.0128, "mean_token_accuracy": 0.9955516993999481, "num_tokens": 96683599.0, "step": 31640 }, { "entropy": 0.06467875819653272, "epoch": 7.376500757664063, "grad_norm": 0.384765625, "learning_rate": 4.8079426774168134e-05, "loss": 0.0219, "mean_token_accuracy": 0.9962655901908875, "num_tokens": 96706960.0, "step": 31645 }, { "entropy": 0.06323386076837778, "epoch": 7.377666394684695, "grad_norm": 1.9921875, "learning_rate": 4.807861369880316e-05, "loss": 0.0142, "mean_token_accuracy": 0.9936850070953369, "num_tokens": 96738935.0, "step": 31650 }, { "entropy": 0.06506092166528106, "epoch": 7.378832031705327, "grad_norm": 3.671875, "learning_rate": 4.807780046569513e-05, "loss": 0.0248, "mean_token_accuracy": 0.993993878364563, "num_tokens": 96757812.0, "step": 31655 }, { "entropy": 0.07635737303644419, "epoch": 7.3799976687259585, "grad_norm": 0.42578125, "learning_rate": 4.8076987074856196e-05, "loss": 0.0111, "mean_token_accuracy": 0.9954919457435608, "num_tokens": 96778442.0, "step": 31660 }, { "entropy": 0.0630334172397852, "epoch": 7.381163305746591, "grad_norm": 0.5859375, "learning_rate": 4.807617352629847e-05, "loss": 0.0134, "mean_token_accuracy": 0.9967043399810791, "num_tokens": 96801329.0, "step": 31665 }, { "entropy": 0.0476308373734355, "epoch": 7.382328942767222, "grad_norm": 1.2265625, "learning_rate": 4.80753598200341e-05, "loss": 0.0061, "mean_token_accuracy": 0.9978141784667969, "num_tokens": 96817981.0, "step": 31670 }, { "entropy": 0.07085022889077663, "epoch": 7.383494579787854, "grad_norm": 5.46875, "learning_rate": 4.8074545956075203e-05, "loss": 0.0194, "mean_token_accuracy": 0.9938320398330689, "num_tokens": 96835143.0, "step": 31675 }, { "entropy": 0.06873668488115073, "epoch": 7.384660216808486, "grad_norm": 0.466796875, "learning_rate": 4.807373193443392e-05, "loss": 0.0314, "mean_token_accuracy": 0.9937922835350037, "num_tokens": 96847103.0, "step": 31680 }, { "entropy": 0.08670074734836816, "epoch": 7.385825853829117, "grad_norm": 2.640625, "learning_rate": 4.807291775512239e-05, "loss": 0.0589, "mean_token_accuracy": 0.9884765923023224, "num_tokens": 96876792.0, "step": 31685 }, { "entropy": 0.06278931275010109, "epoch": 7.386991490849749, "grad_norm": 0.478515625, "learning_rate": 4.807210341815275e-05, "loss": 0.0054, "mean_token_accuracy": 0.9977745175361633, "num_tokens": 96907806.0, "step": 31690 }, { "entropy": 0.05830715810880065, "epoch": 7.3881571278703815, "grad_norm": 0.26953125, "learning_rate": 4.807128892353715e-05, "loss": 0.0061, "mean_token_accuracy": 0.9984943211078644, "num_tokens": 96931150.0, "step": 31695 }, { "entropy": 0.09116251096129417, "epoch": 7.389322764891013, "grad_norm": 2.328125, "learning_rate": 4.8070474271287735e-05, "loss": 0.01, "mean_token_accuracy": 0.996881228685379, "num_tokens": 96941888.0, "step": 31700 }, { "entropy": 0.06741825370118022, "epoch": 7.390488401911645, "grad_norm": 1.7578125, "learning_rate": 4.8069659461416644e-05, "loss": 0.0171, "mean_token_accuracy": 0.9934270083904266, "num_tokens": 96964333.0, "step": 31705 }, { "entropy": 0.048117708042263986, "epoch": 7.391654038932277, "grad_norm": 1.3125, "learning_rate": 4.806884449393604e-05, "loss": 0.0098, "mean_token_accuracy": 0.9970331609249115, "num_tokens": 96991219.0, "step": 31710 }, { "entropy": 0.06609851317480206, "epoch": 7.392819675952908, "grad_norm": 1.953125, "learning_rate": 4.806802936885806e-05, "loss": 0.0141, "mean_token_accuracy": 0.9967052280902863, "num_tokens": 97021576.0, "step": 31715 }, { "entropy": 0.07261874917894602, "epoch": 7.39398531297354, "grad_norm": 0.2353515625, "learning_rate": 4.806721408619487e-05, "loss": 0.0134, "mean_token_accuracy": 0.9963623523712158, "num_tokens": 97040151.0, "step": 31720 }, { "entropy": 0.06871300302445889, "epoch": 7.395150949994171, "grad_norm": 0.86328125, "learning_rate": 4.806639864595863e-05, "loss": 0.0186, "mean_token_accuracy": 0.9961176872253418, "num_tokens": 97058961.0, "step": 31725 }, { "entropy": 0.06867877654731273, "epoch": 7.3963165870148035, "grad_norm": 1.671875, "learning_rate": 4.806558304816148e-05, "loss": 0.0182, "mean_token_accuracy": 0.9950770616531373, "num_tokens": 97072234.0, "step": 31730 }, { "entropy": 0.06973572825081646, "epoch": 7.397482224035436, "grad_norm": 1.7109375, "learning_rate": 4.806476729281561e-05, "loss": 0.0197, "mean_token_accuracy": 0.9921569526195526, "num_tokens": 97088397.0, "step": 31735 }, { "entropy": 0.06464776555076242, "epoch": 7.398647861056067, "grad_norm": 0.69921875, "learning_rate": 4.806395137993316e-05, "loss": 0.0124, "mean_token_accuracy": 0.9959135890007019, "num_tokens": 97111838.0, "step": 31740 }, { "entropy": 0.04916971167549491, "epoch": 7.399813498076699, "grad_norm": 0.1845703125, "learning_rate": 4.806313530952631e-05, "loss": 0.0056, "mean_token_accuracy": 0.9973500072956085, "num_tokens": 97146271.0, "step": 31745 }, { "entropy": 0.05610126769170165, "epoch": 7.400979135097331, "grad_norm": 1.8984375, "learning_rate": 4.806231908160722e-05, "loss": 0.0084, "mean_token_accuracy": 0.9978776752948761, "num_tokens": 97175592.0, "step": 31750 }, { "entropy": 0.06999030411243438, "epoch": 7.402144772117962, "grad_norm": 0.62890625, "learning_rate": 4.806150269618807e-05, "loss": 0.0108, "mean_token_accuracy": 0.9972579479217529, "num_tokens": 97197064.0, "step": 31755 }, { "entropy": 0.06999958539381623, "epoch": 7.403310409138594, "grad_norm": 0.279296875, "learning_rate": 4.8060686153281034e-05, "loss": 0.0194, "mean_token_accuracy": 0.9947843551635742, "num_tokens": 97219096.0, "step": 31760 }, { "entropy": 0.06066929465159774, "epoch": 7.404476046159226, "grad_norm": 0.396484375, "learning_rate": 4.805986945289828e-05, "loss": 0.0126, "mean_token_accuracy": 0.9972352147102356, "num_tokens": 97240817.0, "step": 31765 }, { "entropy": 0.053822552971541884, "epoch": 7.405641683179858, "grad_norm": 3.375, "learning_rate": 4.8059052595051986e-05, "loss": 0.0106, "mean_token_accuracy": 0.9965937435626984, "num_tokens": 97266739.0, "step": 31770 }, { "entropy": 0.0714537937194109, "epoch": 7.40680732020049, "grad_norm": 1.0078125, "learning_rate": 4.805823557975433e-05, "loss": 0.0287, "mean_token_accuracy": 0.9925433158874511, "num_tokens": 97278114.0, "step": 31775 }, { "entropy": 0.055256042256951333, "epoch": 7.407972957221121, "grad_norm": 1.2265625, "learning_rate": 4.805741840701751e-05, "loss": 0.0076, "mean_token_accuracy": 0.9974486231803894, "num_tokens": 97293752.0, "step": 31780 }, { "entropy": 0.057471389323472975, "epoch": 7.409138594241753, "grad_norm": 0.40625, "learning_rate": 4.8056601076853704e-05, "loss": 0.0083, "mean_token_accuracy": 0.995576798915863, "num_tokens": 97311430.0, "step": 31785 }, { "entropy": 0.07471015406772494, "epoch": 7.410304231262385, "grad_norm": 1.0390625, "learning_rate": 4.80557835892751e-05, "loss": 0.0098, "mean_token_accuracy": 0.9975471258163452, "num_tokens": 97325200.0, "step": 31790 }, { "entropy": 0.05630330964922905, "epoch": 7.4114698682830165, "grad_norm": 1.5625, "learning_rate": 4.8054965944293876e-05, "loss": 0.0108, "mean_token_accuracy": 0.9970169067382812, "num_tokens": 97347193.0, "step": 31795 }, { "entropy": 0.05957003049552441, "epoch": 7.412635505303649, "grad_norm": 0.38671875, "learning_rate": 4.8054148141922236e-05, "loss": 0.0073, "mean_token_accuracy": 0.9971933484077453, "num_tokens": 97386951.0, "step": 31800 }, { "entropy": 0.0552212193608284, "epoch": 7.41380114232428, "grad_norm": 4.4375, "learning_rate": 4.805333018217238e-05, "loss": 0.015, "mean_token_accuracy": 0.9929179668426513, "num_tokens": 97410465.0, "step": 31805 }, { "entropy": 0.06529962951317429, "epoch": 7.414966779344912, "grad_norm": 0.2734375, "learning_rate": 4.8052512065056495e-05, "loss": 0.0116, "mean_token_accuracy": 0.9965193450450898, "num_tokens": 97437080.0, "step": 31810 }, { "entropy": 0.07579502649605274, "epoch": 7.416132416365544, "grad_norm": 2.953125, "learning_rate": 4.805169379058678e-05, "loss": 0.016, "mean_token_accuracy": 0.9949338555335998, "num_tokens": 97458811.0, "step": 31815 }, { "entropy": 0.06360966097563506, "epoch": 7.417298053386175, "grad_norm": 1.5390625, "learning_rate": 4.805087535877544e-05, "loss": 0.0194, "mean_token_accuracy": 0.994461327791214, "num_tokens": 97474463.0, "step": 31820 }, { "entropy": 0.07172002922743559, "epoch": 7.418463690406807, "grad_norm": 0.7578125, "learning_rate": 4.8050056769634684e-05, "loss": 0.0059, "mean_token_accuracy": 0.9970459818840027, "num_tokens": 97496723.0, "step": 31825 }, { "entropy": 0.07104908116161823, "epoch": 7.419629327427439, "grad_norm": 0.482421875, "learning_rate": 4.804923802317671e-05, "loss": 0.0218, "mean_token_accuracy": 0.9935750663280487, "num_tokens": 97520510.0, "step": 31830 }, { "entropy": 0.05038301609456539, "epoch": 7.420794964448071, "grad_norm": 0.390625, "learning_rate": 4.804841911941373e-05, "loss": 0.0133, "mean_token_accuracy": 0.9973720788955689, "num_tokens": 97551599.0, "step": 31835 }, { "entropy": 0.07734383950009942, "epoch": 7.421960601468703, "grad_norm": 2.515625, "learning_rate": 4.804760005835796e-05, "loss": 0.0235, "mean_token_accuracy": 0.994698303937912, "num_tokens": 97569532.0, "step": 31840 }, { "entropy": 0.0816195654682815, "epoch": 7.423126238489335, "grad_norm": 1.1875, "learning_rate": 4.80467808400216e-05, "loss": 0.0288, "mean_token_accuracy": 0.9956186056137085, "num_tokens": 97582823.0, "step": 31845 }, { "entropy": 0.07614492103457451, "epoch": 7.424291875509966, "grad_norm": 1.859375, "learning_rate": 4.8045961464416876e-05, "loss": 0.0124, "mean_token_accuracy": 0.9969462215900421, "num_tokens": 97595668.0, "step": 31850 }, { "entropy": 0.052913818042725325, "epoch": 7.425457512530598, "grad_norm": 0.6796875, "learning_rate": 4.804514193155601e-05, "loss": 0.0084, "mean_token_accuracy": 0.9972995042800903, "num_tokens": 97625129.0, "step": 31855 }, { "entropy": 0.083005213085562, "epoch": 7.426623149551229, "grad_norm": 1.3203125, "learning_rate": 4.804432224145121e-05, "loss": 0.0199, "mean_token_accuracy": 0.9942511141300201, "num_tokens": 97640906.0, "step": 31860 }, { "entropy": 0.07574100028723478, "epoch": 7.4277887865718615, "grad_norm": 2.375, "learning_rate": 4.8043502394114714e-05, "loss": 0.0111, "mean_token_accuracy": 0.9965871274471283, "num_tokens": 97663290.0, "step": 31865 }, { "entropy": 0.06771274656057358, "epoch": 7.428954423592494, "grad_norm": 0.380859375, "learning_rate": 4.804268238955874e-05, "loss": 0.0068, "mean_token_accuracy": 0.995803314447403, "num_tokens": 97677727.0, "step": 31870 }, { "entropy": 0.06518225539475679, "epoch": 7.430120060613125, "grad_norm": 2.8125, "learning_rate": 4.8041862227795506e-05, "loss": 0.0264, "mean_token_accuracy": 0.994991272687912, "num_tokens": 97695900.0, "step": 31875 }, { "entropy": 0.06668935623019934, "epoch": 7.431285697633757, "grad_norm": 1.234375, "learning_rate": 4.804104190883725e-05, "loss": 0.0189, "mean_token_accuracy": 0.9964043319225311, "num_tokens": 97708289.0, "step": 31880 }, { "entropy": 0.06050615776330233, "epoch": 7.432451334654389, "grad_norm": 0.177734375, "learning_rate": 4.804022143269621e-05, "loss": 0.0051, "mean_token_accuracy": 0.9986397385597229, "num_tokens": 97739608.0, "step": 31885 }, { "entropy": 0.05716359736397862, "epoch": 7.43361697167502, "grad_norm": 0.2421875, "learning_rate": 4.803940079938461e-05, "loss": 0.0088, "mean_token_accuracy": 0.9968883991241455, "num_tokens": 97755159.0, "step": 31890 }, { "entropy": 0.08151376890018583, "epoch": 7.434782608695652, "grad_norm": 0.53125, "learning_rate": 4.80385800089147e-05, "loss": 0.008, "mean_token_accuracy": 0.9967722475528717, "num_tokens": 97778369.0, "step": 31895 }, { "entropy": 0.060337006114423275, "epoch": 7.4359482457162835, "grad_norm": 2.421875, "learning_rate": 4.80377590612987e-05, "loss": 0.0121, "mean_token_accuracy": 0.9959405541419983, "num_tokens": 97788905.0, "step": 31900 }, { "entropy": 0.07208081735298037, "epoch": 7.437113882736916, "grad_norm": 2.0, "learning_rate": 4.8036937956548875e-05, "loss": 0.0245, "mean_token_accuracy": 0.992925900220871, "num_tokens": 97807287.0, "step": 31905 }, { "entropy": 0.05150127690285444, "epoch": 7.438279519757548, "grad_norm": 0.35546875, "learning_rate": 4.8036116694677446e-05, "loss": 0.0085, "mean_token_accuracy": 0.9969991385936737, "num_tokens": 97837029.0, "step": 31910 }, { "entropy": 0.0542297987267375, "epoch": 7.439445156778179, "grad_norm": 1.03125, "learning_rate": 4.803529527569668e-05, "loss": 0.0119, "mean_token_accuracy": 0.9952615916728973, "num_tokens": 97851701.0, "step": 31915 }, { "entropy": 0.06879463642835618, "epoch": 7.440610793798811, "grad_norm": 0.2470703125, "learning_rate": 4.803447369961881e-05, "loss": 0.0078, "mean_token_accuracy": 0.9959939181804657, "num_tokens": 97881684.0, "step": 31920 }, { "entropy": 0.048804645985364915, "epoch": 7.441776430819443, "grad_norm": 0.625, "learning_rate": 4.803365196645609e-05, "loss": 0.0143, "mean_token_accuracy": 0.9956522107124328, "num_tokens": 97902591.0, "step": 31925 }, { "entropy": 0.05077600758522749, "epoch": 7.442942067840074, "grad_norm": 2.3125, "learning_rate": 4.803283007622078e-05, "loss": 0.0121, "mean_token_accuracy": 0.9966121196746827, "num_tokens": 97917592.0, "step": 31930 }, { "entropy": 0.06004767008125782, "epoch": 7.4441077048607065, "grad_norm": 2.390625, "learning_rate": 4.803200802892513e-05, "loss": 0.017, "mean_token_accuracy": 0.9940854012966156, "num_tokens": 97930297.0, "step": 31935 }, { "entropy": 0.07400213945657015, "epoch": 7.445273341881338, "grad_norm": 1.1953125, "learning_rate": 4.803118582458139e-05, "loss": 0.0148, "mean_token_accuracy": 0.9951455473899842, "num_tokens": 97940683.0, "step": 31940 }, { "entropy": 0.0650801496580243, "epoch": 7.44643897890197, "grad_norm": 1.6171875, "learning_rate": 4.803036346320184e-05, "loss": 0.0137, "mean_token_accuracy": 0.9969987511634827, "num_tokens": 97952741.0, "step": 31945 }, { "entropy": 0.09372318238019943, "epoch": 7.447604615922602, "grad_norm": 0.69921875, "learning_rate": 4.802954094479873e-05, "loss": 0.0159, "mean_token_accuracy": 0.9942943751811981, "num_tokens": 97965865.0, "step": 31950 }, { "entropy": 0.047209294699132445, "epoch": 7.448770252943233, "grad_norm": 0.4375, "learning_rate": 4.8028718269384333e-05, "loss": 0.0053, "mean_token_accuracy": 0.9976045370101929, "num_tokens": 97999982.0, "step": 31955 }, { "entropy": 0.07388658728450537, "epoch": 7.449935889963865, "grad_norm": 3.59375, "learning_rate": 4.80278954369709e-05, "loss": 0.015, "mean_token_accuracy": 0.9949574530124664, "num_tokens": 98014580.0, "step": 31960 }, { "entropy": 0.07657665573060513, "epoch": 7.451101526984497, "grad_norm": 0.44140625, "learning_rate": 4.802707244757072e-05, "loss": 0.0116, "mean_token_accuracy": 0.9976080656051636, "num_tokens": 98035444.0, "step": 31965 }, { "entropy": 0.05655227024108171, "epoch": 7.452267164005129, "grad_norm": 0.2314453125, "learning_rate": 4.802624930119605e-05, "loss": 0.0128, "mean_token_accuracy": 0.9970692336559296, "num_tokens": 98069827.0, "step": 31970 }, { "entropy": 0.061259434837847945, "epoch": 7.453432801025761, "grad_norm": 0.2294921875, "learning_rate": 4.8025425997859176e-05, "loss": 0.0086, "mean_token_accuracy": 0.9976824820041656, "num_tokens": 98097657.0, "step": 31975 }, { "entropy": 0.03952852394431829, "epoch": 7.454598438046393, "grad_norm": 0.2021484375, "learning_rate": 4.802460253757237e-05, "loss": 0.0063, "mean_token_accuracy": 0.9980733573436738, "num_tokens": 98127437.0, "step": 31980 }, { "entropy": 0.05817561913281679, "epoch": 7.455764075067024, "grad_norm": 2.984375, "learning_rate": 4.80237789203479e-05, "loss": 0.0128, "mean_token_accuracy": 0.9965545058250427, "num_tokens": 98150403.0, "step": 31985 }, { "entropy": 0.08537760525941848, "epoch": 7.456929712087656, "grad_norm": 2.8125, "learning_rate": 4.802295514619807e-05, "loss": 0.0349, "mean_token_accuracy": 0.9901233434677124, "num_tokens": 98157269.0, "step": 31990 }, { "entropy": 0.03773313369601965, "epoch": 7.458095349108287, "grad_norm": 0.3046875, "learning_rate": 4.802213121513515e-05, "loss": 0.0051, "mean_token_accuracy": 0.9988882124423981, "num_tokens": 98195898.0, "step": 31995 }, { "entropy": 0.05709927398711443, "epoch": 7.459260986128919, "grad_norm": 0.404296875, "learning_rate": 4.802130712717142e-05, "loss": 0.0097, "mean_token_accuracy": 0.9951608777046204, "num_tokens": 98216906.0, "step": 32000 }, { "entropy": 0.07515623532235623, "epoch": 7.4604266231495515, "grad_norm": 2.171875, "learning_rate": 4.802048288231917e-05, "loss": 0.0239, "mean_token_accuracy": 0.9943017184734344, "num_tokens": 98229341.0, "step": 32005 }, { "entropy": 0.046360192447900773, "epoch": 7.461592260170183, "grad_norm": 0.25390625, "learning_rate": 4.8019658480590715e-05, "loss": 0.0134, "mean_token_accuracy": 0.9963595449924469, "num_tokens": 98262472.0, "step": 32010 }, { "entropy": 0.06408070325851441, "epoch": 7.462757897190815, "grad_norm": 0.11767578125, "learning_rate": 4.8018833921998316e-05, "loss": 0.0097, "mean_token_accuracy": 0.9963421761989594, "num_tokens": 98282518.0, "step": 32015 }, { "entropy": 0.07481765616685151, "epoch": 7.463923534211447, "grad_norm": 1.390625, "learning_rate": 4.801800920655429e-05, "loss": 0.0125, "mean_token_accuracy": 0.9957054257392883, "num_tokens": 98294251.0, "step": 32020 }, { "entropy": 0.061605995427817105, "epoch": 7.465089171232078, "grad_norm": 0.25390625, "learning_rate": 4.801718433427092e-05, "loss": 0.0159, "mean_token_accuracy": 0.9957810342311859, "num_tokens": 98320647.0, "step": 32025 }, { "entropy": 0.06579680051654577, "epoch": 7.46625480825271, "grad_norm": 0.28125, "learning_rate": 4.801635930516051e-05, "loss": 0.0063, "mean_token_accuracy": 0.9973134338855744, "num_tokens": 98339371.0, "step": 32030 }, { "entropy": 0.07847501374781132, "epoch": 7.4674204452733415, "grad_norm": 1.4609375, "learning_rate": 4.801553411923537e-05, "loss": 0.0193, "mean_token_accuracy": 0.9948892652988434, "num_tokens": 98351538.0, "step": 32035 }, { "entropy": 0.05895627625286579, "epoch": 7.468586082293974, "grad_norm": 0.703125, "learning_rate": 4.8014708776507806e-05, "loss": 0.009, "mean_token_accuracy": 0.9976952493190765, "num_tokens": 98369458.0, "step": 32040 }, { "entropy": 0.06033395798876882, "epoch": 7.469751719314606, "grad_norm": 0.2001953125, "learning_rate": 4.801388327699011e-05, "loss": 0.0098, "mean_token_accuracy": 0.9974822103977203, "num_tokens": 98394279.0, "step": 32045 }, { "entropy": 0.0509566405788064, "epoch": 7.470917356335237, "grad_norm": 0.515625, "learning_rate": 4.801305762069461e-05, "loss": 0.0082, "mean_token_accuracy": 0.9969842314720154, "num_tokens": 98415653.0, "step": 32050 }, { "entropy": 0.07163450829684734, "epoch": 7.472082993355869, "grad_norm": 1.4609375, "learning_rate": 4.80122318076336e-05, "loss": 0.009, "mean_token_accuracy": 0.9957123637199402, "num_tokens": 98444850.0, "step": 32055 }, { "entropy": 0.07031346326693892, "epoch": 7.473248630376501, "grad_norm": 2.9375, "learning_rate": 4.8011405837819403e-05, "loss": 0.017, "mean_token_accuracy": 0.994475269317627, "num_tokens": 98461675.0, "step": 32060 }, { "entropy": 0.10657533258199692, "epoch": 7.474414267397132, "grad_norm": 4.46875, "learning_rate": 4.801057971126434e-05, "loss": 0.0245, "mean_token_accuracy": 0.9923931121826172, "num_tokens": 98478969.0, "step": 32065 }, { "entropy": 0.0678681674413383, "epoch": 7.475579904417764, "grad_norm": 0.2294921875, "learning_rate": 4.8009753427980724e-05, "loss": 0.0176, "mean_token_accuracy": 0.9935955286026001, "num_tokens": 98494078.0, "step": 32070 }, { "entropy": 0.04767877887934446, "epoch": 7.476745541438396, "grad_norm": 0.318359375, "learning_rate": 4.8008926987980874e-05, "loss": 0.0062, "mean_token_accuracy": 0.9986327946186065, "num_tokens": 98534938.0, "step": 32075 }, { "entropy": 0.05674191527068615, "epoch": 7.477911178459028, "grad_norm": 0.53515625, "learning_rate": 4.800810039127712e-05, "loss": 0.0144, "mean_token_accuracy": 0.9943017423152923, "num_tokens": 98550014.0, "step": 32080 }, { "entropy": 0.07790519241243601, "epoch": 7.47907681547966, "grad_norm": 1.7109375, "learning_rate": 4.800727363788178e-05, "loss": 0.0164, "mean_token_accuracy": 0.9978551805019379, "num_tokens": 98567666.0, "step": 32085 }, { "entropy": 0.05544118182733655, "epoch": 7.480242452500291, "grad_norm": 0.74609375, "learning_rate": 4.8006446727807186e-05, "loss": 0.0105, "mean_token_accuracy": 0.9966945469379425, "num_tokens": 98609715.0, "step": 32090 }, { "entropy": 0.04747336655855179, "epoch": 7.481408089520923, "grad_norm": 0.431640625, "learning_rate": 4.800561966106567e-05, "loss": 0.0106, "mean_token_accuracy": 0.9962104082107544, "num_tokens": 98638761.0, "step": 32095 }, { "entropy": 0.08676061071455479, "epoch": 7.482573726541555, "grad_norm": 1.640625, "learning_rate": 4.8004792437669564e-05, "loss": 0.0164, "mean_token_accuracy": 0.9954579770565033, "num_tokens": 98652723.0, "step": 32100 }, { "entropy": 0.08409857619553804, "epoch": 7.4837393635621865, "grad_norm": 2.703125, "learning_rate": 4.80039650576312e-05, "loss": 0.0486, "mean_token_accuracy": 0.9907464563846589, "num_tokens": 98675617.0, "step": 32105 }, { "entropy": 0.0578452062793076, "epoch": 7.484905000582819, "grad_norm": 0.365234375, "learning_rate": 4.800313752096292e-05, "loss": 0.011, "mean_token_accuracy": 0.9951134145259857, "num_tokens": 98696778.0, "step": 32110 }, { "entropy": 0.06942653022706509, "epoch": 7.486070637603451, "grad_norm": 2.046875, "learning_rate": 4.800230982767707e-05, "loss": 0.0112, "mean_token_accuracy": 0.9965254843235016, "num_tokens": 98709420.0, "step": 32115 }, { "entropy": 0.06638341769576073, "epoch": 7.487236274624082, "grad_norm": 1.046875, "learning_rate": 4.800148197778597e-05, "loss": 0.0172, "mean_token_accuracy": 0.9953685343265534, "num_tokens": 98719108.0, "step": 32120 }, { "entropy": 0.06705099958926439, "epoch": 7.488401911644714, "grad_norm": 0.90625, "learning_rate": 4.800065397130199e-05, "loss": 0.0198, "mean_token_accuracy": 0.9936238288879394, "num_tokens": 98737993.0, "step": 32125 }, { "entropy": 0.06405184855684638, "epoch": 7.489567548665345, "grad_norm": 3.078125, "learning_rate": 4.799982580823746e-05, "loss": 0.0241, "mean_token_accuracy": 0.9920284271240234, "num_tokens": 98756576.0, "step": 32130 }, { "entropy": 0.0811619933694601, "epoch": 7.490733185685977, "grad_norm": 0.59765625, "learning_rate": 4.799899748860473e-05, "loss": 0.0228, "mean_token_accuracy": 0.9951043605804444, "num_tokens": 98773519.0, "step": 32135 }, { "entropy": 0.10115591492503881, "epoch": 7.4918988227066095, "grad_norm": 1.234375, "learning_rate": 4.799816901241616e-05, "loss": 0.0294, "mean_token_accuracy": 0.9908878207206726, "num_tokens": 98785515.0, "step": 32140 }, { "entropy": 0.06037537744268775, "epoch": 7.493064459727241, "grad_norm": 1.4765625, "learning_rate": 4.79973403796841e-05, "loss": 0.0121, "mean_token_accuracy": 0.9955259382724762, "num_tokens": 98802327.0, "step": 32145 }, { "entropy": 0.10647099614143371, "epoch": 7.494230096747873, "grad_norm": 2.390625, "learning_rate": 4.799651159042091e-05, "loss": 0.0377, "mean_token_accuracy": 0.99305180311203, "num_tokens": 98811942.0, "step": 32150 }, { "entropy": 0.06047349814325571, "epoch": 7.495395733768505, "grad_norm": 3.109375, "learning_rate": 4.799568264463894e-05, "loss": 0.0118, "mean_token_accuracy": 0.9965644776821136, "num_tokens": 98831430.0, "step": 32155 }, { "entropy": 0.08296026848256588, "epoch": 7.496561370789136, "grad_norm": 0.98828125, "learning_rate": 4.799485354235056e-05, "loss": 0.0152, "mean_token_accuracy": 0.9960346400737763, "num_tokens": 98840931.0, "step": 32160 }, { "entropy": 0.06511731203645468, "epoch": 7.497727007809768, "grad_norm": 0.333984375, "learning_rate": 4.7994024283568125e-05, "loss": 0.0079, "mean_token_accuracy": 0.9975647747516632, "num_tokens": 98857778.0, "step": 32165 }, { "entropy": 0.07737187128514052, "epoch": 7.498892644830399, "grad_norm": 1.7890625, "learning_rate": 4.7993194868304e-05, "loss": 0.0143, "mean_token_accuracy": 0.9938872277736663, "num_tokens": 98875306.0, "step": 32170 }, { "entropy": 0.06764257559552789, "epoch": 7.5000582818510315, "grad_norm": 0.314453125, "learning_rate": 4.7992365296570564e-05, "loss": 0.0074, "mean_token_accuracy": 0.9962385296821594, "num_tokens": 98907649.0, "step": 32175 }, { "entropy": 0.05583788137882948, "epoch": 7.501223918871664, "grad_norm": 0.400390625, "learning_rate": 4.799153556838018e-05, "loss": 0.0301, "mean_token_accuracy": 0.9933856427669525, "num_tokens": 98938264.0, "step": 32180 }, { "entropy": 0.09082546047866344, "epoch": 7.502389555892295, "grad_norm": 2.59375, "learning_rate": 4.799070568374522e-05, "loss": 0.0166, "mean_token_accuracy": 0.9947808086872101, "num_tokens": 98948444.0, "step": 32185 }, { "entropy": 0.06142871137708426, "epoch": 7.503555192912927, "grad_norm": 2.109375, "learning_rate": 4.7989875642678054e-05, "loss": 0.016, "mean_token_accuracy": 0.9952453315258026, "num_tokens": 98961866.0, "step": 32190 }, { "entropy": 0.05687323287129402, "epoch": 7.504720829933559, "grad_norm": 1.2109375, "learning_rate": 4.798904544519107e-05, "loss": 0.0111, "mean_token_accuracy": 0.9948042273521424, "num_tokens": 98982911.0, "step": 32195 }, { "entropy": 0.06204276205971837, "epoch": 7.50588646695419, "grad_norm": 0.9375, "learning_rate": 4.7988215091296637e-05, "loss": 0.008, "mean_token_accuracy": 0.9979773998260498, "num_tokens": 99009256.0, "step": 32200 }, { "entropy": 0.05596322454512119, "epoch": 7.507052103974822, "grad_norm": 1.9375, "learning_rate": 4.798738458100714e-05, "loss": 0.0232, "mean_token_accuracy": 0.9954159021377563, "num_tokens": 99034387.0, "step": 32205 }, { "entropy": 0.07269154787063599, "epoch": 7.508217740995454, "grad_norm": 1.1796875, "learning_rate": 4.798655391433498e-05, "loss": 0.0139, "mean_token_accuracy": 0.9955158352851867, "num_tokens": 99051466.0, "step": 32210 }, { "entropy": 0.05379718169569969, "epoch": 7.509383378016086, "grad_norm": 0.66015625, "learning_rate": 4.798572309129251e-05, "loss": 0.0102, "mean_token_accuracy": 0.9950558304786682, "num_tokens": 99081520.0, "step": 32215 }, { "entropy": 0.05792910754680634, "epoch": 7.510549015036718, "grad_norm": 0.6953125, "learning_rate": 4.798489211189215e-05, "loss": 0.0115, "mean_token_accuracy": 0.9956493020057678, "num_tokens": 99107916.0, "step": 32220 }, { "entropy": 0.06945790387690068, "epoch": 7.511714652057349, "grad_norm": 3.578125, "learning_rate": 4.798406097614627e-05, "loss": 0.0228, "mean_token_accuracy": 0.9951068162918091, "num_tokens": 99117208.0, "step": 32225 }, { "entropy": 0.05028128759004176, "epoch": 7.512880289077981, "grad_norm": 0.3515625, "learning_rate": 4.7983229684067275e-05, "loss": 0.0051, "mean_token_accuracy": 0.9985537052154541, "num_tokens": 99151148.0, "step": 32230 }, { "entropy": 0.06575621776282788, "epoch": 7.514045926098613, "grad_norm": 0.83984375, "learning_rate": 4.7982398235667556e-05, "loss": 0.0162, "mean_token_accuracy": 0.994884067773819, "num_tokens": 99170846.0, "step": 32235 }, { "entropy": 0.06433080639690161, "epoch": 7.515211563119244, "grad_norm": 0.63671875, "learning_rate": 4.798156663095952e-05, "loss": 0.0168, "mean_token_accuracy": 0.9959873199462891, "num_tokens": 99184422.0, "step": 32240 }, { "entropy": 0.06489841155707836, "epoch": 7.5163772001398765, "grad_norm": 0.8984375, "learning_rate": 4.7980734869955555e-05, "loss": 0.0097, "mean_token_accuracy": 0.9961045622825623, "num_tokens": 99198234.0, "step": 32245 }, { "entropy": 0.060334295779466626, "epoch": 7.517542837160509, "grad_norm": 2.140625, "learning_rate": 4.797990295266807e-05, "loss": 0.0151, "mean_token_accuracy": 0.9957007765769958, "num_tokens": 99220548.0, "step": 32250 }, { "entropy": 0.045729524735361335, "epoch": 7.51870847418114, "grad_norm": 0.220703125, "learning_rate": 4.797907087910947e-05, "loss": 0.0054, "mean_token_accuracy": 0.9980747818946838, "num_tokens": 99249556.0, "step": 32255 }, { "entropy": 0.06361297219991684, "epoch": 7.519874111201772, "grad_norm": 0.96875, "learning_rate": 4.797823864929216e-05, "loss": 0.0084, "mean_token_accuracy": 0.9971418261528016, "num_tokens": 99260254.0, "step": 32260 }, { "entropy": 0.06000940101221204, "epoch": 7.521039748222403, "grad_norm": 0.73046875, "learning_rate": 4.7977406263228555e-05, "loss": 0.0043, "mean_token_accuracy": 0.9969924569129944, "num_tokens": 99282455.0, "step": 32265 }, { "entropy": 0.06560409255325794, "epoch": 7.522205385243035, "grad_norm": 2.421875, "learning_rate": 4.797657372093106e-05, "loss": 0.0213, "mean_token_accuracy": 0.9930356025695801, "num_tokens": 99300204.0, "step": 32270 }, { "entropy": 0.07295639859512448, "epoch": 7.523371022263667, "grad_norm": 0.330078125, "learning_rate": 4.797574102241209e-05, "loss": 0.0071, "mean_token_accuracy": 0.9984568536281586, "num_tokens": 99317838.0, "step": 32275 }, { "entropy": 0.07099381582811475, "epoch": 7.524536659284299, "grad_norm": 3.53125, "learning_rate": 4.797490816768407e-05, "loss": 0.0134, "mean_token_accuracy": 0.9947207570075989, "num_tokens": 99348614.0, "step": 32280 }, { "entropy": 0.0586695110425353, "epoch": 7.525702296304931, "grad_norm": 3.53125, "learning_rate": 4.7974075156759415e-05, "loss": 0.0112, "mean_token_accuracy": 0.9969553589820862, "num_tokens": 99363903.0, "step": 32285 }, { "entropy": 0.06191801391541958, "epoch": 7.526867933325562, "grad_norm": 0.337890625, "learning_rate": 4.797324198965055e-05, "loss": 0.0088, "mean_token_accuracy": 0.9979340076446533, "num_tokens": 99392391.0, "step": 32290 }, { "entropy": 0.07315607350319624, "epoch": 7.528033570346194, "grad_norm": 2.28125, "learning_rate": 4.797240866636988e-05, "loss": 0.0276, "mean_token_accuracy": 0.990977531671524, "num_tokens": 99403416.0, "step": 32295 }, { "entropy": 0.06841037645936013, "epoch": 7.529199207366826, "grad_norm": 2.3125, "learning_rate": 4.797157518692986e-05, "loss": 0.0204, "mean_token_accuracy": 0.9952999293804169, "num_tokens": 99414810.0, "step": 32300 }, { "entropy": 0.05617341762408614, "epoch": 7.530364844387458, "grad_norm": 1.4375, "learning_rate": 4.79707415513429e-05, "loss": 0.0095, "mean_token_accuracy": 0.9951882123947143, "num_tokens": 99442828.0, "step": 32305 }, { "entropy": 0.06965567506849765, "epoch": 7.5315304814080895, "grad_norm": 1.109375, "learning_rate": 4.796990775962143e-05, "loss": 0.023, "mean_token_accuracy": 0.9933463513851166, "num_tokens": 99451675.0, "step": 32310 }, { "entropy": 0.07169121066108346, "epoch": 7.5326961184287216, "grad_norm": 0.31640625, "learning_rate": 4.7969073811777885e-05, "loss": 0.0132, "mean_token_accuracy": 0.9940380156040192, "num_tokens": 99477402.0, "step": 32315 }, { "entropy": 0.05028648842126131, "epoch": 7.533861755449353, "grad_norm": 0.34375, "learning_rate": 4.7968239707824715e-05, "loss": 0.0135, "mean_token_accuracy": 0.996212112903595, "num_tokens": 99498726.0, "step": 32320 }, { "entropy": 0.09644905207678675, "epoch": 7.535027392469985, "grad_norm": 2.15625, "learning_rate": 4.796740544777433e-05, "loss": 0.0125, "mean_token_accuracy": 0.9962563216686249, "num_tokens": 99516871.0, "step": 32325 }, { "entropy": 0.06047167964279652, "epoch": 7.536193029490617, "grad_norm": 0.6328125, "learning_rate": 4.7966571031639205e-05, "loss": 0.0082, "mean_token_accuracy": 0.9959788024425507, "num_tokens": 99547359.0, "step": 32330 }, { "entropy": 0.07342666406184435, "epoch": 7.537358666511248, "grad_norm": 0.54296875, "learning_rate": 4.796573645943175e-05, "loss": 0.0063, "mean_token_accuracy": 0.9982951283454895, "num_tokens": 99567827.0, "step": 32335 }, { "entropy": 0.07892006533220411, "epoch": 7.53852430353188, "grad_norm": 1.3515625, "learning_rate": 4.7964901731164425e-05, "loss": 0.005, "mean_token_accuracy": 0.9984602570533753, "num_tokens": 99590514.0, "step": 32340 }, { "entropy": 0.05502187833189964, "epoch": 7.5396899405525115, "grad_norm": 0.84765625, "learning_rate": 4.7964066846849674e-05, "loss": 0.0184, "mean_token_accuracy": 0.995114940404892, "num_tokens": 99607407.0, "step": 32345 }, { "entropy": 0.061845211498439315, "epoch": 7.540855577573144, "grad_norm": 0.671875, "learning_rate": 4.796323180649995e-05, "loss": 0.0053, "mean_token_accuracy": 0.9959377884864807, "num_tokens": 99629271.0, "step": 32350 }, { "entropy": 0.06737384842708707, "epoch": 7.542021214593776, "grad_norm": 0.373046875, "learning_rate": 4.79623966101277e-05, "loss": 0.0075, "mean_token_accuracy": 0.9972203969955444, "num_tokens": 99649342.0, "step": 32355 }, { "entropy": 0.06474924013018608, "epoch": 7.543186851614407, "grad_norm": 0.7734375, "learning_rate": 4.796156125774538e-05, "loss": 0.0147, "mean_token_accuracy": 0.9958956420421601, "num_tokens": 99664635.0, "step": 32360 }, { "entropy": 0.05531628727912903, "epoch": 7.544352488635039, "grad_norm": 0.5078125, "learning_rate": 4.796072574936545e-05, "loss": 0.0131, "mean_token_accuracy": 0.994882071018219, "num_tokens": 99692508.0, "step": 32365 }, { "entropy": 0.07443120554089547, "epoch": 7.545518125655671, "grad_norm": 2.265625, "learning_rate": 4.795989008500037e-05, "loss": 0.0348, "mean_token_accuracy": 0.9899700045585632, "num_tokens": 99714721.0, "step": 32370 }, { "entropy": 0.07098412998020649, "epoch": 7.546683762676302, "grad_norm": 4.90625, "learning_rate": 4.795905426466259e-05, "loss": 0.0211, "mean_token_accuracy": 0.9950291216373444, "num_tokens": 99732835.0, "step": 32375 }, { "entropy": 0.050486131198704244, "epoch": 7.5478493996969345, "grad_norm": 0.5859375, "learning_rate": 4.7958218288364574e-05, "loss": 0.0097, "mean_token_accuracy": 0.9978505671024323, "num_tokens": 99756728.0, "step": 32380 }, { "entropy": 0.06294616991654038, "epoch": 7.549015036717567, "grad_norm": 0.9609375, "learning_rate": 4.7957382156118804e-05, "loss": 0.0109, "mean_token_accuracy": 0.9962819039821624, "num_tokens": 99779085.0, "step": 32385 }, { "entropy": 0.049626897927373646, "epoch": 7.550180673738198, "grad_norm": 0.1328125, "learning_rate": 4.795654586793773e-05, "loss": 0.0052, "mean_token_accuracy": 0.9979463160037995, "num_tokens": 99816126.0, "step": 32390 }, { "entropy": 0.06566182002425194, "epoch": 7.55134631075883, "grad_norm": 2.703125, "learning_rate": 4.795570942383383e-05, "loss": 0.022, "mean_token_accuracy": 0.9944956481456757, "num_tokens": 99828398.0, "step": 32395 }, { "entropy": 0.06577919237315655, "epoch": 7.552511947779461, "grad_norm": 1.2890625, "learning_rate": 4.7954872823819576e-05, "loss": 0.0107, "mean_token_accuracy": 0.9972432315349579, "num_tokens": 99838581.0, "step": 32400 }, { "entropy": 0.18690302036702633, "epoch": 7.553677584800093, "grad_norm": 5.34375, "learning_rate": 4.795403606790745e-05, "loss": 0.2838, "mean_token_accuracy": 0.9480214655399323, "num_tokens": 99856832.0, "step": 32405 }, { "entropy": 0.08401225432753563, "epoch": 7.554843221820725, "grad_norm": 0.78125, "learning_rate": 4.795319915610991e-05, "loss": 0.0166, "mean_token_accuracy": 0.9959981799125671, "num_tokens": 99881690.0, "step": 32410 }, { "entropy": 0.08292626310139894, "epoch": 7.5560088588413565, "grad_norm": 0.89453125, "learning_rate": 4.7952362088439463e-05, "loss": 0.0102, "mean_token_accuracy": 0.9968954145908355, "num_tokens": 99901095.0, "step": 32415 }, { "entropy": 0.07522573880851269, "epoch": 7.557174495861989, "grad_norm": 3.515625, "learning_rate": 4.7951524864908563e-05, "loss": 0.0231, "mean_token_accuracy": 0.9964980185031891, "num_tokens": 99910958.0, "step": 32420 }, { "entropy": 0.07450153809040785, "epoch": 7.55834013288262, "grad_norm": 1.1796875, "learning_rate": 4.795068748552971e-05, "loss": 0.0196, "mean_token_accuracy": 0.9937357306480408, "num_tokens": 99922733.0, "step": 32425 }, { "entropy": 0.05930988015606999, "epoch": 7.559505769903252, "grad_norm": 0.33984375, "learning_rate": 4.7949849950315395e-05, "loss": 0.0081, "mean_token_accuracy": 0.9973505437374115, "num_tokens": 99956843.0, "step": 32430 }, { "entropy": 0.06503955796360969, "epoch": 7.560671406923884, "grad_norm": 0.84765625, "learning_rate": 4.79490122592781e-05, "loss": 0.0134, "mean_token_accuracy": 0.9955211043357849, "num_tokens": 99980564.0, "step": 32435 }, { "entropy": 0.06906016170978546, "epoch": 7.561837043944516, "grad_norm": 0.4296875, "learning_rate": 4.794817441243031e-05, "loss": 0.0108, "mean_token_accuracy": 0.9956372857093811, "num_tokens": 100001928.0, "step": 32440 }, { "entropy": 0.08328457288444042, "epoch": 7.563002680965147, "grad_norm": 2.1875, "learning_rate": 4.7947336409784524e-05, "loss": 0.016, "mean_token_accuracy": 0.9953269362449646, "num_tokens": 100010956.0, "step": 32445 }, { "entropy": 0.05307028675451875, "epoch": 7.5641683179857795, "grad_norm": 1.6015625, "learning_rate": 4.7946498251353246e-05, "loss": 0.0106, "mean_token_accuracy": 0.9949360966682435, "num_tokens": 100035076.0, "step": 32450 }, { "entropy": 0.0713282000273466, "epoch": 7.565333955006411, "grad_norm": 1.8828125, "learning_rate": 4.794565993714896e-05, "loss": 0.0202, "mean_token_accuracy": 0.9958377063274384, "num_tokens": 100052207.0, "step": 32455 }, { "entropy": 0.06920521166175604, "epoch": 7.566499592027043, "grad_norm": 0.400390625, "learning_rate": 4.794482146718417e-05, "loss": 0.0217, "mean_token_accuracy": 0.9941867649555206, "num_tokens": 100078093.0, "step": 32460 }, { "entropy": 0.05242059859447181, "epoch": 7.567665229047675, "grad_norm": 2.984375, "learning_rate": 4.794398284147139e-05, "loss": 0.0095, "mean_token_accuracy": 0.9971729397773743, "num_tokens": 100106656.0, "step": 32465 }, { "entropy": 0.06839284533634782, "epoch": 7.568830866068306, "grad_norm": 1.7109375, "learning_rate": 4.794314406002311e-05, "loss": 0.0188, "mean_token_accuracy": 0.9950945854187012, "num_tokens": 100125560.0, "step": 32470 }, { "entropy": 0.09131706580519676, "epoch": 7.569996503088938, "grad_norm": 1.2578125, "learning_rate": 4.7942305122851846e-05, "loss": 0.0125, "mean_token_accuracy": 0.997002649307251, "num_tokens": 100135364.0, "step": 32475 }, { "entropy": 0.05984076950699091, "epoch": 7.5711621401095694, "grad_norm": 0.310546875, "learning_rate": 4.794146602997011e-05, "loss": 0.0106, "mean_token_accuracy": 0.9975899815559387, "num_tokens": 100151513.0, "step": 32480 }, { "entropy": 0.08737356476485729, "epoch": 7.5723277771302016, "grad_norm": 1.625, "learning_rate": 4.794062678139041e-05, "loss": 0.0134, "mean_token_accuracy": 0.9978051900863647, "num_tokens": 100160893.0, "step": 32485 }, { "entropy": 0.07065956890583039, "epoch": 7.573493414150834, "grad_norm": 2.953125, "learning_rate": 4.793978737712526e-05, "loss": 0.0174, "mean_token_accuracy": 0.9937400400638581, "num_tokens": 100172491.0, "step": 32490 }, { "entropy": 0.05652084918692708, "epoch": 7.574659051171465, "grad_norm": 0.291015625, "learning_rate": 4.793894781718718e-05, "loss": 0.0177, "mean_token_accuracy": 0.9928201317787171, "num_tokens": 100195349.0, "step": 32495 }, { "entropy": 0.07366820741444827, "epoch": 7.575824688192097, "grad_norm": 0.61328125, "learning_rate": 4.793810810158868e-05, "loss": 0.008, "mean_token_accuracy": 0.9973739683628082, "num_tokens": 100217402.0, "step": 32500 }, { "entropy": 0.048085775785148145, "epoch": 7.576990325212729, "grad_norm": 1.5234375, "learning_rate": 4.79372682303423e-05, "loss": 0.0116, "mean_token_accuracy": 0.9953145444393158, "num_tokens": 100248272.0, "step": 32505 }, { "entropy": 0.05929643930867314, "epoch": 7.57815596223336, "grad_norm": 0.302734375, "learning_rate": 4.793642820346055e-05, "loss": 0.0134, "mean_token_accuracy": 0.9968596279621125, "num_tokens": 100268834.0, "step": 32510 }, { "entropy": 0.05762045104056597, "epoch": 7.579321599253992, "grad_norm": 0.515625, "learning_rate": 4.793558802095595e-05, "loss": 0.009, "mean_token_accuracy": 0.9958757042884827, "num_tokens": 100292134.0, "step": 32515 }, { "entropy": 0.0705728504806757, "epoch": 7.5804872362746245, "grad_norm": 0.28515625, "learning_rate": 4.793474768284104e-05, "loss": 0.0187, "mean_token_accuracy": 0.9952540934085846, "num_tokens": 100302583.0, "step": 32520 }, { "entropy": 0.0648424707353115, "epoch": 7.581652873295256, "grad_norm": 1.0234375, "learning_rate": 4.7933907189128353e-05, "loss": 0.0076, "mean_token_accuracy": 0.9974114298820496, "num_tokens": 100318585.0, "step": 32525 }, { "entropy": 0.07678204700350762, "epoch": 7.582818510315888, "grad_norm": 1.328125, "learning_rate": 4.7933066539830405e-05, "loss": 0.0094, "mean_token_accuracy": 0.9962783277034759, "num_tokens": 100330717.0, "step": 32530 }, { "entropy": 0.05130081316456199, "epoch": 7.583984147336519, "grad_norm": 0.375, "learning_rate": 4.793222573495975e-05, "loss": 0.0082, "mean_token_accuracy": 0.9958062887191772, "num_tokens": 100363015.0, "step": 32535 }, { "entropy": 0.06532498020678759, "epoch": 7.585149784357151, "grad_norm": 0.9765625, "learning_rate": 4.793138477452892e-05, "loss": 0.008, "mean_token_accuracy": 0.9959288775920868, "num_tokens": 100385936.0, "step": 32540 }, { "entropy": 0.04017478302121162, "epoch": 7.586315421377783, "grad_norm": 0.625, "learning_rate": 4.793054365855045e-05, "loss": 0.0064, "mean_token_accuracy": 0.9981935679912567, "num_tokens": 100422413.0, "step": 32545 }, { "entropy": 0.051121273264288905, "epoch": 7.5874810583984145, "grad_norm": 1.4140625, "learning_rate": 4.792970238703689e-05, "loss": 0.0052, "mean_token_accuracy": 0.9979244947433472, "num_tokens": 100452225.0, "step": 32550 }, { "entropy": 0.07465707622468472, "epoch": 7.588646695419047, "grad_norm": 0.416015625, "learning_rate": 4.7928860960000774e-05, "loss": 0.0106, "mean_token_accuracy": 0.9957206606864929, "num_tokens": 100464424.0, "step": 32555 }, { "entropy": 0.0724950728006661, "epoch": 7.589812332439678, "grad_norm": 2.203125, "learning_rate": 4.792801937745466e-05, "loss": 0.012, "mean_token_accuracy": 0.9970667362213135, "num_tokens": 100485939.0, "step": 32560 }, { "entropy": 0.05878008343279362, "epoch": 7.59097796946031, "grad_norm": 5.03125, "learning_rate": 4.792717763941109e-05, "loss": 0.0186, "mean_token_accuracy": 0.9959011018276215, "num_tokens": 100502937.0, "step": 32565 }, { "entropy": 0.07744065094739198, "epoch": 7.592143606480942, "grad_norm": 1.625, "learning_rate": 4.7926335745882615e-05, "loss": 0.0157, "mean_token_accuracy": 0.9964656233787537, "num_tokens": 100521009.0, "step": 32570 }, { "entropy": 0.04522500513121486, "epoch": 7.593309243501574, "grad_norm": 0.56640625, "learning_rate": 4.7925493696881797e-05, "loss": 0.0072, "mean_token_accuracy": 0.998151433467865, "num_tokens": 100562959.0, "step": 32575 }, { "entropy": 0.05111492648720741, "epoch": 7.594474880522205, "grad_norm": 0.609375, "learning_rate": 4.7924651492421186e-05, "loss": 0.0064, "mean_token_accuracy": 0.9987885475158691, "num_tokens": 100585745.0, "step": 32580 }, { "entropy": 0.06261113192886114, "epoch": 7.595640517542837, "grad_norm": 0.248046875, "learning_rate": 4.792380913251334e-05, "loss": 0.0114, "mean_token_accuracy": 0.9969954133033753, "num_tokens": 100605109.0, "step": 32585 }, { "entropy": 0.07228537742048502, "epoch": 7.596806154563469, "grad_norm": 1.640625, "learning_rate": 4.792296661717082e-05, "loss": 0.0086, "mean_token_accuracy": 0.9956452965736389, "num_tokens": 100625700.0, "step": 32590 }, { "entropy": 0.060614088317379355, "epoch": 7.597971791584101, "grad_norm": 2.703125, "learning_rate": 4.792212394640619e-05, "loss": 0.0185, "mean_token_accuracy": 0.994023448228836, "num_tokens": 100645065.0, "step": 32595 }, { "entropy": 0.05851653721183538, "epoch": 7.599137428604733, "grad_norm": 0.78515625, "learning_rate": 4.792128112023203e-05, "loss": 0.0117, "mean_token_accuracy": 0.997266286611557, "num_tokens": 100663742.0, "step": 32600 }, { "entropy": 0.054263474978506566, "epoch": 7.600303065625364, "grad_norm": 0.359375, "learning_rate": 4.7920438138660886e-05, "loss": 0.0077, "mean_token_accuracy": 0.9973829567432404, "num_tokens": 100697972.0, "step": 32605 }, { "entropy": 0.04637954393401742, "epoch": 7.601468702645996, "grad_norm": 1.140625, "learning_rate": 4.791959500170533e-05, "loss": 0.0073, "mean_token_accuracy": 0.9971019864082337, "num_tokens": 100734309.0, "step": 32610 }, { "entropy": 0.08320282809436322, "epoch": 7.602634339666627, "grad_norm": 1.2109375, "learning_rate": 4.791875170937794e-05, "loss": 0.0132, "mean_token_accuracy": 0.9958483457565308, "num_tokens": 100744170.0, "step": 32615 }, { "entropy": 0.06456623747944831, "epoch": 7.6037999766872595, "grad_norm": 2.46875, "learning_rate": 4.7917908261691296e-05, "loss": 0.0147, "mean_token_accuracy": 0.9935942471027375, "num_tokens": 100765054.0, "step": 32620 }, { "entropy": 0.06546109467744828, "epoch": 7.604965613707892, "grad_norm": 0.45703125, "learning_rate": 4.7917064658657974e-05, "loss": 0.0055, "mean_token_accuracy": 0.9976051330566407, "num_tokens": 100780513.0, "step": 32625 }, { "entropy": 0.05621226644143462, "epoch": 7.606131250728523, "grad_norm": 0.515625, "learning_rate": 4.7916220900290545e-05, "loss": 0.011, "mean_token_accuracy": 0.9961635172367096, "num_tokens": 100798992.0, "step": 32630 }, { "entropy": 0.07289885049685836, "epoch": 7.607296887749155, "grad_norm": 1.578125, "learning_rate": 4.7915376986601595e-05, "loss": 0.0132, "mean_token_accuracy": 0.9941843330860138, "num_tokens": 100821013.0, "step": 32635 }, { "entropy": 0.0671339999884367, "epoch": 7.608462524769787, "grad_norm": 2.71875, "learning_rate": 4.791453291760371e-05, "loss": 0.024, "mean_token_accuracy": 0.9942957162857056, "num_tokens": 100831114.0, "step": 32640 }, { "entropy": 0.04521031361073256, "epoch": 7.609628161790418, "grad_norm": 1.609375, "learning_rate": 4.791368869330948e-05, "loss": 0.0099, "mean_token_accuracy": 0.9969500958919525, "num_tokens": 100865685.0, "step": 32645 }, { "entropy": 0.06600057780742645, "epoch": 7.61079379881105, "grad_norm": 0.828125, "learning_rate": 4.791284431373148e-05, "loss": 0.0078, "mean_token_accuracy": 0.9975499629974365, "num_tokens": 100887627.0, "step": 32650 }, { "entropy": 0.06534403078258037, "epoch": 7.611959435831682, "grad_norm": 3.0, "learning_rate": 4.791199977888231e-05, "loss": 0.017, "mean_token_accuracy": 0.9944500923156738, "num_tokens": 100900901.0, "step": 32655 }, { "entropy": 0.07990739308297634, "epoch": 7.613125072852314, "grad_norm": 1.7890625, "learning_rate": 4.791115508877457e-05, "loss": 0.0136, "mean_token_accuracy": 0.9956984043121337, "num_tokens": 100911816.0, "step": 32660 }, { "entropy": 0.05803863704204559, "epoch": 7.614290709872946, "grad_norm": 0.443359375, "learning_rate": 4.7910310243420845e-05, "loss": 0.0225, "mean_token_accuracy": 0.9933572173118591, "num_tokens": 100943353.0, "step": 32665 }, { "entropy": 0.05729225566610694, "epoch": 7.615456346893577, "grad_norm": 0.419921875, "learning_rate": 4.790946524283373e-05, "loss": 0.0092, "mean_token_accuracy": 0.9972775459289551, "num_tokens": 100969851.0, "step": 32670 }, { "entropy": 0.05942230150103569, "epoch": 7.616621983914209, "grad_norm": 1.265625, "learning_rate": 4.790862008702584e-05, "loss": 0.0166, "mean_token_accuracy": 0.9958006918430329, "num_tokens": 100985383.0, "step": 32675 }, { "entropy": 0.05730917723849416, "epoch": 7.617787620934841, "grad_norm": 1.9765625, "learning_rate": 4.790777477600976e-05, "loss": 0.0119, "mean_token_accuracy": 0.9960092306137085, "num_tokens": 101016757.0, "step": 32680 }, { "entropy": 0.0662191977724433, "epoch": 7.618953257955472, "grad_norm": 0.8125, "learning_rate": 4.790692930979811e-05, "loss": 0.0171, "mean_token_accuracy": 0.993461686372757, "num_tokens": 101029341.0, "step": 32685 }, { "entropy": 0.05978534407913685, "epoch": 7.6201188949761045, "grad_norm": 3.21875, "learning_rate": 4.790608368840349e-05, "loss": 0.0207, "mean_token_accuracy": 0.9946033835411072, "num_tokens": 101042189.0, "step": 32690 }, { "entropy": 0.08330644629895687, "epoch": 7.621284531996736, "grad_norm": 1.0703125, "learning_rate": 4.790523791183852e-05, "loss": 0.0099, "mean_token_accuracy": 0.9977569222450257, "num_tokens": 101053521.0, "step": 32695 }, { "entropy": 0.07074883468449115, "epoch": 7.622450169017368, "grad_norm": 2.953125, "learning_rate": 4.7904391980115786e-05, "loss": 0.0182, "mean_token_accuracy": 0.9951606273651123, "num_tokens": 101068673.0, "step": 32700 }, { "entropy": 0.06495154052972793, "epoch": 7.623615806038, "grad_norm": 2.046875, "learning_rate": 4.790354589324793e-05, "loss": 0.0102, "mean_token_accuracy": 0.995569896697998, "num_tokens": 101091515.0, "step": 32705 }, { "entropy": 0.06703095585107803, "epoch": 7.624781443058632, "grad_norm": 2.03125, "learning_rate": 4.790269965124755e-05, "loss": 0.0184, "mean_token_accuracy": 0.9955197989940643, "num_tokens": 101111464.0, "step": 32710 }, { "entropy": 0.11674296110868454, "epoch": 7.625947080079263, "grad_norm": 6.34375, "learning_rate": 4.790185325412728e-05, "loss": 0.0926, "mean_token_accuracy": 0.987501859664917, "num_tokens": 101134805.0, "step": 32715 }, { "entropy": 0.09656250309199095, "epoch": 7.627112717099895, "grad_norm": 2.25, "learning_rate": 4.7901006701899724e-05, "loss": 0.0162, "mean_token_accuracy": 0.9959886252880097, "num_tokens": 101155209.0, "step": 32720 }, { "entropy": 0.06006800103932619, "epoch": 7.628278354120527, "grad_norm": 2.796875, "learning_rate": 4.790015999457752e-05, "loss": 0.0085, "mean_token_accuracy": 0.9979590594768524, "num_tokens": 101173279.0, "step": 32725 }, { "entropy": 0.0692501813173294, "epoch": 7.629443991141159, "grad_norm": 0.5859375, "learning_rate": 4.7899313132173284e-05, "loss": 0.0085, "mean_token_accuracy": 0.9972864508628845, "num_tokens": 101199392.0, "step": 32730 }, { "entropy": 0.23568055331707, "epoch": 7.630609628161791, "grad_norm": 2.171875, "learning_rate": 4.789846611469965e-05, "loss": 0.2858, "mean_token_accuracy": 0.9683476388454437, "num_tokens": 101219041.0, "step": 32735 }, { "entropy": 0.07892981367185711, "epoch": 7.631775265182422, "grad_norm": 0.953125, "learning_rate": 4.7897618942169245e-05, "loss": 0.0115, "mean_token_accuracy": 0.9962479889392852, "num_tokens": 101240090.0, "step": 32740 }, { "entropy": 0.0884470010176301, "epoch": 7.632940902203054, "grad_norm": 2.90625, "learning_rate": 4.7896771614594705e-05, "loss": 0.0073, "mean_token_accuracy": 0.9975970149040222, "num_tokens": 101255685.0, "step": 32745 }, { "entropy": 0.052130958810448644, "epoch": 7.634106539223685, "grad_norm": 0.32421875, "learning_rate": 4.7895924131988654e-05, "loss": 0.0062, "mean_token_accuracy": 0.997983181476593, "num_tokens": 101292950.0, "step": 32750 }, { "entropy": 0.0583219145424664, "epoch": 7.635272176244317, "grad_norm": 1.125, "learning_rate": 4.789507649436374e-05, "loss": 0.0072, "mean_token_accuracy": 0.9973702132701874, "num_tokens": 101318481.0, "step": 32755 }, { "entropy": 0.06547697465866804, "epoch": 7.6364378132649495, "grad_norm": 1.953125, "learning_rate": 4.7894228701732613e-05, "loss": 0.0129, "mean_token_accuracy": 0.9954418540000916, "num_tokens": 101339941.0, "step": 32760 }, { "entropy": 0.11741905976086855, "epoch": 7.637603450285581, "grad_norm": 0.93359375, "learning_rate": 4.789338075410789e-05, "loss": 0.0696, "mean_token_accuracy": 0.9896807491779327, "num_tokens": 101366743.0, "step": 32765 }, { "entropy": 0.06383249973878265, "epoch": 7.638769087306213, "grad_norm": 3.375, "learning_rate": 4.789253265150223e-05, "loss": 0.008, "mean_token_accuracy": 0.9968828380107879, "num_tokens": 101385178.0, "step": 32770 }, { "entropy": 0.06623956738039852, "epoch": 7.639934724326845, "grad_norm": 1.2109375, "learning_rate": 4.789168439392828e-05, "loss": 0.0125, "mean_token_accuracy": 0.9964322030544281, "num_tokens": 101400186.0, "step": 32775 }, { "entropy": 0.0739117719233036, "epoch": 7.641100361347476, "grad_norm": 0.71875, "learning_rate": 4.7890835981398686e-05, "loss": 0.0125, "mean_token_accuracy": 0.9961394846439362, "num_tokens": 101414902.0, "step": 32780 }, { "entropy": 0.05914564449340105, "epoch": 7.642265998368108, "grad_norm": 0.46875, "learning_rate": 4.78899874139261e-05, "loss": 0.0148, "mean_token_accuracy": 0.9944905817508698, "num_tokens": 101434462.0, "step": 32785 }, { "entropy": 0.06933594457805156, "epoch": 7.64343163538874, "grad_norm": 0.9453125, "learning_rate": 4.7889138691523166e-05, "loss": 0.009, "mean_token_accuracy": 0.9975636959075928, "num_tokens": 101446358.0, "step": 32790 }, { "entropy": 0.08477734699845314, "epoch": 7.644597272409372, "grad_norm": 1.0234375, "learning_rate": 4.788828981420255e-05, "loss": 0.0117, "mean_token_accuracy": 0.9955092072486877, "num_tokens": 101456184.0, "step": 32795 }, { "entropy": 0.058824803587049244, "epoch": 7.645762909430004, "grad_norm": 2.234375, "learning_rate": 4.7887440781976915e-05, "loss": 0.0124, "mean_token_accuracy": 0.9953935146331787, "num_tokens": 101488985.0, "step": 32800 }, { "entropy": 0.05272114276885986, "epoch": 7.646928546450635, "grad_norm": 1.3828125, "learning_rate": 4.788659159485891e-05, "loss": 0.0138, "mean_token_accuracy": 0.9938934504985809, "num_tokens": 101507299.0, "step": 32805 }, { "entropy": 0.0796303316950798, "epoch": 7.648094183471267, "grad_norm": 3.28125, "learning_rate": 4.7885742252861205e-05, "loss": 0.0185, "mean_token_accuracy": 0.9956585705280304, "num_tokens": 101518078.0, "step": 32810 }, { "entropy": 0.07179453428834677, "epoch": 7.649259820491899, "grad_norm": 3.34375, "learning_rate": 4.788489275599646e-05, "loss": 0.0091, "mean_token_accuracy": 0.9973179042339325, "num_tokens": 101540068.0, "step": 32815 }, { "entropy": 0.05889113489538431, "epoch": 7.65042545751253, "grad_norm": 3.328125, "learning_rate": 4.788404310427734e-05, "loss": 0.0128, "mean_token_accuracy": 0.9952363848686219, "num_tokens": 101560366.0, "step": 32820 }, { "entropy": 0.06677535912021995, "epoch": 7.651591094533162, "grad_norm": 0.37109375, "learning_rate": 4.788319329771652e-05, "loss": 0.006, "mean_token_accuracy": 0.9987049341201782, "num_tokens": 101578158.0, "step": 32825 }, { "entropy": 0.06673195157200099, "epoch": 7.652756731553794, "grad_norm": 0.6015625, "learning_rate": 4.7882343336326675e-05, "loss": 0.0122, "mean_token_accuracy": 0.9957050621509552, "num_tokens": 101590083.0, "step": 32830 }, { "entropy": 0.07282912842929364, "epoch": 7.653922368574426, "grad_norm": 1.3203125, "learning_rate": 4.788149322012048e-05, "loss": 0.0196, "mean_token_accuracy": 0.994915121793747, "num_tokens": 101610498.0, "step": 32835 }, { "entropy": 0.05825249031186104, "epoch": 7.655088005595058, "grad_norm": 2.921875, "learning_rate": 4.7880642949110594e-05, "loss": 0.0069, "mean_token_accuracy": 0.9975422143936157, "num_tokens": 101631178.0, "step": 32840 }, { "entropy": 0.083549792971462, "epoch": 7.65625364261569, "grad_norm": 0.365234375, "learning_rate": 4.7879792523309715e-05, "loss": 0.0113, "mean_token_accuracy": 0.9959293603897095, "num_tokens": 101647880.0, "step": 32845 }, { "entropy": 0.07544692466035485, "epoch": 7.657419279636321, "grad_norm": 0.359375, "learning_rate": 4.787894194273052e-05, "loss": 0.0189, "mean_token_accuracy": 0.9943213880062103, "num_tokens": 101666788.0, "step": 32850 }, { "entropy": 0.06447355775162578, "epoch": 7.658584916656953, "grad_norm": 1.0, "learning_rate": 4.787809120738568e-05, "loss": 0.0074, "mean_token_accuracy": 0.9963658154010773, "num_tokens": 101694306.0, "step": 32855 }, { "entropy": 0.060479212738573554, "epoch": 7.6597505536775845, "grad_norm": 1.6171875, "learning_rate": 4.7877240317287896e-05, "loss": 0.0113, "mean_token_accuracy": 0.996816223859787, "num_tokens": 101716460.0, "step": 32860 }, { "entropy": 0.0611457291059196, "epoch": 7.660916190698217, "grad_norm": 1.671875, "learning_rate": 4.787638927244985e-05, "loss": 0.0143, "mean_token_accuracy": 0.9953365206718445, "num_tokens": 101740617.0, "step": 32865 }, { "entropy": 0.062300180457532404, "epoch": 7.662081827718849, "grad_norm": 2.515625, "learning_rate": 4.7875538072884234e-05, "loss": 0.0129, "mean_token_accuracy": 0.9958526492118835, "num_tokens": 101762005.0, "step": 32870 }, { "entropy": 0.06913834474980832, "epoch": 7.66324746473948, "grad_norm": 0.314453125, "learning_rate": 4.787468671860374e-05, "loss": 0.0125, "mean_token_accuracy": 0.995975774526596, "num_tokens": 101783436.0, "step": 32875 }, { "entropy": 0.04262585397809744, "epoch": 7.664413101760112, "grad_norm": 1.6640625, "learning_rate": 4.787383520962106e-05, "loss": 0.0111, "mean_token_accuracy": 0.9966529786586762, "num_tokens": 101803270.0, "step": 32880 }, { "entropy": 0.05681615024805069, "epoch": 7.665578738780743, "grad_norm": 2.046875, "learning_rate": 4.78729835459489e-05, "loss": 0.0196, "mean_token_accuracy": 0.9942091941833496, "num_tokens": 101819136.0, "step": 32885 }, { "entropy": 0.06326032225042581, "epoch": 7.666744375801375, "grad_norm": 0.5390625, "learning_rate": 4.787213172759995e-05, "loss": 0.0168, "mean_token_accuracy": 0.9958334505558014, "num_tokens": 101833599.0, "step": 32890 }, { "entropy": 0.07607240015640855, "epoch": 7.6679100128220075, "grad_norm": 0.546875, "learning_rate": 4.787127975458692e-05, "loss": 0.0095, "mean_token_accuracy": 0.9953685104846954, "num_tokens": 101852948.0, "step": 32895 }, { "entropy": 0.06080687679350376, "epoch": 7.669075649842639, "grad_norm": 2.140625, "learning_rate": 4.78704276269225e-05, "loss": 0.0237, "mean_token_accuracy": 0.9924868643283844, "num_tokens": 101866462.0, "step": 32900 }, { "entropy": 0.05896556191146374, "epoch": 7.670241286863271, "grad_norm": 1.8671875, "learning_rate": 4.786957534461941e-05, "loss": 0.0118, "mean_token_accuracy": 0.9962682902812958, "num_tokens": 101878867.0, "step": 32905 }, { "entropy": 0.055679400265216825, "epoch": 7.671406923883903, "grad_norm": 1.0390625, "learning_rate": 4.786872290769036e-05, "loss": 0.0185, "mean_token_accuracy": 0.9933323562145233, "num_tokens": 101891123.0, "step": 32910 }, { "entropy": 0.044333659764379266, "epoch": 7.672572560904534, "grad_norm": 0.82421875, "learning_rate": 4.786787031614804e-05, "loss": 0.009, "mean_token_accuracy": 0.996362054347992, "num_tokens": 101920179.0, "step": 32915 }, { "entropy": 0.05670885499566793, "epoch": 7.673738197925166, "grad_norm": 0.77734375, "learning_rate": 4.78670175700052e-05, "loss": 0.0109, "mean_token_accuracy": 0.9959345281124115, "num_tokens": 101944715.0, "step": 32920 }, { "entropy": 0.07925061210989952, "epoch": 7.674903834945798, "grad_norm": 0.470703125, "learning_rate": 4.7866164669274526e-05, "loss": 0.0089, "mean_token_accuracy": 0.9954618394374848, "num_tokens": 101974097.0, "step": 32925 }, { "entropy": 0.05602764692157507, "epoch": 7.6760694719664295, "grad_norm": 0.80078125, "learning_rate": 4.786531161396874e-05, "loss": 0.0167, "mean_token_accuracy": 0.9930595636367798, "num_tokens": 101991808.0, "step": 32930 }, { "entropy": 0.06822612164542079, "epoch": 7.677235108987062, "grad_norm": 0.2294921875, "learning_rate": 4.7864458404100575e-05, "loss": 0.0188, "mean_token_accuracy": 0.9941983044147491, "num_tokens": 102010298.0, "step": 32935 }, { "entropy": 0.07209388744086027, "epoch": 7.678400746007693, "grad_norm": 2.296875, "learning_rate": 4.786360503968275e-05, "loss": 0.0276, "mean_token_accuracy": 0.9932683050632477, "num_tokens": 102024598.0, "step": 32940 }, { "entropy": 0.07343168566003441, "epoch": 7.679566383028325, "grad_norm": 1.6953125, "learning_rate": 4.7862751520727976e-05, "loss": 0.0225, "mean_token_accuracy": 0.9909487366676331, "num_tokens": 102042228.0, "step": 32945 }, { "entropy": 0.06603799071162939, "epoch": 7.680732020048957, "grad_norm": 1.2578125, "learning_rate": 4.786189784724899e-05, "loss": 0.0191, "mean_token_accuracy": 0.9925129771232605, "num_tokens": 102056086.0, "step": 32950 }, { "entropy": 0.04232637556269765, "epoch": 7.681897657069588, "grad_norm": 0.30078125, "learning_rate": 4.7861044019258536e-05, "loss": 0.0048, "mean_token_accuracy": 0.9985785365104676, "num_tokens": 102088710.0, "step": 32955 }, { "entropy": 0.046622142335399984, "epoch": 7.68306329409022, "grad_norm": 0.8515625, "learning_rate": 4.786019003676931e-05, "loss": 0.0112, "mean_token_accuracy": 0.9971826553344727, "num_tokens": 102117410.0, "step": 32960 }, { "entropy": 0.04436410292983055, "epoch": 7.684228931110852, "grad_norm": 0.35546875, "learning_rate": 4.785933589979409e-05, "loss": 0.0091, "mean_token_accuracy": 0.9968259632587433, "num_tokens": 102139165.0, "step": 32965 }, { "entropy": 0.05858845524489879, "epoch": 7.685394568131484, "grad_norm": 0.8671875, "learning_rate": 4.785848160834558e-05, "loss": 0.0054, "mean_token_accuracy": 0.9990635514259338, "num_tokens": 102149919.0, "step": 32970 }, { "entropy": 0.08302914574742318, "epoch": 7.686560205152116, "grad_norm": 1.6484375, "learning_rate": 4.785762716243653e-05, "loss": 0.0106, "mean_token_accuracy": 0.9958298087120057, "num_tokens": 102162836.0, "step": 32975 }, { "entropy": 0.061361842602491376, "epoch": 7.687725842172748, "grad_norm": 0.392578125, "learning_rate": 4.7856772562079675e-05, "loss": 0.0175, "mean_token_accuracy": 0.9943647503852844, "num_tokens": 102190595.0, "step": 32980 }, { "entropy": 0.06810544840991498, "epoch": 7.688891479193379, "grad_norm": 0.69140625, "learning_rate": 4.785591780728777e-05, "loss": 0.0174, "mean_token_accuracy": 0.9955644488334656, "num_tokens": 102200770.0, "step": 32985 }, { "entropy": 0.07041889689862728, "epoch": 7.690057116214011, "grad_norm": 1.75, "learning_rate": 4.785506289807356e-05, "loss": 0.0274, "mean_token_accuracy": 0.9925129294395447, "num_tokens": 102218653.0, "step": 32990 }, { "entropy": 0.04661326901987195, "epoch": 7.691222753234642, "grad_norm": 0.291015625, "learning_rate": 4.785420783444978e-05, "loss": 0.0125, "mean_token_accuracy": 0.9967496514320373, "num_tokens": 102248406.0, "step": 32995 }, { "entropy": 0.06012960709631443, "epoch": 7.6923883902552745, "grad_norm": 3.8125, "learning_rate": 4.785335261642918e-05, "loss": 0.0259, "mean_token_accuracy": 0.9954575836658478, "num_tokens": 102264295.0, "step": 33000 }, { "entropy": 0.06518603842705488, "epoch": 7.693554027275907, "grad_norm": 1.53125, "learning_rate": 4.785249724402453e-05, "loss": 0.0142, "mean_token_accuracy": 0.9950007736682892, "num_tokens": 102283898.0, "step": 33005 }, { "entropy": 0.07277072016149759, "epoch": 7.694719664296538, "grad_norm": 0.265625, "learning_rate": 4.7851641717248574e-05, "loss": 0.0173, "mean_token_accuracy": 0.9951655566692352, "num_tokens": 102301866.0, "step": 33010 }, { "entropy": 0.04626034093089402, "epoch": 7.69588530131717, "grad_norm": 0.412109375, "learning_rate": 4.785078603611407e-05, "loss": 0.0067, "mean_token_accuracy": 0.9977597951889038, "num_tokens": 102326359.0, "step": 33015 }, { "entropy": 0.06575447116047144, "epoch": 7.697050938337801, "grad_norm": 3.203125, "learning_rate": 4.784993020063377e-05, "loss": 0.0205, "mean_token_accuracy": 0.9942839860916137, "num_tokens": 102344300.0, "step": 33020 }, { "entropy": 0.05121599268168211, "epoch": 7.698216575358433, "grad_norm": 0.1943359375, "learning_rate": 4.784907421082046e-05, "loss": 0.0076, "mean_token_accuracy": 0.9972567915916443, "num_tokens": 102369108.0, "step": 33025 }, { "entropy": 0.06364568127319217, "epoch": 7.699382212379065, "grad_norm": 0.423828125, "learning_rate": 4.784821806668688e-05, "loss": 0.0252, "mean_token_accuracy": 0.9911703109741211, "num_tokens": 102392748.0, "step": 33030 }, { "entropy": 0.08194961175322532, "epoch": 7.700547849399697, "grad_norm": 2.25, "learning_rate": 4.7847361768245804e-05, "loss": 0.0176, "mean_token_accuracy": 0.9945807337760926, "num_tokens": 102406579.0, "step": 33035 }, { "entropy": 0.05632104352116585, "epoch": 7.701713486420329, "grad_norm": 0.76171875, "learning_rate": 4.784650531550999e-05, "loss": 0.0122, "mean_token_accuracy": 0.9953259468078614, "num_tokens": 102423760.0, "step": 33040 }, { "entropy": 0.08072004318237305, "epoch": 7.702879123440961, "grad_norm": 0.345703125, "learning_rate": 4.784564870849223e-05, "loss": 0.0233, "mean_token_accuracy": 0.9934710383415222, "num_tokens": 102445159.0, "step": 33045 }, { "entropy": 0.0482025190256536, "epoch": 7.704044760461592, "grad_norm": 0.94140625, "learning_rate": 4.7844791947205295e-05, "loss": 0.0089, "mean_token_accuracy": 0.9969426095485687, "num_tokens": 102473665.0, "step": 33050 }, { "entropy": 0.04185782624408603, "epoch": 7.705210397482224, "grad_norm": 1.203125, "learning_rate": 4.7843935031661936e-05, "loss": 0.0099, "mean_token_accuracy": 0.9968387722969055, "num_tokens": 102502207.0, "step": 33055 }, { "entropy": 0.055229269759729506, "epoch": 7.706376034502856, "grad_norm": 0.240234375, "learning_rate": 4.7843077961874955e-05, "loss": 0.0173, "mean_token_accuracy": 0.9945455610752105, "num_tokens": 102522659.0, "step": 33060 }, { "entropy": 0.05512173883616924, "epoch": 7.7075416715234875, "grad_norm": 0.33203125, "learning_rate": 4.7842220737857125e-05, "loss": 0.0193, "mean_token_accuracy": 0.9933101654052734, "num_tokens": 102544870.0, "step": 33065 }, { "entropy": 0.05951900091022253, "epoch": 7.70870730854412, "grad_norm": 1.609375, "learning_rate": 4.7841363359621225e-05, "loss": 0.0197, "mean_token_accuracy": 0.9944490492343903, "num_tokens": 102556250.0, "step": 33070 }, { "entropy": 0.05257823131978512, "epoch": 7.709872945564751, "grad_norm": 0.37109375, "learning_rate": 4.784050582718005e-05, "loss": 0.0054, "mean_token_accuracy": 0.997491991519928, "num_tokens": 102598786.0, "step": 33075 }, { "entropy": 0.0675898913294077, "epoch": 7.711038582585383, "grad_norm": 2.625, "learning_rate": 4.783964814054638e-05, "loss": 0.0138, "mean_token_accuracy": 0.9958470046520234, "num_tokens": 102625008.0, "step": 33080 }, { "entropy": 0.06182341612875462, "epoch": 7.712204219606015, "grad_norm": 0.51171875, "learning_rate": 4.7838790299732996e-05, "loss": 0.0105, "mean_token_accuracy": 0.9968319714069367, "num_tokens": 102638009.0, "step": 33085 }, { "entropy": 0.04808502923697233, "epoch": 7.713369856626646, "grad_norm": 4.6875, "learning_rate": 4.78379323047527e-05, "loss": 0.0134, "mean_token_accuracy": 0.9957450985908508, "num_tokens": 102660401.0, "step": 33090 }, { "entropy": 0.06274218112230301, "epoch": 7.714535493647278, "grad_norm": 2.0625, "learning_rate": 4.783707415561829e-05, "loss": 0.0291, "mean_token_accuracy": 0.9942092418670654, "num_tokens": 102683677.0, "step": 33095 }, { "entropy": 0.07062279777601362, "epoch": 7.7157011306679095, "grad_norm": 1.3359375, "learning_rate": 4.783621585234255e-05, "loss": 0.0153, "mean_token_accuracy": 0.9945529520511627, "num_tokens": 102704196.0, "step": 33100 }, { "entropy": 0.06789557654410601, "epoch": 7.716866767688542, "grad_norm": 0.6875, "learning_rate": 4.7835357394938295e-05, "loss": 0.0274, "mean_token_accuracy": 0.9927381813526154, "num_tokens": 102718236.0, "step": 33105 }, { "entropy": 0.0625371436122805, "epoch": 7.718032404709174, "grad_norm": 1.4296875, "learning_rate": 4.7834498783418305e-05, "loss": 0.0082, "mean_token_accuracy": 0.9966192424297333, "num_tokens": 102742988.0, "step": 33110 }, { "entropy": 0.07199394702911377, "epoch": 7.719198041729806, "grad_norm": 2.84375, "learning_rate": 4.78336400177954e-05, "loss": 0.0148, "mean_token_accuracy": 0.9946137011051178, "num_tokens": 102753814.0, "step": 33115 }, { "entropy": 0.0699772285297513, "epoch": 7.720363678750437, "grad_norm": 1.9375, "learning_rate": 4.783278109808238e-05, "loss": 0.0178, "mean_token_accuracy": 0.9951143622398376, "num_tokens": 102770221.0, "step": 33120 }, { "entropy": 0.09867776576429606, "epoch": 7.721529315771069, "grad_norm": 1.59375, "learning_rate": 4.783192202429205e-05, "loss": 0.0198, "mean_token_accuracy": 0.994517570734024, "num_tokens": 102782732.0, "step": 33125 }, { "entropy": 0.107102907076478, "epoch": 7.7226949527917, "grad_norm": 6.25, "learning_rate": 4.783106279643722e-05, "loss": 0.1153, "mean_token_accuracy": 0.9768224596977234, "num_tokens": 102803666.0, "step": 33130 }, { "entropy": 0.056758302915841344, "epoch": 7.7238605898123325, "grad_norm": 0.9765625, "learning_rate": 4.783020341453071e-05, "loss": 0.0275, "mean_token_accuracy": 0.9949622571468353, "num_tokens": 102825348.0, "step": 33135 }, { "entropy": 0.057881328649818896, "epoch": 7.725026226832965, "grad_norm": 0.4921875, "learning_rate": 4.782934387858533e-05, "loss": 0.012, "mean_token_accuracy": 0.9954042911529541, "num_tokens": 102848920.0, "step": 33140 }, { "entropy": 0.04482985120266676, "epoch": 7.726191863853596, "grad_norm": 3.640625, "learning_rate": 4.7828484188613896e-05, "loss": 0.0118, "mean_token_accuracy": 0.9950476944446563, "num_tokens": 102874383.0, "step": 33145 }, { "entropy": 0.047770484909415246, "epoch": 7.727357500874228, "grad_norm": 1.9921875, "learning_rate": 4.782762434462922e-05, "loss": 0.008, "mean_token_accuracy": 0.9952480673789978, "num_tokens": 102902687.0, "step": 33150 }, { "entropy": 0.06177355572581291, "epoch": 7.728523137894859, "grad_norm": 0.228515625, "learning_rate": 4.782676434664414e-05, "loss": 0.007, "mean_token_accuracy": 0.9983032703399658, "num_tokens": 102923480.0, "step": 33155 }, { "entropy": 0.03888240284286439, "epoch": 7.729688774915491, "grad_norm": 0.5703125, "learning_rate": 4.782590419467147e-05, "loss": 0.0082, "mean_token_accuracy": 0.9970541417598724, "num_tokens": 102968286.0, "step": 33160 }, { "entropy": 0.05827982537448406, "epoch": 7.730854411936123, "grad_norm": 1.34375, "learning_rate": 4.782504388872404e-05, "loss": 0.018, "mean_token_accuracy": 0.9947967290878296, "num_tokens": 102980808.0, "step": 33165 }, { "entropy": 0.06765324827283621, "epoch": 7.7320200489567545, "grad_norm": 0.2138671875, "learning_rate": 4.7824183428814674e-05, "loss": 0.0093, "mean_token_accuracy": 0.9967410266399384, "num_tokens": 102994850.0, "step": 33170 }, { "entropy": 0.07319668047130108, "epoch": 7.733185685977387, "grad_norm": 1.4453125, "learning_rate": 4.78233228149562e-05, "loss": 0.0244, "mean_token_accuracy": 0.9916601479053497, "num_tokens": 103004716.0, "step": 33175 }, { "entropy": 0.06414531394839287, "epoch": 7.734351322998019, "grad_norm": 0.90625, "learning_rate": 4.782246204716146e-05, "loss": 0.0088, "mean_token_accuracy": 0.9972757875919342, "num_tokens": 103024745.0, "step": 33180 }, { "entropy": 0.07337948856875301, "epoch": 7.73551696001865, "grad_norm": 1.4765625, "learning_rate": 4.782160112544328e-05, "loss": 0.0204, "mean_token_accuracy": 0.9944812595844269, "num_tokens": 103043048.0, "step": 33185 }, { "entropy": 0.07922273678705097, "epoch": 7.736682597039282, "grad_norm": 0.82421875, "learning_rate": 4.7820740049814506e-05, "loss": 0.0154, "mean_token_accuracy": 0.9938981294631958, "num_tokens": 103064617.0, "step": 33190 }, { "entropy": 0.055163250956684354, "epoch": 7.737848234059914, "grad_norm": 0.65234375, "learning_rate": 4.7819878820287976e-05, "loss": 0.0087, "mean_token_accuracy": 0.9972365856170654, "num_tokens": 103092281.0, "step": 33195 }, { "entropy": 0.07809178866446018, "epoch": 7.739013871080545, "grad_norm": 2.734375, "learning_rate": 4.781901743687653e-05, "loss": 0.0176, "mean_token_accuracy": 0.993689090013504, "num_tokens": 103101742.0, "step": 33200 }, { "entropy": 0.05465510156936944, "epoch": 7.7401795081011775, "grad_norm": 1.4765625, "learning_rate": 4.7818155899593015e-05, "loss": 0.0141, "mean_token_accuracy": 0.9954380929470062, "num_tokens": 103137325.0, "step": 33205 }, { "entropy": 0.05313769178465009, "epoch": 7.741345145121809, "grad_norm": 0.5625, "learning_rate": 4.781729420845027e-05, "loss": 0.0198, "mean_token_accuracy": 0.99625164270401, "num_tokens": 103157786.0, "step": 33210 }, { "entropy": 0.07087142560631036, "epoch": 7.742510782142441, "grad_norm": 1.7421875, "learning_rate": 4.781643236346115e-05, "loss": 0.0223, "mean_token_accuracy": 0.9946645021438598, "num_tokens": 103174318.0, "step": 33215 }, { "entropy": 0.058909991011023524, "epoch": 7.743676419163073, "grad_norm": 1.75, "learning_rate": 4.781557036463852e-05, "loss": 0.0171, "mean_token_accuracy": 0.9935064792633057, "num_tokens": 103195835.0, "step": 33220 }, { "entropy": 0.06861430993303656, "epoch": 7.744842056183704, "grad_norm": 0.2255859375, "learning_rate": 4.7814708211995206e-05, "loss": 0.0162, "mean_token_accuracy": 0.995832359790802, "num_tokens": 103225740.0, "step": 33225 }, { "entropy": 0.07119888961315154, "epoch": 7.746007693204336, "grad_norm": 0.92578125, "learning_rate": 4.781384590554409e-05, "loss": 0.0195, "mean_token_accuracy": 0.9929039359092713, "num_tokens": 103240560.0, "step": 33230 }, { "entropy": 0.12398752514272929, "epoch": 7.7471733302249675, "grad_norm": 4.78125, "learning_rate": 4.781298344529801e-05, "loss": 0.146, "mean_token_accuracy": 0.9746751546859741, "num_tokens": 103269285.0, "step": 33235 }, { "entropy": 0.06968305222690105, "epoch": 7.7483389672456, "grad_norm": 2.578125, "learning_rate": 4.781212083126984e-05, "loss": 0.0212, "mean_token_accuracy": 0.9939125597476959, "num_tokens": 103278316.0, "step": 33240 }, { "entropy": 0.057888449355959895, "epoch": 7.749504604266232, "grad_norm": 4.15625, "learning_rate": 4.7811258063472433e-05, "loss": 0.0176, "mean_token_accuracy": 0.9953479826450348, "num_tokens": 103301857.0, "step": 33245 }, { "entropy": 0.05492489393800497, "epoch": 7.750670241286863, "grad_norm": 1.1640625, "learning_rate": 4.781039514191866e-05, "loss": 0.0132, "mean_token_accuracy": 0.995477843284607, "num_tokens": 103330999.0, "step": 33250 }, { "entropy": 0.05832458529621363, "epoch": 7.751835878307495, "grad_norm": 1.984375, "learning_rate": 4.7809532066621396e-05, "loss": 0.0085, "mean_token_accuracy": 0.9947230100631714, "num_tokens": 103353067.0, "step": 33255 }, { "entropy": 0.06644787210971118, "epoch": 7.753001515328127, "grad_norm": 1.5390625, "learning_rate": 4.78086688375935e-05, "loss": 0.0112, "mean_token_accuracy": 0.9967861711978913, "num_tokens": 103366334.0, "step": 33260 }, { "entropy": 0.09004889465868474, "epoch": 7.754167152348758, "grad_norm": 4.6875, "learning_rate": 4.780780545484783e-05, "loss": 0.0269, "mean_token_accuracy": 0.992378157377243, "num_tokens": 103374930.0, "step": 33265 }, { "entropy": 0.08529338352382183, "epoch": 7.75533278936939, "grad_norm": 0.70703125, "learning_rate": 4.78069419183973e-05, "loss": 0.0123, "mean_token_accuracy": 0.9963854551315308, "num_tokens": 103386330.0, "step": 33270 }, { "entropy": 0.05681054722517729, "epoch": 7.7564984263900225, "grad_norm": 0.6328125, "learning_rate": 4.780607822825475e-05, "loss": 0.0077, "mean_token_accuracy": 0.9969890892505646, "num_tokens": 103401501.0, "step": 33275 }, { "entropy": 0.07758706733584404, "epoch": 7.757664063410654, "grad_norm": 0.7421875, "learning_rate": 4.780521438443307e-05, "loss": 0.0242, "mean_token_accuracy": 0.9923904359340667, "num_tokens": 103409858.0, "step": 33280 }, { "entropy": 0.08151622787117958, "epoch": 7.758829700431286, "grad_norm": 0.54296875, "learning_rate": 4.780435038694515e-05, "loss": 0.049, "mean_token_accuracy": 0.9898855566978455, "num_tokens": 103432904.0, "step": 33285 }, { "entropy": 0.11001786850392818, "epoch": 7.759995337451917, "grad_norm": 3.5, "learning_rate": 4.780348623580387e-05, "loss": 0.0171, "mean_token_accuracy": 0.9956152141094208, "num_tokens": 103440705.0, "step": 33290 }, { "entropy": 0.05951671497896314, "epoch": 7.761160974472549, "grad_norm": 0.310546875, "learning_rate": 4.7802621931022105e-05, "loss": 0.009, "mean_token_accuracy": 0.9959829747676849, "num_tokens": 103462340.0, "step": 33295 }, { "entropy": 0.07016182951629162, "epoch": 7.762326611493181, "grad_norm": 1.2578125, "learning_rate": 4.7801757472612756e-05, "loss": 0.0148, "mean_token_accuracy": 0.9939250349998474, "num_tokens": 103472282.0, "step": 33300 }, { "entropy": 0.06460611652582884, "epoch": 7.7634922485138125, "grad_norm": 0.275390625, "learning_rate": 4.780089286058871e-05, "loss": 0.0112, "mean_token_accuracy": 0.9950837731361389, "num_tokens": 103501179.0, "step": 33305 }, { "entropy": 0.06389118535444141, "epoch": 7.764657885534445, "grad_norm": 2.09375, "learning_rate": 4.7800028094962856e-05, "loss": 0.0089, "mean_token_accuracy": 0.9971124291419983, "num_tokens": 103523931.0, "step": 33310 }, { "entropy": 0.059523440059274436, "epoch": 7.765823522555077, "grad_norm": 0.609375, "learning_rate": 4.779916317574809e-05, "loss": 0.0077, "mean_token_accuracy": 0.9953795492649078, "num_tokens": 103546158.0, "step": 33315 }, { "entropy": 0.047978917602449656, "epoch": 7.766989159575708, "grad_norm": 0.26171875, "learning_rate": 4.779829810295731e-05, "loss": 0.0112, "mean_token_accuracy": 0.9941164374351501, "num_tokens": 103591627.0, "step": 33320 }, { "entropy": 0.053133474104106425, "epoch": 7.76815479659634, "grad_norm": 0.73046875, "learning_rate": 4.7797432876603415e-05, "loss": 0.0101, "mean_token_accuracy": 0.9951233327388763, "num_tokens": 103625207.0, "step": 33325 }, { "entropy": 0.09868288524448872, "epoch": 7.769320433616972, "grad_norm": 0.33203125, "learning_rate": 4.779656749669931e-05, "loss": 0.0073, "mean_token_accuracy": 0.9969569325447083, "num_tokens": 103644136.0, "step": 33330 }, { "entropy": 0.06480435077100992, "epoch": 7.770486070637603, "grad_norm": 0.85546875, "learning_rate": 4.77957019632579e-05, "loss": 0.0132, "mean_token_accuracy": 0.997107309103012, "num_tokens": 103660756.0, "step": 33335 }, { "entropy": 0.06808798797428608, "epoch": 7.771651707658235, "grad_norm": 0.5, "learning_rate": 4.779483627629208e-05, "loss": 0.0135, "mean_token_accuracy": 0.9981209456920623, "num_tokens": 103674964.0, "step": 33340 }, { "entropy": 0.06963288504630327, "epoch": 7.772817344678867, "grad_norm": 0.33984375, "learning_rate": 4.779397043581477e-05, "loss": 0.0139, "mean_token_accuracy": 0.996001148223877, "num_tokens": 103701505.0, "step": 33345 }, { "entropy": 0.07459334554150701, "epoch": 7.773982981699499, "grad_norm": 1.765625, "learning_rate": 4.779310444183888e-05, "loss": 0.0245, "mean_token_accuracy": 0.9941957771778107, "num_tokens": 103715081.0, "step": 33350 }, { "entropy": 0.08988549634814262, "epoch": 7.775148618720131, "grad_norm": 1.71875, "learning_rate": 4.7792238294377326e-05, "loss": 0.0748, "mean_token_accuracy": 0.9865087509155274, "num_tokens": 103737406.0, "step": 33355 }, { "entropy": 0.0655953474342823, "epoch": 7.776314255740762, "grad_norm": 2.375, "learning_rate": 4.7791371993443004e-05, "loss": 0.0408, "mean_token_accuracy": 0.9900922536849975, "num_tokens": 103756899.0, "step": 33360 }, { "entropy": 0.0665599879808724, "epoch": 7.777479892761394, "grad_norm": 1.2421875, "learning_rate": 4.779050553904886e-05, "loss": 0.0272, "mean_token_accuracy": 0.9948042571544647, "num_tokens": 103776091.0, "step": 33365 }, { "entropy": 0.05449905479326844, "epoch": 7.778645529782025, "grad_norm": 0.4921875, "learning_rate": 4.778963893120779e-05, "loss": 0.0141, "mean_token_accuracy": 0.9956022024154663, "num_tokens": 103818424.0, "step": 33370 }, { "entropy": 0.05944770090281963, "epoch": 7.7798111668026575, "grad_norm": 2.125, "learning_rate": 4.7788772169932735e-05, "loss": 0.0276, "mean_token_accuracy": 0.9925519466400147, "num_tokens": 103838159.0, "step": 33375 }, { "entropy": 0.0551123920828104, "epoch": 7.78097680382329, "grad_norm": 0.7734375, "learning_rate": 4.778790525523661e-05, "loss": 0.0085, "mean_token_accuracy": 0.997332113981247, "num_tokens": 103877255.0, "step": 33380 }, { "entropy": 0.06190832667052746, "epoch": 7.782142440843921, "grad_norm": 0.216796875, "learning_rate": 4.7787038187132345e-05, "loss": 0.0143, "mean_token_accuracy": 0.9960220634937287, "num_tokens": 103901221.0, "step": 33385 }, { "entropy": 0.060510965529829264, "epoch": 7.783308077864553, "grad_norm": 1.453125, "learning_rate": 4.778617096563286e-05, "loss": 0.0146, "mean_token_accuracy": 0.9958796203136444, "num_tokens": 103934690.0, "step": 33390 }, { "entropy": 0.06902567390352488, "epoch": 7.784473714885185, "grad_norm": 0.2314453125, "learning_rate": 4.778530359075111e-05, "loss": 0.0064, "mean_token_accuracy": 0.9963374018669129, "num_tokens": 103974900.0, "step": 33395 }, { "entropy": 0.062181627936661245, "epoch": 7.785639351905816, "grad_norm": 2.96875, "learning_rate": 4.77844360625e-05, "loss": 0.027, "mean_token_accuracy": 0.9922132313251495, "num_tokens": 103998158.0, "step": 33400 }, { "entropy": 0.06246403036639094, "epoch": 7.786804988926448, "grad_norm": 3.265625, "learning_rate": 4.778356838089248e-05, "loss": 0.0206, "mean_token_accuracy": 0.9934264302253724, "num_tokens": 104010695.0, "step": 33405 }, { "entropy": 0.058712884597480294, "epoch": 7.7879706259470804, "grad_norm": 0.64453125, "learning_rate": 4.778270054594149e-05, "loss": 0.02, "mean_token_accuracy": 0.9937012672424317, "num_tokens": 104022834.0, "step": 33410 }, { "entropy": 0.06700698286294937, "epoch": 7.789136262967712, "grad_norm": 0.345703125, "learning_rate": 4.7781832557659975e-05, "loss": 0.017, "mean_token_accuracy": 0.9955753743648529, "num_tokens": 104038301.0, "step": 33415 }, { "entropy": 0.06809659153223038, "epoch": 7.790301899988344, "grad_norm": 0.59375, "learning_rate": 4.7780964416060866e-05, "loss": 0.0218, "mean_token_accuracy": 0.9940990149974823, "num_tokens": 104049127.0, "step": 33420 }, { "entropy": 0.06645800778642297, "epoch": 7.791467537008975, "grad_norm": 1.46875, "learning_rate": 4.7780096121157115e-05, "loss": 0.0292, "mean_token_accuracy": 0.9924209117889404, "num_tokens": 104069094.0, "step": 33425 }, { "entropy": 0.07412951868027448, "epoch": 7.792633174029607, "grad_norm": 0.48828125, "learning_rate": 4.777922767296167e-05, "loss": 0.0079, "mean_token_accuracy": 0.9977226316928863, "num_tokens": 104095882.0, "step": 33430 }, { "entropy": 0.059772913716733454, "epoch": 7.793798811050239, "grad_norm": 0.609375, "learning_rate": 4.777835907148748e-05, "loss": 0.0155, "mean_token_accuracy": 0.9950716972351075, "num_tokens": 104110924.0, "step": 33435 }, { "entropy": 0.09825005661696196, "epoch": 7.79496444807087, "grad_norm": 1.328125, "learning_rate": 4.77774903167475e-05, "loss": 0.0728, "mean_token_accuracy": 0.9886581361293793, "num_tokens": 104144096.0, "step": 33440 }, { "entropy": 0.04443741850554943, "epoch": 7.7961300850915025, "grad_norm": 0.255859375, "learning_rate": 4.777662140875467e-05, "loss": 0.0051, "mean_token_accuracy": 0.9977137744426727, "num_tokens": 104184663.0, "step": 33445 }, { "entropy": 0.0897174721583724, "epoch": 7.797295722112135, "grad_norm": 0.97265625, "learning_rate": 4.7775752347521965e-05, "loss": 0.0179, "mean_token_accuracy": 0.9940895915031434, "num_tokens": 104205061.0, "step": 33450 }, { "entropy": 0.0796100415289402, "epoch": 7.798461359132766, "grad_norm": 1.2890625, "learning_rate": 4.777488313306234e-05, "loss": 0.03, "mean_token_accuracy": 0.9946964383125305, "num_tokens": 104216186.0, "step": 33455 }, { "entropy": 0.07518827449530363, "epoch": 7.799626996153398, "grad_norm": 3.15625, "learning_rate": 4.7774013765388745e-05, "loss": 0.0168, "mean_token_accuracy": 0.9950263619422912, "num_tokens": 104225544.0, "step": 33460 }, { "entropy": 0.06233833208680153, "epoch": 7.80079263317403, "grad_norm": 1.40625, "learning_rate": 4.777314424451416e-05, "loss": 0.0077, "mean_token_accuracy": 0.9979846239089966, "num_tokens": 104249410.0, "step": 33465 }, { "entropy": 0.059140789741650227, "epoch": 7.801958270194661, "grad_norm": 1.453125, "learning_rate": 4.7772274570451535e-05, "loss": 0.0143, "mean_token_accuracy": 0.9955178260803222, "num_tokens": 104286115.0, "step": 33470 }, { "entropy": 0.06378463245928287, "epoch": 7.803123907215293, "grad_norm": 2.203125, "learning_rate": 4.777140474321385e-05, "loss": 0.0189, "mean_token_accuracy": 0.9947424292564392, "num_tokens": 104319276.0, "step": 33475 }, { "entropy": 0.053663753625005486, "epoch": 7.804289544235925, "grad_norm": 0.392578125, "learning_rate": 4.777053476281407e-05, "loss": 0.0083, "mean_token_accuracy": 0.9964630961418152, "num_tokens": 104339566.0, "step": 33480 }, { "entropy": 0.07534630130976439, "epoch": 7.805455181256557, "grad_norm": 1.8984375, "learning_rate": 4.7769664629265174e-05, "loss": 0.0159, "mean_token_accuracy": 0.9969941914081574, "num_tokens": 104354412.0, "step": 33485 }, { "entropy": 0.06900246925652027, "epoch": 7.806620818277189, "grad_norm": 1.640625, "learning_rate": 4.7768794342580124e-05, "loss": 0.0274, "mean_token_accuracy": 0.9949246644973755, "num_tokens": 104363262.0, "step": 33490 }, { "entropy": 0.06793273855000734, "epoch": 7.80778645529782, "grad_norm": 0.6953125, "learning_rate": 4.776792390277191e-05, "loss": 0.008, "mean_token_accuracy": 0.9973601698875427, "num_tokens": 104384275.0, "step": 33495 }, { "entropy": 0.06260851919651031, "epoch": 7.808952092318452, "grad_norm": 2.203125, "learning_rate": 4.776705330985351e-05, "loss": 0.0123, "mean_token_accuracy": 0.9972707688808441, "num_tokens": 104404175.0, "step": 33500 }, { "entropy": 0.06454886039718985, "epoch": 7.810117729339083, "grad_norm": 1.0546875, "learning_rate": 4.77661825638379e-05, "loss": 0.0158, "mean_token_accuracy": 0.9947829186916352, "num_tokens": 104423332.0, "step": 33505 }, { "entropy": 0.037727932911366224, "epoch": 7.811283366359715, "grad_norm": 1.140625, "learning_rate": 4.7765311664738065e-05, "loss": 0.0053, "mean_token_accuracy": 0.997829121351242, "num_tokens": 104457532.0, "step": 33510 }, { "entropy": 0.06598635371774435, "epoch": 7.8124490033803475, "grad_norm": 1.4296875, "learning_rate": 4.776444061256699e-05, "loss": 0.0187, "mean_token_accuracy": 0.9948759615421295, "num_tokens": 104474783.0, "step": 33515 }, { "entropy": 0.043385568913072346, "epoch": 7.813614640400979, "grad_norm": 1.296875, "learning_rate": 4.776356940733767e-05, "loss": 0.008, "mean_token_accuracy": 0.9977335274219513, "num_tokens": 104503133.0, "step": 33520 }, { "entropy": 0.06043797004967928, "epoch": 7.814780277421611, "grad_norm": 0.357421875, "learning_rate": 4.776269804906309e-05, "loss": 0.0068, "mean_token_accuracy": 0.997977751493454, "num_tokens": 104522082.0, "step": 33525 }, { "entropy": 0.06230600643903017, "epoch": 7.815945914442243, "grad_norm": 2.8125, "learning_rate": 4.776182653775625e-05, "loss": 0.0187, "mean_token_accuracy": 0.9945006728172302, "num_tokens": 104542746.0, "step": 33530 }, { "entropy": 0.06799766402691602, "epoch": 7.817111551462874, "grad_norm": 0.58203125, "learning_rate": 4.7760954873430146e-05, "loss": 0.0204, "mean_token_accuracy": 0.9932066082954407, "num_tokens": 104559473.0, "step": 33535 }, { "entropy": 0.059759671241045, "epoch": 7.818277188483506, "grad_norm": 0.30859375, "learning_rate": 4.776008305609776e-05, "loss": 0.0129, "mean_token_accuracy": 0.9955129384994507, "num_tokens": 104583010.0, "step": 33540 }, { "entropy": 0.06374518619850278, "epoch": 7.819442825504138, "grad_norm": 0.25, "learning_rate": 4.775921108577211e-05, "loss": 0.0175, "mean_token_accuracy": 0.9949056446552277, "num_tokens": 104598695.0, "step": 33545 }, { "entropy": 0.04200270352885127, "epoch": 7.82060846252477, "grad_norm": 0.349609375, "learning_rate": 4.77583389624662e-05, "loss": 0.0125, "mean_token_accuracy": 0.9944587707519531, "num_tokens": 104624382.0, "step": 33550 }, { "entropy": 0.0661899745464325, "epoch": 7.821774099545402, "grad_norm": 0.82421875, "learning_rate": 4.775746668619302e-05, "loss": 0.0199, "mean_token_accuracy": 0.9944428563117981, "num_tokens": 104644759.0, "step": 33555 }, { "entropy": 0.09902186430990696, "epoch": 7.822939736566033, "grad_norm": 1.453125, "learning_rate": 4.7756594256965584e-05, "loss": 0.0204, "mean_token_accuracy": 0.9947905838489532, "num_tokens": 104656623.0, "step": 33560 }, { "entropy": 0.10063904188573361, "epoch": 7.824105373586665, "grad_norm": 1.140625, "learning_rate": 4.775572167479689e-05, "loss": 0.0133, "mean_token_accuracy": 0.9956040382385254, "num_tokens": 104667933.0, "step": 33565 }, { "entropy": 0.045405428390949965, "epoch": 7.825271010607297, "grad_norm": 0.83984375, "learning_rate": 4.7754848939699975e-05, "loss": 0.0064, "mean_token_accuracy": 0.9983605444431305, "num_tokens": 104690189.0, "step": 33570 }, { "entropy": 0.06263646613806487, "epoch": 7.826436647627928, "grad_norm": 1.4140625, "learning_rate": 4.775397605168783e-05, "loss": 0.0126, "mean_token_accuracy": 0.99606973528862, "num_tokens": 104713442.0, "step": 33575 }, { "entropy": 0.060155317559838294, "epoch": 7.8276022846485604, "grad_norm": 3.09375, "learning_rate": 4.775310301077348e-05, "loss": 0.0159, "mean_token_accuracy": 0.994491446018219, "num_tokens": 104734783.0, "step": 33580 }, { "entropy": 0.047922109439969066, "epoch": 7.8287679216691926, "grad_norm": 0.125, "learning_rate": 4.775222981696995e-05, "loss": 0.0103, "mean_token_accuracy": 0.9948363065719604, "num_tokens": 104763217.0, "step": 33585 }, { "entropy": 0.0798790443688631, "epoch": 7.829933558689824, "grad_norm": 0.98828125, "learning_rate": 4.775135647029025e-05, "loss": 0.0131, "mean_token_accuracy": 0.9965567708015441, "num_tokens": 104775163.0, "step": 33590 }, { "entropy": 0.062330286018550396, "epoch": 7.831099195710456, "grad_norm": 0.32421875, "learning_rate": 4.77504829707474e-05, "loss": 0.0103, "mean_token_accuracy": 0.9961469352245331, "num_tokens": 104803548.0, "step": 33595 }, { "entropy": 0.06328217554837465, "epoch": 7.832264832731088, "grad_norm": 2.78125, "learning_rate": 4.774960931835444e-05, "loss": 0.017, "mean_token_accuracy": 0.9947004914283752, "num_tokens": 104830200.0, "step": 33600 }, { "entropy": 0.06574259772896766, "epoch": 7.833430469751719, "grad_norm": 2.609375, "learning_rate": 4.7748735513124375e-05, "loss": 0.0106, "mean_token_accuracy": 0.9949138760566711, "num_tokens": 104850235.0, "step": 33605 }, { "entropy": 0.04857909232378006, "epoch": 7.834596106772351, "grad_norm": 0.63671875, "learning_rate": 4.774786155507026e-05, "loss": 0.007, "mean_token_accuracy": 0.9977436184883117, "num_tokens": 104875446.0, "step": 33610 }, { "entropy": 0.060628737695515154, "epoch": 7.8357617437929825, "grad_norm": 0.57421875, "learning_rate": 4.774698744420512e-05, "loss": 0.0156, "mean_token_accuracy": 0.9954569041728973, "num_tokens": 104894570.0, "step": 33615 }, { "entropy": 0.07210960108786821, "epoch": 7.836927380813615, "grad_norm": 0.359375, "learning_rate": 4.774611318054197e-05, "loss": 0.0136, "mean_token_accuracy": 0.9972477376461029, "num_tokens": 104905388.0, "step": 33620 }, { "entropy": 0.06413645837455988, "epoch": 7.838093017834247, "grad_norm": 2.078125, "learning_rate": 4.774523876409387e-05, "loss": 0.0268, "mean_token_accuracy": 0.9931788384914398, "num_tokens": 104924156.0, "step": 33625 }, { "entropy": 0.07827889760956168, "epoch": 7.839258654854878, "grad_norm": 0.51953125, "learning_rate": 4.774436419487385e-05, "loss": 0.035, "mean_token_accuracy": 0.9918962776660919, "num_tokens": 104947611.0, "step": 33630 }, { "entropy": 0.050841915979981424, "epoch": 7.84042429187551, "grad_norm": 1.34375, "learning_rate": 4.7743489472894955e-05, "loss": 0.0122, "mean_token_accuracy": 0.9957148313522339, "num_tokens": 104971187.0, "step": 33635 }, { "entropy": 0.07467871066182852, "epoch": 7.841589928896141, "grad_norm": 1.9765625, "learning_rate": 4.774261459817022e-05, "loss": 0.0136, "mean_token_accuracy": 0.9967439770698547, "num_tokens": 104982093.0, "step": 33640 }, { "entropy": 0.06240941435098648, "epoch": 7.842755565916773, "grad_norm": 3.59375, "learning_rate": 4.77417395707127e-05, "loss": 0.0165, "mean_token_accuracy": 0.995259428024292, "num_tokens": 105001148.0, "step": 33645 }, { "entropy": 0.06917629651725292, "epoch": 7.8439212029374055, "grad_norm": 3.234375, "learning_rate": 4.7740864390535434e-05, "loss": 0.0178, "mean_token_accuracy": 0.994339919090271, "num_tokens": 105015498.0, "step": 33650 }, { "entropy": 0.051767791528254746, "epoch": 7.845086839958037, "grad_norm": 0.6796875, "learning_rate": 4.773998905765147e-05, "loss": 0.0071, "mean_token_accuracy": 0.9966930508613586, "num_tokens": 105045073.0, "step": 33655 }, { "entropy": 0.06975855696946383, "epoch": 7.846252476978669, "grad_norm": 1.8203125, "learning_rate": 4.7739113572073876e-05, "loss": 0.0122, "mean_token_accuracy": 0.9956272125244141, "num_tokens": 105072520.0, "step": 33660 }, { "entropy": 0.07446997575461864, "epoch": 7.847418113999301, "grad_norm": 1.5546875, "learning_rate": 4.77382379338157e-05, "loss": 0.0257, "mean_token_accuracy": 0.9942268848419189, "num_tokens": 105083649.0, "step": 33665 }, { "entropy": 0.06689784284681081, "epoch": 7.848583751019932, "grad_norm": 0.39453125, "learning_rate": 4.773736214289e-05, "loss": 0.0076, "mean_token_accuracy": 0.9982570469379425, "num_tokens": 105107999.0, "step": 33670 }, { "entropy": 0.07380356825888157, "epoch": 7.849749388040564, "grad_norm": 0.470703125, "learning_rate": 4.773648619930983e-05, "loss": 0.0083, "mean_token_accuracy": 0.996808648109436, "num_tokens": 105128184.0, "step": 33675 }, { "entropy": 0.06525698453187942, "epoch": 7.850915025061196, "grad_norm": 4.71875, "learning_rate": 4.7735610103088245e-05, "loss": 0.0144, "mean_token_accuracy": 0.9928626179695129, "num_tokens": 105155039.0, "step": 33680 }, { "entropy": 0.06640646066516638, "epoch": 7.8520806620818275, "grad_norm": 1.6015625, "learning_rate": 4.7734733854238324e-05, "loss": 0.0091, "mean_token_accuracy": 0.9936692476272583, "num_tokens": 105181828.0, "step": 33685 }, { "entropy": 0.08131216876208783, "epoch": 7.85324629910246, "grad_norm": 0.91015625, "learning_rate": 4.773385745277313e-05, "loss": 0.026, "mean_token_accuracy": 0.9937609195709228, "num_tokens": 105194395.0, "step": 33690 }, { "entropy": 0.05827680192887783, "epoch": 7.854411936123091, "grad_norm": 1.2734375, "learning_rate": 4.773298089870573e-05, "loss": 0.0159, "mean_token_accuracy": 0.9963181138038635, "num_tokens": 105207309.0, "step": 33695 }, { "entropy": 0.07168765515089034, "epoch": 7.855577573143723, "grad_norm": 1.703125, "learning_rate": 4.773210419204919e-05, "loss": 0.0124, "mean_token_accuracy": 0.9961957156658172, "num_tokens": 105220229.0, "step": 33700 }, { "entropy": 0.062171673867851496, "epoch": 7.856743210164355, "grad_norm": 0.2021484375, "learning_rate": 4.773122733281659e-05, "loss": 0.0083, "mean_token_accuracy": 0.9970798075199128, "num_tokens": 105238170.0, "step": 33705 }, { "entropy": 0.04381891647353768, "epoch": 7.857908847184986, "grad_norm": 0.462890625, "learning_rate": 4.773035032102099e-05, "loss": 0.0055, "mean_token_accuracy": 0.997516006231308, "num_tokens": 105285631.0, "step": 33710 }, { "entropy": 0.04537671413272619, "epoch": 7.859074484205618, "grad_norm": 0.33203125, "learning_rate": 4.772947315667549e-05, "loss": 0.0139, "mean_token_accuracy": 0.9939434051513671, "num_tokens": 105310695.0, "step": 33715 }, { "entropy": 0.05652222605422139, "epoch": 7.8602401212262505, "grad_norm": 1.3828125, "learning_rate": 4.7728595839793155e-05, "loss": 0.0088, "mean_token_accuracy": 0.9973005592823029, "num_tokens": 105328469.0, "step": 33720 }, { "entropy": 0.0721471224911511, "epoch": 7.861405758246882, "grad_norm": 1.078125, "learning_rate": 4.7727718370387074e-05, "loss": 0.0242, "mean_token_accuracy": 0.9938137114048005, "num_tokens": 105348789.0, "step": 33725 }, { "entropy": 0.054934839438647035, "epoch": 7.862571395267514, "grad_norm": 0.275390625, "learning_rate": 4.772684074847033e-05, "loss": 0.0084, "mean_token_accuracy": 0.9964746117591858, "num_tokens": 105372261.0, "step": 33730 }, { "entropy": 0.060262033343315126, "epoch": 7.863737032288146, "grad_norm": 0.2255859375, "learning_rate": 4.7725962974056007e-05, "loss": 0.0074, "mean_token_accuracy": 0.99892857670784, "num_tokens": 105401147.0, "step": 33735 }, { "entropy": 0.07648681541904807, "epoch": 7.864902669308777, "grad_norm": 2.6875, "learning_rate": 4.772508504715719e-05, "loss": 0.0147, "mean_token_accuracy": 0.9954568028450013, "num_tokens": 105424983.0, "step": 33740 }, { "entropy": 0.06310393549501896, "epoch": 7.866068306329409, "grad_norm": 0.80859375, "learning_rate": 4.772420696778699e-05, "loss": 0.0171, "mean_token_accuracy": 0.9937354981899261, "num_tokens": 105435754.0, "step": 33745 }, { "entropy": 0.08068116065114736, "epoch": 7.8672339433500404, "grad_norm": 1.125, "learning_rate": 4.7723328735958475e-05, "loss": 0.022, "mean_token_accuracy": 0.9926752507686615, "num_tokens": 105456206.0, "step": 33750 }, { "entropy": 0.10241719000041485, "epoch": 7.8683995803706726, "grad_norm": 1.609375, "learning_rate": 4.772245035168475e-05, "loss": 0.0188, "mean_token_accuracy": 0.9940951466560364, "num_tokens": 105475697.0, "step": 33755 }, { "entropy": 0.05356983579695225, "epoch": 7.869565217391305, "grad_norm": 0.359375, "learning_rate": 4.772157181497892e-05, "loss": 0.0072, "mean_token_accuracy": 0.9973737239837647, "num_tokens": 105507022.0, "step": 33760 }, { "entropy": 0.08068993408232927, "epoch": 7.870730854411936, "grad_norm": 0.2294921875, "learning_rate": 4.772069312585408e-05, "loss": 0.0151, "mean_token_accuracy": 0.9943787693977356, "num_tokens": 105530124.0, "step": 33765 }, { "entropy": 0.0713021919131279, "epoch": 7.871896491432568, "grad_norm": 0.61328125, "learning_rate": 4.771981428432333e-05, "loss": 0.0104, "mean_token_accuracy": 0.9964679121971131, "num_tokens": 105542720.0, "step": 33770 }, { "entropy": 0.0819464897736907, "epoch": 7.873062128453199, "grad_norm": 4.3125, "learning_rate": 4.771893529039978e-05, "loss": 0.0303, "mean_token_accuracy": 0.9911946177482605, "num_tokens": 105563421.0, "step": 33775 }, { "entropy": 0.07584189437329769, "epoch": 7.874227765473831, "grad_norm": 1.484375, "learning_rate": 4.7718056144096526e-05, "loss": 0.0135, "mean_token_accuracy": 0.9951583981513977, "num_tokens": 105578641.0, "step": 33780 }, { "entropy": 0.058252211194485426, "epoch": 7.875393402494463, "grad_norm": 1.2421875, "learning_rate": 4.77171768454267e-05, "loss": 0.017, "mean_token_accuracy": 0.9964474558830261, "num_tokens": 105600238.0, "step": 33785 }, { "entropy": 0.08072509821504355, "epoch": 7.876559039515095, "grad_norm": 0.92578125, "learning_rate": 4.771629739440339e-05, "loss": 0.0243, "mean_token_accuracy": 0.995055878162384, "num_tokens": 105625046.0, "step": 33790 }, { "entropy": 0.0600465914234519, "epoch": 7.877724676535727, "grad_norm": 0.333984375, "learning_rate": 4.7715417791039726e-05, "loss": 0.0086, "mean_token_accuracy": 0.9962651550769805, "num_tokens": 105643980.0, "step": 33795 }, { "entropy": 0.08397406414151191, "epoch": 7.878890313556359, "grad_norm": 0.87890625, "learning_rate": 4.771453803534881e-05, "loss": 0.0222, "mean_token_accuracy": 0.9933609664440155, "num_tokens": 105662117.0, "step": 33800 }, { "entropy": 0.04541693087667227, "epoch": 7.88005595057699, "grad_norm": 0.314453125, "learning_rate": 4.7713658127343776e-05, "loss": 0.0065, "mean_token_accuracy": 0.9967815399169921, "num_tokens": 105695778.0, "step": 33805 }, { "entropy": 0.07490654885768891, "epoch": 7.881221587597622, "grad_norm": 2.296875, "learning_rate": 4.771277806703773e-05, "loss": 0.0176, "mean_token_accuracy": 0.9951118290424347, "num_tokens": 105707859.0, "step": 33810 }, { "entropy": 0.07078452426940203, "epoch": 7.882387224618254, "grad_norm": 0.95703125, "learning_rate": 4.771189785444381e-05, "loss": 0.0158, "mean_token_accuracy": 0.9953491032123566, "num_tokens": 105722345.0, "step": 33815 }, { "entropy": 0.06641866527497768, "epoch": 7.8835528616388855, "grad_norm": 0.296875, "learning_rate": 4.7711017489575134e-05, "loss": 0.0136, "mean_token_accuracy": 0.995814448595047, "num_tokens": 105741946.0, "step": 33820 }, { "entropy": 0.09044511755928397, "epoch": 7.884718498659518, "grad_norm": 1.421875, "learning_rate": 4.7710136972444816e-05, "loss": 0.0151, "mean_token_accuracy": 0.9930476427078248, "num_tokens": 105764921.0, "step": 33825 }, { "entropy": 0.05839778780937195, "epoch": 7.885884135680149, "grad_norm": 1.3515625, "learning_rate": 4.770925630306601e-05, "loss": 0.0093, "mean_token_accuracy": 0.9965133607387543, "num_tokens": 105797265.0, "step": 33830 }, { "entropy": 0.06722655799239874, "epoch": 7.887049772700781, "grad_norm": 2.15625, "learning_rate": 4.770837548145184e-05, "loss": 0.0203, "mean_token_accuracy": 0.9924581229686738, "num_tokens": 105809418.0, "step": 33835 }, { "entropy": 0.07799172587692738, "epoch": 7.888215409721413, "grad_norm": 4.8125, "learning_rate": 4.770749450761543e-05, "loss": 0.0218, "mean_token_accuracy": 0.9934140980243683, "num_tokens": 105827257.0, "step": 33840 }, { "entropy": 0.058118250127881764, "epoch": 7.889381046742044, "grad_norm": 3.625, "learning_rate": 4.770661338156993e-05, "loss": 0.0112, "mean_token_accuracy": 0.9950781047344208, "num_tokens": 105852545.0, "step": 33845 }, { "entropy": 0.06989272115752101, "epoch": 7.890546683762676, "grad_norm": 0.5703125, "learning_rate": 4.7705732103328466e-05, "loss": 0.0179, "mean_token_accuracy": 0.9932778656482697, "num_tokens": 105875208.0, "step": 33850 }, { "entropy": 0.05829099677503109, "epoch": 7.891712320783308, "grad_norm": 1.640625, "learning_rate": 4.770485067290419e-05, "loss": 0.0148, "mean_token_accuracy": 0.9961495161056518, "num_tokens": 105891957.0, "step": 33855 }, { "entropy": 0.06487886887043715, "epoch": 7.89287795780394, "grad_norm": 2.375, "learning_rate": 4.7703969090310244e-05, "loss": 0.0202, "mean_token_accuracy": 0.9944393396377563, "num_tokens": 105905309.0, "step": 33860 }, { "entropy": 0.07422820059582591, "epoch": 7.894043594824572, "grad_norm": 0.48046875, "learning_rate": 4.7703087355559764e-05, "loss": 0.0076, "mean_token_accuracy": 0.996603262424469, "num_tokens": 105929609.0, "step": 33865 }, { "entropy": 0.0708279337733984, "epoch": 7.895209231845204, "grad_norm": 1.3125, "learning_rate": 4.7702205468665904e-05, "loss": 0.008, "mean_token_accuracy": 0.99549121260643, "num_tokens": 105951474.0, "step": 33870 }, { "entropy": 0.04628618396818638, "epoch": 7.896374868865835, "grad_norm": 0.416015625, "learning_rate": 4.770132342964182e-05, "loss": 0.0072, "mean_token_accuracy": 0.9972758531570435, "num_tokens": 105982339.0, "step": 33875 }, { "entropy": 0.08831856437027455, "epoch": 7.897540505886467, "grad_norm": 0.73828125, "learning_rate": 4.7700441238500667e-05, "loss": 0.0176, "mean_token_accuracy": 0.996027272939682, "num_tokens": 105996369.0, "step": 33880 }, { "entropy": 0.07061892207711935, "epoch": 7.898706142907098, "grad_norm": 5.21875, "learning_rate": 4.769955889525558e-05, "loss": 0.0129, "mean_token_accuracy": 0.9928837418556213, "num_tokens": 106023378.0, "step": 33885 }, { "entropy": 0.05240669772028923, "epoch": 7.8998717799277305, "grad_norm": 0.3515625, "learning_rate": 4.769867639991974e-05, "loss": 0.0075, "mean_token_accuracy": 0.9973884999752045, "num_tokens": 106049844.0, "step": 33890 }, { "entropy": 0.05810522306710482, "epoch": 7.901037416948363, "grad_norm": 0.35546875, "learning_rate": 4.769779375250629e-05, "loss": 0.006, "mean_token_accuracy": 0.9974211573600769, "num_tokens": 106083133.0, "step": 33895 }, { "entropy": 0.057801909372210504, "epoch": 7.902203053968994, "grad_norm": 1.234375, "learning_rate": 4.7696910953028395e-05, "loss": 0.0072, "mean_token_accuracy": 0.9975810825824738, "num_tokens": 106108191.0, "step": 33900 }, { "entropy": 0.06211001239717007, "epoch": 7.903368690989626, "grad_norm": 3.4375, "learning_rate": 4.769602800149922e-05, "loss": 0.0133, "mean_token_accuracy": 0.9950418591499328, "num_tokens": 106123112.0, "step": 33905 }, { "entropy": 0.05631450889632106, "epoch": 7.904534328010257, "grad_norm": 0.35546875, "learning_rate": 4.769514489793194e-05, "loss": 0.0089, "mean_token_accuracy": 0.9966521799564362, "num_tokens": 106142631.0, "step": 33910 }, { "entropy": 0.06154237762093544, "epoch": 7.905699965030889, "grad_norm": 0.96484375, "learning_rate": 4.7694261642339706e-05, "loss": 0.0126, "mean_token_accuracy": 0.9955787539482117, "num_tokens": 106170787.0, "step": 33915 }, { "entropy": 0.05892001828178763, "epoch": 7.906865602051521, "grad_norm": 0.25390625, "learning_rate": 4.76933782347357e-05, "loss": 0.0173, "mean_token_accuracy": 0.994269210100174, "num_tokens": 106194157.0, "step": 33920 }, { "entropy": 0.058598081674426795, "epoch": 7.9080312390721526, "grad_norm": 1.109375, "learning_rate": 4.7692494675133094e-05, "loss": 0.0094, "mean_token_accuracy": 0.9951113820075989, "num_tokens": 106219023.0, "step": 33925 }, { "entropy": 0.07916519045829773, "epoch": 7.909196876092785, "grad_norm": 1.2421875, "learning_rate": 4.769161096354506e-05, "loss": 0.0256, "mean_token_accuracy": 0.9923042953014374, "num_tokens": 106239168.0, "step": 33930 }, { "entropy": 0.07624795585870743, "epoch": 7.910362513113417, "grad_norm": 1.796875, "learning_rate": 4.769072709998478e-05, "loss": 0.0191, "mean_token_accuracy": 0.9953700006008148, "num_tokens": 106263742.0, "step": 33935 }, { "entropy": 0.07141708973795176, "epoch": 7.911528150134048, "grad_norm": 1.0859375, "learning_rate": 4.768984308446544e-05, "loss": 0.0086, "mean_token_accuracy": 0.9971637487411499, "num_tokens": 106282014.0, "step": 33940 }, { "entropy": 0.07009767638519407, "epoch": 7.91269378715468, "grad_norm": 1.1796875, "learning_rate": 4.7688958917000195e-05, "loss": 0.0142, "mean_token_accuracy": 0.9976205289363861, "num_tokens": 106314706.0, "step": 33945 }, { "entropy": 0.05547423539683223, "epoch": 7.913859424175312, "grad_norm": 0.84765625, "learning_rate": 4.768807459760226e-05, "loss": 0.0142, "mean_token_accuracy": 0.9966548085212708, "num_tokens": 106356405.0, "step": 33950 }, { "entropy": 0.14163271840661765, "epoch": 7.915025061195943, "grad_norm": 2.71875, "learning_rate": 4.768719012628481e-05, "loss": 0.1602, "mean_token_accuracy": 0.9762783288955689, "num_tokens": 106383544.0, "step": 33955 }, { "entropy": 0.07611956689506769, "epoch": 7.9161906982165755, "grad_norm": 3.171875, "learning_rate": 4.7686305503061024e-05, "loss": 0.0154, "mean_token_accuracy": 0.9944826066493988, "num_tokens": 106400123.0, "step": 33960 }, { "entropy": 0.05730560040101409, "epoch": 7.917356335237207, "grad_norm": 0.353515625, "learning_rate": 4.768542072794411e-05, "loss": 0.0079, "mean_token_accuracy": 0.9983284890651702, "num_tokens": 106416830.0, "step": 33965 }, { "entropy": 0.04461151892319322, "epoch": 7.918521972257839, "grad_norm": 0.609375, "learning_rate": 4.768453580094724e-05, "loss": 0.0091, "mean_token_accuracy": 0.9960005044937134, "num_tokens": 106445076.0, "step": 33970 }, { "entropy": 0.07526362799108029, "epoch": 7.919687609278471, "grad_norm": 1.2734375, "learning_rate": 4.7683650722083636e-05, "loss": 0.0117, "mean_token_accuracy": 0.9961690485477448, "num_tokens": 106462778.0, "step": 33975 }, { "entropy": 0.06253966316580772, "epoch": 7.920853246299102, "grad_norm": 0.447265625, "learning_rate": 4.7682765491366484e-05, "loss": 0.0131, "mean_token_accuracy": 0.9950137913227082, "num_tokens": 106496519.0, "step": 33980 }, { "entropy": 0.05731949470937252, "epoch": 7.922018883319734, "grad_norm": 0.267578125, "learning_rate": 4.768188010880897e-05, "loss": 0.018, "mean_token_accuracy": 0.9952464640140534, "num_tokens": 106523851.0, "step": 33985 }, { "entropy": 0.0510101712308824, "epoch": 7.923184520340366, "grad_norm": 0.341796875, "learning_rate": 4.768099457442432e-05, "loss": 0.0107, "mean_token_accuracy": 0.996779203414917, "num_tokens": 106546787.0, "step": 33990 }, { "entropy": 0.0640136405825615, "epoch": 7.924350157360998, "grad_norm": 0.462890625, "learning_rate": 4.7680108888225724e-05, "loss": 0.0175, "mean_token_accuracy": 0.9934268534183502, "num_tokens": 106558030.0, "step": 33995 }, { "entropy": 0.06371219847351313, "epoch": 7.92551579438163, "grad_norm": 1.890625, "learning_rate": 4.7679223050226396e-05, "loss": 0.0104, "mean_token_accuracy": 0.9967857897281647, "num_tokens": 106575073.0, "step": 34000 }, { "entropy": 0.06602727882564068, "epoch": 7.926681431402262, "grad_norm": 1.2734375, "learning_rate": 4.7678337060439536e-05, "loss": 0.0159, "mean_token_accuracy": 0.9948717474937439, "num_tokens": 106587825.0, "step": 34005 }, { "entropy": 0.058292522095143795, "epoch": 7.927847068422893, "grad_norm": 1.4140625, "learning_rate": 4.767745091887837e-05, "loss": 0.0204, "mean_token_accuracy": 0.9944105803966522, "num_tokens": 106617867.0, "step": 34010 }, { "entropy": 0.07741288328543305, "epoch": 7.929012705443525, "grad_norm": 1.921875, "learning_rate": 4.76765646255561e-05, "loss": 0.019, "mean_token_accuracy": 0.9938458442687989, "num_tokens": 106635957.0, "step": 34015 }, { "entropy": 0.06108499057590962, "epoch": 7.930178342464156, "grad_norm": 1.8984375, "learning_rate": 4.767567818048594e-05, "loss": 0.015, "mean_token_accuracy": 0.9948182106018066, "num_tokens": 106653606.0, "step": 34020 }, { "entropy": 0.07961751464754344, "epoch": 7.931343979484788, "grad_norm": 0.416015625, "learning_rate": 4.7674791583681115e-05, "loss": 0.0084, "mean_token_accuracy": 0.9968226969242096, "num_tokens": 106676928.0, "step": 34025 }, { "entropy": 0.08433617986738681, "epoch": 7.9325096165054205, "grad_norm": 0.91015625, "learning_rate": 4.767390483515485e-05, "loss": 0.0266, "mean_token_accuracy": 0.9931154727935791, "num_tokens": 106687073.0, "step": 34030 }, { "entropy": 0.05512349987402558, "epoch": 7.933675253526052, "grad_norm": 0.5234375, "learning_rate": 4.767301793492035e-05, "loss": 0.0156, "mean_token_accuracy": 0.9950350284576416, "num_tokens": 106707843.0, "step": 34035 }, { "entropy": 0.053560327272862196, "epoch": 7.934840890546684, "grad_norm": 1.859375, "learning_rate": 4.767213088299086e-05, "loss": 0.016, "mean_token_accuracy": 0.9937402963638305, "num_tokens": 106739062.0, "step": 34040 }, { "entropy": 0.06507365573197603, "epoch": 7.936006527567315, "grad_norm": 1.3125, "learning_rate": 4.76712436793796e-05, "loss": 0.0093, "mean_token_accuracy": 0.9949987173080445, "num_tokens": 106753445.0, "step": 34045 }, { "entropy": 0.05009383130818605, "epoch": 7.937172164587947, "grad_norm": 1.0390625, "learning_rate": 4.767035632409979e-05, "loss": 0.0083, "mean_token_accuracy": 0.9968866109848022, "num_tokens": 106771659.0, "step": 34050 }, { "entropy": 0.06271151565015316, "epoch": 7.938337801608579, "grad_norm": 0.875, "learning_rate": 4.7669468817164676e-05, "loss": 0.0102, "mean_token_accuracy": 0.9934817552566528, "num_tokens": 106784329.0, "step": 34055 }, { "entropy": 0.06390997301787138, "epoch": 7.9395034386292105, "grad_norm": 0.92578125, "learning_rate": 4.7668581158587486e-05, "loss": 0.0174, "mean_token_accuracy": 0.9963392674922943, "num_tokens": 106807272.0, "step": 34060 }, { "entropy": 0.05805385177955032, "epoch": 7.940669075649843, "grad_norm": 2.375, "learning_rate": 4.766769334838146e-05, "loss": 0.0144, "mean_token_accuracy": 0.9943688631057739, "num_tokens": 106835599.0, "step": 34065 }, { "entropy": 0.06601453013718128, "epoch": 7.941834712670475, "grad_norm": 1.0859375, "learning_rate": 4.7666805386559826e-05, "loss": 0.0137, "mean_token_accuracy": 0.9940655350685119, "num_tokens": 106856149.0, "step": 34070 }, { "entropy": 0.062201010249555114, "epoch": 7.943000349691106, "grad_norm": 0.3125, "learning_rate": 4.766591727313584e-05, "loss": 0.0073, "mean_token_accuracy": 0.9955526292324066, "num_tokens": 106890142.0, "step": 34075 }, { "entropy": 0.06021309243515134, "epoch": 7.944165986711738, "grad_norm": 0.2431640625, "learning_rate": 4.7665029008122725e-05, "loss": 0.0243, "mean_token_accuracy": 0.994538813829422, "num_tokens": 106916598.0, "step": 34080 }, { "entropy": 0.06776752769947052, "epoch": 7.94533162373237, "grad_norm": 0.94140625, "learning_rate": 4.7664140591533745e-05, "loss": 0.0052, "mean_token_accuracy": 0.9975357413291931, "num_tokens": 106937426.0, "step": 34085 }, { "entropy": 0.06775364875793458, "epoch": 7.946497260753001, "grad_norm": 2.15625, "learning_rate": 4.7663252023382145e-05, "loss": 0.0163, "mean_token_accuracy": 0.9948492228984833, "num_tokens": 106948882.0, "step": 34090 }, { "entropy": 0.07773761413991451, "epoch": 7.947662897773633, "grad_norm": 1.8671875, "learning_rate": 4.766236330368117e-05, "loss": 0.0231, "mean_token_accuracy": 0.991237860918045, "num_tokens": 106963377.0, "step": 34095 }, { "entropy": 0.06178758256137371, "epoch": 7.948828534794265, "grad_norm": 0.87109375, "learning_rate": 4.7661474432444065e-05, "loss": 0.015, "mean_token_accuracy": 0.9954757153987884, "num_tokens": 106979240.0, "step": 34100 }, { "entropy": 0.13852319195866586, "epoch": 7.949994171814897, "grad_norm": 3.5, "learning_rate": 4.76605854096841e-05, "loss": 0.1877, "mean_token_accuracy": 0.9742585599422455, "num_tokens": 107002385.0, "step": 34105 }, { "entropy": 0.08786424957215785, "epoch": 7.951159808835529, "grad_norm": 2.59375, "learning_rate": 4.765969623541452e-05, "loss": 0.0205, "mean_token_accuracy": 0.9931396842002869, "num_tokens": 107011802.0, "step": 34110 }, { "entropy": 0.06776084508746863, "epoch": 7.95232544585616, "grad_norm": 1.125, "learning_rate": 4.765880690964859e-05, "loss": 0.0206, "mean_token_accuracy": 0.9957813143730163, "num_tokens": 107028043.0, "step": 34115 }, { "entropy": 0.04915742976590991, "epoch": 7.953491082876792, "grad_norm": 0.3671875, "learning_rate": 4.765791743239957e-05, "loss": 0.0126, "mean_token_accuracy": 0.9925608456134796, "num_tokens": 107059800.0, "step": 34120 }, { "entropy": 0.056366760190576314, "epoch": 7.954656719897424, "grad_norm": 0.46484375, "learning_rate": 4.765702780368072e-05, "loss": 0.0092, "mean_token_accuracy": 0.9966596603393555, "num_tokens": 107083510.0, "step": 34125 }, { "entropy": 0.0461616107262671, "epoch": 7.9558223569180555, "grad_norm": 0.35546875, "learning_rate": 4.765613802350532e-05, "loss": 0.0076, "mean_token_accuracy": 0.9946865618228913, "num_tokens": 107123417.0, "step": 34130 }, { "entropy": 0.07643718775361777, "epoch": 7.956987993938688, "grad_norm": 2.640625, "learning_rate": 4.765524809188661e-05, "loss": 0.0243, "mean_token_accuracy": 0.9927464604377747, "num_tokens": 107142894.0, "step": 34135 }, { "entropy": 0.06084223799407482, "epoch": 7.95815363095932, "grad_norm": 1.0078125, "learning_rate": 4.765435800883788e-05, "loss": 0.0115, "mean_token_accuracy": 0.996390950679779, "num_tokens": 107163073.0, "step": 34140 }, { "entropy": 0.0730635855346918, "epoch": 7.959319267979951, "grad_norm": 4.9375, "learning_rate": 4.7653467774372405e-05, "loss": 0.0116, "mean_token_accuracy": 0.9954229176044465, "num_tokens": 107174723.0, "step": 34145 }, { "entropy": 0.05009271074086428, "epoch": 7.960484905000583, "grad_norm": 0.79296875, "learning_rate": 4.765257738850345e-05, "loss": 0.0155, "mean_token_accuracy": 0.9943463981151581, "num_tokens": 107200870.0, "step": 34150 }, { "entropy": 0.0630310207605362, "epoch": 7.961650542021214, "grad_norm": 1.2265625, "learning_rate": 4.76516868512443e-05, "loss": 0.0227, "mean_token_accuracy": 0.9922509610652923, "num_tokens": 107216538.0, "step": 34155 }, { "entropy": 0.05934207225218415, "epoch": 7.962816179041846, "grad_norm": 0.287109375, "learning_rate": 4.7650796162608225e-05, "loss": 0.0066, "mean_token_accuracy": 0.9973390638828278, "num_tokens": 107244740.0, "step": 34160 }, { "entropy": 0.06261946465820074, "epoch": 7.9639818160624785, "grad_norm": 0.71875, "learning_rate": 4.764990532260851e-05, "loss": 0.0111, "mean_token_accuracy": 0.9966804802417755, "num_tokens": 107268063.0, "step": 34165 }, { "entropy": 0.05479574520140886, "epoch": 7.96514745308311, "grad_norm": 3.34375, "learning_rate": 4.7649014331258454e-05, "loss": 0.0201, "mean_token_accuracy": 0.994753235578537, "num_tokens": 107284612.0, "step": 34170 }, { "entropy": 0.062203343212604525, "epoch": 7.966313090103742, "grad_norm": 0.400390625, "learning_rate": 4.764812318857131e-05, "loss": 0.0106, "mean_token_accuracy": 0.9971283197402954, "num_tokens": 107302774.0, "step": 34175 }, { "entropy": 0.10029302248731256, "epoch": 7.967478727124373, "grad_norm": 0.59765625, "learning_rate": 4.7647231894560405e-05, "loss": 0.0059, "mean_token_accuracy": 0.998160594701767, "num_tokens": 107341090.0, "step": 34180 }, { "entropy": 0.08249510303139687, "epoch": 7.968644364145005, "grad_norm": 4.1875, "learning_rate": 4.7646340449238995e-05, "loss": 0.024, "mean_token_accuracy": 0.9922098696231842, "num_tokens": 107350811.0, "step": 34185 }, { "entropy": 0.05340167321264744, "epoch": 7.969810001165637, "grad_norm": 0.42578125, "learning_rate": 4.76454488526204e-05, "loss": 0.0111, "mean_token_accuracy": 0.9972420930862427, "num_tokens": 107381142.0, "step": 34190 }, { "entropy": 0.056400989554822444, "epoch": 7.970975638186268, "grad_norm": 0.474609375, "learning_rate": 4.7644557104717894e-05, "loss": 0.0251, "mean_token_accuracy": 0.994273841381073, "num_tokens": 107403837.0, "step": 34195 }, { "entropy": 0.05342116076499224, "epoch": 7.9721412752069005, "grad_norm": 0.390625, "learning_rate": 4.7643665205544786e-05, "loss": 0.0067, "mean_token_accuracy": 0.9962201118469238, "num_tokens": 107430524.0, "step": 34200 }, { "entropy": 0.0673133933916688, "epoch": 7.973306912227533, "grad_norm": 0.53515625, "learning_rate": 4.764277315511437e-05, "loss": 0.0095, "mean_token_accuracy": 0.9935947299003601, "num_tokens": 107448914.0, "step": 34205 }, { "entropy": 0.06266302475705743, "epoch": 7.974472549248164, "grad_norm": 1.4921875, "learning_rate": 4.764188095343996e-05, "loss": 0.0123, "mean_token_accuracy": 0.9956326484680176, "num_tokens": 107467145.0, "step": 34210 }, { "entropy": 0.056773718632757667, "epoch": 7.975638186268796, "grad_norm": 0.79296875, "learning_rate": 4.7640988600534834e-05, "loss": 0.0135, "mean_token_accuracy": 0.9961419343948364, "num_tokens": 107484762.0, "step": 34215 }, { "entropy": 0.05827885894104838, "epoch": 7.976803823289428, "grad_norm": 0.75, "learning_rate": 4.7640096096412326e-05, "loss": 0.0176, "mean_token_accuracy": 0.9965371251106262, "num_tokens": 107507967.0, "step": 34220 }, { "entropy": 0.06455299574881793, "epoch": 7.977969460310059, "grad_norm": 1.6484375, "learning_rate": 4.763920344108573e-05, "loss": 0.0129, "mean_token_accuracy": 0.994284588098526, "num_tokens": 107524367.0, "step": 34225 }, { "entropy": 0.05216892771422863, "epoch": 7.979135097330691, "grad_norm": 1.875, "learning_rate": 4.763831063456837e-05, "loss": 0.0094, "mean_token_accuracy": 0.9944133341312409, "num_tokens": 107549902.0, "step": 34230 }, { "entropy": 0.046560865826904774, "epoch": 7.980300734351323, "grad_norm": 1.1484375, "learning_rate": 4.7637417676873534e-05, "loss": 0.0112, "mean_token_accuracy": 0.996799236536026, "num_tokens": 107573291.0, "step": 34235 }, { "entropy": 0.07189444862306119, "epoch": 7.981466371371955, "grad_norm": 0.671875, "learning_rate": 4.7636524568014564e-05, "loss": 0.0158, "mean_token_accuracy": 0.9959348559379577, "num_tokens": 107583447.0, "step": 34240 }, { "entropy": 0.06508129709400237, "epoch": 7.982632008392587, "grad_norm": 1.8984375, "learning_rate": 4.763563130800476e-05, "loss": 0.0138, "mean_token_accuracy": 0.9959682047367096, "num_tokens": 107604534.0, "step": 34245 }, { "entropy": 0.04465307705104351, "epoch": 7.983797645413218, "grad_norm": 0.35546875, "learning_rate": 4.763473789685746e-05, "loss": 0.0142, "mean_token_accuracy": 0.9941910564899444, "num_tokens": 107624770.0, "step": 34250 }, { "entropy": 0.08091672882437706, "epoch": 7.98496328243385, "grad_norm": 0.51171875, "learning_rate": 4.763384433458596e-05, "loss": 0.0211, "mean_token_accuracy": 0.9940769791603088, "num_tokens": 107634611.0, "step": 34255 }, { "entropy": 0.05613311324268579, "epoch": 7.986128919454482, "grad_norm": 1.5, "learning_rate": 4.763295062120361e-05, "loss": 0.0139, "mean_token_accuracy": 0.9961461901664734, "num_tokens": 107650710.0, "step": 34260 }, { "entropy": 0.049251684360206126, "epoch": 7.987294556475113, "grad_norm": 0.419921875, "learning_rate": 4.763205675672372e-05, "loss": 0.0047, "mean_token_accuracy": 0.9977784931659699, "num_tokens": 107677762.0, "step": 34265 }, { "entropy": 0.06999004650861025, "epoch": 7.9884601934957455, "grad_norm": 2.484375, "learning_rate": 4.763116274115963e-05, "loss": 0.021, "mean_token_accuracy": 0.9948745965957642, "num_tokens": 107690370.0, "step": 34270 }, { "entropy": 0.07125404626131057, "epoch": 7.989625830516378, "grad_norm": 0.2353515625, "learning_rate": 4.7630268574524656e-05, "loss": 0.0099, "mean_token_accuracy": 0.9984432697296143, "num_tokens": 107704023.0, "step": 34275 }, { "entropy": 0.08081364408135414, "epoch": 7.990791467537009, "grad_norm": 2.046875, "learning_rate": 4.762937425683215e-05, "loss": 0.008, "mean_token_accuracy": 0.9953217029571533, "num_tokens": 107720117.0, "step": 34280 }, { "entropy": 0.0741135323420167, "epoch": 7.991957104557641, "grad_norm": 1.6640625, "learning_rate": 4.762847978809543e-05, "loss": 0.0134, "mean_token_accuracy": 0.9946883201599122, "num_tokens": 107730835.0, "step": 34285 }, { "entropy": 0.07226054593920708, "epoch": 7.993122741578272, "grad_norm": 1.2265625, "learning_rate": 4.7627585168327846e-05, "loss": 0.0169, "mean_token_accuracy": 0.9945705354213714, "num_tokens": 107760637.0, "step": 34290 }, { "entropy": 0.0748120654374361, "epoch": 7.994288378598904, "grad_norm": 2.1875, "learning_rate": 4.762669039754273e-05, "loss": 0.0241, "mean_token_accuracy": 0.9938279986381531, "num_tokens": 107770993.0, "step": 34295 }, { "entropy": 0.0834178514778614, "epoch": 7.995454015619536, "grad_norm": 1.9375, "learning_rate": 4.7625795475753436e-05, "loss": 0.054, "mean_token_accuracy": 0.9899437367916107, "num_tokens": 107796597.0, "step": 34300 }, { "entropy": 0.0649189880117774, "epoch": 7.996619652640168, "grad_norm": 2.03125, "learning_rate": 4.76249004029733e-05, "loss": 0.0144, "mean_token_accuracy": 0.9953900396823883, "num_tokens": 107818585.0, "step": 34305 }, { "entropy": 0.06638593953102827, "epoch": 7.9977852896608, "grad_norm": 1.453125, "learning_rate": 4.762400517921567e-05, "loss": 0.0159, "mean_token_accuracy": 0.9956220924854279, "num_tokens": 107840086.0, "step": 34310 }, { "entropy": 0.0659774586558342, "epoch": 7.998950926681431, "grad_norm": 0.40625, "learning_rate": 4.7623109804493895e-05, "loss": 0.0168, "mean_token_accuracy": 0.9956194818019867, "num_tokens": 107861379.0, "step": 34315 }, { "entropy": 0.11034517497238186, "epoch": 8.0, "grad_norm": 0.28515625, "learning_rate": 4.762221427882132e-05, "loss": 0.0774, "mean_token_accuracy": 0.9823514488008287, "num_tokens": 107884500.0, "step": 34320 }, { "entropy": 0.0570672795176506, "epoch": 8.001165637020632, "grad_norm": 0.1298828125, "learning_rate": 4.7621318602211315e-05, "loss": 0.0023, "mean_token_accuracy": 0.9987614572048187, "num_tokens": 107910348.0, "step": 34325 }, { "entropy": 0.056932621914893386, "epoch": 8.002331274041264, "grad_norm": 0.4609375, "learning_rate": 4.762042277467723e-05, "loss": 0.0048, "mean_token_accuracy": 0.9991077303886413, "num_tokens": 107928390.0, "step": 34330 }, { "entropy": 0.06185147017240524, "epoch": 8.003496911061895, "grad_norm": 0.796875, "learning_rate": 4.761952679623241e-05, "loss": 0.0038, "mean_token_accuracy": 0.9989405035972595, "num_tokens": 107945191.0, "step": 34335 }, { "entropy": 0.05853238175623119, "epoch": 8.004662548082527, "grad_norm": 0.38671875, "learning_rate": 4.761863066689022e-05, "loss": 0.0071, "mean_token_accuracy": 0.9990803480148316, "num_tokens": 107965503.0, "step": 34340 }, { "entropy": 0.07207233114168048, "epoch": 8.005828185103159, "grad_norm": 0.4765625, "learning_rate": 4.761773438666404e-05, "loss": 0.0042, "mean_token_accuracy": 0.9984596371650696, "num_tokens": 107983227.0, "step": 34345 }, { "entropy": 0.07162175551056862, "epoch": 8.00699382212379, "grad_norm": 1.3125, "learning_rate": 4.761683795556722e-05, "loss": 0.0103, "mean_token_accuracy": 0.9979361593723297, "num_tokens": 108005127.0, "step": 34350 }, { "entropy": 0.0725046245381236, "epoch": 8.008159459144423, "grad_norm": 0.267578125, "learning_rate": 4.761594137361312e-05, "loss": 0.0015, "mean_token_accuracy": 0.9999236643314362, "num_tokens": 108021479.0, "step": 34355 }, { "entropy": 0.04563398342579603, "epoch": 8.009325096165055, "grad_norm": 0.236328125, "learning_rate": 4.7615044640815123e-05, "loss": 0.003, "mean_token_accuracy": 0.9993497133255005, "num_tokens": 108040627.0, "step": 34360 }, { "entropy": 0.047726039029657844, "epoch": 8.010490733185685, "grad_norm": 0.349609375, "learning_rate": 4.76141477571866e-05, "loss": 0.0012, "mean_token_accuracy": 0.9991644740104675, "num_tokens": 108073823.0, "step": 34365 }, { "entropy": 0.05970633877441287, "epoch": 8.011656370206317, "grad_norm": 0.8984375, "learning_rate": 4.761325072274092e-05, "loss": 0.0034, "mean_token_accuracy": 0.9993616282939911, "num_tokens": 108093022.0, "step": 34370 }, { "entropy": 0.049816728476434945, "epoch": 8.01282200722695, "grad_norm": 1.2578125, "learning_rate": 4.761235353749146e-05, "loss": 0.0056, "mean_token_accuracy": 0.9986774921417236, "num_tokens": 108119311.0, "step": 34375 }, { "entropy": 0.05257010804489255, "epoch": 8.013987644247582, "grad_norm": 0.26953125, "learning_rate": 4.7611456201451604e-05, "loss": 0.0077, "mean_token_accuracy": 0.9974354028701782, "num_tokens": 108141035.0, "step": 34380 }, { "entropy": 0.06397520806640386, "epoch": 8.015153281268214, "grad_norm": 0.72265625, "learning_rate": 4.7610558714634724e-05, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 108155271.0, "step": 34385 }, { "entropy": 0.05282603679224849, "epoch": 8.016318918288844, "grad_norm": 0.1328125, "learning_rate": 4.76096610770542e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998047232627869, "num_tokens": 108186139.0, "step": 34390 }, { "entropy": 0.05000446168705821, "epoch": 8.017484555309476, "grad_norm": 0.09423828125, "learning_rate": 4.7608763288723434e-05, "loss": 0.0012, "mean_token_accuracy": 0.9998030006885529, "num_tokens": 108205726.0, "step": 34395 }, { "entropy": 0.057174905110150574, "epoch": 8.018650192330108, "grad_norm": 0.201171875, "learning_rate": 4.76078653496558e-05, "loss": 0.0021, "mean_token_accuracy": 0.9996845364570618, "num_tokens": 108226331.0, "step": 34400 }, { "entropy": 0.06045698598027229, "epoch": 8.01981582935074, "grad_norm": 1.71875, "learning_rate": 4.760696725986469e-05, "loss": 0.0043, "mean_token_accuracy": 0.9985334575176239, "num_tokens": 108245610.0, "step": 34405 }, { "entropy": 0.042647901270538566, "epoch": 8.020981466371373, "grad_norm": 0.0546875, "learning_rate": 4.760606901936349e-05, "loss": 0.0026, "mean_token_accuracy": 0.999316680431366, "num_tokens": 108277590.0, "step": 34410 }, { "entropy": 0.06458899248391389, "epoch": 8.022147103392003, "grad_norm": 1.9296875, "learning_rate": 4.7605170628165604e-05, "loss": 0.0033, "mean_token_accuracy": 0.9993538677692413, "num_tokens": 108304890.0, "step": 34415 }, { "entropy": 0.06744192838668824, "epoch": 8.023312740412635, "grad_norm": 0.1630859375, "learning_rate": 4.760427208628443e-05, "loss": 0.004, "mean_token_accuracy": 0.9989646315574646, "num_tokens": 108323253.0, "step": 34420 }, { "entropy": 0.05032515674829483, "epoch": 8.024478377433267, "grad_norm": 0.453125, "learning_rate": 4.760337339373336e-05, "loss": 0.0063, "mean_token_accuracy": 0.9982992768287658, "num_tokens": 108358355.0, "step": 34425 }, { "entropy": 0.062434613704681396, "epoch": 8.0256440144539, "grad_norm": 0.5234375, "learning_rate": 4.76024745505258e-05, "loss": 0.002, "mean_token_accuracy": 0.9996152639389038, "num_tokens": 108382542.0, "step": 34430 }, { "entropy": 0.05185540029779077, "epoch": 8.026809651474531, "grad_norm": 0.490234375, "learning_rate": 4.7601575556675135e-05, "loss": 0.0058, "mean_token_accuracy": 0.9983523845672607, "num_tokens": 108422836.0, "step": 34435 }, { "entropy": 0.07135622762143612, "epoch": 8.027975288495163, "grad_norm": 0.5, "learning_rate": 4.76006764121948e-05, "loss": 0.0052, "mean_token_accuracy": 0.9981033325195312, "num_tokens": 108436442.0, "step": 34440 }, { "entropy": 0.06478116279467941, "epoch": 8.029140925515794, "grad_norm": 0.064453125, "learning_rate": 4.759977711709818e-05, "loss": 0.0034, "mean_token_accuracy": 0.9992549657821655, "num_tokens": 108462678.0, "step": 34445 }, { "entropy": 0.049783976096659896, "epoch": 8.030306562536426, "grad_norm": 0.11962890625, "learning_rate": 4.759887767139869e-05, "loss": 0.0026, "mean_token_accuracy": 0.9983406960964203, "num_tokens": 108484473.0, "step": 34450 }, { "entropy": 0.07408700212836265, "epoch": 8.031472199557058, "grad_norm": 3.125, "learning_rate": 4.759797807510975e-05, "loss": 0.0135, "mean_token_accuracy": 0.9985843777656556, "num_tokens": 108492586.0, "step": 34455 }, { "entropy": 0.1893187090754509, "epoch": 8.03263783657769, "grad_norm": 0.1494140625, "learning_rate": 4.759707832824477e-05, "loss": 0.3292, "mean_token_accuracy": 0.9610701024532318, "num_tokens": 108528181.0, "step": 34460 }, { "entropy": 0.04200795106589794, "epoch": 8.033803473598322, "grad_norm": 0.2490234375, "learning_rate": 4.7596178430817156e-05, "loss": 0.0018, "mean_token_accuracy": 0.9991814613342285, "num_tokens": 108550517.0, "step": 34465 }, { "entropy": 0.06480789603665471, "epoch": 8.034969110618952, "grad_norm": 0.2001953125, "learning_rate": 4.759527838284035e-05, "loss": 0.0043, "mean_token_accuracy": 0.9986863672733307, "num_tokens": 108564419.0, "step": 34470 }, { "entropy": 0.07080358192324639, "epoch": 8.036134747639585, "grad_norm": 0.0517578125, "learning_rate": 4.7594378184327745e-05, "loss": 0.0047, "mean_token_accuracy": 0.9989554643630981, "num_tokens": 108576162.0, "step": 34475 }, { "entropy": 0.051558745186775924, "epoch": 8.037300384660217, "grad_norm": 0.234375, "learning_rate": 4.759347783529279e-05, "loss": 0.0013, "mean_token_accuracy": 0.999577134847641, "num_tokens": 108605987.0, "step": 34480 }, { "entropy": 0.05720545100048184, "epoch": 8.038466021680849, "grad_norm": 1.109375, "learning_rate": 4.759257733574889e-05, "loss": 0.0082, "mean_token_accuracy": 0.9988365888595581, "num_tokens": 108625363.0, "step": 34485 }, { "entropy": 0.0536104841157794, "epoch": 8.03963165870148, "grad_norm": 0.466796875, "learning_rate": 4.7591676685709486e-05, "loss": 0.0033, "mean_token_accuracy": 0.9984508991241455, "num_tokens": 108659582.0, "step": 34490 }, { "entropy": 0.06404474405571818, "epoch": 8.040797295722113, "grad_norm": 1.0546875, "learning_rate": 4.7590775885188e-05, "loss": 0.0071, "mean_token_accuracy": 0.9984810829162598, "num_tokens": 108676935.0, "step": 34495 }, { "entropy": 0.0648849243298173, "epoch": 8.041962932742743, "grad_norm": 0.279296875, "learning_rate": 4.758987493419787e-05, "loss": 0.0047, "mean_token_accuracy": 0.998368215560913, "num_tokens": 108699414.0, "step": 34500 }, { "entropy": 0.05447390619665384, "epoch": 8.043128569763375, "grad_norm": 0.26171875, "learning_rate": 4.758897383275253e-05, "loss": 0.0038, "mean_token_accuracy": 0.9990447640419007, "num_tokens": 108717800.0, "step": 34505 }, { "entropy": 0.04812881154939532, "epoch": 8.044294206784008, "grad_norm": 0.28515625, "learning_rate": 4.7588072580865416e-05, "loss": 0.0046, "mean_token_accuracy": 0.9990270733833313, "num_tokens": 108737812.0, "step": 34510 }, { "entropy": 0.030604275315999983, "epoch": 8.04545984380464, "grad_norm": 0.171875, "learning_rate": 4.758717117854997e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998698234558105, "num_tokens": 108779109.0, "step": 34515 }, { "entropy": 0.06884174533188343, "epoch": 8.046625480825272, "grad_norm": 0.76171875, "learning_rate": 4.7586269625819624e-05, "loss": 0.0019, "mean_token_accuracy": 0.9994652390480041, "num_tokens": 108790310.0, "step": 34520 }, { "entropy": 0.06846872791647911, "epoch": 8.047791117845902, "grad_norm": 0.455078125, "learning_rate": 4.7585367922687826e-05, "loss": 0.0096, "mean_token_accuracy": 0.996958750486374, "num_tokens": 108808837.0, "step": 34525 }, { "entropy": 0.07946307212114334, "epoch": 8.048956754866534, "grad_norm": 0.423828125, "learning_rate": 4.758446606916803e-05, "loss": 0.0374, "mean_token_accuracy": 0.9925918638706207, "num_tokens": 108828868.0, "step": 34530 }, { "entropy": 0.0731971831060946, "epoch": 8.050122391887166, "grad_norm": 0.4609375, "learning_rate": 4.758356406527367e-05, "loss": 0.0041, "mean_token_accuracy": 0.9987062692642212, "num_tokens": 108845014.0, "step": 34535 }, { "entropy": 0.06874092482030392, "epoch": 8.051288028907798, "grad_norm": 0.9609375, "learning_rate": 4.758266191101821e-05, "loss": 0.0019, "mean_token_accuracy": 0.9991285920143127, "num_tokens": 108875202.0, "step": 34540 }, { "entropy": 0.07375312838703393, "epoch": 8.05245366592843, "grad_norm": 0.10546875, "learning_rate": 4.7581759606415085e-05, "loss": 0.0025, "mean_token_accuracy": 0.9993589997291565, "num_tokens": 108887896.0, "step": 34545 }, { "entropy": 0.06638930989429355, "epoch": 8.05361930294906, "grad_norm": 0.1943359375, "learning_rate": 4.7580857151477775e-05, "loss": 0.0069, "mean_token_accuracy": 0.9990043401718139, "num_tokens": 108906837.0, "step": 34550 }, { "entropy": 0.05511264381930232, "epoch": 8.054784939969693, "grad_norm": 0.1669921875, "learning_rate": 4.7579954546219715e-05, "loss": 0.0035, "mean_token_accuracy": 0.9984936416149139, "num_tokens": 108935559.0, "step": 34555 }, { "entropy": 0.08696108423173428, "epoch": 8.055950576990325, "grad_norm": 1.2890625, "learning_rate": 4.757905179065437e-05, "loss": 0.0047, "mean_token_accuracy": 0.9991724133491516, "num_tokens": 108946896.0, "step": 34560 }, { "entropy": 0.06002172166481614, "epoch": 8.057116214010957, "grad_norm": 1.4765625, "learning_rate": 4.75781488847952e-05, "loss": 0.0063, "mean_token_accuracy": 0.9990639746189117, "num_tokens": 108973470.0, "step": 34565 }, { "entropy": 0.06480908133089543, "epoch": 8.05828185103159, "grad_norm": 0.21875, "learning_rate": 4.757724582865568e-05, "loss": 0.0017, "mean_token_accuracy": 0.999623441696167, "num_tokens": 108994182.0, "step": 34570 }, { "entropy": 0.18375195730477573, "epoch": 8.059447488052221, "grad_norm": 4.6875, "learning_rate": 4.7576342622249263e-05, "loss": 0.2482, "mean_token_accuracy": 0.9745356976985932, "num_tokens": 109016581.0, "step": 34575 }, { "entropy": 0.049759179912507535, "epoch": 8.060613125072852, "grad_norm": 0.06591796875, "learning_rate": 4.757543926558943e-05, "loss": 0.0009, "mean_token_accuracy": 0.9999882936477661, "num_tokens": 109042756.0, "step": 34580 }, { "entropy": 0.06753350887447596, "epoch": 8.061778762093484, "grad_norm": 0.25390625, "learning_rate": 4.757453575868963e-05, "loss": 0.0037, "mean_token_accuracy": 0.9990081429481507, "num_tokens": 109061726.0, "step": 34585 }, { "entropy": 0.053464246727526185, "epoch": 8.062944399114116, "grad_norm": 0.240234375, "learning_rate": 4.757363210156336e-05, "loss": 0.0019, "mean_token_accuracy": 0.9998391628265381, "num_tokens": 109097127.0, "step": 34590 }, { "entropy": 0.061327016353607176, "epoch": 8.064110036134748, "grad_norm": 0.58203125, "learning_rate": 4.757272829422407e-05, "loss": 0.0019, "mean_token_accuracy": 0.9997431099414825, "num_tokens": 109122097.0, "step": 34595 }, { "entropy": 0.05932259801775217, "epoch": 8.06527567315538, "grad_norm": 2.328125, "learning_rate": 4.7571824336685266e-05, "loss": 0.0085, "mean_token_accuracy": 0.9978394627571106, "num_tokens": 109135503.0, "step": 34600 }, { "entropy": 0.05971184335649014, "epoch": 8.06644131017601, "grad_norm": 1.1015625, "learning_rate": 4.75709202289604e-05, "loss": 0.0014, "mean_token_accuracy": 0.999554580450058, "num_tokens": 109160791.0, "step": 34605 }, { "entropy": 0.14114007484167815, "epoch": 8.067606947196643, "grad_norm": 3.96875, "learning_rate": 4.757001597106297e-05, "loss": 0.1574, "mean_token_accuracy": 0.978978055715561, "num_tokens": 109184846.0, "step": 34610 }, { "entropy": 0.0502942712046206, "epoch": 8.068772584217275, "grad_norm": 1.6484375, "learning_rate": 4.7569111563006464e-05, "loss": 0.0055, "mean_token_accuracy": 0.9975858092308044, "num_tokens": 109208910.0, "step": 34615 }, { "entropy": 0.06697574276477099, "epoch": 8.069938221237907, "grad_norm": 1.03125, "learning_rate": 4.756820700480435e-05, "loss": 0.0077, "mean_token_accuracy": 0.997665673494339, "num_tokens": 109232222.0, "step": 34620 }, { "entropy": 0.05864384537562728, "epoch": 8.071103858258539, "grad_norm": 0.173828125, "learning_rate": 4.756730229647014e-05, "loss": 0.0023, "mean_token_accuracy": 0.9985059797763824, "num_tokens": 109261238.0, "step": 34625 }, { "entropy": 0.05310507658869028, "epoch": 8.072269495279171, "grad_norm": 2.8125, "learning_rate": 4.756639743801729e-05, "loss": 0.0096, "mean_token_accuracy": 0.9981181919574738, "num_tokens": 109293773.0, "step": 34630 }, { "entropy": 0.055432358756661415, "epoch": 8.073435132299801, "grad_norm": 0.31640625, "learning_rate": 4.7565492429459327e-05, "loss": 0.0017, "mean_token_accuracy": 0.9994365215301514, "num_tokens": 109305075.0, "step": 34635 }, { "entropy": 0.06921327225863934, "epoch": 8.074600769320433, "grad_norm": 0.1806640625, "learning_rate": 4.756458727080973e-05, "loss": 0.0364, "mean_token_accuracy": 0.9931345105171203, "num_tokens": 109328180.0, "step": 34640 }, { "entropy": 0.09400631617754698, "epoch": 8.075766406341065, "grad_norm": 0.11474609375, "learning_rate": 4.7563681962082e-05, "loss": 0.0019, "mean_token_accuracy": 0.9998493969440461, "num_tokens": 109340085.0, "step": 34645 }, { "entropy": 0.06292316848412156, "epoch": 8.076932043361698, "grad_norm": 0.78515625, "learning_rate": 4.756277650328963e-05, "loss": 0.0029, "mean_token_accuracy": 0.9987617790699005, "num_tokens": 109361790.0, "step": 34650 }, { "entropy": 0.04522802149876952, "epoch": 8.07809768038233, "grad_norm": 0.1240234375, "learning_rate": 4.756187089444613e-05, "loss": 0.0083, "mean_token_accuracy": 0.9972607851028442, "num_tokens": 109391813.0, "step": 34655 }, { "entropy": 0.07555988952517509, "epoch": 8.07926331740296, "grad_norm": 0.35546875, "learning_rate": 4.7560965135565e-05, "loss": 0.0046, "mean_token_accuracy": 0.9993071317672729, "num_tokens": 109424065.0, "step": 34660 }, { "entropy": 0.07265737438574434, "epoch": 8.080428954423592, "grad_norm": 2.0625, "learning_rate": 4.756005922665975e-05, "loss": 0.0037, "mean_token_accuracy": 0.998919528722763, "num_tokens": 109435647.0, "step": 34665 }, { "entropy": 0.06973394704982638, "epoch": 8.081594591444224, "grad_norm": 0.5, "learning_rate": 4.7559153167743886e-05, "loss": 0.0037, "mean_token_accuracy": 0.9993018686771393, "num_tokens": 109455917.0, "step": 34670 }, { "entropy": 0.06335199475288392, "epoch": 8.082760228464856, "grad_norm": 0.201171875, "learning_rate": 4.7558246958830916e-05, "loss": 0.0023, "mean_token_accuracy": 0.9993453681468963, "num_tokens": 109466263.0, "step": 34675 }, { "entropy": 0.058289121463894845, "epoch": 8.083925865485488, "grad_norm": 1.4453125, "learning_rate": 4.755734059993436e-05, "loss": 0.0046, "mean_token_accuracy": 0.9986499547958374, "num_tokens": 109479761.0, "step": 34680 }, { "entropy": 0.06979324482381344, "epoch": 8.085091502506119, "grad_norm": 2.59375, "learning_rate": 4.755643409106772e-05, "loss": 0.0075, "mean_token_accuracy": 0.9977883517742157, "num_tokens": 109493751.0, "step": 34685 }, { "entropy": 0.06401132261380553, "epoch": 8.08625713952675, "grad_norm": 0.263671875, "learning_rate": 4.755552743224453e-05, "loss": 0.0091, "mean_token_accuracy": 0.9982850253582001, "num_tokens": 109514572.0, "step": 34690 }, { "entropy": 0.07194490609690547, "epoch": 8.087422776547383, "grad_norm": 0.47265625, "learning_rate": 4.7554620623478294e-05, "loss": 0.0208, "mean_token_accuracy": 0.9971080601215363, "num_tokens": 109539657.0, "step": 34695 }, { "entropy": 0.06034817099571228, "epoch": 8.088588413568015, "grad_norm": 0.33203125, "learning_rate": 4.755371366478255e-05, "loss": 0.0026, "mean_token_accuracy": 0.9994564294815064, "num_tokens": 109560886.0, "step": 34700 }, { "entropy": 0.05557254049926996, "epoch": 8.089754050588647, "grad_norm": 0.39453125, "learning_rate": 4.755280655617081e-05, "loss": 0.0045, "mean_token_accuracy": 0.9980686604976654, "num_tokens": 109591798.0, "step": 34705 }, { "entropy": 0.05787776857614517, "epoch": 8.09091968760928, "grad_norm": 0.439453125, "learning_rate": 4.75518992976566e-05, "loss": 0.003, "mean_token_accuracy": 0.9983800172805786, "num_tokens": 109614304.0, "step": 34710 }, { "entropy": 0.07849300522357225, "epoch": 8.09208532462991, "grad_norm": 0.58203125, "learning_rate": 4.755099188925346e-05, "loss": 0.0017, "mean_token_accuracy": 0.999658054113388, "num_tokens": 109628888.0, "step": 34715 }, { "entropy": 0.05869809268042445, "epoch": 8.093250961650542, "grad_norm": 0.1923828125, "learning_rate": 4.7550084330974906e-05, "loss": 0.0044, "mean_token_accuracy": 0.9989194750785828, "num_tokens": 109655835.0, "step": 34720 }, { "entropy": 0.06568187102675438, "epoch": 8.094416598671174, "grad_norm": 1.84375, "learning_rate": 4.7549176622834476e-05, "loss": 0.0048, "mean_token_accuracy": 0.9967410624027252, "num_tokens": 109675362.0, "step": 34725 }, { "entropy": 0.07911296645179391, "epoch": 8.095582235691806, "grad_norm": 0.1171875, "learning_rate": 4.754826876484572e-05, "loss": 0.0051, "mean_token_accuracy": 0.998859316110611, "num_tokens": 109695961.0, "step": 34730 }, { "entropy": 0.05620421562343836, "epoch": 8.096747872712438, "grad_norm": 0.6171875, "learning_rate": 4.754736075702216e-05, "loss": 0.0051, "mean_token_accuracy": 0.9992101728916168, "num_tokens": 109727604.0, "step": 34735 }, { "entropy": 0.06297731064260007, "epoch": 8.097913509733068, "grad_norm": 0.47265625, "learning_rate": 4.754645259937733e-05, "loss": 0.0027, "mean_token_accuracy": 0.9980818867683411, "num_tokens": 109747309.0, "step": 34740 }, { "entropy": 0.06897974638268352, "epoch": 8.0990791467537, "grad_norm": 0.59375, "learning_rate": 4.754554429192479e-05, "loss": 0.0034, "mean_token_accuracy": 0.9988614618778229, "num_tokens": 109769049.0, "step": 34745 }, { "entropy": 0.056776642613112924, "epoch": 8.100244783774333, "grad_norm": 1.3203125, "learning_rate": 4.754463583467808e-05, "loss": 0.0036, "mean_token_accuracy": 0.9992092728614808, "num_tokens": 109792119.0, "step": 34750 }, { "entropy": 0.07602698244154453, "epoch": 8.101410420794965, "grad_norm": 0.33203125, "learning_rate": 4.754372722765073e-05, "loss": 0.0018, "mean_token_accuracy": 0.9994484543800354, "num_tokens": 109801617.0, "step": 34755 }, { "entropy": 0.0649228509515524, "epoch": 8.102576057815597, "grad_norm": 1.4765625, "learning_rate": 4.7542818470856295e-05, "loss": 0.0023, "mean_token_accuracy": 0.9990556597709656, "num_tokens": 109812394.0, "step": 34760 }, { "entropy": 0.07589486986398697, "epoch": 8.103741694836229, "grad_norm": 0.6015625, "learning_rate": 4.754190956430834e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998491406440735, "num_tokens": 109831667.0, "step": 34765 }, { "entropy": 0.0709638643078506, "epoch": 8.10490733185686, "grad_norm": 0.5703125, "learning_rate": 4.7541000508020415e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997025489807129, "num_tokens": 109857628.0, "step": 34770 }, { "entropy": 0.09596365876495838, "epoch": 8.106072968877491, "grad_norm": 0.89453125, "learning_rate": 4.7540091302006065e-05, "loss": 0.0024, "mean_token_accuracy": 0.9993020176887513, "num_tokens": 109867038.0, "step": 34775 }, { "entropy": 0.04796134922653437, "epoch": 8.107238605898123, "grad_norm": 0.365234375, "learning_rate": 4.753918194627886e-05, "loss": 0.0016, "mean_token_accuracy": 0.999389237165451, "num_tokens": 109894744.0, "step": 34780 }, { "entropy": 0.0685146126896143, "epoch": 8.108404242918756, "grad_norm": 1.59375, "learning_rate": 4.753827244085234e-05, "loss": 0.0105, "mean_token_accuracy": 0.9986864030361176, "num_tokens": 109905112.0, "step": 34785 }, { "entropy": 0.050441629253327847, "epoch": 8.109569879939388, "grad_norm": 0.275390625, "learning_rate": 4.7537362785740084e-05, "loss": 0.0012, "mean_token_accuracy": 0.9994386732578278, "num_tokens": 109927501.0, "step": 34790 }, { "entropy": 0.05915358606725931, "epoch": 8.110735516960018, "grad_norm": 0.2177734375, "learning_rate": 4.7536452980955656e-05, "loss": 0.0075, "mean_token_accuracy": 0.9971899330615998, "num_tokens": 109956240.0, "step": 34795 }, { "entropy": 0.058942811191082, "epoch": 8.11190115398065, "grad_norm": 0.466796875, "learning_rate": 4.7535543026512616e-05, "loss": 0.0083, "mean_token_accuracy": 0.9983623147010803, "num_tokens": 109971126.0, "step": 34800 }, { "entropy": 0.06020897924900055, "epoch": 8.113066791001282, "grad_norm": 0.263671875, "learning_rate": 4.753463292242453e-05, "loss": 0.0029, "mean_token_accuracy": 0.9993130803108216, "num_tokens": 109992921.0, "step": 34805 }, { "entropy": 0.05796087365597487, "epoch": 8.114232428021914, "grad_norm": 1.1875, "learning_rate": 4.7533722668704975e-05, "loss": 0.0038, "mean_token_accuracy": 0.9987258195877076, "num_tokens": 110003456.0, "step": 34810 }, { "entropy": 0.074308517947793, "epoch": 8.115398065042546, "grad_norm": 1.7890625, "learning_rate": 4.7532812265367534e-05, "loss": 0.0019, "mean_token_accuracy": 0.9994267642498016, "num_tokens": 110013095.0, "step": 34815 }, { "entropy": 0.06139457672834396, "epoch": 8.116563702063177, "grad_norm": 0.41015625, "learning_rate": 4.753190171242576e-05, "loss": 0.0033, "mean_token_accuracy": 0.998884916305542, "num_tokens": 110026782.0, "step": 34820 }, { "entropy": 0.05485517792403698, "epoch": 8.117729339083809, "grad_norm": 1.109375, "learning_rate": 4.753099100989324e-05, "loss": 0.0026, "mean_token_accuracy": 0.9990606665611267, "num_tokens": 110039842.0, "step": 34825 }, { "entropy": 0.07333013117313385, "epoch": 8.11889497610444, "grad_norm": 0.51171875, "learning_rate": 4.753008015778357e-05, "loss": 0.0048, "mean_token_accuracy": 0.9987470507621765, "num_tokens": 110053856.0, "step": 34830 }, { "entropy": 0.05965627012774348, "epoch": 8.120060613125073, "grad_norm": 0.4140625, "learning_rate": 4.752916915611031e-05, "loss": 0.0023, "mean_token_accuracy": 0.9996311247348786, "num_tokens": 110069531.0, "step": 34835 }, { "entropy": 0.04760171640664339, "epoch": 8.121226250145705, "grad_norm": 0.6875, "learning_rate": 4.7528258004887064e-05, "loss": 0.0016, "mean_token_accuracy": 0.9989043712615967, "num_tokens": 110088907.0, "step": 34840 }, { "entropy": 0.050096172746270894, "epoch": 8.122391887166337, "grad_norm": 0.380859375, "learning_rate": 4.75273467041274e-05, "loss": 0.0015, "mean_token_accuracy": 0.9991250276565552, "num_tokens": 110114696.0, "step": 34845 }, { "entropy": 0.06456055156886578, "epoch": 8.123557524186968, "grad_norm": 1.1015625, "learning_rate": 4.752643525384491e-05, "loss": 0.0042, "mean_token_accuracy": 0.9991252899169922, "num_tokens": 110127399.0, "step": 34850 }, { "entropy": 0.059490696713328364, "epoch": 8.1247231612076, "grad_norm": 1.8828125, "learning_rate": 4.7525523654053194e-05, "loss": 0.0116, "mean_token_accuracy": 0.9978362441062927, "num_tokens": 110146673.0, "step": 34855 }, { "entropy": 0.07245905827730895, "epoch": 8.125888798228232, "grad_norm": 0.24609375, "learning_rate": 4.752461190476584e-05, "loss": 0.0014, "mean_token_accuracy": 0.9997872352600098, "num_tokens": 110158175.0, "step": 34860 }, { "entropy": 0.0838863905519247, "epoch": 8.127054435248864, "grad_norm": 0.8359375, "learning_rate": 4.752370000599644e-05, "loss": 0.0058, "mean_token_accuracy": 0.9980123996734619, "num_tokens": 110167746.0, "step": 34865 }, { "entropy": 0.06200479865074158, "epoch": 8.128220072269496, "grad_norm": 0.28515625, "learning_rate": 4.752278795775861e-05, "loss": 0.006, "mean_token_accuracy": 0.9985685169696807, "num_tokens": 110196220.0, "step": 34870 }, { "entropy": 0.08028090875595809, "epoch": 8.129385709290126, "grad_norm": 0.1845703125, "learning_rate": 4.752187576006593e-05, "loss": 0.0027, "mean_token_accuracy": 0.9991875410079956, "num_tokens": 110209683.0, "step": 34875 }, { "entropy": 0.05563276670873165, "epoch": 8.130551346310758, "grad_norm": 3.0, "learning_rate": 4.7520963412932004e-05, "loss": 0.007, "mean_token_accuracy": 0.9985905766487122, "num_tokens": 110240404.0, "step": 34880 }, { "entropy": 0.06231335056945682, "epoch": 8.13171698333139, "grad_norm": 0.1875, "learning_rate": 4.752005091637044e-05, "loss": 0.0019, "mean_token_accuracy": 0.9995314836502075, "num_tokens": 110267847.0, "step": 34885 }, { "entropy": 0.054864462465047836, "epoch": 8.132882620352023, "grad_norm": 0.2412109375, "learning_rate": 4.751913827039485e-05, "loss": 0.004, "mean_token_accuracy": 0.9984963834285736, "num_tokens": 110281664.0, "step": 34890 }, { "entropy": 0.056126052886247633, "epoch": 8.134048257372655, "grad_norm": 0.2333984375, "learning_rate": 4.751822547501884e-05, "loss": 0.0032, "mean_token_accuracy": 0.9994036316871643, "num_tokens": 110311704.0, "step": 34895 }, { "entropy": 0.053292475175112486, "epoch": 8.135213894393287, "grad_norm": 2.25, "learning_rate": 4.751731253025602e-05, "loss": 0.0045, "mean_token_accuracy": 0.9993162214756012, "num_tokens": 110334636.0, "step": 34900 }, { "entropy": 0.046154945995658635, "epoch": 8.136379531413917, "grad_norm": 0.27734375, "learning_rate": 4.751639943612e-05, "loss": 0.006, "mean_token_accuracy": 0.998881858587265, "num_tokens": 110352633.0, "step": 34905 }, { "entropy": 0.07911163456737995, "epoch": 8.13754516843455, "grad_norm": 1.8359375, "learning_rate": 4.7515486192624404e-05, "loss": 0.0116, "mean_token_accuracy": 0.9959693968296051, "num_tokens": 110363195.0, "step": 34910 }, { "entropy": 0.06139131411910057, "epoch": 8.138710805455181, "grad_norm": 1.2109375, "learning_rate": 4.7514572799782845e-05, "loss": 0.0036, "mean_token_accuracy": 0.9995906531810761, "num_tokens": 110377365.0, "step": 34915 }, { "entropy": 0.05661859530955553, "epoch": 8.139876442475813, "grad_norm": 0.6640625, "learning_rate": 4.7513659257608937e-05, "loss": 0.0052, "mean_token_accuracy": 0.9978150308132172, "num_tokens": 110393155.0, "step": 34920 }, { "entropy": 0.059014659747481345, "epoch": 8.141042079496446, "grad_norm": 2.59375, "learning_rate": 4.7512745566116306e-05, "loss": 0.0045, "mean_token_accuracy": 0.9981638669967652, "num_tokens": 110415426.0, "step": 34925 }, { "entropy": 0.05055002924054861, "epoch": 8.142207716517076, "grad_norm": 0.74609375, "learning_rate": 4.751183172531858e-05, "loss": 0.0031, "mean_token_accuracy": 0.9986936986446381, "num_tokens": 110440385.0, "step": 34930 }, { "entropy": 0.09240026883780957, "epoch": 8.143373353537708, "grad_norm": 0.9296875, "learning_rate": 4.7510917735229395e-05, "loss": 0.0066, "mean_token_accuracy": 0.9980834245681762, "num_tokens": 110448900.0, "step": 34935 }, { "entropy": 0.0694688574410975, "epoch": 8.14453899055834, "grad_norm": 1.25, "learning_rate": 4.7510003595862354e-05, "loss": 0.0027, "mean_token_accuracy": 0.999321436882019, "num_tokens": 110467815.0, "step": 34940 }, { "entropy": 0.06604335568845272, "epoch": 8.145704627578972, "grad_norm": 0.74609375, "learning_rate": 4.7509089307231114e-05, "loss": 0.0029, "mean_token_accuracy": 0.9987660408020019, "num_tokens": 110484450.0, "step": 34945 }, { "entropy": 0.04780478095635772, "epoch": 8.146870264599604, "grad_norm": 0.65625, "learning_rate": 4.75081748693493e-05, "loss": 0.008, "mean_token_accuracy": 0.9983278274536133, "num_tokens": 110505707.0, "step": 34950 }, { "entropy": 0.08801990495994687, "epoch": 8.148035901620235, "grad_norm": 0.21484375, "learning_rate": 4.750726028223054e-05, "loss": 0.0361, "mean_token_accuracy": 0.9941394925117493, "num_tokens": 110533523.0, "step": 34955 }, { "entropy": 0.093077028170228, "epoch": 8.149201538640867, "grad_norm": 0.359375, "learning_rate": 4.7506345545888475e-05, "loss": 0.008, "mean_token_accuracy": 0.997499966621399, "num_tokens": 110541514.0, "step": 34960 }, { "entropy": 0.06018462534993887, "epoch": 8.150367175661499, "grad_norm": 0.408203125, "learning_rate": 4.750543066033675e-05, "loss": 0.0071, "mean_token_accuracy": 0.9981537163257599, "num_tokens": 110571866.0, "step": 34965 }, { "entropy": 0.06054795626550913, "epoch": 8.151532812682131, "grad_norm": 0.2060546875, "learning_rate": 4.750451562558901e-05, "loss": 0.0063, "mean_token_accuracy": 0.9983173251152039, "num_tokens": 110590691.0, "step": 34970 }, { "entropy": 0.08489367999136448, "epoch": 8.152698449702763, "grad_norm": 0.2001953125, "learning_rate": 4.7503600441658886e-05, "loss": 0.0021, "mean_token_accuracy": 0.9990243017673492, "num_tokens": 110607698.0, "step": 34975 }, { "entropy": 0.06136819664388895, "epoch": 8.153864086723395, "grad_norm": 2.328125, "learning_rate": 4.750268510856003e-05, "loss": 0.0064, "mean_token_accuracy": 0.9964813113212585, "num_tokens": 110622126.0, "step": 34980 }, { "entropy": 0.054417230654507875, "epoch": 8.155029723744025, "grad_norm": 0.28515625, "learning_rate": 4.750176962630611e-05, "loss": 0.0044, "mean_token_accuracy": 0.9996447026729584, "num_tokens": 110655232.0, "step": 34985 }, { "entropy": 0.08106433693319559, "epoch": 8.156195360764658, "grad_norm": 2.03125, "learning_rate": 4.750085399491075e-05, "loss": 0.0041, "mean_token_accuracy": 0.9984701454639435, "num_tokens": 110666627.0, "step": 34990 }, { "entropy": 0.06782528571784496, "epoch": 8.15736099778529, "grad_norm": 0.5546875, "learning_rate": 4.7499938214387616e-05, "loss": 0.0021, "mean_token_accuracy": 0.9992859661579132, "num_tokens": 110676515.0, "step": 34995 }, { "entropy": 0.059601149149239065, "epoch": 8.158526634805922, "grad_norm": 0.45703125, "learning_rate": 4.7499022284750367e-05, "loss": 0.0011, "mean_token_accuracy": 0.9995229601860046, "num_tokens": 110701167.0, "step": 35000 }, { "entropy": 0.061528265848755835, "epoch": 8.159692271826554, "grad_norm": 0.1337890625, "learning_rate": 4.749810620601265e-05, "loss": 0.0076, "mean_token_accuracy": 0.9985450446605683, "num_tokens": 110728803.0, "step": 35005 }, { "entropy": 0.0531819636002183, "epoch": 8.160857908847184, "grad_norm": 0.173828125, "learning_rate": 4.749718997818814e-05, "loss": 0.0016, "mean_token_accuracy": 0.9998791217803955, "num_tokens": 110749977.0, "step": 35010 }, { "entropy": 0.06418558629229665, "epoch": 8.162023545867816, "grad_norm": 0.267578125, "learning_rate": 4.749627360129048e-05, "loss": 0.0015, "mean_token_accuracy": 0.9985103130340576, "num_tokens": 110773354.0, "step": 35015 }, { "entropy": 0.05865657143294811, "epoch": 8.163189182888448, "grad_norm": 0.326171875, "learning_rate": 4.749535707533335e-05, "loss": 0.0111, "mean_token_accuracy": 0.9982908666133881, "num_tokens": 110793997.0, "step": 35020 }, { "entropy": 0.04337018961086869, "epoch": 8.16435481990908, "grad_norm": 1.09375, "learning_rate": 4.749444040033042e-05, "loss": 0.0047, "mean_token_accuracy": 0.9983787119388581, "num_tokens": 110833879.0, "step": 35025 }, { "entropy": 0.05562132876366377, "epoch": 8.165520456929713, "grad_norm": 0.419921875, "learning_rate": 4.749352357629534e-05, "loss": 0.0014, "mean_token_accuracy": 0.9996047914028168, "num_tokens": 110858174.0, "step": 35030 }, { "entropy": 0.06040436141192913, "epoch": 8.166686093950345, "grad_norm": 0.64453125, "learning_rate": 4.74926066032418e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999040007591248, "num_tokens": 110869837.0, "step": 35035 }, { "entropy": 0.05392250213772058, "epoch": 8.167851730970975, "grad_norm": 0.26953125, "learning_rate": 4.7491689481183454e-05, "loss": 0.0032, "mean_token_accuracy": 0.9994827568531036, "num_tokens": 110885984.0, "step": 35040 }, { "entropy": 0.05607498260214925, "epoch": 8.169017367991607, "grad_norm": 0.65625, "learning_rate": 4.7490772210134007e-05, "loss": 0.0025, "mean_token_accuracy": 0.9998294651508332, "num_tokens": 110915562.0, "step": 35045 }, { "entropy": 0.06420133570209145, "epoch": 8.17018300501224, "grad_norm": 0.466796875, "learning_rate": 4.7489854790107104e-05, "loss": 0.001, "mean_token_accuracy": 0.9996419370174408, "num_tokens": 110942745.0, "step": 35050 }, { "entropy": 0.047397821210324764, "epoch": 8.171348642032871, "grad_norm": 1.265625, "learning_rate": 4.7488937221116446e-05, "loss": 0.0022, "mean_token_accuracy": 0.9984205842018128, "num_tokens": 110963960.0, "step": 35055 }, { "entropy": 0.0502863303758204, "epoch": 8.172514279053503, "grad_norm": 0.30078125, "learning_rate": 4.748801950317571e-05, "loss": 0.0018, "mean_token_accuracy": 0.9994450092315674, "num_tokens": 110988319.0, "step": 35060 }, { "entropy": 0.05009726490825415, "epoch": 8.173679916074134, "grad_norm": 1.703125, "learning_rate": 4.748710163629858e-05, "loss": 0.0028, "mean_token_accuracy": 0.9994385838508606, "num_tokens": 111018445.0, "step": 35065 }, { "entropy": 0.07948374161496759, "epoch": 8.174845553094766, "grad_norm": 0.09912109375, "learning_rate": 4.748618362049875e-05, "loss": 0.0021, "mean_token_accuracy": 0.9990148901939392, "num_tokens": 111044428.0, "step": 35070 }, { "entropy": 0.06243461025878787, "epoch": 8.176011190115398, "grad_norm": 1.8203125, "learning_rate": 4.7485265455789894e-05, "loss": 0.0065, "mean_token_accuracy": 0.998746132850647, "num_tokens": 111070780.0, "step": 35075 }, { "entropy": 0.061110925208777186, "epoch": 8.17717682713603, "grad_norm": 5.25, "learning_rate": 4.748434714218571e-05, "loss": 0.0052, "mean_token_accuracy": 0.9985956192016602, "num_tokens": 111088837.0, "step": 35080 }, { "entropy": 0.06326413620263338, "epoch": 8.178342464156662, "grad_norm": 1.7890625, "learning_rate": 4.74834286796999e-05, "loss": 0.0057, "mean_token_accuracy": 0.9978280127048492, "num_tokens": 111115422.0, "step": 35085 }, { "entropy": 0.05515581658110023, "epoch": 8.179508101177293, "grad_norm": 1.2890625, "learning_rate": 4.7482510068346145e-05, "loss": 0.0052, "mean_token_accuracy": 0.9984103560447692, "num_tokens": 111129777.0, "step": 35090 }, { "entropy": 0.06779031567275524, "epoch": 8.180673738197925, "grad_norm": 1.0546875, "learning_rate": 4.748159130813816e-05, "loss": 0.0117, "mean_token_accuracy": 0.9983104467391968, "num_tokens": 111145557.0, "step": 35095 }, { "entropy": 0.05313057005405426, "epoch": 8.181839375218557, "grad_norm": 0.0771484375, "learning_rate": 4.748067239908963e-05, "loss": 0.0046, "mean_token_accuracy": 0.9984862327575683, "num_tokens": 111169517.0, "step": 35100 }, { "entropy": 0.07514633145183325, "epoch": 8.183005012239189, "grad_norm": 0.90625, "learning_rate": 4.747975334121426e-05, "loss": 0.0045, "mean_token_accuracy": 0.9984767377376557, "num_tokens": 111182469.0, "step": 35105 }, { "entropy": 0.0630564677529037, "epoch": 8.184170649259821, "grad_norm": 0.1298828125, "learning_rate": 4.7478834134525756e-05, "loss": 0.0051, "mean_token_accuracy": 0.9989075541496277, "num_tokens": 111214410.0, "step": 35110 }, { "entropy": 0.058083107136189936, "epoch": 8.185336286280453, "grad_norm": 0.734375, "learning_rate": 4.747791477903783e-05, "loss": 0.0103, "mean_token_accuracy": 0.9987756073474884, "num_tokens": 111234767.0, "step": 35115 }, { "entropy": 0.04894097140058875, "epoch": 8.186501923301083, "grad_norm": 0.6328125, "learning_rate": 4.7476995274764186e-05, "loss": 0.002, "mean_token_accuracy": 0.9996597945690155, "num_tokens": 111261638.0, "step": 35120 }, { "entropy": 0.06282207742333412, "epoch": 8.187667560321715, "grad_norm": 1.21875, "learning_rate": 4.747607562171854e-05, "loss": 0.0075, "mean_token_accuracy": 0.9990973234176636, "num_tokens": 111271401.0, "step": 35125 }, { "entropy": 0.07177103348076344, "epoch": 8.188833197342348, "grad_norm": 2.234375, "learning_rate": 4.74751558199146e-05, "loss": 0.005, "mean_token_accuracy": 0.9983224630355835, "num_tokens": 111280641.0, "step": 35130 }, { "entropy": 0.09346720837056637, "epoch": 8.18999883436298, "grad_norm": 0.0859375, "learning_rate": 4.747423586936608e-05, "loss": 0.0024, "mean_token_accuracy": 0.9991428554058075, "num_tokens": 111288976.0, "step": 35135 }, { "entropy": 0.06719689685851335, "epoch": 8.191164471383612, "grad_norm": 2.5625, "learning_rate": 4.74733157700867e-05, "loss": 0.0044, "mean_token_accuracy": 0.9985256791114807, "num_tokens": 111298730.0, "step": 35140 }, { "entropy": 0.08412168696522712, "epoch": 8.192330108404242, "grad_norm": 2.921875, "learning_rate": 4.7472395522090186e-05, "loss": 0.0323, "mean_token_accuracy": 0.9948005080223083, "num_tokens": 111317180.0, "step": 35145 }, { "entropy": 0.06580277308821678, "epoch": 8.193495745424874, "grad_norm": 0.7890625, "learning_rate": 4.747147512539025e-05, "loss": 0.003, "mean_token_accuracy": 0.9991651713848114, "num_tokens": 111331434.0, "step": 35150 }, { "entropy": 0.041099204868078235, "epoch": 8.194661382445506, "grad_norm": 1.75, "learning_rate": 4.747055458000063e-05, "loss": 0.0019, "mean_token_accuracy": 0.9992423474788665, "num_tokens": 111369635.0, "step": 35155 }, { "entropy": 0.0656563414260745, "epoch": 8.195827019466138, "grad_norm": 0.3203125, "learning_rate": 4.7469633885935037e-05, "loss": 0.0034, "mean_token_accuracy": 0.9993948400020599, "num_tokens": 111391309.0, "step": 35160 }, { "entropy": 0.06426733545958996, "epoch": 8.19699265648677, "grad_norm": 1.3828125, "learning_rate": 4.746871304320721e-05, "loss": 0.0046, "mean_token_accuracy": 0.999400395154953, "num_tokens": 111406533.0, "step": 35165 }, { "entropy": 0.06955592483282089, "epoch": 8.198158293507403, "grad_norm": 3.875, "learning_rate": 4.746779205183088e-05, "loss": 0.0143, "mean_token_accuracy": 0.9961112380027771, "num_tokens": 111418189.0, "step": 35170 }, { "entropy": 0.045057310909032824, "epoch": 8.199323930528033, "grad_norm": 0.328125, "learning_rate": 4.746687091181977e-05, "loss": 0.0046, "mean_token_accuracy": 0.9985878467559814, "num_tokens": 111449378.0, "step": 35175 }, { "entropy": 0.049578765965998174, "epoch": 8.200489567548665, "grad_norm": 0.40234375, "learning_rate": 4.7465949623187635e-05, "loss": 0.0035, "mean_token_accuracy": 0.9987337410449981, "num_tokens": 111474869.0, "step": 35180 }, { "entropy": 0.04917066413909197, "epoch": 8.201655204569297, "grad_norm": 0.173828125, "learning_rate": 4.74650281859482e-05, "loss": 0.0017, "mean_token_accuracy": 0.9996753096580505, "num_tokens": 111502113.0, "step": 35185 }, { "entropy": 0.07581342617049813, "epoch": 8.20282084158993, "grad_norm": 0.40234375, "learning_rate": 4.74641066001152e-05, "loss": 0.0028, "mean_token_accuracy": 0.9993069410324097, "num_tokens": 111519087.0, "step": 35190 }, { "entropy": 0.051918433699756864, "epoch": 8.203986478610561, "grad_norm": 2.921875, "learning_rate": 4.7463184865702386e-05, "loss": 0.0062, "mean_token_accuracy": 0.9992042243480682, "num_tokens": 111557289.0, "step": 35195 }, { "entropy": 0.06914200708270073, "epoch": 8.205152115631192, "grad_norm": 2.671875, "learning_rate": 4.74622629827235e-05, "loss": 0.0062, "mean_token_accuracy": 0.9993867039680481, "num_tokens": 111567208.0, "step": 35200 }, { "entropy": 0.07257007220759988, "epoch": 8.206317752651824, "grad_norm": 1.3125, "learning_rate": 4.746134095119229e-05, "loss": 0.0114, "mean_token_accuracy": 0.9968818962574005, "num_tokens": 111579305.0, "step": 35205 }, { "entropy": 0.0569312437903136, "epoch": 8.207483389672456, "grad_norm": 0.98046875, "learning_rate": 4.7460418771122505e-05, "loss": 0.0097, "mean_token_accuracy": 0.9975073218345643, "num_tokens": 111610993.0, "step": 35210 }, { "entropy": 0.06311609046533703, "epoch": 8.208649026693088, "grad_norm": 0.68359375, "learning_rate": 4.7459496442527895e-05, "loss": 0.0029, "mean_token_accuracy": 0.9987077414989471, "num_tokens": 111635088.0, "step": 35215 }, { "entropy": 0.05613302402198315, "epoch": 8.20981466371372, "grad_norm": 2.25, "learning_rate": 4.7458573965422206e-05, "loss": 0.0047, "mean_token_accuracy": 0.9986424922943116, "num_tokens": 111648821.0, "step": 35220 }, { "entropy": 0.06687151882797479, "epoch": 8.21098030073435, "grad_norm": 0.224609375, "learning_rate": 4.745765133981921e-05, "loss": 0.0021, "mean_token_accuracy": 0.9990479350090027, "num_tokens": 111680389.0, "step": 35225 }, { "entropy": 0.05649174377322197, "epoch": 8.212145937754983, "grad_norm": 1.8125, "learning_rate": 4.745672856573265e-05, "loss": 0.0075, "mean_token_accuracy": 0.9981228053569794, "num_tokens": 111703894.0, "step": 35230 }, { "entropy": 0.054273920319974425, "epoch": 8.213311574775615, "grad_norm": 2.078125, "learning_rate": 4.7455805643176295e-05, "loss": 0.0074, "mean_token_accuracy": 0.9993931233882904, "num_tokens": 111726855.0, "step": 35235 }, { "entropy": 0.049174747243523595, "epoch": 8.214477211796247, "grad_norm": 0.2021484375, "learning_rate": 4.74548825721639e-05, "loss": 0.0025, "mean_token_accuracy": 0.9997669696807862, "num_tokens": 111771810.0, "step": 35240 }, { "entropy": 0.08168933633714914, "epoch": 8.215642848816879, "grad_norm": 0.203125, "learning_rate": 4.745395935270923e-05, "loss": 0.0034, "mean_token_accuracy": 0.9992255866527557, "num_tokens": 111795911.0, "step": 35245 }, { "entropy": 0.08296185713261366, "epoch": 8.216808485837511, "grad_norm": 0.103515625, "learning_rate": 4.745303598482606e-05, "loss": 0.0151, "mean_token_accuracy": 0.995855188369751, "num_tokens": 111826674.0, "step": 35250 }, { "entropy": 0.07323675518855452, "epoch": 8.217974122858141, "grad_norm": 0.97265625, "learning_rate": 4.7452112468528156e-05, "loss": 0.0149, "mean_token_accuracy": 0.997208696603775, "num_tokens": 111856193.0, "step": 35255 }, { "entropy": 0.059016761183738706, "epoch": 8.219139759878773, "grad_norm": 0.1572265625, "learning_rate": 4.7451188803829284e-05, "loss": 0.0011, "mean_token_accuracy": 0.9998023688793183, "num_tokens": 111865608.0, "step": 35260 }, { "entropy": 0.054262810340151194, "epoch": 8.220305396899406, "grad_norm": 0.765625, "learning_rate": 4.745026499074322e-05, "loss": 0.0027, "mean_token_accuracy": 0.9982050716876983, "num_tokens": 111886789.0, "step": 35265 }, { "entropy": 0.05554175898432732, "epoch": 8.221471033920038, "grad_norm": 0.515625, "learning_rate": 4.744934102928373e-05, "loss": 0.002, "mean_token_accuracy": 0.9989612340927124, "num_tokens": 111906176.0, "step": 35270 }, { "entropy": 0.045644909329712394, "epoch": 8.22263667094067, "grad_norm": 0.203125, "learning_rate": 4.7448416919464607e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997182667255402, "num_tokens": 111942352.0, "step": 35275 }, { "entropy": 0.06484599094837903, "epoch": 8.2238023079613, "grad_norm": 0.2470703125, "learning_rate": 4.744749266129962e-05, "loss": 0.0099, "mean_token_accuracy": 0.9974075615406036, "num_tokens": 111964101.0, "step": 35280 }, { "entropy": 0.05839154152199626, "epoch": 8.224967944981932, "grad_norm": 1.1875, "learning_rate": 4.744656825480257e-05, "loss": 0.0032, "mean_token_accuracy": 0.9989475786685944, "num_tokens": 111989065.0, "step": 35285 }, { "entropy": 0.07809292711317539, "epoch": 8.226133582002564, "grad_norm": 0.7265625, "learning_rate": 4.744564369998721e-05, "loss": 0.0043, "mean_token_accuracy": 0.9986956357955933, "num_tokens": 112000207.0, "step": 35290 }, { "entropy": 0.050143054034560916, "epoch": 8.227299219023196, "grad_norm": 1.203125, "learning_rate": 4.7444718996867356e-05, "loss": 0.0033, "mean_token_accuracy": 0.9980408549308777, "num_tokens": 112021884.0, "step": 35295 }, { "entropy": 0.05025668404996395, "epoch": 8.228464856043828, "grad_norm": 0.125, "learning_rate": 4.744379414545678e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998918652534485, "num_tokens": 112042919.0, "step": 35300 }, { "entropy": 0.0611870632506907, "epoch": 8.22963049306446, "grad_norm": 0.02685546875, "learning_rate": 4.744286914576927e-05, "loss": 0.0051, "mean_token_accuracy": 0.9994702398777008, "num_tokens": 112065484.0, "step": 35305 }, { "entropy": 0.07436528988182545, "epoch": 8.230796130085091, "grad_norm": 0.1767578125, "learning_rate": 4.744194399781863e-05, "loss": 0.0037, "mean_token_accuracy": 0.9982783615589141, "num_tokens": 112078227.0, "step": 35310 }, { "entropy": 0.042807167023420335, "epoch": 8.231961767105723, "grad_norm": 0.416015625, "learning_rate": 4.744101870161866e-05, "loss": 0.0025, "mean_token_accuracy": 0.9995752155780793, "num_tokens": 112103737.0, "step": 35315 }, { "entropy": 0.05692212600260973, "epoch": 8.233127404126355, "grad_norm": 1.6484375, "learning_rate": 4.744009325718314e-05, "loss": 0.0044, "mean_token_accuracy": 0.9979093134403229, "num_tokens": 112123907.0, "step": 35320 }, { "entropy": 0.05411262274719775, "epoch": 8.234293041146987, "grad_norm": 2.875, "learning_rate": 4.7439167664525876e-05, "loss": 0.0037, "mean_token_accuracy": 0.9989370942115784, "num_tokens": 112154557.0, "step": 35325 }, { "entropy": 0.060514886677265164, "epoch": 8.23545867816762, "grad_norm": 0.7421875, "learning_rate": 4.743824192366068e-05, "loss": 0.0069, "mean_token_accuracy": 0.9985116302967072, "num_tokens": 112179242.0, "step": 35330 }, { "entropy": 0.05649018418043852, "epoch": 8.23662431518825, "grad_norm": 0.09814453125, "learning_rate": 4.743731603460134e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998935043811799, "num_tokens": 112210248.0, "step": 35335 }, { "entropy": 0.06745369862765074, "epoch": 8.237789952208882, "grad_norm": 0.08642578125, "learning_rate": 4.743638999736169e-05, "loss": 0.0034, "mean_token_accuracy": 0.9993011593818665, "num_tokens": 112228105.0, "step": 35340 }, { "entropy": 0.05459690997377038, "epoch": 8.238955589229514, "grad_norm": 0.50390625, "learning_rate": 4.74354638119555e-05, "loss": 0.003, "mean_token_accuracy": 0.9975790500640869, "num_tokens": 112243364.0, "step": 35345 }, { "entropy": 0.0657209831289947, "epoch": 8.240121226250146, "grad_norm": 2.484375, "learning_rate": 4.743453747839661e-05, "loss": 0.0023, "mean_token_accuracy": 0.9995322525501251, "num_tokens": 112274116.0, "step": 35350 }, { "entropy": 0.05723834093660116, "epoch": 8.241286863270778, "grad_norm": 0.255859375, "learning_rate": 4.743361099669882e-05, "loss": 0.0024, "mean_token_accuracy": 0.9994523048400878, "num_tokens": 112290556.0, "step": 35355 }, { "entropy": 0.059019094612449405, "epoch": 8.242452500291408, "grad_norm": 0.1796875, "learning_rate": 4.743268436687595e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999679744243621, "num_tokens": 112312452.0, "step": 35360 }, { "entropy": 0.05994194708764553, "epoch": 8.24361813731204, "grad_norm": 0.416015625, "learning_rate": 4.743175758894182e-05, "loss": 0.0039, "mean_token_accuracy": 0.9986020386219024, "num_tokens": 112324430.0, "step": 35365 }, { "entropy": 0.07612526267766953, "epoch": 8.244783774332673, "grad_norm": 0.458984375, "learning_rate": 4.743083066291024e-05, "loss": 0.0029, "mean_token_accuracy": 0.9977519989013672, "num_tokens": 112363036.0, "step": 35370 }, { "entropy": 0.05744869448244572, "epoch": 8.245949411353305, "grad_norm": 1.359375, "learning_rate": 4.7429903588795044e-05, "loss": 0.0038, "mean_token_accuracy": 0.9983863711357117, "num_tokens": 112376745.0, "step": 35375 }, { "entropy": 0.07884750552475453, "epoch": 8.247115048373937, "grad_norm": 0.138671875, "learning_rate": 4.742897636661005e-05, "loss": 0.0098, "mean_token_accuracy": 0.9982732713222504, "num_tokens": 112395514.0, "step": 35380 }, { "entropy": 0.056860837060958146, "epoch": 8.248280685394569, "grad_norm": 0.19921875, "learning_rate": 4.742804899636908e-05, "loss": 0.0136, "mean_token_accuracy": 0.9961659133434295, "num_tokens": 112411532.0, "step": 35385 }, { "entropy": 0.13037989661097527, "epoch": 8.2494463224152, "grad_norm": 0.64453125, "learning_rate": 4.742712147808597e-05, "loss": 0.1419, "mean_token_accuracy": 0.9767031192779541, "num_tokens": 112433524.0, "step": 35390 }, { "entropy": 0.0718831043690443, "epoch": 8.250611959435831, "grad_norm": 0.69140625, "learning_rate": 4.742619381177455e-05, "loss": 0.0053, "mean_token_accuracy": 0.9996219515800476, "num_tokens": 112443322.0, "step": 35395 }, { "entropy": 0.04233479611575604, "epoch": 8.251777596456463, "grad_norm": 0.2470703125, "learning_rate": 4.742526599744865e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995525121688843, "num_tokens": 112478937.0, "step": 35400 }, { "entropy": 0.07610894702374935, "epoch": 8.252943233477096, "grad_norm": 0.1650390625, "learning_rate": 4.74243380351221e-05, "loss": 0.0131, "mean_token_accuracy": 0.9966908633708954, "num_tokens": 112495918.0, "step": 35405 }, { "entropy": 0.08622381035238505, "epoch": 8.254108870497728, "grad_norm": 0.84375, "learning_rate": 4.742340992480875e-05, "loss": 0.0016, "mean_token_accuracy": 0.9995142340660095, "num_tokens": 112514007.0, "step": 35410 }, { "entropy": 0.06141715543344617, "epoch": 8.255274507518358, "grad_norm": 0.58984375, "learning_rate": 4.7422481666522423e-05, "loss": 0.0025, "mean_token_accuracy": 0.9987404525279999, "num_tokens": 112537499.0, "step": 35415 }, { "entropy": 0.08039052113890648, "epoch": 8.25644014453899, "grad_norm": 1.4921875, "learning_rate": 4.7421553260276973e-05, "loss": 0.0056, "mean_token_accuracy": 0.9989081561565399, "num_tokens": 112546351.0, "step": 35420 }, { "entropy": 0.056267809588462114, "epoch": 8.257605781559622, "grad_norm": 2.890625, "learning_rate": 4.742062470608625e-05, "loss": 0.0025, "mean_token_accuracy": 0.9991233944892883, "num_tokens": 112575017.0, "step": 35425 }, { "entropy": 0.05321586560457945, "epoch": 8.258771418580254, "grad_norm": 0.85546875, "learning_rate": 4.741969600396408e-05, "loss": 0.0066, "mean_token_accuracy": 0.9979334831237793, "num_tokens": 112604847.0, "step": 35430 }, { "entropy": 0.04675264293327928, "epoch": 8.259937055600886, "grad_norm": 0.20703125, "learning_rate": 4.741876715392433e-05, "loss": 0.0012, "mean_token_accuracy": 0.9998992204666137, "num_tokens": 112626161.0, "step": 35435 }, { "entropy": 0.08684096895158291, "epoch": 8.261102692621517, "grad_norm": 2.0, "learning_rate": 4.7417838155980835e-05, "loss": 0.0037, "mean_token_accuracy": 0.9993700981140137, "num_tokens": 112635620.0, "step": 35440 }, { "entropy": 0.06560825593769551, "epoch": 8.262268329642149, "grad_norm": 1.203125, "learning_rate": 4.7416909010147456e-05, "loss": 0.003, "mean_token_accuracy": 0.9992285370826721, "num_tokens": 112661918.0, "step": 35445 }, { "entropy": 0.058400828018784524, "epoch": 8.263433966662781, "grad_norm": 0.51953125, "learning_rate": 4.7415979716438055e-05, "loss": 0.0088, "mean_token_accuracy": 0.9980306982994079, "num_tokens": 112681038.0, "step": 35450 }, { "entropy": 0.09542725849896669, "epoch": 8.264599603683413, "grad_norm": 3.40625, "learning_rate": 4.7415050274866483e-05, "loss": 0.0969, "mean_token_accuracy": 0.9781103134155273, "num_tokens": 112701804.0, "step": 35455 }, { "entropy": 0.05795524828135967, "epoch": 8.265765240704045, "grad_norm": 1.53125, "learning_rate": 4.741412068544659e-05, "loss": 0.005, "mean_token_accuracy": 0.9989598095417023, "num_tokens": 112719943.0, "step": 35460 }, { "entropy": 0.05817284304648638, "epoch": 8.266930877724677, "grad_norm": 0.23046875, "learning_rate": 4.741319094819226e-05, "loss": 0.0075, "mean_token_accuracy": 0.9974438846111298, "num_tokens": 112735325.0, "step": 35465 }, { "entropy": 0.06478349603712559, "epoch": 8.268096514745308, "grad_norm": 0.03857421875, "learning_rate": 4.741226106311733e-05, "loss": 0.0028, "mean_token_accuracy": 0.9993962347507477, "num_tokens": 112751179.0, "step": 35470 }, { "entropy": 0.04304729863069952, "epoch": 8.26926215176594, "grad_norm": 0.302734375, "learning_rate": 4.7411331030235684e-05, "loss": 0.0035, "mean_token_accuracy": 0.9986756980419159, "num_tokens": 112785380.0, "step": 35475 }, { "entropy": 0.039424076396971944, "epoch": 8.270427788786572, "grad_norm": 0.23828125, "learning_rate": 4.741040084956118e-05, "loss": 0.0016, "mean_token_accuracy": 0.9992639780044555, "num_tokens": 112809960.0, "step": 35480 }, { "entropy": 0.09679261557757854, "epoch": 8.271593425807204, "grad_norm": 0.333984375, "learning_rate": 4.74094705211077e-05, "loss": 0.0673, "mean_token_accuracy": 0.9878584563732147, "num_tokens": 112840958.0, "step": 35485 }, { "entropy": 0.059682253934443, "epoch": 8.272759062827836, "grad_norm": 0.33984375, "learning_rate": 4.74085400448891e-05, "loss": 0.0021, "mean_token_accuracy": 0.9995380342006683, "num_tokens": 112866924.0, "step": 35490 }, { "entropy": 0.05326059451326728, "epoch": 8.273924699848466, "grad_norm": 0.84765625, "learning_rate": 4.7407609420919275e-05, "loss": 0.004, "mean_token_accuracy": 0.9986121416091919, "num_tokens": 112890112.0, "step": 35495 }, { "entropy": 0.07074670540168881, "epoch": 8.275090336869098, "grad_norm": 0.201171875, "learning_rate": 4.740667864921209e-05, "loss": 0.002, "mean_token_accuracy": 0.9993785321712494, "num_tokens": 112917769.0, "step": 35500 }, { "entropy": 0.05106313647702336, "epoch": 8.27625597388973, "grad_norm": 0.765625, "learning_rate": 4.7405747729781416e-05, "loss": 0.0048, "mean_token_accuracy": 0.9989130139350891, "num_tokens": 112934554.0, "step": 35505 }, { "entropy": 0.0686172442510724, "epoch": 8.277421610910363, "grad_norm": 0.55859375, "learning_rate": 4.740481666264115e-05, "loss": 0.0051, "mean_token_accuracy": 0.9985272109508514, "num_tokens": 112956525.0, "step": 35510 }, { "entropy": 0.06048934049904346, "epoch": 8.278587247930995, "grad_norm": 0.283203125, "learning_rate": 4.740388544780517e-05, "loss": 0.0016, "mean_token_accuracy": 0.9995243310928345, "num_tokens": 112979928.0, "step": 35515 }, { "entropy": 0.09104947121813893, "epoch": 8.279752884951627, "grad_norm": 0.58203125, "learning_rate": 4.740295408528737e-05, "loss": 0.0045, "mean_token_accuracy": 0.9983532786369324, "num_tokens": 112999457.0, "step": 35520 }, { "entropy": 0.04913658211007714, "epoch": 8.280918521972257, "grad_norm": 0.279296875, "learning_rate": 4.740202257510162e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999531686306, "num_tokens": 113026574.0, "step": 35525 }, { "entropy": 0.06502987816929817, "epoch": 8.28208415899289, "grad_norm": 0.2265625, "learning_rate": 4.7401090917261826e-05, "loss": 0.0023, "mean_token_accuracy": 0.9996796190738678, "num_tokens": 113046634.0, "step": 35530 }, { "entropy": 0.05888591511175036, "epoch": 8.283249796013521, "grad_norm": 0.4765625, "learning_rate": 4.740015911178187e-05, "loss": 0.004, "mean_token_accuracy": 0.9976631879806519, "num_tokens": 113066170.0, "step": 35535 }, { "entropy": 0.08687619585543871, "epoch": 8.284415433034154, "grad_norm": 0.1650390625, "learning_rate": 4.739922715867565e-05, "loss": 0.0061, "mean_token_accuracy": 0.9987577140331269, "num_tokens": 113076695.0, "step": 35540 }, { "entropy": 0.06502792998217047, "epoch": 8.285581070054786, "grad_norm": 0.265625, "learning_rate": 4.739829505795707e-05, "loss": 0.0065, "mean_token_accuracy": 0.9979210734367371, "num_tokens": 113112146.0, "step": 35545 }, { "entropy": 0.060326622892171146, "epoch": 8.286746707075416, "grad_norm": 1.921875, "learning_rate": 4.739736280964002e-05, "loss": 0.003, "mean_token_accuracy": 0.9994000792503357, "num_tokens": 113137726.0, "step": 35550 }, { "entropy": 0.0668079487979412, "epoch": 8.287912344096048, "grad_norm": 0.859375, "learning_rate": 4.7396430413738394e-05, "loss": 0.0023, "mean_token_accuracy": 0.9990176737308503, "num_tokens": 113163502.0, "step": 35555 }, { "entropy": 0.051899380423128604, "epoch": 8.28907798111668, "grad_norm": 0.26171875, "learning_rate": 4.7395497870266115e-05, "loss": 0.0036, "mean_token_accuracy": 0.9989886045455932, "num_tokens": 113182112.0, "step": 35560 }, { "entropy": 0.0725651178508997, "epoch": 8.290243618137312, "grad_norm": 0.166015625, "learning_rate": 4.7394565179237084e-05, "loss": 0.0037, "mean_token_accuracy": 0.9993403315544128, "num_tokens": 113203507.0, "step": 35565 }, { "entropy": 0.04745776057243347, "epoch": 8.291409255157944, "grad_norm": 0.1953125, "learning_rate": 4.739363234066519e-05, "loss": 0.0018, "mean_token_accuracy": 0.9995150685310363, "num_tokens": 113221559.0, "step": 35570 }, { "entropy": 0.051142162969335915, "epoch": 8.292574892178575, "grad_norm": 0.15625, "learning_rate": 4.739269935456437e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998042106628418, "num_tokens": 113252839.0, "step": 35575 }, { "entropy": 0.06874603973701596, "epoch": 8.293740529199207, "grad_norm": 0.314453125, "learning_rate": 4.739176622094852e-05, "loss": 0.003, "mean_token_accuracy": 0.9991725265979767, "num_tokens": 113265678.0, "step": 35580 }, { "entropy": 0.103728087246418, "epoch": 8.294906166219839, "grad_norm": 0.189453125, "learning_rate": 4.7390832939831554e-05, "loss": 0.0906, "mean_token_accuracy": 0.9850226819515229, "num_tokens": 113286175.0, "step": 35585 }, { "entropy": 0.05010626185685396, "epoch": 8.296071803240471, "grad_norm": 0.2431640625, "learning_rate": 4.7389899511227395e-05, "loss": 0.0062, "mean_token_accuracy": 0.9987168490886689, "num_tokens": 113303583.0, "step": 35590 }, { "entropy": 0.07682762825861573, "epoch": 8.297237440261103, "grad_norm": 0.671875, "learning_rate": 4.7388965935149955e-05, "loss": 0.003, "mean_token_accuracy": 0.9988443732261658, "num_tokens": 113328321.0, "step": 35595 }, { "entropy": 0.06315315756946802, "epoch": 8.298403077281735, "grad_norm": 0.578125, "learning_rate": 4.7388032211613166e-05, "loss": 0.0083, "mean_token_accuracy": 0.9949438393115997, "num_tokens": 113340699.0, "step": 35600 }, { "entropy": 0.08139186827465891, "epoch": 8.299568714302366, "grad_norm": 0.953125, "learning_rate": 4.738709834063094e-05, "loss": 0.0085, "mean_token_accuracy": 0.9987073481082916, "num_tokens": 113361554.0, "step": 35605 }, { "entropy": 0.06355111561715603, "epoch": 8.300734351322998, "grad_norm": 0.10595703125, "learning_rate": 4.738616432221721e-05, "loss": 0.0022, "mean_token_accuracy": 0.9995387196540833, "num_tokens": 113372946.0, "step": 35610 }, { "entropy": 0.054753214679658414, "epoch": 8.30189998834363, "grad_norm": 0.173828125, "learning_rate": 4.7385230156385894e-05, "loss": 0.0041, "mean_token_accuracy": 0.9989207983016968, "num_tokens": 113411917.0, "step": 35615 }, { "entropy": 0.08477168828248978, "epoch": 8.303065625364262, "grad_norm": 1.390625, "learning_rate": 4.738429584315093e-05, "loss": 0.0032, "mean_token_accuracy": 0.9975354373455048, "num_tokens": 113421543.0, "step": 35620 }, { "entropy": 0.03909264667890966, "epoch": 8.304231262384894, "grad_norm": 0.3515625, "learning_rate": 4.738336138252625e-05, "loss": 0.0016, "mean_token_accuracy": 0.9994762778282166, "num_tokens": 113453973.0, "step": 35625 }, { "entropy": 0.048014458548277614, "epoch": 8.305396899405524, "grad_norm": 0.890625, "learning_rate": 4.738242677452578e-05, "loss": 0.0039, "mean_token_accuracy": 0.9986834228038788, "num_tokens": 113473666.0, "step": 35630 }, { "entropy": 0.07936519216746092, "epoch": 8.306562536426156, "grad_norm": 0.9765625, "learning_rate": 4.7381492019163475e-05, "loss": 0.0025, "mean_token_accuracy": 0.9991873800754547, "num_tokens": 113488056.0, "step": 35635 }, { "entropy": 0.05001498758792877, "epoch": 8.307728173446788, "grad_norm": 0.92578125, "learning_rate": 4.7380557116453255e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998580873012543, "num_tokens": 113525080.0, "step": 35640 }, { "entropy": 0.07388973170891404, "epoch": 8.30889381046742, "grad_norm": 1.0234375, "learning_rate": 4.737962206640907e-05, "loss": 0.0022, "mean_token_accuracy": 0.9986834764480591, "num_tokens": 113538373.0, "step": 35645 }, { "entropy": 0.05318211195990443, "epoch": 8.310059447488053, "grad_norm": 0.353515625, "learning_rate": 4.737868686904485e-05, "loss": 0.0043, "mean_token_accuracy": 0.9992851078510284, "num_tokens": 113563745.0, "step": 35650 }, { "entropy": 0.04736358020454645, "epoch": 8.311225084508685, "grad_norm": 1.2734375, "learning_rate": 4.737775152437456e-05, "loss": 0.0037, "mean_token_accuracy": 0.9990275919437408, "num_tokens": 113594405.0, "step": 35655 }, { "entropy": 0.055001238361001015, "epoch": 8.312390721529315, "grad_norm": 1.4921875, "learning_rate": 4.737681603241214e-05, "loss": 0.0051, "mean_token_accuracy": 0.9982485055923462, "num_tokens": 113611583.0, "step": 35660 }, { "entropy": 0.059361847769469026, "epoch": 8.313556358549947, "grad_norm": 0.322265625, "learning_rate": 4.737588039317153e-05, "loss": 0.0034, "mean_token_accuracy": 0.9991229712963104, "num_tokens": 113638985.0, "step": 35665 }, { "entropy": 0.06605250053107739, "epoch": 8.31472199557058, "grad_norm": 0.38671875, "learning_rate": 4.7374944606666694e-05, "loss": 0.0037, "mean_token_accuracy": 0.9987422049045562, "num_tokens": 113649240.0, "step": 35670 }, { "entropy": 0.08314755260944366, "epoch": 8.315887632591211, "grad_norm": 1.8515625, "learning_rate": 4.737400867291158e-05, "loss": 0.0051, "mean_token_accuracy": 0.9990105986595154, "num_tokens": 113657025.0, "step": 35675 }, { "entropy": 0.051615030877292155, "epoch": 8.317053269611844, "grad_norm": 0.734375, "learning_rate": 4.737307259192014e-05, "loss": 0.0043, "mean_token_accuracy": 0.9991748809814454, "num_tokens": 113680681.0, "step": 35680 }, { "entropy": 0.06697340840473771, "epoch": 8.318218906632474, "grad_norm": 0.31640625, "learning_rate": 4.737213636370635e-05, "loss": 0.0019, "mean_token_accuracy": 0.9999058306217193, "num_tokens": 113696503.0, "step": 35685 }, { "entropy": 0.07072263630107045, "epoch": 8.319384543653106, "grad_norm": 0.1982421875, "learning_rate": 4.737119998828415e-05, "loss": 0.0016, "mean_token_accuracy": 0.9999394178390503, "num_tokens": 113711994.0, "step": 35690 }, { "entropy": 0.059878239221870896, "epoch": 8.320550180673738, "grad_norm": 2.4375, "learning_rate": 4.737026346566751e-05, "loss": 0.0153, "mean_token_accuracy": 0.9969691872596741, "num_tokens": 113730409.0, "step": 35695 }, { "entropy": 0.06803951859474182, "epoch": 8.32171581769437, "grad_norm": 1.4375, "learning_rate": 4.7369326795870394e-05, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 113739693.0, "step": 35700 }, { "entropy": 0.056852425914257765, "epoch": 8.322881454715002, "grad_norm": 0.2490234375, "learning_rate": 4.736838997890678e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997595429420472, "num_tokens": 113772385.0, "step": 35705 }, { "entropy": 0.06514825280755758, "epoch": 8.324047091735633, "grad_norm": 0.494140625, "learning_rate": 4.7367453014790627e-05, "loss": 0.0027, "mean_token_accuracy": 0.9989059031009674, "num_tokens": 113785680.0, "step": 35710 }, { "entropy": 0.07101746341213584, "epoch": 8.325212728756265, "grad_norm": 1.3984375, "learning_rate": 4.73665159035359e-05, "loss": 0.0062, "mean_token_accuracy": 0.9984484255313874, "num_tokens": 113802287.0, "step": 35715 }, { "entropy": 0.06135486271232367, "epoch": 8.326378365776897, "grad_norm": 0.52734375, "learning_rate": 4.736557864515658e-05, "loss": 0.0069, "mean_token_accuracy": 0.9988792181015015, "num_tokens": 113826041.0, "step": 35720 }, { "entropy": 0.04972120532765985, "epoch": 8.327544002797529, "grad_norm": 0.265625, "learning_rate": 4.7364641239666654e-05, "loss": 0.0046, "mean_token_accuracy": 0.9995426177978516, "num_tokens": 113865421.0, "step": 35725 }, { "entropy": 0.07017171997576951, "epoch": 8.328709639818161, "grad_norm": 0.216796875, "learning_rate": 4.736370368708008e-05, "loss": 0.0025, "mean_token_accuracy": 0.999553245306015, "num_tokens": 113885947.0, "step": 35730 }, { "entropy": 0.058057605568319556, "epoch": 8.329875276838793, "grad_norm": 0.2119140625, "learning_rate": 4.736276598741086e-05, "loss": 0.0029, "mean_token_accuracy": 0.9994979918003082, "num_tokens": 113913705.0, "step": 35735 }, { "entropy": 0.09026005379855633, "epoch": 8.331040913859423, "grad_norm": 2.78125, "learning_rate": 4.7361828140672956e-05, "loss": 0.0041, "mean_token_accuracy": 0.9987558662891388, "num_tokens": 113924075.0, "step": 35740 }, { "entropy": 0.08890927508473397, "epoch": 8.332206550880056, "grad_norm": 0.314453125, "learning_rate": 4.736089014688037e-05, "loss": 0.0074, "mean_token_accuracy": 0.9987519264221192, "num_tokens": 113944931.0, "step": 35745 }, { "entropy": 0.05832267887890339, "epoch": 8.333372187900688, "grad_norm": 0.1806640625, "learning_rate": 4.735995200604707e-05, "loss": 0.0075, "mean_token_accuracy": 0.9988953292369842, "num_tokens": 113968947.0, "step": 35750 }, { "entropy": 0.05671592140570283, "epoch": 8.33453782492132, "grad_norm": 0.2890625, "learning_rate": 4.735901371818706e-05, "loss": 0.0031, "mean_token_accuracy": 0.998402863740921, "num_tokens": 113996374.0, "step": 35755 }, { "entropy": 0.05853022150695324, "epoch": 8.335703461941952, "grad_norm": 0.2197265625, "learning_rate": 4.735807528331432e-05, "loss": 0.0035, "mean_token_accuracy": 0.998811411857605, "num_tokens": 114010996.0, "step": 35760 }, { "entropy": 0.06083117621019483, "epoch": 8.336869098962582, "grad_norm": 0.185546875, "learning_rate": 4.7357136701442864e-05, "loss": 0.0036, "mean_token_accuracy": 0.9989865601062775, "num_tokens": 114050055.0, "step": 35765 }, { "entropy": 0.053792219050228594, "epoch": 8.338034735983214, "grad_norm": 0.10400390625, "learning_rate": 4.735619797258666e-05, "loss": 0.0042, "mean_token_accuracy": 0.9987802326679229, "num_tokens": 114079476.0, "step": 35770 }, { "entropy": 0.07619495354592801, "epoch": 8.339200373003846, "grad_norm": 1.421875, "learning_rate": 4.735525909675972e-05, "loss": 0.0061, "mean_token_accuracy": 0.9981731057167054, "num_tokens": 114089238.0, "step": 35775 }, { "entropy": 0.06971075274050235, "epoch": 8.340366010024479, "grad_norm": 0.78125, "learning_rate": 4.735432007397605e-05, "loss": 0.0029, "mean_token_accuracy": 0.9989834904670716, "num_tokens": 114109164.0, "step": 35780 }, { "entropy": 0.046845695050433275, "epoch": 8.34153164704511, "grad_norm": 0.39453125, "learning_rate": 4.735338090424965e-05, "loss": 0.0026, "mean_token_accuracy": 0.9987184941768646, "num_tokens": 114152383.0, "step": 35785 }, { "entropy": 0.06434694388881326, "epoch": 8.342697284065743, "grad_norm": 0.328125, "learning_rate": 4.735244158759452e-05, "loss": 0.0026, "mean_token_accuracy": 0.9994235217571259, "num_tokens": 114170104.0, "step": 35790 }, { "entropy": 0.0462822025641799, "epoch": 8.343862921086373, "grad_norm": 0.267578125, "learning_rate": 4.735150212402466e-05, "loss": 0.001, "mean_token_accuracy": 0.9997472882270813, "num_tokens": 114195725.0, "step": 35795 }, { "entropy": 0.06865362599492073, "epoch": 8.345028558107005, "grad_norm": 0.30859375, "learning_rate": 4.735056251355409e-05, "loss": 0.0081, "mean_token_accuracy": 0.996992152929306, "num_tokens": 114217589.0, "step": 35800 }, { "entropy": 0.0582847012206912, "epoch": 8.346194195127637, "grad_norm": 0.53515625, "learning_rate": 4.734962275619681e-05, "loss": 0.0065, "mean_token_accuracy": 0.9987275958061218, "num_tokens": 114232250.0, "step": 35805 }, { "entropy": 0.05756397508084774, "epoch": 8.34735983214827, "grad_norm": 0.1748046875, "learning_rate": 4.734868285196685e-05, "loss": 0.0014, "mean_token_accuracy": 0.9999210953712463, "num_tokens": 114250794.0, "step": 35810 }, { "entropy": 0.06038464680314064, "epoch": 8.348525469168901, "grad_norm": 0.74609375, "learning_rate": 4.7347742800878206e-05, "loss": 0.0078, "mean_token_accuracy": 0.9986655592918396, "num_tokens": 114265626.0, "step": 35815 }, { "entropy": 0.04220847636461258, "epoch": 8.349691106189532, "grad_norm": 2.3125, "learning_rate": 4.734680260294491e-05, "loss": 0.0064, "mean_token_accuracy": 0.9986125826835632, "num_tokens": 114297113.0, "step": 35820 }, { "entropy": 0.05641756169497967, "epoch": 8.350856743210164, "grad_norm": 1.4609375, "learning_rate": 4.734586225818098e-05, "loss": 0.0029, "mean_token_accuracy": 0.9988239586353302, "num_tokens": 114312306.0, "step": 35825 }, { "entropy": 0.05419354699552059, "epoch": 8.352022380230796, "grad_norm": 1.53125, "learning_rate": 4.7344921766600425e-05, "loss": 0.0054, "mean_token_accuracy": 0.9972829222679138, "num_tokens": 114338143.0, "step": 35830 }, { "entropy": 0.061890093795955184, "epoch": 8.353188017251428, "grad_norm": 0.1689453125, "learning_rate": 4.734398112821728e-05, "loss": 0.0047, "mean_token_accuracy": 0.9984643816947937, "num_tokens": 114360916.0, "step": 35835 }, { "entropy": 0.0586331375874579, "epoch": 8.35435365427206, "grad_norm": 1.5546875, "learning_rate": 4.7343040343045586e-05, "loss": 0.0046, "mean_token_accuracy": 0.9992785274982452, "num_tokens": 114374324.0, "step": 35840 }, { "entropy": 0.09843996288254857, "epoch": 8.35551929129269, "grad_norm": 0.30078125, "learning_rate": 4.7342099411099336e-05, "loss": 0.0065, "mean_token_accuracy": 0.99708451628685, "num_tokens": 114392560.0, "step": 35845 }, { "entropy": 0.0483152624219656, "epoch": 8.356684928313323, "grad_norm": 0.396484375, "learning_rate": 4.734115833239259e-05, "loss": 0.0093, "mean_token_accuracy": 0.9988516390323638, "num_tokens": 114415426.0, "step": 35850 }, { "entropy": 0.061218463350087404, "epoch": 8.357850565333955, "grad_norm": 0.341796875, "learning_rate": 4.734021710693938e-05, "loss": 0.0015, "mean_token_accuracy": 0.9996902167797088, "num_tokens": 114435183.0, "step": 35855 }, { "entropy": 0.06922294609248639, "epoch": 8.359016202354587, "grad_norm": 2.484375, "learning_rate": 4.7339275734753717e-05, "loss": 0.0038, "mean_token_accuracy": 0.9989567220211029, "num_tokens": 114469970.0, "step": 35860 }, { "entropy": 0.052051532082259655, "epoch": 8.360181839375219, "grad_norm": 0.1611328125, "learning_rate": 4.7338334215849664e-05, "loss": 0.0026, "mean_token_accuracy": 0.9997234284877777, "num_tokens": 114490221.0, "step": 35865 }, { "entropy": 0.0687698122113943, "epoch": 8.361347476395851, "grad_norm": 0.6953125, "learning_rate": 4.7337392550241246e-05, "loss": 0.0069, "mean_token_accuracy": 0.999000883102417, "num_tokens": 114509080.0, "step": 35870 }, { "entropy": 0.05277932183817029, "epoch": 8.362513113416481, "grad_norm": 0.09326171875, "learning_rate": 4.73364507379425e-05, "loss": 0.0023, "mean_token_accuracy": 0.9998419165611268, "num_tokens": 114527650.0, "step": 35875 }, { "entropy": 0.059004738088697196, "epoch": 8.363678750437114, "grad_norm": 1.171875, "learning_rate": 4.73355087789675e-05, "loss": 0.0036, "mean_token_accuracy": 0.9990606307983398, "num_tokens": 114549999.0, "step": 35880 }, { "entropy": 0.05168069005012512, "epoch": 8.364844387457746, "grad_norm": 0.47265625, "learning_rate": 4.733456667333025e-05, "loss": 0.0019, "mean_token_accuracy": 0.9995020270347595, "num_tokens": 114571806.0, "step": 35885 }, { "entropy": 0.053898480255156755, "epoch": 8.366010024478378, "grad_norm": 0.859375, "learning_rate": 4.7333624421044834e-05, "loss": 0.0036, "mean_token_accuracy": 0.9992337822914124, "num_tokens": 114591540.0, "step": 35890 }, { "entropy": 0.06580785913392902, "epoch": 8.36717566149901, "grad_norm": 0.447265625, "learning_rate": 4.733268202212527e-05, "loss": 0.0044, "mean_token_accuracy": 0.9986721754074097, "num_tokens": 114608012.0, "step": 35895 }, { "entropy": 0.07490675896406174, "epoch": 8.36834129851964, "grad_norm": 2.40625, "learning_rate": 4.733173947658564e-05, "loss": 0.0063, "mean_token_accuracy": 0.9986124873161316, "num_tokens": 114628929.0, "step": 35900 }, { "entropy": 0.0590736048296094, "epoch": 8.369506935540272, "grad_norm": 0.1337890625, "learning_rate": 4.733079678443999e-05, "loss": 0.0017, "mean_token_accuracy": 0.9995599031448364, "num_tokens": 114641901.0, "step": 35905 }, { "entropy": 0.08702043211087584, "epoch": 8.370672572560904, "grad_norm": 0.2080078125, "learning_rate": 4.7329853945702366e-05, "loss": 0.0047, "mean_token_accuracy": 0.9992570400238037, "num_tokens": 114662085.0, "step": 35910 }, { "entropy": 0.06151698585599661, "epoch": 8.371838209581536, "grad_norm": 3.65625, "learning_rate": 4.7328910960386834e-05, "loss": 0.0083, "mean_token_accuracy": 0.9983277082443237, "num_tokens": 114675531.0, "step": 35915 }, { "entropy": 0.045510869659483436, "epoch": 8.373003846602169, "grad_norm": 0.52734375, "learning_rate": 4.732796782850746e-05, "loss": 0.0032, "mean_token_accuracy": 0.9992880523204803, "num_tokens": 114698590.0, "step": 35920 }, { "entropy": 0.06477670334279537, "epoch": 8.3741694836228, "grad_norm": 3.53125, "learning_rate": 4.73270245500783e-05, "loss": 0.0043, "mean_token_accuracy": 0.9987984895706177, "num_tokens": 114710536.0, "step": 35925 }, { "entropy": 0.0671735213138163, "epoch": 8.375335120643431, "grad_norm": 0.5625, "learning_rate": 4.732608112511343e-05, "loss": 0.0044, "mean_token_accuracy": 0.9994207143783569, "num_tokens": 114738586.0, "step": 35930 }, { "entropy": 0.07747904891148209, "epoch": 8.376500757664063, "grad_norm": 0.1796875, "learning_rate": 4.732513755362691e-05, "loss": 0.0041, "mean_token_accuracy": 0.9974416434764862, "num_tokens": 114758164.0, "step": 35935 }, { "entropy": 0.041080075595527885, "epoch": 8.377666394684695, "grad_norm": 0.396484375, "learning_rate": 4.73241938356328e-05, "loss": 0.0022, "mean_token_accuracy": 0.9994053184986115, "num_tokens": 114787467.0, "step": 35940 }, { "entropy": 0.0725083589553833, "epoch": 8.378832031705327, "grad_norm": 4.25, "learning_rate": 4.73232499711452e-05, "loss": 0.0147, "mean_token_accuracy": 0.997485488653183, "num_tokens": 114798080.0, "step": 35945 }, { "entropy": 0.04100953293964267, "epoch": 8.37999766872596, "grad_norm": 0.2314453125, "learning_rate": 4.732230596017816e-05, "loss": 0.0096, "mean_token_accuracy": 0.9985480844974518, "num_tokens": 114834393.0, "step": 35950 }, { "entropy": 0.06174456924200058, "epoch": 8.38116330574659, "grad_norm": 0.11962890625, "learning_rate": 4.732136180274576e-05, "loss": 0.0082, "mean_token_accuracy": 0.9990587592124939, "num_tokens": 114847723.0, "step": 35955 }, { "entropy": 0.072079146374017, "epoch": 8.382328942767222, "grad_norm": 1.375, "learning_rate": 4.732041749886209e-05, "loss": 0.0065, "mean_token_accuracy": 0.9978023052215577, "num_tokens": 114861770.0, "step": 35960 }, { "entropy": 0.05037323208525777, "epoch": 8.383494579787854, "grad_norm": 0.361328125, "learning_rate": 4.731947304854122e-05, "loss": 0.0019, "mean_token_accuracy": 0.9989248156547547, "num_tokens": 114887553.0, "step": 35965 }, { "entropy": 0.07071936894208193, "epoch": 8.384660216808486, "grad_norm": 1.9765625, "learning_rate": 4.731852845179724e-05, "loss": 0.0033, "mean_token_accuracy": 0.9990245401859283, "num_tokens": 114910384.0, "step": 35970 }, { "entropy": 0.04616665868088603, "epoch": 8.385825853829118, "grad_norm": 0.1845703125, "learning_rate": 4.731758370864423e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997251272201538, "num_tokens": 114954754.0, "step": 35975 }, { "entropy": 0.07068393416702748, "epoch": 8.386991490849748, "grad_norm": 0.09814453125, "learning_rate": 4.731663881909628e-05, "loss": 0.0017, "mean_token_accuracy": 0.9998482525348663, "num_tokens": 114964733.0, "step": 35980 }, { "entropy": 0.0473877920769155, "epoch": 8.38815712787038, "grad_norm": 0.228515625, "learning_rate": 4.731569378316749e-05, "loss": 0.0018, "mean_token_accuracy": 0.999399745464325, "num_tokens": 114980569.0, "step": 35985 }, { "entropy": 0.051515743136405945, "epoch": 8.389322764891013, "grad_norm": 0.23046875, "learning_rate": 4.731474860087193e-05, "loss": 0.0021, "mean_token_accuracy": 0.9992481112480164, "num_tokens": 115022823.0, "step": 35990 }, { "entropy": 0.04221371039748192, "epoch": 8.390488401911645, "grad_norm": 0.154296875, "learning_rate": 4.731380327222371e-05, "loss": 0.0036, "mean_token_accuracy": 0.9989139676094055, "num_tokens": 115044356.0, "step": 35995 }, { "entropy": 0.061717442143708466, "epoch": 8.391654038932277, "grad_norm": 1.1875, "learning_rate": 4.7312857797236925e-05, "loss": 0.0048, "mean_token_accuracy": 0.9987213313579559, "num_tokens": 115065703.0, "step": 36000 }, { "entropy": 0.07057226775214076, "epoch": 8.392819675952909, "grad_norm": 2.0625, "learning_rate": 4.731191217592567e-05, "loss": 0.0037, "mean_token_accuracy": 0.9992031335830689, "num_tokens": 115092467.0, "step": 36005 }, { "entropy": 0.08439392279833555, "epoch": 8.39398531297354, "grad_norm": 0.265625, "learning_rate": 4.731096640830405e-05, "loss": 0.0336, "mean_token_accuracy": 0.9949727892875672, "num_tokens": 115130036.0, "step": 36010 }, { "entropy": 0.04997500581666827, "epoch": 8.395150949994171, "grad_norm": 1.109375, "learning_rate": 4.7310020494386156e-05, "loss": 0.0026, "mean_token_accuracy": 0.9988471925258636, "num_tokens": 115150815.0, "step": 36015 }, { "entropy": 0.07026242911815643, "epoch": 8.396316587014804, "grad_norm": 0.1328125, "learning_rate": 4.730907443418611e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996152281761169, "num_tokens": 115160160.0, "step": 36020 }, { "entropy": 0.05926717910915613, "epoch": 8.397482224035436, "grad_norm": 0.3203125, "learning_rate": 4.730812822771801e-05, "loss": 0.0033, "mean_token_accuracy": 0.9973244667053223, "num_tokens": 115177388.0, "step": 36025 }, { "entropy": 0.06058362293988466, "epoch": 8.398647861056068, "grad_norm": 0.1904296875, "learning_rate": 4.730718187499595e-05, "loss": 0.0032, "mean_token_accuracy": 0.9996544599533081, "num_tokens": 115193357.0, "step": 36030 }, { "entropy": 0.08812159774824976, "epoch": 8.399813498076698, "grad_norm": 2.484375, "learning_rate": 4.730623537603408e-05, "loss": 0.0112, "mean_token_accuracy": 0.9962131261825562, "num_tokens": 115209695.0, "step": 36035 }, { "entropy": 0.0533585537225008, "epoch": 8.40097913509733, "grad_norm": 0.84375, "learning_rate": 4.730528873084648e-05, "loss": 0.0082, "mean_token_accuracy": 0.9977275252342224, "num_tokens": 115233579.0, "step": 36040 }, { "entropy": 0.04185967352241278, "epoch": 8.402144772117962, "grad_norm": 0.6953125, "learning_rate": 4.730434193944727e-05, "loss": 0.003, "mean_token_accuracy": 0.9987737834453583, "num_tokens": 115254772.0, "step": 36045 }, { "entropy": 0.06632612012326718, "epoch": 8.403310409138594, "grad_norm": 0.84375, "learning_rate": 4.730339500185059e-05, "loss": 0.0046, "mean_token_accuracy": 0.9984843134880066, "num_tokens": 115266893.0, "step": 36050 }, { "entropy": 0.0712279126048088, "epoch": 8.404476046159226, "grad_norm": 3.84375, "learning_rate": 4.7302447918070536e-05, "loss": 0.0027, "mean_token_accuracy": 0.9988064527511596, "num_tokens": 115292056.0, "step": 36055 }, { "entropy": 0.05562030803412199, "epoch": 8.405641683179859, "grad_norm": 0.5625, "learning_rate": 4.730150068812124e-05, "loss": 0.0047, "mean_token_accuracy": 0.9988888800144196, "num_tokens": 115312811.0, "step": 36060 }, { "entropy": 0.06178484875708819, "epoch": 8.406807320200489, "grad_norm": 0.076171875, "learning_rate": 4.730055331201683e-05, "loss": 0.0024, "mean_token_accuracy": 0.9996735990047455, "num_tokens": 115328524.0, "step": 36065 }, { "entropy": 0.04668997749686241, "epoch": 8.407972957221121, "grad_norm": 0.5234375, "learning_rate": 4.729960578977143e-05, "loss": 0.003, "mean_token_accuracy": 0.9984108448028565, "num_tokens": 115349273.0, "step": 36070 }, { "entropy": 0.06319479513913392, "epoch": 8.409138594241753, "grad_norm": 0.369140625, "learning_rate": 4.729865812139916e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999687492847442, "num_tokens": 115369544.0, "step": 36075 }, { "entropy": 0.05766276512295008, "epoch": 8.410304231262385, "grad_norm": 0.10595703125, "learning_rate": 4.729771030691417e-05, "loss": 0.0036, "mean_token_accuracy": 0.9991033136844635, "num_tokens": 115383256.0, "step": 36080 }, { "entropy": 0.04213843066245317, "epoch": 8.411469868283017, "grad_norm": 0.91015625, "learning_rate": 4.7296762346330576e-05, "loss": 0.0038, "mean_token_accuracy": 0.998986440896988, "num_tokens": 115415891.0, "step": 36085 }, { "entropy": 0.07678450532257557, "epoch": 8.412635505303648, "grad_norm": 1.203125, "learning_rate": 4.7295814239662525e-05, "loss": 0.0023, "mean_token_accuracy": 0.9996045112609864, "num_tokens": 115426929.0, "step": 36090 }, { "entropy": 0.05705181276425719, "epoch": 8.41380114232428, "grad_norm": 0.84765625, "learning_rate": 4.729486598692414e-05, "loss": 0.0031, "mean_token_accuracy": 0.9984883546829224, "num_tokens": 115451739.0, "step": 36095 }, { "entropy": 0.05112549578770995, "epoch": 8.414966779344912, "grad_norm": 0.2265625, "learning_rate": 4.729391758812958e-05, "loss": 0.0017, "mean_token_accuracy": 0.9999086081981658, "num_tokens": 115480556.0, "step": 36100 }, { "entropy": 0.04785581501200795, "epoch": 8.416132416365544, "grad_norm": 0.48046875, "learning_rate": 4.729296904329298e-05, "loss": 0.0027, "mean_token_accuracy": 0.9994737505912781, "num_tokens": 115497081.0, "step": 36105 }, { "entropy": 0.06632670313119889, "epoch": 8.417298053386176, "grad_norm": 0.76953125, "learning_rate": 4.7292020352428477e-05, "loss": 0.002, "mean_token_accuracy": 0.9995335042476654, "num_tokens": 115506729.0, "step": 36110 }, { "entropy": 0.059855165053159, "epoch": 8.418463690406806, "grad_norm": 0.05908203125, "learning_rate": 4.729107151555022e-05, "loss": 0.008, "mean_token_accuracy": 0.9979570627212524, "num_tokens": 115520456.0, "step": 36115 }, { "entropy": 0.0672213020734489, "epoch": 8.419629327427439, "grad_norm": 0.2041015625, "learning_rate": 4.7290122532672366e-05, "loss": 0.0018, "mean_token_accuracy": 0.9995971262454987, "num_tokens": 115537752.0, "step": 36120 }, { "entropy": 0.05041234586387873, "epoch": 8.42079496444807, "grad_norm": 0.95703125, "learning_rate": 4.728917340380905e-05, "loss": 0.0018, "mean_token_accuracy": 0.9997573018074035, "num_tokens": 115563759.0, "step": 36125 }, { "entropy": 0.05824168622493744, "epoch": 8.421960601468703, "grad_norm": 0.125, "learning_rate": 4.7288224128974445e-05, "loss": 0.0039, "mean_token_accuracy": 0.998965859413147, "num_tokens": 115577018.0, "step": 36130 }, { "entropy": 0.06886055655777454, "epoch": 8.423126238489335, "grad_norm": 0.3046875, "learning_rate": 4.728727470818269e-05, "loss": 0.0021, "mean_token_accuracy": 0.9999520361423493, "num_tokens": 115588761.0, "step": 36135 }, { "entropy": 0.06945041492581368, "epoch": 8.424291875509967, "grad_norm": 0.173828125, "learning_rate": 4.728632514144796e-05, "loss": 0.0076, "mean_token_accuracy": 0.9981557667255402, "num_tokens": 115611607.0, "step": 36140 }, { "entropy": 0.07045771069824695, "epoch": 8.425457512530597, "grad_norm": 0.279296875, "learning_rate": 4.728537542878439e-05, "loss": 0.0045, "mean_token_accuracy": 0.9992258012294769, "num_tokens": 115628332.0, "step": 36145 }, { "entropy": 0.052793185226619244, "epoch": 8.42662314955123, "grad_norm": 0.26953125, "learning_rate": 4.728442557020616e-05, "loss": 0.0048, "mean_token_accuracy": 0.9986692845821381, "num_tokens": 115658797.0, "step": 36150 }, { "entropy": 0.05012588379904628, "epoch": 8.427788786571861, "grad_norm": 1.9609375, "learning_rate": 4.7283475565727424e-05, "loss": 0.0037, "mean_token_accuracy": 0.999320387840271, "num_tokens": 115687971.0, "step": 36155 }, { "entropy": 0.07577522285282612, "epoch": 8.428954423592494, "grad_norm": 0.52734375, "learning_rate": 4.7282525415362354e-05, "loss": 0.0028, "mean_token_accuracy": 0.9989284574985504, "num_tokens": 115701010.0, "step": 36160 }, { "entropy": 0.0740378656424582, "epoch": 8.430120060613126, "grad_norm": 1.859375, "learning_rate": 4.7281575119125124e-05, "loss": 0.0062, "mean_token_accuracy": 0.9979305446147919, "num_tokens": 115718645.0, "step": 36165 }, { "entropy": 0.06981781832873821, "epoch": 8.431285697633756, "grad_norm": 0.203125, "learning_rate": 4.7280624677029886e-05, "loss": 0.0078, "mean_token_accuracy": 0.9985656261444091, "num_tokens": 115739479.0, "step": 36170 }, { "entropy": 0.07181575652211905, "epoch": 8.432451334654388, "grad_norm": 0.419921875, "learning_rate": 4.7279674089090833e-05, "loss": 0.0029, "mean_token_accuracy": 0.999373197555542, "num_tokens": 115760286.0, "step": 36175 }, { "entropy": 0.07071037925779819, "epoch": 8.43361697167502, "grad_norm": 1.5859375, "learning_rate": 4.727872335532212e-05, "loss": 0.0061, "mean_token_accuracy": 0.9976220965385437, "num_tokens": 115769961.0, "step": 36180 }, { "entropy": 0.09611797733232379, "epoch": 8.434782608695652, "grad_norm": 5.28125, "learning_rate": 4.727777247573794e-05, "loss": 0.0296, "mean_token_accuracy": 0.9970374286174775, "num_tokens": 115795424.0, "step": 36185 }, { "entropy": 0.04846240486949682, "epoch": 8.435948245716284, "grad_norm": 0.94140625, "learning_rate": 4.727682145035246e-05, "loss": 0.0029, "mean_token_accuracy": 0.9988361775875092, "num_tokens": 115831652.0, "step": 36190 }, { "entropy": 0.057739018369466065, "epoch": 8.437113882736917, "grad_norm": 0.1474609375, "learning_rate": 4.727587027917987e-05, "loss": 0.0083, "mean_token_accuracy": 0.9980798184871673, "num_tokens": 115858353.0, "step": 36195 }, { "entropy": 0.06929563917219639, "epoch": 8.438279519757547, "grad_norm": 1.90625, "learning_rate": 4.727491896223435e-05, "loss": 0.0136, "mean_token_accuracy": 0.9970984637737275, "num_tokens": 115887873.0, "step": 36200 }, { "entropy": 0.05829632971435785, "epoch": 8.439445156778179, "grad_norm": 0.2080078125, "learning_rate": 4.727396749953009e-05, "loss": 0.005, "mean_token_accuracy": 0.9983768701553345, "num_tokens": 115904952.0, "step": 36205 }, { "entropy": 0.061249536275863645, "epoch": 8.440610793798811, "grad_norm": 0.98828125, "learning_rate": 4.727301589108127e-05, "loss": 0.0017, "mean_token_accuracy": 0.9995396554470062, "num_tokens": 115916917.0, "step": 36210 }, { "entropy": 0.05658513549715281, "epoch": 8.441776430819443, "grad_norm": 2.78125, "learning_rate": 4.7272064136902085e-05, "loss": 0.0054, "mean_token_accuracy": 0.998557734489441, "num_tokens": 115934530.0, "step": 36215 }, { "entropy": 0.049229751247912644, "epoch": 8.442942067840075, "grad_norm": 0.83984375, "learning_rate": 4.727111223700672e-05, "loss": 0.0054, "mean_token_accuracy": 0.9983462572097779, "num_tokens": 115974940.0, "step": 36220 }, { "entropy": 0.06888475380837918, "epoch": 8.444107704860706, "grad_norm": 0.6015625, "learning_rate": 4.727016019140938e-05, "loss": 0.0035, "mean_token_accuracy": 0.9989739239215851, "num_tokens": 115997818.0, "step": 36225 }, { "entropy": 0.06635741349309683, "epoch": 8.445273341881338, "grad_norm": 1.6875, "learning_rate": 4.7269208000124256e-05, "loss": 0.0167, "mean_token_accuracy": 0.9960039258003235, "num_tokens": 116007111.0, "step": 36230 }, { "entropy": 0.06352852582931519, "epoch": 8.44643897890197, "grad_norm": 0.671875, "learning_rate": 4.726825566316555e-05, "loss": 0.0035, "mean_token_accuracy": 0.9989105463027954, "num_tokens": 116019964.0, "step": 36235 }, { "entropy": 0.06089718565344811, "epoch": 8.447604615922602, "grad_norm": 0.369140625, "learning_rate": 4.726730318054745e-05, "loss": 0.0099, "mean_token_accuracy": 0.9988169968128204, "num_tokens": 116045247.0, "step": 36240 }, { "entropy": 0.044029067549854514, "epoch": 8.448770252943234, "grad_norm": 0.75390625, "learning_rate": 4.726635055228418e-05, "loss": 0.002, "mean_token_accuracy": 0.9987780928611756, "num_tokens": 116067328.0, "step": 36245 }, { "entropy": 0.050043806247413156, "epoch": 8.449935889963864, "grad_norm": 0.2099609375, "learning_rate": 4.726539777838993e-05, "loss": 0.0023, "mean_token_accuracy": 0.9993367373943329, "num_tokens": 116086931.0, "step": 36250 }, { "entropy": 0.060326000954955813, "epoch": 8.451101526984496, "grad_norm": 0.408203125, "learning_rate": 4.726444485887891e-05, "loss": 0.0026, "mean_token_accuracy": 0.9995485246181488, "num_tokens": 116112104.0, "step": 36255 }, { "entropy": 0.06028610188513994, "epoch": 8.452267164005129, "grad_norm": 2.734375, "learning_rate": 4.726349179376533e-05, "loss": 0.0055, "mean_token_accuracy": 0.9986335098743438, "num_tokens": 116128103.0, "step": 36260 }, { "entropy": 0.07007607705891132, "epoch": 8.45343280102576, "grad_norm": 0.765625, "learning_rate": 4.7262538583063404e-05, "loss": 0.0042, "mean_token_accuracy": 0.9987308144569397, "num_tokens": 116147244.0, "step": 36265 }, { "entropy": 0.0728133057244122, "epoch": 8.454598438046393, "grad_norm": 1.046875, "learning_rate": 4.726158522678734e-05, "loss": 0.003, "mean_token_accuracy": 0.9991988897323608, "num_tokens": 116165431.0, "step": 36270 }, { "entropy": 0.0499640004709363, "epoch": 8.455764075067025, "grad_norm": 1.6171875, "learning_rate": 4.726063172495137e-05, "loss": 0.0039, "mean_token_accuracy": 0.9992352545261383, "num_tokens": 116184587.0, "step": 36275 }, { "entropy": 0.06305303145200014, "epoch": 8.456929712087655, "grad_norm": 0.058349609375, "learning_rate": 4.725967807756969e-05, "loss": 0.0059, "mean_token_accuracy": 0.9978918075561524, "num_tokens": 116197306.0, "step": 36280 }, { "entropy": 0.06247878428548574, "epoch": 8.458095349108287, "grad_norm": 0.2001953125, "learning_rate": 4.725872428465653e-05, "loss": 0.0078, "mean_token_accuracy": 0.9982529759407044, "num_tokens": 116215388.0, "step": 36285 }, { "entropy": 0.05539757264778018, "epoch": 8.45926098612892, "grad_norm": 0.3515625, "learning_rate": 4.725777034622611e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996000409126282, "num_tokens": 116242407.0, "step": 36290 }, { "entropy": 0.05653154449537397, "epoch": 8.460426623149552, "grad_norm": 0.2294921875, "learning_rate": 4.7256816262292665e-05, "loss": 0.0085, "mean_token_accuracy": 0.9972741007804871, "num_tokens": 116268238.0, "step": 36295 }, { "entropy": 0.077280986122787, "epoch": 8.461592260170184, "grad_norm": 0.037109375, "learning_rate": 4.725586203287041e-05, "loss": 0.0012, "mean_token_accuracy": 0.9990314662456512, "num_tokens": 116282961.0, "step": 36300 }, { "entropy": 0.05508853131905198, "epoch": 8.462757897190814, "grad_norm": 0.380859375, "learning_rate": 4.725490765797358e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997860372066498, "num_tokens": 116311225.0, "step": 36305 }, { "entropy": 0.058704734221100806, "epoch": 8.463923534211446, "grad_norm": 0.16015625, "learning_rate": 4.725395313761641e-05, "loss": 0.0015, "mean_token_accuracy": 0.9997250258922576, "num_tokens": 116325286.0, "step": 36310 }, { "entropy": 0.05704579222947359, "epoch": 8.465089171232078, "grad_norm": 0.2890625, "learning_rate": 4.725299847181312e-05, "loss": 0.004, "mean_token_accuracy": 0.9993097305297851, "num_tokens": 116338229.0, "step": 36315 }, { "entropy": 0.05245425468310714, "epoch": 8.46625480825271, "grad_norm": 0.095703125, "learning_rate": 4.725204366057796e-05, "loss": 0.0115, "mean_token_accuracy": 0.9979504764080047, "num_tokens": 116356198.0, "step": 36320 }, { "entropy": 0.05498585319146514, "epoch": 8.467420445273342, "grad_norm": 0.2119140625, "learning_rate": 4.725108870392516e-05, "loss": 0.0041, "mean_token_accuracy": 0.9978257954120636, "num_tokens": 116381242.0, "step": 36325 }, { "entropy": 0.08065539970993996, "epoch": 8.468586082293974, "grad_norm": 0.51171875, "learning_rate": 4.725013360186895e-05, "loss": 0.0021, "mean_token_accuracy": 0.9993955552577972, "num_tokens": 116394168.0, "step": 36330 }, { "entropy": 0.07359606567770242, "epoch": 8.469751719314605, "grad_norm": 2.1875, "learning_rate": 4.72491783544236e-05, "loss": 0.0133, "mean_token_accuracy": 0.9973003745079041, "num_tokens": 116428556.0, "step": 36335 }, { "entropy": 0.061815372481942174, "epoch": 8.470917356335237, "grad_norm": 0.59375, "learning_rate": 4.724822296160332e-05, "loss": 0.0025, "mean_token_accuracy": 0.9987725913524628, "num_tokens": 116443891.0, "step": 36340 }, { "entropy": 0.0534939656034112, "epoch": 8.472082993355869, "grad_norm": 0.146484375, "learning_rate": 4.7247267423422386e-05, "loss": 0.0024, "mean_token_accuracy": 0.9995495617389679, "num_tokens": 116468462.0, "step": 36345 }, { "entropy": 0.058882415667176245, "epoch": 8.473248630376501, "grad_norm": 0.12890625, "learning_rate": 4.7246311739895035e-05, "loss": 0.0022, "mean_token_accuracy": 0.9995469570159912, "num_tokens": 116485331.0, "step": 36350 }, { "entropy": 0.06126846736297011, "epoch": 8.474414267397133, "grad_norm": 0.185546875, "learning_rate": 4.7245355911035506e-05, "loss": 0.004, "mean_token_accuracy": 0.9989485740661621, "num_tokens": 116499399.0, "step": 36355 }, { "entropy": 0.06335963495075703, "epoch": 8.475579904417764, "grad_norm": 0.9921875, "learning_rate": 4.724439993685807e-05, "loss": 0.0053, "mean_token_accuracy": 0.9982250690460205, "num_tokens": 116514649.0, "step": 36360 }, { "entropy": 0.06798462234437466, "epoch": 8.476745541438396, "grad_norm": 2.21875, "learning_rate": 4.724344381737697e-05, "loss": 0.0097, "mean_token_accuracy": 0.9951762199401856, "num_tokens": 116524731.0, "step": 36365 }, { "entropy": 0.047466957662254575, "epoch": 8.477911178459028, "grad_norm": 0.2890625, "learning_rate": 4.7242487552606475e-05, "loss": 0.003, "mean_token_accuracy": 0.9994238436222076, "num_tokens": 116555864.0, "step": 36370 }, { "entropy": 0.055692866630852225, "epoch": 8.47907681547966, "grad_norm": 0.4765625, "learning_rate": 4.724153114256083e-05, "loss": 0.0027, "mean_token_accuracy": 0.9994192600250245, "num_tokens": 116575010.0, "step": 36375 }, { "entropy": 0.06099151102825999, "epoch": 8.480242452500292, "grad_norm": 0.1552734375, "learning_rate": 4.7240574587254304e-05, "loss": 0.0044, "mean_token_accuracy": 0.9995291829109192, "num_tokens": 116592779.0, "step": 36380 }, { "entropy": 0.06979598198086023, "epoch": 8.481408089520922, "grad_norm": 0.91015625, "learning_rate": 4.723961788670117e-05, "loss": 0.0018, "mean_token_accuracy": 0.9991238117218018, "num_tokens": 116612943.0, "step": 36385 }, { "entropy": 0.054224473610520366, "epoch": 8.482573726541554, "grad_norm": 0.484375, "learning_rate": 4.7238661040915675e-05, "loss": 0.0018, "mean_token_accuracy": 0.9984688460826874, "num_tokens": 116636565.0, "step": 36390 }, { "entropy": 0.054101691022515296, "epoch": 8.483739363562186, "grad_norm": 0.546875, "learning_rate": 4.7237704049912095e-05, "loss": 0.0052, "mean_token_accuracy": 0.9988676548004151, "num_tokens": 116657239.0, "step": 36395 }, { "entropy": 0.06508222203701734, "epoch": 8.484905000582819, "grad_norm": 0.59765625, "learning_rate": 4.723674691370471e-05, "loss": 0.0034, "mean_token_accuracy": 0.9983641326427459, "num_tokens": 116670991.0, "step": 36400 }, { "entropy": 0.060155941732227804, "epoch": 8.48607063760345, "grad_norm": 0.73046875, "learning_rate": 4.723578963230777e-05, "loss": 0.0032, "mean_token_accuracy": 0.9993346929550171, "num_tokens": 116684413.0, "step": 36405 }, { "entropy": 0.06790580712258816, "epoch": 8.487236274624083, "grad_norm": 1.3125, "learning_rate": 4.723483220573557e-05, "loss": 0.0047, "mean_token_accuracy": 0.9989775061607361, "num_tokens": 116697951.0, "step": 36410 }, { "entropy": 0.09916951693594456, "epoch": 8.488401911644713, "grad_norm": 2.015625, "learning_rate": 4.723387463400238e-05, "loss": 0.003, "mean_token_accuracy": 0.9989169955253601, "num_tokens": 116705777.0, "step": 36415 }, { "entropy": 0.05130702285096049, "epoch": 8.489567548665345, "grad_norm": 1.75, "learning_rate": 4.723291691712248e-05, "loss": 0.004, "mean_token_accuracy": 0.9996195018291474, "num_tokens": 116730161.0, "step": 36420 }, { "entropy": 0.04962422214448452, "epoch": 8.490733185685977, "grad_norm": 0.2451171875, "learning_rate": 4.723195905511015e-05, "loss": 0.0017, "mean_token_accuracy": 0.9991441249847413, "num_tokens": 116748972.0, "step": 36425 }, { "entropy": 0.04863788112998009, "epoch": 8.49189882270661, "grad_norm": 0.2373046875, "learning_rate": 4.723100104797967e-05, "loss": 0.0012, "mean_token_accuracy": 0.9993934810161591, "num_tokens": 116778203.0, "step": 36430 }, { "entropy": 0.04388864999637008, "epoch": 8.493064459727242, "grad_norm": 0.3203125, "learning_rate": 4.723004289574533e-05, "loss": 0.003, "mean_token_accuracy": 0.9993597865104675, "num_tokens": 116802243.0, "step": 36435 }, { "entropy": 0.06913177091628313, "epoch": 8.494230096747872, "grad_norm": 0.51953125, "learning_rate": 4.722908459842141e-05, "loss": 0.005, "mean_token_accuracy": 0.998873645067215, "num_tokens": 116827393.0, "step": 36440 }, { "entropy": 0.07518032267689705, "epoch": 8.495395733768504, "grad_norm": 3.625, "learning_rate": 4.7228126156022216e-05, "loss": 0.0068, "mean_token_accuracy": 0.9983530879020691, "num_tokens": 116838490.0, "step": 36445 }, { "entropy": 0.07093150406144559, "epoch": 8.496561370789136, "grad_norm": 0.76171875, "learning_rate": 4.722716756856203e-05, "loss": 0.0028, "mean_token_accuracy": 0.9985154151916504, "num_tokens": 116855780.0, "step": 36450 }, { "entropy": 0.06148874796926975, "epoch": 8.497727007809768, "grad_norm": 0.271484375, "learning_rate": 4.7226208836055136e-05, "loss": 0.0033, "mean_token_accuracy": 0.9992707073688507, "num_tokens": 116873275.0, "step": 36455 }, { "entropy": 0.05633387309499085, "epoch": 8.4988926448304, "grad_norm": 0.357421875, "learning_rate": 4.7225249958515836e-05, "loss": 0.0043, "mean_token_accuracy": 0.9990444958209992, "num_tokens": 116897277.0, "step": 36460 }, { "entropy": 0.06432565553113818, "epoch": 8.500058281851032, "grad_norm": 2.546875, "learning_rate": 4.7224290935958444e-05, "loss": 0.0059, "mean_token_accuracy": 0.9976148426532745, "num_tokens": 116919333.0, "step": 36465 }, { "entropy": 0.05420015938580036, "epoch": 8.501223918871663, "grad_norm": 0.1484375, "learning_rate": 4.722333176839724e-05, "loss": 0.0026, "mean_token_accuracy": 0.9995829820632934, "num_tokens": 116933248.0, "step": 36470 }, { "entropy": 0.055879263393580916, "epoch": 8.502389555892295, "grad_norm": 0.2392578125, "learning_rate": 4.7222372455846536e-05, "loss": 0.0077, "mean_token_accuracy": 0.9986678183078765, "num_tokens": 116946484.0, "step": 36475 }, { "entropy": 0.07837851643562317, "epoch": 8.503555192912927, "grad_norm": 0.58203125, "learning_rate": 4.7221412998320636e-05, "loss": 0.0041, "mean_token_accuracy": 0.9984042167663574, "num_tokens": 116956517.0, "step": 36480 }, { "entropy": 0.07617838717997075, "epoch": 8.504720829933559, "grad_norm": 2.390625, "learning_rate": 4.722045339583385e-05, "loss": 0.0038, "mean_token_accuracy": 0.9988938868045807, "num_tokens": 116977417.0, "step": 36485 }, { "entropy": 0.04583414709195495, "epoch": 8.505886466954191, "grad_norm": 0.75390625, "learning_rate": 4.7219493648400474e-05, "loss": 0.0071, "mean_token_accuracy": 0.9980126678943634, "num_tokens": 117002359.0, "step": 36490 }, { "entropy": 0.041270645707845686, "epoch": 8.507052103974821, "grad_norm": 0.193359375, "learning_rate": 4.7218533756034835e-05, "loss": 0.0068, "mean_token_accuracy": 0.9986809670925141, "num_tokens": 117035657.0, "step": 36495 }, { "entropy": 0.05645858906209469, "epoch": 8.508217740995454, "grad_norm": 1.6171875, "learning_rate": 4.7217573718751243e-05, "loss": 0.0027, "mean_token_accuracy": 0.999339473247528, "num_tokens": 117052075.0, "step": 36500 }, { "entropy": 0.0792703942861408, "epoch": 8.509383378016086, "grad_norm": 0.099609375, "learning_rate": 4.7216613536564005e-05, "loss": 0.0021, "mean_token_accuracy": 0.9995384693145752, "num_tokens": 117070170.0, "step": 36505 }, { "entropy": 0.058775024861097334, "epoch": 8.510549015036718, "grad_norm": 0.1650390625, "learning_rate": 4.7215653209487444e-05, "loss": 0.0051, "mean_token_accuracy": 0.9987652480602265, "num_tokens": 117088783.0, "step": 36510 }, { "entropy": 0.052767443284392354, "epoch": 8.51171465205735, "grad_norm": 1.125, "learning_rate": 4.721469273753588e-05, "loss": 0.0071, "mean_token_accuracy": 0.9984904825687408, "num_tokens": 117114387.0, "step": 36515 }, { "entropy": 0.05728981187567115, "epoch": 8.51288028907798, "grad_norm": 0.2890625, "learning_rate": 4.721373212072364e-05, "loss": 0.0049, "mean_token_accuracy": 0.9978182494640351, "num_tokens": 117135319.0, "step": 36520 }, { "entropy": 0.07348077250644565, "epoch": 8.514045926098612, "grad_norm": 2.03125, "learning_rate": 4.721277135906504e-05, "loss": 0.0077, "mean_token_accuracy": 0.9993664264678955, "num_tokens": 117158108.0, "step": 36525 }, { "entropy": 0.04832636620849371, "epoch": 8.515211563119244, "grad_norm": 0.439453125, "learning_rate": 4.7211810452574415e-05, "loss": 0.0036, "mean_token_accuracy": 0.997581273317337, "num_tokens": 117186605.0, "step": 36530 }, { "entropy": 0.06315313652157784, "epoch": 8.516377200139877, "grad_norm": 0.171875, "learning_rate": 4.721084940126609e-05, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 117197974.0, "step": 36535 }, { "entropy": 0.06376685537397861, "epoch": 8.517542837160509, "grad_norm": 1.6015625, "learning_rate": 4.720988820515439e-05, "loss": 0.0064, "mean_token_accuracy": 0.9981851637363434, "num_tokens": 117208905.0, "step": 36540 }, { "entropy": 0.05537942638620734, "epoch": 8.51870847418114, "grad_norm": 0.296875, "learning_rate": 4.7208926864253655e-05, "loss": 0.0036, "mean_token_accuracy": 0.9994171023368835, "num_tokens": 117232817.0, "step": 36545 }, { "entropy": 0.05915795974433422, "epoch": 8.519874111201771, "grad_norm": 0.2255859375, "learning_rate": 4.7207965378578215e-05, "loss": 0.0035, "mean_token_accuracy": 0.9996034562587738, "num_tokens": 117245430.0, "step": 36550 }, { "entropy": 0.05175903402268887, "epoch": 8.521039748222403, "grad_norm": 0.18359375, "learning_rate": 4.720700374814241e-05, "loss": 0.0038, "mean_token_accuracy": 0.9987463653087616, "num_tokens": 117270560.0, "step": 36555 }, { "entropy": 0.06156531311571598, "epoch": 8.522205385243035, "grad_norm": 0.6484375, "learning_rate": 4.720604197296058e-05, "loss": 0.0044, "mean_token_accuracy": 0.9983774304389954, "num_tokens": 117280633.0, "step": 36560 }, { "entropy": 0.0584191894158721, "epoch": 8.523371022263667, "grad_norm": 0.421875, "learning_rate": 4.720508005304706e-05, "loss": 0.0054, "mean_token_accuracy": 0.9979436278343201, "num_tokens": 117301478.0, "step": 36565 }, { "entropy": 0.05222648493945599, "epoch": 8.5245366592843, "grad_norm": 0.1904296875, "learning_rate": 4.720411798841621e-05, "loss": 0.004, "mean_token_accuracy": 0.9989825844764709, "num_tokens": 117313296.0, "step": 36570 }, { "entropy": 0.056239666417241096, "epoch": 8.52570229630493, "grad_norm": 2.21875, "learning_rate": 4.720315577908236e-05, "loss": 0.0023, "mean_token_accuracy": 0.9994934022426605, "num_tokens": 117336909.0, "step": 36575 }, { "entropy": 0.07502004820853472, "epoch": 8.526867933325562, "grad_norm": 1.9609375, "learning_rate": 4.720219342505986e-05, "loss": 0.016, "mean_token_accuracy": 0.9961349189281463, "num_tokens": 117354325.0, "step": 36580 }, { "entropy": 0.08849140480160714, "epoch": 8.528033570346194, "grad_norm": 3.296875, "learning_rate": 4.7201230926363065e-05, "loss": 0.0082, "mean_token_accuracy": 0.9983762741088867, "num_tokens": 117364550.0, "step": 36585 }, { "entropy": 0.05067680682986975, "epoch": 8.529199207366826, "grad_norm": 0.69140625, "learning_rate": 4.720026828300632e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997664391994476, "num_tokens": 117384001.0, "step": 36590 }, { "entropy": 0.07895645070821047, "epoch": 8.530364844387458, "grad_norm": 0.337890625, "learning_rate": 4.7199305495003995e-05, "loss": 0.0054, "mean_token_accuracy": 0.998946338891983, "num_tokens": 117397445.0, "step": 36595 }, { "entropy": 0.049645941238850355, "epoch": 8.53153048140809, "grad_norm": 3.4375, "learning_rate": 4.7198342562370436e-05, "loss": 0.0048, "mean_token_accuracy": 0.9991376340389252, "num_tokens": 117420151.0, "step": 36600 }, { "entropy": 0.06408117078244686, "epoch": 8.53269611842872, "grad_norm": 0.408203125, "learning_rate": 4.719737948512e-05, "loss": 0.0037, "mean_token_accuracy": 0.9996393918991089, "num_tokens": 117434708.0, "step": 36605 }, { "entropy": 0.06821114458143711, "epoch": 8.533861755449353, "grad_norm": 1.5234375, "learning_rate": 4.719641626326704e-05, "loss": 0.0046, "mean_token_accuracy": 0.9991869747638702, "num_tokens": 117445971.0, "step": 36610 }, { "entropy": 0.05026884274557233, "epoch": 8.535027392469985, "grad_norm": 0.2177734375, "learning_rate": 4.719545289682594e-05, "loss": 0.0029, "mean_token_accuracy": 0.998915296792984, "num_tokens": 117478835.0, "step": 36615 }, { "entropy": 0.05111234989017248, "epoch": 8.536193029490617, "grad_norm": 0.07666015625, "learning_rate": 4.719448938581105e-05, "loss": 0.0073, "mean_token_accuracy": 0.998233848810196, "num_tokens": 117510505.0, "step": 36620 }, { "entropy": 0.056202891655266284, "epoch": 8.537358666511249, "grad_norm": 0.3671875, "learning_rate": 4.719352573023674e-05, "loss": 0.0019, "mean_token_accuracy": 0.9993539452552795, "num_tokens": 117535677.0, "step": 36625 }, { "entropy": 0.05016994327306747, "epoch": 8.53852430353188, "grad_norm": 0.353515625, "learning_rate": 4.719256193011739e-05, "loss": 0.0025, "mean_token_accuracy": 0.9991106808185577, "num_tokens": 117561357.0, "step": 36630 }, { "entropy": 0.04156949240714312, "epoch": 8.539689940552512, "grad_norm": 0.2060546875, "learning_rate": 4.719159798546736e-05, "loss": 0.0042, "mean_token_accuracy": 0.9992599308490753, "num_tokens": 117594891.0, "step": 36635 }, { "entropy": 0.04975649286061525, "epoch": 8.540855577573144, "grad_norm": 0.25390625, "learning_rate": 4.719063389630103e-05, "loss": 0.001, "mean_token_accuracy": 0.9999047815799713, "num_tokens": 117627847.0, "step": 36640 }, { "entropy": 0.060001560300588605, "epoch": 8.542021214593776, "grad_norm": 1.1328125, "learning_rate": 4.7189669662632765e-05, "loss": 0.0027, "mean_token_accuracy": 0.9989645898342132, "num_tokens": 117641464.0, "step": 36645 }, { "entropy": 0.07360632680356502, "epoch": 8.543186851614408, "grad_norm": 0.68359375, "learning_rate": 4.7188705284476955e-05, "loss": 0.004, "mean_token_accuracy": 0.998823744058609, "num_tokens": 117650419.0, "step": 36650 }, { "entropy": 0.07439129706472158, "epoch": 8.544352488635038, "grad_norm": 0.5234375, "learning_rate": 4.7187740761847974e-05, "loss": 0.0035, "mean_token_accuracy": 0.9991745054721832, "num_tokens": 117666198.0, "step": 36655 }, { "entropy": 0.07077626138925552, "epoch": 8.54551812565567, "grad_norm": 0.4140625, "learning_rate": 4.718677609476021e-05, "loss": 0.0043, "mean_token_accuracy": 0.9988212764263154, "num_tokens": 117685003.0, "step": 36660 }, { "entropy": 0.06650689067319035, "epoch": 8.546683762676302, "grad_norm": 0.2490234375, "learning_rate": 4.7185811283228046e-05, "loss": 0.0032, "mean_token_accuracy": 0.9989586055278779, "num_tokens": 117708501.0, "step": 36665 }, { "entropy": 0.05205532372929156, "epoch": 8.547849399696934, "grad_norm": 3.265625, "learning_rate": 4.7184846327265865e-05, "loss": 0.0024, "mean_token_accuracy": 0.9995387613773346, "num_tokens": 117729794.0, "step": 36670 }, { "entropy": 0.05996727123856545, "epoch": 8.549015036717567, "grad_norm": 0.296875, "learning_rate": 4.718388122688806e-05, "loss": 0.0042, "mean_token_accuracy": 0.9982151925563812, "num_tokens": 117759396.0, "step": 36675 }, { "entropy": 0.08831366430968046, "epoch": 8.550180673738199, "grad_norm": 0.8984375, "learning_rate": 4.718291598210902e-05, "loss": 0.0016, "mean_token_accuracy": 0.9998996675014495, "num_tokens": 117771998.0, "step": 36680 }, { "entropy": 0.049416807992383835, "epoch": 8.551346310758829, "grad_norm": 0.859375, "learning_rate": 4.7181950592943134e-05, "loss": 0.0084, "mean_token_accuracy": 0.997665536403656, "num_tokens": 117817626.0, "step": 36685 }, { "entropy": 0.07952840253710747, "epoch": 8.552511947779461, "grad_norm": 0.34375, "learning_rate": 4.718098505940481e-05, "loss": 0.0014, "mean_token_accuracy": 0.999513429403305, "num_tokens": 117831508.0, "step": 36690 }, { "entropy": 0.06541987545788289, "epoch": 8.553677584800093, "grad_norm": 0.421875, "learning_rate": 4.7180019381508435e-05, "loss": 0.0064, "mean_token_accuracy": 0.9980966448783875, "num_tokens": 117859078.0, "step": 36695 }, { "entropy": 0.06116861402988434, "epoch": 8.554843221820725, "grad_norm": 0.74609375, "learning_rate": 4.717905355926841e-05, "loss": 0.0155, "mean_token_accuracy": 0.9986841559410096, "num_tokens": 117871420.0, "step": 36700 }, { "entropy": 0.0664717435836792, "epoch": 8.556008858841357, "grad_norm": 0.265625, "learning_rate": 4.717808759269914e-05, "loss": 0.0031, "mean_token_accuracy": 0.9988701105117798, "num_tokens": 117882255.0, "step": 36705 }, { "entropy": 0.0731908904388547, "epoch": 8.557174495861988, "grad_norm": 2.828125, "learning_rate": 4.717712148181503e-05, "loss": 0.0065, "mean_token_accuracy": 0.9986431419849395, "num_tokens": 117899097.0, "step": 36710 }, { "entropy": 0.05931155104190111, "epoch": 8.55834013288262, "grad_norm": 1.34375, "learning_rate": 4.7176155226630476e-05, "loss": 0.0064, "mean_token_accuracy": 0.9985710024833679, "num_tokens": 117919256.0, "step": 36715 }, { "entropy": 0.05108649954199791, "epoch": 8.559505769903252, "grad_norm": 0.2177734375, "learning_rate": 4.7175188827159897e-05, "loss": 0.0017, "mean_token_accuracy": 0.9996058583259583, "num_tokens": 117940881.0, "step": 36720 }, { "entropy": 0.05063903266564011, "epoch": 8.560671406923884, "grad_norm": 2.296875, "learning_rate": 4.71742222834177e-05, "loss": 0.0035, "mean_token_accuracy": 0.9989172637462616, "num_tokens": 117958438.0, "step": 36725 }, { "entropy": 0.04010082418099046, "epoch": 8.561837043944516, "grad_norm": 0.2353515625, "learning_rate": 4.71732555954183e-05, "loss": 0.0016, "mean_token_accuracy": 0.9995945632457733, "num_tokens": 117985829.0, "step": 36730 }, { "entropy": 0.03602319527417421, "epoch": 8.563002680965148, "grad_norm": 0.169921875, "learning_rate": 4.717228876317611e-05, "loss": 0.0041, "mean_token_accuracy": 0.9995147943496704, "num_tokens": 118015283.0, "step": 36735 }, { "entropy": 0.041568309720605615, "epoch": 8.564168317985779, "grad_norm": 0.435546875, "learning_rate": 4.7171321786705544e-05, "loss": 0.0033, "mean_token_accuracy": 0.9983395159244537, "num_tokens": 118055251.0, "step": 36740 }, { "entropy": 0.09163246341049672, "epoch": 8.56533395500641, "grad_norm": 0.11572265625, "learning_rate": 4.717035466602103e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996281206607819, "num_tokens": 118073912.0, "step": 36745 }, { "entropy": 0.050749783497303726, "epoch": 8.566499592027043, "grad_norm": 0.703125, "learning_rate": 4.7169387401136976e-05, "loss": 0.005, "mean_token_accuracy": 0.9987952232360839, "num_tokens": 118095700.0, "step": 36750 }, { "entropy": 0.06472921185195446, "epoch": 8.567665229047675, "grad_norm": 0.369140625, "learning_rate": 4.7168419992067816e-05, "loss": 0.0034, "mean_token_accuracy": 0.999310165643692, "num_tokens": 118106780.0, "step": 36755 }, { "entropy": 0.0620553707703948, "epoch": 8.568830866068307, "grad_norm": 0.435546875, "learning_rate": 4.716745243882797e-05, "loss": 0.005, "mean_token_accuracy": 0.9971357882022858, "num_tokens": 118127361.0, "step": 36760 }, { "entropy": 0.0793293721973896, "epoch": 8.569996503088937, "grad_norm": 1.9140625, "learning_rate": 4.7166484741431865e-05, "loss": 0.0073, "mean_token_accuracy": 0.9988360643386841, "num_tokens": 118142418.0, "step": 36765 }, { "entropy": 0.05423343572765589, "epoch": 8.57116214010957, "grad_norm": 0.7890625, "learning_rate": 4.7165516899893934e-05, "loss": 0.0026, "mean_token_accuracy": 0.9994962751865387, "num_tokens": 118154318.0, "step": 36770 }, { "entropy": 0.060041078738868235, "epoch": 8.572327777130202, "grad_norm": 0.2255859375, "learning_rate": 4.716454891422861e-05, "loss": 0.0023, "mean_token_accuracy": 0.9993019044399262, "num_tokens": 118176358.0, "step": 36775 }, { "entropy": 0.06099687637761235, "epoch": 8.573493414150834, "grad_norm": 1.5234375, "learning_rate": 4.716358078445033e-05, "loss": 0.0062, "mean_token_accuracy": 0.9983492612838745, "num_tokens": 118195838.0, "step": 36780 }, { "entropy": 0.06963885650038719, "epoch": 8.574659051171466, "grad_norm": 2.875, "learning_rate": 4.716261251057352e-05, "loss": 0.0069, "mean_token_accuracy": 0.9993766665458679, "num_tokens": 118206136.0, "step": 36785 }, { "entropy": 0.052096801623702046, "epoch": 8.575824688192096, "grad_norm": 0.27734375, "learning_rate": 4.7161644092612624e-05, "loss": 0.0024, "mean_token_accuracy": 0.9995839893817902, "num_tokens": 118227387.0, "step": 36790 }, { "entropy": 0.04954950464889407, "epoch": 8.576990325212728, "grad_norm": 0.29296875, "learning_rate": 4.7160675530582084e-05, "loss": 0.0023, "mean_token_accuracy": 0.9993766367435455, "num_tokens": 118253458.0, "step": 36795 }, { "entropy": 0.0848664847202599, "epoch": 8.57815596223336, "grad_norm": 0.7421875, "learning_rate": 4.715970682449634e-05, "loss": 0.0016, "mean_token_accuracy": 0.9990188479423523, "num_tokens": 118277832.0, "step": 36800 }, { "entropy": 0.05834921356290579, "epoch": 8.579321599253992, "grad_norm": 2.234375, "learning_rate": 4.715873797436984e-05, "loss": 0.0025, "mean_token_accuracy": 0.9983950853347778, "num_tokens": 118291990.0, "step": 36805 }, { "entropy": 0.0686840882524848, "epoch": 8.580487236274625, "grad_norm": 0.349609375, "learning_rate": 4.715776898021702e-05, "loss": 0.0019, "mean_token_accuracy": 0.9983816087245941, "num_tokens": 118316014.0, "step": 36810 }, { "entropy": 0.07611836092546582, "epoch": 8.581652873295257, "grad_norm": 1.21875, "learning_rate": 4.715679984205236e-05, "loss": 0.0048, "mean_token_accuracy": 0.9976979672908783, "num_tokens": 118333823.0, "step": 36815 }, { "entropy": 0.07102014757692814, "epoch": 8.582818510315887, "grad_norm": 0.87109375, "learning_rate": 4.715583055989027e-05, "loss": 0.0026, "mean_token_accuracy": 0.9995433807373046, "num_tokens": 118343033.0, "step": 36820 }, { "entropy": 0.049099778011441234, "epoch": 8.583984147336519, "grad_norm": 0.53515625, "learning_rate": 4.715486113374523e-05, "loss": 0.0016, "mean_token_accuracy": 0.9993889331817627, "num_tokens": 118362082.0, "step": 36825 }, { "entropy": 0.05649897027760744, "epoch": 8.585149784357151, "grad_norm": 0.283203125, "learning_rate": 4.715389156363169e-05, "loss": 0.0017, "mean_token_accuracy": 0.9999586164951324, "num_tokens": 118376154.0, "step": 36830 }, { "entropy": 0.050150804314762355, "epoch": 8.586315421377783, "grad_norm": 0.1259765625, "learning_rate": 4.71529218495641e-05, "loss": 0.0024, "mean_token_accuracy": 0.9996409952640534, "num_tokens": 118407431.0, "step": 36835 }, { "entropy": 0.07235877737402915, "epoch": 8.587481058398415, "grad_norm": 5.21875, "learning_rate": 4.715195199155694e-05, "loss": 0.0092, "mean_token_accuracy": 0.9982260286808013, "num_tokens": 118426403.0, "step": 36840 }, { "entropy": 0.05380323426797986, "epoch": 8.588646695419046, "grad_norm": 1.1171875, "learning_rate": 4.7150981989624646e-05, "loss": 0.0025, "mean_token_accuracy": 0.9996935486793518, "num_tokens": 118440910.0, "step": 36845 }, { "entropy": 0.06442772559821605, "epoch": 8.589812332439678, "grad_norm": 0.74609375, "learning_rate": 4.71500118437817e-05, "loss": 0.0052, "mean_token_accuracy": 0.9995631098747253, "num_tokens": 118454440.0, "step": 36850 }, { "entropy": 0.06424436261877417, "epoch": 8.59097796946031, "grad_norm": 0.5234375, "learning_rate": 4.714904155404256e-05, "loss": 0.0044, "mean_token_accuracy": 0.9995743334293365, "num_tokens": 118477017.0, "step": 36855 }, { "entropy": 0.06407613288611173, "epoch": 8.592143606480942, "grad_norm": 0.384765625, "learning_rate": 4.71480711204217e-05, "loss": 0.003, "mean_token_accuracy": 0.9990883469581604, "num_tokens": 118498748.0, "step": 36860 }, { "entropy": 0.06708194902166724, "epoch": 8.593309243501574, "grad_norm": 0.2490234375, "learning_rate": 4.7147100542933585e-05, "loss": 0.0031, "mean_token_accuracy": 0.9988902091979981, "num_tokens": 118511220.0, "step": 36865 }, { "entropy": 0.05490873893722892, "epoch": 8.594474880522206, "grad_norm": 0.19140625, "learning_rate": 4.714612982159269e-05, "loss": 0.0099, "mean_token_accuracy": 0.9985246181488037, "num_tokens": 118531407.0, "step": 36870 }, { "entropy": 0.05289556067436933, "epoch": 8.595640517542837, "grad_norm": 0.4609375, "learning_rate": 4.71451589564135e-05, "loss": 0.0076, "mean_token_accuracy": 0.9989979445934296, "num_tokens": 118553472.0, "step": 36875 }, { "entropy": 0.056383601669222114, "epoch": 8.596806154563469, "grad_norm": 1.4453125, "learning_rate": 4.714418794741048e-05, "loss": 0.0027, "mean_token_accuracy": 0.999051034450531, "num_tokens": 118571136.0, "step": 36880 }, { "entropy": 0.07303898371756076, "epoch": 8.5979717915841, "grad_norm": 1.1015625, "learning_rate": 4.714321679459811e-05, "loss": 0.0016, "mean_token_accuracy": 0.9996551752090455, "num_tokens": 118580619.0, "step": 36885 }, { "entropy": 0.06205555312335491, "epoch": 8.599137428604733, "grad_norm": 0.2294921875, "learning_rate": 4.7142245497990863e-05, "loss": 0.0108, "mean_token_accuracy": 0.9981782078742981, "num_tokens": 118602080.0, "step": 36890 }, { "entropy": 0.05046365726739168, "epoch": 8.600303065625365, "grad_norm": 0.1552734375, "learning_rate": 4.7141274057603246e-05, "loss": 0.0012, "mean_token_accuracy": 0.9998971283435821, "num_tokens": 118632524.0, "step": 36895 }, { "entropy": 0.058700887486338615, "epoch": 8.601468702645995, "grad_norm": 1.828125, "learning_rate": 4.714030247344973e-05, "loss": 0.0067, "mean_token_accuracy": 0.9982360363006592, "num_tokens": 118644958.0, "step": 36900 }, { "entropy": 0.061605300009250644, "epoch": 8.602634339666627, "grad_norm": 2.953125, "learning_rate": 4.713933074554479e-05, "loss": 0.0121, "mean_token_accuracy": 0.9974478423595429, "num_tokens": 118668839.0, "step": 36905 }, { "entropy": 0.0644808927550912, "epoch": 8.60379997668726, "grad_norm": 2.796875, "learning_rate": 4.713835887390295e-05, "loss": 0.004, "mean_token_accuracy": 0.998391056060791, "num_tokens": 118690493.0, "step": 36910 }, { "entropy": 0.06565331649035215, "epoch": 8.604965613707892, "grad_norm": 0.9921875, "learning_rate": 4.713738685853867e-05, "loss": 0.0018, "mean_token_accuracy": 0.9990341901779175, "num_tokens": 118713103.0, "step": 36915 }, { "entropy": 0.056130803655833006, "epoch": 8.606131250728524, "grad_norm": 0.345703125, "learning_rate": 4.713641469946646e-05, "loss": 0.0031, "mean_token_accuracy": 0.9994325637817383, "num_tokens": 118739582.0, "step": 36920 }, { "entropy": 0.04501139735803008, "epoch": 8.607296887749154, "grad_norm": 0.2392578125, "learning_rate": 4.713544239670082e-05, "loss": 0.0023, "mean_token_accuracy": 0.9992862045764923, "num_tokens": 118767796.0, "step": 36925 }, { "entropy": 0.061400901339948175, "epoch": 8.608462524769786, "grad_norm": 1.3515625, "learning_rate": 4.7134469950256234e-05, "loss": 0.0016, "mean_token_accuracy": 0.9987517178058625, "num_tokens": 118783178.0, "step": 36930 }, { "entropy": 0.08621167857199907, "epoch": 8.609628161790418, "grad_norm": 0.470703125, "learning_rate": 4.713349736014721e-05, "loss": 0.0037, "mean_token_accuracy": 0.9996699929237366, "num_tokens": 118798519.0, "step": 36935 }, { "entropy": 0.07519685104489326, "epoch": 8.61079379881105, "grad_norm": 0.154296875, "learning_rate": 4.713252462638825e-05, "loss": 0.0018, "mean_token_accuracy": 0.9999246597290039, "num_tokens": 118812634.0, "step": 36940 }, { "entropy": 0.07892098985612392, "epoch": 8.611959435831682, "grad_norm": 0.5390625, "learning_rate": 4.7131551748993865e-05, "loss": 0.0042, "mean_token_accuracy": 0.9985935688018799, "num_tokens": 118823025.0, "step": 36945 }, { "entropy": 0.06437665317207575, "epoch": 8.613125072852315, "grad_norm": 0.3046875, "learning_rate": 4.7130578727978555e-05, "loss": 0.0009, "mean_token_accuracy": 0.999948388338089, "num_tokens": 118845104.0, "step": 36950 }, { "entropy": 0.06747263586148619, "epoch": 8.614290709872945, "grad_norm": 0.44921875, "learning_rate": 4.7129605563356826e-05, "loss": 0.0028, "mean_token_accuracy": 0.9994589805603027, "num_tokens": 118891752.0, "step": 36955 }, { "entropy": 0.05360950659960508, "epoch": 8.615456346893577, "grad_norm": 0.62890625, "learning_rate": 4.71286322551432e-05, "loss": 0.0031, "mean_token_accuracy": 0.998487788438797, "num_tokens": 118911281.0, "step": 36960 }, { "entropy": 0.06426534093916417, "epoch": 8.616621983914209, "grad_norm": 0.8203125, "learning_rate": 4.712765880335218e-05, "loss": 0.0082, "mean_token_accuracy": 0.997918164730072, "num_tokens": 118921605.0, "step": 36965 }, { "entropy": 0.055563436821103096, "epoch": 8.617787620934841, "grad_norm": 1.65625, "learning_rate": 4.712668520799829e-05, "loss": 0.0064, "mean_token_accuracy": 0.99852534532547, "num_tokens": 118934730.0, "step": 36970 }, { "entropy": 0.06757491566240788, "epoch": 8.618953257955473, "grad_norm": 3.203125, "learning_rate": 4.712571146909604e-05, "loss": 0.0055, "mean_token_accuracy": 0.9983723163604736, "num_tokens": 118946537.0, "step": 36975 }, { "entropy": 0.05890776924788952, "epoch": 8.620118894976104, "grad_norm": 0.984375, "learning_rate": 4.712473758665995e-05, "loss": 0.0049, "mean_token_accuracy": 0.9985151171684266, "num_tokens": 118957927.0, "step": 36980 }, { "entropy": 0.05017476119101048, "epoch": 8.621284531996736, "grad_norm": 1.3671875, "learning_rate": 4.712376356070456e-05, "loss": 0.0042, "mean_token_accuracy": 0.9995585024356842, "num_tokens": 118980899.0, "step": 36985 }, { "entropy": 0.06962464861571789, "epoch": 8.622450169017368, "grad_norm": 1.0625, "learning_rate": 4.712278939124437e-05, "loss": 0.0069, "mean_token_accuracy": 0.998376590013504, "num_tokens": 118992807.0, "step": 36990 }, { "entropy": 0.07402192037552595, "epoch": 8.623615806038, "grad_norm": 0.388671875, "learning_rate": 4.7121815078293926e-05, "loss": 0.0074, "mean_token_accuracy": 0.9978413581848145, "num_tokens": 119012183.0, "step": 36995 }, { "entropy": 0.06642863359302283, "epoch": 8.624781443058632, "grad_norm": 2.78125, "learning_rate": 4.712084062186774e-05, "loss": 0.0095, "mean_token_accuracy": 0.997755628824234, "num_tokens": 119025096.0, "step": 37000 }, { "entropy": 0.05094387661665678, "epoch": 8.625947080079264, "grad_norm": 0.373046875, "learning_rate": 4.711986602198035e-05, "loss": 0.0028, "mean_token_accuracy": 0.9983302295207978, "num_tokens": 119056220.0, "step": 37005 }, { "entropy": 0.04212229116819799, "epoch": 8.627112717099894, "grad_norm": 0.8671875, "learning_rate": 4.711889127864629e-05, "loss": 0.0025, "mean_token_accuracy": 0.9991898596286773, "num_tokens": 119081884.0, "step": 37010 }, { "entropy": 0.07550401668995618, "epoch": 8.628278354120527, "grad_norm": 0.322265625, "learning_rate": 4.711791639188009e-05, "loss": 0.0125, "mean_token_accuracy": 0.9969983458518982, "num_tokens": 119098151.0, "step": 37015 }, { "entropy": 0.0732094880193472, "epoch": 8.629443991141159, "grad_norm": 2.921875, "learning_rate": 4.71169413616963e-05, "loss": 0.0079, "mean_token_accuracy": 0.9970404207706451, "num_tokens": 119116875.0, "step": 37020 }, { "entropy": 0.058289825543761255, "epoch": 8.63060962816179, "grad_norm": 1.0859375, "learning_rate": 4.711596618810944e-05, "loss": 0.0019, "mean_token_accuracy": 0.9994800806045532, "num_tokens": 119139337.0, "step": 37025 }, { "entropy": 0.05543985888361931, "epoch": 8.631775265182423, "grad_norm": 1.015625, "learning_rate": 4.711499087113406e-05, "loss": 0.0026, "mean_token_accuracy": 0.998934018611908, "num_tokens": 119152298.0, "step": 37030 }, { "entropy": 0.0805392375215888, "epoch": 8.632940902203053, "grad_norm": 0.11328125, "learning_rate": 4.7114015410784705e-05, "loss": 0.0027, "mean_token_accuracy": 0.9997101426124573, "num_tokens": 119162999.0, "step": 37035 }, { "entropy": 0.05598073918372393, "epoch": 8.634106539223685, "grad_norm": 0.275390625, "learning_rate": 4.711303980707593e-05, "loss": 0.0052, "mean_token_accuracy": 0.9981026768684387, "num_tokens": 119190278.0, "step": 37040 }, { "entropy": 0.05474949460476637, "epoch": 8.635272176244317, "grad_norm": 0.26953125, "learning_rate": 4.7112064060022266e-05, "loss": 0.0068, "mean_token_accuracy": 0.998596829175949, "num_tokens": 119212347.0, "step": 37045 }, { "entropy": 0.05179757541045547, "epoch": 8.63643781326495, "grad_norm": 0.162109375, "learning_rate": 4.711108816963827e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997380375862122, "num_tokens": 119237205.0, "step": 37050 }, { "entropy": 0.05135552119463682, "epoch": 8.637603450285582, "grad_norm": 0.3203125, "learning_rate": 4.711011213593849e-05, "loss": 0.003, "mean_token_accuracy": 0.9992295384407044, "num_tokens": 119260123.0, "step": 37055 }, { "entropy": 0.07269342504441738, "epoch": 8.638769087306212, "grad_norm": 2.734375, "learning_rate": 4.710913595893749e-05, "loss": 0.0092, "mean_token_accuracy": 0.9974887371063232, "num_tokens": 119269978.0, "step": 37060 }, { "entropy": 0.06440305057913065, "epoch": 8.639934724326844, "grad_norm": 0.310546875, "learning_rate": 4.710815963864981e-05, "loss": 0.0051, "mean_token_accuracy": 0.9989653289318084, "num_tokens": 119290205.0, "step": 37065 }, { "entropy": 0.0616750443354249, "epoch": 8.641100361347476, "grad_norm": 0.46484375, "learning_rate": 4.7107183175090034e-05, "loss": 0.0025, "mean_token_accuracy": 0.9990861892700196, "num_tokens": 119308798.0, "step": 37070 }, { "entropy": 0.052800285443663594, "epoch": 8.642265998368108, "grad_norm": 0.1826171875, "learning_rate": 4.710620656827269e-05, "loss": 0.0017, "mean_token_accuracy": 0.9986510217189789, "num_tokens": 119335956.0, "step": 37075 }, { "entropy": 0.04528741203248501, "epoch": 8.64343163538874, "grad_norm": 0.40234375, "learning_rate": 4.7105229818212363e-05, "loss": 0.0016, "mean_token_accuracy": 0.9998533010482789, "num_tokens": 119369936.0, "step": 37080 }, { "entropy": 0.05757056567817927, "epoch": 8.644597272409372, "grad_norm": 0.44921875, "learning_rate": 4.710425292492362e-05, "loss": 0.0038, "mean_token_accuracy": 0.9993304073810577, "num_tokens": 119385045.0, "step": 37085 }, { "entropy": 0.07393027395009995, "epoch": 8.645762909430003, "grad_norm": 2.21875, "learning_rate": 4.710327588842101e-05, "loss": 0.0058, "mean_token_accuracy": 0.9989072501659393, "num_tokens": 119403772.0, "step": 37090 }, { "entropy": 0.06416831258684397, "epoch": 8.646928546450635, "grad_norm": 0.06640625, "learning_rate": 4.7102298708719114e-05, "loss": 0.003, "mean_token_accuracy": 0.9994474589824677, "num_tokens": 119425992.0, "step": 37095 }, { "entropy": 0.05758734289556742, "epoch": 8.648094183471267, "grad_norm": 0.310546875, "learning_rate": 4.7101321385832506e-05, "loss": 0.0027, "mean_token_accuracy": 0.9988933801651001, "num_tokens": 119458267.0, "step": 37100 }, { "entropy": 0.059611622150987385, "epoch": 8.6492598204919, "grad_norm": 1.0078125, "learning_rate": 4.7100343919775755e-05, "loss": 0.0038, "mean_token_accuracy": 0.9986943900585175, "num_tokens": 119477394.0, "step": 37105 }, { "entropy": 0.05624700449407101, "epoch": 8.650425457512531, "grad_norm": 0.357421875, "learning_rate": 4.7099366310563436e-05, "loss": 0.0035, "mean_token_accuracy": 0.9987177789211273, "num_tokens": 119494246.0, "step": 37110 }, { "entropy": 0.04718580381013453, "epoch": 8.651591094533162, "grad_norm": 0.34765625, "learning_rate": 4.7098388558210125e-05, "loss": 0.0026, "mean_token_accuracy": 0.9990927994251251, "num_tokens": 119537066.0, "step": 37115 }, { "entropy": 0.05400928994640708, "epoch": 8.652756731553794, "grad_norm": 0.466796875, "learning_rate": 4.7097410662730404e-05, "loss": 0.0031, "mean_token_accuracy": 0.9985824465751648, "num_tokens": 119557231.0, "step": 37120 }, { "entropy": 0.0821190900169313, "epoch": 8.653922368574426, "grad_norm": 2.21875, "learning_rate": 4.7096432624138856e-05, "loss": 0.004, "mean_token_accuracy": 0.9990611732006073, "num_tokens": 119579195.0, "step": 37125 }, { "entropy": 0.0718055371195078, "epoch": 8.655088005595058, "grad_norm": 1.546875, "learning_rate": 4.709545444245006e-05, "loss": 0.0074, "mean_token_accuracy": 0.9984009981155395, "num_tokens": 119596227.0, "step": 37130 }, { "entropy": 0.0754847377538681, "epoch": 8.65625364261569, "grad_norm": 1.6171875, "learning_rate": 4.7094476117678615e-05, "loss": 0.0091, "mean_token_accuracy": 0.9982321262359619, "num_tokens": 119605099.0, "step": 37135 }, { "entropy": 0.05754865976050496, "epoch": 8.657419279636322, "grad_norm": 0.11083984375, "learning_rate": 4.7093497649839094e-05, "loss": 0.0024, "mean_token_accuracy": 0.9994746029376984, "num_tokens": 119621905.0, "step": 37140 }, { "entropy": 0.07236482677981257, "epoch": 8.658584916656952, "grad_norm": 0.216796875, "learning_rate": 4.709251903894609e-05, "loss": 0.0084, "mean_token_accuracy": 0.9981110274791718, "num_tokens": 119634260.0, "step": 37145 }, { "entropy": 0.044534440897405145, "epoch": 8.659750553677585, "grad_norm": 1.015625, "learning_rate": 4.709154028501421e-05, "loss": 0.0029, "mean_token_accuracy": 0.9993156492710114, "num_tokens": 119671143.0, "step": 37150 }, { "entropy": 0.04072251198813319, "epoch": 8.660916190698217, "grad_norm": 1.453125, "learning_rate": 4.709056138805803e-05, "loss": 0.0041, "mean_token_accuracy": 0.9978930294513703, "num_tokens": 119708077.0, "step": 37155 }, { "entropy": 0.060729991924017665, "epoch": 8.662081827718849, "grad_norm": 0.38671875, "learning_rate": 4.7089582348092155e-05, "loss": 0.0028, "mean_token_accuracy": 0.9985782206058502, "num_tokens": 119725703.0, "step": 37160 }, { "entropy": 0.06289136074483395, "epoch": 8.66324746473948, "grad_norm": 1.0859375, "learning_rate": 4.7088603165131184e-05, "loss": 0.0058, "mean_token_accuracy": 0.9981587767601013, "num_tokens": 119735588.0, "step": 37165 }, { "entropy": 0.0584659150801599, "epoch": 8.664413101760111, "grad_norm": 0.62890625, "learning_rate": 4.7087623839189716e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993330657482147, "num_tokens": 119757051.0, "step": 37170 }, { "entropy": 0.06662820726633072, "epoch": 8.665578738780743, "grad_norm": 1.953125, "learning_rate": 4.7086644370282355e-05, "loss": 0.003, "mean_token_accuracy": 0.9991186201572418, "num_tokens": 119768192.0, "step": 37175 }, { "entropy": 0.0852966820821166, "epoch": 8.666744375801375, "grad_norm": 0.431640625, "learning_rate": 4.708566475842371e-05, "loss": 0.0515, "mean_token_accuracy": 0.9915950953960418, "num_tokens": 119789693.0, "step": 37180 }, { "entropy": 0.05510229915380478, "epoch": 8.667910012822007, "grad_norm": 0.322265625, "learning_rate": 4.708468500362839e-05, "loss": 0.0106, "mean_token_accuracy": 0.9975094437599182, "num_tokens": 119820800.0, "step": 37185 }, { "entropy": 0.05542775820940733, "epoch": 8.66907564984264, "grad_norm": 0.30078125, "learning_rate": 4.708370510591099e-05, "loss": 0.0019, "mean_token_accuracy": 0.9986540079116821, "num_tokens": 119839424.0, "step": 37190 }, { "entropy": 0.05367962615564466, "epoch": 8.67024128686327, "grad_norm": 0.91015625, "learning_rate": 4.7082725065286146e-05, "loss": 0.005, "mean_token_accuracy": 0.9973668694496155, "num_tokens": 119854949.0, "step": 37195 }, { "entropy": 0.0600742656737566, "epoch": 8.671406923883902, "grad_norm": 0.859375, "learning_rate": 4.708174488176845e-05, "loss": 0.0033, "mean_token_accuracy": 0.999224054813385, "num_tokens": 119870736.0, "step": 37200 }, { "entropy": 0.054541374929249285, "epoch": 8.672572560904534, "grad_norm": 2.34375, "learning_rate": 4.708076455537253e-05, "loss": 0.0102, "mean_token_accuracy": 0.9980546355247497, "num_tokens": 119889971.0, "step": 37205 }, { "entropy": 0.05923604611307383, "epoch": 8.673738197925166, "grad_norm": 0.58984375, "learning_rate": 4.707978408611299e-05, "loss": 0.0019, "mean_token_accuracy": 0.9991122007369995, "num_tokens": 119905624.0, "step": 37210 }, { "entropy": 0.04516512975096702, "epoch": 8.674903834945798, "grad_norm": 0.341796875, "learning_rate": 4.707880347400447e-05, "loss": 0.0028, "mean_token_accuracy": 0.9992085576057435, "num_tokens": 119939204.0, "step": 37215 }, { "entropy": 0.07773965373635291, "epoch": 8.67606947196643, "grad_norm": 1.5546875, "learning_rate": 4.707782271906158e-05, "loss": 0.0109, "mean_token_accuracy": 0.9983750343322754, "num_tokens": 119949517.0, "step": 37220 }, { "entropy": 0.054637028463184835, "epoch": 8.67723510898706, "grad_norm": 1.296875, "learning_rate": 4.707684182129894e-05, "loss": 0.0027, "mean_token_accuracy": 0.9994326770305634, "num_tokens": 119971182.0, "step": 37225 }, { "entropy": 0.047064932715147736, "epoch": 8.678400746007693, "grad_norm": 0.22265625, "learning_rate": 4.70758607807312e-05, "loss": 0.0063, "mean_token_accuracy": 0.99907346367836, "num_tokens": 119996422.0, "step": 37230 }, { "entropy": 0.07008444052189589, "epoch": 8.679566383028325, "grad_norm": 0.49609375, "learning_rate": 4.707487959737296e-05, "loss": 0.0023, "mean_token_accuracy": 0.9998964786529541, "num_tokens": 120006560.0, "step": 37235 }, { "entropy": 0.04979623667895794, "epoch": 8.680732020048957, "grad_norm": 1.015625, "learning_rate": 4.707389827123887e-05, "loss": 0.0045, "mean_token_accuracy": 0.999424010515213, "num_tokens": 120033004.0, "step": 37240 }, { "entropy": 0.04261988895013928, "epoch": 8.68189765706959, "grad_norm": 0.251953125, "learning_rate": 4.707291680234356e-05, "loss": 0.0051, "mean_token_accuracy": 0.998600310087204, "num_tokens": 120065940.0, "step": 37245 }, { "entropy": 0.058392337942495945, "epoch": 8.68306329409022, "grad_norm": 0.07958984375, "learning_rate": 4.7071935190701657e-05, "loss": 0.0021, "mean_token_accuracy": 0.9992400527000427, "num_tokens": 120088547.0, "step": 37250 }, { "entropy": 0.0629306823015213, "epoch": 8.684228931110852, "grad_norm": 0.035888671875, "learning_rate": 4.70709534363278e-05, "loss": 0.0031, "mean_token_accuracy": 0.9986060202121735, "num_tokens": 120101167.0, "step": 37255 }, { "entropy": 0.06437505278736352, "epoch": 8.685394568131484, "grad_norm": 1.0234375, "learning_rate": 4.706997153923663e-05, "loss": 0.0054, "mean_token_accuracy": 0.9984506249427796, "num_tokens": 120113468.0, "step": 37260 }, { "entropy": 0.06895528947934508, "epoch": 8.686560205152116, "grad_norm": 0.2890625, "learning_rate": 4.70689894994428e-05, "loss": 0.004, "mean_token_accuracy": 0.9981598138809205, "num_tokens": 120132392.0, "step": 37265 }, { "entropy": 0.07193691097199917, "epoch": 8.687725842172748, "grad_norm": 0.984375, "learning_rate": 4.706800731696094e-05, "loss": 0.0024, "mean_token_accuracy": 0.9991972386837006, "num_tokens": 120144937.0, "step": 37270 }, { "entropy": 0.06414989028126002, "epoch": 8.68889147919338, "grad_norm": 1.0390625, "learning_rate": 4.70670249918057e-05, "loss": 0.0035, "mean_token_accuracy": 0.9996701180934906, "num_tokens": 120154910.0, "step": 37275 }, { "entropy": 0.04639833634719252, "epoch": 8.69005711621401, "grad_norm": 0.1298828125, "learning_rate": 4.7066042523991726e-05, "loss": 0.0032, "mean_token_accuracy": 0.9986519098281861, "num_tokens": 120189822.0, "step": 37280 }, { "entropy": 0.05604259353131056, "epoch": 8.691222753234642, "grad_norm": 0.1318359375, "learning_rate": 4.706505991353367e-05, "loss": 0.0073, "mean_token_accuracy": 0.9979808628559113, "num_tokens": 120210678.0, "step": 37285 }, { "entropy": 0.06102119609713554, "epoch": 8.692388390255275, "grad_norm": 0.66015625, "learning_rate": 4.7064077160446186e-05, "loss": 0.0021, "mean_token_accuracy": 0.9990385472774506, "num_tokens": 120226506.0, "step": 37290 }, { "entropy": 0.06351532833650708, "epoch": 8.693554027275907, "grad_norm": 0.462890625, "learning_rate": 4.7063094264743926e-05, "loss": 0.0045, "mean_token_accuracy": 0.9987837851047516, "num_tokens": 120247799.0, "step": 37295 }, { "entropy": 0.061118031945079566, "epoch": 8.694719664296539, "grad_norm": 1.234375, "learning_rate": 4.706211122644155e-05, "loss": 0.0032, "mean_token_accuracy": 0.9995421290397644, "num_tokens": 120271327.0, "step": 37300 }, { "entropy": 0.05696291178464889, "epoch": 8.695885301317169, "grad_norm": 0.50390625, "learning_rate": 4.70611280455537e-05, "loss": 0.0045, "mean_token_accuracy": 0.9981352150440216, "num_tokens": 120285859.0, "step": 37305 }, { "entropy": 0.05032131155021489, "epoch": 8.697050938337801, "grad_norm": 0.3671875, "learning_rate": 4.706014472209506e-05, "loss": 0.0054, "mean_token_accuracy": 0.999362564086914, "num_tokens": 120315442.0, "step": 37310 }, { "entropy": 0.0457731731235981, "epoch": 8.698216575358433, "grad_norm": 1.109375, "learning_rate": 4.7059161256080284e-05, "loss": 0.0037, "mean_token_accuracy": 0.9987881302833557, "num_tokens": 120341149.0, "step": 37315 }, { "entropy": 0.06630498059093952, "epoch": 8.699382212379065, "grad_norm": 1.046875, "learning_rate": 4.705817764752404e-05, "loss": 0.0032, "mean_token_accuracy": 0.9983902394771575, "num_tokens": 120360094.0, "step": 37320 }, { "entropy": 0.052767217718064786, "epoch": 8.700547849399697, "grad_norm": 0.201171875, "learning_rate": 4.7057193896440984e-05, "loss": 0.0018, "mean_token_accuracy": 0.9992530107498169, "num_tokens": 120405568.0, "step": 37325 }, { "entropy": 0.07418045364320278, "epoch": 8.701713486420328, "grad_norm": 2.140625, "learning_rate": 4.705621000284579e-05, "loss": 0.004, "mean_token_accuracy": 0.9987151980400085, "num_tokens": 120415566.0, "step": 37330 }, { "entropy": 0.06375032095238567, "epoch": 8.70287912344096, "grad_norm": 0.7265625, "learning_rate": 4.705522596675314e-05, "loss": 0.0021, "mean_token_accuracy": 0.9989543080329895, "num_tokens": 120431073.0, "step": 37335 }, { "entropy": 0.060994432866573335, "epoch": 8.704044760461592, "grad_norm": 0.28125, "learning_rate": 4.705424178817769e-05, "loss": 0.0029, "mean_token_accuracy": 0.9991658449172973, "num_tokens": 120447912.0, "step": 37340 }, { "entropy": 0.05534893814474344, "epoch": 8.705210397482224, "grad_norm": 0.7734375, "learning_rate": 4.7053257467134125e-05, "loss": 0.0049, "mean_token_accuracy": 0.9992211759090424, "num_tokens": 120480904.0, "step": 37345 }, { "entropy": 0.07157540544867516, "epoch": 8.706376034502856, "grad_norm": 0.64453125, "learning_rate": 4.705227300363713e-05, "loss": 0.0018, "mean_token_accuracy": 0.9997665584087372, "num_tokens": 120500577.0, "step": 37350 }, { "entropy": 0.07752133421599865, "epoch": 8.707541671523488, "grad_norm": 0.4375, "learning_rate": 4.705128839770137e-05, "loss": 0.0075, "mean_token_accuracy": 0.9980410099029541, "num_tokens": 120513824.0, "step": 37355 }, { "entropy": 0.0577980768866837, "epoch": 8.708707308544119, "grad_norm": 0.52734375, "learning_rate": 4.705030364934154e-05, "loss": 0.0051, "mean_token_accuracy": 0.9989452242851258, "num_tokens": 120532064.0, "step": 37360 }, { "entropy": 0.06213938985019922, "epoch": 8.70987294556475, "grad_norm": 2.078125, "learning_rate": 4.7049318758572316e-05, "loss": 0.0051, "mean_token_accuracy": 0.9985716700553894, "num_tokens": 120555377.0, "step": 37365 }, { "entropy": 0.04146665600128472, "epoch": 8.711038582585383, "grad_norm": 0.267578125, "learning_rate": 4.7048333725408386e-05, "loss": 0.0022, "mean_token_accuracy": 0.9992313742637634, "num_tokens": 120588649.0, "step": 37370 }, { "entropy": 0.08321618214249611, "epoch": 8.712204219606015, "grad_norm": 0.234375, "learning_rate": 4.704734854986443e-05, "loss": 0.0041, "mean_token_accuracy": 0.9995436012744904, "num_tokens": 120600085.0, "step": 37375 }, { "entropy": 0.045752164581790565, "epoch": 8.713369856626647, "grad_norm": 0.3203125, "learning_rate": 4.704636323195516e-05, "loss": 0.0018, "mean_token_accuracy": 0.9973626852035522, "num_tokens": 120634174.0, "step": 37380 }, { "entropy": 0.05586595851927996, "epoch": 8.714535493647277, "grad_norm": 0.2109375, "learning_rate": 4.7045377771695254e-05, "loss": 0.0034, "mean_token_accuracy": 0.9989406883716583, "num_tokens": 120648975.0, "step": 37385 }, { "entropy": 0.07887803390622139, "epoch": 8.71570113066791, "grad_norm": 0.59765625, "learning_rate": 4.7044392169099406e-05, "loss": 0.0022, "mean_token_accuracy": 0.9995689630508423, "num_tokens": 120662651.0, "step": 37390 }, { "entropy": 0.04403102770447731, "epoch": 8.716866767688542, "grad_norm": 0.451171875, "learning_rate": 4.704340642418231e-05, "loss": 0.0058, "mean_token_accuracy": 0.9982415914535523, "num_tokens": 120697029.0, "step": 37395 }, { "entropy": 0.07757324762642384, "epoch": 8.718032404709174, "grad_norm": 0.15234375, "learning_rate": 4.704242053695868e-05, "loss": 0.0035, "mean_token_accuracy": 0.9992213308811188, "num_tokens": 120706590.0, "step": 37400 }, { "entropy": 0.048875637259334324, "epoch": 8.719198041729806, "grad_norm": 1.6796875, "learning_rate": 4.7041434507443195e-05, "loss": 0.0057, "mean_token_accuracy": 0.9986173272132873, "num_tokens": 120730767.0, "step": 37405 }, { "entropy": 0.07898346781730652, "epoch": 8.720363678750438, "grad_norm": 1.125, "learning_rate": 4.704044833565058e-05, "loss": 0.0049, "mean_token_accuracy": 0.9988845109939575, "num_tokens": 120742874.0, "step": 37410 }, { "entropy": 0.06409270605072379, "epoch": 8.721529315771068, "grad_norm": 0.031005859375, "learning_rate": 4.7039462021595524e-05, "loss": 0.0079, "mean_token_accuracy": 0.9972718060016632, "num_tokens": 120765821.0, "step": 37415 }, { "entropy": 0.064057532325387, "epoch": 8.7226949527917, "grad_norm": 0.7890625, "learning_rate": 4.703847556529275e-05, "loss": 0.0086, "mean_token_accuracy": 0.9980102241039276, "num_tokens": 120775932.0, "step": 37420 }, { "entropy": 0.052418787498027086, "epoch": 8.723860589812332, "grad_norm": 0.5234375, "learning_rate": 4.7037488966756947e-05, "loss": 0.0021, "mean_token_accuracy": 0.9994726002216339, "num_tokens": 120812212.0, "step": 37425 }, { "entropy": 0.07726996615529061, "epoch": 8.725026226832965, "grad_norm": 0.78515625, "learning_rate": 4.7036502226002846e-05, "loss": 0.0135, "mean_token_accuracy": 0.9986808180809021, "num_tokens": 120835862.0, "step": 37430 }, { "entropy": 0.0491664957255125, "epoch": 8.726191863853597, "grad_norm": 0.40234375, "learning_rate": 4.7035515343045154e-05, "loss": 0.006, "mean_token_accuracy": 0.9971634745597839, "num_tokens": 120849769.0, "step": 37435 }, { "entropy": 0.07634511133655905, "epoch": 8.727357500874227, "grad_norm": 0.61328125, "learning_rate": 4.703452831789858e-05, "loss": 0.0026, "mean_token_accuracy": 0.9992483615875244, "num_tokens": 120871055.0, "step": 37440 }, { "entropy": 0.062133604381233457, "epoch": 8.72852313789486, "grad_norm": 0.265625, "learning_rate": 4.7033541150577855e-05, "loss": 0.0057, "mean_token_accuracy": 0.9979267299175263, "num_tokens": 120896972.0, "step": 37445 }, { "entropy": 0.050842320267111066, "epoch": 8.729688774915491, "grad_norm": 0.87890625, "learning_rate": 4.7032553841097685e-05, "loss": 0.0022, "mean_token_accuracy": 0.9996079862117767, "num_tokens": 120920337.0, "step": 37450 }, { "entropy": 0.06864991504698992, "epoch": 8.730854411936123, "grad_norm": 0.21875, "learning_rate": 4.703156638947281e-05, "loss": 0.0046, "mean_token_accuracy": 0.9985345125198364, "num_tokens": 120929393.0, "step": 37455 }, { "entropy": 0.06609613662585616, "epoch": 8.732020048956755, "grad_norm": 1.546875, "learning_rate": 4.703057879571793e-05, "loss": 0.0021, "mean_token_accuracy": 0.9994767427444458, "num_tokens": 120942647.0, "step": 37460 }, { "entropy": 0.08063123216852545, "epoch": 8.733185685977386, "grad_norm": 0.44921875, "learning_rate": 4.70295910598478e-05, "loss": 0.0028, "mean_token_accuracy": 0.9994502007961273, "num_tokens": 120957571.0, "step": 37465 }, { "entropy": 0.04949803929775953, "epoch": 8.734351322998018, "grad_norm": 0.365234375, "learning_rate": 4.7028603181877124e-05, "loss": 0.0014, "mean_token_accuracy": 0.999658590555191, "num_tokens": 120988159.0, "step": 37470 }, { "entropy": 0.0553031301125884, "epoch": 8.73551696001865, "grad_norm": 0.30859375, "learning_rate": 4.702761516182065e-05, "loss": 0.0032, "mean_token_accuracy": 0.9988710165023804, "num_tokens": 121009241.0, "step": 37475 }, { "entropy": 0.05864779045805335, "epoch": 8.736682597039282, "grad_norm": 0.7265625, "learning_rate": 4.70266269996931e-05, "loss": 0.0079, "mean_token_accuracy": 0.9978500843048096, "num_tokens": 121026722.0, "step": 37480 }, { "entropy": 0.07253898419439793, "epoch": 8.737848234059914, "grad_norm": 1.90625, "learning_rate": 4.7025638695509205e-05, "loss": 0.0047, "mean_token_accuracy": 0.9983980894088745, "num_tokens": 121042962.0, "step": 37485 }, { "entropy": 0.051535771181806925, "epoch": 8.739013871080546, "grad_norm": 0.1982421875, "learning_rate": 4.702465024928372e-05, "loss": 0.0018, "mean_token_accuracy": 0.9981653451919555, "num_tokens": 121080611.0, "step": 37490 }, { "entropy": 0.05589711694046855, "epoch": 8.740179508101177, "grad_norm": 0.51953125, "learning_rate": 4.702366166103137e-05, "loss": 0.0021, "mean_token_accuracy": 0.9993067443370819, "num_tokens": 121105123.0, "step": 37495 }, { "entropy": 0.05385690657421947, "epoch": 8.741345145121809, "grad_norm": 0.412109375, "learning_rate": 4.70226729307669e-05, "loss": 0.0039, "mean_token_accuracy": 0.999626749753952, "num_tokens": 121129332.0, "step": 37500 }, { "entropy": 0.062184521462768316, "epoch": 8.74251078214244, "grad_norm": 0.322265625, "learning_rate": 4.7021684058505054e-05, "loss": 0.0028, "mean_token_accuracy": 0.9991961359977722, "num_tokens": 121145454.0, "step": 37505 }, { "entropy": 0.060806962009519336, "epoch": 8.743676419163073, "grad_norm": 0.1689453125, "learning_rate": 4.702069504426058e-05, "loss": 0.012, "mean_token_accuracy": 0.9983984470367432, "num_tokens": 121164100.0, "step": 37510 }, { "entropy": 0.05776356812566519, "epoch": 8.744842056183705, "grad_norm": 2.734375, "learning_rate": 4.7019705888048214e-05, "loss": 0.0055, "mean_token_accuracy": 0.998740166425705, "num_tokens": 121178031.0, "step": 37515 }, { "entropy": 0.07295375410467386, "epoch": 8.746007693204335, "grad_norm": 0.349609375, "learning_rate": 4.7018716589882724e-05, "loss": 0.0026, "mean_token_accuracy": 0.9994298398494721, "num_tokens": 121195358.0, "step": 37520 }, { "entropy": 0.05422674883157015, "epoch": 8.747173330224967, "grad_norm": 2.109375, "learning_rate": 4.701772714977885e-05, "loss": 0.0035, "mean_token_accuracy": 0.9994776785373688, "num_tokens": 121231081.0, "step": 37525 }, { "entropy": 0.06482381774112582, "epoch": 8.7483389672456, "grad_norm": 0.24609375, "learning_rate": 4.7016737567751346e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999204099178314, "num_tokens": 121257291.0, "step": 37530 }, { "entropy": 0.05208018375560641, "epoch": 8.749504604266232, "grad_norm": 0.5078125, "learning_rate": 4.7015747843814974e-05, "loss": 0.0028, "mean_token_accuracy": 0.9984355688095092, "num_tokens": 121284555.0, "step": 37535 }, { "entropy": 0.046364251803606746, "epoch": 8.750670241286864, "grad_norm": 0.453125, "learning_rate": 4.701475797798449e-05, "loss": 0.0027, "mean_token_accuracy": 0.9997131705284119, "num_tokens": 121319114.0, "step": 37540 }, { "entropy": 0.04175833626650274, "epoch": 8.751835878307496, "grad_norm": 0.37890625, "learning_rate": 4.701376797027465e-05, "loss": 0.0099, "mean_token_accuracy": 0.9990463435649872, "num_tokens": 121359675.0, "step": 37545 }, { "entropy": 0.061986826453357936, "epoch": 8.753001515328126, "grad_norm": 3.03125, "learning_rate": 4.7012777820700226e-05, "loss": 0.0096, "mean_token_accuracy": 0.9976721167564392, "num_tokens": 121371584.0, "step": 37550 }, { "entropy": 0.046063092350959775, "epoch": 8.754167152348758, "grad_norm": 0.3671875, "learning_rate": 4.701178752927598e-05, "loss": 0.0057, "mean_token_accuracy": 0.9985677540302277, "num_tokens": 121412011.0, "step": 37555 }, { "entropy": 0.08060178887099027, "epoch": 8.75533278936939, "grad_norm": 2.046875, "learning_rate": 4.701079709601666e-05, "loss": 0.0342, "mean_token_accuracy": 0.9890652418136596, "num_tokens": 121452091.0, "step": 37560 }, { "entropy": 0.06487112455070018, "epoch": 8.756498426390023, "grad_norm": 2.078125, "learning_rate": 4.700980652093706e-05, "loss": 0.0066, "mean_token_accuracy": 0.9989998698234558, "num_tokens": 121463790.0, "step": 37565 }, { "entropy": 0.07172239758074284, "epoch": 8.757664063410655, "grad_norm": 2.6875, "learning_rate": 4.700881580405194e-05, "loss": 0.0076, "mean_token_accuracy": 0.9977281510829925, "num_tokens": 121477631.0, "step": 37570 }, { "entropy": 0.05877719409763813, "epoch": 8.758829700431285, "grad_norm": 0.345703125, "learning_rate": 4.7007824945376074e-05, "loss": 0.0019, "mean_token_accuracy": 0.9990646779537201, "num_tokens": 121502532.0, "step": 37575 }, { "entropy": 0.041250808723270894, "epoch": 8.759995337451917, "grad_norm": 0.34375, "learning_rate": 4.7006833944924236e-05, "loss": 0.0015, "mean_token_accuracy": 0.9991530656814576, "num_tokens": 121539955.0, "step": 37580 }, { "entropy": 0.06379560967907309, "epoch": 8.76116097447255, "grad_norm": 0.546875, "learning_rate": 4.70058428027112e-05, "loss": 0.0024, "mean_token_accuracy": 0.9996963918209076, "num_tokens": 121558100.0, "step": 37585 }, { "entropy": 0.06465062042698264, "epoch": 8.762326611493181, "grad_norm": 1.4921875, "learning_rate": 4.700485151875176e-05, "loss": 0.0015, "mean_token_accuracy": 0.99923455119133, "num_tokens": 121583763.0, "step": 37590 }, { "entropy": 0.06468153018504381, "epoch": 8.763492248513813, "grad_norm": 0.4296875, "learning_rate": 4.700386009306069e-05, "loss": 0.0024, "mean_token_accuracy": 0.9996022522449494, "num_tokens": 121604954.0, "step": 37595 }, { "entropy": 0.06871114894747735, "epoch": 8.764657885534444, "grad_norm": 0.98828125, "learning_rate": 4.700286852565276e-05, "loss": 0.01, "mean_token_accuracy": 0.997947508096695, "num_tokens": 121614592.0, "step": 37600 }, { "entropy": 0.04772069398313761, "epoch": 8.765823522555076, "grad_norm": 0.345703125, "learning_rate": 4.700187681654277e-05, "loss": 0.0042, "mean_token_accuracy": 0.9992965459823608, "num_tokens": 121638138.0, "step": 37605 }, { "entropy": 0.06294498881325125, "epoch": 8.766989159575708, "grad_norm": 0.400390625, "learning_rate": 4.700088496574551e-05, "loss": 0.0037, "mean_token_accuracy": 0.9988208770751953, "num_tokens": 121656429.0, "step": 37610 }, { "entropy": 0.07124684005975723, "epoch": 8.76815479659634, "grad_norm": 0.158203125, "learning_rate": 4.699989297327577e-05, "loss": 0.002, "mean_token_accuracy": 0.999472838640213, "num_tokens": 121667496.0, "step": 37615 }, { "entropy": 0.049650049302726984, "epoch": 8.769320433616972, "grad_norm": 0.181640625, "learning_rate": 4.6998900839148326e-05, "loss": 0.0029, "mean_token_accuracy": 0.9990351021289825, "num_tokens": 121693895.0, "step": 37620 }, { "entropy": 0.04391482761129737, "epoch": 8.770486070637604, "grad_norm": 0.12890625, "learning_rate": 4.6997908563377986e-05, "loss": 0.0052, "mean_token_accuracy": 0.9973374605178833, "num_tokens": 121741181.0, "step": 37625 }, { "entropy": 0.06794363204389811, "epoch": 8.771651707658235, "grad_norm": 0.96875, "learning_rate": 4.699691614597955e-05, "loss": 0.0046, "mean_token_accuracy": 0.9987189173698425, "num_tokens": 121751435.0, "step": 37630 }, { "entropy": 0.06100701736286283, "epoch": 8.772817344678867, "grad_norm": 0.259765625, "learning_rate": 4.6995923586967796e-05, "loss": 0.0023, "mean_token_accuracy": 0.9995702564716339, "num_tokens": 121772227.0, "step": 37635 }, { "entropy": 0.06428485959768296, "epoch": 8.773982981699499, "grad_norm": 3.421875, "learning_rate": 4.699493088635755e-05, "loss": 0.0101, "mean_token_accuracy": 0.9976272761821747, "num_tokens": 121793079.0, "step": 37640 }, { "entropy": 0.07052382789552211, "epoch": 8.77514861872013, "grad_norm": 1.53125, "learning_rate": 4.69939380441636e-05, "loss": 0.0048, "mean_token_accuracy": 0.9983150899410248, "num_tokens": 121803848.0, "step": 37645 }, { "entropy": 0.0693971360102296, "epoch": 8.776314255740763, "grad_norm": 1.34375, "learning_rate": 4.699294506040076e-05, "loss": 0.0052, "mean_token_accuracy": 0.9960627436637879, "num_tokens": 121818194.0, "step": 37650 }, { "entropy": 0.09031069064512849, "epoch": 8.777479892761393, "grad_norm": 1.2734375, "learning_rate": 4.6991951935083824e-05, "loss": 0.0029, "mean_token_accuracy": 0.9992822110652924, "num_tokens": 121833034.0, "step": 37655 }, { "entropy": 0.0589916012249887, "epoch": 8.778645529782025, "grad_norm": 2.390625, "learning_rate": 4.6990958668227615e-05, "loss": 0.0034, "mean_token_accuracy": 0.9991851389408112, "num_tokens": 121847066.0, "step": 37660 }, { "entropy": 0.06416790802031755, "epoch": 8.779811166802657, "grad_norm": 1.71875, "learning_rate": 4.6989965259846926e-05, "loss": 0.004, "mean_token_accuracy": 0.9991792678833008, "num_tokens": 121857992.0, "step": 37665 }, { "entropy": 0.055831207521259786, "epoch": 8.78097680382329, "grad_norm": 0.15625, "learning_rate": 4.698897170995658e-05, "loss": 0.0209, "mean_token_accuracy": 0.9955951690673828, "num_tokens": 121904833.0, "step": 37670 }, { "entropy": 0.0680293409153819, "epoch": 8.782142440843922, "grad_norm": 0.890625, "learning_rate": 4.698797801857141e-05, "loss": 0.0091, "mean_token_accuracy": 0.9971361041069031, "num_tokens": 121914517.0, "step": 37675 }, { "entropy": 0.04976560343056917, "epoch": 8.783308077864554, "grad_norm": 0.70703125, "learning_rate": 4.6986984185706206e-05, "loss": 0.002, "mean_token_accuracy": 0.9991954684257507, "num_tokens": 121934035.0, "step": 37680 }, { "entropy": 0.08044440317898989, "epoch": 8.784473714885184, "grad_norm": 0.193359375, "learning_rate": 4.6985990211375805e-05, "loss": 0.0332, "mean_token_accuracy": 0.9900187373161315, "num_tokens": 121958662.0, "step": 37685 }, { "entropy": 0.06931205466389656, "epoch": 8.785639351905816, "grad_norm": 0.2158203125, "learning_rate": 4.6984996095595014e-05, "loss": 0.0025, "mean_token_accuracy": 0.9993624806404113, "num_tokens": 121972128.0, "step": 37690 }, { "entropy": 0.04132307339459658, "epoch": 8.786804988926448, "grad_norm": 0.2294921875, "learning_rate": 4.698400183837867e-05, "loss": 0.0022, "mean_token_accuracy": 0.9990397334098816, "num_tokens": 122011044.0, "step": 37695 }, { "entropy": 0.061664972454309464, "epoch": 8.78797062594708, "grad_norm": 0.69921875, "learning_rate": 4.6983007439741586e-05, "loss": 0.0026, "mean_token_accuracy": 0.9992005169391632, "num_tokens": 122021209.0, "step": 37700 }, { "entropy": 0.05934088248759508, "epoch": 8.789136262967713, "grad_norm": 1.859375, "learning_rate": 4.698201289969861e-05, "loss": 0.0056, "mean_token_accuracy": 0.9981985509395599, "num_tokens": 122052417.0, "step": 37705 }, { "entropy": 0.05349223613739014, "epoch": 8.790301899988343, "grad_norm": 1.0078125, "learning_rate": 4.698101821826455e-05, "loss": 0.0025, "mean_token_accuracy": 0.9989639759063721, "num_tokens": 122074536.0, "step": 37710 }, { "entropy": 0.0589401114732027, "epoch": 8.791467537008975, "grad_norm": 1.3828125, "learning_rate": 4.6980023395454256e-05, "loss": 0.0045, "mean_token_accuracy": 0.9988614022731781, "num_tokens": 122089899.0, "step": 37715 }, { "entropy": 0.06001086411997676, "epoch": 8.792633174029607, "grad_norm": 2.296875, "learning_rate": 4.697902843128255e-05, "loss": 0.0044, "mean_token_accuracy": 0.9989833116531373, "num_tokens": 122112483.0, "step": 37720 }, { "entropy": 0.05685397181659937, "epoch": 8.79379881105024, "grad_norm": 0.197265625, "learning_rate": 4.697803332576428e-05, "loss": 0.0119, "mean_token_accuracy": 0.9974867701530457, "num_tokens": 122126367.0, "step": 37725 }, { "entropy": 0.07201154017820954, "epoch": 8.794964448070871, "grad_norm": 0.73046875, "learning_rate": 4.697703807891426e-05, "loss": 0.003, "mean_token_accuracy": 0.999059921503067, "num_tokens": 122145494.0, "step": 37730 }, { "entropy": 0.047375439945608375, "epoch": 8.796130085091502, "grad_norm": 0.1416015625, "learning_rate": 4.6976042690747366e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999362409114838, "num_tokens": 122165067.0, "step": 37735 }, { "entropy": 0.07775508984923363, "epoch": 8.797295722112134, "grad_norm": 0.36328125, "learning_rate": 4.697504716127842e-05, "loss": 0.0054, "mean_token_accuracy": 0.9980196058750153, "num_tokens": 122175557.0, "step": 37740 }, { "entropy": 0.066469413228333, "epoch": 8.798461359132766, "grad_norm": 0.10693359375, "learning_rate": 4.6974051490522256e-05, "loss": 0.003, "mean_token_accuracy": 0.9987132906913757, "num_tokens": 122188226.0, "step": 37745 }, { "entropy": 0.061857113242149354, "epoch": 8.799626996153398, "grad_norm": 0.99609375, "learning_rate": 4.697305567849375e-05, "loss": 0.0046, "mean_token_accuracy": 0.9985567629337311, "num_tokens": 122198763.0, "step": 37750 }, { "entropy": 0.07967776516452432, "epoch": 8.80079263317403, "grad_norm": 1.59375, "learning_rate": 4.697205972520773e-05, "loss": 0.0049, "mean_token_accuracy": 0.9984486401081085, "num_tokens": 122216833.0, "step": 37755 }, { "entropy": 0.05474145282059908, "epoch": 8.801958270194662, "grad_norm": 1.890625, "learning_rate": 4.697106363067905e-05, "loss": 0.0048, "mean_token_accuracy": 0.9994183659553528, "num_tokens": 122235649.0, "step": 37760 }, { "entropy": 0.0869576308876276, "epoch": 8.803123907215292, "grad_norm": 0.3828125, "learning_rate": 4.697006739492257e-05, "loss": 0.0037, "mean_token_accuracy": 0.9988507032394409, "num_tokens": 122249504.0, "step": 37765 }, { "entropy": 0.05836290316656232, "epoch": 8.804289544235925, "grad_norm": 0.328125, "learning_rate": 4.696907101795314e-05, "loss": 0.0029, "mean_token_accuracy": 0.998987078666687, "num_tokens": 122269940.0, "step": 37770 }, { "entropy": 0.06279167141765356, "epoch": 8.805455181256557, "grad_norm": 2.15625, "learning_rate": 4.6968074499785615e-05, "loss": 0.0063, "mean_token_accuracy": 0.9981760680675507, "num_tokens": 122287840.0, "step": 37775 }, { "entropy": 0.07444556690752506, "epoch": 8.806620818277189, "grad_norm": 0.921875, "learning_rate": 4.696707784043486e-05, "loss": 0.0052, "mean_token_accuracy": 0.9971623122692108, "num_tokens": 122296145.0, "step": 37780 }, { "entropy": 0.052605046331882475, "epoch": 8.807786455297821, "grad_norm": 0.81640625, "learning_rate": 4.6966081039915735e-05, "loss": 0.0114, "mean_token_accuracy": 0.9969050943851471, "num_tokens": 122336505.0, "step": 37785 }, { "entropy": 0.0693147087469697, "epoch": 8.808952092318451, "grad_norm": 2.09375, "learning_rate": 4.696508409824311e-05, "loss": 0.006, "mean_token_accuracy": 0.9984557271003723, "num_tokens": 122349602.0, "step": 37790 }, { "entropy": 0.07108534360304475, "epoch": 8.810117729339083, "grad_norm": 2.5, "learning_rate": 4.696408701543184e-05, "loss": 0.0111, "mean_token_accuracy": 0.9988550305366516, "num_tokens": 122385144.0, "step": 37795 }, { "entropy": 0.07313933782279491, "epoch": 8.811283366359715, "grad_norm": 0.58984375, "learning_rate": 4.696308979149679e-05, "loss": 0.0089, "mean_token_accuracy": 0.9986230254173278, "num_tokens": 122400947.0, "step": 37800 }, { "entropy": 0.05324373729526997, "epoch": 8.812449003380348, "grad_norm": 0.255859375, "learning_rate": 4.696209242645285e-05, "loss": 0.0041, "mean_token_accuracy": 0.9986553013324737, "num_tokens": 122420296.0, "step": 37805 }, { "entropy": 0.06737412475049495, "epoch": 8.81361464040098, "grad_norm": 2.046875, "learning_rate": 4.696109492031488e-05, "loss": 0.0113, "mean_token_accuracy": 0.9984551131725311, "num_tokens": 122453040.0, "step": 37810 }, { "entropy": 0.05885831089690328, "epoch": 8.814780277421612, "grad_norm": 0.08935546875, "learning_rate": 4.696009727309775e-05, "loss": 0.0022, "mean_token_accuracy": 0.999238908290863, "num_tokens": 122478826.0, "step": 37815 }, { "entropy": 0.06913261096924543, "epoch": 8.815945914442242, "grad_norm": 0.234375, "learning_rate": 4.6959099484816336e-05, "loss": 0.0027, "mean_token_accuracy": 0.9983415126800537, "num_tokens": 122495715.0, "step": 37820 }, { "entropy": 0.05994777157902718, "epoch": 8.817111551462874, "grad_norm": 1.6015625, "learning_rate": 4.695810155548553e-05, "loss": 0.0171, "mean_token_accuracy": 0.9962732553482055, "num_tokens": 122521314.0, "step": 37825 }, { "entropy": 0.07015992011874914, "epoch": 8.818277188483506, "grad_norm": 0.25, "learning_rate": 4.6957103485120204e-05, "loss": 0.0028, "mean_token_accuracy": 0.999011218547821, "num_tokens": 122541171.0, "step": 37830 }, { "entropy": 0.06579969152808189, "epoch": 8.819442825504138, "grad_norm": 1.53125, "learning_rate": 4.6956105273735234e-05, "loss": 0.0086, "mean_token_accuracy": 0.998229706287384, "num_tokens": 122553154.0, "step": 37835 }, { "entropy": 0.07723803166300058, "epoch": 8.82060846252477, "grad_norm": 0.59375, "learning_rate": 4.6955106921345516e-05, "loss": 0.0032, "mean_token_accuracy": 0.9987864375114441, "num_tokens": 122575622.0, "step": 37840 }, { "entropy": 0.05942539321258664, "epoch": 8.8217740995454, "grad_norm": 0.75390625, "learning_rate": 4.695410842796594e-05, "loss": 0.0061, "mean_token_accuracy": 0.9983376860618591, "num_tokens": 122600863.0, "step": 37845 }, { "entropy": 0.04993642652407289, "epoch": 8.822939736566033, "grad_norm": 0.50390625, "learning_rate": 4.695310979361137e-05, "loss": 0.0061, "mean_token_accuracy": 0.9980162382125854, "num_tokens": 122624134.0, "step": 37850 }, { "entropy": 0.07489926908165216, "epoch": 8.824105373586665, "grad_norm": 4.21875, "learning_rate": 4.695211101829673e-05, "loss": 0.004, "mean_token_accuracy": 0.9985229194164276, "num_tokens": 122638432.0, "step": 37855 }, { "entropy": 0.06193410158157349, "epoch": 8.825271010607297, "grad_norm": 0.08837890625, "learning_rate": 4.695111210203689e-05, "loss": 0.006, "mean_token_accuracy": 0.9991264760494232, "num_tokens": 122655695.0, "step": 37860 }, { "entropy": 0.04280728902667761, "epoch": 8.82643664762793, "grad_norm": 0.70703125, "learning_rate": 4.6950113044846754e-05, "loss": 0.0026, "mean_token_accuracy": 0.9995523571968079, "num_tokens": 122675293.0, "step": 37865 }, { "entropy": 0.03909464376047254, "epoch": 8.82760228464856, "grad_norm": 0.2080078125, "learning_rate": 4.694911384674122e-05, "loss": 0.0025, "mean_token_accuracy": 0.9993949353694915, "num_tokens": 122703106.0, "step": 37870 }, { "entropy": 0.06501968447118997, "epoch": 8.828767921669192, "grad_norm": 0.26171875, "learning_rate": 4.694811450773519e-05, "loss": 0.0039, "mean_token_accuracy": 0.999257355928421, "num_tokens": 122718218.0, "step": 37875 }, { "entropy": 0.05192201929166913, "epoch": 8.829933558689824, "grad_norm": 1.28125, "learning_rate": 4.6947115027843556e-05, "loss": 0.0038, "mean_token_accuracy": 0.9991044104099274, "num_tokens": 122743671.0, "step": 37880 }, { "entropy": 0.049569260654971005, "epoch": 8.831099195710456, "grad_norm": 0.1220703125, "learning_rate": 4.694611540708123e-05, "loss": 0.002, "mean_token_accuracy": 0.9995582580566407, "num_tokens": 122767291.0, "step": 37885 }, { "entropy": 0.050462238024920225, "epoch": 8.832264832731088, "grad_norm": 1.15625, "learning_rate": 4.6945115645463114e-05, "loss": 0.0045, "mean_token_accuracy": 0.9993569672107696, "num_tokens": 122790047.0, "step": 37890 }, { "entropy": 0.05963395088911057, "epoch": 8.83343046975172, "grad_norm": 0.046875, "learning_rate": 4.694411574300412e-05, "loss": 0.0027, "mean_token_accuracy": 0.9995575189590454, "num_tokens": 122802120.0, "step": 37895 }, { "entropy": 0.0568438459187746, "epoch": 8.83459610677235, "grad_norm": 0.3359375, "learning_rate": 4.6943115699719155e-05, "loss": 0.0053, "mean_token_accuracy": 0.9985347628593445, "num_tokens": 122840584.0, "step": 37900 }, { "entropy": 0.08491461314260959, "epoch": 8.835761743792983, "grad_norm": 0.55078125, "learning_rate": 4.694211551562313e-05, "loss": 0.0051, "mean_token_accuracy": 0.9972289741039276, "num_tokens": 122853087.0, "step": 37905 }, { "entropy": 0.056212511658668515, "epoch": 8.836927380813615, "grad_norm": 0.076171875, "learning_rate": 4.694111519073096e-05, "loss": 0.0031, "mean_token_accuracy": 0.9995512783527374, "num_tokens": 122863495.0, "step": 37910 }, { "entropy": 0.05507175326347351, "epoch": 8.838093017834247, "grad_norm": 1.859375, "learning_rate": 4.6940114725057567e-05, "loss": 0.0029, "mean_token_accuracy": 0.9987462341785431, "num_tokens": 122880791.0, "step": 37915 }, { "entropy": 0.05416063591837883, "epoch": 8.839258654854879, "grad_norm": 1.4296875, "learning_rate": 4.693911411861786e-05, "loss": 0.0024, "mean_token_accuracy": 0.998863023519516, "num_tokens": 122899547.0, "step": 37920 }, { "entropy": 0.05549648702144623, "epoch": 8.84042429187551, "grad_norm": 1.3125, "learning_rate": 4.6938113371426766e-05, "loss": 0.0051, "mean_token_accuracy": 0.9996604323387146, "num_tokens": 122931372.0, "step": 37925 }, { "entropy": 0.03992574992589652, "epoch": 8.841589928896141, "grad_norm": 0.6171875, "learning_rate": 4.693711248349921e-05, "loss": 0.0018, "mean_token_accuracy": 0.9993171095848083, "num_tokens": 122956032.0, "step": 37930 }, { "entropy": 0.05518424436450005, "epoch": 8.842755565916773, "grad_norm": 2.578125, "learning_rate": 4.693611145485011e-05, "loss": 0.0037, "mean_token_accuracy": 0.9986071944236755, "num_tokens": 122967343.0, "step": 37935 }, { "entropy": 0.04317314065992832, "epoch": 8.843921202937405, "grad_norm": 0.349609375, "learning_rate": 4.693511028549439e-05, "loss": 0.006, "mean_token_accuracy": 0.998054838180542, "num_tokens": 122996226.0, "step": 37940 }, { "entropy": 0.05350646786391735, "epoch": 8.845086839958038, "grad_norm": 0.2353515625, "learning_rate": 4.693410897544699e-05, "loss": 0.0056, "mean_token_accuracy": 0.9985206425189972, "num_tokens": 123027698.0, "step": 37945 }, { "entropy": 0.05209854450076819, "epoch": 8.84625247697867, "grad_norm": 0.39453125, "learning_rate": 4.6933107524722835e-05, "loss": 0.0016, "mean_token_accuracy": 0.9998648226261139, "num_tokens": 123061500.0, "step": 37950 }, { "entropy": 0.050198511127382515, "epoch": 8.8474181139993, "grad_norm": 0.240234375, "learning_rate": 4.6932105933336854e-05, "loss": 0.0058, "mean_token_accuracy": 0.9992323398590088, "num_tokens": 123076939.0, "step": 37955 }, { "entropy": 0.06413730578497052, "epoch": 8.848583751019932, "grad_norm": 0.59765625, "learning_rate": 4.693110420130399e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995835602283478, "num_tokens": 123093348.0, "step": 37960 }, { "entropy": 0.04941152567043901, "epoch": 8.849749388040564, "grad_norm": 0.416015625, "learning_rate": 4.693010232863918e-05, "loss": 0.0106, "mean_token_accuracy": 0.9988395273685455, "num_tokens": 123113623.0, "step": 37965 }, { "entropy": 0.06690901890397072, "epoch": 8.850915025061196, "grad_norm": 3.828125, "learning_rate": 4.6929100315357364e-05, "loss": 0.035, "mean_token_accuracy": 0.9943624317646027, "num_tokens": 123142491.0, "step": 37970 }, { "entropy": 0.06515284590423107, "epoch": 8.852080662081828, "grad_norm": 1.359375, "learning_rate": 4.692809816147347e-05, "loss": 0.0035, "mean_token_accuracy": 0.9986517250537872, "num_tokens": 123156097.0, "step": 37975 }, { "entropy": 0.045817646011710164, "epoch": 8.853246299102459, "grad_norm": 2.421875, "learning_rate": 4.692709586700246e-05, "loss": 0.0048, "mean_token_accuracy": 0.9982234835624695, "num_tokens": 123182011.0, "step": 37980 }, { "entropy": 0.05612170891836286, "epoch": 8.85441193612309, "grad_norm": 0.265625, "learning_rate": 4.692609343195926e-05, "loss": 0.0024, "mean_token_accuracy": 0.999575287103653, "num_tokens": 123201193.0, "step": 37985 }, { "entropy": 0.07323649059981108, "epoch": 8.855577573143723, "grad_norm": 0.1806640625, "learning_rate": 4.692509085635884e-05, "loss": 0.0031, "mean_token_accuracy": 0.9975827276706696, "num_tokens": 123242077.0, "step": 37990 }, { "entropy": 0.04696584166958928, "epoch": 8.856743210164355, "grad_norm": 0.5703125, "learning_rate": 4.692408814021614e-05, "loss": 0.0059, "mean_token_accuracy": 0.9984638512134552, "num_tokens": 123265675.0, "step": 37995 }, { "entropy": 0.06086919633671641, "epoch": 8.857908847184987, "grad_norm": 0.298828125, "learning_rate": 4.6923085283546106e-05, "loss": 0.0011, "mean_token_accuracy": 0.999669861793518, "num_tokens": 123286453.0, "step": 38000 }, { "entropy": 0.0598510229960084, "epoch": 8.859074484205617, "grad_norm": 3.15625, "learning_rate": 4.69220822863637e-05, "loss": 0.0084, "mean_token_accuracy": 0.9975327312946319, "num_tokens": 123298214.0, "step": 38005 }, { "entropy": 0.05113176926970482, "epoch": 8.86024012122625, "grad_norm": 1.1484375, "learning_rate": 4.692107914868387e-05, "loss": 0.0023, "mean_token_accuracy": 0.9995282173156739, "num_tokens": 123313774.0, "step": 38010 }, { "entropy": 0.06497044824063777, "epoch": 8.861405758246882, "grad_norm": 1.34375, "learning_rate": 4.692007587052159e-05, "loss": 0.0039, "mean_token_accuracy": 0.9993992626667023, "num_tokens": 123323123.0, "step": 38015 }, { "entropy": 0.07354457750916481, "epoch": 8.862571395267514, "grad_norm": 0.189453125, "learning_rate": 4.69190724518918e-05, "loss": 0.0116, "mean_token_accuracy": 0.9962685942649842, "num_tokens": 123342630.0, "step": 38020 }, { "entropy": 0.053354914858937265, "epoch": 8.863737032288146, "grad_norm": 0.62109375, "learning_rate": 4.691806889280948e-05, "loss": 0.003, "mean_token_accuracy": 0.9992679178714752, "num_tokens": 123365283.0, "step": 38025 }, { "entropy": 0.0487318092957139, "epoch": 8.864902669308778, "grad_norm": 0.453125, "learning_rate": 4.691706519328958e-05, "loss": 0.0051, "mean_token_accuracy": 0.9992010533809662, "num_tokens": 123406472.0, "step": 38030 }, { "entropy": 0.05246655810624361, "epoch": 8.866068306329408, "grad_norm": 1.765625, "learning_rate": 4.691606135334708e-05, "loss": 0.0032, "mean_token_accuracy": 0.9991471171379089, "num_tokens": 123434661.0, "step": 38035 }, { "entropy": 0.060769391059875486, "epoch": 8.86723394335004, "grad_norm": 0.3671875, "learning_rate": 4.691505737299694e-05, "loss": 0.0055, "mean_token_accuracy": 0.9982271492481232, "num_tokens": 123451385.0, "step": 38040 }, { "entropy": 0.08358767367899418, "epoch": 8.868399580370673, "grad_norm": 0.1591796875, "learning_rate": 4.691405325225413e-05, "loss": 0.0037, "mean_token_accuracy": 0.9988424837589264, "num_tokens": 123465541.0, "step": 38045 }, { "entropy": 0.04848737036809325, "epoch": 8.869565217391305, "grad_norm": 1.7734375, "learning_rate": 4.6913048991133636e-05, "loss": 0.0068, "mean_token_accuracy": 0.996968537569046, "num_tokens": 123486840.0, "step": 38050 }, { "entropy": 0.0603315188549459, "epoch": 8.870730854411937, "grad_norm": 3.234375, "learning_rate": 4.6912044589650414e-05, "loss": 0.009, "mean_token_accuracy": 0.997438782453537, "num_tokens": 123512872.0, "step": 38055 }, { "entropy": 0.05510821873322129, "epoch": 8.871896491432567, "grad_norm": 0.59765625, "learning_rate": 4.691104004781946e-05, "loss": 0.0037, "mean_token_accuracy": 0.9990927755832673, "num_tokens": 123538521.0, "step": 38060 }, { "entropy": 0.055203709099441764, "epoch": 8.8730621284532, "grad_norm": 0.45703125, "learning_rate": 4.691003536565574e-05, "loss": 0.002, "mean_token_accuracy": 0.9989315450191498, "num_tokens": 123566004.0, "step": 38065 }, { "entropy": 0.053858717624098065, "epoch": 8.874227765473831, "grad_norm": 1.0234375, "learning_rate": 4.690903054317424e-05, "loss": 0.0022, "mean_token_accuracy": 0.9992274165153503, "num_tokens": 123587755.0, "step": 38070 }, { "entropy": 0.04106435338035226, "epoch": 8.875393402494463, "grad_norm": 0.447265625, "learning_rate": 4.690802558038994e-05, "loss": 0.0035, "mean_token_accuracy": 0.9998286664485931, "num_tokens": 123608853.0, "step": 38075 }, { "entropy": 0.055756100453436375, "epoch": 8.876559039515096, "grad_norm": 0.328125, "learning_rate": 4.690702047731782e-05, "loss": 0.0039, "mean_token_accuracy": 0.9991030275821686, "num_tokens": 123624845.0, "step": 38080 }, { "entropy": 0.05964929591864347, "epoch": 8.877724676535728, "grad_norm": 0.1396484375, "learning_rate": 4.690601523397289e-05, "loss": 0.0064, "mean_token_accuracy": 0.9988237857818604, "num_tokens": 123634948.0, "step": 38085 }, { "entropy": 0.06275531104765833, "epoch": 8.878890313556358, "grad_norm": 0.19921875, "learning_rate": 4.690500985037012e-05, "loss": 0.0029, "mean_token_accuracy": 0.9983062148094177, "num_tokens": 123661599.0, "step": 38090 }, { "entropy": 0.05262847691774368, "epoch": 8.88005595057699, "grad_norm": 0.2197265625, "learning_rate": 4.690400432652451e-05, "loss": 0.0034, "mean_token_accuracy": 0.9996294438838959, "num_tokens": 123686160.0, "step": 38095 }, { "entropy": 0.057931592734530565, "epoch": 8.881221587597622, "grad_norm": 0.32421875, "learning_rate": 4.690299866245105e-05, "loss": 0.0025, "mean_token_accuracy": 0.999240392446518, "num_tokens": 123706409.0, "step": 38100 }, { "entropy": 0.06777220210060478, "epoch": 8.882387224618254, "grad_norm": 0.6484375, "learning_rate": 4.690199285816473e-05, "loss": 0.0029, "mean_token_accuracy": 0.998669707775116, "num_tokens": 123718962.0, "step": 38105 }, { "entropy": 0.05980415008962155, "epoch": 8.883552861638886, "grad_norm": 0.0625, "learning_rate": 4.690098691368056e-05, "loss": 0.002, "mean_token_accuracy": 0.9993426620960235, "num_tokens": 123743234.0, "step": 38110 }, { "entropy": 0.07145977187901735, "epoch": 8.884718498659517, "grad_norm": 0.1953125, "learning_rate": 4.6899980829013534e-05, "loss": 0.002, "mean_token_accuracy": 0.9988697230815887, "num_tokens": 123771453.0, "step": 38115 }, { "entropy": 0.06307173445820809, "epoch": 8.885884135680149, "grad_norm": 0.09130859375, "learning_rate": 4.6898974604178655e-05, "loss": 0.001, "mean_token_accuracy": 0.9999496757984161, "num_tokens": 123788361.0, "step": 38120 }, { "entropy": 0.06684031132608652, "epoch": 8.887049772700781, "grad_norm": 1.890625, "learning_rate": 4.689796823919094e-05, "loss": 0.0025, "mean_token_accuracy": 0.9992721199989318, "num_tokens": 123799970.0, "step": 38125 }, { "entropy": 0.04606306701898575, "epoch": 8.888215409721413, "grad_norm": 0.42578125, "learning_rate": 4.689696173406537e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999196231365204, "num_tokens": 123826915.0, "step": 38130 }, { "entropy": 0.06843077950179577, "epoch": 8.889381046742045, "grad_norm": 1.5234375, "learning_rate": 4.6895955088816964e-05, "loss": 0.0034, "mean_token_accuracy": 0.9991783380508423, "num_tokens": 123836264.0, "step": 38135 }, { "entropy": 0.057823483273386954, "epoch": 8.890546683762675, "grad_norm": 1.1015625, "learning_rate": 4.689494830346074e-05, "loss": 0.0043, "mean_token_accuracy": 0.9989975571632386, "num_tokens": 123849316.0, "step": 38140 }, { "entropy": 0.05925882076844573, "epoch": 8.891712320783308, "grad_norm": 0.0498046875, "learning_rate": 4.6893941378011716e-05, "loss": 0.0028, "mean_token_accuracy": 0.9984163522720337, "num_tokens": 123887079.0, "step": 38145 }, { "entropy": 0.06463732328265906, "epoch": 8.89287795780394, "grad_norm": 0.357421875, "learning_rate": 4.689293431248488e-05, "loss": 0.0041, "mean_token_accuracy": 0.9998124241828918, "num_tokens": 123910484.0, "step": 38150 }, { "entropy": 0.04970733867958188, "epoch": 8.894043594824572, "grad_norm": 0.275390625, "learning_rate": 4.689192710689528e-05, "loss": 0.0027, "mean_token_accuracy": 0.9995087504386901, "num_tokens": 123933985.0, "step": 38155 }, { "entropy": 0.040503044705837966, "epoch": 8.895209231845204, "grad_norm": 0.1494140625, "learning_rate": 4.689091976125791e-05, "loss": 0.0052, "mean_token_accuracy": 0.9990995824337006, "num_tokens": 123980387.0, "step": 38160 }, { "entropy": 0.04687797641381621, "epoch": 8.896374868865836, "grad_norm": 0.07177734375, "learning_rate": 4.688991227558781e-05, "loss": 0.0025, "mean_token_accuracy": 0.9996799647808075, "num_tokens": 124006601.0, "step": 38165 }, { "entropy": 0.05270485226064921, "epoch": 8.897540505886466, "grad_norm": 0.1845703125, "learning_rate": 4.688890464989999e-05, "loss": 0.0051, "mean_token_accuracy": 0.9976952373981476, "num_tokens": 124045129.0, "step": 38170 }, { "entropy": 0.049232575856149194, "epoch": 8.898706142907098, "grad_norm": 0.3515625, "learning_rate": 4.688789688420948e-05, "loss": 0.0013, "mean_token_accuracy": 0.998599624633789, "num_tokens": 124077367.0, "step": 38175 }, { "entropy": 0.07646393990144133, "epoch": 8.89987177992773, "grad_norm": 1.0703125, "learning_rate": 4.688688897853131e-05, "loss": 0.0008, "mean_token_accuracy": 0.999968695640564, "num_tokens": 124095346.0, "step": 38180 }, { "entropy": 0.05587264345958829, "epoch": 8.901037416948363, "grad_norm": 0.140625, "learning_rate": 4.68858809328805e-05, "loss": 0.0036, "mean_token_accuracy": 0.998901629447937, "num_tokens": 124124179.0, "step": 38185 }, { "entropy": 0.06073369812220335, "epoch": 8.902203053968995, "grad_norm": 1.65625, "learning_rate": 4.688487274727209e-05, "loss": 0.0055, "mean_token_accuracy": 0.9984927356243134, "num_tokens": 124145302.0, "step": 38190 }, { "entropy": 0.04628473650664091, "epoch": 8.903368690989625, "grad_norm": 1.2734375, "learning_rate": 4.688386442172111e-05, "loss": 0.0027, "mean_token_accuracy": 0.998195481300354, "num_tokens": 124179163.0, "step": 38195 }, { "entropy": 0.04713690942153335, "epoch": 8.904534328010257, "grad_norm": 0.58203125, "learning_rate": 4.688285595624261e-05, "loss": 0.0029, "mean_token_accuracy": 0.9986393749713898, "num_tokens": 124209097.0, "step": 38200 }, { "entropy": 0.056856155022978784, "epoch": 8.90569996503089, "grad_norm": 0.205078125, "learning_rate": 4.68818473508516e-05, "loss": 0.0018, "mean_token_accuracy": 0.9994652688503265, "num_tokens": 124239155.0, "step": 38205 }, { "entropy": 0.06115551386028528, "epoch": 8.906865602051521, "grad_norm": 0.11279296875, "learning_rate": 4.688083860556315e-05, "loss": 0.0026, "mean_token_accuracy": 0.9995314121246338, "num_tokens": 124253044.0, "step": 38210 }, { "entropy": 0.07185219712555409, "epoch": 8.908031239072153, "grad_norm": 0.3984375, "learning_rate": 4.6879829720392276e-05, "loss": 0.003, "mean_token_accuracy": 0.9995608389377594, "num_tokens": 124263476.0, "step": 38215 }, { "entropy": 0.06399134192615748, "epoch": 8.909196876092786, "grad_norm": 0.2255859375, "learning_rate": 4.687882069535403e-05, "loss": 0.0079, "mean_token_accuracy": 0.9987414300441741, "num_tokens": 124284712.0, "step": 38220 }, { "entropy": 0.04301735432818532, "epoch": 8.910362513113416, "grad_norm": 0.369140625, "learning_rate": 4.687781153046347e-05, "loss": 0.0028, "mean_token_accuracy": 0.9982926070690155, "num_tokens": 124315576.0, "step": 38225 }, { "entropy": 0.048022180516272786, "epoch": 8.911528150134048, "grad_norm": 1.5390625, "learning_rate": 4.6876802225735626e-05, "loss": 0.0016, "mean_token_accuracy": 0.9999112784862518, "num_tokens": 124345674.0, "step": 38230 }, { "entropy": 0.0699318254366517, "epoch": 8.91269378715468, "grad_norm": 2.609375, "learning_rate": 4.687579278118557e-05, "loss": 0.0056, "mean_token_accuracy": 0.9987849235534668, "num_tokens": 124357459.0, "step": 38235 }, { "entropy": 0.10346001256257295, "epoch": 8.913859424175312, "grad_norm": 1.2734375, "learning_rate": 4.687478319682833e-05, "loss": 0.0029, "mean_token_accuracy": 0.9995729744434356, "num_tokens": 124369169.0, "step": 38240 }, { "entropy": 0.08160105086863041, "epoch": 8.915025061195944, "grad_norm": 3.0625, "learning_rate": 4.6873773472678975e-05, "loss": 0.0087, "mean_token_accuracy": 0.9984677612781525, "num_tokens": 124377276.0, "step": 38245 }, { "entropy": 0.049109653756022456, "epoch": 8.916190698216575, "grad_norm": 0.291015625, "learning_rate": 4.6872763608752566e-05, "loss": 0.0041, "mean_token_accuracy": 0.9981947541236877, "num_tokens": 124402590.0, "step": 38250 }, { "entropy": 0.08780342619866133, "epoch": 8.917356335237207, "grad_norm": 1.6953125, "learning_rate": 4.687175360506415e-05, "loss": 0.0055, "mean_token_accuracy": 0.9984165310859681, "num_tokens": 124416878.0, "step": 38255 }, { "entropy": 0.0503659694455564, "epoch": 8.918521972257839, "grad_norm": 0.37109375, "learning_rate": 4.6870743461628785e-05, "loss": 0.0043, "mean_token_accuracy": 0.9995361506938935, "num_tokens": 124447276.0, "step": 38260 }, { "entropy": 0.048782548774033783, "epoch": 8.919687609278471, "grad_norm": 0.37890625, "learning_rate": 4.686973317846155e-05, "loss": 0.0049, "mean_token_accuracy": 0.9984687030315399, "num_tokens": 124481843.0, "step": 38265 }, { "entropy": 0.0999077981337905, "epoch": 8.920853246299103, "grad_norm": 1.5234375, "learning_rate": 4.6868722755577486e-05, "loss": 0.0019, "mean_token_accuracy": 0.9997912287712097, "num_tokens": 124492998.0, "step": 38270 }, { "entropy": 0.055358598567545415, "epoch": 8.922018883319733, "grad_norm": 0.1435546875, "learning_rate": 4.6867712192991685e-05, "loss": 0.0046, "mean_token_accuracy": 0.9990233540534973, "num_tokens": 124517996.0, "step": 38275 }, { "entropy": 0.04316076897084713, "epoch": 8.923184520340365, "grad_norm": 1.21875, "learning_rate": 4.6866701490719206e-05, "loss": 0.0066, "mean_token_accuracy": 0.9968756437301636, "num_tokens": 124541992.0, "step": 38280 }, { "entropy": 0.060758284851908685, "epoch": 8.924350157360998, "grad_norm": 0.578125, "learning_rate": 4.6865690648775115e-05, "loss": 0.0036, "mean_token_accuracy": 0.9993073165416717, "num_tokens": 124555858.0, "step": 38285 }, { "entropy": 0.048283427767455575, "epoch": 8.92551579438163, "grad_norm": 0.66796875, "learning_rate": 4.6864679667174494e-05, "loss": 0.0027, "mean_token_accuracy": 0.999762213230133, "num_tokens": 124577980.0, "step": 38290 }, { "entropy": 0.05046857642009854, "epoch": 8.926681431402262, "grad_norm": 0.58984375, "learning_rate": 4.6863668545932415e-05, "loss": 0.0044, "mean_token_accuracy": 0.9984736323356629, "num_tokens": 124600566.0, "step": 38295 }, { "entropy": 0.0775529894977808, "epoch": 8.927847068422894, "grad_norm": 0.76953125, "learning_rate": 4.686265728506395e-05, "loss": 0.0017, "mean_token_accuracy": 0.9988628149032592, "num_tokens": 124612207.0, "step": 38300 }, { "entropy": 0.06363149667158723, "epoch": 8.929012705443524, "grad_norm": 1.578125, "learning_rate": 4.686164588458418e-05, "loss": 0.0045, "mean_token_accuracy": 0.9986072659492493, "num_tokens": 124627386.0, "step": 38305 }, { "entropy": 0.054681507777422665, "epoch": 8.930178342464156, "grad_norm": 1.0625, "learning_rate": 4.68606343445082e-05, "loss": 0.018, "mean_token_accuracy": 0.9967020988464356, "num_tokens": 124653093.0, "step": 38310 }, { "entropy": 0.06164302993565798, "epoch": 8.931343979484788, "grad_norm": 0.455078125, "learning_rate": 4.685962266485107e-05, "loss": 0.0041, "mean_token_accuracy": 0.9991594314575195, "num_tokens": 124666686.0, "step": 38315 }, { "entropy": 0.06907703513279558, "epoch": 8.93250961650542, "grad_norm": 0.1484375, "learning_rate": 4.6858610845627895e-05, "loss": 0.0018, "mean_token_accuracy": 0.9997636675834656, "num_tokens": 124685401.0, "step": 38320 }, { "entropy": 0.07067847475409508, "epoch": 8.933675253526053, "grad_norm": 0.390625, "learning_rate": 4.685759888685376e-05, "loss": 0.0033, "mean_token_accuracy": 0.9993339359760285, "num_tokens": 124696379.0, "step": 38325 }, { "entropy": 0.06941550485789776, "epoch": 8.934840890546683, "grad_norm": 0.43359375, "learning_rate": 4.6856586788543746e-05, "loss": 0.0029, "mean_token_accuracy": 0.9992957353591919, "num_tokens": 124711477.0, "step": 38330 }, { "entropy": 0.04978791456669569, "epoch": 8.936006527567315, "grad_norm": 0.376953125, "learning_rate": 4.685557455071295e-05, "loss": 0.002, "mean_token_accuracy": 0.9994521677494049, "num_tokens": 124726622.0, "step": 38335 }, { "entropy": 0.04930603364482522, "epoch": 8.937172164587947, "grad_norm": 0.1748046875, "learning_rate": 4.685456217337646e-05, "loss": 0.0063, "mean_token_accuracy": 0.9980187237262725, "num_tokens": 124754390.0, "step": 38340 }, { "entropy": 0.05293769268319011, "epoch": 8.93833780160858, "grad_norm": 0.333984375, "learning_rate": 4.685354965654939e-05, "loss": 0.0038, "mean_token_accuracy": 0.9984555840492249, "num_tokens": 124776615.0, "step": 38345 }, { "entropy": 0.05897071985527873, "epoch": 8.939503438629211, "grad_norm": 0.314453125, "learning_rate": 4.685253700024682e-05, "loss": 0.004, "mean_token_accuracy": 0.9994755685329437, "num_tokens": 124813214.0, "step": 38350 }, { "entropy": 0.07651661131531, "epoch": 8.940669075649843, "grad_norm": 0.609375, "learning_rate": 4.685152420448386e-05, "loss": 0.0066, "mean_token_accuracy": 0.9984458088874817, "num_tokens": 124822339.0, "step": 38355 }, { "entropy": 0.0461973468773067, "epoch": 8.941834712670474, "grad_norm": 0.375, "learning_rate": 4.685051126927561e-05, "loss": 0.0031, "mean_token_accuracy": 0.9989622116088868, "num_tokens": 124848166.0, "step": 38360 }, { "entropy": 0.17166866697371005, "epoch": 8.943000349691106, "grad_norm": 0.28515625, "learning_rate": 4.684949819463717e-05, "loss": 0.2022, "mean_token_accuracy": 0.9561330854892731, "num_tokens": 124867564.0, "step": 38365 }, { "entropy": 0.06029870919883251, "epoch": 8.944165986711738, "grad_norm": 0.2470703125, "learning_rate": 4.684848498058364e-05, "loss": 0.0027, "mean_token_accuracy": 0.9992516517639161, "num_tokens": 124889029.0, "step": 38370 }, { "entropy": 0.058437543269246814, "epoch": 8.94533162373237, "grad_norm": 0.1552734375, "learning_rate": 4.6847471627130145e-05, "loss": 0.003, "mean_token_accuracy": 0.9987386465072632, "num_tokens": 124903062.0, "step": 38375 }, { "entropy": 0.057839416153728965, "epoch": 8.946497260753002, "grad_norm": 2.34375, "learning_rate": 4.684645813429179e-05, "loss": 0.0148, "mean_token_accuracy": 0.9975367724895478, "num_tokens": 124925603.0, "step": 38380 }, { "entropy": 0.07765648029744625, "epoch": 8.947662897773633, "grad_norm": 0.37890625, "learning_rate": 4.684544450208368e-05, "loss": 0.0021, "mean_token_accuracy": 0.9997811794281006, "num_tokens": 124936529.0, "step": 38385 }, { "entropy": 0.05938598429784179, "epoch": 8.948828534794265, "grad_norm": 0.123046875, "learning_rate": 4.684443073052095e-05, "loss": 0.0079, "mean_token_accuracy": 0.9987027943134308, "num_tokens": 124957318.0, "step": 38390 }, { "entropy": 0.12599324379116297, "epoch": 8.949994171814897, "grad_norm": 0.404296875, "learning_rate": 4.684341681961869e-05, "loss": 0.0921, "mean_token_accuracy": 0.9860056221485138, "num_tokens": 124980658.0, "step": 38395 }, { "entropy": 0.06396199259907007, "epoch": 8.951159808835529, "grad_norm": 0.88671875, "learning_rate": 4.684240276939204e-05, "loss": 0.007, "mean_token_accuracy": 0.9991120994091034, "num_tokens": 125005084.0, "step": 38400 }, { "entropy": 0.05854538474231959, "epoch": 8.952325445856161, "grad_norm": 0.4921875, "learning_rate": 4.684138857985611e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997228562831879, "num_tokens": 125033670.0, "step": 38405 }, { "entropy": 0.05626933202147484, "epoch": 8.953491082876791, "grad_norm": 1.3046875, "learning_rate": 4.684037425102603e-05, "loss": 0.0099, "mean_token_accuracy": 0.9971358001232147, "num_tokens": 125065295.0, "step": 38410 }, { "entropy": 0.06237728837877512, "epoch": 8.954656719897423, "grad_norm": 0.314453125, "learning_rate": 4.6839359782916916e-05, "loss": 0.0024, "mean_token_accuracy": 0.9991385698318481, "num_tokens": 125093192.0, "step": 38415 }, { "entropy": 0.050046677514910695, "epoch": 8.955822356918056, "grad_norm": 2.53125, "learning_rate": 4.68383451755439e-05, "loss": 0.0117, "mean_token_accuracy": 0.9974133551120759, "num_tokens": 125108996.0, "step": 38420 }, { "entropy": 0.054030220536515114, "epoch": 8.956987993938688, "grad_norm": 0.1572265625, "learning_rate": 4.683733042892211e-05, "loss": 0.0044, "mean_token_accuracy": 0.9980381071567536, "num_tokens": 125130684.0, "step": 38425 }, { "entropy": 0.06457688459195196, "epoch": 8.95815363095932, "grad_norm": 0.384765625, "learning_rate": 4.683631554306668e-05, "loss": 0.0029, "mean_token_accuracy": 0.9989829778671264, "num_tokens": 125158330.0, "step": 38430 }, { "entropy": 0.07588558178395033, "epoch": 8.959319267979952, "grad_norm": 2.59375, "learning_rate": 4.6835300517992755e-05, "loss": 0.0069, "mean_token_accuracy": 0.998191100358963, "num_tokens": 125182823.0, "step": 38435 }, { "entropy": 0.055414431262761354, "epoch": 8.960484905000582, "grad_norm": 0.64453125, "learning_rate": 4.683428535371544e-05, "loss": 0.0026, "mean_token_accuracy": 0.9980771243572235, "num_tokens": 125201575.0, "step": 38440 }, { "entropy": 0.05777478665113449, "epoch": 8.961650542021214, "grad_norm": 2.234375, "learning_rate": 4.68332700502499e-05, "loss": 0.0049, "mean_token_accuracy": 0.9983273804187774, "num_tokens": 125212087.0, "step": 38445 }, { "entropy": 0.06871578618884086, "epoch": 8.962816179041846, "grad_norm": 0.72265625, "learning_rate": 4.683225460761126e-05, "loss": 0.0088, "mean_token_accuracy": 0.9972965836524963, "num_tokens": 125223395.0, "step": 38450 }, { "entropy": 0.05714446417987347, "epoch": 8.963981816062478, "grad_norm": 0.349609375, "learning_rate": 4.683123902581468e-05, "loss": 0.0017, "mean_token_accuracy": 0.9998061537742615, "num_tokens": 125249399.0, "step": 38455 }, { "entropy": 0.06488155499100685, "epoch": 8.96514745308311, "grad_norm": 0.2255859375, "learning_rate": 4.683022330487528e-05, "loss": 0.003, "mean_token_accuracy": 0.9994613707065583, "num_tokens": 125278360.0, "step": 38460 }, { "entropy": 0.06843510447070003, "epoch": 8.966313090103741, "grad_norm": 0.87890625, "learning_rate": 4.682920744480822e-05, "loss": 0.0098, "mean_token_accuracy": 0.9972912669181824, "num_tokens": 125294562.0, "step": 38465 }, { "entropy": 0.05685213636606932, "epoch": 8.967478727124373, "grad_norm": 2.234375, "learning_rate": 4.6828191445628643e-05, "loss": 0.004, "mean_token_accuracy": 0.998846048116684, "num_tokens": 125323621.0, "step": 38470 }, { "entropy": 0.05628944206982851, "epoch": 8.968644364145005, "grad_norm": 1.203125, "learning_rate": 4.682717530735171e-05, "loss": 0.0063, "mean_token_accuracy": 0.998843890428543, "num_tokens": 125337092.0, "step": 38475 }, { "entropy": 0.06336890961974859, "epoch": 8.969810001165637, "grad_norm": 0.25, "learning_rate": 4.682615902999255e-05, "loss": 0.0054, "mean_token_accuracy": 0.9988363981246948, "num_tokens": 125355100.0, "step": 38480 }, { "entropy": 0.049503239383921024, "epoch": 8.97097563818627, "grad_norm": 0.9140625, "learning_rate": 4.682514261356634e-05, "loss": 0.0046, "mean_token_accuracy": 0.9989173531532287, "num_tokens": 125372821.0, "step": 38485 }, { "entropy": 0.06290744915604592, "epoch": 8.972141275206901, "grad_norm": 2.265625, "learning_rate": 4.682412605808823e-05, "loss": 0.0045, "mean_token_accuracy": 0.9985226809978485, "num_tokens": 125386583.0, "step": 38490 }, { "entropy": 0.06669617369771004, "epoch": 8.973306912227532, "grad_norm": 0.5078125, "learning_rate": 4.6823109363573375e-05, "loss": 0.0091, "mean_token_accuracy": 0.9965139627456665, "num_tokens": 125398934.0, "step": 38495 }, { "entropy": 0.05491267712786794, "epoch": 8.974472549248164, "grad_norm": 1.59375, "learning_rate": 4.682209253003693e-05, "loss": 0.0022, "mean_token_accuracy": 0.998519778251648, "num_tokens": 125419781.0, "step": 38500 }, { "entropy": 0.06229007430374622, "epoch": 8.975638186268796, "grad_norm": 1.125, "learning_rate": 4.682107555749408e-05, "loss": 0.0026, "mean_token_accuracy": 0.9979374289512635, "num_tokens": 125436392.0, "step": 38505 }, { "entropy": 0.052480381168425086, "epoch": 8.976803823289428, "grad_norm": 0.255859375, "learning_rate": 4.682005844595996e-05, "loss": 0.0058, "mean_token_accuracy": 0.9982479214668274, "num_tokens": 125467487.0, "step": 38510 }, { "entropy": 0.07493066936731338, "epoch": 8.97796946031006, "grad_norm": 0.1796875, "learning_rate": 4.6819041195449755e-05, "loss": 0.0037, "mean_token_accuracy": 0.9987802803516388, "num_tokens": 125477378.0, "step": 38515 }, { "entropy": 0.06157904057763517, "epoch": 8.97913509733069, "grad_norm": 0.171875, "learning_rate": 4.681802380597862e-05, "loss": 0.0048, "mean_token_accuracy": 0.9989335715770722, "num_tokens": 125498711.0, "step": 38520 }, { "entropy": 0.039980428479611876, "epoch": 8.980300734351323, "grad_norm": 0.220703125, "learning_rate": 4.6817006277561745e-05, "loss": 0.0024, "mean_token_accuracy": 0.9981152474880218, "num_tokens": 125543250.0, "step": 38525 }, { "entropy": 0.06129704499617219, "epoch": 8.981466371371955, "grad_norm": 0.30859375, "learning_rate": 4.681598861021429e-05, "loss": 0.0032, "mean_token_accuracy": 0.9974672019481658, "num_tokens": 125568173.0, "step": 38530 }, { "entropy": 0.07135277204215526, "epoch": 8.982632008392587, "grad_norm": 2.703125, "learning_rate": 4.681497080395143e-05, "loss": 0.0081, "mean_token_accuracy": 0.997850650548935, "num_tokens": 125578244.0, "step": 38535 }, { "entropy": 0.06666518365964294, "epoch": 8.983797645413219, "grad_norm": 0.953125, "learning_rate": 4.681395285878835e-05, "loss": 0.0051, "mean_token_accuracy": 0.9992936909198761, "num_tokens": 125589548.0, "step": 38540 }, { "entropy": 0.05618604850023985, "epoch": 8.98496328243385, "grad_norm": 0.54296875, "learning_rate": 4.6812934774740223e-05, "loss": 0.0059, "mean_token_accuracy": 0.9985435128211975, "num_tokens": 125615502.0, "step": 38545 }, { "entropy": 0.04700936172157526, "epoch": 8.986128919454481, "grad_norm": 1.1171875, "learning_rate": 4.6811916551822235e-05, "loss": 0.0029, "mean_token_accuracy": 0.9995196282863616, "num_tokens": 125642561.0, "step": 38550 }, { "entropy": 0.06371036674827338, "epoch": 8.987294556475113, "grad_norm": 0.3984375, "learning_rate": 4.681089819004956e-05, "loss": 0.0036, "mean_token_accuracy": 0.9984392046928405, "num_tokens": 125665212.0, "step": 38555 }, { "entropy": 0.0539166197180748, "epoch": 8.988460193495746, "grad_norm": 0.26953125, "learning_rate": 4.680987968943739e-05, "loss": 0.0023, "mean_token_accuracy": 0.999776154756546, "num_tokens": 125701838.0, "step": 38560 }, { "entropy": 0.07485632486641407, "epoch": 8.989625830516378, "grad_norm": 1.0625, "learning_rate": 4.68088610500009e-05, "loss": 0.0041, "mean_token_accuracy": 0.9985475897789001, "num_tokens": 125716351.0, "step": 38565 }, { "entropy": 0.06469077356159687, "epoch": 8.99079146753701, "grad_norm": 0.443359375, "learning_rate": 4.6807842271755306e-05, "loss": 0.0023, "mean_token_accuracy": 0.9997498154640198, "num_tokens": 125731984.0, "step": 38570 }, { "entropy": 0.05720685347914696, "epoch": 8.99195710455764, "grad_norm": 1.5546875, "learning_rate": 4.680682335471577e-05, "loss": 0.0054, "mean_token_accuracy": 0.9984744906425476, "num_tokens": 125746456.0, "step": 38575 }, { "entropy": 0.04150606356561184, "epoch": 8.993122741578272, "grad_norm": 0.072265625, "learning_rate": 4.680580429889751e-05, "loss": 0.0042, "mean_token_accuracy": 0.9995882928371429, "num_tokens": 125770312.0, "step": 38580 }, { "entropy": 0.06257959809154272, "epoch": 8.994288378598904, "grad_norm": 0.07421875, "learning_rate": 4.6804785104315714e-05, "loss": 0.0065, "mean_token_accuracy": 0.9992784261703491, "num_tokens": 125789938.0, "step": 38585 }, { "entropy": 0.10159647967666388, "epoch": 8.995454015619536, "grad_norm": 1.625, "learning_rate": 4.680376577098557e-05, "loss": 0.0816, "mean_token_accuracy": 0.9853849947452545, "num_tokens": 125809595.0, "step": 38590 }, { "entropy": 0.056752304825931785, "epoch": 8.996619652640168, "grad_norm": 1.515625, "learning_rate": 4.680274629892228e-05, "loss": 0.0041, "mean_token_accuracy": 0.9988771855831147, "num_tokens": 125824181.0, "step": 38595 }, { "entropy": 0.051522306725382806, "epoch": 8.997785289660799, "grad_norm": 1.078125, "learning_rate": 4.680172668814106e-05, "loss": 0.0033, "mean_token_accuracy": 0.9990641057491303, "num_tokens": 125840965.0, "step": 38600 }, { "entropy": 0.06658210419118404, "epoch": 8.998950926681431, "grad_norm": 0.1044921875, "learning_rate": 4.68007069386571e-05, "loss": 0.0047, "mean_token_accuracy": 0.9983263134956359, "num_tokens": 125854797.0, "step": 38605 }, { "entropy": 0.06707908130354351, "epoch": 9.0, "grad_norm": 0.5703125, "learning_rate": 4.679968705048562e-05, "loss": 0.0043, "mean_token_accuracy": 0.9990188876787821, "num_tokens": 125865250.0, "step": 38610 }, { "entropy": 0.04199001295492053, "epoch": 9.001165637020632, "grad_norm": 0.1748046875, "learning_rate": 4.679866702364181e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999545395374299, "num_tokens": 125890465.0, "step": 38615 }, { "entropy": 0.19603591002523899, "epoch": 9.002331274041264, "grad_norm": 5.21875, "learning_rate": 4.679764685814089e-05, "loss": 0.2893, "mean_token_accuracy": 0.9659708976745606, "num_tokens": 125923536.0, "step": 38620 }, { "entropy": 0.04350639209151268, "epoch": 9.003496911061895, "grad_norm": 0.1865234375, "learning_rate": 4.679662655399806e-05, "loss": 0.0027, "mean_token_accuracy": 0.9994899988174438, "num_tokens": 125965804.0, "step": 38625 }, { "entropy": 0.06196370590478182, "epoch": 9.004662548082527, "grad_norm": 0.1298828125, "learning_rate": 4.6795606111228565e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 125983321.0, "step": 38630 }, { "entropy": 0.07306013368070126, "epoch": 9.005828185103159, "grad_norm": 0.2333984375, "learning_rate": 4.679458552984759e-05, "loss": 0.0031, "mean_token_accuracy": 0.9990762650966645, "num_tokens": 125994538.0, "step": 38635 }, { "entropy": 0.052279172092676164, "epoch": 9.00699382212379, "grad_norm": 0.05517578125, "learning_rate": 4.679356480987036e-05, "loss": 0.0027, "mean_token_accuracy": 0.9994321703910828, "num_tokens": 126019599.0, "step": 38640 }, { "entropy": 0.06070118434727192, "epoch": 9.008159459144423, "grad_norm": 0.037109375, "learning_rate": 4.679254395131211e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997816622257233, "num_tokens": 126031811.0, "step": 38645 }, { "entropy": 0.057560394145548345, "epoch": 9.009325096165055, "grad_norm": 0.016357421875, "learning_rate": 4.679152295418805e-05, "loss": 0.0014, "mean_token_accuracy": 0.9993645131587983, "num_tokens": 126048597.0, "step": 38650 }, { "entropy": 0.0726345956325531, "epoch": 9.010490733185685, "grad_norm": 0.111328125, "learning_rate": 4.679050181851341e-05, "loss": 0.002, "mean_token_accuracy": 0.9994578123092651, "num_tokens": 126060395.0, "step": 38655 }, { "entropy": 0.05440345862880349, "epoch": 9.011656370206317, "grad_norm": 0.63671875, "learning_rate": 4.678948054430341e-05, "loss": 0.0039, "mean_token_accuracy": 0.9978732764720917, "num_tokens": 126079809.0, "step": 38660 }, { "entropy": 0.06478796168230475, "epoch": 9.01282200722695, "grad_norm": 0.138671875, "learning_rate": 4.678845913157328e-05, "loss": 0.0016, "mean_token_accuracy": 0.9992167770862579, "num_tokens": 126108676.0, "step": 38665 }, { "entropy": 0.04735125498846173, "epoch": 9.013987644247582, "grad_norm": 1.9140625, "learning_rate": 4.678743758033826e-05, "loss": 0.0052, "mean_token_accuracy": 0.9993732571601868, "num_tokens": 126131661.0, "step": 38670 }, { "entropy": 0.07930247653275728, "epoch": 9.015153281268214, "grad_norm": 0.3046875, "learning_rate": 4.6786415890613576e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 126142115.0, "step": 38675 }, { "entropy": 0.07448353804647923, "epoch": 9.016318918288844, "grad_norm": 0.09912109375, "learning_rate": 4.678539406241446e-05, "loss": 0.004, "mean_token_accuracy": 0.9995809614658355, "num_tokens": 126170355.0, "step": 38680 }, { "entropy": 0.06288553746417165, "epoch": 9.017484555309476, "grad_norm": 0.099609375, "learning_rate": 4.6784372095756155e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999343454837799, "num_tokens": 126200626.0, "step": 38685 }, { "entropy": 0.06611028239130974, "epoch": 9.018650192330108, "grad_norm": 0.1455078125, "learning_rate": 4.67833499906539e-05, "loss": 0.0017, "mean_token_accuracy": 0.9992486298084259, "num_tokens": 126211632.0, "step": 38690 }, { "entropy": 0.058050422370433806, "epoch": 9.01981582935074, "grad_norm": 0.1953125, "learning_rate": 4.678232774712293e-05, "loss": 0.0011, "mean_token_accuracy": 0.9996413052082062, "num_tokens": 126233280.0, "step": 38695 }, { "entropy": 0.0486367829144001, "epoch": 9.020981466371373, "grad_norm": 0.07568359375, "learning_rate": 4.6781305365178495e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999536752700806, "num_tokens": 126255103.0, "step": 38700 }, { "entropy": 0.05647154543548823, "epoch": 9.022147103392003, "grad_norm": 0.2060546875, "learning_rate": 4.678028284483583e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999893784523011, "num_tokens": 126283378.0, "step": 38705 }, { "entropy": 0.05234082993119955, "epoch": 9.023312740412635, "grad_norm": 0.2177734375, "learning_rate": 4.67792601861102e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999635398387909, "num_tokens": 126312391.0, "step": 38710 }, { "entropy": 0.07205567061901093, "epoch": 9.024478377433267, "grad_norm": 0.01806640625, "learning_rate": 4.6778237389016835e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 126327468.0, "step": 38715 }, { "entropy": 0.07139207748696208, "epoch": 9.0256440144539, "grad_norm": 0.1083984375, "learning_rate": 4.6777214453571e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 126341775.0, "step": 38720 }, { "entropy": 0.0707708889618516, "epoch": 9.026809651474531, "grad_norm": 0.10498046875, "learning_rate": 4.677619137978794e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999778509140015, "num_tokens": 126361618.0, "step": 38725 }, { "entropy": 0.0603253073990345, "epoch": 9.027975288495163, "grad_norm": 0.06494140625, "learning_rate": 4.677516816768292e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999888360500335, "num_tokens": 126383105.0, "step": 38730 }, { "entropy": 0.07485587485134601, "epoch": 9.029140925515794, "grad_norm": 0.28125, "learning_rate": 4.6774144817271195e-05, "loss": 0.0021, "mean_token_accuracy": 0.9997031033039093, "num_tokens": 126403286.0, "step": 38735 }, { "entropy": 0.06576101407408715, "epoch": 9.030306562536426, "grad_norm": 1.234375, "learning_rate": 4.677312132856801e-05, "loss": 0.0052, "mean_token_accuracy": 0.9995265305042267, "num_tokens": 126415973.0, "step": 38740 }, { "entropy": 0.08720704466104508, "epoch": 9.031472199557058, "grad_norm": 0.26953125, "learning_rate": 4.6772097701588646e-05, "loss": 0.004, "mean_token_accuracy": 0.9993067860603333, "num_tokens": 126428004.0, "step": 38745 }, { "entropy": 0.06274453792721033, "epoch": 9.03263783657769, "grad_norm": 0.173828125, "learning_rate": 4.677107393634835e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 126442681.0, "step": 38750 }, { "entropy": 0.04070065952837467, "epoch": 9.033803473598322, "grad_norm": 0.609375, "learning_rate": 4.677005003286241e-05, "loss": 0.0019, "mean_token_accuracy": 0.9995027482509613, "num_tokens": 126467633.0, "step": 38755 }, { "entropy": 0.08652214827015996, "epoch": 9.034969110618952, "grad_norm": 0.0751953125, "learning_rate": 4.6769025991146076e-05, "loss": 0.0011, "mean_token_accuracy": 0.9998975336551666, "num_tokens": 126502224.0, "step": 38760 }, { "entropy": 0.05523251341655851, "epoch": 9.036134747639585, "grad_norm": 0.1298828125, "learning_rate": 4.676800181121462e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 126522879.0, "step": 38765 }, { "entropy": 0.06810875101946294, "epoch": 9.037300384660217, "grad_norm": 0.0849609375, "learning_rate": 4.676697749308332e-05, "loss": 0.0007, "mean_token_accuracy": 0.999539029598236, "num_tokens": 126537183.0, "step": 38770 }, { "entropy": 0.06015290655195713, "epoch": 9.038466021680849, "grad_norm": 0.33203125, "learning_rate": 4.676595303676745e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999792695045471, "num_tokens": 126552182.0, "step": 38775 }, { "entropy": 0.06348831951618195, "epoch": 9.03963165870148, "grad_norm": 0.07861328125, "learning_rate": 4.676492844228227e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 126561944.0, "step": 38780 }, { "entropy": 0.06204386427998543, "epoch": 9.040797295722113, "grad_norm": 0.546875, "learning_rate": 4.676390370964309e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997840166091919, "num_tokens": 126573591.0, "step": 38785 }, { "entropy": 0.06764180241152644, "epoch": 9.041962932742743, "grad_norm": 0.345703125, "learning_rate": 4.676287883886516e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997762858867645, "num_tokens": 126585784.0, "step": 38790 }, { "entropy": 0.05596046075224877, "epoch": 9.043128569763375, "grad_norm": 0.23828125, "learning_rate": 4.676185382996378e-05, "loss": 0.0013, "mean_token_accuracy": 0.9999856591224671, "num_tokens": 126601959.0, "step": 38795 }, { "entropy": 0.05647138254716992, "epoch": 9.044294206784008, "grad_norm": 0.1416015625, "learning_rate": 4.676082868295423e-05, "loss": 0.0019, "mean_token_accuracy": 0.9988505721092225, "num_tokens": 126620139.0, "step": 38800 }, { "entropy": 0.050310919806361196, "epoch": 9.04545984380464, "grad_norm": 0.1474609375, "learning_rate": 4.675980339785179e-05, "loss": 0.0016, "mean_token_accuracy": 0.999627536535263, "num_tokens": 126645308.0, "step": 38805 }, { "entropy": 0.05990322157740593, "epoch": 9.046625480825272, "grad_norm": 0.72265625, "learning_rate": 4.675877797467176e-05, "loss": 0.004, "mean_token_accuracy": 0.9993937849998474, "num_tokens": 126657453.0, "step": 38810 }, { "entropy": 0.04789342461153865, "epoch": 9.047791117845902, "grad_norm": 0.1611328125, "learning_rate": 4.675775241342942e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999779224395752, "num_tokens": 126690452.0, "step": 38815 }, { "entropy": 0.07110001184046269, "epoch": 9.048956754866534, "grad_norm": 0.046630859375, "learning_rate": 4.675672671414006e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 126705011.0, "step": 38820 }, { "entropy": 0.0685180657543242, "epoch": 9.050122391887166, "grad_norm": 0.09814453125, "learning_rate": 4.6755700876819e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996334731578826, "num_tokens": 126725173.0, "step": 38825 }, { "entropy": 0.061684256233274934, "epoch": 9.051288028907798, "grad_norm": 0.06396484375, "learning_rate": 4.67546749014815e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999591529369354, "num_tokens": 126750446.0, "step": 38830 }, { "entropy": 0.07004572823643684, "epoch": 9.05245366592843, "grad_norm": 0.03173828125, "learning_rate": 4.675364878814289e-05, "loss": 0.0016, "mean_token_accuracy": 0.9995744705200196, "num_tokens": 126762479.0, "step": 38835 }, { "entropy": 0.08703018184751272, "epoch": 9.05361930294906, "grad_norm": 0.06298828125, "learning_rate": 4.675262253681845e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999328851699829, "num_tokens": 126775726.0, "step": 38840 }, { "entropy": 0.05530893374234438, "epoch": 9.054784939969693, "grad_norm": 0.07666015625, "learning_rate": 4.67515961475235e-05, "loss": 0.0022, "mean_token_accuracy": 0.9998360633850097, "num_tokens": 126799072.0, "step": 38845 }, { "entropy": 0.05625711902976036, "epoch": 9.055950576990325, "grad_norm": 0.04638671875, "learning_rate": 4.6750569620273324e-05, "loss": 0.0006, "mean_token_accuracy": 0.9993377506732941, "num_tokens": 126813234.0, "step": 38850 }, { "entropy": 0.08031964246183634, "epoch": 9.057116214010957, "grad_norm": 0.064453125, "learning_rate": 4.6749542955083253e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 126840453.0, "step": 38855 }, { "entropy": 0.05963180642575026, "epoch": 9.05828185103159, "grad_norm": 0.039306640625, "learning_rate": 4.674851615196858e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 126850720.0, "step": 38860 }, { "entropy": 0.07368194349110127, "epoch": 9.059447488052221, "grad_norm": 0.61328125, "learning_rate": 4.674748921094462e-05, "loss": 0.0025, "mean_token_accuracy": 0.9993014574050904, "num_tokens": 126865529.0, "step": 38865 }, { "entropy": 0.06885956060141325, "epoch": 9.060613125072852, "grad_norm": 0.0908203125, "learning_rate": 4.6746462132026686e-05, "loss": 0.0024, "mean_token_accuracy": 0.9986264824867248, "num_tokens": 126884470.0, "step": 38870 }, { "entropy": 0.06841649003326893, "epoch": 9.061778762093484, "grad_norm": 0.06640625, "learning_rate": 4.67454349152301e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 126895503.0, "step": 38875 }, { "entropy": 0.06709451526403427, "epoch": 9.062944399114116, "grad_norm": 0.78515625, "learning_rate": 4.674440756057017e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993994951248169, "num_tokens": 126908015.0, "step": 38880 }, { "entropy": 0.05293879248201847, "epoch": 9.064110036134748, "grad_norm": 0.1376953125, "learning_rate": 4.674338006806222e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995798289775848, "num_tokens": 126928878.0, "step": 38885 }, { "entropy": 0.07902012132108212, "epoch": 9.06527567315538, "grad_norm": 0.439453125, "learning_rate": 4.674235243772156e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999373495578766, "num_tokens": 126950751.0, "step": 38890 }, { "entropy": 0.06013939660042524, "epoch": 9.06644131017601, "grad_norm": 0.2119140625, "learning_rate": 4.674132466956354e-05, "loss": 0.0029, "mean_token_accuracy": 0.9996572375297547, "num_tokens": 126962951.0, "step": 38895 }, { "entropy": 0.04780583553947508, "epoch": 9.067606947196643, "grad_norm": 0.19140625, "learning_rate": 4.674029676360346e-05, "loss": 0.0011, "mean_token_accuracy": 0.9996591031551361, "num_tokens": 126992608.0, "step": 38900 }, { "entropy": 0.06537991110235453, "epoch": 9.068772584217275, "grad_norm": 0.0615234375, "learning_rate": 4.6739268719856657e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999897062778473, "num_tokens": 127019628.0, "step": 38905 }, { "entropy": 0.04793858341872692, "epoch": 9.069938221237907, "grad_norm": 0.67578125, "learning_rate": 4.673824053833846e-05, "loss": 0.0026, "mean_token_accuracy": 0.9994303047657013, "num_tokens": 127039633.0, "step": 38910 }, { "entropy": 0.042522894544526936, "epoch": 9.071103858258539, "grad_norm": 0.21484375, "learning_rate": 4.6737212219064204e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996679306030274, "num_tokens": 127070798.0, "step": 38915 }, { "entropy": 0.037787226028740406, "epoch": 9.072269495279171, "grad_norm": 0.0240478515625, "learning_rate": 4.673618376204922e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999767899513244, "num_tokens": 127100035.0, "step": 38920 }, { "entropy": 0.04557211743667722, "epoch": 9.073435132299801, "grad_norm": 0.049072265625, "learning_rate": 4.6735155167308844e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997222900390625, "num_tokens": 127130459.0, "step": 38925 }, { "entropy": 0.05762959066778421, "epoch": 9.074600769320433, "grad_norm": 0.030029296875, "learning_rate": 4.6734126434858416e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999673962593079, "num_tokens": 127143328.0, "step": 38930 }, { "entropy": 0.039792079012840986, "epoch": 9.075766406341065, "grad_norm": 0.1298828125, "learning_rate": 4.673309756471327e-05, "loss": 0.0014, "mean_token_accuracy": 0.9997629404067994, "num_tokens": 127172420.0, "step": 38935 }, { "entropy": 0.0473951231688261, "epoch": 9.076932043361698, "grad_norm": 0.0986328125, "learning_rate": 4.6732068556888755e-05, "loss": 0.0009, "mean_token_accuracy": 0.9999626576900482, "num_tokens": 127203870.0, "step": 38940 }, { "entropy": 0.05528845563530922, "epoch": 9.07809768038233, "grad_norm": 0.1748046875, "learning_rate": 4.673103941140021e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999382495880127, "num_tokens": 127228127.0, "step": 38945 }, { "entropy": 0.046199146658182144, "epoch": 9.07926331740296, "grad_norm": 0.8515625, "learning_rate": 4.673001012826298e-05, "loss": 0.0028, "mean_token_accuracy": 0.9992288112640381, "num_tokens": 127256065.0, "step": 38950 }, { "entropy": 0.052226276509463786, "epoch": 9.080428954423592, "grad_norm": 0.2431640625, "learning_rate": 4.6728980707492426e-05, "loss": 0.0039, "mean_token_accuracy": 0.999572080373764, "num_tokens": 127273454.0, "step": 38955 }, { "entropy": 0.048444395791739224, "epoch": 9.081594591444224, "grad_norm": 0.138671875, "learning_rate": 4.6727951149103884e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999888062477111, "num_tokens": 127294859.0, "step": 38960 }, { "entropy": 0.06333124712109565, "epoch": 9.082760228464856, "grad_norm": 0.033447265625, "learning_rate": 4.672692145311271e-05, "loss": 0.0012, "mean_token_accuracy": 0.9994173049926758, "num_tokens": 127311275.0, "step": 38965 }, { "entropy": 0.04639546973630786, "epoch": 9.083925865485488, "grad_norm": 0.056640625, "learning_rate": 4.672589161953426e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 127328407.0, "step": 38970 }, { "entropy": 0.050291641149669886, "epoch": 9.085091502506119, "grad_norm": 0.34375, "learning_rate": 4.672486164838389e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999376535415649, "num_tokens": 127355314.0, "step": 38975 }, { "entropy": 0.05147938821464777, "epoch": 9.08625713952675, "grad_norm": 2.1875, "learning_rate": 4.672383153967695e-05, "loss": 0.0022, "mean_token_accuracy": 0.999790358543396, "num_tokens": 127382802.0, "step": 38980 }, { "entropy": 0.045143406558781865, "epoch": 9.087422776547383, "grad_norm": 0.087890625, "learning_rate": 4.672280129342882e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999670028686524, "num_tokens": 127418533.0, "step": 38985 }, { "entropy": 0.06548787970095873, "epoch": 9.088588413568015, "grad_norm": 0.3984375, "learning_rate": 4.672177090965484e-05, "loss": 0.001, "mean_token_accuracy": 0.9999308168888092, "num_tokens": 127429872.0, "step": 38990 }, { "entropy": 0.0844784826040268, "epoch": 9.089754050588647, "grad_norm": 0.29296875, "learning_rate": 4.672074038837039e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 127441571.0, "step": 38995 }, { "entropy": 0.05351873962208629, "epoch": 9.09091968760928, "grad_norm": 0.1865234375, "learning_rate": 4.671970972959083e-05, "loss": 0.0013, "mean_token_accuracy": 0.999194449186325, "num_tokens": 127470617.0, "step": 39000 }, { "entropy": 0.0646541254594922, "epoch": 9.09208532462991, "grad_norm": 0.375, "learning_rate": 4.671867893333154e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 127486521.0, "step": 39005 }, { "entropy": 0.06300574182532728, "epoch": 9.093250961650542, "grad_norm": 0.11328125, "learning_rate": 4.671764799960787e-05, "loss": 0.0009, "mean_token_accuracy": 0.9999026715755462, "num_tokens": 127515347.0, "step": 39010 }, { "entropy": 0.07608828879892826, "epoch": 9.094416598671174, "grad_norm": 0.34765625, "learning_rate": 4.6716616928435215e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999895751476288, "num_tokens": 127537060.0, "step": 39015 }, { "entropy": 0.09740983368828893, "epoch": 9.095582235691806, "grad_norm": 0.0849609375, "learning_rate": 4.671558571982893e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 127550040.0, "step": 39020 }, { "entropy": 0.15485297152772545, "epoch": 9.096747872712438, "grad_norm": 0.07080078125, "learning_rate": 4.6714554373804404e-05, "loss": 0.2097, "mean_token_accuracy": 0.9618768632411957, "num_tokens": 127577627.0, "step": 39025 }, { "entropy": 0.0448100233450532, "epoch": 9.097913509733068, "grad_norm": 0.71484375, "learning_rate": 4.671352289037701e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998813092708587, "num_tokens": 127601452.0, "step": 39030 }, { "entropy": 0.051290947990491983, "epoch": 9.0990791467537, "grad_norm": 0.1455078125, "learning_rate": 4.671249126956214e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999408841133117, "num_tokens": 127637347.0, "step": 39035 }, { "entropy": 0.07046590894460678, "epoch": 9.100244783774333, "grad_norm": 0.0322265625, "learning_rate": 4.671145951137516e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999824047088623, "num_tokens": 127652235.0, "step": 39040 }, { "entropy": 0.07944189198315144, "epoch": 9.101410420794965, "grad_norm": 0.5703125, "learning_rate": 4.671042761583147e-05, "loss": 0.0187, "mean_token_accuracy": 0.9976053655147552, "num_tokens": 127671703.0, "step": 39045 }, { "entropy": 0.06307450551539659, "epoch": 9.102576057815597, "grad_norm": 0.05419921875, "learning_rate": 4.670939558294645e-05, "loss": 0.0009, "mean_token_accuracy": 0.9993710696697236, "num_tokens": 127685689.0, "step": 39050 }, { "entropy": 0.08569299336522818, "epoch": 9.103741694836229, "grad_norm": 0.94921875, "learning_rate": 4.6708363412735486e-05, "loss": 0.0036, "mean_token_accuracy": 0.9993679821491241, "num_tokens": 127711429.0, "step": 39055 }, { "entropy": 0.05434844773262739, "epoch": 9.10490733185686, "grad_norm": 0.26953125, "learning_rate": 4.670733110521398e-05, "loss": 0.0034, "mean_token_accuracy": 0.9994161069393158, "num_tokens": 127736568.0, "step": 39060 }, { "entropy": 0.07459601685404778, "epoch": 9.106072968877491, "grad_norm": 0.07763671875, "learning_rate": 4.6706298660397306e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 127755668.0, "step": 39065 }, { "entropy": 0.060200719721615316, "epoch": 9.107238605898123, "grad_norm": 0.154296875, "learning_rate": 4.6705266078300886e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 127768457.0, "step": 39070 }, { "entropy": 0.0584429781883955, "epoch": 9.108404242918756, "grad_norm": 0.68359375, "learning_rate": 4.6704233358940094e-05, "loss": 0.001, "mean_token_accuracy": 0.9996918320655823, "num_tokens": 127778860.0, "step": 39075 }, { "entropy": 0.060449579171836375, "epoch": 9.109569879939388, "grad_norm": 0.08642578125, "learning_rate": 4.6703200502330345e-05, "loss": 0.001, "mean_token_accuracy": 0.9997960925102234, "num_tokens": 127810046.0, "step": 39080 }, { "entropy": 0.03814925597980619, "epoch": 9.110735516960018, "grad_norm": 0.1728515625, "learning_rate": 4.670216750848703e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999839961528778, "num_tokens": 127843080.0, "step": 39085 }, { "entropy": 0.06509601455181838, "epoch": 9.11190115398065, "grad_norm": 0.0311279296875, "learning_rate": 4.670113437742556e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 127867744.0, "step": 39090 }, { "entropy": 0.05478460993617773, "epoch": 9.113066791001282, "grad_norm": 0.94140625, "learning_rate": 4.670010110916133e-05, "loss": 0.0007, "mean_token_accuracy": 0.9996774196624756, "num_tokens": 127895035.0, "step": 39095 }, { "entropy": 0.060867223888635635, "epoch": 9.114232428021914, "grad_norm": 0.302734375, "learning_rate": 4.6699067703709766e-05, "loss": 0.0012, "mean_token_accuracy": 0.99984050989151, "num_tokens": 127907214.0, "step": 39100 }, { "entropy": 0.09498673398047686, "epoch": 9.115398065042546, "grad_norm": 0.20703125, "learning_rate": 4.669803416108626e-05, "loss": 0.0054, "mean_token_accuracy": 0.9991422772407532, "num_tokens": 127932050.0, "step": 39105 }, { "entropy": 0.05927578574046492, "epoch": 9.116563702063177, "grad_norm": 0.0390625, "learning_rate": 4.669700048130622e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 127952338.0, "step": 39110 }, { "entropy": 0.06774403676390647, "epoch": 9.117729339083809, "grad_norm": 0.9921875, "learning_rate": 4.6695966664385087e-05, "loss": 0.0013, "mean_token_accuracy": 0.999355947971344, "num_tokens": 127963600.0, "step": 39115 }, { "entropy": 0.05179953342303634, "epoch": 9.11889497610444, "grad_norm": 1.3515625, "learning_rate": 4.669493271033825e-05, "loss": 0.002, "mean_token_accuracy": 0.9997416198253631, "num_tokens": 127990422.0, "step": 39120 }, { "entropy": 0.06711933370679617, "epoch": 9.120060613125073, "grad_norm": 0.2177734375, "learning_rate": 4.6693898619181144e-05, "loss": 0.0021, "mean_token_accuracy": 0.9995348811149597, "num_tokens": 128001948.0, "step": 39125 }, { "entropy": 0.0793570352718234, "epoch": 9.121226250145705, "grad_norm": 0.08837890625, "learning_rate": 4.669286439092917e-05, "loss": 0.0017, "mean_token_accuracy": 0.9998459160327912, "num_tokens": 128019219.0, "step": 39130 }, { "entropy": 0.061221508868038656, "epoch": 9.122391887166337, "grad_norm": 0.045166015625, "learning_rate": 4.669183002559777e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 128036619.0, "step": 39135 }, { "entropy": 0.06286525307223201, "epoch": 9.123557524186968, "grad_norm": 0.04638671875, "learning_rate": 4.6690795523202355e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 128048687.0, "step": 39140 }, { "entropy": 0.03850018824450672, "epoch": 9.1247231612076, "grad_norm": 0.2275390625, "learning_rate": 4.668976088375836e-05, "loss": 0.003, "mean_token_accuracy": 0.9997575998306274, "num_tokens": 128072663.0, "step": 39145 }, { "entropy": 0.0629619574174285, "epoch": 9.125888798228232, "grad_norm": 0.02099609375, "learning_rate": 4.66887261072812e-05, "loss": 0.0015, "mean_token_accuracy": 0.9994776368141174, "num_tokens": 128084948.0, "step": 39150 }, { "entropy": 0.06467896215617656, "epoch": 9.127054435248864, "grad_norm": 0.189453125, "learning_rate": 4.668769119378632e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 128094427.0, "step": 39155 }, { "entropy": 0.053331601060926916, "epoch": 9.128220072269496, "grad_norm": 0.0238037109375, "learning_rate": 4.668665614328914e-05, "loss": 0.0014, "mean_token_accuracy": 0.9996485114097595, "num_tokens": 128118089.0, "step": 39160 }, { "entropy": 0.048440984450280666, "epoch": 9.129385709290126, "grad_norm": 0.173828125, "learning_rate": 4.6685620955805104e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999478876590728, "num_tokens": 128133298.0, "step": 39165 }, { "entropy": 0.056567294523119926, "epoch": 9.130551346310758, "grad_norm": 0.10107421875, "learning_rate": 4.6684585631349644e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995827078819275, "num_tokens": 128152849.0, "step": 39170 }, { "entropy": 0.05280892346054315, "epoch": 9.13171698333139, "grad_norm": 0.076171875, "learning_rate": 4.668355016993819e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 128166039.0, "step": 39175 }, { "entropy": 0.06629458190873265, "epoch": 9.132882620352023, "grad_norm": 0.1669921875, "learning_rate": 4.66825145715862e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999472379684449, "num_tokens": 128183728.0, "step": 39180 }, { "entropy": 0.04763224720954895, "epoch": 9.134048257372655, "grad_norm": 0.11572265625, "learning_rate": 4.66814788363091e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 128199350.0, "step": 39185 }, { "entropy": 0.07102838419377804, "epoch": 9.135213894393287, "grad_norm": 0.1953125, "learning_rate": 4.668044296412234e-05, "loss": 0.0031, "mean_token_accuracy": 0.9999483764171601, "num_tokens": 128220784.0, "step": 39190 }, { "entropy": 0.0709263451397419, "epoch": 9.136379531413917, "grad_norm": 1.140625, "learning_rate": 4.667940695504137e-05, "loss": 0.0015, "mean_token_accuracy": 0.9998858451843262, "num_tokens": 128229335.0, "step": 39195 }, { "entropy": 0.059687384590506556, "epoch": 9.13754516843455, "grad_norm": 0.0986328125, "learning_rate": 4.667837080908164e-05, "loss": 0.0022, "mean_token_accuracy": 0.999447512626648, "num_tokens": 128246055.0, "step": 39200 }, { "entropy": 0.054660573275759815, "epoch": 9.138710805455181, "grad_norm": 0.05029296875, "learning_rate": 4.667733452625859e-05, "loss": 0.001, "mean_token_accuracy": 0.9999875128269196, "num_tokens": 128267602.0, "step": 39205 }, { "entropy": 0.05184516375884414, "epoch": 9.139876442475813, "grad_norm": 0.08740234375, "learning_rate": 4.667629810658768e-05, "loss": 0.0051, "mean_token_accuracy": 0.9993105053901672, "num_tokens": 128291298.0, "step": 39210 }, { "entropy": 0.04663265328854323, "epoch": 9.141042079496446, "grad_norm": 0.08447265625, "learning_rate": 4.667526155008436e-05, "loss": 0.001, "mean_token_accuracy": 0.999936830997467, "num_tokens": 128306082.0, "step": 39215 }, { "entropy": 0.06637532748281956, "epoch": 9.142207716517076, "grad_norm": 0.1240234375, "learning_rate": 4.6674224856764096e-05, "loss": 0.0019, "mean_token_accuracy": 0.9994987487792969, "num_tokens": 128316316.0, "step": 39220 }, { "entropy": 0.06515855994075537, "epoch": 9.143373353537708, "grad_norm": 0.0322265625, "learning_rate": 4.667318802664234e-05, "loss": 0.0004, "mean_token_accuracy": 0.9996699690818787, "num_tokens": 128329256.0, "step": 39225 }, { "entropy": 0.06536714136600494, "epoch": 9.14453899055834, "grad_norm": 0.1357421875, "learning_rate": 4.6672151059734555e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999700427055359, "num_tokens": 128352667.0, "step": 39230 }, { "entropy": 0.07236762633547186, "epoch": 9.145704627578972, "grad_norm": 1.75, "learning_rate": 4.6671113956056194e-05, "loss": 0.0011, "mean_token_accuracy": 0.9995726466178894, "num_tokens": 128371166.0, "step": 39235 }, { "entropy": 0.048507886566221715, "epoch": 9.146870264599604, "grad_norm": 0.267578125, "learning_rate": 4.667007671562274e-05, "loss": 0.0007, "mean_token_accuracy": 0.9994894981384277, "num_tokens": 128410651.0, "step": 39240 }, { "entropy": 0.058967319689691065, "epoch": 9.148035901620235, "grad_norm": 0.5234375, "learning_rate": 4.6669039338449636e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997023820877076, "num_tokens": 128423915.0, "step": 39245 }, { "entropy": 0.0643064547330141, "epoch": 9.149201538640867, "grad_norm": 0.1025390625, "learning_rate": 4.666800182455238e-05, "loss": 0.0023, "mean_token_accuracy": 0.9993151903152466, "num_tokens": 128436003.0, "step": 39250 }, { "entropy": 0.05355655811727047, "epoch": 9.150367175661499, "grad_norm": 1.0390625, "learning_rate": 4.6666964173946415e-05, "loss": 0.0066, "mean_token_accuracy": 0.9981539249420166, "num_tokens": 128466715.0, "step": 39255 }, { "entropy": 0.0643747929483652, "epoch": 9.151532812682131, "grad_norm": 0.095703125, "learning_rate": 4.666592638664724e-05, "loss": 0.0007, "mean_token_accuracy": 0.9996855318546295, "num_tokens": 128480809.0, "step": 39260 }, { "entropy": 0.07963294740766287, "epoch": 9.152698449702763, "grad_norm": 0.091796875, "learning_rate": 4.6664888462670295e-05, "loss": 0.0175, "mean_token_accuracy": 0.9961723029613495, "num_tokens": 128520211.0, "step": 39265 }, { "entropy": 0.0579895157366991, "epoch": 9.153864086723395, "grad_norm": 0.017822265625, "learning_rate": 4.666385040203109e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 128536577.0, "step": 39270 }, { "entropy": 0.053957913815975186, "epoch": 9.155029723744025, "grad_norm": 0.036865234375, "learning_rate": 4.66628122047451e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999778687953949, "num_tokens": 128555849.0, "step": 39275 }, { "entropy": 0.05544408448040485, "epoch": 9.156195360764658, "grad_norm": 3.296875, "learning_rate": 4.666177387082779e-05, "loss": 0.0014, "mean_token_accuracy": 0.9991379320621491, "num_tokens": 128569598.0, "step": 39280 }, { "entropy": 0.05070274667814374, "epoch": 9.15736099778529, "grad_norm": 0.0380859375, "learning_rate": 4.666073540029465e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 128586309.0, "step": 39285 }, { "entropy": 0.0538034837692976, "epoch": 9.158526634805922, "grad_norm": 0.05615234375, "learning_rate": 4.665969679316117e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999155402183533, "num_tokens": 128617861.0, "step": 39290 }, { "entropy": 0.04511388307437301, "epoch": 9.159692271826554, "grad_norm": 0.0556640625, "learning_rate": 4.665865804944284e-05, "loss": 0.0019, "mean_token_accuracy": 0.9997424304485321, "num_tokens": 128647469.0, "step": 39295 }, { "entropy": 0.04719185484573245, "epoch": 9.160857908847184, "grad_norm": 0.044677734375, "learning_rate": 4.665761916915513e-05, "loss": 0.012, "mean_token_accuracy": 0.9976949393749237, "num_tokens": 128666573.0, "step": 39300 }, { "entropy": 0.047588223777711394, "epoch": 9.162023545867816, "grad_norm": 0.146484375, "learning_rate": 4.6656580152313554e-05, "loss": 0.0004, "mean_token_accuracy": 0.999963355064392, "num_tokens": 128694201.0, "step": 39305 }, { "entropy": 0.049427392426878214, "epoch": 9.163189182888448, "grad_norm": 0.96875, "learning_rate": 4.665554099893359e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998815894126892, "num_tokens": 128722755.0, "step": 39310 }, { "entropy": 0.047885213326662776, "epoch": 9.16435481990908, "grad_norm": 0.21875, "learning_rate": 4.665450170903074e-05, "loss": 0.001, "mean_token_accuracy": 0.9990951359272003, "num_tokens": 128751245.0, "step": 39315 }, { "entropy": 0.06532157673500479, "epoch": 9.165520456929713, "grad_norm": 0.111328125, "learning_rate": 4.6653462282620504e-05, "loss": 0.0005, "mean_token_accuracy": 0.999978369474411, "num_tokens": 128777772.0, "step": 39320 }, { "entropy": 0.06396664790809155, "epoch": 9.166686093950345, "grad_norm": 0.14453125, "learning_rate": 4.6652422719718374e-05, "loss": 0.0006, "mean_token_accuracy": 0.9997727274894714, "num_tokens": 128790482.0, "step": 39325 }, { "entropy": 0.06470691915601492, "epoch": 9.167851730970975, "grad_norm": 0.036376953125, "learning_rate": 4.6651383020339855e-05, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 128801350.0, "step": 39330 }, { "entropy": 0.08019221909344196, "epoch": 9.169017367991607, "grad_norm": 0.2451171875, "learning_rate": 4.665034318450045e-05, "loss": 0.0006, "mean_token_accuracy": 0.999977171421051, "num_tokens": 128819447.0, "step": 39335 }, { "entropy": 0.06017429428175092, "epoch": 9.17018300501224, "grad_norm": 0.0203857421875, "learning_rate": 4.664930321221567e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999876737594604, "num_tokens": 128847434.0, "step": 39340 }, { "entropy": 0.05513544147834182, "epoch": 9.171348642032871, "grad_norm": 0.333984375, "learning_rate": 4.664826310350102e-05, "loss": 0.0011, "mean_token_accuracy": 0.9994580030441285, "num_tokens": 128878814.0, "step": 39345 }, { "entropy": 0.06086507327854633, "epoch": 9.172514279053503, "grad_norm": 0.08203125, "learning_rate": 4.6647222858372004e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 128897376.0, "step": 39350 }, { "entropy": 0.05507046952843666, "epoch": 9.173679916074134, "grad_norm": 0.0615234375, "learning_rate": 4.6646182476844135e-05, "loss": 0.0007, "mean_token_accuracy": 0.9995168089866638, "num_tokens": 128912835.0, "step": 39355 }, { "entropy": 0.05077353697270155, "epoch": 9.174845553094766, "grad_norm": 0.34375, "learning_rate": 4.664514195893293e-05, "loss": 0.0007, "mean_token_accuracy": 0.9998065769672394, "num_tokens": 128930582.0, "step": 39360 }, { "entropy": 0.044340075273066756, "epoch": 9.176011190115398, "grad_norm": 0.341796875, "learning_rate": 4.6644101304653904e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997381329536438, "num_tokens": 128956920.0, "step": 39365 }, { "entropy": 0.05113975182175636, "epoch": 9.17717682713603, "grad_norm": 0.2119140625, "learning_rate": 4.664306051402257e-05, "loss": 0.003, "mean_token_accuracy": 0.998814046382904, "num_tokens": 128978761.0, "step": 39370 }, { "entropy": 0.0629701149649918, "epoch": 9.178342464156662, "grad_norm": 0.12451171875, "learning_rate": 4.664201958705445e-05, "loss": 0.0026, "mean_token_accuracy": 0.9992149472236633, "num_tokens": 129001845.0, "step": 39375 }, { "entropy": 0.05342851486057043, "epoch": 9.179508101177293, "grad_norm": 0.058349609375, "learning_rate": 4.6640978523765075e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 129016277.0, "step": 39380 }, { "entropy": 0.07416110690683127, "epoch": 9.180673738197925, "grad_norm": 0.07421875, "learning_rate": 4.6639937324169966e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997787594795227, "num_tokens": 129027129.0, "step": 39385 }, { "entropy": 0.0515309227630496, "epoch": 9.181839375218557, "grad_norm": 0.62109375, "learning_rate": 4.6638895988284634e-05, "loss": 0.0006, "mean_token_accuracy": 0.99997638463974, "num_tokens": 129042418.0, "step": 39390 }, { "entropy": 0.0686886103823781, "epoch": 9.183005012239189, "grad_norm": 0.328125, "learning_rate": 4.6637854516124616e-05, "loss": 0.0012, "mean_token_accuracy": 0.9993710696697236, "num_tokens": 129059775.0, "step": 39395 }, { "entropy": 0.05704756639897823, "epoch": 9.184170649259821, "grad_norm": 0.0556640625, "learning_rate": 4.663681290770545e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 129079156.0, "step": 39400 }, { "entropy": 0.04882788760587573, "epoch": 9.185336286280453, "grad_norm": 0.2099609375, "learning_rate": 4.663577116304266e-05, "loss": 0.001, "mean_token_accuracy": 0.9994452178478241, "num_tokens": 129104080.0, "step": 39405 }, { "entropy": 0.051801460422575475, "epoch": 9.186501923301083, "grad_norm": 0.1806640625, "learning_rate": 4.663472928215178e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 129123118.0, "step": 39410 }, { "entropy": 0.07059294497594237, "epoch": 9.187667560321715, "grad_norm": 0.134765625, "learning_rate": 4.6633687265048344e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999624252319336, "num_tokens": 129142115.0, "step": 39415 }, { "entropy": 0.04961256729438901, "epoch": 9.188833197342348, "grad_norm": 0.232421875, "learning_rate": 4.66326451117479e-05, "loss": 0.0008, "mean_token_accuracy": 0.9996217608451843, "num_tokens": 129171901.0, "step": 39420 }, { "entropy": 0.04771387707442045, "epoch": 9.18999883436298, "grad_norm": 0.5390625, "learning_rate": 4.663160282226597e-05, "loss": 0.0021, "mean_token_accuracy": 0.9998019516468049, "num_tokens": 129212825.0, "step": 39425 }, { "entropy": 0.062344396207481625, "epoch": 9.191164471383612, "grad_norm": 0.8515625, "learning_rate": 4.663056039661811e-05, "loss": 0.0014, "mean_token_accuracy": 0.9993386328220367, "num_tokens": 129228522.0, "step": 39430 }, { "entropy": 0.06107481122016907, "epoch": 9.192330108404242, "grad_norm": 0.193359375, "learning_rate": 4.662951783481987e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 129242723.0, "step": 39435 }, { "entropy": 0.06025108881294727, "epoch": 9.193495745424874, "grad_norm": 0.52734375, "learning_rate": 4.662847513688678e-05, "loss": 0.0018, "mean_token_accuracy": 0.9987414538860321, "num_tokens": 129267902.0, "step": 39440 }, { "entropy": 0.0548030543141067, "epoch": 9.194661382445506, "grad_norm": 0.039794921875, "learning_rate": 4.66274323028344e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995507836341858, "num_tokens": 129301913.0, "step": 39445 }, { "entropy": 0.05716030802577734, "epoch": 9.195827019466138, "grad_norm": 1.7578125, "learning_rate": 4.662638933267827e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993688941001893, "num_tokens": 129321794.0, "step": 39450 }, { "entropy": 0.04664466297253966, "epoch": 9.19699265648677, "grad_norm": 0.0849609375, "learning_rate": 4.662534622643395e-05, "loss": 0.0119, "mean_token_accuracy": 0.9973366379737854, "num_tokens": 129355666.0, "step": 39455 }, { "entropy": 0.058199040777981284, "epoch": 9.198158293507403, "grad_norm": 0.14453125, "learning_rate": 4.6624302984116996e-05, "loss": 0.0017, "mean_token_accuracy": 0.9995925843715667, "num_tokens": 129391276.0, "step": 39460 }, { "entropy": 0.064020661637187, "epoch": 9.199323930528033, "grad_norm": 0.07080078125, "learning_rate": 4.662325960574297e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 129408426.0, "step": 39465 }, { "entropy": 0.04356107474304736, "epoch": 9.200489567548665, "grad_norm": 0.09619140625, "learning_rate": 4.6622216091327403e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999892115592957, "num_tokens": 129435871.0, "step": 39470 }, { "entropy": 0.06568758022040129, "epoch": 9.201655204569297, "grad_norm": 0.055419921875, "learning_rate": 4.662117244088588e-05, "loss": 0.0015, "mean_token_accuracy": 0.9998577475547791, "num_tokens": 129453203.0, "step": 39475 }, { "entropy": 0.05361274089664221, "epoch": 9.20282084158993, "grad_norm": 0.04345703125, "learning_rate": 4.662012865443396e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999482154846191, "num_tokens": 129470797.0, "step": 39480 }, { "entropy": 0.050313870795071124, "epoch": 9.203986478610561, "grad_norm": 0.07861328125, "learning_rate": 4.66190847319872e-05, "loss": 0.0032, "mean_token_accuracy": 0.9993630588054657, "num_tokens": 129495570.0, "step": 39485 }, { "entropy": 0.041808879002928734, "epoch": 9.205152115631192, "grad_norm": 0.1298828125, "learning_rate": 4.661804067356118e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996637821197509, "num_tokens": 129530054.0, "step": 39490 }, { "entropy": 0.06288732271641492, "epoch": 9.206317752651824, "grad_norm": 0.0238037109375, "learning_rate": 4.661699647917145e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 129544100.0, "step": 39495 }, { "entropy": 0.043396432511508466, "epoch": 9.207483389672456, "grad_norm": 1.0234375, "learning_rate": 4.6615952148833587e-05, "loss": 0.0015, "mean_token_accuracy": 0.9995144903659821, "num_tokens": 129582973.0, "step": 39500 }, { "entropy": 0.045847236458212134, "epoch": 9.208649026693088, "grad_norm": 0.1396484375, "learning_rate": 4.6614907682563177e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 129609091.0, "step": 39505 }, { "entropy": 0.07774993143975735, "epoch": 9.20981466371372, "grad_norm": 0.265625, "learning_rate": 4.661386308037577e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 129619771.0, "step": 39510 }, { "entropy": 0.07045667059719563, "epoch": 9.21098030073435, "grad_norm": 0.034423828125, "learning_rate": 4.661281834228697e-05, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 129629617.0, "step": 39515 }, { "entropy": 0.05546746281906963, "epoch": 9.212145937754983, "grad_norm": 0.85546875, "learning_rate": 4.661177346831234e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997250974178314, "num_tokens": 129652976.0, "step": 39520 }, { "entropy": 0.04471162809059024, "epoch": 9.213311574775615, "grad_norm": 0.234375, "learning_rate": 4.661072845846746e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 129678217.0, "step": 39525 }, { "entropy": 0.040480018593370914, "epoch": 9.214477211796247, "grad_norm": 0.10986328125, "learning_rate": 4.660968331276791e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999004542827606, "num_tokens": 129720188.0, "step": 39530 }, { "entropy": 0.060479015298187735, "epoch": 9.215642848816879, "grad_norm": 0.06591796875, "learning_rate": 4.660863803122928e-05, "loss": 0.0039, "mean_token_accuracy": 0.9987586557865142, "num_tokens": 129731917.0, "step": 39535 }, { "entropy": 0.06452511474490166, "epoch": 9.216808485837511, "grad_norm": 0.029296875, "learning_rate": 4.660759261386717e-05, "loss": 0.0021, "mean_token_accuracy": 0.9995082080364227, "num_tokens": 129743041.0, "step": 39540 }, { "entropy": 0.048442641738802195, "epoch": 9.217974122858141, "grad_norm": 0.134765625, "learning_rate": 4.6606547060697145e-05, "loss": 0.001, "mean_token_accuracy": 0.9992859959602356, "num_tokens": 129772704.0, "step": 39545 }, { "entropy": 0.06779967844486237, "epoch": 9.219139759878773, "grad_norm": 0.05810546875, "learning_rate": 4.6605501371734804e-05, "loss": 0.0041, "mean_token_accuracy": 0.9993433296680451, "num_tokens": 129782439.0, "step": 39550 }, { "entropy": 0.04868889665231109, "epoch": 9.220305396899406, "grad_norm": 0.12451171875, "learning_rate": 4.660445554699575e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996962010860443, "num_tokens": 129817338.0, "step": 39555 }, { "entropy": 0.047591369785368445, "epoch": 9.221471033920038, "grad_norm": 0.380859375, "learning_rate": 4.660340958649557e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998111546039581, "num_tokens": 129848844.0, "step": 39560 }, { "entropy": 0.06939591914415359, "epoch": 9.22263667094067, "grad_norm": 0.23828125, "learning_rate": 4.660236349024985e-05, "loss": 0.0008, "mean_token_accuracy": 0.9993773400783539, "num_tokens": 129871471.0, "step": 39565 }, { "entropy": 0.05595292616635561, "epoch": 9.2238023079613, "grad_norm": 0.05908203125, "learning_rate": 4.6601317258274214e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 129885167.0, "step": 39570 }, { "entropy": 0.06748554203659296, "epoch": 9.224967944981932, "grad_norm": 2.265625, "learning_rate": 4.660027089058424e-05, "loss": 0.0015, "mean_token_accuracy": 0.9995475113391876, "num_tokens": 129894753.0, "step": 39575 }, { "entropy": 0.08345902096480132, "epoch": 9.226133582002564, "grad_norm": 0.060302734375, "learning_rate": 4.6599224387195537e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 129907997.0, "step": 39580 }, { "entropy": 0.0697068564593792, "epoch": 9.227299219023196, "grad_norm": 0.330078125, "learning_rate": 4.659817774812372e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999857664108276, "num_tokens": 129926056.0, "step": 39585 }, { "entropy": 0.045826638396829364, "epoch": 9.228464856043828, "grad_norm": 0.0986328125, "learning_rate": 4.659713097338438e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999352037906647, "num_tokens": 129956239.0, "step": 39590 }, { "entropy": 0.05160574847832322, "epoch": 9.22963049306446, "grad_norm": 0.01611328125, "learning_rate": 4.659608406299314e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999892354011536, "num_tokens": 129980573.0, "step": 39595 }, { "entropy": 0.049797141645103696, "epoch": 9.230796130085091, "grad_norm": 0.197265625, "learning_rate": 4.65950370169656e-05, "loss": 0.0025, "mean_token_accuracy": 0.9998175919055938, "num_tokens": 130000508.0, "step": 39600 }, { "entropy": 0.06450515007600188, "epoch": 9.231961767105723, "grad_norm": 0.1669921875, "learning_rate": 4.659398983531739e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999364972114563, "num_tokens": 130031099.0, "step": 39605 }, { "entropy": 0.05456445217132568, "epoch": 9.233127404126355, "grad_norm": 0.051025390625, "learning_rate": 4.65929425180641e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999895572662354, "num_tokens": 130061066.0, "step": 39610 }, { "entropy": 0.05030291229486465, "epoch": 9.234293041146987, "grad_norm": 0.1748046875, "learning_rate": 4.659189506522137e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999889194965362, "num_tokens": 130087255.0, "step": 39615 }, { "entropy": 0.05618452290073037, "epoch": 9.23545867816762, "grad_norm": 0.083984375, "learning_rate": 4.6590847476804803e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997907936573028, "num_tokens": 130104918.0, "step": 39620 }, { "entropy": 0.07257263027131558, "epoch": 9.23662431518825, "grad_norm": 0.0849609375, "learning_rate": 4.658979975283003e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 130117627.0, "step": 39625 }, { "entropy": 0.05578606994822621, "epoch": 9.237789952208882, "grad_norm": 0.19140625, "learning_rate": 4.6588751893312676e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997557580471039, "num_tokens": 130141689.0, "step": 39630 }, { "entropy": 0.05225554727949202, "epoch": 9.238955589229514, "grad_norm": 0.248046875, "learning_rate": 4.6587703898268353e-05, "loss": 0.001, "mean_token_accuracy": 0.9997475385665894, "num_tokens": 130167512.0, "step": 39635 }, { "entropy": 0.0557454289868474, "epoch": 9.240121226250146, "grad_norm": 0.0162353515625, "learning_rate": 4.65866557677127e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 130180103.0, "step": 39640 }, { "entropy": 0.06349661573767662, "epoch": 9.241286863270778, "grad_norm": 0.220703125, "learning_rate": 4.658560750166134e-05, "loss": 0.0044, "mean_token_accuracy": 0.9994892954826355, "num_tokens": 130197775.0, "step": 39645 }, { "entropy": 0.048552720621228215, "epoch": 9.242452500291408, "grad_norm": 0.05615234375, "learning_rate": 4.658455910012991e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997536957263946, "num_tokens": 130217603.0, "step": 39650 }, { "entropy": 0.050778523366898295, "epoch": 9.24361813731204, "grad_norm": 1.265625, "learning_rate": 4.658351056313404e-05, "loss": 0.0033, "mean_token_accuracy": 0.9992936372756958, "num_tokens": 130249221.0, "step": 39655 }, { "entropy": 0.059298649057745935, "epoch": 9.244783774332673, "grad_norm": 0.1015625, "learning_rate": 4.6582461890689354e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999761939048767, "num_tokens": 130267756.0, "step": 39660 }, { "entropy": 0.05675745829939842, "epoch": 9.245949411353305, "grad_norm": 0.1533203125, "learning_rate": 4.6581413082811514e-05, "loss": 0.0021, "mean_token_accuracy": 0.9996394336223602, "num_tokens": 130287265.0, "step": 39665 }, { "entropy": 0.07041055001318455, "epoch": 9.247115048373937, "grad_norm": 2.921875, "learning_rate": 4.658036413951614e-05, "loss": 0.0054, "mean_token_accuracy": 0.9992876887321472, "num_tokens": 130297803.0, "step": 39670 }, { "entropy": 0.056393209099769595, "epoch": 9.248280685394569, "grad_norm": 0.875, "learning_rate": 4.657931506081889e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997631192207337, "num_tokens": 130314472.0, "step": 39675 }, { "entropy": 0.060738119576126334, "epoch": 9.2494463224152, "grad_norm": 0.1259765625, "learning_rate": 4.657826584673538e-05, "loss": 0.001, "mean_token_accuracy": 0.9999781250953674, "num_tokens": 130328923.0, "step": 39680 }, { "entropy": 0.0554684535600245, "epoch": 9.250611959435831, "grad_norm": 0.107421875, "learning_rate": 4.6577216497281275e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999745726585388, "num_tokens": 130344597.0, "step": 39685 }, { "entropy": 0.05555789954960346, "epoch": 9.251777596456463, "grad_norm": 0.0966796875, "learning_rate": 4.657616701247222e-05, "loss": 0.001, "mean_token_accuracy": 0.9996960461139679, "num_tokens": 130365321.0, "step": 39690 }, { "entropy": 0.056109121069312096, "epoch": 9.252943233477096, "grad_norm": 0.1630859375, "learning_rate": 4.657511739232387e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996614396572113, "num_tokens": 130385100.0, "step": 39695 }, { "entropy": 0.06600268706679344, "epoch": 9.254108870497728, "grad_norm": 0.29296875, "learning_rate": 4.657406763685187e-05, "loss": 0.0032, "mean_token_accuracy": 0.9989592075347901, "num_tokens": 130397765.0, "step": 39700 }, { "entropy": 0.059470337629318235, "epoch": 9.255274507518358, "grad_norm": 0.1259765625, "learning_rate": 4.657301774607187e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 130408436.0, "step": 39705 }, { "entropy": 0.04764430820941925, "epoch": 9.25644014453899, "grad_norm": 0.2294921875, "learning_rate": 4.6571967719999524e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999611496925354, "num_tokens": 130428273.0, "step": 39710 }, { "entropy": 0.04987523974850774, "epoch": 9.257605781559622, "grad_norm": 0.126953125, "learning_rate": 4.6570917558650495e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997695863246918, "num_tokens": 130441734.0, "step": 39715 }, { "entropy": 0.03264869209378958, "epoch": 9.258771418580254, "grad_norm": 0.1611328125, "learning_rate": 4.656986726204044e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999175131320953, "num_tokens": 130487344.0, "step": 39720 }, { "entropy": 0.05826353104785085, "epoch": 9.259937055600886, "grad_norm": 0.2109375, "learning_rate": 4.656881683018503e-05, "loss": 0.0014, "mean_token_accuracy": 0.9995794057846069, "num_tokens": 130507878.0, "step": 39725 }, { "entropy": 0.06025405712425709, "epoch": 9.261102692621517, "grad_norm": 0.1376953125, "learning_rate": 4.6567766263099925e-05, "loss": 0.0028, "mean_token_accuracy": 0.9991832613945008, "num_tokens": 130527697.0, "step": 39730 }, { "entropy": 0.07366060577332974, "epoch": 9.262268329642149, "grad_norm": 0.1142578125, "learning_rate": 4.656671556080078e-05, "loss": 0.0014, "mean_token_accuracy": 0.9995670974254608, "num_tokens": 130538675.0, "step": 39735 }, { "entropy": 0.07155503174290061, "epoch": 9.263433966662781, "grad_norm": 0.08984375, "learning_rate": 4.656566472330326e-05, "loss": 0.0004, "mean_token_accuracy": 0.9996869444847107, "num_tokens": 130563391.0, "step": 39740 }, { "entropy": 0.037785251764580606, "epoch": 9.264599603683413, "grad_norm": 0.1484375, "learning_rate": 4.6564613750623054e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999007999897003, "num_tokens": 130594860.0, "step": 39745 }, { "entropy": 0.07801759839057923, "epoch": 9.265765240704045, "grad_norm": 0.0257568359375, "learning_rate": 4.656356264277582e-05, "loss": 0.001, "mean_token_accuracy": 0.9996275663375854, "num_tokens": 130616197.0, "step": 39750 }, { "entropy": 0.05990429036319256, "epoch": 9.266930877724677, "grad_norm": 0.0184326171875, "learning_rate": 4.656251139977724e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999756574630737, "num_tokens": 130640769.0, "step": 39755 }, { "entropy": 0.066241804510355, "epoch": 9.268096514745308, "grad_norm": 0.43359375, "learning_rate": 4.6561460021642974e-05, "loss": 0.0004, "mean_token_accuracy": 0.9997572839260102, "num_tokens": 130656744.0, "step": 39760 }, { "entropy": 0.06499183923006058, "epoch": 9.26926215176594, "grad_norm": 0.1767578125, "learning_rate": 4.656040850838872e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999786257743836, "num_tokens": 130675036.0, "step": 39765 }, { "entropy": 0.0647053737193346, "epoch": 9.270427788786572, "grad_norm": 0.026123046875, "learning_rate": 4.6559356860030146e-05, "loss": 0.0036, "mean_token_accuracy": 0.9986259639263153, "num_tokens": 130687371.0, "step": 39770 }, { "entropy": 0.06163849774748087, "epoch": 9.271593425807204, "grad_norm": 0.59375, "learning_rate": 4.655830507658293e-05, "loss": 0.0005, "mean_token_accuracy": 0.999989140033722, "num_tokens": 130708757.0, "step": 39775 }, { "entropy": 0.078371412307024, "epoch": 9.272759062827836, "grad_norm": 0.043212890625, "learning_rate": 4.6557253158062765e-05, "loss": 0.0024, "mean_token_accuracy": 0.9997084558010101, "num_tokens": 130721840.0, "step": 39780 }, { "entropy": 0.05995170101523399, "epoch": 9.273924699848466, "grad_norm": 1.4921875, "learning_rate": 4.655620110448533e-05, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 130738473.0, "step": 39785 }, { "entropy": 0.05594620313495398, "epoch": 9.275090336869098, "grad_norm": 0.1279296875, "learning_rate": 4.6555148915866316e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999889075756073, "num_tokens": 130764506.0, "step": 39790 }, { "entropy": 0.05183468423783779, "epoch": 9.27625597388973, "grad_norm": 0.267578125, "learning_rate": 4.6554096592221406e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997442483901977, "num_tokens": 130781308.0, "step": 39795 }, { "entropy": 0.04706506533548236, "epoch": 9.277421610910363, "grad_norm": 0.09814453125, "learning_rate": 4.655304413356631e-05, "loss": 0.0006, "mean_token_accuracy": 0.9997573494911194, "num_tokens": 130815566.0, "step": 39800 }, { "entropy": 0.07166340351104736, "epoch": 9.278587247930995, "grad_norm": 0.048095703125, "learning_rate": 4.65519915399167e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 130826851.0, "step": 39805 }, { "entropy": 0.06749443728476763, "epoch": 9.279752884951627, "grad_norm": 0.5859375, "learning_rate": 4.6550938811288285e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997927844524384, "num_tokens": 130854821.0, "step": 39810 }, { "entropy": 0.04661665195599198, "epoch": 9.280918521972257, "grad_norm": 0.2294921875, "learning_rate": 4.654988594769676e-05, "loss": 0.0014, "mean_token_accuracy": 0.9997952282428741, "num_tokens": 130874687.0, "step": 39815 }, { "entropy": 0.08724109530448913, "epoch": 9.28208415899289, "grad_norm": 0.053955078125, "learning_rate": 4.654883294915782e-05, "loss": 0.0181, "mean_token_accuracy": 0.9981146395206452, "num_tokens": 130901068.0, "step": 39820 }, { "entropy": 0.08578491769731045, "epoch": 9.283249796013521, "grad_norm": 0.1259765625, "learning_rate": 4.654777981568717e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997267782688141, "num_tokens": 130911620.0, "step": 39825 }, { "entropy": 0.040522088576108214, "epoch": 9.284415433034154, "grad_norm": 0.1826171875, "learning_rate": 4.654672654730052e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999558866024018, "num_tokens": 130942124.0, "step": 39830 }, { "entropy": 0.06131479572504759, "epoch": 9.285581070054786, "grad_norm": 0.10693359375, "learning_rate": 4.654567314401356e-05, "loss": 0.0018, "mean_token_accuracy": 0.9995066106319428, "num_tokens": 130975937.0, "step": 39835 }, { "entropy": 0.05530798006802797, "epoch": 9.286746707075416, "grad_norm": 0.052734375, "learning_rate": 4.654461960584202e-05, "loss": 0.0016, "mean_token_accuracy": 0.9994952142238617, "num_tokens": 130988252.0, "step": 39840 }, { "entropy": 0.054783723689615725, "epoch": 9.287912344096048, "grad_norm": 0.1787109375, "learning_rate": 4.654356593280159e-05, "loss": 0.0019, "mean_token_accuracy": 0.9993468701839447, "num_tokens": 131023807.0, "step": 39845 }, { "entropy": 0.05474893264472484, "epoch": 9.28907798111668, "grad_norm": 0.1357421875, "learning_rate": 4.6542512124907995e-05, "loss": 0.0008, "mean_token_accuracy": 0.9996736764907836, "num_tokens": 131050692.0, "step": 39850 }, { "entropy": 0.05652236472815275, "epoch": 9.290243618137312, "grad_norm": 0.453125, "learning_rate": 4.654145818217694e-05, "loss": 0.002, "mean_token_accuracy": 0.9995680093765259, "num_tokens": 131067738.0, "step": 39855 }, { "entropy": 0.06417035115882755, "epoch": 9.291409255157944, "grad_norm": 0.05126953125, "learning_rate": 4.6540404104624144e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999670863151551, "num_tokens": 131093753.0, "step": 39860 }, { "entropy": 0.06767412256449461, "epoch": 9.292574892178575, "grad_norm": 0.20703125, "learning_rate": 4.6539349892265324e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997669279575347, "num_tokens": 131106916.0, "step": 39865 }, { "entropy": 0.052419050503522156, "epoch": 9.293740529199207, "grad_norm": 0.0301513671875, "learning_rate": 4.6538295545116206e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999543309211731, "num_tokens": 131135751.0, "step": 39870 }, { "entropy": 0.04931356767192483, "epoch": 9.294906166219839, "grad_norm": 0.2265625, "learning_rate": 4.6537241063192504e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999555230140686, "num_tokens": 131159416.0, "step": 39875 }, { "entropy": 0.055171745270490645, "epoch": 9.296071803240471, "grad_norm": 0.04931640625, "learning_rate": 4.653618644650995e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 131172872.0, "step": 39880 }, { "entropy": 0.05928640691563487, "epoch": 9.297237440261103, "grad_norm": 0.037353515625, "learning_rate": 4.6535131695084255e-05, "loss": 0.0015, "mean_token_accuracy": 0.9997416019439698, "num_tokens": 131184609.0, "step": 39885 }, { "entropy": 0.04858807288110256, "epoch": 9.298403077281735, "grad_norm": 0.09326171875, "learning_rate": 4.653407680893116e-05, "loss": 0.0054, "mean_token_accuracy": 0.9997319042682647, "num_tokens": 131217075.0, "step": 39890 }, { "entropy": 0.04870633902028203, "epoch": 9.299568714302366, "grad_norm": 0.06884765625, "learning_rate": 4.65330217880664e-05, "loss": 0.0004, "mean_token_accuracy": 0.999982088804245, "num_tokens": 131236468.0, "step": 39895 }, { "entropy": 0.06770975720137358, "epoch": 9.300734351322998, "grad_norm": 0.109375, "learning_rate": 4.653196663250569e-05, "loss": 0.0034, "mean_token_accuracy": 0.9991342008113862, "num_tokens": 131254232.0, "step": 39900 }, { "entropy": 0.051382277719676496, "epoch": 9.30189998834363, "grad_norm": 0.05810546875, "learning_rate": 4.653091134226478e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999669253826141, "num_tokens": 131279555.0, "step": 39905 }, { "entropy": 0.05385262481868267, "epoch": 9.303065625364262, "grad_norm": 0.1787109375, "learning_rate": 4.652985591735939e-05, "loss": 0.0007, "mean_token_accuracy": 0.99990394115448, "num_tokens": 131290450.0, "step": 39910 }, { "entropy": 0.05229444522410631, "epoch": 9.304231262384894, "grad_norm": 1.8671875, "learning_rate": 4.652880035780527e-05, "loss": 0.0045, "mean_token_accuracy": 0.9993044972419739, "num_tokens": 131314618.0, "step": 39915 }, { "entropy": 0.06551222130656242, "epoch": 9.305396899405524, "grad_norm": 0.302734375, "learning_rate": 4.652774466361815e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998004019260407, "num_tokens": 131325817.0, "step": 39920 }, { "entropy": 0.07701365770772099, "epoch": 9.306562536426156, "grad_norm": 0.04833984375, "learning_rate": 4.652668883481379e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999864161014557, "num_tokens": 131345787.0, "step": 39925 }, { "entropy": 0.06038803607225418, "epoch": 9.307728173446788, "grad_norm": 0.205078125, "learning_rate": 4.652563287140792e-05, "loss": 0.0063, "mean_token_accuracy": 0.9997166514396667, "num_tokens": 131368437.0, "step": 39930 }, { "entropy": 0.0589005762245506, "epoch": 9.30889381046742, "grad_norm": 0.232421875, "learning_rate": 4.652457677341629e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997618794441223, "num_tokens": 131393813.0, "step": 39935 }, { "entropy": 0.06715304385870695, "epoch": 9.310059447488053, "grad_norm": 0.279296875, "learning_rate": 4.6523520540854644e-05, "loss": 0.0015, "mean_token_accuracy": 0.9997555017471313, "num_tokens": 131406815.0, "step": 39940 }, { "entropy": 0.05549956224858761, "epoch": 9.311225084508685, "grad_norm": 0.043212890625, "learning_rate": 4.652246417373873e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997076034545899, "num_tokens": 131418869.0, "step": 39945 }, { "entropy": 0.05067304102703929, "epoch": 9.312390721529315, "grad_norm": 0.2021484375, "learning_rate": 4.652140767208431e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999592304229736, "num_tokens": 131441594.0, "step": 39950 }, { "entropy": 0.06731152012944222, "epoch": 9.313556358549947, "grad_norm": 0.0302734375, "learning_rate": 4.652035103590713e-05, "loss": 0.0022, "mean_token_accuracy": 0.9997109830379486, "num_tokens": 131451038.0, "step": 39955 }, { "entropy": 0.05239680912345648, "epoch": 9.31472199557058, "grad_norm": 0.0927734375, "learning_rate": 4.6519294265222954e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 131473175.0, "step": 39960 }, { "entropy": 0.05735105858184397, "epoch": 9.315887632591211, "grad_norm": 0.0216064453125, "learning_rate": 4.651823736004753e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999798119068146, "num_tokens": 131489315.0, "step": 39965 }, { "entropy": 0.04591607404872775, "epoch": 9.317053269611844, "grad_norm": 0.388671875, "learning_rate": 4.6517180320396624e-05, "loss": 0.0022, "mean_token_accuracy": 0.9985618889331818, "num_tokens": 131515106.0, "step": 39970 }, { "entropy": 0.059040891379117964, "epoch": 9.318218906632474, "grad_norm": 0.07373046875, "learning_rate": 4.6516123146285995e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 131536732.0, "step": 39975 }, { "entropy": 0.05244419164955616, "epoch": 9.319384543653106, "grad_norm": 0.1201171875, "learning_rate": 4.651506583773141e-05, "loss": 0.0022, "mean_token_accuracy": 0.9999791860580445, "num_tokens": 131563292.0, "step": 39980 }, { "entropy": 0.06199465803802014, "epoch": 9.320550180673738, "grad_norm": 0.041259765625, "learning_rate": 4.651400839474863e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 131572957.0, "step": 39985 }, { "entropy": 0.04819637825712562, "epoch": 9.32171581769437, "grad_norm": 0.04345703125, "learning_rate": 4.651295081735344e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 131592367.0, "step": 39990 }, { "entropy": 0.05749725298956036, "epoch": 9.322881454715002, "grad_norm": 1.6953125, "learning_rate": 4.651189310556158e-05, "loss": 0.0014, "mean_token_accuracy": 0.9995614051818847, "num_tokens": 131619144.0, "step": 39995 }, { "entropy": 0.04768976755440235, "epoch": 9.324047091735633, "grad_norm": 0.1337890625, "learning_rate": 4.651083525938885e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995601296424865, "num_tokens": 131639167.0, "step": 40000 }, { "entropy": 0.049379723891615865, "epoch": 9.325212728756265, "grad_norm": 0.130859375, "learning_rate": 4.6509777278851015e-05, "loss": 0.0019, "mean_token_accuracy": 0.9997111022472381, "num_tokens": 131664826.0, "step": 40005 }, { "entropy": 0.07187463045120239, "epoch": 9.326378365776897, "grad_norm": 0.09765625, "learning_rate": 4.650871916396384e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 131673814.0, "step": 40010 }, { "entropy": 0.053065793495625256, "epoch": 9.327544002797529, "grad_norm": 1.6875, "learning_rate": 4.650766091474312e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996896207332611, "num_tokens": 131695962.0, "step": 40015 }, { "entropy": 0.06435949224978685, "epoch": 9.328709639818161, "grad_norm": 0.11474609375, "learning_rate": 4.650660253120462e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999523639678956, "num_tokens": 131723799.0, "step": 40020 }, { "entropy": 0.04089836934581399, "epoch": 9.329875276838793, "grad_norm": 0.5546875, "learning_rate": 4.650554401336414e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998673915863037, "num_tokens": 131748144.0, "step": 40025 }, { "entropy": 0.06390482522547244, "epoch": 9.331040913859423, "grad_norm": 0.5234375, "learning_rate": 4.650448536123745e-05, "loss": 0.0061, "mean_token_accuracy": 0.9995572030544281, "num_tokens": 131767343.0, "step": 40030 }, { "entropy": 0.0792075976729393, "epoch": 9.332206550880056, "grad_norm": 0.1826171875, "learning_rate": 4.6503426574840337e-05, "loss": 0.003, "mean_token_accuracy": 0.9991525411605835, "num_tokens": 131777276.0, "step": 40035 }, { "entropy": 0.04253325518220663, "epoch": 9.333372187900688, "grad_norm": 0.01611328125, "learning_rate": 4.650236765418859e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999195039272308, "num_tokens": 131827608.0, "step": 40040 }, { "entropy": 0.0662536833435297, "epoch": 9.33453782492132, "grad_norm": 0.1416015625, "learning_rate": 4.6501308599298e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 131838436.0, "step": 40045 }, { "entropy": 0.04661760358139873, "epoch": 9.335703461941952, "grad_norm": 0.0225830078125, "learning_rate": 4.6500249410184365e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999865055084228, "num_tokens": 131860393.0, "step": 40050 }, { "entropy": 0.05046624289825559, "epoch": 9.336869098962582, "grad_norm": 0.1865234375, "learning_rate": 4.6499190086863476e-05, "loss": 0.0033, "mean_token_accuracy": 0.9986062943935394, "num_tokens": 131896517.0, "step": 40055 }, { "entropy": 0.043066446855664256, "epoch": 9.338034735983214, "grad_norm": 0.06396484375, "learning_rate": 4.649813062935112e-05, "loss": 0.0049, "mean_token_accuracy": 0.9996003627777099, "num_tokens": 131927175.0, "step": 40060 }, { "entropy": 0.0638367710635066, "epoch": 9.339200373003846, "grad_norm": 0.043701171875, "learning_rate": 4.649707103766311e-05, "loss": 0.003, "mean_token_accuracy": 0.999473923444748, "num_tokens": 131941489.0, "step": 40065 }, { "entropy": 0.052022173814475534, "epoch": 9.340366010024479, "grad_norm": 0.1455078125, "learning_rate": 4.649601131181524e-05, "loss": 0.0018, "mean_token_accuracy": 0.9986912429332733, "num_tokens": 131977336.0, "step": 40070 }, { "entropy": 0.04792749881744385, "epoch": 9.34153164704511, "grad_norm": 0.0927734375, "learning_rate": 4.64949514518233e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998324334621429, "num_tokens": 132000748.0, "step": 40075 }, { "entropy": 0.04526071464642882, "epoch": 9.342697284065743, "grad_norm": 0.06591796875, "learning_rate": 4.649389145770311e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 132020054.0, "step": 40080 }, { "entropy": 0.1261118996888399, "epoch": 9.343862921086373, "grad_norm": 0.056396484375, "learning_rate": 4.649283132947047e-05, "loss": 0.1482, "mean_token_accuracy": 0.9802892565727234, "num_tokens": 132041911.0, "step": 40085 }, { "entropy": 0.08652307861484587, "epoch": 9.345028558107005, "grad_norm": 0.1513671875, "learning_rate": 4.6491771067141186e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999862551689148, "num_tokens": 132058753.0, "step": 40090 }, { "entropy": 0.05955226179212332, "epoch": 9.346194195127637, "grad_norm": 0.134765625, "learning_rate": 4.6490710670731075e-05, "loss": 0.0005, "mean_token_accuracy": 0.999969094991684, "num_tokens": 132080898.0, "step": 40095 }, { "entropy": 0.05750239389017224, "epoch": 9.34735983214827, "grad_norm": 0.05322265625, "learning_rate": 4.648965014025595e-05, "loss": 0.003, "mean_token_accuracy": 0.9995156407356263, "num_tokens": 132106290.0, "step": 40100 }, { "entropy": 0.05300535634160042, "epoch": 9.348525469168901, "grad_norm": 0.19140625, "learning_rate": 4.6488589475731614e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998235583305359, "num_tokens": 132129402.0, "step": 40105 }, { "entropy": 0.06626793816685676, "epoch": 9.349691106189532, "grad_norm": 0.5390625, "learning_rate": 4.648752867717389e-05, "loss": 0.0016, "mean_token_accuracy": 0.9997787594795227, "num_tokens": 132146370.0, "step": 40110 }, { "entropy": 0.06416756641119718, "epoch": 9.350856743210164, "grad_norm": 2.078125, "learning_rate": 4.6486467744598596e-05, "loss": 0.0007, "mean_token_accuracy": 0.99963099360466, "num_tokens": 132157773.0, "step": 40115 }, { "entropy": 0.051140221767127514, "epoch": 9.352022380230796, "grad_norm": 1.1875, "learning_rate": 4.6485406678021546e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996812820434571, "num_tokens": 132182802.0, "step": 40120 }, { "entropy": 0.05248261038213968, "epoch": 9.353188017251428, "grad_norm": 0.185546875, "learning_rate": 4.648434547745858e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997892618179322, "num_tokens": 132204779.0, "step": 40125 }, { "entropy": 0.06364800855517387, "epoch": 9.35435365427206, "grad_norm": 0.0302734375, "learning_rate": 4.64832841429255e-05, "loss": 0.002, "mean_token_accuracy": 0.999296373128891, "num_tokens": 132216288.0, "step": 40130 }, { "entropy": 0.061640280019491914, "epoch": 9.35551929129269, "grad_norm": 0.08154296875, "learning_rate": 4.6482222674438147e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 132229424.0, "step": 40135 }, { "entropy": 0.04232308492064476, "epoch": 9.356684928313323, "grad_norm": 0.90625, "learning_rate": 4.6481161072012334e-05, "loss": 0.0007, "mean_token_accuracy": 0.9995788276195526, "num_tokens": 132247241.0, "step": 40140 }, { "entropy": 0.06577129755169153, "epoch": 9.357850565333955, "grad_norm": 0.050048828125, "learning_rate": 4.648009933566391e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996784567832947, "num_tokens": 132261612.0, "step": 40145 }, { "entropy": 0.055283906683325765, "epoch": 9.359016202354587, "grad_norm": 0.10498046875, "learning_rate": 4.64790374654087e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997281730175018, "num_tokens": 132279653.0, "step": 40150 }, { "entropy": 0.057182838954031465, "epoch": 9.360181839375219, "grad_norm": 0.1591796875, "learning_rate": 4.6477975461262535e-05, "loss": 0.0035, "mean_token_accuracy": 0.9997178077697754, "num_tokens": 132308947.0, "step": 40155 }, { "entropy": 0.05807437375187874, "epoch": 9.361347476395851, "grad_norm": 0.8671875, "learning_rate": 4.647691332324124e-05, "loss": 0.0014, "mean_token_accuracy": 0.9994499266147614, "num_tokens": 132319937.0, "step": 40160 }, { "entropy": 0.05894612278789282, "epoch": 9.362513113416481, "grad_norm": 0.11572265625, "learning_rate": 4.647585105136068e-05, "loss": 0.0016, "mean_token_accuracy": 0.9993421077728272, "num_tokens": 132329099.0, "step": 40165 }, { "entropy": 0.060213756002485755, "epoch": 9.363678750437114, "grad_norm": 0.083984375, "learning_rate": 4.647478864563668e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995322227478027, "num_tokens": 132367396.0, "step": 40170 }, { "entropy": 0.054358705133199695, "epoch": 9.364844387457746, "grad_norm": 0.0908203125, "learning_rate": 4.6473726106085076e-05, "loss": 0.0011, "mean_token_accuracy": 0.9993378758430481, "num_tokens": 132398650.0, "step": 40175 }, { "entropy": 0.06580995731055736, "epoch": 9.366010024478378, "grad_norm": 0.0576171875, "learning_rate": 4.647266343272172e-05, "loss": 0.0007, "mean_token_accuracy": 0.9998428761959076, "num_tokens": 132418663.0, "step": 40180 }, { "entropy": 0.09032328445464373, "epoch": 9.36717566149901, "grad_norm": 0.47265625, "learning_rate": 4.647160062556246e-05, "loss": 0.0009, "mean_token_accuracy": 0.999989515542984, "num_tokens": 132436254.0, "step": 40185 }, { "entropy": 0.04208869868889451, "epoch": 9.36834129851964, "grad_norm": 0.373046875, "learning_rate": 4.647053768462313e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999783515930176, "num_tokens": 132467998.0, "step": 40190 }, { "entropy": 0.061600181832909585, "epoch": 9.369506935540272, "grad_norm": 0.478515625, "learning_rate": 4.646947460991961e-05, "loss": 0.0019, "mean_token_accuracy": 0.999874085187912, "num_tokens": 132491206.0, "step": 40195 }, { "entropy": 0.059306449443101886, "epoch": 9.370672572560904, "grad_norm": 0.0986328125, "learning_rate": 4.646841140146771e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 132511274.0, "step": 40200 }, { "entropy": 0.04950930029153824, "epoch": 9.371838209581536, "grad_norm": 0.047119140625, "learning_rate": 4.646734805928332e-05, "loss": 0.0016, "mean_token_accuracy": 0.9993799746036529, "num_tokens": 132531833.0, "step": 40205 }, { "entropy": 0.07071809405460953, "epoch": 9.373003846602169, "grad_norm": 0.1640625, "learning_rate": 4.646628458338228e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997894406318665, "num_tokens": 132556517.0, "step": 40210 }, { "entropy": 0.05042863581329584, "epoch": 9.3741694836228, "grad_norm": 0.384765625, "learning_rate": 4.6465220973780455e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 132571158.0, "step": 40215 }, { "entropy": 0.057017339020967485, "epoch": 9.375335120643431, "grad_norm": 0.271484375, "learning_rate": 4.64641572304937e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 132582965.0, "step": 40220 }, { "entropy": 0.06408351408317685, "epoch": 9.376500757664063, "grad_norm": 0.09716796875, "learning_rate": 4.646309335353787e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999407470226288, "num_tokens": 132609368.0, "step": 40225 }, { "entropy": 0.055050110910087824, "epoch": 9.377666394684695, "grad_norm": 0.11474609375, "learning_rate": 4.646202934292884e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998552739620209, "num_tokens": 132634237.0, "step": 40230 }, { "entropy": 0.0654405765235424, "epoch": 9.378832031705327, "grad_norm": 1.15625, "learning_rate": 4.646096519868248e-05, "loss": 0.0104, "mean_token_accuracy": 0.9984454095363617, "num_tokens": 132655238.0, "step": 40235 }, { "entropy": 0.05522510912269354, "epoch": 9.37999766872596, "grad_norm": 0.6953125, "learning_rate": 4.6459900920814645e-05, "loss": 0.0033, "mean_token_accuracy": 0.9997252762317658, "num_tokens": 132666985.0, "step": 40240 }, { "entropy": 0.06411749050021172, "epoch": 9.38116330574659, "grad_norm": 0.28515625, "learning_rate": 4.645883650934121e-05, "loss": 0.0018, "mean_token_accuracy": 0.9997101426124573, "num_tokens": 132676241.0, "step": 40245 }, { "entropy": 0.04640081143006682, "epoch": 9.382328942767222, "grad_norm": 0.130859375, "learning_rate": 4.645777196427805e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999883055686951, "num_tokens": 132696465.0, "step": 40250 }, { "entropy": 0.06816220059990882, "epoch": 9.383494579787854, "grad_norm": 0.10009765625, "learning_rate": 4.6456707285641036e-05, "loss": 0.0005, "mean_token_accuracy": 0.999319726228714, "num_tokens": 132707483.0, "step": 40255 }, { "entropy": 0.055040829442441465, "epoch": 9.384660216808486, "grad_norm": 0.1337890625, "learning_rate": 4.645564247344604e-05, "loss": 0.0015, "mean_token_accuracy": 0.9996013402938843, "num_tokens": 132728906.0, "step": 40260 }, { "entropy": 0.06292527839541436, "epoch": 9.385825853829118, "grad_norm": 0.75, "learning_rate": 4.645457752770895e-05, "loss": 0.001, "mean_token_accuracy": 0.9999794006347656, "num_tokens": 132756625.0, "step": 40265 }, { "entropy": 0.05493353232741356, "epoch": 9.386991490849748, "grad_norm": 0.1337890625, "learning_rate": 4.645351244844565e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 132769179.0, "step": 40270 }, { "entropy": 0.06623713504523039, "epoch": 9.38815712787038, "grad_norm": 0.89453125, "learning_rate": 4.6452447235672e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996913552284241, "num_tokens": 132781794.0, "step": 40275 }, { "entropy": 0.04920391724444926, "epoch": 9.389322764891013, "grad_norm": 0.294921875, "learning_rate": 4.6451381889403895e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999530673027038, "num_tokens": 132810819.0, "step": 40280 }, { "entropy": 0.05621719462797046, "epoch": 9.390488401911645, "grad_norm": 0.1572265625, "learning_rate": 4.645031640965722e-05, "loss": 0.0006, "mean_token_accuracy": 0.999968808889389, "num_tokens": 132836542.0, "step": 40285 }, { "entropy": 0.06389943808317185, "epoch": 9.391654038932277, "grad_norm": 0.65234375, "learning_rate": 4.644925079644788e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999892890453339, "num_tokens": 132855480.0, "step": 40290 }, { "entropy": 0.059335741028189656, "epoch": 9.392819675952909, "grad_norm": 0.058837890625, "learning_rate": 4.644818504979174e-05, "loss": 0.0011, "mean_token_accuracy": 0.9992248058319092, "num_tokens": 132865545.0, "step": 40295 }, { "entropy": 0.04476952906697988, "epoch": 9.39398531297354, "grad_norm": 0.038818359375, "learning_rate": 4.644711916970471e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 132882310.0, "step": 40300 }, { "entropy": 0.05240940302610397, "epoch": 9.395150949994171, "grad_norm": 0.058349609375, "learning_rate": 4.644605315620266e-05, "loss": 0.0009, "mean_token_accuracy": 0.999613881111145, "num_tokens": 132911227.0, "step": 40305 }, { "entropy": 0.03990960521623492, "epoch": 9.396316587014804, "grad_norm": 0.06298828125, "learning_rate": 4.644498700930152e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999681174755096, "num_tokens": 132937528.0, "step": 40310 }, { "entropy": 0.0688298974186182, "epoch": 9.397482224035436, "grad_norm": 0.039794921875, "learning_rate": 4.6443920729017154e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 132947749.0, "step": 40315 }, { "entropy": 0.07430526856333017, "epoch": 9.398647861056068, "grad_norm": 0.201171875, "learning_rate": 4.644285431536549e-05, "loss": 0.0149, "mean_token_accuracy": 0.9950378179550171, "num_tokens": 132976251.0, "step": 40320 }, { "entropy": 0.04790102792903781, "epoch": 9.399813498076698, "grad_norm": 0.103515625, "learning_rate": 4.644178776836241e-05, "loss": 0.0011, "mean_token_accuracy": 0.9993688344955445, "num_tokens": 133016802.0, "step": 40325 }, { "entropy": 0.07281846143305301, "epoch": 9.40097913509733, "grad_norm": 0.08740234375, "learning_rate": 4.644072108802383e-05, "loss": 0.0004, "mean_token_accuracy": 0.9998487114906311, "num_tokens": 133033477.0, "step": 40330 }, { "entropy": 0.04617578536272049, "epoch": 9.402144772117962, "grad_norm": 0.1357421875, "learning_rate": 4.643965427436564e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999640166759491, "num_tokens": 133065597.0, "step": 40335 }, { "entropy": 0.05054793646559119, "epoch": 9.403310409138594, "grad_norm": 0.21875, "learning_rate": 4.643858732740377e-05, "loss": 0.0011, "mean_token_accuracy": 0.9994157433509827, "num_tokens": 133090321.0, "step": 40340 }, { "entropy": 0.0676423467695713, "epoch": 9.404476046159226, "grad_norm": 0.103515625, "learning_rate": 4.643752024715412e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997844815254211, "num_tokens": 133101643.0, "step": 40345 }, { "entropy": 0.05842981785535813, "epoch": 9.405641683179859, "grad_norm": 0.02490234375, "learning_rate": 4.64364530336326e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999783515930176, "num_tokens": 133121728.0, "step": 40350 }, { "entropy": 0.04886232684366405, "epoch": 9.406807320200489, "grad_norm": 0.23046875, "learning_rate": 4.643538568685511e-05, "loss": 0.001, "mean_token_accuracy": 0.9995041966438294, "num_tokens": 133153226.0, "step": 40355 }, { "entropy": 0.050247212126851085, "epoch": 9.407972957221121, "grad_norm": 0.047119140625, "learning_rate": 4.64343182068376e-05, "loss": 0.0002, "mean_token_accuracy": 0.9998877048492432, "num_tokens": 133180024.0, "step": 40360 }, { "entropy": 0.04787283539772034, "epoch": 9.409138594241753, "grad_norm": 0.03173828125, "learning_rate": 4.643325059359596e-05, "loss": 0.0005, "mean_token_accuracy": 0.999958747625351, "num_tokens": 133203070.0, "step": 40365 }, { "entropy": 0.05310492250137031, "epoch": 9.410304231262385, "grad_norm": 0.13671875, "learning_rate": 4.643218284714611e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999827980995178, "num_tokens": 133221629.0, "step": 40370 }, { "entropy": 0.08067540870979428, "epoch": 9.411469868283017, "grad_norm": 0.08349609375, "learning_rate": 4.6431114967503985e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996642708778382, "num_tokens": 133249735.0, "step": 40375 }, { "entropy": 0.06417739875614643, "epoch": 9.412635505303648, "grad_norm": 0.04150390625, "learning_rate": 4.64300469546855e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 133261895.0, "step": 40380 }, { "entropy": 0.059690887294709685, "epoch": 9.41380114232428, "grad_norm": 0.1259765625, "learning_rate": 4.642897880870658e-05, "loss": 0.0006, "mean_token_accuracy": 0.9997668981552124, "num_tokens": 133276430.0, "step": 40385 }, { "entropy": 0.05419606175273657, "epoch": 9.414966779344912, "grad_norm": 0.025146484375, "learning_rate": 4.642791052958316e-05, "loss": 0.0005, "mean_token_accuracy": 0.999883234500885, "num_tokens": 133303994.0, "step": 40390 }, { "entropy": 0.06044567935168743, "epoch": 9.416132416365544, "grad_norm": 0.07177734375, "learning_rate": 4.642684211733117e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 133314143.0, "step": 40395 }, { "entropy": 0.04812805755063891, "epoch": 9.417298053386176, "grad_norm": 0.2080078125, "learning_rate": 4.642577357196654e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999467551708221, "num_tokens": 133338336.0, "step": 40400 }, { "entropy": 0.0419490784406662, "epoch": 9.418463690406806, "grad_norm": 0.054931640625, "learning_rate": 4.642470489350519e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999777257442475, "num_tokens": 133378960.0, "step": 40405 }, { "entropy": 0.06326623265631497, "epoch": 9.419629327427439, "grad_norm": 0.1181640625, "learning_rate": 4.6423636081963074e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999671995639801, "num_tokens": 133406067.0, "step": 40410 }, { "entropy": 0.05259804669767618, "epoch": 9.42079496444807, "grad_norm": 0.125, "learning_rate": 4.642256713735612e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996414184570312, "num_tokens": 133424898.0, "step": 40415 }, { "entropy": 0.09150759177282453, "epoch": 9.421960601468703, "grad_norm": 0.021728515625, "learning_rate": 4.642149805970026e-05, "loss": 0.074, "mean_token_accuracy": 0.9797709345817566, "num_tokens": 133446510.0, "step": 40420 }, { "entropy": 0.05307275112718344, "epoch": 9.423126238489335, "grad_norm": 0.07080078125, "learning_rate": 4.642042884901146e-05, "loss": 0.0072, "mean_token_accuracy": 0.9985364437103271, "num_tokens": 133469814.0, "step": 40425 }, { "entropy": 0.07435050662606954, "epoch": 9.424291875509967, "grad_norm": 0.080078125, "learning_rate": 4.641935950530564e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 133488449.0, "step": 40430 }, { "entropy": 0.0618276858702302, "epoch": 9.425457512530597, "grad_norm": 0.69921875, "learning_rate": 4.641829002859876e-05, "loss": 0.0021, "mean_token_accuracy": 0.9993707299232483, "num_tokens": 133506729.0, "step": 40435 }, { "entropy": 0.045803864300251004, "epoch": 9.42662314955123, "grad_norm": 0.09228515625, "learning_rate": 4.641722041890676e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999457180500031, "num_tokens": 133535231.0, "step": 40440 }, { "entropy": 0.06190860979259014, "epoch": 9.427788786571861, "grad_norm": 0.035400390625, "learning_rate": 4.641615067624559e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 133546578.0, "step": 40445 }, { "entropy": 0.06924508688971401, "epoch": 9.428954423592494, "grad_norm": 0.0751953125, "learning_rate": 4.64150808006312e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999842643737793, "num_tokens": 133562598.0, "step": 40450 }, { "entropy": 0.05628445390611887, "epoch": 9.430120060613126, "grad_norm": 0.08154296875, "learning_rate": 4.641401079207955e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 133576149.0, "step": 40455 }, { "entropy": 0.05225107911974192, "epoch": 9.431285697633756, "grad_norm": 0.376953125, "learning_rate": 4.6412940650606597e-05, "loss": 0.0017, "mean_token_accuracy": 0.999372273683548, "num_tokens": 133597360.0, "step": 40460 }, { "entropy": 0.05286563150584698, "epoch": 9.432451334654388, "grad_norm": 0.07568359375, "learning_rate": 4.6411870376228284e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 133608823.0, "step": 40465 }, { "entropy": 0.06937075927853584, "epoch": 9.43361697167502, "grad_norm": 1.0546875, "learning_rate": 4.6410799968960586e-05, "loss": 0.0029, "mean_token_accuracy": 0.9996330261230468, "num_tokens": 133619534.0, "step": 40470 }, { "entropy": 0.04530189856886864, "epoch": 9.434782608695652, "grad_norm": 0.17578125, "learning_rate": 4.640972942881945e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999788701534271, "num_tokens": 133643820.0, "step": 40475 }, { "entropy": 0.06963036190718412, "epoch": 9.435948245716284, "grad_norm": 0.008056640625, "learning_rate": 4.6408658755820855e-05, "loss": 0.0008, "mean_token_accuracy": 0.9994134902954102, "num_tokens": 133655961.0, "step": 40480 }, { "entropy": 0.05203675013035536, "epoch": 9.437113882736917, "grad_norm": 0.047119140625, "learning_rate": 4.640758794998076e-05, "loss": 0.0034, "mean_token_accuracy": 0.9996129751205445, "num_tokens": 133679471.0, "step": 40485 }, { "entropy": 0.07076594727113843, "epoch": 9.438279519757547, "grad_norm": 2.484375, "learning_rate": 4.6406517011315124e-05, "loss": 0.0165, "mean_token_accuracy": 0.9988285422325134, "num_tokens": 133696136.0, "step": 40490 }, { "entropy": 0.03876218590885401, "epoch": 9.439445156778179, "grad_norm": 0.14453125, "learning_rate": 4.640544593983992e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997322976589202, "num_tokens": 133736117.0, "step": 40495 }, { "entropy": 0.057139042066410184, "epoch": 9.440610793798811, "grad_norm": 0.05810546875, "learning_rate": 4.640437473557113e-05, "loss": 0.0011, "mean_token_accuracy": 0.9994039118289948, "num_tokens": 133755791.0, "step": 40500 }, { "entropy": 0.054068625625222924, "epoch": 9.441776430819443, "grad_norm": 0.10107421875, "learning_rate": 4.640330339852472e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999571919441224, "num_tokens": 133787308.0, "step": 40505 }, { "entropy": 0.05481251748278737, "epoch": 9.442942067840075, "grad_norm": 0.12109375, "learning_rate": 4.6402231928716664e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997755348682403, "num_tokens": 133800285.0, "step": 40510 }, { "entropy": 0.06906182300299406, "epoch": 9.444107704860706, "grad_norm": 0.024658203125, "learning_rate": 4.6401160326162934e-05, "loss": 0.0003, "mean_token_accuracy": 0.9996212124824524, "num_tokens": 133819035.0, "step": 40515 }, { "entropy": 0.04636044520884752, "epoch": 9.445273341881338, "grad_norm": 0.046142578125, "learning_rate": 4.640008859087952e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 133853991.0, "step": 40520 }, { "entropy": 0.05268848827108741, "epoch": 9.44643897890197, "grad_norm": 0.091796875, "learning_rate": 4.6399016722882404e-05, "loss": 0.0019, "mean_token_accuracy": 0.9995454549789429, "num_tokens": 133868111.0, "step": 40525 }, { "entropy": 0.07035997025668621, "epoch": 9.447604615922602, "grad_norm": 0.345703125, "learning_rate": 4.639794472218756e-05, "loss": 0.0047, "mean_token_accuracy": 0.9991369843482971, "num_tokens": 133878376.0, "step": 40530 }, { "entropy": 0.05296846874989569, "epoch": 9.448770252943234, "grad_norm": 0.11865234375, "learning_rate": 4.639687258881097e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999543309211731, "num_tokens": 133906468.0, "step": 40535 }, { "entropy": 0.056552316341549155, "epoch": 9.449935889963864, "grad_norm": 0.08935546875, "learning_rate": 4.6395800322768634e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 133925080.0, "step": 40540 }, { "entropy": 0.06830793656408787, "epoch": 9.451101526984496, "grad_norm": 0.2216796875, "learning_rate": 4.6394727924076535e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 133937381.0, "step": 40545 }, { "entropy": 0.061395585630089046, "epoch": 9.452267164005129, "grad_norm": 0.1416015625, "learning_rate": 4.639365539275067e-05, "loss": 0.0148, "mean_token_accuracy": 0.998397821187973, "num_tokens": 133963585.0, "step": 40550 }, { "entropy": 0.048311513382941484, "epoch": 9.45343280102576, "grad_norm": 0.1630859375, "learning_rate": 4.6392582728807014e-05, "loss": 0.0017, "mean_token_accuracy": 0.9994284033775329, "num_tokens": 133989337.0, "step": 40555 }, { "entropy": 0.04738085251301527, "epoch": 9.454598438046393, "grad_norm": 0.291015625, "learning_rate": 4.639150993226158e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 134015143.0, "step": 40560 }, { "entropy": 0.07007132191210985, "epoch": 9.455764075067025, "grad_norm": 0.201171875, "learning_rate": 4.6390437003130366e-05, "loss": 0.031, "mean_token_accuracy": 0.9956527829170227, "num_tokens": 134042838.0, "step": 40565 }, { "entropy": 0.049748735316097734, "epoch": 9.456929712087655, "grad_norm": 0.04833984375, "learning_rate": 4.638936394142936e-05, "loss": 0.001, "mean_token_accuracy": 0.9994413435459137, "num_tokens": 134060084.0, "step": 40570 }, { "entropy": 0.06348553579300642, "epoch": 9.458095349108287, "grad_norm": 0.1865234375, "learning_rate": 4.638829074717456e-05, "loss": 0.0005, "mean_token_accuracy": 0.9996629536151886, "num_tokens": 134082574.0, "step": 40575 }, { "entropy": 0.054871161840856074, "epoch": 9.45926098612892, "grad_norm": 0.14453125, "learning_rate": 4.6387217420381986e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134096670.0, "step": 40580 }, { "entropy": 0.054104173369705674, "epoch": 9.460426623149552, "grad_norm": 0.76953125, "learning_rate": 4.638614396106763e-05, "loss": 0.0049, "mean_token_accuracy": 0.9995225071907043, "num_tokens": 134124932.0, "step": 40585 }, { "entropy": 0.06244059270247817, "epoch": 9.461592260170184, "grad_norm": 0.302734375, "learning_rate": 4.6385070369247495e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998060464859009, "num_tokens": 134145979.0, "step": 40590 }, { "entropy": 0.07192586399614811, "epoch": 9.462757897190814, "grad_norm": 0.92578125, "learning_rate": 4.6383996644937606e-05, "loss": 0.002, "mean_token_accuracy": 0.9992907822132111, "num_tokens": 134156439.0, "step": 40595 }, { "entropy": 0.07092061564326287, "epoch": 9.463923534211446, "grad_norm": 0.341796875, "learning_rate": 4.638292278815396e-05, "loss": 0.0018, "mean_token_accuracy": 0.9998226940631867, "num_tokens": 134171371.0, "step": 40600 }, { "entropy": 0.0636997226625681, "epoch": 9.465089171232078, "grad_norm": 2.234375, "learning_rate": 4.638184879891258e-05, "loss": 0.0019, "mean_token_accuracy": 0.999472576379776, "num_tokens": 134191017.0, "step": 40605 }, { "entropy": 0.05375488083809614, "epoch": 9.46625480825271, "grad_norm": 1.5078125, "learning_rate": 4.638077467722947e-05, "loss": 0.001, "mean_token_accuracy": 0.9997506201267242, "num_tokens": 134210202.0, "step": 40610 }, { "entropy": 0.05368066322989762, "epoch": 9.467420445273342, "grad_norm": 0.1728515625, "learning_rate": 4.637970042312065e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999517619609832, "num_tokens": 134236096.0, "step": 40615 }, { "entropy": 0.07572010587900876, "epoch": 9.468586082293974, "grad_norm": 0.07373046875, "learning_rate": 4.6378626036602145e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134247013.0, "step": 40620 }, { "entropy": 0.04615337757859379, "epoch": 9.469751719314605, "grad_norm": 1.796875, "learning_rate": 4.637755151768998e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996350347995758, "num_tokens": 134269849.0, "step": 40625 }, { "entropy": 0.07043313197791576, "epoch": 9.470917356335237, "grad_norm": 0.03662109375, "learning_rate": 4.637647686640015e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 134281666.0, "step": 40630 }, { "entropy": 0.04748814105987549, "epoch": 9.472082993355869, "grad_norm": 0.1318359375, "learning_rate": 4.637540208274872e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997235715389252, "num_tokens": 134305770.0, "step": 40635 }, { "entropy": 0.07811860628426075, "epoch": 9.473248630376501, "grad_norm": 1.4296875, "learning_rate": 4.637432716675168e-05, "loss": 0.002, "mean_token_accuracy": 0.9995238184928894, "num_tokens": 134314655.0, "step": 40640 }, { "entropy": 0.05494981547817588, "epoch": 9.474414267397133, "grad_norm": 0.03759765625, "learning_rate": 4.637325211842508e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134337243.0, "step": 40645 }, { "entropy": 0.05441898424178362, "epoch": 9.475579904417764, "grad_norm": 1.4140625, "learning_rate": 4.637217693778494e-05, "loss": 0.0016, "mean_token_accuracy": 0.9998080611228943, "num_tokens": 134351430.0, "step": 40650 }, { "entropy": 0.07939355503767728, "epoch": 9.476745541438396, "grad_norm": 2.4375, "learning_rate": 4.6371101624847305e-05, "loss": 0.0014, "mean_token_accuracy": 0.9993265986442565, "num_tokens": 134364615.0, "step": 40655 }, { "entropy": 0.05908539802767336, "epoch": 9.477911178459028, "grad_norm": 0.162109375, "learning_rate": 4.63700261796282e-05, "loss": 0.0027, "mean_token_accuracy": 0.9996601998806, "num_tokens": 134387056.0, "step": 40660 }, { "entropy": 0.046356960525736214, "epoch": 9.47907681547966, "grad_norm": 0.05322265625, "learning_rate": 4.636895060214366e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999344050884247, "num_tokens": 134425884.0, "step": 40665 }, { "entropy": 0.062171243224292995, "epoch": 9.480242452500292, "grad_norm": 0.040283203125, "learning_rate": 4.636787489240973e-05, "loss": 0.001, "mean_token_accuracy": 0.9997522413730622, "num_tokens": 134448675.0, "step": 40670 }, { "entropy": 0.06885421648621559, "epoch": 9.481408089520922, "grad_norm": 0.8984375, "learning_rate": 4.636679905044245e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134459310.0, "step": 40675 }, { "entropy": 0.06276169000193477, "epoch": 9.482573726541554, "grad_norm": 0.060791015625, "learning_rate": 4.636572307625785e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999372661113739, "num_tokens": 134475261.0, "step": 40680 }, { "entropy": 0.06280481340363622, "epoch": 9.483739363562186, "grad_norm": 0.86328125, "learning_rate": 4.636464696987199e-05, "loss": 0.0006, "mean_token_accuracy": 0.9996323525905609, "num_tokens": 134500387.0, "step": 40685 }, { "entropy": 0.08110145181417465, "epoch": 9.484905000582819, "grad_norm": 0.09716796875, "learning_rate": 4.636357073130091e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134514011.0, "step": 40690 }, { "entropy": 0.041450155060738327, "epoch": 9.48607063760345, "grad_norm": 0.1767578125, "learning_rate": 4.636249436056066e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995937526226044, "num_tokens": 134542359.0, "step": 40695 }, { "entropy": 0.05160027435049415, "epoch": 9.487236274624083, "grad_norm": 0.0810546875, "learning_rate": 4.636141785766729e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 134561048.0, "step": 40700 }, { "entropy": 0.07677444498986005, "epoch": 9.488401911644713, "grad_norm": 1.1875, "learning_rate": 4.6360341222636846e-05, "loss": 0.001, "mean_token_accuracy": 0.9994491636753082, "num_tokens": 134576808.0, "step": 40705 }, { "entropy": 0.07004327522590756, "epoch": 9.489567548665345, "grad_norm": 0.09375, "learning_rate": 4.635926445548539e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 134588413.0, "step": 40710 }, { "entropy": 0.0489866410382092, "epoch": 9.490733185685977, "grad_norm": 0.0849609375, "learning_rate": 4.635818755622898e-05, "loss": 0.0037, "mean_token_accuracy": 0.9992427587509155, "num_tokens": 134605793.0, "step": 40715 }, { "entropy": 0.04987419592216611, "epoch": 9.49189882270661, "grad_norm": 0.0810546875, "learning_rate": 4.635711052488367e-05, "loss": 0.0022, "mean_token_accuracy": 0.9993611574172974, "num_tokens": 134631089.0, "step": 40720 }, { "entropy": 0.059205600060522556, "epoch": 9.493064459727242, "grad_norm": 0.07861328125, "learning_rate": 4.6356033361465515e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999550521373749, "num_tokens": 134656109.0, "step": 40725 }, { "entropy": 0.06425941651687025, "epoch": 9.494230096747872, "grad_norm": 0.2109375, "learning_rate": 4.635495606599059e-05, "loss": 0.0125, "mean_token_accuracy": 0.997954660654068, "num_tokens": 134687865.0, "step": 40730 }, { "entropy": 0.056310771498829126, "epoch": 9.495395733768504, "grad_norm": 0.185546875, "learning_rate": 4.635387863847494e-05, "loss": 0.0016, "mean_token_accuracy": 0.999619048833847, "num_tokens": 134699790.0, "step": 40735 }, { "entropy": 0.04697222877293825, "epoch": 9.496561370789136, "grad_norm": 0.0308837890625, "learning_rate": 4.635280107893465e-05, "loss": 0.0029, "mean_token_accuracy": 0.9995614051818847, "num_tokens": 134715296.0, "step": 40740 }, { "entropy": 0.06893054358661174, "epoch": 9.497727007809768, "grad_norm": 0.11962890625, "learning_rate": 4.6351723387385784e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134734337.0, "step": 40745 }, { "entropy": 0.04734471701085567, "epoch": 9.4988926448304, "grad_norm": 0.033935546875, "learning_rate": 4.6350645563844394e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 134753683.0, "step": 40750 }, { "entropy": 0.0547360529191792, "epoch": 9.500058281851032, "grad_norm": 0.01385498046875, "learning_rate": 4.6349567608326585e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999753832817078, "num_tokens": 134778207.0, "step": 40755 }, { "entropy": 0.059074167534708975, "epoch": 9.501223918871663, "grad_norm": 0.162109375, "learning_rate": 4.63484895208484e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 134794641.0, "step": 40760 }, { "entropy": 0.05270886849611998, "epoch": 9.502389555892295, "grad_norm": 0.12255859375, "learning_rate": 4.634741130142593e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999666035175323, "num_tokens": 134821573.0, "step": 40765 }, { "entropy": 0.056115038227289915, "epoch": 9.503555192912927, "grad_norm": 0.034912109375, "learning_rate": 4.634633295007525e-05, "loss": 0.0027, "mean_token_accuracy": 0.9993288278579712, "num_tokens": 134839327.0, "step": 40770 }, { "entropy": 0.03874876936897635, "epoch": 9.504720829933559, "grad_norm": 0.1455078125, "learning_rate": 4.634525446681245e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999327003955841, "num_tokens": 134882343.0, "step": 40775 }, { "entropy": 0.05923546832054853, "epoch": 9.505886466954191, "grad_norm": 0.115234375, "learning_rate": 4.634417585165359e-05, "loss": 0.0009, "mean_token_accuracy": 0.999529767036438, "num_tokens": 134897391.0, "step": 40780 }, { "entropy": 0.05450958535075188, "epoch": 9.507052103974821, "grad_norm": 0.1357421875, "learning_rate": 4.634309710461476e-05, "loss": 0.0009, "mean_token_accuracy": 0.9994285702705383, "num_tokens": 134920605.0, "step": 40785 }, { "entropy": 0.03137501548044384, "epoch": 9.508217740995454, "grad_norm": 0.0262451171875, "learning_rate": 4.634201822571207e-05, "loss": 0.0003, "mean_token_accuracy": 0.999960595369339, "num_tokens": 134959815.0, "step": 40790 }, { "entropy": 0.058038387820124625, "epoch": 9.509383378016086, "grad_norm": 0.041259765625, "learning_rate": 4.6340939214961575e-05, "loss": 0.0014, "mean_token_accuracy": 0.999402391910553, "num_tokens": 134972976.0, "step": 40795 }, { "entropy": 0.04890700094401836, "epoch": 9.510549015036718, "grad_norm": 0.2373046875, "learning_rate": 4.633986007237939e-05, "loss": 0.0005, "mean_token_accuracy": 0.999665379524231, "num_tokens": 135004943.0, "step": 40800 }, { "entropy": 0.04844941468909383, "epoch": 9.51171465205735, "grad_norm": 0.037109375, "learning_rate": 4.6338780797981584e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996458530426026, "num_tokens": 135029697.0, "step": 40805 }, { "entropy": 0.06579445358365774, "epoch": 9.51288028907798, "grad_norm": 0.10888671875, "learning_rate": 4.6337701391784266e-05, "loss": 0.0005, "mean_token_accuracy": 0.9995369851589203, "num_tokens": 135043437.0, "step": 40810 }, { "entropy": 0.06171910166740417, "epoch": 9.514045926098612, "grad_norm": 0.025390625, "learning_rate": 4.633662185380353e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999885797500611, "num_tokens": 135063057.0, "step": 40815 }, { "entropy": 0.06348031088709831, "epoch": 9.515211563119244, "grad_norm": 0.09326171875, "learning_rate": 4.633554218405547e-05, "loss": 0.0017, "mean_token_accuracy": 0.9990782618522644, "num_tokens": 135073222.0, "step": 40820 }, { "entropy": 0.05706033930182457, "epoch": 9.516377200139877, "grad_norm": 0.0869140625, "learning_rate": 4.633446238255619e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135085320.0, "step": 40825 }, { "entropy": 0.05939996847882867, "epoch": 9.517542837160509, "grad_norm": 3.109375, "learning_rate": 4.6333382449321776e-05, "loss": 0.0014, "mean_token_accuracy": 0.9996251404285431, "num_tokens": 135100091.0, "step": 40830 }, { "entropy": 0.04892335725016892, "epoch": 9.51870847418114, "grad_norm": 1.046875, "learning_rate": 4.6332302384368355e-05, "loss": 0.0088, "mean_token_accuracy": 0.9980179607868195, "num_tokens": 135116547.0, "step": 40835 }, { "entropy": 0.06294927066192031, "epoch": 9.519874111201771, "grad_norm": 0.9453125, "learning_rate": 4.633122218771202e-05, "loss": 0.0021, "mean_token_accuracy": 0.9986855506896972, "num_tokens": 135135187.0, "step": 40840 }, { "entropy": 0.054868426825851205, "epoch": 9.521039748222403, "grad_norm": 0.138671875, "learning_rate": 4.6330141859368875e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999448001384735, "num_tokens": 135155106.0, "step": 40845 }, { "entropy": 0.0764088025316596, "epoch": 9.522205385243035, "grad_norm": 0.40625, "learning_rate": 4.632906139935503e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999555706977844, "num_tokens": 135174411.0, "step": 40850 }, { "entropy": 0.0597866203635931, "epoch": 9.523371022263667, "grad_norm": 0.138671875, "learning_rate": 4.632798080768661e-05, "loss": 0.0023, "mean_token_accuracy": 0.9996915280818939, "num_tokens": 135195708.0, "step": 40855 }, { "entropy": 0.04914709161967039, "epoch": 9.5245366592843, "grad_norm": 0.41796875, "learning_rate": 4.632690008437971e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999254047870636, "num_tokens": 135217479.0, "step": 40860 }, { "entropy": 0.052582056075334546, "epoch": 9.52570229630493, "grad_norm": 0.119140625, "learning_rate": 4.632581922945046e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135243405.0, "step": 40865 }, { "entropy": 0.07694977000355721, "epoch": 9.526867933325562, "grad_norm": 0.208984375, "learning_rate": 4.6324738242914966e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999651193618775, "num_tokens": 135260625.0, "step": 40870 }, { "entropy": 0.041239157784730196, "epoch": 9.528033570346194, "grad_norm": 0.1875, "learning_rate": 4.632365712478935e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135291113.0, "step": 40875 }, { "entropy": 0.052362867165356874, "epoch": 9.529199207366826, "grad_norm": 0.064453125, "learning_rate": 4.632257587508975e-05, "loss": 0.0004, "mean_token_accuracy": 0.999953818321228, "num_tokens": 135308792.0, "step": 40880 }, { "entropy": 0.06618333477526903, "epoch": 9.530364844387458, "grad_norm": 0.271484375, "learning_rate": 4.632149449383226e-05, "loss": 0.0008, "mean_token_accuracy": 0.999617338180542, "num_tokens": 135327668.0, "step": 40885 }, { "entropy": 0.07062805388122798, "epoch": 9.53153048140809, "grad_norm": 0.16796875, "learning_rate": 4.6320412981033015e-05, "loss": 0.0015, "mean_token_accuracy": 0.9999792397022247, "num_tokens": 135347719.0, "step": 40890 }, { "entropy": 0.06933510228991509, "epoch": 9.53269611842872, "grad_norm": 0.0849609375, "learning_rate": 4.631933133670815e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 135358370.0, "step": 40895 }, { "entropy": 0.046752724703401326, "epoch": 9.533861755449353, "grad_norm": 0.054443359375, "learning_rate": 4.63182495608738e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893844127655, "num_tokens": 135380333.0, "step": 40900 }, { "entropy": 0.05584818883799016, "epoch": 9.535027392469985, "grad_norm": 0.07861328125, "learning_rate": 4.631716765354607e-05, "loss": 0.0003, "mean_token_accuracy": 0.999989253282547, "num_tokens": 135401439.0, "step": 40905 }, { "entropy": 0.06718082525767385, "epoch": 9.536193029490617, "grad_norm": 0.77734375, "learning_rate": 4.6316085614741115e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996930599212647, "num_tokens": 135423014.0, "step": 40910 }, { "entropy": 0.06601903475821018, "epoch": 9.537358666511249, "grad_norm": 0.0269775390625, "learning_rate": 4.6315003444475065e-05, "loss": 0.0004, "mean_token_accuracy": 0.9995391726493835, "num_tokens": 135438969.0, "step": 40915 }, { "entropy": 0.06503906883299351, "epoch": 9.53852430353188, "grad_norm": 2.140625, "learning_rate": 4.631392114276406e-05, "loss": 0.001, "mean_token_accuracy": 0.999578058719635, "num_tokens": 135452833.0, "step": 40920 }, { "entropy": 0.06475102892145515, "epoch": 9.539689940552512, "grad_norm": 0.08935546875, "learning_rate": 4.6312838709624226e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135471217.0, "step": 40925 }, { "entropy": 0.03920008554123342, "epoch": 9.540855577573144, "grad_norm": 0.1962890625, "learning_rate": 4.6311756145071705e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999875962734223, "num_tokens": 135500601.0, "step": 40930 }, { "entropy": 0.03399303937330842, "epoch": 9.542021214593776, "grad_norm": 0.0185546875, "learning_rate": 4.631067344912266e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999543249607086, "num_tokens": 135542941.0, "step": 40935 }, { "entropy": 0.055815706960856915, "epoch": 9.543186851614408, "grad_norm": 1.5859375, "learning_rate": 4.630959062179321e-05, "loss": 0.0029, "mean_token_accuracy": 0.9992071092128754, "num_tokens": 135557143.0, "step": 40940 }, { "entropy": 0.07005295492708682, "epoch": 9.544352488635038, "grad_norm": 0.06787109375, "learning_rate": 4.6308507663099524e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 135568449.0, "step": 40945 }, { "entropy": 0.05475952555425465, "epoch": 9.54551812565567, "grad_norm": 0.06298828125, "learning_rate": 4.630742457305774e-05, "loss": 0.0004, "mean_token_accuracy": 0.9997912287712097, "num_tokens": 135591304.0, "step": 40950 }, { "entropy": 0.050777725968509914, "epoch": 9.546683762676302, "grad_norm": 0.68359375, "learning_rate": 4.6306341351684e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998898327350616, "num_tokens": 135612736.0, "step": 40955 }, { "entropy": 0.0599704397842288, "epoch": 9.547849399696934, "grad_norm": 0.1416015625, "learning_rate": 4.630525799899447e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135629357.0, "step": 40960 }, { "entropy": 0.041774158645421264, "epoch": 9.549015036717567, "grad_norm": 0.04248046875, "learning_rate": 4.6304174515005294e-05, "loss": 0.0016, "mean_token_accuracy": 0.9995239317417145, "num_tokens": 135651757.0, "step": 40965 }, { "entropy": 0.04553243769332767, "epoch": 9.550180673738199, "grad_norm": 0.029296875, "learning_rate": 4.630309089973264e-05, "loss": 0.0023, "mean_token_accuracy": 0.9994342923164368, "num_tokens": 135676830.0, "step": 40970 }, { "entropy": 0.05818598102778196, "epoch": 9.551346310758829, "grad_norm": 0.09130859375, "learning_rate": 4.630200715319266e-05, "loss": 0.0015, "mean_token_accuracy": 0.9994813144207001, "num_tokens": 135705468.0, "step": 40975 }, { "entropy": 0.0411489917896688, "epoch": 9.552511947779461, "grad_norm": 0.0927734375, "learning_rate": 4.6300923275401506e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999789834022522, "num_tokens": 135731968.0, "step": 40980 }, { "entropy": 0.045835569687187674, "epoch": 9.553677584800093, "grad_norm": 0.01300048828125, "learning_rate": 4.629983926637536e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999888777732849, "num_tokens": 135752618.0, "step": 40985 }, { "entropy": 0.05435104519128799, "epoch": 9.554843221820725, "grad_norm": 0.193359375, "learning_rate": 4.629875512613037e-05, "loss": 0.0015, "mean_token_accuracy": 0.9998708009719849, "num_tokens": 135771384.0, "step": 40990 }, { "entropy": 0.06642589755356312, "epoch": 9.556008858841357, "grad_norm": 0.6015625, "learning_rate": 4.6297670854682705e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997084558010101, "num_tokens": 135783644.0, "step": 40995 }, { "entropy": 0.056930082757025956, "epoch": 9.557174495861988, "grad_norm": 0.055419921875, "learning_rate": 4.629658645204854e-05, "loss": 0.003, "mean_token_accuracy": 0.9992791295051575, "num_tokens": 135812769.0, "step": 41000 }, { "entropy": 0.05946479961276054, "epoch": 9.55834013288262, "grad_norm": 0.09619140625, "learning_rate": 4.629550191824404e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 135825284.0, "step": 41005 }, { "entropy": 0.04301239885389805, "epoch": 9.559505769903252, "grad_norm": 0.040771484375, "learning_rate": 4.629441725328537e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999394893646241, "num_tokens": 135847689.0, "step": 41010 }, { "entropy": 0.049624948669224976, "epoch": 9.560671406923884, "grad_norm": 0.080078125, "learning_rate": 4.6293332457188724e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996513247489929, "num_tokens": 135863808.0, "step": 41015 }, { "entropy": 0.037809535302221774, "epoch": 9.561837043944516, "grad_norm": 0.19140625, "learning_rate": 4.629224752997026e-05, "loss": 0.0012, "mean_token_accuracy": 0.9994710803031921, "num_tokens": 135904205.0, "step": 41020 }, { "entropy": 0.0786101894453168, "epoch": 9.563002680965148, "grad_norm": 0.37109375, "learning_rate": 4.629116247164616e-05, "loss": 0.0194, "mean_token_accuracy": 0.9954461216926574, "num_tokens": 135932346.0, "step": 41025 }, { "entropy": 0.05533070582896471, "epoch": 9.564168317985779, "grad_norm": 0.46484375, "learning_rate": 4.6290077282232594e-05, "loss": 0.0014, "mean_token_accuracy": 0.9999791443347931, "num_tokens": 135957361.0, "step": 41030 }, { "entropy": 0.059603410121053454, "epoch": 9.56533395500641, "grad_norm": 1.0234375, "learning_rate": 4.6288991961745764e-05, "loss": 0.0012, "mean_token_accuracy": 0.999586033821106, "num_tokens": 135982860.0, "step": 41035 }, { "entropy": 0.0578294038772583, "epoch": 9.566499592027043, "grad_norm": 0.140625, "learning_rate": 4.628790651020184e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 136001350.0, "step": 41040 }, { "entropy": 0.06318863546475768, "epoch": 9.567665229047675, "grad_norm": 0.051025390625, "learning_rate": 4.628682092761702e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999873399734497, "num_tokens": 136017372.0, "step": 41045 }, { "entropy": 0.0619500283151865, "epoch": 9.568830866068307, "grad_norm": 0.11279296875, "learning_rate": 4.628573521400748e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999892354011536, "num_tokens": 136040839.0, "step": 41050 }, { "entropy": 0.05457418002188206, "epoch": 9.569996503088937, "grad_norm": 0.0159912109375, "learning_rate": 4.6284649369389416e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999674022197723, "num_tokens": 136069858.0, "step": 41055 }, { "entropy": 0.057254419988021256, "epoch": 9.57116214010957, "grad_norm": 0.12353515625, "learning_rate": 4.6283563393779014e-05, "loss": 0.0007, "mean_token_accuracy": 0.999933785200119, "num_tokens": 136101309.0, "step": 41060 }, { "entropy": 0.07205391302704811, "epoch": 9.572327777130202, "grad_norm": 0.07763671875, "learning_rate": 4.628247728719247e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997916638851165, "num_tokens": 136115648.0, "step": 41065 }, { "entropy": 0.06587801650166511, "epoch": 9.573493414150834, "grad_norm": 0.376953125, "learning_rate": 4.6281391049645986e-05, "loss": 0.0027, "mean_token_accuracy": 0.9996513187885284, "num_tokens": 136136716.0, "step": 41070 }, { "entropy": 0.09622649177908897, "epoch": 9.574659051171466, "grad_norm": 0.09521484375, "learning_rate": 4.628030468115574e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 136147763.0, "step": 41075 }, { "entropy": 0.05813981788232923, "epoch": 9.575824688192096, "grad_norm": 0.0380859375, "learning_rate": 4.627921818173796e-05, "loss": 0.0011, "mean_token_accuracy": 0.9996845960617066, "num_tokens": 136166803.0, "step": 41080 }, { "entropy": 0.06320654163137078, "epoch": 9.576990325212728, "grad_norm": 0.1328125, "learning_rate": 4.627813155140882e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 136191064.0, "step": 41085 }, { "entropy": 0.041704765520989895, "epoch": 9.57815596223336, "grad_norm": 0.04150390625, "learning_rate": 4.6277044790184546e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 136205964.0, "step": 41090 }, { "entropy": 0.056818116828799246, "epoch": 9.579321599253992, "grad_norm": 0.373046875, "learning_rate": 4.627595789808133e-05, "loss": 0.0008, "mean_token_accuracy": 0.9994535505771637, "num_tokens": 136227356.0, "step": 41095 }, { "entropy": 0.09946035724133254, "epoch": 9.580487236274625, "grad_norm": 0.1796875, "learning_rate": 4.627487087511538e-05, "loss": 0.0561, "mean_token_accuracy": 0.992105895280838, "num_tokens": 136263847.0, "step": 41100 }, { "entropy": 0.07794980946928262, "epoch": 9.581652873295257, "grad_norm": 0.05322265625, "learning_rate": 4.6273783721302904e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 136274021.0, "step": 41105 }, { "entropy": 0.06599203050136566, "epoch": 9.582818510315887, "grad_norm": 0.39453125, "learning_rate": 4.6272696436660125e-05, "loss": 0.0012, "mean_token_accuracy": 0.9998664915561676, "num_tokens": 136294119.0, "step": 41110 }, { "entropy": 0.057835323922336104, "epoch": 9.583984147336519, "grad_norm": 1.6796875, "learning_rate": 4.627160902120324e-05, "loss": 0.0027, "mean_token_accuracy": 0.9999575018882751, "num_tokens": 136318752.0, "step": 41115 }, { "entropy": 0.04515928709879517, "epoch": 9.585149784357151, "grad_norm": 0.65234375, "learning_rate": 4.627052147494847e-05, "loss": 0.0015, "mean_token_accuracy": 0.9992202460765839, "num_tokens": 136333968.0, "step": 41120 }, { "entropy": 0.06753161698579788, "epoch": 9.586315421377783, "grad_norm": 0.048583984375, "learning_rate": 4.6269433797912034e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999408602714539, "num_tokens": 136352206.0, "step": 41125 }, { "entropy": 0.06529767028987407, "epoch": 9.587481058398415, "grad_norm": 0.03369140625, "learning_rate": 4.6268345990110156e-05, "loss": 0.0007, "mean_token_accuracy": 0.9998778998851776, "num_tokens": 136361336.0, "step": 41130 }, { "entropy": 0.05374781196005642, "epoch": 9.588646695419046, "grad_norm": 0.0286865234375, "learning_rate": 4.6267258051559046e-05, "loss": 0.0004, "mean_token_accuracy": 0.9996373355388641, "num_tokens": 136387286.0, "step": 41135 }, { "entropy": 0.06637289561331272, "epoch": 9.589812332439678, "grad_norm": 0.040283203125, "learning_rate": 4.626616998227493e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999774336814881, "num_tokens": 136407787.0, "step": 41140 }, { "entropy": 0.052790251560509205, "epoch": 9.59097796946031, "grad_norm": 0.10009765625, "learning_rate": 4.6265081782274046e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 136431041.0, "step": 41145 }, { "entropy": 0.06665988322347402, "epoch": 9.592143606480942, "grad_norm": 0.140625, "learning_rate": 4.6263993451572594e-05, "loss": 0.001, "mean_token_accuracy": 0.9994082868099212, "num_tokens": 136441999.0, "step": 41150 }, { "entropy": 0.05983887910842896, "epoch": 9.593309243501574, "grad_norm": 0.1552734375, "learning_rate": 4.6262904990186826e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999689280986785, "num_tokens": 136465318.0, "step": 41155 }, { "entropy": 0.061148155480623245, "epoch": 9.594474880522206, "grad_norm": 0.427734375, "learning_rate": 4.6261816398132965e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 136478002.0, "step": 41160 }, { "entropy": 0.05825490411370993, "epoch": 9.595640517542837, "grad_norm": 0.1416015625, "learning_rate": 4.626072767542724e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997474730014801, "num_tokens": 136492488.0, "step": 41165 }, { "entropy": 0.09653316643089056, "epoch": 9.596806154563469, "grad_norm": 0.1103515625, "learning_rate": 4.625963882208589e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 136507177.0, "step": 41170 }, { "entropy": 0.03887947797775269, "epoch": 9.5979717915841, "grad_norm": 0.07421875, "learning_rate": 4.625854983812515e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998583555221557, "num_tokens": 136535310.0, "step": 41175 }, { "entropy": 0.047138193342834714, "epoch": 9.599137428604733, "grad_norm": 0.478515625, "learning_rate": 4.625746072356126e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 136551768.0, "step": 41180 }, { "entropy": 0.07058442085981369, "epoch": 9.600303065625365, "grad_norm": 0.1298828125, "learning_rate": 4.625637147841046e-05, "loss": 0.0046, "mean_token_accuracy": 0.9993247389793396, "num_tokens": 136562332.0, "step": 41185 }, { "entropy": 0.05666069518774748, "epoch": 9.601468702645995, "grad_norm": 0.0517578125, "learning_rate": 4.625528210268898e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998417735099793, "num_tokens": 136575910.0, "step": 41190 }, { "entropy": 0.06339784916490317, "epoch": 9.602634339666627, "grad_norm": 0.041259765625, "learning_rate": 4.625419259641308e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999178290367127, "num_tokens": 136586032.0, "step": 41195 }, { "entropy": 0.03752074474468827, "epoch": 9.60379997668726, "grad_norm": 0.162109375, "learning_rate": 4.625310295959901e-05, "loss": 0.0007, "mean_token_accuracy": 0.9998230099678039, "num_tokens": 136607533.0, "step": 41200 }, { "entropy": 0.07200399097055196, "epoch": 9.604965613707892, "grad_norm": 0.10693359375, "learning_rate": 4.6252013192263e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 136626736.0, "step": 41205 }, { "entropy": 0.07442393903620541, "epoch": 9.606131250728524, "grad_norm": 2.859375, "learning_rate": 4.625092329442131e-05, "loss": 0.0064, "mean_token_accuracy": 0.9989759445190429, "num_tokens": 136647055.0, "step": 41210 }, { "entropy": 0.04741760222241283, "epoch": 9.607296887749154, "grad_norm": 0.1396484375, "learning_rate": 4.62498332660902e-05, "loss": 0.001, "mean_token_accuracy": 0.9996415734291076, "num_tokens": 136680106.0, "step": 41215 }, { "entropy": 0.07332726744934917, "epoch": 9.608462524769786, "grad_norm": 3.328125, "learning_rate": 4.6248743107285905e-05, "loss": 0.0025, "mean_token_accuracy": 0.9996515691280365, "num_tokens": 136699318.0, "step": 41220 }, { "entropy": 0.05996815362013876, "epoch": 9.609628161790418, "grad_norm": 0.029541015625, "learning_rate": 4.62476528180247e-05, "loss": 0.0006, "mean_token_accuracy": 0.999965512752533, "num_tokens": 136719358.0, "step": 41225 }, { "entropy": 0.06359810242429376, "epoch": 9.61079379881105, "grad_norm": 2.921875, "learning_rate": 4.624656239832283e-05, "loss": 0.0036, "mean_token_accuracy": 0.9995666563510894, "num_tokens": 136748699.0, "step": 41230 }, { "entropy": 0.054721962008625266, "epoch": 9.611959435831682, "grad_norm": 0.0361328125, "learning_rate": 4.624547184819656e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999768316745759, "num_tokens": 136767794.0, "step": 41235 }, { "entropy": 0.04269056161865592, "epoch": 9.613125072852315, "grad_norm": 0.1767578125, "learning_rate": 4.624438116766215e-05, "loss": 0.0006, "mean_token_accuracy": 0.9996411383152009, "num_tokens": 136797142.0, "step": 41240 }, { "entropy": 0.055638442654162644, "epoch": 9.614290709872945, "grad_norm": 0.05712890625, "learning_rate": 4.6243290356735865e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 136809958.0, "step": 41245 }, { "entropy": 0.048937355354428294, "epoch": 9.615456346893577, "grad_norm": 0.12890625, "learning_rate": 4.624219941543397e-05, "loss": 0.0011, "mean_token_accuracy": 0.999721109867096, "num_tokens": 136835763.0, "step": 41250 }, { "entropy": 0.07546586729586124, "epoch": 9.616621983914209, "grad_norm": 2.28125, "learning_rate": 4.6241108343772735e-05, "loss": 0.0025, "mean_token_accuracy": 0.9991138696670532, "num_tokens": 136846408.0, "step": 41255 }, { "entropy": 0.04572682408615947, "epoch": 9.617787620934841, "grad_norm": 0.10009765625, "learning_rate": 4.6240017141768424e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999783515930176, "num_tokens": 136889137.0, "step": 41260 }, { "entropy": 0.06971332840621472, "epoch": 9.618953257955473, "grad_norm": 0.11474609375, "learning_rate": 4.623892580943731e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996296286582946, "num_tokens": 136898647.0, "step": 41265 }, { "entropy": 0.05883047040551901, "epoch": 9.620118894976104, "grad_norm": 0.09716796875, "learning_rate": 4.623783434679567e-05, "loss": 0.0015, "mean_token_accuracy": 0.9993464052677155, "num_tokens": 136924038.0, "step": 41270 }, { "entropy": 0.052692413609474895, "epoch": 9.621284531996736, "grad_norm": 1.546875, "learning_rate": 4.623674275385978e-05, "loss": 0.0057, "mean_token_accuracy": 0.9981497287750244, "num_tokens": 136944410.0, "step": 41275 }, { "entropy": 0.06412827260792256, "epoch": 9.622450169017368, "grad_norm": 0.0400390625, "learning_rate": 4.623565103064591e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999616265296936, "num_tokens": 136956566.0, "step": 41280 }, { "entropy": 0.06835590191185474, "epoch": 9.623615806038, "grad_norm": 0.07958984375, "learning_rate": 4.623455917717035e-05, "loss": 0.0008, "mean_token_accuracy": 0.9986554086208344, "num_tokens": 136987616.0, "step": 41285 }, { "entropy": 0.06827231720089913, "epoch": 9.624781443058632, "grad_norm": 0.2119140625, "learning_rate": 4.623346719344937e-05, "loss": 0.0014, "mean_token_accuracy": 0.9994838953018188, "num_tokens": 137009985.0, "step": 41290 }, { "entropy": 0.06732688648626209, "epoch": 9.625947080079264, "grad_norm": 0.1376953125, "learning_rate": 4.623237507949926e-05, "loss": 0.002, "mean_token_accuracy": 0.9995437383651733, "num_tokens": 137033747.0, "step": 41295 }, { "entropy": 0.04815381010994315, "epoch": 9.627112717099894, "grad_norm": 0.06591796875, "learning_rate": 4.6231282835336306e-05, "loss": 0.0002, "mean_token_accuracy": 0.999988979101181, "num_tokens": 137057779.0, "step": 41300 }, { "entropy": 0.05480205034837127, "epoch": 9.628278354120527, "grad_norm": 0.65234375, "learning_rate": 4.623019046097679e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996123731136322, "num_tokens": 137074912.0, "step": 41305 }, { "entropy": 0.06899555269628763, "epoch": 9.629443991141159, "grad_norm": 1.2109375, "learning_rate": 4.6229097956437e-05, "loss": 0.0012, "mean_token_accuracy": 0.9994629144668579, "num_tokens": 137094135.0, "step": 41310 }, { "entropy": 0.06584795317612588, "epoch": 9.63060962816179, "grad_norm": 0.05859375, "learning_rate": 4.622800532173324e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 137116123.0, "step": 41315 }, { "entropy": 0.04917632364667952, "epoch": 9.631775265182423, "grad_norm": 0.546875, "learning_rate": 4.6226912556881786e-05, "loss": 0.0011, "mean_token_accuracy": 0.9999045550823211, "num_tokens": 137144216.0, "step": 41320 }, { "entropy": 0.05311643872410059, "epoch": 9.632940902203053, "grad_norm": 0.0174560546875, "learning_rate": 4.622581966189894e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 137159915.0, "step": 41325 }, { "entropy": 0.041334444051608445, "epoch": 9.634106539223685, "grad_norm": 0.10791015625, "learning_rate": 4.6224726636801e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996349096298218, "num_tokens": 137189497.0, "step": 41330 }, { "entropy": 0.05849685426801443, "epoch": 9.635272176244317, "grad_norm": 1.078125, "learning_rate": 4.622363348160426e-05, "loss": 0.0017, "mean_token_accuracy": 0.999852204322815, "num_tokens": 137213906.0, "step": 41335 }, { "entropy": 0.06110354913398623, "epoch": 9.63643781326495, "grad_norm": 0.52734375, "learning_rate": 4.622254019632503e-05, "loss": 0.0006, "mean_token_accuracy": 0.9995953798294067, "num_tokens": 137242822.0, "step": 41340 }, { "entropy": 0.045990268047899005, "epoch": 9.637603450285582, "grad_norm": 0.1455078125, "learning_rate": 4.6221446780979594e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999152421951294, "num_tokens": 137260574.0, "step": 41345 }, { "entropy": 0.04493403248488903, "epoch": 9.638769087306212, "grad_norm": 1.1953125, "learning_rate": 4.622035323558428e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997753381729126, "num_tokens": 137283109.0, "step": 41350 }, { "entropy": 0.06357506215572357, "epoch": 9.639934724326844, "grad_norm": 0.033935546875, "learning_rate": 4.621925956015538e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 137297805.0, "step": 41355 }, { "entropy": 0.057191804703325036, "epoch": 9.641100361347476, "grad_norm": 0.251953125, "learning_rate": 4.6218165754709204e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 137321489.0, "step": 41360 }, { "entropy": 0.05335645116865635, "epoch": 9.642265998368108, "grad_norm": 0.212890625, "learning_rate": 4.621707181926206e-05, "loss": 0.0018, "mean_token_accuracy": 0.9995932877063751, "num_tokens": 137338002.0, "step": 41365 }, { "entropy": 0.05435511860996485, "epoch": 9.64343163538874, "grad_norm": 0.1044921875, "learning_rate": 4.621597775383027e-05, "loss": 0.0004, "mean_token_accuracy": 0.999989116191864, "num_tokens": 137357160.0, "step": 41370 }, { "entropy": 0.06837129667401314, "epoch": 9.644597272409372, "grad_norm": 1.703125, "learning_rate": 4.6214883558430144e-05, "loss": 0.0016, "mean_token_accuracy": 0.9996031939983367, "num_tokens": 137367199.0, "step": 41375 }, { "entropy": 0.05769171183928847, "epoch": 9.645762909430003, "grad_norm": 0.0693359375, "learning_rate": 4.621378923307799e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 137383857.0, "step": 41380 }, { "entropy": 0.0587625206913799, "epoch": 9.646928546450635, "grad_norm": 0.10986328125, "learning_rate": 4.6212694777790134e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999234795570373, "num_tokens": 137402085.0, "step": 41385 }, { "entropy": 0.06259220317006112, "epoch": 9.648094183471267, "grad_norm": 0.02880859375, "learning_rate": 4.62116001925829e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997849702835083, "num_tokens": 137423375.0, "step": 41390 }, { "entropy": 0.08494818285107612, "epoch": 9.6492598204919, "grad_norm": 1.3828125, "learning_rate": 4.6210505477472596e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999876201152802, "num_tokens": 137440289.0, "step": 41395 }, { "entropy": 0.049837575666606425, "epoch": 9.650425457512531, "grad_norm": 0.01239013671875, "learning_rate": 4.620941063247556e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 137453733.0, "step": 41400 }, { "entropy": 0.054591873474419114, "epoch": 9.651591094533162, "grad_norm": 0.13671875, "learning_rate": 4.620831565760811e-05, "loss": 0.0004, "mean_token_accuracy": 0.9996463716030121, "num_tokens": 137480363.0, "step": 41405 }, { "entropy": 0.05170064996927977, "epoch": 9.652756731553794, "grad_norm": 0.76171875, "learning_rate": 4.6207220552886575e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996154427528381, "num_tokens": 137502929.0, "step": 41410 }, { "entropy": 0.05048687579110265, "epoch": 9.653922368574426, "grad_norm": 0.3203125, "learning_rate": 4.620612531832728e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997367084026336, "num_tokens": 137530077.0, "step": 41415 }, { "entropy": 0.06301561882719398, "epoch": 9.655088005595058, "grad_norm": 0.1796875, "learning_rate": 4.620502995394657e-05, "loss": 0.0013, "mean_token_accuracy": 0.9993149757385253, "num_tokens": 137560535.0, "step": 41420 }, { "entropy": 0.047367416135966775, "epoch": 9.65625364261569, "grad_norm": 0.050537109375, "learning_rate": 4.620393445976077e-05, "loss": 0.001, "mean_token_accuracy": 0.9998161792755127, "num_tokens": 137574452.0, "step": 41425 }, { "entropy": 0.04114743801765144, "epoch": 9.657419279636322, "grad_norm": 0.337890625, "learning_rate": 4.620283883578621e-05, "loss": 0.0026, "mean_token_accuracy": 0.9996399223804474, "num_tokens": 137607358.0, "step": 41430 }, { "entropy": 0.05030833915807307, "epoch": 9.658584916656952, "grad_norm": 0.134765625, "learning_rate": 4.620174308203924e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999332249164581, "num_tokens": 137640131.0, "step": 41435 }, { "entropy": 0.04978873711079359, "epoch": 9.659750553677585, "grad_norm": 0.1708984375, "learning_rate": 4.620064719853618e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998363316059112, "num_tokens": 137665585.0, "step": 41440 }, { "entropy": 0.09417327288538217, "epoch": 9.660916190698217, "grad_norm": 0.0498046875, "learning_rate": 4.61995511852934e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 137680253.0, "step": 41445 }, { "entropy": 0.06099643521010876, "epoch": 9.662081827718849, "grad_norm": 1.75, "learning_rate": 4.619845504232722e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 137689510.0, "step": 41450 }, { "entropy": 0.05705720148980618, "epoch": 9.66324746473948, "grad_norm": 0.111328125, "learning_rate": 4.6197358769653985e-05, "loss": 0.0015, "mean_token_accuracy": 0.9993055582046508, "num_tokens": 137703980.0, "step": 41455 }, { "entropy": 0.06064219465479255, "epoch": 9.664413101760111, "grad_norm": 0.201171875, "learning_rate": 4.619626236729006e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999850571155549, "num_tokens": 137720798.0, "step": 41460 }, { "entropy": 0.057933222688734534, "epoch": 9.665578738780743, "grad_norm": 0.083984375, "learning_rate": 4.6195165835251775e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 137734242.0, "step": 41465 }, { "entropy": 0.10937680806964636, "epoch": 9.666744375801375, "grad_norm": 0.09130859375, "learning_rate": 4.61940691735555e-05, "loss": 0.0729, "mean_token_accuracy": 0.990872037410736, "num_tokens": 137757622.0, "step": 41470 }, { "entropy": 0.08084865333512425, "epoch": 9.667910012822007, "grad_norm": 0.09130859375, "learning_rate": 4.6192972382217566e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995614051818847, "num_tokens": 137778970.0, "step": 41475 }, { "entropy": 0.0488979984074831, "epoch": 9.66907564984264, "grad_norm": 0.06494140625, "learning_rate": 4.619187546125434e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999493420124054, "num_tokens": 137806820.0, "step": 41480 }, { "entropy": 0.05168530084192753, "epoch": 9.67024128686327, "grad_norm": 0.91015625, "learning_rate": 4.619077841068218e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999542713165284, "num_tokens": 137834902.0, "step": 41485 }, { "entropy": 0.07210210133343935, "epoch": 9.671406923883902, "grad_norm": 0.0849609375, "learning_rate": 4.6189681230517446e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 137855260.0, "step": 41490 }, { "entropy": 0.08489122875034809, "epoch": 9.672572560904534, "grad_norm": 0.10595703125, "learning_rate": 4.618858392077649e-05, "loss": 0.0018, "mean_token_accuracy": 0.9996070742607117, "num_tokens": 137865330.0, "step": 41495 }, { "entropy": 0.046634336933493614, "epoch": 9.673738197925166, "grad_norm": 0.056396484375, "learning_rate": 4.618748648147568e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 137879045.0, "step": 41500 }, { "entropy": 0.05105196684598923, "epoch": 9.674903834945798, "grad_norm": 0.154296875, "learning_rate": 4.6186388912631376e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999715268611908, "num_tokens": 137902675.0, "step": 41505 }, { "entropy": 0.05192754324525595, "epoch": 9.67606947196643, "grad_norm": 0.380859375, "learning_rate": 4.6185291214259954e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999767959117889, "num_tokens": 137931187.0, "step": 41510 }, { "entropy": 0.0527059281244874, "epoch": 9.67723510898706, "grad_norm": 0.1005859375, "learning_rate": 4.618419338637778e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996753036975861, "num_tokens": 137955839.0, "step": 41515 }, { "entropy": 0.06725910743698478, "epoch": 9.678400746007693, "grad_norm": 2.859375, "learning_rate": 4.618309542900122e-05, "loss": 0.0075, "mean_token_accuracy": 0.9991827428340911, "num_tokens": 137973813.0, "step": 41520 }, { "entropy": 0.06493516713380813, "epoch": 9.679566383028325, "grad_norm": 0.0625, "learning_rate": 4.618199734214664e-05, "loss": 0.0009, "mean_token_accuracy": 0.9998061001300812, "num_tokens": 137992578.0, "step": 41525 }, { "entropy": 0.07467560321092606, "epoch": 9.680732020048957, "grad_norm": 0.041748046875, "learning_rate": 4.6180899125830425e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997429311275482, "num_tokens": 138002434.0, "step": 41530 }, { "entropy": 0.04612938100472093, "epoch": 9.68189765706959, "grad_norm": 0.04150390625, "learning_rate": 4.617980078006894e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 138022320.0, "step": 41535 }, { "entropy": 0.05380245940759778, "epoch": 9.68306329409022, "grad_norm": 0.03515625, "learning_rate": 4.617870230487858e-05, "loss": 0.0015, "mean_token_accuracy": 0.9998850584030151, "num_tokens": 138042476.0, "step": 41540 }, { "entropy": 0.04941651728004217, "epoch": 9.684228931110852, "grad_norm": 0.337890625, "learning_rate": 4.617760370027571e-05, "loss": 0.0004, "mean_token_accuracy": 0.9998548626899719, "num_tokens": 138066165.0, "step": 41545 }, { "entropy": 0.07211229205131531, "epoch": 9.685394568131484, "grad_norm": 0.1162109375, "learning_rate": 4.6176504966276726e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138075102.0, "step": 41550 }, { "entropy": 0.048767372686415913, "epoch": 9.686560205152116, "grad_norm": 0.06982421875, "learning_rate": 4.617540610289799e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 138090963.0, "step": 41555 }, { "entropy": 0.07153227487578988, "epoch": 9.687725842172748, "grad_norm": 0.390625, "learning_rate": 4.6174307110155905e-05, "loss": 0.0009, "mean_token_accuracy": 0.9998355269432068, "num_tokens": 138108155.0, "step": 41560 }, { "entropy": 0.04240287458524108, "epoch": 9.68889147919338, "grad_norm": 0.1123046875, "learning_rate": 4.617320798806686e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999687135219574, "num_tokens": 138144713.0, "step": 41565 }, { "entropy": 0.0623874238692224, "epoch": 9.69005711621401, "grad_norm": 0.04833984375, "learning_rate": 4.6172108736647234e-05, "loss": 0.0017, "mean_token_accuracy": 0.9997009575366974, "num_tokens": 138164675.0, "step": 41570 }, { "entropy": 0.055486177653074266, "epoch": 9.691222753234642, "grad_norm": 0.0286865234375, "learning_rate": 4.617100935591342e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997545659542084, "num_tokens": 138188695.0, "step": 41575 }, { "entropy": 0.045479421876370905, "epoch": 9.692388390255275, "grad_norm": 0.10107421875, "learning_rate": 4.616990984588182e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138203879.0, "step": 41580 }, { "entropy": 0.053182793455198406, "epoch": 9.693554027275907, "grad_norm": 0.26171875, "learning_rate": 4.6168810206568816e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138227419.0, "step": 41585 }, { "entropy": 0.04395967880263925, "epoch": 9.694719664296539, "grad_norm": 0.2392578125, "learning_rate": 4.6167710437990826e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999681532382965, "num_tokens": 138253694.0, "step": 41590 }, { "entropy": 0.05410387422889471, "epoch": 9.695885301317169, "grad_norm": 0.181640625, "learning_rate": 4.616661054016422e-05, "loss": 0.001, "mean_token_accuracy": 0.999914425611496, "num_tokens": 138290391.0, "step": 41595 }, { "entropy": 0.059342716634273526, "epoch": 9.697050938337801, "grad_norm": 0.06396484375, "learning_rate": 4.616551051310543e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138303049.0, "step": 41600 }, { "entropy": 0.05551241189241409, "epoch": 9.698216575358433, "grad_norm": 0.298828125, "learning_rate": 4.616441035683084e-05, "loss": 0.0021, "mean_token_accuracy": 0.9993800580501556, "num_tokens": 138331547.0, "step": 41605 }, { "entropy": 0.07022674959152937, "epoch": 9.699382212379065, "grad_norm": 0.1748046875, "learning_rate": 4.616331007135686e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995691180229187, "num_tokens": 138352702.0, "step": 41610 }, { "entropy": 0.07191054821014405, "epoch": 9.700547849399697, "grad_norm": 0.236328125, "learning_rate": 4.6162209656699895e-05, "loss": 0.0014, "mean_token_accuracy": 0.9997867822647095, "num_tokens": 138362706.0, "step": 41615 }, { "entropy": 0.06880695289000868, "epoch": 9.701713486420328, "grad_norm": 0.134765625, "learning_rate": 4.6161109112876355e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999779343605042, "num_tokens": 138381870.0, "step": 41620 }, { "entropy": 0.05174048515036702, "epoch": 9.70287912344096, "grad_norm": 0.1416015625, "learning_rate": 4.616000843990265e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999647498130798, "num_tokens": 138413997.0, "step": 41625 }, { "entropy": 0.04192606620490551, "epoch": 9.704044760461592, "grad_norm": 0.0260009765625, "learning_rate": 4.61589076377952e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997602283954621, "num_tokens": 138450577.0, "step": 41630 }, { "entropy": 0.06087205857038498, "epoch": 9.705210397482224, "grad_norm": 0.275390625, "learning_rate": 4.6157806706570406e-05, "loss": 0.001, "mean_token_accuracy": 0.9993548393249512, "num_tokens": 138462445.0, "step": 41635 }, { "entropy": 0.06275629969313741, "epoch": 9.706376034502856, "grad_norm": 0.177734375, "learning_rate": 4.6156705646244694e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999878585338593, "num_tokens": 138480334.0, "step": 41640 }, { "entropy": 0.0454404903575778, "epoch": 9.707541671523488, "grad_norm": 2.1875, "learning_rate": 4.615560445683448e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993854582309722, "num_tokens": 138503994.0, "step": 41645 }, { "entropy": 0.06599656357429921, "epoch": 9.708707308544119, "grad_norm": 0.0308837890625, "learning_rate": 4.6154503138356186e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999789416790008, "num_tokens": 138523903.0, "step": 41650 }, { "entropy": 0.047876712214201686, "epoch": 9.70987294556475, "grad_norm": 0.185546875, "learning_rate": 4.6153401690826235e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999779760837555, "num_tokens": 138545309.0, "step": 41655 }, { "entropy": 0.05285617532208562, "epoch": 9.711038582585383, "grad_norm": 0.224609375, "learning_rate": 4.615230011426105e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999444663524628, "num_tokens": 138568014.0, "step": 41660 }, { "entropy": 0.06128771533258259, "epoch": 9.712204219606015, "grad_norm": 0.125, "learning_rate": 4.615119840867705e-05, "loss": 0.0005, "mean_token_accuracy": 0.9996610164642334, "num_tokens": 138587152.0, "step": 41665 }, { "entropy": 0.05335593800991774, "epoch": 9.713369856626647, "grad_norm": 0.058837890625, "learning_rate": 4.6150096574090674e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999667584896088, "num_tokens": 138621828.0, "step": 41670 }, { "entropy": 0.05973550733178854, "epoch": 9.714535493647277, "grad_norm": 1.7890625, "learning_rate": 4.614899461051835e-05, "loss": 0.0012, "mean_token_accuracy": 0.9995299935340881, "num_tokens": 138639928.0, "step": 41675 }, { "entropy": 0.06601793747395276, "epoch": 9.71570113066791, "grad_norm": 2.21875, "learning_rate": 4.614789251797651e-05, "loss": 0.0025, "mean_token_accuracy": 0.9994257688522339, "num_tokens": 138660350.0, "step": 41680 }, { "entropy": 0.04857506472617388, "epoch": 9.716866767688542, "grad_norm": 0.07080078125, "learning_rate": 4.614679029648157e-05, "loss": 0.0011, "mean_token_accuracy": 0.9991885244846344, "num_tokens": 138687417.0, "step": 41685 }, { "entropy": 0.04925138596445322, "epoch": 9.718032404709174, "grad_norm": 0.0198974609375, "learning_rate": 4.614568794604999e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138712767.0, "step": 41690 }, { "entropy": 0.049935361836105585, "epoch": 9.719198041729806, "grad_norm": 0.2099609375, "learning_rate": 4.6144585466698204e-05, "loss": 0.0009, "mean_token_accuracy": 0.9995476484298706, "num_tokens": 138739256.0, "step": 41695 }, { "entropy": 0.06216288320720196, "epoch": 9.720363678750438, "grad_norm": 0.04150390625, "learning_rate": 4.6143482858442644e-05, "loss": 0.0005, "mean_token_accuracy": 0.9996389865875244, "num_tokens": 138755595.0, "step": 41700 }, { "entropy": 0.03968005385249853, "epoch": 9.721529315771068, "grad_norm": 0.193359375, "learning_rate": 4.614238012129975e-05, "loss": 0.0006, "mean_token_accuracy": 0.9995426118373871, "num_tokens": 138792739.0, "step": 41705 }, { "entropy": 0.06131018102169037, "epoch": 9.7226949527917, "grad_norm": 0.10595703125, "learning_rate": 4.6141277255285974e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138803123.0, "step": 41710 }, { "entropy": 0.05428974824026227, "epoch": 9.723860589812332, "grad_norm": 0.953125, "learning_rate": 4.614017426041776e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 138818913.0, "step": 41715 }, { "entropy": 0.07367751612327993, "epoch": 9.725026226832965, "grad_norm": 0.099609375, "learning_rate": 4.6139071136711545e-05, "loss": 0.0241, "mean_token_accuracy": 0.9963607609272003, "num_tokens": 138849574.0, "step": 41720 }, { "entropy": 0.05248088203370571, "epoch": 9.726191863853597, "grad_norm": 0.07177734375, "learning_rate": 4.6137967884183786e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999784231185913, "num_tokens": 138869838.0, "step": 41725 }, { "entropy": 0.052217611204832794, "epoch": 9.727357500874227, "grad_norm": 0.029296875, "learning_rate": 4.613686450285094e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999537229537964, "num_tokens": 138887125.0, "step": 41730 }, { "entropy": 0.058260629698634145, "epoch": 9.72852313789486, "grad_norm": 0.08544921875, "learning_rate": 4.613576099272944e-05, "loss": 0.001, "mean_token_accuracy": 0.9992043793201446, "num_tokens": 138909178.0, "step": 41735 }, { "entropy": 0.07376976646482944, "epoch": 9.729688774915491, "grad_norm": 0.06787109375, "learning_rate": 4.613465735383576e-05, "loss": 0.0044, "mean_token_accuracy": 0.9981070458889008, "num_tokens": 138917466.0, "step": 41740 }, { "entropy": 0.07478579832240939, "epoch": 9.730854411936123, "grad_norm": 0.022216796875, "learning_rate": 4.613355358618636e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 138934280.0, "step": 41745 }, { "entropy": 0.04984639883041382, "epoch": 9.732020048956755, "grad_norm": 0.0966796875, "learning_rate": 4.6132449689797686e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999508142471314, "num_tokens": 138965992.0, "step": 41750 }, { "entropy": 0.05700819352641702, "epoch": 9.733185685977386, "grad_norm": 0.76171875, "learning_rate": 4.61313456646862e-05, "loss": 0.0018, "mean_token_accuracy": 0.9998159170150757, "num_tokens": 138982514.0, "step": 41755 }, { "entropy": 0.05794349061325192, "epoch": 9.734351322998018, "grad_norm": 0.72265625, "learning_rate": 4.613024151086838e-05, "loss": 0.0017, "mean_token_accuracy": 0.9998278856277466, "num_tokens": 138999275.0, "step": 41760 }, { "entropy": 0.06126205716282129, "epoch": 9.73551696001865, "grad_norm": 0.06298828125, "learning_rate": 4.6129137228360665e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998766958713532, "num_tokens": 139012755.0, "step": 41765 }, { "entropy": 0.06369129400700331, "epoch": 9.736682597039282, "grad_norm": 0.0400390625, "learning_rate": 4.612803281717954e-05, "loss": 0.0007, "mean_token_accuracy": 0.9996539771556854, "num_tokens": 139023890.0, "step": 41770 }, { "entropy": 0.055011258088052274, "epoch": 9.737848234059914, "grad_norm": 0.08642578125, "learning_rate": 4.612692827734146e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 139036019.0, "step": 41775 }, { "entropy": 0.06937271989881992, "epoch": 9.739013871080546, "grad_norm": 0.240234375, "learning_rate": 4.6125823608862916e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999895572662354, "num_tokens": 139055200.0, "step": 41780 }, { "entropy": 0.051238779630512, "epoch": 9.740179508101177, "grad_norm": 0.08154296875, "learning_rate": 4.6124718811760366e-05, "loss": 0.0019, "mean_token_accuracy": 0.9998046875, "num_tokens": 139076963.0, "step": 41785 }, { "entropy": 0.050241039227694274, "epoch": 9.741345145121809, "grad_norm": 0.115234375, "learning_rate": 4.612361388605028e-05, "loss": 0.0028, "mean_token_accuracy": 0.9995464980602264, "num_tokens": 139102393.0, "step": 41790 }, { "entropy": 0.050885317660868165, "epoch": 9.74251078214244, "grad_norm": 0.1328125, "learning_rate": 4.6122508831749145e-05, "loss": 0.0015, "mean_token_accuracy": 0.9997395157814026, "num_tokens": 139122773.0, "step": 41795 }, { "entropy": 0.04559088721871376, "epoch": 9.743676419163073, "grad_norm": 0.482421875, "learning_rate": 4.6121403648873435e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999362051486969, "num_tokens": 139164920.0, "step": 41800 }, { "entropy": 0.058302114717662334, "epoch": 9.744842056183705, "grad_norm": 0.48828125, "learning_rate": 4.6120298337439624e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 139181052.0, "step": 41805 }, { "entropy": 0.0644199687987566, "epoch": 9.746007693204335, "grad_norm": 0.1591796875, "learning_rate": 4.61191928974642e-05, "loss": 0.0019, "mean_token_accuracy": 0.9990922808647156, "num_tokens": 139202201.0, "step": 41810 }, { "entropy": 0.04844171958975494, "epoch": 9.747173330224967, "grad_norm": 0.072265625, "learning_rate": 4.6118087328963655e-05, "loss": 0.0007, "mean_token_accuracy": 0.9993967711925507, "num_tokens": 139233615.0, "step": 41815 }, { "entropy": 0.04493777519091964, "epoch": 9.7483389672456, "grad_norm": 0.061767578125, "learning_rate": 4.611698163195446e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998775362968445, "num_tokens": 139256815.0, "step": 41820 }, { "entropy": 0.05382460365071893, "epoch": 9.749504604266232, "grad_norm": 0.11767578125, "learning_rate": 4.6115875806453105e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999469757080078, "num_tokens": 139277285.0, "step": 41825 }, { "entropy": 0.045704776979982854, "epoch": 9.750670241286864, "grad_norm": 0.10009765625, "learning_rate": 4.6114769852476086e-05, "loss": 0.0008, "mean_token_accuracy": 0.9996791303157806, "num_tokens": 139309332.0, "step": 41830 }, { "entropy": 0.04757586750201881, "epoch": 9.751835878307496, "grad_norm": 0.10009765625, "learning_rate": 4.6113663770039886e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999895751476288, "num_tokens": 139333860.0, "step": 41835 }, { "entropy": 0.053784340433776376, "epoch": 9.753001515328126, "grad_norm": 0.1279296875, "learning_rate": 4.611255755916101e-05, "loss": 0.009, "mean_token_accuracy": 0.9990944027900696, "num_tokens": 139357575.0, "step": 41840 }, { "entropy": 0.0499279048293829, "epoch": 9.754167152348758, "grad_norm": 0.0211181640625, "learning_rate": 4.6111451219855946e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999744951725006, "num_tokens": 139371869.0, "step": 41845 }, { "entropy": 0.06379685923457146, "epoch": 9.75533278936939, "grad_norm": 0.2197265625, "learning_rate": 4.611034475214119e-05, "loss": 0.0022, "mean_token_accuracy": 0.9996087431907654, "num_tokens": 139391127.0, "step": 41850 }, { "entropy": 0.04493435020558536, "epoch": 9.756498426390023, "grad_norm": 0.0224609375, "learning_rate": 4.6109238156033247e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999648272991181, "num_tokens": 139419874.0, "step": 41855 }, { "entropy": 0.051786020305007695, "epoch": 9.757664063410655, "grad_norm": 0.6796875, "learning_rate": 4.610813143154861e-05, "loss": 0.0005, "mean_token_accuracy": 0.9998482525348663, "num_tokens": 139435046.0, "step": 41860 }, { "entropy": 0.060211922880262135, "epoch": 9.758829700431285, "grad_norm": 0.1474609375, "learning_rate": 4.610702457870378e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999869108200073, "num_tokens": 139453747.0, "step": 41865 }, { "entropy": 0.06323374919593334, "epoch": 9.759995337451917, "grad_norm": 0.1005859375, "learning_rate": 4.610591759751528e-05, "loss": 0.0011, "mean_token_accuracy": 0.999958622455597, "num_tokens": 139475495.0, "step": 41870 }, { "entropy": 0.05418466171249747, "epoch": 9.76116097447255, "grad_norm": 0.0537109375, "learning_rate": 4.61048104879996e-05, "loss": 0.0044, "mean_token_accuracy": 0.999826854467392, "num_tokens": 139508996.0, "step": 41875 }, { "entropy": 0.049932948080822825, "epoch": 9.762326611493181, "grad_norm": 0.061279296875, "learning_rate": 4.610370325017325e-05, "loss": 0.0012, "mean_token_accuracy": 0.9995410740375519, "num_tokens": 139537999.0, "step": 41880 }, { "entropy": 0.053233534656465055, "epoch": 9.763492248513813, "grad_norm": 0.0869140625, "learning_rate": 4.610259588405275e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 139553111.0, "step": 41885 }, { "entropy": 0.05464749345555901, "epoch": 9.764657885534444, "grad_norm": 0.1513671875, "learning_rate": 4.61014883896546e-05, "loss": 0.001, "mean_token_accuracy": 0.9997717440128326, "num_tokens": 139572733.0, "step": 41890 }, { "entropy": 0.06824347507208586, "epoch": 9.765823522555076, "grad_norm": 1.4609375, "learning_rate": 4.610038076699532e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993533074855805, "num_tokens": 139581909.0, "step": 41895 }, { "entropy": 0.06773032071068882, "epoch": 9.766989159575708, "grad_norm": 0.1015625, "learning_rate": 4.609927301609143e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 139602961.0, "step": 41900 }, { "entropy": 0.06699398197233677, "epoch": 9.76815479659634, "grad_norm": 0.1474609375, "learning_rate": 4.609816513695945e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999671339988708, "num_tokens": 139625934.0, "step": 41905 }, { "entropy": 0.06858031582087279, "epoch": 9.769320433616972, "grad_norm": 0.12890625, "learning_rate": 4.609705712961589e-05, "loss": 0.0014, "mean_token_accuracy": 0.9996108949184418, "num_tokens": 139635226.0, "step": 41910 }, { "entropy": 0.07053380329161882, "epoch": 9.770486070637604, "grad_norm": 2.640625, "learning_rate": 4.6095948994077276e-05, "loss": 0.0014, "mean_token_accuracy": 0.9996874988079071, "num_tokens": 139647792.0, "step": 41915 }, { "entropy": 0.062039784714579584, "epoch": 9.771651707658235, "grad_norm": 0.310546875, "learning_rate": 4.6094840730360136e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 139661792.0, "step": 41920 }, { "entropy": 0.05674058310687542, "epoch": 9.772817344678867, "grad_norm": 0.17578125, "learning_rate": 4.6093732338480996e-05, "loss": 0.0015, "mean_token_accuracy": 0.9998069524765014, "num_tokens": 139673558.0, "step": 41925 }, { "entropy": 0.057136714551597835, "epoch": 9.773982981699499, "grad_norm": 0.06787109375, "learning_rate": 4.6092623818456376e-05, "loss": 0.0017, "mean_token_accuracy": 0.9996316730976105, "num_tokens": 139693234.0, "step": 41930 }, { "entropy": 0.056234538089483976, "epoch": 9.77514861872013, "grad_norm": 0.23046875, "learning_rate": 4.609151517030281e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999083876609802, "num_tokens": 139738061.0, "step": 41935 }, { "entropy": 0.10356351681984961, "epoch": 9.776314255740763, "grad_norm": 0.1728515625, "learning_rate": 4.609040639403684e-05, "loss": 0.0942, "mean_token_accuracy": 0.9904897749423981, "num_tokens": 139771667.0, "step": 41940 }, { "entropy": 0.049145910609513524, "epoch": 9.777479892761393, "grad_norm": 0.318359375, "learning_rate": 4.608929748967498e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999116659164429, "num_tokens": 139797587.0, "step": 41945 }, { "entropy": 0.04926884109154343, "epoch": 9.778645529782025, "grad_norm": 0.1357421875, "learning_rate": 4.608818845723378e-05, "loss": 0.0018, "mean_token_accuracy": 0.9998193085193634, "num_tokens": 139823734.0, "step": 41950 }, { "entropy": 0.06200911607593298, "epoch": 9.779811166802657, "grad_norm": 0.10888671875, "learning_rate": 4.6087079296729774e-05, "loss": 0.0015, "mean_token_accuracy": 0.999501520395279, "num_tokens": 139843061.0, "step": 41955 }, { "entropy": 0.06600505784153939, "epoch": 9.78097680382329, "grad_norm": 0.44140625, "learning_rate": 4.6085970008179496e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999466836452484, "num_tokens": 139861857.0, "step": 41960 }, { "entropy": 0.05788415623828769, "epoch": 9.782142440843922, "grad_norm": 0.06396484375, "learning_rate": 4.6084860591599497e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997093617916107, "num_tokens": 139881212.0, "step": 41965 }, { "entropy": 0.04697915324941278, "epoch": 9.783308077864554, "grad_norm": 0.1025390625, "learning_rate": 4.608375104700631e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999893486499787, "num_tokens": 139911891.0, "step": 41970 }, { "entropy": 0.051614006608724596, "epoch": 9.784473714885184, "grad_norm": 0.0272216796875, "learning_rate": 4.608264137441648e-05, "loss": 0.0018, "mean_token_accuracy": 0.9994884729385376, "num_tokens": 139935765.0, "step": 41975 }, { "entropy": 0.06123457215726376, "epoch": 9.785639351905816, "grad_norm": 0.3828125, "learning_rate": 4.608153157384657e-05, "loss": 0.0018, "mean_token_accuracy": 0.9993305623531341, "num_tokens": 139953020.0, "step": 41980 }, { "entropy": 0.049324381072074173, "epoch": 9.786804988926448, "grad_norm": 0.1025390625, "learning_rate": 4.6080421645313106e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 139976015.0, "step": 41985 }, { "entropy": 0.06161579126492143, "epoch": 9.78797062594708, "grad_norm": 3.046875, "learning_rate": 4.6079311588832654e-05, "loss": 0.0033, "mean_token_accuracy": 0.9989308536052703, "num_tokens": 139988700.0, "step": 41990 }, { "entropy": 0.06346913799643517, "epoch": 9.789136262967713, "grad_norm": 3.4375, "learning_rate": 4.6078201404421754e-05, "loss": 0.0017, "mean_token_accuracy": 0.9995437502861023, "num_tokens": 140002194.0, "step": 41995 }, { "entropy": 0.07399707436561584, "epoch": 9.790301899988343, "grad_norm": 1.15625, "learning_rate": 4.607709109209698e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995833337306976, "num_tokens": 140015771.0, "step": 42000 }, { "entropy": 0.056039393041282894, "epoch": 9.791467537008975, "grad_norm": 0.014404296875, "learning_rate": 4.607598065187487e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999895513057708, "num_tokens": 140038244.0, "step": 42005 }, { "entropy": 0.044340645615011456, "epoch": 9.792633174029607, "grad_norm": 0.20703125, "learning_rate": 4.607487008377199e-05, "loss": 0.0023, "mean_token_accuracy": 0.9995433449745178, "num_tokens": 140056539.0, "step": 42010 }, { "entropy": 0.06757498234510421, "epoch": 9.79379881105024, "grad_norm": 0.87109375, "learning_rate": 4.607375938780491e-05, "loss": 0.0014, "mean_token_accuracy": 0.9990313827991486, "num_tokens": 140074674.0, "step": 42015 }, { "entropy": 0.04378555864095688, "epoch": 9.794964448070871, "grad_norm": 0.103515625, "learning_rate": 4.6072648563990174e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 140090614.0, "step": 42020 }, { "entropy": 0.047512101009488106, "epoch": 9.796130085091502, "grad_norm": 0.50390625, "learning_rate": 4.607153761234435e-05, "loss": 0.0004, "mean_token_accuracy": 0.999925297498703, "num_tokens": 140115414.0, "step": 42025 }, { "entropy": 0.046311000688001515, "epoch": 9.797295722112134, "grad_norm": 0.1923828125, "learning_rate": 4.6070426532884016e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999650657176972, "num_tokens": 140144210.0, "step": 42030 }, { "entropy": 0.05368498945608735, "epoch": 9.798461359132766, "grad_norm": 0.384765625, "learning_rate": 4.606931532562572e-05, "loss": 0.0025, "mean_token_accuracy": 0.9998359262943268, "num_tokens": 140170962.0, "step": 42035 }, { "entropy": 0.05441671833395958, "epoch": 9.799626996153398, "grad_norm": 0.59375, "learning_rate": 4.6068203990586054e-05, "loss": 0.0044, "mean_token_accuracy": 0.9996001899242402, "num_tokens": 140212748.0, "step": 42040 }, { "entropy": 0.060715678706765176, "epoch": 9.80079263317403, "grad_norm": 1.3046875, "learning_rate": 4.606709252778158e-05, "loss": 0.0033, "mean_token_accuracy": 0.9981433093547821, "num_tokens": 140223455.0, "step": 42045 }, { "entropy": 0.06080267839133739, "epoch": 9.801958270194662, "grad_norm": 0.043212890625, "learning_rate": 4.6065980937228865e-05, "loss": 0.0035, "mean_token_accuracy": 0.999952882528305, "num_tokens": 140256426.0, "step": 42050 }, { "entropy": 0.0371818239800632, "epoch": 9.803123907215292, "grad_norm": 0.06103515625, "learning_rate": 4.6064869218944487e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999569833278656, "num_tokens": 140295217.0, "step": 42055 }, { "entropy": 0.0610663041472435, "epoch": 9.804289544235925, "grad_norm": 0.640625, "learning_rate": 4.6063757372945036e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996368050575256, "num_tokens": 140318166.0, "step": 42060 }, { "entropy": 0.060305179469287394, "epoch": 9.805455181256557, "grad_norm": 0.07568359375, "learning_rate": 4.606264539924708e-05, "loss": 0.0006, "mean_token_accuracy": 0.9997816622257233, "num_tokens": 140329547.0, "step": 42065 }, { "entropy": 0.06976679842919112, "epoch": 9.806620818277189, "grad_norm": 0.060302734375, "learning_rate": 4.6061533297867196e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 140342013.0, "step": 42070 }, { "entropy": 0.07246024245396257, "epoch": 9.807786455297821, "grad_norm": 0.3203125, "learning_rate": 4.606042106882198e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 140357969.0, "step": 42075 }, { "entropy": 0.05506151458248496, "epoch": 9.808952092318451, "grad_norm": 0.09033203125, "learning_rate": 4.6059308712128005e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 140381723.0, "step": 42080 }, { "entropy": 0.058845835737884045, "epoch": 9.810117729339083, "grad_norm": 0.208984375, "learning_rate": 4.605819622780187e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999732673168182, "num_tokens": 140406918.0, "step": 42085 }, { "entropy": 0.053271705750375986, "epoch": 9.811283366359715, "grad_norm": 1.546875, "learning_rate": 4.605708361586015e-05, "loss": 0.0017, "mean_token_accuracy": 0.9994361996650696, "num_tokens": 140428351.0, "step": 42090 }, { "entropy": 0.06628831773996353, "epoch": 9.812449003380348, "grad_norm": 1.71875, "learning_rate": 4.605597087631945e-05, "loss": 0.0017, "mean_token_accuracy": 0.9993700981140137, "num_tokens": 140440728.0, "step": 42095 }, { "entropy": 0.04875661414116621, "epoch": 9.81361464040098, "grad_norm": 0.09619140625, "learning_rate": 4.605485800919635e-05, "loss": 0.0011, "mean_token_accuracy": 0.9992801308631897, "num_tokens": 140466856.0, "step": 42100 }, { "entropy": 0.10183219909667969, "epoch": 9.814780277421612, "grad_norm": 2.03125, "learning_rate": 4.605374501450745e-05, "loss": 0.1021, "mean_token_accuracy": 0.9827478647232055, "num_tokens": 140493294.0, "step": 42105 }, { "entropy": 0.05042801667004824, "epoch": 9.815945914442242, "grad_norm": 0.06982421875, "learning_rate": 4.605263189226935e-05, "loss": 0.0009, "mean_token_accuracy": 0.9998808383941651, "num_tokens": 140527597.0, "step": 42110 }, { "entropy": 0.05014429837465286, "epoch": 9.817111551462874, "grad_norm": 0.1396484375, "learning_rate": 4.6051518642498635e-05, "loss": 0.001, "mean_token_accuracy": 0.9996078431606292, "num_tokens": 140545375.0, "step": 42115 }, { "entropy": 0.07266998011618853, "epoch": 9.818277188483506, "grad_norm": 0.1904296875, "learning_rate": 4.605040526521193e-05, "loss": 0.004, "mean_token_accuracy": 0.9994434773921966, "num_tokens": 140575461.0, "step": 42120 }, { "entropy": 0.04460279541090131, "epoch": 9.819442825504138, "grad_norm": 0.048583984375, "learning_rate": 4.604929176042581e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 140600307.0, "step": 42125 }, { "entropy": 0.06226409394294023, "epoch": 9.82060846252477, "grad_norm": 0.3671875, "learning_rate": 4.6048178128156895e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999888122081757, "num_tokens": 140621568.0, "step": 42130 }, { "entropy": 0.06694214371964335, "epoch": 9.8217740995454, "grad_norm": 0.154296875, "learning_rate": 4.604706436842178e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999789297580719, "num_tokens": 140647240.0, "step": 42135 }, { "entropy": 0.06501425355672837, "epoch": 9.822939736566033, "grad_norm": 0.271484375, "learning_rate": 4.604595048123709e-05, "loss": 0.0048, "mean_token_accuracy": 0.9990079879760743, "num_tokens": 140655241.0, "step": 42140 }, { "entropy": 0.05788929592818022, "epoch": 9.824105373586665, "grad_norm": 0.2255859375, "learning_rate": 4.604483646661942e-05, "loss": 0.0007, "mean_token_accuracy": 0.9995981812477112, "num_tokens": 140668117.0, "step": 42145 }, { "entropy": 0.05733644086867571, "epoch": 9.825271010607297, "grad_norm": 0.0673828125, "learning_rate": 4.6043722324585385e-05, "loss": 0.0042, "mean_token_accuracy": 0.9997166275978089, "num_tokens": 140693790.0, "step": 42150 }, { "entropy": 0.0445280559360981, "epoch": 9.82643664762793, "grad_norm": 0.3515625, "learning_rate": 4.60426080551516e-05, "loss": 0.0029, "mean_token_accuracy": 0.9996988236904144, "num_tokens": 140717437.0, "step": 42155 }, { "entropy": 0.05390657912939787, "epoch": 9.82760228464856, "grad_norm": 0.322265625, "learning_rate": 4.6041493658334685e-05, "loss": 0.0032, "mean_token_accuracy": 0.9997225880622864, "num_tokens": 140731334.0, "step": 42160 }, { "entropy": 0.057373590767383575, "epoch": 9.828767921669192, "grad_norm": 0.1435546875, "learning_rate": 4.6040379134151245e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999842882156372, "num_tokens": 140759282.0, "step": 42165 }, { "entropy": 0.0680330197326839, "epoch": 9.829933558689824, "grad_norm": 0.51171875, "learning_rate": 4.603926448261791e-05, "loss": 0.0009, "mean_token_accuracy": 0.9999366581439972, "num_tokens": 140778046.0, "step": 42170 }, { "entropy": 0.044882729463279245, "epoch": 9.831099195710456, "grad_norm": 0.138671875, "learning_rate": 4.6038149703751296e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997124314308167, "num_tokens": 140798807.0, "step": 42175 }, { "entropy": 0.05645276643335819, "epoch": 9.832264832731088, "grad_norm": 0.2490234375, "learning_rate": 4.603703479756803e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 140809550.0, "step": 42180 }, { "entropy": 0.07319327550940216, "epoch": 9.83343046975172, "grad_norm": 0.03369140625, "learning_rate": 4.603591976408474e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 140833990.0, "step": 42185 }, { "entropy": 0.050815807469189164, "epoch": 9.83459610677235, "grad_norm": 0.12890625, "learning_rate": 4.603480460331804e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999539196491242, "num_tokens": 140866343.0, "step": 42190 }, { "entropy": 0.04648887384682894, "epoch": 9.835761743792983, "grad_norm": 1.3203125, "learning_rate": 4.603368931528457e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995949506759644, "num_tokens": 140888272.0, "step": 42195 }, { "entropy": 0.0669131524860859, "epoch": 9.836927380813615, "grad_norm": 1.4140625, "learning_rate": 4.6032573900000956e-05, "loss": 0.0022, "mean_token_accuracy": 0.9996757328510284, "num_tokens": 140900672.0, "step": 42200 }, { "entropy": 0.05158605314791202, "epoch": 9.838093017834247, "grad_norm": 0.181640625, "learning_rate": 4.603145835748383e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999863147735596, "num_tokens": 140921895.0, "step": 42205 }, { "entropy": 0.05032551847398281, "epoch": 9.839258654854879, "grad_norm": 0.041015625, "learning_rate": 4.6030342687749825e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999283790588379, "num_tokens": 140948592.0, "step": 42210 }, { "entropy": 0.03406341969966888, "epoch": 9.84042429187551, "grad_norm": 0.62890625, "learning_rate": 4.602922689081559e-05, "loss": 0.0007, "mean_token_accuracy": 0.9996773183345795, "num_tokens": 140977425.0, "step": 42215 }, { "entropy": 0.05516029987484217, "epoch": 9.841589928896141, "grad_norm": 0.6640625, "learning_rate": 4.6028110966697745e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999792456626893, "num_tokens": 141008850.0, "step": 42220 }, { "entropy": 0.0569972931407392, "epoch": 9.842755565916773, "grad_norm": 0.134765625, "learning_rate": 4.602699491541294e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999261140823364, "num_tokens": 141043992.0, "step": 42225 }, { "entropy": 0.08039007391780614, "epoch": 9.843921202937405, "grad_norm": 0.7421875, "learning_rate": 4.6025878736977817e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 141055054.0, "step": 42230 }, { "entropy": 0.06913083251565695, "epoch": 9.845086839958038, "grad_norm": 0.07080078125, "learning_rate": 4.602476243140902e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999841928482056, "num_tokens": 141070624.0, "step": 42235 }, { "entropy": 0.07660244479775428, "epoch": 9.84625247697867, "grad_norm": 0.19140625, "learning_rate": 4.602364599872319e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999557554721832, "num_tokens": 141088148.0, "step": 42240 }, { "entropy": 0.06507962569594383, "epoch": 9.8474181139993, "grad_norm": 0.037353515625, "learning_rate": 4.602252943893698e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996131360530853, "num_tokens": 141099636.0, "step": 42245 }, { "entropy": 0.09644481688737869, "epoch": 9.848583751019932, "grad_norm": 0.275390625, "learning_rate": 4.602141275206704e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 141108319.0, "step": 42250 }, { "entropy": 0.06280481992289424, "epoch": 9.849749388040564, "grad_norm": 0.0303955078125, "learning_rate": 4.602029593813001e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999892771244049, "num_tokens": 141129078.0, "step": 42255 }, { "entropy": 0.05322151854634285, "epoch": 9.850915025061196, "grad_norm": 0.578125, "learning_rate": 4.601917899714256e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996376812458039, "num_tokens": 141139131.0, "step": 42260 }, { "entropy": 0.05685127004981041, "epoch": 9.852080662081828, "grad_norm": 0.076171875, "learning_rate": 4.6018061929121335e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999690473079681, "num_tokens": 141163027.0, "step": 42265 }, { "entropy": 0.04338296307250857, "epoch": 9.853246299102459, "grad_norm": 0.142578125, "learning_rate": 4.6016944734083e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999771177768707, "num_tokens": 141185371.0, "step": 42270 }, { "entropy": 0.0789628304541111, "epoch": 9.85441193612309, "grad_norm": 0.051513671875, "learning_rate": 4.60158274120442e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997252762317658, "num_tokens": 141197892.0, "step": 42275 }, { "entropy": 0.05556338932365179, "epoch": 9.855577573143723, "grad_norm": 0.050537109375, "learning_rate": 4.601470996302161e-05, "loss": 0.0022, "mean_token_accuracy": 0.9993739485740661, "num_tokens": 141209692.0, "step": 42280 }, { "entropy": 0.06027071522548795, "epoch": 9.856743210164355, "grad_norm": 0.79296875, "learning_rate": 4.601359238703188e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999654769897461, "num_tokens": 141240991.0, "step": 42285 }, { "entropy": 0.052701130695641044, "epoch": 9.857908847184987, "grad_norm": 0.169921875, "learning_rate": 4.601247468409169e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999734103679657, "num_tokens": 141261119.0, "step": 42290 }, { "entropy": 0.050172841548919676, "epoch": 9.859074484205617, "grad_norm": 0.045654296875, "learning_rate": 4.6011356854217695e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 141278454.0, "step": 42295 }, { "entropy": 0.05537276756949723, "epoch": 9.86024012122625, "grad_norm": 0.07568359375, "learning_rate": 4.601023889742657e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999526858329773, "num_tokens": 141315661.0, "step": 42300 }, { "entropy": 0.048711988516151904, "epoch": 9.861405758246882, "grad_norm": 0.6015625, "learning_rate": 4.6009120813734985e-05, "loss": 0.0024, "mean_token_accuracy": 0.9993138968944549, "num_tokens": 141342496.0, "step": 42305 }, { "entropy": 0.05058379173278808, "epoch": 9.862571395267514, "grad_norm": 0.0269775390625, "learning_rate": 4.600800260315961e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999231338500977, "num_tokens": 141366952.0, "step": 42310 }, { "entropy": 0.049850033968687056, "epoch": 9.863737032288146, "grad_norm": 0.047119140625, "learning_rate": 4.600688426571711e-05, "loss": 0.0015, "mean_token_accuracy": 0.9995084285736084, "num_tokens": 141385183.0, "step": 42315 }, { "entropy": 0.04541545612737537, "epoch": 9.864902669308778, "grad_norm": 0.2578125, "learning_rate": 4.6005765801424184e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999175369739532, "num_tokens": 141408920.0, "step": 42320 }, { "entropy": 0.05511685479432345, "epoch": 9.866068306329408, "grad_norm": 1.546875, "learning_rate": 4.600464721029749e-05, "loss": 0.0013, "mean_token_accuracy": 0.9994753241539002, "num_tokens": 141424253.0, "step": 42325 }, { "entropy": 0.05739809488877654, "epoch": 9.86723394335004, "grad_norm": 0.6015625, "learning_rate": 4.6003528492353714e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998377740383149, "num_tokens": 141443989.0, "step": 42330 }, { "entropy": 0.08445545099675655, "epoch": 9.868399580370673, "grad_norm": 0.2353515625, "learning_rate": 4.600240964760954e-05, "loss": 0.0032, "mean_token_accuracy": 0.9985578715801239, "num_tokens": 141460527.0, "step": 42335 }, { "entropy": 0.0682337274774909, "epoch": 9.869565217391305, "grad_norm": 1.6484375, "learning_rate": 4.600129067608165e-05, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 141473532.0, "step": 42340 }, { "entropy": 0.05082208625972271, "epoch": 9.870730854411937, "grad_norm": 0.026611328125, "learning_rate": 4.600017157778673e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 141490093.0, "step": 42345 }, { "entropy": 0.0639322081580758, "epoch": 9.871896491432567, "grad_norm": 0.06640625, "learning_rate": 4.5999052352741464e-05, "loss": 0.0011, "mean_token_accuracy": 0.99965980052948, "num_tokens": 141515258.0, "step": 42350 }, { "entropy": 0.06224740967154503, "epoch": 9.8730621284532, "grad_norm": 0.2431640625, "learning_rate": 4.599793300096255e-05, "loss": 0.0014, "mean_token_accuracy": 0.9995000004768372, "num_tokens": 141525066.0, "step": 42355 }, { "entropy": 0.06684228293597698, "epoch": 9.874227765473831, "grad_norm": 0.0791015625, "learning_rate": 4.599681352246666e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 141535957.0, "step": 42360 }, { "entropy": 0.07877216022461653, "epoch": 9.875393402494463, "grad_norm": 0.396484375, "learning_rate": 4.599569391727051e-05, "loss": 0.0025, "mean_token_accuracy": 0.9993925511837005, "num_tokens": 141554599.0, "step": 42365 }, { "entropy": 0.0896750895306468, "epoch": 9.876559039515096, "grad_norm": 0.1875, "learning_rate": 4.599457418539079e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 141569724.0, "step": 42370 }, { "entropy": 0.041266056802123786, "epoch": 9.877724676535728, "grad_norm": 0.1787109375, "learning_rate": 4.5993454326844183e-05, "loss": 0.0031, "mean_token_accuracy": 0.9990445971488953, "num_tokens": 141608109.0, "step": 42375 }, { "entropy": 0.055930220521986485, "epoch": 9.878890313556358, "grad_norm": 0.046875, "learning_rate": 4.59923343416474e-05, "loss": 0.0019, "mean_token_accuracy": 0.9999681651592255, "num_tokens": 141642229.0, "step": 42380 }, { "entropy": 0.05225104205310345, "epoch": 9.88005595057699, "grad_norm": 0.97265625, "learning_rate": 4.599121422981714e-05, "loss": 0.0017, "mean_token_accuracy": 0.9991886019706726, "num_tokens": 141657383.0, "step": 42385 }, { "entropy": 0.05562779037281871, "epoch": 9.881221587597622, "grad_norm": 0.0308837890625, "learning_rate": 4.5990093991370105e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999800384044647, "num_tokens": 141675797.0, "step": 42390 }, { "entropy": 0.0824700677767396, "epoch": 9.882387224618254, "grad_norm": 0.236328125, "learning_rate": 4.5988973626323e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999433159828186, "num_tokens": 141698010.0, "step": 42395 }, { "entropy": 0.054821851290762426, "epoch": 9.883552861638886, "grad_norm": 0.466796875, "learning_rate": 4.598785313469253e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999880850315094, "num_tokens": 141723941.0, "step": 42400 }, { "entropy": 0.05956423785537481, "epoch": 9.884718498659517, "grad_norm": 2.25, "learning_rate": 4.5986732516495394e-05, "loss": 0.0013, "mean_token_accuracy": 0.9996835470199585, "num_tokens": 141736876.0, "step": 42405 }, { "entropy": 0.04881628667935729, "epoch": 9.885884135680149, "grad_norm": 0.185546875, "learning_rate": 4.598561177174832e-05, "loss": 0.0024, "mean_token_accuracy": 0.9995063245296478, "num_tokens": 141756360.0, "step": 42410 }, { "entropy": 0.051187887974083426, "epoch": 9.887049772700781, "grad_norm": 0.0162353515625, "learning_rate": 4.598449090046801e-05, "loss": 0.0013, "mean_token_accuracy": 0.9998794257640838, "num_tokens": 141800002.0, "step": 42415 }, { "entropy": 0.07245470732450485, "epoch": 9.888215409721413, "grad_norm": 0.052001953125, "learning_rate": 4.598336990267118e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 141809958.0, "step": 42420 }, { "entropy": 0.0633705073967576, "epoch": 9.889381046742045, "grad_norm": 0.265625, "learning_rate": 4.598224877837454e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999843239784241, "num_tokens": 141832394.0, "step": 42425 }, { "entropy": 0.054108756873756644, "epoch": 9.890546683762675, "grad_norm": 0.0859375, "learning_rate": 4.598112752759482e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999885141849518, "num_tokens": 141853885.0, "step": 42430 }, { "entropy": 0.0420980092138052, "epoch": 9.891712320783308, "grad_norm": 3.59375, "learning_rate": 4.5980006150348734e-05, "loss": 0.0053, "mean_token_accuracy": 0.999622642993927, "num_tokens": 141877275.0, "step": 42435 }, { "entropy": 0.04029533253051341, "epoch": 9.89287795780394, "grad_norm": 0.07861328125, "learning_rate": 4.597888464665299e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999532997608185, "num_tokens": 141901488.0, "step": 42440 }, { "entropy": 0.06203906927257776, "epoch": 9.894043594824572, "grad_norm": 0.03173828125, "learning_rate": 4.5977763016524336e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 141916010.0, "step": 42445 }, { "entropy": 0.04797376096248627, "epoch": 9.895209231845204, "grad_norm": 0.050048828125, "learning_rate": 4.597664125997947e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999784409999848, "num_tokens": 141949456.0, "step": 42450 }, { "entropy": 0.0533231851644814, "epoch": 9.896374868865836, "grad_norm": 0.1611328125, "learning_rate": 4.597551937703515e-05, "loss": 0.0011, "mean_token_accuracy": 0.9998474419116974, "num_tokens": 141971590.0, "step": 42455 }, { "entropy": 0.03850670075044036, "epoch": 9.897540505886466, "grad_norm": 0.07421875, "learning_rate": 4.597439736770807e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999563872814179, "num_tokens": 142008843.0, "step": 42460 }, { "entropy": 0.05502571975812316, "epoch": 9.898706142907098, "grad_norm": 0.0179443359375, "learning_rate": 4.597327523201499e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997801601886749, "num_tokens": 142034549.0, "step": 42465 }, { "entropy": 0.05452665351331234, "epoch": 9.89987177992773, "grad_norm": 0.255859375, "learning_rate": 4.597215296997263e-05, "loss": 0.0027, "mean_token_accuracy": 0.9993485331535339, "num_tokens": 142049199.0, "step": 42470 }, { "entropy": 0.054085666034370664, "epoch": 9.901037416948363, "grad_norm": 0.267578125, "learning_rate": 4.597103058159773e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999523103237152, "num_tokens": 142067986.0, "step": 42475 }, { "entropy": 0.062441179621964694, "epoch": 9.902203053968995, "grad_norm": 0.05859375, "learning_rate": 4.596990806690702e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998196721076965, "num_tokens": 142089011.0, "step": 42480 }, { "entropy": 0.04920492600649595, "epoch": 9.903368690989625, "grad_norm": 0.173828125, "learning_rate": 4.596878542591724e-05, "loss": 0.002, "mean_token_accuracy": 0.9997519671916961, "num_tokens": 142114493.0, "step": 42485 }, { "entropy": 0.04822962349280715, "epoch": 9.904534328010257, "grad_norm": 0.0615234375, "learning_rate": 4.5967662658645135e-05, "loss": 0.0022, "mean_token_accuracy": 0.9994761765003204, "num_tokens": 142135758.0, "step": 42490 }, { "entropy": 0.053807942755520345, "epoch": 9.90569996503089, "grad_norm": 0.1767578125, "learning_rate": 4.5966539765107434e-05, "loss": 0.0004, "mean_token_accuracy": 0.999971354007721, "num_tokens": 142157371.0, "step": 42495 }, { "entropy": 0.05380201395601034, "epoch": 9.906865602051521, "grad_norm": 0.03759765625, "learning_rate": 4.59654167453209e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999534487724304, "num_tokens": 142183055.0, "step": 42500 }, { "entropy": 0.06522977026179433, "epoch": 9.908031239072153, "grad_norm": 0.041015625, "learning_rate": 4.596429359930227e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999375700950622, "num_tokens": 142206833.0, "step": 42505 }, { "entropy": 0.06614532265812159, "epoch": 9.909196876092786, "grad_norm": 3.375, "learning_rate": 4.5963170327068286e-05, "loss": 0.0443, "mean_token_accuracy": 0.9935894548892975, "num_tokens": 142251545.0, "step": 42510 }, { "entropy": 0.04537292215973139, "epoch": 9.910362513113416, "grad_norm": 0.23046875, "learning_rate": 4.5962046928635706e-05, "loss": 0.0013, "mean_token_accuracy": 0.9999306857585907, "num_tokens": 142280886.0, "step": 42515 }, { "entropy": 0.08216597959399223, "epoch": 9.911528150134048, "grad_norm": 0.1396484375, "learning_rate": 4.596092340402128e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 142294713.0, "step": 42520 }, { "entropy": 0.04948452245444059, "epoch": 9.91269378715468, "grad_norm": 0.051513671875, "learning_rate": 4.595979975324176e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999775171279908, "num_tokens": 142318059.0, "step": 42525 }, { "entropy": 0.05829036943614483, "epoch": 9.913859424175312, "grad_norm": 0.205078125, "learning_rate": 4.5958675976313895e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 142336852.0, "step": 42530 }, { "entropy": 0.07351097948849201, "epoch": 9.915025061195944, "grad_norm": 0.029541015625, "learning_rate": 4.5957552073254456e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 142347592.0, "step": 42535 }, { "entropy": 0.0548619513399899, "epoch": 9.916190698216575, "grad_norm": 0.94140625, "learning_rate": 4.5956428044080194e-05, "loss": 0.0017, "mean_token_accuracy": 0.9996331691741943, "num_tokens": 142365828.0, "step": 42540 }, { "entropy": 0.08164572985842825, "epoch": 9.917356335237207, "grad_norm": 0.14453125, "learning_rate": 4.5955303888807865e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999838948249817, "num_tokens": 142382841.0, "step": 42545 }, { "entropy": 0.06366458348929882, "epoch": 9.918521972257839, "grad_norm": 0.050048828125, "learning_rate": 4.595417960745424e-05, "loss": 0.0014, "mean_token_accuracy": 0.9994186043739319, "num_tokens": 142394380.0, "step": 42550 }, { "entropy": 0.050054412055760625, "epoch": 9.919687609278471, "grad_norm": 0.1689453125, "learning_rate": 4.595305520003609e-05, "loss": 0.0009, "mean_token_accuracy": 0.9997562050819397, "num_tokens": 142414179.0, "step": 42555 }, { "entropy": 0.05203136084601283, "epoch": 9.920853246299103, "grad_norm": 0.1943359375, "learning_rate": 4.5951930666570164e-05, "loss": 0.0009, "mean_token_accuracy": 0.9996435225009919, "num_tokens": 142430075.0, "step": 42560 }, { "entropy": 0.053751358296722174, "epoch": 9.922018883319733, "grad_norm": 0.2578125, "learning_rate": 4.595080600707324e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999409317970276, "num_tokens": 142457226.0, "step": 42565 }, { "entropy": 0.06119176633656025, "epoch": 9.923184520340365, "grad_norm": 1.4375, "learning_rate": 4.594968122156209e-05, "loss": 0.0038, "mean_token_accuracy": 0.9991643846035003, "num_tokens": 142471257.0, "step": 42570 }, { "entropy": 0.04179687043651938, "epoch": 9.924350157360998, "grad_norm": 0.0390625, "learning_rate": 4.594855631005348e-05, "loss": 0.0031, "mean_token_accuracy": 0.9988110899925232, "num_tokens": 142502096.0, "step": 42575 }, { "entropy": 0.049814547039568426, "epoch": 9.92551579438163, "grad_norm": 0.21875, "learning_rate": 4.594743127256419e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999563336372376, "num_tokens": 142528528.0, "step": 42580 }, { "entropy": 0.07513895481824875, "epoch": 9.926681431402262, "grad_norm": 0.043212890625, "learning_rate": 4.594630610911099e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 142539302.0, "step": 42585 }, { "entropy": 0.05776936365291476, "epoch": 9.927847068422894, "grad_norm": 0.12109375, "learning_rate": 4.5945180819710673e-05, "loss": 0.0004, "mean_token_accuracy": 0.9997615098953248, "num_tokens": 142568093.0, "step": 42590 }, { "entropy": 0.0796022929251194, "epoch": 9.929012705443524, "grad_norm": 3.421875, "learning_rate": 4.594405540438e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999736607074737, "num_tokens": 142585063.0, "step": 42595 }, { "entropy": 0.061672138934955004, "epoch": 9.930178342464156, "grad_norm": 0.6171875, "learning_rate": 4.5942929863135765e-05, "loss": 0.003, "mean_token_accuracy": 0.9992265224456787, "num_tokens": 142612185.0, "step": 42600 }, { "entropy": 0.06622321843169629, "epoch": 9.931343979484788, "grad_norm": 0.1005859375, "learning_rate": 4.594180419599474e-05, "loss": 0.0009, "mean_token_accuracy": 0.9999887228012085, "num_tokens": 142640222.0, "step": 42605 }, { "entropy": 0.04617886040359735, "epoch": 9.93250961650542, "grad_norm": 0.051513671875, "learning_rate": 4.594067840297372e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999405086040497, "num_tokens": 142673271.0, "step": 42610 }, { "entropy": 0.06401413748972118, "epoch": 9.933675253526053, "grad_norm": 0.046142578125, "learning_rate": 4.5939552484089485e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999673068523407, "num_tokens": 142697553.0, "step": 42615 }, { "entropy": 0.06120873279869556, "epoch": 9.934840890546683, "grad_norm": 0.1904296875, "learning_rate": 4.593842643935884e-05, "loss": 0.0006, "mean_token_accuracy": 0.999710202217102, "num_tokens": 142718665.0, "step": 42620 }, { "entropy": 0.08795096650719643, "epoch": 9.936006527567315, "grad_norm": 0.07177734375, "learning_rate": 4.593730026879856e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 142725823.0, "step": 42625 }, { "entropy": 0.0548077093437314, "epoch": 9.937172164587947, "grad_norm": 0.0849609375, "learning_rate": 4.593617397242544e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 142741644.0, "step": 42630 }, { "entropy": 0.0867973305284977, "epoch": 9.93833780160858, "grad_norm": 1.9453125, "learning_rate": 4.5935047550256274e-05, "loss": 0.0042, "mean_token_accuracy": 0.9996503353118896, "num_tokens": 142770440.0, "step": 42635 }, { "entropy": 0.060863146930933, "epoch": 9.939503438629211, "grad_norm": 2.546875, "learning_rate": 4.5933921002307875e-05, "loss": 0.0017, "mean_token_accuracy": 0.9998944044113159, "num_tokens": 142784755.0, "step": 42640 }, { "entropy": 0.050374433491379024, "epoch": 9.940669075649843, "grad_norm": 0.2392578125, "learning_rate": 4.593279432859702e-05, "loss": 0.002, "mean_token_accuracy": 0.9991772949695588, "num_tokens": 142813398.0, "step": 42645 }, { "entropy": 0.06011525299400091, "epoch": 9.941834712670474, "grad_norm": 0.1806640625, "learning_rate": 4.5931667529140516e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998737394809722, "num_tokens": 142827724.0, "step": 42650 }, { "entropy": 0.059044943377375605, "epoch": 9.943000349691106, "grad_norm": 0.177734375, "learning_rate": 4.593054060395517e-05, "loss": 0.0028, "mean_token_accuracy": 0.9989231169223786, "num_tokens": 142851612.0, "step": 42655 }, { "entropy": 0.057979169255122545, "epoch": 9.944165986711738, "grad_norm": 0.189453125, "learning_rate": 4.592941355305778e-05, "loss": 0.0017, "mean_token_accuracy": 0.9997536420822144, "num_tokens": 142876056.0, "step": 42660 }, { "entropy": 0.0455438518896699, "epoch": 9.94533162373237, "grad_norm": 0.078125, "learning_rate": 4.5928286376465156e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999211847782135, "num_tokens": 142898663.0, "step": 42665 }, { "entropy": 0.06030480302870274, "epoch": 9.946497260753002, "grad_norm": 0.09765625, "learning_rate": 4.592715907419411e-05, "loss": 0.0024, "mean_token_accuracy": 0.9997260272502899, "num_tokens": 142908469.0, "step": 42670 }, { "entropy": 0.053213020972907546, "epoch": 9.947662897773633, "grad_norm": 0.197265625, "learning_rate": 4.5926031646261445e-05, "loss": 0.0009, "mean_token_accuracy": 0.9995137393474579, "num_tokens": 142921349.0, "step": 42675 }, { "entropy": 0.055709246825426814, "epoch": 9.948828534794265, "grad_norm": 0.02490234375, "learning_rate": 4.5924904092683974e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 142932311.0, "step": 42680 }, { "entropy": 0.05174860148690641, "epoch": 9.949994171814897, "grad_norm": 0.033935546875, "learning_rate": 4.5923776413478506e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995390951633454, "num_tokens": 142970140.0, "step": 42685 }, { "entropy": 0.06010691300034523, "epoch": 9.951159808835529, "grad_norm": 0.609375, "learning_rate": 4.592264860866187e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 142980699.0, "step": 42690 }, { "entropy": 0.05457963831722736, "epoch": 9.952325445856161, "grad_norm": 1.859375, "learning_rate": 4.592152067825087e-05, "loss": 0.002, "mean_token_accuracy": 0.999420291185379, "num_tokens": 142993009.0, "step": 42695 }, { "entropy": 0.15338611789047718, "epoch": 9.953491082876791, "grad_norm": 1.4140625, "learning_rate": 4.592039262226233e-05, "loss": 0.2135, "mean_token_accuracy": 0.9780685484409333, "num_tokens": 143016315.0, "step": 42700 }, { "entropy": 0.04838618785142899, "epoch": 9.954656719897423, "grad_norm": 0.1689453125, "learning_rate": 4.591926444071307e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999370753765107, "num_tokens": 143043190.0, "step": 42705 }, { "entropy": 0.06054220134392381, "epoch": 9.955822356918056, "grad_norm": 0.2578125, "learning_rate": 4.5918136133619924e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999464511871338, "num_tokens": 143067614.0, "step": 42710 }, { "entropy": 0.053869908582419156, "epoch": 9.956987993938688, "grad_norm": 0.1669921875, "learning_rate": 4.59170077009997e-05, "loss": 0.001, "mean_token_accuracy": 0.9998924374580384, "num_tokens": 143104360.0, "step": 42715 }, { "entropy": 0.05953489625826478, "epoch": 9.95815363095932, "grad_norm": 0.0260009765625, "learning_rate": 4.591587914286923e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999382019042968, "num_tokens": 143121331.0, "step": 42720 }, { "entropy": 0.06068947771564126, "epoch": 9.959319267979952, "grad_norm": 1.90625, "learning_rate": 4.591475045924534e-05, "loss": 0.0021, "mean_token_accuracy": 0.9996376812458039, "num_tokens": 143137585.0, "step": 42725 }, { "entropy": 0.05690534273162484, "epoch": 9.960484905000582, "grad_norm": 0.038330078125, "learning_rate": 4.5913621650144866e-05, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 143160525.0, "step": 42730 }, { "entropy": 0.0698689054697752, "epoch": 9.961650542021214, "grad_norm": 0.0927734375, "learning_rate": 4.5912492715584645e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 143170908.0, "step": 42735 }, { "entropy": 0.059409552905708554, "epoch": 9.962816179041846, "grad_norm": 0.578125, "learning_rate": 4.59113636555815e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999563694000244, "num_tokens": 143186397.0, "step": 42740 }, { "entropy": 0.05956787364557385, "epoch": 9.963981816062478, "grad_norm": 0.1748046875, "learning_rate": 4.591023447015227e-05, "loss": 0.0028, "mean_token_accuracy": 0.9996439158916474, "num_tokens": 143208394.0, "step": 42745 }, { "entropy": 0.061505304835736754, "epoch": 9.96514745308311, "grad_norm": 0.053466796875, "learning_rate": 4.5909105159313796e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 143217121.0, "step": 42750 }, { "entropy": 0.05436940034851432, "epoch": 9.966313090103741, "grad_norm": 0.06689453125, "learning_rate": 4.5907975723082916e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999572575092316, "num_tokens": 143237233.0, "step": 42755 }, { "entropy": 0.06057135965675116, "epoch": 9.967478727124373, "grad_norm": 0.55859375, "learning_rate": 4.5906846161476475e-05, "loss": 0.0024, "mean_token_accuracy": 0.9984305322170257, "num_tokens": 143259643.0, "step": 42760 }, { "entropy": 0.044597274530678986, "epoch": 9.968644364145005, "grad_norm": 0.1962890625, "learning_rate": 4.5905716474511307e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998950362205505, "num_tokens": 143283034.0, "step": 42765 }, { "entropy": 0.06079157404601574, "epoch": 9.969810001165637, "grad_norm": 0.212890625, "learning_rate": 4.590458666220427e-05, "loss": 0.0004, "mean_token_accuracy": 0.999790358543396, "num_tokens": 143299578.0, "step": 42770 }, { "entropy": 0.0736723642796278, "epoch": 9.97097563818627, "grad_norm": 0.212890625, "learning_rate": 4.59034567245722e-05, "loss": 0.0015, "mean_token_accuracy": 0.9993957698345184, "num_tokens": 143315016.0, "step": 42775 }, { "entropy": 0.07209227010607719, "epoch": 9.972141275206901, "grad_norm": 0.06787109375, "learning_rate": 4.590232666163196e-05, "loss": 0.0009, "mean_token_accuracy": 0.9989817023277283, "num_tokens": 143343550.0, "step": 42780 }, { "entropy": 0.052204666286706926, "epoch": 9.973306912227532, "grad_norm": 0.042236328125, "learning_rate": 4.5901196473400374e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 143365780.0, "step": 42785 }, { "entropy": 0.06627508513629436, "epoch": 9.974472549248164, "grad_norm": 0.0693359375, "learning_rate": 4.590006615989432e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 143376379.0, "step": 42790 }, { "entropy": 0.05749188121408224, "epoch": 9.975638186268796, "grad_norm": 0.4140625, "learning_rate": 4.589893572113065e-05, "loss": 0.0008, "mean_token_accuracy": 0.99997838139534, "num_tokens": 143395659.0, "step": 42795 }, { "entropy": 0.04636556897312403, "epoch": 9.976803823289428, "grad_norm": 0.0262451171875, "learning_rate": 4.589780515712622e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999452292919159, "num_tokens": 143415294.0, "step": 42800 }, { "entropy": 0.059783428255468604, "epoch": 9.97796946031006, "grad_norm": 0.41796875, "learning_rate": 4.589667446789787e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999665439128875, "num_tokens": 143451315.0, "step": 42805 }, { "entropy": 0.04871940072625876, "epoch": 9.97913509733069, "grad_norm": 0.1123046875, "learning_rate": 4.589554365346248e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 143465653.0, "step": 42810 }, { "entropy": 0.07266805339604616, "epoch": 9.980300734351323, "grad_norm": 0.032958984375, "learning_rate": 4.5894412713836906e-05, "loss": 0.0018, "mean_token_accuracy": 0.9993788838386536, "num_tokens": 143475465.0, "step": 42815 }, { "entropy": 0.05912731643766165, "epoch": 9.981466371371955, "grad_norm": 0.0247802734375, "learning_rate": 4.589328164903802e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 143489561.0, "step": 42820 }, { "entropy": 0.05282509876415133, "epoch": 9.982632008392587, "grad_norm": 0.09716796875, "learning_rate": 4.589215045908267e-05, "loss": 0.0015, "mean_token_accuracy": 0.999288922548294, "num_tokens": 143516069.0, "step": 42825 }, { "entropy": 0.062217441760003565, "epoch": 9.983797645413219, "grad_norm": 0.9375, "learning_rate": 4.589101914398774e-05, "loss": 0.0027, "mean_token_accuracy": 0.9992483615875244, "num_tokens": 143530501.0, "step": 42830 }, { "entropy": 0.07485974226146937, "epoch": 9.98496328243385, "grad_norm": 0.2001953125, "learning_rate": 4.58898877037701e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997389018535614, "num_tokens": 143550036.0, "step": 42835 }, { "entropy": 0.04717427408322692, "epoch": 9.986128919454481, "grad_norm": 0.74609375, "learning_rate": 4.58887561384466e-05, "loss": 0.0006, "mean_token_accuracy": 0.9997922837734222, "num_tokens": 143571664.0, "step": 42840 }, { "entropy": 0.049185032676905396, "epoch": 9.987294556475113, "grad_norm": 0.1650390625, "learning_rate": 4.588762444803414e-05, "loss": 0.0007, "mean_token_accuracy": 0.999546229839325, "num_tokens": 143599859.0, "step": 42845 }, { "entropy": 0.045813700463622806, "epoch": 9.988460193495746, "grad_norm": 0.09375, "learning_rate": 4.5886492632549575e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999786972999573, "num_tokens": 143629787.0, "step": 42850 }, { "entropy": 0.07945517208427191, "epoch": 9.989625830516378, "grad_norm": 0.1357421875, "learning_rate": 4.58853606920098e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 143655188.0, "step": 42855 }, { "entropy": 0.05326961185783148, "epoch": 9.99079146753701, "grad_norm": 0.2021484375, "learning_rate": 4.588422862643168e-05, "loss": 0.0123, "mean_token_accuracy": 0.9983867526054382, "num_tokens": 143680228.0, "step": 42860 }, { "entropy": 0.061639095610007646, "epoch": 9.99195710455764, "grad_norm": 0.65234375, "learning_rate": 4.58830964358321e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999892473220825, "num_tokens": 143697468.0, "step": 42865 }, { "entropy": 0.05384563766419888, "epoch": 9.993122741578272, "grad_norm": 0.13671875, "learning_rate": 4.588196412022795e-05, "loss": 0.0014, "mean_token_accuracy": 0.999766594171524, "num_tokens": 143720560.0, "step": 42870 }, { "entropy": 0.05485376380383968, "epoch": 9.994288378598904, "grad_norm": 0.177734375, "learning_rate": 4.58808316796361e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998770773410797, "num_tokens": 143731863.0, "step": 42875 }, { "entropy": 0.06633745562285184, "epoch": 9.995454015619536, "grad_norm": 0.12255859375, "learning_rate": 4.587969911407344e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 143748516.0, "step": 42880 }, { "entropy": 0.041839384008198977, "epoch": 9.996619652640168, "grad_norm": 0.1923828125, "learning_rate": 4.587856642355687e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999523460865021, "num_tokens": 143773800.0, "step": 42885 }, { "entropy": 0.03867072528228164, "epoch": 9.997785289660799, "grad_norm": 0.10888671875, "learning_rate": 4.5877433608103275e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996968567371368, "num_tokens": 143807957.0, "step": 42890 }, { "entropy": 0.044101893063634635, "epoch": 9.998950926681431, "grad_norm": 0.06591796875, "learning_rate": 4.587630066772954e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999754369258881, "num_tokens": 143832484.0, "step": 42895 }, { "entropy": 0.05562862671083874, "epoch": 10.0, "grad_norm": 0.0458984375, "learning_rate": 4.587516760245257e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997179905573527, "num_tokens": 143846000.0, "step": 42900 }, { "entropy": 0.05632606642320752, "epoch": 10.001165637020632, "grad_norm": 0.64453125, "learning_rate": 4.587403441228924e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 143862104.0, "step": 42905 }, { "entropy": 0.052796078659594056, "epoch": 10.002331274041264, "grad_norm": 0.09130859375, "learning_rate": 4.5872901097256474e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 143880552.0, "step": 42910 }, { "entropy": 0.06833078693598509, "epoch": 10.003496911061895, "grad_norm": 0.04443359375, "learning_rate": 4.587176765737116e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 143894599.0, "step": 42915 }, { "entropy": 0.03659242931753397, "epoch": 10.004662548082527, "grad_norm": 0.0216064453125, "learning_rate": 4.5870634092650186e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 143924185.0, "step": 42920 }, { "entropy": 0.06099397018551826, "epoch": 10.005828185103159, "grad_norm": 0.1708984375, "learning_rate": 4.586950040311048e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995762705802917, "num_tokens": 143936581.0, "step": 42925 }, { "entropy": 0.06240507024340332, "epoch": 10.00699382212379, "grad_norm": 0.020263671875, "learning_rate": 4.586836658876893e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 143964883.0, "step": 42930 }, { "entropy": 0.05507293636910617, "epoch": 10.008159459144423, "grad_norm": 0.08203125, "learning_rate": 4.586723264964245e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892473220825, "num_tokens": 143984179.0, "step": 42935 }, { "entropy": 0.05958865638822317, "epoch": 10.009325096165055, "grad_norm": 0.0712890625, "learning_rate": 4.586609858574794e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144000657.0, "step": 42940 }, { "entropy": 0.06111275050789118, "epoch": 10.010490733185685, "grad_norm": 0.115234375, "learning_rate": 4.5864964397102324e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 144012450.0, "step": 42945 }, { "entropy": 0.05097737601026893, "epoch": 10.011656370206317, "grad_norm": 0.0166015625, "learning_rate": 4.58638300837225e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144031839.0, "step": 42950 }, { "entropy": 0.0694691475480795, "epoch": 10.01282200722695, "grad_norm": 0.0234375, "learning_rate": 4.586269564562539e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144050020.0, "step": 42955 }, { "entropy": 0.06655300110578537, "epoch": 10.013987644247582, "grad_norm": 0.0634765625, "learning_rate": 4.586156108282791e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144061209.0, "step": 42960 }, { "entropy": 0.046417646063491705, "epoch": 10.015153281268214, "grad_norm": 0.0137939453125, "learning_rate": 4.586042639534699e-05, "loss": 0.0001, "mean_token_accuracy": 0.999989265203476, "num_tokens": 144085648.0, "step": 42965 }, { "entropy": 0.0366755124181509, "epoch": 10.016318918288844, "grad_norm": 0.0118408203125, "learning_rate": 4.585929158319952e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144107360.0, "step": 42970 }, { "entropy": 0.053187299706041816, "epoch": 10.017484555309476, "grad_norm": 0.01251220703125, "learning_rate": 4.585815664640245e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144129854.0, "step": 42975 }, { "entropy": 0.043479549791663887, "epoch": 10.018650192330108, "grad_norm": 0.032470703125, "learning_rate": 4.585702158497269e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892294406891, "num_tokens": 144165456.0, "step": 42980 }, { "entropy": 0.046352763567119835, "epoch": 10.01981582935074, "grad_norm": 0.140625, "learning_rate": 4.585588639892717e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999870836734772, "num_tokens": 144186276.0, "step": 42985 }, { "entropy": 0.03585439161397517, "epoch": 10.020981466371373, "grad_norm": 0.0269775390625, "learning_rate": 4.5854751088282815e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999523818492889, "num_tokens": 144220443.0, "step": 42990 }, { "entropy": 0.05620026285760105, "epoch": 10.022147103392003, "grad_norm": 0.236328125, "learning_rate": 4.585361565305656e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144239270.0, "step": 42995 }, { "entropy": 0.06483755446970463, "epoch": 10.023312740412635, "grad_norm": 0.03173828125, "learning_rate": 4.585248009326532e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144248183.0, "step": 43000 }, { "entropy": 0.039686411060392855, "epoch": 10.024478377433267, "grad_norm": 0.0673828125, "learning_rate": 4.5851344408926046e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144273189.0, "step": 43005 }, { "entropy": 0.04534797742962837, "epoch": 10.0256440144539, "grad_norm": 0.08642578125, "learning_rate": 4.585020860005567e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999887585639954, "num_tokens": 144296910.0, "step": 43010 }, { "entropy": 0.046770491264760496, "epoch": 10.026809651474531, "grad_norm": 0.0147705078125, "learning_rate": 4.5849072666671116e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144310853.0, "step": 43015 }, { "entropy": 0.061034923605620864, "epoch": 10.027975288495163, "grad_norm": 0.2138671875, "learning_rate": 4.5847936608789336e-05, "loss": 0.0003, "mean_token_accuracy": 0.9998384475708008, "num_tokens": 144324377.0, "step": 43020 }, { "entropy": 0.07491187937557697, "epoch": 10.029140925515794, "grad_norm": 0.06884765625, "learning_rate": 4.584680042642726e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144334358.0, "step": 43025 }, { "entropy": 0.05644649108871817, "epoch": 10.030306562536426, "grad_norm": 0.2109375, "learning_rate": 4.584566411960184e-05, "loss": 0.001, "mean_token_accuracy": 0.9999792277812958, "num_tokens": 144360323.0, "step": 43030 }, { "entropy": 0.0484986113384366, "epoch": 10.031472199557058, "grad_norm": 0.06640625, "learning_rate": 4.5844527688330013e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144383411.0, "step": 43035 }, { "entropy": 0.061395833175629376, "epoch": 10.03263783657769, "grad_norm": 0.0216064453125, "learning_rate": 4.584339113262873e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997023820877076, "num_tokens": 144396897.0, "step": 43040 }, { "entropy": 0.05118941427208483, "epoch": 10.033803473598322, "grad_norm": 0.038330078125, "learning_rate": 4.584225445251493e-05, "loss": 0.0002, "mean_token_accuracy": 0.9996197700500489, "num_tokens": 144427047.0, "step": 43045 }, { "entropy": 0.05451108440756798, "epoch": 10.034969110618952, "grad_norm": 0.01434326171875, "learning_rate": 4.584111764800557e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144439872.0, "step": 43050 }, { "entropy": 0.0546447460539639, "epoch": 10.036134747639585, "grad_norm": 0.048095703125, "learning_rate": 4.58399807191176e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144454004.0, "step": 43055 }, { "entropy": 0.04826419707387686, "epoch": 10.037300384660217, "grad_norm": 0.044677734375, "learning_rate": 4.583884366586798e-05, "loss": 0.0007, "mean_token_accuracy": 0.9998730480670929, "num_tokens": 144486122.0, "step": 43060 }, { "entropy": 0.04088318916037679, "epoch": 10.038466021680849, "grad_norm": 0.031494140625, "learning_rate": 4.583770648827366e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144510438.0, "step": 43065 }, { "entropy": 0.06502441363409162, "epoch": 10.03963165870148, "grad_norm": 0.1201171875, "learning_rate": 4.583656918635159e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999777853488923, "num_tokens": 144534876.0, "step": 43070 }, { "entropy": 0.04362553898245096, "epoch": 10.040797295722113, "grad_norm": 0.055419921875, "learning_rate": 4.583543176011874e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999790906906127, "num_tokens": 144568855.0, "step": 43075 }, { "entropy": 0.06911927331238985, "epoch": 10.041962932742743, "grad_norm": 0.026123046875, "learning_rate": 4.5834294209592055e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 144579570.0, "step": 43080 }, { "entropy": 0.060473337210714816, "epoch": 10.043128569763375, "grad_norm": 0.049072265625, "learning_rate": 4.5833156534788515e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144607964.0, "step": 43085 }, { "entropy": 0.0562533063814044, "epoch": 10.044294206784008, "grad_norm": 0.0419921875, "learning_rate": 4.5832018735725066e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 144621562.0, "step": 43090 }, { "entropy": 0.06382388435304165, "epoch": 10.04545984380464, "grad_norm": 0.228515625, "learning_rate": 4.58308808124187e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 144641956.0, "step": 43095 }, { "entropy": 0.055002011358737946, "epoch": 10.046625480825272, "grad_norm": 0.050537109375, "learning_rate": 4.5829742764886365e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144651672.0, "step": 43100 }, { "entropy": 0.05091058509424329, "epoch": 10.047791117845902, "grad_norm": 0.0703125, "learning_rate": 4.582860459314504e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144673179.0, "step": 43105 }, { "entropy": 0.07972749415785074, "epoch": 10.048956754866534, "grad_norm": 0.1328125, "learning_rate": 4.582746629721169e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144685739.0, "step": 43110 }, { "entropy": 0.04776022732257843, "epoch": 10.050122391887166, "grad_norm": 0.036865234375, "learning_rate": 4.5826327877103294e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999884486198425, "num_tokens": 144722168.0, "step": 43115 }, { "entropy": 0.05045590978115797, "epoch": 10.051288028907798, "grad_norm": 0.03564453125, "learning_rate": 4.5825189332836826e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 144742010.0, "step": 43120 }, { "entropy": 0.04327065721154213, "epoch": 10.05245366592843, "grad_norm": 0.06787109375, "learning_rate": 4.582405066442926e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999754190444946, "num_tokens": 144761437.0, "step": 43125 }, { "entropy": 0.053365825302898885, "epoch": 10.05361930294906, "grad_norm": 0.0181884765625, "learning_rate": 4.582291187189758e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144775121.0, "step": 43130 }, { "entropy": 0.05459557678550482, "epoch": 10.054784939969693, "grad_norm": 0.049072265625, "learning_rate": 4.5821772955258766e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144790489.0, "step": 43135 }, { "entropy": 0.07486938592046499, "epoch": 10.055950576990325, "grad_norm": 0.04052734375, "learning_rate": 4.58206339145298e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999890625476837, "num_tokens": 144809884.0, "step": 43140 }, { "entropy": 0.06250056177377701, "epoch": 10.057116214010957, "grad_norm": 0.062255859375, "learning_rate": 4.5819494749727673e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144843171.0, "step": 43145 }, { "entropy": 0.07772515416145324, "epoch": 10.05828185103159, "grad_norm": 0.1142578125, "learning_rate": 4.581835546086936e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999781608581543, "num_tokens": 144863844.0, "step": 43150 }, { "entropy": 0.06099540926516056, "epoch": 10.059447488052221, "grad_norm": 0.0255126953125, "learning_rate": 4.581721604797186e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997014939785004, "num_tokens": 144875069.0, "step": 43155 }, { "entropy": 0.04972206288948655, "epoch": 10.060613125072852, "grad_norm": 0.0150146484375, "learning_rate": 4.5816076511052156e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999792039394378, "num_tokens": 144894553.0, "step": 43160 }, { "entropy": 0.07169721573591233, "epoch": 10.061778762093484, "grad_norm": 0.064453125, "learning_rate": 4.5814936850127246e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999859213829041, "num_tokens": 144910261.0, "step": 43165 }, { "entropy": 0.06280940165743232, "epoch": 10.062944399114116, "grad_norm": 0.0157470703125, "learning_rate": 4.5813797065214114e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 144926046.0, "step": 43170 }, { "entropy": 0.06737614311277866, "epoch": 10.064110036134748, "grad_norm": 0.10009765625, "learning_rate": 4.581265715632977e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144941287.0, "step": 43175 }, { "entropy": 0.06027284953743219, "epoch": 10.06527567315538, "grad_norm": 0.0159912109375, "learning_rate": 4.5811517123491197e-05, "loss": 0.0004, "mean_token_accuracy": 0.9996774196624756, "num_tokens": 144963613.0, "step": 43180 }, { "entropy": 0.05328985899686813, "epoch": 10.06644131017601, "grad_norm": 0.1435546875, "learning_rate": 4.5810376966715415e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 144984479.0, "step": 43185 }, { "entropy": 0.057596872001886366, "epoch": 10.067606947196643, "grad_norm": 10.0625, "learning_rate": 4.5809236686019404e-05, "loss": 0.002, "mean_token_accuracy": 0.9995798289775848, "num_tokens": 144995502.0, "step": 43190 }, { "entropy": 0.05896051302552223, "epoch": 10.068772584217275, "grad_norm": 0.0291748046875, "learning_rate": 4.580809628142018e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145009177.0, "step": 43195 }, { "entropy": 0.048872120585292576, "epoch": 10.069938221237907, "grad_norm": 0.1181640625, "learning_rate": 4.5806955752934736e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145038730.0, "step": 43200 }, { "entropy": 0.055742715485394, "epoch": 10.071103858258539, "grad_norm": 0.031494140625, "learning_rate": 4.5805815100580096e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145048608.0, "step": 43205 }, { "entropy": 0.05829884652048349, "epoch": 10.072269495279171, "grad_norm": 0.034912109375, "learning_rate": 4.580467432437326e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145065347.0, "step": 43210 }, { "entropy": 0.069200593046844, "epoch": 10.073435132299801, "grad_norm": 0.0308837890625, "learning_rate": 4.580353342433124e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145077310.0, "step": 43215 }, { "entropy": 0.05498428577557206, "epoch": 10.074600769320433, "grad_norm": 0.0849609375, "learning_rate": 4.580239240047104e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999893128871917, "num_tokens": 145099238.0, "step": 43220 }, { "entropy": 0.0957097552716732, "epoch": 10.075766406341065, "grad_norm": 0.111328125, "learning_rate": 4.5801251252809686e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145109037.0, "step": 43225 }, { "entropy": 0.06944717965088784, "epoch": 10.076932043361698, "grad_norm": 0.01708984375, "learning_rate": 4.5800109981364194e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145127293.0, "step": 43230 }, { "entropy": 0.051800630846992136, "epoch": 10.07809768038233, "grad_norm": 0.039306640625, "learning_rate": 4.579896858615157e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999708771705628, "num_tokens": 145151103.0, "step": 43235 }, { "entropy": 0.05186516717076302, "epoch": 10.07926331740296, "grad_norm": 0.0260009765625, "learning_rate": 4.5797827067188847e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145166630.0, "step": 43240 }, { "entropy": 0.07183772623538971, "epoch": 10.080428954423592, "grad_norm": 0.06884765625, "learning_rate": 4.579668542449304e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999724447727203, "num_tokens": 145179262.0, "step": 43245 }, { "entropy": 0.0538714830763638, "epoch": 10.081594591444224, "grad_norm": 0.435546875, "learning_rate": 4.579554365808118e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 145207774.0, "step": 43250 }, { "entropy": 0.06259715519845485, "epoch": 10.082760228464856, "grad_norm": 0.08740234375, "learning_rate": 4.579440176797028e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145229440.0, "step": 43255 }, { "entropy": 0.050983167439699176, "epoch": 10.083925865485488, "grad_norm": 0.0361328125, "learning_rate": 4.5793259754177376e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145242812.0, "step": 43260 }, { "entropy": 0.08244118951261044, "epoch": 10.085091502506119, "grad_norm": 0.033203125, "learning_rate": 4.5792117616719496e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145251750.0, "step": 43265 }, { "entropy": 0.06357784420251847, "epoch": 10.08625713952675, "grad_norm": 0.0267333984375, "learning_rate": 4.579097535561367e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999827563762664, "num_tokens": 145268581.0, "step": 43270 }, { "entropy": 0.07260563299059868, "epoch": 10.087422776547383, "grad_norm": 0.072265625, "learning_rate": 4.578983297087693e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145278177.0, "step": 43275 }, { "entropy": 0.04490415137261152, "epoch": 10.088588413568015, "grad_norm": 0.029052734375, "learning_rate": 4.5788690462526315e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145292038.0, "step": 43280 }, { "entropy": 0.04673054702579975, "epoch": 10.089754050588647, "grad_norm": 0.01177978515625, "learning_rate": 4.5787547830578855e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145309088.0, "step": 43285 }, { "entropy": 0.07409633975476027, "epoch": 10.09091968760928, "grad_norm": 0.035400390625, "learning_rate": 4.578640507505159e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145321036.0, "step": 43290 }, { "entropy": 0.05097045348957181, "epoch": 10.09208532462991, "grad_norm": 0.10791015625, "learning_rate": 4.5785262195961567e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999896228313446, "num_tokens": 145345470.0, "step": 43295 }, { "entropy": 0.05900251679122448, "epoch": 10.093250961650542, "grad_norm": 0.0203857421875, "learning_rate": 4.5784119193325824e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145355866.0, "step": 43300 }, { "entropy": 0.04195380094461143, "epoch": 10.094416598671174, "grad_norm": 0.026123046875, "learning_rate": 4.57829760671614e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999420940876007, "num_tokens": 145391273.0, "step": 43305 }, { "entropy": 0.06040984988212585, "epoch": 10.095582235691806, "grad_norm": 0.021484375, "learning_rate": 4.5781832817485345e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145404008.0, "step": 43310 }, { "entropy": 0.05451854122802615, "epoch": 10.096747872712438, "grad_norm": 0.1279296875, "learning_rate": 4.578068944431471e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 145428004.0, "step": 43315 }, { "entropy": 0.07288854941725731, "epoch": 10.097913509733068, "grad_norm": 0.0146484375, "learning_rate": 4.577954594766653e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145437308.0, "step": 43320 }, { "entropy": 0.04537887102924287, "epoch": 10.0990791467537, "grad_norm": 0.07470703125, "learning_rate": 4.5778402327557874e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 145466503.0, "step": 43325 }, { "entropy": 0.05773584768176079, "epoch": 10.100244783774333, "grad_norm": 0.057861328125, "learning_rate": 4.577725858400579e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999887704849243, "num_tokens": 145488191.0, "step": 43330 }, { "entropy": 0.06356157390400767, "epoch": 10.101410420794965, "grad_norm": 0.018798828125, "learning_rate": 4.577611471702733e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145500787.0, "step": 43335 }, { "entropy": 0.05118578230030835, "epoch": 10.102576057815597, "grad_norm": 0.0164794921875, "learning_rate": 4.5774970726639546e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999885082244873, "num_tokens": 145524395.0, "step": 43340 }, { "entropy": 0.07704499680548907, "epoch": 10.103741694836229, "grad_norm": 0.02734375, "learning_rate": 4.5773826612859505e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145535984.0, "step": 43345 }, { "entropy": 0.04595871288329363, "epoch": 10.10490733185686, "grad_norm": 0.1015625, "learning_rate": 4.577268237570427e-05, "loss": 0.0039, "mean_token_accuracy": 0.9998726367950439, "num_tokens": 145572556.0, "step": 43350 }, { "entropy": 0.05619607055559754, "epoch": 10.106072968877491, "grad_norm": 0.0299072265625, "learning_rate": 4.577153801519089e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 145588353.0, "step": 43355 }, { "entropy": 0.05760320192202926, "epoch": 10.107238605898123, "grad_norm": 0.01513671875, "learning_rate": 4.577039353133644e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999895393848419, "num_tokens": 145613438.0, "step": 43360 }, { "entropy": 0.06066440977156162, "epoch": 10.108404242918756, "grad_norm": 0.111328125, "learning_rate": 4.576924892415799e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999894499778748, "num_tokens": 145635962.0, "step": 43365 }, { "entropy": 0.05439153090119362, "epoch": 10.109569879939388, "grad_norm": 0.0184326171875, "learning_rate": 4.576810419367259e-05, "loss": 0.0004, "mean_token_accuracy": 0.999989140033722, "num_tokens": 145666330.0, "step": 43370 }, { "entropy": 0.06398234134539962, "epoch": 10.110735516960018, "grad_norm": 0.0252685546875, "learning_rate": 4.5766959339897325e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145686348.0, "step": 43375 }, { "entropy": 0.06492618154734373, "epoch": 10.11190115398065, "grad_norm": 0.06298828125, "learning_rate": 4.576581436284926e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145698197.0, "step": 43380 }, { "entropy": 0.05615097945556045, "epoch": 10.113066791001282, "grad_norm": 0.0478515625, "learning_rate": 4.576466926254547e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145714330.0, "step": 43385 }, { "entropy": 0.047344126645475625, "epoch": 10.114232428021914, "grad_norm": 0.0849609375, "learning_rate": 4.576352403900304e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145738619.0, "step": 43390 }, { "entropy": 0.05977406506426632, "epoch": 10.115398065042546, "grad_norm": 0.034423828125, "learning_rate": 4.5762378692239025e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145761272.0, "step": 43395 }, { "entropy": 0.0675832625478506, "epoch": 10.116563702063177, "grad_norm": 0.060546875, "learning_rate": 4.576123322227053e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999685943126678, "num_tokens": 145782553.0, "step": 43400 }, { "entropy": 0.05482005216181278, "epoch": 10.117729339083809, "grad_norm": 0.04345703125, "learning_rate": 4.576008762911461e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 145801907.0, "step": 43405 }, { "entropy": 0.06072709150612354, "epoch": 10.11889497610444, "grad_norm": 0.047119140625, "learning_rate": 4.5758941912788364e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145810627.0, "step": 43410 }, { "entropy": 0.04225499797612429, "epoch": 10.120060613125073, "grad_norm": 0.0546875, "learning_rate": 4.575779607330887e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999840795993805, "num_tokens": 145833376.0, "step": 43415 }, { "entropy": 0.06278912676498294, "epoch": 10.121226250145705, "grad_norm": 0.039306640625, "learning_rate": 4.5756650110693225e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145852253.0, "step": 43420 }, { "entropy": 0.05084234895184636, "epoch": 10.122391887166337, "grad_norm": 0.07666015625, "learning_rate": 4.5755504024958493e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999851584434509, "num_tokens": 145870162.0, "step": 43425 }, { "entropy": 0.048171498160809276, "epoch": 10.123557524186968, "grad_norm": 0.125, "learning_rate": 4.575435781612179e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999879360198974, "num_tokens": 145899437.0, "step": 43430 }, { "entropy": 0.04637369932606816, "epoch": 10.1247231612076, "grad_norm": 0.0791015625, "learning_rate": 4.5753211484200195e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145921647.0, "step": 43435 }, { "entropy": 0.046095291478559376, "epoch": 10.125888798228232, "grad_norm": 0.024169921875, "learning_rate": 4.57520650292108e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 145952959.0, "step": 43440 }, { "entropy": 0.03387051681056619, "epoch": 10.127054435248864, "grad_norm": 0.11865234375, "learning_rate": 4.57509184511707e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893724918365, "num_tokens": 145988615.0, "step": 43445 }, { "entropy": 0.07406612485647202, "epoch": 10.128220072269496, "grad_norm": 0.00927734375, "learning_rate": 4.5749771750097e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146009149.0, "step": 43450 }, { "entropy": 0.05293272449634969, "epoch": 10.129385709290126, "grad_norm": 0.0269775390625, "learning_rate": 4.5748624926006806e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999882042407989, "num_tokens": 146036249.0, "step": 43455 }, { "entropy": 0.07195199579000473, "epoch": 10.130551346310758, "grad_norm": 0.2451171875, "learning_rate": 4.5747477978917196e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146046878.0, "step": 43460 }, { "entropy": 0.049850366963073614, "epoch": 10.13171698333139, "grad_norm": 0.0308837890625, "learning_rate": 4.5746330908845286e-05, "loss": 0.001, "mean_token_accuracy": 0.9996632993221283, "num_tokens": 146067672.0, "step": 43465 }, { "entropy": 0.046344765927642584, "epoch": 10.132882620352023, "grad_norm": 0.294921875, "learning_rate": 4.574518371580818e-05, "loss": 0.0071, "mean_token_accuracy": 0.9994359970092773, "num_tokens": 146102759.0, "step": 43470 }, { "entropy": 0.058653326518833634, "epoch": 10.134048257372655, "grad_norm": 0.0208740234375, "learning_rate": 4.574403639982298e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146124319.0, "step": 43475 }, { "entropy": 0.06518659181892872, "epoch": 10.135213894393287, "grad_norm": 0.0206298828125, "learning_rate": 4.57428889609068e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146136593.0, "step": 43480 }, { "entropy": 0.05135323880240321, "epoch": 10.136379531413917, "grad_norm": 0.048583984375, "learning_rate": 4.574174139907675e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146158241.0, "step": 43485 }, { "entropy": 0.18844906222075225, "epoch": 10.13754516843455, "grad_norm": 5.40625, "learning_rate": 4.574059371434993e-05, "loss": 0.2538, "mean_token_accuracy": 0.9702293097972869, "num_tokens": 146197255.0, "step": 43490 }, { "entropy": 0.03568605692125857, "epoch": 10.138710805455181, "grad_norm": 0.046630859375, "learning_rate": 4.573944590674347e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999648869037628, "num_tokens": 146230349.0, "step": 43495 }, { "entropy": 0.05035318732261658, "epoch": 10.139876442475813, "grad_norm": 0.0361328125, "learning_rate": 4.573829797627449e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146243193.0, "step": 43500 }, { "entropy": 0.06015748176723719, "epoch": 10.141042079496446, "grad_norm": 0.03857421875, "learning_rate": 4.573714992296008e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892771244049, "num_tokens": 146266143.0, "step": 43505 }, { "entropy": 0.07109114537015557, "epoch": 10.142207716517076, "grad_norm": 0.0537109375, "learning_rate": 4.573600174681738e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146284877.0, "step": 43510 }, { "entropy": 0.06056566163897514, "epoch": 10.143373353537708, "grad_norm": 0.1669921875, "learning_rate": 4.573485344786351e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999842584133148, "num_tokens": 146310349.0, "step": 43515 }, { "entropy": 0.040589083032682535, "epoch": 10.14453899055834, "grad_norm": 0.06689453125, "learning_rate": 4.5733705026115584e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146340851.0, "step": 43520 }, { "entropy": 0.06647374760359526, "epoch": 10.145704627578972, "grad_norm": 0.05419921875, "learning_rate": 4.5732556481590736e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146354591.0, "step": 43525 }, { "entropy": 0.0501191689632833, "epoch": 10.146870264599604, "grad_norm": 0.01611328125, "learning_rate": 4.573140781430608e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146369355.0, "step": 43530 }, { "entropy": 0.0644486665725708, "epoch": 10.148035901620235, "grad_norm": 0.10791015625, "learning_rate": 4.573025902427876e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146378703.0, "step": 43535 }, { "entropy": 0.048806856386363506, "epoch": 10.149201538640867, "grad_norm": 0.0177001953125, "learning_rate": 4.5729110111525904e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999835252761841, "num_tokens": 146406804.0, "step": 43540 }, { "entropy": 0.060486155655235053, "epoch": 10.150367175661499, "grad_norm": 0.0186767578125, "learning_rate": 4.572796107606463e-05, "loss": 0.0001, "mean_token_accuracy": 0.9999890804290772, "num_tokens": 146435773.0, "step": 43545 }, { "entropy": 0.057251500664278865, "epoch": 10.151532812682131, "grad_norm": 0.0269775390625, "learning_rate": 4.572681191791208e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999665915966034, "num_tokens": 146462467.0, "step": 43550 }, { "entropy": 0.0698289708700031, "epoch": 10.152698449702763, "grad_norm": 1.71875, "learning_rate": 4.57256626370854e-05, "loss": 0.0159, "mean_token_accuracy": 0.9981661915779114, "num_tokens": 146489551.0, "step": 43555 }, { "entropy": 0.06030179150402546, "epoch": 10.153864086723395, "grad_norm": 0.0712890625, "learning_rate": 4.572451323360171e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146502896.0, "step": 43560 }, { "entropy": 0.06003944650292396, "epoch": 10.155029723744025, "grad_norm": 0.01373291015625, "learning_rate": 4.572336370747816e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146513190.0, "step": 43565 }, { "entropy": 0.0528954841196537, "epoch": 10.156195360764658, "grad_norm": 0.0537109375, "learning_rate": 4.572221405873189e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146529473.0, "step": 43570 }, { "entropy": 0.05009576361626387, "epoch": 10.15736099778529, "grad_norm": 0.04638671875, "learning_rate": 4.572106428738003e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 146547608.0, "step": 43575 }, { "entropy": 0.05700251702219248, "epoch": 10.158526634805922, "grad_norm": 0.07421875, "learning_rate": 4.571991439343976e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146575736.0, "step": 43580 }, { "entropy": 0.058728303946554664, "epoch": 10.159692271826554, "grad_norm": 0.049560546875, "learning_rate": 4.571876437692818e-05, "loss": 0.0101, "mean_token_accuracy": 0.9980162858963013, "num_tokens": 146603676.0, "step": 43585 }, { "entropy": 0.05301784612238407, "epoch": 10.160857908847184, "grad_norm": 0.0189208984375, "learning_rate": 4.571761423786247e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146622318.0, "step": 43590 }, { "entropy": 0.07999129854142666, "epoch": 10.162023545867816, "grad_norm": 0.034423828125, "learning_rate": 4.571646397625977e-05, "loss": 0.0067, "mean_token_accuracy": 0.9984594106674194, "num_tokens": 146645929.0, "step": 43595 }, { "entropy": 0.07351434212177992, "epoch": 10.163189182888448, "grad_norm": 0.06787109375, "learning_rate": 4.5715313592137235e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146661177.0, "step": 43600 }, { "entropy": 0.05000458974391222, "epoch": 10.16435481990908, "grad_norm": 0.025146484375, "learning_rate": 4.5714163085512024e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999616265296936, "num_tokens": 146687619.0, "step": 43605 }, { "entropy": 0.05602185511961579, "epoch": 10.165520456929713, "grad_norm": 0.1865234375, "learning_rate": 4.571301245640128e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999705493450165, "num_tokens": 146710382.0, "step": 43610 }, { "entropy": 0.0893972160294652, "epoch": 10.166686093950345, "grad_norm": 0.080078125, "learning_rate": 4.5711861704822165e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 146722652.0, "step": 43615 }, { "entropy": 0.053737116511911154, "epoch": 10.167851730970975, "grad_norm": 0.019775390625, "learning_rate": 4.5710710830791846e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999891698360444, "num_tokens": 146751021.0, "step": 43620 }, { "entropy": 0.04591502221301198, "epoch": 10.169017367991607, "grad_norm": 0.026611328125, "learning_rate": 4.5709559834327475e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999671936035156, "num_tokens": 146784280.0, "step": 43625 }, { "entropy": 0.06887899702414871, "epoch": 10.17018300501224, "grad_norm": 0.030029296875, "learning_rate": 4.570840871544622e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999736905097961, "num_tokens": 146823176.0, "step": 43630 }, { "entropy": 0.045278893690556286, "epoch": 10.171348642032871, "grad_norm": 0.020751953125, "learning_rate": 4.570725747416525e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146861771.0, "step": 43635 }, { "entropy": 0.06565503738820552, "epoch": 10.172514279053503, "grad_norm": 0.062255859375, "learning_rate": 4.5706106110501725e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 146883279.0, "step": 43640 }, { "entropy": 0.07009013332426547, "epoch": 10.173679916074134, "grad_norm": 0.06201171875, "learning_rate": 4.570495462447281e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146893649.0, "step": 43645 }, { "entropy": 0.06315956339240074, "epoch": 10.174845553094766, "grad_norm": 0.0498046875, "learning_rate": 4.570380301609568e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146904111.0, "step": 43650 }, { "entropy": 0.056829939410090444, "epoch": 10.176011190115398, "grad_norm": 0.0247802734375, "learning_rate": 4.570265128538751e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146917594.0, "step": 43655 }, { "entropy": 0.05483479988761246, "epoch": 10.17717682713603, "grad_norm": 0.01177978515625, "learning_rate": 4.5701499432365474e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 146936107.0, "step": 43660 }, { "entropy": 0.04961704201996327, "epoch": 10.178342464156662, "grad_norm": 0.0194091796875, "learning_rate": 4.570034745704674e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146954632.0, "step": 43665 }, { "entropy": 0.047052628640085456, "epoch": 10.179508101177293, "grad_norm": 0.019287109375, "learning_rate": 4.56991953594485e-05, "loss": 0.0008, "mean_token_accuracy": 0.999933409690857, "num_tokens": 146974621.0, "step": 43670 }, { "entropy": 0.06985645415261388, "epoch": 10.180673738197925, "grad_norm": 0.0380859375, "learning_rate": 4.569804313958792e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 146998567.0, "step": 43675 }, { "entropy": 0.05309359449893236, "epoch": 10.181839375218557, "grad_norm": 0.0380859375, "learning_rate": 4.569689079748219e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147017026.0, "step": 43680 }, { "entropy": 0.058295493759214875, "epoch": 10.183005012239189, "grad_norm": 0.0201416015625, "learning_rate": 4.569573833314849e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 147034319.0, "step": 43685 }, { "entropy": 0.05003042472526431, "epoch": 10.184170649259821, "grad_norm": 0.0263671875, "learning_rate": 4.5694585746604e-05, "loss": 0.0029, "mean_token_accuracy": 0.9996855318546295, "num_tokens": 147052788.0, "step": 43690 }, { "entropy": 0.04796261852607131, "epoch": 10.185336286280453, "grad_norm": 0.06396484375, "learning_rate": 4.5693433037865905e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 147076027.0, "step": 43695 }, { "entropy": 0.04751263782382011, "epoch": 10.186501923301083, "grad_norm": 0.036376953125, "learning_rate": 4.569228020695141e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147099569.0, "step": 43700 }, { "entropy": 0.050695561431348325, "epoch": 10.187667560321715, "grad_norm": 0.019775390625, "learning_rate": 4.569112725387769e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147117475.0, "step": 43705 }, { "entropy": 0.04532196051441133, "epoch": 10.188833197342348, "grad_norm": 0.06640625, "learning_rate": 4.568997417866195e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 147147684.0, "step": 43710 }, { "entropy": 0.052403824776411055, "epoch": 10.18999883436298, "grad_norm": 0.038330078125, "learning_rate": 4.5688820981321364e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147172304.0, "step": 43715 }, { "entropy": 0.054084846563637254, "epoch": 10.191164471383612, "grad_norm": 0.0125732421875, "learning_rate": 4.5687667661873154e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147191474.0, "step": 43720 }, { "entropy": 0.06174150565639138, "epoch": 10.192330108404242, "grad_norm": 0.01422119140625, "learning_rate": 4.5686514220334495e-05, "loss": 0.0004, "mean_token_accuracy": 0.999846625328064, "num_tokens": 147206627.0, "step": 43725 }, { "entropy": 0.07426982149481773, "epoch": 10.193495745424874, "grad_norm": 0.033935546875, "learning_rate": 4.56853606567226e-05, "loss": 0.0025, "mean_token_accuracy": 0.9997890293598175, "num_tokens": 147216221.0, "step": 43730 }, { "entropy": 0.04849873506464064, "epoch": 10.194661382445506, "grad_norm": 0.0361328125, "learning_rate": 4.5684206971054666e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147243363.0, "step": 43735 }, { "entropy": 0.05038243047893047, "epoch": 10.195827019466138, "grad_norm": 0.01953125, "learning_rate": 4.568305316334789e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147265800.0, "step": 43740 }, { "entropy": 0.06462544896639884, "epoch": 10.19699265648677, "grad_norm": 0.0169677734375, "learning_rate": 4.568189923361949e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999897480010986, "num_tokens": 147294528.0, "step": 43745 }, { "entropy": 0.04675520276650787, "epoch": 10.198158293507403, "grad_norm": 0.0262451171875, "learning_rate": 4.5680745181886665e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147316504.0, "step": 43750 }, { "entropy": 0.06306911818683147, "epoch": 10.199323930528033, "grad_norm": 0.38671875, "learning_rate": 4.567959100816663e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147338573.0, "step": 43755 }, { "entropy": 0.05053731370717287, "epoch": 10.200489567548665, "grad_norm": 0.0257568359375, "learning_rate": 4.567843671247658e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147349623.0, "step": 43760 }, { "entropy": 0.04980887686833739, "epoch": 10.201655204569297, "grad_norm": 0.078125, "learning_rate": 4.567728229483374e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147370342.0, "step": 43765 }, { "entropy": 0.06197886522859335, "epoch": 10.20282084158993, "grad_norm": 0.0498046875, "learning_rate": 4.567612775525533e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147386351.0, "step": 43770 }, { "entropy": 0.049138284660875796, "epoch": 10.203986478610561, "grad_norm": 0.041748046875, "learning_rate": 4.567497309375854e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997716903686523, "num_tokens": 147403614.0, "step": 43775 }, { "entropy": 0.0651495123282075, "epoch": 10.205152115631192, "grad_norm": 0.01806640625, "learning_rate": 4.567381831036062e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999730885028839, "num_tokens": 147428634.0, "step": 43780 }, { "entropy": 0.04915805528871715, "epoch": 10.206317752651824, "grad_norm": 0.0263671875, "learning_rate": 4.5672663405078775e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147448143.0, "step": 43785 }, { "entropy": 0.07239555716514587, "epoch": 10.207483389672456, "grad_norm": 0.0166015625, "learning_rate": 4.5671508377930224e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147467194.0, "step": 43790 }, { "entropy": 0.053422879241406915, "epoch": 10.208649026693088, "grad_norm": 0.029541015625, "learning_rate": 4.567035322893219e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147486871.0, "step": 43795 }, { "entropy": 0.049904408678412435, "epoch": 10.20981466371372, "grad_norm": 0.0361328125, "learning_rate": 4.56691979581019e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147509184.0, "step": 43800 }, { "entropy": 0.05743863061070442, "epoch": 10.21098030073435, "grad_norm": 0.022705078125, "learning_rate": 4.566804256545659e-05, "loss": 0.0006, "mean_token_accuracy": 0.9995495498180389, "num_tokens": 147523305.0, "step": 43805 }, { "entropy": 0.047840119106695056, "epoch": 10.212145937754983, "grad_norm": 0.026123046875, "learning_rate": 4.5666887051013466e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147554093.0, "step": 43810 }, { "entropy": 0.05289085814729333, "epoch": 10.213311574775615, "grad_norm": 0.045166015625, "learning_rate": 4.566573141478978e-05, "loss": 0.001, "mean_token_accuracy": 0.9997448980808258, "num_tokens": 147583164.0, "step": 43815 }, { "entropy": 0.046384062990546225, "epoch": 10.214477211796247, "grad_norm": 0.0267333984375, "learning_rate": 4.566457565680275e-05, "loss": 0.0014, "mean_token_accuracy": 0.9995967745780945, "num_tokens": 147607328.0, "step": 43820 }, { "entropy": 0.05956487860530615, "epoch": 10.215642848816879, "grad_norm": 0.0225830078125, "learning_rate": 4.566341977706963e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147632027.0, "step": 43825 }, { "entropy": 0.057423918321728704, "epoch": 10.216808485837511, "grad_norm": 0.062255859375, "learning_rate": 4.5662263775607636e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147646068.0, "step": 43830 }, { "entropy": 0.0800579123198986, "epoch": 10.217974122858141, "grad_norm": 0.03125, "learning_rate": 4.566110765243401e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147662620.0, "step": 43835 }, { "entropy": 0.0751149789430201, "epoch": 10.219139759878773, "grad_norm": 0.0263671875, "learning_rate": 4.5659951407565995e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147676792.0, "step": 43840 }, { "entropy": 0.057619784399867056, "epoch": 10.220305396899406, "grad_norm": 0.08935546875, "learning_rate": 4.5658795041020834e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999888896942138, "num_tokens": 147701947.0, "step": 43845 }, { "entropy": 0.07080321535468101, "epoch": 10.221471033920038, "grad_norm": 0.037109375, "learning_rate": 4.565763855281576e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147711260.0, "step": 43850 }, { "entropy": 0.042559560388326645, "epoch": 10.22263667094067, "grad_norm": 0.0191650390625, "learning_rate": 4.5656481942968035e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999866664409638, "num_tokens": 147741647.0, "step": 43855 }, { "entropy": 0.05230944976210594, "epoch": 10.2238023079613, "grad_norm": 0.0263671875, "learning_rate": 4.5655325211494896e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999886631965638, "num_tokens": 147771976.0, "step": 43860 }, { "entropy": 0.07495128940790892, "epoch": 10.224967944981932, "grad_norm": 0.0390625, "learning_rate": 4.565416835841358e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147786026.0, "step": 43865 }, { "entropy": 0.060472107119858266, "epoch": 10.226133582002564, "grad_norm": 0.0242919921875, "learning_rate": 4.565301138374136e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999890267848969, "num_tokens": 147806748.0, "step": 43870 }, { "entropy": 0.04094974570907652, "epoch": 10.227299219023196, "grad_norm": 0.036376953125, "learning_rate": 4.5651854287495474e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147829803.0, "step": 43875 }, { "entropy": 0.06098997332155705, "epoch": 10.228464856043828, "grad_norm": 0.0224609375, "learning_rate": 4.5650697069693185e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147841209.0, "step": 43880 }, { "entropy": 0.062342527136206624, "epoch": 10.22963049306446, "grad_norm": 0.028564453125, "learning_rate": 4.564953973035174e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147857426.0, "step": 43885 }, { "entropy": 0.054973821109160784, "epoch": 10.230796130085091, "grad_norm": 0.54296875, "learning_rate": 4.5648382269488396e-05, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 147876279.0, "step": 43890 }, { "entropy": 0.05381541419774294, "epoch": 10.231961767105723, "grad_norm": 0.01513671875, "learning_rate": 4.5647224687120416e-05, "loss": 0.0006, "mean_token_accuracy": 0.999957150220871, "num_tokens": 147904289.0, "step": 43895 }, { "entropy": 0.042642751894891265, "epoch": 10.233127404126355, "grad_norm": 0.0225830078125, "learning_rate": 4.564606698326506e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 147930693.0, "step": 43900 }, { "entropy": 0.06699599586427211, "epoch": 10.234293041146987, "grad_norm": 0.040771484375, "learning_rate": 4.5644909157939605e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147939675.0, "step": 43905 }, { "entropy": 0.061661123856902124, "epoch": 10.23545867816762, "grad_norm": 0.09033203125, "learning_rate": 4.564375121116129e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 147954674.0, "step": 43910 }, { "entropy": 0.047797265090048315, "epoch": 10.23662431518825, "grad_norm": 0.03564453125, "learning_rate": 4.5642593142947404e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997142851352692, "num_tokens": 147991518.0, "step": 43915 }, { "entropy": 0.04810534734278917, "epoch": 10.237789952208882, "grad_norm": 0.04443359375, "learning_rate": 4.5641434953315205e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999752402305603, "num_tokens": 148017831.0, "step": 43920 }, { "entropy": 0.06693259365856648, "epoch": 10.238955589229514, "grad_norm": 0.7890625, "learning_rate": 4.564027664228196e-05, "loss": 0.0015, "mean_token_accuracy": 0.9996845424175262, "num_tokens": 148029953.0, "step": 43925 }, { "entropy": 0.060024634934961796, "epoch": 10.240121226250146, "grad_norm": 0.015380859375, "learning_rate": 4.563911820986495e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148044411.0, "step": 43930 }, { "entropy": 0.08300753589719534, "epoch": 10.241286863270778, "grad_norm": 0.0299072265625, "learning_rate": 4.563795965608145e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148054513.0, "step": 43935 }, { "entropy": 0.057464625034481284, "epoch": 10.242452500291408, "grad_norm": 0.0164794921875, "learning_rate": 4.563680098094874e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148070212.0, "step": 43940 }, { "entropy": 0.04505125693976879, "epoch": 10.24361813731204, "grad_norm": 0.026123046875, "learning_rate": 4.5635642184484076e-05, "loss": 0.0002, "mean_token_accuracy": 0.9993670880794525, "num_tokens": 148102139.0, "step": 43945 }, { "entropy": 0.07114105895161629, "epoch": 10.244783774332673, "grad_norm": 0.039306640625, "learning_rate": 4.563448326670475e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148113838.0, "step": 43950 }, { "entropy": 0.04451997820287943, "epoch": 10.245949411353305, "grad_norm": 0.0203857421875, "learning_rate": 4.5633324227628044e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999620139598846, "num_tokens": 148135553.0, "step": 43955 }, { "entropy": 0.05354665564373136, "epoch": 10.247115048373937, "grad_norm": 0.0419921875, "learning_rate": 4.5632165067271246e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148163500.0, "step": 43960 }, { "entropy": 0.07594393603503705, "epoch": 10.248280685394569, "grad_norm": 0.0147705078125, "learning_rate": 4.5631005785651625e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148175929.0, "step": 43965 }, { "entropy": 0.07074794918298721, "epoch": 10.2494463224152, "grad_norm": 0.0233154296875, "learning_rate": 4.562984638278649e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148184960.0, "step": 43970 }, { "entropy": 0.04213942000642419, "epoch": 10.250611959435831, "grad_norm": 0.0390625, "learning_rate": 4.562868685869311e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892354011536, "num_tokens": 148218408.0, "step": 43975 }, { "entropy": 0.05735755441710353, "epoch": 10.251777596456463, "grad_norm": 0.0135498046875, "learning_rate": 4.5627527213388786e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148244867.0, "step": 43980 }, { "entropy": 0.07946206014603377, "epoch": 10.252943233477096, "grad_norm": 0.09716796875, "learning_rate": 4.5626367446890806e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 148266308.0, "step": 43985 }, { "entropy": 0.04815430268645286, "epoch": 10.254108870497728, "grad_norm": 0.03125, "learning_rate": 4.562520755921647e-05, "loss": 0.0058, "mean_token_accuracy": 0.9998537182807923, "num_tokens": 148288621.0, "step": 43990 }, { "entropy": 0.04941950519569218, "epoch": 10.255274507518358, "grad_norm": 0.017333984375, "learning_rate": 4.562404755038305e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997955024242401, "num_tokens": 148315529.0, "step": 43995 }, { "entropy": 0.07427412495017052, "epoch": 10.25644014453899, "grad_norm": 0.0233154296875, "learning_rate": 4.562288742040788e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148325547.0, "step": 44000 }, { "entropy": 0.04696356984786689, "epoch": 10.257605781559622, "grad_norm": 0.392578125, "learning_rate": 4.5621727169308237e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997361481189728, "num_tokens": 148354682.0, "step": 44005 }, { "entropy": 0.04103820836171508, "epoch": 10.258771418580254, "grad_norm": 0.0240478515625, "learning_rate": 4.562056679710142e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148391296.0, "step": 44010 }, { "entropy": 0.0682970798574388, "epoch": 10.259937055600886, "grad_norm": 0.2431640625, "learning_rate": 4.561940630380475e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148403790.0, "step": 44015 }, { "entropy": 0.04868617909960449, "epoch": 10.261102692621517, "grad_norm": 0.0240478515625, "learning_rate": 4.561824568943551e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148423715.0, "step": 44020 }, { "entropy": 0.05339447120204568, "epoch": 10.262268329642149, "grad_norm": 0.013427734375, "learning_rate": 4.561708495401102e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148439737.0, "step": 44025 }, { "entropy": 0.059647050127387045, "epoch": 10.263433966662781, "grad_norm": 0.021728515625, "learning_rate": 4.561592409754858e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148451360.0, "step": 44030 }, { "entropy": 0.049726602528244256, "epoch": 10.264599603683413, "grad_norm": 0.0211181640625, "learning_rate": 4.5614763120065505e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999891757965088, "num_tokens": 148474790.0, "step": 44035 }, { "entropy": 0.04956343998201192, "epoch": 10.265765240704045, "grad_norm": 0.07177734375, "learning_rate": 4.5613602021579105e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995555579662323, "num_tokens": 148503518.0, "step": 44040 }, { "entropy": 0.047596500441432, "epoch": 10.266930877724677, "grad_norm": 0.020751953125, "learning_rate": 4.5612440802106706e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 148529529.0, "step": 44045 }, { "entropy": 0.07422376275062562, "epoch": 10.268096514745308, "grad_norm": 0.1083984375, "learning_rate": 4.56112794616656e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148543502.0, "step": 44050 }, { "entropy": 0.04665753650479019, "epoch": 10.26926215176594, "grad_norm": 0.0135498046875, "learning_rate": 4.5610118000273126e-05, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 148569697.0, "step": 44055 }, { "entropy": 0.05942644737660885, "epoch": 10.270427788786572, "grad_norm": 0.189453125, "learning_rate": 4.560895641794659e-05, "loss": 0.0005, "mean_token_accuracy": 0.99957115650177, "num_tokens": 148592942.0, "step": 44060 }, { "entropy": 0.059556891489773986, "epoch": 10.271593425807204, "grad_norm": 0.07666015625, "learning_rate": 4.5607794714703325e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148615967.0, "step": 44065 }, { "entropy": 0.056974831596016885, "epoch": 10.272759062827836, "grad_norm": 0.037109375, "learning_rate": 4.5606632890560634e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 148646794.0, "step": 44070 }, { "entropy": 0.05813450757414103, "epoch": 10.273924699848466, "grad_norm": 0.037353515625, "learning_rate": 4.560547094553586e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999894082546235, "num_tokens": 148669486.0, "step": 44075 }, { "entropy": 0.06914445832371711, "epoch": 10.275090336869098, "grad_norm": 0.0159912109375, "learning_rate": 4.5604308879646326e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148680933.0, "step": 44080 }, { "entropy": 0.05653935582377016, "epoch": 10.27625597388973, "grad_norm": 0.03125, "learning_rate": 4.5603146692909356e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148703638.0, "step": 44085 }, { "entropy": 0.050142921041697264, "epoch": 10.277421610910363, "grad_norm": 0.0184326171875, "learning_rate": 4.5601984385342275e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148720770.0, "step": 44090 }, { "entropy": 0.051549593452364206, "epoch": 10.278587247930995, "grad_norm": 0.033203125, "learning_rate": 4.5600821956962424e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999508917331695, "num_tokens": 148754773.0, "step": 44095 }, { "entropy": 0.06519920136779547, "epoch": 10.279752884951627, "grad_norm": 0.0771484375, "learning_rate": 4.559965940778713e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148772964.0, "step": 44100 }, { "entropy": 0.059289961494505404, "epoch": 10.280918521972257, "grad_norm": 0.1865234375, "learning_rate": 4.559849673783374e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148787873.0, "step": 44105 }, { "entropy": 0.06752317813225091, "epoch": 10.28208415899289, "grad_norm": 0.0228271484375, "learning_rate": 4.559733394711958e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148802377.0, "step": 44110 }, { "entropy": 0.04858728079125285, "epoch": 10.283249796013521, "grad_norm": 0.427734375, "learning_rate": 4.559617103566199e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999848067760467, "num_tokens": 148836126.0, "step": 44115 }, { "entropy": 0.04201396256685257, "epoch": 10.284415433034154, "grad_norm": 0.134765625, "learning_rate": 4.559500800347831e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 148855915.0, "step": 44120 }, { "entropy": 0.08059627935290337, "epoch": 10.285581070054786, "grad_norm": 0.0234375, "learning_rate": 4.559384485058589e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 148874005.0, "step": 44125 }, { "entropy": 0.044201585277915, "epoch": 10.286746707075416, "grad_norm": 0.013427734375, "learning_rate": 4.559268157700206e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148895357.0, "step": 44130 }, { "entropy": 0.053767064865678546, "epoch": 10.287912344096048, "grad_norm": 0.00860595703125, "learning_rate": 4.559151818274419e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148908927.0, "step": 44135 }, { "entropy": 0.0679398087784648, "epoch": 10.28907798111668, "grad_norm": 0.0218505859375, "learning_rate": 4.5590354667829606e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148919535.0, "step": 44140 }, { "entropy": 0.05648909267038107, "epoch": 10.290243618137312, "grad_norm": 0.068359375, "learning_rate": 4.5589191032275665e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 148941648.0, "step": 44145 }, { "entropy": 0.05573352323845029, "epoch": 10.291409255157944, "grad_norm": 0.01904296875, "learning_rate": 4.558802727609972e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 148969075.0, "step": 44150 }, { "entropy": 0.0648234311491251, "epoch": 10.292574892178575, "grad_norm": 0.05517578125, "learning_rate": 4.558686339931912e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148987435.0, "step": 44155 }, { "entropy": 0.05349012557417154, "epoch": 10.293740529199207, "grad_norm": 0.0361328125, "learning_rate": 4.558569940195122e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 148999688.0, "step": 44160 }, { "entropy": 0.059884701296687125, "epoch": 10.294906166219839, "grad_norm": 0.33984375, "learning_rate": 4.558453528401339e-05, "loss": 0.0023, "mean_token_accuracy": 0.9997921824455261, "num_tokens": 149022227.0, "step": 44165 }, { "entropy": 0.06587412673979998, "epoch": 10.296071803240471, "grad_norm": 0.01300048828125, "learning_rate": 4.5583371045522975e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149035404.0, "step": 44170 }, { "entropy": 0.06613674918189645, "epoch": 10.297237440261103, "grad_norm": 0.1201171875, "learning_rate": 4.558220668649734e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149063640.0, "step": 44175 }, { "entropy": 0.07241187756881118, "epoch": 10.298403077281735, "grad_norm": 0.12353515625, "learning_rate": 4.558104220695385e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 149088361.0, "step": 44180 }, { "entropy": 0.05228366181254387, "epoch": 10.299568714302366, "grad_norm": 0.0927734375, "learning_rate": 4.557987760690986e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149111431.0, "step": 44185 }, { "entropy": 0.05424889232963324, "epoch": 10.300734351322998, "grad_norm": 0.1416015625, "learning_rate": 4.557871288638275e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999862551689148, "num_tokens": 149128277.0, "step": 44190 }, { "entropy": 0.0512849391438067, "epoch": 10.30189998834363, "grad_norm": 0.10498046875, "learning_rate": 4.557754804538987e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999889254570007, "num_tokens": 149150972.0, "step": 44195 }, { "entropy": 0.06689666323363781, "epoch": 10.303065625364262, "grad_norm": 0.08447265625, "learning_rate": 4.55763830839486e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 149170074.0, "step": 44200 }, { "entropy": 0.060508682392537594, "epoch": 10.304231262384894, "grad_norm": 0.2451171875, "learning_rate": 4.557521800207632e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999882757663727, "num_tokens": 149198425.0, "step": 44205 }, { "entropy": 0.07404768951237202, "epoch": 10.305396899405524, "grad_norm": 0.0322265625, "learning_rate": 4.557405279979039e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149206503.0, "step": 44210 }, { "entropy": 0.05456382445991039, "epoch": 10.306562536426156, "grad_norm": 0.0213623046875, "learning_rate": 4.5572887477108184e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149226799.0, "step": 44215 }, { "entropy": 0.0596602609846741, "epoch": 10.307728173446788, "grad_norm": 0.0272216796875, "learning_rate": 4.557172203404709e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999854505062103, "num_tokens": 149247136.0, "step": 44220 }, { "entropy": 0.07430512486025691, "epoch": 10.30889381046742, "grad_norm": 0.040771484375, "learning_rate": 4.5570556470624475e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149259111.0, "step": 44225 }, { "entropy": 0.048424013145267966, "epoch": 10.310059447488053, "grad_norm": 0.02490234375, "learning_rate": 4.5569390786857725e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149285200.0, "step": 44230 }, { "entropy": 0.0636987192556262, "epoch": 10.311225084508685, "grad_norm": 0.027099609375, "learning_rate": 4.5568224982764225e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999885141849518, "num_tokens": 149312337.0, "step": 44235 }, { "entropy": 0.03847971297800541, "epoch": 10.312390721529315, "grad_norm": 0.0196533203125, "learning_rate": 4.556705905836135e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149333963.0, "step": 44240 }, { "entropy": 0.05180752845481038, "epoch": 10.313556358549947, "grad_norm": 0.064453125, "learning_rate": 4.5565893013666495e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149360883.0, "step": 44245 }, { "entropy": 0.054119398910552265, "epoch": 10.31472199557058, "grad_norm": 0.02392578125, "learning_rate": 4.556472684869704e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 149387376.0, "step": 44250 }, { "entropy": 0.07822755612432956, "epoch": 10.315887632591211, "grad_norm": 0.036865234375, "learning_rate": 4.556356056347038e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149395371.0, "step": 44255 }, { "entropy": 0.06847381629049779, "epoch": 10.317053269611844, "grad_norm": 0.23046875, "learning_rate": 4.5562394158003906e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149407104.0, "step": 44260 }, { "entropy": 0.06189041472971439, "epoch": 10.318218906632474, "grad_norm": 0.0125732421875, "learning_rate": 4.5561227632315e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149418449.0, "step": 44265 }, { "entropy": 0.05848919115960598, "epoch": 10.319384543653106, "grad_norm": 0.046142578125, "learning_rate": 4.556006098642107e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998461544513703, "num_tokens": 149432000.0, "step": 44270 }, { "entropy": 0.047734024748206136, "epoch": 10.320550180673738, "grad_norm": 0.03173828125, "learning_rate": 4.555889422033951e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149477400.0, "step": 44275 }, { "entropy": 0.0656460294034332, "epoch": 10.32171581769437, "grad_norm": 0.05810546875, "learning_rate": 4.555772733408772e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149499213.0, "step": 44280 }, { "entropy": 0.07494772598147392, "epoch": 10.322881454715002, "grad_norm": 0.015625, "learning_rate": 4.5556560327683085e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149511555.0, "step": 44285 }, { "entropy": 0.04982608687132597, "epoch": 10.324047091735633, "grad_norm": 0.0208740234375, "learning_rate": 4.5555393201143025e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149527136.0, "step": 44290 }, { "entropy": 0.05682174563407898, "epoch": 10.325212728756265, "grad_norm": 0.0498046875, "learning_rate": 4.555422595448494e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149540226.0, "step": 44295 }, { "entropy": 0.05086037190631032, "epoch": 10.326378365776897, "grad_norm": 0.01019287109375, "learning_rate": 4.555305858772622e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149553341.0, "step": 44300 }, { "entropy": 0.06853776425123215, "epoch": 10.327544002797529, "grad_norm": 0.021240234375, "learning_rate": 4.555189110088429e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893784523011, "num_tokens": 149570177.0, "step": 44305 }, { "entropy": 0.07153114788234234, "epoch": 10.328709639818161, "grad_norm": 0.03271484375, "learning_rate": 4.555072349397656e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149584394.0, "step": 44310 }, { "entropy": 0.0588343221694231, "epoch": 10.329875276838793, "grad_norm": 0.0311279296875, "learning_rate": 4.5549555767020425e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149599151.0, "step": 44315 }, { "entropy": 0.06345712374895811, "epoch": 10.331040913859423, "grad_norm": 0.0303955078125, "learning_rate": 4.5548387920033306e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 149619838.0, "step": 44320 }, { "entropy": 0.06382895335555076, "epoch": 10.332206550880056, "grad_norm": 0.0301513671875, "learning_rate": 4.554721995303262e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999876201152802, "num_tokens": 149648081.0, "step": 44325 }, { "entropy": 0.06957971584051847, "epoch": 10.333372187900688, "grad_norm": 0.0712890625, "learning_rate": 4.554605186603578e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 149673066.0, "step": 44330 }, { "entropy": 0.04718512548133731, "epoch": 10.33453782492132, "grad_norm": 0.08740234375, "learning_rate": 4.554488365906021e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149702132.0, "step": 44335 }, { "entropy": 0.05456408876925707, "epoch": 10.335703461941952, "grad_norm": 0.0137939453125, "learning_rate": 4.5543715332123316e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149720216.0, "step": 44340 }, { "entropy": 0.05885306540876627, "epoch": 10.336869098962582, "grad_norm": 0.1591796875, "learning_rate": 4.5542546885242535e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149740075.0, "step": 44345 }, { "entropy": 0.044297129614278674, "epoch": 10.338034735983214, "grad_norm": 0.01348876953125, "learning_rate": 4.554137831843528e-05, "loss": 0.0001, "mean_token_accuracy": 0.999986058473587, "num_tokens": 149764370.0, "step": 44350 }, { "entropy": 0.05702058784663677, "epoch": 10.339200373003846, "grad_norm": 0.025146484375, "learning_rate": 4.554020963171898e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149776968.0, "step": 44355 }, { "entropy": 0.057385982014238834, "epoch": 10.340366010024479, "grad_norm": 0.0274658203125, "learning_rate": 4.553904082511106e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999786972999573, "num_tokens": 149814014.0, "step": 44360 }, { "entropy": 0.047365696541965006, "epoch": 10.34153164704511, "grad_norm": 0.017578125, "learning_rate": 4.553787189862895e-05, "loss": 0.0001, "mean_token_accuracy": 0.9997907936573028, "num_tokens": 149839742.0, "step": 44365 }, { "entropy": 0.05646844375878572, "epoch": 10.342697284065743, "grad_norm": 0.02587890625, "learning_rate": 4.553670285229008e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149851155.0, "step": 44370 }, { "entropy": 0.058501888811588285, "epoch": 10.343862921086373, "grad_norm": 0.07421875, "learning_rate": 4.553553368611188e-05, "loss": 0.001, "mean_token_accuracy": 0.9993243217468262, "num_tokens": 149865248.0, "step": 44375 }, { "entropy": 0.06298747211694718, "epoch": 10.345028558107005, "grad_norm": 0.0211181640625, "learning_rate": 4.55343644001118e-05, "loss": 0.0004, "mean_token_accuracy": 0.9994152069091797, "num_tokens": 149878095.0, "step": 44380 }, { "entropy": 0.06646413435228168, "epoch": 10.346194195127637, "grad_norm": 0.1279296875, "learning_rate": 4.553319499430725e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999783575534821, "num_tokens": 149906489.0, "step": 44385 }, { "entropy": 0.05169821633026004, "epoch": 10.34735983214827, "grad_norm": 0.0654296875, "learning_rate": 4.5532025468715675e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999868750572205, "num_tokens": 149934017.0, "step": 44390 }, { "entropy": 0.06346538104116917, "epoch": 10.348525469168901, "grad_norm": 0.0311279296875, "learning_rate": 4.5530855823354526e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 149944762.0, "step": 44395 }, { "entropy": 0.049836630932986736, "epoch": 10.349691106189532, "grad_norm": 0.0260009765625, "learning_rate": 4.552968605824125e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893188476563, "num_tokens": 149970492.0, "step": 44400 }, { "entropy": 0.06093708393163979, "epoch": 10.350856743210164, "grad_norm": 0.03271484375, "learning_rate": 4.552851617339326e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 149989599.0, "step": 44405 }, { "entropy": 0.06286875121295452, "epoch": 10.352022380230796, "grad_norm": 0.018310546875, "learning_rate": 4.5527346168828025e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997101426124573, "num_tokens": 150000033.0, "step": 44410 }, { "entropy": 0.05082651572301984, "epoch": 10.353188017251428, "grad_norm": 0.0179443359375, "learning_rate": 4.5526176044562985e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150028772.0, "step": 44415 }, { "entropy": 0.14444229658693075, "epoch": 10.35435365427206, "grad_norm": 5.0625, "learning_rate": 4.5525005800615585e-05, "loss": 0.1725, "mean_token_accuracy": 0.9819698393344879, "num_tokens": 150070583.0, "step": 44420 }, { "entropy": 0.08742633331567048, "epoch": 10.35551929129269, "grad_norm": 0.04150390625, "learning_rate": 4.552383543700328e-05, "loss": 0.0012, "mean_token_accuracy": 0.9999787509441376, "num_tokens": 150091656.0, "step": 44425 }, { "entropy": 0.05972275361418724, "epoch": 10.356684928313323, "grad_norm": 0.072265625, "learning_rate": 4.552266495374352e-05, "loss": 0.0014, "mean_token_accuracy": 0.9994139850139618, "num_tokens": 150108536.0, "step": 44430 }, { "entropy": 0.07121363878250123, "epoch": 10.357850565333955, "grad_norm": 0.0303955078125, "learning_rate": 4.5521494350853764e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150117275.0, "step": 44435 }, { "entropy": 0.049450202379375696, "epoch": 10.359016202354587, "grad_norm": 0.057861328125, "learning_rate": 4.552032362835146e-05, "loss": 0.0043, "mean_token_accuracy": 0.9999368906021118, "num_tokens": 150151732.0, "step": 44440 }, { "entropy": 0.06740026157349348, "epoch": 10.360181839375219, "grad_norm": 0.017578125, "learning_rate": 4.551915278625406e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150161764.0, "step": 44445 }, { "entropy": 0.07839980935677886, "epoch": 10.361347476395851, "grad_norm": 0.01544189453125, "learning_rate": 4.5517981824579034e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150177487.0, "step": 44450 }, { "entropy": 0.04942024620249867, "epoch": 10.362513113416481, "grad_norm": 0.034423828125, "learning_rate": 4.551681074334384e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150201662.0, "step": 44455 }, { "entropy": 0.05563904563896358, "epoch": 10.363678750437114, "grad_norm": 0.078125, "learning_rate": 4.5515639542565946e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999758481979371, "num_tokens": 150224405.0, "step": 44460 }, { "entropy": 0.05794704202562571, "epoch": 10.364844387457746, "grad_norm": 0.044189453125, "learning_rate": 4.55144682222628e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150244252.0, "step": 44465 }, { "entropy": 0.0887676641345024, "epoch": 10.366010024478378, "grad_norm": 0.1484375, "learning_rate": 4.551329678245189e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150255084.0, "step": 44470 }, { "entropy": 0.06721811229363084, "epoch": 10.36717566149901, "grad_norm": 0.0284423828125, "learning_rate": 4.5512125223150656e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150271849.0, "step": 44475 }, { "entropy": 0.07131273578852415, "epoch": 10.36834129851964, "grad_norm": 0.08935546875, "learning_rate": 4.55109535443766e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150297391.0, "step": 44480 }, { "entropy": 0.061979838041588665, "epoch": 10.369506935540272, "grad_norm": 0.02783203125, "learning_rate": 4.550978174614717e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 150315599.0, "step": 44485 }, { "entropy": 0.06966787008568645, "epoch": 10.370672572560904, "grad_norm": 0.023681640625, "learning_rate": 4.550860982847985e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997916400432587, "num_tokens": 150344717.0, "step": 44490 }, { "entropy": 0.054807602986693384, "epoch": 10.371838209581536, "grad_norm": 0.123046875, "learning_rate": 4.550743779139211e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 150362497.0, "step": 44495 }, { "entropy": 0.06291921064257622, "epoch": 10.373003846602169, "grad_norm": 0.01434326171875, "learning_rate": 4.550626563490143e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150378889.0, "step": 44500 }, { "entropy": 0.05275652538985014, "epoch": 10.3741694836228, "grad_norm": 0.0595703125, "learning_rate": 4.550509335902529e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150401787.0, "step": 44505 }, { "entropy": 0.02468291549012065, "epoch": 10.375335120643431, "grad_norm": 0.01904296875, "learning_rate": 4.5503920963781156e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999732375144958, "num_tokens": 150443772.0, "step": 44510 }, { "entropy": 0.06841404465958476, "epoch": 10.376500757664063, "grad_norm": 0.0458984375, "learning_rate": 4.550274844918653e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150468813.0, "step": 44515 }, { "entropy": 0.05546982828527689, "epoch": 10.377666394684695, "grad_norm": 0.0419921875, "learning_rate": 4.550157581525889e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150489590.0, "step": 44520 }, { "entropy": 0.05243567517027259, "epoch": 10.378832031705327, "grad_norm": 0.0966796875, "learning_rate": 4.550040306201571e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999762177467346, "num_tokens": 150509588.0, "step": 44525 }, { "entropy": 0.07348482059314847, "epoch": 10.37999766872596, "grad_norm": 0.05810546875, "learning_rate": 4.5499230189474496e-05, "loss": 0.0004, "mean_token_accuracy": 0.9998459160327912, "num_tokens": 150522711.0, "step": 44530 }, { "entropy": 0.05543697997927666, "epoch": 10.38116330574659, "grad_norm": 0.0260009765625, "learning_rate": 4.5498057197652726e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 150537080.0, "step": 44535 }, { "entropy": 0.052227580547332765, "epoch": 10.382328942767222, "grad_norm": 0.0211181640625, "learning_rate": 4.549688408656789e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999841928482056, "num_tokens": 150556156.0, "step": 44540 }, { "entropy": 0.051933661289513114, "epoch": 10.383494579787854, "grad_norm": 0.0279541015625, "learning_rate": 4.549571085623749e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150573522.0, "step": 44545 }, { "entropy": 0.05841661635786295, "epoch": 10.384660216808486, "grad_norm": 0.07568359375, "learning_rate": 4.549453750667901e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 150602640.0, "step": 44550 }, { "entropy": 0.08519041016697884, "epoch": 10.385825853829118, "grad_norm": 0.035888671875, "learning_rate": 4.549336403790996e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150612023.0, "step": 44555 }, { "entropy": 0.06693806014955044, "epoch": 10.386991490849748, "grad_norm": 0.01470947265625, "learning_rate": 4.549219044994782e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150633112.0, "step": 44560 }, { "entropy": 0.06107230139896273, "epoch": 10.38815712787038, "grad_norm": 0.025634765625, "learning_rate": 4.54910167428101e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150661439.0, "step": 44565 }, { "entropy": 0.0495242724660784, "epoch": 10.389322764891013, "grad_norm": 0.0181884765625, "learning_rate": 4.548984291651431e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999850571155549, "num_tokens": 150687714.0, "step": 44570 }, { "entropy": 0.05606261203065514, "epoch": 10.390488401911645, "grad_norm": 0.279296875, "learning_rate": 4.548866897107794e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999887406826019, "num_tokens": 150710467.0, "step": 44575 }, { "entropy": 0.05214941115118563, "epoch": 10.391654038932277, "grad_norm": 0.021728515625, "learning_rate": 4.54874949065185e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150732070.0, "step": 44580 }, { "entropy": 0.0623500001616776, "epoch": 10.392819675952909, "grad_norm": 0.23046875, "learning_rate": 4.548632072285349e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150746236.0, "step": 44585 }, { "entropy": 0.052656793221831324, "epoch": 10.39398531297354, "grad_norm": 0.1015625, "learning_rate": 4.548514642010043e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 150769149.0, "step": 44590 }, { "entropy": 0.05154966562986374, "epoch": 10.395150949994171, "grad_norm": 0.01055908203125, "learning_rate": 4.5483971998276834e-05, "loss": 0.0004, "mean_token_accuracy": 0.999904578924179, "num_tokens": 150794023.0, "step": 44595 }, { "entropy": 0.058824519719928504, "epoch": 10.396316587014804, "grad_norm": 0.0233154296875, "learning_rate": 4.54827974574002e-05, "loss": 0.0005, "mean_token_accuracy": 0.999965363740921, "num_tokens": 150821718.0, "step": 44600 }, { "entropy": 0.05286461748182773, "epoch": 10.397482224035436, "grad_norm": 0.062255859375, "learning_rate": 4.548162279748805e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150832539.0, "step": 44605 }, { "entropy": 0.07234964519739151, "epoch": 10.398647861056068, "grad_norm": 0.07373046875, "learning_rate": 4.54804480185579e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150843175.0, "step": 44610 }, { "entropy": 0.05047530680894852, "epoch": 10.399813498076698, "grad_norm": 0.02734375, "learning_rate": 4.5479273120627266e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150862858.0, "step": 44615 }, { "entropy": 0.051994080375880006, "epoch": 10.40097913509733, "grad_norm": 0.036865234375, "learning_rate": 4.547809810371367e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150879504.0, "step": 44620 }, { "entropy": 0.051050873938947916, "epoch": 10.402144772117962, "grad_norm": 0.2470703125, "learning_rate": 4.547692296783463e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999808728694916, "num_tokens": 150902392.0, "step": 44625 }, { "entropy": 0.07185981124639511, "epoch": 10.403310409138594, "grad_norm": 0.026123046875, "learning_rate": 4.5475747713007674e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150913796.0, "step": 44630 }, { "entropy": 0.08481485210359097, "epoch": 10.404476046159226, "grad_norm": 0.08740234375, "learning_rate": 4.547457233925031e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 150943637.0, "step": 44635 }, { "entropy": 0.06651202514767647, "epoch": 10.405641683179859, "grad_norm": 0.0223388671875, "learning_rate": 4.5473396846580096e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 150954871.0, "step": 44640 }, { "entropy": 0.04281379147432744, "epoch": 10.406807320200489, "grad_norm": 0.0303955078125, "learning_rate": 4.5472221235014535e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999888718128205, "num_tokens": 150988430.0, "step": 44645 }, { "entropy": 0.06591433379799128, "epoch": 10.407972957221121, "grad_norm": 0.024658203125, "learning_rate": 4.5471045504571166e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999487698078156, "num_tokens": 151015790.0, "step": 44650 }, { "entropy": 0.05050462838262319, "epoch": 10.409138594241753, "grad_norm": 0.01953125, "learning_rate": 4.546986965526751e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 151047021.0, "step": 44655 }, { "entropy": 0.08088272651657462, "epoch": 10.410304231262385, "grad_norm": 0.02880859375, "learning_rate": 4.5468693687121124e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151066724.0, "step": 44660 }, { "entropy": 0.06316407779231667, "epoch": 10.411469868283017, "grad_norm": 0.03271484375, "learning_rate": 4.546751760014952e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151090250.0, "step": 44665 }, { "entropy": 0.07306670425459742, "epoch": 10.412635505303648, "grad_norm": 0.0257568359375, "learning_rate": 4.546634139437025e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151106420.0, "step": 44670 }, { "entropy": 0.06450021881610155, "epoch": 10.41380114232428, "grad_norm": 0.032470703125, "learning_rate": 4.546516506980084e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151119827.0, "step": 44675 }, { "entropy": 0.05164213851094246, "epoch": 10.414966779344912, "grad_norm": 0.059326171875, "learning_rate": 4.546398862645885e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151135806.0, "step": 44680 }, { "entropy": 0.06281434707343578, "epoch": 10.416132416365544, "grad_norm": 0.01251220703125, "learning_rate": 4.5462812064361806e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151145452.0, "step": 44685 }, { "entropy": 0.07266581580042838, "epoch": 10.417298053386176, "grad_norm": 0.11865234375, "learning_rate": 4.546163538352725e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151158311.0, "step": 44690 }, { "entropy": 0.0545011417940259, "epoch": 10.418463690406806, "grad_norm": 0.0302734375, "learning_rate": 4.5460458583972745e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151175781.0, "step": 44695 }, { "entropy": 0.06281043328344822, "epoch": 10.419629327427439, "grad_norm": 0.041015625, "learning_rate": 4.545928166571582e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151187268.0, "step": 44700 }, { "entropy": 0.045202738512307404, "epoch": 10.42079496444807, "grad_norm": 0.03759765625, "learning_rate": 4.5458104628774046e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151215418.0, "step": 44705 }, { "entropy": 0.06615263698622584, "epoch": 10.421960601468703, "grad_norm": 0.05810546875, "learning_rate": 4.545692747316495e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151231048.0, "step": 44710 }, { "entropy": 0.03947287751361728, "epoch": 10.423126238489335, "grad_norm": 0.051025390625, "learning_rate": 4.5455750198906114e-05, "loss": 0.0003, "mean_token_accuracy": 0.999989128112793, "num_tokens": 151274419.0, "step": 44715 }, { "entropy": 0.054187464900314807, "epoch": 10.424291875509967, "grad_norm": 0.3125, "learning_rate": 4.545457280601506e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999715149402618, "num_tokens": 151293841.0, "step": 44720 }, { "entropy": 0.064159763045609, "epoch": 10.425457512530597, "grad_norm": 0.06494140625, "learning_rate": 4.545339529450937e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151319691.0, "step": 44725 }, { "entropy": 0.060284094978123905, "epoch": 10.42662314955123, "grad_norm": 0.56640625, "learning_rate": 4.545221766440659e-05, "loss": 0.0072, "mean_token_accuracy": 0.9996629536151886, "num_tokens": 151353006.0, "step": 44730 }, { "entropy": 0.06142573980614543, "epoch": 10.427788786571861, "grad_norm": 0.02587890625, "learning_rate": 4.545103991572428e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151372233.0, "step": 44735 }, { "entropy": 0.05108613050542772, "epoch": 10.428954423592494, "grad_norm": 0.1513671875, "learning_rate": 4.5449862048480015e-05, "loss": 0.0064, "mean_token_accuracy": 0.9988063097000122, "num_tokens": 151410291.0, "step": 44740 }, { "entropy": 0.05234553683549166, "epoch": 10.430120060613126, "grad_norm": 0.051513671875, "learning_rate": 4.544868406269134e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 151435611.0, "step": 44745 }, { "entropy": 0.059111610800027845, "epoch": 10.431285697633756, "grad_norm": 0.018310546875, "learning_rate": 4.544750595837584e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151454651.0, "step": 44750 }, { "entropy": 0.06212139260023832, "epoch": 10.432451334654388, "grad_norm": 0.12060546875, "learning_rate": 4.544632773555107e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999787032604217, "num_tokens": 151476996.0, "step": 44755 }, { "entropy": 0.04963586870580912, "epoch": 10.43361697167502, "grad_norm": 0.0198974609375, "learning_rate": 4.5445149394234596e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999685645103454, "num_tokens": 151495009.0, "step": 44760 }, { "entropy": 0.06097169406712055, "epoch": 10.434782608695652, "grad_norm": 0.01470947265625, "learning_rate": 4.5443970934444e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151516934.0, "step": 44765 }, { "entropy": 0.06304800482466817, "epoch": 10.435948245716284, "grad_norm": 0.0230712890625, "learning_rate": 4.5442792356196846e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151532322.0, "step": 44770 }, { "entropy": 0.045744067151099445, "epoch": 10.437113882736917, "grad_norm": 0.103515625, "learning_rate": 4.544161365951071e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999655902385711, "num_tokens": 151560417.0, "step": 44775 }, { "entropy": 0.05613409299403429, "epoch": 10.438279519757547, "grad_norm": 0.0634765625, "learning_rate": 4.5440434844403173e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151585302.0, "step": 44780 }, { "entropy": 0.05387907605618238, "epoch": 10.439445156778179, "grad_norm": 0.021484375, "learning_rate": 4.543925591089181e-05, "loss": 0.0005, "mean_token_accuracy": 0.9995951414108276, "num_tokens": 151606647.0, "step": 44785 }, { "entropy": 0.09123647324740887, "epoch": 10.440610793798811, "grad_norm": 0.0294189453125, "learning_rate": 4.543807685899419e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151619649.0, "step": 44790 }, { "entropy": 0.045269330497831106, "epoch": 10.441776430819443, "grad_norm": 0.016845703125, "learning_rate": 4.543689768872792e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151649258.0, "step": 44795 }, { "entropy": 0.06868037879467011, "epoch": 10.442942067840075, "grad_norm": 0.048095703125, "learning_rate": 4.543571840011056e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151659263.0, "step": 44800 }, { "entropy": 0.059371945634484294, "epoch": 10.444107704860706, "grad_norm": 0.216796875, "learning_rate": 4.54345389931597e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151675122.0, "step": 44805 }, { "entropy": 0.08021088363602757, "epoch": 10.445273341881338, "grad_norm": 0.035400390625, "learning_rate": 4.5433359467892935e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151699196.0, "step": 44810 }, { "entropy": 0.040476453490555286, "epoch": 10.44643897890197, "grad_norm": 0.035888671875, "learning_rate": 4.543217982432784e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151719643.0, "step": 44815 }, { "entropy": 0.06719337925314903, "epoch": 10.447604615922602, "grad_norm": 0.0400390625, "learning_rate": 4.543100006248202e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999888837337494, "num_tokens": 151736668.0, "step": 44820 }, { "entropy": 0.055458275601267815, "epoch": 10.448770252943234, "grad_norm": 0.03369140625, "learning_rate": 4.542982018237306e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151750220.0, "step": 44825 }, { "entropy": 0.0656410625204444, "epoch": 10.449935889963864, "grad_norm": 0.12060546875, "learning_rate": 4.542864018401855e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999665200710297, "num_tokens": 151773498.0, "step": 44830 }, { "entropy": 0.05906023997813463, "epoch": 10.451101526984496, "grad_norm": 0.08740234375, "learning_rate": 4.542746006743609e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 151786423.0, "step": 44835 }, { "entropy": 0.06858272757381201, "epoch": 10.452267164005129, "grad_norm": 0.318359375, "learning_rate": 4.542627983264328e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 151804263.0, "step": 44840 }, { "entropy": 0.07860703244805337, "epoch": 10.45343280102576, "grad_norm": 0.1298828125, "learning_rate": 4.542509947965772e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 151812919.0, "step": 44845 }, { "entropy": 0.05585416806861758, "epoch": 10.454598438046393, "grad_norm": 0.020263671875, "learning_rate": 4.5423919008497e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151829565.0, "step": 44850 }, { "entropy": 0.04956377726048231, "epoch": 10.455764075067025, "grad_norm": 0.04833984375, "learning_rate": 4.542273841917873e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893546104431, "num_tokens": 151854530.0, "step": 44855 }, { "entropy": 0.054505852470174435, "epoch": 10.456929712087655, "grad_norm": 0.01190185546875, "learning_rate": 4.5421557711720506e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151881715.0, "step": 44860 }, { "entropy": 0.04270871412009001, "epoch": 10.458095349108287, "grad_norm": 0.018310546875, "learning_rate": 4.5420376886139954e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151900522.0, "step": 44865 }, { "entropy": 0.05425464333966375, "epoch": 10.45926098612892, "grad_norm": 0.038818359375, "learning_rate": 4.5419195942454665e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999880194664001, "num_tokens": 151919944.0, "step": 44870 }, { "entropy": 0.057429822022095324, "epoch": 10.460426623149552, "grad_norm": 0.041015625, "learning_rate": 4.5418014880682256e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999831736087799, "num_tokens": 151942421.0, "step": 44875 }, { "entropy": 0.06060660080984235, "epoch": 10.461592260170184, "grad_norm": 0.038818359375, "learning_rate": 4.541683370084033e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999891996383667, "num_tokens": 151962688.0, "step": 44880 }, { "entropy": 0.07495781434699893, "epoch": 10.462757897190814, "grad_norm": 0.035400390625, "learning_rate": 4.541565240294651e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 151982335.0, "step": 44885 }, { "entropy": 0.0536651149392128, "epoch": 10.463923534211446, "grad_norm": 0.00994873046875, "learning_rate": 4.541447098701841e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152003296.0, "step": 44890 }, { "entropy": 0.07172315865755081, "epoch": 10.465089171232078, "grad_norm": 0.035400390625, "learning_rate": 4.5413289453073645e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999733746051789, "num_tokens": 152019012.0, "step": 44895 }, { "entropy": 0.0632179843261838, "epoch": 10.46625480825271, "grad_norm": 0.0201416015625, "learning_rate": 4.5412107801129824e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152029719.0, "step": 44900 }, { "entropy": 0.05272108670324087, "epoch": 10.467420445273342, "grad_norm": 0.09521484375, "learning_rate": 4.541092603120458e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152051695.0, "step": 44905 }, { "entropy": 0.036588351242244244, "epoch": 10.468586082293974, "grad_norm": 0.034912109375, "learning_rate": 4.540974414331552e-05, "loss": 0.0001, "mean_token_accuracy": 0.9999890267848969, "num_tokens": 152084842.0, "step": 44910 }, { "entropy": 0.06178782312199473, "epoch": 10.469751719314605, "grad_norm": 0.0101318359375, "learning_rate": 4.540856213748029e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998004019260407, "num_tokens": 152100061.0, "step": 44915 }, { "entropy": 0.05155020691454411, "epoch": 10.470917356335237, "grad_norm": 0.061767578125, "learning_rate": 4.5407380013716506e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 152116996.0, "step": 44920 }, { "entropy": 0.032804567646235225, "epoch": 10.472082993355869, "grad_norm": 0.1259765625, "learning_rate": 4.5406197772041784e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999784171581269, "num_tokens": 152160796.0, "step": 44925 }, { "entropy": 0.04573766337707639, "epoch": 10.473248630376501, "grad_norm": 0.035400390625, "learning_rate": 4.5405015412473764e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999883711338043, "num_tokens": 152190185.0, "step": 44930 }, { "entropy": 0.05881201233714819, "epoch": 10.474414267397133, "grad_norm": 0.0250244140625, "learning_rate": 4.540383293503008e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 152199805.0, "step": 44935 }, { "entropy": 0.06688203103840351, "epoch": 10.475579904417764, "grad_norm": 0.16796875, "learning_rate": 4.540265033972835e-05, "loss": 0.0019, "mean_token_accuracy": 0.9996675014495849, "num_tokens": 152217785.0, "step": 44940 }, { "entropy": 0.04776301896199584, "epoch": 10.476745541438396, "grad_norm": 0.04931640625, "learning_rate": 4.540146762658622e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999693930149078, "num_tokens": 152236196.0, "step": 44945 }, { "entropy": 0.05963452514261007, "epoch": 10.477911178459028, "grad_norm": 0.034912109375, "learning_rate": 4.540028479562132e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152247581.0, "step": 44950 }, { "entropy": 0.04292171949055046, "epoch": 10.47907681547966, "grad_norm": 0.032958984375, "learning_rate": 4.53991018468513e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 152286578.0, "step": 44955 }, { "entropy": 0.04976120926439762, "epoch": 10.480242452500292, "grad_norm": 1.5390625, "learning_rate": 4.539791878029378e-05, "loss": 0.0015, "mean_token_accuracy": 0.9997435867786407, "num_tokens": 152328454.0, "step": 44960 }, { "entropy": 0.06413271725177765, "epoch": 10.481408089520922, "grad_norm": 0.10791015625, "learning_rate": 4.539673559596642e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152338536.0, "step": 44965 }, { "entropy": 0.047244657296687365, "epoch": 10.482573726541554, "grad_norm": 0.054443359375, "learning_rate": 4.539555229388685e-05, "loss": 0.0013, "mean_token_accuracy": 0.999899297952652, "num_tokens": 152372224.0, "step": 44970 }, { "entropy": 0.0595405210275203, "epoch": 10.483739363562186, "grad_norm": 0.0242919921875, "learning_rate": 4.5394368874072725e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152391307.0, "step": 44975 }, { "entropy": 0.07148723490536213, "epoch": 10.484905000582819, "grad_norm": 0.042724609375, "learning_rate": 4.5393185336541684e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152400417.0, "step": 44980 }, { "entropy": 0.06768373921513557, "epoch": 10.48607063760345, "grad_norm": 0.016357421875, "learning_rate": 4.5392001681311376e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152411955.0, "step": 44985 }, { "entropy": 0.07932289410382509, "epoch": 10.487236274624083, "grad_norm": 0.0289306640625, "learning_rate": 4.539081790839945e-05, "loss": 0.0013, "mean_token_accuracy": 0.9995620608329773, "num_tokens": 152430368.0, "step": 44990 }, { "entropy": 0.050478702038526536, "epoch": 10.488401911644713, "grad_norm": 0.373046875, "learning_rate": 4.538963401782357e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999386966228485, "num_tokens": 152456906.0, "step": 44995 }, { "entropy": 0.05402056300081313, "epoch": 10.489567548665345, "grad_norm": 0.1923828125, "learning_rate": 4.5388450009601367e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999709129333496, "num_tokens": 152479603.0, "step": 45000 }, { "entropy": 0.04246578803285957, "epoch": 10.490733185685977, "grad_norm": 0.076171875, "learning_rate": 4.538726588375052e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999894201755524, "num_tokens": 152511917.0, "step": 45005 }, { "entropy": 0.046724597364664076, "epoch": 10.49189882270661, "grad_norm": 0.0230712890625, "learning_rate": 4.538608164028867e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152538081.0, "step": 45010 }, { "entropy": 0.06224795989692211, "epoch": 10.493064459727242, "grad_norm": 0.1865234375, "learning_rate": 4.538489727923348e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 152560465.0, "step": 45015 }, { "entropy": 0.0667582368478179, "epoch": 10.494230096747872, "grad_norm": 0.058837890625, "learning_rate": 4.5383712800602616e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999842643737793, "num_tokens": 152577602.0, "step": 45020 }, { "entropy": 0.056950070336461066, "epoch": 10.495395733768504, "grad_norm": 0.017333984375, "learning_rate": 4.538252820441374e-05, "loss": 0.0004, "mean_token_accuracy": 0.9998468577861785, "num_tokens": 152598903.0, "step": 45025 }, { "entropy": 0.06032578498125076, "epoch": 10.496561370789136, "grad_norm": 0.0245361328125, "learning_rate": 4.538134349068451e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152611466.0, "step": 45030 }, { "entropy": 0.06809374392032623, "epoch": 10.497727007809768, "grad_norm": 0.0257568359375, "learning_rate": 4.538015865943259e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152621414.0, "step": 45035 }, { "entropy": 0.061638028174638745, "epoch": 10.4988926448304, "grad_norm": 0.0458984375, "learning_rate": 4.537897371067565e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152633531.0, "step": 45040 }, { "entropy": 0.04594460809603333, "epoch": 10.500058281851032, "grad_norm": 0.14453125, "learning_rate": 4.537778864443137e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999785780906677, "num_tokens": 152665138.0, "step": 45045 }, { "entropy": 0.059464158676564696, "epoch": 10.501223918871663, "grad_norm": 0.0279541015625, "learning_rate": 4.537660346071741e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999689579010009, "num_tokens": 152685895.0, "step": 45050 }, { "entropy": 0.058475431287661195, "epoch": 10.502389555892295, "grad_norm": 0.064453125, "learning_rate": 4.5375418159551444e-05, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 152716201.0, "step": 45055 }, { "entropy": 0.03777650678530335, "epoch": 10.503555192912927, "grad_norm": 0.04638671875, "learning_rate": 4.537423274095115e-05, "loss": 0.0006, "mean_token_accuracy": 0.9998415231704711, "num_tokens": 152746424.0, "step": 45060 }, { "entropy": 0.06340892501175403, "epoch": 10.504720829933559, "grad_norm": 0.07177734375, "learning_rate": 4.5373047204934197e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999887526035309, "num_tokens": 152767071.0, "step": 45065 }, { "entropy": 0.06310759540647268, "epoch": 10.505886466954191, "grad_norm": 0.01708984375, "learning_rate": 4.5371861551518275e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 152779962.0, "step": 45070 }, { "entropy": 0.06093995273113251, "epoch": 10.507052103974821, "grad_norm": 0.0123291015625, "learning_rate": 4.537067578072105e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152797320.0, "step": 45075 }, { "entropy": 0.04854728020727635, "epoch": 10.508217740995454, "grad_norm": 0.025634765625, "learning_rate": 4.536948989256021e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999867856502533, "num_tokens": 152817688.0, "step": 45080 }, { "entropy": 0.06601851750165225, "epoch": 10.509383378016086, "grad_norm": 0.09375, "learning_rate": 4.536830388705346e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152829817.0, "step": 45085 }, { "entropy": 0.05404104976914823, "epoch": 10.510549015036718, "grad_norm": 0.671875, "learning_rate": 4.5367117764218436e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152861766.0, "step": 45090 }, { "entropy": 0.05845761811360717, "epoch": 10.51171465205735, "grad_norm": 0.009521484375, "learning_rate": 4.536593152407287e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152884718.0, "step": 45095 }, { "entropy": 0.048073732480406764, "epoch": 10.51288028907798, "grad_norm": 0.146484375, "learning_rate": 4.5364745166634426e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999864935874939, "num_tokens": 152920331.0, "step": 45100 }, { "entropy": 0.056105440203100446, "epoch": 10.514045926098612, "grad_norm": 0.060546875, "learning_rate": 4.5363558691920803e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 152938039.0, "step": 45105 }, { "entropy": 0.05227671144530177, "epoch": 10.515211563119244, "grad_norm": 0.0152587890625, "learning_rate": 4.53623720999497e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999771773815155, "num_tokens": 152960373.0, "step": 45110 }, { "entropy": 0.05718650780618191, "epoch": 10.516377200139877, "grad_norm": 0.259765625, "learning_rate": 4.53611853907388e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 152969974.0, "step": 45115 }, { "entropy": 0.06564706424251199, "epoch": 10.517542837160509, "grad_norm": 0.09765625, "learning_rate": 4.53599985643058e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999891757965088, "num_tokens": 152991494.0, "step": 45120 }, { "entropy": 0.061748063564300536, "epoch": 10.51870847418114, "grad_norm": 0.09375, "learning_rate": 4.53588116206684e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999875843524932, "num_tokens": 153015942.0, "step": 45125 }, { "entropy": 0.04634425612166524, "epoch": 10.519874111201771, "grad_norm": 0.08154296875, "learning_rate": 4.53576245598443e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999892115592957, "num_tokens": 153050371.0, "step": 45130 }, { "entropy": 0.06081217750906944, "epoch": 10.521039748222403, "grad_norm": 0.07080078125, "learning_rate": 4.5356437381851194e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153065419.0, "step": 45135 }, { "entropy": 0.03465043311007321, "epoch": 10.522205385243035, "grad_norm": 0.044677734375, "learning_rate": 4.53552500867068e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153097010.0, "step": 45140 }, { "entropy": 0.05195081997662783, "epoch": 10.523371022263667, "grad_norm": 0.028564453125, "learning_rate": 4.53540626744288e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153123199.0, "step": 45145 }, { "entropy": 0.0526338548399508, "epoch": 10.5245366592843, "grad_norm": 0.0294189453125, "learning_rate": 4.5352875145034926e-05, "loss": 0.0019, "mean_token_accuracy": 0.9996376812458039, "num_tokens": 153139993.0, "step": 45150 }, { "entropy": 0.0496996458619833, "epoch": 10.52570229630493, "grad_norm": 0.034423828125, "learning_rate": 4.5351687498542863e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999785900115967, "num_tokens": 153167766.0, "step": 45155 }, { "entropy": 0.046733937319368125, "epoch": 10.526867933325562, "grad_norm": 0.0693359375, "learning_rate": 4.535049973497033e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153192549.0, "step": 45160 }, { "entropy": 0.06857496574521064, "epoch": 10.528033570346194, "grad_norm": 0.1640625, "learning_rate": 4.5349311854335035e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 153202218.0, "step": 45165 }, { "entropy": 0.054048574063926935, "epoch": 10.529199207366826, "grad_norm": 0.10693359375, "learning_rate": 4.53481238566547e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999850869178772, "num_tokens": 153222058.0, "step": 45170 }, { "entropy": 0.05584253640845418, "epoch": 10.530364844387458, "grad_norm": 0.026123046875, "learning_rate": 4.534693574194703e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153235925.0, "step": 45175 }, { "entropy": 0.058231227286159995, "epoch": 10.53153048140809, "grad_norm": 0.0284423828125, "learning_rate": 4.5345747510229754e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153254301.0, "step": 45180 }, { "entropy": 0.0526955584064126, "epoch": 10.53269611842872, "grad_norm": 0.099609375, "learning_rate": 4.534455916152057e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999891638755798, "num_tokens": 153276488.0, "step": 45185 }, { "entropy": 0.06668624244630336, "epoch": 10.533861755449353, "grad_norm": 0.2373046875, "learning_rate": 4.5343370695837215e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153289492.0, "step": 45190 }, { "entropy": 0.06033003106713295, "epoch": 10.535027392469985, "grad_norm": 0.06298828125, "learning_rate": 4.5342182113197414e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153309181.0, "step": 45195 }, { "entropy": 0.05025592148303985, "epoch": 10.536193029490617, "grad_norm": 0.02197265625, "learning_rate": 4.534099341361887e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153335484.0, "step": 45200 }, { "entropy": 0.061456915270537135, "epoch": 10.537358666511249, "grad_norm": 0.0693359375, "learning_rate": 4.5339804597119325e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153355824.0, "step": 45205 }, { "entropy": 0.041024369467049834, "epoch": 10.53852430353188, "grad_norm": 1.4296875, "learning_rate": 4.533861566371651e-05, "loss": 0.0025, "mean_token_accuracy": 0.9995857894420623, "num_tokens": 153386252.0, "step": 45210 }, { "entropy": 0.07406266573816538, "epoch": 10.539689940552512, "grad_norm": 0.09326171875, "learning_rate": 4.533742661342813e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999888896942138, "num_tokens": 153414762.0, "step": 45215 }, { "entropy": 0.07470035180449486, "epoch": 10.540855577573144, "grad_norm": 0.0164794921875, "learning_rate": 4.533623744627194e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153425035.0, "step": 45220 }, { "entropy": 0.04976415578275919, "epoch": 10.542021214593776, "grad_norm": 0.181640625, "learning_rate": 4.533504816226567e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999687135219574, "num_tokens": 153449577.0, "step": 45225 }, { "entropy": 0.056877507083117965, "epoch": 10.543186851614408, "grad_norm": 0.01507568359375, "learning_rate": 4.533385876142704e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153461321.0, "step": 45230 }, { "entropy": 0.053165735118091105, "epoch": 10.544352488635038, "grad_norm": 0.0181884765625, "learning_rate": 4.533266924377379e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153479189.0, "step": 45235 }, { "entropy": 0.05576419588178396, "epoch": 10.54551812565567, "grad_norm": 0.04296875, "learning_rate": 4.533147960932366e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153500382.0, "step": 45240 }, { "entropy": 0.05502867121249437, "epoch": 10.546683762676302, "grad_norm": 0.2890625, "learning_rate": 4.5330289858094396e-05, "loss": 0.0016, "mean_token_accuracy": 0.9996170341968537, "num_tokens": 153523829.0, "step": 45245 }, { "entropy": 0.05236600376665592, "epoch": 10.547849399696934, "grad_norm": 0.020263671875, "learning_rate": 4.532909999010373e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153540333.0, "step": 45250 }, { "entropy": 0.047642444260418415, "epoch": 10.549015036717567, "grad_norm": 0.022216796875, "learning_rate": 4.5327910005369414e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153558943.0, "step": 45255 }, { "entropy": 0.06373915579169989, "epoch": 10.550180673738199, "grad_norm": 0.053466796875, "learning_rate": 4.5326719903909176e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153570140.0, "step": 45260 }, { "entropy": 0.05331633798778057, "epoch": 10.551346310758829, "grad_norm": 0.035400390625, "learning_rate": 4.5325529685740775e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153587093.0, "step": 45265 }, { "entropy": 0.05336455744691193, "epoch": 10.552511947779461, "grad_norm": 0.1240234375, "learning_rate": 4.5324339350881964e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893069267273, "num_tokens": 153613028.0, "step": 45270 }, { "entropy": 0.04905619155615568, "epoch": 10.553677584800093, "grad_norm": 0.01806640625, "learning_rate": 4.532314889935048e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153635947.0, "step": 45275 }, { "entropy": 0.07199727166444063, "epoch": 10.554843221820725, "grad_norm": 0.10791015625, "learning_rate": 4.5321958331164074e-05, "loss": 0.0004, "mean_token_accuracy": 0.9998493969440461, "num_tokens": 153648137.0, "step": 45280 }, { "entropy": 0.07504986342974007, "epoch": 10.556008858841357, "grad_norm": 0.125, "learning_rate": 4.53207676463405e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999784767627716, "num_tokens": 153677806.0, "step": 45285 }, { "entropy": 0.06699912054464222, "epoch": 10.557174495861988, "grad_norm": 0.022705078125, "learning_rate": 4.531957684489753e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153697423.0, "step": 45290 }, { "entropy": 0.04461238384246826, "epoch": 10.55834013288262, "grad_norm": 0.0260009765625, "learning_rate": 4.53183859268529e-05, "loss": 0.0011, "mean_token_accuracy": 0.9996108174324035, "num_tokens": 153728335.0, "step": 45295 }, { "entropy": 0.04771164106205106, "epoch": 10.559505769903252, "grad_norm": 0.06298828125, "learning_rate": 4.531719489222438e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153754782.0, "step": 45300 }, { "entropy": 0.04681495912373066, "epoch": 10.560671406923884, "grad_norm": 0.0264892578125, "learning_rate": 4.531600374102973e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999852657318116, "num_tokens": 153778119.0, "step": 45305 }, { "entropy": 0.05865760799497366, "epoch": 10.561837043944516, "grad_norm": 0.0179443359375, "learning_rate": 4.53148124732867e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153797206.0, "step": 45310 }, { "entropy": 0.05252798534929752, "epoch": 10.563002680965148, "grad_norm": 0.043701171875, "learning_rate": 4.531362108901306e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153825044.0, "step": 45315 }, { "entropy": 0.07219594558700919, "epoch": 10.564168317985779, "grad_norm": 0.0147705078125, "learning_rate": 4.5312429588226593e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999789476394654, "num_tokens": 153849354.0, "step": 45320 }, { "entropy": 0.0687197322025895, "epoch": 10.56533395500641, "grad_norm": 0.021240234375, "learning_rate": 4.531123797094503e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 153868137.0, "step": 45325 }, { "entropy": 0.04669546764343977, "epoch": 10.566499592027043, "grad_norm": 0.09912109375, "learning_rate": 4.531004623718618e-05, "loss": 0.0012, "mean_token_accuracy": 0.9997701466083526, "num_tokens": 153893125.0, "step": 45330 }, { "entropy": 0.061506491433829066, "epoch": 10.567665229047675, "grad_norm": 0.11279296875, "learning_rate": 4.530885438696778e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893963336944, "num_tokens": 153918793.0, "step": 45335 }, { "entropy": 0.047557246033102275, "epoch": 10.568830866068307, "grad_norm": 0.01171875, "learning_rate": 4.530766242030762e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 153944278.0, "step": 45340 }, { "entropy": 0.05632236008532345, "epoch": 10.569996503088937, "grad_norm": 0.357421875, "learning_rate": 4.530647033722347e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999646008014679, "num_tokens": 153967238.0, "step": 45345 }, { "entropy": 0.04898662269115448, "epoch": 10.57116214010957, "grad_norm": 0.015869140625, "learning_rate": 4.53052781377331e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 153989791.0, "step": 45350 }, { "entropy": 0.06470034439116716, "epoch": 10.572327777130202, "grad_norm": 0.016357421875, "learning_rate": 4.53040858218543e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154000301.0, "step": 45355 }, { "entropy": 0.04806965310126543, "epoch": 10.573493414150834, "grad_norm": 0.7890625, "learning_rate": 4.530289338960484e-05, "loss": 0.0008, "mean_token_accuracy": 0.9995983958244323, "num_tokens": 154026538.0, "step": 45360 }, { "entropy": 0.1315355844795704, "epoch": 10.574659051171466, "grad_norm": 3.5, "learning_rate": 4.530170084100251e-05, "loss": 0.1375, "mean_token_accuracy": 0.9813533067703247, "num_tokens": 154046506.0, "step": 45365 }, { "entropy": 0.05549485264346003, "epoch": 10.575824688192096, "grad_norm": 0.048828125, "learning_rate": 4.5300508176065074e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999867022037506, "num_tokens": 154074785.0, "step": 45370 }, { "entropy": 0.03857192704454064, "epoch": 10.576990325212728, "grad_norm": 0.0181884765625, "learning_rate": 4.529931539481034e-05, "loss": 0.001, "mean_token_accuracy": 0.9999784290790558, "num_tokens": 154115955.0, "step": 45375 }, { "entropy": 0.06022445512935519, "epoch": 10.57815596223336, "grad_norm": 0.057861328125, "learning_rate": 4.529812249725608e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154131298.0, "step": 45380 }, { "entropy": 0.06327858474105597, "epoch": 10.579321599253992, "grad_norm": 0.1318359375, "learning_rate": 4.529692948342008e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999776542186737, "num_tokens": 154160613.0, "step": 45385 }, { "entropy": 0.06446757782250642, "epoch": 10.580487236274625, "grad_norm": 0.044677734375, "learning_rate": 4.5295736353320135e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154177442.0, "step": 45390 }, { "entropy": 0.06058889739215374, "epoch": 10.581652873295257, "grad_norm": 0.0189208984375, "learning_rate": 4.5294543106974036e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154188632.0, "step": 45395 }, { "entropy": 0.05697990693151951, "epoch": 10.582818510315887, "grad_norm": 0.0224609375, "learning_rate": 4.5293349744399574e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999100744724274, "num_tokens": 154202061.0, "step": 45400 }, { "entropy": 0.049638493172824386, "epoch": 10.583984147336519, "grad_norm": 0.049560546875, "learning_rate": 4.529215626561455e-05, "loss": 0.0003, "mean_token_accuracy": 0.999984622001648, "num_tokens": 154219966.0, "step": 45405 }, { "entropy": 0.062036858219653365, "epoch": 10.585149784357151, "grad_norm": 0.06201171875, "learning_rate": 4.529096267063676e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154236505.0, "step": 45410 }, { "entropy": 0.04993337215855718, "epoch": 10.586315421377783, "grad_norm": 0.0159912109375, "learning_rate": 4.5289768959483985e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999766051769257, "num_tokens": 154267797.0, "step": 45415 }, { "entropy": 0.05615239506587386, "epoch": 10.587481058398415, "grad_norm": 0.0296630859375, "learning_rate": 4.528857513217405e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154286964.0, "step": 45420 }, { "entropy": 0.057061603758484125, "epoch": 10.588646695419046, "grad_norm": 0.09716796875, "learning_rate": 4.5287381188724745e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999895453453064, "num_tokens": 154307212.0, "step": 45425 }, { "entropy": 0.05221607079729438, "epoch": 10.589812332439678, "grad_norm": 0.03759765625, "learning_rate": 4.528618712915386e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154326983.0, "step": 45430 }, { "entropy": 0.06867706384509802, "epoch": 10.59097796946031, "grad_norm": 0.020263671875, "learning_rate": 4.528499295347923e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154339509.0, "step": 45435 }, { "entropy": 0.05803119344636798, "epoch": 10.592143606480942, "grad_norm": 0.0166015625, "learning_rate": 4.5283798661718635e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999827980995178, "num_tokens": 154360403.0, "step": 45440 }, { "entropy": 0.05287682805210352, "epoch": 10.593309243501574, "grad_norm": 0.111328125, "learning_rate": 4.5282604253889896e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999831140041351, "num_tokens": 154380649.0, "step": 45445 }, { "entropy": 0.057697663782164454, "epoch": 10.594474880522206, "grad_norm": 0.0159912109375, "learning_rate": 4.528140973001083e-05, "loss": 0.0002, "mean_token_accuracy": 0.9998168468475341, "num_tokens": 154408373.0, "step": 45450 }, { "entropy": 0.05991329280659556, "epoch": 10.595640517542837, "grad_norm": 0.02685546875, "learning_rate": 4.5280215090099235e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154421905.0, "step": 45455 }, { "entropy": 0.05662989765405655, "epoch": 10.596806154563469, "grad_norm": 0.1142578125, "learning_rate": 4.527902033417293e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154442673.0, "step": 45460 }, { "entropy": 0.05534627726301551, "epoch": 10.5979717915841, "grad_norm": 0.03759765625, "learning_rate": 4.527782546224973e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 154462617.0, "step": 45465 }, { "entropy": 0.06300230165943503, "epoch": 10.599137428604733, "grad_norm": 0.0177001953125, "learning_rate": 4.527663047434746e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999778687953949, "num_tokens": 154482320.0, "step": 45470 }, { "entropy": 0.06169234653934837, "epoch": 10.600303065625365, "grad_norm": 0.0289306640625, "learning_rate": 4.527543537048392e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999802231788635, "num_tokens": 154498935.0, "step": 45475 }, { "entropy": 0.0762554974295199, "epoch": 10.601468702645995, "grad_norm": 0.10205078125, "learning_rate": 4.527424015067696e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999882817268372, "num_tokens": 154521923.0, "step": 45480 }, { "entropy": 0.045643165893852713, "epoch": 10.602634339666627, "grad_norm": 0.05322265625, "learning_rate": 4.527304481494438e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999760866165162, "num_tokens": 154550699.0, "step": 45485 }, { "entropy": 0.06465123752132058, "epoch": 10.60379997668726, "grad_norm": 0.03076171875, "learning_rate": 4.527184936330401e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154582411.0, "step": 45490 }, { "entropy": 0.05987545819953084, "epoch": 10.604965613707892, "grad_norm": 0.080078125, "learning_rate": 4.5270653795773676e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893605709076, "num_tokens": 154607072.0, "step": 45495 }, { "entropy": 0.05700749680399895, "epoch": 10.606131250728524, "grad_norm": 0.02197265625, "learning_rate": 4.52694581123712e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154619307.0, "step": 45500 }, { "entropy": 0.050329295732080934, "epoch": 10.607296887749154, "grad_norm": 0.0703125, "learning_rate": 4.526826231311443e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154638654.0, "step": 45505 }, { "entropy": 0.06347782760858536, "epoch": 10.608462524769786, "grad_norm": 0.0272216796875, "learning_rate": 4.5267066398021174e-05, "loss": 0.005, "mean_token_accuracy": 0.9992537200450897, "num_tokens": 154652379.0, "step": 45510 }, { "entropy": 0.0653164304792881, "epoch": 10.609628161790418, "grad_norm": 0.0194091796875, "learning_rate": 4.526587036710928e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154665309.0, "step": 45515 }, { "entropy": 0.10336335860192776, "epoch": 10.61079379881105, "grad_norm": 0.0186767578125, "learning_rate": 4.526467422039658e-05, "loss": 0.051, "mean_token_accuracy": 0.9942165672779083, "num_tokens": 154689631.0, "step": 45520 }, { "entropy": 0.04605412124656141, "epoch": 10.611959435831682, "grad_norm": 0.404296875, "learning_rate": 4.526347795790091e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999624967575074, "num_tokens": 154719478.0, "step": 45525 }, { "entropy": 0.06172304879873991, "epoch": 10.613125072852315, "grad_norm": 0.0245361328125, "learning_rate": 4.5262281579640106e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154742502.0, "step": 45530 }, { "entropy": 0.06773852966725827, "epoch": 10.614290709872945, "grad_norm": 0.032958984375, "learning_rate": 4.526108508563201e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154752874.0, "step": 45535 }, { "entropy": 0.06439133267849684, "epoch": 10.615456346893577, "grad_norm": 0.01202392578125, "learning_rate": 4.5259888475894454e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999867498874664, "num_tokens": 154776941.0, "step": 45540 }, { "entropy": 0.05004207184538245, "epoch": 10.616621983914209, "grad_norm": 0.043212890625, "learning_rate": 4.5258691750445295e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154799157.0, "step": 45545 }, { "entropy": 0.05588539754971862, "epoch": 10.617787620934841, "grad_norm": 0.025390625, "learning_rate": 4.5257494909302366e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 154818796.0, "step": 45550 }, { "entropy": 0.04940931098535657, "epoch": 10.618953257955473, "grad_norm": 0.016845703125, "learning_rate": 4.525629795248353e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154840657.0, "step": 45555 }, { "entropy": 0.05910136494785547, "epoch": 10.620118894976104, "grad_norm": 0.0157470703125, "learning_rate": 4.525510088000662e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154852226.0, "step": 45560 }, { "entropy": 0.0557030210737139, "epoch": 10.621284531996736, "grad_norm": 0.027099609375, "learning_rate": 4.525390369188949e-05, "loss": 0.0003, "mean_token_accuracy": 0.9997493743896484, "num_tokens": 154878594.0, "step": 45565 }, { "entropy": 0.05533176232129335, "epoch": 10.622450169017368, "grad_norm": 0.76953125, "learning_rate": 4.525270638814999e-05, "loss": 0.0006, "mean_token_accuracy": 0.9996478855609894, "num_tokens": 154898526.0, "step": 45570 }, { "entropy": 0.058144222619012, "epoch": 10.623615806038, "grad_norm": 0.0361328125, "learning_rate": 4.525150896880599e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154925922.0, "step": 45575 }, { "entropy": 0.062190312519669534, "epoch": 10.624781443058632, "grad_norm": 0.1767578125, "learning_rate": 4.525031143387533e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154946470.0, "step": 45580 }, { "entropy": 0.08605777956545353, "epoch": 10.625947080079264, "grad_norm": 1.84375, "learning_rate": 4.5249113783375855e-05, "loss": 0.0004, "mean_token_accuracy": 0.9994974851608276, "num_tokens": 154955771.0, "step": 45585 }, { "entropy": 0.06026861779391766, "epoch": 10.627112717099894, "grad_norm": 0.0146484375, "learning_rate": 4.524791601732545e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 154967988.0, "step": 45590 }, { "entropy": 0.05325524704530835, "epoch": 10.628278354120527, "grad_norm": 0.017822265625, "learning_rate": 4.524671813574196e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999680340290069, "num_tokens": 154982691.0, "step": 45595 }, { "entropy": 0.06144336890429258, "epoch": 10.629443991141159, "grad_norm": 0.02294921875, "learning_rate": 4.5245520138643254e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999618291854858, "num_tokens": 155000448.0, "step": 45600 }, { "entropy": 0.052808401314541696, "epoch": 10.63060962816179, "grad_norm": 0.017822265625, "learning_rate": 4.5244322026047195e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155018584.0, "step": 45605 }, { "entropy": 0.03916442524641752, "epoch": 10.631775265182423, "grad_norm": 0.06494140625, "learning_rate": 4.5243123797971644e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155042625.0, "step": 45610 }, { "entropy": 0.049193437211215496, "epoch": 10.632940902203053, "grad_norm": 0.443359375, "learning_rate": 4.524192545443446e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 155057525.0, "step": 45615 }, { "entropy": 0.05743730580434203, "epoch": 10.634106539223685, "grad_norm": 0.06298828125, "learning_rate": 4.5240726995453536e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 155073628.0, "step": 45620 }, { "entropy": 0.06278720712289214, "epoch": 10.635272176244317, "grad_norm": 0.10595703125, "learning_rate": 4.523952842104673e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999800860881806, "num_tokens": 155105856.0, "step": 45625 }, { "entropy": 0.05687279971316457, "epoch": 10.63643781326495, "grad_norm": 0.11474609375, "learning_rate": 4.523832973123191e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999889671802521, "num_tokens": 155128659.0, "step": 45630 }, { "entropy": 0.06879281736910343, "epoch": 10.637603450285582, "grad_norm": 0.080078125, "learning_rate": 4.523713092602695e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155143525.0, "step": 45635 }, { "entropy": 0.06541344905272126, "epoch": 10.638769087306212, "grad_norm": 0.185546875, "learning_rate": 4.523593200544974e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999894678592682, "num_tokens": 155163360.0, "step": 45640 }, { "entropy": 0.04915583487600088, "epoch": 10.639934724326844, "grad_norm": 0.025634765625, "learning_rate": 4.523473296951814e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 155187475.0, "step": 45645 }, { "entropy": 0.04946916834451258, "epoch": 10.641100361347476, "grad_norm": 0.03369140625, "learning_rate": 4.523353381825004e-05, "loss": 0.0002, "mean_token_accuracy": 0.999989241361618, "num_tokens": 155209442.0, "step": 45650 }, { "entropy": 0.05402911715209484, "epoch": 10.642265998368108, "grad_norm": 0.009765625, "learning_rate": 4.5232334551663326e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155223725.0, "step": 45655 }, { "entropy": 0.05086983609944582, "epoch": 10.64343163538874, "grad_norm": 0.0228271484375, "learning_rate": 4.523113516977586e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155250672.0, "step": 45660 }, { "entropy": 0.05399879598990083, "epoch": 10.644597272409372, "grad_norm": 0.1171875, "learning_rate": 4.5229935672605554e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999883711338043, "num_tokens": 155280736.0, "step": 45665 }, { "entropy": 0.0479968911036849, "epoch": 10.645762909430003, "grad_norm": 0.047607421875, "learning_rate": 4.5228736060170274e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 155314206.0, "step": 45670 }, { "entropy": 0.04139636503532529, "epoch": 10.646928546450635, "grad_norm": 0.01904296875, "learning_rate": 4.522753633248792e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155352210.0, "step": 45675 }, { "entropy": 0.0488029814325273, "epoch": 10.648094183471267, "grad_norm": 0.0966796875, "learning_rate": 4.522633648957637e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155386905.0, "step": 45680 }, { "entropy": 0.034656665613874794, "epoch": 10.6492598204919, "grad_norm": 0.0908203125, "learning_rate": 4.522513653145352e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999785244464874, "num_tokens": 155429889.0, "step": 45685 }, { "entropy": 0.058836123626679185, "epoch": 10.650425457512531, "grad_norm": 0.053955078125, "learning_rate": 4.5223936458137276e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155442744.0, "step": 45690 }, { "entropy": 0.04692996209487319, "epoch": 10.651591094533162, "grad_norm": 0.041748046875, "learning_rate": 4.5222736269645514e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155476379.0, "step": 45695 }, { "entropy": 0.0463966122828424, "epoch": 10.652756731553794, "grad_norm": 0.01190185546875, "learning_rate": 4.522153596599614e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155493833.0, "step": 45700 }, { "entropy": 0.05225074263289571, "epoch": 10.653922368574426, "grad_norm": 0.048095703125, "learning_rate": 4.522033554720705e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155514467.0, "step": 45705 }, { "entropy": 0.05257317861542106, "epoch": 10.655088005595058, "grad_norm": 0.05908203125, "learning_rate": 4.5219135013296145e-05, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 155547168.0, "step": 45710 }, { "entropy": 0.07927863914519548, "epoch": 10.65625364261569, "grad_norm": 0.0712890625, "learning_rate": 4.5217934364281324e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 155565651.0, "step": 45715 }, { "entropy": 0.05985526898875833, "epoch": 10.657419279636322, "grad_norm": 0.0223388671875, "learning_rate": 4.52167336001805e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155580332.0, "step": 45720 }, { "entropy": 0.062313320487737654, "epoch": 10.658584916656952, "grad_norm": 0.0211181640625, "learning_rate": 4.5215532721011563e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155592185.0, "step": 45725 }, { "entropy": 0.06476543843746185, "epoch": 10.659750553677585, "grad_norm": 0.0233154296875, "learning_rate": 4.5214331726792436e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155602391.0, "step": 45730 }, { "entropy": 0.06466516926884651, "epoch": 10.660916190698217, "grad_norm": 0.01214599609375, "learning_rate": 4.5213130617541016e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155614267.0, "step": 45735 }, { "entropy": 0.07169229295104743, "epoch": 10.662081827718849, "grad_norm": 0.11474609375, "learning_rate": 4.521192939327522e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155627312.0, "step": 45740 }, { "entropy": 0.08903030175715684, "epoch": 10.66324746473948, "grad_norm": 0.0111083984375, "learning_rate": 4.521072805401296e-05, "loss": 0.0249, "mean_token_accuracy": 0.9973048746585846, "num_tokens": 155653835.0, "step": 45745 }, { "entropy": 0.07297799242660404, "epoch": 10.664413101760111, "grad_norm": 0.040283203125, "learning_rate": 4.520952659977214e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999803006649017, "num_tokens": 155668901.0, "step": 45750 }, { "entropy": 0.08213113537058234, "epoch": 10.665578738780743, "grad_norm": 2.96875, "learning_rate": 4.520832503057069e-05, "loss": 0.0383, "mean_token_accuracy": 0.9947204291820526, "num_tokens": 155690681.0, "step": 45755 }, { "entropy": 0.06827007010579109, "epoch": 10.666744375801375, "grad_norm": 0.0225830078125, "learning_rate": 4.5207123346426513e-05, "loss": 0.0023, "mean_token_accuracy": 0.9998970210552216, "num_tokens": 155709925.0, "step": 45760 }, { "entropy": 0.06171514326706529, "epoch": 10.667910012822007, "grad_norm": 0.0274658203125, "learning_rate": 4.520592154735753e-05, "loss": 0.0141, "mean_token_accuracy": 0.9981945633888245, "num_tokens": 155740138.0, "step": 45765 }, { "entropy": 0.056561283254995945, "epoch": 10.66907564984264, "grad_norm": 0.03076171875, "learning_rate": 4.5204719633381676e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 155769312.0, "step": 45770 }, { "entropy": 0.07458315212279558, "epoch": 10.67024128686327, "grad_norm": 0.033203125, "learning_rate": 4.520351760451686e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155779671.0, "step": 45775 }, { "entropy": 0.07302795853465796, "epoch": 10.671406923883902, "grad_norm": 0.02001953125, "learning_rate": 4.520231546078101e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155793274.0, "step": 45780 }, { "entropy": 0.06928151659667492, "epoch": 10.672572560904534, "grad_norm": 0.0115966796875, "learning_rate": 4.5201113202192056e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155806913.0, "step": 45785 }, { "entropy": 0.07158723613247275, "epoch": 10.673738197925166, "grad_norm": 0.0203857421875, "learning_rate": 4.5199910828767916e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999862253665924, "num_tokens": 155828497.0, "step": 45790 }, { "entropy": 0.06955606564879417, "epoch": 10.674903834945798, "grad_norm": 0.0181884765625, "learning_rate": 4.519870834052652e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155838451.0, "step": 45795 }, { "entropy": 0.04313288903795183, "epoch": 10.67606947196643, "grad_norm": 0.09033203125, "learning_rate": 4.519750573748581e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155858054.0, "step": 45800 }, { "entropy": 0.05261556897312403, "epoch": 10.67723510898706, "grad_norm": 0.078125, "learning_rate": 4.5196303019663715e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 155885122.0, "step": 45805 }, { "entropy": 0.055707470141351224, "epoch": 10.678400746007693, "grad_norm": 0.0277099609375, "learning_rate": 4.519510018707817e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 155905105.0, "step": 45810 }, { "entropy": 0.05604796879924834, "epoch": 10.679566383028325, "grad_norm": 0.0225830078125, "learning_rate": 4.519389723974709e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892711639404, "num_tokens": 155939506.0, "step": 45815 }, { "entropy": 0.06491890689358115, "epoch": 10.680732020048957, "grad_norm": 0.0283203125, "learning_rate": 4.519269417768844e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 155959895.0, "step": 45820 }, { "entropy": 0.0519371272996068, "epoch": 10.68189765706959, "grad_norm": 0.01348876953125, "learning_rate": 4.519149100092015e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999669194221497, "num_tokens": 155983069.0, "step": 45825 }, { "entropy": 0.05116039318963885, "epoch": 10.68306329409022, "grad_norm": 0.01495361328125, "learning_rate": 4.519028770946016e-05, "loss": 0.0017, "mean_token_accuracy": 0.9999583482742309, "num_tokens": 156014111.0, "step": 45830 }, { "entropy": 0.04381315428763628, "epoch": 10.684228931110852, "grad_norm": 0.021484375, "learning_rate": 4.518908430332641e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156031814.0, "step": 45835 }, { "entropy": 0.09437487740069628, "epoch": 10.685394568131484, "grad_norm": 0.0244140625, "learning_rate": 4.518788078253685e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156042372.0, "step": 45840 }, { "entropy": 0.05893740877509117, "epoch": 10.686560205152116, "grad_norm": 0.0235595703125, "learning_rate": 4.5186677147109426e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156054300.0, "step": 45845 }, { "entropy": 0.03940586084499955, "epoch": 10.687725842172748, "grad_norm": 0.04296875, "learning_rate": 4.518547339706208e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156081050.0, "step": 45850 }, { "entropy": 0.05552921891212463, "epoch": 10.68889147919338, "grad_norm": 0.1005859375, "learning_rate": 4.518426953241277e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 156090944.0, "step": 45855 }, { "entropy": 0.03403740664944053, "epoch": 10.69005711621401, "grad_norm": 0.0233154296875, "learning_rate": 4.518306555317944e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156125244.0, "step": 45860 }, { "entropy": 0.051710548158735034, "epoch": 10.691222753234642, "grad_norm": 0.041015625, "learning_rate": 4.518186145938005e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156152733.0, "step": 45865 }, { "entropy": 0.046452429797500375, "epoch": 10.692388390255275, "grad_norm": 0.0184326171875, "learning_rate": 4.518065725103255e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156176237.0, "step": 45870 }, { "entropy": 0.05762659339234233, "epoch": 10.693554027275907, "grad_norm": 0.01446533203125, "learning_rate": 4.51794529281549e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999857604503631, "num_tokens": 156203627.0, "step": 45875 }, { "entropy": 0.05502113308757543, "epoch": 10.694719664296539, "grad_norm": 0.0296630859375, "learning_rate": 4.5178248490765054e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156220487.0, "step": 45880 }, { "entropy": 0.04440567507408559, "epoch": 10.695885301317169, "grad_norm": 0.07763671875, "learning_rate": 4.517704393888097e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999863386154175, "num_tokens": 156255165.0, "step": 45885 }, { "entropy": 0.038073526509106156, "epoch": 10.697050938337801, "grad_norm": 0.012939453125, "learning_rate": 4.517583927252062e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999889850616455, "num_tokens": 156292113.0, "step": 45890 }, { "entropy": 0.0740324473939836, "epoch": 10.698216575358433, "grad_norm": 0.03759765625, "learning_rate": 4.517463449170196e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156304723.0, "step": 45895 }, { "entropy": 0.09565671551972628, "epoch": 10.699382212379065, "grad_norm": 0.05615234375, "learning_rate": 4.5173429596442955e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156322150.0, "step": 45900 }, { "entropy": 0.06854218691587448, "epoch": 10.700547849399697, "grad_norm": 0.0252685546875, "learning_rate": 4.517222458676158e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156333929.0, "step": 45905 }, { "entropy": 0.054353891499340536, "epoch": 10.701713486420328, "grad_norm": 0.04736328125, "learning_rate": 4.517101946267579e-05, "loss": 0.0007, "mean_token_accuracy": 0.9997032642364502, "num_tokens": 156347801.0, "step": 45910 }, { "entropy": 0.05088144233450294, "epoch": 10.70287912344096, "grad_norm": 0.0147705078125, "learning_rate": 4.516981422420356e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999614655971527, "num_tokens": 156362209.0, "step": 45915 }, { "entropy": 0.064677076600492, "epoch": 10.704044760461592, "grad_norm": 0.016357421875, "learning_rate": 4.516860887136287e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156377040.0, "step": 45920 }, { "entropy": 0.050365370139479636, "epoch": 10.705210397482224, "grad_norm": 0.04443359375, "learning_rate": 4.516740340417168e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156399740.0, "step": 45925 }, { "entropy": 0.04361268552020192, "epoch": 10.706376034502856, "grad_norm": 0.349609375, "learning_rate": 4.516619782264798e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999781847000122, "num_tokens": 156421815.0, "step": 45930 }, { "entropy": 0.0545174271799624, "epoch": 10.707541671523488, "grad_norm": 0.181640625, "learning_rate": 4.516499212680974e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999893307685852, "num_tokens": 156442944.0, "step": 45935 }, { "entropy": 0.04827831089496613, "epoch": 10.708707308544119, "grad_norm": 0.024658203125, "learning_rate": 4.5163786316674934e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156471993.0, "step": 45940 }, { "entropy": 0.05747170811519027, "epoch": 10.70987294556475, "grad_norm": 0.053466796875, "learning_rate": 4.516258039226155e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156488309.0, "step": 45945 }, { "entropy": 0.052263769414275886, "epoch": 10.711038582585383, "grad_norm": 0.05126953125, "learning_rate": 4.516137435358757e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 156520541.0, "step": 45950 }, { "entropy": 0.05521758422255516, "epoch": 10.712204219606015, "grad_norm": 0.052734375, "learning_rate": 4.516016820067096e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156554999.0, "step": 45955 }, { "entropy": 0.07212651148438454, "epoch": 10.713369856626647, "grad_norm": 0.1484375, "learning_rate": 4.515896193352974e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156566072.0, "step": 45960 }, { "entropy": 0.05750099988654256, "epoch": 10.714535493647277, "grad_norm": 0.029296875, "learning_rate": 4.515775555218187e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156578638.0, "step": 45965 }, { "entropy": 0.06050075925886631, "epoch": 10.71570113066791, "grad_norm": 0.07177734375, "learning_rate": 4.515654905664535e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 156605760.0, "step": 45970 }, { "entropy": 0.05188793493434787, "epoch": 10.716866767688542, "grad_norm": 0.01214599609375, "learning_rate": 4.515534244693816e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999775767326355, "num_tokens": 156631387.0, "step": 45975 }, { "entropy": 0.06129320347681642, "epoch": 10.718032404709174, "grad_norm": 0.052001953125, "learning_rate": 4.5154135723078306e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156650853.0, "step": 45980 }, { "entropy": 0.07202781811356544, "epoch": 10.719198041729806, "grad_norm": 0.01361083984375, "learning_rate": 4.515292888508377e-05, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 156672371.0, "step": 45985 }, { "entropy": 0.05794037450104952, "epoch": 10.720363678750438, "grad_norm": 0.03662109375, "learning_rate": 4.515172193297256e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156693221.0, "step": 45990 }, { "entropy": 0.06433992027305066, "epoch": 10.721529315771068, "grad_norm": 0.1474609375, "learning_rate": 4.515051486676266e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999689877033233, "num_tokens": 156717306.0, "step": 45995 }, { "entropy": 0.05043230140581727, "epoch": 10.7226949527917, "grad_norm": 0.0281982421875, "learning_rate": 4.514930768647209e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999823093414306, "num_tokens": 156735116.0, "step": 46000 }, { "entropy": 0.053006827272474764, "epoch": 10.723860589812332, "grad_norm": 0.0216064453125, "learning_rate": 4.514810039211882e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 156760057.0, "step": 46005 }, { "entropy": 0.04849845627322793, "epoch": 10.725026226832965, "grad_norm": 0.130859375, "learning_rate": 4.514689298372088e-05, "loss": 0.0002, "mean_token_accuracy": 0.9998394846916199, "num_tokens": 156786045.0, "step": 46010 }, { "entropy": 0.07077356418594719, "epoch": 10.726191863853597, "grad_norm": 0.0289306640625, "learning_rate": 4.514568546129626e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999822556972504, "num_tokens": 156819310.0, "step": 46015 }, { "entropy": 0.06431170925498009, "epoch": 10.727357500874227, "grad_norm": 0.392578125, "learning_rate": 4.5144477824862976e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999893486499787, "num_tokens": 156843448.0, "step": 46020 }, { "entropy": 0.059689549077302215, "epoch": 10.72852313789486, "grad_norm": 0.423828125, "learning_rate": 4.514327007443903e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156860177.0, "step": 46025 }, { "entropy": 0.05863263411447406, "epoch": 10.729688774915491, "grad_norm": 0.0791015625, "learning_rate": 4.514206221004242e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999895572662354, "num_tokens": 156882655.0, "step": 46030 }, { "entropy": 0.06030096784234047, "epoch": 10.730854411936123, "grad_norm": 0.048095703125, "learning_rate": 4.514085423169118e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999851763248444, "num_tokens": 156905529.0, "step": 46035 }, { "entropy": 0.06778318686410785, "epoch": 10.732020048956755, "grad_norm": 0.0546875, "learning_rate": 4.51396461394033e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 156928035.0, "step": 46040 }, { "entropy": 0.04988482487387955, "epoch": 10.733185685977386, "grad_norm": 0.18359375, "learning_rate": 4.513843793319681e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999284684658051, "num_tokens": 156978769.0, "step": 46045 }, { "entropy": 0.053737777099013326, "epoch": 10.734351322998018, "grad_norm": 0.050048828125, "learning_rate": 4.513722961308973e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999889194965362, "num_tokens": 157000562.0, "step": 46050 }, { "entropy": 0.0412151537835598, "epoch": 10.73551696001865, "grad_norm": 0.12060546875, "learning_rate": 4.5136021179100055e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 157024160.0, "step": 46055 }, { "entropy": 0.04722663722932339, "epoch": 10.736682597039282, "grad_norm": 1.1640625, "learning_rate": 4.513481263124583e-05, "loss": 0.0012, "mean_token_accuracy": 0.9996932506561279, "num_tokens": 157035689.0, "step": 46060 }, { "entropy": 0.07388071976602077, "epoch": 10.737848234059914, "grad_norm": 0.0361328125, "learning_rate": 4.5133603969545054e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157047511.0, "step": 46065 }, { "entropy": 0.05361750088632107, "epoch": 10.739013871080546, "grad_norm": 0.1484375, "learning_rate": 4.513239519401578e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999772608280182, "num_tokens": 157083063.0, "step": 46070 }, { "entropy": 0.07026615431532264, "epoch": 10.740179508101177, "grad_norm": 0.03271484375, "learning_rate": 4.513118630467599e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157099612.0, "step": 46075 }, { "entropy": 0.05888519417494535, "epoch": 10.741345145121809, "grad_norm": 0.02197265625, "learning_rate": 4.512997730154375e-05, "loss": 0.0162, "mean_token_accuracy": 0.9984605610370636, "num_tokens": 157139175.0, "step": 46080 }, { "entropy": 0.0555834517814219, "epoch": 10.74251078214244, "grad_norm": 0.013916015625, "learning_rate": 4.512876818463707e-05, "loss": 0.0004, "mean_token_accuracy": 0.999962842464447, "num_tokens": 157160147.0, "step": 46085 }, { "entropy": 0.0965851410292089, "epoch": 10.743676419163073, "grad_norm": 0.021484375, "learning_rate": 4.5127558953973984e-05, "loss": 0.0307, "mean_token_accuracy": 0.9936910271644592, "num_tokens": 157198608.0, "step": 46090 }, { "entropy": 0.04910420798696578, "epoch": 10.744842056183705, "grad_norm": 0.07080078125, "learning_rate": 4.5126349609572515e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157222442.0, "step": 46095 }, { "entropy": 0.04693204928189516, "epoch": 10.746007693204335, "grad_norm": 0.025146484375, "learning_rate": 4.5125140151450706e-05, "loss": 0.0004, "mean_token_accuracy": 0.9997084558010101, "num_tokens": 157239643.0, "step": 46100 }, { "entropy": 0.06487049106508494, "epoch": 10.747173330224967, "grad_norm": 0.029052734375, "learning_rate": 4.512393057962659e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157254305.0, "step": 46105 }, { "entropy": 0.07090209368616343, "epoch": 10.7483389672456, "grad_norm": 0.01031494140625, "learning_rate": 4.5122720894118196e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157263233.0, "step": 46110 }, { "entropy": 0.0584601731505245, "epoch": 10.749504604266232, "grad_norm": 0.044189453125, "learning_rate": 4.512151109494357e-05, "loss": 0.0012, "mean_token_accuracy": 0.9993981122970581, "num_tokens": 157292283.0, "step": 46115 }, { "entropy": 0.07151546142995358, "epoch": 10.750670241286864, "grad_norm": 0.119140625, "learning_rate": 4.512030118212076e-05, "loss": 0.0003, "mean_token_accuracy": 0.999989241361618, "num_tokens": 157312137.0, "step": 46120 }, { "entropy": 0.05258299568668008, "epoch": 10.751835878307496, "grad_norm": 0.0218505859375, "learning_rate": 4.511909115566779e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157331948.0, "step": 46125 }, { "entropy": 0.058688655495643616, "epoch": 10.753001515328126, "grad_norm": 0.0224609375, "learning_rate": 4.511788101560271e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157352978.0, "step": 46130 }, { "entropy": 0.05229517556726933, "epoch": 10.754167152348758, "grad_norm": 0.0194091796875, "learning_rate": 4.511667076194357e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157364802.0, "step": 46135 }, { "entropy": 0.060772106423974036, "epoch": 10.75533278936939, "grad_norm": 0.1103515625, "learning_rate": 4.511546039470841e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157380506.0, "step": 46140 }, { "entropy": 0.07069172933697701, "epoch": 10.756498426390023, "grad_norm": 0.2109375, "learning_rate": 4.511424991391528e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157391149.0, "step": 46145 }, { "entropy": 0.048912717308849094, "epoch": 10.757664063410655, "grad_norm": 0.020751953125, "learning_rate": 4.511303931958224e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 157414839.0, "step": 46150 }, { "entropy": 0.05329639483243227, "epoch": 10.758829700431285, "grad_norm": 0.033935546875, "learning_rate": 4.5111828611727324e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157431406.0, "step": 46155 }, { "entropy": 0.06239355998113751, "epoch": 10.759995337451917, "grad_norm": 0.28515625, "learning_rate": 4.51106177903686e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999683320522308, "num_tokens": 157458439.0, "step": 46160 }, { "entropy": 0.04378145812079311, "epoch": 10.76116097447255, "grad_norm": 0.0283203125, "learning_rate": 4.510940685552411e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157486671.0, "step": 46165 }, { "entropy": 0.06428087167441845, "epoch": 10.762326611493181, "grad_norm": 0.025390625, "learning_rate": 4.5108195807211925e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157496563.0, "step": 46170 }, { "entropy": 0.048792037460952994, "epoch": 10.763492248513813, "grad_norm": 0.0196533203125, "learning_rate": 4.510698464545009e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157518160.0, "step": 46175 }, { "entropy": 0.05703177060931921, "epoch": 10.764657885534444, "grad_norm": 0.030029296875, "learning_rate": 4.510577337025668e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 157536400.0, "step": 46180 }, { "entropy": 0.058706553932279346, "epoch": 10.765823522555076, "grad_norm": 0.027587890625, "learning_rate": 4.5104561981649754e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 157557203.0, "step": 46185 }, { "entropy": 0.04635893451049924, "epoch": 10.766989159575708, "grad_norm": 0.03515625, "learning_rate": 4.510335047964736e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157588463.0, "step": 46190 }, { "entropy": 0.05046179071068764, "epoch": 10.76815479659634, "grad_norm": 0.050048828125, "learning_rate": 4.510213886426758e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157622759.0, "step": 46195 }, { "entropy": 0.06388626941479743, "epoch": 10.769320433616972, "grad_norm": 0.05224609375, "learning_rate": 4.510092713552847e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157640856.0, "step": 46200 }, { "entropy": 0.04669735683128238, "epoch": 10.770486070637604, "grad_norm": 0.0238037109375, "learning_rate": 4.5099715293448106e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157668198.0, "step": 46205 }, { "entropy": 0.06003896193578839, "epoch": 10.771651707658235, "grad_norm": 0.671875, "learning_rate": 4.5098503338044564e-05, "loss": 0.001, "mean_token_accuracy": 0.9995143592357636, "num_tokens": 157698155.0, "step": 46210 }, { "entropy": 0.05959401056170464, "epoch": 10.772817344678867, "grad_norm": 0.058837890625, "learning_rate": 4.50972912693359e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157713655.0, "step": 46215 }, { "entropy": 0.06324677541851997, "epoch": 10.773982981699499, "grad_norm": 0.0233154296875, "learning_rate": 4.5096079087340196e-05, "loss": 0.0013, "mean_token_accuracy": 0.999968820810318, "num_tokens": 157733745.0, "step": 46220 }, { "entropy": 0.08075557686388493, "epoch": 10.77514861872013, "grad_norm": 0.046142578125, "learning_rate": 4.509486679207553e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157742039.0, "step": 46225 }, { "entropy": 0.058673990145325663, "epoch": 10.776314255740763, "grad_norm": 0.041259765625, "learning_rate": 4.509365438355998e-05, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 157760679.0, "step": 46230 }, { "entropy": 0.06012842683121562, "epoch": 10.777479892761393, "grad_norm": 0.01239013671875, "learning_rate": 4.509244186181162e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 157780551.0, "step": 46235 }, { "entropy": 0.05824370728805661, "epoch": 10.778645529782025, "grad_norm": 0.052734375, "learning_rate": 4.509122922684853e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157804333.0, "step": 46240 }, { "entropy": 0.05847919774241746, "epoch": 10.779811166802657, "grad_norm": 0.039306640625, "learning_rate": 4.50900164786888e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999836504459381, "num_tokens": 157819699.0, "step": 46245 }, { "entropy": 0.059690111130475995, "epoch": 10.78097680382329, "grad_norm": 0.03271484375, "learning_rate": 4.50888036173505e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 157832701.0, "step": 46250 }, { "entropy": 0.0757465548813343, "epoch": 10.782142440843922, "grad_norm": 0.380859375, "learning_rate": 4.508759064285173e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999886095523834, "num_tokens": 157851007.0, "step": 46255 }, { "entropy": 0.055960895121097566, "epoch": 10.783308077864554, "grad_norm": 0.1220703125, "learning_rate": 4.508637755521057e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999889612197876, "num_tokens": 157883389.0, "step": 46260 }, { "entropy": 0.05103008281439543, "epoch": 10.784473714885184, "grad_norm": 0.12060546875, "learning_rate": 4.5085164354445106e-05, "loss": 0.0001, "mean_token_accuracy": 0.9999893248081207, "num_tokens": 157905917.0, "step": 46265 }, { "entropy": 0.07026658989489079, "epoch": 10.785639351905816, "grad_norm": 0.0322265625, "learning_rate": 4.508395104057344e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 157926464.0, "step": 46270 }, { "entropy": 0.06333841476589441, "epoch": 10.786804988926448, "grad_norm": 0.038330078125, "learning_rate": 4.508273761361365e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999777853488923, "num_tokens": 157949618.0, "step": 46275 }, { "entropy": 0.04195570405572653, "epoch": 10.78797062594708, "grad_norm": 0.0208740234375, "learning_rate": 4.508152407358384e-05, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 158003139.0, "step": 46280 }, { "entropy": 0.06566681675612926, "epoch": 10.789136262967713, "grad_norm": 0.0439453125, "learning_rate": 4.5080310420502104e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158012765.0, "step": 46285 }, { "entropy": 0.05819013142026961, "epoch": 10.790301899988343, "grad_norm": 0.0625, "learning_rate": 4.5079096654386534e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999180316925049, "num_tokens": 158041200.0, "step": 46290 }, { "entropy": 0.07342990711331368, "epoch": 10.791467537008975, "grad_norm": 0.0301513671875, "learning_rate": 4.5077882775255235e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999577939510346, "num_tokens": 158060295.0, "step": 46295 }, { "entropy": 0.04707777537405491, "epoch": 10.792633174029607, "grad_norm": 0.0830078125, "learning_rate": 4.5076668783126304e-05, "loss": 0.0004, "mean_token_accuracy": 0.99990394115448, "num_tokens": 158074145.0, "step": 46300 }, { "entropy": 0.04368063462898135, "epoch": 10.79379881105024, "grad_norm": 0.06689453125, "learning_rate": 4.507545467801785e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 158102757.0, "step": 46305 }, { "entropy": 0.0559994793497026, "epoch": 10.794964448070871, "grad_norm": 0.31640625, "learning_rate": 4.507424045994797e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999407052993774, "num_tokens": 158121960.0, "step": 46310 }, { "entropy": 0.06052645109593868, "epoch": 10.796130085091502, "grad_norm": 0.03955078125, "learning_rate": 4.507302612893478e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158130632.0, "step": 46315 }, { "entropy": 0.05715808067470789, "epoch": 10.797295722112134, "grad_norm": 0.03515625, "learning_rate": 4.5071811684996365e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158155311.0, "step": 46320 }, { "entropy": 0.06015035463497043, "epoch": 10.798461359132766, "grad_norm": 0.040771484375, "learning_rate": 4.507059712815086e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158171695.0, "step": 46325 }, { "entropy": 0.06758764693513512, "epoch": 10.799626996153398, "grad_norm": 0.057861328125, "learning_rate": 4.506938245841636e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999662697315216, "num_tokens": 158196574.0, "step": 46330 }, { "entropy": 0.0585243116132915, "epoch": 10.80079263317403, "grad_norm": 0.04296875, "learning_rate": 4.506816767581099e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158220399.0, "step": 46335 }, { "entropy": 0.054940407443791625, "epoch": 10.801958270194662, "grad_norm": 0.34765625, "learning_rate": 4.506695278035285e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 158244022.0, "step": 46340 }, { "entropy": 0.05187456281855703, "epoch": 10.803123907215292, "grad_norm": 0.138671875, "learning_rate": 4.506573777206006e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158264788.0, "step": 46345 }, { "entropy": 0.07387198638170958, "epoch": 10.804289544235925, "grad_norm": 0.06640625, "learning_rate": 4.5064522650950745e-05, "loss": 0.0003, "mean_token_accuracy": 0.999989116191864, "num_tokens": 158285073.0, "step": 46350 }, { "entropy": 0.04432422863319516, "epoch": 10.805455181256557, "grad_norm": 0.01904296875, "learning_rate": 4.506330741704302e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999845564365387, "num_tokens": 158312097.0, "step": 46355 }, { "entropy": 0.047852301597595216, "epoch": 10.806620818277189, "grad_norm": 0.0181884765625, "learning_rate": 4.506209207035501e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 158334327.0, "step": 46360 }, { "entropy": 0.0673111722804606, "epoch": 10.807786455297821, "grad_norm": 0.07470703125, "learning_rate": 4.506087661090483e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158349202.0, "step": 46365 }, { "entropy": 0.043902119528502224, "epoch": 10.808952092318451, "grad_norm": 0.0225830078125, "learning_rate": 4.50596610387106e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999886274337768, "num_tokens": 158379655.0, "step": 46370 }, { "entropy": 0.0513251107186079, "epoch": 10.810117729339083, "grad_norm": 0.18359375, "learning_rate": 4.505844535379046e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999780595302582, "num_tokens": 158404138.0, "step": 46375 }, { "entropy": 0.07172129093669355, "epoch": 10.811283366359715, "grad_norm": 0.025634765625, "learning_rate": 4.505722955616254e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158423565.0, "step": 46380 }, { "entropy": 0.048972241766750815, "epoch": 10.812449003380348, "grad_norm": 0.047607421875, "learning_rate": 4.505601364584495e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999634981155395, "num_tokens": 158450117.0, "step": 46385 }, { "entropy": 0.05351090729236603, "epoch": 10.81361464040098, "grad_norm": 0.01275634765625, "learning_rate": 4.5054797622855825e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158465495.0, "step": 46390 }, { "entropy": 0.039100353326648477, "epoch": 10.814780277421612, "grad_norm": 0.267578125, "learning_rate": 4.505358148721332e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999880850315094, "num_tokens": 158501337.0, "step": 46395 }, { "entropy": 0.04310502801090479, "epoch": 10.815945914442242, "grad_norm": 0.028564453125, "learning_rate": 4.505236523893554e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158520456.0, "step": 46400 }, { "entropy": 0.05818876605480909, "epoch": 10.817111551462874, "grad_norm": 0.03759765625, "learning_rate": 4.5051148878040646e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158533301.0, "step": 46405 }, { "entropy": 0.074424147605896, "epoch": 10.818277188483506, "grad_norm": 0.0118408203125, "learning_rate": 4.5049932404546755e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158550858.0, "step": 46410 }, { "entropy": 0.07016548365354539, "epoch": 10.819442825504138, "grad_norm": 0.1328125, "learning_rate": 4.504871581847202e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158561725.0, "step": 46415 }, { "entropy": 0.05289169065654278, "epoch": 10.82060846252477, "grad_norm": 0.0196533203125, "learning_rate": 4.504749911983458e-05, "loss": 0.0059, "mean_token_accuracy": 0.9993242502212525, "num_tokens": 158586226.0, "step": 46420 }, { "entropy": 0.042893216293305156, "epoch": 10.8217740995454, "grad_norm": 0.0164794921875, "learning_rate": 4.504628230865258e-05, "loss": 0.0035, "mean_token_accuracy": 0.9993669390678406, "num_tokens": 158615230.0, "step": 46425 }, { "entropy": 0.06487907837145031, "epoch": 10.822939736566033, "grad_norm": 0.134765625, "learning_rate": 4.504506538494415e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999787032604217, "num_tokens": 158637921.0, "step": 46430 }, { "entropy": 0.07466004192829132, "epoch": 10.824105373586665, "grad_norm": 2.453125, "learning_rate": 4.504384834872745e-05, "loss": 0.0014, "mean_token_accuracy": 0.9998039186000824, "num_tokens": 158648773.0, "step": 46435 }, { "entropy": 0.05474613988772035, "epoch": 10.825271010607297, "grad_norm": 0.01953125, "learning_rate": 4.504263120002063e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158677327.0, "step": 46440 }, { "entropy": 0.07186794616281986, "epoch": 10.82643664762793, "grad_norm": 0.01177978515625, "learning_rate": 4.504141393884183e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158689482.0, "step": 46445 }, { "entropy": 0.06628307662904262, "epoch": 10.82760228464856, "grad_norm": 0.28515625, "learning_rate": 4.50401965652092e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158701872.0, "step": 46450 }, { "entropy": 0.06336401142179966, "epoch": 10.828767921669192, "grad_norm": 0.029052734375, "learning_rate": 4.503897907914091e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 158715592.0, "step": 46455 }, { "entropy": 0.06207911539822817, "epoch": 10.829933558689824, "grad_norm": 0.029296875, "learning_rate": 4.503776148065509e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158732624.0, "step": 46460 }, { "entropy": 0.0868097135797143, "epoch": 10.831099195710456, "grad_norm": 0.029296875, "learning_rate": 4.503654376976992e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158743880.0, "step": 46465 }, { "entropy": 0.0856756535358727, "epoch": 10.832264832731088, "grad_norm": 0.018798828125, "learning_rate": 4.503532594650355e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158757618.0, "step": 46470 }, { "entropy": 0.055490178242325786, "epoch": 10.83343046975172, "grad_norm": 0.03564453125, "learning_rate": 4.503410801087412e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158772979.0, "step": 46475 }, { "entropy": 0.05386331751942634, "epoch": 10.83459610677235, "grad_norm": 0.029296875, "learning_rate": 4.503288996289982e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158787215.0, "step": 46480 }, { "entropy": 0.04529502475634217, "epoch": 10.835761743792983, "grad_norm": 0.01336669921875, "learning_rate": 4.50316718025988e-05, "loss": 0.0008, "mean_token_accuracy": 0.9998682498931885, "num_tokens": 158803337.0, "step": 46485 }, { "entropy": 0.055859196837991475, "epoch": 10.836927380813615, "grad_norm": 0.02001953125, "learning_rate": 4.503045352998922e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158818801.0, "step": 46490 }, { "entropy": 0.050908899027854206, "epoch": 10.838093017834247, "grad_norm": 0.00799560546875, "learning_rate": 4.502923514508926e-05, "loss": 0.0003, "mean_token_accuracy": 0.999989241361618, "num_tokens": 158840131.0, "step": 46495 }, { "entropy": 0.04821221772581339, "epoch": 10.839258654854879, "grad_norm": 0.024169921875, "learning_rate": 4.5028016647917064e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158854498.0, "step": 46500 }, { "entropy": 0.06585240634158254, "epoch": 10.84042429187551, "grad_norm": 0.037109375, "learning_rate": 4.5026798038490826e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158867837.0, "step": 46505 }, { "entropy": 0.051115707214921716, "epoch": 10.841589928896141, "grad_norm": 0.0201416015625, "learning_rate": 4.502557931682872e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158882615.0, "step": 46510 }, { "entropy": 0.0701981533318758, "epoch": 10.842755565916773, "grad_norm": 0.014892578125, "learning_rate": 4.5024360482948885e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158891294.0, "step": 46515 }, { "entropy": 0.054075379855930805, "epoch": 10.843921202937405, "grad_norm": 0.2353515625, "learning_rate": 4.502314153686953e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 158908722.0, "step": 46520 }, { "entropy": 0.059608714934438464, "epoch": 10.845086839958038, "grad_norm": 0.0146484375, "learning_rate": 4.502192247860882e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158925185.0, "step": 46525 }, { "entropy": 0.051519249100238085, "epoch": 10.84625247697867, "grad_norm": 0.049072265625, "learning_rate": 4.502070330818493e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999891877174377, "num_tokens": 158944524.0, "step": 46530 }, { "entropy": 0.06343580950051546, "epoch": 10.8474181139993, "grad_norm": 0.07958984375, "learning_rate": 4.501948402561604e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 158957367.0, "step": 46535 }, { "entropy": 0.06394379865378141, "epoch": 10.848583751019932, "grad_norm": 0.030029296875, "learning_rate": 4.5018264630920335e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999693334102631, "num_tokens": 158977627.0, "step": 46540 }, { "entropy": 0.056028542760759595, "epoch": 10.849749388040564, "grad_norm": 0.0164794921875, "learning_rate": 4.5017045124116e-05, "loss": 0.0003, "mean_token_accuracy": 0.999985671043396, "num_tokens": 158994400.0, "step": 46545 }, { "entropy": 0.0522186104208231, "epoch": 10.850915025061196, "grad_norm": 0.134765625, "learning_rate": 4.501582550522121e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999893844127655, "num_tokens": 159015220.0, "step": 46550 }, { "entropy": 0.0532087117433548, "epoch": 10.852080662081828, "grad_norm": 0.029296875, "learning_rate": 4.5014605774254157e-05, "loss": 0.0002, "mean_token_accuracy": 0.999976909160614, "num_tokens": 159043131.0, "step": 46555 }, { "entropy": 0.0711324105039239, "epoch": 10.853246299102459, "grad_norm": 0.2255859375, "learning_rate": 4.5013385931233034e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999675273895263, "num_tokens": 159059578.0, "step": 46560 }, { "entropy": 0.0600858336314559, "epoch": 10.85441193612309, "grad_norm": 0.02783203125, "learning_rate": 4.501216597617602e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159077354.0, "step": 46565 }, { "entropy": 0.05426020985469222, "epoch": 10.855577573143723, "grad_norm": 0.0283203125, "learning_rate": 4.501094590910132e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159098793.0, "step": 46570 }, { "entropy": 0.054496968165040016, "epoch": 10.856743210164355, "grad_norm": 0.130859375, "learning_rate": 4.5009725730027115e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999775528907776, "num_tokens": 159135013.0, "step": 46575 }, { "entropy": 0.05475781839340925, "epoch": 10.857908847184987, "grad_norm": 0.05908203125, "learning_rate": 4.500850543897161e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999852359294892, "num_tokens": 159152297.0, "step": 46580 }, { "entropy": 0.05325304474681616, "epoch": 10.859074484205617, "grad_norm": 0.314453125, "learning_rate": 4.500728503595298e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159172531.0, "step": 46585 }, { "entropy": 0.15666724601760507, "epoch": 10.86024012122625, "grad_norm": 0.01708984375, "learning_rate": 4.500606452098945e-05, "loss": 0.2038, "mean_token_accuracy": 0.9648593962192535, "num_tokens": 159195114.0, "step": 46590 }, { "entropy": 0.062145916000008586, "epoch": 10.861405758246882, "grad_norm": 0.07958984375, "learning_rate": 4.500484389409921e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159214629.0, "step": 46595 }, { "entropy": 0.06958608105778694, "epoch": 10.862571395267514, "grad_norm": 0.037841796875, "learning_rate": 4.5003623155300455e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999890923500061, "num_tokens": 159239052.0, "step": 46600 }, { "entropy": 0.05691871186718345, "epoch": 10.863737032288146, "grad_norm": 0.031494140625, "learning_rate": 4.50024023046114e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159266360.0, "step": 46605 }, { "entropy": 0.04313572673127055, "epoch": 10.864902669308778, "grad_norm": 0.058349609375, "learning_rate": 4.500118134205023e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999039709568024, "num_tokens": 159287232.0, "step": 46610 }, { "entropy": 0.06896574310958385, "epoch": 10.866068306329408, "grad_norm": 0.02978515625, "learning_rate": 4.499996026763517e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159296067.0, "step": 46615 }, { "entropy": 0.05110810687765479, "epoch": 10.86723394335004, "grad_norm": 0.0361328125, "learning_rate": 4.499873908138442e-05, "loss": 0.0061, "mean_token_accuracy": 0.9997102320194244, "num_tokens": 159324520.0, "step": 46620 }, { "entropy": 0.050146241392940286, "epoch": 10.868399580370673, "grad_norm": 0.076171875, "learning_rate": 4.49975177833162e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159342964.0, "step": 46625 }, { "entropy": 0.049869694840162994, "epoch": 10.869565217391305, "grad_norm": 0.02490234375, "learning_rate": 4.4996296373448706e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159376393.0, "step": 46630 }, { "entropy": 0.055807786621153356, "epoch": 10.870730854411937, "grad_norm": 0.01806640625, "learning_rate": 4.499507485180016e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159391769.0, "step": 46635 }, { "entropy": 0.044060825975611805, "epoch": 10.871896491432567, "grad_norm": 0.034423828125, "learning_rate": 4.499385321838877e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159425953.0, "step": 46640 }, { "entropy": 0.051666715648025274, "epoch": 10.8730621284532, "grad_norm": 0.0673828125, "learning_rate": 4.499263147323276e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997613370418549, "num_tokens": 159443888.0, "step": 46645 }, { "entropy": 0.06478715725243092, "epoch": 10.874227765473831, "grad_norm": 0.09912109375, "learning_rate": 4.499140961635035e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999750196933747, "num_tokens": 159466030.0, "step": 46650 }, { "entropy": 0.05626236498355865, "epoch": 10.875393402494463, "grad_norm": 0.026123046875, "learning_rate": 4.499018764775975e-05, "loss": 0.0002, "mean_token_accuracy": 0.999988055229187, "num_tokens": 159495164.0, "step": 46655 }, { "entropy": 0.06515727676451206, "epoch": 10.876559039515096, "grad_norm": 0.03466796875, "learning_rate": 4.4988965567479186e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159512208.0, "step": 46660 }, { "entropy": 0.0587312781251967, "epoch": 10.877724676535728, "grad_norm": 0.048095703125, "learning_rate": 4.498774337552688e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159532669.0, "step": 46665 }, { "entropy": 0.0911343522835523, "epoch": 10.878890313556358, "grad_norm": 0.016845703125, "learning_rate": 4.4986521071921064e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159547697.0, "step": 46670 }, { "entropy": 0.09437187728472055, "epoch": 10.88005595057699, "grad_norm": 0.0279541015625, "learning_rate": 4.498529865667995e-05, "loss": 0.0334, "mean_token_accuracy": 0.9862868785858154, "num_tokens": 159574345.0, "step": 46675 }, { "entropy": 0.056803678441792724, "epoch": 10.881221587597622, "grad_norm": 0.111328125, "learning_rate": 4.498407612982178e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 159597605.0, "step": 46680 }, { "entropy": 0.0474022360984236, "epoch": 10.882387224618254, "grad_norm": 0.146484375, "learning_rate": 4.4982853491364786e-05, "loss": 0.0012, "mean_token_accuracy": 0.999748706817627, "num_tokens": 159634069.0, "step": 46685 }, { "entropy": 0.062248018197715284, "epoch": 10.883552861638886, "grad_norm": 0.0177001953125, "learning_rate": 4.498163074132718e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159649192.0, "step": 46690 }, { "entropy": 0.05585647188127041, "epoch": 10.884718498659517, "grad_norm": 0.09716796875, "learning_rate": 4.498040787972721e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999870717525482, "num_tokens": 159669437.0, "step": 46695 }, { "entropy": 0.05814083656296134, "epoch": 10.885884135680149, "grad_norm": 0.02197265625, "learning_rate": 4.4979184906583105e-05, "loss": 0.0003, "mean_token_accuracy": 0.999438202381134, "num_tokens": 159690161.0, "step": 46700 }, { "entropy": 0.052232743427157405, "epoch": 10.887049772700781, "grad_norm": 0.01806640625, "learning_rate": 4.49779618219131e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159705554.0, "step": 46705 }, { "entropy": 0.05564236463978887, "epoch": 10.888215409721413, "grad_norm": 0.0284423828125, "learning_rate": 4.497673862573545e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159722870.0, "step": 46710 }, { "entropy": 0.050008386839181185, "epoch": 10.889381046742045, "grad_norm": 0.01416015625, "learning_rate": 4.497551531806837e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999751627445221, "num_tokens": 159748801.0, "step": 46715 }, { "entropy": 0.05154750719666481, "epoch": 10.890546683762675, "grad_norm": 0.06103515625, "learning_rate": 4.497429189893012e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159770977.0, "step": 46720 }, { "entropy": 0.060008395742624995, "epoch": 10.891712320783308, "grad_norm": 0.0556640625, "learning_rate": 4.4973068368338935e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159793862.0, "step": 46725 }, { "entropy": 0.04886681037023664, "epoch": 10.89287795780394, "grad_norm": 0.10302734375, "learning_rate": 4.4971844726313055e-05, "loss": 0.0005, "mean_token_accuracy": 0.9997793674468994, "num_tokens": 159821291.0, "step": 46730 }, { "entropy": 0.06569123174995184, "epoch": 10.894043594824572, "grad_norm": 0.01202392578125, "learning_rate": 4.497062097287074e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159832940.0, "step": 46735 }, { "entropy": 0.0484948112629354, "epoch": 10.895209231845204, "grad_norm": 0.0242919921875, "learning_rate": 4.496939710803022e-05, "loss": 0.0002, "mean_token_accuracy": 0.999989491701126, "num_tokens": 159856860.0, "step": 46740 }, { "entropy": 0.06075437283143401, "epoch": 10.896374868865836, "grad_norm": 0.01416015625, "learning_rate": 4.496817313180976e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999761402606964, "num_tokens": 159888601.0, "step": 46745 }, { "entropy": 0.052485890313982965, "epoch": 10.897540505886466, "grad_norm": 0.0235595703125, "learning_rate": 4.49669490442276e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999137938022613, "num_tokens": 159918599.0, "step": 46750 }, { "entropy": 0.043839930184185506, "epoch": 10.898706142907098, "grad_norm": 0.024658203125, "learning_rate": 4.496572484530201e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999860525131226, "num_tokens": 159943656.0, "step": 46755 }, { "entropy": 0.070591782592237, "epoch": 10.89987177992773, "grad_norm": 0.034423828125, "learning_rate": 4.4964500535051224e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 159955076.0, "step": 46760 }, { "entropy": 0.05396025264635682, "epoch": 10.901037416948363, "grad_norm": 0.032470703125, "learning_rate": 4.4963276113493516e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999826550483704, "num_tokens": 159985512.0, "step": 46765 }, { "entropy": 0.039633904490619896, "epoch": 10.902203053968995, "grad_norm": 0.016845703125, "learning_rate": 4.496205158064713e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160008250.0, "step": 46770 }, { "entropy": 0.06101240310817957, "epoch": 10.903368690989625, "grad_norm": 1.9453125, "learning_rate": 4.496082693653033e-05, "loss": 0.0013, "mean_token_accuracy": 0.9997867822647095, "num_tokens": 160022169.0, "step": 46775 }, { "entropy": 0.037630663625895976, "epoch": 10.904534328010257, "grad_norm": 0.2021484375, "learning_rate": 4.495960218116138e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999630093574524, "num_tokens": 160053029.0, "step": 46780 }, { "entropy": 0.049717345740646124, "epoch": 10.90569996503089, "grad_norm": 0.032958984375, "learning_rate": 4.495837731455854e-05, "loss": 0.0002, "mean_token_accuracy": 0.999989253282547, "num_tokens": 160077100.0, "step": 46785 }, { "entropy": 0.09623774792999029, "epoch": 10.906865602051521, "grad_norm": 0.00897216796875, "learning_rate": 4.495715233674008e-05, "loss": 0.0667, "mean_token_accuracy": 0.9903517365455627, "num_tokens": 160099240.0, "step": 46790 }, { "entropy": 0.06155298855155707, "epoch": 10.908031239072153, "grad_norm": 0.0281982421875, "learning_rate": 4.495592724772427e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999715268611908, "num_tokens": 160123176.0, "step": 46795 }, { "entropy": 0.047423532139509915, "epoch": 10.909196876092786, "grad_norm": 0.01092529296875, "learning_rate": 4.4954702047529354e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160149639.0, "step": 46800 }, { "entropy": 0.05371604720130563, "epoch": 10.910362513113416, "grad_norm": 0.017822265625, "learning_rate": 4.4953476736173624e-05, "loss": 0.0008, "mean_token_accuracy": 0.9999548852443695, "num_tokens": 160173688.0, "step": 46805 }, { "entropy": 0.05411314023658633, "epoch": 10.911528150134048, "grad_norm": 0.058837890625, "learning_rate": 4.4952251313675354e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160190111.0, "step": 46810 }, { "entropy": 0.10832246728241443, "epoch": 10.91269378715468, "grad_norm": 0.0186767578125, "learning_rate": 4.49510257800528e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160200466.0, "step": 46815 }, { "entropy": 0.05945599777624011, "epoch": 10.913859424175312, "grad_norm": 0.036865234375, "learning_rate": 4.494980013532424e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160216988.0, "step": 46820 }, { "entropy": 0.06891081742942333, "epoch": 10.915025061195944, "grad_norm": 0.06787109375, "learning_rate": 4.494857437950797e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160232389.0, "step": 46825 }, { "entropy": 0.06697693895548582, "epoch": 10.916190698216575, "grad_norm": 0.0238037109375, "learning_rate": 4.494734851262224e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160250277.0, "step": 46830 }, { "entropy": 0.0647424777969718, "epoch": 10.917356335237207, "grad_norm": 0.00958251953125, "learning_rate": 4.494612253468534e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160265324.0, "step": 46835 }, { "entropy": 0.06983225960284471, "epoch": 10.918521972257839, "grad_norm": 0.050537109375, "learning_rate": 4.494489644571556e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 160280959.0, "step": 46840 }, { "entropy": 0.05160042904317379, "epoch": 10.919687609278471, "grad_norm": 0.064453125, "learning_rate": 4.4943670245731176e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999881267547608, "num_tokens": 160305171.0, "step": 46845 }, { "entropy": 0.067512839846313, "epoch": 10.920853246299103, "grad_norm": 0.02685546875, "learning_rate": 4.494244393475047e-05, "loss": 0.0068, "mean_token_accuracy": 0.9995011687278748, "num_tokens": 160331333.0, "step": 46850 }, { "entropy": 0.05696789734065533, "epoch": 10.922018883319733, "grad_norm": 0.02783203125, "learning_rate": 4.4941217512791736e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999892592430115, "num_tokens": 160350125.0, "step": 46855 }, { "entropy": 0.05613101264461875, "epoch": 10.923184520340365, "grad_norm": 0.1064453125, "learning_rate": 4.493999097987325e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160366326.0, "step": 46860 }, { "entropy": 0.0436920034699142, "epoch": 10.924350157360998, "grad_norm": 0.07177734375, "learning_rate": 4.4938764336013314e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999226748943328, "num_tokens": 160405365.0, "step": 46865 }, { "entropy": 0.0488032216206193, "epoch": 10.92551579438163, "grad_norm": 0.0177001953125, "learning_rate": 4.493753758123021e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160424476.0, "step": 46870 }, { "entropy": 0.057715128920972344, "epoch": 10.926681431402262, "grad_norm": 0.1884765625, "learning_rate": 4.4936310715542235e-05, "loss": 0.0007, "mean_token_accuracy": 0.9995918929576874, "num_tokens": 160454477.0, "step": 46875 }, { "entropy": 0.07326898816972971, "epoch": 10.927847068422894, "grad_norm": 0.03076171875, "learning_rate": 4.493508373896768e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160466285.0, "step": 46880 }, { "entropy": 0.07479946129024029, "epoch": 10.929012705443524, "grad_norm": 0.322265625, "learning_rate": 4.493385665152485e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999879539012909, "num_tokens": 160482048.0, "step": 46885 }, { "entropy": 0.04308393271639943, "epoch": 10.930178342464156, "grad_norm": 0.033203125, "learning_rate": 4.493262945323203e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999845504760743, "num_tokens": 160511233.0, "step": 46890 }, { "entropy": 0.05206470335833728, "epoch": 10.931343979484788, "grad_norm": 0.0125732421875, "learning_rate": 4.4931402144107525e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999897003173828, "num_tokens": 160532775.0, "step": 46895 }, { "entropy": 0.0450290129519999, "epoch": 10.93250961650542, "grad_norm": 0.259765625, "learning_rate": 4.493017472416964e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999562263488769, "num_tokens": 160554549.0, "step": 46900 }, { "entropy": 0.05536229070276022, "epoch": 10.933675253526053, "grad_norm": 0.059814453125, "learning_rate": 4.4928947193436675e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999775588512421, "num_tokens": 160584324.0, "step": 46905 }, { "entropy": 0.06466497108340263, "epoch": 10.934840890546683, "grad_norm": 0.052001953125, "learning_rate": 4.492771955192693e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 160611199.0, "step": 46910 }, { "entropy": 0.05528691895306111, "epoch": 10.936006527567315, "grad_norm": 0.0546875, "learning_rate": 4.492649179965872e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999731123447418, "num_tokens": 160635898.0, "step": 46915 }, { "entropy": 0.05148557173088193, "epoch": 10.937172164587947, "grad_norm": 0.015869140625, "learning_rate": 4.492526393665034e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999773383140564, "num_tokens": 160659185.0, "step": 46920 }, { "entropy": 0.046163988672196864, "epoch": 10.93833780160858, "grad_norm": 0.00921630859375, "learning_rate": 4.4924035962920114e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160675283.0, "step": 46925 }, { "entropy": 0.06352129988372326, "epoch": 10.939503438629211, "grad_norm": 0.11572265625, "learning_rate": 4.492280787848634e-05, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 160700580.0, "step": 46930 }, { "entropy": 0.053053854452446106, "epoch": 10.940669075649843, "grad_norm": 0.03466796875, "learning_rate": 4.492157968336734e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160720175.0, "step": 46935 }, { "entropy": 0.05099070845171809, "epoch": 10.941834712670474, "grad_norm": 0.154296875, "learning_rate": 4.492035137758141e-05, "loss": 0.0004, "mean_token_accuracy": 0.999974662065506, "num_tokens": 160743372.0, "step": 46940 }, { "entropy": 0.057633578404784204, "epoch": 10.943000349691106, "grad_norm": 0.0233154296875, "learning_rate": 4.4919122961146895e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160756529.0, "step": 46945 }, { "entropy": 0.03396057607606053, "epoch": 10.944165986711738, "grad_norm": 0.03173828125, "learning_rate": 4.4917894434082094e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999751329421998, "num_tokens": 160805843.0, "step": 46950 }, { "entropy": 0.0497897163964808, "epoch": 10.94533162373237, "grad_norm": 0.08837890625, "learning_rate": 4.4916665796405335e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160827488.0, "step": 46955 }, { "entropy": 0.057886832021176816, "epoch": 10.946497260753002, "grad_norm": 0.0208740234375, "learning_rate": 4.4915437048134926e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160843657.0, "step": 46960 }, { "entropy": 0.06596755646169186, "epoch": 10.947662897773633, "grad_norm": 0.0361328125, "learning_rate": 4.4914208189289203e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160854533.0, "step": 46965 }, { "entropy": 0.06427727779373527, "epoch": 10.948828534794265, "grad_norm": 0.1240234375, "learning_rate": 4.491297921988648e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892592430115, "num_tokens": 160872862.0, "step": 46970 }, { "entropy": 0.050669254176318644, "epoch": 10.949994171814897, "grad_norm": 0.015869140625, "learning_rate": 4.4911750139945085e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160891437.0, "step": 46975 }, { "entropy": 0.060540583729743955, "epoch": 10.951159808835529, "grad_norm": 0.080078125, "learning_rate": 4.4910520949483355e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160900617.0, "step": 46980 }, { "entropy": 0.0568017577752471, "epoch": 10.952325445856161, "grad_norm": 0.0146484375, "learning_rate": 4.490929164851961e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160917838.0, "step": 46985 }, { "entropy": 0.06554779931902885, "epoch": 10.953491082876791, "grad_norm": 0.27734375, "learning_rate": 4.4908062237072176e-05, "loss": 0.0004, "mean_token_accuracy": 0.9997151017189025, "num_tokens": 160929706.0, "step": 46990 }, { "entropy": 0.060347382165491584, "epoch": 10.954656719897423, "grad_norm": 0.043701171875, "learning_rate": 4.49068327151594e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 160951452.0, "step": 46995 }, { "entropy": 0.05922064566984773, "epoch": 10.955822356918056, "grad_norm": 0.07763671875, "learning_rate": 4.49056030827996e-05, "loss": 0.0012, "mean_token_accuracy": 0.9995283007621765, "num_tokens": 160972788.0, "step": 47000 }, { "entropy": 0.04474808312952518, "epoch": 10.956987993938688, "grad_norm": 0.01904296875, "learning_rate": 4.490437334001112e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 160986235.0, "step": 47005 }, { "entropy": 0.05137424096465111, "epoch": 10.95815363095932, "grad_norm": 0.04052734375, "learning_rate": 4.49031434868123e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999865055084228, "num_tokens": 161001817.0, "step": 47010 }, { "entropy": 0.0649847850203514, "epoch": 10.959319267979952, "grad_norm": 0.0308837890625, "learning_rate": 4.4901913523221474e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999863147735596, "num_tokens": 161028501.0, "step": 47015 }, { "entropy": 0.05216891895979643, "epoch": 10.960484905000582, "grad_norm": 0.12890625, "learning_rate": 4.490068344925699e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161043699.0, "step": 47020 }, { "entropy": 0.059045640379190446, "epoch": 10.961650542021214, "grad_norm": 0.01092529296875, "learning_rate": 4.489945326493717e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161055626.0, "step": 47025 }, { "entropy": 0.039859526231884955, "epoch": 10.962816179041846, "grad_norm": 0.0186767578125, "learning_rate": 4.489822297028039e-05, "loss": 0.0007, "mean_token_accuracy": 0.9999716818332672, "num_tokens": 161084220.0, "step": 47030 }, { "entropy": 0.0398352628108114, "epoch": 10.963981816062478, "grad_norm": 0.023681640625, "learning_rate": 4.489699256530497e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161120641.0, "step": 47035 }, { "entropy": 0.07531622983515263, "epoch": 10.96514745308311, "grad_norm": 0.043212890625, "learning_rate": 4.489576205002926e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999892890453339, "num_tokens": 161139755.0, "step": 47040 }, { "entropy": 0.0534830316901207, "epoch": 10.966313090103741, "grad_norm": 0.0147705078125, "learning_rate": 4.489453142447163e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999890923500061, "num_tokens": 161168833.0, "step": 47045 }, { "entropy": 0.0778751790523529, "epoch": 10.967478727124373, "grad_norm": 1.2109375, "learning_rate": 4.4893300688650396e-05, "loss": 0.0008, "mean_token_accuracy": 0.9997368395328522, "num_tokens": 161180900.0, "step": 47050 }, { "entropy": 0.047051459364593026, "epoch": 10.968644364145005, "grad_norm": 0.0184326171875, "learning_rate": 4.489206984258394e-05, "loss": 0.0004, "mean_token_accuracy": 0.9999641895294189, "num_tokens": 161207943.0, "step": 47055 }, { "entropy": 0.07386998878791928, "epoch": 10.969810001165637, "grad_norm": 0.02392578125, "learning_rate": 4.48908388862906e-05, "loss": 0.0003, "mean_token_accuracy": 0.9999883055686951, "num_tokens": 161238852.0, "step": 47060 }, { "entropy": 0.06613675840198993, "epoch": 10.97097563818627, "grad_norm": 0.0179443359375, "learning_rate": 4.488960781978874e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161247685.0, "step": 47065 }, { "entropy": 0.06145631754770875, "epoch": 10.972141275206901, "grad_norm": 0.08203125, "learning_rate": 4.488837664309671e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161261824.0, "step": 47070 }, { "entropy": 0.05057299751788378, "epoch": 10.973306912227532, "grad_norm": 0.0546875, "learning_rate": 4.488714535623286e-05, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 161289547.0, "step": 47075 }, { "entropy": 0.05460757054388523, "epoch": 10.974472549248164, "grad_norm": 0.015625, "learning_rate": 4.4885913959215575e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161305775.0, "step": 47080 }, { "entropy": 0.04940736414864659, "epoch": 10.975638186268796, "grad_norm": 0.0189208984375, "learning_rate": 4.48846824520632e-05, "loss": 0.0084, "mean_token_accuracy": 0.9985802948474884, "num_tokens": 161331070.0, "step": 47085 }, { "entropy": 0.04240815499797464, "epoch": 10.976803823289428, "grad_norm": 0.0218505859375, "learning_rate": 4.48834508347941e-05, "loss": 0.0005, "mean_token_accuracy": 0.99996976852417, "num_tokens": 161369429.0, "step": 47090 }, { "entropy": 0.049704886972904205, "epoch": 10.97796946031006, "grad_norm": 0.0155029296875, "learning_rate": 4.4882219107426655e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161394674.0, "step": 47095 }, { "entropy": 0.07719376794993878, "epoch": 10.97913509733069, "grad_norm": 0.1513671875, "learning_rate": 4.488098726997921e-05, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 161405704.0, "step": 47100 }, { "entropy": 0.03706503426656127, "epoch": 10.980300734351323, "grad_norm": 0.10791015625, "learning_rate": 4.4879755322470146e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161445858.0, "step": 47105 }, { "entropy": 0.05613263482227922, "epoch": 10.981466371371955, "grad_norm": 0.126953125, "learning_rate": 4.4878523264917825e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161467141.0, "step": 47110 }, { "entropy": 0.04681310541927815, "epoch": 10.982632008392587, "grad_norm": 0.044677734375, "learning_rate": 4.487729109734063e-05, "loss": 0.0006, "mean_token_accuracy": 0.9999776303768158, "num_tokens": 161499793.0, "step": 47115 }, { "entropy": 0.04940250525251031, "epoch": 10.983797645413219, "grad_norm": 0.09228515625, "learning_rate": 4.487605881975693e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161522900.0, "step": 47120 }, { "entropy": 0.06434418596327304, "epoch": 10.98496328243385, "grad_norm": 0.05712890625, "learning_rate": 4.4874826432185094e-05, "loss": 0.0009, "mean_token_accuracy": 0.9998936176300048, "num_tokens": 161532228.0, "step": 47125 }, { "entropy": 0.05575905358418822, "epoch": 10.986128919454481, "grad_norm": 0.1240234375, "learning_rate": 4.487359393464351e-05, "loss": 0.0001, "mean_token_accuracy": 0.999989265203476, "num_tokens": 161557700.0, "step": 47130 }, { "entropy": 0.06682649105787278, "epoch": 10.987294556475113, "grad_norm": 0.0208740234375, "learning_rate": 4.487236132715054e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161568934.0, "step": 47135 }, { "entropy": 0.04514258056879043, "epoch": 10.988460193495746, "grad_norm": 0.130859375, "learning_rate": 4.487112860972458e-05, "loss": 0.0005, "mean_token_accuracy": 0.9999634981155395, "num_tokens": 161592366.0, "step": 47140 }, { "entropy": 0.06385567653924226, "epoch": 10.989625830516378, "grad_norm": 0.0277099609375, "learning_rate": 4.4869895782384e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 161608094.0, "step": 47145 }, { "entropy": 0.057418633997440335, "epoch": 10.99079146753701, "grad_norm": 0.05712890625, "learning_rate": 4.486866284514719e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161627583.0, "step": 47150 }, { "entropy": 0.048961544316262005, "epoch": 10.99195710455764, "grad_norm": 0.2294921875, "learning_rate": 4.486742979803254e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999533712863922, "num_tokens": 161645226.0, "step": 47155 }, { "entropy": 0.08157783066853881, "epoch": 10.993122741578272, "grad_norm": 0.0625, "learning_rate": 4.486619664105843e-05, "loss": 0.0103, "mean_token_accuracy": 0.9992714405059815, "num_tokens": 161668211.0, "step": 47160 }, { "entropy": 0.04942806586623192, "epoch": 10.994288378598904, "grad_norm": 0.0341796875, "learning_rate": 4.486496337424325e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161688874.0, "step": 47165 }, { "entropy": 0.054880039487034085, "epoch": 10.995454015619536, "grad_norm": 0.095703125, "learning_rate": 4.486372999760538e-05, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 161727249.0, "step": 47170 }, { "entropy": 0.048123320937156676, "epoch": 10.996619652640168, "grad_norm": 0.0228271484375, "learning_rate": 4.486249651116323e-05, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 161755299.0, "step": 47175 }, { "entropy": 0.07769046975299716, "epoch": 10.997785289660799, "grad_norm": 0.03857421875, "learning_rate": 4.4861262914935174e-05, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 161769230.0, "step": 47180 }, { "entropy": 0.06101873740553856, "epoch": 10.998950926681431, "grad_norm": 0.1005859375, "learning_rate": 4.4860029208939616e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999891459941864, "num_tokens": 161791922.0, "step": 47185 }, { "entropy": 0.0431011370383203, "epoch": 11.0, "grad_norm": 0.75390625, "learning_rate": 4.485879539319496e-05, "loss": 0.0025, "mean_token_accuracy": 0.9998559223281013, "num_tokens": 161826750.0, "step": 47190 } ], "logging_steps": 5, "max_steps": 128700, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.311995803053312e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }